From 9a8e3e0dd010d130bc009077fea6395ddc414440 Mon Sep 17 00:00:00 2001 From: Joel Sherrill Date: Thu, 8 Mar 2012 08:17:55 -0600 Subject: Revert move of contrib until more sorted out --- contrib/altq/rtems/freebsd/altq/altq.h | 204 + contrib/altq/rtems/freebsd/altq/altq_cbq.c | 1187 +++ contrib/altq/rtems/freebsd/altq/altq_cbq.h | 221 + contrib/altq/rtems/freebsd/altq/altq_cdnr.c | 1393 ++++ contrib/altq/rtems/freebsd/altq/altq_cdnr.h | 335 + contrib/altq/rtems/freebsd/altq/altq_classq.h | 206 + contrib/altq/rtems/freebsd/altq/altq_hfsc.c | 2279 ++++++ contrib/altq/rtems/freebsd/altq/altq_hfsc.h | 320 + contrib/altq/rtems/freebsd/altq/altq_priq.c | 1055 +++ contrib/altq/rtems/freebsd/altq/altq_priq.h | 170 + contrib/altq/rtems/freebsd/altq/altq_red.c | 1503 ++++ contrib/altq/rtems/freebsd/altq/altq_red.h | 198 + contrib/altq/rtems/freebsd/altq/altq_rio.c | 855 +++ contrib/altq/rtems/freebsd/altq/altq_rio.h | 144 + contrib/altq/rtems/freebsd/altq/altq_rmclass.c | 1843 +++++ contrib/altq/rtems/freebsd/altq/altq_rmclass.h | 266 + .../altq/rtems/freebsd/altq/altq_rmclass_debug.h | 112 + contrib/altq/rtems/freebsd/altq/altq_subr.c | 2032 +++++ contrib/altq/rtems/freebsd/altq/altq_var.h | 265 + contrib/altq/rtems/freebsd/altq/altqconf.h | 29 + contrib/altq/rtems/freebsd/altq/if_altq.h | 191 + contrib/pf/rtems/freebsd/net/if_pflog.c | 438 ++ contrib/pf/rtems/freebsd/net/if_pflog.h | 103 + contrib/pf/rtems/freebsd/net/if_pfsync.c | 2331 ++++++ contrib/pf/rtems/freebsd/net/if_pfsync.h | 375 + contrib/pf/rtems/freebsd/net/pf.c | 7771 ++++++++++++++++++++ contrib/pf/rtems/freebsd/net/pf_if.c | 950 +++ contrib/pf/rtems/freebsd/net/pf_ioctl.c | 3896 ++++++++++ contrib/pf/rtems/freebsd/net/pf_mtag.h | 82 + contrib/pf/rtems/freebsd/net/pf_norm.c | 2062 ++++++ contrib/pf/rtems/freebsd/net/pf_osfp.c | 640 ++ contrib/pf/rtems/freebsd/net/pf_ruleset.c | 433 ++ contrib/pf/rtems/freebsd/net/pf_subr.c | 170 + contrib/pf/rtems/freebsd/net/pf_table.c | 2363 ++++++ contrib/pf/rtems/freebsd/net/pfvar.h | 1866 +++++ contrib/pf/rtems/freebsd/netinet/in4_cksum.c | 122 + 36 files changed, 38410 insertions(+) create mode 100644 contrib/altq/rtems/freebsd/altq/altq.h create mode 100644 contrib/altq/rtems/freebsd/altq/altq_cbq.c create mode 100644 contrib/altq/rtems/freebsd/altq/altq_cbq.h create mode 100644 contrib/altq/rtems/freebsd/altq/altq_cdnr.c create mode 100644 contrib/altq/rtems/freebsd/altq/altq_cdnr.h create mode 100644 contrib/altq/rtems/freebsd/altq/altq_classq.h create mode 100644 contrib/altq/rtems/freebsd/altq/altq_hfsc.c create mode 100644 contrib/altq/rtems/freebsd/altq/altq_hfsc.h create mode 100644 contrib/altq/rtems/freebsd/altq/altq_priq.c create mode 100644 contrib/altq/rtems/freebsd/altq/altq_priq.h create mode 100644 contrib/altq/rtems/freebsd/altq/altq_red.c create mode 100644 contrib/altq/rtems/freebsd/altq/altq_red.h create mode 100644 contrib/altq/rtems/freebsd/altq/altq_rio.c create mode 100644 contrib/altq/rtems/freebsd/altq/altq_rio.h create mode 100644 contrib/altq/rtems/freebsd/altq/altq_rmclass.c create mode 100644 contrib/altq/rtems/freebsd/altq/altq_rmclass.h create mode 100644 contrib/altq/rtems/freebsd/altq/altq_rmclass_debug.h create mode 100644 contrib/altq/rtems/freebsd/altq/altq_subr.c create mode 100644 contrib/altq/rtems/freebsd/altq/altq_var.h create mode 100644 contrib/altq/rtems/freebsd/altq/altqconf.h create mode 100644 contrib/altq/rtems/freebsd/altq/if_altq.h create mode 100644 contrib/pf/rtems/freebsd/net/if_pflog.c create mode 100644 contrib/pf/rtems/freebsd/net/if_pflog.h create mode 100644 contrib/pf/rtems/freebsd/net/if_pfsync.c create mode 100644 contrib/pf/rtems/freebsd/net/if_pfsync.h create mode 100644 contrib/pf/rtems/freebsd/net/pf.c create mode 100644 contrib/pf/rtems/freebsd/net/pf_if.c create mode 100644 contrib/pf/rtems/freebsd/net/pf_ioctl.c create mode 100644 contrib/pf/rtems/freebsd/net/pf_mtag.h create mode 100644 contrib/pf/rtems/freebsd/net/pf_norm.c create mode 100644 contrib/pf/rtems/freebsd/net/pf_osfp.c create mode 100644 contrib/pf/rtems/freebsd/net/pf_ruleset.c create mode 100644 contrib/pf/rtems/freebsd/net/pf_subr.c create mode 100644 contrib/pf/rtems/freebsd/net/pf_table.c create mode 100644 contrib/pf/rtems/freebsd/net/pfvar.h create mode 100644 contrib/pf/rtems/freebsd/netinet/in4_cksum.c (limited to 'contrib') diff --git a/contrib/altq/rtems/freebsd/altq/altq.h b/contrib/altq/rtems/freebsd/altq/altq.h new file mode 100644 index 00000000..78ec2d8c --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq.h @@ -0,0 +1,204 @@ +/* $FreeBSD$ */ +/* $KAME: altq.h,v 1.10 2003/07/10 12:07:47 kjc Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_ALTQ_HH_ +#define _ALTQ_ALTQ_HH_ + +#if 0 +/* + * allow altq-3 (altqd(8) and /dev/altq) to coexist with the new pf-based altq. + * altq3 is mainly for research experiments. pf-based altq is for daily use. + */ +#define ALTQ3_COMPAT /* for compatibility with altq-3 */ +#define ALTQ3_CLFIER_COMPAT /* for compatibility with altq-3 classifier */ +#endif + +#ifdef ALTQ3_COMPAT +#include +#include +#include +#include + +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif +#endif /* ALTQ3_COMPAT */ + +/* altq discipline type */ +#define ALTQT_NONE 0 /* reserved */ +#define ALTQT_CBQ 1 /* cbq */ +#define ALTQT_WFQ 2 /* wfq */ +#define ALTQT_AFMAP 3 /* afmap */ +#define ALTQT_FIFOQ 4 /* fifoq */ +#define ALTQT_RED 5 /* red */ +#define ALTQT_RIO 6 /* rio */ +#define ALTQT_LOCALQ 7 /* local use */ +#define ALTQT_HFSC 8 /* hfsc */ +#define ALTQT_CDNR 9 /* traffic conditioner */ +#define ALTQT_BLUE 10 /* blue */ +#define ALTQT_PRIQ 11 /* priority queue */ +#define ALTQT_JOBS 12 /* JoBS */ +#define ALTQT_MAX 13 /* should be max discipline type + 1 */ + +#ifdef ALTQ3_COMPAT +struct altqreq { + char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */ + u_long arg; /* request-specific argument */ +}; +#endif + +/* simple token backet meter profile */ +struct tb_profile { + u_int rate; /* rate in bit-per-sec */ + u_int depth; /* depth in bytes */ +}; + +#ifdef ALTQ3_COMPAT +struct tbrreq { + char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct tb_profile tb_prof; /* token bucket profile */ +}; + +#ifdef ALTQ3_CLFIER_COMPAT +/* + * common network flow info structure + */ +struct flowinfo { + u_char fi_len; /* total length */ + u_char fi_family; /* address family */ + u_int8_t fi_data[46]; /* actually longer; address family + specific flow info. */ +}; + +/* + * flow info structure for internet protocol family. + * (currently this is the only protocol family supported) + */ +struct flowinfo_in { + u_char fi_len; /* sizeof(struct flowinfo_in) */ + u_char fi_family; /* AF_INET */ + u_int8_t fi_proto; /* IPPROTO_XXX */ + u_int8_t fi_tos; /* type-of-service */ + struct in_addr fi_dst; /* dest address */ + struct in_addr fi_src; /* src address */ + u_int16_t fi_dport; /* dest port */ + u_int16_t fi_sport; /* src port */ + u_int32_t fi_gpi; /* generalized port id for ipsec */ + u_int8_t _pad[28]; /* make the size equal to + flowinfo_in6 */ +}; + +#ifdef SIN6_LEN +struct flowinfo_in6 { + u_char fi6_len; /* sizeof(struct flowinfo_in6) */ + u_char fi6_family; /* AF_INET6 */ + u_int8_t fi6_proto; /* IPPROTO_XXX */ + u_int8_t fi6_tclass; /* traffic class */ + u_int32_t fi6_flowlabel; /* ipv6 flowlabel */ + u_int16_t fi6_dport; /* dest port */ + u_int16_t fi6_sport; /* src port */ + u_int32_t fi6_gpi; /* generalized port id */ + struct in6_addr fi6_dst; /* dest address */ + struct in6_addr fi6_src; /* src address */ +}; +#endif /* INET6 */ + +/* + * flow filters for AF_INET and AF_INET6 + */ +struct flow_filter { + int ff_ruleno; + struct flowinfo_in ff_flow; + struct { + struct in_addr mask_dst; + struct in_addr mask_src; + u_int8_t mask_tos; + u_int8_t _pad[3]; + } ff_mask; + u_int8_t _pad2[24]; /* make the size equal to flow_filter6 */ +}; + +#ifdef SIN6_LEN +struct flow_filter6 { + int ff_ruleno; + struct flowinfo_in6 ff_flow6; + struct { + struct in6_addr mask6_dst; + struct in6_addr mask6_src; + u_int8_t mask6_tclass; + u_int8_t _pad[3]; + } ff_mask6; +}; +#endif /* INET6 */ +#endif /* ALTQ3_CLFIER_COMPAT */ +#endif /* ALTQ3_COMPAT */ + +/* + * generic packet counter + */ +struct pktcntr { + u_int64_t packets; + u_int64_t bytes; +}; + +#define PKTCNTR_ADD(cntr, len) \ + do { (cntr)->packets++; (cntr)->bytes += len; } while (/*CONSTCOND*/ 0) + +#ifdef ALTQ3_COMPAT +/* + * altq related ioctls + */ +#define ALTQGTYPE _IOWR('q', 0, struct altqreq) /* get queue type */ +#if 0 +/* + * these ioctls are currently discipline-specific but could be shared + * in the future. + */ +#define ALTQATTACH _IOW('q', 1, struct altqreq) /* attach discipline */ +#define ALTQDETACH _IOW('q', 2, struct altqreq) /* detach discipline */ +#define ALTQENABLE _IOW('q', 3, struct altqreq) /* enable discipline */ +#define ALTQDISABLE _IOW('q', 4, struct altqreq) /* disable discipline*/ +#define ALTQCLEAR _IOW('q', 5, struct altqreq) /* (re)initialize */ +#define ALTQCONFIG _IOWR('q', 6, struct altqreq) /* set config params */ +#define ALTQADDCLASS _IOWR('q', 7, struct altqreq) /* add a class */ +#define ALTQMODCLASS _IOWR('q', 8, struct altqreq) /* modify a class */ +#define ALTQDELCLASS _IOWR('q', 9, struct altqreq) /* delete a class */ +#define ALTQADDFILTER _IOWR('q', 10, struct altqreq) /* add a filter */ +#define ALTQDELFILTER _IOWR('q', 11, struct altqreq) /* delete a filter */ +#define ALTQGETSTATS _IOWR('q', 12, struct altqreq) /* get statistics */ +#define ALTQGETCNTR _IOWR('q', 13, struct altqreq) /* get a pkt counter */ +#endif /* 0 */ +#define ALTQTBRSET _IOW('q', 14, struct tbrreq) /* set tb regulator */ +#define ALTQTBRGET _IOWR('q', 15, struct tbrreq) /* get tb regulator */ +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL +#include +#endif + +#endif /* _ALTQ_ALTQ_HH_ */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_cbq.c b/contrib/altq/rtems/freebsd/altq/altq_cbq.c new file mode 100644 index 00000000..27454d47 --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_cbq.c @@ -0,0 +1,1187 @@ +#include + +/* $FreeBSD$ */ +/* $KAME: altq_cbq.c,v 1.19 2003/09/17 14:23:25 kjc Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include +#if (__FreeBSD__ != 2) +#include +#ifdef __FreeBSD__ +#include +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ALTQ3_COMPAT +#include +#include +#endif + +#include +#include + +#include +#include +#include +#ifdef ALTQ3_COMPAT +#include +#endif + +#ifdef ALTQ3_COMPAT +/* + * Local Data structures. + */ +static cbq_state_t *cbq_list = NULL; +#endif + +/* + * Forward Declarations. + */ +static int cbq_class_destroy(cbq_state_t *, struct rm_class *); +static struct rm_class *clh_to_clp(cbq_state_t *, u_int32_t); +static int cbq_clear_interface(cbq_state_t *); +static int cbq_request(struct ifaltq *, int, void *); +static int cbq_enqueue(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); +static struct mbuf *cbq_dequeue(struct ifaltq *, int); +static void cbqrestart(struct ifaltq *); +static void get_class_stats(class_stats_t *, struct rm_class *); +static void cbq_purge(cbq_state_t *); +#ifdef ALTQ3_COMPAT +static int cbq_add_class(struct cbq_add_class *); +static int cbq_delete_class(struct cbq_delete_class *); +static int cbq_modify_class(struct cbq_modify_class *); +static int cbq_class_create(cbq_state_t *, struct cbq_add_class *, + struct rm_class *, struct rm_class *); +static int cbq_clear_hierarchy(struct cbq_interface *); +static int cbq_set_enable(struct cbq_interface *, int); +static int cbq_ifattach(struct cbq_interface *); +static int cbq_ifdetach(struct cbq_interface *); +static int cbq_getstats(struct cbq_getstats *); + +static int cbq_add_filter(struct cbq_add_filter *); +static int cbq_delete_filter(struct cbq_delete_filter *); +#endif /* ALTQ3_COMPAT */ + +/* + * int + * cbq_class_destroy(cbq_mod_state_t *, struct rm_class *) - This + * function destroys a given traffic class. Before destroying + * the class, all traffic for that class is released. + */ +static int +cbq_class_destroy(cbq_state_t *cbqp, struct rm_class *cl) +{ + int i; + + /* delete the class */ + rmc_delete_class(&cbqp->ifnp, cl); + + /* + * free the class handle + */ + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == cl) + cbqp->cbq_class_tbl[i] = NULL; + + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; +#ifdef ALTQ3_COMPAT + if (cl == cbqp->ifnp.ctl_) + cbqp->ifnp.ctl_ = NULL; +#endif + return (0); +} + +/* convert class handle to class pointer */ +static struct rm_class * +clh_to_clp(cbq_state_t *cbqp, u_int32_t chandle) +{ + int i; + struct rm_class *cl; + + if (chandle == 0) + return (NULL); + /* + * first, try optimistically the slot matching the lower bits of + * the handle. if it fails, do the linear table search. + */ + i = chandle % CBQ_MAX_CLASSES; + if ((cl = cbqp->cbq_class_tbl[i]) != NULL && + cl->stats_.handle == chandle) + return (cl); + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if ((cl = cbqp->cbq_class_tbl[i]) != NULL && + cl->stats_.handle == chandle) + return (cl); + return (NULL); +} + +static int +cbq_clear_interface(cbq_state_t *cbqp) +{ + int again, i; + struct rm_class *cl; + +#ifdef ALTQ3_CLFIER_COMPAT + /* free the filters for this interface */ + acc_discard_filters(&cbqp->cbq_classifier, NULL, 1); +#endif + + /* clear out the classes now */ + do { + again = 0; + for (i = 0; i < CBQ_MAX_CLASSES; i++) { + if ((cl = cbqp->cbq_class_tbl[i]) != NULL) { + if (is_a_parent_class(cl)) + again++; + else { + cbq_class_destroy(cbqp, cl); + cbqp->cbq_class_tbl[i] = NULL; + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; +#ifdef ALTQ3_COMPAT + if (cl == cbqp->ifnp.ctl_) + cbqp->ifnp.ctl_ = NULL; +#endif + } + } + } + } while (again); + + return (0); +} + +static int +cbq_request(struct ifaltq *ifq, int req, void *arg) +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + switch (req) { + case ALTRQ_PURGE: + cbq_purge(cbqp); + break; + } + return (0); +} + +/* copy the stats info in rm_class to class_states_t */ +static void +get_class_stats(class_stats_t *statsp, struct rm_class *cl) +{ + statsp->xmit_cnt = cl->stats_.xmit_cnt; + statsp->drop_cnt = cl->stats_.drop_cnt; + statsp->over = cl->stats_.over; + statsp->borrows = cl->stats_.borrows; + statsp->overactions = cl->stats_.overactions; + statsp->delays = cl->stats_.delays; + + statsp->depth = cl->depth_; + statsp->priority = cl->pri_; + statsp->maxidle = cl->maxidle_; + statsp->minidle = cl->minidle_; + statsp->offtime = cl->offtime_; + statsp->qmax = qlimit(cl->q_); + statsp->ns_per_byte = cl->ns_per_byte_; + statsp->wrr_allot = cl->w_allotment_; + statsp->qcnt = qlen(cl->q_); + statsp->avgidle = cl->avgidle_; + + statsp->qtype = qtype(cl->q_); +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + red_getstats(cl->red_, &statsp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + rio_getstats((rio_t *)cl->red_, &statsp->red[0]); +#endif +} + +int +cbq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + error = altq_attach(&ifp->if_snd, ALTQT_CBQ, a->altq_disc, + cbq_enqueue, cbq_dequeue, cbq_request, NULL, NULL); + splx(s); + return (error); +} + +int +cbq_add_altq(struct pf_altq *a) +{ + cbq_state_t *cbqp; + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENODEV); + + /* allocate and initialize cbq_state_t */ + cbqp = malloc(sizeof(cbq_state_t), M_DEVBUF, M_WAITOK); + if (cbqp == NULL) + return (ENOMEM); + bzero(cbqp, sizeof(cbq_state_t)); + CALLOUT_INIT(&cbqp->cbq_callout); + cbqp->cbq_qlen = 0; + cbqp->ifnp.ifq_ = &ifp->if_snd; /* keep the ifq */ + + /* keep the state in pf_altq */ + a->altq_disc = cbqp; + + return (0); +} + +int +cbq_remove_altq(struct pf_altq *a) +{ + cbq_state_t *cbqp; + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + cbq_clear_interface(cbqp); + + if (cbqp->ifnp.default_) + cbq_class_destroy(cbqp, cbqp->ifnp.default_); + if (cbqp->ifnp.root_) + cbq_class_destroy(cbqp, cbqp->ifnp.root_); + + /* deallocate cbq_state_t */ + free(cbqp, M_DEVBUF); + + return (0); +} + +int +cbq_add_queue(struct pf_altq *a) +{ + struct rm_class *borrow, *parent; + cbq_state_t *cbqp; + struct rm_class *cl; + struct cbq_opts *opts; + int i; + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + if (a->qid == 0) + return (EINVAL); + + /* + * find a free slot in the class table. if the slot matching + * the lower bits of qid is free, use this slot. otherwise, + * use the first free slot. + */ + i = a->qid % CBQ_MAX_CLASSES; + if (cbqp->cbq_class_tbl[i] != NULL) { + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == NULL) + break; + if (i == CBQ_MAX_CLASSES) + return (EINVAL); + } + + opts = &a->pq_u.cbq_opts; + /* check parameters */ + if (a->priority >= CBQ_MAXPRI) + return (EINVAL); + + /* Get pointers to parent and borrow classes. */ + parent = clh_to_clp(cbqp, a->parent_qid); + if (opts->flags & CBQCLF_BORROW) + borrow = parent; + else + borrow = NULL; + + /* + * A class must borrow from it's parent or it can not + * borrow at all. Hence, borrow can be null. + */ + if (parent == NULL && (opts->flags & CBQCLF_ROOTCLASS) == 0) { + printf("cbq_add_queue: no parent class!\n"); + return (EINVAL); + } + + if ((borrow != parent) && (borrow != NULL)) { + printf("cbq_add_class: borrow class != parent\n"); + return (EINVAL); + } + + /* + * check parameters + */ + switch (opts->flags & CBQCLF_CLASSMASK) { + case CBQCLF_ROOTCLASS: + if (parent != NULL) + return (EINVAL); + if (cbqp->ifnp.root_) + return (EINVAL); + break; + case CBQCLF_DEFCLASS: + if (cbqp->ifnp.default_) + return (EINVAL); + break; + case 0: + if (a->qid == 0) + return (EINVAL); + break; + default: + /* more than two flags bits set */ + return (EINVAL); + } + + /* + * create a class. if this is a root class, initialize the + * interface. + */ + if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_ROOTCLASS) { + rmc_init(cbqp->ifnp.ifq_, &cbqp->ifnp, opts->ns_per_byte, + cbqrestart, a->qlimit, RM_MAXQUEUED, + opts->maxidle, opts->minidle, opts->offtime, + opts->flags); + cl = cbqp->ifnp.root_; + } else { + cl = rmc_newclass(a->priority, + &cbqp->ifnp, opts->ns_per_byte, + rmc_delay_action, a->qlimit, parent, borrow, + opts->maxidle, opts->minidle, opts->offtime, + opts->pktsize, opts->flags); + } + if (cl == NULL) + return (ENOMEM); + + /* return handle to user space. */ + cl->stats_.handle = a->qid; + cl->stats_.depth = cl->depth_; + + /* save the allocated class */ + cbqp->cbq_class_tbl[i] = cl; + + if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_DEFCLASS) + cbqp->ifnp.default_ = cl; + + return (0); +} + +int +cbq_remove_queue(struct pf_altq *a) +{ + struct rm_class *cl; + cbq_state_t *cbqp; + int i; + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + + if ((cl = clh_to_clp(cbqp, a->qid)) == NULL) + return (EINVAL); + + /* if we are a parent class, then return an error. */ + if (is_a_parent_class(cl)) + return (EINVAL); + + /* delete the class */ + rmc_delete_class(&cbqp->ifnp, cl); + + /* + * free the class handle + */ + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == cl) { + cbqp->cbq_class_tbl[i] = NULL; + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; + break; + } + + return (0); +} + +int +cbq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + cbq_state_t *cbqp; + struct rm_class *cl; + class_stats_t stats; + int error = 0; + + if ((cbqp = altq_lookup(a->ifname, ALTQT_CBQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(cbqp, a->qid)) == NULL) + return (EINVAL); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + return (0); +} + +/* + * int + * cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pattr) + * - Queue data packets. + * + * cbq_enqueue is set to ifp->if_altqenqueue and called by an upper + * layer (e.g. ether_output). cbq_enqueue queues the given packet + * to the cbq, then invokes the driver's start routine. + * + * Assumptions: called in splimp + * Returns: 0 if the queueing is successful. + * ENOBUFS if a packet dropping occurred as a result of + * the queueing. + */ + +static int +cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + struct rm_class *cl; + struct pf_mtag *t; + int len; + + IFQ_LOCK_ASSERT(ifq); + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ +#if defined(__NetBSD__) || defined(__OpenBSD__)\ + || (defined(__FreeBSD__) && __FreeBSD_version >= 501113) + printf("altq: packet for %s does not have pkthdr\n", + ifq->altq_ifp->if_xname); +#else + printf("altq: packet for %s%d does not have pkthdr\n", + ifq->altq_ifp->if_name, ifq->altq_ifp->if_unit); +#endif + m_freem(m); + return (ENOBUFS); + } + cl = NULL; + if ((t = pf_find_mtag(m)) != NULL) + cl = clh_to_clp(cbqp, t->qid); +#ifdef ALTQ3_COMPAT + else if ((ifq->altq_flags & ALTQF_CLASSIFY) && pktattr != NULL) + cl = pktattr->pattr_class; +#endif + if (cl == NULL) { + cl = cbqp->ifnp.default_; + if (cl == NULL) { + m_freem(m); + return (ENOBUFS); + } + } +#ifdef ALTQ3_COMPAT + if (pktattr != NULL) + cl->pktattr_ = pktattr; /* save proto hdr used by ECN */ + else +#endif + cl->pktattr_ = NULL; + len = m_pktlen(m); + if (rmc_queue_packet(cl, m) != 0) { + /* drop occurred. some mbuf was freed in rmc_queue_packet. */ + PKTCNTR_ADD(&cl->stats_.drop_cnt, len); + return (ENOBUFS); + } + + /* successfully queued. */ + ++cbqp->cbq_qlen; + IFQ_INC_LEN(ifq); + return (0); +} + +static struct mbuf * +cbq_dequeue(struct ifaltq *ifq, int op) +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + struct mbuf *m; + + IFQ_LOCK_ASSERT(ifq); + + m = rmc_dequeue_next(&cbqp->ifnp, op); + + if (m && op == ALTDQ_REMOVE) { + --cbqp->cbq_qlen; /* decrement # of packets in cbq */ + IFQ_DEC_LEN(ifq); + + /* Update the class. */ + rmc_update_class_util(&cbqp->ifnp); + } + return (m); +} + +/* + * void + * cbqrestart(queue_t *) - Restart sending of data. + * called from rmc_restart in splimp via timeout after waking up + * a suspended class. + * Returns: NONE + */ + +static void +cbqrestart(struct ifaltq *ifq) +{ + cbq_state_t *cbqp; + struct ifnet *ifp; + + IFQ_LOCK_ASSERT(ifq); + + if (!ALTQ_IS_ENABLED(ifq)) + /* cbq must have been detached */ + return; + + if ((cbqp = (cbq_state_t *)ifq->altq_disc) == NULL) + /* should not happen */ + return; + + ifp = ifq->altq_ifp; + if (ifp->if_start && + cbqp->cbq_qlen > 0 && (ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) { + IFQ_UNLOCK(ifq); + (*ifp->if_start)(ifp); + IFQ_LOCK(ifq); + } +} + +static void cbq_purge(cbq_state_t *cbqp) +{ + struct rm_class *cl; + int i; + + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if ((cl = cbqp->cbq_class_tbl[i]) != NULL) + rmc_dropall(cl); + if (ALTQ_IS_ENABLED(cbqp->ifnp.ifq_)) + cbqp->ifnp.ifq_->ifq_len = 0; +} +#ifdef ALTQ3_COMPAT + +static int +cbq_add_class(acp) + struct cbq_add_class *acp; +{ + char *ifacename; + struct rm_class *borrow, *parent; + cbq_state_t *cbqp; + + ifacename = acp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + /* check parameters */ + if (acp->cbq_class.priority >= CBQ_MAXPRI || + acp->cbq_class.maxq > CBQ_MAXQSIZE) + return (EINVAL); + + /* Get pointers to parent and borrow classes. */ + parent = clh_to_clp(cbqp, acp->cbq_class.parent_class_handle); + borrow = clh_to_clp(cbqp, acp->cbq_class.borrow_class_handle); + + /* + * A class must borrow from it's parent or it can not + * borrow at all. Hence, borrow can be null. + */ + if (parent == NULL && (acp->cbq_class.flags & CBQCLF_ROOTCLASS) == 0) { + printf("cbq_add_class: no parent class!\n"); + return (EINVAL); + } + + if ((borrow != parent) && (borrow != NULL)) { + printf("cbq_add_class: borrow class != parent\n"); + return (EINVAL); + } + + return cbq_class_create(cbqp, acp, parent, borrow); +} + +static int +cbq_delete_class(dcp) + struct cbq_delete_class *dcp; +{ + char *ifacename; + struct rm_class *cl; + cbq_state_t *cbqp; + + ifacename = dcp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(cbqp, dcp->cbq_class_handle)) == NULL) + return (EINVAL); + + /* if we are a parent class, then return an error. */ + if (is_a_parent_class(cl)) + return (EINVAL); + + /* if a filter has a reference to this class delete the filter */ + acc_discard_filters(&cbqp->cbq_classifier, cl, 0); + + return cbq_class_destroy(cbqp, cl); +} + +static int +cbq_modify_class(acp) + struct cbq_modify_class *acp; +{ + char *ifacename; + struct rm_class *cl; + cbq_state_t *cbqp; + + ifacename = acp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + /* Get pointer to this class */ + if ((cl = clh_to_clp(cbqp, acp->cbq_class_handle)) == NULL) + return (EINVAL); + + if (rmc_modclass(cl, acp->cbq_class.nano_sec_per_byte, + acp->cbq_class.maxq, acp->cbq_class.maxidle, + acp->cbq_class.minidle, acp->cbq_class.offtime, + acp->cbq_class.pktsize) < 0) + return (EINVAL); + return (0); +} + +/* + * struct rm_class * + * cbq_class_create(cbq_mod_state_t *cbqp, struct cbq_add_class *acp, + * struct rm_class *parent, struct rm_class *borrow) + * + * This function create a new traffic class in the CBQ class hierarchy of + * given paramters. The class that created is either the root, default, + * or a new dynamic class. If CBQ is not initilaized, the the root class + * will be created. + */ +static int +cbq_class_create(cbqp, acp, parent, borrow) + cbq_state_t *cbqp; + struct cbq_add_class *acp; + struct rm_class *parent, *borrow; +{ + struct rm_class *cl; + cbq_class_spec_t *spec = &acp->cbq_class; + u_int32_t chandle; + int i; + + /* + * allocate class handle + */ + for (i = 1; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == NULL) + break; + if (i == CBQ_MAX_CLASSES) + return (EINVAL); + chandle = i; /* use the slot number as class handle */ + + /* + * create a class. if this is a root class, initialize the + * interface. + */ + if ((spec->flags & CBQCLF_CLASSMASK) == CBQCLF_ROOTCLASS) { + rmc_init(cbqp->ifnp.ifq_, &cbqp->ifnp, spec->nano_sec_per_byte, + cbqrestart, spec->maxq, RM_MAXQUEUED, + spec->maxidle, spec->minidle, spec->offtime, + spec->flags); + cl = cbqp->ifnp.root_; + } else { + cl = rmc_newclass(spec->priority, + &cbqp->ifnp, spec->nano_sec_per_byte, + rmc_delay_action, spec->maxq, parent, borrow, + spec->maxidle, spec->minidle, spec->offtime, + spec->pktsize, spec->flags); + } + if (cl == NULL) + return (ENOMEM); + + /* return handle to user space. */ + acp->cbq_class_handle = chandle; + + cl->stats_.handle = chandle; + cl->stats_.depth = cl->depth_; + + /* save the allocated class */ + cbqp->cbq_class_tbl[i] = cl; + + if ((spec->flags & CBQCLF_CLASSMASK) == CBQCLF_DEFCLASS) + cbqp->ifnp.default_ = cl; + if ((spec->flags & CBQCLF_CLASSMASK) == CBQCLF_CTLCLASS) + cbqp->ifnp.ctl_ = cl; + + return (0); +} + +static int +cbq_add_filter(afp) + struct cbq_add_filter *afp; +{ + char *ifacename; + cbq_state_t *cbqp; + struct rm_class *cl; + + ifacename = afp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + /* Get the pointer to class. */ + if ((cl = clh_to_clp(cbqp, afp->cbq_class_handle)) == NULL) + return (EINVAL); + + return acc_add_filter(&cbqp->cbq_classifier, &afp->cbq_filter, + cl, &afp->cbq_filter_handle); +} + +static int +cbq_delete_filter(dfp) + struct cbq_delete_filter *dfp; +{ + char *ifacename; + cbq_state_t *cbqp; + + ifacename = dfp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + return acc_delete_filter(&cbqp->cbq_classifier, + dfp->cbq_filter_handle); +} + +/* + * cbq_clear_hierarchy deletes all classes and their filters on the + * given interface. + */ +static int +cbq_clear_hierarchy(ifacep) + struct cbq_interface *ifacep; +{ + char *ifacename; + cbq_state_t *cbqp; + + ifacename = ifacep->cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + return cbq_clear_interface(cbqp); +} + +/* + * static int + * cbq_set_enable(struct cbq_enable *ep) - this function processed the + * ioctl request to enable class based queueing. It searches the list + * of interfaces for the specified interface and then enables CBQ on + * that interface. + * + * Returns: 0, for no error. + * EBADF, for specified inteface not found. + */ + +static int +cbq_set_enable(ep, enable) + struct cbq_interface *ep; + int enable; +{ + int error = 0; + cbq_state_t *cbqp; + char *ifacename; + + ifacename = ep->cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + switch (enable) { + case ENABLE: + if (cbqp->ifnp.root_ == NULL || cbqp->ifnp.default_ == NULL || + cbqp->ifnp.ctl_ == NULL) { + if (cbqp->ifnp.root_ == NULL) + printf("No Root Class for %s\n", ifacename); + if (cbqp->ifnp.default_ == NULL) + printf("No Default Class for %s\n", ifacename); + if (cbqp->ifnp.ctl_ == NULL) + printf("No Control Class for %s\n", ifacename); + error = EINVAL; + } else if ((error = altq_enable(cbqp->ifnp.ifq_)) == 0) { + cbqp->cbq_qlen = 0; + } + break; + + case DISABLE: + error = altq_disable(cbqp->ifnp.ifq_); + break; + } + return (error); +} + +static int +cbq_getstats(gsp) + struct cbq_getstats *gsp; +{ + char *ifacename; + int i, n, nclasses; + cbq_state_t *cbqp; + struct rm_class *cl; + class_stats_t stats, *usp; + int error = 0; + + ifacename = gsp->iface.cbq_ifacename; + nclasses = gsp->nclasses; + usp = gsp->stats; + + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + if (nclasses <= 0) + return (EINVAL); + + for (n = 0, i = 0; n < nclasses && i < CBQ_MAX_CLASSES; n++, i++) { + while ((cl = cbqp->cbq_class_tbl[i]) == NULL) + if (++i >= CBQ_MAX_CLASSES) + goto out; + + get_class_stats(&stats, cl); + stats.handle = cl->stats_.handle; + + if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, + sizeof(stats))) != 0) + return (error); + } + + out: + gsp->nclasses = n; + return (error); +} + +static int +cbq_ifattach(ifacep) + struct cbq_interface *ifacep; +{ + int error = 0; + char *ifacename; + cbq_state_t *new_cbqp; + struct ifnet *ifp; + + ifacename = ifacep->cbq_ifacename; + if ((ifp = ifunit(ifacename)) == NULL) + return (ENXIO); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENXIO); + + /* allocate and initialize cbq_state_t */ + new_cbqp = malloc(sizeof(cbq_state_t), M_DEVBUF, M_WAITOK); + if (new_cbqp == NULL) + return (ENOMEM); + bzero(new_cbqp, sizeof(cbq_state_t)); + CALLOUT_INIT(&new_cbqp->cbq_callout); + + new_cbqp->cbq_qlen = 0; + new_cbqp->ifnp.ifq_ = &ifp->if_snd; /* keep the ifq */ + + /* + * set CBQ to this ifnet structure. + */ + error = altq_attach(&ifp->if_snd, ALTQT_CBQ, new_cbqp, + cbq_enqueue, cbq_dequeue, cbq_request, + &new_cbqp->cbq_classifier, acc_classify); + if (error) { + free(new_cbqp, M_DEVBUF); + return (error); + } + + /* prepend to the list of cbq_state_t's. */ + new_cbqp->cbq_next = cbq_list; + cbq_list = new_cbqp; + + return (0); +} + +static int +cbq_ifdetach(ifacep) + struct cbq_interface *ifacep; +{ + char *ifacename; + cbq_state_t *cbqp; + + ifacename = ifacep->cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + (void)cbq_set_enable(ifacep, DISABLE); + + cbq_clear_interface(cbqp); + + /* remove CBQ from the ifnet structure. */ + (void)altq_detach(cbqp->ifnp.ifq_); + + /* remove from the list of cbq_state_t's. */ + if (cbq_list == cbqp) + cbq_list = cbqp->cbq_next; + else { + cbq_state_t *cp; + + for (cp = cbq_list; cp != NULL; cp = cp->cbq_next) + if (cp->cbq_next == cbqp) { + cp->cbq_next = cbqp->cbq_next; + break; + } + ASSERT(cp != NULL); + } + + /* deallocate cbq_state_t */ + free(cbqp, M_DEVBUF); + + return (0); +} + +/* + * cbq device interface + */ + +altqdev_decl(cbq); + +int +cbqopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + return (0); +} + +int +cbqclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct ifnet *ifp; + struct cbq_interface iface; + int err, error = 0; + + while (cbq_list) { + ifp = cbq_list->ifnp.ifq_->altq_ifp; +#if defined(__NetBSD__) || defined(__OpenBSD__)\ + || (defined(__FreeBSD__) && __FreeBSD_version >= 501113) + sprintf(iface.cbq_ifacename, "%s", ifp->if_xname); +#else + sprintf(iface.cbq_ifacename, + "%s%d", ifp->if_name, ifp->if_unit); +#endif + err = cbq_ifdetach(&iface); + if (err != 0 && error == 0) + error = err; + } + + return (error); +} + +int +cbqioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + int error = 0; + + /* check cmd for superuser only */ + switch (cmd) { + case CBQ_GETSTATS: + /* currently only command that an ordinary user can call */ + break; + default: +#if (__FreeBSD_version > 700000) + error = priv_check(p, PRIV_ALTQ_MANAGE); +#elsif (__FreeBSD_version > 400000) + error = suser(p); +#else + error = suser(p->p_ucred, &p->p_acflag); +#endif + if (error) + return (error); + break; + } + + switch (cmd) { + + case CBQ_ENABLE: + error = cbq_set_enable((struct cbq_interface *)addr, ENABLE); + break; + + case CBQ_DISABLE: + error = cbq_set_enable((struct cbq_interface *)addr, DISABLE); + break; + + case CBQ_ADD_FILTER: + error = cbq_add_filter((struct cbq_add_filter *)addr); + break; + + case CBQ_DEL_FILTER: + error = cbq_delete_filter((struct cbq_delete_filter *)addr); + break; + + case CBQ_ADD_CLASS: + error = cbq_add_class((struct cbq_add_class *)addr); + break; + + case CBQ_DEL_CLASS: + error = cbq_delete_class((struct cbq_delete_class *)addr); + break; + + case CBQ_MODIFY_CLASS: + error = cbq_modify_class((struct cbq_modify_class *)addr); + break; + + case CBQ_CLEAR_HIERARCHY: + error = cbq_clear_hierarchy((struct cbq_interface *)addr); + break; + + case CBQ_IF_ATTACH: + error = cbq_ifattach((struct cbq_interface *)addr); + break; + + case CBQ_IF_DETACH: + error = cbq_ifdetach((struct cbq_interface *)addr); + break; + + case CBQ_GETSTATS: + error = cbq_getstats((struct cbq_getstats *)addr); + break; + + default: + error = EINVAL; + break; + } + + return error; +} + +#if 0 +/* for debug */ +static void cbq_class_dump(int); + +static void cbq_class_dump(i) + int i; +{ + struct rm_class *cl; + rm_class_stats_t *s; + struct _class_queue_ *q; + + if (cbq_list == NULL) { + printf("cbq_class_dump: no cbq_state found\n"); + return; + } + cl = cbq_list->cbq_class_tbl[i]; + + printf("class %d cl=%p\n", i, cl); + if (cl != NULL) { + s = &cl->stats_; + q = cl->q_; + + printf("pri=%d, depth=%d, maxrate=%d, allotment=%d\n", + cl->pri_, cl->depth_, cl->maxrate_, cl->allotment_); + printf("w_allotment=%d, bytes_alloc=%d, avgidle=%d, maxidle=%d\n", + cl->w_allotment_, cl->bytes_alloc_, cl->avgidle_, + cl->maxidle_); + printf("minidle=%d, offtime=%d, sleeping=%d, leaf=%d\n", + cl->minidle_, cl->offtime_, cl->sleeping_, cl->leaf_); + printf("handle=%d, depth=%d, packets=%d, bytes=%d\n", + s->handle, s->depth, + (int)s->xmit_cnt.packets, (int)s->xmit_cnt.bytes); + printf("over=%d\n, borrows=%d, drops=%d, overactions=%d, delays=%d\n", + s->over, s->borrows, (int)s->drop_cnt.packets, + s->overactions, s->delays); + printf("tail=%p, head=%p, qlen=%d, qlim=%d, qthresh=%d,qtype=%d\n", + q->tail_, q->head_, q->qlen_, q->qlim_, + q->qthresh_, q->qtype_); + } +} +#endif /* 0 */ + +#ifdef KLD_MODULE + +static struct altqsw cbq_sw = + {"cbq", cbqopen, cbqclose, cbqioctl}; + +ALTQ_MODULE(altq_cbq, ALTQT_CBQ, &cbq_sw); +MODULE_DEPEND(altq_cbq, altq_red, 1, 1, 1); +MODULE_DEPEND(altq_cbq, altq_rio, 1, 1, 1); + +#endif /* KLD_MODULE */ +#endif /* ALTQ3_COMPAT */ + +#endif /* ALTQ_CBQ */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_cbq.h b/contrib/altq/rtems/freebsd/altq/altq_cbq.h new file mode 100644 index 00000000..ecc730c7 --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_cbq.h @@ -0,0 +1,221 @@ +/* $KAME: altq_cbq.h,v 1.12 2003/10/03 05:05:15 kjc Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#ifndef _ALTQ_ALTQ_CBQ_HH_ +#define _ALTQ_ALTQ_CBQ_HH_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define NULL_CLASS_HANDLE 0 + +/* class flags should be same as class flags in rm_class.h */ +#define CBQCLF_RED 0x0001 /* use RED */ +#define CBQCLF_ECN 0x0002 /* use RED/ECN */ +#define CBQCLF_RIO 0x0004 /* use RIO */ +#define CBQCLF_FLOWVALVE 0x0008 /* use flowvalve (aka penalty-box) */ +#define CBQCLF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define CBQCLF_BORROW 0x0020 /* borrow from parent */ + +/* class flags only for root class */ +#define CBQCLF_WRR 0x0100 /* weighted-round robin */ +#define CBQCLF_EFFICIENT 0x0200 /* work-conserving */ + +/* class flags for special classes */ +#define CBQCLF_ROOTCLASS 0x1000 /* root class */ +#define CBQCLF_DEFCLASS 0x2000 /* default class */ +#ifdef ALTQ3_COMPAT +#define CBQCLF_CTLCLASS 0x4000 /* control class */ +#endif +#define CBQCLF_CLASSMASK 0xf000 /* class mask */ + +#define CBQ_MAXQSIZE 200 +#define CBQ_MAXPRI RM_MAXPRIO + +typedef struct _cbq_class_stats_ { + u_int32_t handle; + u_int depth; + + struct pktcntr xmit_cnt; /* packets sent in this class */ + struct pktcntr drop_cnt; /* dropped packets */ + u_int over; /* # times went over limit */ + u_int borrows; /* # times tried to borrow */ + u_int overactions; /* # times invoked overlimit action */ + u_int delays; /* # times invoked delay actions */ + + /* other static class parameters useful for debugging */ + int priority; + int maxidle; + int minidle; + int offtime; + int qmax; + int ns_per_byte; + int wrr_allot; + + int qcnt; /* # packets in queue */ + int avgidle; + + /* red and rio related info */ + int qtype; + struct redstats red[3]; +} class_stats_t; + +#ifdef ALTQ3_COMPAT +/* + * Define structures associated with IOCTLS for cbq. + */ + +/* + * Define the CBQ interface structure. This must be included in all + * IOCTL's such that the CBQ driver may find the appropriate CBQ module + * associated with the network interface to be affected. + */ +struct cbq_interface { + char cbq_ifacename[IFNAMSIZ]; +}; + +typedef struct cbq_class_spec { + u_int priority; + u_int nano_sec_per_byte; + u_int maxq; + u_int maxidle; + int minidle; + u_int offtime; + u_int32_t parent_class_handle; + u_int32_t borrow_class_handle; + + u_int pktsize; + int flags; +} cbq_class_spec_t; + +struct cbq_add_class { + struct cbq_interface cbq_iface; + + cbq_class_spec_t cbq_class; + u_int32_t cbq_class_handle; +}; + +struct cbq_delete_class { + struct cbq_interface cbq_iface; + u_int32_t cbq_class_handle; +}; + +struct cbq_modify_class { + struct cbq_interface cbq_iface; + + cbq_class_spec_t cbq_class; + u_int32_t cbq_class_handle; +}; + +struct cbq_add_filter { + struct cbq_interface cbq_iface; + u_int32_t cbq_class_handle; + struct flow_filter cbq_filter; + + u_long cbq_filter_handle; +}; + +struct cbq_delete_filter { + struct cbq_interface cbq_iface; + u_long cbq_filter_handle; +}; + +/* number of classes are returned in nclasses field */ +struct cbq_getstats { + struct cbq_interface iface; + int nclasses; + class_stats_t *stats; +}; + +/* + * Define IOCTLs for CBQ. + */ +#define CBQ_IF_ATTACH _IOW('Q', 1, struct cbq_interface) +#define CBQ_IF_DETACH _IOW('Q', 2, struct cbq_interface) +#define CBQ_ENABLE _IOW('Q', 3, struct cbq_interface) +#define CBQ_DISABLE _IOW('Q', 4, struct cbq_interface) +#define CBQ_CLEAR_HIERARCHY _IOW('Q', 5, struct cbq_interface) +#define CBQ_ADD_CLASS _IOWR('Q', 7, struct cbq_add_class) +#define CBQ_DEL_CLASS _IOW('Q', 8, struct cbq_delete_class) +#define CBQ_MODIFY_CLASS _IOWR('Q', 9, struct cbq_modify_class) +#define CBQ_ADD_FILTER _IOWR('Q', 10, struct cbq_add_filter) +#define CBQ_DEL_FILTER _IOW('Q', 11, struct cbq_delete_filter) +#define CBQ_GETSTATS _IOWR('Q', 12, struct cbq_getstats) +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL +/* + * Define macros only good for kernel drivers and modules. + */ +#define CBQ_WATCHDOG (hz / 20) +#define CBQ_TIMEOUT 10 +#define CBQ_LS_TIMEOUT (20 * hz / 1000) + +#define CBQ_MAX_CLASSES 256 + +#ifdef ALTQ3_COMPAT +#define CBQ_MAX_FILTERS 256 + +#define DISABLE 0x00 +#define ENABLE 0x01 +#endif /* ALTQ3_COMPAT */ + +/* + * Define State structures. + */ +typedef struct cbqstate { +#ifdef ALTQ3_COMPAT + struct cbqstate *cbq_next; +#endif + int cbq_qlen; /* # of packets in cbq */ + struct rm_class *cbq_class_tbl[CBQ_MAX_CLASSES]; + + struct rm_ifdat ifnp; + struct callout cbq_callout; /* for timeouts */ +#ifdef ALTQ3_CLFIER_COMPAT + struct acc_classifier cbq_classifier; +#endif +} cbq_state_t; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* !_ALTQ_ALTQ_CBQ_HH_ */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_cdnr.c b/contrib/altq/rtems/freebsd/altq/altq_cdnr.c new file mode 100644 index 00000000..636d4b79 --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_cdnr.c @@ -0,0 +1,1393 @@ +#include + +/* $FreeBSD$ */ +/* $KAME: altq_cdnr.c,v 1.14 2003/09/05 22:40:36 itojun Exp $ */ + +/* + * Copyright (C) 1999-2002 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include +#if (__FreeBSD__ != 2) +#include +#ifdef __FreeBSD__ +#include +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include +#ifdef ALTQ3_COMPAT +#include +#endif +#include + +#ifdef ALTQ3_COMPAT +/* + * diffserv traffic conditioning module + */ + +int altq_cdnr_enabled = 0; + +/* traffic conditioner is enabled by ALTQ_CDNR option in opt_altq.h */ +#ifdef ALTQ_CDNR + +/* cdnr_list keeps all cdnr's allocated. */ +static LIST_HEAD(, top_cdnr) tcb_list; + +static int altq_cdnr_input(struct mbuf *, int); +static struct top_cdnr *tcb_lookup(char *ifname); +static struct cdnr_block *cdnr_handle2cb(u_long); +static u_long cdnr_cb2handle(struct cdnr_block *); +static void *cdnr_cballoc(struct top_cdnr *, int, + struct tc_action *(*)(struct cdnr_block *, struct cdnr_pktinfo *)); +static void cdnr_cbdestroy(void *); +static int tca_verify_action(struct tc_action *); +static void tca_import_action(struct tc_action *, struct tc_action *); +static void tca_invalidate_action(struct tc_action *); + +static int generic_element_destroy(struct cdnr_block *); +static struct top_cdnr *top_create(struct ifaltq *); +static int top_destroy(struct top_cdnr *); +static struct cdnr_block *element_create(struct top_cdnr *, struct tc_action *); +static int element_destroy(struct cdnr_block *); +static void tb_import_profile(struct tbe *, struct tb_profile *); +static struct tbmeter *tbm_create(struct top_cdnr *, struct tb_profile *, + struct tc_action *, struct tc_action *); +static int tbm_destroy(struct tbmeter *); +static struct tc_action *tbm_input(struct cdnr_block *, struct cdnr_pktinfo *); +static struct trtcm *trtcm_create(struct top_cdnr *, + struct tb_profile *, struct tb_profile *, + struct tc_action *, struct tc_action *, struct tc_action *, + int); +static int trtcm_destroy(struct trtcm *); +static struct tc_action *trtcm_input(struct cdnr_block *, struct cdnr_pktinfo *); +static struct tswtcm *tswtcm_create(struct top_cdnr *, + u_int32_t, u_int32_t, u_int32_t, + struct tc_action *, struct tc_action *, struct tc_action *); +static int tswtcm_destroy(struct tswtcm *); +static struct tc_action *tswtcm_input(struct cdnr_block *, struct cdnr_pktinfo *); + +static int cdnrcmd_if_attach(char *); +static int cdnrcmd_if_detach(char *); +static int cdnrcmd_add_element(struct cdnr_add_element *); +static int cdnrcmd_delete_element(struct cdnr_delete_element *); +static int cdnrcmd_add_filter(struct cdnr_add_filter *); +static int cdnrcmd_delete_filter(struct cdnr_delete_filter *); +static int cdnrcmd_add_tbm(struct cdnr_add_tbmeter *); +static int cdnrcmd_modify_tbm(struct cdnr_modify_tbmeter *); +static int cdnrcmd_tbm_stats(struct cdnr_tbmeter_stats *); +static int cdnrcmd_add_trtcm(struct cdnr_add_trtcm *); +static int cdnrcmd_modify_trtcm(struct cdnr_modify_trtcm *); +static int cdnrcmd_tcm_stats(struct cdnr_tcm_stats *); +static int cdnrcmd_add_tswtcm(struct cdnr_add_tswtcm *); +static int cdnrcmd_modify_tswtcm(struct cdnr_modify_tswtcm *); +static int cdnrcmd_get_stats(struct cdnr_get_stats *); + +altqdev_decl(cdnr); + +/* + * top level input function called from ip_input. + * should be called before converting header fields to host-byte-order. + */ +int +altq_cdnr_input(m, af) + struct mbuf *m; + int af; /* address family */ +{ + struct ifnet *ifp; + struct ip *ip; + struct top_cdnr *top; + struct tc_action *tca; + struct cdnr_block *cb; + struct cdnr_pktinfo pktinfo; + + ifp = m->m_pkthdr.rcvif; + if (!ALTQ_IS_CNDTNING(&ifp->if_snd)) + /* traffic conditioner is not enabled on this interface */ + return (1); + + top = ifp->if_snd.altq_cdnr; + + ip = mtod(m, struct ip *); +#ifdef INET6 + if (af == AF_INET6) { + u_int32_t flowlabel; + + flowlabel = ((struct ip6_hdr *)ip)->ip6_flow; + pktinfo.pkt_dscp = (ntohl(flowlabel) >> 20) & DSCP_MASK; + } else +#endif + pktinfo.pkt_dscp = ip->ip_tos & DSCP_MASK; + pktinfo.pkt_len = m_pktlen(m); + + tca = NULL; + + cb = acc_classify(&top->tc_classifier, m, af); + if (cb != NULL) + tca = &cb->cb_action; + + if (tca == NULL) + tca = &top->tc_block.cb_action; + + while (1) { + PKTCNTR_ADD(&top->tc_cnts[tca->tca_code], pktinfo.pkt_len); + + switch (tca->tca_code) { + case TCACODE_PASS: + return (1); + case TCACODE_DROP: + m_freem(m); + return (0); + case TCACODE_RETURN: + return (0); + case TCACODE_MARK: +#ifdef INET6 + if (af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + flowlabel = (tca->tca_dscp << 20) | + (flowlabel & ~(DSCP_MASK << 20)); + ip6->ip6_flow = htonl(flowlabel); + } else +#endif + ip->ip_tos = tca->tca_dscp | + (ip->ip_tos & DSCP_CUMASK); + return (1); + case TCACODE_NEXT: + cb = tca->tca_next; + tca = (*cb->cb_input)(cb, &pktinfo); + break; + case TCACODE_NONE: + default: + return (1); + } + } +} + +static struct top_cdnr * +tcb_lookup(ifname) + char *ifname; +{ + struct top_cdnr *top; + struct ifnet *ifp; + + if ((ifp = ifunit(ifname)) != NULL) + LIST_FOREACH(top, &tcb_list, tc_next) + if (top->tc_ifq->altq_ifp == ifp) + return (top); + return (NULL); +} + +static struct cdnr_block * +cdnr_handle2cb(handle) + u_long handle; +{ + struct cdnr_block *cb; + + cb = (struct cdnr_block *)handle; + if (handle != ALIGN(cb)) + return (NULL); + + if (cb == NULL || cb->cb_handle != handle) + return (NULL); + return (cb); +} + +static u_long +cdnr_cb2handle(cb) + struct cdnr_block *cb; +{ + return (cb->cb_handle); +} + +static void * +cdnr_cballoc(top, type, input_func) + struct top_cdnr *top; + int type; + struct tc_action *(*input_func)(struct cdnr_block *, + struct cdnr_pktinfo *); +{ + struct cdnr_block *cb; + int size; + + switch (type) { + case TCETYPE_TOP: + size = sizeof(struct top_cdnr); + break; + case TCETYPE_ELEMENT: + size = sizeof(struct cdnr_block); + break; + case TCETYPE_TBMETER: + size = sizeof(struct tbmeter); + break; + case TCETYPE_TRTCM: + size = sizeof(struct trtcm); + break; + case TCETYPE_TSWTCM: + size = sizeof(struct tswtcm); + break; + default: + return (NULL); + } + + cb = malloc(size, M_DEVBUF, M_WAITOK); + if (cb == NULL) + return (NULL); + bzero(cb, size); + + cb->cb_len = size; + cb->cb_type = type; + cb->cb_ref = 0; + cb->cb_handle = (u_long)cb; + if (top == NULL) + cb->cb_top = (struct top_cdnr *)cb; + else + cb->cb_top = top; + + if (input_func != NULL) { + /* + * if this cdnr has an action function, + * make tc_action to call itself. + */ + cb->cb_action.tca_code = TCACODE_NEXT; + cb->cb_action.tca_next = cb; + cb->cb_input = input_func; + } else + cb->cb_action.tca_code = TCACODE_NONE; + + /* if this isn't top, register the element to the top level cdnr */ + if (top != NULL) + LIST_INSERT_HEAD(&top->tc_elements, cb, cb_next); + + return ((void *)cb); +} + +static void +cdnr_cbdestroy(cblock) + void *cblock; +{ + struct cdnr_block *cb = cblock; + + /* delete filters belonging to this cdnr */ + acc_discard_filters(&cb->cb_top->tc_classifier, cb, 0); + + /* remove from the top level cdnr */ + if (cb->cb_top != cblock) + LIST_REMOVE(cb, cb_next); + + free(cb, M_DEVBUF); +} + +/* + * conditioner common destroy routine + */ +static int +generic_element_destroy(cb) + struct cdnr_block *cb; +{ + int error = 0; + + switch (cb->cb_type) { + case TCETYPE_TOP: + error = top_destroy((struct top_cdnr *)cb); + break; + case TCETYPE_ELEMENT: + error = element_destroy(cb); + break; + case TCETYPE_TBMETER: + error = tbm_destroy((struct tbmeter *)cb); + break; + case TCETYPE_TRTCM: + error = trtcm_destroy((struct trtcm *)cb); + break; + case TCETYPE_TSWTCM: + error = tswtcm_destroy((struct tswtcm *)cb); + break; + default: + error = EINVAL; + } + return (error); +} + +static int +tca_verify_action(utca) + struct tc_action *utca; +{ + switch (utca->tca_code) { + case TCACODE_PASS: + case TCACODE_DROP: + case TCACODE_MARK: + /* these are ok */ + break; + + case TCACODE_HANDLE: + /* verify handle value */ + if (cdnr_handle2cb(utca->tca_handle) == NULL) + return (-1); + break; + + case TCACODE_NONE: + case TCACODE_RETURN: + case TCACODE_NEXT: + default: + /* should not be passed from a user */ + return (-1); + } + return (0); +} + +static void +tca_import_action(ktca, utca) + struct tc_action *ktca, *utca; +{ + struct cdnr_block *cb; + + *ktca = *utca; + if (ktca->tca_code == TCACODE_HANDLE) { + cb = cdnr_handle2cb(ktca->tca_handle); + if (cb == NULL) { + ktca->tca_code = TCACODE_NONE; + return; + } + ktca->tca_code = TCACODE_NEXT; + ktca->tca_next = cb; + cb->cb_ref++; + } else if (ktca->tca_code == TCACODE_MARK) { + ktca->tca_dscp &= DSCP_MASK; + } + return; +} + +static void +tca_invalidate_action(tca) + struct tc_action *tca; +{ + struct cdnr_block *cb; + + if (tca->tca_code == TCACODE_NEXT) { + cb = tca->tca_next; + if (cb == NULL) + return; + cb->cb_ref--; + } + tca->tca_code = TCACODE_NONE; +} + +/* + * top level traffic conditioner + */ +static struct top_cdnr * +top_create(ifq) + struct ifaltq *ifq; +{ + struct top_cdnr *top; + + if ((top = cdnr_cballoc(NULL, TCETYPE_TOP, NULL)) == NULL) + return (NULL); + + top->tc_ifq = ifq; + /* set default action for the top level conditioner */ + top->tc_block.cb_action.tca_code = TCACODE_PASS; + + LIST_INSERT_HEAD(&tcb_list, top, tc_next); + + ifq->altq_cdnr = top; + + return (top); +} + +static int +top_destroy(top) + struct top_cdnr *top; +{ + struct cdnr_block *cb; + + if (ALTQ_IS_CNDTNING(top->tc_ifq)) + ALTQ_CLEAR_CNDTNING(top->tc_ifq); + top->tc_ifq->altq_cdnr = NULL; + + /* + * destroy all the conditioner elements belonging to this interface + */ + while ((cb = LIST_FIRST(&top->tc_elements)) != NULL) { + while (cb != NULL && cb->cb_ref > 0) + cb = LIST_NEXT(cb, cb_next); + if (cb != NULL) + generic_element_destroy(cb); + } + + LIST_REMOVE(top, tc_next); + + cdnr_cbdestroy(top); + + /* if there is no active conditioner, remove the input hook */ + if (altq_input != NULL) { + LIST_FOREACH(top, &tcb_list, tc_next) + if (ALTQ_IS_CNDTNING(top->tc_ifq)) + break; + if (top == NULL) + altq_input = NULL; + } + + return (0); +} + +/* + * simple tc elements without input function (e.g., dropper and makers). + */ +static struct cdnr_block * +element_create(top, action) + struct top_cdnr *top; + struct tc_action *action; +{ + struct cdnr_block *cb; + + if (tca_verify_action(action) < 0) + return (NULL); + + if ((cb = cdnr_cballoc(top, TCETYPE_ELEMENT, NULL)) == NULL) + return (NULL); + + tca_import_action(&cb->cb_action, action); + + return (cb); +} + +static int +element_destroy(cb) + struct cdnr_block *cb; +{ + if (cb->cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&cb->cb_action); + + cdnr_cbdestroy(cb); + return (0); +} + +/* + * internal representation of token bucket parameters + * rate: byte_per_unittime << 32 + * (((bits_per_sec) / 8) << 32) / machclk_freq + * depth: byte << 32 + * + */ +#define TB_SHIFT 32 +#define TB_SCALE(x) ((u_int64_t)(x) << TB_SHIFT) +#define TB_UNSCALE(x) ((x) >> TB_SHIFT) + +static void +tb_import_profile(tb, profile) + struct tbe *tb; + struct tb_profile *profile; +{ + tb->rate = TB_SCALE(profile->rate / 8) / machclk_freq; + tb->depth = TB_SCALE(profile->depth); + if (tb->rate > 0) + tb->filluptime = tb->depth / tb->rate; + else + tb->filluptime = 0xffffffffffffffffLL; + tb->token = tb->depth; + tb->last = read_machclk(); +} + +/* + * simple token bucket meter + */ +static struct tbmeter * +tbm_create(top, profile, in_action, out_action) + struct top_cdnr *top; + struct tb_profile *profile; + struct tc_action *in_action, *out_action; +{ + struct tbmeter *tbm = NULL; + + if (tca_verify_action(in_action) < 0 + || tca_verify_action(out_action) < 0) + return (NULL); + + if ((tbm = cdnr_cballoc(top, TCETYPE_TBMETER, + tbm_input)) == NULL) + return (NULL); + + tb_import_profile(&tbm->tb, profile); + + tca_import_action(&tbm->in_action, in_action); + tca_import_action(&tbm->out_action, out_action); + + return (tbm); +} + +static int +tbm_destroy(tbm) + struct tbmeter *tbm; +{ + if (tbm->cdnrblk.cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&tbm->in_action); + tca_invalidate_action(&tbm->out_action); + + cdnr_cbdestroy(tbm); + return (0); +} + +static struct tc_action * +tbm_input(cb, pktinfo) + struct cdnr_block *cb; + struct cdnr_pktinfo *pktinfo; +{ + struct tbmeter *tbm = (struct tbmeter *)cb; + u_int64_t len; + u_int64_t interval, now; + + len = TB_SCALE(pktinfo->pkt_len); + + if (tbm->tb.token < len) { + now = read_machclk(); + interval = now - tbm->tb.last; + if (interval >= tbm->tb.filluptime) + tbm->tb.token = tbm->tb.depth; + else { + tbm->tb.token += interval * tbm->tb.rate; + if (tbm->tb.token > tbm->tb.depth) + tbm->tb.token = tbm->tb.depth; + } + tbm->tb.last = now; + } + + if (tbm->tb.token < len) { + PKTCNTR_ADD(&tbm->out_cnt, pktinfo->pkt_len); + return (&tbm->out_action); + } + + tbm->tb.token -= len; + PKTCNTR_ADD(&tbm->in_cnt, pktinfo->pkt_len); + return (&tbm->in_action); +} + +/* + * two rate three color marker + * as described in draft-heinanen-diffserv-trtcm-01.txt + */ +static struct trtcm * +trtcm_create(top, cmtd_profile, peak_profile, + green_action, yellow_action, red_action, coloraware) + struct top_cdnr *top; + struct tb_profile *cmtd_profile, *peak_profile; + struct tc_action *green_action, *yellow_action, *red_action; + int coloraware; +{ + struct trtcm *tcm = NULL; + + if (tca_verify_action(green_action) < 0 + || tca_verify_action(yellow_action) < 0 + || tca_verify_action(red_action) < 0) + return (NULL); + + if ((tcm = cdnr_cballoc(top, TCETYPE_TRTCM, + trtcm_input)) == NULL) + return (NULL); + + tb_import_profile(&tcm->cmtd_tb, cmtd_profile); + tb_import_profile(&tcm->peak_tb, peak_profile); + + tca_import_action(&tcm->green_action, green_action); + tca_import_action(&tcm->yellow_action, yellow_action); + tca_import_action(&tcm->red_action, red_action); + + /* set dscps to use */ + if (tcm->green_action.tca_code == TCACODE_MARK) + tcm->green_dscp = tcm->green_action.tca_dscp & DSCP_MASK; + else + tcm->green_dscp = DSCP_AF11; + if (tcm->yellow_action.tca_code == TCACODE_MARK) + tcm->yellow_dscp = tcm->yellow_action.tca_dscp & DSCP_MASK; + else + tcm->yellow_dscp = DSCP_AF12; + if (tcm->red_action.tca_code == TCACODE_MARK) + tcm->red_dscp = tcm->red_action.tca_dscp & DSCP_MASK; + else + tcm->red_dscp = DSCP_AF13; + + tcm->coloraware = coloraware; + + return (tcm); +} + +static int +trtcm_destroy(tcm) + struct trtcm *tcm; +{ + if (tcm->cdnrblk.cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&tcm->green_action); + tca_invalidate_action(&tcm->yellow_action); + tca_invalidate_action(&tcm->red_action); + + cdnr_cbdestroy(tcm); + return (0); +} + +static struct tc_action * +trtcm_input(cb, pktinfo) + struct cdnr_block *cb; + struct cdnr_pktinfo *pktinfo; +{ + struct trtcm *tcm = (struct trtcm *)cb; + u_int64_t len; + u_int64_t interval, now; + u_int8_t color; + + len = TB_SCALE(pktinfo->pkt_len); + if (tcm->coloraware) { + color = pktinfo->pkt_dscp; + if (color != tcm->yellow_dscp && color != tcm->red_dscp) + color = tcm->green_dscp; + } else { + /* if color-blind, precolor it as green */ + color = tcm->green_dscp; + } + + now = read_machclk(); + if (tcm->cmtd_tb.token < len) { + interval = now - tcm->cmtd_tb.last; + if (interval >= tcm->cmtd_tb.filluptime) + tcm->cmtd_tb.token = tcm->cmtd_tb.depth; + else { + tcm->cmtd_tb.token += interval * tcm->cmtd_tb.rate; + if (tcm->cmtd_tb.token > tcm->cmtd_tb.depth) + tcm->cmtd_tb.token = tcm->cmtd_tb.depth; + } + tcm->cmtd_tb.last = now; + } + if (tcm->peak_tb.token < len) { + interval = now - tcm->peak_tb.last; + if (interval >= tcm->peak_tb.filluptime) + tcm->peak_tb.token = tcm->peak_tb.depth; + else { + tcm->peak_tb.token += interval * tcm->peak_tb.rate; + if (tcm->peak_tb.token > tcm->peak_tb.depth) + tcm->peak_tb.token = tcm->peak_tb.depth; + } + tcm->peak_tb.last = now; + } + + if (color == tcm->red_dscp || tcm->peak_tb.token < len) { + pktinfo->pkt_dscp = tcm->red_dscp; + PKTCNTR_ADD(&tcm->red_cnt, pktinfo->pkt_len); + return (&tcm->red_action); + } + + if (color == tcm->yellow_dscp || tcm->cmtd_tb.token < len) { + pktinfo->pkt_dscp = tcm->yellow_dscp; + tcm->peak_tb.token -= len; + PKTCNTR_ADD(&tcm->yellow_cnt, pktinfo->pkt_len); + return (&tcm->yellow_action); + } + + pktinfo->pkt_dscp = tcm->green_dscp; + tcm->cmtd_tb.token -= len; + tcm->peak_tb.token -= len; + PKTCNTR_ADD(&tcm->green_cnt, pktinfo->pkt_len); + return (&tcm->green_action); +} + +/* + * time sliding window three color marker + * as described in draft-fang-diffserv-tc-tswtcm-00.txt + */ +static struct tswtcm * +tswtcm_create(top, cmtd_rate, peak_rate, avg_interval, + green_action, yellow_action, red_action) + struct top_cdnr *top; + u_int32_t cmtd_rate, peak_rate, avg_interval; + struct tc_action *green_action, *yellow_action, *red_action; +{ + struct tswtcm *tsw; + + if (tca_verify_action(green_action) < 0 + || tca_verify_action(yellow_action) < 0 + || tca_verify_action(red_action) < 0) + return (NULL); + + if ((tsw = cdnr_cballoc(top, TCETYPE_TSWTCM, + tswtcm_input)) == NULL) + return (NULL); + + tca_import_action(&tsw->green_action, green_action); + tca_import_action(&tsw->yellow_action, yellow_action); + tca_import_action(&tsw->red_action, red_action); + + /* set dscps to use */ + if (tsw->green_action.tca_code == TCACODE_MARK) + tsw->green_dscp = tsw->green_action.tca_dscp & DSCP_MASK; + else + tsw->green_dscp = DSCP_AF11; + if (tsw->yellow_action.tca_code == TCACODE_MARK) + tsw->yellow_dscp = tsw->yellow_action.tca_dscp & DSCP_MASK; + else + tsw->yellow_dscp = DSCP_AF12; + if (tsw->red_action.tca_code == TCACODE_MARK) + tsw->red_dscp = tsw->red_action.tca_dscp & DSCP_MASK; + else + tsw->red_dscp = DSCP_AF13; + + /* convert rates from bits/sec to bytes/sec */ + tsw->cmtd_rate = cmtd_rate / 8; + tsw->peak_rate = peak_rate / 8; + tsw->avg_rate = 0; + + /* timewin is converted from msec to machine clock unit */ + tsw->timewin = (u_int64_t)machclk_freq * avg_interval / 1000; + + return (tsw); +} + +static int +tswtcm_destroy(tsw) + struct tswtcm *tsw; +{ + if (tsw->cdnrblk.cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&tsw->green_action); + tca_invalidate_action(&tsw->yellow_action); + tca_invalidate_action(&tsw->red_action); + + cdnr_cbdestroy(tsw); + return (0); +} + +static struct tc_action * +tswtcm_input(cb, pktinfo) + struct cdnr_block *cb; + struct cdnr_pktinfo *pktinfo; +{ + struct tswtcm *tsw = (struct tswtcm *)cb; + int len; + u_int32_t avg_rate; + u_int64_t interval, now, tmp; + + /* + * rate estimator + */ + len = pktinfo->pkt_len; + now = read_machclk(); + + interval = now - tsw->t_front; + /* + * calculate average rate: + * avg = (avg * timewin + pkt_len)/(timewin + interval) + * pkt_len needs to be multiplied by machclk_freq in order to + * get (bytes/sec). + * note: when avg_rate (bytes/sec) and timewin (machclk unit) are + * less than 32 bits, the following 64-bit operation has enough + * precision. + */ + tmp = ((u_int64_t)tsw->avg_rate * tsw->timewin + + (u_int64_t)len * machclk_freq) / (tsw->timewin + interval); + tsw->avg_rate = avg_rate = (u_int32_t)tmp; + tsw->t_front = now; + + /* + * marker + */ + if (avg_rate > tsw->cmtd_rate) { + u_int32_t randval = arc4random() % avg_rate; + + if (avg_rate > tsw->peak_rate) { + if (randval < avg_rate - tsw->peak_rate) { + /* mark red */ + pktinfo->pkt_dscp = tsw->red_dscp; + PKTCNTR_ADD(&tsw->red_cnt, len); + return (&tsw->red_action); + } else if (randval < avg_rate - tsw->cmtd_rate) + goto mark_yellow; + } else { + /* peak_rate >= avg_rate > cmtd_rate */ + if (randval < avg_rate - tsw->cmtd_rate) { + mark_yellow: + pktinfo->pkt_dscp = tsw->yellow_dscp; + PKTCNTR_ADD(&tsw->yellow_cnt, len); + return (&tsw->yellow_action); + } + } + } + + /* mark green */ + pktinfo->pkt_dscp = tsw->green_dscp; + PKTCNTR_ADD(&tsw->green_cnt, len); + return (&tsw->green_action); +} + +/* + * ioctl requests + */ +static int +cdnrcmd_if_attach(ifname) + char *ifname; +{ + struct ifnet *ifp; + struct top_cdnr *top; + + if ((ifp = ifunit(ifname)) == NULL) + return (EBADF); + + if (ifp->if_snd.altq_cdnr != NULL) + return (EBUSY); + + if ((top = top_create(&ifp->if_snd)) == NULL) + return (ENOMEM); + return (0); +} + +static int +cdnrcmd_if_detach(ifname) + char *ifname; +{ + struct top_cdnr *top; + + if ((top = tcb_lookup(ifname)) == NULL) + return (EBADF); + + return top_destroy(top); +} + +static int +cdnrcmd_add_element(ap) + struct cdnr_add_element *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + cb = element_create(top, &ap->action); + if (cb == NULL) + return (EINVAL); + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(cb); + return (0); +} + +static int +cdnrcmd_delete_element(ap) + struct cdnr_delete_element *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + if (cb->cb_type != TCETYPE_ELEMENT) + return generic_element_destroy(cb); + + return element_destroy(cb); +} + +static int +cdnrcmd_add_filter(ap) + struct cdnr_add_filter *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + return acc_add_filter(&top->tc_classifier, &ap->filter, + cb, &ap->filter_handle); +} + +static int +cdnrcmd_delete_filter(ap) + struct cdnr_delete_filter *ap; +{ + struct top_cdnr *top; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + return acc_delete_filter(&top->tc_classifier, ap->filter_handle); +} + +static int +cdnrcmd_add_tbm(ap) + struct cdnr_add_tbmeter *ap; +{ + struct top_cdnr *top; + struct tbmeter *tbm; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + tbm = tbm_create(top, &ap->profile, &ap->in_action, &ap->out_action); + if (tbm == NULL) + return (EINVAL); + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(&tbm->cdnrblk); + return (0); +} + +static int +cdnrcmd_modify_tbm(ap) + struct cdnr_modify_tbmeter *ap; +{ + struct tbmeter *tbm; + + if ((tbm = (struct tbmeter *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + tb_import_profile(&tbm->tb, &ap->profile); + + return (0); +} + +static int +cdnrcmd_tbm_stats(ap) + struct cdnr_tbmeter_stats *ap; +{ + struct tbmeter *tbm; + + if ((tbm = (struct tbmeter *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + ap->in_cnt = tbm->in_cnt; + ap->out_cnt = tbm->out_cnt; + + return (0); +} + +static int +cdnrcmd_add_trtcm(ap) + struct cdnr_add_trtcm *ap; +{ + struct top_cdnr *top; + struct trtcm *tcm; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + tcm = trtcm_create(top, &ap->cmtd_profile, &ap->peak_profile, + &ap->green_action, &ap->yellow_action, + &ap->red_action, ap->coloraware); + if (tcm == NULL) + return (EINVAL); + + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(&tcm->cdnrblk); + return (0); +} + +static int +cdnrcmd_modify_trtcm(ap) + struct cdnr_modify_trtcm *ap; +{ + struct trtcm *tcm; + + if ((tcm = (struct trtcm *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + tb_import_profile(&tcm->cmtd_tb, &ap->cmtd_profile); + tb_import_profile(&tcm->peak_tb, &ap->peak_profile); + + return (0); +} + +static int +cdnrcmd_tcm_stats(ap) + struct cdnr_tcm_stats *ap; +{ + struct cdnr_block *cb; + + if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + if (cb->cb_type == TCETYPE_TRTCM) { + struct trtcm *tcm = (struct trtcm *)cb; + + ap->green_cnt = tcm->green_cnt; + ap->yellow_cnt = tcm->yellow_cnt; + ap->red_cnt = tcm->red_cnt; + } else if (cb->cb_type == TCETYPE_TSWTCM) { + struct tswtcm *tsw = (struct tswtcm *)cb; + + ap->green_cnt = tsw->green_cnt; + ap->yellow_cnt = tsw->yellow_cnt; + ap->red_cnt = tsw->red_cnt; + } else + return (EINVAL); + + return (0); +} + +static int +cdnrcmd_add_tswtcm(ap) + struct cdnr_add_tswtcm *ap; +{ + struct top_cdnr *top; + struct tswtcm *tsw; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + if (ap->cmtd_rate > ap->peak_rate) + return (EINVAL); + + tsw = tswtcm_create(top, ap->cmtd_rate, ap->peak_rate, + ap->avg_interval, &ap->green_action, + &ap->yellow_action, &ap->red_action); + if (tsw == NULL) + return (EINVAL); + + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(&tsw->cdnrblk); + return (0); +} + +static int +cdnrcmd_modify_tswtcm(ap) + struct cdnr_modify_tswtcm *ap; +{ + struct tswtcm *tsw; + + if ((tsw = (struct tswtcm *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + if (ap->cmtd_rate > ap->peak_rate) + return (EINVAL); + + /* convert rates from bits/sec to bytes/sec */ + tsw->cmtd_rate = ap->cmtd_rate / 8; + tsw->peak_rate = ap->peak_rate / 8; + tsw->avg_rate = 0; + + /* timewin is converted from msec to machine clock unit */ + tsw->timewin = (u_int64_t)machclk_freq * ap->avg_interval / 1000; + + return (0); +} + +static int +cdnrcmd_get_stats(ap) + struct cdnr_get_stats *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + struct tbmeter *tbm; + struct trtcm *tcm; + struct tswtcm *tsw; + struct tce_stats tce, *usp; + int error, n, nskip, nelements; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + /* copy action stats */ + bcopy(top->tc_cnts, ap->cnts, sizeof(ap->cnts)); + + /* stats for each element */ + nelements = ap->nelements; + usp = ap->tce_stats; + if (nelements <= 0 || usp == NULL) + return (0); + + nskip = ap->nskip; + n = 0; + LIST_FOREACH(cb, &top->tc_elements, cb_next) { + if (nskip > 0) { + nskip--; + continue; + } + + bzero(&tce, sizeof(tce)); + tce.tce_handle = cb->cb_handle; + tce.tce_type = cb->cb_type; + switch (cb->cb_type) { + case TCETYPE_TBMETER: + tbm = (struct tbmeter *)cb; + tce.tce_cnts[0] = tbm->in_cnt; + tce.tce_cnts[1] = tbm->out_cnt; + break; + case TCETYPE_TRTCM: + tcm = (struct trtcm *)cb; + tce.tce_cnts[0] = tcm->green_cnt; + tce.tce_cnts[1] = tcm->yellow_cnt; + tce.tce_cnts[2] = tcm->red_cnt; + break; + case TCETYPE_TSWTCM: + tsw = (struct tswtcm *)cb; + tce.tce_cnts[0] = tsw->green_cnt; + tce.tce_cnts[1] = tsw->yellow_cnt; + tce.tce_cnts[2] = tsw->red_cnt; + break; + default: + continue; + } + + if ((error = copyout((caddr_t)&tce, (caddr_t)usp++, + sizeof(tce))) != 0) + return (error); + + if (++n == nelements) + break; + } + ap->nelements = n; + + return (0); +} + +/* + * conditioner device interface + */ +int +cdnropen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + if (machclk_freq == 0) + init_machclk(); + + if (machclk_freq == 0) { + printf("cdnr: no cpu clock available!\n"); + return (ENXIO); + } + + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +cdnrclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct top_cdnr *top; + int err, error = 0; + + while ((top = LIST_FIRST(&tcb_list)) != NULL) { + /* destroy all */ + err = top_destroy(top); + if (err != 0 && error == 0) + error = err; + } + altq_input = NULL; + + return (error); +} + +int +cdnrioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct top_cdnr *top; + struct cdnr_interface *ifacep; + int s, error = 0; + + /* check super-user privilege */ + switch (cmd) { + case CDNR_GETSTATS: + break; + default: +#if (__FreeBSD_version > 700000) + if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) +#elsif (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) +#endif + return (error); + break; + } + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + switch (cmd) { + + case CDNR_IF_ATTACH: + ifacep = (struct cdnr_interface *)addr; + error = cdnrcmd_if_attach(ifacep->cdnr_ifname); + break; + + case CDNR_IF_DETACH: + ifacep = (struct cdnr_interface *)addr; + error = cdnrcmd_if_detach(ifacep->cdnr_ifname); + break; + + case CDNR_ENABLE: + case CDNR_DISABLE: + ifacep = (struct cdnr_interface *)addr; + if ((top = tcb_lookup(ifacep->cdnr_ifname)) == NULL) { + error = EBADF; + break; + } + + switch (cmd) { + + case CDNR_ENABLE: + ALTQ_SET_CNDTNING(top->tc_ifq); + if (altq_input == NULL) + altq_input = altq_cdnr_input; + break; + + case CDNR_DISABLE: + ALTQ_CLEAR_CNDTNING(top->tc_ifq); + LIST_FOREACH(top, &tcb_list, tc_next) + if (ALTQ_IS_CNDTNING(top->tc_ifq)) + break; + if (top == NULL) + altq_input = NULL; + break; + } + break; + + case CDNR_ADD_ELEM: + error = cdnrcmd_add_element((struct cdnr_add_element *)addr); + break; + + case CDNR_DEL_ELEM: + error = cdnrcmd_delete_element((struct cdnr_delete_element *)addr); + break; + + case CDNR_ADD_TBM: + error = cdnrcmd_add_tbm((struct cdnr_add_tbmeter *)addr); + break; + + case CDNR_MOD_TBM: + error = cdnrcmd_modify_tbm((struct cdnr_modify_tbmeter *)addr); + break; + + case CDNR_TBM_STATS: + error = cdnrcmd_tbm_stats((struct cdnr_tbmeter_stats *)addr); + break; + + case CDNR_ADD_TCM: + error = cdnrcmd_add_trtcm((struct cdnr_add_trtcm *)addr); + break; + + case CDNR_MOD_TCM: + error = cdnrcmd_modify_trtcm((struct cdnr_modify_trtcm *)addr); + break; + + case CDNR_TCM_STATS: + error = cdnrcmd_tcm_stats((struct cdnr_tcm_stats *)addr); + break; + + case CDNR_ADD_FILTER: + error = cdnrcmd_add_filter((struct cdnr_add_filter *)addr); + break; + + case CDNR_DEL_FILTER: + error = cdnrcmd_delete_filter((struct cdnr_delete_filter *)addr); + break; + + case CDNR_GETSTATS: + error = cdnrcmd_get_stats((struct cdnr_get_stats *)addr); + break; + + case CDNR_ADD_TSW: + error = cdnrcmd_add_tswtcm((struct cdnr_add_tswtcm *)addr); + break; + + case CDNR_MOD_TSW: + error = cdnrcmd_modify_tswtcm((struct cdnr_modify_tswtcm *)addr); + break; + + default: + error = EINVAL; + break; + } + splx(s); + + return error; +} + +#ifdef KLD_MODULE + +static struct altqsw cdnr_sw = + {"cdnr", cdnropen, cdnrclose, cdnrioctl}; + +ALTQ_MODULE(altq_cdnr, ALTQT_CDNR, &cdnr_sw); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ3_COMPAT */ +#endif /* ALTQ_CDNR */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_cdnr.h b/contrib/altq/rtems/freebsd/altq/altq_cdnr.h new file mode 100644 index 00000000..002e3c38 --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_cdnr.h @@ -0,0 +1,335 @@ +/* $KAME: altq_cdnr.h,v 1.9 2003/07/10 12:07:48 kjc Exp $ */ + +/* + * Copyright (C) 1999-2002 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_CDNR_HH_ +#define _ALTQ_ALTQ_CDNR_HH_ + +#include + +/* + * traffic conditioner element types + */ +#define TCETYPE_NONE 0 +#define TCETYPE_TOP 1 /* top level conditioner */ +#define TCETYPE_ELEMENT 2 /* a simple tc element */ +#define TCETYPE_TBMETER 3 /* token bucket meter */ +#define TCETYPE_TRTCM 4 /* (two-rate) three color marker */ +#define TCETYPE_TSWTCM 5 /* time sliding window 3-color maker */ + +/* + * traffic conditioner action + */ +struct cdnr_block; + +struct tc_action { + int tca_code; /* e.g., TCACODE_PASS */ + /* tca_code dependent variable */ + union { + u_long un_value; /* template */ + u_int8_t un_dscp; /* diffserv code point */ + u_long un_handle; /* tc action handle */ + struct cdnr_block *un_next; /* next tc element block */ + } tca_un; +}; +#define tca_value tca_un.un_value +#define tca_dscp tca_un.un_dscp +#define tca_handle tca_un.un_handle +#define tca_next tca_un.un_next + +#define TCACODE_NONE 0 /* action is not set */ +#define TCACODE_PASS 1 /* pass this packet */ +#define TCACODE_DROP 2 /* discard this packet */ +#define TCACODE_RETURN 3 /* do not process this packet */ +#define TCACODE_MARK 4 /* mark dscp */ +#define TCACODE_HANDLE 5 /* take action specified by handle */ +#define TCACODE_NEXT 6 /* take action in the next tc element */ +#define TCACODE_MAX 6 + +#define CDNR_NULL_HANDLE 0 + +struct cdnr_interface { + char cdnr_ifname[IFNAMSIZ]; /* interface name (e.g., fxp0) */ +}; + +/* simple element operations */ +struct cdnr_add_element { + struct cdnr_interface iface; + struct tc_action action; + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_delete_element { + struct cdnr_interface iface; + u_long cdnr_handle; +}; + +/* token-bucket meter operations */ +struct cdnr_add_tbmeter { + struct cdnr_interface iface; + struct tb_profile profile; + struct tc_action in_action; + struct tc_action out_action; + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_modify_tbmeter { + struct cdnr_interface iface; + u_long cdnr_handle; + struct tb_profile profile; +}; + +struct cdnr_tbmeter_stats { + struct cdnr_interface iface; + u_long cdnr_handle; + struct pktcntr in_cnt; + struct pktcntr out_cnt; +}; + +/* two-rate three-color marker operations */ +struct cdnr_add_trtcm { + struct cdnr_interface iface; + struct tb_profile cmtd_profile; /* profile for committed tb */ + struct tb_profile peak_profile; /* profile for peak tb */ + struct tc_action green_action; /* action for green packets */ + struct tc_action yellow_action; /* action for yellow packets */ + struct tc_action red_action; /* action for red packets */ + int coloraware; /* color-aware/color-blind */ + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_modify_trtcm { + struct cdnr_interface iface; + u_long cdnr_handle; + struct tb_profile cmtd_profile; /* profile for committed tb */ + struct tb_profile peak_profile; /* profile for peak tb */ + int coloraware; /* color-aware/color-blind */ +}; + +struct cdnr_tcm_stats { + struct cdnr_interface iface; + u_long cdnr_handle; + struct pktcntr green_cnt; + struct pktcntr yellow_cnt; + struct pktcntr red_cnt; +}; + +/* time sliding window three-color marker operations */ +struct cdnr_add_tswtcm { + struct cdnr_interface iface; + u_int32_t cmtd_rate; /* committed rate (bits/sec) */ + u_int32_t peak_rate; /* peak rate (bits/sec) */ + u_int32_t avg_interval; /* averaging interval (msec) */ + struct tc_action green_action; /* action for green packets */ + struct tc_action yellow_action; /* action for yellow packets */ + struct tc_action red_action; /* action for red packets */ + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_modify_tswtcm { + struct cdnr_interface iface; + u_long cdnr_handle; + u_int32_t cmtd_rate; /* committed rate (bits/sec) */ + u_int32_t peak_rate; /* peak rate (bits/sec) */ + u_int32_t avg_interval; /* averaging interval (msec) */ +}; + +struct cdnr_add_filter { + struct cdnr_interface iface; + u_long cdnr_handle; +#ifdef ALTQ3_CLFIER_COMPAT + struct flow_filter filter; +#endif + u_long filter_handle; /* return value */ +}; + +struct cdnr_delete_filter { + struct cdnr_interface iface; + u_long filter_handle; +}; + +struct tce_stats { + u_long tce_handle; /* tc element handle */ + int tce_type; /* e.g., TCETYPE_ELEMENT */ + struct pktcntr tce_cnts[3]; /* tcm returns 3 counters */ +}; + +struct cdnr_get_stats { + struct cdnr_interface iface; + struct pktcntr cnts[TCACODE_MAX+1]; + + /* element stats */ + int nskip; /* skip # of elements */ + int nelements; /* # of element stats (WR) */ + struct tce_stats *tce_stats; /* pointer to stats array */ +}; + +#define CDNR_IF_ATTACH _IOW('Q', 1, struct cdnr_interface) +#define CDNR_IF_DETACH _IOW('Q', 2, struct cdnr_interface) +#define CDNR_ENABLE _IOW('Q', 3, struct cdnr_interface) +#define CDNR_DISABLE _IOW('Q', 4, struct cdnr_interface) +#define CDNR_ADD_FILTER _IOWR('Q', 10, struct cdnr_add_filter) +#define CDNR_DEL_FILTER _IOW('Q', 11, struct cdnr_delete_filter) +#define CDNR_GETSTATS _IOWR('Q', 12, struct cdnr_get_stats) +#define CDNR_ADD_ELEM _IOWR('Q', 30, struct cdnr_add_element) +#define CDNR_DEL_ELEM _IOW('Q', 31, struct cdnr_delete_element) +#define CDNR_ADD_TBM _IOWR('Q', 32, struct cdnr_add_tbmeter) +#define CDNR_MOD_TBM _IOW('Q', 33, struct cdnr_modify_tbmeter) +#define CDNR_TBM_STATS _IOWR('Q', 34, struct cdnr_tbmeter_stats) +#define CDNR_ADD_TCM _IOWR('Q', 35, struct cdnr_add_trtcm) +#define CDNR_MOD_TCM _IOWR('Q', 36, struct cdnr_modify_trtcm) +#define CDNR_TCM_STATS _IOWR('Q', 37, struct cdnr_tcm_stats) +#define CDNR_ADD_TSW _IOWR('Q', 38, struct cdnr_add_tswtcm) +#define CDNR_MOD_TSW _IOWR('Q', 39, struct cdnr_modify_tswtcm) + +#ifndef DSCP_EF +/* diffserve code points */ +#define DSCP_MASK 0xfc +#define DSCP_CUMASK 0x03 +#define DSCP_EF 0xb8 +#define DSCP_AF11 0x28 +#define DSCP_AF12 0x30 +#define DSCP_AF13 0x38 +#define DSCP_AF21 0x48 +#define DSCP_AF22 0x50 +#define DSCP_AF23 0x58 +#define DSCP_AF31 0x68 +#define DSCP_AF32 0x70 +#define DSCP_AF33 0x78 +#define DSCP_AF41 0x88 +#define DSCP_AF42 0x90 +#define DSCP_AF43 0x98 +#define AF_CLASSMASK 0xe0 +#define AF_DROPPRECMASK 0x18 +#endif + +#ifdef _KERNEL + +/* + * packet information passed to the input function of tc elements + */ +struct cdnr_pktinfo { + int pkt_len; /* packet length */ + u_int8_t pkt_dscp; /* diffserv code point */ +}; + +/* + * traffic conditioner control block common to all types of tc elements + */ +struct cdnr_block { + LIST_ENTRY(cdnr_block) cb_next; + int cb_len; /* size of this tc element */ + int cb_type; /* cdnr block type */ + int cb_ref; /* reference count of this element */ + u_long cb_handle; /* handle of this tc element */ + struct top_cdnr *cb_top; /* back pointer to top */ + struct tc_action cb_action; /* top level action for this tcb */ + struct tc_action *(*cb_input)(struct cdnr_block *, + struct cdnr_pktinfo *); +}; + +/* + * top level traffic conditioner structure for an interface + */ +struct top_cdnr { + struct cdnr_block tc_block; + + LIST_ENTRY(top_cdnr) tc_next; + struct ifaltq *tc_ifq; + + LIST_HEAD(, cdnr_block) tc_elements; +#ifdef ALTQ3_CLFIER_COMPAT + struct acc_classifier tc_classifier; +#endif + struct pktcntr tc_cnts[TCACODE_MAX+1]; +}; + +/* token bucket element */ +struct tbe { + u_int64_t rate; + u_int64_t depth; + + u_int64_t token; + u_int64_t filluptime; + u_int64_t last; +}; + +/* token bucket meter structure */ +struct tbmeter { + struct cdnr_block cdnrblk; /* conditioner block */ + struct tbe tb; /* token bucket */ + struct tc_action in_action; /* actions for IN/OUT */ + struct tc_action out_action; /* actions for IN/OUT */ + struct pktcntr in_cnt; /* statistics for IN/OUT */ + struct pktcntr out_cnt; /* statistics for IN/OUT */ +}; + +/* two-rate three-color marker structure */ +struct trtcm { + struct cdnr_block cdnrblk; /* conditioner block */ + struct tbe cmtd_tb; /* committed tb profile */ + struct tbe peak_tb; /* peak tb profile */ + struct tc_action green_action; + struct tc_action yellow_action; + struct tc_action red_action; + int coloraware; + u_int8_t green_dscp; + u_int8_t yellow_dscp; + u_int8_t red_dscp; + struct pktcntr green_cnt; + struct pktcntr yellow_cnt; + struct pktcntr red_cnt; +}; + +/* time sliding window three-color marker structure */ +struct tswtcm { + struct cdnr_block cdnrblk; /* conditioner block */ + + u_int32_t avg_rate; /* average rate (bytes/sec) */ + u_int64_t t_front; /* timestamp of last update */ + + u_int64_t timewin; /* average interval */ + u_int32_t cmtd_rate; /* committed target rate */ + u_int32_t peak_rate; /* peak target rate */ + struct tc_action green_action; + struct tc_action yellow_action; + struct tc_action red_action; + u_int8_t green_dscp; + u_int8_t yellow_dscp; + u_int8_t red_dscp; + struct pktcntr green_cnt; + struct pktcntr yellow_cnt; + struct pktcntr red_cnt; +}; + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_CDNR_HH_ */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_classq.h b/contrib/altq/rtems/freebsd/altq/altq_classq.h new file mode 100644 index 00000000..c3cfea37 --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_classq.h @@ -0,0 +1,206 @@ +/* $KAME: altq_classq.h,v 1.6 2003/01/07 07:33:38 kjc Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * class queue definitions extracted from rm_class.h. + */ +#ifndef _ALTQ_ALTQ_CLASSQ_HH_ +#define _ALTQ_ALTQ_CLASSQ_HH_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Packet Queue types: RED or DROPHEAD. + */ +#define Q_DROPHEAD 0x00 +#define Q_RED 0x01 +#define Q_RIO 0x02 +#define Q_DROPTAIL 0x03 + +#ifdef _KERNEL + +/* + * Packet Queue structures and macros to manipulate them. + */ +struct _class_queue_ { + struct mbuf *tail_; /* Tail of packet queue */ + int qlen_; /* Queue length (in number of packets) */ + int qlim_; /* Queue limit (in number of packets*) */ + int qtype_; /* Queue type */ +}; + +typedef struct _class_queue_ class_queue_t; + +#define qtype(q) (q)->qtype_ /* Get queue type */ +#define qlimit(q) (q)->qlim_ /* Max packets to be queued */ +#define qlen(q) (q)->qlen_ /* Current queue length. */ +#define qtail(q) (q)->tail_ /* Tail of the queue */ +#define qhead(q) ((q)->tail_ ? (q)->tail_->m_nextpkt : NULL) + +#define qempty(q) ((q)->qlen_ == 0) /* Is the queue empty?? */ +#define q_is_red(q) ((q)->qtype_ == Q_RED) /* Is the queue a red queue */ +#define q_is_rio(q) ((q)->qtype_ == Q_RIO) /* Is the queue a rio queue */ +#define q_is_red_or_rio(q) ((q)->qtype_ == Q_RED || (q)->qtype_ == Q_RIO) + +#if !defined(__GNUC__) || defined(ALTQ_DEBUG) + +extern void _addq(class_queue_t *, struct mbuf *); +extern struct mbuf *_getq(class_queue_t *); +extern struct mbuf *_getq_tail(class_queue_t *); +extern struct mbuf *_getq_random(class_queue_t *); +extern void _removeq(class_queue_t *, struct mbuf *); +extern void _flushq(class_queue_t *); + +#else /* __GNUC__ && !ALTQ_DEBUG */ +/* + * inlined versions + */ +static __inline void +_addq(class_queue_t *q, struct mbuf *m) +{ + struct mbuf *m0; + + if ((m0 = qtail(q)) != NULL) + m->m_nextpkt = m0->m_nextpkt; + else + m0 = m; + m0->m_nextpkt = m; + qtail(q) = m; + qlen(q)++; +} + +static __inline struct mbuf * +_getq(class_queue_t *q) +{ + struct mbuf *m, *m0; + + if ((m = qtail(q)) == NULL) + return (NULL); + if ((m0 = m->m_nextpkt) != m) + m->m_nextpkt = m0->m_nextpkt; + else + qtail(q) = NULL; + qlen(q)--; + m0->m_nextpkt = NULL; + return (m0); +} + +/* drop a packet at the tail of the queue */ +static __inline struct mbuf * +_getq_tail(class_queue_t *q) +{ + struct mbuf *m, *m0, *prev; + + if ((m = m0 = qtail(q)) == NULL) + return NULL; + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else + qtail(q) = prev; + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +/* randomly select a packet in the queue */ +static __inline struct mbuf * +_getq_random(class_queue_t *q) +{ + struct mbuf *m; + int i, n; + + if ((m = qtail(q)) == NULL) + return NULL; + if (m->m_nextpkt == m) + qtail(q) = NULL; + else { + struct mbuf *prev = NULL; + + n = random() % qlen(q) + 1; + for (i = 0; i < n; i++) { + prev = m; + m = m->m_nextpkt; + } + prev->m_nextpkt = m->m_nextpkt; + if (m == qtail(q)) + qtail(q) = prev; + } + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +static __inline void +_removeq(class_queue_t *q, struct mbuf *m) +{ + struct mbuf *m0, *prev; + + m0 = qtail(q); + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else if (qtail(q) == m) + qtail(q) = prev; + qlen(q)--; +} + +static __inline void +_flushq(class_queue_t *q) +{ + struct mbuf *m; + + while ((m = _getq(q)) != NULL) + m_freem(m); +} + +#endif /* __GNUC__ && !ALTQ_DEBUG */ + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_CLASSQ_HH_ */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_hfsc.c b/contrib/altq/rtems/freebsd/altq/altq_hfsc.c new file mode 100644 index 00000000..6a97b3eb --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_hfsc.c @@ -0,0 +1,2279 @@ +#include + +/* $FreeBSD$ */ +/* $KAME: altq_hfsc.c,v 1.24 2003/12/05 05:40:46 kjc Exp $ */ + +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +/* + * H-FSC is described in Proceedings of SIGCOMM'97, + * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, + * Real-Time and Priority Service" + * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. + * + * Oleg Cherevko added the upperlimit for link-sharing. + * when a class has an upperlimit, the fit-time is computed from the + * upperlimit service curve. the link-sharing scheduler does not schedule + * a class whose fit-time exceeds the current time. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include +#if (__FreeBSD__ != 2) +#include +#ifdef __FreeBSD__ +#include +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ + +#ifdef ALTQ_HFSC /* hfsc is enabled by ALTQ_HFSC option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#if 1 /* ALTQ3_COMPAT */ +#include +#include +#include +#endif /* ALTQ3_COMPAT */ + +#include +#include + +#include +#include +#include +#ifdef ALTQ3_COMPAT +#include +#endif + +/* + * function prototypes + */ +static int hfsc_clear_interface(struct hfsc_if *); +static int hfsc_request(struct ifaltq *, int, void *); +static void hfsc_purge(struct hfsc_if *); +static struct hfsc_class *hfsc_class_create(struct hfsc_if *, + struct service_curve *, struct service_curve *, struct service_curve *, + struct hfsc_class *, int, int, int); +static int hfsc_class_destroy(struct hfsc_class *); +static struct hfsc_class *hfsc_nextclass(struct hfsc_class *); +static int hfsc_enqueue(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); +static struct mbuf *hfsc_dequeue(struct ifaltq *, int); + +static int hfsc_addq(struct hfsc_class *, struct mbuf *); +static struct mbuf *hfsc_getq(struct hfsc_class *); +static struct mbuf *hfsc_pollq(struct hfsc_class *); +static void hfsc_purgeq(struct hfsc_class *); + +static void update_cfmin(struct hfsc_class *); +static void set_active(struct hfsc_class *, int); +static void set_passive(struct hfsc_class *); + +static void init_ed(struct hfsc_class *, int); +static void update_ed(struct hfsc_class *, int); +static void update_d(struct hfsc_class *, int); +static void init_vf(struct hfsc_class *, int); +static void update_vf(struct hfsc_class *, int, u_int64_t); +static ellist_t *ellist_alloc(void); +static void ellist_destroy(ellist_t *); +static void ellist_insert(struct hfsc_class *); +static void ellist_remove(struct hfsc_class *); +static void ellist_update(struct hfsc_class *); +struct hfsc_class *ellist_get_mindl(ellist_t *, u_int64_t); +static actlist_t *actlist_alloc(void); +static void actlist_destroy(actlist_t *); +static void actlist_insert(struct hfsc_class *); +static void actlist_remove(struct hfsc_class *); +static void actlist_update(struct hfsc_class *); + +static struct hfsc_class *actlist_firstfit(struct hfsc_class *, + u_int64_t); + +static __inline u_int64_t seg_x2y(u_int64_t, u_int64_t); +static __inline u_int64_t seg_y2x(u_int64_t, u_int64_t); +static __inline u_int64_t m2sm(u_int); +static __inline u_int64_t m2ism(u_int); +static __inline u_int64_t d2dx(u_int); +static u_int sm2m(u_int64_t); +static u_int dx2d(u_int64_t); + +static void sc2isc(struct service_curve *, struct internal_sc *); +static void rtsc_init(struct runtime_sc *, struct internal_sc *, + u_int64_t, u_int64_t); +static u_int64_t rtsc_y2x(struct runtime_sc *, u_int64_t); +static u_int64_t rtsc_x2y(struct runtime_sc *, u_int64_t); +static void rtsc_min(struct runtime_sc *, struct internal_sc *, + u_int64_t, u_int64_t); + +static void get_class_stats(struct hfsc_classstats *, + struct hfsc_class *); +static struct hfsc_class *clh_to_clp(struct hfsc_if *, u_int32_t); + + +#ifdef ALTQ3_COMPAT +static struct hfsc_if *hfsc_attach(struct ifaltq *, u_int); +static int hfsc_detach(struct hfsc_if *); +static int hfsc_class_modify(struct hfsc_class *, struct service_curve *, + struct service_curve *, struct service_curve *); + +static int hfsccmd_if_attach(struct hfsc_attach *); +static int hfsccmd_if_detach(struct hfsc_interface *); +static int hfsccmd_add_class(struct hfsc_add_class *); +static int hfsccmd_delete_class(struct hfsc_delete_class *); +static int hfsccmd_modify_class(struct hfsc_modify_class *); +static int hfsccmd_add_filter(struct hfsc_add_filter *); +static int hfsccmd_delete_filter(struct hfsc_delete_filter *); +static int hfsccmd_class_stats(struct hfsc_class_stats *); + +altqdev_decl(hfsc); +#endif /* ALTQ3_COMPAT */ + +/* + * macros + */ +#define is_a_parent_class(cl) ((cl)->cl_children != NULL) + +#define HT_INFINITY 0xffffffffffffffffLL /* infinite time value */ + +#ifdef ALTQ3_COMPAT +/* hif_list keeps all hfsc_if's allocated. */ +static struct hfsc_if *hif_list = NULL; +#endif /* ALTQ3_COMPAT */ + +int +hfsc_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + error = altq_attach(&ifp->if_snd, ALTQT_HFSC, a->altq_disc, + hfsc_enqueue, hfsc_dequeue, hfsc_request, NULL, NULL); + splx(s); + return (error); +} + +int +hfsc_add_altq(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENODEV); + + hif = malloc(sizeof(struct hfsc_if), M_DEVBUF, M_WAITOK); + if (hif == NULL) + return (ENOMEM); + bzero(hif, sizeof(struct hfsc_if)); + + hif->hif_eligible = ellist_alloc(); + if (hif->hif_eligible == NULL) { + free(hif, M_DEVBUF); + return (ENOMEM); + } + + hif->hif_ifq = &ifp->if_snd; + + /* keep the state in pf_altq */ + a->altq_disc = hif; + + return (0); +} + +int +hfsc_remove_altq(struct pf_altq *a) +{ + struct hfsc_if *hif; + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + (void)hfsc_clear_interface(hif); + (void)hfsc_class_destroy(hif->hif_rootclass); + + ellist_destroy(hif->hif_eligible); + + free(hif, M_DEVBUF); + + return (0); +} + +int +hfsc_add_queue(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct hfsc_class *cl, *parent; + struct hfsc_opts *opts; + struct service_curve rtsc, lssc, ulsc; + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + + opts = &a->pq_u.hfsc_opts; + + if (a->parent_qid == HFSC_NULLCLASS_HANDLE && + hif->hif_rootclass == NULL) + parent = NULL; + else if ((parent = clh_to_clp(hif, a->parent_qid)) == NULL) + return (EINVAL); + + if (a->qid == 0) + return (EINVAL); + + if (clh_to_clp(hif, a->qid) != NULL) + return (EBUSY); + + rtsc.m1 = opts->rtsc_m1; + rtsc.d = opts->rtsc_d; + rtsc.m2 = opts->rtsc_m2; + lssc.m1 = opts->lssc_m1; + lssc.d = opts->lssc_d; + lssc.m2 = opts->lssc_m2; + ulsc.m1 = opts->ulsc_m1; + ulsc.d = opts->ulsc_d; + ulsc.m2 = opts->ulsc_m2; + + cl = hfsc_class_create(hif, &rtsc, &lssc, &ulsc, + parent, a->qlimit, opts->flags, a->qid); + if (cl == NULL) + return (ENOMEM); + + return (0); +} + +int +hfsc_remove_queue(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + + if ((cl = clh_to_clp(hif, a->qid)) == NULL) + return (EINVAL); + + return (hfsc_class_destroy(cl)); +} + +int +hfsc_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + struct hfsc_classstats stats; + int error = 0; + + if ((hif = altq_lookup(a->ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, a->qid)) == NULL) + return (EINVAL); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes except the root class. + */ +static int +hfsc_clear_interface(struct hfsc_if *hif) +{ + struct hfsc_class *cl; + +#ifdef ALTQ3_COMPAT + /* free the filters for this interface */ + acc_discard_filters(&hif->hif_classifier, NULL, 1); +#endif + + /* clear out the classes */ + while (hif->hif_rootclass != NULL && + (cl = hif->hif_rootclass->cl_children) != NULL) { + /* + * remove the first leaf class found in the hierarchy + * then start over + */ + for (; cl != NULL; cl = hfsc_nextclass(cl)) { + if (!is_a_parent_class(cl)) { + (void)hfsc_class_destroy(cl); + break; + } + } + } + + return (0); +} + +static int +hfsc_request(struct ifaltq *ifq, int req, void *arg) +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + switch (req) { + case ALTRQ_PURGE: + hfsc_purge(hif); + break; + } + return (0); +} + +/* discard all the queued packets on the interface */ +static void +hfsc_purge(struct hfsc_if *hif) +{ + struct hfsc_class *cl; + + for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + if (ALTQ_IS_ENABLED(hif->hif_ifq)) + hif->hif_ifq->ifq_len = 0; +} + +struct hfsc_class * +hfsc_class_create(struct hfsc_if *hif, struct service_curve *rsc, + struct service_curve *fsc, struct service_curve *usc, + struct hfsc_class *parent, int qlimit, int flags, int qid) +{ + struct hfsc_class *cl, *p; + int i, s; + + if (hif->hif_classes >= HFSC_MAX_CLASSES) + return (NULL); + +#ifndef ALTQ_RED + if (flags & HFCF_RED) { +#ifdef ALTQ_DEBUG + printf("hfsc_class_create: RED not configured for HFSC!\n"); +#endif + return (NULL); + } +#endif + + cl = malloc(sizeof(struct hfsc_class), M_DEVBUF, M_WAITOK); + if (cl == NULL) + return (NULL); + bzero(cl, sizeof(struct hfsc_class)); + + cl->cl_q = malloc(sizeof(class_queue_t), M_DEVBUF, M_WAITOK); + if (cl->cl_q == NULL) + goto err_ret; + bzero(cl->cl_q, sizeof(class_queue_t)); + + cl->cl_actc = actlist_alloc(); + if (cl->cl_actc == NULL) + goto err_ret; + + if (qlimit == 0) + qlimit = 50; /* use default */ + qlimit(cl->cl_q) = qlimit; + qtype(cl->cl_q) = Q_DROPTAIL; + qlen(cl->cl_q) = 0; + cl->cl_flags = flags; +#ifdef ALTQ_RED + if (flags & (HFCF_RED|HFCF_RIO)) { + int red_flags, red_pkttime; + u_int m2; + + m2 = 0; + if (rsc != NULL && rsc->m2 > m2) + m2 = rsc->m2; + if (fsc != NULL && fsc->m2 > m2) + m2 = fsc->m2; + if (usc != NULL && usc->m2 > m2) + m2 = usc->m2; + + red_flags = 0; + if (flags & HFCF_ECN) + red_flags |= REDF_ECN; +#ifdef ALTQ_RIO + if (flags & HFCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + if (m2 < 8) + red_pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + red_pkttime = (int64_t)hif->hif_ifq->altq_ifp->if_mtu + * 1000 * 1000 * 1000 / (m2 / 8); + if (flags & HFCF_RED) { + cl->cl_red = red_alloc(0, 0, + qlimit(cl->cl_q) * 10/100, + qlimit(cl->cl_q) * 30/100, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RED; + } +#ifdef ALTQ_RIO + else { + cl->cl_red = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RIO; + } +#endif + } +#endif /* ALTQ_RED */ + + if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0)) { + cl->cl_rsc = malloc(sizeof(struct internal_sc), + M_DEVBUF, M_WAITOK); + if (cl->cl_rsc == NULL) + goto err_ret; + sc2isc(rsc, cl->cl_rsc); + rtsc_init(&cl->cl_deadline, cl->cl_rsc, 0, 0); + rtsc_init(&cl->cl_eligible, cl->cl_rsc, 0, 0); + } + if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0)) { + cl->cl_fsc = malloc(sizeof(struct internal_sc), + M_DEVBUF, M_WAITOK); + if (cl->cl_fsc == NULL) + goto err_ret; + sc2isc(fsc, cl->cl_fsc); + rtsc_init(&cl->cl_virtual, cl->cl_fsc, 0, 0); + } + if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0)) { + cl->cl_usc = malloc(sizeof(struct internal_sc), + M_DEVBUF, M_WAITOK); + if (cl->cl_usc == NULL) + goto err_ret; + sc2isc(usc, cl->cl_usc); + rtsc_init(&cl->cl_ulimit, cl->cl_usc, 0, 0); + } + + cl->cl_id = hif->hif_classid++; + cl->cl_handle = qid; + cl->cl_hif = hif; + cl->cl_parent = parent; + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + IFQ_LOCK(hif->hif_ifq); + hif->hif_classes++; + + /* + * find a free slot in the class table. if the slot matching + * the lower bits of qid is free, use this slot. otherwise, + * use the first free slot. + */ + i = qid % HFSC_MAX_CLASSES; + if (hif->hif_class_tbl[i] == NULL) + hif->hif_class_tbl[i] = cl; + else { + for (i = 0; i < HFSC_MAX_CLASSES; i++) + if (hif->hif_class_tbl[i] == NULL) { + hif->hif_class_tbl[i] = cl; + break; + } + if (i == HFSC_MAX_CLASSES) { + IFQ_UNLOCK(hif->hif_ifq); + splx(s); + goto err_ret; + } + } + + if (flags & HFCF_DEFAULTCLASS) + hif->hif_defaultclass = cl; + + if (parent == NULL) { + /* this is root class */ + hif->hif_rootclass = cl; + } else { + /* add this class to the children list of the parent */ + if ((p = parent->cl_children) == NULL) + parent->cl_children = cl; + else { + while (p->cl_siblings != NULL) + p = p->cl_siblings; + p->cl_siblings = cl; + } + } + IFQ_UNLOCK(hif->hif_ifq); + splx(s); + + return (cl); + + err_ret: + if (cl->cl_actc != NULL) + actlist_destroy(cl->cl_actc); + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + if (cl->cl_fsc != NULL) + free(cl->cl_fsc, M_DEVBUF); + if (cl->cl_rsc != NULL) + free(cl->cl_rsc, M_DEVBUF); + if (cl->cl_usc != NULL) + free(cl->cl_usc, M_DEVBUF); + if (cl->cl_q != NULL) + free(cl->cl_q, M_DEVBUF); + free(cl, M_DEVBUF); + return (NULL); +} + +static int +hfsc_class_destroy(struct hfsc_class *cl) +{ + int i, s; + + if (cl == NULL) + return (0); + + if (is_a_parent_class(cl)) + return (EBUSY); + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + IFQ_LOCK(cl->cl_hif->hif_ifq); + +#ifdef ALTQ3_COMPAT + /* delete filters referencing to this class */ + acc_discard_filters(&cl->cl_hif->hif_classifier, cl, 0); +#endif /* ALTQ3_COMPAT */ + + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + + if (cl->cl_parent == NULL) { + /* this is root class */ + } else { + struct hfsc_class *p = cl->cl_parent->cl_children; + + if (p == cl) + cl->cl_parent->cl_children = cl->cl_siblings; + else do { + if (p->cl_siblings == cl) { + p->cl_siblings = cl->cl_siblings; + break; + } + } while ((p = p->cl_siblings) != NULL); + ASSERT(p != NULL); + } + + for (i = 0; i < HFSC_MAX_CLASSES; i++) + if (cl->cl_hif->hif_class_tbl[i] == cl) { + cl->cl_hif->hif_class_tbl[i] = NULL; + break; + } + + cl->cl_hif->hif_classes--; + IFQ_UNLOCK(cl->cl_hif->hif_ifq); + splx(s); + + actlist_destroy(cl->cl_actc); + + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + + IFQ_LOCK(cl->cl_hif->hif_ifq); + if (cl == cl->cl_hif->hif_rootclass) + cl->cl_hif->hif_rootclass = NULL; + if (cl == cl->cl_hif->hif_defaultclass) + cl->cl_hif->hif_defaultclass = NULL; + IFQ_UNLOCK(cl->cl_hif->hif_ifq); + + if (cl->cl_usc != NULL) + free(cl->cl_usc, M_DEVBUF); + if (cl->cl_fsc != NULL) + free(cl->cl_fsc, M_DEVBUF); + if (cl->cl_rsc != NULL) + free(cl->cl_rsc, M_DEVBUF); + free(cl->cl_q, M_DEVBUF); + free(cl, M_DEVBUF); + + return (0); +} + +/* + * hfsc_nextclass returns the next class in the tree. + * usage: + * for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) + * do_something; + */ +static struct hfsc_class * +hfsc_nextclass(struct hfsc_class *cl) +{ + if (cl->cl_children != NULL) + cl = cl->cl_children; + else if (cl->cl_siblings != NULL) + cl = cl->cl_siblings; + else { + while ((cl = cl->cl_parent) != NULL) + if (cl->cl_siblings) { + cl = cl->cl_siblings; + break; + } + } + + return (cl); +} + +/* + * hfsc_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +hfsc_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + struct hfsc_class *cl; + struct pf_mtag *t; + int len; + + IFQ_LOCK_ASSERT(ifq); + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ +#if defined(__NetBSD__) || defined(__OpenBSD__)\ + || (defined(__FreeBSD__) && __FreeBSD_version >= 501113) + printf("altq: packet for %s does not have pkthdr\n", + ifq->altq_ifp->if_xname); +#else + printf("altq: packet for %s%d does not have pkthdr\n", + ifq->altq_ifp->if_name, ifq->altq_ifp->if_unit); +#endif + m_freem(m); + return (ENOBUFS); + } + cl = NULL; + if ((t = pf_find_mtag(m)) != NULL) + cl = clh_to_clp(hif, t->qid); +#ifdef ALTQ3_COMPAT + else if ((ifq->altq_flags & ALTQF_CLASSIFY) && pktattr != NULL) + cl = pktattr->pattr_class; +#endif + if (cl == NULL || is_a_parent_class(cl)) { + cl = hif->hif_defaultclass; + if (cl == NULL) { + m_freem(m); + return (ENOBUFS); + } + } +#ifdef ALTQ3_COMPAT + if (pktattr != NULL) + cl->cl_pktattr = pktattr; /* save proto hdr used by ECN */ + else +#endif + cl->cl_pktattr = NULL; + len = m_pktlen(m); + if (hfsc_addq(cl, m) != 0) { + /* drop occurred. mbuf was freed in hfsc_addq. */ + PKTCNTR_ADD(&cl->cl_stats.drop_cnt, len); + return (ENOBUFS); + } + IFQ_INC_LEN(ifq); + cl->cl_hif->hif_packets++; + + /* successfully queued. */ + if (qlen(cl->cl_q) == 1) + set_active(cl, m_pktlen(m)); + + return (0); +} + +/* + * hfsc_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +hfsc_dequeue(struct ifaltq *ifq, int op) +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + struct hfsc_class *cl; + struct mbuf *m; + int len, next_len; + int realtime = 0; + u_int64_t cur_time; + + IFQ_LOCK_ASSERT(ifq); + + if (hif->hif_packets == 0) + /* no packet in the tree */ + return (NULL); + + cur_time = read_machclk(); + + if (op == ALTDQ_REMOVE && hif->hif_pollcache != NULL) { + + cl = hif->hif_pollcache; + hif->hif_pollcache = NULL; + /* check if the class was scheduled by real-time criteria */ + if (cl->cl_rsc != NULL) + realtime = (cl->cl_e <= cur_time); + } else { + /* + * if there are eligible classes, use real-time criteria. + * find the class with the minimum deadline among + * the eligible classes. + */ + if ((cl = ellist_get_mindl(hif->hif_eligible, cur_time)) + != NULL) { + realtime = 1; + } else { +#ifdef ALTQ_DEBUG + int fits = 0; +#endif + /* + * use link-sharing criteria + * get the class with the minimum vt in the hierarchy + */ + cl = hif->hif_rootclass; + while (is_a_parent_class(cl)) { + + cl = actlist_firstfit(cl, cur_time); + if (cl == NULL) { +#ifdef ALTQ_DEBUG + if (fits > 0) + printf("%d fit but none found\n",fits); +#endif + return (NULL); + } + /* + * update parent's cl_cvtmin. + * don't update if the new vt is smaller. + */ + if (cl->cl_parent->cl_cvtmin < cl->cl_vt) + cl->cl_parent->cl_cvtmin = cl->cl_vt; +#ifdef ALTQ_DEBUG + fits++; +#endif + } + } + + if (op == ALTDQ_POLL) { + hif->hif_pollcache = cl; + m = hfsc_pollq(cl); + return (m); + } + } + + m = hfsc_getq(cl); + if (m == NULL) + panic("hfsc_dequeue:"); + len = m_pktlen(m); + cl->cl_hif->hif_packets--; + IFQ_DEC_LEN(ifq); + PKTCNTR_ADD(&cl->cl_stats.xmit_cnt, len); + + update_vf(cl, len, cur_time); + if (realtime) + cl->cl_cumul += len; + + if (!qempty(cl->cl_q)) { + if (cl->cl_rsc != NULL) { + /* update ed */ + next_len = m_pktlen(qhead(cl->cl_q)); + + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } + } else { + /* the class becomes passive */ + set_passive(cl); + } + + return (m); +} + +static int +hfsc_addq(struct hfsc_class *cl, struct mbuf *m) +{ + +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_addq((rio_t *)cl->cl_red, cl->cl_q, + m, cl->cl_pktattr); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr); +#endif + if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) { + m_freem(m); + return (-1); + } + + if (cl->cl_flags & HFCF_CLEARDSCP) + write_dsfield(m, cl->cl_pktattr, 0); + + _addq(cl->cl_q, m); + + return (0); +} + +static struct mbuf * +hfsc_getq(struct hfsc_class *cl) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_getq((rio_t *)cl->cl_red, cl->cl_q); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_getq(cl->cl_red, cl->cl_q); +#endif + return _getq(cl->cl_q); +} + +static struct mbuf * +hfsc_pollq(struct hfsc_class *cl) +{ + return qhead(cl->cl_q); +} + +static void +hfsc_purgeq(struct hfsc_class *cl) +{ + struct mbuf *m; + + if (qempty(cl->cl_q)) + return; + + while ((m = _getq(cl->cl_q)) != NULL) { + PKTCNTR_ADD(&cl->cl_stats.drop_cnt, m_pktlen(m)); + m_freem(m); + cl->cl_hif->hif_packets--; + IFQ_DEC_LEN(cl->cl_hif->hif_ifq); + } + ASSERT(qlen(cl->cl_q) == 0); + + update_vf(cl, 0, 0); /* remove cl from the actlist */ + set_passive(cl); +} + +static void +set_active(struct hfsc_class *cl, int len) +{ + if (cl->cl_rsc != NULL) + init_ed(cl, len); + if (cl->cl_fsc != NULL) + init_vf(cl, len); + + cl->cl_stats.period++; +} + +static void +set_passive(struct hfsc_class *cl) +{ + if (cl->cl_rsc != NULL) + ellist_remove(cl); + + /* + * actlist is now handled in update_vf() so that update_vf(cl, 0, 0) + * needs to be called explicitly to remove a class from actlist + */ +} + +static void +init_ed(struct hfsc_class *cl, int next_len) +{ + u_int64_t cur_time; + + cur_time = read_machclk(); + + /* update the deadline curve */ + rtsc_min(&cl->cl_deadline, cl->cl_rsc, cur_time, cl->cl_cumul); + + /* + * update the eligible curve. + * for concave, it is equal to the deadline curve. + * for convex, it is a linear curve with slope m2. + */ + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc->sm1 <= cl->cl_rsc->sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + + /* compute e and d */ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + ellist_insert(cl); +} + +static void +update_ed(struct hfsc_class *cl, int next_len) +{ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + ellist_update(cl); +} + +static void +update_d(struct hfsc_class *cl, int next_len) +{ + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); +} + +static void +init_vf(struct hfsc_class *cl, int len) +{ + struct hfsc_class *max_cl, *p; + u_int64_t vt, f, cur_time; + int go_active; + + cur_time = 0; + go_active = 1; + for ( ; cl->cl_parent != NULL; cl = cl->cl_parent) { + + if (go_active && cl->cl_nactive++ == 0) + go_active = 1; + else + go_active = 0; + + if (go_active) { + max_cl = actlist_last(cl->cl_parent->cl_actc); + if (max_cl != NULL) { + /* + * set vt to the average of the min and max + * classes. if the parent's period didn't + * change, don't decrease vt of the class. + */ + vt = max_cl->cl_vt; + if (cl->cl_parent->cl_cvtmin != 0) + vt = (cl->cl_parent->cl_cvtmin + vt)/2; + + if (cl->cl_parent->cl_vtperiod != + cl->cl_parentperiod || vt > cl->cl_vt) + cl->cl_vt = vt; + } else { + /* + * first child for a new parent backlog period. + * add parent's cvtmax to vtoff of children + * to make a new vt (vtoff + vt) larger than + * the vt in the last period for all children. + */ + vt = cl->cl_parent->cl_cvtmax; + for (p = cl->cl_parent->cl_children; p != NULL; + p = p->cl_siblings) + p->cl_vtoff += vt; + cl->cl_vt = 0; + cl->cl_parent->cl_cvtmax = 0; + cl->cl_parent->cl_cvtmin = 0; + } + cl->cl_initvt = cl->cl_vt; + + /* update the virtual curve */ + vt = cl->cl_vt + cl->cl_vtoff; + rtsc_min(&cl->cl_virtual, cl->cl_fsc, vt, cl->cl_total); + if (cl->cl_virtual.x == vt) { + cl->cl_virtual.x -= cl->cl_vtoff; + cl->cl_vtoff = 0; + } + cl->cl_vtadj = 0; + + cl->cl_vtperiod++; /* increment vt period */ + cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; + if (cl->cl_parent->cl_nactive == 0) + cl->cl_parentperiod++; + cl->cl_f = 0; + + actlist_insert(cl); + + if (cl->cl_usc != NULL) { + /* class has upper limit curve */ + if (cur_time == 0) + cur_time = read_machclk(); + + /* update the ulimit curve */ + rtsc_min(&cl->cl_ulimit, cl->cl_usc, cur_time, + cl->cl_total); + /* compute myf */ + cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, + cl->cl_total); + cl->cl_myfadj = 0; + } + } + + if (cl->cl_myf > cl->cl_cfmin) + f = cl->cl_myf; + else + f = cl->cl_cfmin; + if (f != cl->cl_f) { + cl->cl_f = f; + update_cfmin(cl->cl_parent); + } + } +} + +static void +update_vf(struct hfsc_class *cl, int len, u_int64_t cur_time) +{ + u_int64_t f, myf_bound, delta; + int go_passive; + + go_passive = qempty(cl->cl_q); + + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + + cl->cl_total += len; + + if (cl->cl_fsc == NULL || cl->cl_nactive == 0) + continue; + + if (go_passive && --cl->cl_nactive == 0) + go_passive = 1; + else + go_passive = 0; + + if (go_passive) { + /* no more active child, going passive */ + + /* update cvtmax of the parent class */ + if (cl->cl_vt > cl->cl_parent->cl_cvtmax) + cl->cl_parent->cl_cvtmax = cl->cl_vt; + + /* remove this class from the vt list */ + actlist_remove(cl); + + update_cfmin(cl->cl_parent); + + continue; + } + + /* + * update vt and f + */ + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + - cl->cl_vtoff + cl->cl_vtadj; + + /* + * if vt of the class is smaller than cvtmin, + * the class was skipped in the past due to non-fit. + * if so, we need to adjust vtadj. + */ + if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { + cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; + cl->cl_vt = cl->cl_parent->cl_cvtmin; + } + + /* update the vt list */ + actlist_update(cl); + + if (cl->cl_usc != NULL) { + cl->cl_myf = cl->cl_myfadj + + rtsc_y2x(&cl->cl_ulimit, cl->cl_total); + + /* + * if myf lags behind by more than one clock tick + * from the current time, adjust myfadj to prevent + * a rate-limited class from going greedy. + * in a steady state under rate-limiting, myf + * fluctuates within one clock tick. + */ + myf_bound = cur_time - machclk_per_tick; + if (cl->cl_myf < myf_bound) { + delta = cur_time - cl->cl_myf; + cl->cl_myfadj += delta; + cl->cl_myf += delta; + } + } + + /* cl_f is max(cl_myf, cl_cfmin) */ + if (cl->cl_myf > cl->cl_cfmin) + f = cl->cl_myf; + else + f = cl->cl_cfmin; + if (f != cl->cl_f) { + cl->cl_f = f; + update_cfmin(cl->cl_parent); + } + } +} + +static void +update_cfmin(struct hfsc_class *cl) +{ + struct hfsc_class *p; + u_int64_t cfmin; + + if (TAILQ_EMPTY(cl->cl_actc)) { + cl->cl_cfmin = 0; + return; + } + cfmin = HT_INFINITY; + TAILQ_FOREACH(p, cl->cl_actc, cl_actlist) { + if (p->cl_f == 0) { + cl->cl_cfmin = 0; + return; + } + if (p->cl_f < cfmin) + cfmin = p->cl_f; + } + cl->cl_cfmin = cfmin; +} + +/* + * TAILQ based ellist and actlist implementation + * (ion wanted to make a calendar queue based implementation) + */ +/* + * eligible list holds backlogged classes being sorted by their eligible times. + * there is one eligible list per interface. + */ + +static ellist_t * +ellist_alloc(void) +{ + ellist_t *head; + + head = malloc(sizeof(ellist_t), M_DEVBUF, M_WAITOK); + TAILQ_INIT(head); + return (head); +} + +static void +ellist_destroy(ellist_t *head) +{ + free(head, M_DEVBUF); +} + +static void +ellist_insert(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + struct hfsc_class *p; + + /* check the last entry first */ + if ((p = TAILQ_LAST(hif->hif_eligible, _eligible)) == NULL || + p->cl_e <= cl->cl_e) { + TAILQ_INSERT_TAIL(hif->hif_eligible, cl, cl_ellist); + return; + } + + TAILQ_FOREACH(p, hif->hif_eligible, cl_ellist) { + if (cl->cl_e < p->cl_e) { + TAILQ_INSERT_BEFORE(p, cl, cl_ellist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +static void +ellist_remove(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + + TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist); +} + +static void +ellist_update(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + struct hfsc_class *p, *last; + + /* + * the eligible time of a class increases monotonically. + * if the next entry has a larger eligible time, nothing to do. + */ + p = TAILQ_NEXT(cl, cl_ellist); + if (p == NULL || cl->cl_e <= p->cl_e) + return; + + /* check the last entry */ + last = TAILQ_LAST(hif->hif_eligible, _eligible); + ASSERT(last != NULL); + if (last->cl_e <= cl->cl_e) { + TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist); + TAILQ_INSERT_TAIL(hif->hif_eligible, cl, cl_ellist); + return; + } + + /* + * the new position must be between the next entry + * and the last entry + */ + while ((p = TAILQ_NEXT(p, cl_ellist)) != NULL) { + if (cl->cl_e < p->cl_e) { + TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist); + TAILQ_INSERT_BEFORE(p, cl, cl_ellist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +/* find the class with the minimum deadline among the eligible classes */ +struct hfsc_class * +ellist_get_mindl(ellist_t *head, u_int64_t cur_time) +{ + struct hfsc_class *p, *cl = NULL; + + TAILQ_FOREACH(p, head, cl_ellist) { + if (p->cl_e > cur_time) + break; + if (cl == NULL || p->cl_d < cl->cl_d) + cl = p; + } + return (cl); +} + +/* + * active children list holds backlogged child classes being sorted + * by their virtual time. + * each intermediate class has one active children list. + */ +static actlist_t * +actlist_alloc(void) +{ + actlist_t *head; + + head = malloc(sizeof(actlist_t), M_DEVBUF, M_WAITOK); + TAILQ_INIT(head); + return (head); +} + +static void +actlist_destroy(actlist_t *head) +{ + free(head, M_DEVBUF); +} +static void +actlist_insert(struct hfsc_class *cl) +{ + struct hfsc_class *p; + + /* check the last entry first */ + if ((p = TAILQ_LAST(cl->cl_parent->cl_actc, _active)) == NULL + || p->cl_vt <= cl->cl_vt) { + TAILQ_INSERT_TAIL(cl->cl_parent->cl_actc, cl, cl_actlist); + return; + } + + TAILQ_FOREACH(p, cl->cl_parent->cl_actc, cl_actlist) { + if (cl->cl_vt < p->cl_vt) { + TAILQ_INSERT_BEFORE(p, cl, cl_actlist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +static void +actlist_remove(struct hfsc_class *cl) +{ + TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist); +} + +static void +actlist_update(struct hfsc_class *cl) +{ + struct hfsc_class *p, *last; + + /* + * the virtual time of a class increases monotonically during its + * backlogged period. + * if the next entry has a larger virtual time, nothing to do. + */ + p = TAILQ_NEXT(cl, cl_actlist); + if (p == NULL || cl->cl_vt < p->cl_vt) + return; + + /* check the last entry */ + last = TAILQ_LAST(cl->cl_parent->cl_actc, _active); + ASSERT(last != NULL); + if (last->cl_vt <= cl->cl_vt) { + TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist); + TAILQ_INSERT_TAIL(cl->cl_parent->cl_actc, cl, cl_actlist); + return; + } + + /* + * the new position must be between the next entry + * and the last entry + */ + while ((p = TAILQ_NEXT(p, cl_actlist)) != NULL) { + if (cl->cl_vt < p->cl_vt) { + TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist); + TAILQ_INSERT_BEFORE(p, cl, cl_actlist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +static struct hfsc_class * +actlist_firstfit(struct hfsc_class *cl, u_int64_t cur_time) +{ + struct hfsc_class *p; + + TAILQ_FOREACH(p, cl->cl_actc, cl_actlist) { + if (p->cl_f <= cur_time) + return (p); + } + return (NULL); +} + +/* + * service curve support functions + * + * external service curve parameters + * m: bits/sec + * d: msec + * internal service curve parameters + * sm: (bytes/tsc_interval) << SM_SHIFT + * ism: (tsc_count/byte) << ISM_SHIFT + * dx: tsc_count + * + * SM_SHIFT and ISM_SHIFT are scaled in order to keep effective digits. + * we should be able to handle 100K-1Gbps linkspeed with 200Hz-1GHz CPU + * speed. SM_SHIFT and ISM_SHIFT are selected to have at least 3 effective + * digits in decimal using the following table. + * + * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps + * ----------+------------------------------------------------------- + * bytes/nsec 12.5e-6 125e-6 1250e-6 12500e-6 125000e-6 + * sm(500MHz) 25.0e-6 250e-6 2500e-6 25000e-6 250000e-6 + * sm(200MHz) 62.5e-6 625e-6 6250e-6 62500e-6 625000e-6 + * + * nsec/byte 80000 8000 800 80 8 + * ism(500MHz) 40000 4000 400 40 4 + * ism(200MHz) 16000 1600 160 16 1.6 + */ +#define SM_SHIFT 24 +#define ISM_SHIFT 10 + +#define SM_MASK ((1LL << SM_SHIFT) - 1) +#define ISM_MASK ((1LL << ISM_SHIFT) - 1) + +static __inline u_int64_t +seg_x2y(u_int64_t x, u_int64_t sm) +{ + u_int64_t y; + + /* + * compute + * y = x * sm >> SM_SHIFT + * but divide it for the upper and lower bits to avoid overflow + */ + y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT); + return (y); +} + +static __inline u_int64_t +seg_y2x(u_int64_t y, u_int64_t ism) +{ + u_int64_t x; + + if (y == 0) + x = 0; + else if (ism == HT_INFINITY) + x = HT_INFINITY; + else { + x = (y >> ISM_SHIFT) * ism + + (((y & ISM_MASK) * ism) >> ISM_SHIFT); + } + return (x); +} + +static __inline u_int64_t +m2sm(u_int m) +{ + u_int64_t sm; + + sm = ((u_int64_t)m << SM_SHIFT) / 8 / machclk_freq; + return (sm); +} + +static __inline u_int64_t +m2ism(u_int m) +{ + u_int64_t ism; + + if (m == 0) + ism = HT_INFINITY; + else + ism = ((u_int64_t)machclk_freq << ISM_SHIFT) * 8 / m; + return (ism); +} + +static __inline u_int64_t +d2dx(u_int d) +{ + u_int64_t dx; + + dx = ((u_int64_t)d * machclk_freq) / 1000; + return (dx); +} + +static u_int +sm2m(u_int64_t sm) +{ + u_int64_t m; + + m = (sm * 8 * machclk_freq) >> SM_SHIFT; + return ((u_int)m); +} + +static u_int +dx2d(u_int64_t dx) +{ + u_int64_t d; + + d = dx * 1000 / machclk_freq; + return ((u_int)d); +} + +static void +sc2isc(struct service_curve *sc, struct internal_sc *isc) +{ + isc->sm1 = m2sm(sc->m1); + isc->ism1 = m2ism(sc->m1); + isc->dx = d2dx(sc->d); + isc->dy = seg_x2y(isc->dx, isc->sm1); + isc->sm2 = m2sm(sc->m2); + isc->ism2 = m2ism(sc->m2); +} + +/* + * initialize the runtime service curve with the given internal + * service curve starting at (x, y). + */ +static void +rtsc_init(struct runtime_sc *rtsc, struct internal_sc * isc, u_int64_t x, + u_int64_t y) +{ + rtsc->x = x; + rtsc->y = y; + rtsc->sm1 = isc->sm1; + rtsc->ism1 = isc->ism1; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + rtsc->sm2 = isc->sm2; + rtsc->ism2 = isc->ism2; +} + +/* + * calculate the y-projection of the runtime service curve by the + * given x-projection value + */ +static u_int64_t +rtsc_y2x(struct runtime_sc *rtsc, u_int64_t y) +{ + u_int64_t x; + + if (y < rtsc->y) + x = rtsc->x; + else if (y <= rtsc->y + rtsc->dy) { + /* x belongs to the 1st segment */ + if (rtsc->dy == 0) + x = rtsc->x + rtsc->dx; + else + x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); + } else { + /* x belongs to the 2nd segment */ + x = rtsc->x + rtsc->dx + + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); + } + return (x); +} + +static u_int64_t +rtsc_x2y(struct runtime_sc *rtsc, u_int64_t x) +{ + u_int64_t y; + + if (x <= rtsc->x) + y = rtsc->y; + else if (x <= rtsc->x + rtsc->dx) + /* y belongs to the 1st segment */ + y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); + else + /* y belongs to the 2nd segment */ + y = rtsc->y + rtsc->dy + + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); + return (y); +} + +/* + * update the runtime service curve by taking the minimum of the current + * runtime service curve and the service curve starting at (x, y). + */ +static void +rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u_int64_t x, + u_int64_t y) +{ + u_int64_t y1, y2, dx, dy; + + if (isc->sm1 <= isc->sm2) { + /* service curve is convex */ + y1 = rtsc_x2y(rtsc, x); + if (y1 < y) + /* the current rtsc is smaller */ + return; + rtsc->x = x; + rtsc->y = y; + return; + } + + /* + * service curve is concave + * compute the two y values of the current rtsc + * y1: at x + * y2: at (x + dx) + */ + y1 = rtsc_x2y(rtsc, x); + if (y1 <= y) { + /* rtsc is below isc, no change to rtsc */ + return; + } + + y2 = rtsc_x2y(rtsc, x + isc->dx); + if (y2 >= y + isc->dy) { + /* rtsc is above isc, replace rtsc by isc */ + rtsc->x = x; + rtsc->y = y; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + return; + } + + /* + * the two curves intersect + * compute the offsets (dx, dy) using the reverse + * function of seg_x2y() + * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) + */ + dx = ((y1 - y) << SM_SHIFT) / (isc->sm1 - isc->sm2); + /* + * check if (x, y1) belongs to the 1st segment of rtsc. + * if so, add the offset. + */ + if (rtsc->x + rtsc->dx > x) + dx += rtsc->x + rtsc->dx - x; + dy = seg_x2y(dx, isc->sm1); + + rtsc->x = x; + rtsc->y = y; + rtsc->dx = dx; + rtsc->dy = dy; + return; +} + +static void +get_class_stats(struct hfsc_classstats *sp, struct hfsc_class *cl) +{ + sp->class_id = cl->cl_id; + sp->class_handle = cl->cl_handle; + + if (cl->cl_rsc != NULL) { + sp->rsc.m1 = sm2m(cl->cl_rsc->sm1); + sp->rsc.d = dx2d(cl->cl_rsc->dx); + sp->rsc.m2 = sm2m(cl->cl_rsc->sm2); + } else { + sp->rsc.m1 = 0; + sp->rsc.d = 0; + sp->rsc.m2 = 0; + } + if (cl->cl_fsc != NULL) { + sp->fsc.m1 = sm2m(cl->cl_fsc->sm1); + sp->fsc.d = dx2d(cl->cl_fsc->dx); + sp->fsc.m2 = sm2m(cl->cl_fsc->sm2); + } else { + sp->fsc.m1 = 0; + sp->fsc.d = 0; + sp->fsc.m2 = 0; + } + if (cl->cl_usc != NULL) { + sp->usc.m1 = sm2m(cl->cl_usc->sm1); + sp->usc.d = dx2d(cl->cl_usc->dx); + sp->usc.m2 = sm2m(cl->cl_usc->sm2); + } else { + sp->usc.m1 = 0; + sp->usc.d = 0; + sp->usc.m2 = 0; + } + + sp->total = cl->cl_total; + sp->cumul = cl->cl_cumul; + + sp->d = cl->cl_d; + sp->e = cl->cl_e; + sp->vt = cl->cl_vt; + sp->f = cl->cl_f; + + sp->initvt = cl->cl_initvt; + sp->vtperiod = cl->cl_vtperiod; + sp->parentperiod = cl->cl_parentperiod; + sp->nactive = cl->cl_nactive; + sp->vtoff = cl->cl_vtoff; + sp->cvtmax = cl->cl_cvtmax; + sp->myf = cl->cl_myf; + sp->cfmin = cl->cl_cfmin; + sp->cvtmin = cl->cl_cvtmin; + sp->myfadj = cl->cl_myfadj; + sp->vtadj = cl->cl_vtadj; + + sp->cur_time = read_machclk(); + sp->machclk_freq = machclk_freq; + + sp->qlength = qlen(cl->cl_q); + sp->qlimit = qlimit(cl->cl_q); + sp->xmit_cnt = cl->cl_stats.xmit_cnt; + sp->drop_cnt = cl->cl_stats.drop_cnt; + sp->period = cl->cl_stats.period; + + sp->qtype = qtype(cl->cl_q); +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_getstats((rio_t *)cl->cl_red, &sp->red[0]); +#endif +} + +/* convert a class handle to the corresponding class pointer */ +static struct hfsc_class * +clh_to_clp(struct hfsc_if *hif, u_int32_t chandle) +{ + int i; + struct hfsc_class *cl; + + if (chandle == 0) + return (NULL); + /* + * first, try optimistically the slot matching the lower bits of + * the handle. if it fails, do the linear table search. + */ + i = chandle % HFSC_MAX_CLASSES; + if ((cl = hif->hif_class_tbl[i]) != NULL && cl->cl_handle == chandle) + return (cl); + for (i = 0; i < HFSC_MAX_CLASSES; i++) + if ((cl = hif->hif_class_tbl[i]) != NULL && + cl->cl_handle == chandle) + return (cl); + return (NULL); +} + +#ifdef ALTQ3_COMPAT +static struct hfsc_if * +hfsc_attach(ifq, bandwidth) + struct ifaltq *ifq; + u_int bandwidth; +{ + struct hfsc_if *hif; + + hif = malloc(sizeof(struct hfsc_if), M_DEVBUF, M_WAITOK); + if (hif == NULL) + return (NULL); + bzero(hif, sizeof(struct hfsc_if)); + + hif->hif_eligible = ellist_alloc(); + if (hif->hif_eligible == NULL) { + free(hif, M_DEVBUF); + return NULL; + } + + hif->hif_ifq = ifq; + + /* add this state to the hfsc list */ + hif->hif_next = hif_list; + hif_list = hif; + + return (hif); +} + +static int +hfsc_detach(hif) + struct hfsc_if *hif; +{ + (void)hfsc_clear_interface(hif); + (void)hfsc_class_destroy(hif->hif_rootclass); + + /* remove this interface from the hif list */ + if (hif_list == hif) + hif_list = hif->hif_next; + else { + struct hfsc_if *h; + + for (h = hif_list; h != NULL; h = h->hif_next) + if (h->hif_next == hif) { + h->hif_next = hif->hif_next; + break; + } + ASSERT(h != NULL); + } + + ellist_destroy(hif->hif_eligible); + + free(hif, M_DEVBUF); + + return (0); +} + +static int +hfsc_class_modify(cl, rsc, fsc, usc) + struct hfsc_class *cl; + struct service_curve *rsc, *fsc, *usc; +{ + struct internal_sc *rsc_tmp, *fsc_tmp, *usc_tmp; + u_int64_t cur_time; + int s; + + rsc_tmp = fsc_tmp = usc_tmp = NULL; + if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0) && + cl->cl_rsc == NULL) { + rsc_tmp = malloc(sizeof(struct internal_sc), + M_DEVBUF, M_WAITOK); + if (rsc_tmp == NULL) + return (ENOMEM); + } + if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0) && + cl->cl_fsc == NULL) { + fsc_tmp = malloc(sizeof(struct internal_sc), + M_DEVBUF, M_WAITOK); + if (fsc_tmp == NULL) { + free(rsc_tmp); + return (ENOMEM); + } + } + if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0) && + cl->cl_usc == NULL) { + usc_tmp = malloc(sizeof(struct internal_sc), + M_DEVBUF, M_WAITOK); + if (usc_tmp == NULL) { + free(rsc_tmp); + free(fsc_tmp); + return (ENOMEM); + } + } + + cur_time = read_machclk(); +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + IFQ_LOCK(cl->cl_hif->hif_ifq); + + if (rsc != NULL) { + if (rsc->m1 == 0 && rsc->m2 == 0) { + if (cl->cl_rsc != NULL) { + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + free(cl->cl_rsc, M_DEVBUF); + cl->cl_rsc = NULL; + } + } else { + if (cl->cl_rsc == NULL) + cl->cl_rsc = rsc_tmp; + sc2isc(rsc, cl->cl_rsc); + rtsc_init(&cl->cl_deadline, cl->cl_rsc, cur_time, + cl->cl_cumul); + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc->sm1 <= cl->cl_rsc->sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + } + } + + if (fsc != NULL) { + if (fsc->m1 == 0 && fsc->m2 == 0) { + if (cl->cl_fsc != NULL) { + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + free(cl->cl_fsc, M_DEVBUF); + cl->cl_fsc = NULL; + } + } else { + if (cl->cl_fsc == NULL) + cl->cl_fsc = fsc_tmp; + sc2isc(fsc, cl->cl_fsc); + rtsc_init(&cl->cl_virtual, cl->cl_fsc, cl->cl_vt, + cl->cl_total); + } + } + + if (usc != NULL) { + if (usc->m1 == 0 && usc->m2 == 0) { + if (cl->cl_usc != NULL) { + free(cl->cl_usc, M_DEVBUF); + cl->cl_usc = NULL; + cl->cl_myf = 0; + } + } else { + if (cl->cl_usc == NULL) + cl->cl_usc = usc_tmp; + sc2isc(usc, cl->cl_usc); + rtsc_init(&cl->cl_ulimit, cl->cl_usc, cur_time, + cl->cl_total); + } + } + + if (!qempty(cl->cl_q)) { + if (cl->cl_rsc != NULL) + update_ed(cl, m_pktlen(qhead(cl->cl_q))); + if (cl->cl_fsc != NULL) + update_vf(cl, 0, cur_time); + /* is this enough? */ + } + + IFQ_UNLOCK(cl->cl_hif->hif_ifq); + splx(s); + + return (0); +} + +/* + * hfsc device interface + */ +int +hfscopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + if (machclk_freq == 0) + init_machclk(); + + if (machclk_freq == 0) { + printf("hfsc: no cpu clock available!\n"); + return (ENXIO); + } + + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +hfscclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct hfsc_if *hif; + int err, error = 0; + + while ((hif = hif_list) != NULL) { + /* destroy all */ + if (ALTQ_IS_ENABLED(hif->hif_ifq)) + altq_disable(hif->hif_ifq); + + err = altq_detach(hif->hif_ifq); + if (err == 0) + err = hfsc_detach(hif); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +hfscioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct hfsc_if *hif; + struct hfsc_interface *ifacep; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case HFSC_GETSTATS: + break; + default: +#if (__FreeBSD_version > 700000) + if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) + return (error); +#elsif (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + + case HFSC_IF_ATTACH: + error = hfsccmd_if_attach((struct hfsc_attach *)addr); + break; + + case HFSC_IF_DETACH: + error = hfsccmd_if_detach((struct hfsc_interface *)addr); + break; + + case HFSC_ENABLE: + case HFSC_DISABLE: + case HFSC_CLEAR_HIERARCHY: + ifacep = (struct hfsc_interface *)addr; + if ((hif = altq_lookup(ifacep->hfsc_ifname, + ALTQT_HFSC)) == NULL) { + error = EBADF; + break; + } + + switch (cmd) { + + case HFSC_ENABLE: + if (hif->hif_defaultclass == NULL) { +#ifdef ALTQ_DEBUG + printf("hfsc: no default class\n"); +#endif + error = EINVAL; + break; + } + error = altq_enable(hif->hif_ifq); + break; + + case HFSC_DISABLE: + error = altq_disable(hif->hif_ifq); + break; + + case HFSC_CLEAR_HIERARCHY: + hfsc_clear_interface(hif); + break; + } + break; + + case HFSC_ADD_CLASS: + error = hfsccmd_add_class((struct hfsc_add_class *)addr); + break; + + case HFSC_DEL_CLASS: + error = hfsccmd_delete_class((struct hfsc_delete_class *)addr); + break; + + case HFSC_MOD_CLASS: + error = hfsccmd_modify_class((struct hfsc_modify_class *)addr); + break; + + case HFSC_ADD_FILTER: + error = hfsccmd_add_filter((struct hfsc_add_filter *)addr); + break; + + case HFSC_DEL_FILTER: + error = hfsccmd_delete_filter((struct hfsc_delete_filter *)addr); + break; + + case HFSC_GETSTATS: + error = hfsccmd_class_stats((struct hfsc_class_stats *)addr); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +static int +hfsccmd_if_attach(ap) + struct hfsc_attach *ap; +{ + struct hfsc_if *hif; + struct ifnet *ifp; + int error; + + if ((ifp = ifunit(ap->iface.hfsc_ifname)) == NULL) + return (ENXIO); + + if ((hif = hfsc_attach(&ifp->if_snd, ap->bandwidth)) == NULL) + return (ENOMEM); + + /* + * set HFSC to this ifnet structure. + */ + if ((error = altq_attach(&ifp->if_snd, ALTQT_HFSC, hif, + hfsc_enqueue, hfsc_dequeue, hfsc_request, + &hif->hif_classifier, acc_classify)) != 0) + (void)hfsc_detach(hif); + + return (error); +} + +static int +hfsccmd_if_detach(ap) + struct hfsc_interface *ap; +{ + struct hfsc_if *hif; + int error; + + if ((hif = altq_lookup(ap->hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if (ALTQ_IS_ENABLED(hif->hif_ifq)) + altq_disable(hif->hif_ifq); + + if ((error = altq_detach(hif->hif_ifq))) + return (error); + + return hfsc_detach(hif); +} + +static int +hfsccmd_add_class(ap) + struct hfsc_add_class *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl, *parent; + int i; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if (ap->parent_handle == HFSC_NULLCLASS_HANDLE && + hif->hif_rootclass == NULL) + parent = NULL; + else if ((parent = clh_to_clp(hif, ap->parent_handle)) == NULL) + return (EINVAL); + + /* assign a class handle (use a free slot number for now) */ + for (i = 1; i < HFSC_MAX_CLASSES; i++) + if (hif->hif_class_tbl[i] == NULL) + break; + if (i == HFSC_MAX_CLASSES) + return (EBUSY); + + if ((cl = hfsc_class_create(hif, &ap->service_curve, NULL, NULL, + parent, ap->qlimit, ap->flags, i)) == NULL) + return (ENOMEM); + + /* return a class handle to the user */ + ap->class_handle = i; + + return (0); +} + +static int +hfsccmd_delete_class(ap) + struct hfsc_delete_class *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) + return (EINVAL); + + return hfsc_class_destroy(cl); +} + +static int +hfsccmd_modify_class(ap) + struct hfsc_modify_class *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + struct service_curve *rsc = NULL; + struct service_curve *fsc = NULL; + struct service_curve *usc = NULL; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) + return (EINVAL); + + if (ap->sctype & HFSC_REALTIMESC) + rsc = &ap->service_curve; + if (ap->sctype & HFSC_LINKSHARINGSC) + fsc = &ap->service_curve; + if (ap->sctype & HFSC_UPPERLIMITSC) + usc = &ap->service_curve; + + return hfsc_class_modify(cl, rsc, fsc, usc); +} + +static int +hfsccmd_add_filter(ap) + struct hfsc_add_filter *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) + return (EINVAL); + + if (is_a_parent_class(cl)) { +#ifdef ALTQ_DEBUG + printf("hfsccmd_add_filter: not a leaf class!\n"); +#endif + return (EINVAL); + } + + return acc_add_filter(&hif->hif_classifier, &ap->filter, + cl, &ap->filter_handle); +} + +static int +hfsccmd_delete_filter(ap) + struct hfsc_delete_filter *ap; +{ + struct hfsc_if *hif; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + return acc_delete_filter(&hif->hif_classifier, + ap->filter_handle); +} + +static int +hfsccmd_class_stats(ap) + struct hfsc_class_stats *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + struct hfsc_classstats stats, *usp; + int n, nclasses, error; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + ap->cur_time = read_machclk(); + ap->machclk_freq = machclk_freq; + ap->hif_classes = hif->hif_classes; + ap->hif_packets = hif->hif_packets; + + /* skip the first N classes in the tree */ + nclasses = ap->nskip; + for (cl = hif->hif_rootclass, n = 0; cl != NULL && n < nclasses; + cl = hfsc_nextclass(cl), n++) + ; + if (n != nclasses) + return (EINVAL); + + /* then, read the next N classes in the tree */ + nclasses = ap->nclasses; + usp = ap->stats; + for (n = 0; cl != NULL && n < nclasses; cl = hfsc_nextclass(cl), n++) { + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, + sizeof(stats))) != 0) + return (error); + } + + ap->nclasses = n; + + return (0); +} + +#ifdef KLD_MODULE + +static struct altqsw hfsc_sw = + {"hfsc", hfscopen, hfscclose, hfscioctl}; + +ALTQ_MODULE(altq_hfsc, ALTQT_HFSC, &hfsc_sw); +MODULE_DEPEND(altq_hfsc, altq_red, 1, 1, 1); +MODULE_DEPEND(altq_hfsc, altq_rio, 1, 1, 1); + +#endif /* KLD_MODULE */ +#endif /* ALTQ3_COMPAT */ + +#endif /* ALTQ_HFSC */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_hfsc.h b/contrib/altq/rtems/freebsd/altq/altq_hfsc.h new file mode 100644 index 00000000..29ce60bf --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_hfsc.h @@ -0,0 +1,320 @@ +/* $KAME: altq_hfsc.h,v 1.12 2003/12/05 05:40:46 kjc Exp $ */ + +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +#ifndef _ALTQ_ALTQ_HFSC_HH_ +#define _ALTQ_ALTQ_HFSC_HH_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct service_curve { + u_int m1; /* slope of the first segment in bits/sec */ + u_int d; /* the x-projection of the first segment in msec */ + u_int m2; /* slope of the second segment in bits/sec */ +}; + +/* special class handles */ +#define HFSC_NULLCLASS_HANDLE 0 +#define HFSC_MAX_CLASSES 64 + +/* hfsc class flags */ +#define HFCF_RED 0x0001 /* use RED */ +#define HFCF_ECN 0x0002 /* use RED/ECN */ +#define HFCF_RIO 0x0004 /* use RIO */ +#define HFCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define HFCF_DEFAULTCLASS 0x1000 /* default class */ + +/* service curve types */ +#define HFSC_REALTIMESC 1 +#define HFSC_LINKSHARINGSC 2 +#define HFSC_UPPERLIMITSC 4 +#define HFSC_DEFAULTSC (HFSC_REALTIMESC|HFSC_LINKSHARINGSC) + +struct hfsc_classstats { + u_int class_id; + u_int32_t class_handle; + struct service_curve rsc; + struct service_curve fsc; + struct service_curve usc; /* upper limit service curve */ + + u_int64_t total; /* total work in bytes */ + u_int64_t cumul; /* cumulative work in bytes + done by real-time criteria */ + u_int64_t d; /* deadline */ + u_int64_t e; /* eligible time */ + u_int64_t vt; /* virtual time */ + u_int64_t f; /* fit time for upper-limit */ + + /* info helpful for debugging */ + u_int64_t initvt; /* init virtual time */ + u_int64_t vtoff; /* cl_vt_ipoff */ + u_int64_t cvtmax; /* cl_maxvt */ + u_int64_t myf; /* cl_myf */ + u_int64_t cfmin; /* cl_mincf */ + u_int64_t cvtmin; /* cl_mincvt */ + u_int64_t myfadj; /* cl_myfadj */ + u_int64_t vtadj; /* cl_vtadj */ + u_int64_t cur_time; + u_int32_t machclk_freq; + + u_int qlength; + u_int qlimit; + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int period; + + u_int vtperiod; /* vt period sequence no */ + u_int parentperiod; /* parent's vt period seqno */ + int nactive; /* number of active children */ + + /* red and rio related info */ + int qtype; + struct redstats red[3]; +}; + +#ifdef ALTQ3_COMPAT +struct hfsc_interface { + char hfsc_ifname[IFNAMSIZ]; /* interface name (e.g., fxp0) */ +}; + +struct hfsc_attach { + struct hfsc_interface iface; + u_int bandwidth; /* link bandwidth in bits/sec */ +}; + +struct hfsc_add_class { + struct hfsc_interface iface; + u_int32_t parent_handle; + struct service_curve service_curve; + int qlimit; + int flags; + + u_int32_t class_handle; /* return value */ +}; + +struct hfsc_delete_class { + struct hfsc_interface iface; + u_int32_t class_handle; +}; + +struct hfsc_modify_class { + struct hfsc_interface iface; + u_int32_t class_handle; + struct service_curve service_curve; + int sctype; +}; + +struct hfsc_add_filter { + struct hfsc_interface iface; + u_int32_t class_handle; + struct flow_filter filter; + + u_long filter_handle; /* return value */ +}; + +struct hfsc_delete_filter { + struct hfsc_interface iface; + u_long filter_handle; +}; + +struct hfsc_class_stats { + struct hfsc_interface iface; + int nskip; /* skip # of classes */ + int nclasses; /* # of class stats (WR) */ + u_int64_t cur_time; /* current time */ + u_int32_t machclk_freq; /* machine clock frequency */ + u_int hif_classes; /* # of classes in the tree */ + u_int hif_packets; /* # of packets in the tree */ + struct hfsc_classstats *stats; /* pointer to stats array */ +}; + +#define HFSC_IF_ATTACH _IOW('Q', 1, struct hfsc_attach) +#define HFSC_IF_DETACH _IOW('Q', 2, struct hfsc_interface) +#define HFSC_ENABLE _IOW('Q', 3, struct hfsc_interface) +#define HFSC_DISABLE _IOW('Q', 4, struct hfsc_interface) +#define HFSC_CLEAR_HIERARCHY _IOW('Q', 5, struct hfsc_interface) +#define HFSC_ADD_CLASS _IOWR('Q', 7, struct hfsc_add_class) +#define HFSC_DEL_CLASS _IOW('Q', 8, struct hfsc_delete_class) +#define HFSC_MOD_CLASS _IOW('Q', 9, struct hfsc_modify_class) +#define HFSC_ADD_FILTER _IOWR('Q', 10, struct hfsc_add_filter) +#define HFSC_DEL_FILTER _IOW('Q', 11, struct hfsc_delete_filter) +#define HFSC_GETSTATS _IOWR('Q', 12, struct hfsc_class_stats) +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL +/* + * kernel internal service curve representation + * coordinates are given by 64 bit unsigned integers. + * x-axis: unit is clock count. for the intel x86 architecture, + * the raw Pentium TSC (Timestamp Counter) value is used. + * virtual time is also calculated in this time scale. + * y-axis: unit is byte. + * + * the service curve parameters are converted to the internal + * representation. + * the slope values are scaled to avoid overflow. + * the inverse slope values as well as the y-projection of the 1st + * segment are kept in order to to avoid 64-bit divide operations + * that are expensive on 32-bit architectures. + * + * note: Intel Pentium TSC never wraps around in several thousands of years. + * x-axis doesn't wrap around for 1089 years with 1GHz clock. + * y-axis doesn't wrap around for 4358 years with 1Gbps bandwidth. + */ + +/* kernel internal representation of a service curve */ +struct internal_sc { + u_int64_t sm1; /* scaled slope of the 1st segment */ + u_int64_t ism1; /* scaled inverse-slope of the 1st segment */ + u_int64_t dx; /* the x-projection of the 1st segment */ + u_int64_t dy; /* the y-projection of the 1st segment */ + u_int64_t sm2; /* scaled slope of the 2nd segment */ + u_int64_t ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* runtime service curve */ +struct runtime_sc { + u_int64_t x; /* current starting position on x-axis */ + u_int64_t y; /* current starting position on x-axis */ + u_int64_t sm1; /* scaled slope of the 1st segment */ + u_int64_t ism1; /* scaled inverse-slope of the 1st segment */ + u_int64_t dx; /* the x-projection of the 1st segment */ + u_int64_t dy; /* the y-projection of the 1st segment */ + u_int64_t sm2; /* scaled slope of the 2nd segment */ + u_int64_t ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* for TAILQ based ellist and actlist implementation */ +struct hfsc_class; +typedef TAILQ_HEAD(_eligible, hfsc_class) ellist_t; +typedef TAILQ_ENTRY(hfsc_class) elentry_t; +typedef TAILQ_HEAD(_active, hfsc_class) actlist_t; +typedef TAILQ_ENTRY(hfsc_class) actentry_t; +#define ellist_first(s) TAILQ_FIRST(s) +#define actlist_first(s) TAILQ_FIRST(s) +#define actlist_last(s) TAILQ_LAST(s, _active) + +struct hfsc_class { + u_int cl_id; /* class id (just for debug) */ + u_int32_t cl_handle; /* class handle */ + struct hfsc_if *cl_hif; /* back pointer to struct hfsc_if */ + int cl_flags; /* misc flags */ + + struct hfsc_class *cl_parent; /* parent class */ + struct hfsc_class *cl_siblings; /* sibling classes */ + struct hfsc_class *cl_children; /* child classes */ + + class_queue_t *cl_q; /* class queue structure */ + struct red *cl_red; /* RED state */ + struct altq_pktattr *cl_pktattr; /* saved header used by ECN */ + + u_int64_t cl_total; /* total work in bytes */ + u_int64_t cl_cumul; /* cumulative work in bytes + done by real-time criteria */ + u_int64_t cl_d; /* deadline */ + u_int64_t cl_e; /* eligible time */ + u_int64_t cl_vt; /* virtual time */ + u_int64_t cl_f; /* time when this class will fit for + link-sharing, max(myf, cfmin) */ + u_int64_t cl_myf; /* my fit-time (as calculated from this + class's own upperlimit curve) */ + u_int64_t cl_myfadj; /* my fit-time adjustment + (to cancel history dependence) */ + u_int64_t cl_cfmin; /* earliest children's fit-time (used + with cl_myf to obtain cl_f) */ + u_int64_t cl_cvtmin; /* minimal virtual time among the + children fit for link-sharing + (monotonic within a period) */ + u_int64_t cl_vtadj; /* intra-period cumulative vt + adjustment */ + u_int64_t cl_vtoff; /* inter-period cumulative vt offset */ + u_int64_t cl_cvtmax; /* max child's vt in the last period */ + + u_int64_t cl_initvt; /* init virtual time (for debugging) */ + + struct internal_sc *cl_rsc; /* internal real-time service curve */ + struct internal_sc *cl_fsc; /* internal fair service curve */ + struct internal_sc *cl_usc; /* internal upperlimit service curve */ + struct runtime_sc cl_deadline; /* deadline curve */ + struct runtime_sc cl_eligible; /* eligible curve */ + struct runtime_sc cl_virtual; /* virtual curve */ + struct runtime_sc cl_ulimit; /* upperlimit curve */ + + u_int cl_vtperiod; /* vt period sequence no */ + u_int cl_parentperiod; /* parent's vt period seqno */ + int cl_nactive; /* number of active children */ + actlist_t *cl_actc; /* active children list */ + + actentry_t cl_actlist; /* active children list entry */ + elentry_t cl_ellist; /* eligible list entry */ + + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int period; + } cl_stats; +}; + +/* + * hfsc interface state + */ +struct hfsc_if { + struct hfsc_if *hif_next; /* interface state list */ + struct ifaltq *hif_ifq; /* backpointer to ifaltq */ + struct hfsc_class *hif_rootclass; /* root class */ + struct hfsc_class *hif_defaultclass; /* default class */ + struct hfsc_class *hif_class_tbl[HFSC_MAX_CLASSES]; + struct hfsc_class *hif_pollcache; /* cache for poll operation */ + + u_int hif_classes; /* # of classes in the tree */ + u_int hif_packets; /* # of packets in the tree */ + u_int hif_classid; /* class id sequence number */ + + ellist_t *hif_eligible; /* eligible list */ + +#ifdef ALTQ3_CLFIER_COMPAT + struct acc_classifier hif_classifier; +#endif +}; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_HFSC_HH_ */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_priq.c b/contrib/altq/rtems/freebsd/altq/altq_priq.c new file mode 100644 index 00000000..de7f5f0a --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_priq.c @@ -0,0 +1,1055 @@ +#include + +/* $FreeBSD$ */ +/* $KAME: altq_priq.c,v 1.11 2003/09/17 14:23:25 kjc Exp $ */ +/* + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * priority queue + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include +#if (__FreeBSD__ != 2) +#include +#ifdef __FreeBSD__ +#include +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ + +#ifdef ALTQ_PRIQ /* priq is enabled by ALTQ_PRIQ option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#ifdef ALTQ3_COMPAT +#include +#endif +#include + +/* + * function prototypes + */ +#ifdef ALTQ3_COMPAT +static struct priq_if *priq_attach(struct ifaltq *, u_int); +static int priq_detach(struct priq_if *); +#endif +static int priq_clear_interface(struct priq_if *); +static int priq_request(struct ifaltq *, int, void *); +static void priq_purge(struct priq_if *); +static struct priq_class *priq_class_create(struct priq_if *, int, int, int, + int); +static int priq_class_destroy(struct priq_class *); +static int priq_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +static struct mbuf *priq_dequeue(struct ifaltq *, int); + +static int priq_addq(struct priq_class *, struct mbuf *); +static struct mbuf *priq_getq(struct priq_class *); +static struct mbuf *priq_pollq(struct priq_class *); +static void priq_purgeq(struct priq_class *); + +#ifdef ALTQ3_COMPAT +static int priqcmd_if_attach(struct priq_interface *); +static int priqcmd_if_detach(struct priq_interface *); +static int priqcmd_add_class(struct priq_add_class *); +static int priqcmd_delete_class(struct priq_delete_class *); +static int priqcmd_modify_class(struct priq_modify_class *); +static int priqcmd_add_filter(struct priq_add_filter *); +static int priqcmd_delete_filter(struct priq_delete_filter *); +static int priqcmd_class_stats(struct priq_class_stats *); +#endif /* ALTQ3_COMPAT */ + +static void get_class_stats(struct priq_classstats *, struct priq_class *); +static struct priq_class *clh_to_clp(struct priq_if *, u_int32_t); + +#ifdef ALTQ3_COMPAT +altqdev_decl(priq); + +/* pif_list keeps all priq_if's allocated. */ +static struct priq_if *pif_list = NULL; +#endif /* ALTQ3_COMPAT */ + +int +priq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + error = altq_attach(&ifp->if_snd, ALTQT_PRIQ, a->altq_disc, + priq_enqueue, priq_dequeue, priq_request, NULL, NULL); + splx(s); + return (error); +} + +int +priq_add_altq(struct pf_altq *a) +{ + struct priq_if *pif; + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENODEV); + + pif = malloc(sizeof(struct priq_if), + M_DEVBUF, M_WAITOK); + if (pif == NULL) + return (ENOMEM); + bzero(pif, sizeof(struct priq_if)); + pif->pif_bandwidth = a->ifbandwidth; + pif->pif_maxpri = -1; + pif->pif_ifq = &ifp->if_snd; + + /* keep the state in pf_altq */ + a->altq_disc = pif; + + return (0); +} + +int +priq_remove_altq(struct pf_altq *a) +{ + struct priq_if *pif; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + (void)priq_clear_interface(pif); + + free(pif, M_DEVBUF); + return (0); +} + +int +priq_add_queue(struct pf_altq *a) +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + + /* check parameters */ + if (a->priority >= PRIQ_MAXPRI) + return (EINVAL); + if (a->qid == 0) + return (EINVAL); + if (pif->pif_classes[a->priority] != NULL) + return (EBUSY); + if (clh_to_clp(pif, a->qid) != NULL) + return (EBUSY); + + cl = priq_class_create(pif, a->priority, a->qlimit, + a->pq_u.priq_opts.flags, a->qid); + if (cl == NULL) + return (ENOMEM); + + return (0); +} + +int +priq_remove_queue(struct pf_altq *a) +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + + if ((cl = clh_to_clp(pif, a->qid)) == NULL) + return (EINVAL); + + return (priq_class_destroy(cl)); +} + +int +priq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct priq_if *pif; + struct priq_class *cl; + struct priq_classstats stats; + int error = 0; + + if ((pif = altq_lookup(a->ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(pif, a->qid)) == NULL) + return (EINVAL); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes. + */ +static int +priq_clear_interface(struct priq_if *pif) +{ + struct priq_class *cl; + int pri; + +#ifdef ALTQ3_CLFIER_COMPAT + /* free the filters for this interface */ + acc_discard_filters(&pif->pif_classifier, NULL, 1); +#endif + + /* clear out the classes */ + for (pri = 0; pri <= pif->pif_maxpri; pri++) + if ((cl = pif->pif_classes[pri]) != NULL) + priq_class_destroy(cl); + + return (0); +} + +static int +priq_request(struct ifaltq *ifq, int req, void *arg) +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + switch (req) { + case ALTRQ_PURGE: + priq_purge(pif); + break; + } + return (0); +} + +/* discard all the queued packets on the interface */ +static void +priq_purge(struct priq_if *pif) +{ + struct priq_class *cl; + int pri; + + for (pri = 0; pri <= pif->pif_maxpri; pri++) { + if ((cl = pif->pif_classes[pri]) != NULL && !qempty(cl->cl_q)) + priq_purgeq(cl); + } + if (ALTQ_IS_ENABLED(pif->pif_ifq)) + pif->pif_ifq->ifq_len = 0; +} + +static struct priq_class * +priq_class_create(struct priq_if *pif, int pri, int qlimit, int flags, int qid) +{ + struct priq_class *cl; + int s; + +#ifndef ALTQ_RED + if (flags & PRCF_RED) { +#ifdef ALTQ_DEBUG + printf("priq_class_create: RED not configured for PRIQ!\n"); +#endif + return (NULL); + } +#endif + + if ((cl = pif->pif_classes[pri]) != NULL) { + /* modify the class instead of creating a new one */ +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + IFQ_LOCK(cl->cl_pif->pif_ifq); + if (!qempty(cl->cl_q)) + priq_purgeq(cl); + IFQ_UNLOCK(cl->cl_pif->pif_ifq); + splx(s); +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } else { + cl = malloc(sizeof(struct priq_class), + M_DEVBUF, M_WAITOK); + if (cl == NULL) + return (NULL); + bzero(cl, sizeof(struct priq_class)); + + cl->cl_q = malloc(sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (cl->cl_q == NULL) + goto err_ret; + bzero(cl->cl_q, sizeof(class_queue_t)); + } + + pif->pif_classes[pri] = cl; + if (flags & PRCF_DEFAULTCLASS) + pif->pif_default = cl; + if (qlimit == 0) + qlimit = 50; /* use default */ + qlimit(cl->cl_q) = qlimit; + qtype(cl->cl_q) = Q_DROPTAIL; + qlen(cl->cl_q) = 0; + cl->cl_flags = flags; + cl->cl_pri = pri; + if (pri > pif->pif_maxpri) + pif->pif_maxpri = pri; + cl->cl_pif = pif; + cl->cl_handle = qid; + +#ifdef ALTQ_RED + if (flags & (PRCF_RED|PRCF_RIO)) { + int red_flags, red_pkttime; + + red_flags = 0; + if (flags & PRCF_ECN) + red_flags |= REDF_ECN; +#ifdef ALTQ_RIO + if (flags & PRCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + if (pif->pif_bandwidth < 8) + red_pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu + * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8); +#ifdef ALTQ_RIO + if (flags & PRCF_RIO) { + cl->cl_red = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RIO; + } else +#endif + if (flags & PRCF_RED) { + cl->cl_red = red_alloc(0, 0, + qlimit(cl->cl_q) * 10/100, + qlimit(cl->cl_q) * 30/100, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RED; + } + } +#endif /* ALTQ_RED */ + + return (cl); + + err_ret: + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + if (cl->cl_q != NULL) + free(cl->cl_q, M_DEVBUF); + free(cl, M_DEVBUF); + return (NULL); +} + +static int +priq_class_destroy(struct priq_class *cl) +{ + struct priq_if *pif; + int s, pri; + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + IFQ_LOCK(cl->cl_pif->pif_ifq); + +#ifdef ALTQ3_CLFIER_COMPAT + /* delete filters referencing to this class */ + acc_discard_filters(&cl->cl_pif->pif_classifier, cl, 0); +#endif + + if (!qempty(cl->cl_q)) + priq_purgeq(cl); + + pif = cl->cl_pif; + pif->pif_classes[cl->cl_pri] = NULL; + if (pif->pif_maxpri == cl->cl_pri) { + for (pri = cl->cl_pri; pri >= 0; pri--) + if (pif->pif_classes[pri] != NULL) { + pif->pif_maxpri = pri; + break; + } + if (pri < 0) + pif->pif_maxpri = -1; + } + IFQ_UNLOCK(cl->cl_pif->pif_ifq); + splx(s); + + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + free(cl->cl_q, M_DEVBUF); + free(cl, M_DEVBUF); + return (0); +} + +/* + * priq_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +priq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + struct priq_class *cl; + struct pf_mtag *t; + int len; + + IFQ_LOCK_ASSERT(ifq); + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ +#if defined(__NetBSD__) || defined(__OpenBSD__)\ + || (defined(__FreeBSD__) && __FreeBSD_version >= 501113) + printf("altq: packet for %s does not have pkthdr\n", + ifq->altq_ifp->if_xname); +#else + printf("altq: packet for %s%d does not have pkthdr\n", + ifq->altq_ifp->if_name, ifq->altq_ifp->if_unit); +#endif + m_freem(m); + return (ENOBUFS); + } + cl = NULL; + if ((t = pf_find_mtag(m)) != NULL) + cl = clh_to_clp(pif, t->qid); +#ifdef ALTQ3_COMPAT + else if ((ifq->altq_flags & ALTQF_CLASSIFY) && pktattr != NULL) + cl = pktattr->pattr_class; +#endif + if (cl == NULL) { + cl = pif->pif_default; + if (cl == NULL) { + m_freem(m); + return (ENOBUFS); + } + } +#ifdef ALTQ3_COMPAT + if (pktattr != NULL) + cl->cl_pktattr = pktattr; /* save proto hdr used by ECN */ + else +#endif + cl->cl_pktattr = NULL; + len = m_pktlen(m); + if (priq_addq(cl, m) != 0) { + /* drop occurred. mbuf was freed in priq_addq. */ + PKTCNTR_ADD(&cl->cl_dropcnt, len); + return (ENOBUFS); + } + IFQ_INC_LEN(ifq); + + /* successfully queued. */ + return (0); +} + +/* + * priq_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +priq_dequeue(struct ifaltq *ifq, int op) +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + struct priq_class *cl; + struct mbuf *m; + int pri; + + IFQ_LOCK_ASSERT(ifq); + + if (IFQ_IS_EMPTY(ifq)) + /* no packet in the queue */ + return (NULL); + + for (pri = pif->pif_maxpri; pri >= 0; pri--) { + if ((cl = pif->pif_classes[pri]) != NULL && + !qempty(cl->cl_q)) { + if (op == ALTDQ_POLL) + return (priq_pollq(cl)); + + m = priq_getq(cl); + if (m != NULL) { + IFQ_DEC_LEN(ifq); + if (qempty(cl->cl_q)) + cl->cl_period++; + PKTCNTR_ADD(&cl->cl_xmitcnt, m_pktlen(m)); + } + return (m); + } + } + return (NULL); +} + +static int +priq_addq(struct priq_class *cl, struct mbuf *m) +{ + +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_addq((rio_t *)cl->cl_red, cl->cl_q, m, + cl->cl_pktattr); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr); +#endif + if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) { + m_freem(m); + return (-1); + } + + if (cl->cl_flags & PRCF_CLEARDSCP) + write_dsfield(m, cl->cl_pktattr, 0); + + _addq(cl->cl_q, m); + + return (0); +} + +static struct mbuf * +priq_getq(struct priq_class *cl) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_getq((rio_t *)cl->cl_red, cl->cl_q); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_getq(cl->cl_red, cl->cl_q); +#endif + return _getq(cl->cl_q); +} + +static struct mbuf * +priq_pollq(cl) + struct priq_class *cl; +{ + return qhead(cl->cl_q); +} + +static void +priq_purgeq(struct priq_class *cl) +{ + struct mbuf *m; + + if (qempty(cl->cl_q)) + return; + + while ((m = _getq(cl->cl_q)) != NULL) { + PKTCNTR_ADD(&cl->cl_dropcnt, m_pktlen(m)); + m_freem(m); + } + ASSERT(qlen(cl->cl_q) == 0); +} + +static void +get_class_stats(struct priq_classstats *sp, struct priq_class *cl) +{ + sp->class_handle = cl->cl_handle; + sp->qlength = qlen(cl->cl_q); + sp->qlimit = qlimit(cl->cl_q); + sp->period = cl->cl_period; + sp->xmitcnt = cl->cl_xmitcnt; + sp->dropcnt = cl->cl_dropcnt; + + sp->qtype = qtype(cl->cl_q); +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_getstats((rio_t *)cl->cl_red, &sp->red[0]); +#endif + +} + +/* convert a class handle to the corresponding class pointer */ +static struct priq_class * +clh_to_clp(struct priq_if *pif, u_int32_t chandle) +{ + struct priq_class *cl; + int idx; + + if (chandle == 0) + return (NULL); + + for (idx = pif->pif_maxpri; idx >= 0; idx--) + if ((cl = pif->pif_classes[idx]) != NULL && + cl->cl_handle == chandle) + return (cl); + + return (NULL); +} + + +#ifdef ALTQ3_COMPAT + +static struct priq_if * +priq_attach(ifq, bandwidth) + struct ifaltq *ifq; + u_int bandwidth; +{ + struct priq_if *pif; + + pif = malloc(sizeof(struct priq_if), + M_DEVBUF, M_WAITOK); + if (pif == NULL) + return (NULL); + bzero(pif, sizeof(struct priq_if)); + pif->pif_bandwidth = bandwidth; + pif->pif_maxpri = -1; + pif->pif_ifq = ifq; + + /* add this state to the priq list */ + pif->pif_next = pif_list; + pif_list = pif; + + return (pif); +} + +static int +priq_detach(pif) + struct priq_if *pif; +{ + (void)priq_clear_interface(pif); + + /* remove this interface from the pif list */ + if (pif_list == pif) + pif_list = pif->pif_next; + else { + struct priq_if *p; + + for (p = pif_list; p != NULL; p = p->pif_next) + if (p->pif_next == pif) { + p->pif_next = pif->pif_next; + break; + } + ASSERT(p != NULL); + } + + free(pif, M_DEVBUF); + return (0); +} + +/* + * priq device interface + */ +int +priqopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +priqclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct priq_if *pif; + int err, error = 0; + + while ((pif = pif_list) != NULL) { + /* destroy all */ + if (ALTQ_IS_ENABLED(pif->pif_ifq)) + altq_disable(pif->pif_ifq); + + err = altq_detach(pif->pif_ifq); + if (err == 0) + err = priq_detach(pif); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +priqioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct priq_if *pif; + struct priq_interface *ifacep; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case PRIQ_GETSTATS: + break; + default: +#if (__FreeBSD_version > 700000) + if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) + return (error); +#elsif (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + + case PRIQ_IF_ATTACH: + error = priqcmd_if_attach((struct priq_interface *)addr); + break; + + case PRIQ_IF_DETACH: + error = priqcmd_if_detach((struct priq_interface *)addr); + break; + + case PRIQ_ENABLE: + case PRIQ_DISABLE: + case PRIQ_CLEAR: + ifacep = (struct priq_interface *)addr; + if ((pif = altq_lookup(ifacep->ifname, + ALTQT_PRIQ)) == NULL) { + error = EBADF; + break; + } + + switch (cmd) { + case PRIQ_ENABLE: + if (pif->pif_default == NULL) { +#ifdef ALTQ_DEBUG + printf("priq: no default class\n"); +#endif + error = EINVAL; + break; + } + error = altq_enable(pif->pif_ifq); + break; + + case PRIQ_DISABLE: + error = altq_disable(pif->pif_ifq); + break; + + case PRIQ_CLEAR: + priq_clear_interface(pif); + break; + } + break; + + case PRIQ_ADD_CLASS: + error = priqcmd_add_class((struct priq_add_class *)addr); + break; + + case PRIQ_DEL_CLASS: + error = priqcmd_delete_class((struct priq_delete_class *)addr); + break; + + case PRIQ_MOD_CLASS: + error = priqcmd_modify_class((struct priq_modify_class *)addr); + break; + + case PRIQ_ADD_FILTER: + error = priqcmd_add_filter((struct priq_add_filter *)addr); + break; + + case PRIQ_DEL_FILTER: + error = priqcmd_delete_filter((struct priq_delete_filter *)addr); + break; + + case PRIQ_GETSTATS: + error = priqcmd_class_stats((struct priq_class_stats *)addr); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +static int +priqcmd_if_attach(ap) + struct priq_interface *ap; +{ + struct priq_if *pif; + struct ifnet *ifp; + int error; + + if ((ifp = ifunit(ap->ifname)) == NULL) + return (ENXIO); + + if ((pif = priq_attach(&ifp->if_snd, ap->arg)) == NULL) + return (ENOMEM); + + /* + * set PRIQ to this ifnet structure. + */ + if ((error = altq_attach(&ifp->if_snd, ALTQT_PRIQ, pif, + priq_enqueue, priq_dequeue, priq_request, + &pif->pif_classifier, acc_classify)) != 0) + (void)priq_detach(pif); + + return (error); +} + +static int +priqcmd_if_detach(ap) + struct priq_interface *ap; +{ + struct priq_if *pif; + int error; + + if ((pif = altq_lookup(ap->ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if (ALTQ_IS_ENABLED(pif->pif_ifq)) + altq_disable(pif->pif_ifq); + + if ((error = altq_detach(pif->pif_ifq))) + return (error); + + return priq_detach(pif); +} + +static int +priqcmd_add_class(ap) + struct priq_add_class *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + int qid; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if (ap->pri < 0 || ap->pri >= PRIQ_MAXPRI) + return (EINVAL); + if (pif->pif_classes[ap->pri] != NULL) + return (EBUSY); + + qid = ap->pri + 1; + if ((cl = priq_class_create(pif, ap->pri, + ap->qlimit, ap->flags, qid)) == NULL) + return (ENOMEM); + + /* return a class handle to the user */ + ap->class_handle = cl->cl_handle; + + return (0); +} + +static int +priqcmd_delete_class(ap) + struct priq_delete_class *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) + return (EINVAL); + + return priq_class_destroy(cl); +} + +static int +priqcmd_modify_class(ap) + struct priq_modify_class *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if (ap->pri < 0 || ap->pri >= PRIQ_MAXPRI) + return (EINVAL); + + if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) + return (EINVAL); + + /* + * if priority is changed, move the class to the new priority + */ + if (pif->pif_classes[ap->pri] != cl) { + if (pif->pif_classes[ap->pri] != NULL) + return (EEXIST); + pif->pif_classes[cl->cl_pri] = NULL; + pif->pif_classes[ap->pri] = cl; + cl->cl_pri = ap->pri; + } + + /* call priq_class_create to change class parameters */ + if ((cl = priq_class_create(pif, ap->pri, + ap->qlimit, ap->flags, ap->class_handle)) == NULL) + return (ENOMEM); + return 0; +} + +static int +priqcmd_add_filter(ap) + struct priq_add_filter *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) + return (EINVAL); + + return acc_add_filter(&pif->pif_classifier, &ap->filter, + cl, &ap->filter_handle); +} + +static int +priqcmd_delete_filter(ap) + struct priq_delete_filter *ap; +{ + struct priq_if *pif; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + return acc_delete_filter(&pif->pif_classifier, + ap->filter_handle); +} + +static int +priqcmd_class_stats(ap) + struct priq_class_stats *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + struct priq_classstats stats, *usp; + int pri, error; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + ap->maxpri = pif->pif_maxpri; + + /* then, read the next N classes in the tree */ + usp = ap->stats; + for (pri = 0; pri <= pif->pif_maxpri; pri++) { + cl = pif->pif_classes[pri]; + if (cl != NULL) + get_class_stats(&stats, cl); + else + bzero(&stats, sizeof(stats)); + if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, + sizeof(stats))) != 0) + return (error); + } + return (0); +} + +#ifdef KLD_MODULE + +static struct altqsw priq_sw = + {"priq", priqopen, priqclose, priqioctl}; + +ALTQ_MODULE(altq_priq, ALTQT_PRIQ, &priq_sw); +MODULE_DEPEND(altq_priq, altq_red, 1, 1, 1); +MODULE_DEPEND(altq_priq, altq_rio, 1, 1, 1); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ3_COMPAT */ +#endif /* ALTQ_PRIQ */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_priq.h b/contrib/altq/rtems/freebsd/altq/altq_priq.h new file mode 100644 index 00000000..8f456c91 --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_priq.h @@ -0,0 +1,170 @@ +/* $KAME: altq_priq.h,v 1.7 2003/10/03 05:05:15 kjc Exp $ */ +/* + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_PRIQ_HH_ +#define _ALTQ_ALTQ_PRIQ_HH_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define PRIQ_MAXPRI 16 /* upper limit of the number of priorities */ + +#ifdef ALTQ3_COMPAT +struct priq_interface { + char ifname[IFNAMSIZ]; /* interface name (e.g., fxp0) */ + u_long arg; /* request-specific argument */ +}; + +struct priq_add_class { + struct priq_interface iface; + int pri; /* priority (0 is the lowest) */ + int qlimit; /* queue size limit */ + int flags; /* misc flags (see below) */ + + u_int32_t class_handle; /* return value */ +}; +#endif /* ALTQ3_COMPAT */ + +/* priq class flags */ +#define PRCF_RED 0x0001 /* use RED */ +#define PRCF_ECN 0x0002 /* use RED/ECN */ +#define PRCF_RIO 0x0004 /* use RIO */ +#define PRCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define PRCF_DEFAULTCLASS 0x1000 /* default class */ + +/* special class handles */ +#define PRIQ_NULLCLASS_HANDLE 0 + +#ifdef ALTQ3_COMPAT +struct priq_delete_class { + struct priq_interface iface; + u_int32_t class_handle; +}; + +struct priq_modify_class { + struct priq_interface iface; + u_int32_t class_handle; + int pri; + int qlimit; + int flags; +}; + +struct priq_add_filter { + struct priq_interface iface; + u_int32_t class_handle; + struct flow_filter filter; + + u_long filter_handle; /* return value */ +}; + +struct priq_delete_filter { + struct priq_interface iface; + u_long filter_handle; +}; +#endif /* ALTQ3_COMPAT */ + +struct priq_classstats { + u_int32_t class_handle; + + u_int qlength; + u_int qlimit; + u_int period; + struct pktcntr xmitcnt; /* transmitted packet counter */ + struct pktcntr dropcnt; /* dropped packet counter */ + + /* red and rio related info */ + int qtype; + struct redstats red[3]; /* rio has 3 red stats */ +}; + +#ifdef ALTQ3_COMPAT +struct priq_class_stats { + struct priq_interface iface; + int maxpri; /* in/out */ + + struct priq_classstats *stats; /* pointer to stats array */ +}; + +#define PRIQ_IF_ATTACH _IOW('Q', 1, struct priq_interface) +#define PRIQ_IF_DETACH _IOW('Q', 2, struct priq_interface) +#define PRIQ_ENABLE _IOW('Q', 3, struct priq_interface) +#define PRIQ_DISABLE _IOW('Q', 4, struct priq_interface) +#define PRIQ_CLEAR _IOW('Q', 5, struct priq_interface) +#define PRIQ_ADD_CLASS _IOWR('Q', 7, struct priq_add_class) +#define PRIQ_DEL_CLASS _IOW('Q', 8, struct priq_delete_class) +#define PRIQ_MOD_CLASS _IOW('Q', 9, struct priq_modify_class) +#define PRIQ_ADD_FILTER _IOWR('Q', 10, struct priq_add_filter) +#define PRIQ_DEL_FILTER _IOW('Q', 11, struct priq_delete_filter) +#define PRIQ_GETSTATS _IOWR('Q', 12, struct priq_class_stats) + +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL + +struct priq_class { + u_int32_t cl_handle; /* class handle */ + class_queue_t *cl_q; /* class queue structure */ + struct red *cl_red; /* RED state */ + int cl_pri; /* priority */ + int cl_flags; /* class flags */ + struct priq_if *cl_pif; /* back pointer to pif */ + struct altq_pktattr *cl_pktattr; /* saved header used by ECN */ + + /* statistics */ + u_int cl_period; /* backlog period */ + struct pktcntr cl_xmitcnt; /* transmitted packet counter */ + struct pktcntr cl_dropcnt; /* dropped packet counter */ +}; + +/* + * priq interface state + */ +struct priq_if { + struct priq_if *pif_next; /* interface state list */ + struct ifaltq *pif_ifq; /* backpointer to ifaltq */ + u_int pif_bandwidth; /* link bandwidth in bps */ + int pif_maxpri; /* max priority in use */ + struct priq_class *pif_default; /* default class */ + struct priq_class *pif_classes[PRIQ_MAXPRI]; /* classes */ +#ifdef ALTQ3_CLFIER_COMPAT + struct acc_classifier pif_classifier; /* classifier */ +#endif +}; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_PRIQ_HH_ */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_red.c b/contrib/altq/rtems/freebsd/altq/altq_red.c new file mode 100644 index 00000000..0b76e3d7 --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_red.c @@ -0,0 +1,1503 @@ +#include + +/* $FreeBSD$ */ +/* $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include +#if (__FreeBSD__ != 2) +#include +#ifdef __FreeBSD__ +#include +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#if 1 /* ALTQ3_COMPAT */ +#include +#include +#include +#ifdef ALTQ_FLOWVALVE +#include +#include +#endif +#endif /* ALTQ3_COMPAT */ + +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include +#include +#include +#ifdef ALTQ3_COMPAT +#include +#ifdef ALTQ_FLOWVALVE +#include +#endif +#endif + +/* + * ALTQ/RED (Random Early Detection) implementation using 32-bit + * fixed-point calculation. + * + * written by kjc using the ns code as a reference. + * you can learn more about red and ns from Sally's home page at + * http://www-nrg.ee.lbl.gov/floyd/ + * + * most of the red parameter values are fixed in this implementation + * to prevent fixed-point overflow/underflow. + * if you change the parameters, watch out for overflow/underflow! + * + * the parameters used are recommended values by Sally. + * the corresponding ns config looks: + * q_weight=0.00195 + * minthresh=5 maxthresh=15 queue-size=60 + * linterm=30 + * dropmech=drop-tail + * bytes=false (can't be handled by 32-bit fixed-point) + * doubleq=false dqthresh=false + * wait=true + */ +/* + * alternative red parameters for a slow link. + * + * assume the queue length becomes from zero to L and keeps L, it takes + * N packets for q_avg to reach 63% of L. + * when q_weight is 0.002, N is about 500 packets. + * for a slow link like dial-up, 500 packets takes more than 1 minute! + * when q_weight is 0.008, N is about 127 packets. + * when q_weight is 0.016, N is about 63 packets. + * bursts of 50 packets are allowed for 0.002, bursts of 25 packets + * are allowed for 0.016. + * see Sally's paper for more details. + */ +/* normal red parameters */ +#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ + /* q_weight = 0.00195 */ + +/* red parameters for a slow link */ +#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ + /* q_weight = 0.0078125 */ + +/* red parameters for a very slow link (e.g., dialup) */ +#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ + /* q_weight = 0.015625 */ + +/* fixed-point uses 12-bit decimal places */ +#define FP_SHIFT 12 /* fixed-point shift */ + +/* red parameters for drop probability */ +#define INV_P_MAX 10 /* inverse of max drop probability */ +#define TH_MIN 5 /* min threshold */ +#define TH_MAX 15 /* max threshold */ + +#define RED_LIMIT 60 /* default max queue lenght */ +#define RED_STATS /* collect statistics */ + +/* + * our default policy for forced-drop is drop-tail. + * (in altq-1.1.2 or earlier, the default was random-drop. + * but it makes more sense to punish the cause of the surge.) + * to switch to the random-drop policy, define "RED_RANDOM_DROP". + */ + +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_FLOWVALVE +/* + * flow-valve is an extention to protect red from unresponsive flows + * and to promote end-to-end congestion control. + * flow-valve observes the average drop rates of the flows that have + * experienced packet drops in the recent past. + * when the average drop rate exceeds the threshold, the flow is + * blocked by the flow-valve. the trapped flow should back off + * exponentially to escape from the flow-valve. + */ +#ifdef RED_RANDOM_DROP +#error "random-drop can't be used with flow-valve!" +#endif +#endif /* ALTQ_FLOWVALVE */ + +/* red_list keeps all red_queue_t's allocated. */ +static red_queue_t *red_list = NULL; + +#endif /* ALTQ3_COMPAT */ + +/* default red parameter values */ +static int default_th_min = TH_MIN; +static int default_th_max = TH_MAX; +static int default_inv_pmax = INV_P_MAX; + +#ifdef ALTQ3_COMPAT +/* internal function prototypes */ +static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +static struct mbuf *red_dequeue(struct ifaltq *, int); +static int red_request(struct ifaltq *, int, void *); +static void red_purgeq(red_queue_t *); +static int red_detach(red_queue_t *); +#ifdef ALTQ_FLOWVALVE +static __inline struct fve *flowlist_lookup(struct flowvalve *, + struct altq_pktattr *, struct timeval *); +static __inline struct fve *flowlist_reclaim(struct flowvalve *, + struct altq_pktattr *); +static __inline void flowlist_move_to_head(struct flowvalve *, struct fve *); +static __inline int fv_p2f(struct flowvalve *, int); +#if 0 /* XXX: make the compiler happy (fv_alloc unused) */ +static struct flowvalve *fv_alloc(struct red *); +#endif +static void fv_destroy(struct flowvalve *); +static int fv_checkflow(struct flowvalve *, struct altq_pktattr *, + struct fve **); +static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *, + struct fve *); +#endif +#endif /* ALTQ3_COMPAT */ + +/* + * red support routines + */ +red_t * +red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags, + int pkttime) +{ + red_t *rp; + int w, i; + int npkts_per_sec; + + rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK); + if (rp == NULL) + return (NULL); + bzero(rp, sizeof(red_t)); + + rp->red_avg = 0; + rp->red_idle = 1; + + if (weight == 0) + rp->red_weight = W_WEIGHT; + else + rp->red_weight = weight; + if (inv_pmax == 0) + rp->red_inv_pmax = default_inv_pmax; + else + rp->red_inv_pmax = inv_pmax; + if (th_min == 0) + rp->red_thmin = default_th_min; + else + rp->red_thmin = th_min; + if (th_max == 0) + rp->red_thmax = default_th_max; + else + rp->red_thmax = th_max; + + rp->red_flags = flags; + + if (pkttime == 0) + /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ + rp->red_pkttime = 800; + else + rp->red_pkttime = pkttime; + + if (weight == 0) { + /* when the link is very slow, adjust red parameters */ + npkts_per_sec = 1000000 / rp->red_pkttime; + if (npkts_per_sec < 50) { + /* up to about 400Kbps */ + rp->red_weight = W_WEIGHT_2; + } else if (npkts_per_sec < 300) { + /* up to about 2.4Mbps */ + rp->red_weight = W_WEIGHT_1; + } + } + + /* calculate wshift. weight must be power of 2 */ + w = rp->red_weight; + for (i = 0; w > 1; i++) + w = w >> 1; + rp->red_wshift = i; + w = 1 << rp->red_wshift; + if (w != rp->red_weight) { + printf("invalid weight value %d for red! use %d\n", + rp->red_weight, w); + rp->red_weight = w; + } + + /* + * thmin_s and thmax_s are scaled versions of th_min and th_max + * to be compared with avg. + */ + rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); + rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); + + /* + * precompute probability denominator + * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point + */ + rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) + * rp->red_inv_pmax) << FP_SHIFT; + + /* allocate weight table */ + rp->red_wtab = wtab_alloc(rp->red_weight); + + microtime(&rp->red_last); + return (rp); +} + +void +red_destroy(red_t *rp) +{ +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_FLOWVALVE + if (rp->red_flowvalve != NULL) + fv_destroy(rp->red_flowvalve); +#endif +#endif /* ALTQ3_COMPAT */ + wtab_destroy(rp->red_wtab); + free(rp, M_DEVBUF); +} + +void +red_getstats(red_t *rp, struct redstats *sp) +{ + sp->q_avg = rp->red_avg >> rp->red_wshift; + sp->xmit_cnt = rp->red_stats.xmit_cnt; + sp->drop_cnt = rp->red_stats.drop_cnt; + sp->drop_forced = rp->red_stats.drop_forced; + sp->drop_unforced = rp->red_stats.drop_unforced; + sp->marked_packets = rp->red_stats.marked_packets; +} + +int +red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, + struct altq_pktattr *pktattr) +{ + int avg, droptype; + int n; +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_FLOWVALVE + struct fve *fve = NULL; + + if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0) + if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) { + m_freem(m); + return (-1); + } +#endif +#endif /* ALTQ3_COMPAT */ + + avg = rp->red_avg; + + /* + * if we were idle, we pretend that n packets arrived during + * the idle period. + */ + if (rp->red_idle) { + struct timeval now; + int t; + + rp->red_idle = 0; + microtime(&now); + t = (now.tv_sec - rp->red_last.tv_sec); + if (t > 60) { + /* + * being idle for more than 1 minute, set avg to zero. + * this prevents t from overflow. + */ + avg = 0; + } else { + t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); + n = t / rp->red_pkttime - 1; + + /* the following line does (avg = (1 - Wq)^n * avg) */ + if (n > 0) + avg = (avg >> FP_SHIFT) * + pow_w(rp->red_wtab, n); + } + } + + /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ + avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); + rp->red_avg = avg; /* save the new value */ + + /* + * red_count keeps a tally of arriving traffic that has not + * been dropped. + */ + rp->red_count++; + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (avg >= rp->red_thmin_s && qlen(q) > 1) { + if (avg >= rp->red_thmax_s) { + /* avg >= th_max: forced drop */ + droptype = DTYPE_FORCED; + } else if (rp->red_old == 0) { + /* first exceeds th_min */ + rp->red_count = 1; + rp->red_old = 1; + } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, + rp->red_probd, rp->red_count)) { + /* mark or drop by red */ + if ((rp->red_flags & REDF_ECN) && + mark_ecn(m, pktattr, rp->red_flags)) { + /* successfully marked. do not drop. */ + rp->red_count = 0; +#ifdef RED_STATS + rp->red_stats.marked_packets++; +#endif + } else { + /* unforced drop by red */ + droptype = DTYPE_EARLY; + } + } + } else { + /* avg < th_min */ + rp->red_old = 0; + } + + /* + * if the queue length hits the hard limit, it's a forced drop. + */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + +#ifdef RED_RANDOM_DROP + /* if successful or forced drop, enqueue this packet. */ + if (droptype != DTYPE_EARLY) + _addq(q, m); +#else + /* if successful, enqueue this packet. */ + if (droptype == DTYPE_NODROP) + _addq(q, m); +#endif + if (droptype != DTYPE_NODROP) { + if (droptype == DTYPE_EARLY) { + /* drop the incoming packet */ +#ifdef RED_STATS + rp->red_stats.drop_unforced++; +#endif + } else { + /* forced drop, select a victim packet in the queue. */ +#ifdef RED_RANDOM_DROP + m = _getq_random(q); +#endif +#ifdef RED_STATS + rp->red_stats.drop_forced++; +#endif + } +#ifdef RED_STATS + PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m)); +#endif + rp->red_count = 0; +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_FLOWVALVE + if (rp->red_flowvalve != NULL) + fv_dropbyred(rp->red_flowvalve, pktattr, fve); +#endif +#endif /* ALTQ3_COMPAT */ + m_freem(m); + return (-1); + } + /* successfully queued */ +#ifdef RED_STATS + PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m)); +#endif + return (0); +} + +/* + * early-drop probability is calculated as follows: + * prob = p_max * (avg - th_min) / (th_max - th_min) + * prob_a = prob / (2 - count*prob) + * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) + * here prob_a increases as successive undrop count increases. + * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), + * becomes 1 when (count >= (2 / prob))). + */ +int +drop_early(int fp_len, int fp_probd, int count) +{ + int d; /* denominator of drop-probability */ + + d = fp_probd - count * fp_len; + if (d <= 0) + /* count exceeds the hard limit: drop or mark */ + return (1); + + /* + * now the range of d is [1..600] in fixed-point. (when + * th_max-th_min=10 and p_max=1/30) + * drop probability = (avg - TH_MIN) / d + */ + + if ((arc4random() % d) < fp_len) { + /* drop or mark */ + return (1); + } + /* no drop/mark */ + return (0); +} + +/* + * try to mark CE bit to the packet. + * returns 1 if successfully marked, 0 otherwise. + */ +int +mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags) +{ + struct mbuf *m0; + struct pf_mtag *at; + void *hdr; + int af; + + at = pf_find_mtag(m); + if (at != NULL) { + af = at->af; + hdr = at->hdr; +#ifdef ALTQ3_COMPAT + } else if (pktattr != NULL) { + af = pktattr->pattr_af; + hdr = pktattr->pattr_hdr; +#endif /* ALTQ3_COMPAT */ + } else + return (0); + + if (af != AF_INET && af != AF_INET6) + return (0); + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)hdr >= m0->m_data) && + ((caddr_t)hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, tag info is stale */ + return (0); + } + + switch (af) { + case AF_INET: + if (flags & REDF_ECN4) { + struct ip *ip = hdr; + u_int8_t otos; + int sum; + + if (ip->ip_v != 4) + return (0); /* version mismatch! */ + + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) + return (0); /* not-ECT */ + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + return (1); /* already marked */ + + /* + * ecn-capable but not marked, + * mark CE and update checksum + */ + otos = ip->ip_tos; + ip->ip_tos |= IPTOS_ECN_CE; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += (~otos & 0xffff) + ip->ip_tos; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + ip->ip_sum = htons(~sum & 0xffff); + return (1); + } + break; +#ifdef INET6 + case AF_INET6: + if (flags & REDF_ECN6) { + struct ip6_hdr *ip6 = hdr; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return (0); /* version mismatch! */ + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == + (IPTOS_ECN_NOTECT << 20)) + return (0); /* not-ECT */ + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == + (IPTOS_ECN_CE << 20)) + return (1); /* already marked */ + /* + * ecn-capable but not marked, mark CE + */ + flowlabel |= (IPTOS_ECN_CE << 20); + ip6->ip6_flow = htonl(flowlabel); + return (1); + } + break; +#endif /* INET6 */ + } + + /* not marked */ + return (0); +} + +struct mbuf * +red_getq(rp, q) + red_t *rp; + class_queue_t *q; +{ + struct mbuf *m; + + if ((m = _getq(q)) == NULL) { + if (rp->red_idle == 0) { + rp->red_idle = 1; + microtime(&rp->red_last); + } + return NULL; + } + + rp->red_idle = 0; + return (m); +} + +/* + * helper routine to calibrate avg during idle. + * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point + * here Wq = 1/weight and the code assumes Wq is close to zero. + * + * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. + */ +static struct wtab *wtab_list = NULL; /* pointer to wtab list */ + +struct wtab * +wtab_alloc(int weight) +{ + struct wtab *w; + int i; + + for (w = wtab_list; w != NULL; w = w->w_next) + if (w->w_weight == weight) { + w->w_refcount++; + return (w); + } + + w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK); + if (w == NULL) + panic("wtab_alloc: malloc failed!"); + bzero(w, sizeof(struct wtab)); + w->w_weight = weight; + w->w_refcount = 1; + w->w_next = wtab_list; + wtab_list = w; + + /* initialize the weight table */ + w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; + for (i = 1; i < 32; i++) { + w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; + if (w->w_tab[i] == 0 && w->w_param_max == 0) + w->w_param_max = 1 << i; + } + + return (w); +} + +int +wtab_destroy(struct wtab *w) +{ + struct wtab *prev; + + if (--w->w_refcount > 0) + return (0); + + if (wtab_list == w) + wtab_list = w->w_next; + else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next) + if (prev->w_next == w) { + prev->w_next = w->w_next; + break; + } + + free(w, M_DEVBUF); + return (0); +} + +int32_t +pow_w(struct wtab *w, int n) +{ + int i, bit; + int32_t val; + + if (n >= w->w_param_max) + return (0); + + val = 1 << FP_SHIFT; + if (n <= 0) + return (val); + + bit = 1; + i = 0; + while (n) { + if (n & bit) { + val = (val * w->w_tab[i]) >> FP_SHIFT; + n &= ~bit; + } + i++; + bit <<= 1; + } + return (val); +} + +#ifdef ALTQ3_COMPAT +/* + * red device interface + */ +altqdev_decl(red); + +int +redopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +redclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + red_queue_t *rqp; + int err, error = 0; + + while ((rqp = red_list) != NULL) { + /* destroy all */ + err = red_detach(rqp); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +redioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + red_queue_t *rqp; + struct red_interface *ifacep; + struct ifnet *ifp; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case RED_GETSTATS: + break; + default: +#if (__FreeBSD_version > 700000) + if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) +#elsif (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) +#endif + return (error); + break; + } + + switch (cmd) { + + case RED_ENABLE: + ifacep = (struct red_interface *)addr; + if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + error = altq_enable(rqp->rq_ifq); + break; + + case RED_DISABLE: + ifacep = (struct red_interface *)addr; + if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + error = altq_disable(rqp->rq_ifq); + break; + + case RED_IF_ATTACH: + ifp = ifunit(((struct red_interface *)addr)->red_ifname); + if (ifp == NULL) { + error = ENXIO; + break; + } + + /* allocate and initialize red_queue_t */ + rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK); + if (rqp == NULL) { + error = ENOMEM; + break; + } + bzero(rqp, sizeof(red_queue_t)); + + rqp->rq_q = malloc(sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (rqp->rq_q == NULL) { + free(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + bzero(rqp->rq_q, sizeof(class_queue_t)); + + rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0); + if (rqp->rq_red == NULL) { + free(rqp->rq_q, M_DEVBUF); + free(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + + rqp->rq_ifq = &ifp->if_snd; + qtail(rqp->rq_q) = NULL; + qlen(rqp->rq_q) = 0; + qlimit(rqp->rq_q) = RED_LIMIT; + qtype(rqp->rq_q) = Q_RED; + + /* + * set RED to this ifnet structure. + */ + error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp, + red_enqueue, red_dequeue, red_request, + NULL, NULL); + if (error) { + red_destroy(rqp->rq_red); + free(rqp->rq_q, M_DEVBUF); + free(rqp, M_DEVBUF); + break; + } + + /* add this state to the red list */ + rqp->rq_next = red_list; + red_list = rqp; + break; + + case RED_IF_DETACH: + ifacep = (struct red_interface *)addr; + if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + error = red_detach(rqp); + break; + + case RED_GETSTATS: + do { + struct red_stats *q_stats; + red_t *rp; + + q_stats = (struct red_stats *)addr; + if ((rqp = altq_lookup(q_stats->iface.red_ifname, + ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + + q_stats->q_len = qlen(rqp->rq_q); + q_stats->q_limit = qlimit(rqp->rq_q); + + rp = rqp->rq_red; + q_stats->q_avg = rp->red_avg >> rp->red_wshift; + q_stats->xmit_cnt = rp->red_stats.xmit_cnt; + q_stats->drop_cnt = rp->red_stats.drop_cnt; + q_stats->drop_forced = rp->red_stats.drop_forced; + q_stats->drop_unforced = rp->red_stats.drop_unforced; + q_stats->marked_packets = rp->red_stats.marked_packets; + + q_stats->weight = rp->red_weight; + q_stats->inv_pmax = rp->red_inv_pmax; + q_stats->th_min = rp->red_thmin; + q_stats->th_max = rp->red_thmax; + +#ifdef ALTQ_FLOWVALVE + if (rp->red_flowvalve != NULL) { + struct flowvalve *fv = rp->red_flowvalve; + q_stats->fv_flows = fv->fv_flows; + q_stats->fv_pass = fv->fv_stats.pass; + q_stats->fv_predrop = fv->fv_stats.predrop; + q_stats->fv_alloc = fv->fv_stats.alloc; + q_stats->fv_escape = fv->fv_stats.escape; + } else { +#endif /* ALTQ_FLOWVALVE */ + q_stats->fv_flows = 0; + q_stats->fv_pass = 0; + q_stats->fv_predrop = 0; + q_stats->fv_alloc = 0; + q_stats->fv_escape = 0; +#ifdef ALTQ_FLOWVALVE + } +#endif /* ALTQ_FLOWVALVE */ + } while (/*CONSTCOND*/ 0); + break; + + case RED_CONFIG: + do { + struct red_conf *fc; + red_t *new; + int s, limit; + + fc = (struct red_conf *)addr; + if ((rqp = altq_lookup(fc->iface.red_ifname, + ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + new = red_alloc(fc->red_weight, + fc->red_inv_pmax, + fc->red_thmin, + fc->red_thmax, + fc->red_flags, + fc->red_pkttime); + if (new == NULL) { + error = ENOMEM; + break; + } + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + red_purgeq(rqp); + limit = fc->red_limit; + if (limit < fc->red_thmax) + limit = fc->red_thmax; + qlimit(rqp->rq_q) = limit; + fc->red_limit = limit; /* write back the new value */ + + red_destroy(rqp->rq_red); + rqp->rq_red = new; + + splx(s); + + /* write back new values */ + fc->red_limit = limit; + fc->red_inv_pmax = rqp->rq_red->red_inv_pmax; + fc->red_thmin = rqp->rq_red->red_thmin; + fc->red_thmax = rqp->rq_red->red_thmax; + + } while (/*CONSTCOND*/ 0); + break; + + case RED_SETDEFAULTS: + do { + struct redparams *rp; + + rp = (struct redparams *)addr; + + default_th_min = rp->th_min; + default_th_max = rp->th_max; + default_inv_pmax = rp->inv_pmax; + } while (/*CONSTCOND*/ 0); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +static int +red_detach(rqp) + red_queue_t *rqp; +{ + red_queue_t *tmp; + int error = 0; + + if (ALTQ_IS_ENABLED(rqp->rq_ifq)) + altq_disable(rqp->rq_ifq); + + if ((error = altq_detach(rqp->rq_ifq))) + return (error); + + if (red_list == rqp) + red_list = rqp->rq_next; + else { + for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next) + if (tmp->rq_next == rqp) { + tmp->rq_next = rqp->rq_next; + break; + } + if (tmp == NULL) + printf("red_detach: no state found in red_list!\n"); + } + + red_destroy(rqp->rq_red); + free(rqp->rq_q, M_DEVBUF); + free(rqp, M_DEVBUF); + return (error); +} + +/* + * enqueue routine: + * + * returns: 0 when successfully queued. + * ENOBUFS when drop occurs. + */ +static int +red_enqueue(ifq, m, pktattr) + struct ifaltq *ifq; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0) + return ENOBUFS; + ifq->ifq_len++; + return 0; +} + +/* + * dequeue routine: + * must be called in splimp. + * + * returns: mbuf dequeued. + * NULL when no packet is available in the queue. + */ + +static struct mbuf * +red_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; + struct mbuf *m; + + IFQ_LOCK_ASSERT(ifq); + + if (op == ALTDQ_POLL) + return qhead(rqp->rq_q); + + /* op == ALTDQ_REMOVE */ + m = red_getq(rqp->rq_red, rqp->rq_q); + if (m != NULL) + ifq->ifq_len--; + return (m); +} + +static int +red_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + switch (req) { + case ALTRQ_PURGE: + red_purgeq(rqp); + break; + } + return (0); +} + +static void +red_purgeq(rqp) + red_queue_t *rqp; +{ + _flushq(rqp->rq_q); + if (ALTQ_IS_ENABLED(rqp->rq_ifq)) + rqp->rq_ifq->ifq_len = 0; +} + +#ifdef ALTQ_FLOWVALVE + +#define FV_PSHIFT 7 /* weight of average drop rate -- 1/128 */ +#define FV_PSCALE(x) ((x) << FV_PSHIFT) +#define FV_PUNSCALE(x) ((x) >> FV_PSHIFT) +#define FV_FSHIFT 5 /* weight of average fraction -- 1/32 */ +#define FV_FSCALE(x) ((x) << FV_FSHIFT) +#define FV_FUNSCALE(x) ((x) >> FV_FSHIFT) + +#define FV_TIMER (3 * hz) /* timer value for garbage collector */ +#define FV_FLOWLISTSIZE 64 /* how many flows in flowlist */ + +#define FV_N 10 /* update fve_f every FV_N packets */ + +#define FV_BACKOFFTHRESH 1 /* backoff threshold interval in second */ +#define FV_TTHRESH 3 /* time threshold to delete fve */ +#define FV_ALPHA 5 /* extra packet count */ + +#define FV_STATS + +#if (__FreeBSD_version > 300000) +#define FV_TIMESTAMP(tp) getmicrotime(tp) +#else +#define FV_TIMESTAMP(tp) { (*(tp)) = time; } +#endif + +/* + * Brtt table: 127 entry table to convert drop rate (p) to + * the corresponding bandwidth fraction (f) + * the following equation is implemented to use scaled values, + * fve_p and fve_f, in the fixed point format. + * + * Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p)) + * f = Brtt(p) / (max_th + alpha) + */ +#define BRTT_SIZE 128 +#define BRTT_SHIFT 12 +#define BRTT_MASK 0x0007f000 +#define BRTT_PMAX (1 << (FV_PSHIFT + FP_SHIFT)) + +const int brtt_tab[BRTT_SIZE] = { + 0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728, + 392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361, + 225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333, + 145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612, + 98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957, + 67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440, + 47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184, + 33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611, + 24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062, + 18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487, + 14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222, + 10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844, + 8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079, + 6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746, + 5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722, + 4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924 +}; + +static __inline struct fve * +flowlist_lookup(fv, pktattr, now) + struct flowvalve *fv; + struct altq_pktattr *pktattr; + struct timeval *now; +{ + struct fve *fve; + int flows; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif + struct timeval tthresh; + + if (pktattr == NULL) + return (NULL); + + tthresh.tv_sec = now->tv_sec - FV_TTHRESH; + flows = 0; + /* + * search the flow list + */ + switch (pktattr->pattr_af) { + case AF_INET: + ip = (struct ip *)pktattr->pattr_hdr; + TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){ + if (fve->fve_lastdrop.tv_sec == 0) + break; + if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) { + fve->fve_lastdrop.tv_sec = 0; + break; + } + if (fve->fve_flow.flow_af == AF_INET && + fve->fve_flow.flow_ip.ip_src.s_addr == + ip->ip_src.s_addr && + fve->fve_flow.flow_ip.ip_dst.s_addr == + ip->ip_dst.s_addr) + return (fve); + flows++; + } + break; +#ifdef INET6 + case AF_INET6: + ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){ + if (fve->fve_lastdrop.tv_sec == 0) + break; + if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) { + fve->fve_lastdrop.tv_sec = 0; + break; + } + if (fve->fve_flow.flow_af == AF_INET6 && + IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src, + &ip6->ip6_src) && + IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst, + &ip6->ip6_dst)) + return (fve); + flows++; + } + break; +#endif /* INET6 */ + + default: + /* unknown protocol. no drop. */ + return (NULL); + } + fv->fv_flows = flows; /* save the number of active fve's */ + return (NULL); +} + +static __inline struct fve * +flowlist_reclaim(fv, pktattr) + struct flowvalve *fv; + struct altq_pktattr *pktattr; +{ + struct fve *fve; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif + + /* + * get an entry from the tail of the LRU list. + */ + fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead); + + switch (pktattr->pattr_af) { + case AF_INET: + ip = (struct ip *)pktattr->pattr_hdr; + fve->fve_flow.flow_af = AF_INET; + fve->fve_flow.flow_ip.ip_src = ip->ip_src; + fve->fve_flow.flow_ip.ip_dst = ip->ip_dst; + break; +#ifdef INET6 + case AF_INET6: + ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + fve->fve_flow.flow_af = AF_INET6; + fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src; + fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst; + break; +#endif + } + + fve->fve_state = Green; + fve->fve_p = 0.0; + fve->fve_f = 0.0; + fve->fve_ifseq = fv->fv_ifseq - 1; + fve->fve_count = 0; + + fv->fv_flows++; +#ifdef FV_STATS + fv->fv_stats.alloc++; +#endif + return (fve); +} + +static __inline void +flowlist_move_to_head(fv, fve) + struct flowvalve *fv; + struct fve *fve; +{ + if (TAILQ_FIRST(&fv->fv_flowlist) != fve) { + TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru); + TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru); + } +} + +#if 0 /* XXX: make the compiler happy (fv_alloc unused) */ +/* + * allocate flowvalve structure + */ +static struct flowvalve * +fv_alloc(rp) + struct red *rp; +{ + struct flowvalve *fv; + struct fve *fve; + int i, num; + + num = FV_FLOWLISTSIZE; + fv = malloc(sizeof(struct flowvalve), + M_DEVBUF, M_WAITOK); + if (fv == NULL) + return (NULL); + bzero(fv, sizeof(struct flowvalve)); + + fv->fv_fves = malloc(sizeof(struct fve) * num, + M_DEVBUF, M_WAITOK); + if (fv->fv_fves == NULL) { + free(fv, M_DEVBUF); + return (NULL); + } + bzero(fv->fv_fves, sizeof(struct fve) * num); + + fv->fv_flows = 0; + TAILQ_INIT(&fv->fv_flowlist); + for (i = 0; i < num; i++) { + fve = &fv->fv_fves[i]; + fve->fve_lastdrop.tv_sec = 0; + TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru); + } + + /* initialize drop rate threshold in scaled fixed-point */ + fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax; + + /* initialize drop rate to fraction table */ + fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, + M_DEVBUF, M_WAITOK); + if (fv->fv_p2ftab == NULL) { + free(fv->fv_fves, M_DEVBUF); + free(fv, M_DEVBUF); + return (NULL); + } + /* + * create the p2f table. + * (shift is used to keep the precision) + */ + for (i = 1; i < BRTT_SIZE; i++) { + int f; + + f = brtt_tab[i] << 8; + fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8; + } + + return (fv); +} +#endif + +static void fv_destroy(fv) + struct flowvalve *fv; +{ + free(fv->fv_p2ftab, M_DEVBUF); + free(fv->fv_fves, M_DEVBUF); + free(fv, M_DEVBUF); +} + +static __inline int +fv_p2f(fv, p) + struct flowvalve *fv; + int p; +{ + int val, f; + + if (p >= BRTT_PMAX) + f = fv->fv_p2ftab[BRTT_SIZE-1]; + else if ((val = (p & BRTT_MASK))) + f = fv->fv_p2ftab[(val >> BRTT_SHIFT)]; + else + f = fv->fv_p2ftab[1]; + return (f); +} + +/* + * check if an arriving packet should be pre-dropped. + * called from red_addq() when a packet arrives. + * returns 1 when the packet should be pre-dropped. + * should be called in splimp. + */ +static int +fv_checkflow(fv, pktattr, fcache) + struct flowvalve *fv; + struct altq_pktattr *pktattr; + struct fve **fcache; +{ + struct fve *fve; + struct timeval now; + + fv->fv_ifseq++; + FV_TIMESTAMP(&now); + + if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL) + /* no matching entry in the flowlist */ + return (0); + + *fcache = fve; + + /* update fraction f for every FV_N packets */ + if (++fve->fve_count == FV_N) { + /* + * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f + */ + fve->fve_f = + (FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq) + + fve->fve_f - FV_FUNSCALE(fve->fve_f); + fve->fve_ifseq = fv->fv_ifseq; + fve->fve_count = 0; + } + + /* + * overpumping test + */ + if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) { + int fthresh; + + /* calculate a threshold */ + fthresh = fv_p2f(fv, fve->fve_p); + if (fve->fve_f > fthresh) + fve->fve_state = Red; + } + + if (fve->fve_state == Red) { + /* + * backoff test + */ + if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) { + /* no drop for at least FV_BACKOFFTHRESH sec */ + fve->fve_p = 0; + fve->fve_state = Green; +#ifdef FV_STATS + fv->fv_stats.escape++; +#endif + } else { + /* block this flow */ + flowlist_move_to_head(fv, fve); + fve->fve_lastdrop = now; +#ifdef FV_STATS + fv->fv_stats.predrop++; +#endif + return (1); + } + } + + /* + * p = (1 - Wp) * p + */ + fve->fve_p -= FV_PUNSCALE(fve->fve_p); + if (fve->fve_p < 0) + fve->fve_p = 0; +#ifdef FV_STATS + fv->fv_stats.pass++; +#endif + return (0); +} + +/* + * called from red_addq when a packet is dropped by red. + * should be called in splimp. + */ +static void fv_dropbyred(fv, pktattr, fcache) + struct flowvalve *fv; + struct altq_pktattr *pktattr; + struct fve *fcache; +{ + struct fve *fve; + struct timeval now; + + if (pktattr == NULL) + return; + FV_TIMESTAMP(&now); + + if (fcache != NULL) + /* the fve of this packet is already cached */ + fve = fcache; + else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL) + fve = flowlist_reclaim(fv, pktattr); + + flowlist_move_to_head(fv, fve); + + /* + * update p: the following line cancels the update + * in fv_checkflow() and calculate + * p = Wp + (1 - Wp) * p + */ + fve->fve_p = (1 << FP_SHIFT) + fve->fve_p; + + fve->fve_lastdrop = now; +} + +#endif /* ALTQ_FLOWVALVE */ + +#ifdef KLD_MODULE + +static struct altqsw red_sw = + {"red", redopen, redclose, redioctl}; + +ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw); +MODULE_VERSION(altq_red, 1); + +#endif /* KLD_MODULE */ +#endif /* ALTQ3_COMPAT */ + +#endif /* ALTQ_RED */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_red.h b/contrib/altq/rtems/freebsd/altq/altq_red.h new file mode 100644 index 00000000..0876464c --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_red.h @@ -0,0 +1,198 @@ +/* $KAME: altq_red.h,v 1.8 2003/07/10 12:07:49 kjc Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_RED_HH_ +#define _ALTQ_ALTQ_RED_HH_ + +#include + +#ifdef ALTQ3_COMPAT +struct red_interface { + char red_ifname[IFNAMSIZ]; +}; + +struct red_stats { + struct red_interface iface; + int q_len; + int q_avg; + + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; + + /* static red parameters */ + int q_limit; + int weight; + int inv_pmax; + int th_min; + int th_max; + + /* flowvalve related stuff */ + u_int fv_flows; + u_int fv_pass; + u_int fv_predrop; + u_int fv_alloc; + u_int fv_escape; +}; + +struct red_conf { + struct red_interface iface; + int red_weight; /* weight for EWMA */ + int red_inv_pmax; /* inverse of max drop probability */ + int red_thmin; /* red min threshold */ + int red_thmax; /* red max threshold */ + int red_limit; /* max queue length */ + int red_pkttime; /* average packet time in usec */ + int red_flags; /* see below */ +}; +#endif /* ALTQ3_COMPAT */ + +/* red flags */ +#define REDF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define REDF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define REDF_ECN (REDF_ECN4 | REDF_ECN6) +#define REDF_FLOWVALVE 0x04 /* use flowvalve (aka penalty-box) */ + +/* + * simpler versions of red parameters and statistics used by other + * disciplines (e.g., CBQ) + */ +struct redparams { + int th_min; /* red min threshold */ + int th_max; /* red max threshold */ + int inv_pmax; /* inverse of max drop probability */ +}; + +struct redstats { + int q_avg; + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; +}; + +#ifdef ALTQ3_COMPAT +/* + * IOCTLs for RED + */ +#define RED_IF_ATTACH _IOW('Q', 1, struct red_interface) +#define RED_IF_DETACH _IOW('Q', 2, struct red_interface) +#define RED_ENABLE _IOW('Q', 3, struct red_interface) +#define RED_DISABLE _IOW('Q', 4, struct red_interface) +#define RED_CONFIG _IOWR('Q', 6, struct red_conf) +#define RED_GETSTATS _IOWR('Q', 12, struct red_stats) +#define RED_SETDEFAULTS _IOW('Q', 30, struct redparams) +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL + +#ifdef ALTQ3_COMPAT +struct flowvalve; +#endif + +/* weight table structure for idle time calibration */ +struct wtab { + struct wtab *w_next; + int w_weight; + int w_param_max; + int w_refcount; + int32_t w_tab[32]; +}; + +typedef struct red { + int red_pkttime; /* average packet time in micro sec + used for idle calibration */ + int red_flags; /* red flags */ + + /* red parameters */ + int red_weight; /* weight for EWMA */ + int red_inv_pmax; /* inverse of max drop probability */ + int red_thmin; /* red min threshold */ + int red_thmax; /* red max threshold */ + + /* variables for internal use */ + int red_wshift; /* log(red_weight) */ + int red_thmin_s; /* th_min scaled by avgshift */ + int red_thmax_s; /* th_max scaled by avgshift */ + int red_probd; /* drop probability denominator */ + + int red_avg; /* queue len avg scaled by avgshift */ + int red_count; /* packet count since last dropped/ + marked packet */ + int red_idle; /* queue was empty */ + int red_old; /* avg is above th_min */ + struct wtab *red_wtab; /* weight table */ + struct timeval red_last; /* time when the queue becomes idle */ + +#ifdef ALTQ3_COMPAT + struct flowvalve *red_flowvalve; /* flowvalve state */ +#endif + + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; + } red_stats; +} red_t; + +#ifdef ALTQ3_COMPAT +typedef struct red_queue { + struct red_queue *rq_next; /* next red_state in the list */ + struct ifaltq *rq_ifq; /* backpointer to ifaltq */ + + class_queue_t *rq_q; + + red_t *rq_red; +} red_queue_t; +#endif /* ALTQ3_COMPAT */ + +/* red drop types */ +#define DTYPE_NODROP 0 /* no drop */ +#define DTYPE_FORCED 1 /* a "forced" drop */ +#define DTYPE_EARLY 2 /* an "unforced" (early) drop */ + +extern red_t *red_alloc(int, int, int, int, int, int); +extern void red_destroy(red_t *); +extern void red_getstats(red_t *, struct redstats *); +extern int red_addq(red_t *, class_queue_t *, struct mbuf *, + struct altq_pktattr *); +extern struct mbuf *red_getq(red_t *, class_queue_t *); +extern int drop_early(int, int, int); +extern int mark_ecn(struct mbuf *, struct altq_pktattr *, int); +extern struct wtab *wtab_alloc(int); +extern int wtab_destroy(struct wtab *); +extern int32_t pow_w(struct wtab *, int); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_RED_HH_ */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_rio.c b/contrib/altq/rtems/freebsd/altq/altq_rio.c new file mode 100644 index 00000000..5055a2e0 --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_rio.c @@ -0,0 +1,855 @@ +#include + +/* $FreeBSD$ */ +/* $KAME: altq_rio.c,v 1.17 2003/07/10 12:07:49 kjc Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include +#if (__FreeBSD__ != 2) +#include +#ifdef __FreeBSD__ +#include +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_RIO /* rio is enabled by ALTQ_RIO option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#if 1 /* ALTQ3_COMPAT */ +#include +#include +#include +#endif + +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include +#include +#include +#include +#include +#ifdef ALTQ3_COMPAT +#include +#endif + +/* + * RIO: RED with IN/OUT bit + * described in + * "Explicit Allocation of Best Effort Packet Delivery Service" + * David D. Clark and Wenjia Fang, MIT Lab for Computer Science + * http://diffserv.lcs.mit.edu/Papers/exp-alloc-ddc-wf.{ps,pdf} + * + * this implementation is extended to support more than 2 drop precedence + * values as described in RFC2597 (Assured Forwarding PHB Group). + * + */ +/* + * AF DS (differentiated service) codepoints. + * (classes can be mapped to CBQ or H-FSC classes.) + * + * 0 1 2 3 4 5 6 7 + * +---+---+---+---+---+---+---+---+ + * | CLASS |DropPre| 0 | CU | + * +---+---+---+---+---+---+---+---+ + * + * class 1: 001 + * class 2: 010 + * class 3: 011 + * class 4: 100 + * + * low drop prec: 01 + * medium drop prec: 10 + * high drop prec: 01 + */ + +/* normal red parameters */ +#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ + /* q_weight = 0.00195 */ + +/* red parameters for a slow link */ +#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ + /* q_weight = 0.0078125 */ + +/* red parameters for a very slow link (e.g., dialup) */ +#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ + /* q_weight = 0.015625 */ + +/* fixed-point uses 12-bit decimal places */ +#define FP_SHIFT 12 /* fixed-point shift */ + +/* red parameters for drop probability */ +#define INV_P_MAX 10 /* inverse of max drop probability */ +#define TH_MIN 5 /* min threshold */ +#define TH_MAX 15 /* max threshold */ + +#define RIO_LIMIT 60 /* default max queue lenght */ +#define RIO_STATS /* collect statistics */ + +#define TV_DELTA(a, b, delta) { \ + register int xxs; \ + \ + delta = (a)->tv_usec - (b)->tv_usec; \ + if ((xxs = (a)->tv_sec - (b)->tv_sec) != 0) { \ + if (xxs < 0) { \ + delta = 60000000; \ + } else if (xxs > 4) { \ + if (xxs > 60) \ + delta = 60000000; \ + else \ + delta += xxs * 1000000; \ + } else while (xxs > 0) { \ + delta += 1000000; \ + xxs--; \ + } \ + } \ +} + +#ifdef ALTQ3_COMPAT +/* rio_list keeps all rio_queue_t's allocated. */ +static rio_queue_t *rio_list = NULL; +#endif +/* default rio parameter values */ +static struct redparams default_rio_params[RIO_NDROPPREC] = { + /* th_min, th_max, inv_pmax */ + { TH_MAX * 2 + TH_MIN, TH_MAX * 3, INV_P_MAX }, /* low drop precedence */ + { TH_MAX + TH_MIN, TH_MAX * 2, INV_P_MAX }, /* medium drop precedence */ + { TH_MIN, TH_MAX, INV_P_MAX } /* high drop precedence */ +}; + +/* internal function prototypes */ +static int dscp2index(u_int8_t); +#ifdef ALTQ3_COMPAT +static int rio_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +static struct mbuf *rio_dequeue(struct ifaltq *, int); +static int rio_request(struct ifaltq *, int, void *); +static int rio_detach(rio_queue_t *); + +/* + * rio device interface + */ +altqdev_decl(rio); + +#endif /* ALTQ3_COMPAT */ + +rio_t * +rio_alloc(int weight, struct redparams *params, int flags, int pkttime) +{ + rio_t *rp; + int w, i; + int npkts_per_sec; + + rp = malloc(sizeof(rio_t), M_DEVBUF, M_WAITOK); + if (rp == NULL) + return (NULL); + bzero(rp, sizeof(rio_t)); + + rp->rio_flags = flags; + if (pkttime == 0) + /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ + rp->rio_pkttime = 800; + else + rp->rio_pkttime = pkttime; + + if (weight != 0) + rp->rio_weight = weight; + else { + /* use default */ + rp->rio_weight = W_WEIGHT; + + /* when the link is very slow, adjust red parameters */ + npkts_per_sec = 1000000 / rp->rio_pkttime; + if (npkts_per_sec < 50) { + /* up to about 400Kbps */ + rp->rio_weight = W_WEIGHT_2; + } else if (npkts_per_sec < 300) { + /* up to about 2.4Mbps */ + rp->rio_weight = W_WEIGHT_1; + } + } + + /* calculate wshift. weight must be power of 2 */ + w = rp->rio_weight; + for (i = 0; w > 1; i++) + w = w >> 1; + rp->rio_wshift = i; + w = 1 << rp->rio_wshift; + if (w != rp->rio_weight) { + printf("invalid weight value %d for red! use %d\n", + rp->rio_weight, w); + rp->rio_weight = w; + } + + /* allocate weight table */ + rp->rio_wtab = wtab_alloc(rp->rio_weight); + + for (i = 0; i < RIO_NDROPPREC; i++) { + struct dropprec_state *prec = &rp->rio_precstate[i]; + + prec->avg = 0; + prec->idle = 1; + + if (params == NULL || params[i].inv_pmax == 0) + prec->inv_pmax = default_rio_params[i].inv_pmax; + else + prec->inv_pmax = params[i].inv_pmax; + if (params == NULL || params[i].th_min == 0) + prec->th_min = default_rio_params[i].th_min; + else + prec->th_min = params[i].th_min; + if (params == NULL || params[i].th_max == 0) + prec->th_max = default_rio_params[i].th_max; + else + prec->th_max = params[i].th_max; + + /* + * th_min_s and th_max_s are scaled versions of th_min + * and th_max to be compared with avg. + */ + prec->th_min_s = prec->th_min << (rp->rio_wshift + FP_SHIFT); + prec->th_max_s = prec->th_max << (rp->rio_wshift + FP_SHIFT); + + /* + * precompute probability denominator + * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point + */ + prec->probd = (2 * (prec->th_max - prec->th_min) + * prec->inv_pmax) << FP_SHIFT; + + microtime(&prec->last); + } + + return (rp); +} + +void +rio_destroy(rio_t *rp) +{ + wtab_destroy(rp->rio_wtab); + free(rp, M_DEVBUF); +} + +void +rio_getstats(rio_t *rp, struct redstats *sp) +{ + int i; + + for (i = 0; i < RIO_NDROPPREC; i++) { + bcopy(&rp->q_stats[i], sp, sizeof(struct redstats)); + sp->q_avg = rp->rio_precstate[i].avg >> rp->rio_wshift; + sp++; + } +} + +#if (RIO_NDROPPREC == 3) +/* + * internally, a drop precedence value is converted to an index + * starting from 0. + */ +static int +dscp2index(u_int8_t dscp) +{ + int dpindex = dscp & AF_DROPPRECMASK; + + if (dpindex == 0) + return (0); + return ((dpindex >> 3) - 1); +} +#endif + +#if 1 +/* + * kludge: when a packet is dequeued, we need to know its drop precedence + * in order to keep the queue length of each drop precedence. + * use m_pkthdr.rcvif to pass this info. + */ +#define RIOM_SET_PRECINDEX(m, idx) \ + do { (m)->m_pkthdr.rcvif = (void *)((long)(idx)); } while (0) +#define RIOM_GET_PRECINDEX(m) \ + ({ long idx; idx = (long)((m)->m_pkthdr.rcvif); \ + (m)->m_pkthdr.rcvif = NULL; idx; }) +#endif + +int +rio_addq(rio_t *rp, class_queue_t *q, struct mbuf *m, + struct altq_pktattr *pktattr) +{ + int avg, droptype; + u_int8_t dsfield, odsfield; + int dpindex, i, n, t; + struct timeval now; + struct dropprec_state *prec; + + dsfield = odsfield = read_dsfield(m, pktattr); + dpindex = dscp2index(dsfield); + + /* + * update avg of the precedence states whose drop precedence + * is larger than or equal to the drop precedence of the packet + */ + now.tv_sec = 0; + for (i = dpindex; i < RIO_NDROPPREC; i++) { + prec = &rp->rio_precstate[i]; + avg = prec->avg; + if (prec->idle) { + prec->idle = 0; + if (now.tv_sec == 0) + microtime(&now); + t = (now.tv_sec - prec->last.tv_sec); + if (t > 60) + avg = 0; + else { + t = t * 1000000 + + (now.tv_usec - prec->last.tv_usec); + n = t / rp->rio_pkttime; + /* calculate (avg = (1 - Wq)^n * avg) */ + if (n > 0) + avg = (avg >> FP_SHIFT) * + pow_w(rp->rio_wtab, n); + } + } + + /* run estimator. (avg is scaled by WEIGHT in fixed-point) */ + avg += (prec->qlen << FP_SHIFT) - (avg >> rp->rio_wshift); + prec->avg = avg; /* save the new value */ + /* + * count keeps a tally of arriving traffic that has not + * been dropped. + */ + prec->count++; + } + + prec = &rp->rio_precstate[dpindex]; + avg = prec->avg; + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (avg >= prec->th_min_s && prec->qlen > 1) { + if (avg >= prec->th_max_s) { + /* avg >= th_max: forced drop */ + droptype = DTYPE_FORCED; + } else if (prec->old == 0) { + /* first exceeds th_min */ + prec->count = 1; + prec->old = 1; + } else if (drop_early((avg - prec->th_min_s) >> rp->rio_wshift, + prec->probd, prec->count)) { + /* unforced drop by red */ + droptype = DTYPE_EARLY; + } + } else { + /* avg < th_min */ + prec->old = 0; + } + + /* + * if the queue length hits the hard limit, it's a forced drop. + */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + + if (droptype != DTYPE_NODROP) { + /* always drop incoming packet (as opposed to randomdrop) */ + for (i = dpindex; i < RIO_NDROPPREC; i++) + rp->rio_precstate[i].count = 0; +#ifdef RIO_STATS + if (droptype == DTYPE_EARLY) + rp->q_stats[dpindex].drop_unforced++; + else + rp->q_stats[dpindex].drop_forced++; + PKTCNTR_ADD(&rp->q_stats[dpindex].drop_cnt, m_pktlen(m)); +#endif + m_freem(m); + return (-1); + } + + for (i = dpindex; i < RIO_NDROPPREC; i++) + rp->rio_precstate[i].qlen++; + + /* save drop precedence index in mbuf hdr */ + RIOM_SET_PRECINDEX(m, dpindex); + + if (rp->rio_flags & RIOF_CLEARDSCP) + dsfield &= ~DSCP_MASK; + + if (dsfield != odsfield) + write_dsfield(m, pktattr, dsfield); + + _addq(q, m); + +#ifdef RIO_STATS + PKTCNTR_ADD(&rp->q_stats[dpindex].xmit_cnt, m_pktlen(m)); +#endif + return (0); +} + +struct mbuf * +rio_getq(rio_t *rp, class_queue_t *q) +{ + struct mbuf *m; + int dpindex, i; + + if ((m = _getq(q)) == NULL) + return NULL; + + dpindex = RIOM_GET_PRECINDEX(m); + for (i = dpindex; i < RIO_NDROPPREC; i++) { + if (--rp->rio_precstate[i].qlen == 0) { + if (rp->rio_precstate[i].idle == 0) { + rp->rio_precstate[i].idle = 1; + microtime(&rp->rio_precstate[i].last); + } + } + } + return (m); +} + +#ifdef ALTQ3_COMPAT +int +rioopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +rioclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + rio_queue_t *rqp; + int err, error = 0; + + while ((rqp = rio_list) != NULL) { + /* destroy all */ + err = rio_detach(rqp); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +rioioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + rio_queue_t *rqp; + struct rio_interface *ifacep; + struct ifnet *ifp; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case RIO_GETSTATS: + break; + default: +#if (__FreeBSD_version > 700000) + if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) + return (error); +#elsif (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + + case RIO_ENABLE: + ifacep = (struct rio_interface *)addr; + if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + error = altq_enable(rqp->rq_ifq); + break; + + case RIO_DISABLE: + ifacep = (struct rio_interface *)addr; + if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + error = altq_disable(rqp->rq_ifq); + break; + + case RIO_IF_ATTACH: + ifp = ifunit(((struct rio_interface *)addr)->rio_ifname); + if (ifp == NULL) { + error = ENXIO; + break; + } + + /* allocate and initialize rio_queue_t */ + rqp = malloc(sizeof(rio_queue_t), M_DEVBUF, M_WAITOK); + if (rqp == NULL) { + error = ENOMEM; + break; + } + bzero(rqp, sizeof(rio_queue_t)); + + rqp->rq_q = malloc(sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (rqp->rq_q == NULL) { + free(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + bzero(rqp->rq_q, sizeof(class_queue_t)); + + rqp->rq_rio = rio_alloc(0, NULL, 0, 0); + if (rqp->rq_rio == NULL) { + free(rqp->rq_q, M_DEVBUF); + free(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + + rqp->rq_ifq = &ifp->if_snd; + qtail(rqp->rq_q) = NULL; + qlen(rqp->rq_q) = 0; + qlimit(rqp->rq_q) = RIO_LIMIT; + qtype(rqp->rq_q) = Q_RIO; + + /* + * set RIO to this ifnet structure. + */ + error = altq_attach(rqp->rq_ifq, ALTQT_RIO, rqp, + rio_enqueue, rio_dequeue, rio_request, + NULL, NULL); + if (error) { + rio_destroy(rqp->rq_rio); + free(rqp->rq_q, M_DEVBUF); + free(rqp, M_DEVBUF); + break; + } + + /* add this state to the rio list */ + rqp->rq_next = rio_list; + rio_list = rqp; + break; + + case RIO_IF_DETACH: + ifacep = (struct rio_interface *)addr; + if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + error = rio_detach(rqp); + break; + + case RIO_GETSTATS: + do { + struct rio_stats *q_stats; + rio_t *rp; + int i; + + q_stats = (struct rio_stats *)addr; + if ((rqp = altq_lookup(q_stats->iface.rio_ifname, + ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + + rp = rqp->rq_rio; + + q_stats->q_limit = qlimit(rqp->rq_q); + q_stats->weight = rp->rio_weight; + q_stats->flags = rp->rio_flags; + + for (i = 0; i < RIO_NDROPPREC; i++) { + q_stats->q_len[i] = rp->rio_precstate[i].qlen; + bcopy(&rp->q_stats[i], &q_stats->q_stats[i], + sizeof(struct redstats)); + q_stats->q_stats[i].q_avg = + rp->rio_precstate[i].avg >> rp->rio_wshift; + + q_stats->q_params[i].inv_pmax + = rp->rio_precstate[i].inv_pmax; + q_stats->q_params[i].th_min + = rp->rio_precstate[i].th_min; + q_stats->q_params[i].th_max + = rp->rio_precstate[i].th_max; + } + } while (/*CONSTCOND*/ 0); + break; + + case RIO_CONFIG: + do { + struct rio_conf *fc; + rio_t *new; + int s, limit, i; + + fc = (struct rio_conf *)addr; + if ((rqp = altq_lookup(fc->iface.rio_ifname, + ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + + new = rio_alloc(fc->rio_weight, &fc->q_params[0], + fc->rio_flags, fc->rio_pkttime); + if (new == NULL) { + error = ENOMEM; + break; + } + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + _flushq(rqp->rq_q); + limit = fc->rio_limit; + if (limit < fc->q_params[RIO_NDROPPREC-1].th_max) + limit = fc->q_params[RIO_NDROPPREC-1].th_max; + qlimit(rqp->rq_q) = limit; + + rio_destroy(rqp->rq_rio); + rqp->rq_rio = new; + + splx(s); + + /* write back new values */ + fc->rio_limit = limit; + for (i = 0; i < RIO_NDROPPREC; i++) { + fc->q_params[i].inv_pmax = + rqp->rq_rio->rio_precstate[i].inv_pmax; + fc->q_params[i].th_min = + rqp->rq_rio->rio_precstate[i].th_min; + fc->q_params[i].th_max = + rqp->rq_rio->rio_precstate[i].th_max; + } + } while (/*CONSTCOND*/ 0); + break; + + case RIO_SETDEFAULTS: + do { + struct redparams *rp; + int i; + + rp = (struct redparams *)addr; + for (i = 0; i < RIO_NDROPPREC; i++) + default_rio_params[i] = rp[i]; + } while (/*CONSTCOND*/ 0); + break; + + default: + error = EINVAL; + break; + } + + return error; +} + +static int +rio_detach(rqp) + rio_queue_t *rqp; +{ + rio_queue_t *tmp; + int error = 0; + + if (ALTQ_IS_ENABLED(rqp->rq_ifq)) + altq_disable(rqp->rq_ifq); + + if ((error = altq_detach(rqp->rq_ifq))) + return (error); + + if (rio_list == rqp) + rio_list = rqp->rq_next; + else { + for (tmp = rio_list; tmp != NULL; tmp = tmp->rq_next) + if (tmp->rq_next == rqp) { + tmp->rq_next = rqp->rq_next; + break; + } + if (tmp == NULL) + printf("rio_detach: no state found in rio_list!\n"); + } + + rio_destroy(rqp->rq_rio); + free(rqp->rq_q, M_DEVBUF); + free(rqp, M_DEVBUF); + return (error); +} + +/* + * rio support routines + */ +static int +rio_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + switch (req) { + case ALTRQ_PURGE: + _flushq(rqp->rq_q); + if (ALTQ_IS_ENABLED(ifq)) + ifq->ifq_len = 0; + break; + } + return (0); +} + +/* + * enqueue routine: + * + * returns: 0 when successfully queued. + * ENOBUFS when drop occurs. + */ +static int +rio_enqueue(ifq, m, pktattr) + struct ifaltq *ifq; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; + int error = 0; + + IFQ_LOCK_ASSERT(ifq); + + if (rio_addq(rqp->rq_rio, rqp->rq_q, m, pktattr) == 0) + ifq->ifq_len++; + else + error = ENOBUFS; + return error; +} + +/* + * dequeue routine: + * must be called in splimp. + * + * returns: mbuf dequeued. + * NULL when no packet is available in the queue. + */ + +static struct mbuf * +rio_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; + struct mbuf *m = NULL; + + IFQ_LOCK_ASSERT(ifq); + + if (op == ALTDQ_POLL) + return qhead(rqp->rq_q); + + m = rio_getq(rqp->rq_rio, rqp->rq_q); + if (m != NULL) + ifq->ifq_len--; + return m; +} + +#ifdef KLD_MODULE + +static struct altqsw rio_sw = + {"rio", rioopen, rioclose, rioioctl}; + +ALTQ_MODULE(altq_rio, ALTQT_RIO, &rio_sw); +MODULE_VERSION(altq_rio, 1); +MODULE_DEPEND(altq_rio, altq_red, 1, 1, 1); + +#endif /* KLD_MODULE */ +#endif /* ALTQ3_COMPAT */ + +#endif /* ALTQ_RIO */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_rio.h b/contrib/altq/rtems/freebsd/altq/altq_rio.h new file mode 100644 index 00000000..b27951e2 --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_rio.h @@ -0,0 +1,144 @@ +/* $KAME: altq_rio.h,v 1.9 2003/07/10 12:07:49 kjc Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_RIO_HH_ +#define _ALTQ_ALTQ_RIO_HH_ + +#include + +/* + * RIO: RED with IN/OUT bit + * (extended to support more than 2 drop precedence values) + */ +#define RIO_NDROPPREC 3 /* number of drop precedence values */ + +#ifdef ALTQ3_COMPAT +struct rio_interface { + char rio_ifname[IFNAMSIZ]; +}; + +struct rio_stats { + struct rio_interface iface; + int q_len[RIO_NDROPPREC]; + struct redstats q_stats[RIO_NDROPPREC]; + + /* static red parameters */ + int q_limit; + int weight; + int flags; + struct redparams q_params[RIO_NDROPPREC]; +}; + +struct rio_conf { + struct rio_interface iface; + struct redparams q_params[RIO_NDROPPREC]; + int rio_weight; /* weight for EWMA */ + int rio_limit; /* max queue length */ + int rio_pkttime; /* average packet time in usec */ + int rio_flags; /* see below */ +}; +#endif /* ALTQ3_COMPAT */ + +/* rio flags */ +#define RIOF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define RIOF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define RIOF_ECN (RIOF_ECN4 | RIOF_ECN6) +#define RIOF_CLEARDSCP 0x200 /* clear diffserv codepoint */ + +#ifdef ALTQ3_COMPAT +/* + * IOCTLs for RIO + */ +#define RIO_IF_ATTACH _IOW('Q', 1, struct rio_interface) +#define RIO_IF_DETACH _IOW('Q', 2, struct rio_interface) +#define RIO_ENABLE _IOW('Q', 3, struct rio_interface) +#define RIO_DISABLE _IOW('Q', 4, struct rio_interface) +#define RIO_CONFIG _IOWR('Q', 6, struct rio_conf) +#define RIO_GETSTATS _IOWR('Q', 12, struct rio_stats) +#define RIO_SETDEFAULTS _IOW('Q', 30, struct redparams[RIO_NDROPPREC]) +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL + +typedef struct rio { + /* per drop precedence structure */ + struct dropprec_state { + /* red parameters */ + int inv_pmax; /* inverse of max drop probability */ + int th_min; /* red min threshold */ + int th_max; /* red max threshold */ + + /* variables for internal use */ + int th_min_s; /* th_min scaled by avgshift */ + int th_max_s; /* th_max scaled by avgshift */ + int probd; /* drop probability denominator */ + + int qlen; /* queue length */ + int avg; /* (scaled) queue length average */ + int count; /* packet count since the last dropped/ + marked packet */ + int idle; /* queue was empty */ + int old; /* avg is above th_min */ + struct timeval last; /* timestamp when queue becomes idle */ + } rio_precstate[RIO_NDROPPREC]; + + int rio_wshift; /* log(red_weight) */ + int rio_weight; /* weight for EWMA */ + struct wtab *rio_wtab; /* weight table */ + + int rio_pkttime; /* average packet time in micro sec + used for idle calibration */ + int rio_flags; /* rio flags */ + + u_int8_t rio_codepoint; /* codepoint value to tag packets */ + u_int8_t rio_codepointmask; /* codepoint mask bits */ + + struct redstats q_stats[RIO_NDROPPREC]; /* statistics */ +} rio_t; + +#ifdef ALTQ3_COMPAT +typedef struct rio_queue { + struct rio_queue *rq_next; /* next red_state in the list */ + struct ifaltq *rq_ifq; /* backpointer to ifaltq */ + + class_queue_t *rq_q; + + rio_t *rq_rio; +} rio_queue_t; +#endif /* ALTQ3_COMPAT */ + +extern rio_t *rio_alloc(int, struct redparams *, int, int); +extern void rio_destroy(rio_t *); +extern void rio_getstats(rio_t *, struct redstats *); +extern int rio_addq(rio_t *, class_queue_t *, struct mbuf *, + struct altq_pktattr *); +extern struct mbuf *rio_getq(rio_t *, class_queue_t *); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_RIO_HH_ */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_rmclass.c b/contrib/altq/rtems/freebsd/altq/altq_rmclass.c new file mode 100644 index 00000000..027f2a2a --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_rmclass.c @@ -0,0 +1,1843 @@ +#include + +/* $FreeBSD$ */ +/* $KAME: altq_rmclass.c,v 1.18 2003/11/06 06:32:53 kjc Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * LBL code modified by speer@eng.sun.com, May 1977. + * For questions and/or comments, please send mail to cbq@ee.lbl.gov + */ + +#ident "@(#)rm_class.c 1.48 97/12/05 SMI" + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include +#if (__FreeBSD__ != 2) +#include +#ifdef __FreeBSD__ +#include +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#ifdef ALTQ3_COMPAT +#include +#endif + +#include +#ifdef ALTQ3_COMPAT +#include +#include +#include +#endif + +#include +#include +#include +#include +#include + +/* + * Local Macros + */ + +#define reset_cutoff(ifd) { ifd->cutoff_ = RM_MAXDEPTH; } + +/* + * Local routines. + */ + +static int rmc_satisfied(struct rm_class *, struct timeval *); +static void rmc_wrr_set_weights(struct rm_ifdat *); +static void rmc_depth_compute(struct rm_class *); +static void rmc_depth_recompute(rm_class_t *); + +static mbuf_t *_rmc_wrr_dequeue_next(struct rm_ifdat *, int); +static mbuf_t *_rmc_prr_dequeue_next(struct rm_ifdat *, int); + +static int _rmc_addq(rm_class_t *, mbuf_t *); +static void _rmc_dropq(rm_class_t *); +static mbuf_t *_rmc_getq(rm_class_t *); +static mbuf_t *_rmc_pollq(rm_class_t *); + +static int rmc_under_limit(struct rm_class *, struct timeval *); +static void rmc_tl_satisfied(struct rm_ifdat *, struct timeval *); +static void rmc_drop_action(struct rm_class *); +static void rmc_restart(struct rm_class *); +static void rmc_root_overlimit(struct rm_class *, struct rm_class *); + +#define BORROW_OFFTIME +/* + * BORROW_OFFTIME (experimental): + * borrow the offtime of the class borrowing from. + * the reason is that when its own offtime is set, the class is unable + * to borrow much, especially when cutoff is taking effect. + * but when the borrowed class is overloaded (advidle is close to minidle), + * use the borrowing class's offtime to avoid overload. + */ +#define ADJUST_CUTOFF +/* + * ADJUST_CUTOFF (experimental): + * if no underlimit class is found due to cutoff, increase cutoff and + * retry the scheduling loop. + * also, don't invoke delay_actions while cutoff is taking effect, + * since a sleeping class won't have a chance to be scheduled in the + * next loop. + * + * now heuristics for setting the top-level variable (cutoff_) becomes: + * 1. if a packet arrives for a not-overlimit class, set cutoff + * to the depth of the class. + * 2. if cutoff is i, and a packet arrives for an overlimit class + * with an underlimit ancestor at a lower level than i (say j), + * then set cutoff to j. + * 3. at scheduling a packet, if there is no underlimit class + * due to the current cutoff level, increase cutoff by 1 and + * then try to schedule again. + */ + +/* + * rm_class_t * + * rmc_newclass(...) - Create a new resource management class at priority + * 'pri' on the interface given by 'ifd'. + * + * nsecPerByte is the data rate of the interface in nanoseconds/byte. + * E.g., 800 for a 10Mb/s ethernet. If the class gets less + * than 100% of the bandwidth, this number should be the + * 'effective' rate for the class. Let f be the + * bandwidth fraction allocated to this class, and let + * nsPerByte be the data rate of the output link in + * nanoseconds/byte. Then nsecPerByte is set to + * nsPerByte / f. E.g., 1600 (= 800 / .5) + * for a class that gets 50% of an ethernet's bandwidth. + * + * action the routine to call when the class is over limit. + * + * maxq max allowable queue size for class (in packets). + * + * parent parent class pointer. + * + * borrow class to borrow from (should be either 'parent' or null). + * + * maxidle max value allowed for class 'idle' time estimate (this + * parameter determines how large an initial burst of packets + * can be before overlimit action is invoked. + * + * offtime how long 'delay' action will delay when class goes over + * limit (this parameter determines the steady-state burst + * size when a class is running over its limit). + * + * Maxidle and offtime have to be computed from the following: If the + * average packet size is s, the bandwidth fraction allocated to this + * class is f, we want to allow b packet bursts, and the gain of the + * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then: + * + * ptime = s * nsPerByte * (1 - f) / f + * maxidle = ptime * (1 - g^b) / g^b + * minidle = -ptime * (1 / (f - 1)) + * offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1) + * + * Operationally, it's convenient to specify maxidle & offtime in units + * independent of the link bandwidth so the maxidle & offtime passed to + * this routine are the above values multiplied by 8*f/(1000*nsPerByte). + * (The constant factor is a scale factor needed to make the parameters + * integers. This scaling also means that the 'unscaled' values of + * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds, + * not nanoseconds.) Also note that the 'idle' filter computation keeps + * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of + * maxidle also must be scaled upward by this value. Thus, the passed + * values for maxidle and offtime can be computed as follows: + * + * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte) + * offtime = offtime * 8 / (1000 * nsecPerByte) + * + * When USE_HRTIME is employed, then maxidle and offtime become: + * maxidle = maxilde * (8.0 / nsecPerByte); + * offtime = offtime * (8.0 / nsecPerByte); + */ +struct rm_class * +rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte, + void (*action)(rm_class_t *, rm_class_t *), int maxq, + struct rm_class *parent, struct rm_class *borrow, u_int maxidle, + int minidle, u_int offtime, int pktsize, int flags) +{ + struct rm_class *cl; + struct rm_class *peer; + int s; + + if (pri >= RM_MAXPRIO) + return (NULL); +#ifndef ALTQ_RED + if (flags & RMCF_RED) { +#ifdef ALTQ_DEBUG + printf("rmc_newclass: RED not configured for CBQ!\n"); +#endif + return (NULL); + } +#endif +#ifndef ALTQ_RIO + if (flags & RMCF_RIO) { +#ifdef ALTQ_DEBUG + printf("rmc_newclass: RIO not configured for CBQ!\n"); +#endif + return (NULL); + } +#endif + + cl = malloc(sizeof(struct rm_class), + M_DEVBUF, M_WAITOK); + if (cl == NULL) + return (NULL); + bzero(cl, sizeof(struct rm_class)); + CALLOUT_INIT(&cl->callout_); + cl->q_ = malloc(sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (cl->q_ == NULL) { + free(cl, M_DEVBUF); + return (NULL); + } + bzero(cl->q_, sizeof(class_queue_t)); + + /* + * Class initialization. + */ + cl->children_ = NULL; + cl->parent_ = parent; + cl->borrow_ = borrow; + cl->leaf_ = 1; + cl->ifdat_ = ifd; + cl->pri_ = pri; + cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ + cl->depth_ = 0; + cl->qthresh_ = 0; + cl->ns_per_byte_ = nsecPerByte; + + qlimit(cl->q_) = maxq; + qtype(cl->q_) = Q_DROPHEAD; + qlen(cl->q_) = 0; + cl->flags_ = flags; + +#if 1 /* minidle is also scaled in ALTQ */ + cl->minidle_ = (minidle * (int)nsecPerByte) / 8; + if (cl->minidle_ > 0) + cl->minidle_ = 0; +#else + cl->minidle_ = minidle; +#endif + cl->maxidle_ = (maxidle * nsecPerByte) / 8; + if (cl->maxidle_ == 0) + cl->maxidle_ = 1; +#if 1 /* offtime is also scaled in ALTQ */ + cl->avgidle_ = cl->maxidle_; + cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; + if (cl->offtime_ == 0) + cl->offtime_ = 1; +#else + cl->avgidle_ = 0; + cl->offtime_ = (offtime * nsecPerByte) / 8; +#endif + cl->overlimit = action; + +#ifdef ALTQ_RED + if (flags & (RMCF_RED|RMCF_RIO)) { + int red_flags, red_pkttime; + + red_flags = 0; + if (flags & RMCF_ECN) + red_flags |= REDF_ECN; + if (flags & RMCF_FLOWVALVE) + red_flags |= REDF_FLOWVALVE; +#ifdef ALTQ_RIO + if (flags & RMCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + red_pkttime = nsecPerByte * pktsize / 1000; + + if (flags & RMCF_RED) { + cl->red_ = red_alloc(0, 0, + qlimit(cl->q_) * 10/100, + qlimit(cl->q_) * 30/100, + red_flags, red_pkttime); + if (cl->red_ != NULL) + qtype(cl->q_) = Q_RED; + } +#ifdef ALTQ_RIO + else { + cl->red_ = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->red_ != NULL) + qtype(cl->q_) = Q_RIO; + } +#endif + } +#endif /* ALTQ_RED */ + + /* + * put the class into the class tree + */ +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + IFQ_LOCK(ifd->ifq_); + if ((peer = ifd->active_[pri]) != NULL) { + /* find the last class at this pri */ + cl->peer_ = peer; + while (peer->peer_ != ifd->active_[pri]) + peer = peer->peer_; + peer->peer_ = cl; + } else { + ifd->active_[pri] = cl; + cl->peer_ = cl; + } + + if (cl->parent_) { + cl->next_ = parent->children_; + parent->children_ = cl; + parent->leaf_ = 0; + } + + /* + * Compute the depth of this class and its ancestors in the class + * hierarchy. + */ + rmc_depth_compute(cl); + + /* + * If CBQ's WRR is enabled, then initialize the class WRR state. + */ + if (ifd->wrr_) { + ifd->num_[pri]++; + ifd->alloc_[pri] += cl->allotment_; + rmc_wrr_set_weights(ifd); + } + IFQ_UNLOCK(ifd->ifq_); + splx(s); + return (cl); +} + +int +rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle, + int minidle, u_int offtime, int pktsize) +{ + struct rm_ifdat *ifd; + u_int old_allotment; + int s; + + ifd = cl->ifdat_; + old_allotment = cl->allotment_; + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + IFQ_LOCK(ifd->ifq_); + cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ + cl->qthresh_ = 0; + cl->ns_per_byte_ = nsecPerByte; + + qlimit(cl->q_) = maxq; + +#if 1 /* minidle is also scaled in ALTQ */ + cl->minidle_ = (minidle * nsecPerByte) / 8; + if (cl->minidle_ > 0) + cl->minidle_ = 0; +#else + cl->minidle_ = minidle; +#endif + cl->maxidle_ = (maxidle * nsecPerByte) / 8; + if (cl->maxidle_ == 0) + cl->maxidle_ = 1; +#if 1 /* offtime is also scaled in ALTQ */ + cl->avgidle_ = cl->maxidle_; + cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; + if (cl->offtime_ == 0) + cl->offtime_ = 1; +#else + cl->avgidle_ = 0; + cl->offtime_ = (offtime * nsecPerByte) / 8; +#endif + + /* + * If CBQ's WRR is enabled, then initialize the class WRR state. + */ + if (ifd->wrr_) { + ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment; + rmc_wrr_set_weights(ifd); + } + IFQ_UNLOCK(ifd->ifq_); + splx(s); + return (0); +} + +/* + * static void + * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes + * the appropriate run robin weights for the CBQ weighted round robin + * algorithm. + * + * Returns: NONE + */ + +static void +rmc_wrr_set_weights(struct rm_ifdat *ifd) +{ + int i; + struct rm_class *cl, *clh; + + for (i = 0; i < RM_MAXPRIO; i++) { + /* + * This is inverted from that of the simulator to + * maintain precision. + */ + if (ifd->num_[i] == 0) + ifd->M_[i] = 0; + else + ifd->M_[i] = ifd->alloc_[i] / + (ifd->num_[i] * ifd->maxpkt_); + /* + * Compute the weighted allotment for each class. + * This takes the expensive div instruction out + * of the main loop for the wrr scheduling path. + * These only get recomputed when a class comes or + * goes. + */ + if (ifd->active_[i] != NULL) { + clh = cl = ifd->active_[i]; + do { + /* safe-guard for slow link or alloc_ == 0 */ + if (ifd->M_[i] == 0) + cl->w_allotment_ = 0; + else + cl->w_allotment_ = cl->allotment_ / + ifd->M_[i]; + cl = cl->peer_; + } while ((cl != NULL) && (cl != clh)); + } + } +} + +int +rmc_get_weight(struct rm_ifdat *ifd, int pri) +{ + if ((pri >= 0) && (pri < RM_MAXPRIO)) + return (ifd->M_[pri]); + else + return (0); +} + +/* + * static void + * rmc_depth_compute(struct rm_class *cl) - This function computes the + * appropriate depth of class 'cl' and its ancestors. + * + * Returns: NONE + */ + +static void +rmc_depth_compute(struct rm_class *cl) +{ + rm_class_t *t = cl, *p; + + /* + * Recompute the depth for the branch of the tree. + */ + while (t != NULL) { + p = t->parent_; + if (p && (t->depth_ >= p->depth_)) { + p->depth_ = t->depth_ + 1; + t = p; + } else + t = NULL; + } +} + +/* + * static void + * rmc_depth_recompute(struct rm_class *cl) - This function re-computes + * the depth of the tree after a class has been deleted. + * + * Returns: NONE + */ + +static void +rmc_depth_recompute(rm_class_t *cl) +{ +#if 1 /* ALTQ */ + rm_class_t *p, *t; + + p = cl; + while (p != NULL) { + if ((t = p->children_) == NULL) { + p->depth_ = 0; + } else { + int cdepth = 0; + + while (t != NULL) { + if (t->depth_ > cdepth) + cdepth = t->depth_; + t = t->next_; + } + + if (p->depth_ == cdepth + 1) + /* no change to this parent */ + return; + + p->depth_ = cdepth + 1; + } + + p = p->parent_; + } +#else + rm_class_t *t; + + if (cl->depth_ >= 1) { + if (cl->children_ == NULL) { + cl->depth_ = 0; + } else if ((t = cl->children_) != NULL) { + while (t != NULL) { + if (t->children_ != NULL) + rmc_depth_recompute(t); + t = t->next_; + } + } else + rmc_depth_compute(cl); + } +#endif +} + +/* + * void + * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This + * function deletes a class from the link-sharing structure and frees + * all resources associated with the class. + * + * Returns: NONE + */ + +void +rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl) +{ + struct rm_class *p, *head, *previous; + int s; + + ASSERT(cl->children_ == NULL); + + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + IFQ_LOCK(ifd->ifq_); + /* + * Free packets in the packet queue. + * XXX - this may not be a desired behavior. Packets should be + * re-queued. + */ + rmc_dropall(cl); + + /* + * If the class has a parent, then remove the class from the + * class from the parent's children chain. + */ + if (cl->parent_ != NULL) { + head = cl->parent_->children_; + p = previous = head; + if (head->next_ == NULL) { + ASSERT(head == cl); + cl->parent_->children_ = NULL; + cl->parent_->leaf_ = 1; + } else while (p != NULL) { + if (p == cl) { + if (cl == head) + cl->parent_->children_ = cl->next_; + else + previous->next_ = cl->next_; + cl->next_ = NULL; + p = NULL; + } else { + previous = p; + p = p->next_; + } + } + } + + /* + * Delete class from class priority peer list. + */ + if ((p = ifd->active_[cl->pri_]) != NULL) { + /* + * If there is more than one member of this priority + * level, then look for class(cl) in the priority level. + */ + if (p != p->peer_) { + while (p->peer_ != cl) + p = p->peer_; + p->peer_ = cl->peer_; + + if (ifd->active_[cl->pri_] == cl) + ifd->active_[cl->pri_] = cl->peer_; + } else { + ASSERT(p == cl); + ifd->active_[cl->pri_] = NULL; + } + } + + /* + * Recompute the WRR weights. + */ + if (ifd->wrr_) { + ifd->alloc_[cl->pri_] -= cl->allotment_; + ifd->num_[cl->pri_]--; + rmc_wrr_set_weights(ifd); + } + + /* + * Re-compute the depth of the tree. + */ +#if 1 /* ALTQ */ + rmc_depth_recompute(cl->parent_); +#else + rmc_depth_recompute(ifd->root_); +#endif + + IFQ_UNLOCK(ifd->ifq_); + splx(s); + + /* + * Free the class structure. + */ + if (cl->red_ != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + rio_destroy((rio_t *)cl->red_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + red_destroy(cl->red_); +#endif + } + free(cl->q_, M_DEVBUF); + free(cl, M_DEVBUF); +} + + +/* + * void + * rmc_init(...) - Initialize the resource management data structures + * associated with the output portion of interface 'ifp'. 'ifd' is + * where the structures will be built (for backwards compatibility, the + * structures aren't kept in the ifnet struct). 'nsecPerByte' + * gives the link speed (inverse of bandwidth) in nanoseconds/byte. + * 'restart' is the driver-specific routine that the generic 'delay + * until under limit' action will call to restart output. `maxq' + * is the queue size of the 'link' & 'default' classes. 'maxqueued' + * is the maximum number of packets that the resource management + * code will allow to be queued 'downstream' (this is typically 1). + * + * Returns: NONE + */ + +void +rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte, + void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle, + int minidle, u_int offtime, int flags) +{ + int i, mtu; + + /* + * Initialize the CBQ tracing/debug facility. + */ + CBQTRACEINIT(); + + bzero((char *)ifd, sizeof (*ifd)); + mtu = ifq->altq_ifp->if_mtu; + ifd->ifq_ = ifq; + ifd->restart = restart; + ifd->maxqueued_ = maxqueued; + ifd->ns_per_byte_ = nsecPerByte; + ifd->maxpkt_ = mtu; + ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0; + ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0; +#if 1 + ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16; + if (mtu * nsecPerByte > 10 * 1000000) + ifd->maxiftime_ /= 4; +#endif + + reset_cutoff(ifd); + CBQTRACE(rmc_init, 'INIT', ifd->cutoff_); + + /* + * Initialize the CBQ's WRR state. + */ + for (i = 0; i < RM_MAXPRIO; i++) { + ifd->alloc_[i] = 0; + ifd->M_[i] = 0; + ifd->num_[i] = 0; + ifd->na_[i] = 0; + ifd->active_[i] = NULL; + } + + /* + * Initialize current packet state. + */ + ifd->qi_ = 0; + ifd->qo_ = 0; + for (i = 0; i < RM_MAXQUEUED; i++) { + ifd->class_[i] = NULL; + ifd->curlen_[i] = 0; + ifd->borrowed_[i] = NULL; + } + + /* + * Create the root class of the link-sharing structure. + */ + if ((ifd->root_ = rmc_newclass(0, ifd, + nsecPerByte, + rmc_root_overlimit, maxq, 0, 0, + maxidle, minidle, offtime, + 0, 0)) == NULL) { + printf("rmc_init: root class not allocated\n"); + return ; + } + ifd->root_->depth_ = 0; +} + +/* + * void + * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by + * mbuf 'm' to queue for resource class 'cl'. This routine is called + * by a driver's if_output routine. This routine must be called with + * output packet completion interrupts locked out (to avoid racing with + * rmc_dequeue_next). + * + * Returns: 0 on successful queueing + * -1 when packet drop occurs + */ +int +rmc_queue_packet(struct rm_class *cl, mbuf_t *m) +{ + struct timeval now; + struct rm_ifdat *ifd = cl->ifdat_; + int cpri = cl->pri_; + int is_empty = qempty(cl->q_); + + RM_GETTIME(now); + if (ifd->cutoff_ > 0) { + if (TV_LT(&cl->undertime_, &now)) { + if (ifd->cutoff_ > cl->depth_) + ifd->cutoff_ = cl->depth_; + CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_); + } +#if 1 /* ALTQ */ + else { + /* + * the class is overlimit. if the class has + * underlimit ancestors, set cutoff to the lowest + * depth among them. + */ + struct rm_class *borrow = cl->borrow_; + + while (borrow != NULL && + borrow->depth_ < ifd->cutoff_) { + if (TV_LT(&borrow->undertime_, &now)) { + ifd->cutoff_ = borrow->depth_; + CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_); + break; + } + borrow = borrow->borrow_; + } + } +#else /* !ALTQ */ + else if ((ifd->cutoff_ > 1) && cl->borrow_) { + if (TV_LT(&cl->borrow_->undertime_, &now)) { + ifd->cutoff_ = cl->borrow_->depth_; + CBQTRACE(rmc_queue_packet, 'ffob', + cl->borrow_->depth_); + } + } +#endif /* !ALTQ */ + } + + if (_rmc_addq(cl, m) < 0) + /* failed */ + return (-1); + + if (is_empty) { + CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle); + ifd->na_[cpri]++; + } + + if (qlen(cl->q_) > qlimit(cl->q_)) { + /* note: qlimit can be set to 0 or 1 */ + rmc_drop_action(cl); + return (-1); + } + return (0); +} + +/* + * void + * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all + * classes to see if there are satified. + */ + +static void +rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) +{ + int i; + rm_class_t *p, *bp; + + for (i = RM_MAXPRIO - 1; i >= 0; i--) { + if ((bp = ifd->active_[i]) != NULL) { + p = bp; + do { + if (!rmc_satisfied(p, now)) { + ifd->cutoff_ = p->depth_; + return; + } + p = p->peer_; + } while (p != bp); + } + } + + reset_cutoff(ifd); +} + +/* + * rmc_satisfied - Return 1 of the class is satisfied. O, otherwise. + */ + +static int +rmc_satisfied(struct rm_class *cl, struct timeval *now) +{ + rm_class_t *p; + + if (cl == NULL) + return (1); + if (TV_LT(now, &cl->undertime_)) + return (1); + if (cl->depth_ == 0) { + if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_)) + return (0); + else + return (1); + } + if (cl->children_ != NULL) { + p = cl->children_; + while (p != NULL) { + if (!rmc_satisfied(p, now)) + return (0); + p = p->next_; + } + } + + return (1); +} + +/* + * Return 1 if class 'cl' is under limit or can borrow from a parent, + * 0 if overlimit. As a side-effect, this routine will invoke the + * class overlimit action if the class if overlimit. + */ + +static int +rmc_under_limit(struct rm_class *cl, struct timeval *now) +{ + rm_class_t *p = cl; + rm_class_t *top; + struct rm_ifdat *ifd = cl->ifdat_; + + ifd->borrowed_[ifd->qi_] = NULL; + /* + * If cl is the root class, then always return that it is + * underlimit. Otherwise, check to see if the class is underlimit. + */ + if (cl->parent_ == NULL) + return (1); + + if (cl->sleeping_) { + if (TV_LT(now, &cl->undertime_)) + return (0); + + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; + return (1); + } + + top = NULL; + while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) { + if (((cl = cl->borrow_) == NULL) || + (cl->depth_ > ifd->cutoff_)) { +#ifdef ADJUST_CUTOFF + if (cl != NULL) + /* cutoff is taking effect, just + return false without calling + the delay action. */ + return (0); +#endif +#ifdef BORROW_OFFTIME + /* + * check if the class can borrow offtime too. + * borrow offtime from the top of the borrow + * chain if the top class is not overloaded. + */ + if (cl != NULL) { + /* cutoff is taking effect, use this class as top. */ + top = cl; + CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_); + } + if (top != NULL && top->avgidle_ == top->minidle_) + top = NULL; + p->overtime_ = *now; + (p->overlimit)(p, top); +#else + p->overtime_ = *now; + (p->overlimit)(p, NULL); +#endif + return (0); + } + top = cl; + } + + if (cl != p) + ifd->borrowed_[ifd->qi_] = cl; + return (1); +} + +/* + * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to + * Packet-by-packet round robin. + * + * The heart of the weighted round-robin scheduler, which decides which + * class next gets to send a packet. Highest priority first, then + * weighted round-robin within priorites. + * + * Each able-to-send class gets to send until its byte allocation is + * exhausted. Thus, the active pointer is only changed after a class has + * exhausted its allocation. + * + * If the scheduler finds no class that is underlimit or able to borrow, + * then the first class found that had a nonzero queue and is allowed to + * borrow gets to send. + */ + +static mbuf_t * +_rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op) +{ + struct rm_class *cl = NULL, *first = NULL; + u_int deficit; + int cpri; + mbuf_t *m; + struct timeval now; + + RM_GETTIME(now); + + /* + * if the driver polls the top of the queue and then removes + * the polled packet, we must return the same packet. + */ + if (op == ALTDQ_REMOVE && ifd->pollcache_) { + cl = ifd->pollcache_; + cpri = cl->pri_; + if (ifd->efficient_) { + /* check if this class is overlimit */ + if (cl->undertime_.tv_sec != 0 && + rmc_under_limit(cl, &now) == 0) + first = cl; + } + ifd->pollcache_ = NULL; + goto _wrr_out; + } + else { + /* mode == ALTDQ_POLL || pollcache == NULL */ + ifd->pollcache_ = NULL; + ifd->borrowed_[ifd->qi_] = NULL; + } +#ifdef ADJUST_CUTOFF + _again: +#endif + for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { + if (ifd->na_[cpri] == 0) + continue; + deficit = 0; + /* + * Loop through twice for a priority level, if some class + * was unable to send a packet the first round because + * of the weighted round-robin mechanism. + * During the second loop at this level, deficit==2. + * (This second loop is not needed if for every class, + * "M[cl->pri_])" times "cl->allotment" is greater than + * the byte size for the largest packet in the class.) + */ + _wrr_loop: + cl = ifd->active_[cpri]; + ASSERT(cl != NULL); + do { + if ((deficit < 2) && (cl->bytes_alloc_ <= 0)) + cl->bytes_alloc_ += cl->w_allotment_; + if (!qempty(cl->q_)) { + if ((cl->undertime_.tv_sec == 0) || + rmc_under_limit(cl, &now)) { + if (cl->bytes_alloc_ > 0 || deficit > 1) + goto _wrr_out; + + /* underlimit but no alloc */ + deficit = 1; +#if 1 + ifd->borrowed_[ifd->qi_] = NULL; +#endif + } + else if (first == NULL && cl->borrow_ != NULL) + first = cl; /* borrowing candidate */ + } + + cl->bytes_alloc_ = 0; + cl = cl->peer_; + } while (cl != ifd->active_[cpri]); + + if (deficit == 1) { + /* first loop found an underlimit class with deficit */ + /* Loop on same priority level, with new deficit. */ + deficit = 2; + goto _wrr_loop; + } + } + +#ifdef ADJUST_CUTOFF + /* + * no underlimit class found. if cutoff is taking effect, + * increase cutoff and try again. + */ + if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { + ifd->cutoff_++; + CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_); + goto _again; + } +#endif /* ADJUST_CUTOFF */ + /* + * If LINK_EFFICIENCY is turned on, then the first overlimit + * class we encounter will send a packet if all the classes + * of the link-sharing structure are overlimit. + */ + reset_cutoff(ifd); + CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_); + + if (!ifd->efficient_ || first == NULL) + return (NULL); + + cl = first; + cpri = cl->pri_; +#if 0 /* too time-consuming for nothing */ + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; +#endif + ifd->borrowed_[ifd->qi_] = cl->borrow_; + ifd->cutoff_ = cl->borrow_->depth_; + + /* + * Deque the packet and do the book keeping... + */ + _wrr_out: + if (op == ALTDQ_REMOVE) { + m = _rmc_getq(cl); + if (m == NULL) + panic("_rmc_wrr_dequeue_next"); + if (qempty(cl->q_)) + ifd->na_[cpri]--; + + /* + * Update class statistics and link data. + */ + if (cl->bytes_alloc_ > 0) + cl->bytes_alloc_ -= m_pktlen(m); + + if ((cl->bytes_alloc_ <= 0) || first == cl) + ifd->active_[cl->pri_] = cl->peer_; + else + ifd->active_[cl->pri_] = cl; + + ifd->class_[ifd->qi_] = cl; + ifd->curlen_[ifd->qi_] = m_pktlen(m); + ifd->now_[ifd->qi_] = now; + ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; + ifd->queued_++; + } else { + /* mode == ALTDQ_PPOLL */ + m = _rmc_pollq(cl); + ifd->pollcache_ = cl; + } + return (m); +} + +/* + * Dequeue & return next packet from the highest priority class that + * has a packet to send & has enough allocation to send it. This + * routine is called by a driver whenever it needs a new packet to + * output. + */ +static mbuf_t * +_rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op) +{ + mbuf_t *m; + int cpri; + struct rm_class *cl, *first = NULL; + struct timeval now; + + RM_GETTIME(now); + + /* + * if the driver polls the top of the queue and then removes + * the polled packet, we must return the same packet. + */ + if (op == ALTDQ_REMOVE && ifd->pollcache_) { + cl = ifd->pollcache_; + cpri = cl->pri_; + ifd->pollcache_ = NULL; + goto _prr_out; + } else { + /* mode == ALTDQ_POLL || pollcache == NULL */ + ifd->pollcache_ = NULL; + ifd->borrowed_[ifd->qi_] = NULL; + } +#ifdef ADJUST_CUTOFF + _again: +#endif + for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { + if (ifd->na_[cpri] == 0) + continue; + cl = ifd->active_[cpri]; + ASSERT(cl != NULL); + do { + if (!qempty(cl->q_)) { + if ((cl->undertime_.tv_sec == 0) || + rmc_under_limit(cl, &now)) + goto _prr_out; + if (first == NULL && cl->borrow_ != NULL) + first = cl; + } + cl = cl->peer_; + } while (cl != ifd->active_[cpri]); + } + +#ifdef ADJUST_CUTOFF + /* + * no underlimit class found. if cutoff is taking effect, increase + * cutoff and try again. + */ + if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { + ifd->cutoff_++; + goto _again; + } +#endif /* ADJUST_CUTOFF */ + /* + * If LINK_EFFICIENCY is turned on, then the first overlimit + * class we encounter will send a packet if all the classes + * of the link-sharing structure are overlimit. + */ + reset_cutoff(ifd); + if (!ifd->efficient_ || first == NULL) + return (NULL); + + cl = first; + cpri = cl->pri_; +#if 0 /* too time-consuming for nothing */ + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; +#endif + ifd->borrowed_[ifd->qi_] = cl->borrow_; + ifd->cutoff_ = cl->borrow_->depth_; + + /* + * Deque the packet and do the book keeping... + */ + _prr_out: + if (op == ALTDQ_REMOVE) { + m = _rmc_getq(cl); + if (m == NULL) + panic("_rmc_prr_dequeue_next"); + if (qempty(cl->q_)) + ifd->na_[cpri]--; + + ifd->active_[cpri] = cl->peer_; + + ifd->class_[ifd->qi_] = cl; + ifd->curlen_[ifd->qi_] = m_pktlen(m); + ifd->now_[ifd->qi_] = now; + ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; + ifd->queued_++; + } else { + /* mode == ALTDQ_POLL */ + m = _rmc_pollq(cl); + ifd->pollcache_ = cl; + } + return (m); +} + +/* + * mbuf_t * + * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function + * is invoked by the packet driver to get the next packet to be + * dequeued and output on the link. If WRR is enabled, then the + * WRR dequeue next routine will determine the next packet to sent. + * Otherwise, packet-by-packet round robin is invoked. + * + * Returns: NULL, if a packet is not available or if all + * classes are overlimit. + * + * Otherwise, Pointer to the next packet. + */ + +mbuf_t * +rmc_dequeue_next(struct rm_ifdat *ifd, int mode) +{ + if (ifd->queued_ >= ifd->maxqueued_) + return (NULL); + else if (ifd->wrr_) + return (_rmc_wrr_dequeue_next(ifd, mode)); + else + return (_rmc_prr_dequeue_next(ifd, mode)); +} + +/* + * Update the utilization estimate for the packet that just completed. + * The packet's class & the parent(s) of that class all get their + * estimators updated. This routine is called by the driver's output- + * packet-completion interrupt service routine. + */ + +/* + * a macro to approximate "divide by 1000" that gives 0.000999, + * if a value has enough effective digits. + * (on pentium, mul takes 9 cycles but div takes 46!) + */ +#define NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17)) +void +rmc_update_class_util(struct rm_ifdat *ifd) +{ + int idle, avgidle, pktlen; + int pkt_time, tidle; + rm_class_t *cl, *borrowed; + rm_class_t *borrows; + struct timeval *nowp; + + /* + * Get the most recent completed class. + */ + if ((cl = ifd->class_[ifd->qo_]) == NULL) + return; + + pktlen = ifd->curlen_[ifd->qo_]; + borrowed = ifd->borrowed_[ifd->qo_]; + borrows = borrowed; + + PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); + + /* + * Run estimator on class and its ancestors. + */ + /* + * rm_update_class_util is designed to be called when the + * transfer is completed from a xmit complete interrupt, + * but most drivers don't implement an upcall for that. + * so, just use estimated completion time. + * as a result, ifd->qi_ and ifd->qo_ are always synced. + */ + nowp = &ifd->now_[ifd->qo_]; + /* get pkt_time (for link) in usec */ +#if 1 /* use approximation */ + pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_; + pkt_time = NSEC_TO_USEC(pkt_time); +#else + pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000; +#endif +#if 1 /* ALTQ4PPP */ + if (TV_LT(nowp, &ifd->ifnow_)) { + int iftime; + + /* + * make sure the estimated completion time does not go + * too far. it can happen when the link layer supports + * data compression or the interface speed is set to + * a much lower value. + */ + TV_DELTA(&ifd->ifnow_, nowp, iftime); + if (iftime+pkt_time < ifd->maxiftime_) { + TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); + } else { + TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_); + } + } else { + TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); + } +#else + if (TV_LT(nowp, &ifd->ifnow_)) { + TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); + } else { + TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); + } +#endif + + while (cl != NULL) { + TV_DELTA(&ifd->ifnow_, &cl->last_, idle); + if (idle >= 2000000) + /* + * this class is idle enough, reset avgidle. + * (TV_DELTA returns 2000000 us when delta is large.) + */ + cl->avgidle_ = cl->maxidle_; + + /* get pkt_time (for class) in usec */ +#if 1 /* use approximation */ + pkt_time = pktlen * cl->ns_per_byte_; + pkt_time = NSEC_TO_USEC(pkt_time); +#else + pkt_time = pktlen * cl->ns_per_byte_ / 1000; +#endif + idle -= pkt_time; + + avgidle = cl->avgidle_; + avgidle += idle - (avgidle >> RM_FILTER_GAIN); + cl->avgidle_ = avgidle; + + /* Are we overlimit ? */ + if (avgidle <= 0) { + CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle); +#if 1 /* ALTQ */ + /* + * need some lower bound for avgidle, otherwise + * a borrowing class gets unbounded penalty. + */ + if (avgidle < cl->minidle_) + avgidle = cl->avgidle_ = cl->minidle_; +#endif + /* set next idle to make avgidle 0 */ + tidle = pkt_time + + (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN); + TV_ADD_DELTA(nowp, tidle, &cl->undertime_); + ++cl->stats_.over; + } else { + cl->avgidle_ = + (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle; + cl->undertime_.tv_sec = 0; + if (cl->sleeping_) { + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + } + } + + if (borrows != NULL) { + if (borrows != cl) + ++cl->stats_.borrows; + else + borrows = NULL; + } + cl->last_ = ifd->ifnow_; + cl->last_pkttime_ = pkt_time; + +#if 1 + if (cl->parent_ == NULL) { + /* take stats of root class */ + PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); + } +#endif + + cl = cl->parent_; + } + + /* + * Check to see if cutoff needs to set to a new level. + */ + cl = ifd->class_[ifd->qo_]; + if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) { +#if 1 /* ALTQ */ + if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) { + rmc_tl_satisfied(ifd, nowp); + CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); + } else { + ifd->cutoff_ = borrowed->depth_; + CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); + } +#else /* !ALTQ */ + if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) { + reset_cutoff(ifd); +#ifdef notdef + rmc_tl_satisfied(ifd, &now); +#endif + CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); + } else { + ifd->cutoff_ = borrowed->depth_; + CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); + } +#endif /* !ALTQ */ + } + + /* + * Release class slot + */ + ifd->borrowed_[ifd->qo_] = NULL; + ifd->class_[ifd->qo_] = NULL; + ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_; + ifd->queued_--; +} + +/* + * void + * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific) + * over-limit action routines. These get invoked by rmc_under_limit() + * if a class with packets to send if over its bandwidth limit & can't + * borrow from a parent class. + * + * Returns: NONE + */ + +static void +rmc_drop_action(struct rm_class *cl) +{ + struct rm_ifdat *ifd = cl->ifdat_; + + ASSERT(qlen(cl->q_) > 0); + _rmc_dropq(cl); + if (qempty(cl->q_)) + ifd->na_[cl->pri_]--; +} + +void rmc_dropall(struct rm_class *cl) +{ + struct rm_ifdat *ifd = cl->ifdat_; + + if (!qempty(cl->q_)) { + _flushq(cl->q_); + + ifd->na_[cl->pri_]--; + } +} + +#if (__FreeBSD_version > 300000) +/* hzto() is removed from FreeBSD-3.0 */ +static int hzto(struct timeval *); + +static int +hzto(tv) + struct timeval *tv; +{ + struct timeval t2; + + getmicrotime(&t2); + t2.tv_sec = tv->tv_sec - t2.tv_sec; + t2.tv_usec = tv->tv_usec - t2.tv_usec; + return (tvtohz(&t2)); +} +#endif /* __FreeBSD_version > 300000 */ + +/* + * void + * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ + * delay action routine. It is invoked via rmc_under_limit when the + * packet is discoverd to be overlimit. + * + * If the delay action is result of borrow class being overlimit, then + * delay for the offtime of the borrowing class that is overlimit. + * + * Returns: NONE + */ + +void +rmc_delay_action(struct rm_class *cl, struct rm_class *borrow) +{ + int delay, t, extradelay; + + cl->stats_.overactions++; + TV_DELTA(&cl->undertime_, &cl->overtime_, delay); +#ifndef BORROW_OFFTIME + delay += cl->offtime_; +#endif + + if (!cl->sleeping_) { + CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle); +#ifdef BORROW_OFFTIME + if (borrow != NULL) + extradelay = borrow->offtime_; + else +#endif + extradelay = cl->offtime_; + +#ifdef ALTQ + /* + * XXX recalculate suspend time: + * current undertime is (tidle + pkt_time) calculated + * from the last transmission. + * tidle: time required to bring avgidle back to 0 + * pkt_time: target waiting time for this class + * we need to replace pkt_time by offtime + */ + extradelay -= cl->last_pkttime_; +#endif + if (extradelay > 0) { + TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_); + delay += extradelay; + } + + cl->sleeping_ = 1; + cl->stats_.delays++; + + /* + * Since packets are phased randomly with respect to the + * clock, 1 tick (the next clock tick) can be an arbitrarily + * short time so we have to wait for at least two ticks. + * NOTE: If there's no other traffic, we need the timer as + * a 'backstop' to restart this class. + */ + if (delay > tick * 2) { +#ifdef __FreeBSD__ + /* FreeBSD rounds up the tick */ + t = hzto(&cl->undertime_); +#else + /* other BSDs round down the tick */ + t = hzto(&cl->undertime_) + 1; +#endif + } else + t = 2; + CALLOUT_RESET(&cl->callout_, t, + (timeout_t *)rmc_restart, (caddr_t)cl); + } +} + +/* + * void + * rmc_restart() - is just a helper routine for rmc_delay_action -- it is + * called by the system timer code & is responsible checking if the + * class is still sleeping (it might have been restarted as a side + * effect of the queue scan on a packet arrival) and, if so, restarting + * output for the class. Inspecting the class state & restarting output + * require locking the class structure. In general the driver is + * responsible for locking but this is the only routine that is not + * called directly or indirectly from the interface driver so it has + * know about system locking conventions. Under bsd, locking is done + * by raising IPL to splimp so that's what's implemented here. On a + * different system this would probably need to be changed. + * + * Returns: NONE + */ + +static void +rmc_restart(struct rm_class *cl) +{ + struct rm_ifdat *ifd = cl->ifdat_; + int s; + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + IFQ_LOCK(ifd->ifq_); + if (cl->sleeping_) { + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; + + if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) { + CBQTRACE(rmc_restart, 'trts', cl->stats_.handle); + (ifd->restart)(ifd->ifq_); + } + } + IFQ_UNLOCK(ifd->ifq_); + splx(s); +} + +/* + * void + * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit + * handling routine for the root class of the link sharing structure. + * + * Returns: NONE + */ + +static void +rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow) +{ + panic("rmc_root_overlimit"); +} + +/* + * Packet Queue handling routines. Eventually, this is to localize the + * effects on the code whether queues are red queues or droptail + * queues. + */ + +static int +_rmc_addq(rm_class_t *cl, mbuf_t *m) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + return red_addq(cl->red_, cl->q_, m, cl->pktattr_); +#endif /* ALTQ_RED */ + + if (cl->flags_ & RMCF_CLEARDSCP) + write_dsfield(m, cl->pktattr_, 0); + + _addq(cl->q_, m); + return (0); +} + +/* note: _rmc_dropq is not called for red */ +static void +_rmc_dropq(rm_class_t *cl) +{ + mbuf_t *m; + + if ((m = _getq(cl->q_)) != NULL) + m_freem(m); +} + +static mbuf_t * +_rmc_getq(rm_class_t *cl) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + return rio_getq((rio_t *)cl->red_, cl->q_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + return red_getq(cl->red_, cl->q_); +#endif + return _getq(cl->q_); +} + +static mbuf_t * +_rmc_pollq(rm_class_t *cl) +{ + return qhead(cl->q_); +} + +#ifdef CBQ_TRACE + +struct cbqtrace cbqtrace_buffer[NCBQTRACE+1]; +struct cbqtrace *cbqtrace_ptr = NULL; +int cbqtrace_count; + +/* + * DDB hook to trace cbq events: + * the last 1024 events are held in a circular buffer. + * use "call cbqtrace_dump(N)" to display 20 events from Nth event. + */ +void cbqtrace_dump(int); +static char *rmc_funcname(void *); + +static struct rmc_funcs { + void *func; + char *name; +} rmc_funcs[] = +{ + rmc_init, "rmc_init", + rmc_queue_packet, "rmc_queue_packet", + rmc_under_limit, "rmc_under_limit", + rmc_update_class_util, "rmc_update_class_util", + rmc_delay_action, "rmc_delay_action", + rmc_restart, "rmc_restart", + _rmc_wrr_dequeue_next, "_rmc_wrr_dequeue_next", + NULL, NULL +}; + +static char *rmc_funcname(void *func) +{ + struct rmc_funcs *fp; + + for (fp = rmc_funcs; fp->func != NULL; fp++) + if (fp->func == func) + return (fp->name); + return ("unknown"); +} + +void cbqtrace_dump(int counter) +{ + int i, *p; + char *cp; + + counter = counter % NCBQTRACE; + p = (int *)&cbqtrace_buffer[counter]; + + for (i=0; i<20; i++) { + printf("[0x%x] ", *p++); + printf("%s: ", rmc_funcname((void *)*p++)); + cp = (char *)p++; + printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]); + printf("%d\n",*p++); + + if (p >= (int *)&cbqtrace_buffer[NCBQTRACE]) + p = (int *)cbqtrace_buffer; + } +} +#endif /* CBQ_TRACE */ +#endif /* ALTQ_CBQ */ + +#if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) +#if !defined(__GNUC__) || defined(ALTQ_DEBUG) + +void +_addq(class_queue_t *q, mbuf_t *m) +{ + mbuf_t *m0; + + if ((m0 = qtail(q)) != NULL) + m->m_nextpkt = m0->m_nextpkt; + else + m0 = m; + m0->m_nextpkt = m; + qtail(q) = m; + qlen(q)++; +} + +mbuf_t * +_getq(class_queue_t *q) +{ + mbuf_t *m, *m0; + + if ((m = qtail(q)) == NULL) + return (NULL); + if ((m0 = m->m_nextpkt) != m) + m->m_nextpkt = m0->m_nextpkt; + else { + ASSERT(qlen(q) == 1); + qtail(q) = NULL; + } + qlen(q)--; + m0->m_nextpkt = NULL; + return (m0); +} + +/* drop a packet at the tail of the queue */ +mbuf_t * +_getq_tail(class_queue_t *q) +{ + mbuf_t *m, *m0, *prev; + + if ((m = m0 = qtail(q)) == NULL) + return NULL; + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) { + ASSERT(qlen(q) == 1); + qtail(q) = NULL; + } else + qtail(q) = prev; + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +/* randomly select a packet in the queue */ +mbuf_t * +_getq_random(class_queue_t *q) +{ + struct mbuf *m; + int i, n; + + if ((m = qtail(q)) == NULL) + return NULL; + if (m->m_nextpkt == m) { + ASSERT(qlen(q) == 1); + qtail(q) = NULL; + } else { + struct mbuf *prev = NULL; + + n = arc4random() % qlen(q) + 1; + for (i = 0; i < n; i++) { + prev = m; + m = m->m_nextpkt; + } + prev->m_nextpkt = m->m_nextpkt; + if (m == qtail(q)) + qtail(q) = prev; + } + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +void +_removeq(class_queue_t *q, mbuf_t *m) +{ + mbuf_t *m0, *prev; + + m0 = qtail(q); + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else if (qtail(q) == m) + qtail(q) = prev; + qlen(q)--; +} + +void +_flushq(class_queue_t *q) +{ + mbuf_t *m; + + while ((m = _getq(q)) != NULL) + m_freem(m); + ASSERT(qlen(q) == 0); +} + +#endif /* !__GNUC__ || ALTQ_DEBUG */ +#endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_rmclass.h b/contrib/altq/rtems/freebsd/altq/altq_rmclass.h new file mode 100644 index 00000000..19693173 --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_rmclass.h @@ -0,0 +1,266 @@ +/* $KAME: altq_rmclass.h,v 1.10 2003/08/20 23:30:23 itojun Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_RMCLASS_HH_ +#define _ALTQ_ALTQ_RMCLASS_HH_ + +#include + +/* #pragma ident "@(#)rm_class.h 1.20 97/10/23 SMI" */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define RM_MAXPRIO 8 /* Max priority */ + +#ifdef _KERNEL + +typedef struct mbuf mbuf_t; +typedef struct rm_ifdat rm_ifdat_t; +typedef struct rm_class rm_class_t; + +struct red; + +/* + * Macros for dealing with time values. We assume all times are + * 'timevals'. `microtime' is used to get the best available clock + * resolution. If `microtime' *doesn't* return a value that's about + * ten times smaller than the average packet time on the fastest + * link that will use these routines, a slightly different clock + * scheme than this one should be used. + * (Bias due to truncation error in this scheme will overestimate utilization + * and discriminate against high bandwidth classes. To remove this bias an + * integrator needs to be added. The simplest integrator uses a history of + * 10 * avg.packet.time / min.tick.time packet completion entries. This is + * straight forward to add but we don't want to pay the extra memory + * traffic to maintain it if it's not necessary (occasionally a vendor + * accidentally builds a workstation with a decent clock - e.g., Sun & HP).) + */ + +#define RM_GETTIME(now) microtime(&now) + +#define TV_LT(a, b) (((a)->tv_sec < (b)->tv_sec) || \ + (((a)->tv_usec < (b)->tv_usec) && ((a)->tv_sec <= (b)->tv_sec))) + +#define TV_DELTA(a, b, delta) { \ + register int xxs; \ + \ + delta = (a)->tv_usec - (b)->tv_usec; \ + if ((xxs = (a)->tv_sec - (b)->tv_sec)) { \ + switch (xxs) { \ + default: \ + /* if (xxs < 0) \ + printf("rm_class: bogus time values\n"); */ \ + delta = 0; \ + /* fall through */ \ + case 2: \ + delta += 1000000; \ + /* fall through */ \ + case 1: \ + delta += 1000000; \ + break; \ + } \ + } \ +} + +#define TV_ADD_DELTA(a, delta, res) { \ + register int xxus = (a)->tv_usec + (delta); \ + \ + (res)->tv_sec = (a)->tv_sec; \ + while (xxus >= 1000000) { \ + ++((res)->tv_sec); \ + xxus -= 1000000; \ + } \ + (res)->tv_usec = xxus; \ +} + +#define RM_TIMEOUT 2 /* 1 Clock tick. */ + +#if 1 +#define RM_MAXQUEUED 1 /* this isn't used in ALTQ/CBQ */ +#else +#define RM_MAXQUEUED 16 /* Max number of packets downstream of CBQ */ +#endif +#define RM_MAXQUEUE 64 /* Max queue length */ +#define RM_FILTER_GAIN 5 /* log2 of gain, e.g., 5 => 31/32 */ +#define RM_POWER (1 << RM_FILTER_GAIN) +#define RM_MAXDEPTH 32 +#define RM_NS_PER_SEC (1000000000) + +typedef struct _rm_class_stats_ { + u_int handle; + u_int depth; + + struct pktcntr xmit_cnt; /* packets sent in this class */ + struct pktcntr drop_cnt; /* dropped packets */ + u_int over; /* # times went over limit */ + u_int borrows; /* # times tried to borrow */ + u_int overactions; /* # times invoked overlimit action */ + u_int delays; /* # times invoked delay actions */ +} rm_class_stats_t; + +/* + * CBQ Class state structure + */ +struct rm_class { + class_queue_t *q_; /* Queue of packets */ + rm_ifdat_t *ifdat_; + int pri_; /* Class priority. */ + int depth_; /* Class depth */ + u_int ns_per_byte_; /* NanoSeconds per byte. */ + u_int maxrate_; /* Bytes per second for this class. */ + u_int allotment_; /* Fraction of link bandwidth. */ + u_int w_allotment_; /* Weighted allotment for WRR */ + int bytes_alloc_; /* Allocation for round of WRR */ + + int avgidle_; + int maxidle_; + int minidle_; + int offtime_; + int sleeping_; /* != 0 if delaying */ + int qthresh_; /* Queue threshold for formal link sharing */ + int leaf_; /* Note whether leaf class or not.*/ + + rm_class_t *children_; /* Children of this class */ + rm_class_t *next_; /* Next pointer, used if child */ + + rm_class_t *peer_; /* Peer class */ + rm_class_t *borrow_; /* Borrow class */ + rm_class_t *parent_; /* Parent class */ + + void (*overlimit)(struct rm_class *, struct rm_class *); + void (*drop)(struct rm_class *); /* Class drop action. */ + + struct red *red_; /* RED state pointer */ + struct altq_pktattr *pktattr_; /* saved hdr used by RED/ECN */ + int flags_; + + int last_pkttime_; /* saved pkt_time */ + struct timeval undertime_; /* time can next send */ + struct timeval last_; /* time last packet sent */ + struct timeval overtime_; + struct callout callout_; /* for timeout() calls */ + + rm_class_stats_t stats_; /* Class Statistics */ +}; + +/* + * CBQ Interface state + */ +struct rm_ifdat { + int queued_; /* # pkts queued downstream */ + int efficient_; /* Link Efficency bit */ + int wrr_; /* Enable Weighted Round-Robin */ + u_long ns_per_byte_; /* Link byte speed. */ + int maxqueued_; /* Max packets to queue */ + int maxpkt_; /* Max packet size. */ + int qi_; /* In/out pointers for downstream */ + int qo_; /* packets */ + + /* + * Active class state and WRR state. + */ + rm_class_t *active_[RM_MAXPRIO]; /* Active cl's in each pri */ + int na_[RM_MAXPRIO]; /* # of active cl's in a pri */ + int num_[RM_MAXPRIO]; /* # of cl's per pri */ + int alloc_[RM_MAXPRIO]; /* Byte Allocation */ + u_long M_[RM_MAXPRIO]; /* WRR weights. */ + + /* + * Network Interface/Solaris Queue state pointer. + */ + struct ifaltq *ifq_; + rm_class_t *default_; /* Default Pkt class, BE */ + rm_class_t *root_; /* Root Link class. */ + rm_class_t *ctl_; /* Control Traffic class. */ + void (*restart)(struct ifaltq *); /* Restart routine. */ + + /* + * Current packet downstream packet state and dynamic state. + */ + rm_class_t *borrowed_[RM_MAXQUEUED]; /* Class borrowed last */ + rm_class_t *class_[RM_MAXQUEUED]; /* class sending */ + int curlen_[RM_MAXQUEUED]; /* Current pktlen */ + struct timeval now_[RM_MAXQUEUED]; /* Current packet time. */ + int is_overlimit_[RM_MAXQUEUED];/* Current packet time. */ + + int cutoff_; /* Cut-off depth for borrowing */ + + struct timeval ifnow_; /* expected xmit completion time */ +#if 1 /* ALTQ4PPP */ + int maxiftime_; /* max delay inside interface */ +#endif + rm_class_t *pollcache_; /* cached rm_class by poll operation */ +}; + +/* flags for rmc_init and rmc_newclass */ +/* class flags */ +#define RMCF_RED 0x0001 +#define RMCF_ECN 0x0002 +#define RMCF_RIO 0x0004 +#define RMCF_FLOWVALVE 0x0008 /* use flowvalve (aka penalty-box) */ +#define RMCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ + +/* flags for rmc_init */ +#define RMCF_WRR 0x0100 +#define RMCF_EFFICIENT 0x0200 + +#define is_a_parent_class(cl) ((cl)->children_ != NULL) + +extern rm_class_t *rmc_newclass(int, struct rm_ifdat *, u_int, + void (*)(struct rm_class *, struct rm_class *), + int, struct rm_class *, struct rm_class *, + u_int, int, u_int, int, int); +extern void rmc_delete_class(struct rm_ifdat *, struct rm_class *); +extern int rmc_modclass(struct rm_class *, u_int, int, + u_int, int, u_int, int); +extern void rmc_init(struct ifaltq *, struct rm_ifdat *, u_int, + void (*)(struct ifaltq *), + int, int, u_int, int, u_int, int); +extern int rmc_queue_packet(struct rm_class *, mbuf_t *); +extern mbuf_t *rmc_dequeue_next(struct rm_ifdat *, int); +extern void rmc_update_class_util(struct rm_ifdat *); +extern void rmc_delay_action(struct rm_class *, struct rm_class *); +extern void rmc_dropall(struct rm_class *); +extern int rmc_get_weight(struct rm_ifdat *, int); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_RMCLASS_HH_ */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_rmclass_debug.h b/contrib/altq/rtems/freebsd/altq/altq_rmclass_debug.h new file mode 100644 index 00000000..6723a4b7 --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_rmclass_debug.h @@ -0,0 +1,112 @@ +/* $KAME: altq_rmclass_debug.h,v 1.3 2002/11/29 04:36:24 kjc Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#ifndef _ALTQ_ALTQ_RMCLASS_DEBUG_HH_ +#define _ALTQ_ALTQ_RMCLASS_DEBUG_HH_ + +/* #pragma ident "@(#)rm_class_debug.h 1.7 98/05/04 SMI" */ + +/* + * Cbq debugging macros + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef CBQ_TRACE +#ifndef NCBQTRACE +#define NCBQTRACE (16 * 1024) +#endif + +/* + * To view the trace output, using adb, type: + * adb -k /dev/ksyms /dev/mem , then type + * cbqtrace_count/D to get the count, then type + * cbqtrace_buffer,0tcount/Dp4C" "Xn + * This will dump the trace buffer from 0 to count. + */ +/* + * in ALTQ, "call cbqtrace_dump(N)" from DDB to display 20 events + * from Nth event in the circular buffer. + */ + +struct cbqtrace { + int count; + int function; /* address of function */ + int trace_action; /* descriptive 4 characters */ + int object; /* object operated on */ +}; + +extern struct cbqtrace cbqtrace_buffer[]; +extern struct cbqtrace *cbqtrace_ptr; +extern int cbqtrace_count; + +#define CBQTRACEINIT() { \ + if (cbqtrace_ptr == NULL) \ + cbqtrace_ptr = cbqtrace_buffer; \ + else { \ + cbqtrace_ptr = cbqtrace_buffer; \ + bzero((void *)cbqtrace_ptr, sizeof(cbqtrace_buffer)); \ + cbqtrace_count = 0; \ + } \ +} + +#define LOCK_TRACE() splimp() +#define UNLOCK_TRACE(x) splx(x) + +#define CBQTRACE(func, act, obj) { \ + int __s = LOCK_TRACE(); \ + int *_p = &cbqtrace_ptr->count; \ + *_p++ = ++cbqtrace_count; \ + *_p++ = (int)(func); \ + *_p++ = (int)(act); \ + *_p++ = (int)(obj); \ + if ((struct cbqtrace *)(void *)_p >= &cbqtrace_buffer[NCBQTRACE])\ + cbqtrace_ptr = cbqtrace_buffer; \ + else \ + cbqtrace_ptr = (struct cbqtrace *)(void *)_p; \ + UNLOCK_TRACE(__s); \ + } +#else + +/* If no tracing, define no-ops */ +#define CBQTRACEINIT() +#define CBQTRACE(a, b, c) + +#endif /* !CBQ_TRACE */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_RMCLASS_DEBUG_HH_ */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_subr.c b/contrib/altq/rtems/freebsd/altq/altq_subr.c new file mode 100644 index 00000000..ab6adc7f --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_subr.c @@ -0,0 +1,2032 @@ +#include + +/* $FreeBSD$ */ +/* $KAME: altq_subr.c,v 1.21 2003/11/06 06:32:53 kjc Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include +#include +#ifdef __FreeBSD__ +#include +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#ifdef __FreeBSD__ +#include +#endif + +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#include + +#include +#include +#ifdef ALTQ3_COMPAT +#include +#endif + +/* machine dependent clock related includes */ +#ifdef __FreeBSD__ +#if __FreeBSD__ < 3 +#include /* for FreeBSD-2.2.8 to get i586_ctr_freq */ +#endif +#include +#include +#include +#include +#endif +#if defined(__i386__) +#include /* for pentium tsc */ +#include /* for CPUID_TSC */ +#ifdef __FreeBSD__ +#include /* for cpu_feature */ +#elif defined(__NetBSD__) || defined(__OpenBSD__) +#include /* for cpu_feature */ +#endif +#endif /* __i386__ */ + +/* + * internal function prototypes + */ +static void tbr_timeout(void *); +int (*altq_input)(struct mbuf *, int) = NULL; +static struct mbuf *tbr_dequeue(struct ifaltq *, int); +static int tbr_timer = 0; /* token bucket regulator timer */ +#if !defined(__FreeBSD__) || (__FreeBSD_version < 600000) +static struct callout tbr_callout = CALLOUT_INITIALIZER; +#else +static struct callout tbr_callout; +#endif + +#ifdef ALTQ3_CLFIER_COMPAT +static int extract_ports4(struct mbuf *, struct ip *, struct flowinfo_in *); +#ifdef INET6 +static int extract_ports6(struct mbuf *, struct ip6_hdr *, + struct flowinfo_in6 *); +#endif +static int apply_filter4(u_int32_t, struct flow_filter *, + struct flowinfo_in *); +static int apply_ppfilter4(u_int32_t, struct flow_filter *, + struct flowinfo_in *); +#ifdef INET6 +static int apply_filter6(u_int32_t, struct flow_filter6 *, + struct flowinfo_in6 *); +#endif +static int apply_tosfilter4(u_int32_t, struct flow_filter *, + struct flowinfo_in *); +static u_long get_filt_handle(struct acc_classifier *, int); +static struct acc_filter *filth_to_filtp(struct acc_classifier *, u_long); +static u_int32_t filt2fibmask(struct flow_filter *); + +static void ip4f_cache(struct ip *, struct flowinfo_in *); +static int ip4f_lookup(struct ip *, struct flowinfo_in *); +static int ip4f_init(void); +static struct ip4_frag *ip4f_alloc(void); +static void ip4f_free(struct ip4_frag *); +#endif /* ALTQ3_CLFIER_COMPAT */ + +/* + * alternate queueing support routines + */ + +/* look up the queue state by the interface name and the queueing type. */ +void * +altq_lookup(name, type) + char *name; + int type; +{ + struct ifnet *ifp; + + if ((ifp = ifunit(name)) != NULL) { + /* read if_snd unlocked */ + if (type != ALTQT_NONE && ifp->if_snd.altq_type == type) + return (ifp->if_snd.altq_disc); + } + + return NULL; +} + +int +altq_attach(ifq, type, discipline, enqueue, dequeue, request, clfier, classify) + struct ifaltq *ifq; + int type; + void *discipline; + int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *); + struct mbuf *(*dequeue)(struct ifaltq *, int); + int (*request)(struct ifaltq *, int, void *); + void *clfier; + void *(*classify)(void *, struct mbuf *, int); +{ + IFQ_LOCK(ifq); + if (!ALTQ_IS_READY(ifq)) { + IFQ_UNLOCK(ifq); + return ENXIO; + } + +#ifdef ALTQ3_COMPAT + /* + * pfaltq can override the existing discipline, but altq3 cannot. + * check these if clfier is not NULL (which implies altq3). + */ + if (clfier != NULL) { + if (ALTQ_IS_ENABLED(ifq)) { + IFQ_UNLOCK(ifq); + return EBUSY; + } + if (ALTQ_IS_ATTACHED(ifq)) { + IFQ_UNLOCK(ifq); + return EEXIST; + } + } +#endif + ifq->altq_type = type; + ifq->altq_disc = discipline; + ifq->altq_enqueue = enqueue; + ifq->altq_dequeue = dequeue; + ifq->altq_request = request; + ifq->altq_clfier = clfier; + ifq->altq_classify = classify; + ifq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED); +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_KLD + altq_module_incref(type); +#endif +#endif + IFQ_UNLOCK(ifq); + return 0; +} + +int +altq_detach(ifq) + struct ifaltq *ifq; +{ + IFQ_LOCK(ifq); + + if (!ALTQ_IS_READY(ifq)) { + IFQ_UNLOCK(ifq); + return ENXIO; + } + if (ALTQ_IS_ENABLED(ifq)) { + IFQ_UNLOCK(ifq); + return EBUSY; + } + if (!ALTQ_IS_ATTACHED(ifq)) { + IFQ_UNLOCK(ifq); + return (0); + } +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_KLD + altq_module_declref(ifq->altq_type); +#endif +#endif + + ifq->altq_type = ALTQT_NONE; + ifq->altq_disc = NULL; + ifq->altq_enqueue = NULL; + ifq->altq_dequeue = NULL; + ifq->altq_request = NULL; + ifq->altq_clfier = NULL; + ifq->altq_classify = NULL; + ifq->altq_flags &= ALTQF_CANTCHANGE; + + IFQ_UNLOCK(ifq); + return 0; +} + +int +altq_enable(ifq) + struct ifaltq *ifq; +{ + int s; + + IFQ_LOCK(ifq); + + if (!ALTQ_IS_READY(ifq)) { + IFQ_UNLOCK(ifq); + return ENXIO; + } + if (ALTQ_IS_ENABLED(ifq)) { + IFQ_UNLOCK(ifq); + return 0; + } + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + IFQ_PURGE_NOLOCK(ifq); + ASSERT(ifq->ifq_len == 0); + ifq->ifq_drv_maxlen = 0; /* disable bulk dequeue */ + ifq->altq_flags |= ALTQF_ENABLED; + if (ifq->altq_clfier != NULL) + ifq->altq_flags |= ALTQF_CLASSIFY; + splx(s); + + IFQ_UNLOCK(ifq); + return 0; +} + +int +altq_disable(ifq) + struct ifaltq *ifq; +{ + int s; + + IFQ_LOCK(ifq); + if (!ALTQ_IS_ENABLED(ifq)) { + IFQ_UNLOCK(ifq); + return 0; + } + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + IFQ_PURGE_NOLOCK(ifq); + ASSERT(ifq->ifq_len == 0); + ifq->altq_flags &= ~(ALTQF_ENABLED|ALTQF_CLASSIFY); + splx(s); + + IFQ_UNLOCK(ifq); + return 0; +} + +#ifdef ALTQ_DEBUG +void +altq_assert(file, line, failedexpr) + const char *file, *failedexpr; + int line; +{ + (void)printf("altq assertion \"%s\" failed: file \"%s\", line %d\n", + failedexpr, file, line); + panic("altq assertion"); + /* NOTREACHED */ +} +#endif + +/* + * internal representation of token bucket parameters + * rate: byte_per_unittime << 32 + * (((bits_per_sec) / 8) << 32) / machclk_freq + * depth: byte << 32 + * + */ +#define TBR_SHIFT 32 +#define TBR_SCALE(x) ((int64_t)(x) << TBR_SHIFT) +#define TBR_UNSCALE(x) ((x) >> TBR_SHIFT) + +static struct mbuf * +tbr_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + struct tb_regulator *tbr; + struct mbuf *m; + int64_t interval; + u_int64_t now; + + IFQ_LOCK_ASSERT(ifq); + tbr = ifq->altq_tbr; + if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) { + /* if this is a remove after poll, bypass tbr check */ + } else { + /* update token only when it is negative */ + if (tbr->tbr_token <= 0) { + now = read_machclk(); + interval = now - tbr->tbr_last; + if (interval >= tbr->tbr_filluptime) + tbr->tbr_token = tbr->tbr_depth; + else { + tbr->tbr_token += interval * tbr->tbr_rate; + if (tbr->tbr_token > tbr->tbr_depth) + tbr->tbr_token = tbr->tbr_depth; + } + tbr->tbr_last = now; + } + /* if token is still negative, don't allow dequeue */ + if (tbr->tbr_token <= 0) + return (NULL); + } + + if (ALTQ_IS_ENABLED(ifq)) + m = (*ifq->altq_dequeue)(ifq, op); + else { + if (op == ALTDQ_POLL) + _IF_POLL(ifq, m); + else + _IF_DEQUEUE(ifq, m); + } + + if (m != NULL && op == ALTDQ_REMOVE) + tbr->tbr_token -= TBR_SCALE(m_pktlen(m)); + tbr->tbr_lastop = op; + return (m); +} + +/* + * set a token bucket regulator. + * if the specified rate is zero, the token bucket regulator is deleted. + */ +int +tbr_set(ifq, profile) + struct ifaltq *ifq; + struct tb_profile *profile; +{ + struct tb_regulator *tbr, *otbr; + + if (tbr_dequeue_ptr == NULL) + tbr_dequeue_ptr = tbr_dequeue; + + if (machclk_freq == 0) + init_machclk(); + if (machclk_freq == 0) { + printf("tbr_set: no cpu clock available!\n"); + return (ENXIO); + } + + IFQ_LOCK(ifq); + if (profile->rate == 0) { + /* delete this tbr */ + if ((tbr = ifq->altq_tbr) == NULL) { + IFQ_UNLOCK(ifq); + return (ENOENT); + } + ifq->altq_tbr = NULL; + free(tbr, M_DEVBUF); + IFQ_UNLOCK(ifq); + return (0); + } + + IFQ_UNLOCK(ifq); + tbr = malloc(sizeof(struct tb_regulator), + M_DEVBUF, M_WAITOK); + if (tbr == NULL) { /* can not happen */ + IFQ_UNLOCK(ifq); + return (ENOMEM); + } + bzero(tbr, sizeof(struct tb_regulator)); + + tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq; + tbr->tbr_depth = TBR_SCALE(profile->depth); + if (tbr->tbr_rate > 0) + tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate; + else + tbr->tbr_filluptime = 0xffffffffffffffffLL; + tbr->tbr_token = tbr->tbr_depth; + tbr->tbr_last = read_machclk(); + tbr->tbr_lastop = ALTDQ_REMOVE; + + IFQ_LOCK(ifq); + otbr = ifq->altq_tbr; + ifq->altq_tbr = tbr; /* set the new tbr */ + + if (otbr != NULL) + free(otbr, M_DEVBUF); + else { + if (tbr_timer == 0) { + CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); + tbr_timer = 1; + } + } + IFQ_UNLOCK(ifq); + return (0); +} + +/* + * tbr_timeout goes through the interface list, and kicks the drivers + * if necessary. + * + * MPSAFE + */ +static void +tbr_timeout(arg) + void *arg; +{ +#if defined(__FreeBSD__) + VNET_ITERATOR_DECL(vnet_iter); +#endif + struct ifnet *ifp; + int active, s; + + active = 0; +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif +#if defined(__FreeBSD__) && (__FreeBSD_version >= 500000) + IFNET_RLOCK_NOSLEEP(); + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); +#endif + for (ifp = TAILQ_FIRST(&V_ifnet); ifp; + ifp = TAILQ_NEXT(ifp, if_list)) { + /* read from if_snd unlocked */ + if (!TBR_IS_ENABLED(&ifp->if_snd)) + continue; + active++; + if (!IFQ_IS_EMPTY(&ifp->if_snd) && + ifp->if_start != NULL) + (*ifp->if_start)(ifp); + } +#if defined(__FreeBSD__) && (__FreeBSD_version >= 500000) + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); + IFNET_RUNLOCK_NOSLEEP(); +#endif + splx(s); + if (active > 0) + CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); + else + tbr_timer = 0; /* don't need tbr_timer anymore */ +#if defined(__alpha__) && !defined(ALTQ_NOPCC) + { + /* + * XXX read out the machine dependent clock once a second + * to detect counter wrap-around. + */ + static u_int cnt; + + if (++cnt >= hz) { + (void)read_machclk(); + cnt = 0; + } + } +#endif /* __alpha__ && !ALTQ_NOPCC */ +} + +/* + * get token bucket regulator profile + */ +int +tbr_get(ifq, profile) + struct ifaltq *ifq; + struct tb_profile *profile; +{ + struct tb_regulator *tbr; + + IFQ_LOCK(ifq); + if ((tbr = ifq->altq_tbr) == NULL) { + profile->rate = 0; + profile->depth = 0; + } else { + profile->rate = + (u_int)TBR_UNSCALE(tbr->tbr_rate * 8 * machclk_freq); + profile->depth = (u_int)TBR_UNSCALE(tbr->tbr_depth); + } + IFQ_UNLOCK(ifq); + return (0); +} + +/* + * attach a discipline to the interface. if one already exists, it is + * overridden. + * Locking is done in the discipline specific attach functions. Basically + * they call back to altq_attach which takes care of the attach and locking. + */ +int +altq_pfattach(struct pf_altq *a) +{ + int error = 0; + + switch (a->scheduler) { + case ALTQT_NONE: + break; +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_pfattach(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_pfattach(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_pfattach(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * detach a discipline from the interface. + * it is possible that the discipline was already overridden by another + * discipline. + */ +int +altq_pfdetach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error = 0; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + + /* if this discipline is no longer referenced, just return */ + /* read unlocked from if_snd */ + if (a->altq_disc == NULL || a->altq_disc != ifp->if_snd.altq_disc) + return (0); + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + /* read unlocked from if_snd, _disable and _detach take care */ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) + error = altq_disable(&ifp->if_snd); + if (error == 0) + error = altq_detach(&ifp->if_snd); + splx(s); + + return (error); +} + +/* + * add a discipline or a queue + * Locking is done in the discipline specific functions with regards to + * malloc with WAITOK, also it is not yet clear which lock to use. + */ +int +altq_add(struct pf_altq *a) +{ + int error = 0; + + if (a->qname[0] != 0) + return (altq_add_queue(a)); + + if (machclk_freq == 0) + init_machclk(); + if (machclk_freq == 0) + panic("altq_add: no cpu clock"); + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_add_altq(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_add_altq(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_add_altq(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * remove a discipline or a queue + * It is yet unclear what lock to use to protect this operation, the + * discipline specific functions will determine and grab it + */ +int +altq_remove(struct pf_altq *a) +{ + int error = 0; + + if (a->qname[0] != 0) + return (altq_remove_queue(a)); + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_remove_altq(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_remove_altq(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_remove_altq(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * add a queue to the discipline + * It is yet unclear what lock to use to protect this operation, the + * discipline specific functions will determine and grab it + */ +int +altq_add_queue(struct pf_altq *a) +{ + int error = 0; + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_add_queue(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_add_queue(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_add_queue(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * remove a queue from the discipline + * It is yet unclear what lock to use to protect this operation, the + * discipline specific functions will determine and grab it + */ +int +altq_remove_queue(struct pf_altq *a) +{ + int error = 0; + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_remove_queue(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_remove_queue(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_remove_queue(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * get queue statistics + * Locking is done in the discipline specific functions with regards to + * copyout operations, also it is not yet clear which lock to use. + */ +int +altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + int error = 0; + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_getqstats(a, ubuf, nbytes); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_getqstats(a, ubuf, nbytes); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_getqstats(a, ubuf, nbytes); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * read and write diffserv field in IPv4 or IPv6 header + */ +u_int8_t +read_dsfield(m, pktattr) + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + struct mbuf *m0; + u_int8_t ds_field = 0; + + if (pktattr == NULL || + (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) + return ((u_int8_t)0); + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if ((pktattr->pattr_hdr >= m0->m_data) && + (pktattr->pattr_hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, pattr_hdr is stale */ + pktattr->pattr_af = AF_UNSPEC; +#ifdef ALTQ_DEBUG + printf("read_dsfield: can't locate header!\n"); +#endif + return ((u_int8_t)0); + } + + if (pktattr->pattr_af == AF_INET) { + struct ip *ip = (struct ip *)pktattr->pattr_hdr; + + if (ip->ip_v != 4) + return ((u_int8_t)0); /* version mismatch! */ + ds_field = ip->ip_tos; + } +#ifdef INET6 + else if (pktattr->pattr_af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return ((u_int8_t)0); /* version mismatch! */ + ds_field = (flowlabel >> 20) & 0xff; + } +#endif + return (ds_field); +} + +void +write_dsfield(struct mbuf *m, struct altq_pktattr *pktattr, u_int8_t dsfield) +{ + struct mbuf *m0; + + if (pktattr == NULL || + (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) + return; + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if ((pktattr->pattr_hdr >= m0->m_data) && + (pktattr->pattr_hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, pattr_hdr is stale */ + pktattr->pattr_af = AF_UNSPEC; +#ifdef ALTQ_DEBUG + printf("write_dsfield: can't locate header!\n"); +#endif + return; + } + + if (pktattr->pattr_af == AF_INET) { + struct ip *ip = (struct ip *)pktattr->pattr_hdr; + u_int8_t old; + int32_t sum; + + if (ip->ip_v != 4) + return; /* version mismatch! */ + old = ip->ip_tos; + dsfield |= old & 3; /* leave CU bits */ + if (old == dsfield) + return; + ip->ip_tos = dsfield; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += 0xff00 + (~old & 0xff) + dsfield; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + + ip->ip_sum = htons(~sum & 0xffff); + } +#ifdef INET6 + else if (pktattr->pattr_af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return; /* version mismatch! */ + flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20); + ip6->ip6_flow = htonl(flowlabel); + } +#endif + return; +} + + +/* + * high resolution clock support taking advantage of a machine dependent + * high resolution time counter (e.g., timestamp counter of intel pentium). + * we assume + * - 64-bit-long monotonically-increasing counter + * - frequency range is 100M-4GHz (CPU speed) + */ +/* if pcc is not available or disabled, emulate 256MHz using microtime() */ +#define MACHCLK_SHIFT 8 + +int machclk_usepcc; +u_int32_t machclk_freq; +u_int32_t machclk_per_tick; + +#ifdef __alpha__ +#ifdef __FreeBSD__ +extern u_int32_t cycles_per_sec; /* alpha cpu clock frequency */ +#elif defined(__NetBSD__) || defined(__OpenBSD__) +extern u_int64_t cycles_per_usec; /* alpha cpu clock frequency */ +#endif +#endif /* __alpha__ */ +#if defined(__i386__) && defined(__NetBSD__) +extern u_int64_t cpu_tsc_freq; +#endif /* __alpha__ */ + +#if (__FreeBSD_version >= 700035) +/* Update TSC freq with the value indicated by the caller. */ +static void +tsc_freq_changed(void *arg, const struct cf_level *level, int status) +{ + /* If there was an error during the transition, don't do anything. */ + if (status != 0) + return; + +#if (__FreeBSD_version >= 701102) && (defined(__amd64__) || defined(__i386__)) + /* If TSC is P-state invariant, don't do anything. */ + if (tsc_is_invariant) + return; +#endif + + /* Total setting for this level gives the new frequency in MHz. */ + init_machclk(); +} +EVENTHANDLER_DEFINE(cpufreq_post_change, tsc_freq_changed, NULL, + EVENTHANDLER_PRI_LAST); +#endif /* __FreeBSD_version >= 700035 */ + +static void +init_machclk_setup(void) +{ +#if (__FreeBSD_version >= 600000) + callout_init(&tbr_callout, 0); +#endif + + machclk_usepcc = 1; + +#if (!defined(__i386__) && !defined(__alpha__)) || defined(ALTQ_NOPCC) + machclk_usepcc = 0; +#endif +#if defined(__FreeBSD__) && defined(SMP) + machclk_usepcc = 0; +#endif +#if defined(__NetBSD__) && defined(MULTIPROCESSOR) + machclk_usepcc = 0; +#endif +#ifdef __i386__ + /* check if TSC is available */ + if (machclk_usepcc == 1 && ((cpu_feature & CPUID_TSC) == 0 || + tsc_is_broken)) + machclk_usepcc = 0; +#endif +} + +void +init_machclk(void) +{ + static int called; + + /* Call one-time initialization function. */ + if (!called) { + init_machclk_setup(); + called = 1; + } + + if (machclk_usepcc == 0) { + /* emulate 256MHz using microtime() */ + machclk_freq = 1000000 << MACHCLK_SHIFT; + machclk_per_tick = machclk_freq / hz; +#ifdef ALTQ_DEBUG + printf("altq: emulate %uHz cpu clock\n", machclk_freq); +#endif + return; + } + + /* + * if the clock frequency (of Pentium TSC or Alpha PCC) is + * accessible, just use it. + */ +#ifdef __i386__ +#ifdef __FreeBSD__ +#if (__FreeBSD_version > 300000) + machclk_freq = tsc_freq; +#else + machclk_freq = i586_ctr_freq; +#endif +#elif defined(__NetBSD__) + machclk_freq = (u_int32_t)cpu_tsc_freq; +#elif defined(__OpenBSD__) && (defined(I586_CPU) || defined(I686_CPU)) + machclk_freq = pentium_mhz * 1000000; +#endif +#elif defined(__alpha__) +#ifdef __FreeBSD__ + machclk_freq = cycles_per_sec; +#elif defined(__NetBSD__) || defined(__OpenBSD__) + machclk_freq = (u_int32_t)(cycles_per_usec * 1000000); +#endif +#endif /* __alpha__ */ + + /* + * if we don't know the clock frequency, measure it. + */ + if (machclk_freq == 0) { + static int wait; + struct timeval tv_start, tv_end; + u_int64_t start, end, diff; + int timo; + + microtime(&tv_start); + start = read_machclk(); + timo = hz; /* 1 sec */ + (void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo); + microtime(&tv_end); + end = read_machclk(); + diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000 + + tv_end.tv_usec - tv_start.tv_usec; + if (diff != 0) + machclk_freq = (u_int)((end - start) * 1000000 / diff); + } + + machclk_per_tick = machclk_freq / hz; + +#ifdef ALTQ_DEBUG + printf("altq: CPU clock: %uHz\n", machclk_freq); +#endif +} + +#if defined(__OpenBSD__) && defined(__i386__) +static __inline u_int64_t +rdtsc(void) +{ + u_int64_t rv; + __asm __volatile(".byte 0x0f, 0x31" : "=A" (rv)); + return (rv); +} +#endif /* __OpenBSD__ && __i386__ */ + +u_int64_t +read_machclk(void) +{ + u_int64_t val; + + if (machclk_usepcc) { +#if defined(__i386__) + val = rdtsc(); +#elif defined(__alpha__) + static u_int32_t last_pcc, upper; + u_int32_t pcc; + + /* + * for alpha, make a 64bit counter value out of the 32bit + * alpha processor cycle counter. + * read_machclk must be called within a half of its + * wrap-around cycle (about 5 sec for 400MHz cpu) to properly + * detect a counter wrap-around. + * tbr_timeout calls read_machclk once a second. + */ + pcc = (u_int32_t)alpha_rpcc(); + if (pcc <= last_pcc) + upper++; + last_pcc = pcc; + val = ((u_int64_t)upper << 32) + pcc; +#else + panic("read_machclk"); +#endif + } else { + struct timeval tv; + + microtime(&tv); + val = (((u_int64_t)(tv.tv_sec - boottime.tv_sec) * 1000000 + + tv.tv_usec) << MACHCLK_SHIFT); + } + return (val); +} + +#ifdef ALTQ3_CLFIER_COMPAT + +#ifndef IPPROTO_ESP +#define IPPROTO_ESP 50 /* encapsulating security payload */ +#endif +#ifndef IPPROTO_AH +#define IPPROTO_AH 51 /* authentication header */ +#endif + +/* + * extract flow information from a given packet. + * filt_mask shows flowinfo fields required. + * we assume the ip header is in one mbuf, and addresses and ports are + * in network byte order. + */ +int +altq_extractflow(m, af, flow, filt_bmask) + struct mbuf *m; + int af; + struct flowinfo *flow; + u_int32_t filt_bmask; +{ + + switch (af) { + case PF_INET: { + struct flowinfo_in *fin; + struct ip *ip; + + ip = mtod(m, struct ip *); + + if (ip->ip_v != 4) + break; + + fin = (struct flowinfo_in *)flow; + fin->fi_len = sizeof(struct flowinfo_in); + fin->fi_family = AF_INET; + + fin->fi_proto = ip->ip_p; + fin->fi_tos = ip->ip_tos; + + fin->fi_src.s_addr = ip->ip_src.s_addr; + fin->fi_dst.s_addr = ip->ip_dst.s_addr; + + if (filt_bmask & FIMB4_PORTS) + /* if port info is required, extract port numbers */ + extract_ports4(m, ip, fin); + else { + fin->fi_sport = 0; + fin->fi_dport = 0; + fin->fi_gpi = 0; + } + return (1); + } + +#ifdef INET6 + case PF_INET6: { + struct flowinfo_in6 *fin6; + struct ip6_hdr *ip6; + + ip6 = mtod(m, struct ip6_hdr *); + /* should we check the ip version? */ + + fin6 = (struct flowinfo_in6 *)flow; + fin6->fi6_len = sizeof(struct flowinfo_in6); + fin6->fi6_family = AF_INET6; + + fin6->fi6_proto = ip6->ip6_nxt; + fin6->fi6_tclass = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + + fin6->fi6_flowlabel = ip6->ip6_flow & htonl(0x000fffff); + fin6->fi6_src = ip6->ip6_src; + fin6->fi6_dst = ip6->ip6_dst; + + if ((filt_bmask & FIMB6_PORTS) || + ((filt_bmask & FIMB6_PROTO) + && ip6->ip6_nxt > IPPROTO_IPV6)) + /* + * if port info is required, or proto is required + * but there are option headers, extract port + * and protocol numbers. + */ + extract_ports6(m, ip6, fin6); + else { + fin6->fi6_sport = 0; + fin6->fi6_dport = 0; + fin6->fi6_gpi = 0; + } + return (1); + } +#endif /* INET6 */ + + default: + break; + } + + /* failed */ + flow->fi_len = sizeof(struct flowinfo); + flow->fi_family = AF_UNSPEC; + return (0); +} + +/* + * helper routine to extract port numbers + */ +/* structure for ipsec and ipv6 option header template */ +struct _opt6 { + u_int8_t opt6_nxt; /* next header */ + u_int8_t opt6_hlen; /* header extension length */ + u_int16_t _pad; + u_int32_t ah_spi; /* security parameter index + for authentication header */ +}; + +/* + * extract port numbers from a ipv4 packet. + */ +static int +extract_ports4(m, ip, fin) + struct mbuf *m; + struct ip *ip; + struct flowinfo_in *fin; +{ + struct mbuf *m0; + u_short ip_off; + u_int8_t proto; + int off; + + fin->fi_sport = 0; + fin->fi_dport = 0; + fin->fi_gpi = 0; + + ip_off = ntohs(ip->ip_off); + /* if it is a fragment, try cached fragment info */ + if (ip_off & IP_OFFMASK) { + ip4f_lookup(ip, fin); + return (1); + } + + /* locate the mbuf containing the protocol header */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)ip >= m0->m_data) && + ((caddr_t)ip < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { +#ifdef ALTQ_DEBUG + printf("extract_ports4: can't locate header! ip=%p\n", ip); +#endif + return (0); + } + off = ((caddr_t)ip - m0->m_data) + (ip->ip_hl << 2); + proto = ip->ip_p; + +#ifdef ALTQ_IPSEC + again: +#endif + while (off >= m0->m_len) { + off -= m0->m_len; + m0 = m0->m_next; + if (m0 == NULL) + return (0); /* bogus ip_hl! */ + } + if (m0->m_len < off + 4) + return (0); + + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: { + struct udphdr *udp; + + udp = (struct udphdr *)(mtod(m0, caddr_t) + off); + fin->fi_sport = udp->uh_sport; + fin->fi_dport = udp->uh_dport; + fin->fi_proto = proto; + } + break; + +#ifdef ALTQ_IPSEC + case IPPROTO_ESP: + if (fin->fi_gpi == 0){ + u_int32_t *gpi; + + gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); + fin->fi_gpi = *gpi; + } + fin->fi_proto = proto; + break; + + case IPPROTO_AH: { + /* get next header and header length */ + struct _opt6 *opt6; + + opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); + proto = opt6->opt6_nxt; + off += 8 + (opt6->opt6_hlen * 4); + if (fin->fi_gpi == 0 && m0->m_len >= off + 8) + fin->fi_gpi = opt6->ah_spi; + } + /* goto the next header */ + goto again; +#endif /* ALTQ_IPSEC */ + + default: + fin->fi_proto = proto; + return (0); + } + + /* if this is a first fragment, cache it. */ + if (ip_off & IP_MF) + ip4f_cache(ip, fin); + + return (1); +} + +#ifdef INET6 +static int +extract_ports6(m, ip6, fin6) + struct mbuf *m; + struct ip6_hdr *ip6; + struct flowinfo_in6 *fin6; +{ + struct mbuf *m0; + int off; + u_int8_t proto; + + fin6->fi6_gpi = 0; + fin6->fi6_sport = 0; + fin6->fi6_dport = 0; + + /* locate the mbuf containing the protocol header */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)ip6 >= m0->m_data) && + ((caddr_t)ip6 < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { +#ifdef ALTQ_DEBUG + printf("extract_ports6: can't locate header! ip6=%p\n", ip6); +#endif + return (0); + } + off = ((caddr_t)ip6 - m0->m_data) + sizeof(struct ip6_hdr); + + proto = ip6->ip6_nxt; + do { + while (off >= m0->m_len) { + off -= m0->m_len; + m0 = m0->m_next; + if (m0 == NULL) + return (0); + } + if (m0->m_len < off + 4) + return (0); + + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: { + struct udphdr *udp; + + udp = (struct udphdr *)(mtod(m0, caddr_t) + off); + fin6->fi6_sport = udp->uh_sport; + fin6->fi6_dport = udp->uh_dport; + fin6->fi6_proto = proto; + } + return (1); + + case IPPROTO_ESP: + if (fin6->fi6_gpi == 0) { + u_int32_t *gpi; + + gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); + fin6->fi6_gpi = *gpi; + } + fin6->fi6_proto = proto; + return (1); + + case IPPROTO_AH: { + /* get next header and header length */ + struct _opt6 *opt6; + + opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); + if (fin6->fi6_gpi == 0 && m0->m_len >= off + 8) + fin6->fi6_gpi = opt6->ah_spi; + proto = opt6->opt6_nxt; + off += 8 + (opt6->opt6_hlen * 4); + /* goto the next header */ + break; + } + + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: { + /* get next header and header length */ + struct _opt6 *opt6; + + opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); + proto = opt6->opt6_nxt; + off += (opt6->opt6_hlen + 1) * 8; + /* goto the next header */ + break; + } + + case IPPROTO_FRAGMENT: + /* ipv6 fragmentations are not supported yet */ + default: + fin6->fi6_proto = proto; + return (0); + } + } while (1); + /*NOTREACHED*/ +} +#endif /* INET6 */ + +/* + * altq common classifier + */ +int +acc_add_filter(classifier, filter, class, phandle) + struct acc_classifier *classifier; + struct flow_filter *filter; + void *class; + u_long *phandle; +{ + struct acc_filter *afp, *prev, *tmp; + int i, s; + +#ifdef INET6 + if (filter->ff_flow.fi_family != AF_INET && + filter->ff_flow.fi_family != AF_INET6) + return (EINVAL); +#else + if (filter->ff_flow.fi_family != AF_INET) + return (EINVAL); +#endif + + afp = malloc(sizeof(struct acc_filter), + M_DEVBUF, M_WAITOK); + if (afp == NULL) + return (ENOMEM); + bzero(afp, sizeof(struct acc_filter)); + + afp->f_filter = *filter; + afp->f_class = class; + + i = ACC_WILDCARD_INDEX; + if (filter->ff_flow.fi_family == AF_INET) { + struct flow_filter *filter4 = &afp->f_filter; + + /* + * if address is 0, it's a wildcard. if address mask + * isn't set, use full mask. + */ + if (filter4->ff_flow.fi_dst.s_addr == 0) + filter4->ff_mask.mask_dst.s_addr = 0; + else if (filter4->ff_mask.mask_dst.s_addr == 0) + filter4->ff_mask.mask_dst.s_addr = 0xffffffff; + if (filter4->ff_flow.fi_src.s_addr == 0) + filter4->ff_mask.mask_src.s_addr = 0; + else if (filter4->ff_mask.mask_src.s_addr == 0) + filter4->ff_mask.mask_src.s_addr = 0xffffffff; + + /* clear extra bits in addresses */ + filter4->ff_flow.fi_dst.s_addr &= + filter4->ff_mask.mask_dst.s_addr; + filter4->ff_flow.fi_src.s_addr &= + filter4->ff_mask.mask_src.s_addr; + + /* + * if dst address is a wildcard, use hash-entry + * ACC_WILDCARD_INDEX. + */ + if (filter4->ff_mask.mask_dst.s_addr != 0xffffffff) + i = ACC_WILDCARD_INDEX; + else + i = ACC_GET_HASH_INDEX(filter4->ff_flow.fi_dst.s_addr); + } +#ifdef INET6 + else if (filter->ff_flow.fi_family == AF_INET6) { + struct flow_filter6 *filter6 = + (struct flow_filter6 *)&afp->f_filter; +#ifndef IN6MASK0 /* taken from kame ipv6 */ +#define IN6MASK0 {{{ 0, 0, 0, 0 }}} +#define IN6MASK128 {{{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }}} + const struct in6_addr in6mask0 = IN6MASK0; + const struct in6_addr in6mask128 = IN6MASK128; +#endif + + if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_dst)) + filter6->ff_mask6.mask6_dst = in6mask0; + else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_dst)) + filter6->ff_mask6.mask6_dst = in6mask128; + if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_src)) + filter6->ff_mask6.mask6_src = in6mask0; + else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_src)) + filter6->ff_mask6.mask6_src = in6mask128; + + /* clear extra bits in addresses */ + for (i = 0; i < 16; i++) + filter6->ff_flow6.fi6_dst.s6_addr[i] &= + filter6->ff_mask6.mask6_dst.s6_addr[i]; + for (i = 0; i < 16; i++) + filter6->ff_flow6.fi6_src.s6_addr[i] &= + filter6->ff_mask6.mask6_src.s6_addr[i]; + + if (filter6->ff_flow6.fi6_flowlabel == 0) + i = ACC_WILDCARD_INDEX; + else + i = ACC_GET_HASH_INDEX(filter6->ff_flow6.fi6_flowlabel); + } +#endif /* INET6 */ + + afp->f_handle = get_filt_handle(classifier, i); + + /* update filter bitmask */ + afp->f_fbmask = filt2fibmask(filter); + classifier->acc_fbmask |= afp->f_fbmask; + + /* + * add this filter to the filter list. + * filters are ordered from the highest rule number. + */ +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + prev = NULL; + LIST_FOREACH(tmp, &classifier->acc_filters[i], f_chain) { + if (tmp->f_filter.ff_ruleno > afp->f_filter.ff_ruleno) + prev = tmp; + else + break; + } + if (prev == NULL) + LIST_INSERT_HEAD(&classifier->acc_filters[i], afp, f_chain); + else + LIST_INSERT_AFTER(prev, afp, f_chain); + splx(s); + + *phandle = afp->f_handle; + return (0); +} + +int +acc_delete_filter(classifier, handle) + struct acc_classifier *classifier; + u_long handle; +{ + struct acc_filter *afp; + int s; + + if ((afp = filth_to_filtp(classifier, handle)) == NULL) + return (EINVAL); + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + LIST_REMOVE(afp, f_chain); + splx(s); + + free(afp, M_DEVBUF); + + /* todo: update filt_bmask */ + + return (0); +} + +/* + * delete filters referencing to the specified class. + * if the all flag is not 0, delete all the filters. + */ +int +acc_discard_filters(classifier, class, all) + struct acc_classifier *classifier; + void *class; + int all; +{ + struct acc_filter *afp; + int i, s; + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + for (i = 0; i < ACC_FILTER_TABLESIZE; i++) { + do { + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if (all || afp->f_class == class) { + LIST_REMOVE(afp, f_chain); + free(afp, M_DEVBUF); + /* start again from the head */ + break; + } + } while (afp != NULL); + } + splx(s); + + if (all) + classifier->acc_fbmask = 0; + + return (0); +} + +void * +acc_classify(clfier, m, af) + void *clfier; + struct mbuf *m; + int af; +{ + struct acc_classifier *classifier; + struct flowinfo flow; + struct acc_filter *afp; + int i; + + classifier = (struct acc_classifier *)clfier; + altq_extractflow(m, af, &flow, classifier->acc_fbmask); + + if (flow.fi_family == AF_INET) { + struct flowinfo_in *fp = (struct flowinfo_in *)&flow; + + if ((classifier->acc_fbmask & FIMB4_ALL) == FIMB4_TOS) { + /* only tos is used */ + LIST_FOREACH(afp, + &classifier->acc_filters[ACC_WILDCARD_INDEX], + f_chain) + if (apply_tosfilter4(afp->f_fbmask, + &afp->f_filter, fp)) + /* filter matched */ + return (afp->f_class); + } else if ((classifier->acc_fbmask & + (~(FIMB4_PROTO|FIMB4_SPORT|FIMB4_DPORT) & FIMB4_ALL)) + == 0) { + /* only proto and ports are used */ + LIST_FOREACH(afp, + &classifier->acc_filters[ACC_WILDCARD_INDEX], + f_chain) + if (apply_ppfilter4(afp->f_fbmask, + &afp->f_filter, fp)) + /* filter matched */ + return (afp->f_class); + } else { + /* get the filter hash entry from its dest address */ + i = ACC_GET_HASH_INDEX(fp->fi_dst.s_addr); + do { + /* + * go through this loop twice. first for dst + * hash, second for wildcards. + */ + LIST_FOREACH(afp, &classifier->acc_filters[i], + f_chain) + if (apply_filter4(afp->f_fbmask, + &afp->f_filter, fp)) + /* filter matched */ + return (afp->f_class); + + /* + * check again for filters with a dst addr + * wildcard. + * (daddr == 0 || dmask != 0xffffffff). + */ + if (i != ACC_WILDCARD_INDEX) + i = ACC_WILDCARD_INDEX; + else + break; + } while (1); + } + } +#ifdef INET6 + else if (flow.fi_family == AF_INET6) { + struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)&flow; + + /* get the filter hash entry from its flow ID */ + if (fp6->fi6_flowlabel != 0) + i = ACC_GET_HASH_INDEX(fp6->fi6_flowlabel); + else + /* flowlable can be zero */ + i = ACC_WILDCARD_INDEX; + + /* go through this loop twice. first for flow hash, second + for wildcards. */ + do { + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if (apply_filter6(afp->f_fbmask, + (struct flow_filter6 *)&afp->f_filter, + fp6)) + /* filter matched */ + return (afp->f_class); + + /* + * check again for filters with a wildcard. + */ + if (i != ACC_WILDCARD_INDEX) + i = ACC_WILDCARD_INDEX; + else + break; + } while (1); + } +#endif /* INET6 */ + + /* no filter matched */ + return (NULL); +} + +static int +apply_filter4(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter *filt; + struct flowinfo_in *pkt; +{ + if (filt->ff_flow.fi_family != AF_INET) + return (0); + if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) + return (0); + if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) + return (0); + if ((fbmask & FIMB4_DADDR) && + filt->ff_flow.fi_dst.s_addr != + (pkt->fi_dst.s_addr & filt->ff_mask.mask_dst.s_addr)) + return (0); + if ((fbmask & FIMB4_SADDR) && + filt->ff_flow.fi_src.s_addr != + (pkt->fi_src.s_addr & filt->ff_mask.mask_src.s_addr)) + return (0); + if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) + return (0); + if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != + (pkt->fi_tos & filt->ff_mask.mask_tos)) + return (0); + if ((fbmask & FIMB4_GPI) && filt->ff_flow.fi_gpi != (pkt->fi_gpi)) + return (0); + /* match */ + return (1); +} + +/* + * filter matching function optimized for a common case that checks + * only protocol and port numbers + */ +static int +apply_ppfilter4(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter *filt; + struct flowinfo_in *pkt; +{ + if (filt->ff_flow.fi_family != AF_INET) + return (0); + if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) + return (0); + if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) + return (0); + if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) + return (0); + /* match */ + return (1); +} + +/* + * filter matching function only for tos field. + */ +static int +apply_tosfilter4(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter *filt; + struct flowinfo_in *pkt; +{ + if (filt->ff_flow.fi_family != AF_INET) + return (0); + if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != + (pkt->fi_tos & filt->ff_mask.mask_tos)) + return (0); + /* match */ + return (1); +} + +#ifdef INET6 +static int +apply_filter6(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter6 *filt; + struct flowinfo_in6 *pkt; +{ + int i; + + if (filt->ff_flow6.fi6_family != AF_INET6) + return (0); + if ((fbmask & FIMB6_FLABEL) && + filt->ff_flow6.fi6_flowlabel != pkt->fi6_flowlabel) + return (0); + if ((fbmask & FIMB6_PROTO) && + filt->ff_flow6.fi6_proto != pkt->fi6_proto) + return (0); + if ((fbmask & FIMB6_SPORT) && + filt->ff_flow6.fi6_sport != pkt->fi6_sport) + return (0); + if ((fbmask & FIMB6_DPORT) && + filt->ff_flow6.fi6_dport != pkt->fi6_dport) + return (0); + if (fbmask & FIMB6_SADDR) { + for (i = 0; i < 4; i++) + if (filt->ff_flow6.fi6_src.s6_addr32[i] != + (pkt->fi6_src.s6_addr32[i] & + filt->ff_mask6.mask6_src.s6_addr32[i])) + return (0); + } + if (fbmask & FIMB6_DADDR) { + for (i = 0; i < 4; i++) + if (filt->ff_flow6.fi6_dst.s6_addr32[i] != + (pkt->fi6_dst.s6_addr32[i] & + filt->ff_mask6.mask6_dst.s6_addr32[i])) + return (0); + } + if ((fbmask & FIMB6_TCLASS) && + filt->ff_flow6.fi6_tclass != + (pkt->fi6_tclass & filt->ff_mask6.mask6_tclass)) + return (0); + if ((fbmask & FIMB6_GPI) && + filt->ff_flow6.fi6_gpi != pkt->fi6_gpi) + return (0); + /* match */ + return (1); +} +#endif /* INET6 */ + +/* + * filter handle: + * bit 20-28: index to the filter hash table + * bit 0-19: unique id in the hash bucket. + */ +static u_long +get_filt_handle(classifier, i) + struct acc_classifier *classifier; + int i; +{ + static u_long handle_number = 1; + u_long handle; + struct acc_filter *afp; + + while (1) { + handle = handle_number++ & 0x000fffff; + + if (LIST_EMPTY(&classifier->acc_filters[i])) + break; + + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if ((afp->f_handle & 0x000fffff) == handle) + break; + if (afp == NULL) + break; + /* this handle is already used, try again */ + } + + return ((i << 20) | handle); +} + +/* convert filter handle to filter pointer */ +static struct acc_filter * +filth_to_filtp(classifier, handle) + struct acc_classifier *classifier; + u_long handle; +{ + struct acc_filter *afp; + int i; + + i = ACC_GET_HINDEX(handle); + + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if (afp->f_handle == handle) + return (afp); + + return (NULL); +} + +/* create flowinfo bitmask */ +static u_int32_t +filt2fibmask(filt) + struct flow_filter *filt; +{ + u_int32_t mask = 0; +#ifdef INET6 + struct flow_filter6 *filt6; +#endif + + switch (filt->ff_flow.fi_family) { + case AF_INET: + if (filt->ff_flow.fi_proto != 0) + mask |= FIMB4_PROTO; + if (filt->ff_flow.fi_tos != 0) + mask |= FIMB4_TOS; + if (filt->ff_flow.fi_dst.s_addr != 0) + mask |= FIMB4_DADDR; + if (filt->ff_flow.fi_src.s_addr != 0) + mask |= FIMB4_SADDR; + if (filt->ff_flow.fi_sport != 0) + mask |= FIMB4_SPORT; + if (filt->ff_flow.fi_dport != 0) + mask |= FIMB4_DPORT; + if (filt->ff_flow.fi_gpi != 0) + mask |= FIMB4_GPI; + break; +#ifdef INET6 + case AF_INET6: + filt6 = (struct flow_filter6 *)filt; + + if (filt6->ff_flow6.fi6_proto != 0) + mask |= FIMB6_PROTO; + if (filt6->ff_flow6.fi6_tclass != 0) + mask |= FIMB6_TCLASS; + if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_dst)) + mask |= FIMB6_DADDR; + if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_src)) + mask |= FIMB6_SADDR; + if (filt6->ff_flow6.fi6_sport != 0) + mask |= FIMB6_SPORT; + if (filt6->ff_flow6.fi6_dport != 0) + mask |= FIMB6_DPORT; + if (filt6->ff_flow6.fi6_gpi != 0) + mask |= FIMB6_GPI; + if (filt6->ff_flow6.fi6_flowlabel != 0) + mask |= FIMB6_FLABEL; + break; +#endif /* INET6 */ + } + return (mask); +} + + +/* + * helper functions to handle IPv4 fragments. + * currently only in-sequence fragments are handled. + * - fragment info is cached in a LRU list. + * - when a first fragment is found, cache its flow info. + * - when a non-first fragment is found, lookup the cache. + */ + +struct ip4_frag { + TAILQ_ENTRY(ip4_frag) ip4f_chain; + char ip4f_valid; + u_short ip4f_id; + struct flowinfo_in ip4f_info; +}; + +static TAILQ_HEAD(ip4f_list, ip4_frag) ip4f_list; /* IPv4 fragment cache */ + +#define IP4F_TABSIZE 16 /* IPv4 fragment cache size */ + + +static void +ip4f_cache(ip, fin) + struct ip *ip; + struct flowinfo_in *fin; +{ + struct ip4_frag *fp; + + if (TAILQ_EMPTY(&ip4f_list)) { + /* first time call, allocate fragment cache entries. */ + if (ip4f_init() < 0) + /* allocation failed! */ + return; + } + + fp = ip4f_alloc(); + fp->ip4f_id = ip->ip_id; + fp->ip4f_info.fi_proto = ip->ip_p; + fp->ip4f_info.fi_src.s_addr = ip->ip_src.s_addr; + fp->ip4f_info.fi_dst.s_addr = ip->ip_dst.s_addr; + + /* save port numbers */ + fp->ip4f_info.fi_sport = fin->fi_sport; + fp->ip4f_info.fi_dport = fin->fi_dport; + fp->ip4f_info.fi_gpi = fin->fi_gpi; +} + +static int +ip4f_lookup(ip, fin) + struct ip *ip; + struct flowinfo_in *fin; +{ + struct ip4_frag *fp; + + for (fp = TAILQ_FIRST(&ip4f_list); fp != NULL && fp->ip4f_valid; + fp = TAILQ_NEXT(fp, ip4f_chain)) + if (ip->ip_id == fp->ip4f_id && + ip->ip_src.s_addr == fp->ip4f_info.fi_src.s_addr && + ip->ip_dst.s_addr == fp->ip4f_info.fi_dst.s_addr && + ip->ip_p == fp->ip4f_info.fi_proto) { + + /* found the matching entry */ + fin->fi_sport = fp->ip4f_info.fi_sport; + fin->fi_dport = fp->ip4f_info.fi_dport; + fin->fi_gpi = fp->ip4f_info.fi_gpi; + + if ((ntohs(ip->ip_off) & IP_MF) == 0) + /* this is the last fragment, + release the entry. */ + ip4f_free(fp); + + return (1); + } + + /* no matching entry found */ + return (0); +} + +static int +ip4f_init(void) +{ + struct ip4_frag *fp; + int i; + + TAILQ_INIT(&ip4f_list); + for (i=0; iip4f_valid = 0; + TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); + } + return (0); +} + +static struct ip4_frag * +ip4f_alloc(void) +{ + struct ip4_frag *fp; + + /* reclaim an entry at the tail, put it at the head */ + fp = TAILQ_LAST(&ip4f_list, ip4f_list); + TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); + fp->ip4f_valid = 1; + TAILQ_INSERT_HEAD(&ip4f_list, fp, ip4f_chain); + return (fp); +} + +static void +ip4f_free(fp) + struct ip4_frag *fp; +{ + TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); + fp->ip4f_valid = 0; + TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); +} + +#endif /* ALTQ3_CLFIER_COMPAT */ diff --git a/contrib/altq/rtems/freebsd/altq/altq_var.h b/contrib/altq/rtems/freebsd/altq/altq_var.h new file mode 100644 index 00000000..58384e15 --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altq_var.h @@ -0,0 +1,265 @@ +/* $FreeBSD$ */ +/* $KAME: altq_var.h,v 1.16 2003/10/03 05:05:15 kjc Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_ALTQ_VAR_HH_ +#define _ALTQ_ALTQ_VAR_HH_ + +#ifdef _KERNEL + +#include +#include +#include + +#ifdef ALTQ3_CLFIER_COMPAT +/* + * filter structure for altq common classifier + */ +struct acc_filter { + LIST_ENTRY(acc_filter) f_chain; + void *f_class; /* pointer to the class */ + u_long f_handle; /* filter id */ + u_int32_t f_fbmask; /* filter bitmask */ + struct flow_filter f_filter; /* filter value */ +}; + +/* + * XXX ACC_FILTER_TABLESIZE can't be larger than 2048 unless we fix + * the handle assignment. + */ +#define ACC_FILTER_TABLESIZE (256+1) +#define ACC_FILTER_MASK (ACC_FILTER_TABLESIZE - 2) +#define ACC_WILDCARD_INDEX (ACC_FILTER_TABLESIZE - 1) +#ifdef __GNUC__ +#define ACC_GET_HASH_INDEX(addr) \ + ({int x = (addr) + ((addr) >> 16); (x + (x >> 8)) & ACC_FILTER_MASK;}) +#else +#define ACC_GET_HASH_INDEX(addr) \ + (((addr) + ((addr) >> 8) + ((addr) >> 16) + ((addr) >> 24)) \ + & ACC_FILTER_MASK) +#endif +#define ACC_GET_HINDEX(handle) ((handle) >> 20) + +#if (__FreeBSD_version > 500000) +#define ACC_LOCK_INIT(ac) mtx_init(&(ac)->acc_mtx, "classifier", MTX_DEF) +#define ACC_LOCK_DESTROY(ac) mtx_destroy(&(ac)->acc_mtx) +#define ACC_LOCK(ac) mtx_lock(&(ac)->acc_mtx) +#define ACC_UNLOCK(ac) mtx_unlock(&(ac)->acc_mtx) +#else +#define ACC_LOCK_INIT(ac) +#define ACC_LOCK_DESTROY(ac) +#define ACC_LOCK(ac) +#define ACC_UNLOCK(ac) +#endif + +struct acc_classifier { + u_int32_t acc_fbmask; + LIST_HEAD(filt, acc_filter) acc_filters[ACC_FILTER_TABLESIZE]; + +#if (__FreeBSD_version > 500000) + struct mtx acc_mtx; +#endif +}; + +/* + * flowinfo mask bits used by classifier + */ +/* for ipv4 */ +#define FIMB4_PROTO 0x0001 +#define FIMB4_TOS 0x0002 +#define FIMB4_DADDR 0x0004 +#define FIMB4_SADDR 0x0008 +#define FIMB4_DPORT 0x0010 +#define FIMB4_SPORT 0x0020 +#define FIMB4_GPI 0x0040 +#define FIMB4_ALL 0x007f +/* for ipv6 */ +#define FIMB6_PROTO 0x0100 +#define FIMB6_TCLASS 0x0200 +#define FIMB6_DADDR 0x0400 +#define FIMB6_SADDR 0x0800 +#define FIMB6_DPORT 0x1000 +#define FIMB6_SPORT 0x2000 +#define FIMB6_GPI 0x4000 +#define FIMB6_FLABEL 0x8000 +#define FIMB6_ALL 0xff00 + +#define FIMB_ALL (FIMB4_ALL|FIMB6_ALL) + +#define FIMB4_PORTS (FIMB4_DPORT|FIMB4_SPORT|FIMB4_GPI) +#define FIMB6_PORTS (FIMB6_DPORT|FIMB6_SPORT|FIMB6_GPI) +#endif /* ALTQ3_CLFIER_COMPAT */ + +/* + * machine dependent clock + * a 64bit high resolution time counter. + */ +extern int machclk_usepcc; +extern u_int32_t machclk_freq; +extern u_int32_t machclk_per_tick; +extern void init_machclk(void); +extern u_int64_t read_machclk(void); + +/* + * debug support + */ +#ifdef ALTQ_DEBUG +#ifdef __STDC__ +#define ASSERT(e) ((e) ? (void)0 : altq_assert(__FILE__, __LINE__, #e)) +#else /* PCC */ +#define ASSERT(e) ((e) ? (void)0 : altq_assert(__FILE__, __LINE__, "e")) +#endif +#else +#define ASSERT(e) ((void)0) +#endif + +/* + * misc stuff for compatibility + */ +/* ioctl cmd type */ +#if defined(__FreeBSD__) && (__FreeBSD__ < 3) +typedef int ioctlcmd_t; +#else +typedef u_long ioctlcmd_t; +#endif + +/* + * queue macros: + * the interface of TAILQ_LAST macro changed after the introduction + * of softupdate. redefine it here to make it work with pre-2.2.7. + */ +#undef TAILQ_LAST +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +#ifndef TAILQ_EMPTY +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) +#endif +#ifndef TAILQ_FOREACH +#define TAILQ_FOREACH(var, head, field) \ + for (var = TAILQ_FIRST(head); var; var = TAILQ_NEXT(var, field)) +#endif + +/* macro for timeout/untimeout */ +#if (__FreeBSD_version > 300000) || defined(__NetBSD__) +/* use callout */ +#include + +#if (__FreeBSD_version > 500000) +#define CALLOUT_INIT(c) callout_init((c), 0) +#else +#define CALLOUT_INIT(c) callout_init((c)) +#endif +#define CALLOUT_RESET(c,t,f,a) callout_reset((c),(t),(f),(a)) +#define CALLOUT_STOP(c) callout_stop((c)) +#if !defined(CALLOUT_INITIALIZER) && (__FreeBSD_version < 600000) +#define CALLOUT_INITIALIZER { { { NULL } }, 0, NULL, NULL, 0 } +#endif +#elif defined(__OpenBSD__) +#include +/* callout structure as a wrapper of struct timeout */ +struct callout { + struct timeout c_to; +}; +#define CALLOUT_INIT(c) do { bzero((c), sizeof(*(c))); } while (/*CONSTCOND*/ 0) +#define CALLOUT_RESET(c,t,f,a) do { if (!timeout_initialized(&(c)->c_to)) \ + timeout_set(&(c)->c_to, (f), (a)); \ + timeout_add(&(c)->c_to, (t)); } while (/*CONSTCOND*/ 0) +#define CALLOUT_STOP(c) timeout_del(&(c)->c_to) +#define CALLOUT_INITIALIZER { { { NULL }, NULL, NULL, 0, 0 } } +#else +/* use old-style timeout/untimeout */ +/* dummy callout structure */ +struct callout { + void *c_arg; /* function argument */ + void (*c_func)(void *); /* functiuon to call */ +}; +#define CALLOUT_INIT(c) do { bzero((c), sizeof(*(c))); } while (/*CONSTCOND*/ 0) +#define CALLOUT_RESET(c,t,f,a) do { (c)->c_arg = (a); \ + (c)->c_func = (f); \ + timeout((f),(a),(t)); } while (/*CONSTCOND*/ 0) +#define CALLOUT_STOP(c) untimeout((c)->c_func,(c)->c_arg) +#define CALLOUT_INITIALIZER { NULL, NULL } +#endif +#if !defined(__FreeBSD__) +typedef void (timeout_t)(void *); +#endif + +#define m_pktlen(m) ((m)->m_pkthdr.len) + +struct ifnet; struct mbuf; +struct pf_altq; +#ifdef ALTQ3_CLFIER_COMPAT +struct flowinfo; +#endif + +void *altq_lookup(char *, int); +#ifdef ALTQ3_CLFIER_COMPAT +int altq_extractflow(struct mbuf *, int, struct flowinfo *, u_int32_t); +int acc_add_filter(struct acc_classifier *, struct flow_filter *, + void *, u_long *); +int acc_delete_filter(struct acc_classifier *, u_long); +int acc_discard_filters(struct acc_classifier *, void *, int); +void *acc_classify(void *, struct mbuf *, int); +#endif +u_int8_t read_dsfield(struct mbuf *, struct altq_pktattr *); +void write_dsfield(struct mbuf *, struct altq_pktattr *, u_int8_t); +void altq_assert(const char *, int, const char *); +int tbr_set(struct ifaltq *, struct tb_profile *); +int tbr_get(struct ifaltq *, struct tb_profile *); + +int altq_pfattach(struct pf_altq *); +int altq_pfdetach(struct pf_altq *); +int altq_add(struct pf_altq *); +int altq_remove(struct pf_altq *); +int altq_add_queue(struct pf_altq *); +int altq_remove_queue(struct pf_altq *); +int altq_getqstats(struct pf_altq *, void *, int *); + +int cbq_pfattach(struct pf_altq *); +int cbq_add_altq(struct pf_altq *); +int cbq_remove_altq(struct pf_altq *); +int cbq_add_queue(struct pf_altq *); +int cbq_remove_queue(struct pf_altq *); +int cbq_getqstats(struct pf_altq *, void *, int *); + +int priq_pfattach(struct pf_altq *); +int priq_add_altq(struct pf_altq *); +int priq_remove_altq(struct pf_altq *); +int priq_add_queue(struct pf_altq *); +int priq_remove_queue(struct pf_altq *); +int priq_getqstats(struct pf_altq *, void *, int *); + +int hfsc_pfattach(struct pf_altq *); +int hfsc_add_altq(struct pf_altq *); +int hfsc_remove_altq(struct pf_altq *); +int hfsc_add_queue(struct pf_altq *); +int hfsc_remove_queue(struct pf_altq *); +int hfsc_getqstats(struct pf_altq *, void *, int *); + +#endif /* _KERNEL */ +#endif /* _ALTQ_ALTQ_VAR_HH_ */ diff --git a/contrib/altq/rtems/freebsd/altq/altqconf.h b/contrib/altq/rtems/freebsd/altq/altqconf.h new file mode 100644 index 00000000..69f8d0b7 --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/altqconf.h @@ -0,0 +1,29 @@ +/* $OpenBSD: altqconf.h,v 1.1 2001/06/27 05:28:36 kjc Exp $ */ +/* $NetBSD: altqconf.h,v 1.2 2001/05/30 11:57:16 mrg Exp $ */ + +#if defined(_KERNEL_OPT) || defined(__OpenBSD__) + +#if defined(_KERNEL_OPT) +#include +#endif + +#include + +#ifdef ALTQ +#define NALTQ 1 +#else +#define NALTQ 0 +#endif + +cdev_decl(altq); + +#ifdef __OpenBSD__ +#define cdev_altq_init(c,n) { \ + dev_init(c,n,open), dev_init(c,n,close), (dev_type_read((*))) enodev, \ + (dev_type_write((*))) enodev, dev_init(c,n,ioctl), \ + (dev_type_stop((*))) enodev, 0, (dev_type_select((*))) enodev, \ + (dev_type_mmap((*))) enodev } +#else +#define cdev_altq_init(x,y) cdev__oci_init(x,y) +#endif +#endif /* defined(_KERNEL_OPT) || defined(__OpenBSD__) */ diff --git a/contrib/altq/rtems/freebsd/altq/if_altq.h b/contrib/altq/rtems/freebsd/altq/if_altq.h new file mode 100644 index 00000000..ddc2b08f --- /dev/null +++ b/contrib/altq/rtems/freebsd/altq/if_altq.h @@ -0,0 +1,191 @@ +/* $FreeBSD$ */ +/* $KAME: if_altq.h,v 1.11 2003/07/10 12:07:50 kjc Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_IF_ALTQ_HH_ +#define _ALTQ_IF_ALTQ_HH_ + +#if (defined(__FreeBSD__) && __FreeBSD_version >= 500000) +#include /* XXX */ +#include /* XXX */ +#include /* XXX */ +#endif + +#ifdef _KERNEL_OPT +#include +#endif + +struct altq_pktattr; struct tb_regulator; struct top_cdnr; + +/* + * Structure defining a queue for a network interface. + */ +struct ifaltq { + /* fields compatible with struct ifqueue */ + struct mbuf *ifq_head; + struct mbuf *ifq_tail; + int ifq_len; + int ifq_maxlen; + int ifq_drops; +#if (defined(__FreeBSD__) && __FreeBSD_version >= 500000) + struct mtx ifq_mtx; +#endif + + /* driver owned queue (used for bulk dequeue and prepend) UNLOCKED */ + struct mbuf *ifq_drv_head; + struct mbuf *ifq_drv_tail; + int ifq_drv_len; + int ifq_drv_maxlen; + + /* alternate queueing related fields */ + int altq_type; /* discipline type */ + int altq_flags; /* flags (e.g. ready, in-use) */ + void *altq_disc; /* for discipline-specific use */ + struct ifnet *altq_ifp; /* back pointer to interface */ + + int (*altq_enqueue)(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); + struct mbuf *(*altq_dequeue)(struct ifaltq *, int); + int (*altq_request)(struct ifaltq *, int, void *); + + /* classifier fields */ + void *altq_clfier; /* classifier-specific use */ + void *(*altq_classify)(void *, struct mbuf *, int); + + /* token bucket regulator */ + struct tb_regulator *altq_tbr; + + /* input traffic conditioner (doesn't belong to the output queue...) */ + struct top_cdnr *altq_cdnr; +}; + + +#ifdef _KERNEL + +/* + * packet attributes used by queueing disciplines. + * pattr_class is a discipline-dependent scheduling class that is + * set by a classifier. + * pattr_hdr and pattr_af may be used by a discipline to access + * the header within a mbuf. (e.g. ECN needs to update the CE bit) + * note that pattr_hdr could be stale after m_pullup, though link + * layer output routines usually don't use m_pullup. link-level + * compression also invalidates these fields. thus, pattr_hdr needs + * to be verified when a discipline touches the header. + */ +struct altq_pktattr { + void *pattr_class; /* sched class set by classifier */ + int pattr_af; /* address family */ + caddr_t pattr_hdr; /* saved header position in mbuf */ +}; + +/* + * mbuf tag to carry a queue id (and hints for ECN). + */ +struct altq_tag { + u_int32_t qid; /* queue id */ + /* hints for ecn */ + int af; /* address family */ + void *hdr; /* saved header position in mbuf */ +}; + +/* + * a token-bucket regulator limits the rate that a network driver can + * dequeue packets from the output queue. + * modern cards are able to buffer a large amount of packets and dequeue + * too many packets at a time. this bursty dequeue behavior makes it + * impossible to schedule packets by queueing disciplines. + * a token-bucket is used to control the burst size in a device + * independent manner. + */ +struct tb_regulator { + int64_t tbr_rate; /* (scaled) token bucket rate */ + int64_t tbr_depth; /* (scaled) token bucket depth */ + + int64_t tbr_token; /* (scaled) current token */ + int64_t tbr_filluptime; /* (scaled) time to fill up bucket */ + u_int64_t tbr_last; /* last time token was updated */ + + int tbr_lastop; /* last dequeue operation type + needed for poll-and-dequeue */ +}; + +/* if_altqflags */ +#define ALTQF_READY 0x01 /* driver supports alternate queueing */ +#define ALTQF_ENABLED 0x02 /* altq is in use */ +#define ALTQF_CLASSIFY 0x04 /* classify packets */ +#define ALTQF_CNDTNING 0x08 /* altq traffic conditioning is enabled */ +#define ALTQF_DRIVER1 0x40 /* driver specific */ + +/* if_altqflags set internally only: */ +#define ALTQF_CANTCHANGE (ALTQF_READY) + +/* altq_dequeue 2nd arg */ +#define ALTDQ_REMOVE 1 /* dequeue mbuf from the queue */ +#define ALTDQ_POLL 2 /* don't dequeue mbuf from the queue */ + +/* altq request types (currently only purge is defined) */ +#define ALTRQ_PURGE 1 /* purge all packets */ + +#define ALTQ_IS_READY(ifq) ((ifq)->altq_flags & ALTQF_READY) +#define ALTQ_IS_ENABLED(ifq) ((ifq)->altq_flags & ALTQF_ENABLED) +#define ALTQ_NEEDS_CLASSIFY(ifq) ((ifq)->altq_flags & ALTQF_CLASSIFY) +#define ALTQ_IS_CNDTNING(ifq) ((ifq)->altq_flags & ALTQF_CNDTNING) + +#define ALTQ_SET_CNDTNING(ifq) ((ifq)->altq_flags |= ALTQF_CNDTNING) +#define ALTQ_CLEAR_CNDTNING(ifq) ((ifq)->altq_flags &= ~ALTQF_CNDTNING) +#define ALTQ_IS_ATTACHED(ifq) ((ifq)->altq_disc != NULL) + +#define ALTQ_ENQUEUE(ifq, m, pa, err) \ + (err) = (*(ifq)->altq_enqueue)((ifq),(m),(pa)) +#define ALTQ_DEQUEUE(ifq, m) \ + (m) = (*(ifq)->altq_dequeue)((ifq), ALTDQ_REMOVE) +#define ALTQ_POLL(ifq, m) \ + (m) = (*(ifq)->altq_dequeue)((ifq), ALTDQ_POLL) +#define ALTQ_PURGE(ifq) \ + (void)(*(ifq)->altq_request)((ifq), ALTRQ_PURGE, (void *)0) +#define ALTQ_IS_EMPTY(ifq) ((ifq)->ifq_len == 0) +#define TBR_IS_ENABLED(ifq) ((ifq)->altq_tbr != NULL) + +extern int altq_attach(struct ifaltq *, int, void *, + int (*)(struct ifaltq *, struct mbuf *, + struct altq_pktattr *), + struct mbuf *(*)(struct ifaltq *, int), + int (*)(struct ifaltq *, int, void *), + void *, + void *(*)(void *, struct mbuf *, int)); +extern int altq_detach(struct ifaltq *); +extern int altq_enable(struct ifaltq *); +extern int altq_disable(struct ifaltq *); +extern struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int); +extern int (*altq_input)(struct mbuf *, int); +#if 0 /* ALTQ3_CLFIER_COMPAT */ +void altq_etherclassify(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +#endif +#endif /* _KERNEL */ + +#endif /* _ALTQ_IF_ALTQ_HH_ */ diff --git a/contrib/pf/rtems/freebsd/net/if_pflog.c b/contrib/pf/rtems/freebsd/net/if_pflog.c new file mode 100644 index 00000000..cad97218 --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/if_pflog.c @@ -0,0 +1,438 @@ +#include + +/* $OpenBSD: if_pflog.c,v 1.22 2006/12/15 09:31:20 otto Exp $ */ +/* + * The authors of this code are John Ioannidis (ji@tla.org), + * Angelos D. Keromytis (kermit@csd.uch.gr) and + * Niels Provos (provos@physnet.uni-hamburg.de). + * + * This code was written by John Ioannidis for BSD/OS in Athens, Greece, + * in November 1995. + * + * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, + * by Angelos D. Keromytis. + * + * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis + * and Niels Provos. + * + * Copyright (C) 1995, 1996, 1997, 1998 by John Ioannidis, Angelos D. Keromytis + * and Niels Provos. + * Copyright (c) 2001, Angelos D. Keromytis, Niels Provos. + * + * Permission to use, copy, and modify this software with or without fee + * is hereby granted, provided that this entire notice is included in + * all copies of any software which is or includes a copy or + * modification of this software. + * You may use this code under the GNU public license if you so wish. Please + * contribute changes back to the authors under this freer than GPL license + * so that we may further the use of strong encryption without limitations to + * all. + * + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE + * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR + * PURPOSE. + */ + +#ifdef __FreeBSD__ +#include +#include +#include +#include + +#include +__FBSDID("$FreeBSD$"); + +#ifdef DEV_BPF +#define NBPFILTER DEV_BPF +#else +#define NBPFILTER 0 +#endif + +#ifdef DEV_PFLOG +#define NPFLOG DEV_PFLOG +#else +#define NPFLOG 0 +#endif + +#else /* ! __FreeBSD__ */ +#include +#include +#endif /* __FreeBSD__ */ + +#include +#include +#include +#include +#include +#ifdef __FreeBSD__ +#include +#include +#include +#include +#include +#else +#include +#endif + +#include +#ifdef __FreeBSD__ +#include +#endif +#include +#include +#include + +#ifdef INET +#include +#include +#include +#include +#endif + +#ifdef INET6 +#ifndef INET +#include +#endif +#include +#endif /* INET6 */ + +#include +#include + +#ifdef INET +#ifdef __FreeBSD__ +#include +#endif +#endif + +#define PFLOGMTU (32768 + MHLEN + MLEN) + +#ifdef PFLOGDEBUG +#define DPRINTF(x) do { if (pflogdebug) printf x ; } while (0) +#else +#define DPRINTF(x) +#endif + +void pflogattach(int); +int pflogoutput(struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *); +int pflogioctl(struct ifnet *, u_long, caddr_t); +void pflogstart(struct ifnet *); +#ifdef __FreeBSD__ +static int pflog_clone_create(struct if_clone *, int, caddr_t); +static void pflog_clone_destroy(struct ifnet *); +#else +int pflog_clone_create(struct if_clone *, int); +int pflog_clone_destroy(struct ifnet *); +#endif + +LIST_HEAD(, pflog_softc) pflogif_list; +#ifdef __FreeBSD__ +IFC_SIMPLE_DECLARE(pflog, 1); +#else +struct if_clone pflog_cloner = + IF_CLONE_INITIALIZER("pflog", pflog_clone_create, pflog_clone_destroy); +#endif + +struct ifnet *pflogifs[PFLOGIFS_MAX]; /* for fast access */ + +#ifndef __FreeBSD__ +extern int ifqmaxlen; +#endif + +void +pflogattach(int npflog) +{ + int i; + LIST_INIT(&pflogif_list); + for (i = 0; i < PFLOGIFS_MAX; i++) + pflogifs[i] = NULL; +#ifndef __FreeBSD__ + (void) pflog_clone_create(&pflog_cloner, 0); +#endif + if_clone_attach(&pflog_cloner); +} + +#ifdef __FreeBSD__ +static int +pflog_clone_create(struct if_clone *ifc, int unit, caddr_t param) +#else +int +pflog_clone_create(struct if_clone *ifc, int unit) +#endif +{ + struct ifnet *ifp; + struct pflog_softc *pflogif; + int s; + + if (unit >= PFLOGIFS_MAX) + return (EINVAL); + + if ((pflogif = malloc(sizeof(*pflogif), M_DEVBUF, M_NOWAIT)) == NULL) + return (ENOMEM); + bzero(pflogif, sizeof(*pflogif)); + + pflogif->sc_unit = unit; +#ifdef __FreeBSD__ + ifp = pflogif->sc_ifp = if_alloc(IFT_PFLOG); + if (ifp == NULL) { + free(pflogif, M_DEVBUF); + return (ENOSPC); + } + if_initname(ifp, ifc->ifc_name, unit); +#else + ifp = &pflogif->sc_if; + snprintf(ifp->if_xname, sizeof ifp->if_xname, "pflog%d", unit); +#endif + ifp->if_softc = pflogif; + ifp->if_mtu = PFLOGMTU; + ifp->if_ioctl = pflogioctl; + ifp->if_output = pflogoutput; + ifp->if_start = pflogstart; +#ifndef __FreeBSD__ + ifp->if_type = IFT_PFLOG; +#endif + ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_hdrlen = PFLOG_HDRLEN; + if_attach(ifp); +#ifndef __FreeBSD__ + if_alloc_sadl(ifp); +#endif + +#if NBPFILTER > 0 +#ifdef __FreeBSD__ + bpfattach(ifp, DLT_PFLOG, PFLOG_HDRLEN); +#else + bpfattach(&pflogif->sc_if.if_bpf, ifp, DLT_PFLOG, PFLOG_HDRLEN); +#endif +#endif + + s = splnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + LIST_INSERT_HEAD(&pflogif_list, pflogif, sc_list); + pflogifs[unit] = ifp; +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + + return (0); +} + +#ifdef __FreeBSD__ +static void +pflog_clone_destroy(struct ifnet *ifp) +#else +int +pflog_clone_destroy(struct ifnet *ifp) +#endif +{ + struct pflog_softc *pflogif = ifp->if_softc; + int s; + + s = splnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + pflogifs[pflogif->sc_unit] = NULL; + LIST_REMOVE(pflogif, sc_list); +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + +#if NBPFILTER > 0 + bpfdetach(ifp); +#endif + if_detach(ifp); +#ifdef __FreeBSD__ + if_free(ifp); +#endif + free(pflogif, M_DEVBUF); +#ifndef __FreeBSD__ + return (0); +#endif +} + +/* + * Start output on the pflog interface. + */ +void +pflogstart(struct ifnet *ifp) +{ + struct mbuf *m; +#ifndef __FreeBSD__ + int s; +#endif + + for (;;) { +#ifdef __FreeBSD__ + IF_LOCK(&ifp->if_snd); + _IF_DROP(&ifp->if_snd); + _IF_DEQUEUE(&ifp->if_snd, m); + IF_UNLOCK(&ifp->if_snd); +#else + s = splnet(); + IF_DROP(&ifp->if_snd); + IF_DEQUEUE(&ifp->if_snd, m); + splx(s); +#endif + + if (m == NULL) + return; + else + m_freem(m); + } +} + +int +pflogoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct route *ro) +{ + m_freem(m); + return (0); +} + +/* ARGSUSED */ +int +pflogioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + switch (cmd) { + case SIOCSIFADDR: + case SIOCAIFADDR: + case SIOCSIFDSTADDR: + case SIOCSIFFLAGS: +#ifdef __FreeBSD__ + if (ifp->if_flags & IFF_UP) + ifp->if_drv_flags |= IFF_DRV_RUNNING; + else + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; +#else + if (ifp->if_flags & IFF_UP) + ifp->if_flags |= IFF_RUNNING; + else + ifp->if_flags &= ~IFF_RUNNING; +#endif + break; + default: + return (EINVAL); + } + + return (0); +} + +int +pflog_packet(struct pfi_kif *kif, struct mbuf *m, sa_family_t af, u_int8_t dir, + u_int8_t reason, struct pf_rule *rm, struct pf_rule *am, + struct pf_ruleset *ruleset, struct pf_pdesc *pd) +{ +#if NBPFILTER > 0 + struct ifnet *ifn; + struct pfloghdr hdr; + + if (kif == NULL || m == NULL || rm == NULL || pd == NULL) + return (-1); + + if ((ifn = pflogifs[rm->logif]) == NULL || !ifn->if_bpf) + return (0); + + bzero(&hdr, sizeof(hdr)); + hdr.length = PFLOG_REAL_HDRLEN; + hdr.af = af; + hdr.action = rm->action; + hdr.reason = reason; + memcpy(hdr.ifname, kif->pfik_name, sizeof(hdr.ifname)); + + if (am == NULL) { + hdr.rulenr = htonl(rm->nr); + hdr.subrulenr = -1; + } else { + hdr.rulenr = htonl(am->nr); + hdr.subrulenr = htonl(rm->nr); + if (ruleset != NULL && ruleset->anchor != NULL) + strlcpy(hdr.ruleset, ruleset->anchor->name, + sizeof(hdr.ruleset)); + } + if (rm->log & PF_LOG_SOCKET_LOOKUP && !pd->lookup.done) +#ifdef __FreeBSD__ + /* + * XXX: This should not happen as we force an early lookup + * via debug.pfugidhack + */ + ; /* empty */ +#else + pd->lookup.done = pf_socket_lookup(dir, pd); +#endif + if (pd->lookup.done > 0) { + hdr.uid = pd->lookup.uid; + hdr.pid = pd->lookup.pid; + } else { + hdr.uid = UID_MAX; + hdr.pid = NO_PID; + } + hdr.rule_uid = rm->cuid; + hdr.rule_pid = rm->cpid; + hdr.dir = dir; + +#ifdef INET + if (af == AF_INET && dir == PF_OUT) { + struct ip *ip; + + ip = mtod(m, struct ip *); + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m, ip->ip_hl << 2); + } +#endif /* INET */ + + ifn->if_opackets++; + ifn->if_obytes += m->m_pkthdr.len; +#ifdef __FreeBSD__ + BPF_MTAP2(ifn, &hdr, PFLOG_HDRLEN, m); +#else + bpf_mtap_hdr(ifn->if_bpf, (char *)&hdr, PFLOG_HDRLEN, m, + BPF_DIRECTION_OUT); +#endif +#endif + + return (0); +} + +#ifdef __FreeBSD__ +static int +pflog_modevent(module_t mod, int type, void *data) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + pflogattach(1); + PF_LOCK(); + pflog_packet_ptr = pflog_packet; + PF_UNLOCK(); + break; + case MOD_UNLOAD: + PF_LOCK(); + pflog_packet_ptr = NULL; + PF_UNLOCK(); + if_clone_detach(&pflog_cloner); + break; + default: + error = EINVAL; + break; + } + + return error; +} + +static moduledata_t pflog_mod = { "pflog", pflog_modevent, 0 }; + +#define PFLOG_MODVER 1 + +DECLARE_MODULE(pflog, pflog_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); +MODULE_VERSION(pflog, PFLOG_MODVER); +MODULE_DEPEND(pflog, pf, PF_MODVER, PF_MODVER, PF_MODVER); +#endif /* __FreeBSD__ */ diff --git a/contrib/pf/rtems/freebsd/net/if_pflog.h b/contrib/pf/rtems/freebsd/net/if_pflog.h new file mode 100644 index 00000000..9e9efbef --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/if_pflog.h @@ -0,0 +1,103 @@ +/* $FreeBSD$ */ +/* $OpenBSD: if_pflog.h,v 1.14 2006/10/25 11:27:01 henning Exp $ */ +/* + * Copyright 2001 Niels Provos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _NET_IF_PFLOG_HH_ +#define _NET_IF_PFLOG_HH_ + +#define PFLOGIFS_MAX 16 + +#ifdef _KERNEL +struct pflog_softc { +#ifdef __FreeBSD__ + struct ifnet *sc_ifp; /* the interface pointer */ +#else + struct ifnet sc_if; /* the interface */ +#endif + int sc_unit; + LIST_ENTRY(pflog_softc) sc_list; +}; +#endif /* _KERNEL */ + +#define PFLOG_RULESET_NAME_SIZE 16 + +struct pfloghdr { + u_int8_t length; + sa_family_t af; + u_int8_t action; + u_int8_t reason; + char ifname[IFNAMSIZ]; + char ruleset[PFLOG_RULESET_NAME_SIZE]; + u_int32_t rulenr; + u_int32_t subrulenr; + uid_t uid; + pid_t pid; + uid_t rule_uid; + pid_t rule_pid; + u_int8_t dir; + u_int8_t pad[3]; +}; + +#define PFLOG_HDRLEN sizeof(struct pfloghdr) +/* minus pad, also used as a signature */ +#define PFLOG_REAL_HDRLEN offsetof(struct pfloghdr, pad) + +/* XXX remove later when old format logs are no longer needed */ +struct old_pfloghdr { + u_int32_t af; + char ifname[IFNAMSIZ]; + short rnr; + u_short reason; + u_short action; + u_short dir; +}; +#define OLD_PFLOG_HDRLEN sizeof(struct old_pfloghdr) + +#ifdef _KERNEL + +#ifdef __FreeBSD__ +struct pf_rule; +struct pf_ruleset; +struct pfi_kif; +struct pf_pdesc; + +typedef int pflog_packet_t(struct pfi_kif *, struct mbuf *, sa_family_t, + u_int8_t, u_int8_t, struct pf_rule *, struct pf_rule *, + struct pf_ruleset *, struct pf_pdesc *); +extern pflog_packet_t *pflog_packet_ptr; +#define PFLOG_PACKET(i,x,a,b,c,d,e,f,g,h) do { \ + if (pflog_packet_ptr != NULL) \ + pflog_packet_ptr(i,a,b,c,d,e,f,g,h); \ +} while (0) +#else /* ! __FreeBSD__ */ +#if NPFLOG > 0 +#define PFLOG_PACKET(i,x,a,b,c,d,e,f,g,h) pflog_packet(i,a,b,c,d,e,f,g,h) +#else +#define PFLOG_PACKET(i,x,a,b,c,d,e,f,g,h) ((void)0) +#endif /* NPFLOG > 0 */ +#endif /* __FreeBSD__ */ +#endif /* _KERNEL */ +#endif /* _NET_IF_PFLOG_HH_ */ diff --git a/contrib/pf/rtems/freebsd/net/if_pfsync.c b/contrib/pf/rtems/freebsd/net/if_pfsync.c new file mode 100644 index 00000000..3a48046e --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/if_pfsync.c @@ -0,0 +1,2331 @@ +#include + +/* $OpenBSD: if_pfsync.c,v 1.73 2006/11/16 13:13:38 henning Exp $ */ + +/* + * Copyright (c) 2002 Michael Shalayeff + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef __FreeBSD__ +#include +#include +#include +#include +#include + +#include +__FBSDID("$FreeBSD$"); + +#ifdef DEV_BPF +#define NBPFILTER DEV_BPF +#else +#define NBPFILTER 0 +#endif + +#ifdef DEV_PFSYNC +#define NPFSYNC DEV_PFSYNC +#else +#define NPFSYNC 0 +#endif + +#ifdef DEV_CARP +#define NCARP DEV_CARP +#else +#define NCARP 0 +#endif +#endif /* __FreeBSD__ */ + +#include +#ifdef __FreeBSD__ +#include +#endif +#include +#include +#include +#include +#include +#ifdef __FreeBSD__ +#include +#include +#include +#include +#include +#include +#include +#include +#else +#include +#include +#endif +#include + +#include +#ifdef __FreeBSD__ +#include +#endif +#include +#include +#include +#include +#include +#include +#include + +#ifdef INET +#include +#include +#include +#include +#endif + +#ifdef INET6 +#include +#endif /* INET6 */ + +#ifndef __FreeBSD__ +#include +#endif +#if NCARP > 0 +#include +#endif + +#include +#include + +#ifndef __FreeBSD__ +#include +#include +#endif + +#define PFSYNC_MINMTU \ + (sizeof(struct pfsync_header) + sizeof(struct pf_state)) + +#ifdef PFSYNCDEBUG +#define DPRINTF(x) do { if (pfsyncdebug) printf x ; } while (0) +int pfsyncdebug; +#else +#define DPRINTF(x) +#endif + +struct pfsync_softc *pfsyncif = NULL; +struct pfsyncstats pfsyncstats; +#ifdef __FreeBSD__ +SYSCTL_DECL(_net_inet_pfsync); +SYSCTL_STRUCT(_net_inet_pfsync, 0, stats, CTLFLAG_RW, + &pfsyncstats, pfsyncstats, + "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)"); +#endif + +void pfsyncattach(int); +#ifdef __FreeBSD__ +int pfsync_clone_create(struct if_clone *, int, caddr_t); +void pfsync_clone_destroy(struct ifnet *); +#else +int pfsync_clone_create(struct if_clone *, int); +int pfsync_clone_destroy(struct ifnet *); +#endif +void pfsync_setmtu(struct pfsync_softc *, int); +int pfsync_alloc_scrub_memory(struct pfsync_state_peer *, + struct pf_state_peer *); +int pfsync_insert_net_state(struct pfsync_state *, u_int8_t); +#ifdef PFSYNC_TDB +void pfsync_update_net_tdb(struct pfsync_tdb *); +#endif +int pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *); +int pfsyncioctl(struct ifnet *, u_long, caddr_t); +void pfsyncstart(struct ifnet *); + +struct mbuf *pfsync_get_mbuf(struct pfsync_softc *, u_int8_t, void **); +int pfsync_request_update(struct pfsync_state_upd *, struct in_addr *); +int pfsync_sendout(struct pfsync_softc *); +#ifdef PFSYNC_TDB +int pfsync_tdb_sendout(struct pfsync_softc *); +#endif +int pfsync_sendout_mbuf(struct pfsync_softc *, struct mbuf *); +void pfsync_timeout(void *); +#ifdef PFSYNC_TDB +void pfsync_tdb_timeout(void *); +#endif +void pfsync_send_bus(struct pfsync_softc *, u_int8_t); +void pfsync_bulk_update(void *); +void pfsync_bulkfail(void *); + +#ifdef __FreeBSD__ +void pfsync_ifdetach(void *, struct ifnet *); +void pfsync_senddef(void *, int); + +/* XXX: ugly */ +#define betoh64 (unsigned long long)be64toh +#define timeout_del callout_stop +#endif + +int pfsync_sync_ok; +#ifndef __FreeBSD__ +extern int ifqmaxlen; +#endif + +#ifdef __FreeBSD__ +IFC_SIMPLE_DECLARE(pfsync, 1); +#else +struct if_clone pfsync_cloner = + IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy); +#endif + +void +pfsyncattach(int npfsync) +{ + if_clone_attach(&pfsync_cloner); +} + +int +#ifdef __FreeBSD__ +pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param) +#else +pfsync_clone_create(struct if_clone *ifc, int unit) +#endif +{ + struct ifnet *ifp; + + if (unit != 0) + return (EINVAL); + + pfsync_sync_ok = 1; + if ((pfsyncif = malloc(sizeof(*pfsyncif), M_DEVBUF, M_NOWAIT)) == NULL) + return (ENOMEM); + bzero(pfsyncif, sizeof(*pfsyncif)); +#ifdef __FreeBSD__ + if ((pfsyncif->sc_imo.imo_membership = (struct in_multi **)malloc( + (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_DEVBUF, + M_NOWAIT)) == NULL) { + free(pfsyncif, M_DEVBUF); + return (ENOSPC); + } + pfsyncif->sc_imo.imo_mfilters = NULL; + pfsyncif->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS; + pfsyncif->sc_imo.imo_multicast_vif = -1; + + ifp = pfsyncif->sc_ifp = if_alloc(IFT_PFSYNC); + if (ifp == NULL) { + free(pfsyncif->sc_imo.imo_membership, M_DEVBUF); + free(pfsyncif, M_DEVBUF); + return (ENOSPC); + } + if_initname(ifp, ifc->ifc_name, unit); + + pfsyncif->sc_detachtag = EVENTHANDLER_REGISTER(ifnet_departure_event, + pfsync_ifdetach, pfsyncif, EVENTHANDLER_PRI_ANY); + if (pfsyncif->sc_detachtag == NULL) { + if_free(ifp); + free(pfsyncif->sc_imo.imo_membership, M_DEVBUF); + free(pfsyncif, M_DEVBUF); + return (ENOSPC); + } + + pfsyncif->sc_ifq.ifq_maxlen = ifqmaxlen; + mtx_init(&pfsyncif->sc_ifq.ifq_mtx, ifp->if_xname, + "pfsync send queue", MTX_DEF); + TASK_INIT(&pfsyncif->sc_send_task, 0, pfsync_senddef, pfsyncif); +#endif + pfsyncif->sc_mbuf = NULL; + pfsyncif->sc_mbuf_net = NULL; +#ifdef PFSYNC_TDB + pfsyncif->sc_mbuf_tdb = NULL; +#endif + pfsyncif->sc_statep.s = NULL; + pfsyncif->sc_statep_net.s = NULL; +#ifdef PFSYNC_TDB + pfsyncif->sc_statep_tdb.t = NULL; +#endif + pfsyncif->sc_maxupdates = 128; +#ifdef __FreeBSD__ + pfsyncif->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP); + pfsyncif->sc_sendaddr.s_addr = htonl(INADDR_PFSYNC_GROUP); +#else + pfsyncif->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP; + pfsyncif->sc_sendaddr.s_addr = INADDR_PFSYNC_GROUP; +#endif + pfsyncif->sc_ureq_received = 0; + pfsyncif->sc_ureq_sent = 0; + pfsyncif->sc_bulk_send_next = NULL; + pfsyncif->sc_bulk_terminator = NULL; +#ifndef __FreeBSD__ + ifp = &pfsyncif->sc_if; + snprintf(ifp->if_xname, sizeof ifp->if_xname, "pfsync%d", unit); +#endif + ifp->if_softc = pfsyncif; + ifp->if_ioctl = pfsyncioctl; + ifp->if_output = pfsyncoutput; + ifp->if_start = pfsyncstart; + ifp->if_type = IFT_PFSYNC; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_hdrlen = PFSYNC_HDRLEN; + pfsync_setmtu(pfsyncif, ETHERMTU); +#ifdef __FreeBSD__ + callout_init(&pfsyncif->sc_tmo, CALLOUT_MPSAFE); +#ifdef PFSYNC_TDB + callout_init(&pfsyncif->sc_tdb_tmo, CALLOUT_MPSAFE); +#endif + callout_init(&pfsyncif->sc_bulk_tmo, CALLOUT_MPSAFE); + callout_init(&pfsyncif->sc_bulkfail_tmo, CALLOUT_MPSAFE); +#else + timeout_set(&pfsyncif->sc_tmo, pfsync_timeout, pfsyncif); + timeout_set(&pfsyncif->sc_tdb_tmo, pfsync_tdb_timeout, pfsyncif); + timeout_set(&pfsyncif->sc_bulk_tmo, pfsync_bulk_update, pfsyncif); + timeout_set(&pfsyncif->sc_bulkfail_tmo, pfsync_bulkfail, pfsyncif); +#endif + if_attach(ifp); +#ifndef __FreeBSD__ + if_alloc_sadl(ifp); +#endif + +#if NCARP > 0 + if_addgroup(ifp, "carp"); +#endif + +#if NBPFILTER > 0 +#ifdef __FreeBSD__ + bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN); +#else + bpfattach(&pfsyncif->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN); +#endif +#endif + + return (0); +} + +#ifdef __FreeBSD__ +void +#else +int +#endif +pfsync_clone_destroy(struct ifnet *ifp) +{ +#ifdef __FreeBSD__ + EVENTHANDLER_DEREGISTER(ifnet_departure_event, pfsyncif->sc_detachtag); + callout_stop(&pfsyncif->sc_tmo); +#ifdef PFSYNC_TDB + callout_stop(&pfsyncif->sc_tdb_tmo); +#endif + callout_stop(&pfsyncif->sc_bulk_tmo); + callout_stop(&pfsyncif->sc_bulkfail_tmo); + /* XXX: more? */ +#endif + +#if NBPFILTER > 0 + bpfdetach(ifp); +#endif + if_detach(ifp); +#ifdef __FreeBSD__ + if_free(ifp); + free(pfsyncif->sc_imo.imo_membership, M_DEVBUF); +#endif + free(pfsyncif, M_DEVBUF); + pfsyncif = NULL; +#ifndef __FreeBSD__ + return (0); +#endif +} + +/* + * Start output on the pfsync interface. + */ +void +pfsyncstart(struct ifnet *ifp) +{ + struct mbuf *m; +#ifndef __FreeBSD__ + int s; +#endif + + for (;;) { +#ifdef __FreeBSD__ + IF_LOCK(&ifp->if_snd); + _IF_DROP(&ifp->if_snd); + _IF_DEQUEUE(&ifp->if_snd, m); + IF_UNLOCK(&ifp->if_snd); +#else + s = splnet(); + IF_DROP(&ifp->if_snd); + IF_DEQUEUE(&ifp->if_snd, m); + splx(s); +#endif + + if (m == NULL) + return; + else + m_freem(m); + } +} + +int +pfsync_alloc_scrub_memory(struct pfsync_state_peer *s, + struct pf_state_peer *d) +{ + if (s->scrub.scrub_flag && d->scrub == NULL) { + d->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT); + if (d->scrub == NULL) + return (ENOMEM); + bzero(d->scrub, sizeof(*d->scrub)); + } + + return (0); +} + +int +pfsync_insert_net_state(struct pfsync_state *sp, u_int8_t chksum_flag) +{ + struct pf_state *st = NULL; + struct pf_rule *r = NULL; + struct pfi_kif *kif; + + if (sp->creatorid == 0 && pf_status.debug >= PF_DEBUG_MISC) { + printf("pfsync_insert_net_state: invalid creator id:" + " %08x\n", ntohl(sp->creatorid)); + return (EINVAL); + } + + kif = pfi_kif_get(sp->ifname); + if (kif == NULL) { + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync_insert_net_state: " + "unknown interface: %s\n", sp->ifname); + /* skip this state */ + return (0); + } + + /* + * If the ruleset checksums match, it's safe to associate the state + * with the rule of that number. + */ + if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) && chksum_flag) + r = pf_main_ruleset.rules[ + PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)]; + else + r = &pf_default_rule; + + if (!r->max_states || r->states < r->max_states) + st = pool_get(&pf_state_pl, PR_NOWAIT); + if (st == NULL) { + pfi_kif_unref(kif, PFI_KIF_REF_NONE); + return (ENOMEM); + } + bzero(st, sizeof(*st)); + + /* allocate memory for scrub info */ + if (pfsync_alloc_scrub_memory(&sp->src, &st->src) || + pfsync_alloc_scrub_memory(&sp->dst, &st->dst)) { + pfi_kif_unref(kif, PFI_KIF_REF_NONE); + if (st->src.scrub) + pool_put(&pf_state_scrub_pl, st->src.scrub); + pool_put(&pf_state_pl, st); + return (ENOMEM); + } + + st->rule.ptr = r; + /* XXX get pointers to nat_rule and anchor */ + + /* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */ + r->states++; + + /* fill in the rest of the state entry */ + pf_state_host_ntoh(&sp->lan, &st->lan); + pf_state_host_ntoh(&sp->gwy, &st->gwy); + pf_state_host_ntoh(&sp->ext, &st->ext); + + pf_state_peer_ntoh(&sp->src, &st->src); + pf_state_peer_ntoh(&sp->dst, &st->dst); + + bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr)); + st->creation = time_second - ntohl(sp->creation); + st->expire = ntohl(sp->expire) + time_second; + + st->af = sp->af; + st->proto = sp->proto; + st->direction = sp->direction; + st->log = sp->log; + st->timeout = sp->timeout; + st->state_flags = sp->state_flags; + + bcopy(sp->id, &st->id, sizeof(st->id)); + st->creatorid = sp->creatorid; + st->sync_flags = PFSTATE_FROMSYNC; + + if (pf_insert_state(kif, st)) { + pfi_kif_unref(kif, PFI_KIF_REF_NONE); + /* XXX when we have nat_rule/anchors, use STATE_DEC_COUNTERS */ + r->states--; + if (st->dst.scrub) + pool_put(&pf_state_scrub_pl, st->dst.scrub); + if (st->src.scrub) + pool_put(&pf_state_scrub_pl, st->src.scrub); + pool_put(&pf_state_pl, st); + return (EINVAL); + } + + return (0); +} + +void +#ifdef __FreeBSD__ +pfsync_input(struct mbuf *m, __unused int off) +#else +pfsync_input(struct mbuf *m, ...) +#endif +{ + struct ip *ip = mtod(m, struct ip *); + struct pfsync_header *ph; + struct pfsync_softc *sc = pfsyncif; + struct pf_state *st; + struct pf_state_cmp key; + struct pfsync_state *sp; + struct pfsync_state_upd *up; + struct pfsync_state_del *dp; + struct pfsync_state_clr *cp; + struct pfsync_state_upd_req *rup; + struct pfsync_state_bus *bus; +#ifdef PFSYNC_TDB + struct pfsync_tdb *pt; +#endif + struct in_addr src; + struct mbuf *mp; + int iplen, action, error, i, s, count, offp, sfail, stale = 0; + u_int8_t chksum_flag = 0; + + pfsyncstats.pfsyncs_ipackets++; + + /* verify that we have a sync interface configured */ + if (!sc || !sc->sc_sync_ifp || !pf_status.running) + goto done; + + /* verify that the packet came in on the right interface */ + if (sc->sc_sync_ifp != m->m_pkthdr.rcvif) { + pfsyncstats.pfsyncs_badif++; + goto done; + } + + /* verify that the IP TTL is 255. */ + if (ip->ip_ttl != PFSYNC_DFLTTL) { + pfsyncstats.pfsyncs_badttl++; + goto done; + } + + iplen = ip->ip_hl << 2; + + if (m->m_pkthdr.len < iplen + sizeof(*ph)) { + pfsyncstats.pfsyncs_hdrops++; + goto done; + } + + if (iplen + sizeof(*ph) > m->m_len) { + if ((m = m_pullup(m, iplen + sizeof(*ph))) == NULL) { + pfsyncstats.pfsyncs_hdrops++; + goto done; + } + ip = mtod(m, struct ip *); + } + ph = (struct pfsync_header *)((char *)ip + iplen); + + /* verify the version */ + if (ph->version != PFSYNC_VERSION) { + pfsyncstats.pfsyncs_badver++; + goto done; + } + + action = ph->action; + count = ph->count; + + /* make sure it's a valid action code */ + if (action >= PFSYNC_ACT_MAX) { + pfsyncstats.pfsyncs_badact++; + goto done; + } + + /* Cheaper to grab this now than having to mess with mbufs later */ + src = ip->ip_src; + + if (!bcmp(&ph->pf_chksum, &pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH)) + chksum_flag++; + + switch (action) { + case PFSYNC_ACT_CLR: { + struct pf_state *nexts; + struct pfi_kif *kif; + u_int32_t creatorid; + if ((mp = m_pulldown(m, iplen + sizeof(*ph), + sizeof(*cp), &offp)) == NULL) { + pfsyncstats.pfsyncs_badlen++; + return; + } + cp = (struct pfsync_state_clr *)(mp->m_data + offp); + creatorid = cp->creatorid; + + s = splsoftnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + if (cp->ifname[0] == '\0') { + for (st = RB_MIN(pf_state_tree_id, &tree_id); + st; st = nexts) { + nexts = RB_NEXT(pf_state_tree_id, &tree_id, st); + if (st->creatorid == creatorid) { + st->sync_flags |= PFSTATE_FROMSYNC; + pf_unlink_state(st); + } + } + } else { + if ((kif = pfi_kif_get(cp->ifname)) == NULL) { +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + return; + } + for (st = RB_MIN(pf_state_tree_lan_ext, + &kif->pfik_lan_ext); st; st = nexts) { + nexts = RB_NEXT(pf_state_tree_lan_ext, + &kif->pfik_lan_ext, st); + if (st->creatorid == creatorid) { + st->sync_flags |= PFSTATE_FROMSYNC; + pf_unlink_state(st); + } + } + } +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + + break; + } + case PFSYNC_ACT_INS: + if ((mp = m_pulldown(m, iplen + sizeof(*ph), + count * sizeof(*sp), &offp)) == NULL) { + pfsyncstats.pfsyncs_badlen++; + return; + } + + s = splsoftnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + for (i = 0, sp = (struct pfsync_state *)(mp->m_data + offp); + i < count; i++, sp++) { + /* check for invalid values */ + if (sp->timeout >= PFTM_MAX || + sp->src.state > PF_TCPS_PROXY_DST || + sp->dst.state > PF_TCPS_PROXY_DST || + sp->direction > PF_OUT || + (sp->af != AF_INET && sp->af != AF_INET6)) { + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync_insert: PFSYNC_ACT_INS: " + "invalid value\n"); + pfsyncstats.pfsyncs_badstate++; + continue; + } + + if ((error = pfsync_insert_net_state(sp, + chksum_flag))) { + if (error == ENOMEM) { +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + goto done; + } + continue; + } + } +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + break; + case PFSYNC_ACT_UPD: + if ((mp = m_pulldown(m, iplen + sizeof(*ph), + count * sizeof(*sp), &offp)) == NULL) { + pfsyncstats.pfsyncs_badlen++; + return; + } + + s = splsoftnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + for (i = 0, sp = (struct pfsync_state *)(mp->m_data + offp); + i < count; i++, sp++) { + int flags = PFSYNC_FLAG_STALE; + + /* check for invalid values */ + if (sp->timeout >= PFTM_MAX || + sp->src.state > PF_TCPS_PROXY_DST || + sp->dst.state > PF_TCPS_PROXY_DST) { + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync_insert: PFSYNC_ACT_UPD: " + "invalid value\n"); + pfsyncstats.pfsyncs_badstate++; + continue; + } + + bcopy(sp->id, &key.id, sizeof(key.id)); + key.creatorid = sp->creatorid; + + st = pf_find_state_byid(&key); + if (st == NULL) { + /* insert the update */ + if (pfsync_insert_net_state(sp, chksum_flag)) + pfsyncstats.pfsyncs_badstate++; + continue; + } + sfail = 0; + if (st->proto == IPPROTO_TCP) { + /* + * The state should never go backwards except + * for syn-proxy states. Neither should the + * sequence window slide backwards. + */ + if (st->src.state > sp->src.state && + (st->src.state < PF_TCPS_PROXY_SRC || + sp->src.state >= PF_TCPS_PROXY_SRC)) + sfail = 1; + else if (SEQ_GT(st->src.seqlo, + ntohl(sp->src.seqlo))) + sfail = 3; + else if (st->dst.state > sp->dst.state) { + /* There might still be useful + * information about the src state here, + * so import that part of the update, + * then "fail" so we send the updated + * state back to the peer who is missing + * our what we know. */ + pf_state_peer_ntoh(&sp->src, &st->src); + /* XXX do anything with timeouts? */ + sfail = 7; + flags = 0; + } else if (st->dst.state >= TCPS_SYN_SENT && + SEQ_GT(st->dst.seqlo, ntohl(sp->dst.seqlo))) + sfail = 4; + } else { + /* + * Non-TCP protocol state machine always go + * forwards + */ + if (st->src.state > sp->src.state) + sfail = 5; + else if (st->dst.state > sp->dst.state) + sfail = 6; + } + if (sfail) { + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: %s stale update " + "(%d) id: %016llx " + "creatorid: %08x\n", + (sfail < 7 ? "ignoring" + : "partial"), sfail, + betoh64(st->id), + ntohl(st->creatorid)); + pfsyncstats.pfsyncs_badstate++; + + if (!(sp->sync_flags & PFSTATE_STALE)) { + /* we have a better state, send it */ + if (sc->sc_mbuf != NULL && !stale) + pfsync_sendout(sc); + stale++; + if (!st->sync_flags) + pfsync_pack_state( + PFSYNC_ACT_UPD, st, flags); + } + continue; + } + pfsync_alloc_scrub_memory(&sp->dst, &st->dst); + pf_state_peer_ntoh(&sp->src, &st->src); + pf_state_peer_ntoh(&sp->dst, &st->dst); + st->expire = ntohl(sp->expire) + time_second; + st->timeout = sp->timeout; + } + if (stale && sc->sc_mbuf != NULL) + pfsync_sendout(sc); +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + break; + /* + * It's not strictly necessary for us to support the "uncompressed" + * delete action, but it's relatively simple and maintains consistency. + */ + case PFSYNC_ACT_DEL: + if ((mp = m_pulldown(m, iplen + sizeof(*ph), + count * sizeof(*sp), &offp)) == NULL) { + pfsyncstats.pfsyncs_badlen++; + return; + } + + s = splsoftnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + for (i = 0, sp = (struct pfsync_state *)(mp->m_data + offp); + i < count; i++, sp++) { + bcopy(sp->id, &key.id, sizeof(key.id)); + key.creatorid = sp->creatorid; + + st = pf_find_state_byid(&key); + if (st == NULL) { + pfsyncstats.pfsyncs_badstate++; + continue; + } + st->sync_flags |= PFSTATE_FROMSYNC; + pf_unlink_state(st); + } +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + break; + case PFSYNC_ACT_UPD_C: { + int update_requested = 0; + + if ((mp = m_pulldown(m, iplen + sizeof(*ph), + count * sizeof(*up), &offp)) == NULL) { + pfsyncstats.pfsyncs_badlen++; + return; + } + + s = splsoftnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + for (i = 0, up = (struct pfsync_state_upd *)(mp->m_data + offp); + i < count; i++, up++) { + /* check for invalid values */ + if (up->timeout >= PFTM_MAX || + up->src.state > PF_TCPS_PROXY_DST || + up->dst.state > PF_TCPS_PROXY_DST) { + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync_insert: " + "PFSYNC_ACT_UPD_C: " + "invalid value\n"); + pfsyncstats.pfsyncs_badstate++; + continue; + } + + bcopy(up->id, &key.id, sizeof(key.id)); + key.creatorid = up->creatorid; + + st = pf_find_state_byid(&key); + if (st == NULL) { + /* We don't have this state. Ask for it. */ + error = pfsync_request_update(up, &src); + if (error == ENOMEM) { +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + goto done; + } + update_requested = 1; + pfsyncstats.pfsyncs_badstate++; + continue; + } + sfail = 0; + if (st->proto == IPPROTO_TCP) { + /* + * The state should never go backwards except + * for syn-proxy states. Neither should the + * sequence window slide backwards. + */ + if (st->src.state > up->src.state && + (st->src.state < PF_TCPS_PROXY_SRC || + up->src.state >= PF_TCPS_PROXY_SRC)) + sfail = 1; + else if (st->dst.state > up->dst.state) + sfail = 2; + else if (SEQ_GT(st->src.seqlo, + ntohl(up->src.seqlo))) + sfail = 3; + else if (st->dst.state >= TCPS_SYN_SENT && + SEQ_GT(st->dst.seqlo, ntohl(up->dst.seqlo))) + sfail = 4; + } else { + /* + * Non-TCP protocol state machine always go + * forwards + */ + if (st->src.state > up->src.state) + sfail = 5; + else if (st->dst.state > up->dst.state) + sfail = 6; + } + if (sfail) { + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: ignoring stale update " + "(%d) id: %016llx " + "creatorid: %08x\n", sfail, + betoh64(st->id), + ntohl(st->creatorid)); + pfsyncstats.pfsyncs_badstate++; + + /* we have a better state, send it out */ + if ((!stale || update_requested) && + sc->sc_mbuf != NULL) { + pfsync_sendout(sc); + update_requested = 0; + } + stale++; + if (!st->sync_flags) + pfsync_pack_state(PFSYNC_ACT_UPD, st, + PFSYNC_FLAG_STALE); + continue; + } + pfsync_alloc_scrub_memory(&up->dst, &st->dst); + pf_state_peer_ntoh(&up->src, &st->src); + pf_state_peer_ntoh(&up->dst, &st->dst); + st->expire = ntohl(up->expire) + time_second; + st->timeout = up->timeout; + } + if ((update_requested || stale) && sc->sc_mbuf) + pfsync_sendout(sc); +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + break; + } + case PFSYNC_ACT_DEL_C: + if ((mp = m_pulldown(m, iplen + sizeof(*ph), + count * sizeof(*dp), &offp)) == NULL) { + pfsyncstats.pfsyncs_badlen++; + return; + } + + s = splsoftnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + for (i = 0, dp = (struct pfsync_state_del *)(mp->m_data + offp); + i < count; i++, dp++) { + bcopy(dp->id, &key.id, sizeof(key.id)); + key.creatorid = dp->creatorid; + + st = pf_find_state_byid(&key); + if (st == NULL) { + pfsyncstats.pfsyncs_badstate++; + continue; + } + st->sync_flags |= PFSTATE_FROMSYNC; + pf_unlink_state(st); + } +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + break; + case PFSYNC_ACT_INS_F: + case PFSYNC_ACT_DEL_F: + /* not implemented */ + break; + case PFSYNC_ACT_UREQ: + if ((mp = m_pulldown(m, iplen + sizeof(*ph), + count * sizeof(*rup), &offp)) == NULL) { + pfsyncstats.pfsyncs_badlen++; + return; + } + + s = splsoftnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + if (sc->sc_mbuf != NULL) + pfsync_sendout(sc); + for (i = 0, + rup = (struct pfsync_state_upd_req *)(mp->m_data + offp); + i < count; i++, rup++) { + bcopy(rup->id, &key.id, sizeof(key.id)); + key.creatorid = rup->creatorid; + + if (key.id == 0 && key.creatorid == 0) { + sc->sc_ureq_received = time_uptime; + if (sc->sc_bulk_send_next == NULL) + sc->sc_bulk_send_next = + TAILQ_FIRST(&state_list); + sc->sc_bulk_terminator = sc->sc_bulk_send_next; + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: received " + "bulk update request\n"); + pfsync_send_bus(sc, PFSYNC_BUS_START); +#ifdef __FreeBSD__ + callout_reset(&sc->sc_bulk_tmo, 1 * hz, + pfsync_bulk_update, pfsyncif); +#else + timeout_add(&sc->sc_bulk_tmo, 1 * hz); +#endif + } else { + st = pf_find_state_byid(&key); + if (st == NULL) { + pfsyncstats.pfsyncs_badstate++; + continue; + } + if (!st->sync_flags) + pfsync_pack_state(PFSYNC_ACT_UPD, + st, 0); + } + } + if (sc->sc_mbuf != NULL) + pfsync_sendout(sc); +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + break; + case PFSYNC_ACT_BUS: + /* If we're not waiting for a bulk update, who cares. */ + if (sc->sc_ureq_sent == 0) + break; + + if ((mp = m_pulldown(m, iplen + sizeof(*ph), + sizeof(*bus), &offp)) == NULL) { + pfsyncstats.pfsyncs_badlen++; + return; + } + bus = (struct pfsync_state_bus *)(mp->m_data + offp); + switch (bus->status) { + case PFSYNC_BUS_START: +#ifdef __FreeBSD__ + callout_reset(&sc->sc_bulkfail_tmo, + pf_pool_limits[PF_LIMIT_STATES].limit / + (PFSYNC_BULKPACKETS * sc->sc_maxcount), + pfsync_bulkfail, pfsyncif); +#else + timeout_add(&sc->sc_bulkfail_tmo, + pf_pool_limits[PF_LIMIT_STATES].limit / + (PFSYNC_BULKPACKETS * sc->sc_maxcount)); +#endif + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: received bulk " + "update start\n"); + break; + case PFSYNC_BUS_END: + if (time_uptime - ntohl(bus->endtime) >= + sc->sc_ureq_sent) { + /* that's it, we're happy */ + sc->sc_ureq_sent = 0; + sc->sc_bulk_tries = 0; + timeout_del(&sc->sc_bulkfail_tmo); +#if NCARP > 0 + if (!pfsync_sync_ok) +#ifdef __FreeBSD__ +#ifdef CARP_ADVANCED + carp_group_demote_adj(sc->sc_ifp, -1); +#endif +#else + carp_group_demote_adj(&sc->sc_if, -1); +#endif +#endif + pfsync_sync_ok = 1; + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: received valid " + "bulk update end\n"); + } else { + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: received invalid " + "bulk update end: bad timestamp\n"); + } + break; + } + break; +#ifdef PFSYNC_TDB + case PFSYNC_ACT_TDB_UPD: + if ((mp = m_pulldown(m, iplen + sizeof(*ph), + count * sizeof(*pt), &offp)) == NULL) { + pfsyncstats.pfsyncs_badlen++; + return; + } + s = splsoftnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + for (i = 0, pt = (struct pfsync_tdb *)(mp->m_data + offp); + i < count; i++, pt++) + pfsync_update_net_tdb(pt); +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + break; +#endif + } + +done: + if (m) + m_freem(m); +} + +int +pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct route *ro) +{ + m_freem(m); + return (0); +} + +/* ARGSUSED */ +int +pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ +#ifndef __FreeBSD__ + struct proc *p = curproc; +#endif + struct pfsync_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *)data; + struct ip_moptions *imo = &sc->sc_imo; + struct pfsyncreq pfsyncr; + struct ifnet *sifp; + int s, error; + + switch (cmd) { + case SIOCSIFADDR: + case SIOCAIFADDR: + case SIOCSIFDSTADDR: + case SIOCSIFFLAGS: +#ifdef __FreeBSD__ + if (ifp->if_flags & IFF_UP) + ifp->if_drv_flags |= IFF_DRV_RUNNING; + else + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; +#else + if (ifp->if_flags & IFF_UP) + ifp->if_flags |= IFF_RUNNING; + else + ifp->if_flags &= ~IFF_RUNNING; +#endif + break; + case SIOCSIFMTU: + if (ifr->ifr_mtu < PFSYNC_MINMTU) + return (EINVAL); + if (ifr->ifr_mtu > MCLBYTES) + ifr->ifr_mtu = MCLBYTES; + s = splnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + if (ifr->ifr_mtu < ifp->if_mtu) + pfsync_sendout(sc); + pfsync_setmtu(sc, ifr->ifr_mtu); +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + break; + case SIOCGETPFSYNC: + bzero(&pfsyncr, sizeof(pfsyncr)); + if (sc->sc_sync_ifp) + strlcpy(pfsyncr.pfsyncr_syncdev, + sc->sc_sync_ifp->if_xname, IFNAMSIZ); + pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer; + pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates; + if ((error = copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)))) + return (error); + break; + case SIOCSETPFSYNC: +#ifdef __FreeBSD__ + if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0) +#else + if ((error = suser(p, p->p_acflag)) != 0) +#endif + return (error); + if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr)))) + return (error); + +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + if (pfsyncr.pfsyncr_syncpeer.s_addr == 0) +#ifdef __FreeBSD__ + sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP); +#else + sc->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP; +#endif + else + sc->sc_sync_peer.s_addr = + pfsyncr.pfsyncr_syncpeer.s_addr; + + if (pfsyncr.pfsyncr_maxupdates > 255) +#ifdef __FreeBSD__ + { + PF_UNLOCK(); +#endif + return (EINVAL); +#ifdef __FreeBSD__ + } +#endif + sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates; + + if (pfsyncr.pfsyncr_syncdev[0] == 0) { + sc->sc_sync_ifp = NULL; + if (sc->sc_mbuf_net != NULL) { + /* Don't keep stale pfsync packets around. */ + s = splnet(); + m_freem(sc->sc_mbuf_net); + sc->sc_mbuf_net = NULL; + sc->sc_statep_net.s = NULL; + splx(s); + } +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + if (imo->imo_num_memberships > 0) { + in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); + imo->imo_multicast_ifp = NULL; + } + break; + } + +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + if ((sifp = ifunit(pfsyncr.pfsyncr_syncdev)) == NULL) + return (EINVAL); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + + s = splnet(); +#ifdef __FreeBSD__ + if (sifp->if_mtu < sc->sc_ifp->if_mtu || +#else + if (sifp->if_mtu < sc->sc_if.if_mtu || +#endif + (sc->sc_sync_ifp != NULL && + sifp->if_mtu < sc->sc_sync_ifp->if_mtu) || + sifp->if_mtu < MCLBYTES - sizeof(struct ip)) + pfsync_sendout(sc); + sc->sc_sync_ifp = sifp; + +#ifdef __FreeBSD__ + pfsync_setmtu(sc, sc->sc_ifp->if_mtu); +#else + pfsync_setmtu(sc, sc->sc_if.if_mtu); +#endif + + if (imo->imo_num_memberships > 0) { +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + imo->imo_multicast_ifp = NULL; + } + + if (sc->sc_sync_ifp && +#ifdef __FreeBSD__ + sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) { +#else + sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) { +#endif + struct in_addr addr; + + if (!(sc->sc_sync_ifp->if_flags & IFF_MULTICAST)) { + sc->sc_sync_ifp = NULL; +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + return (EADDRNOTAVAIL); + } + +#ifdef __FreeBSD__ + addr.s_addr = htonl(INADDR_PFSYNC_GROUP); +#else + addr.s_addr = INADDR_PFSYNC_GROUP; +#endif + +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + if ((imo->imo_membership[0] = + in_addmulti(&addr, sc->sc_sync_ifp)) == NULL) { + sc->sc_sync_ifp = NULL; + splx(s); + return (ENOBUFS); + } +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + imo->imo_num_memberships++; + imo->imo_multicast_ifp = sc->sc_sync_ifp; + imo->imo_multicast_ttl = PFSYNC_DFLTTL; + imo->imo_multicast_loop = 0; + } + + if (sc->sc_sync_ifp || +#ifdef __FreeBSD__ + sc->sc_sendaddr.s_addr != htonl(INADDR_PFSYNC_GROUP)) { +#else + sc->sc_sendaddr.s_addr != INADDR_PFSYNC_GROUP) { +#endif + /* Request a full state table update. */ + sc->sc_ureq_sent = time_uptime; +#if NCARP > 0 + if (pfsync_sync_ok) +#ifdef __FreeBSD__ +#ifdef CARP_ADVANCED + carp_group_demote_adj(sc->sc_ifp, 1); +#endif +#else + carp_group_demote_adj(&sc->sc_if, 1); +#endif +#endif + pfsync_sync_ok = 0; + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: requesting bulk update\n"); +#ifdef __FreeBSD__ + callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, + pfsync_bulkfail, pfsyncif); +#else + timeout_add(&sc->sc_bulkfail_tmo, 5 * hz); +#endif + error = pfsync_request_update(NULL, NULL); + if (error == ENOMEM) { +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + return (ENOMEM); + } + pfsync_sendout(sc); + } +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); + + break; + + default: + return (ENOTTY); + } + + return (0); +} + +void +pfsync_setmtu(struct pfsync_softc *sc, int mtu_req) +{ + int mtu; + + if (sc->sc_sync_ifp && sc->sc_sync_ifp->if_mtu < mtu_req) + mtu = sc->sc_sync_ifp->if_mtu; + else + mtu = mtu_req; + + sc->sc_maxcount = (mtu - sizeof(struct pfsync_header)) / + sizeof(struct pfsync_state); + if (sc->sc_maxcount > 254) + sc->sc_maxcount = 254; +#ifdef __FreeBSD__ + sc->sc_ifp->if_mtu = sizeof(struct pfsync_header) + +#else + sc->sc_if.if_mtu = sizeof(struct pfsync_header) + +#endif + sc->sc_maxcount * sizeof(struct pfsync_state); +} + +struct mbuf * +pfsync_get_mbuf(struct pfsync_softc *sc, u_int8_t action, void **sp) +{ + struct pfsync_header *h; + struct mbuf *m; + int len; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) { +#ifdef __FreeBSD__ + sc->sc_ifp->if_oerrors++; +#else + sc->sc_if.if_oerrors++; +#endif + return (NULL); + } + + switch (action) { + case PFSYNC_ACT_CLR: + len = sizeof(struct pfsync_header) + + sizeof(struct pfsync_state_clr); + break; + case PFSYNC_ACT_UPD_C: + len = (sc->sc_maxcount * sizeof(struct pfsync_state_upd)) + + sizeof(struct pfsync_header); + break; + case PFSYNC_ACT_DEL_C: + len = (sc->sc_maxcount * sizeof(struct pfsync_state_del)) + + sizeof(struct pfsync_header); + break; + case PFSYNC_ACT_UREQ: + len = (sc->sc_maxcount * sizeof(struct pfsync_state_upd_req)) + + sizeof(struct pfsync_header); + break; + case PFSYNC_ACT_BUS: + len = sizeof(struct pfsync_header) + + sizeof(struct pfsync_state_bus); + break; +#ifdef PFSYNC_TDB + case PFSYNC_ACT_TDB_UPD: + len = (sc->sc_maxcount * sizeof(struct pfsync_tdb)) + + sizeof(struct pfsync_header); + break; +#endif + default: + len = (sc->sc_maxcount * sizeof(struct pfsync_state)) + + sizeof(struct pfsync_header); + break; + } + + if (len > MHLEN) { + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_free(m); +#ifdef __FreeBSD__ + sc->sc_ifp->if_oerrors++; +#else + sc->sc_if.if_oerrors++; +#endif + return (NULL); + } + m->m_data += (MCLBYTES - len) &~ (sizeof(long) - 1); + } else + MH_ALIGN(m, len); + + m->m_pkthdr.rcvif = NULL; + m->m_pkthdr.len = m->m_len = sizeof(struct pfsync_header); + h = mtod(m, struct pfsync_header *); + h->version = PFSYNC_VERSION; + h->af = 0; + h->count = 0; + h->action = action; +#ifndef PFSYNC_TDB + if (action != PFSYNC_ACT_TDB_UPD) +#endif + bcopy(&pf_status.pf_chksum, &h->pf_chksum, + PF_MD5_DIGEST_LENGTH); + + *sp = (void *)((char *)h + PFSYNC_HDRLEN); +#ifdef PFSYNC_TDB + if (action == PFSYNC_ACT_TDB_UPD) +#ifdef __FreeBSD__ + callout_reset(&sc->sc_tdb_tmo, hz, pfsync_tdb_timeout, + pfsyncif); +#else + timeout_add(&sc->sc_tdb_tmo, hz); +#endif + else +#endif +#ifdef __FreeBSD__ + callout_reset(&sc->sc_tmo, hz, pfsync_timeout, pfsyncif); +#else + timeout_add(&sc->sc_tmo, hz); +#endif + return (m); +} + +int +pfsync_pack_state(u_int8_t action, struct pf_state *st, int flags) +{ + struct ifnet *ifp = NULL; + struct pfsync_softc *sc = pfsyncif; + struct pfsync_header *h, *h_net; + struct pfsync_state *sp = NULL; + struct pfsync_state_upd *up = NULL; + struct pfsync_state_del *dp = NULL; + struct pf_rule *r; + u_long secs; + int s, ret = 0; + u_int8_t i = 255, newaction = 0; + + if (sc == NULL) + return (0); +#ifdef __FreeBSD__ + ifp = sc->sc_ifp; +#else + ifp = &sc->sc_if; +#endif + + /* + * If a packet falls in the forest and there's nobody around to + * hear, does it make a sound? + */ + if (ifp->if_bpf == NULL && sc->sc_sync_ifp == NULL && +#ifdef __FreeBSD__ + sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) { +#else + sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) { +#endif + /* Don't leave any stale pfsync packets hanging around. */ + if (sc->sc_mbuf != NULL) { + m_freem(sc->sc_mbuf); + sc->sc_mbuf = NULL; + sc->sc_statep.s = NULL; + } + return (0); + } + + if (action >= PFSYNC_ACT_MAX) + return (EINVAL); + + s = splnet(); +#ifdef __FreeBSD__ + PF_ASSERT(MA_OWNED); +#endif + if (sc->sc_mbuf == NULL) { + if ((sc->sc_mbuf = pfsync_get_mbuf(sc, action, + (void *)&sc->sc_statep.s)) == NULL) { + splx(s); + return (ENOMEM); + } + h = mtod(sc->sc_mbuf, struct pfsync_header *); + } else { + h = mtod(sc->sc_mbuf, struct pfsync_header *); + if (h->action != action) { + pfsync_sendout(sc); + if ((sc->sc_mbuf = pfsync_get_mbuf(sc, action, + (void *)&sc->sc_statep.s)) == NULL) { + splx(s); + return (ENOMEM); + } + h = mtod(sc->sc_mbuf, struct pfsync_header *); + } else { + /* + * If it's an update, look in the packet to see if + * we already have an update for the state. + */ + if (action == PFSYNC_ACT_UPD && sc->sc_maxupdates) { + struct pfsync_state *usp = + (void *)((char *)h + PFSYNC_HDRLEN); + + for (i = 0; i < h->count; i++) { + if (!memcmp(usp->id, &st->id, + PFSYNC_ID_LEN) && + usp->creatorid == st->creatorid) { + sp = usp; + sp->updates++; + break; + } + usp++; + } + } + } + } + + secs = time_second; + + st->pfsync_time = time_uptime; + + if (sp == NULL) { + /* not a "duplicate" update */ + i = 255; + sp = sc->sc_statep.s++; + sc->sc_mbuf->m_pkthdr.len = + sc->sc_mbuf->m_len += sizeof(struct pfsync_state); + h->count++; + bzero(sp, sizeof(*sp)); + + bcopy(&st->id, sp->id, sizeof(sp->id)); + sp->creatorid = st->creatorid; + + strlcpy(sp->ifname, st->u.s.kif->pfik_name, sizeof(sp->ifname)); + pf_state_host_hton(&st->lan, &sp->lan); + pf_state_host_hton(&st->gwy, &sp->gwy); + pf_state_host_hton(&st->ext, &sp->ext); + + bcopy(&st->rt_addr, &sp->rt_addr, sizeof(sp->rt_addr)); + + sp->creation = htonl(secs - st->creation); + pf_state_counter_hton(st->packets[0], sp->packets[0]); + pf_state_counter_hton(st->packets[1], sp->packets[1]); + pf_state_counter_hton(st->bytes[0], sp->bytes[0]); + pf_state_counter_hton(st->bytes[1], sp->bytes[1]); + if ((r = st->rule.ptr) == NULL) + sp->rule = htonl(-1); + else + sp->rule = htonl(r->nr); + if ((r = st->anchor.ptr) == NULL) + sp->anchor = htonl(-1); + else + sp->anchor = htonl(r->nr); + sp->af = st->af; + sp->proto = st->proto; + sp->direction = st->direction; + sp->log = st->log; + sp->state_flags = st->state_flags; + sp->timeout = st->timeout; + + if (flags & PFSYNC_FLAG_STALE) + sp->sync_flags |= PFSTATE_STALE; + } + + pf_state_peer_hton(&st->src, &sp->src); + pf_state_peer_hton(&st->dst, &sp->dst); + + if (st->expire <= secs) + sp->expire = htonl(0); + else + sp->expire = htonl(st->expire - secs); + + /* do we need to build "compressed" actions for network transfer? */ + if (sc->sc_sync_ifp && flags & PFSYNC_FLAG_COMPRESS) { + switch (action) { + case PFSYNC_ACT_UPD: + newaction = PFSYNC_ACT_UPD_C; + break; + case PFSYNC_ACT_DEL: + newaction = PFSYNC_ACT_DEL_C; + break; + default: + /* by default we just send the uncompressed states */ + break; + } + } + + if (newaction) { + if (sc->sc_mbuf_net == NULL) { + if ((sc->sc_mbuf_net = pfsync_get_mbuf(sc, newaction, + (void *)&sc->sc_statep_net.s)) == NULL) { + splx(s); + return (ENOMEM); + } + } + h_net = mtod(sc->sc_mbuf_net, struct pfsync_header *); + + switch (newaction) { + case PFSYNC_ACT_UPD_C: + if (i != 255) { + up = (void *)((char *)h_net + + PFSYNC_HDRLEN + (i * sizeof(*up))); + up->updates++; + } else { + h_net->count++; + sc->sc_mbuf_net->m_pkthdr.len = + sc->sc_mbuf_net->m_len += sizeof(*up); + up = sc->sc_statep_net.u++; + + bzero(up, sizeof(*up)); + bcopy(&st->id, up->id, sizeof(up->id)); + up->creatorid = st->creatorid; + } + up->timeout = st->timeout; + up->expire = sp->expire; + up->src = sp->src; + up->dst = sp->dst; + break; + case PFSYNC_ACT_DEL_C: + sc->sc_mbuf_net->m_pkthdr.len = + sc->sc_mbuf_net->m_len += sizeof(*dp); + dp = sc->sc_statep_net.d++; + h_net->count++; + + bzero(dp, sizeof(*dp)); + bcopy(&st->id, dp->id, sizeof(dp->id)); + dp->creatorid = st->creatorid; + break; + } + } + + if (h->count == sc->sc_maxcount || + (sc->sc_maxupdates && (sp->updates >= sc->sc_maxupdates))) + ret = pfsync_sendout(sc); + + splx(s); + return (ret); +} + +/* This must be called in splnet() */ +int +pfsync_request_update(struct pfsync_state_upd *up, struct in_addr *src) +{ + struct ifnet *ifp = NULL; + struct pfsync_header *h; + struct pfsync_softc *sc = pfsyncif; + struct pfsync_state_upd_req *rup; + int ret = 0; + + if (sc == NULL) + return (0); + +#ifdef __FreeBSD__ + ifp = sc->sc_ifp; +#else + ifp = &sc->sc_if; +#endif + if (sc->sc_mbuf == NULL) { + if ((sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_UREQ, + (void *)&sc->sc_statep.s)) == NULL) + return (ENOMEM); + h = mtod(sc->sc_mbuf, struct pfsync_header *); + } else { + h = mtod(sc->sc_mbuf, struct pfsync_header *); + if (h->action != PFSYNC_ACT_UREQ) { + pfsync_sendout(sc); + if ((sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_UREQ, + (void *)&sc->sc_statep.s)) == NULL) + return (ENOMEM); + h = mtod(sc->sc_mbuf, struct pfsync_header *); + } + } + + if (src != NULL) + sc->sc_sendaddr = *src; + sc->sc_mbuf->m_pkthdr.len = sc->sc_mbuf->m_len += sizeof(*rup); + h->count++; + rup = sc->sc_statep.r++; + bzero(rup, sizeof(*rup)); + if (up != NULL) { + bcopy(up->id, rup->id, sizeof(rup->id)); + rup->creatorid = up->creatorid; + } + + if (h->count == sc->sc_maxcount) + ret = pfsync_sendout(sc); + + return (ret); +} + +int +pfsync_clear_states(u_int32_t creatorid, char *ifname) +{ + struct ifnet *ifp = NULL; + struct pfsync_softc *sc = pfsyncif; + struct pfsync_state_clr *cp; + int s, ret; + + if (sc == NULL) + return (0); + +#ifdef __FreeBSD__ + ifp = sc->sc_ifp; +#else + ifp = &sc->sc_if; +#endif +#ifdef __FreeBSD__ + PF_ASSERT(MA_OWNED); +#endif + s = splnet(); + if (sc->sc_mbuf != NULL) + pfsync_sendout(sc); + if ((sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_CLR, + (void *)&sc->sc_statep.c)) == NULL) { + splx(s); + return (ENOMEM); + } + sc->sc_mbuf->m_pkthdr.len = sc->sc_mbuf->m_len += sizeof(*cp); + cp = sc->sc_statep.c; + cp->creatorid = creatorid; + if (ifname != NULL) + strlcpy(cp->ifname, ifname, IFNAMSIZ); + + ret = (pfsync_sendout(sc)); + splx(s); + return (ret); +} + +void +pfsync_timeout(void *v) +{ + struct pfsync_softc *sc = v; + int s; + + s = splnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + pfsync_sendout(sc); +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); +} + +#ifdef PFSYNC_TDB +void +pfsync_tdb_timeout(void *v) +{ + struct pfsync_softc *sc = v; + int s; + + s = splnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + pfsync_tdb_sendout(sc); +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + splx(s); +} +#endif + +/* This must be called in splnet() */ +void +pfsync_send_bus(struct pfsync_softc *sc, u_int8_t status) +{ + struct pfsync_state_bus *bus; + +#ifdef __FreeBSD__ + PF_ASSERT(MA_OWNED); +#endif + if (sc->sc_mbuf != NULL) + pfsync_sendout(sc); + + if (pfsync_sync_ok && + (sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_BUS, + (void *)&sc->sc_statep.b)) != NULL) { + sc->sc_mbuf->m_pkthdr.len = sc->sc_mbuf->m_len += sizeof(*bus); + bus = sc->sc_statep.b; + bus->creatorid = pf_status.hostid; + bus->status = status; + bus->endtime = htonl(time_uptime - sc->sc_ureq_received); + pfsync_sendout(sc); + } +} + +void +pfsync_bulk_update(void *v) +{ + struct pfsync_softc *sc = v; + int s, i = 0; + struct pf_state *state; + + s = splnet(); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + if (sc->sc_mbuf != NULL) + pfsync_sendout(sc); + + /* + * Grab at most PFSYNC_BULKPACKETS worth of states which have not + * been sent since the latest request was made. + */ + state = sc->sc_bulk_send_next; + if (state) + do { + /* send state update if syncable and not already sent */ + if (!state->sync_flags + && state->timeout < PFTM_MAX + && state->pfsync_time <= sc->sc_ureq_received) { + pfsync_pack_state(PFSYNC_ACT_UPD, state, 0); + i++; + } + + /* figure next state to send */ + state = TAILQ_NEXT(state, u.s.entry_list); + + /* wrap to start of list if we hit the end */ + if (!state) + state = TAILQ_FIRST(&state_list); + } while (i < sc->sc_maxcount * PFSYNC_BULKPACKETS && + state != sc->sc_bulk_terminator); + + if (!state || state == sc->sc_bulk_terminator) { + /* we're done */ + pfsync_send_bus(sc, PFSYNC_BUS_END); + sc->sc_ureq_received = 0; + sc->sc_bulk_send_next = NULL; + sc->sc_bulk_terminator = NULL; + timeout_del(&sc->sc_bulk_tmo); + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: bulk update complete\n"); + } else { + /* look again for more in a bit */ +#ifdef __FreeBSD__ + callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, + pfsyncif); +#else + timeout_add(&sc->sc_bulk_tmo, 1); +#endif + sc->sc_bulk_send_next = state; + } + if (sc->sc_mbuf != NULL) + pfsync_sendout(sc); + splx(s); +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif +} + +void +pfsync_bulkfail(void *v) +{ + struct pfsync_softc *sc = v; + int s, error; + +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) { + /* Try again in a bit */ +#ifdef __FreeBSD__ + callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulkfail, + pfsyncif); +#else + timeout_add(&sc->sc_bulkfail_tmo, 5 * hz); +#endif + s = splnet(); + error = pfsync_request_update(NULL, NULL); + if (error == ENOMEM) { + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: cannot allocate mbufs for " + "bulk update\n"); + } else + pfsync_sendout(sc); + splx(s); + } else { + /* Pretend like the transfer was ok */ + sc->sc_ureq_sent = 0; + sc->sc_bulk_tries = 0; +#if NCARP > 0 + if (!pfsync_sync_ok) +#ifdef __FreeBSD__ +#ifdef CARP_ADVANCED + carp_group_demote_adj(sc->sc_ifp, -1); +#endif +#else + carp_group_demote_adj(&sc->sc_if, -1); +#endif +#endif + pfsync_sync_ok = 1; + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: failed to receive " + "bulk update status\n"); + timeout_del(&sc->sc_bulkfail_tmo); + } +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif +} + +/* This must be called in splnet() */ +int +pfsync_sendout(struct pfsync_softc *sc) +{ +#if NBPFILTER > 0 +#ifdef __FreeBSD__ + struct ifnet *ifp = sc->sc_ifp; +#else + struct ifnet *ifp = &sc->sc_if; +#endif +#endif + struct mbuf *m; + +#ifdef __FreeBSD__ + PF_ASSERT(MA_OWNED); +#endif + timeout_del(&sc->sc_tmo); + + if (sc->sc_mbuf == NULL) + return (0); + m = sc->sc_mbuf; + sc->sc_mbuf = NULL; + sc->sc_statep.s = NULL; + +#if NBPFILTER > 0 + if (ifp->if_bpf) +#ifdef __FreeBSD__ + BPF_MTAP(ifp, m); +#else + bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT); +#endif +#endif + + if (sc->sc_mbuf_net) { + m_freem(m); + m = sc->sc_mbuf_net; + sc->sc_mbuf_net = NULL; + sc->sc_statep_net.s = NULL; + } + + return pfsync_sendout_mbuf(sc, m); +} + +#ifdef PFSYNC_TDB +int +pfsync_tdb_sendout(struct pfsync_softc *sc) +{ +#if NBPFILTER > 0 +#ifdef __FreeBSD__ + struct ifnet *ifp = sc->sc_ifp; +#else + struct ifnet *ifp = &sc->sc_if; +#endif +#endif + struct mbuf *m; + +#ifdef __FreeBSD__ + PF_ASSERT(MA_OWNED); +#endif + timeout_del(&sc->sc_tdb_tmo); + + if (sc->sc_mbuf_tdb == NULL) + return (0); + m = sc->sc_mbuf_tdb; + sc->sc_mbuf_tdb = NULL; + sc->sc_statep_tdb.t = NULL; + +#if NBPFILTER > 0 + if (ifp->if_bpf) +#ifdef __FreeBSD__ + BPF_MTAP(ifp, m); +#else + bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT); +#endif +#endif + + return pfsync_sendout_mbuf(sc, m); +} +#endif + +int +pfsync_sendout_mbuf(struct pfsync_softc *sc, struct mbuf *m) +{ + struct sockaddr sa; + struct ip *ip; + +#ifdef __FreeBSD__ + PF_ASSERT(MA_OWNED); +#endif + if (sc->sc_sync_ifp || +#ifdef __FreeBSD__ + sc->sc_sync_peer.s_addr != htonl(INADDR_PFSYNC_GROUP)) { +#else + sc->sc_sync_peer.s_addr != INADDR_PFSYNC_GROUP) { +#endif + M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); + if (m == NULL) { + pfsyncstats.pfsyncs_onomem++; + return (0); + } + ip = mtod(m, struct ip *); + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + ip->ip_tos = IPTOS_LOWDELAY; +#ifdef __FreeBSD__ + ip->ip_len = m->m_pkthdr.len; +#else + ip->ip_len = htons(m->m_pkthdr.len); +#endif + ip->ip_id = htons(ip_randomid()); +#ifdef __FreeBSD__ + ip->ip_off = IP_DF; +#else + ip->ip_off = htons(IP_DF); +#endif + ip->ip_ttl = PFSYNC_DFLTTL; + ip->ip_p = IPPROTO_PFSYNC; + ip->ip_sum = 0; + + bzero(&sa, sizeof(sa)); + ip->ip_src.s_addr = INADDR_ANY; + +#ifdef __FreeBSD__ + if (sc->sc_sendaddr.s_addr == htonl(INADDR_PFSYNC_GROUP)) +#else + if (sc->sc_sendaddr.s_addr == INADDR_PFSYNC_GROUP) +#endif + m->m_flags |= M_MCAST; + ip->ip_dst = sc->sc_sendaddr; + sc->sc_sendaddr.s_addr = sc->sc_sync_peer.s_addr; + + pfsyncstats.pfsyncs_opackets++; + +#ifdef __FreeBSD__ + if (!IF_HANDOFF(&sc->sc_ifq, m, NULL)) + pfsyncstats.pfsyncs_oerrors++; + taskqueue_enqueue(taskqueue_thread, &pfsyncif->sc_send_task); +#else + if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) + pfsyncstats.pfsyncs_oerrors++; +#endif + } else + m_freem(m); + + return (0); +} + +#ifdef PFSYNC_TDB +/* Update an in-kernel tdb. Silently fail if no tdb is found. */ +void +pfsync_update_net_tdb(struct pfsync_tdb *pt) +{ + struct tdb *tdb; + int s; + + /* check for invalid values */ + if (ntohl(pt->spi) <= SPI_RESERVED_MAX || + (pt->dst.sa.sa_family != AF_INET && + pt->dst.sa.sa_family != AF_INET6)) + goto bad; + + s = spltdb(); + tdb = gettdb(pt->spi, &pt->dst, pt->sproto); + if (tdb) { + pt->rpl = ntohl(pt->rpl); + pt->cur_bytes = betoh64(pt->cur_bytes); + + /* Neither replay nor byte counter should ever decrease. */ + if (pt->rpl < tdb->tdb_rpl || + pt->cur_bytes < tdb->tdb_cur_bytes) { + splx(s); + goto bad; + } + + tdb->tdb_rpl = pt->rpl; + tdb->tdb_cur_bytes = pt->cur_bytes; + } + splx(s); + return; + + bad: + if (pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: " + "invalid value\n"); + pfsyncstats.pfsyncs_badstate++; + return; +} + +/* One of our local tdbs have been updated, need to sync rpl with others */ +int +pfsync_update_tdb(struct tdb *tdb, int output) +{ + struct ifnet *ifp = NULL; + struct pfsync_softc *sc = pfsyncif; + struct pfsync_header *h; + struct pfsync_tdb *pt = NULL; + int s, i, ret; + + if (sc == NULL) + return (0); + +#ifdef __FreeBSD__ + ifp = sc->sc_ifp; +#else + ifp = &sc->sc_if; +#endif + if (ifp->if_bpf == NULL && sc->sc_sync_ifp == NULL && +#ifdef __FreeBSD__ + sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) { +#else + sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) { +#endif + /* Don't leave any stale pfsync packets hanging around. */ + if (sc->sc_mbuf_tdb != NULL) { + m_freem(sc->sc_mbuf_tdb); + sc->sc_mbuf_tdb = NULL; + sc->sc_statep_tdb.t = NULL; + } + return (0); + } + +#ifdef __FreeBSD__ + PF_ASSERT(MA_OWNED); +#endif + s = splnet(); + if (sc->sc_mbuf_tdb == NULL) { + if ((sc->sc_mbuf_tdb = pfsync_get_mbuf(sc, PFSYNC_ACT_TDB_UPD, + (void *)&sc->sc_statep_tdb.t)) == NULL) { + splx(s); + return (ENOMEM); + } + h = mtod(sc->sc_mbuf_tdb, struct pfsync_header *); + } else { + h = mtod(sc->sc_mbuf_tdb, struct pfsync_header *); + if (h->action != PFSYNC_ACT_TDB_UPD) { + /* + * XXX will never happen as long as there's + * only one "TDB action". + */ + pfsync_tdb_sendout(sc); + sc->sc_mbuf_tdb = pfsync_get_mbuf(sc, + PFSYNC_ACT_TDB_UPD, (void *)&sc->sc_statep_tdb.t); + if (sc->sc_mbuf_tdb == NULL) { + splx(s); + return (ENOMEM); + } + h = mtod(sc->sc_mbuf_tdb, struct pfsync_header *); + } else if (sc->sc_maxupdates) { + /* + * If it's an update, look in the packet to see if + * we already have an update for the state. + */ + struct pfsync_tdb *u = + (void *)((char *)h + PFSYNC_HDRLEN); + + for (i = 0; !pt && i < h->count; i++) { + if (tdb->tdb_spi == u->spi && + tdb->tdb_sproto == u->sproto && + !bcmp(&tdb->tdb_dst, &u->dst, + SA_LEN(&u->dst.sa))) { + pt = u; + pt->updates++; + } + u++; + } + } + } + + if (pt == NULL) { + /* not a "duplicate" update */ + pt = sc->sc_statep_tdb.t++; + sc->sc_mbuf_tdb->m_pkthdr.len = + sc->sc_mbuf_tdb->m_len += sizeof(struct pfsync_tdb); + h->count++; + bzero(pt, sizeof(*pt)); + + pt->spi = tdb->tdb_spi; + memcpy(&pt->dst, &tdb->tdb_dst, sizeof pt->dst); + pt->sproto = tdb->tdb_sproto; + } + + /* + * When a failover happens, the master's rpl is probably above + * what we see here (we may be up to a second late), so + * increase it a bit for outbound tdbs to manage most such + * situations. + * + * For now, just add an offset that is likely to be larger + * than the number of packets we can see in one second. The RFC + * just says the next packet must have a higher seq value. + * + * XXX What is a good algorithm for this? We could use + * a rate-determined increase, but to know it, we would have + * to extend struct tdb. + * XXX pt->rpl can wrap over MAXINT, but if so the real tdb + * will soon be replaced anyway. For now, just don't handle + * this edge case. + */ +#define RPL_INCR 16384 + pt->rpl = htonl(tdb->tdb_rpl + (output ? RPL_INCR : 0)); + pt->cur_bytes = htobe64(tdb->tdb_cur_bytes); + + if (h->count == sc->sc_maxcount || + (sc->sc_maxupdates && (pt->updates >= sc->sc_maxupdates))) + ret = pfsync_tdb_sendout(sc); + + splx(s); + return (ret); +} +#endif /* PFSYNC_TDB */ + +#ifdef __FreeBSD__ +void +pfsync_ifdetach(void *arg, struct ifnet *ifp) +{ + struct pfsync_softc *sc = (struct pfsync_softc *)arg; + struct ip_moptions *imo; + + if (sc == NULL || sc->sc_sync_ifp != ifp) + return; /* not for us; unlocked read */ + + PF_LOCK(); + + /* Deal with a member interface going away from under us. */ + sc->sc_sync_ifp = NULL; + if (sc->sc_mbuf_net != NULL) { + m_freem(sc->sc_mbuf_net); + sc->sc_mbuf_net = NULL; + sc->sc_statep_net.s = NULL; + } + imo = &sc->sc_imo; + if (imo->imo_num_memberships > 0) { + KASSERT(imo->imo_num_memberships == 1, + ("%s: imo_num_memberships != 1", __func__)); + /* + * Our event handler is always called after protocol + * domains have been detached from the underlying ifnet. + * Do not call in_delmulti(); we held a single reference + * which the protocol domain has purged in in_purgemaddrs(). + */ + PF_UNLOCK(); + imo->imo_membership[--imo->imo_num_memberships] = NULL; + PF_LOCK(); + imo->imo_multicast_ifp = NULL; + } + + PF_UNLOCK(); +} + +void +pfsync_senddef(void *arg, __unused int pending) +{ + struct pfsync_softc *sc = (struct pfsync_softc *)arg; + struct mbuf *m; + + for(;;) { + IF_DEQUEUE(&sc->sc_ifq, m); + if (m == NULL) + break; + /* Deal with a member interface going away from under us. */ + if (sc->sc_sync_ifp == NULL) { + pfsyncstats.pfsyncs_oerrors++; + m_freem(m); + continue; + } + if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) + pfsyncstats.pfsyncs_oerrors++; + } +} + +static int +pfsync_modevent(module_t mod, int type, void *data) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + pfsyncattach(0); + break; + case MOD_UNLOAD: + if_clone_detach(&pfsync_cloner); + break; + default: + error = EINVAL; + break; + } + + return error; +} + +static moduledata_t pfsync_mod = { + "pfsync", + pfsync_modevent, + 0 +}; + +#define PFSYNC_MODVER 1 + +DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); +MODULE_VERSION(pfsync, PFSYNC_MODVER); +MODULE_DEPEND(pflog, pf, PF_MODVER, PF_MODVER, PF_MODVER); +#endif /* __FreeBSD__ */ diff --git a/contrib/pf/rtems/freebsd/net/if_pfsync.h b/contrib/pf/rtems/freebsd/net/if_pfsync.h new file mode 100644 index 00000000..e3e6caf9 --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/if_pfsync.h @@ -0,0 +1,375 @@ +/* $FreeBSD$ */ +/* $OpenBSD: if_pfsync.h,v 1.30 2006/10/31 14:49:01 henning Exp $ */ + +/* + * Copyright (c) 2001 Michael Shalayeff + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _NET_IF_PFSYNC_HH_ +#define _NET_IF_PFSYNC_HH_ + + +#define PFSYNC_ID_LEN sizeof(u_int64_t) + +struct pfsync_state_scrub { + u_int16_t pfss_flags; + u_int8_t pfss_ttl; /* stashed TTL */ +#define PFSYNC_SCRUB_FLAG_VALID 0x01 + u_int8_t scrub_flag; + u_int32_t pfss_ts_mod; /* timestamp modulation */ +} __packed; + +struct pfsync_state_host { + struct pf_addr addr; + u_int16_t port; + u_int16_t pad[3]; +} __packed; + +struct pfsync_state_peer { + struct pfsync_state_scrub scrub; /* state is scrubbed */ + u_int32_t seqlo; /* Max sequence number sent */ + u_int32_t seqhi; /* Max the other end ACKd + win */ + u_int32_t seqdiff; /* Sequence number modulator */ + u_int16_t max_win; /* largest window (pre scaling) */ + u_int16_t mss; /* Maximum segment size option */ + u_int8_t state; /* active state level */ + u_int8_t wscale; /* window scaling factor */ + u_int8_t pad[6]; +} __packed; + +struct pfsync_state { + u_int32_t id[2]; + char ifname[IFNAMSIZ]; + struct pfsync_state_host lan; + struct pfsync_state_host gwy; + struct pfsync_state_host ext; + struct pfsync_state_peer src; + struct pfsync_state_peer dst; + struct pf_addr rt_addr; + u_int32_t rule; + u_int32_t anchor; + u_int32_t nat_rule; + u_int32_t creation; + u_int32_t expire; + u_int32_t packets[2][2]; + u_int32_t bytes[2][2]; + u_int32_t creatorid; + sa_family_t af; + u_int8_t proto; + u_int8_t direction; + u_int8_t log; + u_int8_t state_flags; + u_int8_t timeout; + u_int8_t sync_flags; + u_int8_t updates; +} __packed; + +#define PFSYNC_FLAG_COMPRESS 0x01 +#define PFSYNC_FLAG_STALE 0x02 + +#ifdef PFSYNC_TDB +struct pfsync_tdb { + u_int32_t spi; + union sockaddr_union dst; + u_int32_t rpl; + u_int64_t cur_bytes; + u_int8_t sproto; + u_int8_t updates; + u_int8_t pad[2]; +} __packed; +#endif + +struct pfsync_state_upd { + u_int32_t id[2]; + struct pfsync_state_peer src; + struct pfsync_state_peer dst; + u_int32_t creatorid; + u_int32_t expire; + u_int8_t timeout; + u_int8_t updates; + u_int8_t pad[6]; +} __packed; + +struct pfsync_state_del { + u_int32_t id[2]; + u_int32_t creatorid; + struct { + u_int8_t state; + } src; + struct { + u_int8_t state; + } dst; + u_int8_t pad[2]; +} __packed; + +struct pfsync_state_upd_req { + u_int32_t id[2]; + u_int32_t creatorid; + u_int32_t pad; +} __packed; + +struct pfsync_state_clr { + char ifname[IFNAMSIZ]; + u_int32_t creatorid; + u_int32_t pad; +} __packed; + +struct pfsync_state_bus { + u_int32_t creatorid; + u_int32_t endtime; + u_int8_t status; +#define PFSYNC_BUS_START 1 +#define PFSYNC_BUS_END 2 + u_int8_t pad[7]; +} __packed; + +#ifdef _KERNEL + +union sc_statep { + struct pfsync_state *s; + struct pfsync_state_upd *u; + struct pfsync_state_del *d; + struct pfsync_state_clr *c; + struct pfsync_state_bus *b; + struct pfsync_state_upd_req *r; +}; + +#ifdef PFSYNC_TDB +union sc_tdb_statep { + struct pfsync_tdb *t; +}; +#endif + +extern int pfsync_sync_ok; + +struct pfsync_softc { +#ifdef __FreeBSD__ + struct ifnet *sc_ifp; +#else + struct ifnet sc_if; +#endif + struct ifnet *sc_sync_ifp; + + struct ip_moptions sc_imo; +#ifdef __FreeBSD__ + struct callout sc_tmo; +#ifdef PFSYNC_TDB + struct callout sc_tdb_tmo; +#endif + struct callout sc_bulk_tmo; + struct callout sc_bulkfail_tmo; +#else + struct timeout sc_tmo; + struct timeout sc_tdb_tmo; + struct timeout sc_bulk_tmo; + struct timeout sc_bulkfail_tmo; +#endif + struct in_addr sc_sync_peer; + struct in_addr sc_sendaddr; + struct mbuf *sc_mbuf; /* current cumulative mbuf */ + struct mbuf *sc_mbuf_net; /* current cumulative mbuf */ +#ifdef PFSYNC_TDB + struct mbuf *sc_mbuf_tdb; /* dito for TDB updates */ +#endif +#ifdef __FreeBSD__ + struct ifqueue sc_ifq; + struct task sc_send_task; +#endif + union sc_statep sc_statep; + union sc_statep sc_statep_net; +#ifdef PFSYNC_TDB + union sc_tdb_statep sc_statep_tdb; +#endif + u_int32_t sc_ureq_received; + u_int32_t sc_ureq_sent; + struct pf_state *sc_bulk_send_next; + struct pf_state *sc_bulk_terminator; + int sc_bulk_tries; + int sc_maxcount; /* number of states in mtu */ + int sc_maxupdates; /* number of updates/state */ +#ifdef __FreeBSD__ + eventhandler_tag sc_detachtag; +#endif +}; + +extern struct pfsync_softc *pfsyncif; +#endif + + +struct pfsync_header { + u_int8_t version; +#define PFSYNC_VERSION 3 + u_int8_t af; + u_int8_t action; +#define PFSYNC_ACT_CLR 0 /* clear all states */ +#define PFSYNC_ACT_INS 1 /* insert state */ +#define PFSYNC_ACT_UPD 2 /* update state */ +#define PFSYNC_ACT_DEL 3 /* delete state */ +#define PFSYNC_ACT_UPD_C 4 /* "compressed" state update */ +#define PFSYNC_ACT_DEL_C 5 /* "compressed" state delete */ +#define PFSYNC_ACT_INS_F 6 /* insert fragment */ +#define PFSYNC_ACT_DEL_F 7 /* delete fragments */ +#define PFSYNC_ACT_UREQ 8 /* request "uncompressed" state */ +#define PFSYNC_ACT_BUS 9 /* Bulk Update Status */ +#define PFSYNC_ACT_TDB_UPD 10 /* TDB replay counter update */ +#define PFSYNC_ACT_MAX 11 + u_int8_t count; + u_int8_t pf_chksum[PF_MD5_DIGEST_LENGTH]; +} __packed; + +#define PFSYNC_BULKPACKETS 1 /* # of packets per timeout */ +#define PFSYNC_MAX_BULKTRIES 12 +#define PFSYNC_HDRLEN sizeof(struct pfsync_header) +#define PFSYNC_ACTIONS \ + "CLR ST", "INS ST", "UPD ST", "DEL ST", \ + "UPD ST COMP", "DEL ST COMP", "INS FR", "DEL FR", \ + "UPD REQ", "BLK UPD STAT", "TDB UPD" + +#define PFSYNC_DFLTTL 255 + +struct pfsyncstats { + u_int64_t pfsyncs_ipackets; /* total input packets, IPv4 */ + u_int64_t pfsyncs_ipackets6; /* total input packets, IPv6 */ + u_int64_t pfsyncs_badif; /* not the right interface */ + u_int64_t pfsyncs_badttl; /* TTL is not PFSYNC_DFLTTL */ + u_int64_t pfsyncs_hdrops; /* packets shorter than hdr */ + u_int64_t pfsyncs_badver; /* bad (incl unsupp) version */ + u_int64_t pfsyncs_badact; /* bad action */ + u_int64_t pfsyncs_badlen; /* data length does not match */ + u_int64_t pfsyncs_badauth; /* bad authentication */ + u_int64_t pfsyncs_stale; /* stale state */ + u_int64_t pfsyncs_badval; /* bad values */ + u_int64_t pfsyncs_badstate; /* insert/lookup failed */ + + u_int64_t pfsyncs_opackets; /* total output packets, IPv4 */ + u_int64_t pfsyncs_opackets6; /* total output packets, IPv6 */ + u_int64_t pfsyncs_onomem; /* no memory for an mbuf */ + u_int64_t pfsyncs_oerrors; /* ip output error */ +}; + +/* + * Configuration structure for SIOCSETPFSYNC SIOCGETPFSYNC + */ +struct pfsyncreq { + char pfsyncr_syncdev[IFNAMSIZ]; + struct in_addr pfsyncr_syncpeer; + int pfsyncr_maxupdates; + int pfsyncr_authlevel; +}; + +#ifdef __FreeBSD__ +#define SIOCSETPFSYNC _IOW('i', 247, struct ifreq) +#define SIOCGETPFSYNC _IOWR('i', 248, struct ifreq) +#endif + +#define pf_state_peer_hton(s,d) do { \ + (d)->seqlo = htonl((s)->seqlo); \ + (d)->seqhi = htonl((s)->seqhi); \ + (d)->seqdiff = htonl((s)->seqdiff); \ + (d)->max_win = htons((s)->max_win); \ + (d)->mss = htons((s)->mss); \ + (d)->state = (s)->state; \ + (d)->wscale = (s)->wscale; \ + if ((s)->scrub) { \ + (d)->scrub.pfss_flags = \ + htons((s)->scrub->pfss_flags & PFSS_TIMESTAMP); \ + (d)->scrub.pfss_ttl = (s)->scrub->pfss_ttl; \ + (d)->scrub.pfss_ts_mod = htonl((s)->scrub->pfss_ts_mod);\ + (d)->scrub.scrub_flag = PFSYNC_SCRUB_FLAG_VALID; \ + } \ +} while (0) + +#define pf_state_peer_ntoh(s,d) do { \ + (d)->seqlo = ntohl((s)->seqlo); \ + (d)->seqhi = ntohl((s)->seqhi); \ + (d)->seqdiff = ntohl((s)->seqdiff); \ + (d)->max_win = ntohs((s)->max_win); \ + (d)->mss = ntohs((s)->mss); \ + (d)->state = (s)->state; \ + (d)->wscale = (s)->wscale; \ + if ((s)->scrub.scrub_flag == PFSYNC_SCRUB_FLAG_VALID && \ + (d)->scrub != NULL) { \ + (d)->scrub->pfss_flags = \ + ntohs((s)->scrub.pfss_flags) & PFSS_TIMESTAMP; \ + (d)->scrub->pfss_ttl = (s)->scrub.pfss_ttl; \ + (d)->scrub->pfss_ts_mod = ntohl((s)->scrub.pfss_ts_mod);\ + } \ +} while (0) + +#define pf_state_host_hton(s,d) do { \ + bcopy(&(s)->addr, &(d)->addr, sizeof((d)->addr)); \ + (d)->port = (s)->port; \ +} while (0) + +#define pf_state_host_ntoh(s,d) do { \ + bcopy(&(s)->addr, &(d)->addr, sizeof((d)->addr)); \ + (d)->port = (s)->port; \ +} while (0) + +#define pf_state_counter_hton(s,d) do { \ + d[0] = htonl((s>>32)&0xffffffff); \ + d[1] = htonl(s&0xffffffff); \ +} while (0) + +#define pf_state_counter_ntoh(s,d) do { \ + d = ntohl(s[0]); \ + d = d<<32; \ + d += ntohl(s[1]); \ +} while (0) + +#ifdef _KERNEL +#ifdef __FreeBSD__ +void pfsync_input(struct mbuf *, __unused int); +#else +void pfsync_input(struct mbuf *, ...); +#endif +int pfsync_clear_states(u_int32_t, char *); +int pfsync_pack_state(u_int8_t, struct pf_state *, int); +#define pfsync_insert_state(st) do { \ + if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) || \ + (st->proto == IPPROTO_PFSYNC)) \ + st->sync_flags |= PFSTATE_NOSYNC; \ + else if (!st->sync_flags) \ + pfsync_pack_state(PFSYNC_ACT_INS, (st), \ + PFSYNC_FLAG_COMPRESS); \ + st->sync_flags &= ~PFSTATE_FROMSYNC; \ +} while (0) +#define pfsync_update_state(st) do { \ + if (!st->sync_flags) \ + pfsync_pack_state(PFSYNC_ACT_UPD, (st), \ + PFSYNC_FLAG_COMPRESS); \ + st->sync_flags &= ~PFSTATE_FROMSYNC; \ +} while (0) +#define pfsync_delete_state(st) do { \ + if (!st->sync_flags) \ + pfsync_pack_state(PFSYNC_ACT_DEL, (st), \ + PFSYNC_FLAG_COMPRESS); \ +} while (0) +#ifdef PFSYNC_TDB +int pfsync_update_tdb(struct tdb *, int); +#endif +#endif + +#endif /* _NET_IF_PFSYNC_HH_ */ diff --git a/contrib/pf/rtems/freebsd/net/pf.c b/contrib/pf/rtems/freebsd/net/pf.c new file mode 100644 index 00000000..ea1a642a --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/pf.c @@ -0,0 +1,7771 @@ +#include + +/* $OpenBSD: pf.c,v 1.527 2007/02/22 15:23:23 pyr Exp $ */ +/* add: $OpenBSD: pf.c,v 1.559 2007/09/18 18:45:59 markus Exp $ */ + +/* + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002,2003 Henning Brauer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + */ + +#ifdef __FreeBSD__ +#include +#include + +#include +__FBSDID("$FreeBSD$"); +#endif + +#ifdef __FreeBSD__ +#include +#include + +#ifdef DEV_BPF +#define NBPFILTER DEV_BPF +#else +#define NBPFILTER 0 +#endif + +#ifdef DEV_PFLOG +#define NPFLOG DEV_PFLOG +#else +#define NPFLOG 0 +#endif + +#ifdef DEV_PFSYNC +#define NPFSYNC DEV_PFSYNC +#else +#define NPFSYNC 0 +#endif + +#else +#include +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __FreeBSD__ +#include +#include +#else +#include +#endif +#include +#ifdef __FreeBSD__ +#include +#include +#include +#else +#include +#endif + +#include +#include +#include +#include +#ifndef __FreeBSD__ +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __FreeBSD__ +#include +#endif +#include +#include + +#if NPFSYNC > 0 +#include +#endif /* NPFSYNC > 0 */ + +#ifdef INET6 +#include +#include +#include +#include +#ifdef __FreeBSD__ +#include +#include +#endif +#endif /* INET6 */ + +#ifdef __FreeBSD__ +#include +#include +#include +#include + +extern int ip_optcopy(struct ip *, struct ip *); +extern int debug_pfugidhack; +#endif + +#define DPFPRINTF(n, x) if (pf_status.debug >= (n)) printf x + +/* + * Global variables + */ + +struct pf_altqqueue pf_altqs[2]; +struct pf_palist pf_pabuf; +struct pf_altqqueue *pf_altqs_active; +struct pf_altqqueue *pf_altqs_inactive; +struct pf_status pf_status; + +u_int32_t ticket_altqs_active; +u_int32_t ticket_altqs_inactive; +int altqs_inactive_open; +u_int32_t ticket_pabuf; + +struct pf_anchor_stackframe { + struct pf_ruleset *rs; + struct pf_rule *r; + struct pf_anchor_node *parent; + struct pf_anchor *child; +} pf_anchor_stack[64]; + +#ifdef __FreeBSD__ +uma_zone_t pf_src_tree_pl, pf_rule_pl; +uma_zone_t pf_state_pl, pf_altq_pl, pf_pooladdr_pl; +#else +struct pool pf_src_tree_pl, pf_rule_pl; +struct pool pf_state_pl, pf_altq_pl, pf_pooladdr_pl; +#endif + +void pf_print_host(struct pf_addr *, u_int16_t, u_int8_t); + +void pf_init_threshold(struct pf_threshold *, u_int32_t, + u_int32_t); +void pf_add_threshold(struct pf_threshold *); +int pf_check_threshold(struct pf_threshold *); + +void pf_change_ap(struct pf_addr *, u_int16_t *, + u_int16_t *, u_int16_t *, struct pf_addr *, + u_int16_t, u_int8_t, sa_family_t); +int pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *, + struct tcphdr *, struct pf_state_peer *); +#ifdef INET6 +void pf_change_a6(struct pf_addr *, u_int16_t *, + struct pf_addr *, u_int8_t); +#endif /* INET6 */ +void pf_change_icmp(struct pf_addr *, u_int16_t *, + struct pf_addr *, struct pf_addr *, u_int16_t, + u_int16_t *, u_int16_t *, u_int16_t *, + u_int16_t *, u_int8_t, sa_family_t); +#ifdef __FreeBSD__ +void pf_send_tcp(struct mbuf *, + const struct pf_rule *, sa_family_t, +#else +void pf_send_tcp(const struct pf_rule *, sa_family_t, +#endif + const struct pf_addr *, const struct pf_addr *, + u_int16_t, u_int16_t, u_int32_t, u_int32_t, + u_int8_t, u_int16_t, u_int16_t, u_int8_t, int, + u_int16_t, struct ether_header *, struct ifnet *); +void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, + sa_family_t, struct pf_rule *); +struct pf_rule *pf_match_translation(struct pf_pdesc *, struct mbuf *, + int, int, struct pfi_kif *, + struct pf_addr *, u_int16_t, struct pf_addr *, + u_int16_t, int); +struct pf_rule *pf_get_translation(struct pf_pdesc *, struct mbuf *, + int, int, struct pfi_kif *, struct pf_src_node **, + struct pf_addr *, u_int16_t, + struct pf_addr *, u_int16_t, + struct pf_addr *, u_int16_t *); +int pf_test_tcp(struct pf_rule **, struct pf_state **, + int, struct pfi_kif *, struct mbuf *, int, + void *, struct pf_pdesc *, struct pf_rule **, +#ifdef __FreeBSD__ + struct pf_ruleset **, struct ifqueue *, + struct inpcb *); +#else + struct pf_ruleset **, struct ifqueue *); +#endif +int pf_test_udp(struct pf_rule **, struct pf_state **, + int, struct pfi_kif *, struct mbuf *, int, + void *, struct pf_pdesc *, struct pf_rule **, +#ifdef __FreeBSD__ + struct pf_ruleset **, struct ifqueue *, + struct inpcb *); +#else + struct pf_ruleset **, struct ifqueue *); +#endif +int pf_test_icmp(struct pf_rule **, struct pf_state **, + int, struct pfi_kif *, struct mbuf *, int, + void *, struct pf_pdesc *, struct pf_rule **, + struct pf_ruleset **, struct ifqueue *); +int pf_test_other(struct pf_rule **, struct pf_state **, + int, struct pfi_kif *, struct mbuf *, int, void *, + struct pf_pdesc *, struct pf_rule **, + struct pf_ruleset **, struct ifqueue *); +int pf_test_fragment(struct pf_rule **, int, + struct pfi_kif *, struct mbuf *, void *, + struct pf_pdesc *, struct pf_rule **, + struct pf_ruleset **); +int pf_tcp_track_full(struct pf_state_peer *, + struct pf_state_peer *, struct pf_state **, + struct pfi_kif *, struct mbuf *, int, + struct pf_pdesc *, u_short *, int *); +int pf_tcp_track_sloppy(struct pf_state_peer *, + struct pf_state_peer *, struct pf_state **, + struct pf_pdesc *, u_short *); +int pf_test_state_tcp(struct pf_state **, int, + struct pfi_kif *, struct mbuf *, int, + void *, struct pf_pdesc *, u_short *); +int pf_test_state_udp(struct pf_state **, int, + struct pfi_kif *, struct mbuf *, int, + void *, struct pf_pdesc *); +int pf_test_state_icmp(struct pf_state **, int, + struct pfi_kif *, struct mbuf *, int, + void *, struct pf_pdesc *, u_short *); +int pf_test_state_other(struct pf_state **, int, + struct pfi_kif *, struct pf_pdesc *); +int pf_match_tag(struct mbuf *, struct pf_rule *, + struct pf_mtag *, int *); +int pf_step_out_of_anchor(int *, struct pf_ruleset **, + int, struct pf_rule **, struct pf_rule **, + int *); +void pf_hash(struct pf_addr *, struct pf_addr *, + struct pf_poolhashkey *, sa_family_t); +int pf_map_addr(u_int8_t, struct pf_rule *, + struct pf_addr *, struct pf_addr *, + struct pf_addr *, struct pf_src_node **); +int pf_get_sport(sa_family_t, u_int8_t, struct pf_rule *, + struct pf_addr *, struct pf_addr *, u_int16_t, + struct pf_addr *, u_int16_t*, u_int16_t, u_int16_t, + struct pf_src_node **); +void pf_route(struct mbuf **, struct pf_rule *, int, + struct ifnet *, struct pf_state *, + struct pf_pdesc *); +void pf_route6(struct mbuf **, struct pf_rule *, int, + struct ifnet *, struct pf_state *, + struct pf_pdesc *); +#ifdef __FreeBSD__ +/* XXX: import */ +#else +int pf_socket_lookup(int, struct pf_pdesc *); +#endif +u_int8_t pf_get_wscale(struct mbuf *, int, u_int16_t, + sa_family_t); +u_int16_t pf_get_mss(struct mbuf *, int, u_int16_t, + sa_family_t); +u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t, + u_int16_t); +void pf_set_rt_ifp(struct pf_state *, + struct pf_addr *); +int pf_check_proto_cksum(struct mbuf *, int, int, + u_int8_t, sa_family_t); +int pf_addr_wrap_neq(struct pf_addr_wrap *, + struct pf_addr_wrap *); +struct pf_state *pf_find_state_recurse(struct pfi_kif *, + struct pf_state_cmp *, u_int8_t); +int pf_src_connlimit(struct pf_state **); +int pf_check_congestion(struct ifqueue *); + +#ifdef __FreeBSD__ +int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len); + +extern int pf_end_threads; + +struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX]; +#else +extern struct pool pfr_ktable_pl; +extern struct pool pfr_kentry_pl; + +struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX] = { + { &pf_state_pl, PFSTATE_HIWAT }, + { &pf_src_tree_pl, PFSNODE_HIWAT }, + { &pf_frent_pl, PFFRAG_FRENT_HIWAT }, + { &pfr_ktable_pl, PFR_KTABLE_HIWAT }, + { &pfr_kentry_pl, PFR_KENTRY_HIWAT } +}; +#endif + +#define STATE_LOOKUP() \ + do { \ + if (direction == PF_IN) \ + *state = pf_find_state_recurse( \ + kif, &key, PF_EXT_GWY); \ + else \ + *state = pf_find_state_recurse( \ + kif, &key, PF_LAN_EXT); \ + if (*state == NULL || (*state)->timeout == PFTM_PURGE) \ + return (PF_DROP); \ + if (direction == PF_OUT && \ + (((*state)->rule.ptr->rt == PF_ROUTETO && \ + (*state)->rule.ptr->direction == PF_OUT) || \ + ((*state)->rule.ptr->rt == PF_REPLYTO && \ + (*state)->rule.ptr->direction == PF_IN)) && \ + (*state)->rt_kif != NULL && \ + (*state)->rt_kif != kif) \ + return (PF_PASS); \ + } while (0) + +#define STATE_TRANSLATE(s) \ + (s)->lan.addr.addr32[0] != (s)->gwy.addr.addr32[0] || \ + ((s)->af == AF_INET6 && \ + ((s)->lan.addr.addr32[1] != (s)->gwy.addr.addr32[1] || \ + (s)->lan.addr.addr32[2] != (s)->gwy.addr.addr32[2] || \ + (s)->lan.addr.addr32[3] != (s)->gwy.addr.addr32[3])) || \ + (s)->lan.port != (s)->gwy.port + +#define BOUND_IFACE(r, k) \ + ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : pfi_all + +#define STATE_INC_COUNTERS(s) \ + do { \ + s->rule.ptr->states++; \ + if (s->anchor.ptr != NULL) \ + s->anchor.ptr->states++; \ + if (s->nat_rule.ptr != NULL) \ + s->nat_rule.ptr->states++; \ + } while (0) + +#define STATE_DEC_COUNTERS(s) \ + do { \ + if (s->nat_rule.ptr != NULL) \ + s->nat_rule.ptr->states--; \ + if (s->anchor.ptr != NULL) \ + s->anchor.ptr->states--; \ + s->rule.ptr->states--; \ + } while (0) + +struct pf_src_tree tree_src_tracking; + +struct pf_state_tree_id tree_id; +struct pf_state_queue state_list; + +#ifdef __FreeBSD__ +static int pf_src_compare(struct pf_src_node *, struct pf_src_node *); +static int pf_state_compare_lan_ext(struct pf_state *, struct pf_state *); +static int pf_state_compare_ext_gwy(struct pf_state *, struct pf_state *); +static int pf_state_compare_id(struct pf_state *, struct pf_state *); +#endif + +RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare); +RB_GENERATE(pf_state_tree_lan_ext, pf_state, + u.s.entry_lan_ext, pf_state_compare_lan_ext); +RB_GENERATE(pf_state_tree_ext_gwy, pf_state, + u.s.entry_ext_gwy, pf_state_compare_ext_gwy); +RB_GENERATE(pf_state_tree_id, pf_state, + u.s.entry_id, pf_state_compare_id); + +#ifdef __FreeBSD__ +static int +#else +static __inline int +#endif +pf_src_compare(struct pf_src_node *a, struct pf_src_node *b) +{ + int diff; + + if (a->rule.ptr > b->rule.ptr) + return (1); + if (a->rule.ptr < b->rule.ptr) + return (-1); + if ((diff = a->af - b->af) != 0) + return (diff); + switch (a->af) { +#ifdef INET + case AF_INET: + if (a->addr.addr32[0] > b->addr.addr32[0]) + return (1); + if (a->addr.addr32[0] < b->addr.addr32[0]) + return (-1); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (a->addr.addr32[3] > b->addr.addr32[3]) + return (1); + if (a->addr.addr32[3] < b->addr.addr32[3]) + return (-1); + if (a->addr.addr32[2] > b->addr.addr32[2]) + return (1); + if (a->addr.addr32[2] < b->addr.addr32[2]) + return (-1); + if (a->addr.addr32[1] > b->addr.addr32[1]) + return (1); + if (a->addr.addr32[1] < b->addr.addr32[1]) + return (-1); + if (a->addr.addr32[0] > b->addr.addr32[0]) + return (1); + if (a->addr.addr32[0] < b->addr.addr32[0]) + return (-1); + break; +#endif /* INET6 */ + } + return (0); +} + +#ifdef __FreeBSD__ +static int +#else +static __inline int +#endif +pf_state_compare_lan_ext(struct pf_state *a, struct pf_state *b) +{ + int diff; + + if ((diff = a->proto - b->proto) != 0) + return (diff); + if ((diff = a->af - b->af) != 0) + return (diff); + switch (a->af) { +#ifdef INET + case AF_INET: + if (a->lan.addr.addr32[0] > b->lan.addr.addr32[0]) + return (1); + if (a->lan.addr.addr32[0] < b->lan.addr.addr32[0]) + return (-1); + if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0]) + return (1); + if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) + return (-1); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (a->lan.addr.addr32[3] > b->lan.addr.addr32[3]) + return (1); + if (a->lan.addr.addr32[3] < b->lan.addr.addr32[3]) + return (-1); + if (a->ext.addr.addr32[3] > b->ext.addr.addr32[3]) + return (1); + if (a->ext.addr.addr32[3] < b->ext.addr.addr32[3]) + return (-1); + if (a->lan.addr.addr32[2] > b->lan.addr.addr32[2]) + return (1); + if (a->lan.addr.addr32[2] < b->lan.addr.addr32[2]) + return (-1); + if (a->ext.addr.addr32[2] > b->ext.addr.addr32[2]) + return (1); + if (a->ext.addr.addr32[2] < b->ext.addr.addr32[2]) + return (-1); + if (a->lan.addr.addr32[1] > b->lan.addr.addr32[1]) + return (1); + if (a->lan.addr.addr32[1] < b->lan.addr.addr32[1]) + return (-1); + if (a->ext.addr.addr32[1] > b->ext.addr.addr32[1]) + return (1); + if (a->ext.addr.addr32[1] < b->ext.addr.addr32[1]) + return (-1); + if (a->lan.addr.addr32[0] > b->lan.addr.addr32[0]) + return (1); + if (a->lan.addr.addr32[0] < b->lan.addr.addr32[0]) + return (-1); + if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0]) + return (1); + if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) + return (-1); + break; +#endif /* INET6 */ + } + + if ((diff = a->lan.port - b->lan.port) != 0) + return (diff); + if ((diff = a->ext.port - b->ext.port) != 0) + return (diff); + + return (0); +} + +#ifdef __FreeBSD__ +static int +#else +static __inline int +#endif +pf_state_compare_ext_gwy(struct pf_state *a, struct pf_state *b) +{ + int diff; + + if ((diff = a->proto - b->proto) != 0) + return (diff); + if ((diff = a->af - b->af) != 0) + return (diff); + switch (a->af) { +#ifdef INET + case AF_INET: + if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0]) + return (1); + if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) + return (-1); + if (a->gwy.addr.addr32[0] > b->gwy.addr.addr32[0]) + return (1); + if (a->gwy.addr.addr32[0] < b->gwy.addr.addr32[0]) + return (-1); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (a->ext.addr.addr32[3] > b->ext.addr.addr32[3]) + return (1); + if (a->ext.addr.addr32[3] < b->ext.addr.addr32[3]) + return (-1); + if (a->gwy.addr.addr32[3] > b->gwy.addr.addr32[3]) + return (1); + if (a->gwy.addr.addr32[3] < b->gwy.addr.addr32[3]) + return (-1); + if (a->ext.addr.addr32[2] > b->ext.addr.addr32[2]) + return (1); + if (a->ext.addr.addr32[2] < b->ext.addr.addr32[2]) + return (-1); + if (a->gwy.addr.addr32[2] > b->gwy.addr.addr32[2]) + return (1); + if (a->gwy.addr.addr32[2] < b->gwy.addr.addr32[2]) + return (-1); + if (a->ext.addr.addr32[1] > b->ext.addr.addr32[1]) + return (1); + if (a->ext.addr.addr32[1] < b->ext.addr.addr32[1]) + return (-1); + if (a->gwy.addr.addr32[1] > b->gwy.addr.addr32[1]) + return (1); + if (a->gwy.addr.addr32[1] < b->gwy.addr.addr32[1]) + return (-1); + if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0]) + return (1); + if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) + return (-1); + if (a->gwy.addr.addr32[0] > b->gwy.addr.addr32[0]) + return (1); + if (a->gwy.addr.addr32[0] < b->gwy.addr.addr32[0]) + return (-1); + break; +#endif /* INET6 */ + } + + if ((diff = a->ext.port - b->ext.port) != 0) + return (diff); + if ((diff = a->gwy.port - b->gwy.port) != 0) + return (diff); + + return (0); +} + +#ifdef __FreeBSD__ +static int +#else +static __inline int +#endif +pf_state_compare_id(struct pf_state *a, struct pf_state *b) +{ + if (a->id > b->id) + return (1); + if (a->id < b->id) + return (-1); + if (a->creatorid > b->creatorid) + return (1); + if (a->creatorid < b->creatorid) + return (-1); + + return (0); +} + +#ifdef INET6 +void +pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + dst->addr32[0] = src->addr32[0]; + break; +#endif /* INET */ + case AF_INET6: + dst->addr32[0] = src->addr32[0]; + dst->addr32[1] = src->addr32[1]; + dst->addr32[2] = src->addr32[2]; + dst->addr32[3] = src->addr32[3]; + break; + } +} +#endif /* INET6 */ + +struct pf_state * +pf_find_state_byid(struct pf_state_cmp *key) +{ + pf_status.fcounters[FCNT_STATE_SEARCH]++; + return (RB_FIND(pf_state_tree_id, &tree_id, (struct pf_state *)key)); +} + +struct pf_state * +pf_find_state_recurse(struct pfi_kif *kif, struct pf_state_cmp *key, u_int8_t tree) +{ + struct pf_state *s; + + pf_status.fcounters[FCNT_STATE_SEARCH]++; + + switch (tree) { + case PF_LAN_EXT: + if ((s = RB_FIND(pf_state_tree_lan_ext, &kif->pfik_lan_ext, + (struct pf_state *)key)) != NULL) + return (s); + if ((s = RB_FIND(pf_state_tree_lan_ext, &pfi_all->pfik_lan_ext, + (struct pf_state *)key)) != NULL) + return (s); + return (NULL); + case PF_EXT_GWY: + if ((s = RB_FIND(pf_state_tree_ext_gwy, &kif->pfik_ext_gwy, + (struct pf_state *)key)) != NULL) + return (s); + if ((s = RB_FIND(pf_state_tree_ext_gwy, &pfi_all->pfik_ext_gwy, + (struct pf_state *)key)) != NULL) + return (s); + return (NULL); + default: + panic("pf_find_state_recurse"); + } +} + +struct pf_state * +pf_find_state_all(struct pf_state_cmp *key, u_int8_t tree, int *more) +{ + struct pf_state *s, *ss = NULL; + struct pfi_kif *kif; + + pf_status.fcounters[FCNT_STATE_SEARCH]++; + + switch (tree) { + case PF_LAN_EXT: + TAILQ_FOREACH(kif, &pfi_statehead, pfik_w_states) { + s = RB_FIND(pf_state_tree_lan_ext, + &kif->pfik_lan_ext, (struct pf_state *)key); + if (s == NULL) + continue; + if (more == NULL) + return (s); + ss = s; + (*more)++; + } + return (ss); + case PF_EXT_GWY: + TAILQ_FOREACH(kif, &pfi_statehead, pfik_w_states) { + s = RB_FIND(pf_state_tree_ext_gwy, + &kif->pfik_ext_gwy, (struct pf_state *)key); + if (s == NULL) + continue; + if (more == NULL) + return (s); + ss = s; + (*more)++; + } + return (ss); + default: + panic("pf_find_state_all"); + } +} + +void +pf_init_threshold(struct pf_threshold *threshold, + u_int32_t limit, u_int32_t seconds) +{ + threshold->limit = limit * PF_THRESHOLD_MULT; + threshold->seconds = seconds; + threshold->count = 0; + threshold->last = time_second; +} + +void +pf_add_threshold(struct pf_threshold *threshold) +{ + u_int32_t t = time_second, diff = t - threshold->last; + + if (diff >= threshold->seconds) + threshold->count = 0; + else + threshold->count -= threshold->count * diff / + threshold->seconds; + threshold->count += PF_THRESHOLD_MULT; + threshold->last = t; +} + +int +pf_check_threshold(struct pf_threshold *threshold) +{ + return (threshold->count > threshold->limit); +} + +int +pf_src_connlimit(struct pf_state **state) +{ + struct pf_state *s; + int bad = 0; + + (*state)->src_node->conn++; + (*state)->src.tcp_est = 1; + pf_add_threshold(&(*state)->src_node->conn_rate); + + if ((*state)->rule.ptr->max_src_conn && + (*state)->rule.ptr->max_src_conn < + (*state)->src_node->conn) { + pf_status.lcounters[LCNT_SRCCONN]++; + bad++; + } + + if ((*state)->rule.ptr->max_src_conn_rate.limit && + pf_check_threshold(&(*state)->src_node->conn_rate)) { + pf_status.lcounters[LCNT_SRCCONNRATE]++; + bad++; + } + + if (!bad) + return (0); + + if ((*state)->rule.ptr->overload_tbl) { + struct pfr_addr p; + u_int32_t killed = 0; + + pf_status.lcounters[LCNT_OVERLOAD_TABLE]++; + if (pf_status.debug >= PF_DEBUG_MISC) { + printf("pf_src_connlimit: blocking address "); + pf_print_host(&(*state)->src_node->addr, 0, + (*state)->af); + } + + bzero(&p, sizeof(p)); + p.pfra_af = (*state)->af; + switch ((*state)->af) { +#ifdef INET + case AF_INET: + p.pfra_net = 32; + p.pfra_ip4addr = (*state)->src_node->addr.v4; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + p.pfra_net = 128; + p.pfra_ip6addr = (*state)->src_node->addr.v6; + break; +#endif /* INET6 */ + } + + pfr_insert_kentry((*state)->rule.ptr->overload_tbl, + &p, time_second); + + /* kill existing states if that's required. */ + if ((*state)->rule.ptr->flush) { + pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++; + + RB_FOREACH(s, pf_state_tree_id, &tree_id) { + /* + * Kill states from this source. (Only those + * from the same rule if PF_FLUSH_GLOBAL is not + * set) + */ + if (s->af == (*state)->af && + (((*state)->direction == PF_OUT && + PF_AEQ(&(*state)->src_node->addr, + &s->lan.addr, s->af)) || + ((*state)->direction == PF_IN && + PF_AEQ(&(*state)->src_node->addr, + &s->ext.addr, s->af))) && + ((*state)->rule.ptr->flush & + PF_FLUSH_GLOBAL || + (*state)->rule.ptr == s->rule.ptr)) { + s->timeout = PFTM_PURGE; + s->src.state = s->dst.state = + TCPS_CLOSED; + killed++; + } + } + if (pf_status.debug >= PF_DEBUG_MISC) + printf(", %u states killed", killed); + } + if (pf_status.debug >= PF_DEBUG_MISC) + printf("\n"); + } + + /* kill this state */ + (*state)->timeout = PFTM_PURGE; + (*state)->src.state = (*state)->dst.state = TCPS_CLOSED; + return (1); +} + +int +pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule, + struct pf_addr *src, sa_family_t af) +{ + struct pf_src_node k; + + if (*sn == NULL) { + k.af = af; + PF_ACPY(&k.addr, src, af); + if (rule->rule_flag & PFRULE_RULESRCTRACK || + rule->rpool.opts & PF_POOL_STICKYADDR) + k.rule.ptr = rule; + else + k.rule.ptr = NULL; + pf_status.scounters[SCNT_SRC_NODE_SEARCH]++; + *sn = RB_FIND(pf_src_tree, &tree_src_tracking, &k); + } + if (*sn == NULL) { + if (!rule->max_src_nodes || + rule->src_nodes < rule->max_src_nodes) + (*sn) = pool_get(&pf_src_tree_pl, PR_NOWAIT); + else + pf_status.lcounters[LCNT_SRCNODES]++; + if ((*sn) == NULL) + return (-1); + bzero(*sn, sizeof(struct pf_src_node)); + + pf_init_threshold(&(*sn)->conn_rate, + rule->max_src_conn_rate.limit, + rule->max_src_conn_rate.seconds); + + (*sn)->af = af; + if (rule->rule_flag & PFRULE_RULESRCTRACK || + rule->rpool.opts & PF_POOL_STICKYADDR) + (*sn)->rule.ptr = rule; + else + (*sn)->rule.ptr = NULL; + PF_ACPY(&(*sn)->addr, src, af); + if (RB_INSERT(pf_src_tree, + &tree_src_tracking, *sn) != NULL) { + if (pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: src_tree insert failed: "); + pf_print_host(&(*sn)->addr, 0, af); + printf("\n"); + } + pool_put(&pf_src_tree_pl, *sn); + return (-1); + } + (*sn)->creation = time_second; + (*sn)->ruletype = rule->action; + if ((*sn)->rule.ptr != NULL) + (*sn)->rule.ptr->src_nodes++; + pf_status.scounters[SCNT_SRC_NODE_INSERT]++; + pf_status.src_nodes++; + } else { + if (rule->max_src_states && + (*sn)->states >= rule->max_src_states) { + pf_status.lcounters[LCNT_SRCSTATES]++; + return (-1); + } + } + return (0); +} + +int +pf_insert_state(struct pfi_kif *kif, struct pf_state *state) +{ + /* Thou MUST NOT insert multiple duplicate keys */ + state->u.s.kif = kif; + if (RB_INSERT(pf_state_tree_lan_ext, &kif->pfik_lan_ext, state)) { + if (pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: state insert failed: tree_lan_ext"); + printf(" lan: "); + pf_print_host(&state->lan.addr, state->lan.port, + state->af); + printf(" gwy: "); + pf_print_host(&state->gwy.addr, state->gwy.port, + state->af); + printf(" ext: "); + pf_print_host(&state->ext.addr, state->ext.port, + state->af); + if (state->sync_flags & PFSTATE_FROMSYNC) + printf(" (from sync)"); + printf("\n"); + } + return (-1); + } + + if (RB_INSERT(pf_state_tree_ext_gwy, &kif->pfik_ext_gwy, state)) { + if (pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: state insert failed: tree_ext_gwy"); + printf(" lan: "); + pf_print_host(&state->lan.addr, state->lan.port, + state->af); + printf(" gwy: "); + pf_print_host(&state->gwy.addr, state->gwy.port, + state->af); + printf(" ext: "); + pf_print_host(&state->ext.addr, state->ext.port, + state->af); + if (state->sync_flags & PFSTATE_FROMSYNC) + printf(" (from sync)"); + printf("\n"); + } + RB_REMOVE(pf_state_tree_lan_ext, &kif->pfik_lan_ext, state); + return (-1); + } + + if (state->id == 0 && state->creatorid == 0) { + state->id = htobe64(pf_status.stateid++); + state->creatorid = pf_status.hostid; + } + if (RB_INSERT(pf_state_tree_id, &tree_id, state) != NULL) { + if (pf_status.debug >= PF_DEBUG_MISC) { +#ifdef __FreeBSD__ + printf("pf: state insert failed: " + "id: %016llx creatorid: %08x", + (long long)be64toh(state->id), + ntohl(state->creatorid)); +#else + printf("pf: state insert failed: " + "id: %016llx creatorid: %08x", + betoh64(state->id), ntohl(state->creatorid)); +#endif + if (state->sync_flags & PFSTATE_FROMSYNC) + printf(" (from sync)"); + printf("\n"); + } + RB_REMOVE(pf_state_tree_lan_ext, &kif->pfik_lan_ext, state); + RB_REMOVE(pf_state_tree_ext_gwy, &kif->pfik_ext_gwy, state); + return (-1); + } + TAILQ_INSERT_TAIL(&state_list, state, u.s.entry_list); + pf_status.fcounters[FCNT_STATE_INSERT]++; + pf_status.states++; + pfi_kif_ref(kif, PFI_KIF_REF_STATE); +#if NPFSYNC + pfsync_insert_state(state); +#endif + return (0); +} + +void +pf_purge_thread(void *v) +{ + int nloops = 0, s; +#ifdef __FreeBSD__ + int locked; +#endif + + for (;;) { + tsleep(pf_purge_thread, PWAIT, "pftm", 1 * hz); + +#ifdef __FreeBSD__ + sx_slock(&pf_consistency_lock); + PF_LOCK(); + locked = 0; + + if (pf_end_threads) { + PF_UNLOCK(); + sx_sunlock(&pf_consistency_lock); + sx_xlock(&pf_consistency_lock); + PF_LOCK(); + pf_purge_expired_states(pf_status.states, 1); + pf_purge_expired_fragments(); + pf_purge_expired_src_nodes(1); + pf_end_threads++; + + sx_xunlock(&pf_consistency_lock); + PF_UNLOCK(); + wakeup(pf_purge_thread); + kproc_exit(0); + } +#endif + s = splsoftnet(); + + /* process a fraction of the state table every second */ +#ifdef __FreeBSD__ + if(!pf_purge_expired_states(1 + (pf_status.states + / pf_default_rule.timeout[PFTM_INTERVAL]), 0)) { + PF_UNLOCK(); + sx_sunlock(&pf_consistency_lock); + sx_xlock(&pf_consistency_lock); + PF_LOCK(); + locked = 1; + + pf_purge_expired_states(1 + (pf_status.states + / pf_default_rule.timeout[PFTM_INTERVAL]), 1); + } +#else + pf_purge_expired_states(1 + (pf_status.states + / pf_default_rule.timeout[PFTM_INTERVAL])); +#endif + + /* purge other expired types every PFTM_INTERVAL seconds */ + if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) { + pf_purge_expired_fragments(); + if (!pf_purge_expired_src_nodes(locked)) { + PF_UNLOCK(); + sx_sunlock(&pf_consistency_lock); + sx_xlock(&pf_consistency_lock); + PF_LOCK(); + locked = 1; + pf_purge_expired_src_nodes(1); + } + nloops = 0; + } + + splx(s); +#ifdef __FreeBSD__ + PF_UNLOCK(); + if (locked) + sx_xunlock(&pf_consistency_lock); + else + sx_sunlock(&pf_consistency_lock); +#endif + } +} + +u_int32_t +pf_state_expires(const struct pf_state *state) +{ + u_int32_t timeout; + u_int32_t start; + u_int32_t end; + u_int32_t states; + + /* handle all PFTM_* > PFTM_MAX here */ + if (state->timeout == PFTM_PURGE) + return (time_second); + if (state->timeout == PFTM_UNTIL_PACKET) + return (0); +#ifdef __FreeBSD__ + KASSERT(state->timeout != PFTM_UNLINKED, + ("pf_state_expires: timeout == PFTM_UNLINKED")); + KASSERT((state->timeout < PFTM_MAX), + ("pf_state_expires: timeout > PFTM_MAX")); +#else + KASSERT(state->timeout != PFTM_UNLINKED); + KASSERT(state->timeout < PFTM_MAX); +#endif + timeout = state->rule.ptr->timeout[state->timeout]; + if (!timeout) + timeout = pf_default_rule.timeout[state->timeout]; + start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START]; + if (start) { + end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END]; + states = state->rule.ptr->states; + } else { + start = pf_default_rule.timeout[PFTM_ADAPTIVE_START]; + end = pf_default_rule.timeout[PFTM_ADAPTIVE_END]; + states = pf_status.states; + } + if (end && states > start && start < end) { + if (states < end) + return (state->expire + timeout * (end - states) / + (end - start)); + else + return (time_second); + } + return (state->expire + timeout); +} + +#ifdef __FreeBSD__ +int +pf_purge_expired_src_nodes(int waslocked) +#else +void +pf_purge_expired_src_nodes(int waslocked) +#endif +{ + struct pf_src_node *cur, *next; + int locked = waslocked; + + for (cur = RB_MIN(pf_src_tree, &tree_src_tracking); cur; cur = next) { + next = RB_NEXT(pf_src_tree, &tree_src_tracking, cur); + + if (cur->states <= 0 && cur->expire <= time_second) { + if (! locked) { +#ifdef __FreeBSD__ + if (!sx_try_upgrade(&pf_consistency_lock)) + return (0); +#else + rw_enter_write(&pf_consistency_lock); +#endif + next = RB_NEXT(pf_src_tree, + &tree_src_tracking, cur); + locked = 1; + } + if (cur->rule.ptr != NULL) { + cur->rule.ptr->src_nodes--; + if (cur->rule.ptr->states <= 0 && + cur->rule.ptr->max_src_nodes <= 0) + pf_rm_rule(NULL, cur->rule.ptr); + } + RB_REMOVE(pf_src_tree, &tree_src_tracking, cur); + pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; + pf_status.src_nodes--; + pool_put(&pf_src_tree_pl, cur); + } + } + + if (locked && !waslocked) +#ifdef __FreeBSD__ + sx_downgrade(&pf_consistency_lock); +#else + rw_exit_write(&pf_consistency_lock); +#endif + +#ifdef __FreeBSD__ + return (1); +#endif +} + +void +pf_src_tree_remove_state(struct pf_state *s) +{ + u_int32_t timeout; + + if (s->src_node != NULL) { + if (s->proto == IPPROTO_TCP) { + if (s->src.tcp_est) + --s->src_node->conn; + } + if (--s->src_node->states <= 0) { + timeout = s->rule.ptr->timeout[PFTM_SRC_NODE]; + if (!timeout) + timeout = + pf_default_rule.timeout[PFTM_SRC_NODE]; + s->src_node->expire = time_second + timeout; + } + } + if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) { + if (--s->nat_src_node->states <= 0) { + timeout = s->rule.ptr->timeout[PFTM_SRC_NODE]; + if (!timeout) + timeout = + pf_default_rule.timeout[PFTM_SRC_NODE]; + s->nat_src_node->expire = time_second + timeout; + } + } + s->src_node = s->nat_src_node = NULL; +} + +/* callers should be at splsoftnet */ +void +pf_unlink_state(struct pf_state *cur) +{ +#ifdef __FreeBSD__ + if (cur->local_flags & PFSTATE_EXPIRING) + return; + cur->local_flags |= PFSTATE_EXPIRING; +#endif + if (cur->src.state == PF_TCPS_PROXY_DST) { +#ifdef __FreeBSD__ + pf_send_tcp(NULL, cur->rule.ptr, cur->af, +#else + pf_send_tcp(cur->rule.ptr, cur->af, +#endif + &cur->ext.addr, &cur->lan.addr, + cur->ext.port, cur->lan.port, + cur->src.seqhi, cur->src.seqlo + 1, + TH_RST|TH_ACK, 0, 0, 0, 1, cur->tag, NULL, NULL); + } + RB_REMOVE(pf_state_tree_ext_gwy, + &cur->u.s.kif->pfik_ext_gwy, cur); + RB_REMOVE(pf_state_tree_lan_ext, + &cur->u.s.kif->pfik_lan_ext, cur); + RB_REMOVE(pf_state_tree_id, &tree_id, cur); +#if NPFSYNC + if (cur->creatorid == pf_status.hostid) + pfsync_delete_state(cur); +#endif + cur->timeout = PFTM_UNLINKED; + pf_src_tree_remove_state(cur); +} + +/* callers should be at splsoftnet and hold the + * write_lock on pf_consistency_lock */ +void +pf_free_state(struct pf_state *cur) +{ +#if NPFSYNC + if (pfsyncif != NULL && + (pfsyncif->sc_bulk_send_next == cur || + pfsyncif->sc_bulk_terminator == cur)) + return; +#endif +#ifdef __FreeBSD__ + KASSERT(cur->timeout == PFTM_UNLINKED, + ("pf_free_state: cur->timeout != PFTM_UNLINKED")); +#else + KASSERT(cur->timeout == PFTM_UNLINKED); +#endif + if (--cur->rule.ptr->states <= 0 && + cur->rule.ptr->src_nodes <= 0) + pf_rm_rule(NULL, cur->rule.ptr); + if (cur->nat_rule.ptr != NULL) + if (--cur->nat_rule.ptr->states <= 0 && + cur->nat_rule.ptr->src_nodes <= 0) + pf_rm_rule(NULL, cur->nat_rule.ptr); + if (cur->anchor.ptr != NULL) + if (--cur->anchor.ptr->states <= 0) + pf_rm_rule(NULL, cur->anchor.ptr); + pf_normalize_tcp_cleanup(cur); + pfi_kif_unref(cur->u.s.kif, PFI_KIF_REF_STATE); + TAILQ_REMOVE(&state_list, cur, u.s.entry_list); + if (cur->tag) + pf_tag_unref(cur->tag); + pool_put(&pf_state_pl, cur); + pf_status.fcounters[FCNT_STATE_REMOVALS]++; + pf_status.states--; +} + +#ifdef __FreeBSD__ +int +pf_purge_expired_states(u_int32_t maxcheck, int waslocked) +#else +void +pf_purge_expired_states(u_int32_t maxcheck) +#endif +{ + static struct pf_state *cur = NULL; + struct pf_state *next; +#ifdef __FreeBSD__ + int locked = waslocked; +#else + int locked = 0; +#endif + + while (maxcheck--) { + /* wrap to start of list when we hit the end */ + if (cur == NULL) { + cur = TAILQ_FIRST(&state_list); + if (cur == NULL) + break; /* list empty */ + } + + /* get next state, as cur may get deleted */ + next = TAILQ_NEXT(cur, u.s.entry_list); + + if (cur->timeout == PFTM_UNLINKED) { + /* free unlinked state */ + if (! locked) { +#ifdef __FreeBSD__ + if (!sx_try_upgrade(&pf_consistency_lock)) + return (0); +#else + rw_enter_write(&pf_consistency_lock); +#endif + locked = 1; + } + pf_free_state(cur); + } else if (pf_state_expires(cur) <= time_second) { + /* unlink and free expired state */ + pf_unlink_state(cur); + if (! locked) { +#ifdef __FreeBSD__ + if (!sx_try_upgrade(&pf_consistency_lock)) + return (0); +#else + rw_enter_write(&pf_consistency_lock); +#endif + locked = 1; + } + pf_free_state(cur); + } + cur = next; + } + +#ifdef __FreeBSD__ + if (!waslocked && locked) + sx_downgrade(&pf_consistency_lock); + + return (1); +#else + if (locked) + rw_exit_write(&pf_consistency_lock); +#endif +} + +int +pf_tbladdr_setup(struct pf_ruleset *rs, struct pf_addr_wrap *aw) +{ + if (aw->type != PF_ADDR_TABLE) + return (0); + if ((aw->p.tbl = pfr_attach_table(rs, aw->v.tblname)) == NULL) + return (1); + return (0); +} + +void +pf_tbladdr_remove(struct pf_addr_wrap *aw) +{ + if (aw->type != PF_ADDR_TABLE || aw->p.tbl == NULL) + return; + pfr_detach_table(aw->p.tbl); + aw->p.tbl = NULL; +} + +void +pf_tbladdr_copyout(struct pf_addr_wrap *aw) +{ + struct pfr_ktable *kt = aw->p.tbl; + + if (aw->type != PF_ADDR_TABLE || kt == NULL) + return; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + aw->p.tbl = NULL; + aw->p.tblcnt = (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) ? + kt->pfrkt_cnt : -1; +} + +void +pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: { + u_int32_t a = ntohl(addr->addr32[0]); + printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255, + (a>>8)&255, a&255); + if (p) { + p = ntohs(p); + printf(":%u", p); + } + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + u_int16_t b; + u_int8_t i, curstart = 255, curend = 0, + maxstart = 0, maxend = 0; + for (i = 0; i < 8; i++) { + if (!addr->addr16[i]) { + if (curstart == 255) + curstart = i; + else + curend = i; + } else { + if (curstart) { + if ((curend - curstart) > + (maxend - maxstart)) { + maxstart = curstart; + maxend = curend; + curstart = 255; + } + } + } + } + for (i = 0; i < 8; i++) { + if (i >= maxstart && i <= maxend) { + if (maxend != 7) { + if (i == maxstart) + printf(":"); + } else { + if (i == maxend) + printf(":"); + } + } else { + b = ntohs(addr->addr16[i]); + printf("%x", b); + if (i < 7) + printf(":"); + } + } + if (p) { + p = ntohs(p); + printf("[%u]", p); + } + break; + } +#endif /* INET6 */ + } +} + +void +pf_print_state(struct pf_state *s) +{ + switch (s->proto) { + case IPPROTO_TCP: + printf("TCP "); + break; + case IPPROTO_UDP: + printf("UDP "); + break; + case IPPROTO_ICMP: + printf("ICMP "); + break; + case IPPROTO_ICMPV6: + printf("ICMPV6 "); + break; + default: + printf("%u ", s->proto); + break; + } + pf_print_host(&s->lan.addr, s->lan.port, s->af); + printf(" "); + pf_print_host(&s->gwy.addr, s->gwy.port, s->af); + printf(" "); + pf_print_host(&s->ext.addr, s->ext.port, s->af); + printf(" [lo=%u high=%u win=%u modulator=%u", s->src.seqlo, + s->src.seqhi, s->src.max_win, s->src.seqdiff); + if (s->src.wscale && s->dst.wscale) + printf(" wscale=%u", s->src.wscale & PF_WSCALE_MASK); + printf("]"); + printf(" [lo=%u high=%u win=%u modulator=%u", s->dst.seqlo, + s->dst.seqhi, s->dst.max_win, s->dst.seqdiff); + if (s->src.wscale && s->dst.wscale) + printf(" wscale=%u", s->dst.wscale & PF_WSCALE_MASK); + printf("]"); + printf(" %u:%u", s->src.state, s->dst.state); +} + +void +pf_print_flags(u_int8_t f) +{ + if (f) + printf(" "); + if (f & TH_FIN) + printf("F"); + if (f & TH_SYN) + printf("S"); + if (f & TH_RST) + printf("R"); + if (f & TH_PUSH) + printf("P"); + if (f & TH_ACK) + printf("A"); + if (f & TH_URG) + printf("U"); + if (f & TH_ECE) + printf("E"); + if (f & TH_CWR) + printf("W"); +} + +#define PF_SET_SKIP_STEPS(i) \ + do { \ + while (head[i] != cur) { \ + head[i]->skip[i].ptr = cur; \ + head[i] = TAILQ_NEXT(head[i], entries); \ + } \ + } while (0) + +void +pf_calc_skip_steps(struct pf_rulequeue *rules) +{ + struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT]; + int i; + + cur = TAILQ_FIRST(rules); + prev = cur; + for (i = 0; i < PF_SKIP_COUNT; ++i) + head[i] = cur; + while (cur != NULL) { + + if (cur->kif != prev->kif || cur->ifnot != prev->ifnot) + PF_SET_SKIP_STEPS(PF_SKIP_IFP); + if (cur->direction != prev->direction) + PF_SET_SKIP_STEPS(PF_SKIP_DIR); + if (cur->af != prev->af) + PF_SET_SKIP_STEPS(PF_SKIP_AF); + if (cur->proto != prev->proto) + PF_SET_SKIP_STEPS(PF_SKIP_PROTO); + if (cur->src.neg != prev->src.neg || + pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr)) + PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR); + if (cur->src.port[0] != prev->src.port[0] || + cur->src.port[1] != prev->src.port[1] || + cur->src.port_op != prev->src.port_op) + PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT); + if (cur->dst.neg != prev->dst.neg || + pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr)) + PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR); + if (cur->dst.port[0] != prev->dst.port[0] || + cur->dst.port[1] != prev->dst.port[1] || + cur->dst.port_op != prev->dst.port_op) + PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT); + + prev = cur; + cur = TAILQ_NEXT(cur, entries); + } + for (i = 0; i < PF_SKIP_COUNT; ++i) + PF_SET_SKIP_STEPS(i); +} + +int +pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2) +{ + if (aw1->type != aw2->type) + return (1); + switch (aw1->type) { + case PF_ADDR_ADDRMASK: + if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, 0)) + return (1); + if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, 0)) + return (1); + return (0); + case PF_ADDR_DYNIFTL: + return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt); + case PF_ADDR_NOROUTE: + case PF_ADDR_URPFFAILED: + return (0); + case PF_ADDR_TABLE: + return (aw1->p.tbl != aw2->p.tbl); + case PF_ADDR_RTLABEL: + return (aw1->v.rtlabel != aw2->v.rtlabel); + default: + printf("invalid address type: %d\n", aw1->type); + return (1); + } +} + +u_int16_t +pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp) +{ + u_int32_t l; + + if (udp && !cksum) + return (0x0000); + l = cksum + old - new; + l = (l >> 16) + (l & 65535); + l = l & 65535; + if (udp && !l) + return (0xFFFF); + return (l); +} + +void +pf_change_ap(struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc, + struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af) +{ + struct pf_addr ao; + u_int16_t po = *p; + + PF_ACPY(&ao, a, af); + PF_ACPY(a, an, af); + + *p = pn; + + switch (af) { +#ifdef INET + case AF_INET: + *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, + ao.addr16[0], an->addr16[0], 0), + ao.addr16[1], an->addr16[1], 0); + *p = pn; + *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc, + ao.addr16[0], an->addr16[0], u), + ao.addr16[1], an->addr16[1], u), + po, pn, u); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc, + ao.addr16[0], an->addr16[0], u), + ao.addr16[1], an->addr16[1], u), + ao.addr16[2], an->addr16[2], u), + ao.addr16[3], an->addr16[3], u), + ao.addr16[4], an->addr16[4], u), + ao.addr16[5], an->addr16[5], u), + ao.addr16[6], an->addr16[6], u), + ao.addr16[7], an->addr16[7], u), + po, pn, u); + break; +#endif /* INET6 */ + } +} + + +/* Changes a u_int32_t. Uses a void * so there are no align restrictions */ +void +pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u) +{ + u_int32_t ao; + + memcpy(&ao, a, sizeof(ao)); + memcpy(a, &an, sizeof(u_int32_t)); + *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u), + ao % 65536, an % 65536, u); +} + +#ifdef INET6 +void +pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u) +{ + struct pf_addr ao; + + PF_ACPY(&ao, a, AF_INET6); + PF_ACPY(a, an, AF_INET6); + + *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(*c, + ao.addr16[0], an->addr16[0], u), + ao.addr16[1], an->addr16[1], u), + ao.addr16[2], an->addr16[2], u), + ao.addr16[3], an->addr16[3], u), + ao.addr16[4], an->addr16[4], u), + ao.addr16[5], an->addr16[5], u), + ao.addr16[6], an->addr16[6], u), + ao.addr16[7], an->addr16[7], u); +} +#endif /* INET6 */ + +void +pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa, + struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c, + u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af) +{ + struct pf_addr oia, ooa; + + PF_ACPY(&oia, ia, af); + PF_ACPY(&ooa, oa, af); + + /* Change inner protocol port, fix inner protocol checksum. */ + if (ip != NULL) { + u_int16_t oip = *ip; + u_int32_t opc = 0; /* make the compiler happy */ + + if (pc != NULL) + opc = *pc; + *ip = np; + if (pc != NULL) + *pc = pf_cksum_fixup(*pc, oip, *ip, u); + *ic = pf_cksum_fixup(*ic, oip, *ip, 0); + if (pc != NULL) + *ic = pf_cksum_fixup(*ic, opc, *pc, 0); + } + /* Change inner ip address, fix inner ip and icmp checksums. */ + PF_ACPY(ia, na, af); + switch (af) { +#ifdef INET + case AF_INET: { + u_int32_t oh2c = *h2c; + + *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c, + oia.addr16[0], ia->addr16[0], 0), + oia.addr16[1], ia->addr16[1], 0); + *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, + oia.addr16[0], ia->addr16[0], 0), + oia.addr16[1], ia->addr16[1], 0); + *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0); + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(*ic, + oia.addr16[0], ia->addr16[0], u), + oia.addr16[1], ia->addr16[1], u), + oia.addr16[2], ia->addr16[2], u), + oia.addr16[3], ia->addr16[3], u), + oia.addr16[4], ia->addr16[4], u), + oia.addr16[5], ia->addr16[5], u), + oia.addr16[6], ia->addr16[6], u), + oia.addr16[7], ia->addr16[7], u); + break; +#endif /* INET6 */ + } + /* Change outer ip address, fix outer ip or icmpv6 checksum. */ + PF_ACPY(oa, na, af); + switch (af) { +#ifdef INET + case AF_INET: + *hc = pf_cksum_fixup(pf_cksum_fixup(*hc, + ooa.addr16[0], oa->addr16[0], 0), + ooa.addr16[1], oa->addr16[1], 0); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(*ic, + ooa.addr16[0], oa->addr16[0], u), + ooa.addr16[1], oa->addr16[1], u), + ooa.addr16[2], oa->addr16[2], u), + ooa.addr16[3], oa->addr16[3], u), + ooa.addr16[4], oa->addr16[4], u), + ooa.addr16[5], oa->addr16[5], u), + ooa.addr16[6], oa->addr16[6], u), + ooa.addr16[7], oa->addr16[7], u); + break; +#endif /* INET6 */ + } +} + + +/* + * Need to modulate the sequence numbers in the TCP SACK option + * (credits to Krzysztof Pfaff for report and patch) + */ +int +pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd, + struct tcphdr *th, struct pf_state_peer *dst) +{ + int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen; +#ifdef __FreeBSD__ + u_int8_t opts[TCP_MAXOLEN], *opt = opts; +#else + u_int8_t opts[MAX_TCPOPTLEN], *opt = opts; +#endif + int copyback = 0, i, olen; + struct sackblk sack; + +#define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2) + if (hlen < TCPOLEN_SACKLEN || + !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af)) + return 0; + + while (hlen >= TCPOLEN_SACKLEN) { + olen = opt[1]; + switch (*opt) { + case TCPOPT_EOL: /* FALLTHROUGH */ + case TCPOPT_NOP: + opt++; + hlen--; + break; + case TCPOPT_SACK: + if (olen > hlen) + olen = hlen; + if (olen >= TCPOLEN_SACKLEN) { + for (i = 2; i + TCPOLEN_SACK <= olen; + i += TCPOLEN_SACK) { + memcpy(&sack, &opt[i], sizeof(sack)); + pf_change_a(&sack.start, &th->th_sum, + htonl(ntohl(sack.start) - + dst->seqdiff), 0); + pf_change_a(&sack.end, &th->th_sum, + htonl(ntohl(sack.end) - + dst->seqdiff), 0); + memcpy(&opt[i], &sack, sizeof(sack)); + } + copyback = 1; + } + /* FALLTHROUGH */ + default: + if (olen < 2) + olen = 2; + hlen -= olen; + opt += olen; + } + } + + if (copyback) +#ifdef __FreeBSD__ + m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts); +#else + m_copyback(m, off + sizeof(*th), thoptlen, opts); +#endif + return (copyback); +} + +void +#ifdef __FreeBSD__ +pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af, +#else +pf_send_tcp(const struct pf_rule *r, sa_family_t af, +#endif + const struct pf_addr *saddr, const struct pf_addr *daddr, + u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack, + u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag, + u_int16_t rtag, struct ether_header *eh, struct ifnet *ifp) +{ + struct mbuf *m; + int len, tlen; +#ifdef INET + struct ip *h; +#endif /* INET */ +#ifdef INET6 + struct ip6_hdr *h6; +#endif /* INET6 */ + struct tcphdr *th; + char *opt; + struct pf_mtag *pf_mtag; + +#ifdef __FreeBSD__ + KASSERT( +#ifdef INET + af == AF_INET +#else + 0 +#endif + || +#ifdef INET6 + af == AF_INET6 +#else + 0 +#endif + , ("Unsupported AF %d", af)); + len = 0; + th = NULL; +#ifdef INET + h = NULL; +#endif +#ifdef INET6 + h6 = NULL; +#endif +#endif + + /* maximum segment size tcp option */ + tlen = sizeof(struct tcphdr); + if (mss) + tlen += 4; + + switch (af) { +#ifdef INET + case AF_INET: + len = sizeof(struct ip) + tlen; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + len = sizeof(struct ip6_hdr) + tlen; + break; +#endif /* INET6 */ + } + + /* create outgoing mbuf */ + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; +#ifdef __FreeBSD__ +#ifdef MAC + if (replyto) + mac_netinet_firewall_reply(replyto, m); + else + mac_netinet_firewall_send(m); +#else + (void)replyto; +#endif +#endif + if ((pf_mtag = pf_get_mtag(m)) == NULL) { + m_freem(m); + return; + } + if (tag) +#ifdef __FreeBSD__ + m->m_flags |= M_SKIP_FIREWALL; +#else + pf_mtag->flags |= PF_TAG_GENERATED; +#endif + + pf_mtag->tag = rtag; + + if (r != NULL && r->rtableid >= 0) +#ifdef __FreeBSD__ + { + M_SETFIB(m, r->rtableid); +#endif + pf_mtag->rtableid = r->rtableid; +#ifdef __FreeBSD__ + } +#endif +#ifdef ALTQ + if (r != NULL && r->qid) { + pf_mtag->qid = r->qid; + /* add hints for ecn */ + pf_mtag->af = af; + pf_mtag->hdr = mtod(m, struct ip *); + } +#endif /* ALTQ */ + m->m_data += max_linkhdr; + m->m_pkthdr.len = m->m_len = len; + m->m_pkthdr.rcvif = NULL; + bzero(m->m_data, len); + switch (af) { +#ifdef INET + case AF_INET: + h = mtod(m, struct ip *); + + /* IP header fields included in the TCP checksum */ + h->ip_p = IPPROTO_TCP; + h->ip_len = htons(tlen); + h->ip_src.s_addr = saddr->v4.s_addr; + h->ip_dst.s_addr = daddr->v4.s_addr; + + th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip)); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + h6 = mtod(m, struct ip6_hdr *); + + /* IP header fields included in the TCP checksum */ + h6->ip6_nxt = IPPROTO_TCP; + h6->ip6_plen = htons(tlen); + memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr)); + memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr)); + + th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr)); + break; +#endif /* INET6 */ + } + + /* TCP header */ + th->th_sport = sport; + th->th_dport = dport; + th->th_seq = htonl(seq); + th->th_ack = htonl(ack); + th->th_off = tlen >> 2; + th->th_flags = flags; + th->th_win = htons(win); + + if (mss) { + opt = (char *)(th + 1); + opt[0] = TCPOPT_MAXSEG; + opt[1] = 4; + HTONS(mss); + bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2); + } + + switch (af) { +#ifdef INET + case AF_INET: + /* TCP checksum */ + th->th_sum = in_cksum(m, len); + + /* Finish the IP header */ + h->ip_v = 4; + h->ip_hl = sizeof(*h) >> 2; + h->ip_tos = IPTOS_LOWDELAY; +#ifdef __FreeBSD__ + h->ip_off = V_path_mtu_discovery ? IP_DF : 0; + h->ip_len = len; +#else + h->ip_off = htons(ip_mtudisc ? IP_DF : 0); + h->ip_len = htons(len); +#endif + h->ip_ttl = ttl ? ttl : V_ip_defttl; + h->ip_sum = 0; + if (eh == NULL) { +#ifdef __FreeBSD__ + PF_UNLOCK(); + ip_output(m, (void *)NULL, (void *)NULL, 0, + (void *)NULL, (void *)NULL); + PF_LOCK(); +#else /* ! __FreeBSD__ */ + ip_output(m, (void *)NULL, (void *)NULL, 0, + (void *)NULL, (void *)NULL); +#endif + } else { + struct route ro; + struct rtentry rt; + struct ether_header *e = (void *)ro.ro_dst.sa_data; + + if (ifp == NULL) { + m_freem(m); + return; + } + rt.rt_ifp = ifp; + ro.ro_rt = &rt; + ro.ro_dst.sa_len = sizeof(ro.ro_dst); + ro.ro_dst.sa_family = pseudo_AF_HDRCMPLT; + bcopy(eh->ether_dhost, e->ether_shost, ETHER_ADDR_LEN); + bcopy(eh->ether_shost, e->ether_dhost, ETHER_ADDR_LEN); + e->ether_type = eh->ether_type; +#ifdef __FreeBSD__ + PF_UNLOCK(); + /* XXX_IMPORT: later */ + ip_output(m, (void *)NULL, &ro, 0, + (void *)NULL, (void *)NULL); + PF_LOCK(); +#else /* ! __FreeBSD__ */ + ip_output(m, (void *)NULL, &ro, IP_ROUTETOETHER, + (void *)NULL, (void *)NULL); +#endif + } + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + /* TCP checksum */ + th->th_sum = in6_cksum(m, IPPROTO_TCP, + sizeof(struct ip6_hdr), tlen); + + h6->ip6_vfc |= IPV6_VERSION; + h6->ip6_hlim = IPV6_DEFHLIM; + +#ifdef __FreeBSD__ + PF_UNLOCK(); + ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); + PF_LOCK(); +#else + ip6_output(m, NULL, NULL, 0, NULL, NULL); +#endif + break; +#endif /* INET6 */ + } +} + +void +pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af, + struct pf_rule *r) +{ + struct pf_mtag *pf_mtag; + struct mbuf *m0; +#ifdef __FreeBSD__ + struct ip *ip; +#endif + +#ifdef __FreeBSD__ + m0 = m_copypacket(m, M_DONTWAIT); + if (m0 == NULL) + return; +#else + m0 = m_copy(m, 0, M_COPYALL); +#endif + if ((pf_mtag = pf_get_mtag(m0)) == NULL) + return; +#ifdef __FreeBSD__ + /* XXX: revisit */ + m0->m_flags |= M_SKIP_FIREWALL; +#else + pf_mtag->flags |= PF_TAG_GENERATED; +#endif + + if (r->rtableid >= 0) +#ifdef __FreeBSD__ + { + M_SETFIB(m0, r->rtableid); +#endif + pf_mtag->rtableid = r->rtableid; +#ifdef __FreeBSD__ + } +#endif + +#ifdef ALTQ + if (r->qid) { + pf_mtag->qid = r->qid; + /* add hints for ecn */ + pf_mtag->af = af; + pf_mtag->hdr = mtod(m0, struct ip *); + } +#endif /* ALTQ */ + + switch (af) { +#ifdef INET + case AF_INET: +#ifdef __FreeBSD__ + /* icmp_error() expects host byte ordering */ + ip = mtod(m0, struct ip *); + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); + PF_UNLOCK(); + icmp_error(m0, type, code, 0, 0); + PF_LOCK(); +#else + icmp_error(m0, type, code, 0, 0); +#endif + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + icmp6_error(m0, type, code, 0); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + break; +#endif /* INET6 */ + } +} + +/* + * Return 1 if the addresses a and b match (with mask m), otherwise return 0. + * If n is 0, they match if they are equal. If n is != 0, they match if they + * are different. + */ +int +pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m, + struct pf_addr *b, sa_family_t af) +{ + int match = 0; + + switch (af) { +#ifdef INET + case AF_INET: + if ((a->addr32[0] & m->addr32[0]) == + (b->addr32[0] & m->addr32[0])) + match++; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (((a->addr32[0] & m->addr32[0]) == + (b->addr32[0] & m->addr32[0])) && + ((a->addr32[1] & m->addr32[1]) == + (b->addr32[1] & m->addr32[1])) && + ((a->addr32[2] & m->addr32[2]) == + (b->addr32[2] & m->addr32[2])) && + ((a->addr32[3] & m->addr32[3]) == + (b->addr32[3] & m->addr32[3]))) + match++; + break; +#endif /* INET6 */ + } + if (match) { + if (n) + return (0); + else + return (1); + } else { + if (n) + return (1); + else + return (0); + } +} + +int +pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p) +{ + switch (op) { + case PF_OP_IRG: + return ((p > a1) && (p < a2)); + case PF_OP_XRG: + return ((p < a1) || (p > a2)); + case PF_OP_RRG: + return ((p >= a1) && (p <= a2)); + case PF_OP_EQ: + return (p == a1); + case PF_OP_NE: + return (p != a1); + case PF_OP_LT: + return (p < a1); + case PF_OP_LE: + return (p <= a1); + case PF_OP_GT: + return (p > a1); + case PF_OP_GE: + return (p >= a1); + } + return (0); /* never reached */ +} + +int +pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p) +{ + NTOHS(a1); + NTOHS(a2); + NTOHS(p); + return (pf_match(op, a1, a2, p)); +} + +int +pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u) +{ + if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE) + return (0); + return (pf_match(op, a1, a2, u)); +} + +int +pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g) +{ + if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE) + return (0); + return (pf_match(op, a1, a2, g)); +} + +#ifndef __FreeBSD__ +struct pf_mtag * +pf_find_mtag(struct mbuf *m) +{ + struct m_tag *mtag; + + if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) == NULL) + return (NULL); + + return ((struct pf_mtag *)(mtag + 1)); +} + +struct pf_mtag * +pf_get_mtag(struct mbuf *m) +{ + struct m_tag *mtag; + + if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) == NULL) { + mtag = m_tag_get(PACKET_TAG_PF, sizeof(struct pf_mtag), + M_NOWAIT); + if (mtag == NULL) + return (NULL); + bzero(mtag + 1, sizeof(struct pf_mtag)); + m_tag_prepend(m, mtag); + } + + return ((struct pf_mtag *)(mtag + 1)); +} +#endif + +int +pf_match_tag(struct mbuf *m, struct pf_rule *r, struct pf_mtag *pf_mtag, + int *tag) +{ + if (*tag == -1) + *tag = pf_mtag->tag; + + return ((!r->match_tag_not && r->match_tag == *tag) || + (r->match_tag_not && r->match_tag != *tag)); +} + +int +pf_tag_packet(struct mbuf *m, struct pf_mtag *pf_mtag, int tag, int rtableid) +{ + if (tag <= 0 && rtableid < 0) + return (0); + + if (pf_mtag == NULL) + if ((pf_mtag = pf_get_mtag(m)) == NULL) + return (1); + if (tag > 0) + pf_mtag->tag = tag; + if (rtableid >= 0) +#ifdef __FreeBSD__ + { + M_SETFIB(m, rtableid); +#endif + pf_mtag->rtableid = rtableid; +#ifdef __FreeBSD__ + } +#endif + + return (0); +} + +static void +pf_step_into_anchor(int *depth, struct pf_ruleset **rs, int n, + struct pf_rule **r, struct pf_rule **a, int *match) +{ + struct pf_anchor_stackframe *f; + + (*r)->anchor->match = 0; + if (match) + *match = 0; + if (*depth >= sizeof(pf_anchor_stack) / + sizeof(pf_anchor_stack[0])) { + printf("pf_step_into_anchor: stack overflow\n"); + *r = TAILQ_NEXT(*r, entries); + return; + } else if (*depth == 0 && a != NULL) + *a = *r; + f = pf_anchor_stack + (*depth)++; + f->rs = *rs; + f->r = *r; + if ((*r)->anchor_wildcard) { + f->parent = &(*r)->anchor->children; + if ((f->child = RB_MIN(pf_anchor_node, f->parent)) == + NULL) { + *r = NULL; + return; + } + *rs = &f->child->ruleset; + } else { + f->parent = NULL; + f->child = NULL; + *rs = &(*r)->anchor->ruleset; + } + *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); +} + +int +pf_step_out_of_anchor(int *depth, struct pf_ruleset **rs, int n, + struct pf_rule **r, struct pf_rule **a, int *match) +{ + struct pf_anchor_stackframe *f; + int quick = 0; + + do { + if (*depth <= 0) + break; + f = pf_anchor_stack + *depth - 1; + if (f->parent != NULL && f->child != NULL) { + if (f->child->match || + (match != NULL && *match)) { + f->r->anchor->match = 1; + *match = 0; + } + f->child = RB_NEXT(pf_anchor_node, f->parent, f->child); + if (f->child != NULL) { + *rs = &f->child->ruleset; + *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); + if (*r == NULL) + continue; + else + break; + } + } + (*depth)--; + if (*depth == 0 && a != NULL) + *a = NULL; + *rs = f->rs; + if (f->r->anchor->match || (match != NULL && *match)) + quick = f->r->quick; + *r = TAILQ_NEXT(f->r, entries); + } while (*r == NULL); + + return (quick); +} + +#ifdef INET6 +void +pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr, + struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | + ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); + break; +#endif /* INET */ + case AF_INET6: + naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | + ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); + naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) | + ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]); + naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) | + ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]); + naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) | + ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]); + break; + } +} + +void +pf_addr_inc(struct pf_addr *addr, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); + break; +#endif /* INET */ + case AF_INET6: + if (addr->addr32[3] == 0xffffffff) { + addr->addr32[3] = 0; + if (addr->addr32[2] == 0xffffffff) { + addr->addr32[2] = 0; + if (addr->addr32[1] == 0xffffffff) { + addr->addr32[1] = 0; + addr->addr32[0] = + htonl(ntohl(addr->addr32[0]) + 1); + } else + addr->addr32[1] = + htonl(ntohl(addr->addr32[1]) + 1); + } else + addr->addr32[2] = + htonl(ntohl(addr->addr32[2]) + 1); + } else + addr->addr32[3] = + htonl(ntohl(addr->addr32[3]) + 1); + break; + } +} +#endif /* INET6 */ + +#define mix(a,b,c) \ + do { \ + a -= b; a -= c; a ^= (c >> 13); \ + b -= c; b -= a; b ^= (a << 8); \ + c -= a; c -= b; c ^= (b >> 13); \ + a -= b; a -= c; a ^= (c >> 12); \ + b -= c; b -= a; b ^= (a << 16); \ + c -= a; c -= b; c ^= (b >> 5); \ + a -= b; a -= c; a ^= (c >> 3); \ + b -= c; b -= a; b ^= (a << 10); \ + c -= a; c -= b; c ^= (b >> 15); \ + } while (0) + +/* + * hash function based on bridge_hash in if_bridge.c + */ +void +pf_hash(struct pf_addr *inaddr, struct pf_addr *hash, + struct pf_poolhashkey *key, sa_family_t af) +{ + u_int32_t a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0]; + + switch (af) { +#ifdef INET + case AF_INET: + a += inaddr->addr32[0]; + b += key->key32[1]; + mix(a, b, c); + hash->addr32[0] = c + key->key32[2]; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + a += inaddr->addr32[0]; + b += inaddr->addr32[2]; + mix(a, b, c); + hash->addr32[0] = c; + a += inaddr->addr32[1]; + b += inaddr->addr32[3]; + c += key->key32[1]; + mix(a, b, c); + hash->addr32[1] = c; + a += inaddr->addr32[2]; + b += inaddr->addr32[1]; + c += key->key32[2]; + mix(a, b, c); + hash->addr32[2] = c; + a += inaddr->addr32[3]; + b += inaddr->addr32[0]; + c += key->key32[3]; + mix(a, b, c); + hash->addr32[3] = c; + break; +#endif /* INET6 */ + } +} + +int +pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr, + struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sn) +{ + unsigned char hash[16]; + struct pf_pool *rpool = &r->rpool; + struct pf_addr *raddr = &rpool->cur->addr.v.a.addr; + struct pf_addr *rmask = &rpool->cur->addr.v.a.mask; + struct pf_pooladdr *acur = rpool->cur; + struct pf_src_node k; + + if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR && + (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) { + k.af = af; + PF_ACPY(&k.addr, saddr, af); + if (r->rule_flag & PFRULE_RULESRCTRACK || + r->rpool.opts & PF_POOL_STICKYADDR) + k.rule.ptr = r; + else + k.rule.ptr = NULL; + pf_status.scounters[SCNT_SRC_NODE_SEARCH]++; + *sn = RB_FIND(pf_src_tree, &tree_src_tracking, &k); + if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) { + PF_ACPY(naddr, &(*sn)->raddr, af); + if (pf_status.debug >= PF_DEBUG_MISC) { + printf("pf_map_addr: src tracking maps "); + pf_print_host(&k.addr, 0, af); + printf(" to "); + pf_print_host(naddr, 0, af); + printf("\n"); + } + return (0); + } + } + + if (rpool->cur->addr.type == PF_ADDR_NOROUTE) + return (1); + if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { + switch (af) { +#ifdef INET + case AF_INET: + if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 && + (rpool->opts & PF_POOL_TYPEMASK) != + PF_POOL_ROUNDROBIN) + return (1); + raddr = &rpool->cur->addr.p.dyn->pfid_addr4; + rmask = &rpool->cur->addr.p.dyn->pfid_mask4; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (rpool->cur->addr.p.dyn->pfid_acnt6 < 1 && + (rpool->opts & PF_POOL_TYPEMASK) != + PF_POOL_ROUNDROBIN) + return (1); + raddr = &rpool->cur->addr.p.dyn->pfid_addr6; + rmask = &rpool->cur->addr.p.dyn->pfid_mask6; + break; +#endif /* INET6 */ + } + } else if (rpool->cur->addr.type == PF_ADDR_TABLE) { + if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN) + return (1); /* unsupported */ + } else { + raddr = &rpool->cur->addr.v.a.addr; + rmask = &rpool->cur->addr.v.a.mask; + } + + switch (rpool->opts & PF_POOL_TYPEMASK) { + case PF_POOL_NONE: + PF_ACPY(naddr, raddr, af); + break; + case PF_POOL_BITMASK: + PF_POOLMASK(naddr, raddr, rmask, saddr, af); + break; + case PF_POOL_RANDOM: + if (init_addr != NULL && PF_AZERO(init_addr, af)) { + switch (af) { +#ifdef INET + case AF_INET: + rpool->counter.addr32[0] = htonl(arc4random()); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (rmask->addr32[3] != 0xffffffff) + rpool->counter.addr32[3] = + htonl(arc4random()); + else + break; + if (rmask->addr32[2] != 0xffffffff) + rpool->counter.addr32[2] = + htonl(arc4random()); + else + break; + if (rmask->addr32[1] != 0xffffffff) + rpool->counter.addr32[1] = + htonl(arc4random()); + else + break; + if (rmask->addr32[0] != 0xffffffff) + rpool->counter.addr32[0] = + htonl(arc4random()); + break; +#endif /* INET6 */ + } + PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); + PF_ACPY(init_addr, naddr, af); + + } else { + PF_AINC(&rpool->counter, af); + PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); + } + break; + case PF_POOL_SRCHASH: + pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af); + PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af); + break; + case PF_POOL_ROUNDROBIN: + if (rpool->cur->addr.type == PF_ADDR_TABLE) { + if (!pfr_pool_get(rpool->cur->addr.p.tbl, + &rpool->tblidx, &rpool->counter, + &raddr, &rmask, af)) + goto get_addr; + } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { + if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, + &rpool->tblidx, &rpool->counter, + &raddr, &rmask, af)) + goto get_addr; + } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af)) + goto get_addr; + + try_next: + if ((rpool->cur = TAILQ_NEXT(rpool->cur, entries)) == NULL) + rpool->cur = TAILQ_FIRST(&rpool->list); + if (rpool->cur->addr.type == PF_ADDR_TABLE) { + rpool->tblidx = -1; + if (pfr_pool_get(rpool->cur->addr.p.tbl, + &rpool->tblidx, &rpool->counter, + &raddr, &rmask, af)) { + /* table contains no address of type 'af' */ + if (rpool->cur != acur) + goto try_next; + return (1); + } + } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { + rpool->tblidx = -1; + if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, + &rpool->tblidx, &rpool->counter, + &raddr, &rmask, af)) { + /* table contains no address of type 'af' */ + if (rpool->cur != acur) + goto try_next; + return (1); + } + } else { + raddr = &rpool->cur->addr.v.a.addr; + rmask = &rpool->cur->addr.v.a.mask; + PF_ACPY(&rpool->counter, raddr, af); + } + + get_addr: + PF_ACPY(naddr, &rpool->counter, af); + if (init_addr != NULL && PF_AZERO(init_addr, af)) + PF_ACPY(init_addr, naddr, af); + PF_AINC(&rpool->counter, af); + break; + } + if (*sn != NULL) + PF_ACPY(&(*sn)->raddr, naddr, af); + + if (pf_status.debug >= PF_DEBUG_MISC && + (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) { + printf("pf_map_addr: selected address "); + pf_print_host(naddr, 0, af); + printf("\n"); + } + + return (0); +} + +int +pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, + struct pf_addr *saddr, struct pf_addr *daddr, u_int16_t dport, + struct pf_addr *naddr, u_int16_t *nport, u_int16_t low, u_int16_t high, + struct pf_src_node **sn) +{ + struct pf_state_cmp key; + struct pf_addr init_addr; + u_int16_t cut; + + bzero(&init_addr, sizeof(init_addr)); + if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn)) + return (1); + + if (proto == IPPROTO_ICMP) { + low = 1; + high = 65535; + } + + do { + key.af = af; + key.proto = proto; + PF_ACPY(&key.ext.addr, daddr, key.af); + PF_ACPY(&key.gwy.addr, naddr, key.af); + key.ext.port = dport; + + /* + * port search; start random, step; + * similar 2 portloop in in_pcbbind + */ + if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP || + proto == IPPROTO_ICMP)) { + key.gwy.port = dport; + if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == NULL) + return (0); + } else if (low == 0 && high == 0) { + key.gwy.port = *nport; + if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == NULL) + return (0); + } else if (low == high) { + key.gwy.port = htons(low); + if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == NULL) { + *nport = htons(low); + return (0); + } + } else { + u_int16_t tmp; + + if (low > high) { + tmp = low; + low = high; + high = tmp; + } + /* low < high */ + cut = htonl(arc4random()) % (1 + high - low) + low; + /* low <= cut <= high */ + for (tmp = cut; tmp <= high; ++(tmp)) { + key.gwy.port = htons(tmp); + if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == + NULL) { + *nport = htons(tmp); + return (0); + } + } + for (tmp = cut - 1; tmp >= low; --(tmp)) { + key.gwy.port = htons(tmp); + if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == + NULL) { + *nport = htons(tmp); + return (0); + } + } + } + + switch (r->rpool.opts & PF_POOL_TYPEMASK) { + case PF_POOL_RANDOM: + case PF_POOL_ROUNDROBIN: + if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn)) + return (1); + break; + case PF_POOL_NONE: + case PF_POOL_SRCHASH: + case PF_POOL_BITMASK: + default: + return (1); + } + } while (! PF_AEQ(&init_addr, naddr, af) ); + + return (1); /* none available */ +} + +struct pf_rule * +pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, + int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport, + struct pf_addr *daddr, u_int16_t dport, int rs_num) +{ + struct pf_rule *r, *rm = NULL; + struct pf_ruleset *ruleset = NULL; + int tag = -1; + int rtableid = -1; + int asd = 0; + + r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr); + while (r && rm == NULL) { + struct pf_rule_addr *src = NULL, *dst = NULL; + struct pf_addr_wrap *xdst = NULL; + + if (r->action == PF_BINAT && direction == PF_IN) { + src = &r->dst; + if (r->rpool.cur != NULL) + xdst = &r->rpool.cur->addr; + } else { + src = &r->src; + dst = &r->dst; + } + + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != direction) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != pd->af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&src->addr, saddr, pd->af, + src->neg, kif)) + r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR : + PF_SKIP_DST_ADDR].ptr; + else if (src->port_op && !pf_match_port(src->port_op, + src->port[0], src->port[1], sport)) + r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT : + PF_SKIP_DST_PORT].ptr; + else if (dst != NULL && + PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL)) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af, + 0, NULL)) + r = TAILQ_NEXT(r, entries); + else if (dst != NULL && dst->port_op && + !pf_match_port(dst->port_op, dst->port[0], + dst->port[1], dport)) + r = r->skip[PF_SKIP_DST_PORT].ptr; + else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) + r = TAILQ_NEXT(r, entries); + else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto != + IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m, + off, pd->hdr.tcp), r->os_fingerprint))) + r = TAILQ_NEXT(r, entries); + else { + if (r->tag) + tag = r->tag; + if (r->rtableid >= 0) + rtableid = r->rtableid; + if (r->anchor == NULL) { + rm = r; + } else + pf_step_into_anchor(&asd, &ruleset, rs_num, + &r, NULL, NULL); + } + if (r == NULL) + pf_step_out_of_anchor(&asd, &ruleset, rs_num, &r, + NULL, NULL); + } + if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) + return (NULL); + if (rm != NULL && (rm->action == PF_NONAT || + rm->action == PF_NORDR || rm->action == PF_NOBINAT)) + return (NULL); + return (rm); +} + +struct pf_rule * +pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, + struct pfi_kif *kif, struct pf_src_node **sn, + struct pf_addr *saddr, u_int16_t sport, + struct pf_addr *daddr, u_int16_t dport, + struct pf_addr *naddr, u_int16_t *nport) +{ + struct pf_rule *r = NULL; + + if (direction == PF_OUT) { + r = pf_match_translation(pd, m, off, direction, kif, saddr, + sport, daddr, dport, PF_RULESET_BINAT); + if (r == NULL) + r = pf_match_translation(pd, m, off, direction, kif, + saddr, sport, daddr, dport, PF_RULESET_NAT); + } else { + r = pf_match_translation(pd, m, off, direction, kif, saddr, + sport, daddr, dport, PF_RULESET_RDR); + if (r == NULL) + r = pf_match_translation(pd, m, off, direction, kif, + saddr, sport, daddr, dport, PF_RULESET_BINAT); + } + + if (r != NULL) { + switch (r->action) { + case PF_NONAT: + case PF_NOBINAT: + case PF_NORDR: + return (NULL); + case PF_NAT: + if (pf_get_sport(pd->af, pd->proto, r, saddr, + daddr, dport, naddr, nport, r->rpool.proxy_port[0], + r->rpool.proxy_port[1], sn)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: NAT proxy port allocation " + "(%u-%u) failed\n", + r->rpool.proxy_port[0], + r->rpool.proxy_port[1])); + return (NULL); + } + break; + case PF_BINAT: + switch (direction) { + case PF_OUT: + if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){ + switch (pd->af) { +#ifdef INET + case AF_INET: + if (r->rpool.cur->addr.p.dyn-> + pfid_acnt4 < 1) + return (NULL); + PF_POOLMASK(naddr, + &r->rpool.cur->addr.p.dyn-> + pfid_addr4, + &r->rpool.cur->addr.p.dyn-> + pfid_mask4, + saddr, AF_INET); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (r->rpool.cur->addr.p.dyn-> + pfid_acnt6 < 1) + return (NULL); + PF_POOLMASK(naddr, + &r->rpool.cur->addr.p.dyn-> + pfid_addr6, + &r->rpool.cur->addr.p.dyn-> + pfid_mask6, + saddr, AF_INET6); + break; +#endif /* INET6 */ + } + } else + PF_POOLMASK(naddr, + &r->rpool.cur->addr.v.a.addr, + &r->rpool.cur->addr.v.a.mask, + saddr, pd->af); + break; + case PF_IN: + if (r->src.addr.type == PF_ADDR_DYNIFTL) { + switch (pd->af) { +#ifdef INET + case AF_INET: + if (r->src.addr.p.dyn-> + pfid_acnt4 < 1) + return (NULL); + PF_POOLMASK(naddr, + &r->src.addr.p.dyn-> + pfid_addr4, + &r->src.addr.p.dyn-> + pfid_mask4, + daddr, AF_INET); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (r->src.addr.p.dyn-> + pfid_acnt6 < 1) + return (NULL); + PF_POOLMASK(naddr, + &r->src.addr.p.dyn-> + pfid_addr6, + &r->src.addr.p.dyn-> + pfid_mask6, + daddr, AF_INET6); + break; +#endif /* INET6 */ + } + } else + PF_POOLMASK(naddr, + &r->src.addr.v.a.addr, + &r->src.addr.v.a.mask, daddr, + pd->af); + break; + } + break; + case PF_RDR: { + if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn)) + return (NULL); + if ((r->rpool.opts & PF_POOL_TYPEMASK) == + PF_POOL_BITMASK) + PF_POOLMASK(naddr, naddr, + &r->rpool.cur->addr.v.a.mask, daddr, + pd->af); + + if (r->rpool.proxy_port[1]) { + u_int32_t tmp_nport; + + tmp_nport = ((ntohs(dport) - + ntohs(r->dst.port[0])) % + (r->rpool.proxy_port[1] - + r->rpool.proxy_port[0] + 1)) + + r->rpool.proxy_port[0]; + + /* wrap around if necessary */ + if (tmp_nport > 65535) + tmp_nport -= 65535; + *nport = htons((u_int16_t)tmp_nport); + } else if (r->rpool.proxy_port[0]) + *nport = htons(r->rpool.proxy_port[0]); + break; + } + default: + return (NULL); + } + } + + return (r); +} + +int +#ifdef __FreeBSD__ +pf_socket_lookup(int direction, struct pf_pdesc *pd, struct inpcb *inp_arg) +#else +pf_socket_lookup(int direction, struct pf_pdesc *pd) +#endif +{ + struct pf_addr *saddr, *daddr; + u_int16_t sport, dport; +#ifdef __FreeBSD__ + struct inpcbinfo *pi; +#else + struct inpcbtable *tb; +#endif + struct inpcb *inp; + + if (pd == NULL) + return (-1); + pd->lookup.uid = UID_MAX; + pd->lookup.gid = GID_MAX; + pd->lookup.pid = NO_PID; /* XXX: revisit */ +#ifdef __FreeBSD__ + if (inp_arg != NULL) { + INP_LOCK_ASSERT(inp_arg); + pd->lookup.uid = inp_arg->inp_cred->cr_uid; + pd->lookup.gid = inp_arg->inp_cred->cr_groups[0]; + return (1); + } +#endif + switch (pd->proto) { + case IPPROTO_TCP: + if (pd->hdr.tcp == NULL) + return (-1); + sport = pd->hdr.tcp->th_sport; + dport = pd->hdr.tcp->th_dport; +#ifdef __FreeBSD__ + pi = &V_tcbinfo; +#else + tb = &tcbtable; +#endif + break; + case IPPROTO_UDP: + if (pd->hdr.udp == NULL) + return (-1); + sport = pd->hdr.udp->uh_sport; + dport = pd->hdr.udp->uh_dport; +#ifdef __FreeBSD__ + pi = &V_udbinfo; +#else + tb = &udbtable; +#endif + break; + default: + return (-1); + } + if (direction == PF_IN) { + saddr = pd->src; + daddr = pd->dst; + } else { + u_int16_t p; + + p = sport; + sport = dport; + dport = p; + saddr = pd->dst; + daddr = pd->src; + } + switch (pd->af) { +#ifdef INET + case AF_INET: +#ifdef __FreeBSD__ + INP_INFO_RLOCK(pi); /* XXX LOR */ + inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4, + dport, 0, NULL); + if (inp == NULL) { + inp = in_pcblookup_hash(pi, saddr->v4, sport, + daddr->v4, dport, INPLOOKUP_WILDCARD, NULL); + if(inp == NULL) { + INP_INFO_RUNLOCK(pi); + return (-1); + } + } +#else + inp = in_pcbhashlookup(tb, saddr->v4, sport, daddr->v4, dport); + if (inp == NULL) { + inp = in_pcblookup_listen(tb, daddr->v4, dport, 0); + if (inp == NULL) + return (-1); + } +#endif + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: +#ifdef __FreeBSD__ + INP_INFO_RLOCK(pi); + inp = in6_pcblookup_hash(pi, &saddr->v6, sport, + &daddr->v6, dport, 0, NULL); + if (inp == NULL) { + inp = in6_pcblookup_hash(pi, &saddr->v6, sport, + &daddr->v6, dport, INPLOOKUP_WILDCARD, NULL); + if (inp == NULL) { + INP_INFO_RUNLOCK(pi); + return (-1); + } + } +#else + inp = in6_pcbhashlookup(tb, &saddr->v6, sport, &daddr->v6, + dport); + if (inp == NULL) { + inp = in6_pcblookup_listen(tb, &daddr->v6, dport, 0); + if (inp == NULL) + return (-1); + } +#endif + break; +#endif /* INET6 */ + + default: + return (-1); + } +#ifdef __FreeBSD__ + pd->lookup.uid = inp->inp_cred->cr_uid; + pd->lookup.gid = inp->inp_cred->cr_groups[0]; + INP_INFO_RUNLOCK(pi); +#else + pd->lookup.uid = inp->inp_socket->so_euid; + pd->lookup.gid = inp->inp_socket->so_egid; + pd->lookup.pid = inp->inp_socket->so_cpid; +#endif + return (1); +} + +u_int8_t +pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) +{ + int hlen; + u_int8_t hdr[60]; + u_int8_t *opt, optlen; + u_int8_t wscale = 0; + + hlen = th_off << 2; /* hlen <= sizeof(hdr) */ + if (hlen <= sizeof(struct tcphdr)) + return (0); + if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) + return (0); + opt = hdr + sizeof(struct tcphdr); + hlen -= sizeof(struct tcphdr); + while (hlen >= 3) { + switch (*opt) { + case TCPOPT_EOL: + case TCPOPT_NOP: + ++opt; + --hlen; + break; + case TCPOPT_WINDOW: + wscale = opt[2]; + if (wscale > TCP_MAX_WINSHIFT) + wscale = TCP_MAX_WINSHIFT; + wscale |= PF_WSCALE_FLAG; + /* FALLTHROUGH */ + default: + optlen = opt[1]; + if (optlen < 2) + optlen = 2; + hlen -= optlen; + opt += optlen; + break; + } + } + return (wscale); +} + +u_int16_t +pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) +{ + int hlen; + u_int8_t hdr[60]; + u_int8_t *opt, optlen; + u_int16_t mss = V_tcp_mssdflt; + + hlen = th_off << 2; /* hlen <= sizeof(hdr) */ + if (hlen <= sizeof(struct tcphdr)) + return (0); + if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) + return (0); + opt = hdr + sizeof(struct tcphdr); + hlen -= sizeof(struct tcphdr); + while (hlen >= TCPOLEN_MAXSEG) { + switch (*opt) { + case TCPOPT_EOL: + case TCPOPT_NOP: + ++opt; + --hlen; + break; + case TCPOPT_MAXSEG: + bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2); + NTOHS(mss); + /* FALLTHROUGH */ + default: + optlen = opt[1]; + if (optlen < 2) + optlen = 2; + hlen -= optlen; + opt += optlen; + break; + } + } + return (mss); +} + +u_int16_t +pf_calc_mss(struct pf_addr *addr, sa_family_t af, u_int16_t offer) +{ +#ifdef INET + struct sockaddr_in *dst; + struct route ro; +#endif /* INET */ +#ifdef INET6 + struct sockaddr_in6 *dst6; + struct route_in6 ro6; +#endif /* INET6 */ + struct rtentry *rt = NULL; + int hlen = 0; /* make the compiler happy */ + u_int16_t mss = V_tcp_mssdflt; + + switch (af) { +#ifdef INET + case AF_INET: + hlen = sizeof(struct ip); + bzero(&ro, sizeof(ro)); + dst = (struct sockaddr_in *)&ro.ro_dst; + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = addr->v4; +#ifdef __FreeBSD__ +#ifdef RTF_PRCLONING + rtalloc_ign(&ro, (RTF_CLONING | RTF_PRCLONING)); +#else /* !RTF_PRCLONING */ + in_rtalloc_ign(&ro, 0, 0); +#endif +#else /* ! __FreeBSD__ */ + rtalloc_noclone(&ro, NO_CLONING); +#endif + rt = ro.ro_rt; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + hlen = sizeof(struct ip6_hdr); + bzero(&ro6, sizeof(ro6)); + dst6 = (struct sockaddr_in6 *)&ro6.ro_dst; + dst6->sin6_family = AF_INET6; + dst6->sin6_len = sizeof(*dst6); + dst6->sin6_addr = addr->v6; +#ifdef __FreeBSD__ +#ifdef RTF_PRCLONING + rtalloc_ign((struct route *)&ro6, + (RTF_CLONING | RTF_PRCLONING)); +#else /* !RTF_PRCLONING */ + rtalloc_ign((struct route *)&ro6, 0); +#endif +#else /* ! __FreeBSD__ */ + rtalloc_noclone((struct route *)&ro6, NO_CLONING); +#endif + rt = ro6.ro_rt; + break; +#endif /* INET6 */ + } + + if (rt && rt->rt_ifp) { + mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr); + mss = max(V_tcp_mssdflt, mss); + RTFREE(rt); + } + mss = min(mss, offer); + mss = max(mss, 64); /* sanity - at least max opt space */ + return (mss); +} + +void +pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr) +{ + struct pf_rule *r = s->rule.ptr; + + s->rt_kif = NULL; + if (!r->rt || r->rt == PF_FASTROUTE) + return; + switch (s->af) { +#ifdef INET + case AF_INET: + pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL, + &s->nat_src_node); + s->rt_kif = r->rpool.cur->kif; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL, + &s->nat_src_node); + s->rt_kif = r->rpool.cur->kif; + break; +#endif /* INET6 */ + } +} + +int +pf_test_tcp(struct pf_rule **rm, struct pf_state **sm, int direction, + struct pfi_kif *kif, struct mbuf *m, int off, void *h, +#ifdef __FreeBSD__ + struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm, + struct ifqueue *ifq, struct inpcb *inp) +#else + struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm, + struct ifqueue *ifq) +#endif +{ + struct pf_rule *nr = NULL; + struct pf_addr *saddr = pd->src, *daddr = pd->dst; + struct tcphdr *th = pd->hdr.tcp; + u_int16_t bport, nport = 0; + sa_family_t af = pd->af; + struct pf_rule *r, *a = NULL; + struct pf_ruleset *ruleset = NULL; + struct pf_src_node *nsn = NULL; + u_short reason; + int rewrite = 0; + int tag = -1, rtableid = -1; + u_int16_t mss = V_tcp_mssdflt; + int asd = 0; + int match = 0; + + if (pf_check_congestion(ifq)) { + REASON_SET(&reason, PFRES_CONGEST); + return (PF_DROP); + } + +#ifdef __FreeBSD__ + if (inp != NULL) + pd->lookup.done = pf_socket_lookup(direction, pd, inp); + else if (debug_pfugidhack) { + PF_UNLOCK(); + DPFPRINTF(PF_DEBUG_MISC, ("pf: unlocked lookup\n")); + pd->lookup.done = pf_socket_lookup(direction, pd, inp); + PF_LOCK(); + } +#endif + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); + + if (direction == PF_OUT) { + bport = nport = th->th_sport; + /* check outgoing packet for BINAT/NAT */ + if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn, + saddr, th->th_sport, daddr, th->th_dport, + &pd->naddr, &nport)) != NULL) { + PF_ACPY(&pd->baddr, saddr, af); + pf_change_ap(saddr, &th->th_sport, pd->ip_sum, + &th->th_sum, &pd->naddr, nport, 0, af); + rewrite++; + if (nr->natpass) + r = NULL; + pd->nat_rule = nr; + } + } else { + bport = nport = th->th_dport; + /* check incoming packet for BINAT/RDR */ + if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn, + saddr, th->th_sport, daddr, th->th_dport, + &pd->naddr, &nport)) != NULL) { + PF_ACPY(&pd->baddr, daddr, af); + pf_change_ap(daddr, &th->th_dport, pd->ip_sum, + &th->th_sum, &pd->naddr, nport, 0, af); + rewrite++; + if (nr->natpass) + r = NULL; + pd->nat_rule = nr; + } + } + + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != direction) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != IPPROTO_TCP) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, saddr, af, + r->src.neg, kif)) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (r->src.port_op && !pf_match_port(r->src.port_op, + r->src.port[0], r->src.port[1], th->th_sport)) + r = r->skip[PF_SKIP_SRC_PORT].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, + r->dst.neg, NULL)) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (r->dst.port_op && !pf_match_port(r->dst.port_op, + r->dst.port[0], r->dst.port[1], th->th_dport)) + r = r->skip[PF_SKIP_DST_PORT].ptr; + else if (r->tos && !(r->tos == pd->tos)) + r = TAILQ_NEXT(r, entries); + else if (r->rule_flag & PFRULE_FRAGMENT) + r = TAILQ_NEXT(r, entries); + else if ((r->flagset & th->th_flags) != r->flags) + r = TAILQ_NEXT(r, entries); + else if (r->uid.op && (pd->lookup.done || (pd->lookup.done = +#ifdef __FreeBSD__ + pf_socket_lookup(direction, pd, inp), 1)) && +#else + pf_socket_lookup(direction, pd), 1)) && +#endif + !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1], + pd->lookup.uid)) + r = TAILQ_NEXT(r, entries); + else if (r->gid.op && (pd->lookup.done || (pd->lookup.done = +#ifdef __FreeBSD__ + pf_socket_lookup(direction, pd, inp), 1)) && +#else + pf_socket_lookup(direction, pd), 1)) && +#endif + !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1], + pd->lookup.gid)) + r = TAILQ_NEXT(r, entries); + else if (r->prob && r->prob <= arc4random()) + r = TAILQ_NEXT(r, entries); + else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) + r = TAILQ_NEXT(r, entries); + else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match( + pf_osfp_fingerprint(pd, m, off, th), r->os_fingerprint)) + r = TAILQ_NEXT(r, entries); + else { + if (r->tag) + tag = r->tag; + if (r->rtableid >= 0) + rtableid = r->rtableid; + if (r->anchor == NULL) { + match = 1; + *rm = r; + *am = a; + *rsm = ruleset; + if ((*rm)->quick) + break; + r = TAILQ_NEXT(r, entries); + } else + pf_step_into_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match); + } + if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match)) + break; + } + r = *rm; + a = *am; + ruleset = *rsm; + + REASON_SET(&reason, PFRES_MATCH); + + if (r->log || (nr != NULL && nr->natpass && nr->log)) { + if (rewrite) +#ifdef __FreeBSD__ + m_copyback(m, off, sizeof(*th), (caddr_t)th); +#else + m_copyback(m, off, sizeof(*th), th); +#endif + PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr, + a, ruleset, pd); + } + + if ((r->action == PF_DROP) && + ((r->rule_flag & PFRULE_RETURNRST) || + (r->rule_flag & PFRULE_RETURNICMP) || + (r->rule_flag & PFRULE_RETURN))) { + /* undo NAT changes, if they have taken place */ + if (nr != NULL) { + if (direction == PF_OUT) { + pf_change_ap(saddr, &th->th_sport, pd->ip_sum, + &th->th_sum, &pd->baddr, bport, 0, af); + rewrite++; + } else { + pf_change_ap(daddr, &th->th_dport, pd->ip_sum, + &th->th_sum, &pd->baddr, bport, 0, af); + rewrite++; + } + } + if (((r->rule_flag & PFRULE_RETURNRST) || + (r->rule_flag & PFRULE_RETURN)) && + !(th->th_flags & TH_RST)) { + u_int32_t ack = ntohl(th->th_seq) + pd->p_len; + + if (th->th_flags & TH_SYN) + ack++; + if (th->th_flags & TH_FIN) + ack++; +#ifdef __FreeBSD__ + pf_send_tcp(m, r, af, pd->dst, +#else + pf_send_tcp(r, af, pd->dst, +#endif + pd->src, th->th_dport, th->th_sport, + ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0, + r->return_ttl, 1, 0, pd->eh, kif->pfik_ifp); + } else if ((af == AF_INET) && r->return_icmp) + pf_send_icmp(m, r->return_icmp >> 8, + r->return_icmp & 255, af, r); + else if ((af == AF_INET6) && r->return_icmp6) + pf_send_icmp(m, r->return_icmp6 >> 8, + r->return_icmp6 & 255, af, r); + } + + if (r->action == PF_DROP) + return (PF_DROP); + + if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) { + REASON_SET(&reason, PFRES_MEMORY); + return (PF_DROP); + } + + if (r->keep_state || nr != NULL || + (pd->flags & PFDESC_TCP_NORM)) { + /* create new state */ + u_int16_t len; + struct pf_state *s = NULL; + struct pf_src_node *sn = NULL; + + len = pd->tot_len - off - (th->th_off << 2); + + /* check maximums */ + if (r->max_states && (r->states >= r->max_states)) { + pf_status.lcounters[LCNT_STATES]++; + REASON_SET(&reason, PFRES_MAXSTATES); + goto cleanup; + } + /* src node for filter rule */ + if ((r->rule_flag & PFRULE_SRCTRACK || + r->rpool.opts & PF_POOL_STICKYADDR) && + pf_insert_src_node(&sn, r, saddr, af) != 0) { + REASON_SET(&reason, PFRES_SRCLIMIT); + goto cleanup; + } + /* src node for translation rule */ + if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && + ((direction == PF_OUT && + pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) || + (pf_insert_src_node(&nsn, nr, saddr, af) != 0))) { + REASON_SET(&reason, PFRES_SRCLIMIT); + goto cleanup; + } + s = pool_get(&pf_state_pl, PR_NOWAIT); + if (s == NULL) { + REASON_SET(&reason, PFRES_MEMORY); +cleanup: + if (sn != NULL && sn->states == 0 && sn->expire == 0) { + RB_REMOVE(pf_src_tree, &tree_src_tracking, sn); + pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; + pf_status.src_nodes--; + pool_put(&pf_src_tree_pl, sn); + } + if (nsn != sn && nsn != NULL && nsn->states == 0 && + nsn->expire == 0) { + RB_REMOVE(pf_src_tree, &tree_src_tracking, nsn); + pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; + pf_status.src_nodes--; + pool_put(&pf_src_tree_pl, nsn); + } + return (PF_DROP); + } + bzero(s, sizeof(*s)); + s->rule.ptr = r; + s->nat_rule.ptr = nr; + s->anchor.ptr = a; + STATE_INC_COUNTERS(s); + if (r->allow_opts) + s->state_flags |= PFSTATE_ALLOWOPTS; + if (r->rule_flag & PFRULE_STATESLOPPY) + s->state_flags |= PFSTATE_SLOPPY; + s->log = r->log & PF_LOG_ALL; + if (nr != NULL) + s->log |= nr->log & PF_LOG_ALL; + s->proto = IPPROTO_TCP; + s->direction = direction; + s->af = af; + if (direction == PF_OUT) { + PF_ACPY(&s->gwy.addr, saddr, af); + s->gwy.port = th->th_sport; /* sport */ + PF_ACPY(&s->ext.addr, daddr, af); + s->ext.port = th->th_dport; + if (nr != NULL) { + PF_ACPY(&s->lan.addr, &pd->baddr, af); + s->lan.port = bport; + } else { + PF_ACPY(&s->lan.addr, &s->gwy.addr, af); + s->lan.port = s->gwy.port; + } + } else { + PF_ACPY(&s->lan.addr, daddr, af); + s->lan.port = th->th_dport; + PF_ACPY(&s->ext.addr, saddr, af); + s->ext.port = th->th_sport; + if (nr != NULL) { + PF_ACPY(&s->gwy.addr, &pd->baddr, af); + s->gwy.port = bport; + } else { + PF_ACPY(&s->gwy.addr, &s->lan.addr, af); + s->gwy.port = s->lan.port; + } + } + + s->src.seqlo = ntohl(th->th_seq); + s->src.seqhi = s->src.seqlo + len + 1; + if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && + r->keep_state == PF_STATE_MODULATE) { + /* Generate sequence number modulator */ +#ifdef __FreeBSD__ + while ((s->src.seqdiff = + pf_new_isn(s) - s->src.seqlo) == 0) + ; +#else + while ((s->src.seqdiff = + tcp_rndiss_next() - s->src.seqlo) == 0) + ; +#endif + pf_change_a(&th->th_seq, &th->th_sum, + htonl(s->src.seqlo + s->src.seqdiff), 0); + rewrite = 1; + } else + s->src.seqdiff = 0; + if (th->th_flags & TH_SYN) { + s->src.seqhi++; + s->src.wscale = pf_get_wscale(m, off, th->th_off, af); + } + s->src.max_win = MAX(ntohs(th->th_win), 1); + if (s->src.wscale & PF_WSCALE_MASK) { + /* Remove scale factor from initial window */ + int win = s->src.max_win; + win += 1 << (s->src.wscale & PF_WSCALE_MASK); + s->src.max_win = (win - 1) >> + (s->src.wscale & PF_WSCALE_MASK); + } + if (th->th_flags & TH_FIN) + s->src.seqhi++; + s->dst.seqhi = 1; + s->dst.max_win = 1; + s->src.state = TCPS_SYN_SENT; + s->dst.state = TCPS_CLOSED; + s->creation = time_second; + s->expire = time_second; + s->timeout = PFTM_TCP_FIRST_PACKET; + pf_set_rt_ifp(s, saddr); + if (sn != NULL) { + s->src_node = sn; + s->src_node->states++; + } + if (nsn != NULL) { + PF_ACPY(&nsn->raddr, &pd->naddr, af); + s->nat_src_node = nsn; + s->nat_src_node->states++; + } + if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m, + off, pd, th, &s->src, &s->dst)) { + REASON_SET(&reason, PFRES_MEMORY); + pf_src_tree_remove_state(s); + STATE_DEC_COUNTERS(s); + pool_put(&pf_state_pl, s); + return (PF_DROP); + } + if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub && + pf_normalize_tcp_stateful(m, off, pd, &reason, th, s, + &s->src, &s->dst, &rewrite)) { + /* This really shouldn't happen!!! */ + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_normalize_tcp_stateful failed on first pkt")); + pf_normalize_tcp_cleanup(s); + pf_src_tree_remove_state(s); + STATE_DEC_COUNTERS(s); + pool_put(&pf_state_pl, s); + return (PF_DROP); + } + if (pf_insert_state(BOUND_IFACE(r, kif), s)) { + pf_normalize_tcp_cleanup(s); + REASON_SET(&reason, PFRES_STATEINS); + pf_src_tree_remove_state(s); + STATE_DEC_COUNTERS(s); + pool_put(&pf_state_pl, s); + return (PF_DROP); + } else + *sm = s; + if (tag > 0) { + pf_tag_ref(tag); + s->tag = tag; + } + if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && + r->keep_state == PF_STATE_SYNPROXY) { + s->src.state = PF_TCPS_PROXY_SRC; + if (nr != NULL) { + if (direction == PF_OUT) { + pf_change_ap(saddr, &th->th_sport, + pd->ip_sum, &th->th_sum, &pd->baddr, + bport, 0, af); + } else { + pf_change_ap(daddr, &th->th_dport, + pd->ip_sum, &th->th_sum, &pd->baddr, + bport, 0, af); + } + } + s->src.seqhi = htonl(arc4random()); + /* Find mss option */ + mss = pf_get_mss(m, off, th->th_off, af); + mss = pf_calc_mss(saddr, af, mss); + mss = pf_calc_mss(daddr, af, mss); + s->src.mss = mss; +#ifdef __FreeBSD__ + pf_send_tcp(NULL, r, af, daddr, saddr, th->th_dport, +#else + pf_send_tcp(r, af, daddr, saddr, th->th_dport, +#endif + th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1, + TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL, NULL); + REASON_SET(&reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } + } + + /* copy back packet headers if we performed NAT operations */ + if (rewrite) + m_copyback(m, off, sizeof(*th), (caddr_t)th); + + return (PF_PASS); +} + +int +pf_test_udp(struct pf_rule **rm, struct pf_state **sm, int direction, + struct pfi_kif *kif, struct mbuf *m, int off, void *h, +#ifdef __FreeBSD__ + struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm, + struct ifqueue *ifq, struct inpcb *inp) +#else + struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm, + struct ifqueue *ifq) +#endif +{ + struct pf_rule *nr = NULL; + struct pf_addr *saddr = pd->src, *daddr = pd->dst; + struct udphdr *uh = pd->hdr.udp; + u_int16_t bport, nport = 0; + sa_family_t af = pd->af; + struct pf_rule *r, *a = NULL; + struct pf_ruleset *ruleset = NULL; + struct pf_src_node *nsn = NULL; + u_short reason; + int rewrite = 0; + int tag = -1, rtableid = -1; + int asd = 0; + int match = 0; + + if (pf_check_congestion(ifq)) { + REASON_SET(&reason, PFRES_CONGEST); + return (PF_DROP); + } + +#ifdef __FreeBSD__ + if (inp != NULL) + pd->lookup.done = pf_socket_lookup(direction, pd, inp); + else if (debug_pfugidhack) { + PF_UNLOCK(); + DPFPRINTF(PF_DEBUG_MISC, ("pf: unlocked lookup\n")); + pd->lookup.done = pf_socket_lookup(direction, pd, inp); + PF_LOCK(); + } +#endif + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); + + if (direction == PF_OUT) { + bport = nport = uh->uh_sport; + /* check outgoing packet for BINAT/NAT */ + if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn, + saddr, uh->uh_sport, daddr, uh->uh_dport, + &pd->naddr, &nport)) != NULL) { + PF_ACPY(&pd->baddr, saddr, af); + pf_change_ap(saddr, &uh->uh_sport, pd->ip_sum, + &uh->uh_sum, &pd->naddr, nport, 1, af); + rewrite++; + if (nr->natpass) + r = NULL; + pd->nat_rule = nr; + } + } else { + bport = nport = uh->uh_dport; + /* check incoming packet for BINAT/RDR */ + if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn, + saddr, uh->uh_sport, daddr, uh->uh_dport, &pd->naddr, + &nport)) != NULL) { + PF_ACPY(&pd->baddr, daddr, af); + pf_change_ap(daddr, &uh->uh_dport, pd->ip_sum, + &uh->uh_sum, &pd->naddr, nport, 1, af); + rewrite++; + if (nr->natpass) + r = NULL; + pd->nat_rule = nr; + } + } + + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != direction) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != IPPROTO_UDP) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, saddr, af, + r->src.neg, kif)) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (r->src.port_op && !pf_match_port(r->src.port_op, + r->src.port[0], r->src.port[1], uh->uh_sport)) + r = r->skip[PF_SKIP_SRC_PORT].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, + r->dst.neg, NULL)) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (r->dst.port_op && !pf_match_port(r->dst.port_op, + r->dst.port[0], r->dst.port[1], uh->uh_dport)) + r = r->skip[PF_SKIP_DST_PORT].ptr; + else if (r->tos && !(r->tos == pd->tos)) + r = TAILQ_NEXT(r, entries); + else if (r->rule_flag & PFRULE_FRAGMENT) + r = TAILQ_NEXT(r, entries); + else if (r->uid.op && (pd->lookup.done || (pd->lookup.done = +#ifdef __FreeBSD__ + pf_socket_lookup(direction, pd, inp), 1)) && +#else + pf_socket_lookup(direction, pd), 1)) && +#endif + !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1], + pd->lookup.uid)) + r = TAILQ_NEXT(r, entries); + else if (r->gid.op && (pd->lookup.done || (pd->lookup.done = +#ifdef __FreeBSD__ + pf_socket_lookup(direction, pd, inp), 1)) && +#else + pf_socket_lookup(direction, pd), 1)) && +#endif + !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1], + pd->lookup.gid)) + r = TAILQ_NEXT(r, entries); + else if (r->prob && r->prob <= arc4random()) + r = TAILQ_NEXT(r, entries); + else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) + r = TAILQ_NEXT(r, entries); + else if (r->os_fingerprint != PF_OSFP_ANY) + r = TAILQ_NEXT(r, entries); + else { + if (r->tag) + tag = r->tag; + if (r->rtableid >= 0) + rtableid = r->rtableid; + if (r->anchor == NULL) { + match = 1; + *rm = r; + *am = a; + *rsm = ruleset; + if ((*rm)->quick) + break; + r = TAILQ_NEXT(r, entries); + } else + pf_step_into_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match); + } + if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match)) + break; + } + r = *rm; + a = *am; + ruleset = *rsm; + + REASON_SET(&reason, PFRES_MATCH); + + if (r->log || (nr != NULL && nr->natpass && nr->log)) { + if (rewrite) +#ifdef __FreeBSD__ + m_copyback(m, off, sizeof(*uh), (caddr_t)uh); +#else + m_copyback(m, off, sizeof(*uh), uh); +#endif + PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr, + a, ruleset, pd); + } + + if ((r->action == PF_DROP) && + ((r->rule_flag & PFRULE_RETURNICMP) || + (r->rule_flag & PFRULE_RETURN))) { + /* undo NAT changes, if they have taken place */ + if (nr != NULL) { + if (direction == PF_OUT) { + pf_change_ap(saddr, &uh->uh_sport, pd->ip_sum, + &uh->uh_sum, &pd->baddr, bport, 1, af); + rewrite++; + } else { + pf_change_ap(daddr, &uh->uh_dport, pd->ip_sum, + &uh->uh_sum, &pd->baddr, bport, 1, af); + rewrite++; + } + } + if ((af == AF_INET) && r->return_icmp) + pf_send_icmp(m, r->return_icmp >> 8, + r->return_icmp & 255, af, r); + else if ((af == AF_INET6) && r->return_icmp6) + pf_send_icmp(m, r->return_icmp6 >> 8, + r->return_icmp6 & 255, af, r); + } + + if (r->action == PF_DROP) + return (PF_DROP); + + if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) { + REASON_SET(&reason, PFRES_MEMORY); + return (PF_DROP); + } + + if (r->keep_state || nr != NULL) { + /* create new state */ + struct pf_state *s = NULL; + struct pf_src_node *sn = NULL; + + /* check maximums */ + if (r->max_states && (r->states >= r->max_states)) { + pf_status.lcounters[LCNT_STATES]++; + REASON_SET(&reason, PFRES_MAXSTATES); + goto cleanup; + } + /* src node for filter rule */ + if ((r->rule_flag & PFRULE_SRCTRACK || + r->rpool.opts & PF_POOL_STICKYADDR) && + pf_insert_src_node(&sn, r, saddr, af) != 0) { + REASON_SET(&reason, PFRES_SRCLIMIT); + goto cleanup; + } + /* src node for translation rule */ + if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && + ((direction == PF_OUT && + pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) || + (pf_insert_src_node(&nsn, nr, saddr, af) != 0))) { + REASON_SET(&reason, PFRES_SRCLIMIT); + goto cleanup; + } + s = pool_get(&pf_state_pl, PR_NOWAIT); + if (s == NULL) { + REASON_SET(&reason, PFRES_MEMORY); +cleanup: + if (sn != NULL && sn->states == 0 && sn->expire == 0) { + RB_REMOVE(pf_src_tree, &tree_src_tracking, sn); + pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; + pf_status.src_nodes--; + pool_put(&pf_src_tree_pl, sn); + } + if (nsn != sn && nsn != NULL && nsn->states == 0 && + nsn->expire == 0) { + RB_REMOVE(pf_src_tree, &tree_src_tracking, nsn); + pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; + pf_status.src_nodes--; + pool_put(&pf_src_tree_pl, nsn); + } + return (PF_DROP); + } + bzero(s, sizeof(*s)); + s->rule.ptr = r; + s->nat_rule.ptr = nr; + s->anchor.ptr = a; + STATE_INC_COUNTERS(s); + if (r->allow_opts) + s->state_flags |= PFSTATE_ALLOWOPTS; + if (r->rule_flag & PFRULE_STATESLOPPY) + s->state_flags |= PFSTATE_SLOPPY; + s->log = r->log & PF_LOG_ALL; + if (nr != NULL) + s->log |= nr->log & PF_LOG_ALL; + s->proto = IPPROTO_UDP; + s->direction = direction; + s->af = af; + if (direction == PF_OUT) { + PF_ACPY(&s->gwy.addr, saddr, af); + s->gwy.port = uh->uh_sport; + PF_ACPY(&s->ext.addr, daddr, af); + s->ext.port = uh->uh_dport; + if (nr != NULL) { + PF_ACPY(&s->lan.addr, &pd->baddr, af); + s->lan.port = bport; + } else { + PF_ACPY(&s->lan.addr, &s->gwy.addr, af); + s->lan.port = s->gwy.port; + } + } else { + PF_ACPY(&s->lan.addr, daddr, af); + s->lan.port = uh->uh_dport; + PF_ACPY(&s->ext.addr, saddr, af); + s->ext.port = uh->uh_sport; + if (nr != NULL) { + PF_ACPY(&s->gwy.addr, &pd->baddr, af); + s->gwy.port = bport; + } else { + PF_ACPY(&s->gwy.addr, &s->lan.addr, af); + s->gwy.port = s->lan.port; + } + } + s->src.state = PFUDPS_SINGLE; + s->dst.state = PFUDPS_NO_TRAFFIC; + s->creation = time_second; + s->expire = time_second; + s->timeout = PFTM_UDP_FIRST_PACKET; + pf_set_rt_ifp(s, saddr); + if (sn != NULL) { + s->src_node = sn; + s->src_node->states++; + } + if (nsn != NULL) { + PF_ACPY(&nsn->raddr, &pd->naddr, af); + s->nat_src_node = nsn; + s->nat_src_node->states++; + } + if (pf_insert_state(BOUND_IFACE(r, kif), s)) { + REASON_SET(&reason, PFRES_STATEINS); + pf_src_tree_remove_state(s); + STATE_DEC_COUNTERS(s); + pool_put(&pf_state_pl, s); + return (PF_DROP); + } else + *sm = s; + if (tag > 0) { + pf_tag_ref(tag); + s->tag = tag; + } + } + + /* copy back packet headers if we performed NAT operations */ + if (rewrite) + m_copyback(m, off, sizeof(*uh), (caddr_t)uh); + + return (PF_PASS); +} + +int +pf_test_icmp(struct pf_rule **rm, struct pf_state **sm, int direction, + struct pfi_kif *kif, struct mbuf *m, int off, void *h, + struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm, + struct ifqueue *ifq) +{ + struct pf_rule *nr = NULL; + struct pf_addr *saddr = pd->src, *daddr = pd->dst; + struct pf_rule *r, *a = NULL; + struct pf_ruleset *ruleset = NULL; + struct pf_src_node *nsn = NULL; + u_short reason; + u_int16_t icmpid = 0, bport, nport = 0; + sa_family_t af = pd->af; + u_int8_t icmptype = 0; /* make the compiler happy */ + u_int8_t icmpcode = 0; /* make the compiler happy */ + int state_icmp = 0; + int tag = -1, rtableid = -1; +#ifdef INET6 + int rewrite = 0; +#endif /* INET6 */ + int asd = 0; + int match = 0; + + if (pf_check_congestion(ifq)) { + REASON_SET(&reason, PFRES_CONGEST); + return (PF_DROP); + } + + switch (pd->proto) { +#ifdef INET + case IPPROTO_ICMP: + icmptype = pd->hdr.icmp->icmp_type; + icmpcode = pd->hdr.icmp->icmp_code; + icmpid = pd->hdr.icmp->icmp_id; + + if (icmptype == ICMP_UNREACH || + icmptype == ICMP_SOURCEQUENCH || + icmptype == ICMP_REDIRECT || + icmptype == ICMP_TIMXCEED || + icmptype == ICMP_PARAMPROB) + state_icmp++; + break; +#endif /* INET */ +#ifdef INET6 + case IPPROTO_ICMPV6: + icmptype = pd->hdr.icmp6->icmp6_type; + icmpcode = pd->hdr.icmp6->icmp6_code; + icmpid = pd->hdr.icmp6->icmp6_id; + + if (icmptype == ICMP6_DST_UNREACH || + icmptype == ICMP6_PACKET_TOO_BIG || + icmptype == ICMP6_TIME_EXCEEDED || + icmptype == ICMP6_PARAM_PROB) + state_icmp++; + break; +#endif /* INET6 */ + } + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); + + if (direction == PF_OUT) { + bport = nport = icmpid; + /* check outgoing packet for BINAT/NAT */ + if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn, + saddr, icmpid, daddr, icmpid, &pd->naddr, &nport)) != + NULL) { + PF_ACPY(&pd->baddr, saddr, af); + switch (af) { +#ifdef INET + case AF_INET: + pf_change_a(&saddr->v4.s_addr, pd->ip_sum, + pd->naddr.v4.s_addr, 0); + pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( + pd->hdr.icmp->icmp_cksum, icmpid, nport, 0); + pd->hdr.icmp->icmp_id = nport; + m_copyback(m, off, ICMP_MINLEN, + (caddr_t)pd->hdr.icmp); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, + &pd->naddr, 0); + rewrite++; + break; +#endif /* INET6 */ + } + if (nr->natpass) + r = NULL; + pd->nat_rule = nr; + } + } else { + bport = nport = icmpid; + /* check incoming packet for BINAT/RDR */ + if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn, + saddr, icmpid, daddr, icmpid, &pd->naddr, &nport)) != + NULL) { + PF_ACPY(&pd->baddr, daddr, af); + switch (af) { +#ifdef INET + case AF_INET: + pf_change_a(&daddr->v4.s_addr, + pd->ip_sum, pd->naddr.v4.s_addr, 0); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, + &pd->naddr, 0); + rewrite++; + break; +#endif /* INET6 */ + } + if (nr->natpass) + r = NULL; + pd->nat_rule = nr; + } + } + + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != direction) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, saddr, af, + r->src.neg, kif)) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, + r->dst.neg, NULL)) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (r->type && r->type != icmptype + 1) + r = TAILQ_NEXT(r, entries); + else if (r->code && r->code != icmpcode + 1) + r = TAILQ_NEXT(r, entries); + else if (r->tos && !(r->tos == pd->tos)) + r = TAILQ_NEXT(r, entries); + else if (r->rule_flag & PFRULE_FRAGMENT) + r = TAILQ_NEXT(r, entries); + else if (r->prob && r->prob <= arc4random()) + r = TAILQ_NEXT(r, entries); + else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) + r = TAILQ_NEXT(r, entries); + else if (r->os_fingerprint != PF_OSFP_ANY) + r = TAILQ_NEXT(r, entries); + else { + if (r->tag) + tag = r->tag; + if (r->rtableid >= 0) + rtableid = r->rtableid; + if (r->anchor == NULL) { + match = 1; + *rm = r; + *am = a; + *rsm = ruleset; + if ((*rm)->quick) + break; + r = TAILQ_NEXT(r, entries); + } else + pf_step_into_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match); + } + if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match)) + break; + } + r = *rm; + a = *am; + ruleset = *rsm; + + REASON_SET(&reason, PFRES_MATCH); + + if (r->log || (nr != NULL && nr->natpass && nr->log)) { +#ifdef INET6 + if (rewrite) + m_copyback(m, off, sizeof(struct icmp6_hdr), + (caddr_t)pd->hdr.icmp6); +#endif /* INET6 */ + PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr, + a, ruleset, pd); + } + + if (r->action != PF_PASS) + return (PF_DROP); + + if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) { + REASON_SET(&reason, PFRES_MEMORY); + return (PF_DROP); + } + + if (!state_icmp && (r->keep_state || nr != NULL)) { + /* create new state */ + struct pf_state *s = NULL; + struct pf_src_node *sn = NULL; + + /* check maximums */ + if (r->max_states && (r->states >= r->max_states)) { + pf_status.lcounters[LCNT_STATES]++; + REASON_SET(&reason, PFRES_MAXSTATES); + goto cleanup; + } + /* src node for filter rule */ + if ((r->rule_flag & PFRULE_SRCTRACK || + r->rpool.opts & PF_POOL_STICKYADDR) && + pf_insert_src_node(&sn, r, saddr, af) != 0) { + REASON_SET(&reason, PFRES_SRCLIMIT); + goto cleanup; + } + /* src node for translation rule */ + if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && + ((direction == PF_OUT && + pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) || + (pf_insert_src_node(&nsn, nr, saddr, af) != 0))) { + REASON_SET(&reason, PFRES_SRCLIMIT); + goto cleanup; + } + s = pool_get(&pf_state_pl, PR_NOWAIT); + if (s == NULL) { + REASON_SET(&reason, PFRES_MEMORY); +cleanup: + if (sn != NULL && sn->states == 0 && sn->expire == 0) { + RB_REMOVE(pf_src_tree, &tree_src_tracking, sn); + pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; + pf_status.src_nodes--; + pool_put(&pf_src_tree_pl, sn); + } + if (nsn != sn && nsn != NULL && nsn->states == 0 && + nsn->expire == 0) { + RB_REMOVE(pf_src_tree, &tree_src_tracking, nsn); + pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; + pf_status.src_nodes--; + pool_put(&pf_src_tree_pl, nsn); + } + return (PF_DROP); + } + bzero(s, sizeof(*s)); + s->rule.ptr = r; + s->nat_rule.ptr = nr; + s->anchor.ptr = a; + STATE_INC_COUNTERS(s); + if (r->allow_opts) + s->state_flags |= PFSTATE_ALLOWOPTS; + if (r->rule_flag & PFRULE_STATESLOPPY) + s->state_flags |= PFSTATE_SLOPPY; + s->log = r->log & PF_LOG_ALL; + if (nr != NULL) + s->log |= nr->log & PF_LOG_ALL; + s->proto = pd->proto; + s->direction = direction; + s->af = af; + if (direction == PF_OUT) { + PF_ACPY(&s->gwy.addr, saddr, af); + s->gwy.port = nport; + PF_ACPY(&s->ext.addr, daddr, af); + s->ext.port = 0; + if (nr != NULL) { + PF_ACPY(&s->lan.addr, &pd->baddr, af); + s->lan.port = bport; + } else { + PF_ACPY(&s->lan.addr, &s->gwy.addr, af); + s->lan.port = s->gwy.port; + } + } else { + PF_ACPY(&s->lan.addr, daddr, af); + s->lan.port = nport; + PF_ACPY(&s->ext.addr, saddr, af); + s->ext.port = 0; + if (nr != NULL) { + PF_ACPY(&s->gwy.addr, &pd->baddr, af); + s->gwy.port = bport; + } else { + PF_ACPY(&s->gwy.addr, &s->lan.addr, af); + s->gwy.port = s->lan.port; + } + } + s->creation = time_second; + s->expire = time_second; + s->timeout = PFTM_ICMP_FIRST_PACKET; + pf_set_rt_ifp(s, saddr); + if (sn != NULL) { + s->src_node = sn; + s->src_node->states++; + } + if (nsn != NULL) { + PF_ACPY(&nsn->raddr, &pd->naddr, af); + s->nat_src_node = nsn; + s->nat_src_node->states++; + } + if (pf_insert_state(BOUND_IFACE(r, kif), s)) { + REASON_SET(&reason, PFRES_STATEINS); + pf_src_tree_remove_state(s); + STATE_DEC_COUNTERS(s); + pool_put(&pf_state_pl, s); + return (PF_DROP); + } else + *sm = s; + if (tag > 0) { + pf_tag_ref(tag); + s->tag = tag; + } + } + +#ifdef INET6 + /* copy back packet headers if we performed IPv6 NAT operations */ + if (rewrite) + m_copyback(m, off, sizeof(struct icmp6_hdr), + (caddr_t)pd->hdr.icmp6); +#endif /* INET6 */ + + return (PF_PASS); +} + +int +pf_test_other(struct pf_rule **rm, struct pf_state **sm, int direction, + struct pfi_kif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, + struct pf_rule **am, struct pf_ruleset **rsm, struct ifqueue *ifq) +{ + struct pf_rule *nr = NULL; + struct pf_rule *r, *a = NULL; + struct pf_ruleset *ruleset = NULL; + struct pf_src_node *nsn = NULL; + struct pf_addr *saddr = pd->src, *daddr = pd->dst; + sa_family_t af = pd->af; + u_short reason; + int tag = -1, rtableid = -1; + int asd = 0; + int match = 0; + + if (pf_check_congestion(ifq)) { + REASON_SET(&reason, PFRES_CONGEST); + return (PF_DROP); + } + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); + + if (direction == PF_OUT) { + /* check outgoing packet for BINAT/NAT */ + if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn, + saddr, 0, daddr, 0, &pd->naddr, NULL)) != NULL) { + PF_ACPY(&pd->baddr, saddr, af); + switch (af) { +#ifdef INET + case AF_INET: + pf_change_a(&saddr->v4.s_addr, pd->ip_sum, + pd->naddr.v4.s_addr, 0); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + PF_ACPY(saddr, &pd->naddr, af); + break; +#endif /* INET6 */ + } + if (nr->natpass) + r = NULL; + pd->nat_rule = nr; + } + } else { + /* check incoming packet for BINAT/RDR */ + if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn, + saddr, 0, daddr, 0, &pd->naddr, NULL)) != NULL) { + PF_ACPY(&pd->baddr, daddr, af); + switch (af) { +#ifdef INET + case AF_INET: + pf_change_a(&daddr->v4.s_addr, + pd->ip_sum, pd->naddr.v4.s_addr, 0); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + PF_ACPY(daddr, &pd->naddr, af); + break; +#endif /* INET6 */ + } + if (nr->natpass) + r = NULL; + pd->nat_rule = nr; + } + } + + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != direction) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, + r->src.neg, kif)) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, + r->dst.neg, NULL)) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (r->tos && !(r->tos == pd->tos)) + r = TAILQ_NEXT(r, entries); + else if (r->rule_flag & PFRULE_FRAGMENT) + r = TAILQ_NEXT(r, entries); + else if (r->prob && r->prob <= arc4random()) + r = TAILQ_NEXT(r, entries); + else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) + r = TAILQ_NEXT(r, entries); + else if (r->os_fingerprint != PF_OSFP_ANY) + r = TAILQ_NEXT(r, entries); + else { + if (r->tag) + tag = r->tag; + if (r->rtableid >= 0) + rtableid = r->rtableid; + if (r->anchor == NULL) { + match = 1; + *rm = r; + *am = a; + *rsm = ruleset; + if ((*rm)->quick) + break; + r = TAILQ_NEXT(r, entries); + } else + pf_step_into_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match); + } + if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match)) + break; + } + r = *rm; + a = *am; + ruleset = *rsm; + + REASON_SET(&reason, PFRES_MATCH); + + if (r->log || (nr != NULL && nr->natpass && nr->log)) + PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr, + a, ruleset, pd); + + if ((r->action == PF_DROP) && + ((r->rule_flag & PFRULE_RETURNICMP) || + (r->rule_flag & PFRULE_RETURN))) { + struct pf_addr *a = NULL; + + if (nr != NULL) { + if (direction == PF_OUT) + a = saddr; + else + a = daddr; + } + if (a != NULL) { + switch (af) { +#ifdef INET + case AF_INET: + pf_change_a(&a->v4.s_addr, pd->ip_sum, + pd->baddr.v4.s_addr, 0); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + PF_ACPY(a, &pd->baddr, af); + break; +#endif /* INET6 */ + } + } + if ((af == AF_INET) && r->return_icmp) + pf_send_icmp(m, r->return_icmp >> 8, + r->return_icmp & 255, af, r); + else if ((af == AF_INET6) && r->return_icmp6) + pf_send_icmp(m, r->return_icmp6 >> 8, + r->return_icmp6 & 255, af, r); + } + + if (r->action != PF_PASS) + return (PF_DROP); + + if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) { + REASON_SET(&reason, PFRES_MEMORY); + return (PF_DROP); + } + + if (r->keep_state || nr != NULL) { + /* create new state */ + struct pf_state *s = NULL; + struct pf_src_node *sn = NULL; + + /* check maximums */ + if (r->max_states && (r->states >= r->max_states)) { + pf_status.lcounters[LCNT_STATES]++; + REASON_SET(&reason, PFRES_MAXSTATES); + goto cleanup; + } + /* src node for filter rule */ + if ((r->rule_flag & PFRULE_SRCTRACK || + r->rpool.opts & PF_POOL_STICKYADDR) && + pf_insert_src_node(&sn, r, saddr, af) != 0) { + REASON_SET(&reason, PFRES_SRCLIMIT); + goto cleanup; + } + /* src node for translation rule */ + if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && + ((direction == PF_OUT && + pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) || + (pf_insert_src_node(&nsn, nr, saddr, af) != 0))) { + REASON_SET(&reason, PFRES_SRCLIMIT); + goto cleanup; + } + s = pool_get(&pf_state_pl, PR_NOWAIT); + if (s == NULL) { + REASON_SET(&reason, PFRES_MEMORY); +cleanup: + if (sn != NULL && sn->states == 0 && sn->expire == 0) { + RB_REMOVE(pf_src_tree, &tree_src_tracking, sn); + pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; + pf_status.src_nodes--; + pool_put(&pf_src_tree_pl, sn); + } + if (nsn != sn && nsn != NULL && nsn->states == 0 && + nsn->expire == 0) { + RB_REMOVE(pf_src_tree, &tree_src_tracking, nsn); + pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; + pf_status.src_nodes--; + pool_put(&pf_src_tree_pl, nsn); + } + return (PF_DROP); + } + bzero(s, sizeof(*s)); + s->rule.ptr = r; + s->nat_rule.ptr = nr; + s->anchor.ptr = a; + STATE_INC_COUNTERS(s); + if (r->allow_opts) + s->state_flags |= PFSTATE_ALLOWOPTS; + if (r->rule_flag & PFRULE_STATESLOPPY) + s->state_flags |= PFSTATE_SLOPPY; + s->log = r->log & PF_LOG_ALL; + if (nr != NULL) + s->log |= nr->log & PF_LOG_ALL; + s->proto = pd->proto; + s->direction = direction; + s->af = af; + if (direction == PF_OUT) { + PF_ACPY(&s->gwy.addr, saddr, af); + PF_ACPY(&s->ext.addr, daddr, af); + if (nr != NULL) + PF_ACPY(&s->lan.addr, &pd->baddr, af); + else + PF_ACPY(&s->lan.addr, &s->gwy.addr, af); + } else { + PF_ACPY(&s->lan.addr, daddr, af); + PF_ACPY(&s->ext.addr, saddr, af); + if (nr != NULL) + PF_ACPY(&s->gwy.addr, &pd->baddr, af); + else + PF_ACPY(&s->gwy.addr, &s->lan.addr, af); + } + s->src.state = PFOTHERS_SINGLE; + s->dst.state = PFOTHERS_NO_TRAFFIC; + s->creation = time_second; + s->expire = time_second; + s->timeout = PFTM_OTHER_FIRST_PACKET; + pf_set_rt_ifp(s, saddr); + if (sn != NULL) { + s->src_node = sn; + s->src_node->states++; + } + if (nsn != NULL) { + PF_ACPY(&nsn->raddr, &pd->naddr, af); + s->nat_src_node = nsn; + s->nat_src_node->states++; + } + if (pf_insert_state(BOUND_IFACE(r, kif), s)) { + REASON_SET(&reason, PFRES_STATEINS); + pf_src_tree_remove_state(s); + STATE_DEC_COUNTERS(s); + pool_put(&pf_state_pl, s); + return (PF_DROP); + } else + *sm = s; + if (tag > 0) { + pf_tag_ref(tag); + s->tag = tag; + } + } + + return (PF_PASS); +} + +int +pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif, + struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am, + struct pf_ruleset **rsm) +{ + struct pf_rule *r, *a = NULL; + struct pf_ruleset *ruleset = NULL; + sa_family_t af = pd->af; + u_short reason; + int tag = -1; + int asd = 0; + int match = 0; + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != direction) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, + r->src.neg, kif)) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, + r->dst.neg, NULL)) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (r->tos && !(r->tos == pd->tos)) + r = TAILQ_NEXT(r, entries); + else if (r->os_fingerprint != PF_OSFP_ANY) + r = TAILQ_NEXT(r, entries); + else if (pd->proto == IPPROTO_UDP && + (r->src.port_op || r->dst.port_op)) + r = TAILQ_NEXT(r, entries); + else if (pd->proto == IPPROTO_TCP && + (r->src.port_op || r->dst.port_op || r->flagset)) + r = TAILQ_NEXT(r, entries); + else if ((pd->proto == IPPROTO_ICMP || + pd->proto == IPPROTO_ICMPV6) && + (r->type || r->code)) + r = TAILQ_NEXT(r, entries); + else if (r->prob && r->prob <= arc4random()) + r = TAILQ_NEXT(r, entries); + else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) + r = TAILQ_NEXT(r, entries); + else { + if (r->anchor == NULL) { + match = 1; + *rm = r; + *am = a; + *rsm = ruleset; + if ((*rm)->quick) + break; + r = TAILQ_NEXT(r, entries); + } else + pf_step_into_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match); + } + if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match)) + break; + } + r = *rm; + a = *am; + ruleset = *rsm; + + REASON_SET(&reason, PFRES_MATCH); + + if (r->log) + PFLOG_PACKET(kif, h, m, af, direction, reason, r, a, ruleset, + pd); + + if (r->action != PF_PASS) + return (PF_DROP); + + if (pf_tag_packet(m, pd->pf_mtag, tag, -1)) { + REASON_SET(&reason, PFRES_MEMORY); + return (PF_DROP); + } + + return (PF_PASS); +} + +int +pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst, + struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off, + struct pf_pdesc *pd, u_short *reason, int *copyback) +{ + struct tcphdr *th = pd->hdr.tcp; + u_int16_t win = ntohs(th->th_win); + u_int32_t ack, end, seq, orig_seq; + u_int8_t sws, dws; + int ackskew; + + if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) { + sws = src->wscale & PF_WSCALE_MASK; + dws = dst->wscale & PF_WSCALE_MASK; + } else + sws = dws = 0; + + /* + * Sequence tracking algorithm from Guido van Rooij's paper: + * http://www.madison-gurkha.com/publications/tcp_filtering/ + * tcp_filtering.ps + */ + + orig_seq = seq = ntohl(th->th_seq); + if (src->seqlo == 0) { + /* First packet from this end. Set its state */ + + if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) && + src->scrub == NULL) { + if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) { + REASON_SET(reason, PFRES_MEMORY); + return (PF_DROP); + } + } + + /* Deferred generation of sequence number modulator */ + if (dst->seqdiff && !src->seqdiff) { +#ifdef __FreeBSD__ + while ((src->seqdiff = pf_new_isn(*state) - seq) == 0) + ; +#else + while ((src->seqdiff = tcp_rndiss_next() - seq) == 0) + ; +#endif + ack = ntohl(th->th_ack) - dst->seqdiff; + pf_change_a(&th->th_seq, &th->th_sum, htonl(seq + + src->seqdiff), 0); + pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0); + *copyback = 1; + } else { + ack = ntohl(th->th_ack); + } + + end = seq + pd->p_len; + if (th->th_flags & TH_SYN) { + end++; + if (dst->wscale & PF_WSCALE_FLAG) { + src->wscale = pf_get_wscale(m, off, th->th_off, + pd->af); + if (src->wscale & PF_WSCALE_FLAG) { + /* Remove scale factor from initial + * window */ + sws = src->wscale & PF_WSCALE_MASK; + win = ((u_int32_t)win + (1 << sws) - 1) + >> sws; + dws = dst->wscale & PF_WSCALE_MASK; + } else { + /* fixup other window */ + dst->max_win <<= dst->wscale & + PF_WSCALE_MASK; + /* in case of a retrans SYN|ACK */ + dst->wscale = 0; + } + } + } + if (th->th_flags & TH_FIN) + end++; + + src->seqlo = seq; + if (src->state < TCPS_SYN_SENT) + src->state = TCPS_SYN_SENT; + + /* + * May need to slide the window (seqhi may have been set by + * the crappy stack check or if we picked up the connection + * after establishment) + */ + if (src->seqhi == 1 || + SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) + src->seqhi = end + MAX(1, dst->max_win << dws); + if (win > src->max_win) + src->max_win = win; + + } else { + ack = ntohl(th->th_ack) - dst->seqdiff; + if (src->seqdiff) { + /* Modulate sequence numbers */ + pf_change_a(&th->th_seq, &th->th_sum, htonl(seq + + src->seqdiff), 0); + pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0); + *copyback = 1; + } + end = seq + pd->p_len; + if (th->th_flags & TH_SYN) + end++; + if (th->th_flags & TH_FIN) + end++; + } + + if ((th->th_flags & TH_ACK) == 0) { + /* Let it pass through the ack skew check */ + ack = dst->seqlo; + } else if ((ack == 0 && + (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) || + /* broken tcp stacks do not set ack */ + (dst->state < TCPS_SYN_SENT)) { + /* + * Many stacks (ours included) will set the ACK number in an + * FIN|ACK if the SYN times out -- no sequence to ACK. + */ + ack = dst->seqlo; + } + + if (seq == end) { + /* Ease sequencing restrictions on no data packets */ + seq = src->seqlo; + end = seq; + } + + ackskew = dst->seqlo - ack; + + + /* + * Need to demodulate the sequence numbers in any TCP SACK options + * (Selective ACK). We could optionally validate the SACK values + * against the current ACK window, either forwards or backwards, but + * I'm not confident that SACK has been implemented properly + * everywhere. It wouldn't surprise me if several stacks accidently + * SACK too far backwards of previously ACKed data. There really aren't + * any security implications of bad SACKing unless the target stack + * doesn't validate the option length correctly. Someone trying to + * spoof into a TCP connection won't bother blindly sending SACK + * options anyway. + */ + if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) { + if (pf_modulate_sack(m, off, pd, th, dst)) + *copyback = 1; + } + + +#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ + if (SEQ_GEQ(src->seqhi, end) && + /* Last octet inside other's window space */ + SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) && + /* Retrans: not more than one window back */ + (ackskew >= -MAXACKWINDOW) && + /* Acking not more than one reassembled fragment backwards */ + (ackskew <= (MAXACKWINDOW << sws)) && + /* Acking not more than one window forward */ + ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo || + (orig_seq == src->seqlo + 1) || (pd->flags & PFDESC_IP_REAS) == 0)) { + /* Require an exact/+1 sequence match on resets when possible */ + + if (dst->scrub || src->scrub) { + if (pf_normalize_tcp_stateful(m, off, pd, reason, th, + *state, src, dst, copyback)) + return (PF_DROP); + } + + /* update max window */ + if (src->max_win < win) + src->max_win = win; + /* synchronize sequencing */ + if (SEQ_GT(end, src->seqlo)) + src->seqlo = end; + /* slide the window of what the other end can send */ + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) + dst->seqhi = ack + MAX((win << sws), 1); + + + /* update states */ + if (th->th_flags & TH_SYN) + if (src->state < TCPS_SYN_SENT) + src->state = TCPS_SYN_SENT; + if (th->th_flags & TH_FIN) + if (src->state < TCPS_CLOSING) + src->state = TCPS_CLOSING; + if (th->th_flags & TH_ACK) { + if (dst->state == TCPS_SYN_SENT) { + dst->state = TCPS_ESTABLISHED; + if (src->state == TCPS_ESTABLISHED && + (*state)->src_node != NULL && + pf_src_connlimit(state)) { + REASON_SET(reason, PFRES_SRCLIMIT); + return (PF_DROP); + } + } else if (dst->state == TCPS_CLOSING) + dst->state = TCPS_FIN_WAIT_2; + } + if (th->th_flags & TH_RST) + src->state = dst->state = TCPS_TIME_WAIT; + + /* update expire time */ + (*state)->expire = time_second; + if (src->state >= TCPS_FIN_WAIT_2 && + dst->state >= TCPS_FIN_WAIT_2) + (*state)->timeout = PFTM_TCP_CLOSED; + else if (src->state >= TCPS_CLOSING && + dst->state >= TCPS_CLOSING) + (*state)->timeout = PFTM_TCP_FIN_WAIT; + else if (src->state < TCPS_ESTABLISHED || + dst->state < TCPS_ESTABLISHED) + (*state)->timeout = PFTM_TCP_OPENING; + else if (src->state >= TCPS_CLOSING || + dst->state >= TCPS_CLOSING) + (*state)->timeout = PFTM_TCP_CLOSING; + else + (*state)->timeout = PFTM_TCP_ESTABLISHED; + + /* Fall through to PASS packet */ + + } else if ((dst->state < TCPS_SYN_SENT || + dst->state >= TCPS_FIN_WAIT_2 || + src->state >= TCPS_FIN_WAIT_2) && + SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) && + /* Within a window forward of the originating packet */ + SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { + /* Within a window backward of the originating packet */ + + /* + * This currently handles three situations: + * 1) Stupid stacks will shotgun SYNs before their peer + * replies. + * 2) When PF catches an already established stream (the + * firewall rebooted, the state table was flushed, routes + * changed...) + * 3) Packets get funky immediately after the connection + * closes (this should catch Solaris spurious ACK|FINs + * that web servers like to spew after a close) + * + * This must be a little more careful than the above code + * since packet floods will also be caught here. We don't + * update the TTL here to mitigate the damage of a packet + * flood and so the same code can handle awkward establishment + * and a loosened connection close. + * In the establishment case, a correct peer response will + * validate the connection, go through the normal state code + * and keep updating the state TTL. + */ + + if (pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: loose state match: "); + pf_print_state(*state); + pf_print_flags(th->th_flags); + printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " + "pkts=%llu:%llu\n", seq, orig_seq, ack, pd->p_len, +#ifdef __FreeBSD__ + ackskew, (unsigned long long)(*state)->packets[0], + (unsigned long long)(*state)->packets[1]); +#else + ackskew, (*state)->packets[0], + (*state)->packets[1]); +#endif + } + + if (dst->scrub || src->scrub) { + if (pf_normalize_tcp_stateful(m, off, pd, reason, th, + *state, src, dst, copyback)) + return (PF_DROP); + } + + /* update max window */ + if (src->max_win < win) + src->max_win = win; + /* synchronize sequencing */ + if (SEQ_GT(end, src->seqlo)) + src->seqlo = end; + /* slide the window of what the other end can send */ + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) + dst->seqhi = ack + MAX((win << sws), 1); + + /* + * Cannot set dst->seqhi here since this could be a shotgunned + * SYN and not an already established connection. + */ + + if (th->th_flags & TH_FIN) + if (src->state < TCPS_CLOSING) + src->state = TCPS_CLOSING; + if (th->th_flags & TH_RST) + src->state = dst->state = TCPS_TIME_WAIT; + + /* Fall through to PASS packet */ + + } else { + if ((*state)->dst.state == TCPS_SYN_SENT && + (*state)->src.state == TCPS_SYN_SENT) { + /* Send RST for state mismatches during handshake */ + if (!(th->th_flags & TH_RST)) +#ifdef __FreeBSD__ + pf_send_tcp(m, (*state)->rule.ptr, pd->af, +#else + pf_send_tcp((*state)->rule.ptr, pd->af, +#endif + pd->dst, pd->src, th->th_dport, + th->th_sport, ntohl(th->th_ack), 0, + TH_RST, 0, 0, + (*state)->rule.ptr->return_ttl, 1, 0, + pd->eh, kif->pfik_ifp); + src->seqlo = 0; + src->seqhi = 1; + src->max_win = 1; + } else if (pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: BAD state: "); + pf_print_state(*state); + pf_print_flags(th->th_flags); + printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " +#ifdef notyet + "pkts=%llu:%llu dir=%s,%s\n", +#else + "pkts=%llu:%llu%s\n", +#endif + seq, orig_seq, ack, pd->p_len, ackskew, +#ifdef __FreeBSD__ + (unsigned long long)(*state)->packets[0], + (unsigned long long)(*state)->packets[1], +#else + (*state)->packets[0], (*state)->packets[1], +#endif +#ifdef notyet + direction == PF_IN ? "in" : "out", + direction == (*state)->direction ? "fwd" : "rev"); +#else + ""); +#endif + printf("pf: State failure on: %c %c %c %c | %c %c\n", + SEQ_GEQ(src->seqhi, end) ? ' ' : '1', + SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ? + ' ': '2', + (ackskew >= -MAXACKWINDOW) ? ' ' : '3', + (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4', + SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5', + SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6'); + } + REASON_SET(reason, PFRES_BADSTATE); + return (PF_DROP); + } + + /* Any packets which have gotten here are to be passed */ + return (PF_PASS); +} + +int +pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst, + struct pf_state **state, struct pf_pdesc *pd, u_short *reason) +{ + struct tcphdr *th = pd->hdr.tcp; + + if (th->th_flags & TH_SYN) + if (src->state < TCPS_SYN_SENT) + src->state = TCPS_SYN_SENT; + if (th->th_flags & TH_FIN) + if (src->state < TCPS_CLOSING) + src->state = TCPS_CLOSING; + if (th->th_flags & TH_ACK) { + if (dst->state == TCPS_SYN_SENT) { + dst->state = TCPS_ESTABLISHED; + if (src->state == TCPS_ESTABLISHED && + (*state)->src_node != NULL && + pf_src_connlimit(state)) { + REASON_SET(reason, PFRES_SRCLIMIT); + return (PF_DROP); + } + } else if (dst->state == TCPS_CLOSING) { + dst->state = TCPS_FIN_WAIT_2; + } else if (src->state == TCPS_SYN_SENT && + dst->state < TCPS_SYN_SENT) { + /* + * Handle a special sloppy case where we only see one + * half of the connection. If there is a ACK after + * the initial SYN without ever seeing a packet from + * the destination, set the connection to established. + */ + dst->state = src->state = TCPS_ESTABLISHED; + if ((*state)->src_node != NULL && + pf_src_connlimit(state)) { + REASON_SET(reason, PFRES_SRCLIMIT); + return (PF_DROP); + } + } else if (src->state == TCPS_CLOSING && + dst->state == TCPS_ESTABLISHED && + dst->seqlo == 0) { + /* + * Handle the closing of half connections where we + * don't see the full bidirectional FIN/ACK+ACK + * handshake. + */ + dst->state = TCPS_CLOSING; + } + } + if (th->th_flags & TH_RST) + src->state = dst->state = TCPS_TIME_WAIT; + + /* update expire time */ + (*state)->expire = time_second; + if (src->state >= TCPS_FIN_WAIT_2 && + dst->state >= TCPS_FIN_WAIT_2) + (*state)->timeout = PFTM_TCP_CLOSED; + else if (src->state >= TCPS_CLOSING && + dst->state >= TCPS_CLOSING) + (*state)->timeout = PFTM_TCP_FIN_WAIT; + else if (src->state < TCPS_ESTABLISHED || + dst->state < TCPS_ESTABLISHED) + (*state)->timeout = PFTM_TCP_OPENING; + else if (src->state >= TCPS_CLOSING || + dst->state >= TCPS_CLOSING) + (*state)->timeout = PFTM_TCP_CLOSING; + else + (*state)->timeout = PFTM_TCP_ESTABLISHED; + + return (PF_PASS); +} + + +int +pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, + struct mbuf *m, int off, void *h, struct pf_pdesc *pd, + u_short *reason) +{ + struct pf_state_cmp key; + struct tcphdr *th = pd->hdr.tcp; + int copyback = 0; + struct pf_state_peer *src, *dst; + + key.af = pd->af; + key.proto = IPPROTO_TCP; + if (direction == PF_IN) { + PF_ACPY(&key.ext.addr, pd->src, key.af); + PF_ACPY(&key.gwy.addr, pd->dst, key.af); + key.ext.port = th->th_sport; + key.gwy.port = th->th_dport; + } else { + PF_ACPY(&key.lan.addr, pd->src, key.af); + PF_ACPY(&key.ext.addr, pd->dst, key.af); + key.lan.port = th->th_sport; + key.ext.port = th->th_dport; + } + + STATE_LOOKUP(); + + if (direction == (*state)->direction) { + src = &(*state)->src; + dst = &(*state)->dst; + } else { + src = &(*state)->dst; + dst = &(*state)->src; + } + + if ((*state)->src.state == PF_TCPS_PROXY_SRC) { + if (direction != (*state)->direction) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } + if (th->th_flags & TH_SYN) { + if (ntohl(th->th_seq) != (*state)->src.seqlo) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_DROP); + } +#ifdef __FreeBSD__ + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, +#else + pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst, +#endif + pd->src, th->th_dport, th->th_sport, + (*state)->src.seqhi, ntohl(th->th_seq) + 1, + TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, + 0, NULL, NULL); + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } else if (!(th->th_flags & TH_ACK) || + (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || + (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_DROP); + } else if ((*state)->src_node != NULL && + pf_src_connlimit(state)) { + REASON_SET(reason, PFRES_SRCLIMIT); + return (PF_DROP); + } else + (*state)->src.state = PF_TCPS_PROXY_DST; + } + if ((*state)->src.state == PF_TCPS_PROXY_DST) { + struct pf_state_host *src, *dst; + + if (direction == PF_OUT) { + src = &(*state)->gwy; + dst = &(*state)->ext; + } else { + src = &(*state)->ext; + dst = &(*state)->lan; + } + if (direction == (*state)->direction) { + if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) || + (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || + (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_DROP); + } + (*state)->src.max_win = MAX(ntohs(th->th_win), 1); + if ((*state)->dst.seqhi == 1) + (*state)->dst.seqhi = htonl(arc4random()); +#ifdef __FreeBSD__ + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, + &src->addr, +#else + pf_send_tcp((*state)->rule.ptr, pd->af, &src->addr, +#endif + &dst->addr, src->port, dst->port, + (*state)->dst.seqhi, 0, TH_SYN, 0, + (*state)->src.mss, 0, 0, (*state)->tag, NULL, NULL); + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } else if (((th->th_flags & (TH_SYN|TH_ACK)) != + (TH_SYN|TH_ACK)) || + (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_DROP); + } else { + (*state)->dst.max_win = MAX(ntohs(th->th_win), 1); + (*state)->dst.seqlo = ntohl(th->th_seq); +#ifdef __FreeBSD__ + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, +#else + pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst, +#endif + pd->src, th->th_dport, th->th_sport, + ntohl(th->th_ack), ntohl(th->th_seq) + 1, + TH_ACK, (*state)->src.max_win, 0, 0, 0, + (*state)->tag, NULL, NULL); +#ifdef __FreeBSD__ + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, + &src->addr, +#else + pf_send_tcp((*state)->rule.ptr, pd->af, &src->addr, +#endif + &dst->addr, src->port, dst->port, + (*state)->src.seqhi + 1, (*state)->src.seqlo + 1, + TH_ACK, (*state)->dst.max_win, 0, 0, 1, + 0, NULL, NULL); + (*state)->src.seqdiff = (*state)->dst.seqhi - + (*state)->src.seqlo; + (*state)->dst.seqdiff = (*state)->src.seqhi - + (*state)->dst.seqlo; + (*state)->src.seqhi = (*state)->src.seqlo + + (*state)->dst.max_win; + (*state)->dst.seqhi = (*state)->dst.seqlo + + (*state)->src.max_win; + (*state)->src.wscale = (*state)->dst.wscale = 0; + (*state)->src.state = (*state)->dst.state = + TCPS_ESTABLISHED; + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } + } + + if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) && + dst->state >= TCPS_FIN_WAIT_2 && + src->state >= TCPS_FIN_WAIT_2) { + if (pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: state reuse "); + pf_print_state(*state); + pf_print_flags(th->th_flags); + printf("\n"); + } + /* XXX make sure it's the same direction ?? */ + (*state)->src.state = (*state)->dst.state = TCPS_CLOSED; + pf_unlink_state(*state); + *state = NULL; + return (PF_DROP); + } + + if ((*state)->state_flags & PFSTATE_SLOPPY) { + if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP) + return (PF_DROP); + } else { + if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason, + ©back) == PF_DROP) + return (PF_DROP); + } + + /* translate source/destination address, if necessary */ + if (STATE_TRANSLATE(*state)) { + if (direction == PF_OUT) + pf_change_ap(pd->src, &th->th_sport, pd->ip_sum, + &th->th_sum, &(*state)->gwy.addr, + (*state)->gwy.port, 0, pd->af); + else + pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum, + &th->th_sum, &(*state)->lan.addr, + (*state)->lan.port, 0, pd->af); + m_copyback(m, off, sizeof(*th), (caddr_t)th); + } else if (copyback) { + /* Copyback sequence modulation or stateful scrub changes */ + m_copyback(m, off, sizeof(*th), (caddr_t)th); + } + + return (PF_PASS); +} + +int +pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, + struct mbuf *m, int off, void *h, struct pf_pdesc *pd) +{ + struct pf_state_peer *src, *dst; + struct pf_state_cmp key; + struct udphdr *uh = pd->hdr.udp; + + key.af = pd->af; + key.proto = IPPROTO_UDP; + if (direction == PF_IN) { + PF_ACPY(&key.ext.addr, pd->src, key.af); + PF_ACPY(&key.gwy.addr, pd->dst, key.af); + key.ext.port = uh->uh_sport; + key.gwy.port = uh->uh_dport; + } else { + PF_ACPY(&key.lan.addr, pd->src, key.af); + PF_ACPY(&key.ext.addr, pd->dst, key.af); + key.lan.port = uh->uh_sport; + key.ext.port = uh->uh_dport; + } + + STATE_LOOKUP(); + + if (direction == (*state)->direction) { + src = &(*state)->src; + dst = &(*state)->dst; + } else { + src = &(*state)->dst; + dst = &(*state)->src; + } + + /* update states */ + if (src->state < PFUDPS_SINGLE) + src->state = PFUDPS_SINGLE; + if (dst->state == PFUDPS_SINGLE) + dst->state = PFUDPS_MULTIPLE; + + /* update expire time */ + (*state)->expire = time_second; + if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE) + (*state)->timeout = PFTM_UDP_MULTIPLE; + else + (*state)->timeout = PFTM_UDP_SINGLE; + + /* translate source/destination address, if necessary */ + if (STATE_TRANSLATE(*state)) { + if (direction == PF_OUT) + pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum, + &uh->uh_sum, &(*state)->gwy.addr, + (*state)->gwy.port, 1, pd->af); + else + pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum, + &uh->uh_sum, &(*state)->lan.addr, + (*state)->lan.port, 1, pd->af); + m_copyback(m, off, sizeof(*uh), (caddr_t)uh); + } + + return (PF_PASS); +} + +int +pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, + struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) +{ + struct pf_addr *saddr = pd->src, *daddr = pd->dst; + u_int16_t icmpid = 0; /* make the compiler happy */ + u_int16_t *icmpsum = NULL; /* make the compiler happy */ + u_int8_t icmptype = 0; /* make the compiler happy */ + int state_icmp = 0; + struct pf_state_cmp key; + + switch (pd->proto) { +#ifdef INET + case IPPROTO_ICMP: + icmptype = pd->hdr.icmp->icmp_type; + icmpid = pd->hdr.icmp->icmp_id; + icmpsum = &pd->hdr.icmp->icmp_cksum; + + if (icmptype == ICMP_UNREACH || + icmptype == ICMP_SOURCEQUENCH || + icmptype == ICMP_REDIRECT || + icmptype == ICMP_TIMXCEED || + icmptype == ICMP_PARAMPROB) + state_icmp++; + break; +#endif /* INET */ +#ifdef INET6 + case IPPROTO_ICMPV6: + icmptype = pd->hdr.icmp6->icmp6_type; + icmpid = pd->hdr.icmp6->icmp6_id; + icmpsum = &pd->hdr.icmp6->icmp6_cksum; + + if (icmptype == ICMP6_DST_UNREACH || + icmptype == ICMP6_PACKET_TOO_BIG || + icmptype == ICMP6_TIME_EXCEEDED || + icmptype == ICMP6_PARAM_PROB) + state_icmp++; + break; +#endif /* INET6 */ + } + + if (!state_icmp) { + + /* + * ICMP query/reply message not related to a TCP/UDP packet. + * Search for an ICMP state. + */ + key.af = pd->af; + key.proto = pd->proto; + if (direction == PF_IN) { + PF_ACPY(&key.ext.addr, pd->src, key.af); + PF_ACPY(&key.gwy.addr, pd->dst, key.af); + key.ext.port = 0; + key.gwy.port = icmpid; + } else { + PF_ACPY(&key.lan.addr, pd->src, key.af); + PF_ACPY(&key.ext.addr, pd->dst, key.af); + key.lan.port = icmpid; + key.ext.port = 0; + } + + STATE_LOOKUP(); + + (*state)->expire = time_second; + (*state)->timeout = PFTM_ICMP_ERROR_REPLY; + + /* translate source/destination address, if necessary */ + if (STATE_TRANSLATE(*state)) { + if (direction == PF_OUT) { + switch (pd->af) { +#ifdef INET + case AF_INET: + pf_change_a(&saddr->v4.s_addr, + pd->ip_sum, + (*state)->gwy.addr.v4.s_addr, 0); + pd->hdr.icmp->icmp_cksum = + pf_cksum_fixup( + pd->hdr.icmp->icmp_cksum, icmpid, + (*state)->gwy.port, 0); + pd->hdr.icmp->icmp_id = + (*state)->gwy.port; + m_copyback(m, off, ICMP_MINLEN, + (caddr_t)pd->hdr.icmp); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + pf_change_a6(saddr, + &pd->hdr.icmp6->icmp6_cksum, + &(*state)->gwy.addr, 0); + m_copyback(m, off, + sizeof(struct icmp6_hdr), + (caddr_t)pd->hdr.icmp6); + break; +#endif /* INET6 */ + } + } else { + switch (pd->af) { +#ifdef INET + case AF_INET: + pf_change_a(&daddr->v4.s_addr, + pd->ip_sum, + (*state)->lan.addr.v4.s_addr, 0); + pd->hdr.icmp->icmp_cksum = + pf_cksum_fixup( + pd->hdr.icmp->icmp_cksum, icmpid, + (*state)->lan.port, 0); + pd->hdr.icmp->icmp_id = + (*state)->lan.port; + m_copyback(m, off, ICMP_MINLEN, + (caddr_t)pd->hdr.icmp); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + pf_change_a6(daddr, + &pd->hdr.icmp6->icmp6_cksum, + &(*state)->lan.addr, 0); + m_copyback(m, off, + sizeof(struct icmp6_hdr), + (caddr_t)pd->hdr.icmp6); + break; +#endif /* INET6 */ + } + } + } + + return (PF_PASS); + + } else { + /* + * ICMP error message in response to a TCP/UDP packet. + * Extract the inner TCP/UDP header and search for that state. + */ + + struct pf_pdesc pd2; +#ifdef INET + struct ip h2; +#endif /* INET */ +#ifdef INET6 + struct ip6_hdr h2_6; + int terminal = 0; +#endif /* INET6 */ + int ipoff2 = 0; /* make the compiler happy */ + int off2 = 0; /* make the compiler happy */ + + pd2.af = pd->af; + switch (pd->af) { +#ifdef INET + case AF_INET: + /* offset of h2 in mbuf chain */ + ipoff2 = off + ICMP_MINLEN; + + if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2), + NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(ip)\n")); + return (PF_DROP); + } + /* + * ICMP error messages don't refer to non-first + * fragments + */ + if (h2.ip_off & htons(IP_OFFMASK)) { + REASON_SET(reason, PFRES_FRAG); + return (PF_DROP); + } + + /* offset of protocol header that follows h2 */ + off2 = ipoff2 + (h2.ip_hl << 2); + + pd2.proto = h2.ip_p; + pd2.src = (struct pf_addr *)&h2.ip_src; + pd2.dst = (struct pf_addr *)&h2.ip_dst; + pd2.ip_sum = &h2.ip_sum; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + ipoff2 = off + sizeof(struct icmp6_hdr); + + if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6), + NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(ip6)\n")); + return (PF_DROP); + } + pd2.proto = h2_6.ip6_nxt; + pd2.src = (struct pf_addr *)&h2_6.ip6_src; + pd2.dst = (struct pf_addr *)&h2_6.ip6_dst; + pd2.ip_sum = NULL; + off2 = ipoff2 + sizeof(h2_6); + do { + switch (pd2.proto) { + case IPPROTO_FRAGMENT: + /* + * ICMPv6 error messages for + * non-first fragments + */ + REASON_SET(reason, PFRES_FRAG); + return (PF_DROP); + case IPPROTO_AH: + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: { + /* get next header and header length */ + struct ip6_ext opt6; + + if (!pf_pull_hdr(m, off2, &opt6, + sizeof(opt6), NULL, reason, + pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMPv6 short opt\n")); + return (PF_DROP); + } + if (pd2.proto == IPPROTO_AH) + off2 += (opt6.ip6e_len + 2) * 4; + else + off2 += (opt6.ip6e_len + 1) * 8; + pd2.proto = opt6.ip6e_nxt; + /* goto the next header */ + break; + } + default: + terminal++; + break; + } + } while (!terminal); + break; +#endif /* INET6 */ +#ifdef __FreeBSD__ + default: + panic("AF not supported: %d", pd->af); +#endif + } + + switch (pd2.proto) { + case IPPROTO_TCP: { + struct tcphdr th; + u_int32_t seq; + struct pf_state_peer *src, *dst; + u_int8_t dws; + int copyback = 0; + + /* + * Only the first 8 bytes of the TCP header can be + * expected. Don't access any TCP header fields after + * th_seq, an ackskew test is not possible. + */ + if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason, + pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(tcp)\n")); + return (PF_DROP); + } + + key.af = pd2.af; + key.proto = IPPROTO_TCP; + if (direction == PF_IN) { + PF_ACPY(&key.ext.addr, pd2.dst, key.af); + PF_ACPY(&key.gwy.addr, pd2.src, key.af); + key.ext.port = th.th_dport; + key.gwy.port = th.th_sport; + } else { + PF_ACPY(&key.lan.addr, pd2.dst, key.af); + PF_ACPY(&key.ext.addr, pd2.src, key.af); + key.lan.port = th.th_dport; + key.ext.port = th.th_sport; + } + + STATE_LOOKUP(); + + if (direction == (*state)->direction) { + src = &(*state)->dst; + dst = &(*state)->src; + } else { + src = &(*state)->src; + dst = &(*state)->dst; + } + + if (src->wscale && dst->wscale) + dws = dst->wscale & PF_WSCALE_MASK; + else + dws = 0; + + /* Demodulate sequence number */ + seq = ntohl(th.th_seq) - src->seqdiff; + if (src->seqdiff) { + pf_change_a(&th.th_seq, icmpsum, + htonl(seq), 0); + copyback = 1; + } + + if (!((*state)->state_flags & PFSTATE_SLOPPY) && + (!SEQ_GEQ(src->seqhi, seq) || + !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) { + if (pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: BAD ICMP %d:%d ", + icmptype, pd->hdr.icmp->icmp_code); + pf_print_host(pd->src, 0, pd->af); + printf(" -> "); + pf_print_host(pd->dst, 0, pd->af); + printf(" state: "); + pf_print_state(*state); + printf(" seq=%u\n", seq); + } + REASON_SET(reason, PFRES_BADSTATE); + return (PF_DROP); + } + + if (STATE_TRANSLATE(*state)) { + if (direction == PF_IN) { + pf_change_icmp(pd2.src, &th.th_sport, + daddr, &(*state)->lan.addr, + (*state)->lan.port, NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, pd2.af); + } else { + pf_change_icmp(pd2.dst, &th.th_dport, + saddr, &(*state)->gwy.addr, + (*state)->gwy.port, NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, pd2.af); + } + copyback = 1; + } + + if (copyback) { + switch (pd2.af) { +#ifdef INET + case AF_INET: + m_copyback(m, off, ICMP_MINLEN, + (caddr_t)pd->hdr.icmp); + m_copyback(m, ipoff2, sizeof(h2), + (caddr_t)&h2); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + m_copyback(m, off, + sizeof(struct icmp6_hdr), + (caddr_t)pd->hdr.icmp6); + m_copyback(m, ipoff2, sizeof(h2_6), + (caddr_t)&h2_6); + break; +#endif /* INET6 */ + } + m_copyback(m, off2, 8, (caddr_t)&th); + } + + return (PF_PASS); + break; + } + case IPPROTO_UDP: { + struct udphdr uh; + + if (!pf_pull_hdr(m, off2, &uh, sizeof(uh), + NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(udp)\n")); + return (PF_DROP); + } + + key.af = pd2.af; + key.proto = IPPROTO_UDP; + if (direction == PF_IN) { + PF_ACPY(&key.ext.addr, pd2.dst, key.af); + PF_ACPY(&key.gwy.addr, pd2.src, key.af); + key.ext.port = uh.uh_dport; + key.gwy.port = uh.uh_sport; + } else { + PF_ACPY(&key.lan.addr, pd2.dst, key.af); + PF_ACPY(&key.ext.addr, pd2.src, key.af); + key.lan.port = uh.uh_dport; + key.ext.port = uh.uh_sport; + } + + STATE_LOOKUP(); + + if (STATE_TRANSLATE(*state)) { + if (direction == PF_IN) { + pf_change_icmp(pd2.src, &uh.uh_sport, + daddr, &(*state)->lan.addr, + (*state)->lan.port, &uh.uh_sum, + pd2.ip_sum, icmpsum, + pd->ip_sum, 1, pd2.af); + } else { + pf_change_icmp(pd2.dst, &uh.uh_dport, + saddr, &(*state)->gwy.addr, + (*state)->gwy.port, &uh.uh_sum, + pd2.ip_sum, icmpsum, + pd->ip_sum, 1, pd2.af); + } + switch (pd2.af) { +#ifdef INET + case AF_INET: + m_copyback(m, off, ICMP_MINLEN, + (caddr_t)pd->hdr.icmp); + m_copyback(m, ipoff2, sizeof(h2), + (caddr_t)&h2); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + m_copyback(m, off, + sizeof(struct icmp6_hdr), + (caddr_t)pd->hdr.icmp6); + m_copyback(m, ipoff2, sizeof(h2_6), + (caddr_t)&h2_6); + break; +#endif /* INET6 */ + } + m_copyback(m, off2, sizeof(uh), + (caddr_t)&uh); + } + + return (PF_PASS); + break; + } +#ifdef INET + case IPPROTO_ICMP: { + struct icmp iih; + + if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN, + NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short i" + "(icmp)\n")); + return (PF_DROP); + } + + key.af = pd2.af; + key.proto = IPPROTO_ICMP; + if (direction == PF_IN) { + PF_ACPY(&key.ext.addr, pd2.dst, key.af); + PF_ACPY(&key.gwy.addr, pd2.src, key.af); + key.ext.port = 0; + key.gwy.port = iih.icmp_id; + } else { + PF_ACPY(&key.lan.addr, pd2.dst, key.af); + PF_ACPY(&key.ext.addr, pd2.src, key.af); + key.lan.port = iih.icmp_id; + key.ext.port = 0; + } + + STATE_LOOKUP(); + + if (STATE_TRANSLATE(*state)) { + if (direction == PF_IN) { + pf_change_icmp(pd2.src, &iih.icmp_id, + daddr, &(*state)->lan.addr, + (*state)->lan.port, NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, AF_INET); + } else { + pf_change_icmp(pd2.dst, &iih.icmp_id, + saddr, &(*state)->gwy.addr, + (*state)->gwy.port, NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, AF_INET); + } + m_copyback(m, off, ICMP_MINLEN, + (caddr_t)pd->hdr.icmp); + m_copyback(m, ipoff2, sizeof(h2), + (caddr_t)&h2); + m_copyback(m, off2, ICMP_MINLEN, + (caddr_t)&iih); + } + + return (PF_PASS); + break; + } +#endif /* INET */ +#ifdef INET6 + case IPPROTO_ICMPV6: { + struct icmp6_hdr iih; + + if (!pf_pull_hdr(m, off2, &iih, + sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(icmp6)\n")); + return (PF_DROP); + } + + key.af = pd2.af; + key.proto = IPPROTO_ICMPV6; + if (direction == PF_IN) { + PF_ACPY(&key.ext.addr, pd2.dst, key.af); + PF_ACPY(&key.gwy.addr, pd2.src, key.af); + key.ext.port = 0; + key.gwy.port = iih.icmp6_id; + } else { + PF_ACPY(&key.lan.addr, pd2.dst, key.af); + PF_ACPY(&key.ext.addr, pd2.src, key.af); + key.lan.port = iih.icmp6_id; + key.ext.port = 0; + } + + STATE_LOOKUP(); + + if (STATE_TRANSLATE(*state)) { + if (direction == PF_IN) { + pf_change_icmp(pd2.src, &iih.icmp6_id, + daddr, &(*state)->lan.addr, + (*state)->lan.port, NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, AF_INET6); + } else { + pf_change_icmp(pd2.dst, &iih.icmp6_id, + saddr, &(*state)->gwy.addr, + (*state)->gwy.port, NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, AF_INET6); + } + m_copyback(m, off, sizeof(struct icmp6_hdr), + (caddr_t)pd->hdr.icmp6); + m_copyback(m, ipoff2, sizeof(h2_6), + (caddr_t)&h2_6); + m_copyback(m, off2, sizeof(struct icmp6_hdr), + (caddr_t)&iih); + } + + return (PF_PASS); + break; + } +#endif /* INET6 */ + default: { + key.af = pd2.af; + key.proto = pd2.proto; + if (direction == PF_IN) { + PF_ACPY(&key.ext.addr, pd2.dst, key.af); + PF_ACPY(&key.gwy.addr, pd2.src, key.af); + key.ext.port = 0; + key.gwy.port = 0; + } else { + PF_ACPY(&key.lan.addr, pd2.dst, key.af); + PF_ACPY(&key.ext.addr, pd2.src, key.af); + key.lan.port = 0; + key.ext.port = 0; + } + + STATE_LOOKUP(); + + if (STATE_TRANSLATE(*state)) { + if (direction == PF_IN) { + pf_change_icmp(pd2.src, NULL, + daddr, &(*state)->lan.addr, + 0, NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, pd2.af); + } else { + pf_change_icmp(pd2.dst, NULL, + saddr, &(*state)->gwy.addr, + 0, NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, pd2.af); + } + switch (pd2.af) { +#ifdef INET + case AF_INET: + m_copyback(m, off, ICMP_MINLEN, + (caddr_t)pd->hdr.icmp); + m_copyback(m, ipoff2, sizeof(h2), + (caddr_t)&h2); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + m_copyback(m, off, + sizeof(struct icmp6_hdr), + (caddr_t)pd->hdr.icmp6); + m_copyback(m, ipoff2, sizeof(h2_6), + (caddr_t)&h2_6); + break; +#endif /* INET6 */ + } + } + + return (PF_PASS); + break; + } + } + } +} + +int +pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif, + struct pf_pdesc *pd) +{ + struct pf_state_peer *src, *dst; + struct pf_state_cmp key; + + key.af = pd->af; + key.proto = pd->proto; + if (direction == PF_IN) { + PF_ACPY(&key.ext.addr, pd->src, key.af); + PF_ACPY(&key.gwy.addr, pd->dst, key.af); + key.ext.port = 0; + key.gwy.port = 0; + } else { + PF_ACPY(&key.lan.addr, pd->src, key.af); + PF_ACPY(&key.ext.addr, pd->dst, key.af); + key.lan.port = 0; + key.ext.port = 0; + } + + STATE_LOOKUP(); + + if (direction == (*state)->direction) { + src = &(*state)->src; + dst = &(*state)->dst; + } else { + src = &(*state)->dst; + dst = &(*state)->src; + } + + /* update states */ + if (src->state < PFOTHERS_SINGLE) + src->state = PFOTHERS_SINGLE; + if (dst->state == PFOTHERS_SINGLE) + dst->state = PFOTHERS_MULTIPLE; + + /* update expire time */ + (*state)->expire = time_second; + if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE) + (*state)->timeout = PFTM_OTHER_MULTIPLE; + else + (*state)->timeout = PFTM_OTHER_SINGLE; + + /* translate source/destination address, if necessary */ + if (STATE_TRANSLATE(*state)) { + if (direction == PF_OUT) + switch (pd->af) { +#ifdef INET + case AF_INET: + pf_change_a(&pd->src->v4.s_addr, + pd->ip_sum, (*state)->gwy.addr.v4.s_addr, + 0); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + PF_ACPY(pd->src, &(*state)->gwy.addr, pd->af); + break; +#endif /* INET6 */ + } + else + switch (pd->af) { +#ifdef INET + case AF_INET: + pf_change_a(&pd->dst->v4.s_addr, + pd->ip_sum, (*state)->lan.addr.v4.s_addr, + 0); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + PF_ACPY(pd->dst, &(*state)->lan.addr, pd->af); + break; +#endif /* INET6 */ + } + } + + return (PF_PASS); +} + +/* + * ipoff and off are measured from the start of the mbuf chain. + * h must be at "ipoff" on the mbuf chain. + */ +void * +pf_pull_hdr(struct mbuf *m, int off, void *p, int len, + u_short *actionp, u_short *reasonp, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: { + struct ip *h = mtod(m, struct ip *); + u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; + + if (fragoff) { + if (fragoff >= len) + ACTION_SET(actionp, PF_PASS); + else { + ACTION_SET(actionp, PF_DROP); + REASON_SET(reasonp, PFRES_FRAG); + } + return (NULL); + } + if (m->m_pkthdr.len < off + len || + ntohs(h->ip_len) < off + len) { + ACTION_SET(actionp, PF_DROP); + REASON_SET(reasonp, PFRES_SHORT); + return (NULL); + } + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + + if (m->m_pkthdr.len < off + len || + (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) < + (unsigned)(off + len)) { + ACTION_SET(actionp, PF_DROP); + REASON_SET(reasonp, PFRES_SHORT); + return (NULL); + } + break; + } +#endif /* INET6 */ + } + m_copydata(m, off, len, p); + return (p); +} + +int +pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif) +{ + struct sockaddr_in *dst; + int ret = 1; + int check_mpath; +#ifndef __FreeBSD__ + extern int ipmultipath; +#endif +#ifdef INET6 +#ifndef __FreeBSD__ + extern int ip6_multipath; +#endif + struct sockaddr_in6 *dst6; + struct route_in6 ro; +#else + struct route ro; +#endif + struct radix_node *rn; + struct rtentry *rt; + struct ifnet *ifp; + + check_mpath = 0; + bzero(&ro, sizeof(ro)); + switch (af) { + case AF_INET: + dst = satosin(&ro.ro_dst); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = addr->v4; +#ifndef __FreeBSD__ /* MULTIPATH_ROUTING */ + if (ipmultipath) + check_mpath = 1; +#endif + break; +#ifdef INET6 + case AF_INET6: + dst6 = (struct sockaddr_in6 *)&ro.ro_dst; + dst6->sin6_family = AF_INET6; + dst6->sin6_len = sizeof(*dst6); + dst6->sin6_addr = addr->v6; +#ifndef __FreeBSD__ /* MULTIPATH_ROUTING */ + if (ip6_multipath) + check_mpath = 1; +#endif + break; +#endif /* INET6 */ + default: + return (0); + } + + /* Skip checks for ipsec interfaces */ + if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC) + goto out; + +#ifdef __FreeBSD__ +/* XXX MRT not always INET */ /* stick with table 0 though */ + if (af == AF_INET) + in_rtalloc_ign((struct route *)&ro, 0, 0); + else + rtalloc_ign((struct route *)&ro, 0); +#else /* ! __FreeBSD__ */ + rtalloc_noclone((struct route *)&ro, NO_CLONING); +#endif + + if (ro.ro_rt != NULL) { + /* No interface given, this is a no-route check */ + if (kif == NULL) + goto out; + + if (kif->pfik_ifp == NULL) { + ret = 0; + goto out; + } + + /* Perform uRPF check if passed input interface */ + ret = 0; + rn = (struct radix_node *)ro.ro_rt; + do { + rt = (struct rtentry *)rn; +#ifndef __FreeBSD__ /* CARPDEV */ + if (rt->rt_ifp->if_type == IFT_CARP) + ifp = rt->rt_ifp->if_carpdev; + else +#endif + ifp = rt->rt_ifp; + + if (kif->pfik_ifp == ifp) + ret = 1; +#ifdef __FreeBSD__ /* MULTIPATH_ROUTING */ + rn = NULL; +#else + rn = rn_mpath_next(rn); +#endif + } while (check_mpath == 1 && rn != NULL && ret == 0); + } else + ret = 0; +out: + if (ro.ro_rt != NULL) + RTFREE(ro.ro_rt); + return (ret); +} + +int +pf_rtlabel_match(struct pf_addr *addr, sa_family_t af, struct pf_addr_wrap *aw) +{ + struct sockaddr_in *dst; +#ifdef INET6 + struct sockaddr_in6 *dst6; + struct route_in6 ro; +#else + struct route ro; +#endif + int ret = 0; + + bzero(&ro, sizeof(ro)); + switch (af) { + case AF_INET: + dst = satosin(&ro.ro_dst); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = addr->v4; + break; +#ifdef INET6 + case AF_INET6: + dst6 = (struct sockaddr_in6 *)&ro.ro_dst; + dst6->sin6_family = AF_INET6; + dst6->sin6_len = sizeof(*dst6); + dst6->sin6_addr = addr->v6; + break; +#endif /* INET6 */ + default: + return (0); + } + +#ifdef __FreeBSD__ +# ifdef RTF_PRCLONING + rtalloc_ign((struct route *)&ro, (RTF_CLONING|RTF_PRCLONING)); +# else /* !RTF_PRCLONING */ + if (af == AF_INET) + in_rtalloc_ign((struct route *)&ro, 0, 0); + else + rtalloc_ign((struct route *)&ro, 0); +# endif +#else /* ! __FreeBSD__ */ + rtalloc_noclone((struct route *)&ro, NO_CLONING); +#endif + + if (ro.ro_rt != NULL) { +#ifdef __FreeBSD__ + /* XXX_IMPORT: later */ +#else + if (ro.ro_rt->rt_labelid == aw->v.rtlabel) + ret = 1; +#endif + RTFREE(ro.ro_rt); + } + + return (ret); +} + +#ifdef INET + +void +pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, + struct pf_state *s, struct pf_pdesc *pd) +{ + struct mbuf *m0, *m1; + struct route iproute; + struct route *ro = NULL; + struct sockaddr_in *dst; + struct ip *ip; + struct ifnet *ifp = NULL; + struct pf_addr naddr; + struct pf_src_node *sn = NULL; + int error = 0; +#ifdef __FreeBSD__ + int sw_csum; +#endif +#ifdef IPSEC + struct m_tag *mtag; +#endif /* IPSEC */ + + if (m == NULL || *m == NULL || r == NULL || + (dir != PF_IN && dir != PF_OUT) || oifp == NULL) + panic("pf_route: invalid parameters"); + + if (pd->pf_mtag->routed++ > 3) { + m0 = *m; + *m = NULL; + goto bad; + } + + if (r->rt == PF_DUPTO) { +#ifdef __FreeBSD__ + if ((m0 = m_dup(*m, M_DONTWAIT)) == NULL) +#else + if ((m0 = m_copym2(*m, 0, M_COPYALL, M_NOWAIT)) == NULL) +#endif + return; + } else { + if ((r->rt == PF_REPLYTO) == (r->direction == dir)) + return; + m0 = *m; + } + + if (m0->m_len < sizeof(struct ip)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_route: m0->m_len < sizeof(struct ip)\n")); + goto bad; + } + + ip = mtod(m0, struct ip *); + + ro = &iproute; + bzero((caddr_t)ro, sizeof(*ro)); + dst = satosin(&ro->ro_dst); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = ip->ip_dst; + + if (r->rt == PF_FASTROUTE) { + in_rtalloc(ro, 0); + if (ro->ro_rt == 0) { + KMOD_IPSTAT_INC(ips_noroute); + goto bad; + } + + ifp = ro->ro_rt->rt_ifp; + ro->ro_rt->rt_use++; + + if (ro->ro_rt->rt_flags & RTF_GATEWAY) + dst = satosin(ro->ro_rt->rt_gateway); + } else { + if (TAILQ_EMPTY(&r->rpool.list)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_route: TAILQ_EMPTY(&r->rpool.list)\n")); + goto bad; + } + if (s == NULL) { + pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src, + &naddr, NULL, &sn); + if (!PF_AZERO(&naddr, AF_INET)) + dst->sin_addr.s_addr = naddr.v4.s_addr; + ifp = r->rpool.cur->kif ? + r->rpool.cur->kif->pfik_ifp : NULL; + } else { + if (!PF_AZERO(&s->rt_addr, AF_INET)) + dst->sin_addr.s_addr = + s->rt_addr.v4.s_addr; + ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; + } + } + if (ifp == NULL) + goto bad; + + if (oifp != ifp) { +#ifdef __FreeBSD__ + PF_UNLOCK(); + if (pf_test(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) { + PF_LOCK(); + goto bad; + } else if (m0 == NULL) { + PF_LOCK(); + goto done; + } + PF_LOCK(); +#else + if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS) + goto bad; + else if (m0 == NULL) + goto done; +#endif + if (m0->m_len < sizeof(struct ip)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_route: m0->m_len < sizeof(struct ip)\n")); + goto bad; + } + ip = mtod(m0, struct ip *); + } + +#ifdef __FreeBSD__ + /* Copied from FreeBSD 5.1-CURRENT ip_output. */ + m0->m_pkthdr.csum_flags |= CSUM_IP; + sw_csum = m0->m_pkthdr.csum_flags & ~ifp->if_hwassist; + if (sw_csum & CSUM_DELAY_DATA) { + /* + * XXX: in_delayed_cksum assumes HBO for ip->ip_len (at least) + */ + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); /* XXX: needed? */ + in_delayed_cksum(m0); + HTONS(ip->ip_len); + HTONS(ip->ip_off); + sw_csum &= ~CSUM_DELAY_DATA; + } + m0->m_pkthdr.csum_flags &= ifp->if_hwassist; + + if (ntohs(ip->ip_len) <= ifp->if_mtu || + (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || + (ifp->if_hwassist & CSUM_FRAGMENT && + ((ip->ip_off & htons(IP_DF)) == 0))) { + /* + * ip->ip_len = htons(ip->ip_len); + * ip->ip_off = htons(ip->ip_off); + */ + ip->ip_sum = 0; + if (sw_csum & CSUM_DELAY_IP) { + /* From KAME */ + if (ip->ip_v == IPVERSION && + (ip->ip_hl << 2) == sizeof(*ip)) { + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(m0, ip->ip_hl << 2); + } + } + PF_UNLOCK(); + error = (*ifp->if_output)(ifp, m0, sintosa(dst), ro); + PF_LOCK(); + goto done; + } + +#else + /* Copied from ip_output. */ +#ifdef IPSEC + /* + * If deferred crypto processing is needed, check that the + * interface supports it. + */ + if ((mtag = m_tag_find(m0, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL)) + != NULL && (ifp->if_capabilities & IFCAP_IPSEC) == 0) { + /* Notify IPsec to do its own crypto. */ + ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1)); + goto bad; + } +#endif /* IPSEC */ + + /* Catch routing changes wrt. hardware checksumming for TCP or UDP. */ + if (m0->m_pkthdr.csum_flags & M_TCPV4_CSUM_OUT) { + if (!(ifp->if_capabilities & IFCAP_CSUM_TCPv4) || + ifp->if_bridge != NULL) { + in_delayed_cksum(m0); + m0->m_pkthdr.csum_flags &= ~M_TCPV4_CSUM_OUT; /* Clear */ + } + } else if (m0->m_pkthdr.csum_flags & M_UDPV4_CSUM_OUT) { + if (!(ifp->if_capabilities & IFCAP_CSUM_UDPv4) || + ifp->if_bridge != NULL) { + in_delayed_cksum(m0); + m0->m_pkthdr.csum_flags &= ~M_UDPV4_CSUM_OUT; /* Clear */ + } + } + + if (ntohs(ip->ip_len) <= ifp->if_mtu) { + if ((ifp->if_capabilities & IFCAP_CSUM_IPv4) && + ifp->if_bridge == NULL) { + m0->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; + KMOD_IPSTAT_INC(ips_outhwcsum); + } else { + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m0, ip->ip_hl << 2); + } + /* Update relevant hardware checksum stats for TCP/UDP */ + if (m0->m_pkthdr.csum_flags & M_TCPV4_CSUM_OUT) + KMOD_TCPSTAT_INC(tcps_outhwcsum); + else if (m0->m_pkthdr.csum_flags & M_UDPV4_CSUM_OUT) + KMOD_UDPSTAT_INC(udps_outhwcsum); + error = (*ifp->if_output)(ifp, m0, sintosa(dst), NULL); + goto done; + } +#endif + /* + * Too large for interface; fragment if possible. + * Must be able to put at least 8 bytes per fragment. + */ + if (ip->ip_off & htons(IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) { + KMOD_IPSTAT_INC(ips_cantfrag); + if (r->rt != PF_DUPTO) { +#ifdef __FreeBSD__ + /* icmp_error() expects host byte ordering */ + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); + PF_UNLOCK(); + icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, + ifp->if_mtu); + PF_LOCK(); +#else + icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, + ifp->if_mtu); +#endif + goto done; + } else + goto bad; + } + + m1 = m0; +#ifdef __FreeBSD__ + /* + * XXX: is cheaper + less error prone than own function + */ + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); + error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist, sw_csum); +#else + error = ip_fragment(m0, ifp, ifp->if_mtu); +#endif + if (error) { +#ifndef __FreeBSD__ /* ip_fragment does not do m_freem() on FreeBSD */ + m0 = NULL; +#endif + goto bad; + } + + for (m0 = m1; m0; m0 = m1) { + m1 = m0->m_nextpkt; + m0->m_nextpkt = 0; +#ifdef __FreeBSD__ + if (error == 0) { + PF_UNLOCK(); + error = (*ifp->if_output)(ifp, m0, sintosa(dst), + NULL); + PF_LOCK(); + } else +#else + if (error == 0) + error = (*ifp->if_output)(ifp, m0, sintosa(dst), + NULL); + else +#endif + m_freem(m0); + } + + if (error == 0) + KMOD_IPSTAT_INC(ips_fragmented); + +done: + if (r->rt != PF_DUPTO) + *m = NULL; + if (ro == &iproute && ro->ro_rt) + RTFREE(ro->ro_rt); + return; + +bad: + m_freem(m0); + goto done; +} +#endif /* INET */ + +#ifdef INET6 +void +pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, + struct pf_state *s, struct pf_pdesc *pd) +{ + struct mbuf *m0; + struct route_in6 ip6route; + struct route_in6 *ro; + struct sockaddr_in6 *dst; + struct ip6_hdr *ip6; + struct ifnet *ifp = NULL; + struct pf_addr naddr; + struct pf_src_node *sn = NULL; + int error = 0; + + if (m == NULL || *m == NULL || r == NULL || + (dir != PF_IN && dir != PF_OUT) || oifp == NULL) + panic("pf_route6: invalid parameters"); + + if (pd->pf_mtag->routed++ > 3) { + m0 = *m; + *m = NULL; + goto bad; + } + + if (r->rt == PF_DUPTO) { +#ifdef __FreeBSD__ + if ((m0 = m_dup(*m, M_DONTWAIT)) == NULL) +#else + if ((m0 = m_copym2(*m, 0, M_COPYALL, M_NOWAIT)) == NULL) +#endif + return; + } else { + if ((r->rt == PF_REPLYTO) == (r->direction == dir)) + return; + m0 = *m; + } + + if (m0->m_len < sizeof(struct ip6_hdr)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n")); + goto bad; + } + ip6 = mtod(m0, struct ip6_hdr *); + + ro = &ip6route; + bzero((caddr_t)ro, sizeof(*ro)); + dst = (struct sockaddr_in6 *)&ro->ro_dst; + dst->sin6_family = AF_INET6; + dst->sin6_len = sizeof(*dst); + dst->sin6_addr = ip6->ip6_dst; + + /* Cheat. XXX why only in the v6 case??? */ + if (r->rt == PF_FASTROUTE) { +#ifdef __FreeBSD__ + m0->m_flags |= M_SKIP_FIREWALL; + PF_UNLOCK(); + ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); + PF_LOCK(); +#else + mtag = m_tag_get(PACKET_TAG_PF_GENERATED, 0, M_NOWAIT); + if (mtag == NULL) + goto bad; + m_tag_prepend(m0, mtag); + pd->pf_mtag->flags |= PF_TAG_GENERATED; + ip6_output(m0, NULL, NULL, 0, NULL, NULL); +#endif + return; + } + + if (TAILQ_EMPTY(&r->rpool.list)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_route6: TAILQ_EMPTY(&r->rpool.list)\n")); + goto bad; + } + if (s == NULL) { + pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src, + &naddr, NULL, &sn); + if (!PF_AZERO(&naddr, AF_INET6)) + PF_ACPY((struct pf_addr *)&dst->sin6_addr, + &naddr, AF_INET6); + ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; + } else { + if (!PF_AZERO(&s->rt_addr, AF_INET6)) + PF_ACPY((struct pf_addr *)&dst->sin6_addr, + &s->rt_addr, AF_INET6); + ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; + } + if (ifp == NULL) + goto bad; + + if (oifp != ifp) { +#ifdef __FreeBSD__ + PF_UNLOCK(); + if (pf_test6(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) { + PF_LOCK(); + goto bad; + } else if (m0 == NULL) { + PF_LOCK(); + goto done; + } + PF_LOCK(); +#else + if (pf_test6(PF_OUT, ifp, &m0, NULL) != PF_PASS) + goto bad; + else if (m0 == NULL) + goto done; +#endif + if (m0->m_len < sizeof(struct ip6_hdr)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n")); + goto bad; + } + ip6 = mtod(m0, struct ip6_hdr *); + } + + /* + * If the packet is too large for the outgoing interface, + * send back an icmp6 error. + */ + if (IN6_IS_SCOPE_EMBED(&dst->sin6_addr)) + dst->sin6_addr.s6_addr16[1] = htons(ifp->if_index); + if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) { +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + error = nd6_output(ifp, ifp, m0, dst, NULL); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + } else { + in6_ifstat_inc(ifp, ifs6_in_toobig); +#ifdef __FreeBSD__ + if (r->rt != PF_DUPTO) { + PF_UNLOCK(); + icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); + PF_LOCK(); + } else +#else + if (r->rt != PF_DUPTO) + icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); + else +#endif + goto bad; + } + +done: + if (r->rt != PF_DUPTO) + *m = NULL; + return; + +bad: + m_freem(m0); + goto done; +} +#endif /* INET6 */ + + +#ifdef __FreeBSD__ +/* + * FreeBSD supports cksum offloads for the following drivers. + * em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4), + * ti(4), txp(4), xl(4) + * + * CSUM_DATA_VALID | CSUM_PSEUDO_HDR : + * network driver performed cksum including pseudo header, need to verify + * csum_data + * CSUM_DATA_VALID : + * network driver performed cksum, needs to additional pseudo header + * cksum computation with partial csum_data(i.e. lack of H/W support for + * pseudo header, for instance hme(4), sk(4) and possibly gem(4)) + * + * After validating the cksum of packet, set both flag CSUM_DATA_VALID and + * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper + * TCP/UDP layer. + * Also, set csum_data to 0xffff to force cksum validation. + */ +int +pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af) +{ + u_int16_t sum = 0; + int hw_assist = 0; + struct ip *ip; + + if (off < sizeof(struct ip) || len < sizeof(struct udphdr)) + return (1); + if (m->m_pkthdr.len < off + len) + return (1); + + switch (p) { + case IPPROTO_TCP: + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { + sum = m->m_pkthdr.csum_data; + } else { + ip = mtod(m, struct ip *); + sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl((u_short)len + + m->m_pkthdr.csum_data + IPPROTO_TCP)); + } + sum ^= 0xffff; + ++hw_assist; + } + break; + case IPPROTO_UDP: + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { + sum = m->m_pkthdr.csum_data; + } else { + ip = mtod(m, struct ip *); + sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl((u_short)len + + m->m_pkthdr.csum_data + IPPROTO_UDP)); + } + sum ^= 0xffff; + ++hw_assist; + } + break; + case IPPROTO_ICMP: +#ifdef INET6 + case IPPROTO_ICMPV6: +#endif /* INET6 */ + break; + default: + return (1); + } + + if (!hw_assist) { + switch (af) { + case AF_INET: + if (p == IPPROTO_ICMP) { + if (m->m_len < off) + return (1); + m->m_data += off; + m->m_len -= off; + sum = in_cksum(m, len); + m->m_data -= off; + m->m_len += off; + } else { + if (m->m_len < sizeof(struct ip)) + return (1); + sum = in4_cksum(m, p, off, len); + } + break; +#ifdef INET6 + case AF_INET6: + if (m->m_len < sizeof(struct ip6_hdr)) + return (1); + sum = in6_cksum(m, p, off, len); + break; +#endif /* INET6 */ + default: + return (1); + } + } + if (sum) { + switch (p) { + case IPPROTO_TCP: + { + KMOD_TCPSTAT_INC(tcps_rcvbadsum); + break; + } + case IPPROTO_UDP: + { + KMOD_UDPSTAT_INC(udps_badsum); + break; + } + case IPPROTO_ICMP: + { + KMOD_ICMPSTAT_INC(icps_checksum); + break; + } +#ifdef INET6 + case IPPROTO_ICMPV6: + { + KMOD_ICMP6STAT_INC(icp6s_checksum); + break; + } +#endif /* INET6 */ + } + return (1); + } else { + if (p == IPPROTO_TCP || p == IPPROTO_UDP) { + m->m_pkthdr.csum_flags |= + (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + m->m_pkthdr.csum_data = 0xffff; + } + } + return (0); +} +#else /* !__FreeBSD__ */ +/* + * check protocol (tcp/udp/icmp/icmp6) checksum and set mbuf flag + * off is the offset where the protocol header starts + * len is the total length of protocol header plus payload + * returns 0 when the checksum is valid, otherwise returns 1. + */ +int +pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, + sa_family_t af) +{ + u_int16_t flag_ok, flag_bad; + u_int16_t sum; + + switch (p) { + case IPPROTO_TCP: + flag_ok = M_TCP_CSUM_IN_OK; + flag_bad = M_TCP_CSUM_IN_BAD; + break; + case IPPROTO_UDP: + flag_ok = M_UDP_CSUM_IN_OK; + flag_bad = M_UDP_CSUM_IN_BAD; + break; + case IPPROTO_ICMP: +#ifdef INET6 + case IPPROTO_ICMPV6: +#endif /* INET6 */ + flag_ok = flag_bad = 0; + break; + default: + return (1); + } + if (m->m_pkthdr.csum_flags & flag_ok) + return (0); + if (m->m_pkthdr.csum_flags & flag_bad) + return (1); + if (off < sizeof(struct ip) || len < sizeof(struct udphdr)) + return (1); + if (m->m_pkthdr.len < off + len) + return (1); + switch (af) { +#ifdef INET + case AF_INET: + if (p == IPPROTO_ICMP) { + if (m->m_len < off) + return (1); + m->m_data += off; + m->m_len -= off; + sum = in_cksum(m, len); + m->m_data -= off; + m->m_len += off; + } else { + if (m->m_len < sizeof(struct ip)) + return (1); + sum = in4_cksum(m, p, off, len); + } + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (m->m_len < sizeof(struct ip6_hdr)) + return (1); + sum = in6_cksum(m, p, off, len); + break; +#endif /* INET6 */ + default: + return (1); + } + if (sum) { + m->m_pkthdr.csum_flags |= flag_bad; + switch (p) { + case IPPROTO_TCP: + KMOD_TCPSTAT_INC(tcps_rcvbadsum); + break; + case IPPROTO_UDP: + KMOD_UDPSTAT_INC(udps_badsum); + break; + case IPPROTO_ICMP: + KMOD_ICMPSTAT_INC(icps_checksum); + break; +#ifdef INET6 + case IPPROTO_ICMPV6: + KMOD_ICMP6STAT_INC(icp6s_checksum); + break; +#endif /* INET6 */ + } + return (1); + } + m->m_pkthdr.csum_flags |= flag_ok; + return (0); +} +#endif /* __FreeBSD__ */ + +#ifdef INET +int +#ifdef __FreeBSD__ +pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, + struct ether_header *eh, struct inpcb *inp) +#else +pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, + struct ether_header *eh) +#endif +{ + struct pfi_kif *kif; + u_short action, reason = 0, log = 0; + struct mbuf *m = *m0; + struct ip *h = NULL; /* make the compiler happy */ + struct pf_rule *a = NULL, *r = &pf_default_rule, *tr, *nr; + struct pf_state *s = NULL; + struct pf_ruleset *ruleset = NULL; + struct pf_pdesc pd; + int off, dirndx, pqid = 0; + +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + if (!pf_status.running) +#ifdef __FreeBSD__ + { + PF_UNLOCK(); +#endif + return (PF_PASS); +#ifdef __FreeBSD__ + } +#endif + + memset(&pd, 0, sizeof(pd)); + if ((pd.pf_mtag = pf_get_mtag(m)) == NULL) { +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_test: pf_get_mtag returned NULL\n")); + return (PF_DROP); + } +#ifdef __FreeBSD__ + if (m->m_flags & M_SKIP_FIREWALL) { + PF_UNLOCK(); + return (PF_PASS); + } +#else + if (pd.pf_mtag->flags & PF_TAG_GENERATED) + return (PF_PASS); +#endif + +#ifdef __FreeBSD__ + /* XXX_IMPORT: later */ +#else + if (ifp->if_type == IFT_CARP && ifp->if_carpdev) + ifp = ifp->if_carpdev; +#endif + + kif = (struct pfi_kif *)ifp->if_pf_kif; + if (kif == NULL) { +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname)); + return (PF_DROP); + } + if (kif->pfik_flags & PFI_IFLAG_SKIP) { +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + return (PF_PASS); + } + +#ifdef __FreeBSD__ + M_ASSERTPKTHDR(m); +#else +#ifdef DIAGNOSTIC + if ((m->m_flags & M_PKTHDR) == 0) + panic("non-M_PKTHDR is passed to pf_test"); +#endif /* DIAGNOSTIC */ +#endif /* __FreeBSD__ */ + + if (m->m_pkthdr.len < (int)sizeof(*h)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + log = 1; + goto done; + } + + /* We do IP header normalization and packet reassembly here */ + if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) { + action = PF_DROP; + goto done; + } + m = *m0; + h = mtod(m, struct ip *); + + off = h->ip_hl << 2; + if (off < (int)sizeof(*h)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + log = 1; + goto done; + } + + pd.src = (struct pf_addr *)&h->ip_src; + pd.dst = (struct pf_addr *)&h->ip_dst; + PF_ACPY(&pd.baddr, dir == PF_OUT ? pd.src : pd.dst, AF_INET); + pd.ip_sum = &h->ip_sum; + pd.proto = h->ip_p; + pd.af = AF_INET; + pd.tos = h->ip_tos; + pd.tot_len = ntohs(h->ip_len); + pd.eh = eh; + + /* handle fragments that didn't get reassembled by normalization */ + if (h->ip_off & htons(IP_MF | IP_OFFMASK)) { + action = pf_test_fragment(&r, dir, kif, m, h, + &pd, &a, &ruleset); + goto done; + } + + switch (h->ip_p) { + + case IPPROTO_TCP: { + struct tcphdr th; + + pd.hdr.tcp = &th; + if (!pf_pull_hdr(m, off, &th, sizeof(th), + &action, &reason, AF_INET)) { + log = action != PF_PASS; + goto done; + } + if (dir == PF_IN && pf_check_proto_cksum(m, off, + ntohs(h->ip_len) - off, IPPROTO_TCP, AF_INET)) { + REASON_SET(&reason, PFRES_PROTCKSUM); + action = PF_DROP; + goto done; + } + pd.p_len = pd.tot_len - off - (th.th_off << 2); + if ((th.th_flags & TH_ACK) && pd.p_len == 0) + pqid = 1; + action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); + if (action == PF_DROP) + goto done; + action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, + &reason); + if (action == PF_PASS) { +#if NPFSYNC + pfsync_update_state(s); +#endif /* NPFSYNC */ + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) +#ifdef __FreeBSD__ + action = pf_test_tcp(&r, &s, dir, kif, + m, off, h, &pd, &a, &ruleset, NULL, inp); +#else + action = pf_test_tcp(&r, &s, dir, kif, + m, off, h, &pd, &a, &ruleset, &ipintrq); +#endif + break; + } + + case IPPROTO_UDP: { + struct udphdr uh; + + pd.hdr.udp = &uh; + if (!pf_pull_hdr(m, off, &uh, sizeof(uh), + &action, &reason, AF_INET)) { + log = action != PF_PASS; + goto done; + } + if (dir == PF_IN && uh.uh_sum && pf_check_proto_cksum(m, + off, ntohs(h->ip_len) - off, IPPROTO_UDP, AF_INET)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_PROTCKSUM); + goto done; + } + if (uh.uh_dport == 0 || + ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || + ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + goto done; + } + action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); + if (action == PF_PASS) { +#if NPFSYNC + pfsync_update_state(s); +#endif /* NPFSYNC */ + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) +#ifdef __FreeBSD__ + action = pf_test_udp(&r, &s, dir, kif, + m, off, h, &pd, &a, &ruleset, NULL, inp); +#else + action = pf_test_udp(&r, &s, dir, kif, + m, off, h, &pd, &a, &ruleset, &ipintrq); +#endif + break; + } + + case IPPROTO_ICMP: { + struct icmp ih; + + pd.hdr.icmp = &ih; + if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN, + &action, &reason, AF_INET)) { + log = action != PF_PASS; + goto done; + } + if (dir == PF_IN && pf_check_proto_cksum(m, off, + ntohs(h->ip_len) - off, IPPROTO_ICMP, AF_INET)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_PROTCKSUM); + goto done; + } + action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd, + &reason); + if (action == PF_PASS) { +#if NPFSYNC + pfsync_update_state(s); +#endif /* NPFSYNC */ + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) +#ifdef __FreeBSD__ + action = pf_test_icmp(&r, &s, dir, kif, + m, off, h, &pd, &a, &ruleset, NULL); +#else + action = pf_test_icmp(&r, &s, dir, kif, + m, off, h, &pd, &a, &ruleset, &ipintrq); +#endif + break; + } + + default: + action = pf_test_state_other(&s, dir, kif, &pd); + if (action == PF_PASS) { +#if NPFSYNC + pfsync_update_state(s); +#endif /* NPFSYNC */ + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) +#ifdef __FreeBSD__ + action = pf_test_other(&r, &s, dir, kif, m, off, h, + &pd, &a, &ruleset, NULL); +#else + action = pf_test_other(&r, &s, dir, kif, m, off, h, + &pd, &a, &ruleset, &ipintrq); +#endif + break; + } + +done: + if (action == PF_PASS && h->ip_hl > 5 && + !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_IPOPTIONS); + log = 1; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: dropping packet with ip options\n")); + } + + if ((s && s->tag) || r->rtableid) + pf_tag_packet(m, pd.pf_mtag, s ? s->tag : 0, r->rtableid); + +#ifdef ALTQ + if (action == PF_PASS && r->qid) { + if (pqid || (pd.tos & IPTOS_LOWDELAY)) + pd.pf_mtag->qid = r->pqid; + else + pd.pf_mtag->qid = r->qid; + /* add hints for ecn */ + pd.pf_mtag->af = AF_INET; + pd.pf_mtag->hdr = h; + } +#endif /* ALTQ */ + + /* + * connections redirected to loopback should not match sockets + * bound specifically to loopback due to security implications, + * see tcp_input() and in_pcblookup_listen(). + */ + if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || + pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && + (s->nat_rule.ptr->action == PF_RDR || + s->nat_rule.ptr->action == PF_BINAT) && + (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) + pd.pf_mtag->flags |= PF_TAG_TRANSLATE_LOCALHOST; + + if (log) { + struct pf_rule *lr; + + if (s != NULL && s->nat_rule.ptr != NULL && + s->nat_rule.ptr->log & PF_LOG_ALL) + lr = s->nat_rule.ptr; + else + lr = r; + PFLOG_PACKET(kif, h, m, AF_INET, dir, reason, lr, a, ruleset, + &pd); + } + + kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len; + kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++; + + if (action == PF_PASS || r->action == PF_DROP) { + dirndx = (dir == PF_OUT); + r->packets[dirndx]++; + r->bytes[dirndx] += pd.tot_len; + if (a != NULL) { + a->packets[dirndx]++; + a->bytes[dirndx] += pd.tot_len; + } + if (s != NULL) { + if (s->nat_rule.ptr != NULL) { + s->nat_rule.ptr->packets[dirndx]++; + s->nat_rule.ptr->bytes[dirndx] += pd.tot_len; + } + if (s->src_node != NULL) { + s->src_node->packets[dirndx]++; + s->src_node->bytes[dirndx] += pd.tot_len; + } + if (s->nat_src_node != NULL) { + s->nat_src_node->packets[dirndx]++; + s->nat_src_node->bytes[dirndx] += pd.tot_len; + } + dirndx = (dir == s->direction) ? 0 : 1; + s->packets[dirndx]++; + s->bytes[dirndx] += pd.tot_len; + } + tr = r; + nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; + if (nr != NULL) { + struct pf_addr *x; + /* + * XXX: we need to make sure that the addresses + * passed to pfr_update_stats() are the same than + * the addresses used during matching (pfr_match) + */ + if (r == &pf_default_rule) { + tr = nr; + x = (s == NULL || s->direction == dir) ? + &pd.baddr : &pd.naddr; + } else + x = (s == NULL || s->direction == dir) ? + &pd.naddr : &pd.baddr; + if (x == &pd.baddr || s == NULL) { + /* we need to change the address */ + if (dir == PF_OUT) + pd.src = x; + else + pd.dst = x; + } + } + if (tr->src.addr.type == PF_ADDR_TABLE) + pfr_update_stats(tr->src.addr.p.tbl, (s == NULL || + s->direction == dir) ? pd.src : pd.dst, pd.af, + pd.tot_len, dir == PF_OUT, r->action == PF_PASS, + tr->src.neg); + if (tr->dst.addr.type == PF_ADDR_TABLE) + pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL || + s->direction == dir) ? pd.dst : pd.src, pd.af, + pd.tot_len, dir == PF_OUT, r->action == PF_PASS, + tr->dst.neg); + } + + + if (action == PF_SYNPROXY_DROP) { + m_freem(*m0); + *m0 = NULL; + action = PF_PASS; + } else if (r->rt) + /* pf_route can free the mbuf causing *m0 to become NULL */ + pf_route(m0, r, dir, ifp, s, &pd); + +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + + return (action); +} +#endif /* INET */ + +#ifdef INET6 +int +#ifdef __FreeBSD__ +pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, + struct ether_header *eh, struct inpcb *inp) +#else +pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, + struct ether_header *eh) +#endif +{ + struct pfi_kif *kif; + u_short action, reason = 0, log = 0; + struct mbuf *m = *m0, *n = NULL; + struct ip6_hdr *h; + struct pf_rule *a = NULL, *r = &pf_default_rule, *tr, *nr; + struct pf_state *s = NULL; + struct pf_ruleset *ruleset = NULL; + struct pf_pdesc pd; + int off, terminal = 0, dirndx, rh_cnt = 0; + +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + + if (!pf_status.running) +#ifdef __FreeBSD__ + { + PF_UNLOCK(); +#endif + return (PF_PASS); +#ifdef __FreeBSD__ + } +#endif + + memset(&pd, 0, sizeof(pd)); + if ((pd.pf_mtag = pf_get_mtag(m)) == NULL) { +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_test6: pf_get_mtag returned NULL\n")); + return (PF_DROP); + } + if (pd.pf_mtag->flags & PF_TAG_GENERATED) + return (PF_PASS); + +#ifdef __FreeBSD__ + /* XXX_IMPORT: later */ +#else + if (ifp->if_type == IFT_CARP && ifp->if_carpdev) + ifp = ifp->if_carpdev; +#endif + + kif = (struct pfi_kif *)ifp->if_pf_kif; + if (kif == NULL) { +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname)); + return (PF_DROP); + } + if (kif->pfik_flags & PFI_IFLAG_SKIP) { +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + return (PF_PASS); + } + +#ifdef __FreeBSD__ + M_ASSERTPKTHDR(m); +#else +#ifdef DIAGNOSTIC + if ((m->m_flags & M_PKTHDR) == 0) + panic("non-M_PKTHDR is passed to pf_test6"); +#endif /* DIAGNOSTIC */ +#endif + +#ifdef __FreeBSD__ + h = NULL; /* make the compiler happy */ +#endif + + if (m->m_pkthdr.len < (int)sizeof(*h)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + log = 1; + goto done; + } + + /* We do IP header normalization and packet reassembly here */ + if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) { + action = PF_DROP; + goto done; + } + m = *m0; + h = mtod(m, struct ip6_hdr *); + +#if 1 + /* + * we do not support jumbogram yet. if we keep going, zero ip6_plen + * will do something bad, so drop the packet for now. + */ + if (htons(h->ip6_plen) == 0) { + action = PF_DROP; + REASON_SET(&reason, PFRES_NORM); /*XXX*/ + goto done; + } +#endif + + pd.src = (struct pf_addr *)&h->ip6_src; + pd.dst = (struct pf_addr *)&h->ip6_dst; + PF_ACPY(&pd.baddr, dir == PF_OUT ? pd.src : pd.dst, AF_INET6); + pd.ip_sum = NULL; + pd.af = AF_INET6; + pd.tos = 0; + pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr); + pd.eh = eh; + + off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr); + pd.proto = h->ip6_nxt; + do { + switch (pd.proto) { + case IPPROTO_FRAGMENT: + action = pf_test_fragment(&r, dir, kif, m, h, + &pd, &a, &ruleset); + if (action == PF_DROP) + REASON_SET(&reason, PFRES_FRAG); + goto done; + case IPPROTO_ROUTING: { + struct ip6_rthdr rthdr; + + if (rh_cnt++) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: IPv6 more than one rthdr\n")); + action = PF_DROP; + REASON_SET(&reason, PFRES_IPOPTIONS); + log = 1; + goto done; + } + if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL, + &reason, pd.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: IPv6 short rthdr\n")); + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + log = 1; + goto done; + } + if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: IPv6 rthdr0\n")); + action = PF_DROP; + REASON_SET(&reason, PFRES_IPOPTIONS); + log = 1; + goto done; + } + /* fallthrough */ + } + case IPPROTO_AH: + case IPPROTO_HOPOPTS: + case IPPROTO_DSTOPTS: { + /* get next header and header length */ + struct ip6_ext opt6; + + if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6), + NULL, &reason, pd.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: IPv6 short opt\n")); + action = PF_DROP; + log = 1; + goto done; + } + if (pd.proto == IPPROTO_AH) + off += (opt6.ip6e_len + 2) * 4; + else + off += (opt6.ip6e_len + 1) * 8; + pd.proto = opt6.ip6e_nxt; + /* goto the next header */ + break; + } + default: + terminal++; + break; + } + } while (!terminal); + + /* if there's no routing header, use unmodified mbuf for checksumming */ + if (!n) + n = m; + + switch (pd.proto) { + + case IPPROTO_TCP: { + struct tcphdr th; + + pd.hdr.tcp = &th; + if (!pf_pull_hdr(m, off, &th, sizeof(th), + &action, &reason, AF_INET6)) { + log = action != PF_PASS; + goto done; + } + if (dir == PF_IN && pf_check_proto_cksum(n, off, + ntohs(h->ip6_plen) - (off - sizeof(struct ip6_hdr)), + IPPROTO_TCP, AF_INET6)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_PROTCKSUM); + goto done; + } + pd.p_len = pd.tot_len - off - (th.th_off << 2); + action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); + if (action == PF_DROP) + goto done; + action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, + &reason); + if (action == PF_PASS) { +#if NPFSYNC + pfsync_update_state(s); +#endif /* NPFSYNC */ + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) +#ifdef __FreeBSD__ + action = pf_test_tcp(&r, &s, dir, kif, + m, off, h, &pd, &a, &ruleset, NULL, inp); +#else + action = pf_test_tcp(&r, &s, dir, kif, + m, off, h, &pd, &a, &ruleset, &ip6intrq); +#endif + break; + } + + case IPPROTO_UDP: { + struct udphdr uh; + + pd.hdr.udp = &uh; + if (!pf_pull_hdr(m, off, &uh, sizeof(uh), + &action, &reason, AF_INET6)) { + log = action != PF_PASS; + goto done; + } + if (dir == PF_IN && uh.uh_sum && pf_check_proto_cksum(n, + off, ntohs(h->ip6_plen) - (off - sizeof(struct ip6_hdr)), + IPPROTO_UDP, AF_INET6)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_PROTCKSUM); + goto done; + } + if (uh.uh_dport == 0 || + ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || + ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + goto done; + } + action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); + if (action == PF_PASS) { +#if NPFSYNC + pfsync_update_state(s); +#endif /* NPFSYNC */ + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) +#ifdef __FreeBSD__ + action = pf_test_udp(&r, &s, dir, kif, + m, off, h, &pd, &a, &ruleset, NULL, inp); +#else + action = pf_test_udp(&r, &s, dir, kif, + m, off, h, &pd, &a, &ruleset, &ip6intrq); +#endif + break; + } + + case IPPROTO_ICMPV6: { + struct icmp6_hdr ih; + + pd.hdr.icmp6 = &ih; + if (!pf_pull_hdr(m, off, &ih, sizeof(ih), + &action, &reason, AF_INET6)) { + log = action != PF_PASS; + goto done; + } + if (dir == PF_IN && pf_check_proto_cksum(n, off, + ntohs(h->ip6_plen) - (off - sizeof(struct ip6_hdr)), + IPPROTO_ICMPV6, AF_INET6)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_PROTCKSUM); + goto done; + } + action = pf_test_state_icmp(&s, dir, kif, + m, off, h, &pd, &reason); + if (action == PF_PASS) { +#if NPFSYNC + pfsync_update_state(s); +#endif /* NPFSYNC */ + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) +#ifdef __FreeBSD__ + action = pf_test_icmp(&r, &s, dir, kif, + m, off, h, &pd, &a, &ruleset, NULL); +#else + action = pf_test_icmp(&r, &s, dir, kif, + m, off, h, &pd, &a, &ruleset, &ip6intrq); +#endif + break; + } + + default: + action = pf_test_state_other(&s, dir, kif, &pd); + if (action == PF_PASS) { +#if NPFSYNC + pfsync_update_state(s); +#endif /* NPFSYNC */ + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) +#ifdef __FreeBSD__ + action = pf_test_other(&r, &s, dir, kif, m, off, h, + &pd, &a, &ruleset, NULL); +#else + action = pf_test_other(&r, &s, dir, kif, m, off, h, + &pd, &a, &ruleset, &ip6intrq); +#endif + break; + } + +done: + /* handle dangerous IPv6 extension headers. */ + if (action == PF_PASS && rh_cnt && + !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_IPOPTIONS); + log = 1; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: dropping packet with dangerous v6 headers\n")); + } + + if ((s && s->tag) || r->rtableid) + pf_tag_packet(m, pd.pf_mtag, s ? s->tag : 0, r->rtableid); + +#ifdef ALTQ + if (action == PF_PASS && r->qid) { + if (pd.tos & IPTOS_LOWDELAY) + pd.pf_mtag->qid = r->pqid; + else + pd.pf_mtag->qid = r->qid; + /* add hints for ecn */ + pd.pf_mtag->af = AF_INET6; + pd.pf_mtag->hdr = h; + } +#endif /* ALTQ */ + + if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || + pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && + (s->nat_rule.ptr->action == PF_RDR || + s->nat_rule.ptr->action == PF_BINAT) && + IN6_IS_ADDR_LOOPBACK(&pd.dst->v6)) + pd.pf_mtag->flags |= PF_TAG_TRANSLATE_LOCALHOST; + + if (log) { + struct pf_rule *lr; + + if (s != NULL && s->nat_rule.ptr != NULL && + s->nat_rule.ptr->log & PF_LOG_ALL) + lr = s->nat_rule.ptr; + else + lr = r; + PFLOG_PACKET(kif, h, m, AF_INET6, dir, reason, lr, a, ruleset, + &pd); + } + + kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len; + kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++; + + if (action == PF_PASS || r->action == PF_DROP) { + dirndx = (dir == PF_OUT); + r->packets[dirndx]++; + r->bytes[dirndx] += pd.tot_len; + if (a != NULL) { + a->packets[dirndx]++; + a->bytes[dirndx] += pd.tot_len; + } + if (s != NULL) { + if (s->nat_rule.ptr != NULL) { + s->nat_rule.ptr->packets[dirndx]++; + s->nat_rule.ptr->bytes[dirndx] += pd.tot_len; + } + if (s->src_node != NULL) { + s->src_node->packets[dirndx]++; + s->src_node->bytes[dirndx] += pd.tot_len; + } + if (s->nat_src_node != NULL) { + s->nat_src_node->packets[dirndx]++; + s->nat_src_node->bytes[dirndx] += pd.tot_len; + } + dirndx = (dir == s->direction) ? 0 : 1; + s->packets[dirndx]++; + s->bytes[dirndx] += pd.tot_len; + } + tr = r; + nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; + if (nr != NULL) { + struct pf_addr *x; + /* + * XXX: we need to make sure that the addresses + * passed to pfr_update_stats() are the same than + * the addresses used during matching (pfr_match) + */ + if (r == &pf_default_rule) { + tr = nr; + x = (s == NULL || s->direction == dir) ? + &pd.baddr : &pd.naddr; + } else { + x = (s == NULL || s->direction == dir) ? + &pd.naddr : &pd.baddr; + } + if (x == &pd.baddr || s == NULL) { + if (dir == PF_OUT) + pd.src = x; + else + pd.dst = x; + } + } + if (tr->src.addr.type == PF_ADDR_TABLE) + pfr_update_stats(tr->src.addr.p.tbl, (s == NULL || + s->direction == dir) ? pd.src : pd.dst, pd.af, + pd.tot_len, dir == PF_OUT, r->action == PF_PASS, + tr->src.neg); + if (tr->dst.addr.type == PF_ADDR_TABLE) + pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL || + s->direction == dir) ? pd.dst : pd.src, pd.af, + pd.tot_len, dir == PF_OUT, r->action == PF_PASS, + tr->dst.neg); + } + + + if (action == PF_SYNPROXY_DROP) { + m_freem(*m0); + *m0 = NULL; + action = PF_PASS; + } else if (r->rt) + /* pf_route6 can free the mbuf causing *m0 to become NULL */ + pf_route6(m0, r, dir, ifp, s, &pd); + +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + return (action); +} +#endif /* INET6 */ + +int +pf_check_congestion(struct ifqueue *ifq) +{ +#ifdef __FreeBSD__ + /* XXX_IMPORT: later */ + return (0); +#else + if (ifq->ifq_congestion) + return (1); + else + return (0); +#endif +} diff --git a/contrib/pf/rtems/freebsd/net/pf_if.c b/contrib/pf/rtems/freebsd/net/pf_if.c new file mode 100644 index 00000000..f286da5b --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/pf_if.c @@ -0,0 +1,950 @@ +#include + +/* $OpenBSD: pf_if.c,v 1.46 2006/12/13 09:01:59 itojun Exp $ */ + +/* + * Copyright 2005 Henning Brauer + * Copyright 2005 Ryan McBride + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2003 Cedric Berger + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) +#include +#include + +#include +__FBSDID("$FreeBSD$"); +#endif + +#include +#include +#ifdef __FreeBSD__ +#include +#endif +#include +#include +#include +#include +#include +#ifndef __FreeBSD__ +#include +#endif +#include + +#include +#include +#ifdef __FreeBSD__ +#include +#endif + +#include +#include +#include +#include +#include + +#include + +#ifdef INET6 +#include +#endif /* INET6 */ + +struct pfi_kif *pfi_all = NULL; +struct pfi_statehead pfi_statehead; +#ifdef __FreeBSD__ +uma_zone_t pfi_addr_pl; +#else +struct pool pfi_addr_pl; +#endif +struct pfi_ifhead pfi_ifs; +long pfi_update = 1; +struct pfr_addr *pfi_buffer; +int pfi_buffer_cnt; +int pfi_buffer_max; +#ifdef __FreeBSD__ +eventhandler_tag pfi_attach_cookie = NULL; +eventhandler_tag pfi_detach_cookie = NULL; +eventhandler_tag pfi_attach_group_cookie = NULL; +eventhandler_tag pfi_change_group_cookie = NULL; +eventhandler_tag pfi_detach_group_cookie = NULL; +eventhandler_tag pfi_ifaddr_event_cookie = NULL; +#endif + +void pfi_kif_update(struct pfi_kif *); +void pfi_dynaddr_update(struct pfi_dynaddr *dyn); +void pfi_table_update(struct pfr_ktable *, struct pfi_kif *, + int, int); +void pfi_kifaddr_update(void *); +void pfi_instance_add(struct ifnet *, int, int); +void pfi_address_add(struct sockaddr *, int, int); +int pfi_if_compare(struct pfi_kif *, struct pfi_kif *); +int pfi_skip_if(const char *, struct pfi_kif *); +int pfi_unmask(void *); +#ifdef __FreeBSD__ +void pfi_attach_ifnet_event(void * __unused, struct ifnet *); +void pfi_detach_ifnet_event(void * __unused, struct ifnet *); +void pfi_attach_group_event(void * __unused, struct ifg_group *); +void pfi_change_group_event(void * __unused, char *); +void pfi_detach_group_event(void * __unused, struct ifg_group *); +void pfi_ifaddr_event(void * __unused, struct ifnet *); + +#endif + +RB_PROTOTYPE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare); +RB_GENERATE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare); + +#define PFI_BUFFER_MAX 0x10000 +#define PFI_MTYPE M_IFADDR + +void +pfi_initialize(void) +{ + + if (pfi_all != NULL) /* already initialized */ + return; + + TAILQ_INIT(&pfi_statehead); +#ifndef __FreeBSD__ + pool_init(&pfi_addr_pl, sizeof(struct pfi_dynaddr), 0, 0, 0, + "pfiaddrpl", &pool_allocator_nointr); +#endif + pfi_buffer_max = 64; + pfi_buffer = malloc(pfi_buffer_max * sizeof(*pfi_buffer), + PFI_MTYPE, M_WAITOK); + + if ((pfi_all = pfi_kif_get(IFG_ALL)) == NULL) + panic("pfi_kif_get for pfi_all failed"); + +#ifdef __FreeBSD__ + struct ifg_group *ifg; + struct ifnet *ifp; + + IFNET_RLOCK(); + TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) + pfi_attach_ifgroup(ifg); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) + pfi_attach_ifnet(ifp); + IFNET_RUNLOCK(); + + pfi_attach_cookie = EVENTHANDLER_REGISTER(ifnet_arrival_event, + pfi_attach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY); + pfi_detach_cookie = EVENTHANDLER_REGISTER(ifnet_departure_event, + pfi_detach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY); + pfi_attach_group_cookie = EVENTHANDLER_REGISTER(group_attach_event, + pfi_attach_group_event, NULL, EVENTHANDLER_PRI_ANY); + pfi_change_group_cookie = EVENTHANDLER_REGISTER(group_change_event, + pfi_change_group_event, NULL, EVENTHANDLER_PRI_ANY); + pfi_detach_group_cookie = EVENTHANDLER_REGISTER(group_detach_event, + pfi_detach_group_event, NULL, EVENTHANDLER_PRI_ANY); + pfi_ifaddr_event_cookie = EVENTHANDLER_REGISTER(ifaddr_event, + pfi_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY); +#endif +} + +#ifdef __FreeBSD__ +void +pfi_cleanup(void) +{ + struct pfi_kif *p; + + PF_UNLOCK(); + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, pfi_attach_cookie); + EVENTHANDLER_DEREGISTER(ifnet_departure_event, pfi_detach_cookie); + EVENTHANDLER_DEREGISTER(group_attach_event, pfi_attach_group_cookie); + EVENTHANDLER_DEREGISTER(group_change_event, pfi_change_group_cookie); + EVENTHANDLER_DEREGISTER(group_detach_event, pfi_detach_group_cookie); + EVENTHANDLER_DEREGISTER(ifaddr_event, pfi_ifaddr_event_cookie); + PF_LOCK(); + + pfi_all = NULL; + while ((p = RB_MIN(pfi_ifhead, &pfi_ifs))) { + if (p->pfik_rules || p->pfik_states) { + printf("pfi_cleanup: dangling refs for %s\n", + p->pfik_name); + } + + RB_REMOVE(pfi_ifhead, &pfi_ifs, p); + free(p, PFI_MTYPE); + } + + free(pfi_buffer, PFI_MTYPE); +} +#endif + +struct pfi_kif * +pfi_kif_get(const char *kif_name) +{ + struct pfi_kif *kif; + struct pfi_kif_cmp s; + + bzero(&s, sizeof(s)); + strlcpy(s.pfik_name, kif_name, sizeof(s.pfik_name)); + if ((kif = RB_FIND(pfi_ifhead, &pfi_ifs, (struct pfi_kif *)&s)) != NULL) + return (kif); + + /* create new one */ +#ifdef __FreeBSD__ + if ((kif = malloc(sizeof(*kif), PFI_MTYPE, M_NOWAIT)) == NULL) +#else + if ((kif = malloc(sizeof(*kif), PFI_MTYPE, M_DONTWAIT)) == NULL) +#endif + return (NULL); + + bzero(kif, sizeof(*kif)); + strlcpy(kif->pfik_name, kif_name, sizeof(kif->pfik_name)); +#ifdef __FreeBSD__ + /* + * It seems that the value of time_second is in unintialzied state + * when pf sets interface statistics clear time in boot phase if pf + * was statically linked to kernel. Instead of setting the bogus + * time value have pfi_get_ifaces handle this case. In + * pfi_get_ifaces it uses boottime.tv_sec if it sees the time is 0. + */ + kif->pfik_tzero = time_second > 1 ? time_second : 0; +#else + kif->pfik_tzero = time_second; +#endif + TAILQ_INIT(&kif->pfik_dynaddrs); + + RB_INSERT(pfi_ifhead, &pfi_ifs, kif); + return (kif); +} + +void +pfi_kif_ref(struct pfi_kif *kif, enum pfi_kif_refs what) +{ + switch (what) { + case PFI_KIF_REF_RULE: + kif->pfik_rules++; + break; + case PFI_KIF_REF_STATE: + if (!kif->pfik_states++) + TAILQ_INSERT_TAIL(&pfi_statehead, kif, pfik_w_states); + break; + default: + panic("pfi_kif_ref with unknown type"); + } +} + +void +pfi_kif_unref(struct pfi_kif *kif, enum pfi_kif_refs what) +{ + if (kif == NULL) + return; + + switch (what) { + case PFI_KIF_REF_NONE: + break; + case PFI_KIF_REF_RULE: + if (kif->pfik_rules <= 0) { + printf("pfi_kif_unref: rules refcount <= 0\n"); + return; + } + kif->pfik_rules--; + break; + case PFI_KIF_REF_STATE: + if (kif->pfik_states <= 0) { + printf("pfi_kif_unref: state refcount <= 0\n"); + return; + } + if (!--kif->pfik_states) + TAILQ_REMOVE(&pfi_statehead, kif, pfik_w_states); + break; + default: + panic("pfi_kif_unref with unknown type"); + } + + if (kif->pfik_ifp != NULL || kif->pfik_group != NULL || kif == pfi_all) + return; + + if (kif->pfik_rules || kif->pfik_states) + return; + + RB_REMOVE(pfi_ifhead, &pfi_ifs, kif); + free(kif, PFI_MTYPE); +} + +int +pfi_kif_match(struct pfi_kif *rule_kif, struct pfi_kif *packet_kif) +{ + struct ifg_list *p; + + if (rule_kif == NULL || rule_kif == packet_kif) + return (1); + + if (rule_kif->pfik_group != NULL) + TAILQ_FOREACH(p, &packet_kif->pfik_ifp->if_groups, ifgl_next) + if (p->ifgl_group == rule_kif->pfik_group) + return (1); + + return (0); +} + +void +pfi_attach_ifnet(struct ifnet *ifp) +{ + struct pfi_kif *kif; + int s; + + pfi_initialize(); + s = splsoftnet(); + pfi_update++; + if ((kif = pfi_kif_get(ifp->if_xname)) == NULL) + panic("pfi_kif_get failed"); + + kif->pfik_ifp = ifp; + ifp->if_pf_kif = (caddr_t)kif; + +#ifndef __FreeBSD__ + if ((kif->pfik_ah_cookie = hook_establish(ifp->if_addrhooks, 1, + pfi_kifaddr_update, kif)) == NULL) + panic("pfi_attach_ifnet: cannot allocate '%s' address hook", + ifp->if_xname); +#endif + + pfi_kif_update(kif); + + splx(s); +} + +void +pfi_detach_ifnet(struct ifnet *ifp) +{ + int s; + struct pfi_kif *kif; + + if ((kif = (struct pfi_kif *)ifp->if_pf_kif) == NULL) + return; + + s = splsoftnet(); + pfi_update++; +#ifndef __FreeBSD__ + hook_disestablish(ifp->if_addrhooks, kif->pfik_ah_cookie); +#endif + pfi_kif_update(kif); + + kif->pfik_ifp = NULL; + ifp->if_pf_kif = NULL; + pfi_kif_unref(kif, PFI_KIF_REF_NONE); + splx(s); +} + +void +pfi_attach_ifgroup(struct ifg_group *ifg) +{ + struct pfi_kif *kif; + int s; + + pfi_initialize(); + s = splsoftnet(); + pfi_update++; + if ((kif = pfi_kif_get(ifg->ifg_group)) == NULL) + panic("pfi_kif_get failed"); + + kif->pfik_group = ifg; + ifg->ifg_pf_kif = (caddr_t)kif; + + splx(s); +} + +void +pfi_detach_ifgroup(struct ifg_group *ifg) +{ + int s; + struct pfi_kif *kif; + + if ((kif = (struct pfi_kif *)ifg->ifg_pf_kif) == NULL) + return; + + s = splsoftnet(); + pfi_update++; + + kif->pfik_group = NULL; + ifg->ifg_pf_kif = NULL; + pfi_kif_unref(kif, PFI_KIF_REF_NONE); + splx(s); +} + +void +pfi_group_change(const char *group) +{ + struct pfi_kif *kif; + int s; + + s = splsoftnet(); + pfi_update++; + if ((kif = pfi_kif_get(group)) == NULL) + panic("pfi_kif_get failed"); + + pfi_kif_update(kif); + + splx(s); +} + +int +pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + switch (dyn->pfid_acnt4) { + case 0: + return (0); + case 1: + return (PF_MATCHA(0, &dyn->pfid_addr4, + &dyn->pfid_mask4, a, AF_INET)); + default: + return (pfr_match_addr(dyn->pfid_kt, a, AF_INET)); + } + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + switch (dyn->pfid_acnt6) { + case 0: + return (0); + case 1: + return (PF_MATCHA(0, &dyn->pfid_addr6, + &dyn->pfid_mask6, a, AF_INET6)); + default: + return (pfr_match_addr(dyn->pfid_kt, a, AF_INET6)); + } + break; +#endif /* INET6 */ + default: + return (0); + } +} + +int +pfi_dynaddr_setup(struct pf_addr_wrap *aw, sa_family_t af) +{ + struct pfi_dynaddr *dyn; + char tblname[PF_TABLE_NAME_SIZE]; + struct pf_ruleset *ruleset = NULL; + int s, rv = 0; + + if (aw->type != PF_ADDR_DYNIFTL) + return (0); + if ((dyn = pool_get(&pfi_addr_pl, PR_NOWAIT)) == NULL) + return (1); + bzero(dyn, sizeof(*dyn)); + + s = splsoftnet(); + if (!strcmp(aw->v.ifname, "self")) + dyn->pfid_kif = pfi_kif_get(IFG_ALL); + else + dyn->pfid_kif = pfi_kif_get(aw->v.ifname); + if (dyn->pfid_kif == NULL) { + rv = 1; + goto _bad; + } + pfi_kif_ref(dyn->pfid_kif, PFI_KIF_REF_RULE); + + dyn->pfid_net = pfi_unmask(&aw->v.a.mask); + if (af == AF_INET && dyn->pfid_net == 32) + dyn->pfid_net = 128; + strlcpy(tblname, aw->v.ifname, sizeof(tblname)); + if (aw->iflags & PFI_AFLAG_NETWORK) + strlcat(tblname, ":network", sizeof(tblname)); + if (aw->iflags & PFI_AFLAG_BROADCAST) + strlcat(tblname, ":broadcast", sizeof(tblname)); + if (aw->iflags & PFI_AFLAG_PEER) + strlcat(tblname, ":peer", sizeof(tblname)); + if (aw->iflags & PFI_AFLAG_NOALIAS) + strlcat(tblname, ":0", sizeof(tblname)); + if (dyn->pfid_net != 128) + snprintf(tblname + strlen(tblname), + sizeof(tblname) - strlen(tblname), "/%d", dyn->pfid_net); + if ((ruleset = pf_find_or_create_ruleset(PF_RESERVED_ANCHOR)) == NULL) { + rv = 1; + goto _bad; + } + + if ((dyn->pfid_kt = pfr_attach_table(ruleset, tblname)) == NULL) { + rv = 1; + goto _bad; + } + + dyn->pfid_kt->pfrkt_flags |= PFR_TFLAG_ACTIVE; + dyn->pfid_iflags = aw->iflags; + dyn->pfid_af = af; + + TAILQ_INSERT_TAIL(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry); + aw->p.dyn = dyn; + pfi_kif_update(dyn->pfid_kif); + splx(s); + return (0); + +_bad: + if (dyn->pfid_kt != NULL) + pfr_detach_table(dyn->pfid_kt); + if (ruleset != NULL) + pf_remove_if_empty_ruleset(ruleset); + if (dyn->pfid_kif != NULL) + pfi_kif_unref(dyn->pfid_kif, PFI_KIF_REF_RULE); + pool_put(&pfi_addr_pl, dyn); + splx(s); + return (rv); +} + +void +pfi_kif_update(struct pfi_kif *kif) +{ + struct ifg_list *ifgl; + struct pfi_dynaddr *p; + + /* update all dynaddr */ + TAILQ_FOREACH(p, &kif->pfik_dynaddrs, entry) + pfi_dynaddr_update(p); + + /* again for all groups kif is member of */ + if (kif->pfik_ifp != NULL) + TAILQ_FOREACH(ifgl, &kif->pfik_ifp->if_groups, ifgl_next) + pfi_kif_update((struct pfi_kif *) + ifgl->ifgl_group->ifg_pf_kif); +} + +void +pfi_dynaddr_update(struct pfi_dynaddr *dyn) +{ + struct pfi_kif *kif; + struct pfr_ktable *kt; + + if (dyn == NULL || dyn->pfid_kif == NULL || dyn->pfid_kt == NULL) + panic("pfi_dynaddr_update"); + + kif = dyn->pfid_kif; + kt = dyn->pfid_kt; + + if (kt->pfrkt_larg != pfi_update) { + /* this table needs to be brought up-to-date */ + pfi_table_update(kt, kif, dyn->pfid_net, dyn->pfid_iflags); + kt->pfrkt_larg = pfi_update; + } + pfr_dynaddr_update(kt, dyn); +} + +void +pfi_table_update(struct pfr_ktable *kt, struct pfi_kif *kif, int net, int flags) +{ + int e, size2 = 0; + struct ifg_member *ifgm; + + pfi_buffer_cnt = 0; + + if (kif->pfik_ifp != NULL) + pfi_instance_add(kif->pfik_ifp, net, flags); + else if (kif->pfik_group != NULL) + TAILQ_FOREACH(ifgm, &kif->pfik_group->ifg_members, ifgm_next) + pfi_instance_add(ifgm->ifgm_ifp, net, flags); + + if ((e = pfr_set_addrs(&kt->pfrkt_t, pfi_buffer, pfi_buffer_cnt, &size2, + NULL, NULL, NULL, 0, PFR_TFLAG_ALLMASK))) + printf("pfi_table_update: cannot set %d new addresses " + "into table %s: %d\n", pfi_buffer_cnt, kt->pfrkt_name, e); +} + +void +pfi_instance_add(struct ifnet *ifp, int net, int flags) +{ + struct ifaddr *ia; + int got4 = 0, got6 = 0; + int net2, af; + + if (ifp == NULL) + return; + TAILQ_FOREACH(ia, &ifp->if_addrlist, ifa_list) { + if (ia->ifa_addr == NULL) + continue; + af = ia->ifa_addr->sa_family; + if (af != AF_INET && af != AF_INET6) + continue; +#ifdef __FreeBSD__ + /* + * XXX: For point-to-point interfaces, (ifname:0) and IPv4, + * jump over addresses without a proper route to work + * around a problem with ppp not fully removing the + * address used during IPCP. + */ + if ((ifp->if_flags & IFF_POINTOPOINT) && + !(ia->ifa_flags & IFA_ROUTE) && + (flags & PFI_AFLAG_NOALIAS) && (af == AF_INET)) + continue; +#endif + if ((flags & PFI_AFLAG_BROADCAST) && af == AF_INET6) + continue; + if ((flags & PFI_AFLAG_BROADCAST) && + !(ifp->if_flags & IFF_BROADCAST)) + continue; + if ((flags & PFI_AFLAG_PEER) && + !(ifp->if_flags & IFF_POINTOPOINT)) + continue; + if ((flags & PFI_AFLAG_NETWORK) && af == AF_INET6 && + IN6_IS_ADDR_LINKLOCAL( + &((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr)) + continue; + if (flags & PFI_AFLAG_NOALIAS) { + if (af == AF_INET && got4) + continue; + if (af == AF_INET6 && got6) + continue; + } + if (af == AF_INET) + got4 = 1; + else if (af == AF_INET6) + got6 = 1; + net2 = net; + if (net2 == 128 && (flags & PFI_AFLAG_NETWORK)) { + if (af == AF_INET) + net2 = pfi_unmask(&((struct sockaddr_in *) + ia->ifa_netmask)->sin_addr); + else if (af == AF_INET6) + net2 = pfi_unmask(&((struct sockaddr_in6 *) + ia->ifa_netmask)->sin6_addr); + } + if (af == AF_INET && net2 > 32) + net2 = 32; + if (flags & PFI_AFLAG_BROADCAST) + pfi_address_add(ia->ifa_broadaddr, af, net2); + else if (flags & PFI_AFLAG_PEER) + pfi_address_add(ia->ifa_dstaddr, af, net2); + else + pfi_address_add(ia->ifa_addr, af, net2); + } +} + +void +pfi_address_add(struct sockaddr *sa, int af, int net) +{ + struct pfr_addr *p; + int i; + + if (pfi_buffer_cnt >= pfi_buffer_max) { + int new_max = pfi_buffer_max * 2; + + if (new_max > PFI_BUFFER_MAX) { + printf("pfi_address_add: address buffer full (%d/%d)\n", + pfi_buffer_cnt, PFI_BUFFER_MAX); + return; + } + p = malloc(new_max * sizeof(*pfi_buffer), PFI_MTYPE, +#ifdef __FreeBSD__ + M_NOWAIT); +#else + M_DONTWAIT); +#endif + if (p == NULL) { + printf("pfi_address_add: no memory to grow buffer " + "(%d/%d)\n", pfi_buffer_cnt, PFI_BUFFER_MAX); + return; + } + memcpy(p, pfi_buffer, pfi_buffer_max * sizeof(*pfi_buffer)); + /* no need to zero buffer */ + free(pfi_buffer, PFI_MTYPE); + pfi_buffer = p; + pfi_buffer_max = new_max; + } + if (af == AF_INET && net > 32) + net = 128; + p = pfi_buffer + pfi_buffer_cnt++; + bzero(p, sizeof(*p)); + p->pfra_af = af; + p->pfra_net = net; + if (af == AF_INET) + p->pfra_ip4addr = ((struct sockaddr_in *)sa)->sin_addr; + else if (af == AF_INET6) { + p->pfra_ip6addr = ((struct sockaddr_in6 *)sa)->sin6_addr; + if (IN6_IS_SCOPE_EMBED(&p->pfra_ip6addr)) + p->pfra_ip6addr.s6_addr16[1] = 0; + } + /* mask network address bits */ + if (net < 128) + ((caddr_t)p)[p->pfra_net/8] &= ~(0xFF >> (p->pfra_net%8)); + for (i = (p->pfra_net+7)/8; i < sizeof(p->pfra_u); i++) + ((caddr_t)p)[i] = 0; +} + +void +pfi_dynaddr_remove(struct pf_addr_wrap *aw) +{ + int s; + + if (aw->type != PF_ADDR_DYNIFTL || aw->p.dyn == NULL || + aw->p.dyn->pfid_kif == NULL || aw->p.dyn->pfid_kt == NULL) + return; + + s = splsoftnet(); + TAILQ_REMOVE(&aw->p.dyn->pfid_kif->pfik_dynaddrs, aw->p.dyn, entry); + pfi_kif_unref(aw->p.dyn->pfid_kif, PFI_KIF_REF_RULE); + aw->p.dyn->pfid_kif = NULL; + pfr_detach_table(aw->p.dyn->pfid_kt); + aw->p.dyn->pfid_kt = NULL; + pool_put(&pfi_addr_pl, aw->p.dyn); + aw->p.dyn = NULL; + splx(s); +} + +void +pfi_dynaddr_copyout(struct pf_addr_wrap *aw) +{ + if (aw->type != PF_ADDR_DYNIFTL || aw->p.dyn == NULL || + aw->p.dyn->pfid_kif == NULL) + return; + aw->p.dyncnt = aw->p.dyn->pfid_acnt4 + aw->p.dyn->pfid_acnt6; +} + +void +pfi_kifaddr_update(void *v) +{ + int s; + struct pfi_kif *kif = (struct pfi_kif *)v; + + s = splsoftnet(); + pfi_update++; + pfi_kif_update(kif); + splx(s); +} + +int +pfi_if_compare(struct pfi_kif *p, struct pfi_kif *q) +{ + return (strncmp(p->pfik_name, q->pfik_name, IFNAMSIZ)); +} + +void +pfi_fill_oldstatus(struct pf_status *pfs) +{ + struct pfi_kif *p; + struct pfi_kif_cmp key; + int i, j, k, s; + + strlcpy(key.pfik_name, pfs->ifname, sizeof(key.pfik_name)); + s = splsoftnet(); + p = RB_FIND(pfi_ifhead, &pfi_ifs, (struct pfi_kif *)&key); + if (p == NULL) { + splx(s); + return; + } + bzero(pfs->pcounters, sizeof(pfs->pcounters)); + bzero(pfs->bcounters, sizeof(pfs->bcounters)); + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) { + pfs->pcounters[i][j][k] = + p->pfik_packets[i][j][k]; + pfs->bcounters[i][j] += + p->pfik_bytes[i][j][k]; + } + splx(s); +} + +int +pfi_clr_istats(const char *name) +{ + struct pfi_kif *p; + int s; + + s = splsoftnet(); + RB_FOREACH(p, pfi_ifhead, &pfi_ifs) { + if (pfi_skip_if(name, p)) + continue; + bzero(p->pfik_packets, sizeof(p->pfik_packets)); + bzero(p->pfik_bytes, sizeof(p->pfik_bytes)); + p->pfik_tzero = time_second; + } + splx(s); + + return (0); +} + +int +pfi_get_ifaces(const char *name, struct pfi_kif *buf, int *size) +{ + struct pfi_kif *p, *nextp; + int s, n = 0; +#ifdef __FreeBSD__ + int error; +#endif + + s = splsoftnet(); + for (p = RB_MIN(pfi_ifhead, &pfi_ifs); p; p = nextp) { + nextp = RB_NEXT(pfi_ifhead, &pfi_ifs, p); + if (pfi_skip_if(name, p)) + continue; + if (*size > n++) { + if (!p->pfik_tzero) + p->pfik_tzero = time_second; + pfi_kif_ref(p, PFI_KIF_REF_RULE); +#ifdef __FreeBSD__ + PF_COPYOUT(p, buf++, sizeof(*buf), error); + if (error) { +#else + if (copyout(p, buf++, sizeof(*buf))) { +#endif + pfi_kif_unref(p, PFI_KIF_REF_RULE); + splx(s); + return (EFAULT); + } + nextp = RB_NEXT(pfi_ifhead, &pfi_ifs, p); + pfi_kif_unref(p, PFI_KIF_REF_RULE); + } + } + splx(s); + *size = n; + return (0); +} + +int +pfi_skip_if(const char *filter, struct pfi_kif *p) +{ + int n; + + if (filter == NULL || !*filter) + return (0); + if (!strcmp(p->pfik_name, filter)) + return (0); /* exact match */ + n = strlen(filter); + if (n < 1 || n >= IFNAMSIZ) + return (1); /* sanity check */ + if (filter[n-1] >= '0' && filter[n-1] <= '9') + return (1); /* only do exact match in that case */ + if (strncmp(p->pfik_name, filter, n)) + return (1); /* prefix doesn't match */ + return (p->pfik_name[n] < '0' || p->pfik_name[n] > '9'); +} + +int +pfi_set_flags(const char *name, int flags) +{ + struct pfi_kif *p; + int s; + + s = splsoftnet(); + RB_FOREACH(p, pfi_ifhead, &pfi_ifs) { + if (pfi_skip_if(name, p)) + continue; + p->pfik_flags |= flags; + } + splx(s); + return (0); +} + +int +pfi_clear_flags(const char *name, int flags) +{ + struct pfi_kif *p; + int s; + + s = splsoftnet(); + RB_FOREACH(p, pfi_ifhead, &pfi_ifs) { + if (pfi_skip_if(name, p)) + continue; + p->pfik_flags &= ~flags; + } + splx(s); + return (0); +} + +/* from pf_print_state.c */ +int +pfi_unmask(void *addr) +{ + struct pf_addr *m = addr; + int i = 31, j = 0, b = 0; + u_int32_t tmp; + + while (j < 4 && m->addr32[j] == 0xffffffff) { + b += 32; + j++; + } + if (j < 4) { + tmp = ntohl(m->addr32[j]); + for (i = 31; tmp & (1 << i); --i) + b++; + } + return (b); +} + +#ifdef __FreeBSD__ +void +pfi_attach_ifnet_event(void *arg __unused, struct ifnet *ifp) +{ + PF_LOCK(); + pfi_attach_ifnet(ifp); +#ifdef ALTQ + pf_altq_ifnet_event(ifp, 0); +#endif + PF_UNLOCK(); +} + +void +pfi_detach_ifnet_event(void *arg __unused, struct ifnet *ifp) +{ + PF_LOCK(); + pfi_detach_ifnet(ifp); +#ifdef ALTQ + pf_altq_ifnet_event(ifp, 1); +#endif + PF_UNLOCK(); +} + +void +pfi_attach_group_event(void *arg __unused, struct ifg_group *ifg) +{ + PF_LOCK(); + pfi_attach_ifgroup(ifg); + PF_UNLOCK(); +} + +void +pfi_change_group_event(void *arg __unused, char *gname) +{ + PF_LOCK(); + pfi_group_change(gname); + PF_UNLOCK(); +} + +void +pfi_detach_group_event(void *arg __unused, struct ifg_group *ifg) +{ + PF_LOCK(); + pfi_detach_ifgroup(ifg); + PF_UNLOCK(); +} + +void +pfi_ifaddr_event(void *arg __unused, struct ifnet *ifp) +{ + PF_LOCK(); + if (ifp && ifp->if_pf_kif) + pfi_kifaddr_update(ifp->if_pf_kif); + PF_UNLOCK(); +} +#endif /* __FreeBSD__ */ diff --git a/contrib/pf/rtems/freebsd/net/pf_ioctl.c b/contrib/pf/rtems/freebsd/net/pf_ioctl.c new file mode 100644 index 00000000..21032fa8 --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/pf_ioctl.c @@ -0,0 +1,3896 @@ +#include + +/* $OpenBSD: pf_ioctl.c,v 1.175 2007/02/26 22:47:43 deraadt Exp $ */ + +/* + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002,2003 Henning Brauer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + */ + +#ifdef __FreeBSD__ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#ifdef DEV_BPF +#define NBPFILTER DEV_BPF +#else +#define NBPFILTER 0 +#endif + +#ifdef DEV_PFLOG +#define NPFLOG DEV_PFLOG +#else +#define NPFLOG 0 +#endif + +#ifdef DEV_PFSYNC +#define NPFSYNC DEV_PFSYNC +#else +#define NPFSYNC 0 +#endif + +#else +#include +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __FreeBSD__ +#include +#include +#include +#include +#else +#include +#include +#endif +#include +#include +#include +#ifndef __FreeBSD__ +#include +#include +#endif + +#include +#include +#ifdef __FreeBSD__ +#include +#endif + +#include +#include +#include +#include +#include +#include + +#ifdef __FreeBSD__ +#include +#else +#include +#include +#endif +#include + +#if NPFSYNC > 0 +#include +#endif /* NPFSYNC > 0 */ + +#include + +#ifdef INET6 +#include +#include +#endif /* INET6 */ + +#ifdef ALTQ +#include +#endif + +#ifdef __FreeBSD__ +#include +#include +#include +#include +#endif /* __FreeBSD__ */ + +#ifdef __FreeBSD__ +void init_zone_var(void); +void cleanup_pf_zone(void); +int pfattach(void); +#else +void pfattach(int); +void pf_thread_create(void *); +int pfopen(dev_t, int, int, struct proc *); +int pfclose(dev_t, int, int, struct proc *); +#endif +struct pf_pool *pf_get_pool(char *, u_int32_t, u_int8_t, u_int32_t, + u_int8_t, u_int8_t, u_int8_t); + +void pf_mv_pool(struct pf_palist *, struct pf_palist *); +void pf_empty_pool(struct pf_palist *); +#ifdef __FreeBSD__ +int pfioctl(struct cdev *, u_long, caddr_t, int, struct thread *); +#else +int pfioctl(struct cdev *, u_long, caddr_t, int, struct proc *); +#endif +#ifdef ALTQ +int pf_begin_altq(u_int32_t *); +int pf_rollback_altq(u_int32_t); +int pf_commit_altq(u_int32_t); +int pf_enable_altq(struct pf_altq *); +int pf_disable_altq(struct pf_altq *); +#endif /* ALTQ */ +int pf_begin_rules(u_int32_t *, int, const char *); +int pf_rollback_rules(u_int32_t, int, char *); +int pf_setup_pfsync_matching(struct pf_ruleset *); +void pf_hash_rule(MD5_CTX *, struct pf_rule *); +void pf_hash_rule_addr(MD5_CTX *, struct pf_rule_addr *); +int pf_commit_rules(u_int32_t, int, char *); + +struct pf_rule pf_default_rule; +#ifdef __FreeBSD__ +struct sx pf_consistency_lock; +SX_SYSINIT(pf_consistency_lock, &pf_consistency_lock, "pf_statetbl_lock"); +#else +struct rwlock pf_consistency_lock = RWLOCK_INITIALIZER; +#endif +#ifdef ALTQ +static int pf_altq_running; +#endif + +#define TAGID_MAX 50000 +TAILQ_HEAD(pf_tags, pf_tagname) pf_tags = TAILQ_HEAD_INITIALIZER(pf_tags), + pf_qids = TAILQ_HEAD_INITIALIZER(pf_qids); + +#if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE) +#error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE +#endif +u_int16_t tagname2tag(struct pf_tags *, char *); +void tag2tagname(struct pf_tags *, u_int16_t, char *); +void tag_unref(struct pf_tags *, u_int16_t); +int pf_rtlabel_add(struct pf_addr_wrap *); +void pf_rtlabel_remove(struct pf_addr_wrap *); +void pf_rtlabel_copyout(struct pf_addr_wrap *); + +#define DPFPRINTF(n, x) if (pf_status.debug >= (n)) printf x + + +#ifdef __FreeBSD__ +static struct cdev *pf_dev; + +/* + * XXX - These are new and need to be checked when moveing to a new version + */ +static void pf_clear_states(void); +static int pf_clear_tables(void); +static void pf_clear_srcnodes(void); +/* + * XXX - These are new and need to be checked when moveing to a new version + */ + +/* + * Wrapper functions for pfil(9) hooks + */ +static int pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp, + int dir, struct inpcb *inp); +static int pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp, + int dir, struct inpcb *inp); +#ifdef INET6 +static int pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp, + int dir, struct inpcb *inp); +static int pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp, + int dir, struct inpcb *inp); +#endif + +static int hook_pf(void); +static int dehook_pf(void); +static int shutdown_pf(void); +static int pf_load(void); +static int pf_unload(void); + +static struct cdevsw pf_cdevsw = { + .d_ioctl = pfioctl, + .d_name = PF_NAME, + .d_version = D_VERSION, +}; + +static volatile int pf_pfil_hooked = 0; +int pf_end_threads = 0; +struct mtx pf_task_mtx; +pflog_packet_t *pflog_packet_ptr = NULL; + +int debug_pfugidhack = 0; +SYSCTL_INT(_debug, OID_AUTO, pfugidhack, CTLFLAG_RW, &debug_pfugidhack, 0, + "Enable/disable pf user/group rules mpsafe hack"); + +void +init_pf_mutex(void) +{ + mtx_init(&pf_task_mtx, "pf task mtx", NULL, MTX_DEF); +} + +void +destroy_pf_mutex(void) +{ + mtx_destroy(&pf_task_mtx); +} + +void +init_zone_var(void) +{ + pf_src_tree_pl = pf_rule_pl = NULL; + pf_state_pl = pf_altq_pl = pf_pooladdr_pl = NULL; + pf_frent_pl = pf_frag_pl = pf_cache_pl = pf_cent_pl = NULL; + pf_state_scrub_pl = NULL; + pfr_ktable_pl = pfr_kentry_pl = NULL; +} + +void +cleanup_pf_zone(void) +{ + UMA_DESTROY(pf_src_tree_pl); + UMA_DESTROY(pf_rule_pl); + UMA_DESTROY(pf_state_pl); + UMA_DESTROY(pf_altq_pl); + UMA_DESTROY(pf_pooladdr_pl); + UMA_DESTROY(pf_frent_pl); + UMA_DESTROY(pf_frag_pl); + UMA_DESTROY(pf_cache_pl); + UMA_DESTROY(pf_cent_pl); + UMA_DESTROY(pfr_ktable_pl); + UMA_DESTROY(pfr_kentry_pl2); + UMA_DESTROY(pfr_kentry_pl); + UMA_DESTROY(pf_state_scrub_pl); + UMA_DESTROY(pfi_addr_pl); +} + +int +pfattach(void) +{ + u_int32_t *my_timeout = pf_default_rule.timeout; + int error = 1; + + do { + UMA_CREATE(pf_src_tree_pl,struct pf_src_node, "pfsrctrpl"); + UMA_CREATE(pf_rule_pl, struct pf_rule, "pfrulepl"); + UMA_CREATE(pf_state_pl, struct pf_state, "pfstatepl"); + UMA_CREATE(pf_altq_pl, struct pf_altq, "pfaltqpl"); + UMA_CREATE(pf_pooladdr_pl, struct pf_pooladdr, "pfpooladdrpl"); + UMA_CREATE(pfr_ktable_pl, struct pfr_ktable, "pfrktable"); + UMA_CREATE(pfr_kentry_pl, struct pfr_kentry, "pfrkentry"); + UMA_CREATE(pfr_kentry_pl2, struct pfr_kentry, "pfrkentry2"); + UMA_CREATE(pf_frent_pl, struct pf_frent, "pffrent"); + UMA_CREATE(pf_frag_pl, struct pf_fragment, "pffrag"); + UMA_CREATE(pf_cache_pl, struct pf_fragment, "pffrcache"); + UMA_CREATE(pf_cent_pl, struct pf_frcache, "pffrcent"); + UMA_CREATE(pf_state_scrub_pl, struct pf_state_scrub, + "pfstatescrub"); + UMA_CREATE(pfi_addr_pl, struct pfi_dynaddr, "pfiaddrpl"); + error = 0; + } while(0); + if (error) { + cleanup_pf_zone(); + return (error); + } + pfr_initialize(); + pfi_initialize(); + if ( (error = pf_osfp_initialize()) ) { + cleanup_pf_zone(); + pf_osfp_cleanup(); + return (error); + } + + pf_pool_limits[PF_LIMIT_STATES].pp = pf_state_pl; + pf_pool_limits[PF_LIMIT_STATES].limit = PFSTATE_HIWAT; + pf_pool_limits[PF_LIMIT_SRC_NODES].pp = pf_src_tree_pl; + pf_pool_limits[PF_LIMIT_SRC_NODES].limit = PFSNODE_HIWAT; + pf_pool_limits[PF_LIMIT_FRAGS].pp = pf_frent_pl; + pf_pool_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT; + pf_pool_limits[PF_LIMIT_TABLES].pp = pfr_ktable_pl; + pf_pool_limits[PF_LIMIT_TABLES].limit = PFR_KTABLE_HIWAT; + pf_pool_limits[PF_LIMIT_TABLE_ENTRIES].pp = pfr_kentry_pl; + pf_pool_limits[PF_LIMIT_TABLE_ENTRIES].limit = PFR_KENTRY_HIWAT; + uma_zone_set_max(pf_pool_limits[PF_LIMIT_STATES].pp, + pf_pool_limits[PF_LIMIT_STATES].limit); + + RB_INIT(&tree_src_tracking); + RB_INIT(&pf_anchors); + pf_init_ruleset(&pf_main_ruleset); + TAILQ_INIT(&pf_altqs[0]); + TAILQ_INIT(&pf_altqs[1]); + TAILQ_INIT(&pf_pabuf); + pf_altqs_active = &pf_altqs[0]; + pf_altqs_inactive = &pf_altqs[1]; + TAILQ_INIT(&state_list); + + /* default rule should never be garbage collected */ + pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next; + pf_default_rule.action = PF_PASS; + pf_default_rule.nr = -1; + pf_default_rule.rtableid = -1; + + /* initialize default timeouts */ + my_timeout[PFTM_TCP_FIRST_PACKET] = PFTM_TCP_FIRST_PACKET_VAL; + my_timeout[PFTM_TCP_OPENING] = PFTM_TCP_OPENING_VAL; + my_timeout[PFTM_TCP_ESTABLISHED] = PFTM_TCP_ESTABLISHED_VAL; + my_timeout[PFTM_TCP_CLOSING] = PFTM_TCP_CLOSING_VAL; + my_timeout[PFTM_TCP_FIN_WAIT] = PFTM_TCP_FIN_WAIT_VAL; + my_timeout[PFTM_TCP_CLOSED] = PFTM_TCP_CLOSED_VAL; + my_timeout[PFTM_UDP_FIRST_PACKET] = PFTM_UDP_FIRST_PACKET_VAL; + my_timeout[PFTM_UDP_SINGLE] = PFTM_UDP_SINGLE_VAL; + my_timeout[PFTM_UDP_MULTIPLE] = PFTM_UDP_MULTIPLE_VAL; + my_timeout[PFTM_ICMP_FIRST_PACKET] = PFTM_ICMP_FIRST_PACKET_VAL; + my_timeout[PFTM_ICMP_ERROR_REPLY] = PFTM_ICMP_ERROR_REPLY_VAL; + my_timeout[PFTM_OTHER_FIRST_PACKET] = PFTM_OTHER_FIRST_PACKET_VAL; + my_timeout[PFTM_OTHER_SINGLE] = PFTM_OTHER_SINGLE_VAL; + my_timeout[PFTM_OTHER_MULTIPLE] = PFTM_OTHER_MULTIPLE_VAL; + my_timeout[PFTM_FRAG] = PFTM_FRAG_VAL; + my_timeout[PFTM_INTERVAL] = PFTM_INTERVAL_VAL; + my_timeout[PFTM_SRC_NODE] = PFTM_SRC_NODE_VAL; + my_timeout[PFTM_TS_DIFF] = PFTM_TS_DIFF_VAL; + my_timeout[PFTM_ADAPTIVE_START] = PFSTATE_ADAPT_START; + my_timeout[PFTM_ADAPTIVE_END] = PFSTATE_ADAPT_END; + + pf_normalize_init(); + bzero(&pf_status, sizeof(pf_status)); + pf_status.debug = PF_DEBUG_URGENT; + + pf_pfil_hooked = 0; + + /* XXX do our best to avoid a conflict */ + pf_status.hostid = arc4random(); + + if (kproc_create(pf_purge_thread, NULL, NULL, 0, 0, "pfpurge")) + return (ENXIO); + + return (error); +} +#else /* !__FreeBSD__ */ +void +pfattach(int num) +{ + u_int32_t *timeout = pf_default_rule.timeout; + + pool_init(&pf_rule_pl, sizeof(struct pf_rule), 0, 0, 0, "pfrulepl", + &pool_allocator_nointr); + pool_init(&pf_src_tree_pl, sizeof(struct pf_src_node), 0, 0, 0, + "pfsrctrpl", NULL); + pool_init(&pf_state_pl, sizeof(struct pf_state), 0, 0, 0, "pfstatepl", + NULL); + pool_init(&pf_altq_pl, sizeof(struct pf_altq), 0, 0, 0, "pfaltqpl", + &pool_allocator_nointr); + pool_init(&pf_pooladdr_pl, sizeof(struct pf_pooladdr), 0, 0, 0, + "pfpooladdrpl", &pool_allocator_nointr); + pfr_initialize(); + pfi_initialize(); + pf_osfp_initialize(); + + pool_sethardlimit(pf_pool_limits[PF_LIMIT_STATES].pp, + pf_pool_limits[PF_LIMIT_STATES].limit, NULL, 0); + + if (ctob(physmem) <= 100*1024*1024) + pf_pool_limits[PF_LIMIT_TABLE_ENTRIES].limit = + PFR_KENTRY_HIWAT_SMALL; + + RB_INIT(&tree_src_tracking); + RB_INIT(&pf_anchors); + pf_init_ruleset(&pf_main_ruleset); + TAILQ_INIT(&pf_altqs[0]); + TAILQ_INIT(&pf_altqs[1]); + TAILQ_INIT(&pf_pabuf); + pf_altqs_active = &pf_altqs[0]; + pf_altqs_inactive = &pf_altqs[1]; + TAILQ_INIT(&state_list); + + /* default rule should never be garbage collected */ + pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next; + pf_default_rule.action = PF_PASS; + pf_default_rule.nr = -1; + pf_default_rule.rtableid = -1; + + /* initialize default timeouts */ + timeout[PFTM_TCP_FIRST_PACKET] = PFTM_TCP_FIRST_PACKET_VAL; + timeout[PFTM_TCP_OPENING] = PFTM_TCP_OPENING_VAL; + timeout[PFTM_TCP_ESTABLISHED] = PFTM_TCP_ESTABLISHED_VAL; + timeout[PFTM_TCP_CLOSING] = PFTM_TCP_CLOSING_VAL; + timeout[PFTM_TCP_FIN_WAIT] = PFTM_TCP_FIN_WAIT_VAL; + timeout[PFTM_TCP_CLOSED] = PFTM_TCP_CLOSED_VAL; + timeout[PFTM_UDP_FIRST_PACKET] = PFTM_UDP_FIRST_PACKET_VAL; + timeout[PFTM_UDP_SINGLE] = PFTM_UDP_SINGLE_VAL; + timeout[PFTM_UDP_MULTIPLE] = PFTM_UDP_MULTIPLE_VAL; + timeout[PFTM_ICMP_FIRST_PACKET] = PFTM_ICMP_FIRST_PACKET_VAL; + timeout[PFTM_ICMP_ERROR_REPLY] = PFTM_ICMP_ERROR_REPLY_VAL; + timeout[PFTM_OTHER_FIRST_PACKET] = PFTM_OTHER_FIRST_PACKET_VAL; + timeout[PFTM_OTHER_SINGLE] = PFTM_OTHER_SINGLE_VAL; + timeout[PFTM_OTHER_MULTIPLE] = PFTM_OTHER_MULTIPLE_VAL; + timeout[PFTM_FRAG] = PFTM_FRAG_VAL; + timeout[PFTM_INTERVAL] = PFTM_INTERVAL_VAL; + timeout[PFTM_SRC_NODE] = PFTM_SRC_NODE_VAL; + timeout[PFTM_TS_DIFF] = PFTM_TS_DIFF_VAL; + timeout[PFTM_ADAPTIVE_START] = PFSTATE_ADAPT_START; + timeout[PFTM_ADAPTIVE_END] = PFSTATE_ADAPT_END; + + pf_normalize_init(); + bzero(&pf_status, sizeof(pf_status)); + pf_status.debug = PF_DEBUG_URGENT; + + /* XXX do our best to avoid a conflict */ + pf_status.hostid = arc4random(); + + /* require process context to purge states, so perform in a thread */ + kproc_create_deferred(pf_thread_create, NULL); +} + +void +pf_thread_create(void *v) +{ + if (kproc_create(pf_purge_thread, NULL, NULL, "pfpurge")) + panic("pfpurge thread"); +} + +int +pfopen(struct cdev *dev, int flags, int fmt, struct proc *p) +{ + if (dev2unit(dev) >= 1) + return (ENXIO); + return (0); +} + +int +pfclose(struct cdev *dev, int flags, int fmt, struct proc *p) +{ + if (dev2unit(dev) >= 1) + return (ENXIO); + return (0); +} +#endif /* __FreeBSD__ */ + +struct pf_pool * +pf_get_pool(char *anchor, u_int32_t ticket, u_int8_t rule_action, + u_int32_t rule_number, u_int8_t r_last, u_int8_t active, + u_int8_t check_ticket) +{ + struct pf_ruleset *ruleset; + struct pf_rule *rule; + int rs_num; + + ruleset = pf_find_ruleset(anchor); + if (ruleset == NULL) + return (NULL); + rs_num = pf_get_ruleset_number(rule_action); + if (rs_num >= PF_RULESET_MAX) + return (NULL); + if (active) { + if (check_ticket && ticket != + ruleset->rules[rs_num].active.ticket) + return (NULL); + if (r_last) + rule = TAILQ_LAST(ruleset->rules[rs_num].active.ptr, + pf_rulequeue); + else + rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr); + } else { + if (check_ticket && ticket != + ruleset->rules[rs_num].inactive.ticket) + return (NULL); + if (r_last) + rule = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr, + pf_rulequeue); + else + rule = TAILQ_FIRST(ruleset->rules[rs_num].inactive.ptr); + } + if (!r_last) { + while ((rule != NULL) && (rule->nr != rule_number)) + rule = TAILQ_NEXT(rule, entries); + } + if (rule == NULL) + return (NULL); + + return (&rule->rpool); +} + +void +pf_mv_pool(struct pf_palist *poola, struct pf_palist *poolb) +{ + struct pf_pooladdr *mv_pool_pa; + + while ((mv_pool_pa = TAILQ_FIRST(poola)) != NULL) { + TAILQ_REMOVE(poola, mv_pool_pa, entries); + TAILQ_INSERT_TAIL(poolb, mv_pool_pa, entries); + } +} + +void +pf_empty_pool(struct pf_palist *poola) +{ + struct pf_pooladdr *empty_pool_pa; + + while ((empty_pool_pa = TAILQ_FIRST(poola)) != NULL) { + pfi_dynaddr_remove(&empty_pool_pa->addr); + pf_tbladdr_remove(&empty_pool_pa->addr); + pfi_kif_unref(empty_pool_pa->kif, PFI_KIF_REF_RULE); + TAILQ_REMOVE(poola, empty_pool_pa, entries); + pool_put(&pf_pooladdr_pl, empty_pool_pa); + } +} + +void +pf_rm_rule(struct pf_rulequeue *rulequeue, struct pf_rule *rule) +{ + if (rulequeue != NULL) { + if (rule->states <= 0) { + /* + * XXX - we need to remove the table *before* detaching + * the rule to make sure the table code does not delete + * the anchor under our feet. + */ + pf_tbladdr_remove(&rule->src.addr); + pf_tbladdr_remove(&rule->dst.addr); + if (rule->overload_tbl) + pfr_detach_table(rule->overload_tbl); + } + TAILQ_REMOVE(rulequeue, rule, entries); + rule->entries.tqe_prev = NULL; + rule->nr = -1; + } + + if (rule->states > 0 || rule->src_nodes > 0 || + rule->entries.tqe_prev != NULL) + return; + pf_tag_unref(rule->tag); + pf_tag_unref(rule->match_tag); +#ifdef ALTQ + if (rule->pqid != rule->qid) + pf_qid_unref(rule->pqid); + pf_qid_unref(rule->qid); +#endif + pf_rtlabel_remove(&rule->src.addr); + pf_rtlabel_remove(&rule->dst.addr); + pfi_dynaddr_remove(&rule->src.addr); + pfi_dynaddr_remove(&rule->dst.addr); + if (rulequeue == NULL) { + pf_tbladdr_remove(&rule->src.addr); + pf_tbladdr_remove(&rule->dst.addr); + if (rule->overload_tbl) + pfr_detach_table(rule->overload_tbl); + } + pfi_kif_unref(rule->kif, PFI_KIF_REF_RULE); + pf_anchor_remove(rule); + pf_empty_pool(&rule->rpool.list); + pool_put(&pf_rule_pl, rule); +} + +u_int16_t +tagname2tag(struct pf_tags *head, char *tagname) +{ + struct pf_tagname *tag, *p = NULL; + u_int16_t new_tagid = 1; + + TAILQ_FOREACH(tag, head, entries) + if (strcmp(tagname, tag->name) == 0) { + tag->ref++; + return (tag->tag); + } + + /* + * to avoid fragmentation, we do a linear search from the beginning + * and take the first free slot we find. if there is none or the list + * is empty, append a new entry at the end. + */ + + /* new entry */ + if (!TAILQ_EMPTY(head)) + for (p = TAILQ_FIRST(head); p != NULL && + p->tag == new_tagid; p = TAILQ_NEXT(p, entries)) + new_tagid = p->tag + 1; + + if (new_tagid > TAGID_MAX) + return (0); + + /* allocate and fill new struct pf_tagname */ + tag = (struct pf_tagname *)malloc(sizeof(struct pf_tagname), + M_TEMP, M_NOWAIT); + if (tag == NULL) + return (0); + bzero(tag, sizeof(struct pf_tagname)); + strlcpy(tag->name, tagname, sizeof(tag->name)); + tag->tag = new_tagid; + tag->ref++; + + if (p != NULL) /* insert new entry before p */ + TAILQ_INSERT_BEFORE(p, tag, entries); + else /* either list empty or no free slot in between */ + TAILQ_INSERT_TAIL(head, tag, entries); + + return (tag->tag); +} + +void +tag2tagname(struct pf_tags *head, u_int16_t tagid, char *p) +{ + struct pf_tagname *tag; + + TAILQ_FOREACH(tag, head, entries) + if (tag->tag == tagid) { + strlcpy(p, tag->name, PF_TAG_NAME_SIZE); + return; + } +} + +void +tag_unref(struct pf_tags *head, u_int16_t tag) +{ + struct pf_tagname *p, *next; + + if (tag == 0) + return; + + for (p = TAILQ_FIRST(head); p != NULL; p = next) { + next = TAILQ_NEXT(p, entries); + if (tag == p->tag) { + if (--p->ref == 0) { + TAILQ_REMOVE(head, p, entries); + free(p, M_TEMP); + } + break; + } + } +} + +u_int16_t +pf_tagname2tag(char *tagname) +{ + return (tagname2tag(&pf_tags, tagname)); +} + +void +pf_tag2tagname(u_int16_t tagid, char *p) +{ + tag2tagname(&pf_tags, tagid, p); +} + +void +pf_tag_ref(u_int16_t tag) +{ + struct pf_tagname *t; + + TAILQ_FOREACH(t, &pf_tags, entries) + if (t->tag == tag) + break; + if (t != NULL) + t->ref++; +} + +void +pf_tag_unref(u_int16_t tag) +{ + tag_unref(&pf_tags, tag); +} + +int +pf_rtlabel_add(struct pf_addr_wrap *a) +{ +#ifdef __FreeBSD__ + /* XXX_IMPORT: later */ + return (0); +#else + if (a->type == PF_ADDR_RTLABEL && + (a->v.rtlabel = rtlabel_name2id(a->v.rtlabelname)) == 0) + return (-1); + return (0); +#endif +} + +void +pf_rtlabel_remove(struct pf_addr_wrap *a) +{ +#ifdef __FreeBSD__ + /* XXX_IMPORT: later */ +#else + if (a->type == PF_ADDR_RTLABEL) + rtlabel_unref(a->v.rtlabel); +#endif +} + +void +pf_rtlabel_copyout(struct pf_addr_wrap *a) +{ +#ifdef __FreeBSD__ + /* XXX_IMPORT: later */ + if (a->type == PF_ADDR_RTLABEL && a->v.rtlabel) + strlcpy(a->v.rtlabelname, "?", sizeof(a->v.rtlabelname)); +#else + const char *name; + + if (a->type == PF_ADDR_RTLABEL && a->v.rtlabel) { + if ((name = rtlabel_id2name(a->v.rtlabel)) == NULL) + strlcpy(a->v.rtlabelname, "?", + sizeof(a->v.rtlabelname)); + else + strlcpy(a->v.rtlabelname, name, + sizeof(a->v.rtlabelname)); + } +#endif +} + +#ifdef ALTQ +u_int32_t +pf_qname2qid(char *qname) +{ + return ((u_int32_t)tagname2tag(&pf_qids, qname)); +} + +void +pf_qid2qname(u_int32_t qid, char *p) +{ + tag2tagname(&pf_qids, (u_int16_t)qid, p); +} + +void +pf_qid_unref(u_int32_t qid) +{ + tag_unref(&pf_qids, (u_int16_t)qid); +} + +int +pf_begin_altq(u_int32_t *ticket) +{ + struct pf_altq *altq; + int error = 0; + + /* Purge the old altq list */ + while ((altq = TAILQ_FIRST(pf_altqs_inactive)) != NULL) { + TAILQ_REMOVE(pf_altqs_inactive, altq, entries); +#ifdef __FreeBSD__ + if (altq->qname[0] == 0 && + (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { +#else + if (altq->qname[0] == 0) { +#endif + /* detach and destroy the discipline */ + error = altq_remove(altq); + } else + pf_qid_unref(altq->qid); + pool_put(&pf_altq_pl, altq); + } + if (error) + return (error); + *ticket = ++ticket_altqs_inactive; + altqs_inactive_open = 1; + return (0); +} + +int +pf_rollback_altq(u_int32_t ticket) +{ + struct pf_altq *altq; + int error = 0; + + if (!altqs_inactive_open || ticket != ticket_altqs_inactive) + return (0); + /* Purge the old altq list */ + while ((altq = TAILQ_FIRST(pf_altqs_inactive)) != NULL) { + TAILQ_REMOVE(pf_altqs_inactive, altq, entries); +#ifdef __FreeBSD__ + if (altq->qname[0] == 0 && + (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { +#else + if (altq->qname[0] == 0) { +#endif + /* detach and destroy the discipline */ + error = altq_remove(altq); + } else + pf_qid_unref(altq->qid); + pool_put(&pf_altq_pl, altq); + } + altqs_inactive_open = 0; + return (error); +} + +int +pf_commit_altq(u_int32_t ticket) +{ + struct pf_altqqueue *old_altqs; + struct pf_altq *altq; + int s, err, error = 0; + + if (!altqs_inactive_open || ticket != ticket_altqs_inactive) + return (EBUSY); + + /* swap altqs, keep the old. */ + s = splsoftnet(); + old_altqs = pf_altqs_active; + pf_altqs_active = pf_altqs_inactive; + pf_altqs_inactive = old_altqs; + ticket_altqs_active = ticket_altqs_inactive; + + /* Attach new disciplines */ + TAILQ_FOREACH(altq, pf_altqs_active, entries) { +#ifdef __FreeBSD__ + if (altq->qname[0] == 0 && + (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { +#else + if (altq->qname[0] == 0) { +#endif + /* attach the discipline */ + error = altq_pfattach(altq); + if (error == 0 && pf_altq_running) + error = pf_enable_altq(altq); + if (error != 0) { + splx(s); + return (error); + } + } + } + + /* Purge the old altq list */ + while ((altq = TAILQ_FIRST(pf_altqs_inactive)) != NULL) { + TAILQ_REMOVE(pf_altqs_inactive, altq, entries); +#ifdef __FreeBSD__ + if (altq->qname[0] == 0 && + (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { +#else + if (altq->qname[0] == 0) { +#endif + /* detach and destroy the discipline */ + if (pf_altq_running) + error = pf_disable_altq(altq); + err = altq_pfdetach(altq); + if (err != 0 && error == 0) + error = err; + err = altq_remove(altq); + if (err != 0 && error == 0) + error = err; + } else + pf_qid_unref(altq->qid); + pool_put(&pf_altq_pl, altq); + } + splx(s); + + altqs_inactive_open = 0; + return (error); +} + +int +pf_enable_altq(struct pf_altq *altq) +{ + struct ifnet *ifp; + struct tb_profile tb; + int s, error = 0; + + if ((ifp = ifunit(altq->ifname)) == NULL) + return (EINVAL); + + if (ifp->if_snd.altq_type != ALTQT_NONE) + error = altq_enable(&ifp->if_snd); + + /* set tokenbucket regulator */ + if (error == 0 && ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) { + tb.rate = altq->ifbandwidth; + tb.depth = altq->tbrsize; + s = splnet(); +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + error = tbr_set(&ifp->if_snd, &tb); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + splx(s); + } + + return (error); +} + +int +pf_disable_altq(struct pf_altq *altq) +{ + struct ifnet *ifp; + struct tb_profile tb; + int s, error; + + if ((ifp = ifunit(altq->ifname)) == NULL) + return (EINVAL); + + /* + * when the discipline is no longer referenced, it was overridden + * by a new one. if so, just return. + */ + if (altq->altq_disc != ifp->if_snd.altq_disc) + return (0); + + error = altq_disable(&ifp->if_snd); + + if (error == 0) { + /* clear tokenbucket regulator */ + tb.rate = 0; + s = splnet(); +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + error = tbr_set(&ifp->if_snd, &tb); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + splx(s); + } + + return (error); +} + +#ifdef __FreeBSD__ +void +pf_altq_ifnet_event(struct ifnet *ifp, int remove) +{ + struct ifnet *ifp1; + struct pf_altq *a1, *a2, *a3; + u_int32_t ticket; + int error = 0; + + /* Interrupt userland queue modifications */ + if (altqs_inactive_open) + pf_rollback_altq(ticket_altqs_inactive); + + /* Start new altq ruleset */ + if (pf_begin_altq(&ticket)) + return; + + /* Copy the current active set */ + TAILQ_FOREACH(a1, pf_altqs_active, entries) { + a2 = pool_get(&pf_altq_pl, PR_NOWAIT); + if (a2 == NULL) { + error = ENOMEM; + break; + } + bcopy(a1, a2, sizeof(struct pf_altq)); + + if (a2->qname[0] != 0) { + if ((a2->qid = pf_qname2qid(a2->qname)) == 0) { + error = EBUSY; + pool_put(&pf_altq_pl, a2); + break; + } + a2->altq_disc = NULL; + TAILQ_FOREACH(a3, pf_altqs_inactive, entries) { + if (strncmp(a3->ifname, a2->ifname, + IFNAMSIZ) == 0 && a3->qname[0] == 0) { + a2->altq_disc = a3->altq_disc; + break; + } + } + } + /* Deactivate the interface in question */ + a2->local_flags &= ~PFALTQ_FLAG_IF_REMOVED; + if ((ifp1 = ifunit(a2->ifname)) == NULL || + (remove && ifp1 == ifp)) { + a2->local_flags |= PFALTQ_FLAG_IF_REMOVED; + } else { + PF_UNLOCK(); + error = altq_add(a2); + PF_LOCK(); + + if (ticket != ticket_altqs_inactive) + error = EBUSY; + + if (error) { + pool_put(&pf_altq_pl, a2); + break; + } + } + + TAILQ_INSERT_TAIL(pf_altqs_inactive, a2, entries); + } + + if (error != 0) + pf_rollback_altq(ticket); + else + pf_commit_altq(ticket); +} +#endif +#endif /* ALTQ */ + +int +pf_begin_rules(u_int32_t *ticket, int rs_num, const char *anchor) +{ + struct pf_ruleset *rs; + struct pf_rule *rule; + + if (rs_num < 0 || rs_num >= PF_RULESET_MAX) + return (EINVAL); + rs = pf_find_or_create_ruleset(anchor); + if (rs == NULL) + return (EINVAL); + while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) { + pf_rm_rule(rs->rules[rs_num].inactive.ptr, rule); + rs->rules[rs_num].inactive.rcount--; + } + *ticket = ++rs->rules[rs_num].inactive.ticket; + rs->rules[rs_num].inactive.open = 1; + return (0); +} + +int +pf_rollback_rules(u_int32_t ticket, int rs_num, char *anchor) +{ + struct pf_ruleset *rs; + struct pf_rule *rule; + + if (rs_num < 0 || rs_num >= PF_RULESET_MAX) + return (EINVAL); + rs = pf_find_ruleset(anchor); + if (rs == NULL || !rs->rules[rs_num].inactive.open || + rs->rules[rs_num].inactive.ticket != ticket) + return (0); + while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) { + pf_rm_rule(rs->rules[rs_num].inactive.ptr, rule); + rs->rules[rs_num].inactive.rcount--; + } + rs->rules[rs_num].inactive.open = 0; + return (0); +} + +#define PF_MD5_UPD(st, elm) \ + MD5Update(ctx, (u_int8_t *) &(st)->elm, sizeof((st)->elm)) + +#define PF_MD5_UPD_STR(st, elm) \ + MD5Update(ctx, (u_int8_t *) (st)->elm, strlen((st)->elm)) + +#define PF_MD5_UPD_HTONL(st, elm, stor) do { \ + (stor) = htonl((st)->elm); \ + MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int32_t));\ +} while (0) + +#define PF_MD5_UPD_HTONS(st, elm, stor) do { \ + (stor) = htons((st)->elm); \ + MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int16_t));\ +} while (0) + +void +pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr) +{ + PF_MD5_UPD(pfr, addr.type); + switch (pfr->addr.type) { + case PF_ADDR_DYNIFTL: + PF_MD5_UPD(pfr, addr.v.ifname); + PF_MD5_UPD(pfr, addr.iflags); + break; + case PF_ADDR_TABLE: + PF_MD5_UPD(pfr, addr.v.tblname); + break; + case PF_ADDR_ADDRMASK: + /* XXX ignore af? */ + PF_MD5_UPD(pfr, addr.v.a.addr.addr32); + PF_MD5_UPD(pfr, addr.v.a.mask.addr32); + break; + case PF_ADDR_RTLABEL: + PF_MD5_UPD(pfr, addr.v.rtlabelname); + break; + } + + PF_MD5_UPD(pfr, port[0]); + PF_MD5_UPD(pfr, port[1]); + PF_MD5_UPD(pfr, neg); + PF_MD5_UPD(pfr, port_op); +} + +void +pf_hash_rule(MD5_CTX *ctx, struct pf_rule *rule) +{ + u_int16_t x; + u_int32_t y; + + pf_hash_rule_addr(ctx, &rule->src); + pf_hash_rule_addr(ctx, &rule->dst); + PF_MD5_UPD_STR(rule, label); + PF_MD5_UPD_STR(rule, ifname); + PF_MD5_UPD_STR(rule, match_tagname); + PF_MD5_UPD_HTONS(rule, match_tag, x); /* dup? */ + PF_MD5_UPD_HTONL(rule, os_fingerprint, y); + PF_MD5_UPD_HTONL(rule, prob, y); + PF_MD5_UPD_HTONL(rule, uid.uid[0], y); + PF_MD5_UPD_HTONL(rule, uid.uid[1], y); + PF_MD5_UPD(rule, uid.op); + PF_MD5_UPD_HTONL(rule, gid.gid[0], y); + PF_MD5_UPD_HTONL(rule, gid.gid[1], y); + PF_MD5_UPD(rule, gid.op); + PF_MD5_UPD_HTONL(rule, rule_flag, y); + PF_MD5_UPD(rule, action); + PF_MD5_UPD(rule, direction); + PF_MD5_UPD(rule, af); + PF_MD5_UPD(rule, quick); + PF_MD5_UPD(rule, ifnot); + PF_MD5_UPD(rule, match_tag_not); + PF_MD5_UPD(rule, natpass); + PF_MD5_UPD(rule, keep_state); + PF_MD5_UPD(rule, proto); + PF_MD5_UPD(rule, type); + PF_MD5_UPD(rule, code); + PF_MD5_UPD(rule, flags); + PF_MD5_UPD(rule, flagset); + PF_MD5_UPD(rule, allow_opts); + PF_MD5_UPD(rule, rt); + PF_MD5_UPD(rule, tos); +} + +int +pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) +{ + struct pf_ruleset *rs; + struct pf_rule *rule, **old_array; + struct pf_rulequeue *old_rules; + int s, error; + u_int32_t old_rcount; + + if (rs_num < 0 || rs_num >= PF_RULESET_MAX) + return (EINVAL); + rs = pf_find_ruleset(anchor); + if (rs == NULL || !rs->rules[rs_num].inactive.open || + ticket != rs->rules[rs_num].inactive.ticket) + return (EBUSY); + + /* Calculate checksum for the main ruleset */ + if (rs == &pf_main_ruleset) { + error = pf_setup_pfsync_matching(rs); + if (error != 0) + return (error); + } + + /* Swap rules, keep the old. */ + s = splsoftnet(); + old_rules = rs->rules[rs_num].active.ptr; + old_rcount = rs->rules[rs_num].active.rcount; + old_array = rs->rules[rs_num].active.ptr_array; + + rs->rules[rs_num].active.ptr = + rs->rules[rs_num].inactive.ptr; + rs->rules[rs_num].active.ptr_array = + rs->rules[rs_num].inactive.ptr_array; + rs->rules[rs_num].active.rcount = + rs->rules[rs_num].inactive.rcount; + rs->rules[rs_num].inactive.ptr = old_rules; + rs->rules[rs_num].inactive.ptr_array = old_array; + rs->rules[rs_num].inactive.rcount = old_rcount; + + rs->rules[rs_num].active.ticket = + rs->rules[rs_num].inactive.ticket; + pf_calc_skip_steps(rs->rules[rs_num].active.ptr); + + + /* Purge the old rule list. */ + while ((rule = TAILQ_FIRST(old_rules)) != NULL) + pf_rm_rule(old_rules, rule); + if (rs->rules[rs_num].inactive.ptr_array) + free(rs->rules[rs_num].inactive.ptr_array, M_TEMP); + rs->rules[rs_num].inactive.ptr_array = NULL; + rs->rules[rs_num].inactive.rcount = 0; + rs->rules[rs_num].inactive.open = 0; + pf_remove_if_empty_ruleset(rs); + splx(s); + return (0); +} + +int +pf_setup_pfsync_matching(struct pf_ruleset *rs) +{ + MD5_CTX ctx; + struct pf_rule *rule; + int rs_cnt; + u_int8_t digest[PF_MD5_DIGEST_LENGTH]; + + MD5Init(&ctx); + for (rs_cnt = 0; rs_cnt < PF_RULESET_MAX; rs_cnt++) { + /* XXX PF_RULESET_SCRUB as well? */ + if (rs_cnt == PF_RULESET_SCRUB) + continue; + + if (rs->rules[rs_cnt].inactive.ptr_array) + free(rs->rules[rs_cnt].inactive.ptr_array, M_TEMP); + rs->rules[rs_cnt].inactive.ptr_array = NULL; + + if (rs->rules[rs_cnt].inactive.rcount) { + rs->rules[rs_cnt].inactive.ptr_array = + malloc(sizeof(caddr_t) * + rs->rules[rs_cnt].inactive.rcount, + M_TEMP, M_NOWAIT); + + if (!rs->rules[rs_cnt].inactive.ptr_array) + return (ENOMEM); + } + + TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr, + entries) { + pf_hash_rule(&ctx, rule); + (rs->rules[rs_cnt].inactive.ptr_array)[rule->nr] = rule; + } + } + + MD5Final(digest, &ctx); + memcpy(pf_status.pf_chksum, digest, sizeof(pf_status.pf_chksum)); + return (0); +} + +int +#ifdef __FreeBSD__ +pfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) +#else +pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) +#endif +{ + struct pf_pooladdr *pa = NULL; + struct pf_pool *pool = NULL; +#ifndef __FreeBSD__ + int s; +#endif + int error = 0; + + /* XXX keep in sync with switch() below */ +#ifdef __FreeBSD__ + if (securelevel_gt(td->td_ucred, 2)) +#else + if (securelevel > 1) +#endif + switch (cmd) { + case DIOCGETRULES: + case DIOCGETRULE: + case DIOCGETADDRS: + case DIOCGETADDR: + case DIOCGETSTATE: + case DIOCSETSTATUSIF: + case DIOCGETSTATUS: + case DIOCCLRSTATUS: + case DIOCNATLOOK: + case DIOCSETDEBUG: + case DIOCGETSTATES: + case DIOCGETTIMEOUT: + case DIOCCLRRULECTRS: + case DIOCGETLIMIT: + case DIOCGETALTQS: + case DIOCGETALTQ: + case DIOCGETQSTATS: + case DIOCGETRULESETS: + case DIOCGETRULESET: + case DIOCRGETTABLES: + case DIOCRGETTSTATS: + case DIOCRCLRTSTATS: + case DIOCRCLRADDRS: + case DIOCRADDADDRS: + case DIOCRDELADDRS: + case DIOCRSETADDRS: + case DIOCRGETADDRS: + case DIOCRGETASTATS: + case DIOCRCLRASTATS: + case DIOCRTSTADDRS: + case DIOCOSFPGET: + case DIOCGETSRCNODES: + case DIOCCLRSRCNODES: + case DIOCIGETIFACES: +#ifdef __FreeBSD__ + case DIOCGIFSPEED: +#endif + case DIOCSETIFFLAG: + case DIOCCLRIFFLAG: + break; + case DIOCRCLRTABLES: + case DIOCRADDTABLES: + case DIOCRDELTABLES: + case DIOCRSETTFLAGS: + if (((struct pfioc_table *)addr)->pfrio_flags & + PFR_FLAG_DUMMY) + break; /* dummy operation ok */ + return (EPERM); + default: + return (EPERM); + } + + if (!(flags & FWRITE)) + switch (cmd) { + case DIOCGETRULES: + case DIOCGETADDRS: + case DIOCGETADDR: + case DIOCGETSTATE: + case DIOCGETSTATUS: + case DIOCGETSTATES: + case DIOCGETTIMEOUT: + case DIOCGETLIMIT: + case DIOCGETALTQS: + case DIOCGETALTQ: + case DIOCGETQSTATS: + case DIOCGETRULESETS: + case DIOCGETRULESET: + case DIOCNATLOOK: + case DIOCRGETTABLES: + case DIOCRGETTSTATS: + case DIOCRGETADDRS: + case DIOCRGETASTATS: + case DIOCRTSTADDRS: + case DIOCOSFPGET: + case DIOCGETSRCNODES: + case DIOCIGETIFACES: +#ifdef __FreeBSD__ + case DIOCGIFSPEED: +#endif + break; + case DIOCRCLRTABLES: + case DIOCRADDTABLES: + case DIOCRDELTABLES: + case DIOCRCLRTSTATS: + case DIOCRCLRADDRS: + case DIOCRADDADDRS: + case DIOCRDELADDRS: + case DIOCRSETADDRS: + case DIOCRSETTFLAGS: + if (((struct pfioc_table *)addr)->pfrio_flags & + PFR_FLAG_DUMMY) { + flags |= FWRITE; /* need write lock for dummy */ + break; /* dummy operation ok */ + } + return (EACCES); + case DIOCGETRULE: + if (((struct pfioc_rule *)addr)->action == PF_GET_CLR_CNTR) + return (EACCES); + break; + default: + return (EACCES); + } + + if (flags & FWRITE) +#ifdef __FreeBSD__ + sx_xlock(&pf_consistency_lock); + else + sx_slock(&pf_consistency_lock); +#else + rw_enter_write(&pf_consistency_lock); + else + rw_enter_read(&pf_consistency_lock); +#endif + +#ifdef __FreeBSD__ + PF_LOCK(); +#else + s = splsoftnet(); +#endif + switch (cmd) { + + case DIOCSTART: + if (pf_status.running) + error = EEXIST; + else { +#ifdef __FreeBSD__ + PF_UNLOCK(); + error = hook_pf(); + PF_LOCK(); + if (error) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: pfil registeration fail\n")); + break; + } +#endif + pf_status.running = 1; + pf_status.since = time_second; + if (pf_status.stateid == 0) { + pf_status.stateid = time_second; + pf_status.stateid = pf_status.stateid << 32; + } + DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n")); + } + break; + + case DIOCSTOP: + if (!pf_status.running) + error = ENOENT; + else { + pf_status.running = 0; +#ifdef __FreeBSD__ + PF_UNLOCK(); + error = dehook_pf(); + PF_LOCK(); + if (error) { + pf_status.running = 1; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: pfil unregisteration failed\n")); + } +#endif + pf_status.since = time_second; + DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n")); + } + break; + + case DIOCADDRULE: { + struct pfioc_rule *pr = (struct pfioc_rule *)addr; + struct pf_ruleset *ruleset; + struct pf_rule *rule, *tail; + struct pf_pooladdr *pa; + int rs_num; + + pr->anchor[sizeof(pr->anchor) - 1] = 0; + ruleset = pf_find_ruleset(pr->anchor); + if (ruleset == NULL) { + error = EINVAL; + break; + } + rs_num = pf_get_ruleset_number(pr->rule.action); + if (rs_num >= PF_RULESET_MAX) { + error = EINVAL; + break; + } + if (pr->rule.return_icmp >> 8 > ICMP_MAXTYPE) { + error = EINVAL; + break; + } + if (pr->ticket != ruleset->rules[rs_num].inactive.ticket) { +#ifdef __FreeBSD__ + DPFPRINTF(PF_DEBUG_MISC, + ("ticket: %d != [%d]%d\n", pr->ticket, rs_num, + ruleset->rules[rs_num].inactive.ticket)); +#endif + error = EBUSY; + break; + } + if (pr->pool_ticket != ticket_pabuf) { +#ifdef __FreeBSD__ + DPFPRINTF(PF_DEBUG_MISC, + ("pool_ticket: %d != %d\n", pr->pool_ticket, + ticket_pabuf)); +#endif + error = EBUSY; + break; + } + rule = pool_get(&pf_rule_pl, PR_NOWAIT); + if (rule == NULL) { + error = ENOMEM; + break; + } + bcopy(&pr->rule, rule, sizeof(struct pf_rule)); +#ifdef __FreeBSD__ + rule->cuid = td->td_ucred->cr_ruid; + rule->cpid = td->td_proc ? td->td_proc->p_pid : 0; +#else + rule->cuid = p->p_cred->p_ruid; + rule->cpid = p->p_pid; +#endif + rule->anchor = NULL; + rule->kif = NULL; + TAILQ_INIT(&rule->rpool.list); + /* initialize refcounting */ + rule->states = 0; + rule->src_nodes = 0; + rule->entries.tqe_prev = NULL; +#ifndef INET + if (rule->af == AF_INET) { + pool_put(&pf_rule_pl, rule); + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#ifndef INET6 + if (rule->af == AF_INET6) { + pool_put(&pf_rule_pl, rule); + error = EAFNOSUPPORT; + break; + } +#endif /* INET6 */ + tail = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr, + pf_rulequeue); + if (tail) + rule->nr = tail->nr + 1; + else + rule->nr = 0; + if (rule->ifname[0]) { + rule->kif = pfi_kif_get(rule->ifname); + if (rule->kif == NULL) { + pool_put(&pf_rule_pl, rule); + error = EINVAL; + break; + } + pfi_kif_ref(rule->kif, PFI_KIF_REF_RULE); + } + +#ifdef __FreeBSD__ /* ROUTING */ + if (rule->rtableid > 0 && rule->rtableid > rt_numfibs) +#else + if (rule->rtableid > 0 && !rtable_exists(rule->rtableid)) +#endif + error = EBUSY; + +#ifdef ALTQ + /* set queue IDs */ + if (rule->qname[0] != 0) { + if ((rule->qid = pf_qname2qid(rule->qname)) == 0) + error = EBUSY; + else if (rule->pqname[0] != 0) { + if ((rule->pqid = + pf_qname2qid(rule->pqname)) == 0) + error = EBUSY; + } else + rule->pqid = rule->qid; + } +#endif + if (rule->tagname[0]) + if ((rule->tag = pf_tagname2tag(rule->tagname)) == 0) + error = EBUSY; + if (rule->match_tagname[0]) + if ((rule->match_tag = + pf_tagname2tag(rule->match_tagname)) == 0) + error = EBUSY; + if (rule->rt && !rule->direction) + error = EINVAL; +#if NPFLOG > 0 +#ifdef __FreeBSD__ + if (!rule->log) + rule->logif = 0; +#endif + if (rule->logif >= PFLOGIFS_MAX) + error = EINVAL; +#endif + if (pf_rtlabel_add(&rule->src.addr) || + pf_rtlabel_add(&rule->dst.addr)) + error = EBUSY; + if (pfi_dynaddr_setup(&rule->src.addr, rule->af)) + error = EINVAL; + if (pfi_dynaddr_setup(&rule->dst.addr, rule->af)) + error = EINVAL; + if (pf_tbladdr_setup(ruleset, &rule->src.addr)) + error = EINVAL; + if (pf_tbladdr_setup(ruleset, &rule->dst.addr)) + error = EINVAL; + if (pf_anchor_setup(rule, ruleset, pr->anchor_call)) + error = EINVAL; + TAILQ_FOREACH(pa, &pf_pabuf, entries) + if (pf_tbladdr_setup(ruleset, &pa->addr)) + error = EINVAL; + + if (rule->overload_tblname[0]) { + if ((rule->overload_tbl = pfr_attach_table(ruleset, + rule->overload_tblname)) == NULL) + error = EINVAL; + else + rule->overload_tbl->pfrkt_flags |= + PFR_TFLAG_ACTIVE; + } + + pf_mv_pool(&pf_pabuf, &rule->rpool.list); + if (((((rule->action == PF_NAT) || (rule->action == PF_RDR) || + (rule->action == PF_BINAT)) && rule->anchor == NULL) || + (rule->rt > PF_FASTROUTE)) && + (TAILQ_FIRST(&rule->rpool.list) == NULL)) + error = EINVAL; + + if (error) { + pf_rm_rule(NULL, rule); + break; + } + +#ifdef __FreeBSD__ + if (!debug_pfugidhack && (rule->uid.op || rule->gid.op || + rule->log & PF_LOG_SOCKET_LOOKUP)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: debug.pfugidhack enabled\n")); + debug_pfugidhack = 1; + } +#endif + + rule->rpool.cur = TAILQ_FIRST(&rule->rpool.list); + rule->evaluations = rule->packets[0] = rule->packets[1] = + rule->bytes[0] = rule->bytes[1] = 0; + TAILQ_INSERT_TAIL(ruleset->rules[rs_num].inactive.ptr, + rule, entries); + ruleset->rules[rs_num].inactive.rcount++; + break; + } + + case DIOCGETRULES: { + struct pfioc_rule *pr = (struct pfioc_rule *)addr; + struct pf_ruleset *ruleset; + struct pf_rule *tail; + int rs_num; + + pr->anchor[sizeof(pr->anchor) - 1] = 0; + ruleset = pf_find_ruleset(pr->anchor); + if (ruleset == NULL) { + error = EINVAL; + break; + } + rs_num = pf_get_ruleset_number(pr->rule.action); + if (rs_num >= PF_RULESET_MAX) { + error = EINVAL; + break; + } + tail = TAILQ_LAST(ruleset->rules[rs_num].active.ptr, + pf_rulequeue); + if (tail) + pr->nr = tail->nr + 1; + else + pr->nr = 0; + pr->ticket = ruleset->rules[rs_num].active.ticket; + break; + } + + case DIOCGETRULE: { + struct pfioc_rule *pr = (struct pfioc_rule *)addr; + struct pf_ruleset *ruleset; + struct pf_rule *rule; + int rs_num, i; + + pr->anchor[sizeof(pr->anchor) - 1] = 0; + ruleset = pf_find_ruleset(pr->anchor); + if (ruleset == NULL) { + error = EINVAL; + break; + } + rs_num = pf_get_ruleset_number(pr->rule.action); + if (rs_num >= PF_RULESET_MAX) { + error = EINVAL; + break; + } + if (pr->ticket != ruleset->rules[rs_num].active.ticket) { + error = EBUSY; + break; + } + rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr); + while ((rule != NULL) && (rule->nr != pr->nr)) + rule = TAILQ_NEXT(rule, entries); + if (rule == NULL) { + error = EBUSY; + break; + } + bcopy(rule, &pr->rule, sizeof(struct pf_rule)); + if (pf_anchor_copyout(ruleset, rule, pr)) { + error = EBUSY; + break; + } + pfi_dynaddr_copyout(&pr->rule.src.addr); + pfi_dynaddr_copyout(&pr->rule.dst.addr); + pf_tbladdr_copyout(&pr->rule.src.addr); + pf_tbladdr_copyout(&pr->rule.dst.addr); + pf_rtlabel_copyout(&pr->rule.src.addr); + pf_rtlabel_copyout(&pr->rule.dst.addr); + for (i = 0; i < PF_SKIP_COUNT; ++i) + if (rule->skip[i].ptr == NULL) + pr->rule.skip[i].nr = -1; + else + pr->rule.skip[i].nr = + rule->skip[i].ptr->nr; + + if (pr->action == PF_GET_CLR_CNTR) { + rule->evaluations = 0; + rule->packets[0] = rule->packets[1] = 0; + rule->bytes[0] = rule->bytes[1] = 0; + } + break; + } + + case DIOCCHANGERULE: { + struct pfioc_rule *pcr = (struct pfioc_rule *)addr; + struct pf_ruleset *ruleset; + struct pf_rule *oldrule = NULL, *newrule = NULL; + u_int32_t nr = 0; + int rs_num; + + if (!(pcr->action == PF_CHANGE_REMOVE || + pcr->action == PF_CHANGE_GET_TICKET) && + pcr->pool_ticket != ticket_pabuf) { + error = EBUSY; + break; + } + + if (pcr->action < PF_CHANGE_ADD_HEAD || + pcr->action > PF_CHANGE_GET_TICKET) { + error = EINVAL; + break; + } + ruleset = pf_find_ruleset(pcr->anchor); + if (ruleset == NULL) { + error = EINVAL; + break; + } + rs_num = pf_get_ruleset_number(pcr->rule.action); + if (rs_num >= PF_RULESET_MAX) { + error = EINVAL; + break; + } + + if (pcr->action == PF_CHANGE_GET_TICKET) { + pcr->ticket = ++ruleset->rules[rs_num].active.ticket; + break; + } else { + if (pcr->ticket != + ruleset->rules[rs_num].active.ticket) { + error = EINVAL; + break; + } + if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) { + error = EINVAL; + break; + } + } + + if (pcr->action != PF_CHANGE_REMOVE) { + newrule = pool_get(&pf_rule_pl, PR_NOWAIT); + if (newrule == NULL) { + error = ENOMEM; + break; + } + bcopy(&pcr->rule, newrule, sizeof(struct pf_rule)); +#ifdef __FreeBSD__ + newrule->cuid = td->td_ucred->cr_ruid; + newrule->cpid = td->td_proc ? td->td_proc->p_pid : 0; +#else + newrule->cuid = p->p_cred->p_ruid; + newrule->cpid = p->p_pid; +#endif + TAILQ_INIT(&newrule->rpool.list); + /* initialize refcounting */ + newrule->states = 0; + newrule->entries.tqe_prev = NULL; +#ifndef INET + if (newrule->af == AF_INET) { + pool_put(&pf_rule_pl, newrule); + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#ifndef INET6 + if (newrule->af == AF_INET6) { + pool_put(&pf_rule_pl, newrule); + error = EAFNOSUPPORT; + break; + } +#endif /* INET6 */ + if (newrule->ifname[0]) { + newrule->kif = pfi_kif_get(newrule->ifname); + if (newrule->kif == NULL) { + pool_put(&pf_rule_pl, newrule); + error = EINVAL; + break; + } + pfi_kif_ref(newrule->kif, PFI_KIF_REF_RULE); + } else + newrule->kif = NULL; + + if (newrule->rtableid > 0 && +#ifdef __FreeBSD__ /* ROUTING */ + newrule->rtableid > rt_numfibs) +#else + !rtable_exists(newrule->rtableid)) +#endif + error = EBUSY; + +#ifdef ALTQ + /* set queue IDs */ + if (newrule->qname[0] != 0) { + if ((newrule->qid = + pf_qname2qid(newrule->qname)) == 0) + error = EBUSY; + else if (newrule->pqname[0] != 0) { + if ((newrule->pqid = + pf_qname2qid(newrule->pqname)) == 0) + error = EBUSY; + } else + newrule->pqid = newrule->qid; + } +#endif /* ALTQ */ + if (newrule->tagname[0]) + if ((newrule->tag = + pf_tagname2tag(newrule->tagname)) == 0) + error = EBUSY; + if (newrule->match_tagname[0]) + if ((newrule->match_tag = pf_tagname2tag( + newrule->match_tagname)) == 0) + error = EBUSY; + if (newrule->rt && !newrule->direction) + error = EINVAL; +#ifdef __FreeBSD__ +#if NPFLOG > 0 + if (!newrule->log) + newrule->logif = 0; + if (newrule->logif >= PFLOGIFS_MAX) + error = EINVAL; +#endif +#endif + if (pf_rtlabel_add(&newrule->src.addr) || + pf_rtlabel_add(&newrule->dst.addr)) + error = EBUSY; + if (pfi_dynaddr_setup(&newrule->src.addr, newrule->af)) + error = EINVAL; + if (pfi_dynaddr_setup(&newrule->dst.addr, newrule->af)) + error = EINVAL; + if (pf_tbladdr_setup(ruleset, &newrule->src.addr)) + error = EINVAL; + if (pf_tbladdr_setup(ruleset, &newrule->dst.addr)) + error = EINVAL; + if (pf_anchor_setup(newrule, ruleset, pcr->anchor_call)) + error = EINVAL; + TAILQ_FOREACH(pa, &pf_pabuf, entries) + if (pf_tbladdr_setup(ruleset, &pa->addr)) + error = EINVAL; + + if (newrule->overload_tblname[0]) { + if ((newrule->overload_tbl = pfr_attach_table( + ruleset, newrule->overload_tblname)) == + NULL) + error = EINVAL; + else + newrule->overload_tbl->pfrkt_flags |= + PFR_TFLAG_ACTIVE; + } + + pf_mv_pool(&pf_pabuf, &newrule->rpool.list); + if (((((newrule->action == PF_NAT) || + (newrule->action == PF_RDR) || + (newrule->action == PF_BINAT) || + (newrule->rt > PF_FASTROUTE)) && + !newrule->anchor)) && + (TAILQ_FIRST(&newrule->rpool.list) == NULL)) + error = EINVAL; + + if (error) { + pf_rm_rule(NULL, newrule); + break; + } + +#ifdef __FreeBSD__ + if (!debug_pfugidhack && (newrule->uid.op || + newrule->gid.op || + newrule->log & PF_LOG_SOCKET_LOOKUP)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: debug.pfugidhack enabled\n")); + debug_pfugidhack = 1; + } +#endif + + newrule->rpool.cur = TAILQ_FIRST(&newrule->rpool.list); + newrule->evaluations = 0; + newrule->packets[0] = newrule->packets[1] = 0; + newrule->bytes[0] = newrule->bytes[1] = 0; + } + pf_empty_pool(&pf_pabuf); + + if (pcr->action == PF_CHANGE_ADD_HEAD) + oldrule = TAILQ_FIRST( + ruleset->rules[rs_num].active.ptr); + else if (pcr->action == PF_CHANGE_ADD_TAIL) + oldrule = TAILQ_LAST( + ruleset->rules[rs_num].active.ptr, pf_rulequeue); + else { + oldrule = TAILQ_FIRST( + ruleset->rules[rs_num].active.ptr); + while ((oldrule != NULL) && (oldrule->nr != pcr->nr)) + oldrule = TAILQ_NEXT(oldrule, entries); + if (oldrule == NULL) { + if (newrule != NULL) + pf_rm_rule(NULL, newrule); + error = EINVAL; + break; + } + } + + if (pcr->action == PF_CHANGE_REMOVE) { + pf_rm_rule(ruleset->rules[rs_num].active.ptr, oldrule); + ruleset->rules[rs_num].active.rcount--; + } else { + if (oldrule == NULL) + TAILQ_INSERT_TAIL( + ruleset->rules[rs_num].active.ptr, + newrule, entries); + else if (pcr->action == PF_CHANGE_ADD_HEAD || + pcr->action == PF_CHANGE_ADD_BEFORE) + TAILQ_INSERT_BEFORE(oldrule, newrule, entries); + else + TAILQ_INSERT_AFTER( + ruleset->rules[rs_num].active.ptr, + oldrule, newrule, entries); + ruleset->rules[rs_num].active.rcount++; + } + + nr = 0; + TAILQ_FOREACH(oldrule, + ruleset->rules[rs_num].active.ptr, entries) + oldrule->nr = nr++; + + ruleset->rules[rs_num].active.ticket++; + + pf_calc_skip_steps(ruleset->rules[rs_num].active.ptr); + pf_remove_if_empty_ruleset(ruleset); + + break; + } + + case DIOCCLRSTATES: { + struct pf_state *state, *nexts; + struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr; + int killed = 0; + + for (state = RB_MIN(pf_state_tree_id, &tree_id); state; + state = nexts) { + nexts = RB_NEXT(pf_state_tree_id, &tree_id, state); + + if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname, + state->u.s.kif->pfik_name)) { +#if NPFSYNC + /* don't send out individual delete messages */ + state->sync_flags = PFSTATE_NOSYNC; +#endif + pf_unlink_state(state); + killed++; + } + } + psk->psk_af = killed; +#if NPFSYNC + pfsync_clear_states(pf_status.hostid, psk->psk_ifname); +#endif + break; + } + + case DIOCKILLSTATES: { + struct pf_state *state, *nexts; + struct pf_state_host *src, *dst; + struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr; + int killed = 0; + + for (state = RB_MIN(pf_state_tree_id, &tree_id); state; + state = nexts) { + nexts = RB_NEXT(pf_state_tree_id, &tree_id, state); + + if (state->direction == PF_OUT) { + src = &state->lan; + dst = &state->ext; + } else { + src = &state->ext; + dst = &state->lan; + } + if ((!psk->psk_af || state->af == psk->psk_af) + && (!psk->psk_proto || psk->psk_proto == + state->proto) && + PF_MATCHA(psk->psk_src.neg, + &psk->psk_src.addr.v.a.addr, + &psk->psk_src.addr.v.a.mask, + &src->addr, state->af) && + PF_MATCHA(psk->psk_dst.neg, + &psk->psk_dst.addr.v.a.addr, + &psk->psk_dst.addr.v.a.mask, + &dst->addr, state->af) && + (psk->psk_src.port_op == 0 || + pf_match_port(psk->psk_src.port_op, + psk->psk_src.port[0], psk->psk_src.port[1], + src->port)) && + (psk->psk_dst.port_op == 0 || + pf_match_port(psk->psk_dst.port_op, + psk->psk_dst.port[0], psk->psk_dst.port[1], + dst->port)) && + (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname, + state->u.s.kif->pfik_name))) { +#if NPFSYNC > 0 + /* send immediate delete of state */ + pfsync_delete_state(state); + state->sync_flags |= PFSTATE_NOSYNC; +#endif + pf_unlink_state(state); + killed++; + } + } + psk->psk_af = killed; + break; + } + + case DIOCADDSTATE: { + struct pfioc_state *ps = (struct pfioc_state *)addr; + struct pf_state *state; + struct pfi_kif *kif; + + if (ps->state.timeout >= PFTM_MAX && + ps->state.timeout != PFTM_UNTIL_PACKET) { + error = EINVAL; + break; + } + state = pool_get(&pf_state_pl, PR_NOWAIT); + if (state == NULL) { + error = ENOMEM; + break; + } + kif = pfi_kif_get(ps->state.u.ifname); + if (kif == NULL) { + pool_put(&pf_state_pl, state); + error = ENOENT; + break; + } + bcopy(&ps->state, state, sizeof(struct pf_state)); + bzero(&state->u, sizeof(state->u)); + state->rule.ptr = &pf_default_rule; + state->nat_rule.ptr = NULL; + state->anchor.ptr = NULL; + state->rt_kif = NULL; + state->creation = time_second; + state->pfsync_time = 0; + state->packets[0] = state->packets[1] = 0; + state->bytes[0] = state->bytes[1] = 0; + + if (pf_insert_state(kif, state)) { + pfi_kif_unref(kif, PFI_KIF_REF_NONE); + pool_put(&pf_state_pl, state); + error = ENOMEM; + } + break; + } + + case DIOCGETSTATE: { + struct pfioc_state *ps = (struct pfioc_state *)addr; + struct pf_state *state; + u_int32_t nr; + int secs; + + nr = 0; + RB_FOREACH(state, pf_state_tree_id, &tree_id) { + if (nr >= ps->nr) + break; + nr++; + } + if (state == NULL) { + error = EBUSY; + break; + } + secs = time_second; + bcopy(state, &ps->state, sizeof(ps->state)); + strlcpy(ps->state.u.ifname, state->u.s.kif->pfik_name, + sizeof(ps->state.u.ifname)); + ps->state.rule.nr = state->rule.ptr->nr; + ps->state.nat_rule.nr = (state->nat_rule.ptr == NULL) ? + -1 : state->nat_rule.ptr->nr; + ps->state.anchor.nr = (state->anchor.ptr == NULL) ? + -1 : state->anchor.ptr->nr; + ps->state.creation = secs - ps->state.creation; + ps->state.expire = pf_state_expires(state); + if (ps->state.expire > secs) + ps->state.expire -= secs; + else + ps->state.expire = 0; + break; + } + + case DIOCGETSTATES: { + struct pfioc_states *ps = (struct pfioc_states *)addr; + struct pf_state *state; + struct pf_state *p, *pstore; + u_int32_t nr = 0; + int space = ps->ps_len; + + if (space == 0) { + nr = pf_status.states; + ps->ps_len = sizeof(struct pf_state) * nr; + break; + } + +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + + p = ps->ps_states; + + state = TAILQ_FIRST(&state_list); + while (state) { + if (state->timeout != PFTM_UNLINKED) { + int secs = time_second; + + if ((nr+1) * sizeof(*p) > (unsigned)ps->ps_len) + break; + + bcopy(state, pstore, sizeof(*pstore)); + strlcpy(pstore->u.ifname, + state->u.s.kif->pfik_name, + sizeof(pstore->u.ifname)); + pstore->rule.nr = state->rule.ptr->nr; + pstore->nat_rule.nr = (state->nat_rule.ptr == + NULL) ? -1 : state->nat_rule.ptr->nr; + pstore->anchor.nr = (state->anchor.ptr == + NULL) ? -1 : state->anchor.ptr->nr; + pstore->creation = secs - pstore->creation; + pstore->expire = pf_state_expires(state); + if (pstore->expire > secs) + pstore->expire -= secs; + else + pstore->expire = 0; +#ifdef __FreeBSD__ + PF_COPYOUT(pstore, p, sizeof(*p), error); +#else + error = copyout(pstore, p, sizeof(*p)); +#endif + if (error) { + free(pstore, M_TEMP); + goto fail; + } + p++; + nr++; + } + state = TAILQ_NEXT(state, u.s.entry_list); + } + + ps->ps_len = sizeof(struct pf_state) * nr; + + free(pstore, M_TEMP); + break; + } + + case DIOCGETSTATUS: { + struct pf_status *s = (struct pf_status *)addr; + bcopy(&pf_status, s, sizeof(struct pf_status)); + pfi_fill_oldstatus(s); + break; + } + + case DIOCSETSTATUSIF: { + struct pfioc_if *pi = (struct pfioc_if *)addr; + + if (pi->ifname[0] == 0) { + bzero(pf_status.ifname, IFNAMSIZ); + break; + } + if (ifunit(pi->ifname) == NULL) { + error = EINVAL; + break; + } + strlcpy(pf_status.ifname, pi->ifname, IFNAMSIZ); + break; + } + + case DIOCCLRSTATUS: { + bzero(pf_status.counters, sizeof(pf_status.counters)); + bzero(pf_status.fcounters, sizeof(pf_status.fcounters)); + bzero(pf_status.scounters, sizeof(pf_status.scounters)); + pf_status.since = time_second; + if (*pf_status.ifname) + pfi_clr_istats(pf_status.ifname); + break; + } + + case DIOCNATLOOK: { + struct pfioc_natlook *pnl = (struct pfioc_natlook *)addr; + struct pf_state *state; + struct pf_state_cmp key; + int m = 0, direction = pnl->direction; + + key.af = pnl->af; + key.proto = pnl->proto; + + if (!pnl->proto || + PF_AZERO(&pnl->saddr, pnl->af) || + PF_AZERO(&pnl->daddr, pnl->af) || + ((pnl->proto == IPPROTO_TCP || + pnl->proto == IPPROTO_UDP) && + (!pnl->dport || !pnl->sport))) + error = EINVAL; + else { + /* + * userland gives us source and dest of connection, + * reverse the lookup so we ask for what happens with + * the return traffic, enabling us to find it in the + * state tree. + */ + if (direction == PF_IN) { + PF_ACPY(&key.ext.addr, &pnl->daddr, pnl->af); + key.ext.port = pnl->dport; + PF_ACPY(&key.gwy.addr, &pnl->saddr, pnl->af); + key.gwy.port = pnl->sport; + state = pf_find_state_all(&key, PF_EXT_GWY, &m); + } else { + PF_ACPY(&key.lan.addr, &pnl->daddr, pnl->af); + key.lan.port = pnl->dport; + PF_ACPY(&key.ext.addr, &pnl->saddr, pnl->af); + key.ext.port = pnl->sport; + state = pf_find_state_all(&key, PF_LAN_EXT, &m); + } + if (m > 1) + error = E2BIG; /* more than one state */ + else if (state != NULL) { + if (direction == PF_IN) { + PF_ACPY(&pnl->rsaddr, &state->lan.addr, + state->af); + pnl->rsport = state->lan.port; + PF_ACPY(&pnl->rdaddr, &pnl->daddr, + pnl->af); + pnl->rdport = pnl->dport; + } else { + PF_ACPY(&pnl->rdaddr, &state->gwy.addr, + state->af); + pnl->rdport = state->gwy.port; + PF_ACPY(&pnl->rsaddr, &pnl->saddr, + pnl->af); + pnl->rsport = pnl->sport; + } + } else + error = ENOENT; + } + break; + } + + case DIOCSETTIMEOUT: { + struct pfioc_tm *pt = (struct pfioc_tm *)addr; + int old; + + if (pt->timeout < 0 || pt->timeout >= PFTM_MAX || + pt->seconds < 0) { + error = EINVAL; + goto fail; + } + old = pf_default_rule.timeout[pt->timeout]; + if (pt->timeout == PFTM_INTERVAL && pt->seconds == 0) + pt->seconds = 1; + pf_default_rule.timeout[pt->timeout] = pt->seconds; + if (pt->timeout == PFTM_INTERVAL && pt->seconds < old) + wakeup(pf_purge_thread); + pt->seconds = old; + break; + } + + case DIOCGETTIMEOUT: { + struct pfioc_tm *pt = (struct pfioc_tm *)addr; + + if (pt->timeout < 0 || pt->timeout >= PFTM_MAX) { + error = EINVAL; + goto fail; + } + pt->seconds = pf_default_rule.timeout[pt->timeout]; + break; + } + + case DIOCGETLIMIT: { + struct pfioc_limit *pl = (struct pfioc_limit *)addr; + + if (pl->index < 0 || pl->index >= PF_LIMIT_MAX) { + error = EINVAL; + goto fail; + } + pl->limit = pf_pool_limits[pl->index].limit; + break; + } + + case DIOCSETLIMIT: { + struct pfioc_limit *pl = (struct pfioc_limit *)addr; + int old_limit; + + if (pl->index < 0 || pl->index >= PF_LIMIT_MAX || + pf_pool_limits[pl->index].pp == NULL) { + error = EINVAL; + goto fail; + } +#ifdef __FreeBSD__ + uma_zone_set_max(pf_pool_limits[pl->index].pp, pl->limit); +#else + if (pool_sethardlimit(pf_pool_limits[pl->index].pp, + pl->limit, NULL, 0) != 0) { + error = EBUSY; + goto fail; + } +#endif + old_limit = pf_pool_limits[pl->index].limit; + pf_pool_limits[pl->index].limit = pl->limit; + pl->limit = old_limit; + break; + } + + case DIOCSETDEBUG: { + u_int32_t *level = (u_int32_t *)addr; + + pf_status.debug = *level; + break; + } + + case DIOCCLRRULECTRS: { + /* obsoleted by DIOCGETRULE with action=PF_GET_CLR_CNTR */ + struct pf_ruleset *ruleset = &pf_main_ruleset; + struct pf_rule *rule; + + TAILQ_FOREACH(rule, + ruleset->rules[PF_RULESET_FILTER].active.ptr, entries) { + rule->evaluations = 0; + rule->packets[0] = rule->packets[1] = 0; + rule->bytes[0] = rule->bytes[1] = 0; + } + break; + } + +#ifdef __FreeBSD__ + case DIOCGIFSPEED: { + struct pf_ifspeed *psp = (struct pf_ifspeed *)addr; + struct pf_ifspeed ps; + struct ifnet *ifp; + + if (psp->ifname[0] != 0) { + /* Can we completely trust user-land? */ + strlcpy(ps.ifname, psp->ifname, IFNAMSIZ); + ifp = ifunit(ps.ifname); + if (ifp != NULL) + psp->baudrate = ifp->if_baudrate; + else + error = EINVAL; + } else + error = EINVAL; + break; + } +#endif /* __FreeBSD__ */ + +#ifdef ALTQ + case DIOCSTARTALTQ: { + struct pf_altq *altq; + + /* enable all altq interfaces on active list */ + TAILQ_FOREACH(altq, pf_altqs_active, entries) { +#ifdef __FreeBSD__ + if (altq->qname[0] == 0 && (altq->local_flags & + PFALTQ_FLAG_IF_REMOVED) == 0) { +#else + if (altq->qname[0] == 0) { +#endif + error = pf_enable_altq(altq); + if (error != 0) + break; + } + } + if (error == 0) + pf_altq_running = 1; + DPFPRINTF(PF_DEBUG_MISC, ("altq: started\n")); + break; + } + + case DIOCSTOPALTQ: { + struct pf_altq *altq; + + /* disable all altq interfaces on active list */ + TAILQ_FOREACH(altq, pf_altqs_active, entries) { +#ifdef __FreeBSD__ + if (altq->qname[0] == 0 && (altq->local_flags & + PFALTQ_FLAG_IF_REMOVED) == 0) { +#else + if (altq->qname[0] == 0) { +#endif + error = pf_disable_altq(altq); + if (error != 0) + break; + } + } + if (error == 0) + pf_altq_running = 0; + DPFPRINTF(PF_DEBUG_MISC, ("altq: stopped\n")); + break; + } + + case DIOCADDALTQ: { + struct pfioc_altq *pa = (struct pfioc_altq *)addr; + struct pf_altq *altq, *a; + + if (pa->ticket != ticket_altqs_inactive) { + error = EBUSY; + break; + } + altq = pool_get(&pf_altq_pl, PR_NOWAIT); + if (altq == NULL) { + error = ENOMEM; + break; + } + bcopy(&pa->altq, altq, sizeof(struct pf_altq)); +#ifdef __FreeBSD__ + altq->local_flags = 0; +#endif + + /* + * if this is for a queue, find the discipline and + * copy the necessary fields + */ + if (altq->qname[0] != 0) { + if ((altq->qid = pf_qname2qid(altq->qname)) == 0) { + error = EBUSY; + pool_put(&pf_altq_pl, altq); + break; + } + altq->altq_disc = NULL; + TAILQ_FOREACH(a, pf_altqs_inactive, entries) { + if (strncmp(a->ifname, altq->ifname, + IFNAMSIZ) == 0 && a->qname[0] == 0) { + altq->altq_disc = a->altq_disc; + break; + } + } + } + +#ifdef __FreeBSD__ + struct ifnet *ifp; + + if ((ifp = ifunit(altq->ifname)) == NULL) { + altq->local_flags |= PFALTQ_FLAG_IF_REMOVED; + } else { + PF_UNLOCK(); +#endif + error = altq_add(altq); +#ifdef __FreeBSD__ + PF_LOCK(); + } +#endif + if (error) { + pool_put(&pf_altq_pl, altq); + break; + } + + TAILQ_INSERT_TAIL(pf_altqs_inactive, altq, entries); + bcopy(altq, &pa->altq, sizeof(struct pf_altq)); + break; + } + + case DIOCGETALTQS: { + struct pfioc_altq *pa = (struct pfioc_altq *)addr; + struct pf_altq *altq; + + pa->nr = 0; + TAILQ_FOREACH(altq, pf_altqs_active, entries) + pa->nr++; + pa->ticket = ticket_altqs_active; + break; + } + + case DIOCGETALTQ: { + struct pfioc_altq *pa = (struct pfioc_altq *)addr; + struct pf_altq *altq; + u_int32_t nr; + + if (pa->ticket != ticket_altqs_active) { + error = EBUSY; + break; + } + nr = 0; + altq = TAILQ_FIRST(pf_altqs_active); + while ((altq != NULL) && (nr < pa->nr)) { + altq = TAILQ_NEXT(altq, entries); + nr++; + } + if (altq == NULL) { + error = EBUSY; + break; + } + bcopy(altq, &pa->altq, sizeof(struct pf_altq)); + break; + } + + case DIOCCHANGEALTQ: + /* CHANGEALTQ not supported yet! */ + error = ENODEV; + break; + + case DIOCGETQSTATS: { + struct pfioc_qstats *pq = (struct pfioc_qstats *)addr; + struct pf_altq *altq; + u_int32_t nr; + int nbytes; + + if (pq->ticket != ticket_altqs_active) { + error = EBUSY; + break; + } + nbytes = pq->nbytes; + nr = 0; + altq = TAILQ_FIRST(pf_altqs_active); + while ((altq != NULL) && (nr < pq->nr)) { + altq = TAILQ_NEXT(altq, entries); + nr++; + } + if (altq == NULL) { + error = EBUSY; + break; + } +#ifdef __FreeBSD__ + if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) { + error = ENXIO; + break; + } + PF_UNLOCK(); +#endif + error = altq_getqstats(altq, pq->buf, &nbytes); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + if (error == 0) { + pq->scheduler = altq->scheduler; + pq->nbytes = nbytes; + } + break; + } +#endif /* ALTQ */ + + case DIOCBEGINADDRS: { + struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + + pf_empty_pool(&pf_pabuf); + pp->ticket = ++ticket_pabuf; + break; + } + + case DIOCADDADDR: { + struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + + if (pp->ticket != ticket_pabuf) { + error = EBUSY; + break; + } +#ifndef INET + if (pp->af == AF_INET) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#ifndef INET6 + if (pp->af == AF_INET6) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET6 */ + if (pp->addr.addr.type != PF_ADDR_ADDRMASK && + pp->addr.addr.type != PF_ADDR_DYNIFTL && + pp->addr.addr.type != PF_ADDR_TABLE) { + error = EINVAL; + break; + } + pa = pool_get(&pf_pooladdr_pl, PR_NOWAIT); + if (pa == NULL) { + error = ENOMEM; + break; + } + bcopy(&pp->addr, pa, sizeof(struct pf_pooladdr)); + if (pa->ifname[0]) { + pa->kif = pfi_kif_get(pa->ifname); + if (pa->kif == NULL) { + pool_put(&pf_pooladdr_pl, pa); + error = EINVAL; + break; + } + pfi_kif_ref(pa->kif, PFI_KIF_REF_RULE); + } + if (pfi_dynaddr_setup(&pa->addr, pp->af)) { + pfi_dynaddr_remove(&pa->addr); + pfi_kif_unref(pa->kif, PFI_KIF_REF_RULE); + pool_put(&pf_pooladdr_pl, pa); + error = EINVAL; + break; + } + TAILQ_INSERT_TAIL(&pf_pabuf, pa, entries); + break; + } + + case DIOCGETADDRS: { + struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + + pp->nr = 0; + pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action, + pp->r_num, 0, 1, 0); + if (pool == NULL) { + error = EBUSY; + break; + } + TAILQ_FOREACH(pa, &pool->list, entries) + pp->nr++; + break; + } + + case DIOCGETADDR: { + struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + u_int32_t nr = 0; + + pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action, + pp->r_num, 0, 1, 1); + if (pool == NULL) { + error = EBUSY; + break; + } + pa = TAILQ_FIRST(&pool->list); + while ((pa != NULL) && (nr < pp->nr)) { + pa = TAILQ_NEXT(pa, entries); + nr++; + } + if (pa == NULL) { + error = EBUSY; + break; + } + bcopy(pa, &pp->addr, sizeof(struct pf_pooladdr)); + pfi_dynaddr_copyout(&pp->addr.addr); + pf_tbladdr_copyout(&pp->addr.addr); + pf_rtlabel_copyout(&pp->addr.addr); + break; + } + + case DIOCCHANGEADDR: { + struct pfioc_pooladdr *pca = (struct pfioc_pooladdr *)addr; + struct pf_pooladdr *oldpa = NULL, *newpa = NULL; + struct pf_ruleset *ruleset; + + if (pca->action < PF_CHANGE_ADD_HEAD || + pca->action > PF_CHANGE_REMOVE) { + error = EINVAL; + break; + } + if (pca->addr.addr.type != PF_ADDR_ADDRMASK && + pca->addr.addr.type != PF_ADDR_DYNIFTL && + pca->addr.addr.type != PF_ADDR_TABLE) { + error = EINVAL; + break; + } + + ruleset = pf_find_ruleset(pca->anchor); + if (ruleset == NULL) { + error = EBUSY; + break; + } + pool = pf_get_pool(pca->anchor, pca->ticket, pca->r_action, + pca->r_num, pca->r_last, 1, 1); + if (pool == NULL) { + error = EBUSY; + break; + } + if (pca->action != PF_CHANGE_REMOVE) { + newpa = pool_get(&pf_pooladdr_pl, PR_NOWAIT); + if (newpa == NULL) { + error = ENOMEM; + break; + } + bcopy(&pca->addr, newpa, sizeof(struct pf_pooladdr)); +#ifndef INET + if (pca->af == AF_INET) { + pool_put(&pf_pooladdr_pl, newpa); + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#ifndef INET6 + if (pca->af == AF_INET6) { + pool_put(&pf_pooladdr_pl, newpa); + error = EAFNOSUPPORT; + break; + } +#endif /* INET6 */ + if (newpa->ifname[0]) { + newpa->kif = pfi_kif_get(newpa->ifname); + if (newpa->kif == NULL) { + pool_put(&pf_pooladdr_pl, newpa); + error = EINVAL; + break; + } + pfi_kif_ref(newpa->kif, PFI_KIF_REF_RULE); + } else + newpa->kif = NULL; + if (pfi_dynaddr_setup(&newpa->addr, pca->af) || + pf_tbladdr_setup(ruleset, &newpa->addr)) { + pfi_dynaddr_remove(&newpa->addr); + pfi_kif_unref(newpa->kif, PFI_KIF_REF_RULE); + pool_put(&pf_pooladdr_pl, newpa); + error = EINVAL; + break; + } + } + + if (pca->action == PF_CHANGE_ADD_HEAD) + oldpa = TAILQ_FIRST(&pool->list); + else if (pca->action == PF_CHANGE_ADD_TAIL) + oldpa = TAILQ_LAST(&pool->list, pf_palist); + else { + int i = 0; + + oldpa = TAILQ_FIRST(&pool->list); + while ((oldpa != NULL) && (i < pca->nr)) { + oldpa = TAILQ_NEXT(oldpa, entries); + i++; + } + if (oldpa == NULL) { + error = EINVAL; + break; + } + } + + if (pca->action == PF_CHANGE_REMOVE) { + TAILQ_REMOVE(&pool->list, oldpa, entries); + pfi_dynaddr_remove(&oldpa->addr); + pf_tbladdr_remove(&oldpa->addr); + pfi_kif_unref(oldpa->kif, PFI_KIF_REF_RULE); + pool_put(&pf_pooladdr_pl, oldpa); + } else { + if (oldpa == NULL) + TAILQ_INSERT_TAIL(&pool->list, newpa, entries); + else if (pca->action == PF_CHANGE_ADD_HEAD || + pca->action == PF_CHANGE_ADD_BEFORE) + TAILQ_INSERT_BEFORE(oldpa, newpa, entries); + else + TAILQ_INSERT_AFTER(&pool->list, oldpa, + newpa, entries); + } + + pool->cur = TAILQ_FIRST(&pool->list); + PF_ACPY(&pool->counter, &pool->cur->addr.v.a.addr, + pca->af); + break; + } + + case DIOCGETRULESETS: { + struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr; + struct pf_ruleset *ruleset; + struct pf_anchor *anchor; + + pr->path[sizeof(pr->path) - 1] = 0; + if ((ruleset = pf_find_ruleset(pr->path)) == NULL) { + error = EINVAL; + break; + } + pr->nr = 0; + if (ruleset->anchor == NULL) { + /* XXX kludge for pf_main_ruleset */ + RB_FOREACH(anchor, pf_anchor_global, &pf_anchors) + if (anchor->parent == NULL) + pr->nr++; + } else { + RB_FOREACH(anchor, pf_anchor_node, + &ruleset->anchor->children) + pr->nr++; + } + break; + } + + case DIOCGETRULESET: { + struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr; + struct pf_ruleset *ruleset; + struct pf_anchor *anchor; + u_int32_t nr = 0; + + pr->path[sizeof(pr->path) - 1] = 0; + if ((ruleset = pf_find_ruleset(pr->path)) == NULL) { + error = EINVAL; + break; + } + pr->name[0] = 0; + if (ruleset->anchor == NULL) { + /* XXX kludge for pf_main_ruleset */ + RB_FOREACH(anchor, pf_anchor_global, &pf_anchors) + if (anchor->parent == NULL && nr++ == pr->nr) { + strlcpy(pr->name, anchor->name, + sizeof(pr->name)); + break; + } + } else { + RB_FOREACH(anchor, pf_anchor_node, + &ruleset->anchor->children) + if (nr++ == pr->nr) { + strlcpy(pr->name, anchor->name, + sizeof(pr->name)); + break; + } + } + if (!pr->name[0]) + error = EBUSY; + break; + } + + case DIOCRCLRTABLES: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != 0) { + error = ENODEV; + break; + } + error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel, + io->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRADDTABLES: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + error = pfr_add_tables(io->pfrio_buffer, io->pfrio_size, + &io->pfrio_nadd, io->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRDELTABLES: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + error = pfr_del_tables(io->pfrio_buffer, io->pfrio_size, + &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRGETTABLES: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + error = pfr_get_tables(&io->pfrio_table, io->pfrio_buffer, + &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRGETTSTATS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_tstats)) { + error = ENODEV; + break; + } + error = pfr_get_tstats(&io->pfrio_table, io->pfrio_buffer, + &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRCLRTSTATS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + error = pfr_clr_tstats(io->pfrio_buffer, io->pfrio_size, + &io->pfrio_nzero, io->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRSETTFLAGS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + error = pfr_set_tflags(io->pfrio_buffer, io->pfrio_size, + io->pfrio_setflag, io->pfrio_clrflag, &io->pfrio_nchange, + &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRCLRADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != 0) { + error = ENODEV; + break; + } + error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel, + io->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRADDADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + error = pfr_add_addrs(&io->pfrio_table, io->pfrio_buffer, + io->pfrio_size, &io->pfrio_nadd, io->pfrio_flags | + PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRDELADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + error = pfr_del_addrs(&io->pfrio_table, io->pfrio_buffer, + io->pfrio_size, &io->pfrio_ndel, io->pfrio_flags | + PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRSETADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + error = pfr_set_addrs(&io->pfrio_table, io->pfrio_buffer, + io->pfrio_size, &io->pfrio_size2, &io->pfrio_nadd, + &io->pfrio_ndel, &io->pfrio_nchange, io->pfrio_flags | + PFR_FLAG_USERIOCTL, 0); + break; + } + + case DIOCRGETADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + error = pfr_get_addrs(&io->pfrio_table, io->pfrio_buffer, + &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRGETASTATS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_astats)) { + error = ENODEV; + break; + } + error = pfr_get_astats(&io->pfrio_table, io->pfrio_buffer, + &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRCLRASTATS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + error = pfr_clr_astats(&io->pfrio_table, io->pfrio_buffer, + io->pfrio_size, &io->pfrio_nzero, io->pfrio_flags | + PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRTSTADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + error = pfr_tst_addrs(&io->pfrio_table, io->pfrio_buffer, + io->pfrio_size, &io->pfrio_nmatch, io->pfrio_flags | + PFR_FLAG_USERIOCTL); + break; + } + + case DIOCRINADEFINE: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + error = pfr_ina_define(&io->pfrio_table, io->pfrio_buffer, + io->pfrio_size, &io->pfrio_nadd, &io->pfrio_naddr, + io->pfrio_ticket, io->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + } + + case DIOCOSFPADD: { + struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr; + error = pf_osfp_add(io); + break; + } + + case DIOCOSFPGET: { + struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr; + error = pf_osfp_get(io); + break; + } + + case DIOCXBEGIN: { + struct pfioc_trans *io = (struct pfioc_trans *)addr; + struct pfioc_trans_e *ioe; + struct pfr_table *table; + int i; + + if (io->esize != sizeof(*ioe)) { + error = ENODEV; + goto fail; + } +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + ioe = (struct pfioc_trans_e *)malloc(sizeof(*ioe), + M_TEMP, M_WAITOK); + table = (struct pfr_table *)malloc(sizeof(*table), + M_TEMP, M_WAITOK); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + for (i = 0; i < io->size; i++) { +#ifdef __FreeBSD__ + PF_COPYIN(io->array+i, ioe, sizeof(*ioe), error); + if (error) { +#else + if (copyin(io->array+i, ioe, sizeof(*ioe))) { +#endif + free(table, M_TEMP); + free(ioe, M_TEMP); + error = EFAULT; + goto fail; + } + switch (ioe->rs_num) { +#ifdef ALTQ + case PF_RULESET_ALTQ: + if (ioe->anchor[0]) { + free(table, M_TEMP); + free(ioe, M_TEMP); + error = EINVAL; + goto fail; + } + if ((error = pf_begin_altq(&ioe->ticket))) { + free(table, M_TEMP); + free(ioe, M_TEMP); + goto fail; + } + break; +#endif /* ALTQ */ + case PF_RULESET_TABLE: + bzero(table, sizeof(*table)); + strlcpy(table->pfrt_anchor, ioe->anchor, + sizeof(table->pfrt_anchor)); + if ((error = pfr_ina_begin(table, + &ioe->ticket, NULL, 0))) { + free(table, M_TEMP); + free(ioe, M_TEMP); + goto fail; + } + break; + default: + if ((error = pf_begin_rules(&ioe->ticket, + ioe->rs_num, ioe->anchor))) { + free(table, M_TEMP); + free(ioe, M_TEMP); + goto fail; + } + break; + } +#ifdef __FreeBSD__ + PF_COPYOUT(ioe, io->array+i, sizeof(io->array[i]), + error); + if (error) { +#else + if (copyout(ioe, io->array+i, sizeof(io->array[i]))) { +#endif + free(table, M_TEMP); + free(ioe, M_TEMP); + error = EFAULT; + goto fail; + } + } + free(table, M_TEMP); + free(ioe, M_TEMP); + break; + } + + case DIOCXROLLBACK: { + struct pfioc_trans *io = (struct pfioc_trans *)addr; + struct pfioc_trans_e *ioe; + struct pfr_table *table; + int i; + + if (io->esize != sizeof(*ioe)) { + error = ENODEV; + goto fail; + } +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + ioe = (struct pfioc_trans_e *)malloc(sizeof(*ioe), + M_TEMP, M_WAITOK); + table = (struct pfr_table *)malloc(sizeof(*table), + M_TEMP, M_WAITOK); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + for (i = 0; i < io->size; i++) { +#ifdef __FreeBSD__ + PF_COPYIN(io->array+i, ioe, sizeof(*ioe), error); + if (error) { +#else + if (copyin(io->array+i, ioe, sizeof(*ioe))) { +#endif + free(table, M_TEMP); + free(ioe, M_TEMP); + error = EFAULT; + goto fail; + } + switch (ioe->rs_num) { +#ifdef ALTQ + case PF_RULESET_ALTQ: + if (ioe->anchor[0]) { + free(table, M_TEMP); + free(ioe, M_TEMP); + error = EINVAL; + goto fail; + } + if ((error = pf_rollback_altq(ioe->ticket))) { + free(table, M_TEMP); + free(ioe, M_TEMP); + goto fail; /* really bad */ + } + break; +#endif /* ALTQ */ + case PF_RULESET_TABLE: + bzero(table, sizeof(*table)); + strlcpy(table->pfrt_anchor, ioe->anchor, + sizeof(table->pfrt_anchor)); + if ((error = pfr_ina_rollback(table, + ioe->ticket, NULL, 0))) { + free(table, M_TEMP); + free(ioe, M_TEMP); + goto fail; /* really bad */ + } + break; + default: + if ((error = pf_rollback_rules(ioe->ticket, + ioe->rs_num, ioe->anchor))) { + free(table, M_TEMP); + free(ioe, M_TEMP); + goto fail; /* really bad */ + } + break; + } + } + free(table, M_TEMP); + free(ioe, M_TEMP); + break; + } + + case DIOCXCOMMIT: { + struct pfioc_trans *io = (struct pfioc_trans *)addr; + struct pfioc_trans_e *ioe; + struct pfr_table *table; + struct pf_ruleset *rs; + int i; + + if (io->esize != sizeof(*ioe)) { + error = ENODEV; + goto fail; + } +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + ioe = (struct pfioc_trans_e *)malloc(sizeof(*ioe), + M_TEMP, M_WAITOK); + table = (struct pfr_table *)malloc(sizeof(*table), + M_TEMP, M_WAITOK); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + /* first makes sure everything will succeed */ + for (i = 0; i < io->size; i++) { +#ifdef __FreeBSD__ + PF_COPYIN(io->array+i, ioe, sizeof(*ioe), error); + if (error) { +#else + if (copyin(io->array+i, ioe, sizeof(*ioe))) { +#endif + free(table, M_TEMP); + free(ioe, M_TEMP); + error = EFAULT; + goto fail; + } + switch (ioe->rs_num) { +#ifdef ALTQ + case PF_RULESET_ALTQ: + if (ioe->anchor[0]) { + free(table, M_TEMP); + free(ioe, M_TEMP); + error = EINVAL; + goto fail; + } + if (!altqs_inactive_open || ioe->ticket != + ticket_altqs_inactive) { + free(table, M_TEMP); + free(ioe, M_TEMP); + error = EBUSY; + goto fail; + } + break; +#endif /* ALTQ */ + case PF_RULESET_TABLE: + rs = pf_find_ruleset(ioe->anchor); + if (rs == NULL || !rs->topen || ioe->ticket != + rs->tticket) { + free(table, M_TEMP); + free(ioe, M_TEMP); + error = EBUSY; + goto fail; + } + break; + default: + if (ioe->rs_num < 0 || ioe->rs_num >= + PF_RULESET_MAX) { + free(table, M_TEMP); + free(ioe, M_TEMP); + error = EINVAL; + goto fail; + } + rs = pf_find_ruleset(ioe->anchor); + if (rs == NULL || + !rs->rules[ioe->rs_num].inactive.open || + rs->rules[ioe->rs_num].inactive.ticket != + ioe->ticket) { + free(table, M_TEMP); + free(ioe, M_TEMP); + error = EBUSY; + goto fail; + } + break; + } + } + /* now do the commit - no errors should happen here */ + for (i = 0; i < io->size; i++) { +#ifdef __FreeBSD__ + PF_COPYIN(io->array+i, ioe, sizeof(*ioe), error); + if (error) { +#else + if (copyin(io->array+i, ioe, sizeof(*ioe))) { +#endif + free(table, M_TEMP); + free(ioe, M_TEMP); + error = EFAULT; + goto fail; + } + switch (ioe->rs_num) { +#ifdef ALTQ + case PF_RULESET_ALTQ: + if ((error = pf_commit_altq(ioe->ticket))) { + free(table, M_TEMP); + free(ioe, M_TEMP); + goto fail; /* really bad */ + } + break; +#endif /* ALTQ */ + case PF_RULESET_TABLE: + bzero(table, sizeof(*table)); + strlcpy(table->pfrt_anchor, ioe->anchor, + sizeof(table->pfrt_anchor)); + if ((error = pfr_ina_commit(table, ioe->ticket, + NULL, NULL, 0))) { + free(table, M_TEMP); + free(ioe, M_TEMP); + goto fail; /* really bad */ + } + break; + default: + if ((error = pf_commit_rules(ioe->ticket, + ioe->rs_num, ioe->anchor))) { + free(table, M_TEMP); + free(ioe, M_TEMP); + goto fail; /* really bad */ + } + break; + } + } + free(table, M_TEMP); + free(ioe, M_TEMP); + break; + } + + case DIOCGETSRCNODES: { + struct pfioc_src_nodes *psn = (struct pfioc_src_nodes *)addr; + struct pf_src_node *n, *p, *pstore; + u_int32_t nr = 0; + int space = psn->psn_len; + + if (space == 0) { + RB_FOREACH(n, pf_src_tree, &tree_src_tracking) + nr++; + psn->psn_len = sizeof(struct pf_src_node) * nr; + break; + } + +#ifdef __FreeBSD__ + PF_UNLOCK(); +#endif + pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK); +#ifdef __FreeBSD__ + PF_LOCK(); +#endif + + p = psn->psn_src_nodes; + RB_FOREACH(n, pf_src_tree, &tree_src_tracking) { + int secs = time_second, diff; + + if ((nr + 1) * sizeof(*p) > (unsigned)psn->psn_len) + break; + + bcopy(n, pstore, sizeof(*pstore)); + if (n->rule.ptr != NULL) + pstore->rule.nr = n->rule.ptr->nr; + pstore->creation = secs - pstore->creation; + if (pstore->expire > secs) + pstore->expire -= secs; + else + pstore->expire = 0; + + /* adjust the connection rate estimate */ + diff = secs - n->conn_rate.last; + if (diff >= n->conn_rate.seconds) + pstore->conn_rate.count = 0; + else + pstore->conn_rate.count -= + n->conn_rate.count * diff / + n->conn_rate.seconds; + +#ifdef __FreeBSD__ + PF_COPYOUT(pstore, p, sizeof(*p), error); +#else + error = copyout(pstore, p, sizeof(*p)); +#endif + if (error) { + free(pstore, M_TEMP); + goto fail; + } + p++; + nr++; + } + psn->psn_len = sizeof(struct pf_src_node) * nr; + + free(pstore, M_TEMP); + break; + } + + case DIOCCLRSRCNODES: { + struct pf_src_node *n; + struct pf_state *state; + + RB_FOREACH(state, pf_state_tree_id, &tree_id) { + state->src_node = NULL; + state->nat_src_node = NULL; + } + RB_FOREACH(n, pf_src_tree, &tree_src_tracking) { + n->expire = 1; + n->states = 0; + } + pf_purge_expired_src_nodes(1); + pf_status.src_nodes = 0; + break; + } + + case DIOCKILLSRCNODES: { + struct pf_src_node *sn; + struct pf_state *s; + struct pfioc_src_node_kill *psnk = \ + (struct pfioc_src_node_kill *) addr; + int killed = 0; + + RB_FOREACH(sn, pf_src_tree, &tree_src_tracking) { + if (PF_MATCHA(psnk->psnk_src.neg, \ + &psnk->psnk_src.addr.v.a.addr, \ + &psnk->psnk_src.addr.v.a.mask, \ + &sn->addr, sn->af) && + PF_MATCHA(psnk->psnk_dst.neg, \ + &psnk->psnk_dst.addr.v.a.addr, \ + &psnk->psnk_dst.addr.v.a.mask, \ + &sn->raddr, sn->af)) { + /* Handle state to src_node linkage */ + if (sn->states != 0) { + RB_FOREACH(s, pf_state_tree_id, + &tree_id) { + if (s->src_node == sn) + s->src_node = NULL; + if (s->nat_src_node == sn) + s->nat_src_node = NULL; + } + sn->states = 0; + } + sn->expire = 1; + killed++; + } + } + + if (killed > 0) + pf_purge_expired_src_nodes(1); + + psnk->psnk_af = killed; + break; + } + + case DIOCSETHOSTID: { + u_int32_t *hostid = (u_int32_t *)addr; + + if (*hostid == 0) + pf_status.hostid = arc4random(); + else + pf_status.hostid = *hostid; + break; + } + + case DIOCOSFPFLUSH: + pf_osfp_flush(); + break; + + case DIOCIGETIFACES: { + struct pfioc_iface *io = (struct pfioc_iface *)addr; + + if (io->pfiio_esize != sizeof(struct pfi_kif)) { + error = ENODEV; + break; + } + error = pfi_get_ifaces(io->pfiio_name, io->pfiio_buffer, + &io->pfiio_size); + break; + } + + case DIOCSETIFFLAG: { + struct pfioc_iface *io = (struct pfioc_iface *)addr; + + error = pfi_set_flags(io->pfiio_name, io->pfiio_flags); + break; + } + + case DIOCCLRIFFLAG: { + struct pfioc_iface *io = (struct pfioc_iface *)addr; + + error = pfi_clear_flags(io->pfiio_name, io->pfiio_flags); + break; + } + + default: + error = ENODEV; + break; + } +fail: +#ifdef __FreeBSD__ + PF_UNLOCK(); + + if (flags & FWRITE) + sx_xunlock(&pf_consistency_lock); + else + sx_sunlock(&pf_consistency_lock); +#else + splx(s); + /* XXX: Lock order? */ + if (flags & FWRITE) + rw_exit_write(&pf_consistency_lock); + else + rw_exit_read(&pf_consistency_lock); +#endif + return (error); +} + +#ifdef __FreeBSD__ +/* + * XXX - Check for version missmatch!!! + */ +static void +pf_clear_states(void) +{ + struct pf_state *state; + + RB_FOREACH(state, pf_state_tree_id, &tree_id) { + state->timeout = PFTM_PURGE; +#if NPFSYNC + /* don't send out individual delete messages */ + state->sync_flags = PFSTATE_NOSYNC; +#endif + pf_unlink_state(state); + } + +#if 0 /* NPFSYNC */ +/* + * XXX This is called on module unload, we do not want to sync that over? */ + */ + pfsync_clear_states(pf_status.hostid, psk->psk_ifname); +#endif +} + +static int +pf_clear_tables(void) +{ + struct pfioc_table io; + int error; + + bzero(&io, sizeof(io)); + + error = pfr_clr_tables(&io.pfrio_table, &io.pfrio_ndel, + io.pfrio_flags); + + return (error); +} + +static void +pf_clear_srcnodes(void) +{ + struct pf_src_node *n; + struct pf_state *state; + + RB_FOREACH(state, pf_state_tree_id, &tree_id) { + state->src_node = NULL; + state->nat_src_node = NULL; + } + RB_FOREACH(n, pf_src_tree, &tree_src_tracking) { + n->expire = 1; + n->states = 0; + } +} +/* + * XXX - Check for version missmatch!!! + */ + +/* + * Duplicate pfctl -Fa operation to get rid of as much as we can. + */ +static int +shutdown_pf(void) +{ + int error = 0; + u_int32_t t[5]; + char nn = '\0'; + + pf_status.running = 0; + do { + if ((error = pf_begin_rules(&t[0], PF_RULESET_SCRUB, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: SCRUB\n")); + break; + } + if ((error = pf_begin_rules(&t[1], PF_RULESET_FILTER, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: FILTER\n")); + break; /* XXX: rollback? */ + } + if ((error = pf_begin_rules(&t[2], PF_RULESET_NAT, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: NAT\n")); + break; /* XXX: rollback? */ + } + if ((error = pf_begin_rules(&t[3], PF_RULESET_BINAT, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: BINAT\n")); + break; /* XXX: rollback? */ + } + if ((error = pf_begin_rules(&t[4], PF_RULESET_RDR, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: RDR\n")); + break; /* XXX: rollback? */ + } + + /* XXX: these should always succeed here */ + pf_commit_rules(t[0], PF_RULESET_SCRUB, &nn); + pf_commit_rules(t[1], PF_RULESET_FILTER, &nn); + pf_commit_rules(t[2], PF_RULESET_NAT, &nn); + pf_commit_rules(t[3], PF_RULESET_BINAT, &nn); + pf_commit_rules(t[4], PF_RULESET_RDR, &nn); + + if ((error = pf_clear_tables()) != 0) + break; + +#ifdef ALTQ + if ((error = pf_begin_altq(&t[0])) != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: ALTQ\n")); + break; + } + pf_commit_altq(t[0]); +#endif + + pf_clear_states(); + + pf_clear_srcnodes(); + + /* status does not use malloced mem so no need to cleanup */ + /* fingerprints and interfaces have thier own cleanup code */ + } while(0); + + return (error); +} + +static int +pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + /* + * XXX Wed Jul 9 22:03:16 2003 UTC + * OpenBSD has changed its byte ordering convention on ip_len/ip_off + * in network stack. OpenBSD's network stack have converted + * ip_len/ip_off to host byte order frist as FreeBSD. + * Now this is not true anymore , so we should convert back to network + * byte order. + */ + struct ip *h = NULL; + int chk; + + if ((*m)->m_pkthdr.len >= (int)sizeof(struct ip)) { + /* if m_pkthdr.len is less than ip header, pf will handle. */ + h = mtod(*m, struct ip *); + HTONS(h->ip_len); + HTONS(h->ip_off); + } + chk = pf_test(PF_IN, ifp, m, NULL, inp); + if (chk && *m) { + m_freem(*m); + *m = NULL; + } + if (*m != NULL) { + /* pf_test can change ip header location */ + h = mtod(*m, struct ip *); + NTOHS(h->ip_len); + NTOHS(h->ip_off); + } + return chk; +} + +static int +pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + /* + * XXX Wed Jul 9 22:03:16 2003 UTC + * OpenBSD has changed its byte ordering convention on ip_len/ip_off + * in network stack. OpenBSD's network stack have converted + * ip_len/ip_off to host byte order frist as FreeBSD. + * Now this is not true anymore , so we should convert back to network + * byte order. + */ + struct ip *h = NULL; + int chk; + + /* We need a proper CSUM befor we start (s. OpenBSD ip_output) */ + if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(*m); + (*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + if ((*m)->m_pkthdr.len >= (int)sizeof(*h)) { + /* if m_pkthdr.len is less than ip header, pf will handle. */ + h = mtod(*m, struct ip *); + HTONS(h->ip_len); + HTONS(h->ip_off); + } + chk = pf_test(PF_OUT, ifp, m, NULL, inp); + if (chk && *m) { + m_freem(*m); + *m = NULL; + } + if (*m != NULL) { + /* pf_test can change ip header location */ + h = mtod(*m, struct ip *); + NTOHS(h->ip_len); + NTOHS(h->ip_off); + } + return chk; +} + +#ifdef INET6 +static int +pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + + /* + * IPv6 is not affected by ip_len/ip_off byte order changes. + */ + int chk; + + /* + * In case of loopback traffic IPv6 uses the real interface in + * order to support scoped addresses. In order to support stateful + * filtering we have change this to lo0 as it is the case in IPv4. + */ + chk = pf_test6(PF_IN, (*m)->m_flags & M_LOOP ? V_loif : ifp, m, + NULL, inp); + if (chk && *m) { + m_freem(*m); + *m = NULL; + } + return chk; +} + +static int +pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + /* + * IPv6 does not affected ip_len/ip_off byte order changes. + */ + int chk; + + /* We need a proper CSUM befor we start (s. OpenBSD ip_output) */ + if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(*m); + (*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + chk = pf_test6(PF_OUT, ifp, m, NULL, inp); + if (chk && *m) { + m_freem(*m); + *m = NULL; + } + return chk; +} +#endif /* INET6 */ + +static int +hook_pf(void) +{ + struct pfil_head *pfh_inet; +#ifdef INET6 + struct pfil_head *pfh_inet6; +#endif + + PF_ASSERT(MA_NOTOWNED); + + if (pf_pfil_hooked) + return (0); + + pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); + if (pfh_inet == NULL) + return (ESRCH); /* XXX */ + pfil_add_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet); + pfil_add_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet); +#ifdef INET6 + pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); + if (pfh_inet6 == NULL) { + pfil_remove_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, + pfh_inet); + pfil_remove_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, + pfh_inet); + return (ESRCH); /* XXX */ + } + pfil_add_hook(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet6); + pfil_add_hook(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet6); +#endif + + pf_pfil_hooked = 1; + return (0); +} + +static int +dehook_pf(void) +{ + struct pfil_head *pfh_inet; +#ifdef INET6 + struct pfil_head *pfh_inet6; +#endif + + PF_ASSERT(MA_NOTOWNED); + + if (pf_pfil_hooked == 0) + return (0); + + pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); + if (pfh_inet == NULL) + return (ESRCH); /* XXX */ + pfil_remove_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, + pfh_inet); + pfil_remove_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, + pfh_inet); +#ifdef INET6 + pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); + if (pfh_inet6 == NULL) + return (ESRCH); /* XXX */ + pfil_remove_hook(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK, + pfh_inet6); + pfil_remove_hook(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK, + pfh_inet6); +#endif + + pf_pfil_hooked = 0; + return (0); +} + +static int +pf_load(void) +{ + init_zone_var(); + init_pf_mutex(); + pf_dev = make_dev(&pf_cdevsw, 0, 0, 0, 0600, PF_NAME); + if (pfattach() < 0) { + destroy_dev(pf_dev); + destroy_pf_mutex(); + return (ENOMEM); + } + return (0); +} + +static int +pf_unload(void) +{ + int error = 0; + + PF_LOCK(); + pf_status.running = 0; + PF_UNLOCK(); + error = dehook_pf(); + if (error) { + /* + * Should not happen! + * XXX Due to error code ESRCH, kldunload will show + * a message like 'No such process'. + */ + printf("%s : pfil unregisteration fail\n", __FUNCTION__); + return error; + } + PF_LOCK(); + shutdown_pf(); + pf_end_threads = 1; + while (pf_end_threads < 2) { + wakeup_one(pf_purge_thread); + msleep(pf_purge_thread, &pf_task_mtx, 0, "pftmo", hz); + } + pfi_cleanup(); + pf_osfp_flush(); + pf_osfp_cleanup(); + cleanup_pf_zone(); + PF_UNLOCK(); + destroy_dev(pf_dev); + destroy_pf_mutex(); + return error; +} + +static int +pf_modevent(module_t mod, int type, void *data) +{ + int error = 0; + + switch(type) { + case MOD_LOAD: + error = pf_load(); + break; + + case MOD_UNLOAD: + error = pf_unload(); + break; + default: + error = EINVAL; + break; + } + return error; +} + +static moduledata_t pf_mod = { + "pf", + pf_modevent, + 0 +}; + +DECLARE_MODULE(pf, pf_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST); +MODULE_VERSION(pf, PF_MODVER); +#endif /* __FreeBSD__ */ diff --git a/contrib/pf/rtems/freebsd/net/pf_mtag.h b/contrib/pf/rtems/freebsd/net/pf_mtag.h new file mode 100644 index 00000000..09aeb25c --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/pf_mtag.h @@ -0,0 +1,82 @@ +/* $FreeBSD$ */ +/* + * Copyright (c) 2001 Daniel Hartmeier + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _NET_PF_MTAG_HH_ +#define _NET_PF_MTAG_HH_ + +#ifdef _KERNEL + +#define PF_TAG_GENERATED 0x01 +#define PF_TAG_FRAGCACHE 0x02 +#define PF_TAG_TRANSLATE_LOCALHOST 0x04 + +struct pf_mtag { + void *hdr; /* saved hdr pos in mbuf, for ECN */ + u_int rtableid; /* alternate routing table id */ + u_int32_t qid; /* queue id */ + u_int16_t tag; /* tag id */ + u_int8_t flags; + u_int8_t routed; + sa_family_t af; /* for ECN */ +}; + +static __inline struct pf_mtag *pf_find_mtag(struct mbuf *); +static __inline struct pf_mtag *pf_get_mtag(struct mbuf *); + +static __inline struct pf_mtag * +pf_find_mtag(struct mbuf *m) +{ + struct m_tag *mtag; + + if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) == NULL) + return (NULL); + + return ((struct pf_mtag *)(mtag + 1)); +} + +static __inline struct pf_mtag * +pf_get_mtag(struct mbuf *m) +{ + struct m_tag *mtag; + + if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) == NULL) { + mtag = m_tag_get(PACKET_TAG_PF, sizeof(struct pf_mtag), + M_NOWAIT); + if (mtag == NULL) + return (NULL); + bzero(mtag + 1, sizeof(struct pf_mtag)); + m_tag_prepend(m, mtag); + } + + return ((struct pf_mtag *)(mtag + 1)); +} +#endif /* _KERNEL */ +#endif /* _NET_PF_MTAG_HH_ */ diff --git a/contrib/pf/rtems/freebsd/net/pf_norm.c b/contrib/pf/rtems/freebsd/net/pf_norm.c new file mode 100644 index 00000000..22f24506 --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/pf_norm.c @@ -0,0 +1,2062 @@ +#include + +/* $OpenBSD: pf_norm.c,v 1.107 2006/04/16 00:59:52 pascoe Exp $ */ + +/* + * Copyright 2001 Niels Provos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef __FreeBSD__ +#include +#include +#include + +#include +__FBSDID("$FreeBSD$"); + +#ifdef DEV_PFLOG +#define NPFLOG DEV_PFLOG +#else +#define NPFLOG 0 +#endif +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef __FreeBSD__ +#include + +#include +#endif +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef INET6 +#include +#endif /* INET6 */ + +#include + +#ifndef __FreeBSD__ +#include + +struct pf_frent { + LIST_ENTRY(pf_frent) fr_next; + struct ip *fr_ip; + struct mbuf *fr_m; +}; + +struct pf_frcache { + LIST_ENTRY(pf_frcache) fr_next; + uint16_t fr_off; + uint16_t fr_end; +}; +#endif + +#define PFFRAG_SEENLAST 0x0001 /* Seen the last fragment for this */ +#define PFFRAG_NOBUFFER 0x0002 /* Non-buffering fragment cache */ +#define PFFRAG_DROP 0x0004 /* Drop all fragments */ +#define BUFFER_FRAGMENTS(fr) (!((fr)->fr_flags & PFFRAG_NOBUFFER)) + +#ifndef __FreeBSD__ +struct pf_fragment { + RB_ENTRY(pf_fragment) fr_entry; + TAILQ_ENTRY(pf_fragment) frag_next; + struct in_addr fr_src; + struct in_addr fr_dst; + u_int8_t fr_p; /* protocol of this fragment */ + u_int8_t fr_flags; /* status flags */ + u_int16_t fr_id; /* fragment id for reassemble */ + u_int16_t fr_max; /* fragment data max */ + u_int32_t fr_timeout; +#define fr_queue fr_u.fru_queue +#define fr_cache fr_u.fru_cache + union { + LIST_HEAD(pf_fragq, pf_frent) fru_queue; /* buffering */ + LIST_HEAD(pf_cacheq, pf_frcache) fru_cache; /* non-buf */ + } fr_u; +}; +#endif + +TAILQ_HEAD(pf_fragqueue, pf_fragment) pf_fragqueue; +TAILQ_HEAD(pf_cachequeue, pf_fragment) pf_cachequeue; + +#ifndef __FreeBSD__ +static __inline int pf_frag_compare(struct pf_fragment *, + struct pf_fragment *); +#else +static int pf_frag_compare(struct pf_fragment *, + struct pf_fragment *); +#endif +RB_HEAD(pf_frag_tree, pf_fragment) pf_frag_tree, pf_cache_tree; +RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); +RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); + +/* Private prototypes */ +void pf_ip2key(struct pf_fragment *, struct ip *); +void pf_remove_fragment(struct pf_fragment *); +void pf_flush_fragments(void); +void pf_free_fragment(struct pf_fragment *); +struct pf_fragment *pf_find_fragment(struct ip *, struct pf_frag_tree *); +struct mbuf *pf_reassemble(struct mbuf **, struct pf_fragment **, + struct pf_frent *, int); +struct mbuf *pf_fragcache(struct mbuf **, struct ip*, + struct pf_fragment **, int, int, int *); +int pf_normalize_tcpopt(struct pf_rule *, struct mbuf *, + struct tcphdr *, int); + +#define DPFPRINTF(x) do { \ + if (pf_status.debug >= PF_DEBUG_MISC) { \ + printf("%s: ", __func__); \ + printf x ; \ + } \ +} while(0) + +/* Globals */ +#ifdef __FreeBSD__ +uma_zone_t pf_frent_pl, pf_frag_pl, pf_cache_pl, pf_cent_pl; +uma_zone_t pf_state_scrub_pl; +#else +struct pool pf_frent_pl, pf_frag_pl, pf_cache_pl, pf_cent_pl; +struct pool pf_state_scrub_pl; +#endif +int pf_nfrents, pf_ncache; + +void +pf_normalize_init(void) +{ +#ifdef __FreeBSD__ + /* + * XXX + * No high water mark support(It's hint not hard limit). + * uma_zone_set_max(pf_frag_pl, PFFRAG_FRAG_HIWAT); + */ + uma_zone_set_max(pf_frent_pl, PFFRAG_FRENT_HIWAT); + uma_zone_set_max(pf_cache_pl, PFFRAG_FRCACHE_HIWAT); + uma_zone_set_max(pf_cent_pl, PFFRAG_FRCENT_HIWAT); +#else + pool_init(&pf_frent_pl, sizeof(struct pf_frent), 0, 0, 0, "pffrent", + NULL); + pool_init(&pf_frag_pl, sizeof(struct pf_fragment), 0, 0, 0, "pffrag", + NULL); + pool_init(&pf_cache_pl, sizeof(struct pf_fragment), 0, 0, 0, + "pffrcache", NULL); + pool_init(&pf_cent_pl, sizeof(struct pf_frcache), 0, 0, 0, "pffrcent", + NULL); + pool_init(&pf_state_scrub_pl, sizeof(struct pf_state_scrub), 0, 0, 0, + "pfstscr", NULL); + + pool_sethiwat(&pf_frag_pl, PFFRAG_FRAG_HIWAT); + pool_sethardlimit(&pf_frent_pl, PFFRAG_FRENT_HIWAT, NULL, 0); + pool_sethardlimit(&pf_cache_pl, PFFRAG_FRCACHE_HIWAT, NULL, 0); + pool_sethardlimit(&pf_cent_pl, PFFRAG_FRCENT_HIWAT, NULL, 0); +#endif + + TAILQ_INIT(&pf_fragqueue); + TAILQ_INIT(&pf_cachequeue); +} + +#ifdef __FreeBSD__ +static int +#else +static __inline int +#endif +pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b) +{ + int diff; + + if ((diff = a->fr_id - b->fr_id)) + return (diff); + else if ((diff = a->fr_p - b->fr_p)) + return (diff); + else if (a->fr_src.s_addr < b->fr_src.s_addr) + return (-1); + else if (a->fr_src.s_addr > b->fr_src.s_addr) + return (1); + else if (a->fr_dst.s_addr < b->fr_dst.s_addr) + return (-1); + else if (a->fr_dst.s_addr > b->fr_dst.s_addr) + return (1); + return (0); +} + +void +pf_purge_expired_fragments(void) +{ + struct pf_fragment *frag; + u_int32_t expire = time_second - + pf_default_rule.timeout[PFTM_FRAG]; + + while ((frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue)) != NULL) { +#ifdef __FreeBSD__ + KASSERT((BUFFER_FRAGMENTS(frag)), + ("BUFFER_FRAGMENTS(frag) == 0: %s", __FUNCTION__)); +#else + KASSERT(BUFFER_FRAGMENTS(frag)); +#endif + if (frag->fr_timeout > expire) + break; + + DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); + pf_free_fragment(frag); + } + + while ((frag = TAILQ_LAST(&pf_cachequeue, pf_cachequeue)) != NULL) { +#ifdef __FreeBSD__ + KASSERT((!BUFFER_FRAGMENTS(frag)), + ("BUFFER_FRAGMENTS(frag) != 0: %s", __FUNCTION__)); +#else + KASSERT(!BUFFER_FRAGMENTS(frag)); +#endif + if (frag->fr_timeout > expire) + break; + + DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); + pf_free_fragment(frag); +#ifdef __FreeBSD__ + KASSERT((TAILQ_EMPTY(&pf_cachequeue) || + TAILQ_LAST(&pf_cachequeue, pf_cachequeue) != frag), + ("!(TAILQ_EMPTY() || TAILQ_LAST() == farg): %s", + __FUNCTION__)); +#else + KASSERT(TAILQ_EMPTY(&pf_cachequeue) || + TAILQ_LAST(&pf_cachequeue, pf_cachequeue) != frag); +#endif + } +} + +/* + * Try to flush old fragments to make space for new ones + */ + +void +pf_flush_fragments(void) +{ + struct pf_fragment *frag; + int goal; + + goal = pf_nfrents * 9 / 10; + DPFPRINTF(("trying to free > %d frents\n", + pf_nfrents - goal)); + while (goal < pf_nfrents) { + frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue); + if (frag == NULL) + break; + pf_free_fragment(frag); + } + + + goal = pf_ncache * 9 / 10; + DPFPRINTF(("trying to free > %d cache entries\n", + pf_ncache - goal)); + while (goal < pf_ncache) { + frag = TAILQ_LAST(&pf_cachequeue, pf_cachequeue); + if (frag == NULL) + break; + pf_free_fragment(frag); + } +} + +/* Frees the fragments and all associated entries */ + +void +pf_free_fragment(struct pf_fragment *frag) +{ + struct pf_frent *frent; + struct pf_frcache *frcache; + + /* Free all fragments */ + if (BUFFER_FRAGMENTS(frag)) { + for (frent = LIST_FIRST(&frag->fr_queue); frent; + frent = LIST_FIRST(&frag->fr_queue)) { + LIST_REMOVE(frent, fr_next); + + m_freem(frent->fr_m); + pool_put(&pf_frent_pl, frent); + pf_nfrents--; + } + } else { + for (frcache = LIST_FIRST(&frag->fr_cache); frcache; + frcache = LIST_FIRST(&frag->fr_cache)) { + LIST_REMOVE(frcache, fr_next); + +#ifdef __FreeBSD__ + KASSERT((LIST_EMPTY(&frag->fr_cache) || + LIST_FIRST(&frag->fr_cache)->fr_off > + frcache->fr_end), + ("! (LIST_EMPTY() || LIST_FIRST()->fr_off >" + " frcache->fr_end): %s", __FUNCTION__)); +#else + KASSERT(LIST_EMPTY(&frag->fr_cache) || + LIST_FIRST(&frag->fr_cache)->fr_off > + frcache->fr_end); +#endif + + pool_put(&pf_cent_pl, frcache); + pf_ncache--; + } + } + + pf_remove_fragment(frag); +} + +void +pf_ip2key(struct pf_fragment *key, struct ip *ip) +{ + key->fr_p = ip->ip_p; + key->fr_id = ip->ip_id; + key->fr_src.s_addr = ip->ip_src.s_addr; + key->fr_dst.s_addr = ip->ip_dst.s_addr; +} + +struct pf_fragment * +pf_find_fragment(struct ip *ip, struct pf_frag_tree *tree) +{ + struct pf_fragment key; + struct pf_fragment *frag; + + pf_ip2key(&key, ip); + + frag = RB_FIND(pf_frag_tree, tree, &key); + if (frag != NULL) { + /* XXX Are we sure we want to update the timeout? */ + frag->fr_timeout = time_second; + if (BUFFER_FRAGMENTS(frag)) { + TAILQ_REMOVE(&pf_fragqueue, frag, frag_next); + TAILQ_INSERT_HEAD(&pf_fragqueue, frag, frag_next); + } else { + TAILQ_REMOVE(&pf_cachequeue, frag, frag_next); + TAILQ_INSERT_HEAD(&pf_cachequeue, frag, frag_next); + } + } + + return (frag); +} + +/* Removes a fragment from the fragment queue and frees the fragment */ + +void +pf_remove_fragment(struct pf_fragment *frag) +{ + if (BUFFER_FRAGMENTS(frag)) { + RB_REMOVE(pf_frag_tree, &pf_frag_tree, frag); + TAILQ_REMOVE(&pf_fragqueue, frag, frag_next); + pool_put(&pf_frag_pl, frag); + } else { + RB_REMOVE(pf_frag_tree, &pf_cache_tree, frag); + TAILQ_REMOVE(&pf_cachequeue, frag, frag_next); + pool_put(&pf_cache_pl, frag); + } +} + +#define FR_IP_OFF(fr) ((ntohs((fr)->fr_ip->ip_off) & IP_OFFMASK) << 3) +struct mbuf * +pf_reassemble(struct mbuf **m0, struct pf_fragment **frag, + struct pf_frent *frent, int mff) +{ + struct mbuf *m = *m0, *m2; + struct pf_frent *frea, *next; + struct pf_frent *frep = NULL; + struct ip *ip = frent->fr_ip; + int hlen = ip->ip_hl << 2; + u_int16_t off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; + u_int16_t ip_len = ntohs(ip->ip_len) - ip->ip_hl * 4; + u_int16_t max = ip_len + off; + +#ifdef __FreeBSD__ + KASSERT((*frag == NULL || BUFFER_FRAGMENTS(*frag)), + ("! (*frag == NULL || BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__)); +#else + KASSERT(*frag == NULL || BUFFER_FRAGMENTS(*frag)); +#endif + + /* Strip off ip header */ + m->m_data += hlen; + m->m_len -= hlen; + + /* Create a new reassembly queue for this packet */ + if (*frag == NULL) { + *frag = pool_get(&pf_frag_pl, PR_NOWAIT); + if (*frag == NULL) { + pf_flush_fragments(); + *frag = pool_get(&pf_frag_pl, PR_NOWAIT); + if (*frag == NULL) + goto drop_fragment; + } + + (*frag)->fr_flags = 0; + (*frag)->fr_max = 0; + (*frag)->fr_src = frent->fr_ip->ip_src; + (*frag)->fr_dst = frent->fr_ip->ip_dst; + (*frag)->fr_p = frent->fr_ip->ip_p; + (*frag)->fr_id = frent->fr_ip->ip_id; + (*frag)->fr_timeout = time_second; + LIST_INIT(&(*frag)->fr_queue); + + RB_INSERT(pf_frag_tree, &pf_frag_tree, *frag); + TAILQ_INSERT_HEAD(&pf_fragqueue, *frag, frag_next); + + /* We do not have a previous fragment */ + frep = NULL; + goto insert; + } + + /* + * Find a fragment after the current one: + * - off contains the real shifted offset. + */ + LIST_FOREACH(frea, &(*frag)->fr_queue, fr_next) { + if (FR_IP_OFF(frea) > off) + break; + frep = frea; + } + +#ifdef __FreeBSD__ + KASSERT((frep != NULL || frea != NULL), + ("!(frep != NULL || frea != NULL): %s", __FUNCTION__));; +#else + KASSERT(frep != NULL || frea != NULL); +#endif + + if (frep != NULL && + FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl * + 4 > off) + { + u_int16_t precut; + + precut = FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) - + frep->fr_ip->ip_hl * 4 - off; + if (precut >= ip_len) + goto drop_fragment; + m_adj(frent->fr_m, precut); + DPFPRINTF(("overlap -%d\n", precut)); + /* Enforce 8 byte boundaries */ + ip->ip_off = htons(ntohs(ip->ip_off) + (precut >> 3)); + off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; + ip_len -= precut; + ip->ip_len = htons(ip_len); + } + + for (; frea != NULL && ip_len + off > FR_IP_OFF(frea); + frea = next) + { + u_int16_t aftercut; + + aftercut = ip_len + off - FR_IP_OFF(frea); + DPFPRINTF(("adjust overlap %d\n", aftercut)); + if (aftercut < ntohs(frea->fr_ip->ip_len) - frea->fr_ip->ip_hl + * 4) + { + frea->fr_ip->ip_len = + htons(ntohs(frea->fr_ip->ip_len) - aftercut); + frea->fr_ip->ip_off = htons(ntohs(frea->fr_ip->ip_off) + + (aftercut >> 3)); + m_adj(frea->fr_m, aftercut); + break; + } + + /* This fragment is completely overlapped, lose it */ + next = LIST_NEXT(frea, fr_next); + m_freem(frea->fr_m); + LIST_REMOVE(frea, fr_next); + pool_put(&pf_frent_pl, frea); + pf_nfrents--; + } + + insert: + /* Update maximum data size */ + if ((*frag)->fr_max < max) + (*frag)->fr_max = max; + /* This is the last segment */ + if (!mff) + (*frag)->fr_flags |= PFFRAG_SEENLAST; + + if (frep == NULL) + LIST_INSERT_HEAD(&(*frag)->fr_queue, frent, fr_next); + else + LIST_INSERT_AFTER(frep, frent, fr_next); + + /* Check if we are completely reassembled */ + if (!((*frag)->fr_flags & PFFRAG_SEENLAST)) + return (NULL); + + /* Check if we have all the data */ + off = 0; + for (frep = LIST_FIRST(&(*frag)->fr_queue); frep; frep = next) { + next = LIST_NEXT(frep, fr_next); + + off += ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl * 4; + if (off < (*frag)->fr_max && + (next == NULL || FR_IP_OFF(next) != off)) + { + DPFPRINTF(("missing fragment at %d, next %d, max %d\n", + off, next == NULL ? -1 : FR_IP_OFF(next), + (*frag)->fr_max)); + return (NULL); + } + } + DPFPRINTF(("%d < %d?\n", off, (*frag)->fr_max)); + if (off < (*frag)->fr_max) + return (NULL); + + /* We have all the data */ + frent = LIST_FIRST(&(*frag)->fr_queue); +#ifdef __FreeBSD__ + KASSERT((frent != NULL), ("frent == NULL: %s", __FUNCTION__)); +#else + KASSERT(frent != NULL); +#endif + if ((frent->fr_ip->ip_hl << 2) + off > IP_MAXPACKET) { + DPFPRINTF(("drop: too big: %d\n", off)); + pf_free_fragment(*frag); + *frag = NULL; + return (NULL); + } + next = LIST_NEXT(frent, fr_next); + + /* Magic from ip_input */ + ip = frent->fr_ip; + m = frent->fr_m; + m2 = m->m_next; + m->m_next = NULL; + m_cat(m, m2); + pool_put(&pf_frent_pl, frent); + pf_nfrents--; + for (frent = next; frent != NULL; frent = next) { + next = LIST_NEXT(frent, fr_next); + + m2 = frent->fr_m; + pool_put(&pf_frent_pl, frent); + pf_nfrents--; +#ifdef __FreeBSD__ + m->m_pkthdr.csum_flags &= m2->m_pkthdr.csum_flags; + m->m_pkthdr.csum_data += m2->m_pkthdr.csum_data; +#endif + m_cat(m, m2); + } +#ifdef __FreeBSD__ + while (m->m_pkthdr.csum_data & 0xffff0000) + m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + + (m->m_pkthdr.csum_data >> 16); +#endif + + ip->ip_src = (*frag)->fr_src; + ip->ip_dst = (*frag)->fr_dst; + + /* Remove from fragment queue */ + pf_remove_fragment(*frag); + *frag = NULL; + + hlen = ip->ip_hl << 2; + ip->ip_len = htons(off + hlen); + m->m_len += hlen; + m->m_data -= hlen; + + /* some debugging cruft by sklower, below, will go away soon */ + /* XXX this should be done elsewhere */ + if (m->m_flags & M_PKTHDR) { + int plen = 0; + for (m2 = m; m2; m2 = m2->m_next) + plen += m2->m_len; + m->m_pkthdr.len = plen; + } + + DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len))); + return (m); + + drop_fragment: + /* Oops - fail safe - drop packet */ + pool_put(&pf_frent_pl, frent); + pf_nfrents--; + m_freem(m); + return (NULL); +} + +struct mbuf * +pf_fragcache(struct mbuf **m0, struct ip *h, struct pf_fragment **frag, int mff, + int drop, int *nomem) +{ + struct mbuf *m = *m0; + struct pf_frcache *frp, *fra, *cur = NULL; + int ip_len = ntohs(h->ip_len) - (h->ip_hl << 2); + u_int16_t off = ntohs(h->ip_off) << 3; + u_int16_t max = ip_len + off; + int hosed = 0; + +#ifdef __FreeBSD__ + KASSERT((*frag == NULL || !BUFFER_FRAGMENTS(*frag)), + ("!(*frag == NULL || !BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__)); +#else + KASSERT(*frag == NULL || !BUFFER_FRAGMENTS(*frag)); +#endif + + /* Create a new range queue for this packet */ + if (*frag == NULL) { + *frag = pool_get(&pf_cache_pl, PR_NOWAIT); + if (*frag == NULL) { + pf_flush_fragments(); + *frag = pool_get(&pf_cache_pl, PR_NOWAIT); + if (*frag == NULL) + goto no_mem; + } + + /* Get an entry for the queue */ + cur = pool_get(&pf_cent_pl, PR_NOWAIT); + if (cur == NULL) { + pool_put(&pf_cache_pl, *frag); + *frag = NULL; + goto no_mem; + } + pf_ncache++; + + (*frag)->fr_flags = PFFRAG_NOBUFFER; + (*frag)->fr_max = 0; + (*frag)->fr_src = h->ip_src; + (*frag)->fr_dst = h->ip_dst; + (*frag)->fr_p = h->ip_p; + (*frag)->fr_id = h->ip_id; + (*frag)->fr_timeout = time_second; + + cur->fr_off = off; + cur->fr_end = max; + LIST_INIT(&(*frag)->fr_cache); + LIST_INSERT_HEAD(&(*frag)->fr_cache, cur, fr_next); + + RB_INSERT(pf_frag_tree, &pf_cache_tree, *frag); + TAILQ_INSERT_HEAD(&pf_cachequeue, *frag, frag_next); + + DPFPRINTF(("fragcache[%d]: new %d-%d\n", h->ip_id, off, max)); + + goto pass; + } + + /* + * Find a fragment after the current one: + * - off contains the real shifted offset. + */ + frp = NULL; + LIST_FOREACH(fra, &(*frag)->fr_cache, fr_next) { + if (fra->fr_off > off) + break; + frp = fra; + } + +#ifdef __FreeBSD__ + KASSERT((frp != NULL || fra != NULL), + ("!(frp != NULL || fra != NULL): %s", __FUNCTION__)); +#else + KASSERT(frp != NULL || fra != NULL); +#endif + + if (frp != NULL) { + int precut; + + precut = frp->fr_end - off; + if (precut >= ip_len) { + /* Fragment is entirely a duplicate */ + DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n", + h->ip_id, frp->fr_off, frp->fr_end, off, max)); + goto drop_fragment; + } + if (precut == 0) { + /* They are adjacent. Fixup cache entry */ + DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n", + h->ip_id, frp->fr_off, frp->fr_end, off, max)); + frp->fr_end = max; + } else if (precut > 0) { + /* The first part of this payload overlaps with a + * fragment that has already been passed. + * Need to trim off the first part of the payload. + * But to do so easily, we need to create another + * mbuf to throw the original header into. + */ + + DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n", + h->ip_id, precut, frp->fr_off, frp->fr_end, off, + max)); + + off += precut; + max -= precut; + /* Update the previous frag to encompass this one */ + frp->fr_end = max; + + if (!drop) { + /* XXX Optimization opportunity + * This is a very heavy way to trim the payload. + * we could do it much faster by diddling mbuf + * internals but that would be even less legible + * than this mbuf magic. For my next trick, + * I'll pull a rabbit out of my laptop. + */ +#ifdef __FreeBSD__ + *m0 = m_dup(m, M_DONTWAIT); +#else + *m0 = m_copym2(m, 0, h->ip_hl << 2, M_NOWAIT); +#endif + if (*m0 == NULL) + goto no_mem; +#ifdef __FreeBSD__ + /* From KAME Project : We have missed this! */ + m_adj(*m0, (h->ip_hl << 2) - + (*m0)->m_pkthdr.len); + + KASSERT(((*m0)->m_next == NULL), + ("(*m0)->m_next != NULL: %s", + __FUNCTION__)); +#else + KASSERT((*m0)->m_next == NULL); +#endif + m_adj(m, precut + (h->ip_hl << 2)); + m_cat(*m0, m); + m = *m0; + if (m->m_flags & M_PKTHDR) { + int plen = 0; + struct mbuf *t; + for (t = m; t; t = t->m_next) + plen += t->m_len; + m->m_pkthdr.len = plen; + } + + + h = mtod(m, struct ip *); + +#ifdef __FreeBSD__ + KASSERT(((int)m->m_len == + ntohs(h->ip_len) - precut), + ("m->m_len != ntohs(h->ip_len) - precut: %s", + __FUNCTION__)); +#else + KASSERT((int)m->m_len == + ntohs(h->ip_len) - precut); +#endif + h->ip_off = htons(ntohs(h->ip_off) + + (precut >> 3)); + h->ip_len = htons(ntohs(h->ip_len) - precut); + } else { + hosed++; + } + } else { + /* There is a gap between fragments */ + + DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n", + h->ip_id, -precut, frp->fr_off, frp->fr_end, off, + max)); + + cur = pool_get(&pf_cent_pl, PR_NOWAIT); + if (cur == NULL) + goto no_mem; + pf_ncache++; + + cur->fr_off = off; + cur->fr_end = max; + LIST_INSERT_AFTER(frp, cur, fr_next); + } + } + + if (fra != NULL) { + int aftercut; + int merge = 0; + + aftercut = max - fra->fr_off; + if (aftercut == 0) { + /* Adjacent fragments */ + DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n", + h->ip_id, off, max, fra->fr_off, fra->fr_end)); + fra->fr_off = off; + merge = 1; + } else if (aftercut > 0) { + /* Need to chop off the tail of this fragment */ + DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n", + h->ip_id, aftercut, off, max, fra->fr_off, + fra->fr_end)); + fra->fr_off = off; + max -= aftercut; + + merge = 1; + + if (!drop) { + m_adj(m, -aftercut); + if (m->m_flags & M_PKTHDR) { + int plen = 0; + struct mbuf *t; + for (t = m; t; t = t->m_next) + plen += t->m_len; + m->m_pkthdr.len = plen; + } + h = mtod(m, struct ip *); +#ifdef __FreeBSD__ + KASSERT(((int)m->m_len == ntohs(h->ip_len) - aftercut), + ("m->m_len != ntohs(h->ip_len) - aftercut: %s", + __FUNCTION__)); +#else + KASSERT((int)m->m_len == + ntohs(h->ip_len) - aftercut); +#endif + h->ip_len = htons(ntohs(h->ip_len) - aftercut); + } else { + hosed++; + } + } else if (frp == NULL) { + /* There is a gap between fragments */ + DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n", + h->ip_id, -aftercut, off, max, fra->fr_off, + fra->fr_end)); + + cur = pool_get(&pf_cent_pl, PR_NOWAIT); + if (cur == NULL) + goto no_mem; + pf_ncache++; + + cur->fr_off = off; + cur->fr_end = max; + LIST_INSERT_BEFORE(fra, cur, fr_next); + } + + + /* Need to glue together two separate fragment descriptors */ + if (merge) { + if (cur && fra->fr_off <= cur->fr_end) { + /* Need to merge in a previous 'cur' */ + DPFPRINTF(("fragcache[%d]: adjacent(merge " + "%d-%d) %d-%d (%d-%d)\n", + h->ip_id, cur->fr_off, cur->fr_end, off, + max, fra->fr_off, fra->fr_end)); + fra->fr_off = cur->fr_off; + LIST_REMOVE(cur, fr_next); + pool_put(&pf_cent_pl, cur); + pf_ncache--; + cur = NULL; + + } else if (frp && fra->fr_off <= frp->fr_end) { + /* Need to merge in a modified 'frp' */ +#ifdef __FreeBSD__ + KASSERT((cur == NULL), ("cur != NULL: %s", + __FUNCTION__)); +#else + KASSERT(cur == NULL); +#endif + DPFPRINTF(("fragcache[%d]: adjacent(merge " + "%d-%d) %d-%d (%d-%d)\n", + h->ip_id, frp->fr_off, frp->fr_end, off, + max, fra->fr_off, fra->fr_end)); + fra->fr_off = frp->fr_off; + LIST_REMOVE(frp, fr_next); + pool_put(&pf_cent_pl, frp); + pf_ncache--; + frp = NULL; + + } + } + } + + if (hosed) { + /* + * We must keep tracking the overall fragment even when + * we're going to drop it anyway so that we know when to + * free the overall descriptor. Thus we drop the frag late. + */ + goto drop_fragment; + } + + + pass: + /* Update maximum data size */ + if ((*frag)->fr_max < max) + (*frag)->fr_max = max; + + /* This is the last segment */ + if (!mff) + (*frag)->fr_flags |= PFFRAG_SEENLAST; + + /* Check if we are completely reassembled */ + if (((*frag)->fr_flags & PFFRAG_SEENLAST) && + LIST_FIRST(&(*frag)->fr_cache)->fr_off == 0 && + LIST_FIRST(&(*frag)->fr_cache)->fr_end == (*frag)->fr_max) { + /* Remove from fragment queue */ + DPFPRINTF(("fragcache[%d]: done 0-%d\n", h->ip_id, + (*frag)->fr_max)); + pf_free_fragment(*frag); + *frag = NULL; + } + + return (m); + + no_mem: + *nomem = 1; + + /* Still need to pay attention to !IP_MF */ + if (!mff && *frag != NULL) + (*frag)->fr_flags |= PFFRAG_SEENLAST; + + m_freem(m); + return (NULL); + + drop_fragment: + + /* Still need to pay attention to !IP_MF */ + if (!mff && *frag != NULL) + (*frag)->fr_flags |= PFFRAG_SEENLAST; + + if (drop) { + /* This fragment has been deemed bad. Don't reass */ + if (((*frag)->fr_flags & PFFRAG_DROP) == 0) + DPFPRINTF(("fragcache[%d]: dropping overall fragment\n", + h->ip_id)); + (*frag)->fr_flags |= PFFRAG_DROP; + } + + m_freem(m); + return (NULL); +} + +int +pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, + struct pf_pdesc *pd) +{ + struct mbuf *m = *m0; + struct pf_rule *r; + struct pf_frent *frent; + struct pf_fragment *frag = NULL; + struct ip *h = mtod(m, struct ip *); + int mff = (ntohs(h->ip_off) & IP_MF); + int hlen = h->ip_hl << 2; + u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; + u_int16_t max; + int ip_len; + int ip_off; + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != dir) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != AF_INET) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != h->ip_p) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, + (struct pf_addr *)&h->ip_src.s_addr, AF_INET, + r->src.neg, kif)) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, + (struct pf_addr *)&h->ip_dst.s_addr, AF_INET, + r->dst.neg, NULL)) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else + break; + } + + if (r == NULL || r->action == PF_NOSCRUB) + return (PF_PASS); + else { + r->packets[dir == PF_OUT]++; + r->bytes[dir == PF_OUT] += pd->tot_len; + } + + /* Check for illegal packets */ + if (hlen < (int)sizeof(struct ip)) + goto drop; + + if (hlen > ntohs(h->ip_len)) + goto drop; + + /* Clear IP_DF if the rule uses the no-df option */ + if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) { + u_int16_t ip_off = h->ip_off; + + h->ip_off &= htons(~IP_DF); + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); + } + + /* We will need other tests here */ + if (!fragoff && !mff) + goto no_fragment; + + /* We're dealing with a fragment now. Don't allow fragments + * with IP_DF to enter the cache. If the flag was cleared by + * no-df above, fine. Otherwise drop it. + */ + if (h->ip_off & htons(IP_DF)) { + DPFPRINTF(("IP_DF\n")); + goto bad; + } + + ip_len = ntohs(h->ip_len) - hlen; + ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3; + + /* All fragments are 8 byte aligned */ + if (mff && (ip_len & 0x7)) { + DPFPRINTF(("mff and %d\n", ip_len)); + goto bad; + } + + /* Respect maximum length */ + if (fragoff + ip_len > IP_MAXPACKET) { + DPFPRINTF(("max packet %d\n", fragoff + ip_len)); + goto bad; + } + max = fragoff + ip_len; + + if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) { + /* Fully buffer all of the fragments */ + + frag = pf_find_fragment(h, &pf_frag_tree); + + /* Check if we saw the last fragment already */ + if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) && + max > frag->fr_max) + goto bad; + + /* Get an entry for the fragment queue */ + frent = pool_get(&pf_frent_pl, PR_NOWAIT); + if (frent == NULL) { + REASON_SET(reason, PFRES_MEMORY); + return (PF_DROP); + } + pf_nfrents++; + frent->fr_ip = h; + frent->fr_m = m; + + /* Might return a completely reassembled mbuf, or NULL */ + DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max)); + *m0 = m = pf_reassemble(m0, &frag, frent, mff); + + if (m == NULL) + return (PF_DROP); + + /* use mtag from concatenated mbuf chain */ + pd->pf_mtag = pf_find_mtag(m); +#ifdef DIAGNOSTIC + if (pd->pf_mtag == NULL) { + printf("%s: pf_find_mtag returned NULL(1)\n", __func__); + if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) { + m_freem(m); + *m0 = NULL; + goto no_mem; + } + } +#endif + if (frag != NULL && (frag->fr_flags & PFFRAG_DROP)) + goto drop; + + h = mtod(m, struct ip *); + } else { + /* non-buffering fragment cache (drops or masks overlaps) */ + int nomem = 0; + + if (dir == PF_OUT && pd->pf_mtag->flags & PF_TAG_FRAGCACHE) { + /* + * Already passed the fragment cache in the + * input direction. If we continued, it would + * appear to be a dup and would be dropped. + */ + goto fragment_pass; + } + + frag = pf_find_fragment(h, &pf_cache_tree); + + /* Check if we saw the last fragment already */ + if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) && + max > frag->fr_max) { + if (r->rule_flag & PFRULE_FRAGDROP) + frag->fr_flags |= PFFRAG_DROP; + goto bad; + } + + *m0 = m = pf_fragcache(m0, h, &frag, mff, + (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem); + if (m == NULL) { + if (nomem) + goto no_mem; + goto drop; + } + + /* use mtag from copied and trimmed mbuf chain */ + pd->pf_mtag = pf_find_mtag(m); +#ifdef DIAGNOSTIC + if (pd->pf_mtag == NULL) { + printf("%s: pf_find_mtag returned NULL(2)\n", __func__); + if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) { + m_freem(m); + *m0 = NULL; + goto no_mem; + } + } +#endif + if (dir == PF_IN) + pd->pf_mtag->flags |= PF_TAG_FRAGCACHE; + + if (frag != NULL && (frag->fr_flags & PFFRAG_DROP)) + goto drop; + goto fragment_pass; + } + + no_fragment: + /* At this point, only IP_DF is allowed in ip_off */ + if (h->ip_off & ~htons(IP_DF)) { + u_int16_t ip_off = h->ip_off; + + h->ip_off &= htons(IP_DF); + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); + } + + /* Enforce a minimum ttl, may cause endless packet loops */ + if (r->min_ttl && h->ip_ttl < r->min_ttl) { + u_int16_t ip_ttl = h->ip_ttl; + + h->ip_ttl = r->min_ttl; + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0); + } + + if (r->rule_flag & PFRULE_RANDOMID) { + u_int16_t ip_id = h->ip_id; + + h->ip_id = ip_randomid(); + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0); + } + if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) + pd->flags |= PFDESC_IP_REAS; + + return (PF_PASS); + + fragment_pass: + /* Enforce a minimum ttl, may cause endless packet loops */ + if (r->min_ttl && h->ip_ttl < r->min_ttl) { + u_int16_t ip_ttl = h->ip_ttl; + + h->ip_ttl = r->min_ttl; + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0); + } + if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) + pd->flags |= PFDESC_IP_REAS; + return (PF_PASS); + + no_mem: + REASON_SET(reason, PFRES_MEMORY); + if (r != NULL && r->log) + PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r, NULL, NULL, pd); + return (PF_DROP); + + drop: + REASON_SET(reason, PFRES_NORM); + if (r != NULL && r->log) + PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r, NULL, NULL, pd); + return (PF_DROP); + + bad: + DPFPRINTF(("dropping bad fragment\n")); + + /* Free associated fragments */ + if (frag != NULL) + pf_free_fragment(frag); + + REASON_SET(reason, PFRES_FRAG); + if (r != NULL && r->log) + PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r, NULL, NULL, pd); + + return (PF_DROP); +} + +#ifdef INET6 +int +pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, + u_short *reason, struct pf_pdesc *pd) +{ + struct mbuf *m = *m0; + struct pf_rule *r; + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + int off; + struct ip6_ext ext; + struct ip6_opt opt; + struct ip6_opt_jumbo jumbo; + struct ip6_frag frag; + u_int32_t jumbolen = 0, plen; + u_int16_t fragoff = 0; + int optend; + int ooff; + u_int8_t proto; + int terminal; + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != dir) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != AF_INET6) + r = r->skip[PF_SKIP_AF].ptr; +#if 0 /* header chain! */ + else if (r->proto && r->proto != h->ip6_nxt) + r = r->skip[PF_SKIP_PROTO].ptr; +#endif + else if (PF_MISMATCHAW(&r->src.addr, + (struct pf_addr *)&h->ip6_src, AF_INET6, + r->src.neg, kif)) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, + (struct pf_addr *)&h->ip6_dst, AF_INET6, + r->dst.neg, NULL)) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else + break; + } + + if (r == NULL || r->action == PF_NOSCRUB) + return (PF_PASS); + else { + r->packets[dir == PF_OUT]++; + r->bytes[dir == PF_OUT] += pd->tot_len; + } + + /* Check for illegal packets */ + if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len) + goto drop; + + off = sizeof(struct ip6_hdr); + proto = h->ip6_nxt; + terminal = 0; + do { + switch (proto) { + case IPPROTO_FRAGMENT: + goto fragment; + break; + case IPPROTO_AH: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: + if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, + NULL, AF_INET6)) + goto shortpkt; + if (proto == IPPROTO_AH) + off += (ext.ip6e_len + 2) * 4; + else + off += (ext.ip6e_len + 1) * 8; + proto = ext.ip6e_nxt; + break; + case IPPROTO_HOPOPTS: + if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, + NULL, AF_INET6)) + goto shortpkt; + optend = off + (ext.ip6e_len + 1) * 8; + ooff = off + sizeof(ext); + do { + if (!pf_pull_hdr(m, ooff, &opt.ip6o_type, + sizeof(opt.ip6o_type), NULL, NULL, + AF_INET6)) + goto shortpkt; + if (opt.ip6o_type == IP6OPT_PAD1) { + ooff++; + continue; + } + if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt), + NULL, NULL, AF_INET6)) + goto shortpkt; + if (ooff + sizeof(opt) + opt.ip6o_len > optend) + goto drop; + switch (opt.ip6o_type) { + case IP6OPT_JUMBO: + if (h->ip6_plen != 0) + goto drop; + if (!pf_pull_hdr(m, ooff, &jumbo, + sizeof(jumbo), NULL, NULL, + AF_INET6)) + goto shortpkt; + memcpy(&jumbolen, jumbo.ip6oj_jumbo_len, + sizeof(jumbolen)); + jumbolen = ntohl(jumbolen); + if (jumbolen <= IPV6_MAXPACKET) + goto drop; + if (sizeof(struct ip6_hdr) + jumbolen != + m->m_pkthdr.len) + goto drop; + break; + default: + break; + } + ooff += sizeof(opt) + opt.ip6o_len; + } while (ooff < optend); + + off = optend; + proto = ext.ip6e_nxt; + break; + default: + terminal = 1; + break; + } + } while (!terminal); + + /* jumbo payload option must be present, or plen > 0 */ + if (ntohs(h->ip6_plen) == 0) + plen = jumbolen; + else + plen = ntohs(h->ip6_plen); + if (plen == 0) + goto drop; + if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) + goto shortpkt; + + /* Enforce a minimum ttl, may cause endless packet loops */ + if (r->min_ttl && h->ip6_hlim < r->min_ttl) + h->ip6_hlim = r->min_ttl; + + return (PF_PASS); + + fragment: + if (ntohs(h->ip6_plen) == 0 || jumbolen) + goto drop; + plen = ntohs(h->ip6_plen); + + if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6)) + goto shortpkt; + fragoff = ntohs(frag.ip6f_offlg & IP6F_OFF_MASK); + if (fragoff + (plen - off - sizeof(frag)) > IPV6_MAXPACKET) + goto badfrag; + + /* do something about it */ + /* remember to set pd->flags |= PFDESC_IP_REAS */ + return (PF_PASS); + + shortpkt: + REASON_SET(reason, PFRES_SHORT); + if (r != NULL && r->log) + PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd); + return (PF_DROP); + + drop: + REASON_SET(reason, PFRES_NORM); + if (r != NULL && r->log) + PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd); + return (PF_DROP); + + badfrag: + REASON_SET(reason, PFRES_FRAG); + if (r != NULL && r->log) + PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd); + return (PF_DROP); +} +#endif /* INET6 */ + +int +pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff, + int off, void *h, struct pf_pdesc *pd) +{ + struct pf_rule *r, *rm = NULL; + struct tcphdr *th = pd->hdr.tcp; + int rewrite = 0; + u_short reason; + u_int8_t flags; + sa_family_t af = pd->af; + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != dir) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, + r->src.neg, kif)) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (r->src.port_op && !pf_match_port(r->src.port_op, + r->src.port[0], r->src.port[1], th->th_sport)) + r = r->skip[PF_SKIP_SRC_PORT].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, + r->dst.neg, NULL)) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (r->dst.port_op && !pf_match_port(r->dst.port_op, + r->dst.port[0], r->dst.port[1], th->th_dport)) + r = r->skip[PF_SKIP_DST_PORT].ptr; + else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match( + pf_osfp_fingerprint(pd, m, off, th), + r->os_fingerprint)) + r = TAILQ_NEXT(r, entries); + else { + rm = r; + break; + } + } + + if (rm == NULL || rm->action == PF_NOSCRUB) + return (PF_PASS); + else { + r->packets[dir == PF_OUT]++; + r->bytes[dir == PF_OUT] += pd->tot_len; + } + + if (rm->rule_flag & PFRULE_REASSEMBLE_TCP) + pd->flags |= PFDESC_TCP_NORM; + + flags = th->th_flags; + if (flags & TH_SYN) { + /* Illegal packet */ + if (flags & TH_RST) + goto tcp_drop; + + if (flags & TH_FIN) + flags &= ~TH_FIN; + } else { + /* Illegal packet */ + if (!(flags & (TH_ACK|TH_RST))) + goto tcp_drop; + } + + if (!(flags & TH_ACK)) { + /* These flags are only valid if ACK is set */ + if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG)) + goto tcp_drop; + } + + /* Check for illegal header length */ + if (th->th_off < (sizeof(struct tcphdr) >> 2)) + goto tcp_drop; + + /* If flags changed, or reserved data set, then adjust */ + if (flags != th->th_flags || th->th_x2 != 0) { + u_int16_t ov, nv; + + ov = *(u_int16_t *)(&th->th_ack + 1); + th->th_flags = flags; + th->th_x2 = 0; + nv = *(u_int16_t *)(&th->th_ack + 1); + + th->th_sum = pf_cksum_fixup(th->th_sum, ov, nv, 0); + rewrite = 1; + } + + /* Remove urgent pointer, if TH_URG is not set */ + if (!(flags & TH_URG) && th->th_urp) { + th->th_sum = pf_cksum_fixup(th->th_sum, th->th_urp, 0, 0); + th->th_urp = 0; + rewrite = 1; + } + + /* Process options */ + if (r->max_mss && pf_normalize_tcpopt(r, m, th, off)) + rewrite = 1; + + /* copy back packet headers if we sanitized */ + if (rewrite) + m_copyback(m, off, sizeof(*th), (caddr_t)th); + + return (PF_PASS); + + tcp_drop: + REASON_SET(&reason, PFRES_NORM); + if (rm != NULL && r->log) + PFLOG_PACKET(kif, h, m, AF_INET, dir, reason, r, NULL, NULL, pd); + return (PF_DROP); +} + +int +pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd, + struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst) +{ + u_int32_t tsval, tsecr; + u_int8_t hdr[60]; + u_int8_t *opt; + +#ifdef __FreeBSD__ + KASSERT((src->scrub == NULL), + ("pf_normalize_tcp_init: src->scrub != NULL")); +#else + KASSERT(src->scrub == NULL); +#endif + + src->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT); + if (src->scrub == NULL) + return (1); + bzero(src->scrub, sizeof(*src->scrub)); + + switch (pd->af) { +#ifdef INET + case AF_INET: { + struct ip *h = mtod(m, struct ip *); + src->scrub->pfss_ttl = h->ip_ttl; + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + src->scrub->pfss_ttl = h->ip6_hlim; + break; + } +#endif /* INET6 */ + } + + + /* + * All normalizations below are only begun if we see the start of + * the connections. They must all set an enabled bit in pfss_flags + */ + if ((th->th_flags & TH_SYN) == 0) + return (0); + + + if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub && + pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { + /* Diddle with TCP options */ + int hlen; + opt = hdr + sizeof(struct tcphdr); + hlen = (th->th_off << 2) - sizeof(struct tcphdr); + while (hlen >= TCPOLEN_TIMESTAMP) { + switch (*opt) { + case TCPOPT_EOL: /* FALLTHROUGH */ + case TCPOPT_NOP: + opt++; + hlen--; + break; + case TCPOPT_TIMESTAMP: + if (opt[1] >= TCPOLEN_TIMESTAMP) { + src->scrub->pfss_flags |= + PFSS_TIMESTAMP; + src->scrub->pfss_ts_mod = + htonl(arc4random()); + + /* note PFSS_PAWS not set yet */ + memcpy(&tsval, &opt[2], + sizeof(u_int32_t)); + memcpy(&tsecr, &opt[6], + sizeof(u_int32_t)); + src->scrub->pfss_tsval0 = ntohl(tsval); + src->scrub->pfss_tsval = ntohl(tsval); + src->scrub->pfss_tsecr = ntohl(tsecr); + getmicrouptime(&src->scrub->pfss_last); + } + /* FALLTHROUGH */ + default: + hlen -= MAX(opt[1], 2); + opt += MAX(opt[1], 2); + break; + } + } + } + + return (0); +} + +void +pf_normalize_tcp_cleanup(struct pf_state *state) +{ + if (state->src.scrub) + pool_put(&pf_state_scrub_pl, state->src.scrub); + if (state->dst.scrub) + pool_put(&pf_state_scrub_pl, state->dst.scrub); + + /* Someday... flush the TCP segment reassembly descriptors. */ +} + +int +pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd, + u_short *reason, struct tcphdr *th, struct pf_state *state, + struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback) +{ + struct timeval uptime; + u_int32_t tsval, tsecr; + u_int tsval_from_last; + u_int8_t hdr[60]; + u_int8_t *opt; + int copyback = 0; + int got_ts = 0; + +#ifdef __FreeBSD__ + KASSERT((src->scrub || dst->scrub), + ("pf_normalize_tcp_statefull: src->scrub && dst->scrub!")); +#else + KASSERT(src->scrub || dst->scrub); +#endif + + /* + * Enforce the minimum TTL seen for this connection. Negate a common + * technique to evade an intrusion detection system and confuse + * firewall state code. + */ + switch (pd->af) { +#ifdef INET + case AF_INET: { + if (src->scrub) { + struct ip *h = mtod(m, struct ip *); + if (h->ip_ttl > src->scrub->pfss_ttl) + src->scrub->pfss_ttl = h->ip_ttl; + h->ip_ttl = src->scrub->pfss_ttl; + } + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + if (src->scrub) { + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + if (h->ip6_hlim > src->scrub->pfss_ttl) + src->scrub->pfss_ttl = h->ip6_hlim; + h->ip6_hlim = src->scrub->pfss_ttl; + } + break; + } +#endif /* INET6 */ + } + + if (th->th_off > (sizeof(struct tcphdr) >> 2) && + ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) || + (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) && + pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { + /* Diddle with TCP options */ + int hlen; + opt = hdr + sizeof(struct tcphdr); + hlen = (th->th_off << 2) - sizeof(struct tcphdr); + while (hlen >= TCPOLEN_TIMESTAMP) { + switch (*opt) { + case TCPOPT_EOL: /* FALLTHROUGH */ + case TCPOPT_NOP: + opt++; + hlen--; + break; + case TCPOPT_TIMESTAMP: + /* Modulate the timestamps. Can be used for + * NAT detection, OS uptime determination or + * reboot detection. + */ + + if (got_ts) { + /* Huh? Multiple timestamps!? */ + if (pf_status.debug >= PF_DEBUG_MISC) { + DPFPRINTF(("multiple TS??")); + pf_print_state(state); + printf("\n"); + } + REASON_SET(reason, PFRES_TS); + return (PF_DROP); + } + if (opt[1] >= TCPOLEN_TIMESTAMP) { + memcpy(&tsval, &opt[2], + sizeof(u_int32_t)); + if (tsval && src->scrub && + (src->scrub->pfss_flags & + PFSS_TIMESTAMP)) { + tsval = ntohl(tsval); + pf_change_a(&opt[2], + &th->th_sum, + htonl(tsval + + src->scrub->pfss_ts_mod), + 0); + copyback = 1; + } + + /* Modulate TS reply iff valid (!0) */ + memcpy(&tsecr, &opt[6], + sizeof(u_int32_t)); + if (tsecr && dst->scrub && + (dst->scrub->pfss_flags & + PFSS_TIMESTAMP)) { + tsecr = ntohl(tsecr) + - dst->scrub->pfss_ts_mod; + pf_change_a(&opt[6], + &th->th_sum, htonl(tsecr), + 0); + copyback = 1; + } + got_ts = 1; + } + /* FALLTHROUGH */ + default: + hlen -= MAX(opt[1], 2); + opt += MAX(opt[1], 2); + break; + } + } + if (copyback) { + /* Copyback the options, caller copys back header */ + *writeback = 1; + m_copyback(m, off + sizeof(struct tcphdr), + (th->th_off << 2) - sizeof(struct tcphdr), hdr + + sizeof(struct tcphdr)); + } + } + + + /* + * Must invalidate PAWS checks on connections idle for too long. + * The fastest allowed timestamp clock is 1ms. That turns out to + * be about 24 days before it wraps. XXX Right now our lowerbound + * TS echo check only works for the first 12 days of a connection + * when the TS has exhausted half its 32bit space + */ +#define TS_MAX_IDLE (24*24*60*60) +#define TS_MAX_CONN (12*24*60*60) /* XXX remove when better tsecr check */ + + getmicrouptime(&uptime); + if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && + (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE || + time_second - state->creation > TS_MAX_CONN)) { + if (pf_status.debug >= PF_DEBUG_MISC) { + DPFPRINTF(("src idled out of PAWS\n")); + pf_print_state(state); + printf("\n"); + } + src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS) + | PFSS_PAWS_IDLED; + } + if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) && + uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) { + if (pf_status.debug >= PF_DEBUG_MISC) { + DPFPRINTF(("dst idled out of PAWS\n")); + pf_print_state(state); + printf("\n"); + } + dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS) + | PFSS_PAWS_IDLED; + } + + if (got_ts && src->scrub && dst->scrub && + (src->scrub->pfss_flags & PFSS_PAWS) && + (dst->scrub->pfss_flags & PFSS_PAWS)) { + /* Validate that the timestamps are "in-window". + * RFC1323 describes TCP Timestamp options that allow + * measurement of RTT (round trip time) and PAWS + * (protection against wrapped sequence numbers). PAWS + * gives us a set of rules for rejecting packets on + * long fat pipes (packets that were somehow delayed + * in transit longer than the time it took to send the + * full TCP sequence space of 4Gb). We can use these + * rules and infer a few others that will let us treat + * the 32bit timestamp and the 32bit echoed timestamp + * as sequence numbers to prevent a blind attacker from + * inserting packets into a connection. + * + * RFC1323 tells us: + * - The timestamp on this packet must be greater than + * or equal to the last value echoed by the other + * endpoint. The RFC says those will be discarded + * since it is a dup that has already been acked. + * This gives us a lowerbound on the timestamp. + * timestamp >= other last echoed timestamp + * - The timestamp will be less than or equal to + * the last timestamp plus the time between the + * last packet and now. The RFC defines the max + * clock rate as 1ms. We will allow clocks to be + * up to 10% fast and will allow a total difference + * or 30 seconds due to a route change. And this + * gives us an upperbound on the timestamp. + * timestamp <= last timestamp + max ticks + * We have to be careful here. Windows will send an + * initial timestamp of zero and then initialize it + * to a random value after the 3whs; presumably to + * avoid a DoS by having to call an expensive RNG + * during a SYN flood. Proof MS has at least one + * good security geek. + * + * - The TCP timestamp option must also echo the other + * endpoints timestamp. The timestamp echoed is the + * one carried on the earliest unacknowledged segment + * on the left edge of the sequence window. The RFC + * states that the host will reject any echoed + * timestamps that were larger than any ever sent. + * This gives us an upperbound on the TS echo. + * tescr <= largest_tsval + * - The lowerbound on the TS echo is a little more + * tricky to determine. The other endpoint's echoed + * values will not decrease. But there may be + * network conditions that re-order packets and + * cause our view of them to decrease. For now the + * only lowerbound we can safely determine is that + * the TS echo will never be less than the orginal + * TS. XXX There is probably a better lowerbound. + * Remove TS_MAX_CONN with better lowerbound check. + * tescr >= other original TS + * + * It is also important to note that the fastest + * timestamp clock of 1ms will wrap its 32bit space in + * 24 days. So we just disable TS checking after 24 + * days of idle time. We actually must use a 12d + * connection limit until we can come up with a better + * lowerbound to the TS echo check. + */ + struct timeval delta_ts; + int ts_fudge; + + + /* + * PFTM_TS_DIFF is how many seconds of leeway to allow + * a host's timestamp. This can happen if the previous + * packet got delayed in transit for much longer than + * this packet. + */ + if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0) + ts_fudge = pf_default_rule.timeout[PFTM_TS_DIFF]; + + + /* Calculate max ticks since the last timestamp */ +#define TS_MAXFREQ 1100 /* RFC max TS freq of 1Khz + 10% skew */ +#define TS_MICROSECS 1000000 /* microseconds per second */ +#ifdef __FreeBSD__ +#ifndef timersub +#define timersub(tvp, uvp, vvp) \ + do { \ + (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ + (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ + if ((vvp)->tv_usec < 0) { \ + (vvp)->tv_sec--; \ + (vvp)->tv_usec += 1000000; \ + } \ + } while (0) +#endif +#endif + timersub(&uptime, &src->scrub->pfss_last, &delta_ts); + tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ; + tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ); + + + if ((src->state >= TCPS_ESTABLISHED && + dst->state >= TCPS_ESTABLISHED) && + (SEQ_LT(tsval, dst->scrub->pfss_tsecr) || + SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) || + (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) || + SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) { + /* Bad RFC1323 implementation or an insertion attack. + * + * - Solaris 2.6 and 2.7 are known to send another ACK + * after the FIN,FIN|ACK,ACK closing that carries + * an old timestamp. + */ + + DPFPRINTF(("Timestamp failed %c%c%c%c\n", + SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ', + SEQ_GT(tsval, src->scrub->pfss_tsval + + tsval_from_last) ? '1' : ' ', + SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ', + SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' ')); +#ifdef __FreeBSD__ + DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u " + "idle: %jus %lums\n", + tsval, tsecr, tsval_from_last, + (uintmax_t)delta_ts.tv_sec, + delta_ts.tv_usec / 1000)); + DPFPRINTF((" src->tsval: %u tsecr: %u\n", + src->scrub->pfss_tsval, src->scrub->pfss_tsecr)); + DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u" + "\n", dst->scrub->pfss_tsval, + dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0)); +#else + DPFPRINTF((" tsval: %lu tsecr: %lu +ticks: %lu " + "idle: %lus %lums\n", + tsval, tsecr, tsval_from_last, delta_ts.tv_sec, + delta_ts.tv_usec / 1000)); + DPFPRINTF((" src->tsval: %lu tsecr: %lu\n", + src->scrub->pfss_tsval, src->scrub->pfss_tsecr)); + DPFPRINTF((" dst->tsval: %lu tsecr: %lu tsval0: %lu" + "\n", dst->scrub->pfss_tsval, + dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0)); +#endif + if (pf_status.debug >= PF_DEBUG_MISC) { + pf_print_state(state); + pf_print_flags(th->th_flags); + printf("\n"); + } + REASON_SET(reason, PFRES_TS); + return (PF_DROP); + } + + /* XXX I'd really like to require tsecr but it's optional */ + + } else if (!got_ts && (th->th_flags & TH_RST) == 0 && + ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED) + || pd->p_len > 0 || (th->th_flags & TH_SYN)) && + src->scrub && dst->scrub && + (src->scrub->pfss_flags & PFSS_PAWS) && + (dst->scrub->pfss_flags & PFSS_PAWS)) { + /* Didn't send a timestamp. Timestamps aren't really useful + * when: + * - connection opening or closing (often not even sent). + * but we must not let an attacker to put a FIN on a + * data packet to sneak it through our ESTABLISHED check. + * - on a TCP reset. RFC suggests not even looking at TS. + * - on an empty ACK. The TS will not be echoed so it will + * probably not help keep the RTT calculation in sync and + * there isn't as much danger when the sequence numbers + * got wrapped. So some stacks don't include TS on empty + * ACKs :-( + * + * To minimize the disruption to mostly RFC1323 conformant + * stacks, we will only require timestamps on data packets. + * + * And what do ya know, we cannot require timestamps on data + * packets. There appear to be devices that do legitimate + * TCP connection hijacking. There are HTTP devices that allow + * a 3whs (with timestamps) and then buffer the HTTP request. + * If the intermediate device has the HTTP response cache, it + * will spoof the response but not bother timestamping its + * packets. So we can look for the presence of a timestamp in + * the first data packet and if there, require it in all future + * packets. + */ + + if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) { + /* + * Hey! Someone tried to sneak a packet in. Or the + * stack changed its RFC1323 behavior?!?! + */ + if (pf_status.debug >= PF_DEBUG_MISC) { + DPFPRINTF(("Did not receive expected RFC1323 " + "timestamp\n")); + pf_print_state(state); + pf_print_flags(th->th_flags); + printf("\n"); + } + REASON_SET(reason, PFRES_TS); + return (PF_DROP); + } + } + + + /* + * We will note if a host sends his data packets with or without + * timestamps. And require all data packets to contain a timestamp + * if the first does. PAWS implicitly requires that all data packets be + * timestamped. But I think there are middle-man devices that hijack + * TCP streams immediately after the 3whs and don't timestamp their + * packets (seen in a WWW accelerator or cache). + */ + if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags & + (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) { + if (got_ts) + src->scrub->pfss_flags |= PFSS_DATA_TS; + else { + src->scrub->pfss_flags |= PFSS_DATA_NOTS; + if (pf_status.debug >= PF_DEBUG_MISC && dst->scrub && + (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { + /* Don't warn if other host rejected RFC1323 */ + DPFPRINTF(("Broken RFC1323 stack did not " + "timestamp data packet. Disabled PAWS " + "security.\n")); + pf_print_state(state); + pf_print_flags(th->th_flags); + printf("\n"); + } + } + } + + + /* + * Update PAWS values + */ + if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags & + (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) { + getmicrouptime(&src->scrub->pfss_last); + if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) || + (src->scrub->pfss_flags & PFSS_PAWS) == 0) + src->scrub->pfss_tsval = tsval; + + if (tsecr) { + if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) || + (src->scrub->pfss_flags & PFSS_PAWS) == 0) + src->scrub->pfss_tsecr = tsecr; + + if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 && + (SEQ_LT(tsval, src->scrub->pfss_tsval0) || + src->scrub->pfss_tsval0 == 0)) { + /* tsval0 MUST be the lowest timestamp */ + src->scrub->pfss_tsval0 = tsval; + } + + /* Only fully initialized after a TS gets echoed */ + if ((src->scrub->pfss_flags & PFSS_PAWS) == 0) + src->scrub->pfss_flags |= PFSS_PAWS; + } + } + + /* I have a dream.... TCP segment reassembly.... */ + return (0); +} + +int +pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th, + int off) +{ + u_int16_t *mss; + int thoff; + int opt, cnt, optlen = 0; + int rewrite = 0; + u_char *optp; + + thoff = th->th_off << 2; + cnt = thoff - sizeof(struct tcphdr); + optp = mtod(m, caddr_t) + off + sizeof(struct tcphdr); + + for (; cnt > 0; cnt -= optlen, optp += optlen) { + opt = optp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + if (cnt < 2) + break; + optlen = optp[1]; + if (optlen < 2 || optlen > cnt) + break; + } + switch (opt) { + case TCPOPT_MAXSEG: + mss = (u_int16_t *)(optp + 2); + if ((ntohs(*mss)) > r->max_mss) { + th->th_sum = pf_cksum_fixup(th->th_sum, + *mss, htons(r->max_mss), 0); + *mss = htons(r->max_mss); + rewrite = 1; + } + break; + default: + break; + } + } + + return (rewrite); +} diff --git a/contrib/pf/rtems/freebsd/net/pf_osfp.c b/contrib/pf/rtems/freebsd/net/pf_osfp.c new file mode 100644 index 00000000..e1d7d647 --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/pf_osfp.c @@ -0,0 +1,640 @@ +#include + +/* $OpenBSD: pf_osfp.c,v 1.12 2006/12/13 18:14:10 itojun Exp $ */ + +/* + * Copyright (c) 2003 Mike Frantzen + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#ifdef __FreeBSD__ +#include +__FBSDID("$FreeBSD$"); +#endif + +#include +#include +#ifdef _KERNEL +# include +#endif /* _KERNEL */ +#include + +#include +#include +#include +#include + +#include +#include + +#include +#ifdef _KERNEL +#include +#endif + +#ifdef _KERNEL +# define DPFPRINTF(format, x...) \ + if (pf_status.debug >= PF_DEBUG_NOISY) \ + printf(format , ##x) +#ifdef __FreeBSD__ +typedef uma_zone_t pool_t; +#else +typedef struct pool pool_t; +#endif + +#else +/* Userland equivalents so we can lend code to tcpdump et al. */ + +# include +# include +# include +# include +# include +# include +# define pool_t int +# define pool_get(pool, flags) malloc(*(pool)) +# define pool_put(pool, item) free(item) +# define pool_init(pool, size, a, ao, f, m, p) (*(pool)) = (size) + +# ifdef __FreeBSD__ +# define NTOHS(x) (x) = ntohs((u_int16_t)(x)) +# endif + +# ifdef PFDEBUG +# include +# define DPFPRINTF(format, x...) fprintf(stderr, format , ##x) +# else +# define DPFPRINTF(format, x...) ((void)0) +# endif /* PFDEBUG */ +#endif /* _KERNEL */ + + +SLIST_HEAD(pf_osfp_list, pf_os_fingerprint) pf_osfp_list; +pool_t pf_osfp_entry_pl; +pool_t pf_osfp_pl; + +struct pf_os_fingerprint *pf_osfp_find(struct pf_osfp_list *, + struct pf_os_fingerprint *, u_int8_t); +struct pf_os_fingerprint *pf_osfp_find_exact(struct pf_osfp_list *, + struct pf_os_fingerprint *); +void pf_osfp_insert(struct pf_osfp_list *, + struct pf_os_fingerprint *); + + +#ifdef _KERNEL +/* + * Passively fingerprint the OS of the host (IPv4 TCP SYN packets only) + * Returns the list of possible OSes. + */ +struct pf_osfp_enlist * +pf_osfp_fingerprint(struct pf_pdesc *pd, struct mbuf *m, int off, + const struct tcphdr *tcp) +{ + struct ip *ip; + struct ip6_hdr *ip6; + char hdr[60]; + + if ((pd->af != PF_INET && pd->af != PF_INET6) || + pd->proto != IPPROTO_TCP || (tcp->th_off << 2) < sizeof(*tcp)) + return (NULL); + + if (pd->af == PF_INET) { + ip = mtod(m, struct ip *); + ip6 = (struct ip6_hdr *)NULL; + } else { + ip = (struct ip *)NULL; + ip6 = mtod(m, struct ip6_hdr *); + } + if (!pf_pull_hdr(m, off, hdr, tcp->th_off << 2, NULL, NULL, + pd->af)) return (NULL); + + return (pf_osfp_fingerprint_hdr(ip, ip6, (struct tcphdr *)hdr)); +} +#endif /* _KERNEL */ + +struct pf_osfp_enlist * +pf_osfp_fingerprint_hdr(const struct ip *ip, const struct ip6_hdr *ip6, const struct tcphdr *tcp) +{ + struct pf_os_fingerprint fp, *fpresult; + int cnt, optlen = 0; + const u_int8_t *optp; +#ifdef _KERNEL + char srcname[128]; +#else + char srcname[NI_MAXHOST]; +#endif +#ifdef __rtems__ +#ifdef INET6 +char ip6buf[INET6_ADDRSTRLEN]; +#endif //INET6 +#endif //__rtems__ + + if ((tcp->th_flags & (TH_SYN|TH_ACK)) != TH_SYN) + return (NULL); + if (ip) { + if ((ip->ip_off & htons(IP_OFFMASK)) != 0) + return (NULL); + } + + memset(&fp, 0, sizeof(fp)); + + if (ip) { +#ifndef _KERNEL + struct sockaddr_in sin; +#endif + + fp.fp_psize = ntohs(ip->ip_len); + fp.fp_ttl = ip->ip_ttl; + if (ip->ip_off & htons(IP_DF)) + fp.fp_flags |= PF_OSFP_DF; +#ifdef _KERNEL + strlcpy(srcname, inet_ntoa(ip->ip_src), sizeof(srcname)); +#else + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_addr = ip->ip_src; + (void)getnameinfo((struct sockaddr *)&sin, + sizeof(struct sockaddr_in), srcname, sizeof(srcname), + NULL, 0, NI_NUMERICHOST); +#endif + } +#ifdef INET6 + else if (ip6) { +#ifndef _KERNEL + struct sockaddr_in6 sin6; +#endif + + /* jumbo payload? */ + fp.fp_psize = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); + fp.fp_ttl = ip6->ip6_hlim; + fp.fp_flags |= PF_OSFP_DF; + fp.fp_flags |= PF_OSFP_INET6; +#ifdef _KERNEL +#ifndef __rtems__ + strlcpy(srcname, ip6_sprintf((struct in6_addr *)&ip6->ip6_src), + sizeof(srcname)); +#else + strlcpy(srcname, ip6_sprintf(&ip6buf, (struct in6_addr *)&ip6->ip6_src), + sizeof(srcname)); +#endif +#else + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_addr = ip6->ip6_src; + (void)getnameinfo((struct sockaddr *)&sin6, + sizeof(struct sockaddr_in6), srcname, sizeof(srcname), + NULL, 0, NI_NUMERICHOST); +#endif + } +#endif + else + return (NULL); + fp.fp_wsize = ntohs(tcp->th_win); + + + cnt = (tcp->th_off << 2) - sizeof(*tcp); + optp = (const u_int8_t *)((const char *)tcp + sizeof(*tcp)); + for (; cnt > 0; cnt -= optlen, optp += optlen) { + if (*optp == TCPOPT_EOL) + break; + + fp.fp_optcnt++; + if (*optp == TCPOPT_NOP) { + fp.fp_tcpopts = (fp.fp_tcpopts << PF_OSFP_TCPOPT_BITS) | + PF_OSFP_TCPOPT_NOP; + optlen = 1; + } else { + if (cnt < 2) + return (NULL); + optlen = optp[1]; + if (optlen > cnt || optlen < 2) + return (NULL); + switch (*optp) { + case TCPOPT_MAXSEG: + if (optlen >= TCPOLEN_MAXSEG) + memcpy(&fp.fp_mss, &optp[2], + sizeof(fp.fp_mss)); + fp.fp_tcpopts = (fp.fp_tcpopts << + PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_MSS; + NTOHS(fp.fp_mss); + break; + case TCPOPT_WINDOW: + if (optlen >= TCPOLEN_WINDOW) + memcpy(&fp.fp_wscale, &optp[2], + sizeof(fp.fp_wscale)); + NTOHS(fp.fp_wscale); + fp.fp_tcpopts = (fp.fp_tcpopts << + PF_OSFP_TCPOPT_BITS) | + PF_OSFP_TCPOPT_WSCALE; + break; + case TCPOPT_SACK_PERMITTED: + fp.fp_tcpopts = (fp.fp_tcpopts << + PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_SACK; + break; + case TCPOPT_TIMESTAMP: + if (optlen >= TCPOLEN_TIMESTAMP) { + u_int32_t ts; + memcpy(&ts, &optp[2], sizeof(ts)); + if (ts == 0) + fp.fp_flags |= PF_OSFP_TS0; + + } + fp.fp_tcpopts = (fp.fp_tcpopts << + PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_TS; + break; + default: + return (NULL); + } + } + optlen = MAX(optlen, 1); /* paranoia */ + } + + DPFPRINTF("fingerprinted %s:%d %d:%d:%d:%d:%llx (%d) " + "(TS=%s,M=%s%d,W=%s%d)\n", + srcname, ntohs(tcp->th_sport), + fp.fp_wsize, fp.fp_ttl, (fp.fp_flags & PF_OSFP_DF) != 0, + fp.fp_psize, (long long int)fp.fp_tcpopts, fp.fp_optcnt, + (fp.fp_flags & PF_OSFP_TS0) ? "0" : "", + (fp.fp_flags & PF_OSFP_MSS_MOD) ? "%" : + (fp.fp_flags & PF_OSFP_MSS_DC) ? "*" : "", + fp.fp_mss, + (fp.fp_flags & PF_OSFP_WSCALE_MOD) ? "%" : + (fp.fp_flags & PF_OSFP_WSCALE_DC) ? "*" : "", + fp.fp_wscale); + + if ((fpresult = pf_osfp_find(&pf_osfp_list, &fp, + PF_OSFP_MAXTTL_OFFSET))) + return (&fpresult->fp_oses); + return (NULL); +} + +/* Match a fingerprint ID against a list of OSes */ +int +pf_osfp_match(struct pf_osfp_enlist *list, pf_osfp_t os) +{ + struct pf_osfp_entry *entry; + int os_class, os_version, os_subtype; + int en_class, en_version, en_subtype; + + if (os == PF_OSFP_ANY) + return (1); + if (list == NULL) { + DPFPRINTF("osfp no match against %x\n", os); + return (os == PF_OSFP_UNKNOWN); + } + PF_OSFP_UNPACK(os, os_class, os_version, os_subtype); + SLIST_FOREACH(entry, list, fp_entry) { + PF_OSFP_UNPACK(entry->fp_os, en_class, en_version, en_subtype); + if ((os_class == PF_OSFP_ANY || en_class == os_class) && + (os_version == PF_OSFP_ANY || en_version == os_version) && + (os_subtype == PF_OSFP_ANY || en_subtype == os_subtype)) { + DPFPRINTF("osfp matched %s %s %s %x==%x\n", + entry->fp_class_nm, entry->fp_version_nm, + entry->fp_subtype_nm, os, entry->fp_os); + return (1); + } + } + DPFPRINTF("fingerprint 0x%x didn't match\n", os); + return (0); +} + +/* Initialize the OS fingerprint system */ +#ifdef __FreeBSD__ +int +#else +void +#endif +pf_osfp_initialize(void) +{ +#if defined(__FreeBSD__) && defined(_KERNEL) + int error = ENOMEM; + + do { + pf_osfp_entry_pl = pf_osfp_pl = NULL; + UMA_CREATE(pf_osfp_entry_pl, struct pf_osfp_entry, "pfospfen"); + UMA_CREATE(pf_osfp_pl, struct pf_os_fingerprint, "pfosfp"); + error = 0; + } while(0); +#else + pool_init(&pf_osfp_entry_pl, sizeof(struct pf_osfp_entry), 0, 0, 0, + "pfosfpen", &pool_allocator_nointr); + pool_init(&pf_osfp_pl, sizeof(struct pf_os_fingerprint), 0, 0, 0, + "pfosfp", &pool_allocator_nointr); +#endif + SLIST_INIT(&pf_osfp_list); +#ifdef __FreeBSD__ +#ifdef _KERNEL + return (error); +#else + return (0); +#endif +#endif +} + +#if defined(__FreeBSD__) && (_KERNEL) +void +pf_osfp_cleanup(void) +{ + UMA_DESTROY(pf_osfp_entry_pl); + UMA_DESTROY(pf_osfp_pl); +} +#endif + +/* Flush the fingerprint list */ +void +pf_osfp_flush(void) +{ + struct pf_os_fingerprint *fp; + struct pf_osfp_entry *entry; + + while ((fp = SLIST_FIRST(&pf_osfp_list))) { + SLIST_REMOVE_HEAD(&pf_osfp_list, fp_next); + while ((entry = SLIST_FIRST(&fp->fp_oses))) { + SLIST_REMOVE_HEAD(&fp->fp_oses, fp_entry); + pool_put(&pf_osfp_entry_pl, entry); + } + pool_put(&pf_osfp_pl, fp); + } +} + + +/* Add a fingerprint */ +int +pf_osfp_add(struct pf_osfp_ioctl *fpioc) +{ + struct pf_os_fingerprint *fp, fpadd; + struct pf_osfp_entry *entry; + + memset(&fpadd, 0, sizeof(fpadd)); + fpadd.fp_tcpopts = fpioc->fp_tcpopts; + fpadd.fp_wsize = fpioc->fp_wsize; + fpadd.fp_psize = fpioc->fp_psize; + fpadd.fp_mss = fpioc->fp_mss; + fpadd.fp_flags = fpioc->fp_flags; + fpadd.fp_optcnt = fpioc->fp_optcnt; + fpadd.fp_wscale = fpioc->fp_wscale; + fpadd.fp_ttl = fpioc->fp_ttl; + + DPFPRINTF("adding osfp %s %s %s = %s%d:%d:%d:%s%d:0x%llx %d " + "(TS=%s,M=%s%d,W=%s%d) %x\n", + fpioc->fp_os.fp_class_nm, fpioc->fp_os.fp_version_nm, + fpioc->fp_os.fp_subtype_nm, + (fpadd.fp_flags & PF_OSFP_WSIZE_MOD) ? "%" : + (fpadd.fp_flags & PF_OSFP_WSIZE_MSS) ? "S" : + (fpadd.fp_flags & PF_OSFP_WSIZE_MTU) ? "T" : + (fpadd.fp_flags & PF_OSFP_WSIZE_DC) ? "*" : "", + fpadd.fp_wsize, + fpadd.fp_ttl, + (fpadd.fp_flags & PF_OSFP_DF) ? 1 : 0, + (fpadd.fp_flags & PF_OSFP_PSIZE_MOD) ? "%" : + (fpadd.fp_flags & PF_OSFP_PSIZE_DC) ? "*" : "", + fpadd.fp_psize, + (long long int)fpadd.fp_tcpopts, fpadd.fp_optcnt, + (fpadd.fp_flags & PF_OSFP_TS0) ? "0" : "", + (fpadd.fp_flags & PF_OSFP_MSS_MOD) ? "%" : + (fpadd.fp_flags & PF_OSFP_MSS_DC) ? "*" : "", + fpadd.fp_mss, + (fpadd.fp_flags & PF_OSFP_WSCALE_MOD) ? "%" : + (fpadd.fp_flags & PF_OSFP_WSCALE_DC) ? "*" : "", + fpadd.fp_wscale, + fpioc->fp_os.fp_os); + + + if ((fp = pf_osfp_find_exact(&pf_osfp_list, &fpadd))) { + SLIST_FOREACH(entry, &fp->fp_oses, fp_entry) { + if (PF_OSFP_ENTRY_EQ(entry, &fpioc->fp_os)) + return (EEXIST); + } + if ((entry = pool_get(&pf_osfp_entry_pl, PR_NOWAIT)) == NULL) + return (ENOMEM); + } else { + if ((fp = pool_get(&pf_osfp_pl, PR_NOWAIT)) == NULL) + return (ENOMEM); + memset(fp, 0, sizeof(*fp)); + fp->fp_tcpopts = fpioc->fp_tcpopts; + fp->fp_wsize = fpioc->fp_wsize; + fp->fp_psize = fpioc->fp_psize; + fp->fp_mss = fpioc->fp_mss; + fp->fp_flags = fpioc->fp_flags; + fp->fp_optcnt = fpioc->fp_optcnt; + fp->fp_wscale = fpioc->fp_wscale; + fp->fp_ttl = fpioc->fp_ttl; + SLIST_INIT(&fp->fp_oses); + if ((entry = pool_get(&pf_osfp_entry_pl, PR_NOWAIT)) == NULL) { + pool_put(&pf_osfp_pl, fp); + return (ENOMEM); + } + pf_osfp_insert(&pf_osfp_list, fp); + } + memcpy(entry, &fpioc->fp_os, sizeof(*entry)); + + /* Make sure the strings are NUL terminated */ + entry->fp_class_nm[sizeof(entry->fp_class_nm)-1] = '\0'; + entry->fp_version_nm[sizeof(entry->fp_version_nm)-1] = '\0'; + entry->fp_subtype_nm[sizeof(entry->fp_subtype_nm)-1] = '\0'; + + SLIST_INSERT_HEAD(&fp->fp_oses, entry, fp_entry); + +#ifdef PFDEBUG + if ((fp = pf_osfp_validate())) + printf("Invalid fingerprint list\n"); +#endif /* PFDEBUG */ + return (0); +} + + +/* Find a fingerprint in the list */ +struct pf_os_fingerprint * +pf_osfp_find(struct pf_osfp_list *list, struct pf_os_fingerprint *find, + u_int8_t ttldiff) +{ + struct pf_os_fingerprint *f; + +#define MATCH_INT(_MOD, _DC, _field) \ + if ((f->fp_flags & _DC) == 0) { \ + if ((f->fp_flags & _MOD) == 0) { \ + if (f->_field != find->_field) \ + continue; \ + } else { \ + if (f->_field == 0 || find->_field % f->_field) \ + continue; \ + } \ + } + + SLIST_FOREACH(f, list, fp_next) { + if (f->fp_tcpopts != find->fp_tcpopts || + f->fp_optcnt != find->fp_optcnt || + f->fp_ttl < find->fp_ttl || + f->fp_ttl - find->fp_ttl > ttldiff || + (f->fp_flags & (PF_OSFP_DF|PF_OSFP_TS0)) != + (find->fp_flags & (PF_OSFP_DF|PF_OSFP_TS0))) + continue; + + MATCH_INT(PF_OSFP_PSIZE_MOD, PF_OSFP_PSIZE_DC, fp_psize) + MATCH_INT(PF_OSFP_MSS_MOD, PF_OSFP_MSS_DC, fp_mss) + MATCH_INT(PF_OSFP_WSCALE_MOD, PF_OSFP_WSCALE_DC, fp_wscale) + if ((f->fp_flags & PF_OSFP_WSIZE_DC) == 0) { + if (f->fp_flags & PF_OSFP_WSIZE_MSS) { + if (find->fp_mss == 0) + continue; + +/* Some "smart" NAT devices and DSL routers will tweak the MSS size and + * will set it to whatever is suitable for the link type. + */ +#define SMART_MSS 1460 + if ((find->fp_wsize % find->fp_mss || + find->fp_wsize / find->fp_mss != + f->fp_wsize) && + (find->fp_wsize % SMART_MSS || + find->fp_wsize / SMART_MSS != + f->fp_wsize)) + continue; + } else if (f->fp_flags & PF_OSFP_WSIZE_MTU) { + if (find->fp_mss == 0) + continue; + +#define MTUOFF (sizeof(struct ip) + sizeof(struct tcphdr)) +#define SMART_MTU (SMART_MSS + MTUOFF) + if ((find->fp_wsize % (find->fp_mss + MTUOFF) || + find->fp_wsize / (find->fp_mss + MTUOFF) != + f->fp_wsize) && + (find->fp_wsize % SMART_MTU || + find->fp_wsize / SMART_MTU != + f->fp_wsize)) + continue; + } else if (f->fp_flags & PF_OSFP_WSIZE_MOD) { + if (f->fp_wsize == 0 || find->fp_wsize % + f->fp_wsize) + continue; + } else { + if (f->fp_wsize != find->fp_wsize) + continue; + } + } + return (f); + } + + return (NULL); +} + +/* Find an exact fingerprint in the list */ +struct pf_os_fingerprint * +pf_osfp_find_exact(struct pf_osfp_list *list, struct pf_os_fingerprint *find) +{ + struct pf_os_fingerprint *f; + + SLIST_FOREACH(f, list, fp_next) { + if (f->fp_tcpopts == find->fp_tcpopts && + f->fp_wsize == find->fp_wsize && + f->fp_psize == find->fp_psize && + f->fp_mss == find->fp_mss && + f->fp_flags == find->fp_flags && + f->fp_optcnt == find->fp_optcnt && + f->fp_wscale == find->fp_wscale && + f->fp_ttl == find->fp_ttl) + return (f); + } + + return (NULL); +} + +/* Insert a fingerprint into the list */ +void +pf_osfp_insert(struct pf_osfp_list *list, struct pf_os_fingerprint *ins) +{ + struct pf_os_fingerprint *f, *prev = NULL; + + /* XXX need to go semi tree based. can key on tcp options */ + + SLIST_FOREACH(f, list, fp_next) + prev = f; + if (prev) + SLIST_INSERT_AFTER(prev, ins, fp_next); + else + SLIST_INSERT_HEAD(list, ins, fp_next); +} + +/* Fill a fingerprint by its number (from an ioctl) */ +int +pf_osfp_get(struct pf_osfp_ioctl *fpioc) +{ + struct pf_os_fingerprint *fp; + struct pf_osfp_entry *entry; + int num = fpioc->fp_getnum; + int i = 0; + + + memset(fpioc, 0, sizeof(*fpioc)); + SLIST_FOREACH(fp, &pf_osfp_list, fp_next) { + SLIST_FOREACH(entry, &fp->fp_oses, fp_entry) { + if (i++ == num) { + fpioc->fp_mss = fp->fp_mss; + fpioc->fp_wsize = fp->fp_wsize; + fpioc->fp_flags = fp->fp_flags; + fpioc->fp_psize = fp->fp_psize; + fpioc->fp_ttl = fp->fp_ttl; + fpioc->fp_wscale = fp->fp_wscale; + fpioc->fp_getnum = num; + memcpy(&fpioc->fp_os, entry, + sizeof(fpioc->fp_os)); + return (0); + } + } + } + + return (EBUSY); +} + + +/* Validate that each signature is reachable */ +struct pf_os_fingerprint * +pf_osfp_validate(void) +{ + struct pf_os_fingerprint *f, *f2, find; + + SLIST_FOREACH(f, &pf_osfp_list, fp_next) { + memcpy(&find, f, sizeof(find)); + + /* We do a few MSS/th_win percolations to make things unique */ + if (find.fp_mss == 0) + find.fp_mss = 128; + if (f->fp_flags & PF_OSFP_WSIZE_MSS) + find.fp_wsize *= find.fp_mss, 1; + else if (f->fp_flags & PF_OSFP_WSIZE_MTU) + find.fp_wsize *= (find.fp_mss + 40); + else if (f->fp_flags & PF_OSFP_WSIZE_MOD) + find.fp_wsize *= 2; + if (f != (f2 = pf_osfp_find(&pf_osfp_list, &find, 0))) { + if (f2) + printf("Found \"%s %s %s\" instead of " + "\"%s %s %s\"\n", + SLIST_FIRST(&f2->fp_oses)->fp_class_nm, + SLIST_FIRST(&f2->fp_oses)->fp_version_nm, + SLIST_FIRST(&f2->fp_oses)->fp_subtype_nm, + SLIST_FIRST(&f->fp_oses)->fp_class_nm, + SLIST_FIRST(&f->fp_oses)->fp_version_nm, + SLIST_FIRST(&f->fp_oses)->fp_subtype_nm); + else + printf("Couldn't find \"%s %s %s\"\n", + SLIST_FIRST(&f->fp_oses)->fp_class_nm, + SLIST_FIRST(&f->fp_oses)->fp_version_nm, + SLIST_FIRST(&f->fp_oses)->fp_subtype_nm); + return (f); + } + } + return (NULL); +} diff --git a/contrib/pf/rtems/freebsd/net/pf_ruleset.c b/contrib/pf/rtems/freebsd/net/pf_ruleset.c new file mode 100644 index 00000000..147bc8cc --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/pf_ruleset.c @@ -0,0 +1,433 @@ +#include + +/* $OpenBSD: pf_ruleset.c,v 1.1 2006/10/27 13:56:51 mcbride Exp $ */ + +/* + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002,2003 Henning Brauer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + */ + +#ifdef __FreeBSD__ +#include +__FBSDID("$FreeBSD$"); +#endif + +#include +#include +#ifdef _KERNEL +# include +#endif /* _KERNEL */ +#include + +#include +#include +#include +#include + +#include +#include + +#ifdef INET6 +#include +#endif /* INET6 */ + + +#ifdef _KERNEL +# define DPFPRINTF(format, x...) \ + if (pf_status.debug >= PF_DEBUG_NOISY) \ + printf(format , ##x) +#ifdef __FreeBSD__ +#define rs_malloc(x) malloc(x, M_TEMP, M_NOWAIT) +#else +#define rs_malloc(x) malloc(x, M_TEMP, M_WAITOK) +#endif +#define rs_free(x) free(x, M_TEMP) + +#else +/* Userland equivalents so we can lend code to pfctl et al. */ + +# include +# include +# include +# include +# include +# define rs_malloc(x) malloc(x) +# define rs_free(x) free(x) + +# ifdef PFDEBUG +# include +# define DPFPRINTF(format, x...) fprintf(stderr, format , ##x) +# else +# define DPFPRINTF(format, x...) ((void)0) +# endif /* PFDEBUG */ +#endif /* _KERNEL */ + + +struct pf_anchor_global pf_anchors; +struct pf_anchor pf_main_anchor; + +#ifndef __FreeBSD__ +/* XXX: hum? */ +int pf_get_ruleset_number(u_int8_t); +void pf_init_ruleset(struct pf_ruleset *); +int pf_anchor_setup(struct pf_rule *, + const struct pf_ruleset *, const char *); +int pf_anchor_copyout(const struct pf_ruleset *, + const struct pf_rule *, struct pfioc_rule *); +void pf_anchor_remove(struct pf_rule *); +#endif + +static __inline int pf_anchor_compare(struct pf_anchor *, struct pf_anchor *); + +RB_GENERATE(pf_anchor_global, pf_anchor, entry_global, pf_anchor_compare); +RB_GENERATE(pf_anchor_node, pf_anchor, entry_node, pf_anchor_compare); + +static __inline int +pf_anchor_compare(struct pf_anchor *a, struct pf_anchor *b) +{ + int c = strcmp(a->path, b->path); + + return (c ? (c < 0 ? -1 : 1) : 0); +} + +int +pf_get_ruleset_number(u_int8_t action) +{ + switch (action) { + case PF_SCRUB: + case PF_NOSCRUB: + return (PF_RULESET_SCRUB); + break; + case PF_PASS: + case PF_DROP: + return (PF_RULESET_FILTER); + break; + case PF_NAT: + case PF_NONAT: + return (PF_RULESET_NAT); + break; + case PF_BINAT: + case PF_NOBINAT: + return (PF_RULESET_BINAT); + break; + case PF_RDR: + case PF_NORDR: + return (PF_RULESET_RDR); + break; + default: + return (PF_RULESET_MAX); + break; + } +} + +void +pf_init_ruleset(struct pf_ruleset *ruleset) +{ + int i; + + memset(ruleset, 0, sizeof(struct pf_ruleset)); + for (i = 0; i < PF_RULESET_MAX; i++) { + TAILQ_INIT(&ruleset->rules[i].queues[0]); + TAILQ_INIT(&ruleset->rules[i].queues[1]); + ruleset->rules[i].active.ptr = &ruleset->rules[i].queues[0]; + ruleset->rules[i].inactive.ptr = &ruleset->rules[i].queues[1]; + } +} + +struct pf_anchor * +pf_find_anchor(const char *path) +{ + struct pf_anchor *key, *found; + + key = (struct pf_anchor *)rs_malloc(sizeof(*key)); + memset(key, 0, sizeof(*key)); + strlcpy(key->path, path, sizeof(key->path)); + found = RB_FIND(pf_anchor_global, &pf_anchors, key); + rs_free(key); + return (found); +} + +struct pf_ruleset * +pf_find_ruleset(const char *path) +{ + struct pf_anchor *anchor; + + while (*path == '/') + path++; + if (!*path) + return (&pf_main_ruleset); + anchor = pf_find_anchor(path); + if (anchor == NULL) + return (NULL); + else + return (&anchor->ruleset); +} + +struct pf_ruleset * +pf_find_or_create_ruleset(const char *path) +{ + char *p, *q, *r; + struct pf_ruleset *ruleset; +#ifdef __FreeBSD__ + struct pf_anchor *anchor = NULL, *dup, *parent = NULL; +#else + struct pf_anchor *anchor, *dup, *parent = NULL; +#endif + + if (path[0] == 0) + return (&pf_main_ruleset); + while (*path == '/') + path++; + ruleset = pf_find_ruleset(path); + if (ruleset != NULL) + return (ruleset); + p = (char *)rs_malloc(MAXPATHLEN); + bzero(p, MAXPATHLEN); + strlcpy(p, path, MAXPATHLEN); + while (parent == NULL && (q = strrchr(p, '/')) != NULL) { + *q = 0; + if ((ruleset = pf_find_ruleset(p)) != NULL) { + parent = ruleset->anchor; + break; + } + } + if (q == NULL) + q = p; + else + q++; + strlcpy(p, path, MAXPATHLEN); + if (!*q) { + rs_free(p); + return (NULL); + } + while ((r = strchr(q, '/')) != NULL || *q) { + if (r != NULL) + *r = 0; + if (!*q || strlen(q) >= PF_ANCHOR_NAME_SIZE || + (parent != NULL && strlen(parent->path) >= + MAXPATHLEN - PF_ANCHOR_NAME_SIZE - 1)) { + rs_free(p); + return (NULL); + } + anchor = (struct pf_anchor *)rs_malloc(sizeof(*anchor)); + if (anchor == NULL) { + rs_free(p); + return (NULL); + } + memset(anchor, 0, sizeof(*anchor)); + RB_INIT(&anchor->children); + strlcpy(anchor->name, q, sizeof(anchor->name)); + if (parent != NULL) { + strlcpy(anchor->path, parent->path, + sizeof(anchor->path)); + strlcat(anchor->path, "/", sizeof(anchor->path)); + } + strlcat(anchor->path, anchor->name, sizeof(anchor->path)); + if ((dup = RB_INSERT(pf_anchor_global, &pf_anchors, anchor)) != + NULL) { + printf("pf_find_or_create_ruleset: RB_INSERT1 " + "'%s' '%s' collides with '%s' '%s'\n", + anchor->path, anchor->name, dup->path, dup->name); + rs_free(anchor); + rs_free(p); + return (NULL); + } + if (parent != NULL) { + anchor->parent = parent; + if ((dup = RB_INSERT(pf_anchor_node, &parent->children, + anchor)) != NULL) { + printf("pf_find_or_create_ruleset: " + "RB_INSERT2 '%s' '%s' collides with " + "'%s' '%s'\n", anchor->path, anchor->name, + dup->path, dup->name); + RB_REMOVE(pf_anchor_global, &pf_anchors, + anchor); + rs_free(anchor); + rs_free(p); + return (NULL); + } + } + pf_init_ruleset(&anchor->ruleset); + anchor->ruleset.anchor = anchor; + parent = anchor; + if (r != NULL) + q = r + 1; + else + *q = 0; + } + rs_free(p); + return (&anchor->ruleset); +} + +void +pf_remove_if_empty_ruleset(struct pf_ruleset *ruleset) +{ + struct pf_anchor *parent; + int i; + + while (ruleset != NULL) { + if (ruleset == &pf_main_ruleset || ruleset->anchor == NULL || + !RB_EMPTY(&ruleset->anchor->children) || + ruleset->anchor->refcnt > 0 || ruleset->tables > 0 || + ruleset->topen) + return; + for (i = 0; i < PF_RULESET_MAX; ++i) + if (!TAILQ_EMPTY(ruleset->rules[i].active.ptr) || + !TAILQ_EMPTY(ruleset->rules[i].inactive.ptr) || + ruleset->rules[i].inactive.open) + return; + RB_REMOVE(pf_anchor_global, &pf_anchors, ruleset->anchor); + if ((parent = ruleset->anchor->parent) != NULL) + RB_REMOVE(pf_anchor_node, &parent->children, + ruleset->anchor); + rs_free(ruleset->anchor); + if (parent == NULL) + return; + ruleset = &parent->ruleset; + } +} + +int +pf_anchor_setup(struct pf_rule *r, const struct pf_ruleset *s, + const char *name) +{ + char *p, *path; + struct pf_ruleset *ruleset; + + r->anchor = NULL; + r->anchor_relative = 0; + r->anchor_wildcard = 0; + if (!name[0]) + return (0); + path = (char *)rs_malloc(MAXPATHLEN); + bzero(path, MAXPATHLEN); + if (name[0] == '/') + strlcpy(path, name + 1, MAXPATHLEN); + else { + /* relative path */ + r->anchor_relative = 1; + if (s->anchor == NULL || !s->anchor->path[0]) + path[0] = 0; + else + strlcpy(path, s->anchor->path, MAXPATHLEN); + while (name[0] == '.' && name[1] == '.' && name[2] == '/') { + if (!path[0]) { + printf("pf_anchor_setup: .. beyond root\n"); + rs_free(path); + return (1); + } + if ((p = strrchr(path, '/')) != NULL) + *p = 0; + else + path[0] = 0; + r->anchor_relative++; + name += 3; + } + if (path[0]) + strlcat(path, "/", MAXPATHLEN); + strlcat(path, name, MAXPATHLEN); + } + if ((p = strrchr(path, '/')) != NULL && !strcmp(p, "/*")) { + r->anchor_wildcard = 1; + *p = 0; + } + ruleset = pf_find_or_create_ruleset(path); + rs_free(path); + if (ruleset == NULL || ruleset->anchor == NULL) { + printf("pf_anchor_setup: ruleset\n"); + return (1); + } + r->anchor = ruleset->anchor; + r->anchor->refcnt++; + return (0); +} + +int +pf_anchor_copyout(const struct pf_ruleset *rs, const struct pf_rule *r, + struct pfioc_rule *pr) +{ + pr->anchor_call[0] = 0; + if (r->anchor == NULL) + return (0); + if (!r->anchor_relative) { + strlcpy(pr->anchor_call, "/", sizeof(pr->anchor_call)); + strlcat(pr->anchor_call, r->anchor->path, + sizeof(pr->anchor_call)); + } else { + char *a, *p; + int i; + + a = (char *)rs_malloc(MAXPATHLEN); + bzero(a, MAXPATHLEN); + if (rs->anchor == NULL) + a[0] = 0; + else + strlcpy(a, rs->anchor->path, MAXPATHLEN); + for (i = 1; i < r->anchor_relative; ++i) { + if ((p = strrchr(a, '/')) == NULL) + p = a; + *p = 0; + strlcat(pr->anchor_call, "../", + sizeof(pr->anchor_call)); + } + if (strncmp(a, r->anchor->path, strlen(a))) { + printf("pf_anchor_copyout: '%s' '%s'\n", a, + r->anchor->path); + rs_free(a); + return (1); + } + if (strlen(r->anchor->path) > strlen(a)) + strlcat(pr->anchor_call, r->anchor->path + (a[0] ? + strlen(a) + 1 : 0), sizeof(pr->anchor_call)); + rs_free(a); + } + if (r->anchor_wildcard) + strlcat(pr->anchor_call, pr->anchor_call[0] ? "/*" : "*", + sizeof(pr->anchor_call)); + return (0); +} + +void +pf_anchor_remove(struct pf_rule *r) +{ + if (r->anchor == NULL) + return; + if (r->anchor->refcnt <= 0) { + printf("pf_anchor_remove: broken refcount\n"); + r->anchor = NULL; + return; + } + if (!--r->anchor->refcnt) + pf_remove_if_empty_ruleset(&r->anchor->ruleset); + r->anchor = NULL; +} diff --git a/contrib/pf/rtems/freebsd/net/pf_subr.c b/contrib/pf/rtems/freebsd/net/pf_subr.c new file mode 100644 index 00000000..5da77484 --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/pf_subr.c @@ -0,0 +1,170 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +#include + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Following is where TCP initial sequence number generation occurs. + * + * There are two places where we must use initial sequence numbers: + * 1. In SYN-ACK packets. + * 2. In SYN packets. + * + * All ISNs for SYN-ACK packets are generated by the syncache. See + * tcp_syncache.c for details. + * + * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling + * depends on this property. In addition, these ISNs should be + * unguessable so as to prevent connection hijacking. To satisfy + * the requirements of this situation, the algorithm outlined in + * RFC 1948 is used, with only small modifications. + * + * Implementation details: + * + * Time is based off the system timer, and is corrected so that it + * increases by one megabyte per second. This allows for proper + * recycling on high speed LANs while still leaving over an hour + * before rollover. + * + * As reading the *exact* system time is too expensive to be done + * whenever setting up a TCP connection, we increment the time + * offset in two ways. First, a small random positive increment + * is added to isn_offset for each connection that is set up. + * Second, the function tcp_isn_tick fires once per clock tick + * and increments isn_offset as necessary so that sequence numbers + * are incremented at approximately ISN_BYTES_PER_SECOND. The + * random positive increments serve only to ensure that the same + * exact sequence number is never sent out twice (as could otherwise + * happen when a port is recycled in less than the system tick + * interval.) + * + * net.inet.tcp.isn_reseed_interval controls the number of seconds + * between seeding of isn_secret. This is normally set to zero, + * as reseeding should not be necessary. + * + * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, + * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In + * general, this means holding an exclusive (write) lock. + */ + +#define ISN_BYTES_PER_SECOND 1048576 +#define ISN_STATIC_INCREMENT 4096 +#define ISN_RANDOM_INCREMENT (4096 - 1) + +static u_char pf_isn_secret[32]; +static int pf_isn_last_reseed; +static u_int32_t pf_isn_offset; + +u_int32_t +pf_new_isn(struct pf_state *s) +{ + MD5_CTX isn_ctx; + u_int32_t md5_buffer[4]; + u_int32_t new_isn; + struct pf_state_host *src, *dst; + + /* Seed if this is the first use, reseed if requested. */ + if (pf_isn_last_reseed == 0) { + read_random(&pf_isn_secret, sizeof(pf_isn_secret)); + pf_isn_last_reseed = ticks; + } + + if (s->direction == PF_IN) { + src = &s->ext; + dst = &s->gwy; + } else { + src = &s->lan; + dst = &s->ext; + } + + /* Compute the md5 hash and return the ISN. */ + MD5Init(&isn_ctx); + MD5Update(&isn_ctx, (u_char *) &dst->port, sizeof(u_short)); + MD5Update(&isn_ctx, (u_char *) &src->port, sizeof(u_short)); +#ifdef INET6 + if (s->af == AF_INET6) { + MD5Update(&isn_ctx, (u_char *) &dst->addr, + sizeof(struct in6_addr)); + MD5Update(&isn_ctx, (u_char *) &src->addr, + sizeof(struct in6_addr)); + } else +#endif + { + MD5Update(&isn_ctx, (u_char *) &dst->addr, + sizeof(struct in_addr)); + MD5Update(&isn_ctx, (u_char *) &src->addr, + sizeof(struct in_addr)); + } + MD5Update(&isn_ctx, (u_char *) &pf_isn_secret, sizeof(pf_isn_secret)); + MD5Final((u_char *) &md5_buffer, &isn_ctx); + new_isn = (tcp_seq) md5_buffer[0]; + pf_isn_offset += ISN_STATIC_INCREMENT + + (arc4random() & ISN_RANDOM_INCREMENT); + new_isn += pf_isn_offset; + return (new_isn); +} diff --git a/contrib/pf/rtems/freebsd/net/pf_table.c b/contrib/pf/rtems/freebsd/net/pf_table.c new file mode 100644 index 00000000..391077df --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/pf_table.c @@ -0,0 +1,2363 @@ +#include + +/* $OpenBSD: pf_table.c,v 1.68 2006/05/02 10:08:45 dhartmei Exp $ */ + +/* + * Copyright (c) 2002 Cedric Berger + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef __FreeBSD__ +#include +#include + +#include +__FBSDID("$FreeBSD$"); +#endif + +#include +#include +#include +#include +#include +#include +#include +#ifdef __FreeBSD__ +#include +#endif + +#include +#include +#include +#ifndef __FreeBSD__ +#include +#endif + +#include + +#define ACCEPT_FLAGS(oklist) \ + do { \ + if ((flags & ~(oklist)) & \ + PFR_FLAG_ALLMASK) \ + return (EINVAL); \ + } while (0) + +#ifdef __FreeBSD__ +static inline int +_copyin(const void *uaddr, void *kaddr, size_t len) +{ + int r; + + PF_UNLOCK(); + r = copyin(uaddr, kaddr, len); + PF_LOCK(); + + return (r); +} + +static inline int +_copyout(const void *uaddr, void *kaddr, size_t len) +{ + int r; + + PF_UNLOCK(); + r = copyout(uaddr, kaddr, len); + PF_LOCK(); + + return (r); +} + +#define COPYIN(from, to, size) \ + ((flags & PFR_FLAG_USERIOCTL) ? \ + _copyin((from), (to), (size)) : \ + (bcopy((from), (to), (size)), 0)) + +#define COPYOUT(from, to, size) \ + ((flags & PFR_FLAG_USERIOCTL) ? \ + _copyout((from), (to), (size)) : \ + (bcopy((from), (to), (size)), 0)) + +#else + +#define COPYIN(from, to, size) \ + ((flags & PFR_FLAG_USERIOCTL) ? \ + copyin((from), (to), (size)) : \ + (bcopy((from), (to), (size)), 0)) + +#define COPYOUT(from, to, size) \ + ((flags & PFR_FLAG_USERIOCTL) ? \ + copyout((from), (to), (size)) : \ + (bcopy((from), (to), (size)), 0)) + +#endif + +#define FILLIN_SIN(sin, addr) \ + do { \ + (sin).sin_len = sizeof(sin); \ + (sin).sin_family = AF_INET; \ + (sin).sin_addr = (addr); \ + } while (0) + +#define FILLIN_SIN6(sin6, addr) \ + do { \ + (sin6).sin6_len = sizeof(sin6); \ + (sin6).sin6_family = AF_INET6; \ + (sin6).sin6_addr = (addr); \ + } while (0) + +#define SWAP(type, a1, a2) \ + do { \ + type tmp = a1; \ + a1 = a2; \ + a2 = tmp; \ + } while (0) + +#define SUNION2PF(su, af) (((af)==AF_INET) ? \ + (struct pf_addr *)&(su)->sin.sin_addr : \ + (struct pf_addr *)&(su)->sin6.sin6_addr) + +#define AF_BITS(af) (((af)==AF_INET)?32:128) +#define ADDR_NETWORK(ad) ((ad)->pfra_net < AF_BITS((ad)->pfra_af)) +#define KENTRY_NETWORK(ke) ((ke)->pfrke_net < AF_BITS((ke)->pfrke_af)) +#define KENTRY_RNF_ROOT(ke) \ + ((((struct radix_node *)(ke))->rn_flags & RNF_ROOT) != 0) + +#define NO_ADDRESSES (-1) +#define ENQUEUE_UNMARKED_ONLY (1) +#define INVERT_NEG_FLAG (1) + +struct pfr_walktree { + enum pfrw_op { + PFRW_MARK, + PFRW_SWEEP, + PFRW_ENQUEUE, + PFRW_GET_ADDRS, + PFRW_GET_ASTATS, + PFRW_POOL_GET, + PFRW_DYNADDR_UPDATE + } pfrw_op; + union { + struct pfr_addr *pfrw1_addr; + struct pfr_astats *pfrw1_astats; + struct pfr_kentryworkq *pfrw1_workq; + struct pfr_kentry *pfrw1_kentry; + struct pfi_dynaddr *pfrw1_dyn; + } pfrw_1; + int pfrw_free; + int pfrw_flags; +}; +#define pfrw_addr pfrw_1.pfrw1_addr +#define pfrw_astats pfrw_1.pfrw1_astats +#define pfrw_workq pfrw_1.pfrw1_workq +#define pfrw_kentry pfrw_1.pfrw1_kentry +#define pfrw_dyn pfrw_1.pfrw1_dyn +#define pfrw_cnt pfrw_free + +#define senderr(e) do { rv = (e); goto _bad; } while (0) + +#ifdef __FreeBSD__ +uma_zone_t pfr_ktable_pl; +uma_zone_t pfr_kentry_pl; +uma_zone_t pfr_kentry_pl2; +#else +struct pool pfr_ktable_pl; +struct pool pfr_kentry_pl; +struct pool pfr_kentry_pl2; +#endif +struct sockaddr_in pfr_sin; +struct sockaddr_in6 pfr_sin6; +union sockaddr_union pfr_mask; +struct pf_addr pfr_ffaddr; + +void pfr_copyout_addr(struct pfr_addr *, + struct pfr_kentry *ke); +int pfr_validate_addr(struct pfr_addr *); +void pfr_enqueue_addrs(struct pfr_ktable *, + struct pfr_kentryworkq *, int *, int); +void pfr_mark_addrs(struct pfr_ktable *); +struct pfr_kentry *pfr_lookup_addr(struct pfr_ktable *, + struct pfr_addr *, int); +struct pfr_kentry *pfr_create_kentry(struct pfr_addr *, int); +void pfr_destroy_kentries(struct pfr_kentryworkq *); +void pfr_destroy_kentry(struct pfr_kentry *); +void pfr_insert_kentries(struct pfr_ktable *, + struct pfr_kentryworkq *, long); +void pfr_remove_kentries(struct pfr_ktable *, + struct pfr_kentryworkq *); +void pfr_clstats_kentries(struct pfr_kentryworkq *, long, + int); +void pfr_reset_feedback(struct pfr_addr *, int, int); +void pfr_prepare_network(union sockaddr_union *, int, int); +int pfr_route_kentry(struct pfr_ktable *, + struct pfr_kentry *); +int pfr_unroute_kentry(struct pfr_ktable *, + struct pfr_kentry *); +int pfr_walktree(struct radix_node *, void *); +int pfr_validate_table(struct pfr_table *, int, int); +int pfr_fix_anchor(char *); +void pfr_commit_ktable(struct pfr_ktable *, long); +void pfr_insert_ktables(struct pfr_ktableworkq *); +void pfr_insert_ktable(struct pfr_ktable *); +void pfr_setflags_ktables(struct pfr_ktableworkq *); +void pfr_setflags_ktable(struct pfr_ktable *, int); +void pfr_clstats_ktables(struct pfr_ktableworkq *, long, + int); +void pfr_clstats_ktable(struct pfr_ktable *, long, int); +struct pfr_ktable *pfr_create_ktable(struct pfr_table *, long, int); +void pfr_destroy_ktables(struct pfr_ktableworkq *, int); +void pfr_destroy_ktable(struct pfr_ktable *, int); +int pfr_ktable_compare(struct pfr_ktable *, + struct pfr_ktable *); +struct pfr_ktable *pfr_lookup_table(struct pfr_table *); +void pfr_clean_node_mask(struct pfr_ktable *, + struct pfr_kentryworkq *); +int pfr_table_count(struct pfr_table *, int); +int pfr_skip_table(struct pfr_table *, + struct pfr_ktable *, int); +struct pfr_kentry *pfr_kentry_byidx(struct pfr_ktable *, int, int); + +RB_PROTOTYPE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare); +RB_GENERATE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare); + +struct pfr_ktablehead pfr_ktables; +struct pfr_table pfr_nulltable; +int pfr_ktable_cnt; + +void +pfr_initialize(void) +{ +#ifndef __FreeBSD__ + pool_init(&pfr_ktable_pl, sizeof(struct pfr_ktable), 0, 0, 0, + "pfrktable", &pool_allocator_oldnointr); + pool_init(&pfr_kentry_pl, sizeof(struct pfr_kentry), 0, 0, 0, + "pfrkentry", &pool_allocator_oldnointr); + pool_init(&pfr_kentry_pl2, sizeof(struct pfr_kentry), 0, 0, 0, + "pfrkentry2", NULL); +#endif + + pfr_sin.sin_len = sizeof(pfr_sin); + pfr_sin.sin_family = AF_INET; + pfr_sin6.sin6_len = sizeof(pfr_sin6); + pfr_sin6.sin6_family = AF_INET6; + + memset(&pfr_ffaddr, 0xff, sizeof(pfr_ffaddr)); +} + +int +pfr_clr_addrs(struct pfr_table *tbl, int *ndel, int flags) +{ + struct pfr_ktable *kt; + struct pfr_kentryworkq workq; + int s; + + ACCEPT_FLAGS(PFR_FLAG_ATOMIC+PFR_FLAG_DUMMY); + if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_flags & PFR_TFLAG_CONST) + return (EPERM); + pfr_enqueue_addrs(kt, &workq, ndel, 0); + + if (!(flags & PFR_FLAG_DUMMY)) { + s = 0; + if (flags & PFR_FLAG_ATOMIC) + s = splsoftnet(); + pfr_remove_kentries(kt, &workq); + if (flags & PFR_FLAG_ATOMIC) + splx(s); + if (kt->pfrkt_cnt) { + printf("pfr_clr_addrs: corruption detected (%d).\n", + kt->pfrkt_cnt); + kt->pfrkt_cnt = 0; + } + } + return (0); +} + +int +pfr_add_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *nadd, int flags) +{ + struct pfr_ktable *kt, *tmpkt; + struct pfr_kentryworkq workq; + struct pfr_kentry *p, *q; + struct pfr_addr ad; + int i, rv, s = 0, xadd = 0; + long tzero = time_second; + + ACCEPT_FLAGS(PFR_FLAG_ATOMIC+PFR_FLAG_DUMMY+PFR_FLAG_FEEDBACK); + if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_flags & PFR_TFLAG_CONST) + return (EPERM); + tmpkt = pfr_create_ktable(&pfr_nulltable, 0, 0); + if (tmpkt == NULL) + return (ENOMEM); + SLIST_INIT(&workq); + for (i = 0; i < size; i++) { + if (COPYIN(addr+i, &ad, sizeof(ad))) + senderr(EFAULT); + if (pfr_validate_addr(&ad)) + senderr(EINVAL); + p = pfr_lookup_addr(kt, &ad, 1); + q = pfr_lookup_addr(tmpkt, &ad, 1); + if (flags & PFR_FLAG_FEEDBACK) { + if (q != NULL) + ad.pfra_fback = PFR_FB_DUPLICATE; + else if (p == NULL) + ad.pfra_fback = PFR_FB_ADDED; + else if (p->pfrke_not != ad.pfra_not) + ad.pfra_fback = PFR_FB_CONFLICT; + else + ad.pfra_fback = PFR_FB_NONE; + } + if (p == NULL && q == NULL) { + p = pfr_create_kentry(&ad, 0); + if (p == NULL) + senderr(ENOMEM); + if (pfr_route_kentry(tmpkt, p)) { + pfr_destroy_kentry(p); + ad.pfra_fback = PFR_FB_NONE; + } else { + SLIST_INSERT_HEAD(&workq, p, pfrke_workq); + xadd++; + } + } + if (flags & PFR_FLAG_FEEDBACK) { + if (COPYOUT(&ad, addr+i, sizeof(ad))) + senderr(EFAULT); + } + } + pfr_clean_node_mask(tmpkt, &workq); + if (!(flags & PFR_FLAG_DUMMY)) { + if (flags & PFR_FLAG_ATOMIC) + s = splsoftnet(); + pfr_insert_kentries(kt, &workq, tzero); + if (flags & PFR_FLAG_ATOMIC) + splx(s); + } else + pfr_destroy_kentries(&workq); + if (nadd != NULL) + *nadd = xadd; + pfr_destroy_ktable(tmpkt, 0); + return (0); +_bad: + pfr_clean_node_mask(tmpkt, &workq); + pfr_destroy_kentries(&workq); + if (flags & PFR_FLAG_FEEDBACK) + pfr_reset_feedback(addr, size, flags); + pfr_destroy_ktable(tmpkt, 0); + return (rv); +} + +int +pfr_del_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *ndel, int flags) +{ + struct pfr_ktable *kt; + struct pfr_kentryworkq workq; + struct pfr_kentry *p; + struct pfr_addr ad; + int i, rv, s = 0, xdel = 0, log = 1; + + ACCEPT_FLAGS(PFR_FLAG_ATOMIC+PFR_FLAG_DUMMY+PFR_FLAG_FEEDBACK); + if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_flags & PFR_TFLAG_CONST) + return (EPERM); + /* + * there are two algorithms to choose from here. + * with: + * n: number of addresses to delete + * N: number of addresses in the table + * + * one is O(N) and is better for large 'n' + * one is O(n*LOG(N)) and is better for small 'n' + * + * following code try to decide which one is best. + */ + for (i = kt->pfrkt_cnt; i > 0; i >>= 1) + log++; + if (size > kt->pfrkt_cnt/log) { + /* full table scan */ + pfr_mark_addrs(kt); + } else { + /* iterate over addresses to delete */ + for (i = 0; i < size; i++) { + if (COPYIN(addr+i, &ad, sizeof(ad))) + return (EFAULT); + if (pfr_validate_addr(&ad)) + return (EINVAL); + p = pfr_lookup_addr(kt, &ad, 1); + if (p != NULL) + p->pfrke_mark = 0; + } + } + SLIST_INIT(&workq); + for (i = 0; i < size; i++) { + if (COPYIN(addr+i, &ad, sizeof(ad))) + senderr(EFAULT); + if (pfr_validate_addr(&ad)) + senderr(EINVAL); + p = pfr_lookup_addr(kt, &ad, 1); + if (flags & PFR_FLAG_FEEDBACK) { + if (p == NULL) + ad.pfra_fback = PFR_FB_NONE; + else if (p->pfrke_not != ad.pfra_not) + ad.pfra_fback = PFR_FB_CONFLICT; + else if (p->pfrke_mark) + ad.pfra_fback = PFR_FB_DUPLICATE; + else + ad.pfra_fback = PFR_FB_DELETED; + } + if (p != NULL && p->pfrke_not == ad.pfra_not && + !p->pfrke_mark) { + p->pfrke_mark = 1; + SLIST_INSERT_HEAD(&workq, p, pfrke_workq); + xdel++; + } + if (flags & PFR_FLAG_FEEDBACK) + if (COPYOUT(&ad, addr+i, sizeof(ad))) + senderr(EFAULT); + } + if (!(flags & PFR_FLAG_DUMMY)) { + if (flags & PFR_FLAG_ATOMIC) + s = splsoftnet(); + pfr_remove_kentries(kt, &workq); + if (flags & PFR_FLAG_ATOMIC) + splx(s); + } + if (ndel != NULL) + *ndel = xdel; + return (0); +_bad: + if (flags & PFR_FLAG_FEEDBACK) + pfr_reset_feedback(addr, size, flags); + return (rv); +} + +int +pfr_set_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *size2, int *nadd, int *ndel, int *nchange, int flags, + u_int32_t ignore_pfrt_flags) +{ + struct pfr_ktable *kt, *tmpkt; + struct pfr_kentryworkq addq, delq, changeq; + struct pfr_kentry *p, *q; + struct pfr_addr ad; + int i, rv, s = 0, xadd = 0, xdel = 0, xchange = 0; + long tzero = time_second; + + ACCEPT_FLAGS(PFR_FLAG_ATOMIC+PFR_FLAG_DUMMY+PFR_FLAG_FEEDBACK); + if (pfr_validate_table(tbl, ignore_pfrt_flags, flags & + PFR_FLAG_USERIOCTL)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_flags & PFR_TFLAG_CONST) + return (EPERM); + tmpkt = pfr_create_ktable(&pfr_nulltable, 0, 0); + if (tmpkt == NULL) + return (ENOMEM); + pfr_mark_addrs(kt); + SLIST_INIT(&addq); + SLIST_INIT(&delq); + SLIST_INIT(&changeq); + for (i = 0; i < size; i++) { + if (COPYIN(addr+i, &ad, sizeof(ad))) + senderr(EFAULT); + if (pfr_validate_addr(&ad)) + senderr(EINVAL); + ad.pfra_fback = PFR_FB_NONE; + p = pfr_lookup_addr(kt, &ad, 1); + if (p != NULL) { + if (p->pfrke_mark) { + ad.pfra_fback = PFR_FB_DUPLICATE; + goto _skip; + } + p->pfrke_mark = 1; + if (p->pfrke_not != ad.pfra_not) { + SLIST_INSERT_HEAD(&changeq, p, pfrke_workq); + ad.pfra_fback = PFR_FB_CHANGED; + xchange++; + } + } else { + q = pfr_lookup_addr(tmpkt, &ad, 1); + if (q != NULL) { + ad.pfra_fback = PFR_FB_DUPLICATE; + goto _skip; + } + p = pfr_create_kentry(&ad, 0); + if (p == NULL) + senderr(ENOMEM); + if (pfr_route_kentry(tmpkt, p)) { + pfr_destroy_kentry(p); + ad.pfra_fback = PFR_FB_NONE; + } else { + SLIST_INSERT_HEAD(&addq, p, pfrke_workq); + ad.pfra_fback = PFR_FB_ADDED; + xadd++; + } + } +_skip: + if (flags & PFR_FLAG_FEEDBACK) + if (COPYOUT(&ad, addr+i, sizeof(ad))) + senderr(EFAULT); + } + pfr_enqueue_addrs(kt, &delq, &xdel, ENQUEUE_UNMARKED_ONLY); + if ((flags & PFR_FLAG_FEEDBACK) && *size2) { + if (*size2 < size+xdel) { + *size2 = size+xdel; + senderr(0); + } + i = 0; + SLIST_FOREACH(p, &delq, pfrke_workq) { + pfr_copyout_addr(&ad, p); + ad.pfra_fback = PFR_FB_DELETED; + if (COPYOUT(&ad, addr+size+i, sizeof(ad))) + senderr(EFAULT); + i++; + } + } + pfr_clean_node_mask(tmpkt, &addq); + if (!(flags & PFR_FLAG_DUMMY)) { + if (flags & PFR_FLAG_ATOMIC) + s = splsoftnet(); + pfr_insert_kentries(kt, &addq, tzero); + pfr_remove_kentries(kt, &delq); + pfr_clstats_kentries(&changeq, tzero, INVERT_NEG_FLAG); + if (flags & PFR_FLAG_ATOMIC) + splx(s); + } else + pfr_destroy_kentries(&addq); + if (nadd != NULL) + *nadd = xadd; + if (ndel != NULL) + *ndel = xdel; + if (nchange != NULL) + *nchange = xchange; + if ((flags & PFR_FLAG_FEEDBACK) && size2) + *size2 = size+xdel; + pfr_destroy_ktable(tmpkt, 0); + return (0); +_bad: + pfr_clean_node_mask(tmpkt, &addq); + pfr_destroy_kentries(&addq); + if (flags & PFR_FLAG_FEEDBACK) + pfr_reset_feedback(addr, size, flags); + pfr_destroy_ktable(tmpkt, 0); + return (rv); +} + +int +pfr_tst_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *nmatch, int flags) +{ + struct pfr_ktable *kt; + struct pfr_kentry *p; + struct pfr_addr ad; + int i, xmatch = 0; + + ACCEPT_FLAGS(PFR_FLAG_REPLACE); + if (pfr_validate_table(tbl, 0, 0)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + + for (i = 0; i < size; i++) { + if (COPYIN(addr+i, &ad, sizeof(ad))) + return (EFAULT); + if (pfr_validate_addr(&ad)) + return (EINVAL); + if (ADDR_NETWORK(&ad)) + return (EINVAL); + p = pfr_lookup_addr(kt, &ad, 0); + if (flags & PFR_FLAG_REPLACE) + pfr_copyout_addr(&ad, p); + ad.pfra_fback = (p == NULL) ? PFR_FB_NONE : + (p->pfrke_not ? PFR_FB_NOTMATCH : PFR_FB_MATCH); + if (p != NULL && !p->pfrke_not) + xmatch++; + if (COPYOUT(&ad, addr+i, sizeof(ad))) + return (EFAULT); + } + if (nmatch != NULL) + *nmatch = xmatch; + return (0); +} + +int +pfr_get_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int *size, + int flags) +{ + struct pfr_ktable *kt; + struct pfr_walktree w; + int rv; + + ACCEPT_FLAGS(0); + if (pfr_validate_table(tbl, 0, 0)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_cnt > *size) { + *size = kt->pfrkt_cnt; + return (0); + } + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_GET_ADDRS; + w.pfrw_addr = addr; + w.pfrw_free = kt->pfrkt_cnt; + w.pfrw_flags = flags; +#ifdef __FreeBSD__ + rv = kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w); +#else + rv = rn_walktree(kt->pfrkt_ip4, pfr_walktree, &w); +#endif + if (!rv) +#ifdef __FreeBSD__ + rv = kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, + &w); +#else + rv = rn_walktree(kt->pfrkt_ip6, pfr_walktree, &w); +#endif + if (rv) + return (rv); + + if (w.pfrw_free) { + printf("pfr_get_addrs: corruption detected (%d).\n", + w.pfrw_free); + return (ENOTTY); + } + *size = kt->pfrkt_cnt; + return (0); +} + +int +pfr_get_astats(struct pfr_table *tbl, struct pfr_astats *addr, int *size, + int flags) +{ + struct pfr_ktable *kt; + struct pfr_walktree w; + struct pfr_kentryworkq workq; + int rv, s = 0; + long tzero = time_second; + + ACCEPT_FLAGS(PFR_FLAG_ATOMIC); /* XXX PFR_FLAG_CLSTATS disabled */ + if (pfr_validate_table(tbl, 0, 0)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_cnt > *size) { + *size = kt->pfrkt_cnt; + return (0); + } + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_GET_ASTATS; + w.pfrw_astats = addr; + w.pfrw_free = kt->pfrkt_cnt; + w.pfrw_flags = flags; + if (flags & PFR_FLAG_ATOMIC) + s = splsoftnet(); +#ifdef __FreeBSD__ + rv = kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w); +#else + rv = rn_walktree(kt->pfrkt_ip4, pfr_walktree, &w); +#endif + if (!rv) +#ifdef __FreeBSD__ + rv = kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, + &w); +#else + rv = rn_walktree(kt->pfrkt_ip6, pfr_walktree, &w); +#endif + if (!rv && (flags & PFR_FLAG_CLSTATS)) { + pfr_enqueue_addrs(kt, &workq, NULL, 0); + pfr_clstats_kentries(&workq, tzero, 0); + } + if (flags & PFR_FLAG_ATOMIC) + splx(s); + if (rv) + return (rv); + + if (w.pfrw_free) { + printf("pfr_get_astats: corruption detected (%d).\n", + w.pfrw_free); + return (ENOTTY); + } + *size = kt->pfrkt_cnt; + return (0); +} + +int +pfr_clr_astats(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *nzero, int flags) +{ + struct pfr_ktable *kt; + struct pfr_kentryworkq workq; + struct pfr_kentry *p; + struct pfr_addr ad; + int i, rv, s = 0, xzero = 0; + + ACCEPT_FLAGS(PFR_FLAG_ATOMIC+PFR_FLAG_DUMMY+PFR_FLAG_FEEDBACK); + if (pfr_validate_table(tbl, 0, 0)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + SLIST_INIT(&workq); + for (i = 0; i < size; i++) { + if (COPYIN(addr+i, &ad, sizeof(ad))) + senderr(EFAULT); + if (pfr_validate_addr(&ad)) + senderr(EINVAL); + p = pfr_lookup_addr(kt, &ad, 1); + if (flags & PFR_FLAG_FEEDBACK) { + ad.pfra_fback = (p != NULL) ? + PFR_FB_CLEARED : PFR_FB_NONE; + if (COPYOUT(&ad, addr+i, sizeof(ad))) + senderr(EFAULT); + } + if (p != NULL) { + SLIST_INSERT_HEAD(&workq, p, pfrke_workq); + xzero++; + } + } + + if (!(flags & PFR_FLAG_DUMMY)) { + if (flags & PFR_FLAG_ATOMIC) + s = splsoftnet(); + pfr_clstats_kentries(&workq, 0, 0); + if (flags & PFR_FLAG_ATOMIC) + splx(s); + } + if (nzero != NULL) + *nzero = xzero; + return (0); +_bad: + if (flags & PFR_FLAG_FEEDBACK) + pfr_reset_feedback(addr, size, flags); + return (rv); +} + +int +pfr_validate_addr(struct pfr_addr *ad) +{ + int i; + + switch (ad->pfra_af) { +#ifdef INET + case AF_INET: + if (ad->pfra_net > 32) + return (-1); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (ad->pfra_net > 128) + return (-1); + break; +#endif /* INET6 */ + default: + return (-1); + } + if (ad->pfra_net < 128 && + (((caddr_t)ad)[ad->pfra_net/8] & (0xFF >> (ad->pfra_net%8)))) + return (-1); + for (i = (ad->pfra_net+7)/8; i < sizeof(ad->pfra_u); i++) + if (((caddr_t)ad)[i]) + return (-1); + if (ad->pfra_not && ad->pfra_not != 1) + return (-1); + if (ad->pfra_fback) + return (-1); + return (0); +} + +void +pfr_enqueue_addrs(struct pfr_ktable *kt, struct pfr_kentryworkq *workq, + int *naddr, int sweep) +{ + struct pfr_walktree w; + + SLIST_INIT(workq); + bzero(&w, sizeof(w)); + w.pfrw_op = sweep ? PFRW_SWEEP : PFRW_ENQUEUE; + w.pfrw_workq = workq; + if (kt->pfrkt_ip4 != NULL) +#ifdef __FreeBSD__ + if (kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, + &w)) +#else + if (rn_walktree(kt->pfrkt_ip4, pfr_walktree, &w)) +#endif + printf("pfr_enqueue_addrs: IPv4 walktree failed.\n"); + if (kt->pfrkt_ip6 != NULL) +#ifdef __FreeBSD__ + if (kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, + &w)) +#else + if (rn_walktree(kt->pfrkt_ip6, pfr_walktree, &w)) +#endif + printf("pfr_enqueue_addrs: IPv6 walktree failed.\n"); + if (naddr != NULL) + *naddr = w.pfrw_cnt; +} + +void +pfr_mark_addrs(struct pfr_ktable *kt) +{ + struct pfr_walktree w; + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_MARK; +#ifdef __FreeBSD__ + if (kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w)) +#else + if (rn_walktree(kt->pfrkt_ip4, pfr_walktree, &w)) +#endif + printf("pfr_mark_addrs: IPv4 walktree failed.\n"); +#ifdef __FreeBSD__ + if (kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, &w)) +#else + if (rn_walktree(kt->pfrkt_ip6, pfr_walktree, &w)) +#endif + printf("pfr_mark_addrs: IPv6 walktree failed.\n"); +} + + +struct pfr_kentry * +pfr_lookup_addr(struct pfr_ktable *kt, struct pfr_addr *ad, int exact) +{ + union sockaddr_union sa, mask; + struct radix_node_head *head = NULL; /* make the compiler happy */ + struct pfr_kentry *ke; + int s; + + bzero(&sa, sizeof(sa)); + if (ad->pfra_af == AF_INET) { + FILLIN_SIN(sa.sin, ad->pfra_ip4addr); + head = kt->pfrkt_ip4; + } else if ( ad->pfra_af == AF_INET6 ) { + FILLIN_SIN6(sa.sin6, ad->pfra_ip6addr); + head = kt->pfrkt_ip6; + } + if (ADDR_NETWORK(ad)) { + pfr_prepare_network(&mask, ad->pfra_af, ad->pfra_net); + s = splsoftnet(); /* rn_lookup makes use of globals */ +#ifdef __FreeBSD__ + PF_ASSERT(MA_OWNED); +#endif + ke = (struct pfr_kentry *)rn_lookup(&sa, &mask, head); + splx(s); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + } else { + ke = (struct pfr_kentry *)rn_match(&sa, head); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + if (exact && ke && KENTRY_NETWORK(ke)) + ke = NULL; + } + return (ke); +} + +struct pfr_kentry * +pfr_create_kentry(struct pfr_addr *ad, int intr) +{ + struct pfr_kentry *ke; + + if (intr) + ke = pool_get(&pfr_kentry_pl2, PR_NOWAIT); + else + ke = pool_get(&pfr_kentry_pl, PR_NOWAIT); + if (ke == NULL) + return (NULL); + bzero(ke, sizeof(*ke)); + + if (ad->pfra_af == AF_INET) + FILLIN_SIN(ke->pfrke_sa.sin, ad->pfra_ip4addr); + else if (ad->pfra_af == AF_INET6) + FILLIN_SIN6(ke->pfrke_sa.sin6, ad->pfra_ip6addr); + ke->pfrke_af = ad->pfra_af; + ke->pfrke_net = ad->pfra_net; + ke->pfrke_not = ad->pfra_not; + ke->pfrke_intrpool = intr; + return (ke); +} + +void +pfr_destroy_kentries(struct pfr_kentryworkq *workq) +{ + struct pfr_kentry *p, *q; + + for (p = SLIST_FIRST(workq); p != NULL; p = q) { + q = SLIST_NEXT(p, pfrke_workq); + pfr_destroy_kentry(p); + } +} + +void +pfr_destroy_kentry(struct pfr_kentry *ke) +{ + if (ke->pfrke_intrpool) + pool_put(&pfr_kentry_pl2, ke); + else + pool_put(&pfr_kentry_pl, ke); +} + +void +pfr_insert_kentries(struct pfr_ktable *kt, + struct pfr_kentryworkq *workq, long tzero) +{ + struct pfr_kentry *p; + int rv, n = 0; + + SLIST_FOREACH(p, workq, pfrke_workq) { + rv = pfr_route_kentry(kt, p); + if (rv) { + printf("pfr_insert_kentries: cannot route entry " + "(code=%d).\n", rv); + break; + } + p->pfrke_tzero = tzero; + n++; + } + kt->pfrkt_cnt += n; +} + +int +pfr_insert_kentry(struct pfr_ktable *kt, struct pfr_addr *ad, long tzero) +{ + struct pfr_kentry *p; + int rv; + + p = pfr_lookup_addr(kt, ad, 1); + if (p != NULL) + return (0); + p = pfr_create_kentry(ad, 1); + if (p == NULL) + return (EINVAL); + + rv = pfr_route_kentry(kt, p); + if (rv) + return (rv); + + p->pfrke_tzero = tzero; + kt->pfrkt_cnt++; + + return (0); +} + +void +pfr_remove_kentries(struct pfr_ktable *kt, + struct pfr_kentryworkq *workq) +{ + struct pfr_kentry *p; + int n = 0; + + SLIST_FOREACH(p, workq, pfrke_workq) { + pfr_unroute_kentry(kt, p); + n++; + } + kt->pfrkt_cnt -= n; + pfr_destroy_kentries(workq); +} + +void +pfr_clean_node_mask(struct pfr_ktable *kt, + struct pfr_kentryworkq *workq) +{ + struct pfr_kentry *p; + + SLIST_FOREACH(p, workq, pfrke_workq) + pfr_unroute_kentry(kt, p); +} + +void +pfr_clstats_kentries(struct pfr_kentryworkq *workq, long tzero, int negchange) +{ + struct pfr_kentry *p; + int s; + + SLIST_FOREACH(p, workq, pfrke_workq) { + s = splsoftnet(); + if (negchange) + p->pfrke_not = !p->pfrke_not; + bzero(p->pfrke_packets, sizeof(p->pfrke_packets)); + bzero(p->pfrke_bytes, sizeof(p->pfrke_bytes)); + splx(s); + p->pfrke_tzero = tzero; + } +} + +void +pfr_reset_feedback(struct pfr_addr *addr, int size, int flags) +{ + struct pfr_addr ad; + int i; + + for (i = 0; i < size; i++) { + if (COPYIN(addr+i, &ad, sizeof(ad))) + break; + ad.pfra_fback = PFR_FB_NONE; + if (COPYOUT(&ad, addr+i, sizeof(ad))) + break; + } +} + +void +pfr_prepare_network(union sockaddr_union *sa, int af, int net) +{ + int i; + + bzero(sa, sizeof(*sa)); + if (af == AF_INET) { + sa->sin.sin_len = sizeof(sa->sin); + sa->sin.sin_family = AF_INET; + sa->sin.sin_addr.s_addr = net ? htonl(-1 << (32-net)) : 0; + } else if (af == AF_INET6) { + sa->sin6.sin6_len = sizeof(sa->sin6); + sa->sin6.sin6_family = AF_INET6; + for (i = 0; i < 4; i++) { + if (net <= 32) { + sa->sin6.sin6_addr.s6_addr32[i] = + net ? htonl(-1 << (32-net)) : 0; + break; + } + sa->sin6.sin6_addr.s6_addr32[i] = 0xFFFFFFFF; + net -= 32; + } + } +} + +int +pfr_route_kentry(struct pfr_ktable *kt, struct pfr_kentry *ke) +{ + union sockaddr_union mask; + struct radix_node *rn; + struct radix_node_head *head = NULL; /* make the compiler happy */ + int s; + + bzero(ke->pfrke_node, sizeof(ke->pfrke_node)); + if (ke->pfrke_af == AF_INET) + head = kt->pfrkt_ip4; + else if (ke->pfrke_af == AF_INET6) + head = kt->pfrkt_ip6; + + s = splsoftnet(); +#ifdef __FreeBSD__ + PF_ASSERT(MA_OWNED); +#endif + if (KENTRY_NETWORK(ke)) { + pfr_prepare_network(&mask, ke->pfrke_af, ke->pfrke_net); + rn = rn_addroute(&ke->pfrke_sa, &mask, head, ke->pfrke_node); + } else + rn = rn_addroute(&ke->pfrke_sa, NULL, head, ke->pfrke_node); + splx(s); + + return (rn == NULL ? -1 : 0); +} + +int +pfr_unroute_kentry(struct pfr_ktable *kt, struct pfr_kentry *ke) +{ + union sockaddr_union mask; + struct radix_node *rn; + struct radix_node_head *head = NULL; /* make the compiler happy */ + int s; + + if (ke->pfrke_af == AF_INET) + head = kt->pfrkt_ip4; + else if (ke->pfrke_af == AF_INET6) + head = kt->pfrkt_ip6; + + s = splsoftnet(); +#ifdef __FreeBSD__ + PF_ASSERT(MA_OWNED); +#endif + if (KENTRY_NETWORK(ke)) { + pfr_prepare_network(&mask, ke->pfrke_af, ke->pfrke_net); +#ifdef __FreeBSD__ + rn = rn_delete(&ke->pfrke_sa, &mask, head); +#else + rn = rn_delete(&ke->pfrke_sa, &mask, head, NULL); +#endif + } else +#ifdef __FreeBSD__ + rn = rn_delete(&ke->pfrke_sa, NULL, head); +#else + rn = rn_delete(&ke->pfrke_sa, NULL, head, NULL); +#endif + splx(s); + + if (rn == NULL) { + printf("pfr_unroute_kentry: delete failed.\n"); + return (-1); + } + return (0); +} + +void +pfr_copyout_addr(struct pfr_addr *ad, struct pfr_kentry *ke) +{ + bzero(ad, sizeof(*ad)); + if (ke == NULL) + return; + ad->pfra_af = ke->pfrke_af; + ad->pfra_net = ke->pfrke_net; + ad->pfra_not = ke->pfrke_not; + if (ad->pfra_af == AF_INET) + ad->pfra_ip4addr = ke->pfrke_sa.sin.sin_addr; + else if (ad->pfra_af == AF_INET6) + ad->pfra_ip6addr = ke->pfrke_sa.sin6.sin6_addr; +} + +int +pfr_walktree(struct radix_node *rn, void *arg) +{ + struct pfr_kentry *ke = (struct pfr_kentry *)rn; + struct pfr_walktree *w = arg; + int s, flags = w->pfrw_flags; + + switch (w->pfrw_op) { + case PFRW_MARK: + ke->pfrke_mark = 0; + break; + case PFRW_SWEEP: + if (ke->pfrke_mark) + break; + /* FALLTHROUGH */ + case PFRW_ENQUEUE: + SLIST_INSERT_HEAD(w->pfrw_workq, ke, pfrke_workq); + w->pfrw_cnt++; + break; + case PFRW_GET_ADDRS: + if (w->pfrw_free-- > 0) { + struct pfr_addr ad; + + pfr_copyout_addr(&ad, ke); + if (COPYOUT(&ad, w->pfrw_addr, sizeof(ad))) + return (EFAULT); + w->pfrw_addr++; + } + break; + case PFRW_GET_ASTATS: + if (w->pfrw_free-- > 0) { + struct pfr_astats as; + + pfr_copyout_addr(&as.pfras_a, ke); + + s = splsoftnet(); + bcopy(ke->pfrke_packets, as.pfras_packets, + sizeof(as.pfras_packets)); + bcopy(ke->pfrke_bytes, as.pfras_bytes, + sizeof(as.pfras_bytes)); + splx(s); + as.pfras_tzero = ke->pfrke_tzero; + + if (COPYOUT(&as, w->pfrw_astats, sizeof(as))) + return (EFAULT); + w->pfrw_astats++; + } + break; + case PFRW_POOL_GET: + if (ke->pfrke_not) + break; /* negative entries are ignored */ + if (!w->pfrw_cnt--) { + w->pfrw_kentry = ke; + return (1); /* finish search */ + } + break; + case PFRW_DYNADDR_UPDATE: + if (ke->pfrke_af == AF_INET) { + if (w->pfrw_dyn->pfid_acnt4++ > 0) + break; + pfr_prepare_network(&pfr_mask, AF_INET, ke->pfrke_net); + w->pfrw_dyn->pfid_addr4 = *SUNION2PF( + &ke->pfrke_sa, AF_INET); + w->pfrw_dyn->pfid_mask4 = *SUNION2PF( + &pfr_mask, AF_INET); + } else if (ke->pfrke_af == AF_INET6){ + if (w->pfrw_dyn->pfid_acnt6++ > 0) + break; + pfr_prepare_network(&pfr_mask, AF_INET6, ke->pfrke_net); + w->pfrw_dyn->pfid_addr6 = *SUNION2PF( + &ke->pfrke_sa, AF_INET6); + w->pfrw_dyn->pfid_mask6 = *SUNION2PF( + &pfr_mask, AF_INET6); + } + break; + } + return (0); +} + +int +pfr_clr_tables(struct pfr_table *filter, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p; + int s = 0, xdel = 0; + + ACCEPT_FLAGS(PFR_FLAG_ATOMIC+PFR_FLAG_DUMMY+PFR_FLAG_ALLRSETS); + if (pfr_fix_anchor(filter->pfrt_anchor)) + return (EINVAL); + if (pfr_table_count(filter, flags) < 0) + return (ENOENT); + + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (pfr_skip_table(filter, p, flags)) + continue; + if (!strcmp(p->pfrkt_anchor, PF_RESERVED_ANCHOR)) + continue; + if (!(p->pfrkt_flags & PFR_TFLAG_ACTIVE)) + continue; + p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_ACTIVE; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xdel++; + } + if (!(flags & PFR_FLAG_DUMMY)) { + if (flags & PFR_FLAG_ATOMIC) + s = splsoftnet(); + pfr_setflags_ktables(&workq); + if (flags & PFR_FLAG_ATOMIC) + splx(s); + } + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_add_tables(struct pfr_table *tbl, int size, int *nadd, int flags) +{ + struct pfr_ktableworkq addq, changeq; + struct pfr_ktable *p, *q, *r, key; + int i, rv, s = 0, xadd = 0; + long tzero = time_second; + + ACCEPT_FLAGS(PFR_FLAG_ATOMIC+PFR_FLAG_DUMMY); + SLIST_INIT(&addq); + SLIST_INIT(&changeq); + for (i = 0; i < size; i++) { + if (COPYIN(tbl+i, &key.pfrkt_t, sizeof(key.pfrkt_t))) + senderr(EFAULT); + if (pfr_validate_table(&key.pfrkt_t, PFR_TFLAG_USRMASK, + flags & PFR_FLAG_USERIOCTL)) + senderr(EINVAL); + key.pfrkt_flags |= PFR_TFLAG_ACTIVE; + p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (p == NULL) { + p = pfr_create_ktable(&key.pfrkt_t, tzero, 1); + if (p == NULL) + senderr(ENOMEM); + SLIST_FOREACH(q, &addq, pfrkt_workq) { + if (!pfr_ktable_compare(p, q)) + goto _skip; + } + SLIST_INSERT_HEAD(&addq, p, pfrkt_workq); + xadd++; + if (!key.pfrkt_anchor[0]) + goto _skip; + + /* find or create root table */ + bzero(key.pfrkt_anchor, sizeof(key.pfrkt_anchor)); + r = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (r != NULL) { + p->pfrkt_root = r; + goto _skip; + } + SLIST_FOREACH(q, &addq, pfrkt_workq) { + if (!pfr_ktable_compare(&key, q)) { + p->pfrkt_root = q; + goto _skip; + } + } + key.pfrkt_flags = 0; + r = pfr_create_ktable(&key.pfrkt_t, 0, 1); + if (r == NULL) + senderr(ENOMEM); + SLIST_INSERT_HEAD(&addq, r, pfrkt_workq); + p->pfrkt_root = r; + } else if (!(p->pfrkt_flags & PFR_TFLAG_ACTIVE)) { + SLIST_FOREACH(q, &changeq, pfrkt_workq) + if (!pfr_ktable_compare(&key, q)) + goto _skip; + p->pfrkt_nflags = (p->pfrkt_flags & + ~PFR_TFLAG_USRMASK) | key.pfrkt_flags; + SLIST_INSERT_HEAD(&changeq, p, pfrkt_workq); + xadd++; + } +_skip: + ; + } + if (!(flags & PFR_FLAG_DUMMY)) { + if (flags & PFR_FLAG_ATOMIC) + s = splsoftnet(); + pfr_insert_ktables(&addq); + pfr_setflags_ktables(&changeq); + if (flags & PFR_FLAG_ATOMIC) + splx(s); + } else + pfr_destroy_ktables(&addq, 0); + if (nadd != NULL) + *nadd = xadd; + return (0); +_bad: + pfr_destroy_ktables(&addq, 0); + return (rv); +} + +int +pfr_del_tables(struct pfr_table *tbl, int size, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p, *q, key; + int i, s = 0, xdel = 0; + + ACCEPT_FLAGS(PFR_FLAG_ATOMIC+PFR_FLAG_DUMMY); + SLIST_INIT(&workq); + for (i = 0; i < size; i++) { + if (COPYIN(tbl+i, &key.pfrkt_t, sizeof(key.pfrkt_t))) + return (EFAULT); + if (pfr_validate_table(&key.pfrkt_t, 0, + flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (p != NULL && (p->pfrkt_flags & PFR_TFLAG_ACTIVE)) { + SLIST_FOREACH(q, &workq, pfrkt_workq) + if (!pfr_ktable_compare(p, q)) + goto _skip; + p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_ACTIVE; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xdel++; + } +_skip: + ; + } + + if (!(flags & PFR_FLAG_DUMMY)) { + if (flags & PFR_FLAG_ATOMIC) + s = splsoftnet(); + pfr_setflags_ktables(&workq); + if (flags & PFR_FLAG_ATOMIC) + splx(s); + } + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_get_tables(struct pfr_table *filter, struct pfr_table *tbl, int *size, + int flags) +{ + struct pfr_ktable *p; + int n, nn; + + ACCEPT_FLAGS(PFR_FLAG_ALLRSETS); + if (pfr_fix_anchor(filter->pfrt_anchor)) + return (EINVAL); + n = nn = pfr_table_count(filter, flags); + if (n < 0) + return (ENOENT); + if (n > *size) { + *size = n; + return (0); + } + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (pfr_skip_table(filter, p, flags)) + continue; + if (n-- <= 0) + continue; + if (COPYOUT(&p->pfrkt_t, tbl++, sizeof(*tbl))) + return (EFAULT); + } + if (n) { + printf("pfr_get_tables: corruption detected (%d).\n", n); + return (ENOTTY); + } + *size = nn; + return (0); +} + +int +pfr_get_tstats(struct pfr_table *filter, struct pfr_tstats *tbl, int *size, + int flags) +{ + struct pfr_ktable *p; + struct pfr_ktableworkq workq; + int s = 0, n, nn; + long tzero = time_second; + + ACCEPT_FLAGS(PFR_FLAG_ATOMIC|PFR_FLAG_ALLRSETS); + /* XXX PFR_FLAG_CLSTATS disabled */ + if (pfr_fix_anchor(filter->pfrt_anchor)) + return (EINVAL); + n = nn = pfr_table_count(filter, flags); + if (n < 0) + return (ENOENT); + if (n > *size) { + *size = n; + return (0); + } + SLIST_INIT(&workq); + if (flags & PFR_FLAG_ATOMIC) + s = splsoftnet(); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (pfr_skip_table(filter, p, flags)) + continue; + if (n-- <= 0) + continue; + if (!(flags & PFR_FLAG_ATOMIC)) + s = splsoftnet(); + if (COPYOUT(&p->pfrkt_ts, tbl++, sizeof(*tbl))) { + if (!(flags & PFR_FLAG_ATOMIC)) + splx(s); + return (EFAULT); + } + if (!(flags & PFR_FLAG_ATOMIC)) + splx(s); + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + } + if (flags & PFR_FLAG_CLSTATS) + pfr_clstats_ktables(&workq, tzero, + flags & PFR_FLAG_ADDRSTOO); + if (flags & PFR_FLAG_ATOMIC) + splx(s); + if (n) { + printf("pfr_get_tstats: corruption detected (%d).\n", n); + return (ENOTTY); + } + *size = nn; + return (0); +} + +int +pfr_clr_tstats(struct pfr_table *tbl, int size, int *nzero, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p, key; + int i, s = 0, xzero = 0; + long tzero = time_second; + + ACCEPT_FLAGS(PFR_FLAG_ATOMIC+PFR_FLAG_DUMMY+PFR_FLAG_ADDRSTOO); + SLIST_INIT(&workq); + for (i = 0; i < size; i++) { + if (COPYIN(tbl+i, &key.pfrkt_t, sizeof(key.pfrkt_t))) + return (EFAULT); + if (pfr_validate_table(&key.pfrkt_t, 0, 0)) + return (EINVAL); + p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (p != NULL) { + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xzero++; + } + } + if (!(flags & PFR_FLAG_DUMMY)) { + if (flags & PFR_FLAG_ATOMIC) + s = splsoftnet(); + pfr_clstats_ktables(&workq, tzero, flags & PFR_FLAG_ADDRSTOO); + if (flags & PFR_FLAG_ATOMIC) + splx(s); + } + if (nzero != NULL) + *nzero = xzero; + return (0); +} + +int +pfr_set_tflags(struct pfr_table *tbl, int size, int setflag, int clrflag, + int *nchange, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p, *q, key; + int i, s = 0, xchange = 0, xdel = 0; + + ACCEPT_FLAGS(PFR_FLAG_ATOMIC+PFR_FLAG_DUMMY); + if ((setflag & ~PFR_TFLAG_USRMASK) || + (clrflag & ~PFR_TFLAG_USRMASK) || + (setflag & clrflag)) + return (EINVAL); + SLIST_INIT(&workq); + for (i = 0; i < size; i++) { + if (COPYIN(tbl+i, &key.pfrkt_t, sizeof(key.pfrkt_t))) + return (EFAULT); + if (pfr_validate_table(&key.pfrkt_t, 0, + flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (p != NULL && (p->pfrkt_flags & PFR_TFLAG_ACTIVE)) { + p->pfrkt_nflags = (p->pfrkt_flags | setflag) & + ~clrflag; + if (p->pfrkt_nflags == p->pfrkt_flags) + goto _skip; + SLIST_FOREACH(q, &workq, pfrkt_workq) + if (!pfr_ktable_compare(p, q)) + goto _skip; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + if ((p->pfrkt_flags & PFR_TFLAG_PERSIST) && + (clrflag & PFR_TFLAG_PERSIST) && + !(p->pfrkt_flags & PFR_TFLAG_REFERENCED)) + xdel++; + else + xchange++; + } +_skip: + ; + } + if (!(flags & PFR_FLAG_DUMMY)) { + if (flags & PFR_FLAG_ATOMIC) + s = splsoftnet(); + pfr_setflags_ktables(&workq); + if (flags & PFR_FLAG_ATOMIC) + splx(s); + } + if (nchange != NULL) + *nchange = xchange; + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_ina_begin(struct pfr_table *trs, u_int32_t *ticket, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p; + struct pf_ruleset *rs; + int xdel = 0; + + ACCEPT_FLAGS(PFR_FLAG_DUMMY); + rs = pf_find_or_create_ruleset(trs->pfrt_anchor); + if (rs == NULL) + return (ENOMEM); + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) || + pfr_skip_table(trs, p, 0)) + continue; + p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_INACTIVE; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xdel++; + } + if (!(flags & PFR_FLAG_DUMMY)) { + pfr_setflags_ktables(&workq); + if (ticket != NULL) + *ticket = ++rs->tticket; + rs->topen = 1; + } else + pf_remove_if_empty_ruleset(rs); + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_ina_define(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *nadd, int *naddr, u_int32_t ticket, int flags) +{ + struct pfr_ktableworkq tableq; + struct pfr_kentryworkq addrq; + struct pfr_ktable *kt, *rt, *shadow, key; + struct pfr_kentry *p; + struct pfr_addr ad; + struct pf_ruleset *rs; + int i, rv, xadd = 0, xaddr = 0; + + ACCEPT_FLAGS(PFR_FLAG_DUMMY|PFR_FLAG_ADDRSTOO); + if (size && !(flags & PFR_FLAG_ADDRSTOO)) + return (EINVAL); + if (pfr_validate_table(tbl, PFR_TFLAG_USRMASK, + flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + rs = pf_find_ruleset(tbl->pfrt_anchor); + if (rs == NULL || !rs->topen || ticket != rs->tticket) + return (EBUSY); + tbl->pfrt_flags |= PFR_TFLAG_INACTIVE; + SLIST_INIT(&tableq); + kt = RB_FIND(pfr_ktablehead, &pfr_ktables, (struct pfr_ktable *)tbl); + if (kt == NULL) { + kt = pfr_create_ktable(tbl, 0, 1); + if (kt == NULL) + return (ENOMEM); + SLIST_INSERT_HEAD(&tableq, kt, pfrkt_workq); + xadd++; + if (!tbl->pfrt_anchor[0]) + goto _skip; + + /* find or create root table */ + bzero(&key, sizeof(key)); + strlcpy(key.pfrkt_name, tbl->pfrt_name, sizeof(key.pfrkt_name)); + rt = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (rt != NULL) { + kt->pfrkt_root = rt; + goto _skip; + } + rt = pfr_create_ktable(&key.pfrkt_t, 0, 1); + if (rt == NULL) { + pfr_destroy_ktables(&tableq, 0); + return (ENOMEM); + } + SLIST_INSERT_HEAD(&tableq, rt, pfrkt_workq); + kt->pfrkt_root = rt; + } else if (!(kt->pfrkt_flags & PFR_TFLAG_INACTIVE)) + xadd++; +_skip: + shadow = pfr_create_ktable(tbl, 0, 0); + if (shadow == NULL) { + pfr_destroy_ktables(&tableq, 0); + return (ENOMEM); + } + SLIST_INIT(&addrq); + for (i = 0; i < size; i++) { + if (COPYIN(addr+i, &ad, sizeof(ad))) + senderr(EFAULT); + if (pfr_validate_addr(&ad)) + senderr(EINVAL); + if (pfr_lookup_addr(shadow, &ad, 1) != NULL) + continue; + p = pfr_create_kentry(&ad, 0); + if (p == NULL) + senderr(ENOMEM); + if (pfr_route_kentry(shadow, p)) { + pfr_destroy_kentry(p); + continue; + } + SLIST_INSERT_HEAD(&addrq, p, pfrke_workq); + xaddr++; + } + if (!(flags & PFR_FLAG_DUMMY)) { + if (kt->pfrkt_shadow != NULL) + pfr_destroy_ktable(kt->pfrkt_shadow, 1); + kt->pfrkt_flags |= PFR_TFLAG_INACTIVE; + pfr_insert_ktables(&tableq); + shadow->pfrkt_cnt = (flags & PFR_FLAG_ADDRSTOO) ? + xaddr : NO_ADDRESSES; + kt->pfrkt_shadow = shadow; + } else { + pfr_clean_node_mask(shadow, &addrq); + pfr_destroy_ktable(shadow, 0); + pfr_destroy_ktables(&tableq, 0); + pfr_destroy_kentries(&addrq); + } + if (nadd != NULL) + *nadd = xadd; + if (naddr != NULL) + *naddr = xaddr; + return (0); +_bad: + pfr_destroy_ktable(shadow, 0); + pfr_destroy_ktables(&tableq, 0); + pfr_destroy_kentries(&addrq); + return (rv); +} + +int +pfr_ina_rollback(struct pfr_table *trs, u_int32_t ticket, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p; + struct pf_ruleset *rs; + int xdel = 0; + + ACCEPT_FLAGS(PFR_FLAG_DUMMY); + rs = pf_find_ruleset(trs->pfrt_anchor); + if (rs == NULL || !rs->topen || ticket != rs->tticket) + return (0); + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) || + pfr_skip_table(trs, p, 0)) + continue; + p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_INACTIVE; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xdel++; + } + if (!(flags & PFR_FLAG_DUMMY)) { + pfr_setflags_ktables(&workq); + rs->topen = 0; + pf_remove_if_empty_ruleset(rs); + } + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_ina_commit(struct pfr_table *trs, u_int32_t ticket, int *nadd, + int *nchange, int flags) +{ + struct pfr_ktable *p, *q; + struct pfr_ktableworkq workq; + struct pf_ruleset *rs; + int s = 0, xadd = 0, xchange = 0; + long tzero = time_second; + + ACCEPT_FLAGS(PFR_FLAG_ATOMIC+PFR_FLAG_DUMMY); + rs = pf_find_ruleset(trs->pfrt_anchor); + if (rs == NULL || !rs->topen || ticket != rs->tticket) + return (EBUSY); + + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) || + pfr_skip_table(trs, p, 0)) + continue; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + if (p->pfrkt_flags & PFR_TFLAG_ACTIVE) + xchange++; + else + xadd++; + } + + if (!(flags & PFR_FLAG_DUMMY)) { + if (flags & PFR_FLAG_ATOMIC) + s = splsoftnet(); + for (p = SLIST_FIRST(&workq); p != NULL; p = q) { + q = SLIST_NEXT(p, pfrkt_workq); + pfr_commit_ktable(p, tzero); + } + if (flags & PFR_FLAG_ATOMIC) + splx(s); + rs->topen = 0; + pf_remove_if_empty_ruleset(rs); + } + if (nadd != NULL) + *nadd = xadd; + if (nchange != NULL) + *nchange = xchange; + + return (0); +} + +void +pfr_commit_ktable(struct pfr_ktable *kt, long tzero) +{ + struct pfr_ktable *shadow = kt->pfrkt_shadow; + int nflags; + + if (shadow->pfrkt_cnt == NO_ADDRESSES) { + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + pfr_clstats_ktable(kt, tzero, 1); + } else if (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) { + /* kt might contain addresses */ + struct pfr_kentryworkq addrq, addq, changeq, delq, garbageq; + struct pfr_kentry *p, *q, *next; + struct pfr_addr ad; + + pfr_enqueue_addrs(shadow, &addrq, NULL, 0); + pfr_mark_addrs(kt); + SLIST_INIT(&addq); + SLIST_INIT(&changeq); + SLIST_INIT(&delq); + SLIST_INIT(&garbageq); + pfr_clean_node_mask(shadow, &addrq); + for (p = SLIST_FIRST(&addrq); p != NULL; p = next) { + next = SLIST_NEXT(p, pfrke_workq); /* XXX */ + pfr_copyout_addr(&ad, p); + q = pfr_lookup_addr(kt, &ad, 1); + if (q != NULL) { + if (q->pfrke_not != p->pfrke_not) + SLIST_INSERT_HEAD(&changeq, q, + pfrke_workq); + q->pfrke_mark = 1; + SLIST_INSERT_HEAD(&garbageq, p, pfrke_workq); + } else { + p->pfrke_tzero = tzero; + SLIST_INSERT_HEAD(&addq, p, pfrke_workq); + } + } + pfr_enqueue_addrs(kt, &delq, NULL, ENQUEUE_UNMARKED_ONLY); + pfr_insert_kentries(kt, &addq, tzero); + pfr_remove_kentries(kt, &delq); + pfr_clstats_kentries(&changeq, tzero, INVERT_NEG_FLAG); + pfr_destroy_kentries(&garbageq); + } else { + /* kt cannot contain addresses */ + SWAP(struct radix_node_head *, kt->pfrkt_ip4, + shadow->pfrkt_ip4); + SWAP(struct radix_node_head *, kt->pfrkt_ip6, + shadow->pfrkt_ip6); + SWAP(int, kt->pfrkt_cnt, shadow->pfrkt_cnt); + pfr_clstats_ktable(kt, tzero, 1); + } + nflags = ((shadow->pfrkt_flags & PFR_TFLAG_USRMASK) | + (kt->pfrkt_flags & PFR_TFLAG_SETMASK) | PFR_TFLAG_ACTIVE) + & ~PFR_TFLAG_INACTIVE; + pfr_destroy_ktable(shadow, 0); + kt->pfrkt_shadow = NULL; + pfr_setflags_ktable(kt, nflags); +} + +int +pfr_validate_table(struct pfr_table *tbl, int allowedflags, int no_reserved) +{ + int i; + + if (!tbl->pfrt_name[0]) + return (-1); + if (no_reserved && !strcmp(tbl->pfrt_anchor, PF_RESERVED_ANCHOR)) + return (-1); + if (tbl->pfrt_name[PF_TABLE_NAME_SIZE-1]) + return (-1); + for (i = strlen(tbl->pfrt_name); i < PF_TABLE_NAME_SIZE; i++) + if (tbl->pfrt_name[i]) + return (-1); + if (pfr_fix_anchor(tbl->pfrt_anchor)) + return (-1); + if (tbl->pfrt_flags & ~allowedflags) + return (-1); + return (0); +} + +/* + * Rewrite anchors referenced by tables to remove slashes + * and check for validity. + */ +int +pfr_fix_anchor(char *anchor) +{ + size_t siz = MAXPATHLEN; + int i; + + if (anchor[0] == '/') { + char *path; + int off; + + path = anchor; + off = 1; + while (*++path == '/') + off++; + bcopy(path, anchor, siz - off); + memset(anchor + siz - off, 0, off); + } + if (anchor[siz - 1]) + return (-1); + for (i = strlen(anchor); i < siz; i++) + if (anchor[i]) + return (-1); + return (0); +} + +int +pfr_table_count(struct pfr_table *filter, int flags) +{ + struct pf_ruleset *rs; + + if (flags & PFR_FLAG_ALLRSETS) + return (pfr_ktable_cnt); + if (filter->pfrt_anchor[0]) { + rs = pf_find_ruleset(filter->pfrt_anchor); + return ((rs != NULL) ? rs->tables : -1); + } + return (pf_main_ruleset.tables); +} + +int +pfr_skip_table(struct pfr_table *filter, struct pfr_ktable *kt, int flags) +{ + if (flags & PFR_FLAG_ALLRSETS) + return (0); + if (strcmp(filter->pfrt_anchor, kt->pfrkt_anchor)) + return (1); + return (0); +} + +void +pfr_insert_ktables(struct pfr_ktableworkq *workq) +{ + struct pfr_ktable *p; + + SLIST_FOREACH(p, workq, pfrkt_workq) + pfr_insert_ktable(p); +} + +void +pfr_insert_ktable(struct pfr_ktable *kt) +{ + RB_INSERT(pfr_ktablehead, &pfr_ktables, kt); + pfr_ktable_cnt++; + if (kt->pfrkt_root != NULL) + if (!kt->pfrkt_root->pfrkt_refcnt[PFR_REFCNT_ANCHOR]++) + pfr_setflags_ktable(kt->pfrkt_root, + kt->pfrkt_root->pfrkt_flags|PFR_TFLAG_REFDANCHOR); +} + +void +pfr_setflags_ktables(struct pfr_ktableworkq *workq) +{ + struct pfr_ktable *p, *q; + + for (p = SLIST_FIRST(workq); p; p = q) { + q = SLIST_NEXT(p, pfrkt_workq); + pfr_setflags_ktable(p, p->pfrkt_nflags); + } +} + +void +pfr_setflags_ktable(struct pfr_ktable *kt, int newf) +{ + struct pfr_kentryworkq addrq; + + if (!(newf & PFR_TFLAG_REFERENCED) && + !(newf & PFR_TFLAG_PERSIST)) + newf &= ~PFR_TFLAG_ACTIVE; + if (!(newf & PFR_TFLAG_ACTIVE)) + newf &= ~PFR_TFLAG_USRMASK; + if (!(newf & PFR_TFLAG_SETMASK)) { + RB_REMOVE(pfr_ktablehead, &pfr_ktables, kt); + if (kt->pfrkt_root != NULL) + if (!--kt->pfrkt_root->pfrkt_refcnt[PFR_REFCNT_ANCHOR]) + pfr_setflags_ktable(kt->pfrkt_root, + kt->pfrkt_root->pfrkt_flags & + ~PFR_TFLAG_REFDANCHOR); + pfr_destroy_ktable(kt, 1); + pfr_ktable_cnt--; + return; + } + if (!(newf & PFR_TFLAG_ACTIVE) && kt->pfrkt_cnt) { + pfr_enqueue_addrs(kt, &addrq, NULL, 0); + pfr_remove_kentries(kt, &addrq); + } + if (!(newf & PFR_TFLAG_INACTIVE) && kt->pfrkt_shadow != NULL) { + pfr_destroy_ktable(kt->pfrkt_shadow, 1); + kt->pfrkt_shadow = NULL; + } + kt->pfrkt_flags = newf; +} + +void +pfr_clstats_ktables(struct pfr_ktableworkq *workq, long tzero, int recurse) +{ + struct pfr_ktable *p; + + SLIST_FOREACH(p, workq, pfrkt_workq) + pfr_clstats_ktable(p, tzero, recurse); +} + +void +pfr_clstats_ktable(struct pfr_ktable *kt, long tzero, int recurse) +{ + struct pfr_kentryworkq addrq; + int s; + + if (recurse) { + pfr_enqueue_addrs(kt, &addrq, NULL, 0); + pfr_clstats_kentries(&addrq, tzero, 0); + } + s = splsoftnet(); + bzero(kt->pfrkt_packets, sizeof(kt->pfrkt_packets)); + bzero(kt->pfrkt_bytes, sizeof(kt->pfrkt_bytes)); + kt->pfrkt_match = kt->pfrkt_nomatch = 0; + splx(s); + kt->pfrkt_tzero = tzero; +} + +struct pfr_ktable * +pfr_create_ktable(struct pfr_table *tbl, long tzero, int attachruleset) +{ + struct pfr_ktable *kt; + struct pf_ruleset *rs; + + kt = pool_get(&pfr_ktable_pl, PR_NOWAIT); + if (kt == NULL) + return (NULL); + bzero(kt, sizeof(*kt)); + kt->pfrkt_t = *tbl; + + if (attachruleset) { + rs = pf_find_or_create_ruleset(tbl->pfrt_anchor); + if (!rs) { + pfr_destroy_ktable(kt, 0); + return (NULL); + } + kt->pfrkt_rs = rs; + rs->tables++; + } + + if (!rn_inithead((void **)&kt->pfrkt_ip4, + offsetof(struct sockaddr_in, sin_addr) * 8) || + !rn_inithead((void **)&kt->pfrkt_ip6, + offsetof(struct sockaddr_in6, sin6_addr) * 8)) { + pfr_destroy_ktable(kt, 0); + return (NULL); + } + kt->pfrkt_tzero = tzero; + + return (kt); +} + +void +pfr_destroy_ktables(struct pfr_ktableworkq *workq, int flushaddr) +{ + struct pfr_ktable *p, *q; + + for (p = SLIST_FIRST(workq); p; p = q) { + q = SLIST_NEXT(p, pfrkt_workq); + pfr_destroy_ktable(p, flushaddr); + } +} + +void +pfr_destroy_ktable(struct pfr_ktable *kt, int flushaddr) +{ + struct pfr_kentryworkq addrq; + + if (flushaddr) { + pfr_enqueue_addrs(kt, &addrq, NULL, 0); + pfr_clean_node_mask(kt, &addrq); + pfr_destroy_kentries(&addrq); + } +#if defined(__FreeBSD__) && (__FreeBSD_version >= 500100) + if (kt->pfrkt_ip4 != NULL) { + RADIX_NODE_HEAD_DESTROY(kt->pfrkt_ip4); + free((caddr_t)kt->pfrkt_ip4, M_RTABLE); + } + if (kt->pfrkt_ip6 != NULL) { + RADIX_NODE_HEAD_DESTROY(kt->pfrkt_ip6); + free((caddr_t)kt->pfrkt_ip6, M_RTABLE); + } +#else + if (kt->pfrkt_ip4 != NULL) + free((caddr_t)kt->pfrkt_ip4, M_RTABLE); + if (kt->pfrkt_ip6 != NULL) + free((caddr_t)kt->pfrkt_ip6, M_RTABLE); +#endif + if (kt->pfrkt_shadow != NULL) + pfr_destroy_ktable(kt->pfrkt_shadow, flushaddr); + if (kt->pfrkt_rs != NULL) { + kt->pfrkt_rs->tables--; + pf_remove_if_empty_ruleset(kt->pfrkt_rs); + } + pool_put(&pfr_ktable_pl, kt); +} + +int +pfr_ktable_compare(struct pfr_ktable *p, struct pfr_ktable *q) +{ + int d; + + if ((d = strncmp(p->pfrkt_name, q->pfrkt_name, PF_TABLE_NAME_SIZE))) + return (d); + return (strcmp(p->pfrkt_anchor, q->pfrkt_anchor)); +} + +struct pfr_ktable * +pfr_lookup_table(struct pfr_table *tbl) +{ + /* struct pfr_ktable start like a struct pfr_table */ + return (RB_FIND(pfr_ktablehead, &pfr_ktables, + (struct pfr_ktable *)tbl)); +} + +int +pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af) +{ + struct pfr_kentry *ke = NULL; + int match; + + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (0); + + switch (af) { +#ifdef INET + case AF_INET: + pfr_sin.sin_addr.s_addr = a->addr32[0]; + ke = (struct pfr_kentry *)rn_match(&pfr_sin, kt->pfrkt_ip4); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + bcopy(a, &pfr_sin6.sin6_addr, sizeof(pfr_sin6.sin6_addr)); + ke = (struct pfr_kentry *)rn_match(&pfr_sin6, kt->pfrkt_ip6); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + break; +#endif /* INET6 */ + } + match = (ke && !ke->pfrke_not); + if (match) + kt->pfrkt_match++; + else + kt->pfrkt_nomatch++; + return (match); +} + +void +pfr_update_stats(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af, + u_int64_t len, int dir_out, int op_pass, int notrule) +{ + struct pfr_kentry *ke = NULL; + + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return; + + switch (af) { +#ifdef INET + case AF_INET: + pfr_sin.sin_addr.s_addr = a->addr32[0]; + ke = (struct pfr_kentry *)rn_match(&pfr_sin, kt->pfrkt_ip4); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + bcopy(a, &pfr_sin6.sin6_addr, sizeof(pfr_sin6.sin6_addr)); + ke = (struct pfr_kentry *)rn_match(&pfr_sin6, kt->pfrkt_ip6); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + break; +#endif /* INET6 */ + default: + ; + } + if ((ke == NULL || ke->pfrke_not) != notrule) { + if (op_pass != PFR_OP_PASS) + printf("pfr_update_stats: assertion failed.\n"); + op_pass = PFR_OP_XPASS; + } + kt->pfrkt_packets[dir_out][op_pass]++; + kt->pfrkt_bytes[dir_out][op_pass] += len; + if (ke != NULL && op_pass != PFR_OP_XPASS) { + ke->pfrke_packets[dir_out][op_pass]++; + ke->pfrke_bytes[dir_out][op_pass] += len; + } +} + +struct pfr_ktable * +pfr_attach_table(struct pf_ruleset *rs, char *name) +{ + struct pfr_ktable *kt, *rt; + struct pfr_table tbl; + struct pf_anchor *ac = rs->anchor; + + bzero(&tbl, sizeof(tbl)); + strlcpy(tbl.pfrt_name, name, sizeof(tbl.pfrt_name)); + if (ac != NULL) + strlcpy(tbl.pfrt_anchor, ac->path, sizeof(tbl.pfrt_anchor)); + kt = pfr_lookup_table(&tbl); + if (kt == NULL) { + kt = pfr_create_ktable(&tbl, time_second, 1); + if (kt == NULL) + return (NULL); + if (ac != NULL) { + bzero(tbl.pfrt_anchor, sizeof(tbl.pfrt_anchor)); + rt = pfr_lookup_table(&tbl); + if (rt == NULL) { + rt = pfr_create_ktable(&tbl, 0, 1); + if (rt == NULL) { + pfr_destroy_ktable(kt, 0); + return (NULL); + } + pfr_insert_ktable(rt); + } + kt->pfrkt_root = rt; + } + pfr_insert_ktable(kt); + } + if (!kt->pfrkt_refcnt[PFR_REFCNT_RULE]++) + pfr_setflags_ktable(kt, kt->pfrkt_flags|PFR_TFLAG_REFERENCED); + return (kt); +} + +void +pfr_detach_table(struct pfr_ktable *kt) +{ + if (kt->pfrkt_refcnt[PFR_REFCNT_RULE] <= 0) + printf("pfr_detach_table: refcount = %d.\n", + kt->pfrkt_refcnt[PFR_REFCNT_RULE]); + else if (!--kt->pfrkt_refcnt[PFR_REFCNT_RULE]) + pfr_setflags_ktable(kt, kt->pfrkt_flags&~PFR_TFLAG_REFERENCED); +} + + +int +pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter, + struct pf_addr **raddr, struct pf_addr **rmask, sa_family_t af) +{ + struct pfr_kentry *ke, *ke2 = NULL; + struct pf_addr *addr = NULL; + union sockaddr_union mask; + int idx = -1, use_counter = 0; + + if (af == AF_INET) + addr = (struct pf_addr *)&pfr_sin.sin_addr; + else if (af == AF_INET6) + addr = (struct pf_addr *)&pfr_sin6.sin6_addr; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (-1); + + if (pidx != NULL) + idx = *pidx; + if (counter != NULL && idx >= 0) + use_counter = 1; + if (idx < 0) + idx = 0; + +_next_block: + ke = pfr_kentry_byidx(kt, idx, af); + if (ke == NULL) + return (1); + pfr_prepare_network(&pfr_mask, af, ke->pfrke_net); + *raddr = SUNION2PF(&ke->pfrke_sa, af); + *rmask = SUNION2PF(&pfr_mask, af); + + if (use_counter) { + /* is supplied address within block? */ + if (!PF_MATCHA(0, *raddr, *rmask, counter, af)) { + /* no, go to next block in table */ + idx++; + use_counter = 0; + goto _next_block; + } + PF_ACPY(addr, counter, af); + } else { + /* use first address of block */ + PF_ACPY(addr, *raddr, af); + } + + if (!KENTRY_NETWORK(ke)) { + /* this is a single IP address - no possible nested block */ + PF_ACPY(counter, addr, af); + *pidx = idx; + return (0); + } + for (;;) { + /* we don't want to use a nested block */ + if (af == AF_INET) + ke2 = (struct pfr_kentry *)rn_match(&pfr_sin, + kt->pfrkt_ip4); + else if (af == AF_INET6) + ke2 = (struct pfr_kentry *)rn_match(&pfr_sin6, + kt->pfrkt_ip6); + /* no need to check KENTRY_RNF_ROOT() here */ + if (ke2 == ke) { + /* lookup return the same block - perfect */ + PF_ACPY(counter, addr, af); + *pidx = idx; + return (0); + } + + /* we need to increase the counter past the nested block */ + pfr_prepare_network(&mask, AF_INET, ke2->pfrke_net); + PF_POOLMASK(addr, addr, SUNION2PF(&mask, af), &pfr_ffaddr, af); + PF_AINC(addr, af); + if (!PF_MATCHA(0, *raddr, *rmask, addr, af)) { + /* ok, we reached the end of our main block */ + /* go to next block in table */ + idx++; + use_counter = 0; + goto _next_block; + } + } +} + +struct pfr_kentry * +pfr_kentry_byidx(struct pfr_ktable *kt, int idx, int af) +{ + struct pfr_walktree w; + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_POOL_GET; + w.pfrw_cnt = idx; + + switch (af) { +#ifdef INET + case AF_INET: +#ifdef __FreeBSD__ + kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w); +#else + rn_walktree(kt->pfrkt_ip4, pfr_walktree, &w); +#endif + return (w.pfrw_kentry); +#endif /* INET */ +#ifdef INET6 + case AF_INET6: +#ifdef __FreeBSD__ + kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, &w); +#else + rn_walktree(kt->pfrkt_ip6, pfr_walktree, &w); +#endif + return (w.pfrw_kentry); +#endif /* INET6 */ + default: + return (NULL); + } +} + +void +pfr_dynaddr_update(struct pfr_ktable *kt, struct pfi_dynaddr *dyn) +{ + struct pfr_walktree w; + int s; + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_DYNADDR_UPDATE; + w.pfrw_dyn = dyn; + + s = splsoftnet(); + dyn->pfid_acnt4 = 0; + dyn->pfid_acnt6 = 0; + if (!dyn->pfid_af || dyn->pfid_af == AF_INET) +#ifdef __FreeBSD__ + kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w); +#else + rn_walktree(kt->pfrkt_ip4, pfr_walktree, &w); +#endif + if (!dyn->pfid_af || dyn->pfid_af == AF_INET6) +#ifdef __FreeBSD__ + kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, &w); +#else + rn_walktree(kt->pfrkt_ip6, pfr_walktree, &w); +#endif + splx(s); +} diff --git a/contrib/pf/rtems/freebsd/net/pfvar.h b/contrib/pf/rtems/freebsd/net/pfvar.h new file mode 100644 index 00000000..d0c0ced0 --- /dev/null +++ b/contrib/pf/rtems/freebsd/net/pfvar.h @@ -0,0 +1,1866 @@ +/* $FreeBSD$ */ +/* $OpenBSD: pfvar.h,v 1.244 2007/02/23 21:31:51 deraadt Exp $ */ + +/* + * Copyright (c) 2001 Daniel Hartmeier + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _NET_PFVAR_HH_ +#define _NET_PFVAR_HH_ + +#include +#include +#include +#include +#ifdef __FreeBSD__ +#include +#include +#else +#include +#endif + +#include +#include +#ifdef __FreeBSD__ +#include +#include +#include +#else +#include +#endif + +#ifdef __FreeBSD__ +#include +#endif + +#include + +struct ip; +struct ip6_hdr; +#ifdef __FreeBSD__ +struct inpcb; +#endif + +#define PF_TCPS_PROXY_SRC ((TCP_NSTATES)+0) +#define PF_TCPS_PROXY_DST ((TCP_NSTATES)+1) + +#define PF_MD5_DIGEST_LENGTH 16 +#ifdef MD5_DIGEST_LENGTH +#if PF_MD5_DIGEST_LENGTH != MD5_DIGEST_LENGTH +#error +#endif +#endif + +enum { PF_INOUT, PF_IN, PF_OUT }; +enum { PF_LAN_EXT, PF_EXT_GWY, PF_ID }; +enum { PF_PASS, PF_DROP, PF_SCRUB, PF_NOSCRUB, PF_NAT, PF_NONAT, + PF_BINAT, PF_NOBINAT, PF_RDR, PF_NORDR, PF_SYNPROXY_DROP }; +enum { PF_RULESET_SCRUB, PF_RULESET_FILTER, PF_RULESET_NAT, + PF_RULESET_BINAT, PF_RULESET_RDR, PF_RULESET_MAX }; +enum { PF_OP_NONE, PF_OP_IRG, PF_OP_EQ, PF_OP_NE, PF_OP_LT, + PF_OP_LE, PF_OP_GT, PF_OP_GE, PF_OP_XRG, PF_OP_RRG }; +enum { PF_DEBUG_NONE, PF_DEBUG_URGENT, PF_DEBUG_MISC, PF_DEBUG_NOISY }; +enum { PF_CHANGE_NONE, PF_CHANGE_ADD_HEAD, PF_CHANGE_ADD_TAIL, + PF_CHANGE_ADD_BEFORE, PF_CHANGE_ADD_AFTER, + PF_CHANGE_REMOVE, PF_CHANGE_GET_TICKET }; +enum { PF_GET_NONE, PF_GET_CLR_CNTR }; + +/* + * Note about PFTM_*: real indices into pf_rule.timeout[] come before + * PFTM_MAX, special cases afterwards. See pf_state_expires(). + */ +enum { PFTM_TCP_FIRST_PACKET, PFTM_TCP_OPENING, PFTM_TCP_ESTABLISHED, + PFTM_TCP_CLOSING, PFTM_TCP_FIN_WAIT, PFTM_TCP_CLOSED, + PFTM_UDP_FIRST_PACKET, PFTM_UDP_SINGLE, PFTM_UDP_MULTIPLE, + PFTM_ICMP_FIRST_PACKET, PFTM_ICMP_ERROR_REPLY, + PFTM_OTHER_FIRST_PACKET, PFTM_OTHER_SINGLE, + PFTM_OTHER_MULTIPLE, PFTM_FRAG, PFTM_INTERVAL, + PFTM_ADAPTIVE_START, PFTM_ADAPTIVE_END, PFTM_SRC_NODE, + PFTM_TS_DIFF, PFTM_MAX, PFTM_PURGE, PFTM_UNLINKED, + PFTM_UNTIL_PACKET }; + +/* PFTM default values */ +#define PFTM_TCP_FIRST_PACKET_VAL 120 /* First TCP packet */ +#define PFTM_TCP_OPENING_VAL 30 /* No response yet */ +#define PFTM_TCP_ESTABLISHED_VAL 24*60*60/* Established */ +#define PFTM_TCP_CLOSING_VAL 15 * 60 /* Half closed */ +#define PFTM_TCP_FIN_WAIT_VAL 45 /* Got both FINs */ +#define PFTM_TCP_CLOSED_VAL 90 /* Got a RST */ +#define PFTM_UDP_FIRST_PACKET_VAL 60 /* First UDP packet */ +#define PFTM_UDP_SINGLE_VAL 30 /* Unidirectional */ +#define PFTM_UDP_MULTIPLE_VAL 60 /* Bidirectional */ +#define PFTM_ICMP_FIRST_PACKET_VAL 20 /* First ICMP packet */ +#define PFTM_ICMP_ERROR_REPLY_VAL 10 /* Got error response */ +#define PFTM_OTHER_FIRST_PACKET_VAL 60 /* First packet */ +#define PFTM_OTHER_SINGLE_VAL 30 /* Unidirectional */ +#define PFTM_OTHER_MULTIPLE_VAL 60 /* Bidirectional */ +#define PFTM_FRAG_VAL 30 /* Fragment expire */ +#define PFTM_INTERVAL_VAL 10 /* Expire interval */ +#define PFTM_SRC_NODE_VAL 0 /* Source tracking */ +#define PFTM_TS_DIFF_VAL 30 /* Allowed TS diff */ + +enum { PF_NOPFROUTE, PF_FASTROUTE, PF_ROUTETO, PF_DUPTO, PF_REPLYTO }; +enum { PF_LIMIT_STATES, PF_LIMIT_SRC_NODES, PF_LIMIT_FRAGS, + PF_LIMIT_TABLES, PF_LIMIT_TABLE_ENTRIES, PF_LIMIT_MAX }; +#define PF_POOL_IDMASK 0x0f +enum { PF_POOL_NONE, PF_POOL_BITMASK, PF_POOL_RANDOM, + PF_POOL_SRCHASH, PF_POOL_ROUNDROBIN }; +enum { PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, PF_ADDR_DYNIFTL, + PF_ADDR_TABLE, PF_ADDR_RTLABEL, PF_ADDR_URPFFAILED }; +#define PF_POOL_TYPEMASK 0x0f +#define PF_POOL_STICKYADDR 0x20 +#define PF_WSCALE_FLAG 0x80 +#define PF_WSCALE_MASK 0x0f + +#define PF_LOG 0x01 +#define PF_LOG_ALL 0x02 +#define PF_LOG_SOCKET_LOOKUP 0x04 + +struct pf_addr { + union { + struct in_addr v4; + struct in6_addr v6; + u_int8_t addr8[16]; + u_int16_t addr16[8]; + u_int32_t addr32[4]; + } pfa; /* 128-bit address */ +#define v4 pfa.v4 +#define v6 pfa.v6 +#define addr8 pfa.addr8 +#define addr16 pfa.addr16 +#define addr32 pfa.addr32 +}; + +#define PF_TABLE_NAME_SIZE 32 + +#define PFI_AFLAG_NETWORK 0x01 +#define PFI_AFLAG_BROADCAST 0x02 +#define PFI_AFLAG_PEER 0x04 +#define PFI_AFLAG_MODEMASK 0x07 +#define PFI_AFLAG_NOALIAS 0x08 + +struct pf_addr_wrap { + union { + struct { + struct pf_addr addr; + struct pf_addr mask; + } a; + char ifname[IFNAMSIZ]; + char tblname[PF_TABLE_NAME_SIZE]; +#ifdef __FreeBSD__ +#define RTLABEL_LEN 32 +#endif + char rtlabelname[RTLABEL_LEN]; + u_int32_t rtlabel; + } v; + union { + struct pfi_dynaddr *dyn; + struct pfr_ktable *tbl; + int dyncnt; + int tblcnt; + } p; + u_int8_t type; /* PF_ADDR_* */ + u_int8_t iflags; /* PFI_AFLAG_* */ +}; + +#ifdef _KERNEL + +struct pfi_dynaddr { + TAILQ_ENTRY(pfi_dynaddr) entry; + struct pf_addr pfid_addr4; + struct pf_addr pfid_mask4; + struct pf_addr pfid_addr6; + struct pf_addr pfid_mask6; + struct pfr_ktable *pfid_kt; + struct pfi_kif *pfid_kif; + void *pfid_hook_cookie; + int pfid_net; /* mask or 128 */ + int pfid_acnt4; /* address count IPv4 */ + int pfid_acnt6; /* address count IPv6 */ + sa_family_t pfid_af; /* rule af */ + u_int8_t pfid_iflags; /* PFI_AFLAG_* */ +}; + +/* + * Address manipulation macros + */ + +#ifdef __FreeBSD__ +#define splsoftnet() splnet() + +#define HTONL(x) (x) = htonl((__uint32_t)(x)) +#define HTONS(x) (x) = htons((__uint16_t)(x)) +#define NTOHL(x) (x) = ntohl((__uint32_t)(x)) +#define NTOHS(x) (x) = ntohs((__uint16_t)(x)) + +#define PF_NAME "pf" + +#define PR_NOWAIT M_NOWAIT +#define pool_get(p, f) uma_zalloc(*(p), (f)) +#define pool_put(p, o) uma_zfree(*(p), (o)) + +#define UMA_CREATE(var, type, desc) \ + var = uma_zcreate(desc, sizeof(type), \ + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); \ + if (var == NULL) break +#define UMA_DESTROY(var) \ + if(var) uma_zdestroy(var) + +extern struct mtx pf_task_mtx; + +#define PF_ASSERT(h) mtx_assert(&pf_task_mtx, (h)) + +#define PF_LOCK() do { \ + PF_ASSERT(MA_NOTOWNED); \ + mtx_lock(&pf_task_mtx); \ +} while(0) +#define PF_UNLOCK() do { \ + PF_ASSERT(MA_OWNED); \ + mtx_unlock(&pf_task_mtx); \ +} while(0) + +#define PF_COPYIN(uaddr, kaddr, len, r) do { \ + PF_UNLOCK(); \ + r = copyin((uaddr), (kaddr), (len)); \ + PF_LOCK(); \ +} while(0) + +#define PF_COPYOUT(kaddr, uaddr, len, r) do { \ + PF_UNLOCK(); \ + r = copyout((kaddr), (uaddr), (len)); \ + PF_LOCK(); \ +} while(0) + +extern void init_pf_mutex(void); +extern void destroy_pf_mutex(void); + +#define PF_MODVER 1 +#define PFLOG_MODVER 1 +#define PFSYNC_MODVER 1 + +#define PFLOG_MINVER 1 +#define PFLOG_PREFVER PFLOG_MODVER +#define PFLOG_MAXVER 1 +#define PFSYNC_MINVER 1 +#define PFSYNC_PREFVER PFSYNC_MODVER +#define PFSYNC_MAXVER 1 +#endif /* __FreeBSD__ */ + +#ifdef INET +#ifndef INET6 +#define PF_INET_ONLY +#endif /* ! INET6 */ +#endif /* INET */ + +#ifdef INET6 +#ifndef INET +#define PF_INET6_ONLY +#endif /* ! INET */ +#endif /* INET6 */ + +#ifdef INET +#ifdef INET6 +#define PF_INET_INET6 +#endif /* INET6 */ +#endif /* INET */ + +#else + +#define PF_INET_INET6 + +#endif /* _KERNEL */ + +/* Both IPv4 and IPv6 */ +#ifdef PF_INET_INET6 + +#define PF_AEQ(a, b, c) \ + ((c == AF_INET && (a)->addr32[0] == (b)->addr32[0]) || \ + ((a)->addr32[3] == (b)->addr32[3] && \ + (a)->addr32[2] == (b)->addr32[2] && \ + (a)->addr32[1] == (b)->addr32[1] && \ + (a)->addr32[0] == (b)->addr32[0])) \ + +#define PF_ANEQ(a, b, c) \ + ((c == AF_INET && (a)->addr32[0] != (b)->addr32[0]) || \ + ((a)->addr32[3] != (b)->addr32[3] || \ + (a)->addr32[2] != (b)->addr32[2] || \ + (a)->addr32[1] != (b)->addr32[1] || \ + (a)->addr32[0] != (b)->addr32[0])) \ + +#define PF_AZERO(a, c) \ + ((c == AF_INET && !(a)->addr32[0]) || \ + (!(a)->addr32[0] && !(a)->addr32[1] && \ + !(a)->addr32[2] && !(a)->addr32[3] )) \ + +#define PF_MATCHA(n, a, m, b, f) \ + pf_match_addr(n, a, m, b, f) + +#define PF_ACPY(a, b, f) \ + pf_addrcpy(a, b, f) + +#define PF_AINC(a, f) \ + pf_addr_inc(a, f) + +#define PF_POOLMASK(a, b, c, d, f) \ + pf_poolmask(a, b, c, d, f) + +#else + +/* Just IPv6 */ + +#ifdef PF_INET6_ONLY + +#define PF_AEQ(a, b, c) \ + ((a)->addr32[3] == (b)->addr32[3] && \ + (a)->addr32[2] == (b)->addr32[2] && \ + (a)->addr32[1] == (b)->addr32[1] && \ + (a)->addr32[0] == (b)->addr32[0]) \ + +#define PF_ANEQ(a, b, c) \ + ((a)->addr32[3] != (b)->addr32[3] || \ + (a)->addr32[2] != (b)->addr32[2] || \ + (a)->addr32[1] != (b)->addr32[1] || \ + (a)->addr32[0] != (b)->addr32[0]) \ + +#define PF_AZERO(a, c) \ + (!(a)->addr32[0] && \ + !(a)->addr32[1] && \ + !(a)->addr32[2] && \ + !(a)->addr32[3] ) \ + +#define PF_MATCHA(n, a, m, b, f) \ + pf_match_addr(n, a, m, b, f) + +#define PF_ACPY(a, b, f) \ + pf_addrcpy(a, b, f) + +#define PF_AINC(a, f) \ + pf_addr_inc(a, f) + +#define PF_POOLMASK(a, b, c, d, f) \ + pf_poolmask(a, b, c, d, f) + +#else + +/* Just IPv4 */ +#ifdef PF_INET_ONLY + +#define PF_AEQ(a, b, c) \ + ((a)->addr32[0] == (b)->addr32[0]) + +#define PF_ANEQ(a, b, c) \ + ((a)->addr32[0] != (b)->addr32[0]) + +#define PF_AZERO(a, c) \ + (!(a)->addr32[0]) + +#define PF_MATCHA(n, a, m, b, f) \ + pf_match_addr(n, a, m, b, f) + +#define PF_ACPY(a, b, f) \ + (a)->v4.s_addr = (b)->v4.s_addr + +#define PF_AINC(a, f) \ + do { \ + (a)->addr32[0] = htonl(ntohl((a)->addr32[0]) + 1); \ + } while (0) + +#define PF_POOLMASK(a, b, c, d, f) \ + do { \ + (a)->addr32[0] = ((b)->addr32[0] & (c)->addr32[0]) | \ + (((c)->addr32[0] ^ 0xffffffff ) & (d)->addr32[0]); \ + } while (0) + +#endif /* PF_INET_ONLY */ +#endif /* PF_INET6_ONLY */ +#endif /* PF_INET_INET6 */ + +#define PF_MISMATCHAW(aw, x, af, neg, ifp) \ + ( \ + (((aw)->type == PF_ADDR_NOROUTE && \ + pf_routable((x), (af), NULL)) || \ + (((aw)->type == PF_ADDR_URPFFAILED && (ifp) != NULL && \ + pf_routable((x), (af), (ifp))) || \ + ((aw)->type == PF_ADDR_RTLABEL && \ + !pf_rtlabel_match((x), (af), (aw))) || \ + ((aw)->type == PF_ADDR_TABLE && \ + !pfr_match_addr((aw)->p.tbl, (x), (af))) || \ + ((aw)->type == PF_ADDR_DYNIFTL && \ + !pfi_match_addr((aw)->p.dyn, (x), (af))) || \ + ((aw)->type == PF_ADDR_ADDRMASK && \ + !PF_AZERO(&(aw)->v.a.mask, (af)) && \ + !PF_MATCHA(0, &(aw)->v.a.addr, \ + &(aw)->v.a.mask, (x), (af))))) != \ + (neg) \ + ) + + +struct pf_rule_uid { + uid_t uid[2]; + u_int8_t op; +}; + +struct pf_rule_gid { + uid_t gid[2]; + u_int8_t op; +}; + +struct pf_rule_addr { + struct pf_addr_wrap addr; + u_int16_t port[2]; + u_int8_t neg; + u_int8_t port_op; +}; + +struct pf_pooladdr { + struct pf_addr_wrap addr; + TAILQ_ENTRY(pf_pooladdr) entries; + char ifname[IFNAMSIZ]; + struct pfi_kif *kif; +}; + +TAILQ_HEAD(pf_palist, pf_pooladdr); + +struct pf_poolhashkey { + union { + u_int8_t key8[16]; + u_int16_t key16[8]; + u_int32_t key32[4]; + } pfk; /* 128-bit hash key */ +#define key8 pfk.key8 +#define key16 pfk.key16 +#define key32 pfk.key32 +}; + +struct pf_pool { + struct pf_palist list; + struct pf_pooladdr *cur; + struct pf_poolhashkey key; + struct pf_addr counter; + int tblidx; + u_int16_t proxy_port[2]; + u_int8_t port_op; + u_int8_t opts; +}; + + +/* A packed Operating System description for fingerprinting */ +typedef u_int32_t pf_osfp_t; +#define PF_OSFP_ANY ((pf_osfp_t)0) +#define PF_OSFP_UNKNOWN ((pf_osfp_t)-1) +#define PF_OSFP_NOMATCH ((pf_osfp_t)-2) + +struct pf_osfp_entry { + SLIST_ENTRY(pf_osfp_entry) fp_entry; + pf_osfp_t fp_os; + int fp_enflags; +#define PF_OSFP_EXPANDED 0x001 /* expanded entry */ +#define PF_OSFP_GENERIC 0x002 /* generic signature */ +#define PF_OSFP_NODETAIL 0x004 /* no p0f details */ +#define PF_OSFP_LEN 32 + char fp_class_nm[PF_OSFP_LEN]; + char fp_version_nm[PF_OSFP_LEN]; + char fp_subtype_nm[PF_OSFP_LEN]; +}; +#define PF_OSFP_ENTRY_EQ(a, b) \ + ((a)->fp_os == (b)->fp_os && \ + memcmp((a)->fp_class_nm, (b)->fp_class_nm, PF_OSFP_LEN) == 0 && \ + memcmp((a)->fp_version_nm, (b)->fp_version_nm, PF_OSFP_LEN) == 0 && \ + memcmp((a)->fp_subtype_nm, (b)->fp_subtype_nm, PF_OSFP_LEN) == 0) + +/* handle pf_osfp_t packing */ +#define _FP_RESERVED_BIT 1 /* For the special negative #defines */ +#define _FP_UNUSED_BITS 1 +#define _FP_CLASS_BITS 10 /* OS Class (Windows, Linux) */ +#define _FP_VERSION_BITS 10 /* OS version (95, 98, NT, 2.4.54, 3.2) */ +#define _FP_SUBTYPE_BITS 10 /* patch level (NT SP4, SP3, ECN patch) */ +#define PF_OSFP_UNPACK(osfp, class, version, subtype) do { \ + (class) = ((osfp) >> (_FP_VERSION_BITS+_FP_SUBTYPE_BITS)) & \ + ((1 << _FP_CLASS_BITS) - 1); \ + (version) = ((osfp) >> _FP_SUBTYPE_BITS) & \ + ((1 << _FP_VERSION_BITS) - 1);\ + (subtype) = (osfp) & ((1 << _FP_SUBTYPE_BITS) - 1); \ +} while(0) +#define PF_OSFP_PACK(osfp, class, version, subtype) do { \ + (osfp) = ((class) & ((1 << _FP_CLASS_BITS) - 1)) << (_FP_VERSION_BITS \ + + _FP_SUBTYPE_BITS); \ + (osfp) |= ((version) & ((1 << _FP_VERSION_BITS) - 1)) << \ + _FP_SUBTYPE_BITS; \ + (osfp) |= (subtype) & ((1 << _FP_SUBTYPE_BITS) - 1); \ +} while(0) + +/* the fingerprint of an OSes TCP SYN packet */ +typedef u_int64_t pf_tcpopts_t; +struct pf_os_fingerprint { + SLIST_HEAD(pf_osfp_enlist, pf_osfp_entry) fp_oses; /* list of matches */ + pf_tcpopts_t fp_tcpopts; /* packed TCP options */ + u_int16_t fp_wsize; /* TCP window size */ + u_int16_t fp_psize; /* ip->ip_len */ + u_int16_t fp_mss; /* TCP MSS */ + u_int16_t fp_flags; +#define PF_OSFP_WSIZE_MOD 0x0001 /* Window modulus */ +#define PF_OSFP_WSIZE_DC 0x0002 /* Window don't care */ +#define PF_OSFP_WSIZE_MSS 0x0004 /* Window multiple of MSS */ +#define PF_OSFP_WSIZE_MTU 0x0008 /* Window multiple of MTU */ +#define PF_OSFP_PSIZE_MOD 0x0010 /* packet size modulus */ +#define PF_OSFP_PSIZE_DC 0x0020 /* packet size don't care */ +#define PF_OSFP_WSCALE 0x0040 /* TCP window scaling */ +#define PF_OSFP_WSCALE_MOD 0x0080 /* TCP window scale modulus */ +#define PF_OSFP_WSCALE_DC 0x0100 /* TCP window scale dont-care */ +#define PF_OSFP_MSS 0x0200 /* TCP MSS */ +#define PF_OSFP_MSS_MOD 0x0400 /* TCP MSS modulus */ +#define PF_OSFP_MSS_DC 0x0800 /* TCP MSS dont-care */ +#define PF_OSFP_DF 0x1000 /* IPv4 don't fragment bit */ +#define PF_OSFP_TS0 0x2000 /* Zero timestamp */ +#define PF_OSFP_INET6 0x4000 /* IPv6 */ + u_int8_t fp_optcnt; /* TCP option count */ + u_int8_t fp_wscale; /* TCP window scaling */ + u_int8_t fp_ttl; /* IPv4 TTL */ +#define PF_OSFP_MAXTTL_OFFSET 40 +/* TCP options packing */ +#define PF_OSFP_TCPOPT_NOP 0x0 /* TCP NOP option */ +#define PF_OSFP_TCPOPT_WSCALE 0x1 /* TCP window scaling option */ +#define PF_OSFP_TCPOPT_MSS 0x2 /* TCP max segment size opt */ +#define PF_OSFP_TCPOPT_SACK 0x3 /* TCP SACK OK option */ +#define PF_OSFP_TCPOPT_TS 0x4 /* TCP timestamp option */ +#define PF_OSFP_TCPOPT_BITS 3 /* bits used by each option */ +#define PF_OSFP_MAX_OPTS \ + (sizeof(((struct pf_os_fingerprint *)0)->fp_tcpopts) * 8) \ + / PF_OSFP_TCPOPT_BITS + + SLIST_ENTRY(pf_os_fingerprint) fp_next; +}; + +struct pf_osfp_ioctl { + struct pf_osfp_entry fp_os; + pf_tcpopts_t fp_tcpopts; /* packed TCP options */ + u_int16_t fp_wsize; /* TCP window size */ + u_int16_t fp_psize; /* ip->ip_len */ + u_int16_t fp_mss; /* TCP MSS */ + u_int16_t fp_flags; + u_int8_t fp_optcnt; /* TCP option count */ + u_int8_t fp_wscale; /* TCP window scaling */ + u_int8_t fp_ttl; /* IPv4 TTL */ + + int fp_getnum; /* DIOCOSFPGET number */ +}; + + +union pf_rule_ptr { + struct pf_rule *ptr; + u_int32_t nr; +}; + +#define PF_ANCHOR_NAME_SIZE 64 + +struct pf_rule { + struct pf_rule_addr src; + struct pf_rule_addr dst; +#define PF_SKIP_IFP 0 +#define PF_SKIP_DIR 1 +#define PF_SKIP_AF 2 +#define PF_SKIP_PROTO 3 +#define PF_SKIP_SRC_ADDR 4 +#define PF_SKIP_SRC_PORT 5 +#define PF_SKIP_DST_ADDR 6 +#define PF_SKIP_DST_PORT 7 +#define PF_SKIP_COUNT 8 + union pf_rule_ptr skip[PF_SKIP_COUNT]; +#define PF_RULE_LABEL_SIZE 64 + char label[PF_RULE_LABEL_SIZE]; +#define PF_QNAME_SIZE 64 + char ifname[IFNAMSIZ]; + char qname[PF_QNAME_SIZE]; + char pqname[PF_QNAME_SIZE]; +#define PF_TAG_NAME_SIZE 64 + char tagname[PF_TAG_NAME_SIZE]; + char match_tagname[PF_TAG_NAME_SIZE]; + + char overload_tblname[PF_TABLE_NAME_SIZE]; + + TAILQ_ENTRY(pf_rule) entries; + struct pf_pool rpool; + + u_int64_t evaluations; + u_int64_t packets[2]; + u_int64_t bytes[2]; + + struct pfi_kif *kif; + struct pf_anchor *anchor; + struct pfr_ktable *overload_tbl; + + pf_osfp_t os_fingerprint; + + int rtableid; + u_int32_t timeout[PFTM_MAX]; + u_int32_t states; + u_int32_t max_states; + u_int32_t src_nodes; + u_int32_t max_src_nodes; + u_int32_t max_src_states; + u_int32_t spare1; /* netgraph */ + u_int32_t max_src_conn; + struct { + u_int32_t limit; + u_int32_t seconds; + } max_src_conn_rate; + u_int32_t qid; + u_int32_t pqid; + u_int32_t rt_listid; + u_int32_t nr; + u_int32_t prob; + uid_t cuid; + pid_t cpid; + + u_int16_t return_icmp; + u_int16_t return_icmp6; + u_int16_t max_mss; + u_int16_t tag; + u_int16_t match_tag; + u_int16_t spare2; /* netgraph */ + + struct pf_rule_uid uid; + struct pf_rule_gid gid; + + u_int32_t rule_flag; + u_int8_t action; + u_int8_t direction; + u_int8_t log; + u_int8_t logif; + u_int8_t quick; + u_int8_t ifnot; + u_int8_t match_tag_not; + u_int8_t natpass; + +#define PF_STATE_NORMAL 0x1 +#define PF_STATE_MODULATE 0x2 +#define PF_STATE_SYNPROXY 0x3 + u_int8_t keep_state; + sa_family_t af; + u_int8_t proto; + u_int8_t type; + u_int8_t code; + u_int8_t flags; + u_int8_t flagset; + u_int8_t min_ttl; + u_int8_t allow_opts; + u_int8_t rt; + u_int8_t return_ttl; + u_int8_t tos; + u_int8_t anchor_relative; + u_int8_t anchor_wildcard; + +#define PF_FLUSH 0x01 +#define PF_FLUSH_GLOBAL 0x02 + u_int8_t flush; +}; + +/* rule flags */ +#define PFRULE_DROP 0x0000 +#define PFRULE_RETURNRST 0x0001 +#define PFRULE_FRAGMENT 0x0002 +#define PFRULE_RETURNICMP 0x0004 +#define PFRULE_RETURN 0x0008 +#define PFRULE_NOSYNC 0x0010 +#define PFRULE_SRCTRACK 0x0020 /* track source states */ +#define PFRULE_RULESRCTRACK 0x0040 /* per rule */ + +/* scrub flags */ +#define PFRULE_NODF 0x0100 +#define PFRULE_FRAGCROP 0x0200 /* non-buffering frag cache */ +#define PFRULE_FRAGDROP 0x0400 /* drop funny fragments */ +#define PFRULE_RANDOMID 0x0800 +#define PFRULE_REASSEMBLE_TCP 0x1000 + +/* rule flags again */ +#define PFRULE_IFBOUND 0x00010000 /* if-bound */ +#define PFRULE_STATESLOPPY 0x00020000 /* sloppy state tracking */ + +#define PFSTATE_HIWAT 10000 /* default state table size */ +#define PFSTATE_ADAPT_START 6000 /* default adaptive timeout start */ +#define PFSTATE_ADAPT_END 12000 /* default adaptive timeout end */ + + +struct pf_threshold { + u_int32_t limit; +#define PF_THRESHOLD_MULT 1000 +#define PF_THRESHOLD_MAX 0xffffffff / PF_THRESHOLD_MULT + u_int32_t seconds; + u_int32_t count; + u_int32_t last; +}; + +struct pf_src_node { + RB_ENTRY(pf_src_node) entry; + struct pf_addr addr; + struct pf_addr raddr; + union pf_rule_ptr rule; + struct pfi_kif *kif; + u_int64_t bytes[2]; + u_int64_t packets[2]; + u_int32_t states; + u_int32_t conn; + struct pf_threshold conn_rate; + u_int32_t creation; + u_int32_t expire; + sa_family_t af; + u_int8_t ruletype; +}; + +#define PFSNODE_HIWAT 10000 /* default source node table size */ + +struct pf_state_scrub { + struct timeval pfss_last; /* time received last packet */ + u_int32_t pfss_tsecr; /* last echoed timestamp */ + u_int32_t pfss_tsval; /* largest timestamp */ + u_int32_t pfss_tsval0; /* original timestamp */ + u_int16_t pfss_flags; +#define PFSS_TIMESTAMP 0x0001 /* modulate timestamp */ +#define PFSS_PAWS 0x0010 /* stricter PAWS checks */ +#define PFSS_PAWS_IDLED 0x0020 /* was idle too long. no PAWS */ +#define PFSS_DATA_TS 0x0040 /* timestamp on data packets */ +#define PFSS_DATA_NOTS 0x0080 /* no timestamp on data packets */ + u_int8_t pfss_ttl; /* stashed TTL */ + u_int8_t pad; + u_int32_t pfss_ts_mod; /* timestamp modulation */ +}; + +struct pf_state_host { + struct pf_addr addr; + u_int16_t port; + u_int16_t pad; +}; + +struct pf_state_peer { + u_int32_t seqlo; /* Max sequence number sent */ + u_int32_t seqhi; /* Max the other end ACKd + win */ + u_int32_t seqdiff; /* Sequence number modulator */ + u_int16_t max_win; /* largest window (pre scaling) */ + u_int8_t state; /* active state level */ + u_int8_t wscale; /* window scaling factor */ + u_int16_t mss; /* Maximum segment size option */ + u_int8_t tcp_est; /* Did we reach TCPS_ESTABLISHED */ + struct pf_state_scrub *scrub; /* state is scrubbed */ + u_int8_t pad[3]; +}; + +TAILQ_HEAD(pf_state_queue, pf_state); + +/* keep synced with struct pf_state, used in RB_FIND */ +struct pf_state_cmp { + u_int64_t id; + u_int32_t creatorid; + struct pf_state_host lan; + struct pf_state_host gwy; + struct pf_state_host ext; + sa_family_t af; + u_int8_t proto; + u_int8_t direction; + u_int8_t pad; +}; + +struct pf_state { + u_int64_t id; + u_int32_t creatorid; + struct pf_state_host lan; + struct pf_state_host gwy; + struct pf_state_host ext; + sa_family_t af; + u_int8_t proto; + u_int8_t direction; +#ifdef __FreeBSD__ + u_int8_t local_flags; +#define PFSTATE_EXPIRING 0x01 +#else + u_int8_t pad; +#endif + u_int8_t log; + u_int8_t state_flags; +#define PFSTATE_ALLOWOPTS 0x01 +#define PFSTATE_SLOPPY 0x02 + u_int8_t timeout; + u_int8_t sync_flags; +#define PFSTATE_NOSYNC 0x01 +#define PFSTATE_FROMSYNC 0x02 +#define PFSTATE_STALE 0x04 + union { + struct { + RB_ENTRY(pf_state) entry_lan_ext; + RB_ENTRY(pf_state) entry_ext_gwy; + RB_ENTRY(pf_state) entry_id; + TAILQ_ENTRY(pf_state) entry_list; + struct pfi_kif *kif; + } s; + char ifname[IFNAMSIZ]; + } u; + struct pf_state_peer src; + struct pf_state_peer dst; + union pf_rule_ptr rule; + union pf_rule_ptr anchor; + union pf_rule_ptr nat_rule; + struct pf_addr rt_addr; + struct pfi_kif *rt_kif; + struct pf_src_node *src_node; + struct pf_src_node *nat_src_node; + u_int64_t packets[2]; + u_int64_t bytes[2]; + u_int32_t creation; + u_int32_t expire; + u_int32_t pfsync_time; + u_int16_t tag; +}; + +TAILQ_HEAD(pf_rulequeue, pf_rule); + +struct pf_anchor; + +struct pf_ruleset { + struct { + struct pf_rulequeue queues[2]; + struct { + struct pf_rulequeue *ptr; + struct pf_rule **ptr_array; + u_int32_t rcount; + u_int32_t ticket; + int open; + } active, inactive; + } rules[PF_RULESET_MAX]; + struct pf_anchor *anchor; + u_int32_t tticket; + int tables; + int topen; +}; + +RB_HEAD(pf_anchor_global, pf_anchor); +RB_HEAD(pf_anchor_node, pf_anchor); +struct pf_anchor { + RB_ENTRY(pf_anchor) entry_global; + RB_ENTRY(pf_anchor) entry_node; + struct pf_anchor *parent; + struct pf_anchor_node children; + char name[PF_ANCHOR_NAME_SIZE]; + char path[MAXPATHLEN]; + struct pf_ruleset ruleset; + int refcnt; /* anchor rules */ + int match; +}; +RB_PROTOTYPE(pf_anchor_global, pf_anchor, entry_global, pf_anchor_compare); +RB_PROTOTYPE(pf_anchor_node, pf_anchor, entry_node, pf_anchor_compare); + +#define PF_RESERVED_ANCHOR "_pf" + +#define PFR_TFLAG_PERSIST 0x00000001 +#define PFR_TFLAG_CONST 0x00000002 +#define PFR_TFLAG_ACTIVE 0x00000004 +#define PFR_TFLAG_INACTIVE 0x00000008 +#define PFR_TFLAG_REFERENCED 0x00000010 +#define PFR_TFLAG_REFDANCHOR 0x00000020 +#define PFR_TFLAG_USRMASK 0x00000003 +#define PFR_TFLAG_SETMASK 0x0000003C +#define PFR_TFLAG_ALLMASK 0x0000003F + +struct pfr_table { + char pfrt_anchor[MAXPATHLEN]; + char pfrt_name[PF_TABLE_NAME_SIZE]; + u_int32_t pfrt_flags; + u_int8_t pfrt_fback; +}; + +enum { PFR_FB_NONE, PFR_FB_MATCH, PFR_FB_ADDED, PFR_FB_DELETED, + PFR_FB_CHANGED, PFR_FB_CLEARED, PFR_FB_DUPLICATE, + PFR_FB_NOTMATCH, PFR_FB_CONFLICT, PFR_FB_MAX }; + +struct pfr_addr { + union { + struct in_addr _pfra_ip4addr; + struct in6_addr _pfra_ip6addr; + } pfra_u; + u_int8_t pfra_af; + u_int8_t pfra_net; + u_int8_t pfra_not; + u_int8_t pfra_fback; +}; +#define pfra_ip4addr pfra_u._pfra_ip4addr +#define pfra_ip6addr pfra_u._pfra_ip6addr + +enum { PFR_DIR_IN, PFR_DIR_OUT, PFR_DIR_MAX }; +enum { PFR_OP_BLOCK, PFR_OP_PASS, PFR_OP_ADDR_MAX, PFR_OP_TABLE_MAX }; +#define PFR_OP_XPASS PFR_OP_ADDR_MAX + +struct pfr_astats { + struct pfr_addr pfras_a; + u_int64_t pfras_packets[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; + u_int64_t pfras_bytes[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; + long pfras_tzero; +}; + +enum { PFR_REFCNT_RULE, PFR_REFCNT_ANCHOR, PFR_REFCNT_MAX }; + +struct pfr_tstats { + struct pfr_table pfrts_t; + u_int64_t pfrts_packets[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; + u_int64_t pfrts_bytes[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; + u_int64_t pfrts_match; + u_int64_t pfrts_nomatch; + long pfrts_tzero; + int pfrts_cnt; + int pfrts_refcnt[PFR_REFCNT_MAX]; +}; +#define pfrts_name pfrts_t.pfrt_name +#define pfrts_flags pfrts_t.pfrt_flags + +#ifndef _SOCKADDR_UNION_DEFINED +#define _SOCKADDR_UNION_DEFINED +union sockaddr_union { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; +}; +#endif /* _SOCKADDR_UNION_DEFINED */ + +SLIST_HEAD(pfr_kentryworkq, pfr_kentry); +struct pfr_kentry { + struct radix_node pfrke_node[2]; + union sockaddr_union pfrke_sa; + u_int64_t pfrke_packets[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; + u_int64_t pfrke_bytes[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; + SLIST_ENTRY(pfr_kentry) pfrke_workq; + long pfrke_tzero; + u_int8_t pfrke_af; + u_int8_t pfrke_net; + u_int8_t pfrke_not; + u_int8_t pfrke_mark; + u_int8_t pfrke_intrpool; +}; + +SLIST_HEAD(pfr_ktableworkq, pfr_ktable); +RB_HEAD(pfr_ktablehead, pfr_ktable); +struct pfr_ktable { + struct pfr_tstats pfrkt_ts; + RB_ENTRY(pfr_ktable) pfrkt_tree; + SLIST_ENTRY(pfr_ktable) pfrkt_workq; + struct radix_node_head *pfrkt_ip4; + struct radix_node_head *pfrkt_ip6; + struct pfr_ktable *pfrkt_shadow; + struct pfr_ktable *pfrkt_root; + struct pf_ruleset *pfrkt_rs; + long pfrkt_larg; + int pfrkt_nflags; +}; +#define pfrkt_t pfrkt_ts.pfrts_t +#define pfrkt_name pfrkt_t.pfrt_name +#define pfrkt_anchor pfrkt_t.pfrt_anchor +#define pfrkt_ruleset pfrkt_t.pfrt_ruleset +#define pfrkt_flags pfrkt_t.pfrt_flags +#define pfrkt_cnt pfrkt_ts.pfrts_cnt +#define pfrkt_refcnt pfrkt_ts.pfrts_refcnt +#define pfrkt_packets pfrkt_ts.pfrts_packets +#define pfrkt_bytes pfrkt_ts.pfrts_bytes +#define pfrkt_match pfrkt_ts.pfrts_match +#define pfrkt_nomatch pfrkt_ts.pfrts_nomatch +#define pfrkt_tzero pfrkt_ts.pfrts_tzero + +RB_HEAD(pf_state_tree_lan_ext, pf_state); +RB_PROTOTYPE(pf_state_tree_lan_ext, pf_state, + u.s.entry_lan_ext, pf_state_compare_lan_ext); + +RB_HEAD(pf_state_tree_ext_gwy, pf_state); +RB_PROTOTYPE(pf_state_tree_ext_gwy, pf_state, + u.s.entry_ext_gwy, pf_state_compare_ext_gwy); + +TAILQ_HEAD(pfi_statehead, pfi_kif); +RB_HEAD(pfi_ifhead, pfi_kif); + +/* keep synced with pfi_kif, used in RB_FIND */ +struct pfi_kif_cmp { + char pfik_name[IFNAMSIZ]; +}; + +struct pfi_kif { + char pfik_name[IFNAMSIZ]; + RB_ENTRY(pfi_kif) pfik_tree; + u_int64_t pfik_packets[2][2][2]; + u_int64_t pfik_bytes[2][2][2]; + u_int32_t pfik_tzero; + int pfik_flags; + struct pf_state_tree_lan_ext pfik_lan_ext; + struct pf_state_tree_ext_gwy pfik_ext_gwy; + TAILQ_ENTRY(pfi_kif) pfik_w_states; +#ifndef __FreeBSD__ + void *pfik_ah_cookie; +#endif + struct ifnet *pfik_ifp; + struct ifg_group *pfik_group; + int pfik_states; + int pfik_rules; + TAILQ_HEAD(, pfi_dynaddr) pfik_dynaddrs; +}; + +enum pfi_kif_refs { + PFI_KIF_REF_NONE, + PFI_KIF_REF_STATE, + PFI_KIF_REF_RULE +}; + +#define PFI_IFLAG_SKIP 0x0100 /* skip filtering on interface */ +/* XXX: revisist */ +#define PFI_IFLAG_SETABLE_MASK 0x0100 /* setable via DIOC{SET,CLR}IFFLAG */ +#define PFI_IFLAG_PLACEHOLDER 0x8000 /* placeholder group/interface */ + +struct pf_pdesc { + struct { + int done; + uid_t uid; + gid_t gid; + pid_t pid; + } lookup; + u_int64_t tot_len; /* Make Mickey money */ + union { + struct tcphdr *tcp; + struct udphdr *udp; + struct icmp *icmp; +#ifdef INET6 + struct icmp6_hdr *icmp6; +#endif /* INET6 */ + void *any; + } hdr; + struct pf_addr baddr; /* address before translation */ + struct pf_addr naddr; /* address after translation */ + struct pf_rule *nat_rule; /* nat/rdr rule applied to packet */ + struct pf_addr *src; + struct pf_addr *dst; + struct ether_header + *eh; + struct pf_mtag *pf_mtag; + u_int16_t *ip_sum; + u_int32_t p_len; /* total length of payload */ + u_int16_t flags; /* Let SCRUB trigger behavior in + * state code. Easier than tags */ +#define PFDESC_TCP_NORM 0x0001 /* TCP shall be statefully scrubbed */ +#define PFDESC_IP_REAS 0x0002 /* IP frags would've been reassembled */ + sa_family_t af; + u_int8_t proto; + u_int8_t tos; +}; + +/* flags for RDR options */ +#define PF_DPORT_RANGE 0x01 /* Dest port uses range */ +#define PF_RPORT_RANGE 0x02 /* RDR'ed port uses range */ + +/* Reasons code for passing/dropping a packet */ +#define PFRES_MATCH 0 /* Explicit match of a rule */ +#define PFRES_BADOFF 1 /* Bad offset for pull_hdr */ +#define PFRES_FRAG 2 /* Dropping following fragment */ +#define PFRES_SHORT 3 /* Dropping short packet */ +#define PFRES_NORM 4 /* Dropping by normalizer */ +#define PFRES_MEMORY 5 /* Dropped due to lacking mem */ +#define PFRES_TS 6 /* Bad TCP Timestamp (RFC1323) */ +#define PFRES_CONGEST 7 /* Congestion (of ipintrq) */ +#define PFRES_IPOPTIONS 8 /* IP option */ +#define PFRES_PROTCKSUM 9 /* Protocol checksum invalid */ +#define PFRES_BADSTATE 10 /* State mismatch */ +#define PFRES_STATEINS 11 /* State insertion failure */ +#define PFRES_MAXSTATES 12 /* State limit */ +#define PFRES_SRCLIMIT 13 /* Source node/conn limit */ +#define PFRES_SYNPROXY 14 /* SYN proxy */ +#define PFRES_MAX 15 /* total+1 */ + +#define PFRES_NAMES { \ + "match", \ + "bad-offset", \ + "fragment", \ + "short", \ + "normalize", \ + "memory", \ + "bad-timestamp", \ + "congestion", \ + "ip-option", \ + "proto-cksum", \ + "state-mismatch", \ + "state-insert", \ + "state-limit", \ + "src-limit", \ + "synproxy", \ + NULL \ +} + +/* Counters for other things we want to keep track of */ +#define LCNT_STATES 0 /* states */ +#define LCNT_SRCSTATES 1 /* max-src-states */ +#define LCNT_SRCNODES 2 /* max-src-nodes */ +#define LCNT_SRCCONN 3 /* max-src-conn */ +#define LCNT_SRCCONNRATE 4 /* max-src-conn-rate */ +#define LCNT_OVERLOAD_TABLE 5 /* entry added to overload table */ +#define LCNT_OVERLOAD_FLUSH 6 /* state entries flushed */ +#define LCNT_MAX 7 /* total+1 */ + +#define LCNT_NAMES { \ + "max states per rule", \ + "max-src-states", \ + "max-src-nodes", \ + "max-src-conn", \ + "max-src-conn-rate", \ + "overload table insertion", \ + "overload flush states", \ + NULL \ +} + +/* UDP state enumeration */ +#define PFUDPS_NO_TRAFFIC 0 +#define PFUDPS_SINGLE 1 +#define PFUDPS_MULTIPLE 2 + +#define PFUDPS_NSTATES 3 /* number of state levels */ + +#define PFUDPS_NAMES { \ + "NO_TRAFFIC", \ + "SINGLE", \ + "MULTIPLE", \ + NULL \ +} + +/* Other protocol state enumeration */ +#define PFOTHERS_NO_TRAFFIC 0 +#define PFOTHERS_SINGLE 1 +#define PFOTHERS_MULTIPLE 2 + +#define PFOTHERS_NSTATES 3 /* number of state levels */ + +#define PFOTHERS_NAMES { \ + "NO_TRAFFIC", \ + "SINGLE", \ + "MULTIPLE", \ + NULL \ +} + +#define FCNT_STATE_SEARCH 0 +#define FCNT_STATE_INSERT 1 +#define FCNT_STATE_REMOVALS 2 +#define FCNT_MAX 3 + +#define SCNT_SRC_NODE_SEARCH 0 +#define SCNT_SRC_NODE_INSERT 1 +#define SCNT_SRC_NODE_REMOVALS 2 +#define SCNT_MAX 3 + +#define ACTION_SET(a, x) \ + do { \ + if ((a) != NULL) \ + *(a) = (x); \ + } while (0) + +#define REASON_SET(a, x) \ + do { \ + if ((a) != NULL) \ + *(a) = (x); \ + if (x < PFRES_MAX) \ + pf_status.counters[x]++; \ + } while (0) + +struct pf_status { + u_int64_t counters[PFRES_MAX]; + u_int64_t lcounters[LCNT_MAX]; /* limit counters */ + u_int64_t fcounters[FCNT_MAX]; + u_int64_t scounters[SCNT_MAX]; + u_int64_t pcounters[2][2][3]; + u_int64_t bcounters[2][2]; + u_int64_t stateid; + u_int32_t running; + u_int32_t states; + u_int32_t src_nodes; + u_int32_t since; + u_int32_t debug; + u_int32_t hostid; + char ifname[IFNAMSIZ]; + u_int8_t pf_chksum[PF_MD5_DIGEST_LENGTH]; +}; + +struct cbq_opts { + u_int minburst; + u_int maxburst; + u_int pktsize; + u_int maxpktsize; + u_int ns_per_byte; + u_int maxidle; + int minidle; + u_int offtime; + int flags; +}; + +struct priq_opts { + int flags; +}; + +struct hfsc_opts { + /* real-time service curve */ + u_int rtsc_m1; /* slope of the 1st segment in bps */ + u_int rtsc_d; /* the x-projection of m1 in msec */ + u_int rtsc_m2; /* slope of the 2nd segment in bps */ + /* link-sharing service curve */ + u_int lssc_m1; + u_int lssc_d; + u_int lssc_m2; + /* upper-limit service curve */ + u_int ulsc_m1; + u_int ulsc_d; + u_int ulsc_m2; + int flags; +}; + +struct pf_altq { + char ifname[IFNAMSIZ]; + + void *altq_disc; /* discipline-specific state */ + TAILQ_ENTRY(pf_altq) entries; + + /* scheduler spec */ + u_int8_t scheduler; /* scheduler type */ + u_int16_t tbrsize; /* tokenbucket regulator size */ + u_int32_t ifbandwidth; /* interface bandwidth */ + + /* queue spec */ + char qname[PF_QNAME_SIZE]; /* queue name */ + char parent[PF_QNAME_SIZE]; /* parent name */ + u_int32_t parent_qid; /* parent queue id */ + u_int32_t bandwidth; /* queue bandwidth */ + u_int8_t priority; /* priority */ +#ifdef __FreeBSD__ + u_int8_t local_flags; /* dynamic interface */ +#define PFALTQ_FLAG_IF_REMOVED 0x01 +#endif + u_int16_t qlimit; /* queue size limit */ + u_int16_t flags; /* misc flags */ + union { + struct cbq_opts cbq_opts; + struct priq_opts priq_opts; + struct hfsc_opts hfsc_opts; + } pq_u; + + u_int32_t qid; /* return value */ +}; + +#ifndef __FreeBSD__ + +#define PF_TAG_GENERATED 0x01 +#define PF_TAG_FRAGCACHE 0x02 +#define PF_TAG_TRANSLATE_LOCALHOST 0x04 + +struct pf_mtag { + void *hdr; /* saved hdr pos in mbuf, for ECN */ + u_int rtableid; /* alternate routing table id */ + u_int32_t qid; /* queue id */ + u_int16_t tag; /* tag id */ + u_int8_t flags; + u_int8_t routed; + sa_family_t af; /* for ECN */ +}; +#endif + +struct pf_tag { + u_int16_t tag; /* tag id */ +}; + +struct pf_tagname { + TAILQ_ENTRY(pf_tagname) entries; + char name[PF_TAG_NAME_SIZE]; + u_int16_t tag; + int ref; +}; + +#define PFFRAG_FRENT_HIWAT 5000 /* Number of fragment entries */ +#define PFFRAG_FRAG_HIWAT 1000 /* Number of fragmented packets */ +#define PFFRAG_FRCENT_HIWAT 50000 /* Number of fragment cache entries */ +#define PFFRAG_FRCACHE_HIWAT 10000 /* Number of fragment descriptors */ + +#define PFR_KTABLE_HIWAT 1000 /* Number of tables */ +#define PFR_KENTRY_HIWAT 200000 /* Number of table entries */ +#define PFR_KENTRY_HIWAT_SMALL 100000 /* Number of table entries (tiny hosts) */ + +/* + * ioctl parameter structures + */ + +struct pfioc_pooladdr { + u_int32_t action; + u_int32_t ticket; + u_int32_t nr; + u_int32_t r_num; + u_int8_t r_action; + u_int8_t r_last; + u_int8_t af; + char anchor[MAXPATHLEN]; + struct pf_pooladdr addr; +}; + +struct pfioc_rule { + u_int32_t action; + u_int32_t ticket; + u_int32_t pool_ticket; + u_int32_t nr; + char anchor[MAXPATHLEN]; + char anchor_call[MAXPATHLEN]; + struct pf_rule rule; +}; + +struct pfioc_natlook { + struct pf_addr saddr; + struct pf_addr daddr; + struct pf_addr rsaddr; + struct pf_addr rdaddr; + u_int16_t sport; + u_int16_t dport; + u_int16_t rsport; + u_int16_t rdport; + sa_family_t af; + u_int8_t proto; + u_int8_t direction; +}; + +struct pfioc_state { + u_int32_t nr; + struct pf_state state; +}; + +struct pfioc_src_node_kill { + /* XXX returns the number of src nodes killed in psnk_af */ + sa_family_t psnk_af; + struct pf_rule_addr psnk_src; + struct pf_rule_addr psnk_dst; +}; + +struct pfioc_state_kill { + /* XXX returns the number of states killed in psk_af */ + sa_family_t psk_af; + int psk_proto; + struct pf_rule_addr psk_src; + struct pf_rule_addr psk_dst; + char psk_ifname[IFNAMSIZ]; +}; + +struct pfioc_states { + int ps_len; + union { + caddr_t psu_buf; + struct pf_state *psu_states; + } ps_u; +#define ps_buf ps_u.psu_buf +#define ps_states ps_u.psu_states +}; + +struct pfioc_src_nodes { + int psn_len; + union { + caddr_t psu_buf; + struct pf_src_node *psu_src_nodes; + } psn_u; +#define psn_buf psn_u.psu_buf +#define psn_src_nodes psn_u.psu_src_nodes +}; + +struct pfioc_if { + char ifname[IFNAMSIZ]; +}; + +struct pfioc_tm { + int timeout; + int seconds; +}; + +struct pfioc_limit { + int index; + unsigned limit; +}; + +struct pfioc_altq { + u_int32_t action; + u_int32_t ticket; + u_int32_t nr; + struct pf_altq altq; +}; + +struct pfioc_qstats { + u_int32_t ticket; + u_int32_t nr; + void *buf; + int nbytes; + u_int8_t scheduler; +}; + +struct pfioc_ruleset { + u_int32_t nr; + char path[MAXPATHLEN]; + char name[PF_ANCHOR_NAME_SIZE]; +}; + +#define PF_RULESET_ALTQ (PF_RULESET_MAX) +#define PF_RULESET_TABLE (PF_RULESET_MAX+1) +struct pfioc_trans { + int size; /* number of elements */ + int esize; /* size of each element in bytes */ + struct pfioc_trans_e { + int rs_num; + char anchor[MAXPATHLEN]; + u_int32_t ticket; + } *array; +}; + +#define PFR_FLAG_ATOMIC 0x00000001 +#define PFR_FLAG_DUMMY 0x00000002 +#define PFR_FLAG_FEEDBACK 0x00000004 +#define PFR_FLAG_CLSTATS 0x00000008 +#define PFR_FLAG_ADDRSTOO 0x00000010 +#define PFR_FLAG_REPLACE 0x00000020 +#define PFR_FLAG_ALLRSETS 0x00000040 +#define PFR_FLAG_ALLMASK 0x0000007F +#ifdef _KERNEL +#define PFR_FLAG_USERIOCTL 0x10000000 +#endif + +struct pfioc_table { + struct pfr_table pfrio_table; + void *pfrio_buffer; + int pfrio_esize; + int pfrio_size; + int pfrio_size2; + int pfrio_nadd; + int pfrio_ndel; + int pfrio_nchange; + int pfrio_flags; + u_int32_t pfrio_ticket; +}; +#define pfrio_exists pfrio_nadd +#define pfrio_nzero pfrio_nadd +#define pfrio_nmatch pfrio_nadd +#define pfrio_naddr pfrio_size2 +#define pfrio_setflag pfrio_size2 +#define pfrio_clrflag pfrio_nadd + +struct pfioc_iface { + char pfiio_name[IFNAMSIZ]; + void *pfiio_buffer; + int pfiio_esize; + int pfiio_size; + int pfiio_nzero; + int pfiio_flags; +}; + + +/* + * ioctl operations + */ + +#define DIOCSTART _IO ('D', 1) +#define DIOCSTOP _IO ('D', 2) +#define DIOCADDRULE _IOWR('D', 4, struct pfioc_rule) +#define DIOCGETRULES _IOWR('D', 6, struct pfioc_rule) +#define DIOCGETRULE _IOWR('D', 7, struct pfioc_rule) +/* XXX cut 8 - 17 */ +#define DIOCCLRSTATES _IOWR('D', 18, struct pfioc_state_kill) +#define DIOCGETSTATE _IOWR('D', 19, struct pfioc_state) +#define DIOCSETSTATUSIF _IOWR('D', 20, struct pfioc_if) +#define DIOCGETSTATUS _IOWR('D', 21, struct pf_status) +#define DIOCCLRSTATUS _IO ('D', 22) +#define DIOCNATLOOK _IOWR('D', 23, struct pfioc_natlook) +#define DIOCSETDEBUG _IOWR('D', 24, u_int32_t) +#define DIOCGETSTATES _IOWR('D', 25, struct pfioc_states) +#define DIOCCHANGERULE _IOWR('D', 26, struct pfioc_rule) +/* XXX cut 26 - 28 */ +#define DIOCSETTIMEOUT _IOWR('D', 29, struct pfioc_tm) +#define DIOCGETTIMEOUT _IOWR('D', 30, struct pfioc_tm) +#define DIOCADDSTATE _IOWR('D', 37, struct pfioc_state) +#define DIOCCLRRULECTRS _IO ('D', 38) +#define DIOCGETLIMIT _IOWR('D', 39, struct pfioc_limit) +#define DIOCSETLIMIT _IOWR('D', 40, struct pfioc_limit) +#define DIOCKILLSTATES _IOWR('D', 41, struct pfioc_state_kill) +#define DIOCSTARTALTQ _IO ('D', 42) +#define DIOCSTOPALTQ _IO ('D', 43) +#define DIOCADDALTQ _IOWR('D', 45, struct pfioc_altq) +#define DIOCGETALTQS _IOWR('D', 47, struct pfioc_altq) +#define DIOCGETALTQ _IOWR('D', 48, struct pfioc_altq) +#define DIOCCHANGEALTQ _IOWR('D', 49, struct pfioc_altq) +#define DIOCGETQSTATS _IOWR('D', 50, struct pfioc_qstats) +#define DIOCBEGINADDRS _IOWR('D', 51, struct pfioc_pooladdr) +#define DIOCADDADDR _IOWR('D', 52, struct pfioc_pooladdr) +#define DIOCGETADDRS _IOWR('D', 53, struct pfioc_pooladdr) +#define DIOCGETADDR _IOWR('D', 54, struct pfioc_pooladdr) +#define DIOCCHANGEADDR _IOWR('D', 55, struct pfioc_pooladdr) +/* XXX cut 55 - 57 */ +#define DIOCGETRULESETS _IOWR('D', 58, struct pfioc_ruleset) +#define DIOCGETRULESET _IOWR('D', 59, struct pfioc_ruleset) +#define DIOCRCLRTABLES _IOWR('D', 60, struct pfioc_table) +#define DIOCRADDTABLES _IOWR('D', 61, struct pfioc_table) +#define DIOCRDELTABLES _IOWR('D', 62, struct pfioc_table) +#define DIOCRGETTABLES _IOWR('D', 63, struct pfioc_table) +#define DIOCRGETTSTATS _IOWR('D', 64, struct pfioc_table) +#define DIOCRCLRTSTATS _IOWR('D', 65, struct pfioc_table) +#define DIOCRCLRADDRS _IOWR('D', 66, struct pfioc_table) +#define DIOCRADDADDRS _IOWR('D', 67, struct pfioc_table) +#define DIOCRDELADDRS _IOWR('D', 68, struct pfioc_table) +#define DIOCRSETADDRS _IOWR('D', 69, struct pfioc_table) +#define DIOCRGETADDRS _IOWR('D', 70, struct pfioc_table) +#define DIOCRGETASTATS _IOWR('D', 71, struct pfioc_table) +#define DIOCRCLRASTATS _IOWR('D', 72, struct pfioc_table) +#define DIOCRTSTADDRS _IOWR('D', 73, struct pfioc_table) +#define DIOCRSETTFLAGS _IOWR('D', 74, struct pfioc_table) +#define DIOCRINADEFINE _IOWR('D', 77, struct pfioc_table) +#define DIOCOSFPFLUSH _IO('D', 78) +#define DIOCOSFPADD _IOWR('D', 79, struct pf_osfp_ioctl) +#define DIOCOSFPGET _IOWR('D', 80, struct pf_osfp_ioctl) +#define DIOCXBEGIN _IOWR('D', 81, struct pfioc_trans) +#define DIOCXCOMMIT _IOWR('D', 82, struct pfioc_trans) +#define DIOCXROLLBACK _IOWR('D', 83, struct pfioc_trans) +#define DIOCGETSRCNODES _IOWR('D', 84, struct pfioc_src_nodes) +#define DIOCCLRSRCNODES _IO('D', 85) +#define DIOCSETHOSTID _IOWR('D', 86, u_int32_t) +#define DIOCIGETIFACES _IOWR('D', 87, struct pfioc_iface) +#define DIOCSETIFFLAG _IOWR('D', 89, struct pfioc_iface) +#define DIOCCLRIFFLAG _IOWR('D', 90, struct pfioc_iface) +#define DIOCKILLSRCNODES _IOWR('D', 91, struct pfioc_src_node_kill) +#ifdef __FreeBSD__ +struct pf_ifspeed { + char ifname[IFNAMSIZ]; + u_int32_t baudrate; +}; +#define DIOCGIFSPEED _IOWR('D', 92, struct pf_ifspeed) +#endif + +#ifdef _KERNEL +RB_HEAD(pf_src_tree, pf_src_node); +RB_PROTOTYPE(pf_src_tree, pf_src_node, entry, pf_src_compare); +extern struct pf_src_tree tree_src_tracking; + +RB_HEAD(pf_state_tree_id, pf_state); +RB_PROTOTYPE(pf_state_tree_id, pf_state, + entry_id, pf_state_compare_id); +extern struct pf_state_tree_id tree_id; +extern struct pf_state_queue state_list; + +TAILQ_HEAD(pf_poolqueue, pf_pool); +extern struct pf_poolqueue pf_pools[2]; +TAILQ_HEAD(pf_altqqueue, pf_altq); +extern struct pf_altqqueue pf_altqs[2]; +extern struct pf_palist pf_pabuf; + +extern u_int32_t ticket_altqs_active; +extern u_int32_t ticket_altqs_inactive; +extern int altqs_inactive_open; +extern u_int32_t ticket_pabuf; +extern struct pf_altqqueue *pf_altqs_active; +extern struct pf_altqqueue *pf_altqs_inactive; +extern struct pf_poolqueue *pf_pools_active; +extern struct pf_poolqueue *pf_pools_inactive; +extern int pf_tbladdr_setup(struct pf_ruleset *, + struct pf_addr_wrap *); +extern void pf_tbladdr_remove(struct pf_addr_wrap *); +extern void pf_tbladdr_copyout(struct pf_addr_wrap *); +extern void pf_calc_skip_steps(struct pf_rulequeue *); +#ifdef __FreeBSD__ +#ifdef ALTQ +extern void pf_altq_ifnet_event(struct ifnet *, int); +#endif +extern uma_zone_t pf_src_tree_pl, pf_rule_pl; +extern uma_zone_t pf_state_pl, pf_altq_pl, pf_pooladdr_pl; +extern uma_zone_t pfr_ktable_pl, pfr_kentry_pl, pfr_kentry_pl2; +extern uma_zone_t pf_cache_pl, pf_cent_pl; +extern uma_zone_t pf_state_scrub_pl; +extern uma_zone_t pfi_addr_pl; +#else +extern struct pool pf_src_tree_pl, pf_rule_pl; +extern struct pool pf_state_pl, pf_altq_pl, pf_pooladdr_pl; +extern struct pool pf_state_scrub_pl; +#endif +extern void pf_purge_thread(void *); +#ifdef __FreeBSD__ +extern int pf_purge_expired_src_nodes(int); +extern int pf_purge_expired_states(u_int32_t, int); +#else +extern void pf_purge_expired_src_nodes(int); +extern void pf_purge_expired_states(u_int32_t); +#endif +extern void pf_unlink_state(struct pf_state *); +extern void pf_free_state(struct pf_state *); +extern int pf_insert_state(struct pfi_kif *, + struct pf_state *); +extern int pf_insert_src_node(struct pf_src_node **, + struct pf_rule *, struct pf_addr *, + sa_family_t); +void pf_src_tree_remove_state(struct pf_state *); +extern struct pf_state *pf_find_state_byid(struct pf_state_cmp *); +extern struct pf_state *pf_find_state_all(struct pf_state_cmp *key, + u_int8_t tree, int *more); +extern void pf_print_state(struct pf_state *); +extern void pf_print_flags(u_int8_t); +extern u_int16_t pf_cksum_fixup(u_int16_t, u_int16_t, u_int16_t, + u_int8_t); + +extern struct ifnet *sync_ifp; +extern struct pf_rule pf_default_rule; +extern void pf_addrcpy(struct pf_addr *, struct pf_addr *, + u_int8_t); +void pf_rm_rule(struct pf_rulequeue *, + struct pf_rule *); + +#ifdef INET +#ifdef __FreeBSD__ +int pf_test(int, struct ifnet *, struct mbuf **, struct ether_header *, + struct inpcb *); +#else +int pf_test(int, struct ifnet *, struct mbuf **, struct ether_header *); +#endif +#endif /* INET */ + +#ifdef INET6 +#ifdef __FreeBSD__ +int pf_test6(int, struct ifnet *, struct mbuf **, struct ether_header *, + struct inpcb *); +#else +int pf_test6(int, struct ifnet *, struct mbuf **, struct ether_header *); +#endif +void pf_poolmask(struct pf_addr *, struct pf_addr*, + struct pf_addr *, struct pf_addr *, u_int8_t); +void pf_addr_inc(struct pf_addr *, sa_family_t); +#endif /* INET6 */ + +#ifdef __FreeBSD__ +u_int32_t pf_new_isn(struct pf_state *); +#endif +void *pf_pull_hdr(struct mbuf *, int, void *, int, u_short *, u_short *, + sa_family_t); +void pf_change_a(void *, u_int16_t *, u_int32_t, u_int8_t); +int pflog_packet(struct pfi_kif *, struct mbuf *, sa_family_t, u_int8_t, + u_int8_t, struct pf_rule *, struct pf_rule *, struct pf_ruleset *, + struct pf_pdesc *); +int pf_match_addr(u_int8_t, struct pf_addr *, struct pf_addr *, + struct pf_addr *, sa_family_t); +int pf_match(u_int8_t, u_int32_t, u_int32_t, u_int32_t); +int pf_match_port(u_int8_t, u_int16_t, u_int16_t, u_int16_t); +int pf_match_uid(u_int8_t, uid_t, uid_t, uid_t); +int pf_match_gid(u_int8_t, gid_t, gid_t, gid_t); + +void pf_normalize_init(void); +int pf_normalize_ip(struct mbuf **, int, struct pfi_kif *, u_short *, + struct pf_pdesc *); +int pf_normalize_ip6(struct mbuf **, int, struct pfi_kif *, u_short *, + struct pf_pdesc *); +int pf_normalize_tcp(int, struct pfi_kif *, struct mbuf *, int, int, void *, + struct pf_pdesc *); +void pf_normalize_tcp_cleanup(struct pf_state *); +int pf_normalize_tcp_init(struct mbuf *, int, struct pf_pdesc *, + struct tcphdr *, struct pf_state_peer *, struct pf_state_peer *); +int pf_normalize_tcp_stateful(struct mbuf *, int, struct pf_pdesc *, + u_short *, struct tcphdr *, struct pf_state *, + struct pf_state_peer *, struct pf_state_peer *, int *); +u_int32_t + pf_state_expires(const struct pf_state *); +void pf_purge_expired_fragments(void); +int pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *); +int pf_rtlabel_match(struct pf_addr *, sa_family_t, struct pf_addr_wrap *); +#ifdef __FreeBSD__ +int pf_socket_lookup(int, struct pf_pdesc *, struct inpcb *); +#else +int pf_socket_lookup(int, struct pf_pdesc *); +#endif +void pfr_initialize(void); +int pfr_match_addr(struct pfr_ktable *, struct pf_addr *, sa_family_t); +void pfr_update_stats(struct pfr_ktable *, struct pf_addr *, sa_family_t, + u_int64_t, int, int, int); +int pfr_pool_get(struct pfr_ktable *, int *, struct pf_addr *, + struct pf_addr **, struct pf_addr **, sa_family_t); +void pfr_dynaddr_update(struct pfr_ktable *, struct pfi_dynaddr *); +struct pfr_ktable * + pfr_attach_table(struct pf_ruleset *, char *); +void pfr_detach_table(struct pfr_ktable *); +int pfr_clr_tables(struct pfr_table *, int *, int); +int pfr_add_tables(struct pfr_table *, int, int *, int); +int pfr_del_tables(struct pfr_table *, int, int *, int); +int pfr_get_tables(struct pfr_table *, struct pfr_table *, int *, int); +int pfr_get_tstats(struct pfr_table *, struct pfr_tstats *, int *, int); +int pfr_clr_tstats(struct pfr_table *, int, int *, int); +int pfr_set_tflags(struct pfr_table *, int, int, int, int *, int *, int); +int pfr_clr_addrs(struct pfr_table *, int *, int); +int pfr_insert_kentry(struct pfr_ktable *, struct pfr_addr *, long); +int pfr_add_addrs(struct pfr_table *, struct pfr_addr *, int, int *, + int); +int pfr_del_addrs(struct pfr_table *, struct pfr_addr *, int, int *, + int); +int pfr_set_addrs(struct pfr_table *, struct pfr_addr *, int, int *, + int *, int *, int *, int, u_int32_t); +int pfr_get_addrs(struct pfr_table *, struct pfr_addr *, int *, int); +int pfr_get_astats(struct pfr_table *, struct pfr_astats *, int *, int); +int pfr_clr_astats(struct pfr_table *, struct pfr_addr *, int, int *, + int); +int pfr_tst_addrs(struct pfr_table *, struct pfr_addr *, int, int *, + int); +int pfr_ina_begin(struct pfr_table *, u_int32_t *, int *, int); +int pfr_ina_rollback(struct pfr_table *, u_int32_t, int *, int); +int pfr_ina_commit(struct pfr_table *, u_int32_t, int *, int *, int); +int pfr_ina_define(struct pfr_table *, struct pfr_addr *, int, int *, + int *, u_int32_t, int); + +extern struct pfi_statehead pfi_statehead; +extern struct pfi_kif *pfi_all; + +void pfi_initialize(void); +#ifdef __FreeBSD__ +void pfi_cleanup(void); +#endif +struct pfi_kif *pfi_kif_get(const char *); +void pfi_kif_ref(struct pfi_kif *, enum pfi_kif_refs); +void pfi_kif_unref(struct pfi_kif *, enum pfi_kif_refs); +int pfi_kif_match(struct pfi_kif *, struct pfi_kif *); +void pfi_attach_ifnet(struct ifnet *); +void pfi_detach_ifnet(struct ifnet *); +void pfi_attach_ifgroup(struct ifg_group *); +void pfi_detach_ifgroup(struct ifg_group *); +void pfi_group_change(const char *); +int pfi_match_addr(struct pfi_dynaddr *, struct pf_addr *, + sa_family_t); +int pfi_dynaddr_setup(struct pf_addr_wrap *, sa_family_t); +void pfi_dynaddr_remove(struct pf_addr_wrap *); +void pfi_dynaddr_copyout(struct pf_addr_wrap *); +void pfi_fill_oldstatus(struct pf_status *); +int pfi_clr_istats(const char *); +int pfi_get_ifaces(const char *, struct pfi_kif *, int *); +int pfi_set_flags(const char *, int); +int pfi_clear_flags(const char *, int); + +u_int16_t pf_tagname2tag(char *); +void pf_tag2tagname(u_int16_t, char *); +void pf_tag_ref(u_int16_t); +void pf_tag_unref(u_int16_t); +int pf_tag_packet(struct mbuf *, struct pf_mtag *, int, int); +u_int32_t pf_qname2qid(char *); +void pf_qid2qname(u_int32_t, char *); +void pf_qid_unref(u_int32_t); +#ifndef __FreeBSD__ +struct pf_mtag *pf_find_mtag(struct mbuf *); +struct pf_mtag *pf_get_mtag(struct mbuf *); +#endif + +extern struct pf_status pf_status; + +#ifdef __FreeBSD__ +extern uma_zone_t pf_frent_pl, pf_frag_pl; +extern struct sx pf_consistency_lock; +#else +extern struct pool pf_frent_pl, pf_frag_pl; +extern struct rwlock pf_consistency_lock; +#endif + +struct pf_pool_limit { + void *pp; + unsigned limit; +}; +extern struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX]; + +#ifdef __FreeBSD__ +struct pf_frent { + LIST_ENTRY(pf_frent) fr_next; + struct ip *fr_ip; + struct mbuf *fr_m; +}; + +struct pf_frcache { + LIST_ENTRY(pf_frcache) fr_next; + uint16_t fr_off; + uint16_t fr_end; +}; + +struct pf_fragment { + RB_ENTRY(pf_fragment) fr_entry; + TAILQ_ENTRY(pf_fragment) frag_next; + struct in_addr fr_src; + struct in_addr fr_dst; + u_int8_t fr_p; /* protocol of this fragment */ + u_int8_t fr_flags; /* status flags */ + u_int16_t fr_id; /* fragment id for reassemble */ + u_int16_t fr_max; /* fragment data max */ + u_int32_t fr_timeout; +#define fr_queue fr_u.fru_queue +#define fr_cache fr_u.fru_cache + union { + LIST_HEAD(pf_fragq, pf_frent) fru_queue; /* buffering */ + LIST_HEAD(pf_cacheq, pf_frcache) fru_cache; /* non-buf */ + } fr_u; +}; +#endif /* (__FreeBSD__) */ + +#endif /* _KERNEL */ + +extern struct pf_anchor_global pf_anchors; +extern struct pf_anchor pf_main_anchor; +#define pf_main_ruleset pf_main_anchor.ruleset + +/* these ruleset functions can be linked into userland programs (pfctl) */ +int pf_get_ruleset_number(u_int8_t); +void pf_init_ruleset(struct pf_ruleset *); +int pf_anchor_setup(struct pf_rule *, + const struct pf_ruleset *, const char *); +int pf_anchor_copyout(const struct pf_ruleset *, + const struct pf_rule *, struct pfioc_rule *); +void pf_anchor_remove(struct pf_rule *); +void pf_remove_if_empty_ruleset(struct pf_ruleset *); +struct pf_anchor *pf_find_anchor(const char *); +struct pf_ruleset *pf_find_ruleset(const char *); +struct pf_ruleset *pf_find_or_create_ruleset(const char *); +void pf_rs_initialize(void); + +#ifndef __FreeBSD__ +/* ?!? */ +#ifdef _KERNEL +int pf_anchor_copyout(const struct pf_ruleset *, + const struct pf_rule *, struct pfioc_rule *); +void pf_anchor_remove(struct pf_rule *); + +#endif /* _KERNEL */ +#endif + +/* The fingerprint functions can be linked into userland programs (tcpdump) */ +int pf_osfp_add(struct pf_osfp_ioctl *); +#ifdef _KERNEL +struct pf_osfp_enlist * + pf_osfp_fingerprint(struct pf_pdesc *, struct mbuf *, int, + const struct tcphdr *); +#endif /* _KERNEL */ +struct pf_osfp_enlist * + pf_osfp_fingerprint_hdr(const struct ip *, const struct ip6_hdr *, + const struct tcphdr *); +void pf_osfp_flush(void); +int pf_osfp_get(struct pf_osfp_ioctl *); +#ifdef __FreeBSD__ +int pf_osfp_initialize(void); +void pf_osfp_cleanup(void); +#else +void pf_osfp_initialize(void); +#endif +int pf_osfp_match(struct pf_osfp_enlist *, pf_osfp_t); +struct pf_os_fingerprint * + pf_osfp_validate(void); + +#endif /* _NET_PFVAR_HH_ */ diff --git a/contrib/pf/rtems/freebsd/netinet/in4_cksum.c b/contrib/pf/rtems/freebsd/netinet/in4_cksum.c new file mode 100644 index 00000000..bc11aeb9 --- /dev/null +++ b/contrib/pf/rtems/freebsd/netinet/in4_cksum.c @@ -0,0 +1,122 @@ +#include + +/* $FreeBSD$ */ +/* $OpenBSD: in4_cksum.c,v 1.7 2003/06/02 23:28:13 millert Exp $ */ +/* $KAME: in4_cksum.c,v 1.10 2001/11/30 10:06:15 itojun Exp $ */ +/* $NetBSD: in_cksum.c,v 1.13 1996/10/13 02:03:03 christos Exp $ */ + +/* + * Copyright (C) 1999 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} + +int in4_cksum(struct mbuf *, u_int8_t, int, int); + +int +in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len) +{ + union { + struct ipovly ipov; + u_int16_t w[10]; + } u; + union { + u_int16_t s[2]; + u_int32_t l; + } l_util; + + u_int16_t *w; + int psum; + int sum = 0; + + if (nxt != 0) { + /* pseudo header */ + if (off < sizeof(struct ipovly)) + panic("in4_cksum: offset too short"); + if (m->m_len < sizeof(struct ip)) + panic("in4_cksum: bad mbuf chain"); + bzero(&u.ipov, sizeof(u.ipov)); + u.ipov.ih_len = htons(len); + u.ipov.ih_pr = nxt; + u.ipov.ih_src = mtod(m, struct ip *)->ip_src; + u.ipov.ih_dst = mtod(m, struct ip *)->ip_dst; + w = u.w; + /* assumes sizeof(ipov) == 20 */ + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4]; + sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9]; + } + + psum = in_cksum_skip(m, len + off, off); + psum = ~psum & 0xffff; + sum += psum; + REDUCE; + return (~sum & 0xffff); +} -- cgit v1.2.3