summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netpfil
diff options
context:
space:
mode:
authorSebastian Huber <sebastian.huber@embedded-brains.de>2016-10-07 15:10:20 +0200
committerSebastian Huber <sebastian.huber@embedded-brains.de>2017-01-10 09:53:31 +0100
commitc40e45b75eb76d79a05c7fa85c1fa9b5c728a12f (patch)
treead4f2519067709f00ab98b3c591186c26dc3a21f /freebsd/sys/netpfil
parentuserspace-header-gen.py: Simplify program ports (diff)
downloadrtems-libbsd-c40e45b75eb76d79a05c7fa85c1fa9b5c728a12f.tar.bz2
Update to FreeBSD head 2016-08-23
Git mirror commit 9fe7c416e6abb28b1398fd3e5687099846800cfd.
Diffstat (limited to 'freebsd/sys/netpfil')
-rw-r--r--freebsd/sys/netpfil/ipfw/dn_aqm.h167
-rw-r--r--freebsd/sys/netpfil/ipfw/dn_aqm_codel.h222
-rw-r--r--freebsd/sys/netpfil/ipfw/dn_aqm_pie.h153
-rw-r--r--freebsd/sys/netpfil/ipfw/dn_heap.c554
-rw-r--r--freebsd/sys/netpfil/ipfw/dn_heap.h4
-rw-r--r--freebsd/sys/netpfil/ipfw/dn_sched.h14
-rw-r--r--freebsd/sys/netpfil/ipfw/dn_sched_fifo.c122
-rw-r--r--freebsd/sys/netpfil/ipfw/dn_sched_fq_codel.h167
-rw-r--r--freebsd/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h187
-rw-r--r--freebsd/sys/netpfil/ipfw/dn_sched_prio.c231
-rw-r--r--freebsd/sys/netpfil/ipfw/dn_sched_qfq.c866
-rw-r--r--freebsd/sys/netpfil/ipfw/dn_sched_rr.c309
-rw-r--r--freebsd/sys/netpfil/ipfw/dn_sched_wf2q.c375
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_dn_glue.c848
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_dn_io.c852
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_dn_private.h62
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_dummynet.c2309
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw2.c553
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw_bpf.c211
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw_dynamic.c1822
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw_eaction.c383
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw_iface.c541
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw_log.c120
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw_nat.c710
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw_pfil.c210
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw_private.h545
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw_sockopt.c3904
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw_table.c3595
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw_table.h234
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw_table_algo.c4112
-rw-r--r--freebsd/sys/netpfil/ipfw/ip_fw_table_value.c810
-rw-r--r--freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.c131
-rw-r--r--freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.h117
-rw-r--r--freebsd/sys/netpfil/ipfw/nat64/nat64_translate.c1574
-rw-r--r--freebsd/sys/netpfil/ipfw/nat64/nat64_translate.h116
-rw-r--r--freebsd/sys/netpfil/ipfw/nat64/nat64lsn.c1772
-rw-r--r--freebsd/sys/netpfil/ipfw/nat64/nat64lsn.h351
-rw-r--r--freebsd/sys/netpfil/ipfw/nat64/nat64lsn_control.c919
-rw-r--r--freebsd/sys/netpfil/ipfw/nat64/nat64stl.c262
-rw-r--r--freebsd/sys/netpfil/ipfw/nat64/nat64stl.h58
-rw-r--r--freebsd/sys/netpfil/ipfw/nat64/nat64stl_control.c623
-rw-r--r--freebsd/sys/netpfil/ipfw/nptv6/ip_fw_nptv6.c101
-rw-r--r--freebsd/sys/netpfil/ipfw/nptv6/nptv6.c894
-rw-r--r--freebsd/sys/netpfil/ipfw/nptv6/nptv6.h65
-rw-r--r--freebsd/sys/netpfil/pf/if_pflog.c320
-rw-r--r--freebsd/sys/netpfil/pf/if_pfsync.c2421
-rw-r--r--freebsd/sys/netpfil/pf/in4_cksum.c122
-rw-r--r--freebsd/sys/netpfil/pf/pf.c6657
-rw-r--r--freebsd/sys/netpfil/pf/pf.h203
-rw-r--r--freebsd/sys/netpfil/pf/pf_altq.h121
-rw-r--r--freebsd/sys/netpfil/pf/pf_if.c924
-rw-r--r--freebsd/sys/netpfil/pf/pf_ioctl.c3872
-rw-r--r--freebsd/sys/netpfil/pf/pf_lb.c681
-rw-r--r--freebsd/sys/netpfil/pf/pf_mtag.h64
-rw-r--r--freebsd/sys/netpfil/pf/pf_norm.c1843
-rw-r--r--freebsd/sys/netpfil/pf/pf_osfp.c530
-rw-r--r--freebsd/sys/netpfil/pf/pf_ruleset.c426
-rw-r--r--freebsd/sys/netpfil/pf/pf_table.c2195
58 files changed, 44729 insertions, 7825 deletions
diff --git a/freebsd/sys/netpfil/ipfw/dn_aqm.h b/freebsd/sys/netpfil/ipfw/dn_aqm.h
new file mode 100644
index 00000000..d01e98eb
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/dn_aqm.h
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * API for writing an Active Queue Management algorithm for Dummynet
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_AQM_H
+#define _IP_DN_AQM_H
+
+
+/* NOW is the current time in millisecond*/
+#define NOW ((dn_cfg.curr_time * tick) / 1000)
+
+#define AQM_UNOW (dn_cfg.curr_time * tick)
+#define AQM_TIME_1US ((aqm_time_t)(1))
+#define AQM_TIME_1MS ((aqm_time_t)(1000))
+#define AQM_TIME_1S ((aqm_time_t)(AQM_TIME_1MS * 1000))
+
+/* aqm time allows to store up to 4294 seconds */
+typedef uint32_t aqm_time_t;
+typedef int32_t aqm_stime_t;
+
+#define DN_AQM_MTAG_TS 55345
+
+/* Macro for variable bounding */
+#define BOUND_VAR(x,l,h) ((x) > (h)? (h) : ((x) > (l)? (x) : (l)))
+
+/* sysctl variable to count number of dropped packets */
+extern unsigned long io_pkt_drop;
+
+/*
+ * Structure for holding data and function pointers that together represent a
+ * AQM algorithm.
+ */
+ struct dn_aqm {
+#define DN_AQM_NAME_MAX 50
+ char name[DN_AQM_NAME_MAX]; /* name of AQM algorithm */
+ uint32_t type; /* AQM type number */
+
+ /* Methods implemented by AQM algorithm:
+ *
+ * enqueue enqueue packet 'm' on queue 'q'.
+ * Return 0 on success, 1 on drop.
+ *
+ * dequeue dequeue a packet from queue 'q'.
+ * Return a packet, NULL if no packet available.
+ *
+ * config configure AQM algorithm
+ * If required, this function should allocate space to store
+ * the configurations and set 'fs->aqmcfg' to point to this space.
+ * 'dn_extra_parms' includes array of parameters send
+ * from ipfw userland command.
+ * Return 0 on success, non-zero otherwise.
+ *
+ * deconfig deconfigure AQM algorithm.
+ * The allocated configuration memory space should be freed here.
+ * Return 0 on success, non-zero otherwise.
+ *
+ * init initialise AQM status variables of queue 'q'
+ * This function is used to allocate space and init AQM status for a
+ * queue and q->aqm_status to point to this space.
+ * Return 0 on success, non-zero otherwise.
+ *
+ * cleanup cleanup AQM status variables of queue 'q'
+ * The allocated memory space for AQM status should be freed here.
+ * Return 0 on success, non-zero otherwise.
+ *
+ * getconfig retrieve AQM configurations
+ * This function is used to return AQM parameters to userland
+ * command. The function should fill 'dn_extra_parms' struct with
+ * the AQM configurations using 'par' array.
+ *
+ */
+
+ int (*enqueue)(struct dn_queue *, struct mbuf *);
+ struct mbuf * (*dequeue)(struct dn_queue *);
+ int (*config)(struct dn_fsk *, struct dn_extra_parms *ep, int);
+ int (*deconfig)(struct dn_fsk *);
+ int (*init)(struct dn_queue *);
+ int (*cleanup)(struct dn_queue *);
+ int (*getconfig)(struct dn_fsk *, struct dn_extra_parms *);
+
+ int ref_count; /*Number of queues instances in the system */
+ int cfg_ref_count; /*Number of AQM instances in the system */
+ SLIST_ENTRY (dn_aqm) next; /* Next AQM in the list */
+};
+
+/* Helper function to update queue and scheduler statistics.
+ * negative len + drop -> drop
+ * negative len -> dequeue
+ * positive len -> enqueue
+ * positive len + drop -> drop during enqueue
+ */
+__inline static void
+update_stats(struct dn_queue *q, int len, int drop)
+{
+ int inc = 0;
+ struct dn_flow *sni;
+ struct dn_flow *qni;
+
+ sni = &q->_si->ni;
+ qni = &q->ni;
+
+ if (len < 0)
+ inc = -1;
+ else if(len > 0)
+ inc = 1;
+
+ if (drop) {
+ qni->drops++;
+ sni->drops++;
+ io_pkt_drop++;
+ } else {
+ /*update queue stats */
+ qni->length += inc;
+ qni->len_bytes += len;
+
+ /*update scheduler instance stats */
+ sni->length += inc;
+ sni->len_bytes += len;
+ }
+ /* tot_pkts is updated in dn_enqueue function */
+}
+
+
+/* kernel module related function */
+int
+dn_aqm_modevent(module_t mod, int cmd, void *arg);
+
+#define DECLARE_DNAQM_MODULE(name, dnaqm) \
+ static moduledata_t name##_mod = { \
+ #name, dn_aqm_modevent, dnaqm \
+ }; \
+ DECLARE_MODULE(name, name##_mod, \
+ SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \
+ MODULE_DEPEND(name, dummynet, 3, 3, 3)
+
+#endif
diff --git a/freebsd/sys/netpfil/ipfw/dn_aqm_codel.h b/freebsd/sys/netpfil/ipfw/dn_aqm_codel.h
new file mode 100644
index 00000000..f5618e76
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/dn_aqm_codel.h
@@ -0,0 +1,222 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm.
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Copyright (C) 2011-2014 Kathleen Nichols <nichols@pollere.com>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * o Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ *
+ * o Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * o The names of the authors may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General Public
+ * License ("GPL") version 2, in which case the provisions of the GPL
+ * apply INSTEAD OF those given above.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _IP_DN_AQM_CODEL_H
+#define _IP_DN_AQM_CODEL_H
+
+
+// XXX How to choose MTAG?
+#define FIX_POINT_BITS 16
+
+enum {
+ CODEL_ECN_ENABLED = 1
+};
+
+/* Codel parameters */
+struct dn_aqm_codel_parms {
+ aqm_time_t target;
+ aqm_time_t interval;
+ uint32_t flags;
+};
+
+/* codel status variables */
+struct codel_status {
+ uint32_t count; /* number of dropped pkts since entering drop state */
+ uint16_t dropping; /* dropping state */
+ aqm_time_t drop_next_time; /* time for next drop */
+ aqm_time_t first_above_time; /* time for first ts over target we observed */
+ uint16_t isqrt; /* last isqrt for control low */
+ uint16_t maxpkt_size; /* max packet size seen so far */
+};
+
+struct mbuf *codel_extract_head(struct dn_queue *, aqm_time_t *);
+aqm_time_t control_law(struct codel_status *,
+ struct dn_aqm_codel_parms *, aqm_time_t );
+
+__inline static struct mbuf *
+codel_dodequeue(struct dn_queue *q, aqm_time_t now, uint16_t *ok_to_drop)
+{
+ struct mbuf * m;
+ struct dn_aqm_codel_parms *cprms;
+ struct codel_status *cst;
+ aqm_time_t pkt_ts, sojourn_time;
+
+ *ok_to_drop = 0;
+ m = codel_extract_head(q, &pkt_ts);
+
+ cst = q->aqm_status;
+
+ if (m == NULL) {
+ /* queue is empty - we can't be above target */
+ cst->first_above_time= 0;
+ return m;
+ }
+
+ cprms = q->fs->aqmcfg;
+
+ /* To span a large range of bandwidths, CoDel runs two
+ * different AQMs in parallel. One is sojourn-time-based
+ * and takes effect when the time to send an MTU-sized
+ * packet is less than target. The 1st term of the "if"
+ * below does this. The other is backlog-based and takes
+ * effect when the time to send an MTU-sized packet is >=
+ * target. The goal here is to keep the output link
+ * utilization high by never allowing the queue to get
+ * smaller than the amount that arrives in a typical
+ * interarrival time (MTU-sized packets arriving spaced
+ * by the amount of time it takes to send such a packet on
+ * the bottleneck). The 2nd term of the "if" does this.
+ */
+ sojourn_time = now - pkt_ts;
+ if (sojourn_time < cprms->target || q->ni.len_bytes <= cst->maxpkt_size) {
+ /* went below - stay below for at least interval */
+ cst->first_above_time = 0;
+ } else {
+ if (cst->first_above_time == 0) {
+ /* just went above from below. if still above at
+ * first_above_time, will say it's ok to drop. */
+ cst->first_above_time = now + cprms->interval;
+ } else if (now >= cst->first_above_time) {
+ *ok_to_drop = 1;
+ }
+ }
+ return m;
+}
+
+/*
+ * Dequeue a packet from queue 'q'
+ */
+__inline static struct mbuf *
+codel_dequeue(struct dn_queue *q)
+{
+ struct mbuf *m;
+ struct dn_aqm_codel_parms *cprms;
+ struct codel_status *cst;
+ aqm_time_t now;
+ uint16_t ok_to_drop;
+
+ cst = q->aqm_status;;
+ cprms = q->fs->aqmcfg;
+ now = AQM_UNOW;
+
+ m = codel_dodequeue(q, now, &ok_to_drop);
+ if (cst->dropping) {
+ if (!ok_to_drop) {
+ /* sojourn time below target - leave dropping state */
+ cst->dropping = false;
+ }
+ /*
+ * Time for the next drop. Drop current packet and dequeue
+ * next. If the dequeue doesn't take us out of dropping
+ * state, schedule the next drop. A large backlog might
+ * result in drop rates so high that the next drop should
+ * happen now, hence the 'while' loop.
+ */
+ while (now >= cst->drop_next_time && cst->dropping) {
+
+ /* mark the packet */
+ if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) {
+ cst->count++;
+ /* schedule the next mark. */
+ cst->drop_next_time = control_law(cst, cprms,
+ cst->drop_next_time);
+ return m;
+ }
+
+ /* drop the packet */
+ update_stats(q, 0, 1);
+ FREE_PKT(m);
+ m = codel_dodequeue(q, now, &ok_to_drop);
+
+ if (!ok_to_drop) {
+ /* leave dropping state */
+ cst->dropping = false;
+ } else {
+ cst->count++;
+ /* schedule the next drop. */
+ cst->drop_next_time = control_law(cst, cprms,
+ cst->drop_next_time);
+ }
+ }
+ /* If we get here we're not in dropping state. The 'ok_to_drop'
+ * return from dodequeue means that the sojourn time has been
+ * above 'target' for 'interval' so enter dropping state.
+ */
+ } else if (ok_to_drop) {
+
+ /* if ECN option is disabled or the packet cannot be marked,
+ * drop the packet and extract another.
+ */
+ if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) {
+ update_stats(q, 0, 1);
+ FREE_PKT(m);
+ m = codel_dodequeue(q, now, &ok_to_drop);
+ }
+
+ cst->dropping = true;
+
+ /* If min went above target close to when it last went
+ * below, assume that the drop rate that controlled the
+ * queue on the last cycle is a good starting point to
+ * control it now. ('drop_next' will be at most 'interval'
+ * later than the time of the last drop so 'now - drop_next'
+ * is a good approximation of the time from the last drop
+ * until now.)
+ */
+ cst->count = (cst->count > 2 && ((aqm_stime_t)now -
+ (aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)?
+ cst->count - 2 : 1;
+ /* we don't have to set initial guess for Newton's method isqrt as
+ * we initilaize isqrt in control_law function when count == 1 */
+ cst->drop_next_time = control_law(cst, cprms, now);
+ }
+
+ return m;
+}
+
+#endif
diff --git a/freebsd/sys/netpfil/ipfw/dn_aqm_pie.h b/freebsd/sys/netpfil/ipfw/dn_aqm_pie.h
new file mode 100644
index 00000000..aa2fceba
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/dn_aqm_pie.h
@@ -0,0 +1,153 @@
+/*
+ * PIE - Proportional Integral controller Enhanced AQM algorithm.
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _IP_DN_AQM_PIE_H
+#define _IP_DN_AQM_PIE_H
+
+#define DN_AQM_PIE 2
+#define PIE_DQ_THRESHOLD_BITS 14
+/* 2^14 =16KB */
+#define PIE_DQ_THRESHOLD (1UL << PIE_DQ_THRESHOLD_BITS)
+#define MEAN_PKTSIZE 800
+
+/* 31-bits because random() generates range from 0->(2**31)-1 */
+#define PIE_PROB_BITS 31
+#define PIE_MAX_PROB ((1ULL<<PIE_PROB_BITS) -1)
+
+/* for 16-bits, we have 3-bits for integer part and 13-bits for fraction */
+#define PIE_FIX_POINT_BITS 13
+#define PIE_SCALE (1UL<<PIE_FIX_POINT_BITS)
+
+
+/* PIE options */
+enum {
+ PIE_ECN_ENABLED =1,
+ PIE_CAPDROP_ENABLED = 2,
+ PIE_ON_OFF_MODE_ENABLED = 4,
+ PIE_DEPRATEEST_ENABLED = 8,
+ PIE_DERAND_ENABLED = 16
+};
+
+/* PIE parameters */
+struct dn_aqm_pie_parms {
+ aqm_time_t qdelay_ref; /* AQM Latency Target (default: 15ms) */
+ aqm_time_t tupdate; /* a period to calculate drop probability (default:15ms) */
+ aqm_time_t max_burst; /* AQM Max Burst Allowance (default: 150ms) */
+ uint16_t max_ecnth; /*AQM Max ECN Marking Threshold (default: 10%) */
+ uint16_t alpha; /* (default: 1/8) */
+ uint16_t beta; /* (default: 1+1/4) */
+ uint32_t flags; /* PIE options */
+};
+
+/* PIE status variables */
+struct pie_status{
+ struct callout aqm_pie_callout;
+ aqm_time_t burst_allowance;
+ uint32_t drop_prob;
+ aqm_time_t current_qdelay;
+ aqm_time_t qdelay_old;
+ uint64_t accu_prob;
+ aqm_time_t measurement_start;
+ aqm_time_t avg_dq_time;
+ uint32_t dq_count;
+ uint32_t sflags;
+ struct dn_aqm_pie_parms *parms; /* pointer to PIE configurations */
+ /* pointer to parent queue of FQ-PIE sub-queues, or queue of owner fs. */
+ struct dn_queue *pq;
+ struct mtx lock_mtx;
+ uint32_t one_third_q_size; /* 1/3 of queue size, for speed optization */
+};
+
+enum {
+ ENQUE = 1,
+ DROP,
+ MARKECN
+};
+
+/* PIE current state */
+enum {
+ PIE_ACTIVE = 1,
+ PIE_INMEASUREMENT = 2
+};
+
+/*
+ * Check if eneque should drop packet to control delay or not based on
+ * PIe algorithm.
+ * return DROP if it is time to drop or ENQUE otherwise.
+ * This function is used by PIE and FQ-PIE.
+ */
+__inline static int
+drop_early(struct pie_status *pst, uint32_t qlen)
+{
+ struct dn_aqm_pie_parms *pprms;
+
+ pprms = pst->parms;
+
+ /* queue is not congested */
+
+ if ((pst->qdelay_old < (pprms->qdelay_ref >> 1)
+ && pst->drop_prob < PIE_MAX_PROB / 5 )
+ || qlen <= 2 * MEAN_PKTSIZE)
+ return ENQUE;
+
+
+ if (pst->drop_prob == 0)
+ pst->accu_prob = 0;
+
+ /* increment accu_prob */
+ if (pprms->flags & PIE_DERAND_ENABLED)
+ pst->accu_prob += pst->drop_prob;
+
+ /* De-randomize option
+ * if accu_prob < 0.85 -> enqueue
+ * if accu_prob>8.5 ->drop
+ * between 0.85 and 8.5 || !De-randomize --> drop on prob
+ *
+ * (0.85 = 17/20 ,8.5 = 17/2)
+ */
+ if (pprms->flags & PIE_DERAND_ENABLED) {
+ if(pst->accu_prob < (uint64_t) (PIE_MAX_PROB * 17 / 20))
+ return ENQUE;
+ if( pst->accu_prob >= (uint64_t) (PIE_MAX_PROB * 17 / 2))
+ return DROP;
+ }
+
+ if (random() < pst->drop_prob) {
+ pst->accu_prob = 0;
+ return DROP;
+ }
+
+ return ENQUE;
+}
+
+#endif
diff --git a/freebsd/sys/netpfil/ipfw/dn_heap.c b/freebsd/sys/netpfil/ipfw/dn_heap.c
deleted file mode 100644
index 15e2870d..00000000
--- a/freebsd/sys/netpfil/ipfw/dn_heap.c
+++ /dev/null
@@ -1,554 +0,0 @@
-#include <machine/rtems-bsd-kernel-space.h>
-
-/*-
- * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
- * All rights reserved
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * Binary heap and hash tables, used in dummynet
- *
- * $FreeBSD$
- */
-
-#include <sys/cdefs.h>
-#include <rtems/bsd/sys/param.h>
-#ifdef _KERNEL
-__FBSDID("$FreeBSD$");
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/kernel.h>
-#include <netpfil/ipfw/dn_heap.h>
-#ifndef log
-#define log(x, arg...)
-#endif
-
-#else /* !_KERNEL */
-
-#include <stdio.h>
-#include <dn_test.h>
-#include <strings.h>
-#include <stdlib.h>
-
-#include "dn_heap.h"
-#define log(x, arg...) fprintf(stderr, ## arg)
-#define panic(x...) fprintf(stderr, ## x), exit(1)
-#define MALLOC_DEFINE(a, b, c)
-static void *my_malloc(int s) { return malloc(s); }
-static void my_free(void *p) { free(p); }
-#define malloc(s, t, w) my_malloc(s)
-#define free(p, t) my_free(p)
-#endif /* !_KERNEL */
-
-MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap");
-
-/*
- * Heap management functions.
- *
- * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
- * Some macros help finding parent/children so we can optimize them.
- *
- * heap_init() is called to expand the heap when needed.
- * Increment size in blocks of 16 entries.
- * Returns 1 on error, 0 on success
- */
-#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
-#define HEAP_LEFT(x) ( (x)+(x) + 1 )
-#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
-#define HEAP_INCREMENT 15
-
-static int
-heap_resize(struct dn_heap *h, unsigned int new_size)
-{
- struct dn_heap_entry *p;
-
- if (h->size >= new_size ) /* have enough room */
- return 0;
-#if 1 /* round to the next power of 2 */
- new_size |= new_size >> 1;
- new_size |= new_size >> 2;
- new_size |= new_size >> 4;
- new_size |= new_size >> 8;
- new_size |= new_size >> 16;
-#else
- new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT;
-#endif
- p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT);
- if (p == NULL) {
- printf("--- %s, resize %d failed\n", __func__, new_size );
- return 1; /* error */
- }
- if (h->size > 0) {
- bcopy(h->p, p, h->size * sizeof(*p) );
- free(h->p, M_DN_HEAP);
- }
- h->p = p;
- h->size = new_size;
- return 0;
-}
-
-int
-heap_init(struct dn_heap *h, int size, int ofs)
-{
- if (heap_resize(h, size))
- return 1;
- h->elements = 0;
- h->ofs = ofs;
- return 0;
-}
-
-/*
- * Insert element in heap. Normally, p != NULL, we insert p in
- * a new position and bubble up. If p == NULL, then the element is
- * already in place, and key is the position where to start the
- * bubble-up.
- * Returns 1 on failure (cannot allocate new heap entry)
- *
- * If ofs > 0 the position (index, int) of the element in the heap is
- * also stored in the element itself at the given offset in bytes.
- */
-#define SET_OFFSET(h, i) do { \
- if (h->ofs > 0) \
- *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \
- } while (0)
-/*
- * RESET_OFFSET is used for sanity checks. It sets ofs
- * to an invalid value.
- */
-#define RESET_OFFSET(h, i) do { \
- if (h->ofs > 0) \
- *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \
- } while (0)
-
-int
-heap_insert(struct dn_heap *h, uint64_t key1, void *p)
-{
- int son = h->elements;
-
- //log("%s key %llu p %p\n", __FUNCTION__, key1, p);
- if (p == NULL) { /* data already there, set starting point */
- son = key1;
- } else { /* insert new element at the end, possibly resize */
- son = h->elements;
- if (son == h->size) /* need resize... */
- // XXX expand by 16 or so
- if (heap_resize(h, h->elements+16) )
- return 1; /* failure... */
- h->p[son].object = p;
- h->p[son].key = key1;
- h->elements++;
- }
- /* make sure that son >= father along the path */
- while (son > 0) {
- int father = HEAP_FATHER(son);
- struct dn_heap_entry tmp;
-
- if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
- break; /* found right position */
- /* son smaller than father, swap and repeat */
- HEAP_SWAP(h->p[son], h->p[father], tmp);
- SET_OFFSET(h, son);
- son = father;
- }
- SET_OFFSET(h, son);
- return 0;
-}
-
-/*
- * remove top element from heap, or obj if obj != NULL
- */
-void
-heap_extract(struct dn_heap *h, void *obj)
-{
- int child, father, max = h->elements - 1;
-
- if (max < 0) {
- printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h);
- return;
- }
- if (obj == NULL)
- father = 0; /* default: move up smallest child */
- else { /* extract specific element, index is at offset */
- if (h->ofs <= 0)
- panic("%s: extract from middle not set on %p\n",
- __FUNCTION__, h);
- father = *((int *)((char *)obj + h->ofs));
- if (father < 0 || father >= h->elements) {
- panic("%s: father %d out of bound 0..%d\n",
- __FUNCTION__, father, h->elements);
- }
- }
- /*
- * below, father is the index of the empty element, which
- * we replace at each step with the smallest child until we
- * reach the bottom level.
- */
- // XXX why removing RESET_OFFSET increases runtime by 10% ?
- RESET_OFFSET(h, father);
- while ( (child = HEAP_LEFT(father)) <= max ) {
- if (child != max &&
- DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
- child++; /* take right child, otherwise left */
- h->p[father] = h->p[child];
- SET_OFFSET(h, father);
- father = child;
- }
- h->elements--;
- if (father != max) {
- /*
- * Fill hole with last entry and bubble up,
- * reusing the insert code
- */
- h->p[father] = h->p[max];
- heap_insert(h, father, NULL);
- }
-}
-
-#if 0
-/*
- * change object position and update references
- * XXX this one is never used!
- */
-static void
-heap_move(struct dn_heap *h, uint64_t new_key, void *object)
-{
- int temp, i, max = h->elements-1;
- struct dn_heap_entry *p, buf;
-
- if (h->ofs <= 0)
- panic("cannot move items on this heap");
- p = h->p; /* shortcut */
-
- i = *((int *)((char *)object + h->ofs));
- if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */
- p[i].key = new_key;
- for (; i>0 &&
- DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key);
- i = temp ) { /* bubble up */
- HEAP_SWAP(p[i], p[temp], buf);
- SET_OFFSET(h, i);
- }
- } else { /* must move down */
- p[i].key = new_key;
- while ( (temp = HEAP_LEFT(i)) <= max ) {
- /* found left child */
- if (temp != max &&
- DN_KEY_LT(p[temp+1].key, p[temp].key))
- temp++; /* select child with min key */
- if (DN_KEY_LT(>p[temp].key, new_key)) {
- /* go down */
- HEAP_SWAP(p[i], p[temp], buf);
- SET_OFFSET(h, i);
- } else
- break;
- i = temp;
- }
- }
- SET_OFFSET(h, i);
-}
-#endif /* heap_move, unused */
-
-/*
- * heapify() will reorganize data inside an array to maintain the
- * heap property. It is needed when we delete a bunch of entries.
- */
-static void
-heapify(struct dn_heap *h)
-{
- int i;
-
- for (i = 0; i < h->elements; i++ )
- heap_insert(h, i , NULL);
-}
-
-int
-heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t),
- uintptr_t arg)
-{
- int i, ret, found;
-
- for (i = found = 0 ; i < h->elements ;) {
- ret = fn(h->p[i].object, arg);
- if (ret & HEAP_SCAN_DEL) {
- h->elements-- ;
- h->p[i] = h->p[h->elements] ;
- found++ ;
- } else
- i++ ;
- if (ret & HEAP_SCAN_END)
- break;
- }
- if (found)
- heapify(h);
- return found;
-}
-
-/*
- * cleanup the heap and free data structure
- */
-void
-heap_free(struct dn_heap *h)
-{
- if (h->size >0 )
- free(h->p, M_DN_HEAP);
- bzero(h, sizeof(*h) );
-}
-
-/*
- * hash table support.
- */
-
-struct dn_ht {
- int buckets; /* how many buckets, really buckets - 1*/
- int entries; /* how many entries */
- int ofs; /* offset of link field */
- uint32_t (*hash)(uintptr_t, int, void *arg);
- int (*match)(void *_el, uintptr_t key, int, void *);
- void *(*newh)(uintptr_t, int, void *);
- void **ht; /* bucket heads */
-};
-/*
- * Initialize, allocating bucket pointers inline.
- * Recycle previous record if possible.
- * If the 'newh' function is not supplied, we assume that the
- * key passed to ht_find is the same object to be stored in.
- */
-struct dn_ht *
-dn_ht_init(struct dn_ht *ht, int buckets, int ofs,
- uint32_t (*h)(uintptr_t, int, void *),
- int (*match)(void *, uintptr_t, int, void *),
- void *(*newh)(uintptr_t, int, void *))
-{
- int l;
-
- /*
- * Notes about rounding bucket size to a power of two.
- * Given the original bucket size, we compute the nearest lower and
- * higher power of two, minus 1 (respectively b_min and b_max) because
- * this value will be used to do an AND with the index returned
- * by hash function.
- * To choice between these two values, the original bucket size is
- * compared with b_min. If the original size is greater than 4/3 b_min,
- * we round the bucket size to b_max, else to b_min.
- * This ratio try to round to the nearest power of two, advantaging
- * the greater size if the different between two power is relatively
- * big.
- * Rounding the bucket size to a power of two avoid the use of
- * module when calculating the correct bucket.
- * The ht->buckets variable store the bucket size - 1 to simply
- * do an AND between the index returned by hash function and ht->bucket
- * instead of a module.
- */
- int b_min; /* min buckets */
- int b_max; /* max buckets */
- int b_ori; /* original buckets */
-
- if (h == NULL || match == NULL) {
- printf("--- missing hash or match function");
- return NULL;
- }
- if (buckets < 1 || buckets > 65536)
- return NULL;
-
- b_ori = buckets;
- /* calculate next power of 2, - 1*/
- buckets |= buckets >> 1;
- buckets |= buckets >> 2;
- buckets |= buckets >> 4;
- buckets |= buckets >> 8;
- buckets |= buckets >> 16;
-
- b_max = buckets; /* Next power */
- b_min = buckets >> 1; /* Previous power */
-
- /* Calculate the 'nearest' bucket size */
- if (b_min * 4000 / 3000 < b_ori)
- buckets = b_max;
- else
- buckets = b_min;
-
- if (ht) { /* see if we can reuse */
- if (buckets <= ht->buckets) {
- ht->buckets = buckets;
- } else {
- /* free pointers if not allocated inline */
- if (ht->ht != (void *)(ht + 1))
- free(ht->ht, M_DN_HEAP);
- free(ht, M_DN_HEAP);
- ht = NULL;
- }
- }
- if (ht == NULL) {
- /* Allocate buckets + 1 entries because buckets is use to
- * do the AND with the index returned by hash function
- */
- l = sizeof(*ht) + (buckets + 1) * sizeof(void **);
- ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO);
- }
- if (ht) {
- ht->ht = (void **)(ht + 1);
- ht->buckets = buckets;
- ht->ofs = ofs;
- ht->hash = h;
- ht->match = match;
- ht->newh = newh;
- }
- return ht;
-}
-
-/* dummy callback for dn_ht_free to unlink all */
-static int
-do_del(void *obj, void *arg)
-{
- return DNHT_SCAN_DEL;
-}
-
-void
-dn_ht_free(struct dn_ht *ht, int flags)
-{
- if (ht == NULL)
- return;
- if (flags & DNHT_REMOVE) {
- (void)dn_ht_scan(ht, do_del, NULL);
- } else {
- if (ht->ht && ht->ht != (void *)(ht + 1))
- free(ht->ht, M_DN_HEAP);
- free(ht, M_DN_HEAP);
- }
-}
-
-int
-dn_ht_entries(struct dn_ht *ht)
-{
- return ht ? ht->entries : 0;
-}
-
-/* lookup and optionally create or delete element */
-void *
-dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg)
-{
- int i;
- void **pp, *p;
-
- if (ht == NULL) /* easy on an empty hash */
- return NULL;
- i = (ht->buckets == 1) ? 0 :
- (ht->hash(key, flags, arg) & ht->buckets);
-
- for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) {
- if (flags & DNHT_MATCH_PTR) {
- if (key == (uintptr_t)p)
- break;
- } else if (ht->match(p, key, flags, arg)) /* found match */
- break;
- }
- if (p) {
- if (flags & DNHT_REMOVE) {
- /* link in the next element */
- *pp = *(void **)((char *)p + ht->ofs);
- *(void **)((char *)p + ht->ofs) = NULL;
- ht->entries--;
- }
- } else if (flags & DNHT_INSERT) {
- // printf("%s before calling new, bucket %d ofs %d\n",
- // __FUNCTION__, i, ht->ofs);
- p = ht->newh ? ht->newh(key, flags, arg) : (void *)key;
- // printf("%s newh returns %p\n", __FUNCTION__, p);
- if (p) {
- ht->entries++;
- *(void **)((char *)p + ht->ofs) = ht->ht[i];
- ht->ht[i] = p;
- }
- }
- return p;
-}
-
-/*
- * do a scan with the option to delete the object. Extract next before
- * running the callback because the element may be destroyed there.
- */
-int
-dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg)
-{
- int i, ret, found = 0;
- void **curp, *cur, *next;
-
- if (ht == NULL || fn == NULL)
- return 0;
- for (i = 0; i <= ht->buckets; i++) {
- curp = &ht->ht[i];
- while ( (cur = *curp) != NULL) {
- next = *(void **)((char *)cur + ht->ofs);
- ret = fn(cur, arg);
- if (ret & DNHT_SCAN_DEL) {
- found++;
- ht->entries--;
- *curp = next;
- } else {
- curp = (void **)((char *)cur + ht->ofs);
- }
- if (ret & DNHT_SCAN_END)
- return found;
- }
- }
- return found;
-}
-
-/*
- * Similar to dn_ht_scan(), except that the scan is performed only
- * in the bucket 'bucket'. The function returns a correct bucket number if
- * the original is invalid.
- * If the callback returns DNHT_SCAN_END, the function move the ht->ht[i]
- * pointer to the last entry processed. Moreover, the bucket number passed
- * by caller is decremented, because usually the caller increment it.
- */
-int
-dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *),
- void *arg)
-{
- int i, ret, found = 0;
- void **curp, *cur, *next;
-
- if (ht == NULL || fn == NULL)
- return 0;
- if (*bucket > ht->buckets)
- *bucket = 0;
- i = *bucket;
-
- curp = &ht->ht[i];
- while ( (cur = *curp) != NULL) {
- next = *(void **)((char *)cur + ht->ofs);
- ret = fn(cur, arg);
- if (ret & DNHT_SCAN_DEL) {
- found++;
- ht->entries--;
- *curp = next;
- } else {
- curp = (void **)((char *)cur + ht->ofs);
- }
- if (ret & DNHT_SCAN_END)
- return found;
- }
- return found;
-}
diff --git a/freebsd/sys/netpfil/ipfw/dn_heap.h b/freebsd/sys/netpfil/ipfw/dn_heap.h
index c95473ad..cb6e03ef 100644
--- a/freebsd/sys/netpfil/ipfw/dn_heap.h
+++ b/freebsd/sys/netpfil/ipfw/dn_heap.h
@@ -83,7 +83,7 @@ enum {
* heap_insert() adds a key-pointer pair to the heap
*
* HEAP_TOP() returns a pointer to the top element of the heap,
- * but makes no checks on its existance (XXX should we change ?)
+ * but makes no checks on its existence (XXX should we change ?)
*
* heap_extract() removes the entry at the top, returing the pointer.
* (the key should have been read before).
@@ -146,7 +146,7 @@ int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t);
* of the dn_ht_find(), and of the callbacks:
*
* DNHT_KEY_IS_OBJ means the key is the object pointer.
- * It is usally of interest for the hash and match functions.
+ * It is usually of interest for the hash and match functions.
*
* DNHT_MATCH_PTR during a lookup, match pointers instead
* of calling match(). Normally used when removing specific
diff --git a/freebsd/sys/netpfil/ipfw/dn_sched.h b/freebsd/sys/netpfil/ipfw/dn_sched.h
index ab823fe7..ab32771b 100644
--- a/freebsd/sys/netpfil/ipfw/dn_sched.h
+++ b/freebsd/sys/netpfil/ipfw/dn_sched.h
@@ -132,6 +132,10 @@ struct dn_alg {
int (*free_fsk)(struct dn_fsk *f);
int (*new_queue)(struct dn_queue *q);
int (*free_queue)(struct dn_queue *q);
+#ifdef NEW_AQM
+ /* Getting scheduler extra parameters */
+ int (*getconfig)(struct dn_schk *, struct dn_extra_parms *);
+#endif
/* run-time fields */
int ref_count; /* XXX number of instances in the system */
@@ -165,7 +169,13 @@ dn_dequeue(struct dn_queue *q)
struct mbuf *m = q->mq.head;
if (m == NULL)
return NULL;
+#ifdef NEW_AQM
+ /* Call AQM dequeue function */
+ if (q->fs->aqmfp && q->fs->aqmfp->dequeue )
+ return q->fs->aqmfp->dequeue(q);
+#endif
q->mq.head = m->m_nextpkt;
+ q->mq.count--;
/* Update stats for the queue */
q->ni.length--;
@@ -186,6 +196,6 @@ int dn_sched_modevent(module_t mod, int cmd, void *arg);
#name, dn_sched_modevent, dnsched \
}; \
DECLARE_MODULE(name, name##_mod, \
- SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \
- MODULE_DEPEND(name, dummynet, 3, 3, 3);
+ SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY); \
+ MODULE_DEPEND(name, dummynet, 3, 3, 3)
#endif /* _DN_SCHED_H */
diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_fifo.c b/freebsd/sys/netpfil/ipfw/dn_sched_fifo.c
deleted file mode 100644
index 154a7ac6..00000000
--- a/freebsd/sys/netpfil/ipfw/dn_sched_fifo.c
+++ /dev/null
@@ -1,122 +0,0 @@
-#include <machine/rtems-bsd-kernel-space.h>
-
-/*
- * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
- * All rights reserved
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * $FreeBSD$
- */
-
-#ifdef _KERNEL
-#include <sys/malloc.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/kernel.h>
-#include <sys/mbuf.h>
-#include <sys/module.h>
-#include <net/if.h> /* IFNAMSIZ */
-#include <netinet/in.h>
-#include <netinet/ip_var.h> /* ipfw_rule_ref */
-#include <netinet/ip_fw.h> /* flow_id */
-#include <netinet/ip_dummynet.h>
-#include <netpfil/ipfw/dn_heap.h>
-#include <netpfil/ipfw/ip_dn_private.h>
-#include <netpfil/ipfw/dn_sched.h>
-#else
-#include <dn_test.h>
-#endif
-
-/*
- * This file implements a FIFO scheduler for a single queue.
- * The queue is allocated as part of the scheduler instance,
- * and there is a single flowset is in the template which stores
- * queue size and policy.
- * Enqueue and dequeue use the default library functions.
- */
-static int
-fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m)
-{
- /* XXX if called with q != NULL and m=NULL, this is a
- * re-enqueue from an existing scheduler, which we should
- * handle.
- */
- return dn_enqueue((struct dn_queue *)(si+1), m, 0);
-}
-
-static struct mbuf *
-fifo_dequeue(struct dn_sch_inst *si)
-{
- return dn_dequeue((struct dn_queue *)(si + 1));
-}
-
-static int
-fifo_new_sched(struct dn_sch_inst *si)
-{
- /* This scheduler instance contains the queue */
- struct dn_queue *q = (struct dn_queue *)(si + 1);
-
- set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
- q->_si = si;
- q->fs = si->sched->fs;
- return 0;
-}
-
-static int
-fifo_free_sched(struct dn_sch_inst *si)
-{
- struct dn_queue *q = (struct dn_queue *)(si + 1);
- dn_free_pkts(q->mq.head);
- bzero(q, sizeof(*q));
- return 0;
-}
-
-/*
- * FIFO scheduler descriptor
- * contains the type of the scheduler, the name, the size of extra
- * data structures, and function pointers.
- */
-static struct dn_alg fifo_desc = {
- _SI( .type = ) DN_SCHED_FIFO,
- _SI( .name = ) "FIFO",
- _SI( .flags = ) 0,
-
- _SI( .schk_datalen = ) 0,
- _SI( .si_datalen = ) sizeof(struct dn_queue),
- _SI( .q_datalen = ) 0,
-
- _SI( .enqueue = ) fifo_enqueue,
- _SI( .dequeue = ) fifo_dequeue,
- _SI( .config = ) NULL,
- _SI( .destroy = ) NULL,
- _SI( .new_sched = ) fifo_new_sched,
- _SI( .free_sched = ) fifo_free_sched,
- _SI( .new_fsk = ) NULL,
- _SI( .free_fsk = ) NULL,
- _SI( .new_queue = ) NULL,
- _SI( .free_queue = ) NULL,
-};
-
-DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc);
diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_fq_codel.h b/freebsd/sys/netpfil/ipfw/dn_sched_fq_codel.h
new file mode 100644
index 00000000..4b65781e
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/dn_sched_fq_codel.h
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * FQ_Codel Structures and helper functions
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_SCHED_FQ_CODEL_H
+#define _IP_DN_SCHED_FQ_CODEL_H
+
+/* list of queues */
+STAILQ_HEAD(fq_codel_list, fq_codel_flow) ;
+
+/* fq_codel parameters including codel */
+struct dn_sch_fq_codel_parms {
+ struct dn_aqm_codel_parms ccfg; /* CoDel Parameters */
+ /* FQ_CODEL Parameters */
+ uint32_t flows_cnt; /* number of flows */
+ uint32_t limit; /* hard limit of fq_codel queue size*/
+ uint32_t quantum;
+}; /* defaults */
+
+/* flow (sub-queue) stats */
+struct flow_stats {
+ uint64_t tot_pkts; /* statistics counters */
+ uint64_t tot_bytes;
+ uint32_t length; /* Queue length, in packets */
+ uint32_t len_bytes; /* Queue length, in bytes */
+ uint32_t drops;
+};
+
+/* A flow of packets (sub-queue).*/
+struct fq_codel_flow {
+ struct mq mq; /* list of packets */
+ struct flow_stats stats; /* statistics */
+ int deficit;
+ int active; /* 1: flow is active (in a list) */
+ struct codel_status cst;
+ STAILQ_ENTRY(fq_codel_flow) flowchain;
+};
+
+/* extra fq_codel scheduler configurations */
+struct fq_codel_schk {
+ struct dn_sch_fq_codel_parms cfg;
+};
+
+/* fq_codel scheduler instance */
+struct fq_codel_si {
+ struct dn_sch_inst _si; /* standard scheduler instance */
+ struct dn_queue main_q; /* main queue is after si directly */
+
+ struct fq_codel_flow *flows; /* array of flows (queues) */
+ uint32_t perturbation; /* random value */
+ struct fq_codel_list newflows; /* list of new queues */
+ struct fq_codel_list oldflows; /* list of old queues */
+};
+
+/* Helper function to update queue&main-queue and scheduler statistics.
+ * negative len + drop -> drop
+ * negative len -> dequeue
+ * positive len -> enqueue
+ * positive len + drop -> drop during enqueue
+ */
+__inline static void
+fq_update_stats(struct fq_codel_flow *q, struct fq_codel_si *si, int len,
+ int drop)
+{
+ int inc = 0;
+
+ if (len < 0)
+ inc = -1;
+ else if (len > 0)
+ inc = 1;
+
+ if (drop) {
+ si->main_q.ni.drops ++;
+ q->stats.drops ++;
+ si->_si.ni.drops ++;
+ io_pkt_drop ++;
+ }
+
+ if (!drop || (drop && len < 0)) {
+ /* Update stats for the main queue */
+ si->main_q.ni.length += inc;
+ si->main_q.ni.len_bytes += len;
+
+ /*update sub-queue stats */
+ q->stats.length += inc;
+ q->stats.len_bytes += len;
+
+ /*update scheduler instance stats */
+ si->_si.ni.length += inc;
+ si->_si.ni.len_bytes += len;
+ }
+
+ if (inc > 0) {
+ si->main_q.ni.tot_bytes += len;
+ si->main_q.ni.tot_pkts ++;
+
+ q->stats.tot_bytes +=len;
+ q->stats.tot_pkts++;
+
+ si->_si.ni.tot_bytes +=len;
+ si->_si.ni.tot_pkts ++;
+ }
+
+}
+
+/* extract the head of fq_codel sub-queue */
+__inline static struct mbuf *
+fq_codel_extract_head(struct fq_codel_flow *q, aqm_time_t *pkt_ts, struct fq_codel_si *si)
+{
+ struct mbuf *m = q->mq.head;
+
+ if (m == NULL)
+ return m;
+ q->mq.head = m->m_nextpkt;
+
+ fq_update_stats(q, si, -m->m_pkthdr.len, 0);
+
+ if (si->main_q.ni.length == 0) /* queue is now idle */
+ si->main_q.q_time = dn_cfg.curr_time;
+
+ /* extract packet timestamp*/
+ struct m_tag *mtag;
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL){
+ D("timestamp tag is not found!");
+ *pkt_ts = 0;
+ } else {
+ *pkt_ts = *(aqm_time_t *)(mtag + 1);
+ m_tag_delete(m,mtag);
+ }
+
+ return m;
+}
+
+
+#endif
diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h b/freebsd/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h
new file mode 100644
index 00000000..da663dc8
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h
@@ -0,0 +1,187 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm.
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Copyright (C) 2011-2014 Kathleen Nichols <nichols@pollere.com>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * o Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ *
+ * o Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * o The names of the authors may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General Public
+ * License ("GPL") version 2, in which case the provisions of the GPL
+ * apply INSTEAD OF those given above.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _IP_DN_SCHED_FQ_CODEL_HELPER_H
+#define _IP_DN_SCHED_FQ_CODEL_HELPER_H
+
+__inline static struct mbuf *
+fqc_dodequeue(struct fq_codel_flow *q, aqm_time_t now, uint16_t *ok_to_drop,
+ struct fq_codel_si *si)
+{
+ struct mbuf * m;
+ struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1);
+ aqm_time_t pkt_ts, sojourn_time;
+
+ *ok_to_drop = 0;
+ m = fq_codel_extract_head(q, &pkt_ts, si);
+
+ if (m == NULL) {
+ /*queue is empty - we can't be above target*/
+ q->cst.first_above_time= 0;
+ return m;
+ }
+
+ /* To span a large range of bandwidths, CoDel runs two
+ * different AQMs in parallel. One is sojourn-time-based
+ * and takes effect when the time to send an MTU-sized
+ * packet is less than target. The 1st term of the "if"
+ * below does this. The other is backlog-based and takes
+ * effect when the time to send an MTU-sized packet is >=
+ * target. The goal here is to keep the output link
+ * utilization high by never allowing the queue to get
+ * smaller than the amount that arrives in a typical
+ * interarrival time (MTU-sized packets arriving spaced
+ * by the amount of time it takes to send such a packet on
+ * the bottleneck). The 2nd term of the "if" does this.
+ */
+ sojourn_time = now - pkt_ts;
+ if (sojourn_time < schk->cfg.ccfg.target || q->stats.len_bytes <= q->cst.maxpkt_size) {
+ /* went below - stay below for at least interval */
+ q->cst.first_above_time = 0;
+ } else {
+ if (q->cst.first_above_time == 0) {
+ /* just went above from below. if still above at
+ * first_above_time, will say it's ok to drop. */
+ q->cst.first_above_time = now + schk->cfg.ccfg.interval;
+ } else if (now >= q->cst.first_above_time) {
+ *ok_to_drop = 1;
+ }
+ }
+ return m;
+}
+
+/* Codel dequeue function */
+__inline static struct mbuf *
+fqc_codel_dequeue(struct fq_codel_flow *q, struct fq_codel_si *si)
+{
+ struct mbuf *m;
+ struct dn_aqm_codel_parms *cprms;
+ struct codel_status *cst;
+ aqm_time_t now;
+ uint16_t ok_to_drop;
+ struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1);
+
+ cst = &q->cst;
+ cprms = &schk->cfg.ccfg;
+
+ now = AQM_UNOW;
+ m = fqc_dodequeue(q, now, &ok_to_drop, si);
+
+ if (cst->dropping) {
+ if (!ok_to_drop) {
+ /* sojourn time below target - leave dropping state */
+ cst->dropping = false;
+ }
+
+ /* Time for the next drop. Drop current packet and dequeue
+ * next. If the dequeue doesn't take us out of dropping
+ * state, schedule the next drop. A large backlog might
+ * result in drop rates so high that the next drop should
+ * happen now, hence the 'while' loop.
+ */
+ while (now >= cst->drop_next_time && cst->dropping) {
+
+ /* mark the packet */
+ if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) {
+ cst->count++;
+ /* schedule the next mark. */
+ cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time);
+ return m;
+ }
+
+ /* drop the packet */
+ fq_update_stats(q, si, 0, 1);
+ m_freem(m);
+ m = fqc_dodequeue(q, now, &ok_to_drop, si);
+
+ if (!ok_to_drop) {
+ /* leave dropping state */
+ cst->dropping = false;
+ } else {
+ cst->count++;
+ /* schedule the next drop. */
+ cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time);
+ }
+ }
+ /* If we get here we're not in dropping state. The 'ok_to_drop'
+ * return from dodequeue means that the sojourn time has been
+ * above 'target' for 'interval' so enter dropping state.
+ */
+ } else if (ok_to_drop) {
+
+ /* if ECN option is disabled or the packet cannot be marked,
+ * drop the packet and extract another.
+ */
+ if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) {
+ fq_update_stats(q, si, 0, 1);
+ m_freem(m);
+ m = fqc_dodequeue(q, now, &ok_to_drop,si);
+ }
+
+ cst->dropping = true;
+
+ /* If min went above target close to when it last went
+ * below, assume that the drop rate that controlled the
+ * queue on the last cycle is a good starting point to
+ * control it now. ('drop_next' will be at most 'interval'
+ * later than the time of the last drop so 'now - drop_next'
+ * is a good approximation of the time from the last drop
+ * until now.)
+ */
+ cst->count = (cst->count > 2 && ((aqm_stime_t)now -
+ (aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)? cst->count - 2 : 1;
+
+ /* we don't have to set initial guess for Newton's method isqrt as
+ * we initilaize isqrt in control_law function when count == 1 */
+ cst->drop_next_time = control_law(cst, cprms, now);
+ }
+
+ return m;
+}
+
+#endif
diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_prio.c b/freebsd/sys/netpfil/ipfw/dn_sched_prio.c
deleted file mode 100644
index 0679db9d..00000000
--- a/freebsd/sys/netpfil/ipfw/dn_sched_prio.c
+++ /dev/null
@@ -1,231 +0,0 @@
-#include <machine/rtems-bsd-kernel-space.h>
-
-/*
- * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
- * All rights reserved
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * $FreeBSD$
- */
-#ifdef _KERNEL
-#include <sys/malloc.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/kernel.h>
-#include <sys/mbuf.h>
-#include <sys/module.h>
-#include <net/if.h> /* IFNAMSIZ */
-#include <netinet/in.h>
-#include <netinet/ip_var.h> /* ipfw_rule_ref */
-#include <netinet/ip_fw.h> /* flow_id */
-#include <netinet/ip_dummynet.h>
-#include <netpfil/ipfw/dn_heap.h>
-#include <netpfil/ipfw/ip_dn_private.h>
-#include <netpfil/ipfw/dn_sched.h>
-#else
-#include <dn_test.h>
-#endif
-
-#define DN_SCHED_PRIO 5 //XXX
-
-#if !defined(_KERNEL) || !defined(__linux__)
-#define test_bit(ix, pData) ((*pData) & (1<<(ix)))
-#define __set_bit(ix, pData) (*pData) |= (1<<(ix))
-#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
-#endif
-
-#ifdef __MIPSEL__
-#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
-#endif
-
-/* Size of the array of queues pointers. */
-#define BITMAP_T unsigned long
-#define MAXPRIO (sizeof(BITMAP_T) * 8)
-
-/*
- * The scheduler instance contains an array of pointers to queues,
- * one for each priority, and a bitmap listing backlogged queues.
- */
-struct prio_si {
- BITMAP_T bitmap; /* array bitmap */
- struct dn_queue *q_array[MAXPRIO]; /* Array of queues pointers */
-};
-
-/*
- * If a queue with the same priority is already backlogged, use
- * that one instead of the queue passed as argument.
- */
-static int
-prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
-{
- struct prio_si *si = (struct prio_si *)(_si + 1);
- int prio = q->fs->fs.par[0];
-
- if (test_bit(prio, &si->bitmap) == 0) {
- /* No queue with this priority, insert */
- __set_bit(prio, &si->bitmap);
- si->q_array[prio] = q;
- } else { /* use the existing queue */
- q = si->q_array[prio];
- }
- if (dn_enqueue(q, m, 0))
- return 1;
- return 0;
-}
-
-/*
- * Packets are dequeued only from the highest priority queue.
- * The function ffs() return the lowest bit in the bitmap that rapresent
- * the array index (-1) which contains the pointer to the highest priority
- * queue.
- * After the dequeue, if this queue become empty, it is index is removed
- * from the bitmap.
- * Scheduler is idle if the bitmap is empty
- *
- * NOTE: highest priority is 0, lowest is sched->max_prio_q
- */
-static struct mbuf *
-prio_dequeue(struct dn_sch_inst *_si)
-{
- struct prio_si *si = (struct prio_si *)(_si + 1);
- struct mbuf *m;
- struct dn_queue *q;
- int prio;
-
- if (si->bitmap == 0) /* scheduler idle */
- return NULL;
-
- prio = ffs(si->bitmap) - 1;
-
- /* Take the highest priority queue in the scheduler */
- q = si->q_array[prio];
- // assert(q)
-
- m = dn_dequeue(q);
- if (q->mq.head == NULL) {
- /* Queue is now empty, remove from scheduler
- * and mark it
- */
- si->q_array[prio] = NULL;
- __clear_bit(prio, &si->bitmap);
- }
- return m;
-}
-
-static int
-prio_new_sched(struct dn_sch_inst *_si)
-{
- struct prio_si *si = (struct prio_si *)(_si + 1);
-
- bzero(si->q_array, sizeof(si->q_array));
- si->bitmap = 0;
-
- return 0;
-}
-
-static int
-prio_new_fsk(struct dn_fsk *fs)
-{
- /* Check if the prioritiy is between 0 and MAXPRIO-1 */
- ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority");
- return 0;
-}
-
-static int
-prio_new_queue(struct dn_queue *q)
-{
- struct prio_si *si = (struct prio_si *)(q->_si + 1);
- int prio = q->fs->fs.par[0];
- struct dn_queue *oldq;
-
- q->ni.oid.subtype = DN_SCHED_PRIO;
-
- if (q->mq.head == NULL)
- return 0;
-
- /* Queue already full, must insert in the scheduler or append
- * mbufs to existing queue. This partly duplicates prio_enqueue
- */
- if (test_bit(prio, &si->bitmap) == 0) {
- /* No queue with this priority, insert */
- __set_bit(prio, &si->bitmap);
- si->q_array[prio] = q;
- } else if ( (oldq = si->q_array[prio]) != q) {
- /* must append to the existing queue.
- * can simply append q->mq.head to q2->...
- * and add the counters to those of q2
- */
- oldq->mq.tail->m_nextpkt = q->mq.head;
- oldq->mq.tail = q->mq.tail;
- oldq->ni.length += q->ni.length;
- q->ni.length = 0;
- oldq->ni.len_bytes += q->ni.len_bytes;
- q->ni.len_bytes = 0;
- q->mq.tail = q->mq.head = NULL;
- }
- return 0;
-}
-
-static int
-prio_free_queue(struct dn_queue *q)
-{
- int prio = q->fs->fs.par[0];
- struct prio_si *si = (struct prio_si *)(q->_si + 1);
-
- if (si->q_array[prio] == q) {
- si->q_array[prio] = NULL;
- __clear_bit(prio, &si->bitmap);
- }
- return 0;
-}
-
-
-static struct dn_alg prio_desc = {
- _SI( .type = ) DN_SCHED_PRIO,
- _SI( .name = ) "PRIO",
- _SI( .flags = ) DN_MULTIQUEUE,
-
- /* we need extra space in the si and the queue */
- _SI( .schk_datalen = ) 0,
- _SI( .si_datalen = ) sizeof(struct prio_si),
- _SI( .q_datalen = ) 0,
-
- _SI( .enqueue = ) prio_enqueue,
- _SI( .dequeue = ) prio_dequeue,
-
- _SI( .config = ) NULL,
- _SI( .destroy = ) NULL,
- _SI( .new_sched = ) prio_new_sched,
- _SI( .free_sched = ) NULL,
-
- _SI( .new_fsk = ) prio_new_fsk,
- _SI( .free_fsk = ) NULL,
-
- _SI( .new_queue = ) prio_new_queue,
- _SI( .free_queue = ) prio_free_queue,
-};
-
-
-DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc);
diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_qfq.c b/freebsd/sys/netpfil/ipfw/dn_sched_qfq.c
deleted file mode 100644
index 461c40a5..00000000
--- a/freebsd/sys/netpfil/ipfw/dn_sched_qfq.c
+++ /dev/null
@@ -1,866 +0,0 @@
-#include <machine/rtems-bsd-kernel-space.h>
-
-/*
- * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
- * All rights reserved
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * $FreeBSD$
- */
-
-#ifdef _KERNEL
-#include <sys/malloc.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/kernel.h>
-#include <sys/mbuf.h>
-#include <sys/module.h>
-#include <net/if.h> /* IFNAMSIZ */
-#include <netinet/in.h>
-#include <netinet/ip_var.h> /* ipfw_rule_ref */
-#include <netinet/ip_fw.h> /* flow_id */
-#include <netinet/ip_dummynet.h>
-#include <netpfil/ipfw/dn_heap.h>
-#include <netpfil/ipfw/ip_dn_private.h>
-#include <netpfil/ipfw/dn_sched.h>
-#else
-#include <dn_test.h>
-#endif
-
-#ifdef QFQ_DEBUG
-struct qfq_sched;
-static void dump_sched(struct qfq_sched *q, const char *msg);
-#define NO(x) x
-#else
-#define NO(x)
-#endif
-#define DN_SCHED_QFQ 4 // XXX Where?
-typedef unsigned long bitmap;
-
-/*
- * bitmaps ops are critical. Some linux versions have __fls
- * and the bitmap ops. Some machines have ffs
- */
-#if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
-int fls(unsigned int n)
-{
- int i = 0;
- for (i = 0; n > 0; n >>= 1, i++)
- ;
- return i;
-}
-#endif
-
-#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
-static inline unsigned long __fls(unsigned long word)
-{
- return fls(word) - 1;
-}
-#endif
-
-#if !defined(_KERNEL) || !defined(__linux__)
-#ifdef QFQ_DEBUG
-int test_bit(int ix, bitmap *p)
-{
- if (ix < 0 || ix > 31)
- D("bad index %d", ix);
- return *p & (1<<ix);
-}
-void __set_bit(int ix, bitmap *p)
-{
- if (ix < 0 || ix > 31)
- D("bad index %d", ix);
- *p |= (1<<ix);
-}
-void __clear_bit(int ix, bitmap *p)
-{
- if (ix < 0 || ix > 31)
- D("bad index %d", ix);
- *p &= ~(1<<ix);
-}
-#else /* !QFQ_DEBUG */
-/* XXX do we have fast version, or leave it to the compiler ? */
-#define test_bit(ix, pData) ((*pData) & (1<<(ix)))
-#define __set_bit(ix, pData) (*pData) |= (1<<(ix))
-#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
-#endif /* !QFQ_DEBUG */
-#endif /* !__linux__ */
-
-#ifdef __MIPSEL__
-#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
-#endif
-
-/*-------------------------------------------*/
-/*
-
-Virtual time computations.
-
-S, F and V are all computed in fixed point arithmetic with
-FRAC_BITS decimal bits.
-
- QFQ_MAX_INDEX is the maximum index allowed for a group. We need
- one bit per index.
- QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
- The layout of the bits is as below:
-
- [ MTU_SHIFT ][ FRAC_BITS ]
- [ MAX_INDEX ][ MIN_SLOT_SHIFT ]
- ^.__grp->index = 0
- *.__grp->slot_shift
-
- where MIN_SLOT_SHIFT is derived by difference from the others.
-
-The max group index corresponds to Lmax/w_min, where
-Lmax=1<<MTU_SHIFT, w_min = 1 .
-From this, and knowing how many groups (MAX_INDEX) we want,
-we can derive the shift corresponding to each group.
-
-Because we often need to compute
- F = S + len/w_i and V = V + len/wsum
-instead of storing w_i store the value
- inv_w = (1<<FRAC_BITS)/w_i
-so we can do F = S + len * inv_w * wsum.
-We use W_TOT in the formulas so we can easily move between
-static and adaptive weight sum.
-
-The per-scheduler-instance data contain all the data structures
-for the scheduler: bitmaps and bucket lists.
-
- */
-/*
- * Maximum number of consecutive slots occupied by backlogged classes
- * inside a group. This is approx lmax/lmin + 5.
- * XXX check because it poses constraints on MAX_INDEX
- */
-#define QFQ_MAX_SLOTS 32
-/*
- * Shifts used for class<->group mapping. Class weights are
- * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the
- * group with the smallest index that can support the L_i / r_i
- * configured for the class.
- *
- * grp->index is the index of the group; and grp->slot_shift
- * is the shift for the corresponding (scaled) sigma_i.
- *
- * When computing the group index, we do (len<<FP_SHIFT)/weight,
- * then compute an FLS (which is like a log2()), and if the result
- * is below the MAX_INDEX region we use 0 (which is the same as
- * using a larger len).
- */
-#define QFQ_MAX_INDEX 19
-#define QFQ_MAX_WSHIFT 16 /* log2(max_weight) */
-
-#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT)
-#define QFQ_MAX_WSUM (2*QFQ_MAX_WEIGHT)
-//#define IWSUM (q->i_wsum)
-#define IWSUM ((1<<FRAC_BITS)/QFQ_MAX_WSUM)
-
-#define FRAC_BITS 30 /* fixed point arithmetic */
-#define ONE_FP (1UL << FRAC_BITS)
-
-#define QFQ_MTU_SHIFT 11 /* log2(max_len) */
-#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
-
-/*
- * Possible group states, also indexes for the bitmaps array in
- * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3
- */
-enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
-
-struct qfq_group;
-/*
- * additional queue info. Some of this info should come from
- * the flowset, we copy them here for faster processing.
- * This is an overlay of the struct dn_queue
- */
-struct qfq_class {
- struct dn_queue _q;
- uint64_t S, F; /* flow timestamps (exact) */
- struct qfq_class *next; /* Link for the slot list. */
-
- /* group we belong to. In principle we would need the index,
- * which is log_2(lmax/weight), but we never reference it
- * directly, only the group.
- */
- struct qfq_group *grp;
-
- /* these are copied from the flowset. */
- uint32_t inv_w; /* ONE_FP/weight */
- uint32_t lmax; /* Max packet size for this flow. */
-};
-
-/* Group descriptor, see the paper for details.
- * Basically this contains the bucket lists
- */
-struct qfq_group {
- uint64_t S, F; /* group timestamps (approx). */
- unsigned int slot_shift; /* Slot shift. */
- unsigned int index; /* Group index. */
- unsigned int front; /* Index of the front slot. */
- bitmap full_slots; /* non-empty slots */
-
- /* Array of lists of active classes. */
- struct qfq_class *slots[QFQ_MAX_SLOTS];
-};
-
-/* scheduler instance descriptor. */
-struct qfq_sched {
- uint64_t V; /* Precise virtual time. */
- uint32_t wsum; /* weight sum */
- NO(uint32_t i_wsum; /* ONE_FP/w_sum */
- uint32_t _queued; /* debugging */
- uint32_t loops; /* debugging */)
- bitmap bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
- struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
-};
-
-/*---- support functions ----------------------------*/
-
-/* Generic comparison function, handling wraparound. */
-static inline int qfq_gt(uint64_t a, uint64_t b)
-{
- return (int64_t)(a - b) > 0;
-}
-
-/* Round a precise timestamp to its slotted value. */
-static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift)
-{
- return ts & ~((1ULL << shift) - 1);
-}
-
-/* return the pointer to the group with lowest index in the bitmap */
-static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
- unsigned long bitmap)
-{
- int index = ffs(bitmap) - 1; // zero-based
- return &q->groups[index];
-}
-
-/*
- * Calculate a flow index, given its weight and maximum packet length.
- * index = log_2(maxlen/weight) but we need to apply the scaling.
- * This is used only once at flow creation.
- */
-static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen)
-{
- uint64_t slot_size = (uint64_t)maxlen *inv_w;
- unsigned long size_map;
- int index = 0;
-
- size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT);
- if (!size_map)
- goto out;
-
- index = __fls(size_map) + 1; // basically a log_2()
- index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
-
- if (index < 0)
- index = 0;
-
-out:
- ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index);
- return index;
-}
-/*---- end support functions ----*/
-
-/*-------- API calls --------------------------------*/
-/*
- * Validate and copy parameters from flowset.
- */
-static int
-qfq_new_queue(struct dn_queue *_q)
-{
- struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
- struct qfq_class *cl = (struct qfq_class *)_q;
- int i;
- uint32_t w; /* approximated weight */
-
- /* import parameters from the flowset. They should be correct
- * already.
- */
- w = _q->fs->fs.par[0];
- cl->lmax = _q->fs->fs.par[1];
- if (!w || w > QFQ_MAX_WEIGHT) {
- w = 1;
- D("rounding weight to 1");
- }
- cl->inv_w = ONE_FP/w;
- w = ONE_FP/cl->inv_w;
- if (q->wsum + w > QFQ_MAX_WSUM)
- return EINVAL;
-
- i = qfq_calc_index(cl->inv_w, cl->lmax);
- cl->grp = &q->groups[i];
- q->wsum += w;
- // XXX cl->S = q->V; ?
- // XXX compute q->i_wsum
- return 0;
-}
-
-/* remove an empty queue */
-static int
-qfq_free_queue(struct dn_queue *_q)
-{
- struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
- struct qfq_class *cl = (struct qfq_class *)_q;
- if (cl->inv_w) {
- q->wsum -= ONE_FP/cl->inv_w;
- cl->inv_w = 0; /* reset weight to avoid run twice */
- }
- return 0;
-}
-
-/* Calculate a mask to mimic what would be ffs_from(). */
-static inline unsigned long
-mask_from(unsigned long bitmap, int from)
-{
- return bitmap & ~((1UL << from) - 1);
-}
-
-/*
- * The state computation relies on ER=0, IR=1, EB=2, IB=3
- * First compute eligibility comparing grp->S, q->V,
- * then check if someone is blocking us and possibly add EB
- */
-static inline unsigned int
-qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp)
-{
- /* if S > V we are not eligible */
- unsigned int state = qfq_gt(grp->S, q->V);
- unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
- struct qfq_group *next;
-
- if (mask) {
- next = qfq_ffs(q, mask);
- if (qfq_gt(grp->F, next->F))
- state |= EB;
- }
-
- return state;
-}
-
-/*
- * In principle
- * q->bitmaps[dst] |= q->bitmaps[src] & mask;
- * q->bitmaps[src] &= ~mask;
- * but we should make sure that src != dst
- */
-static inline void
-qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst)
-{
- q->bitmaps[dst] |= q->bitmaps[src] & mask;
- q->bitmaps[src] &= ~mask;
-}
-
-static inline void
-qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish)
-{
- unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
- struct qfq_group *next;
-
- if (mask) {
- next = qfq_ffs(q, mask);
- if (!qfq_gt(next->F, old_finish))
- return;
- }
-
- mask = (1UL << index) - 1;
- qfq_move_groups(q, mask, EB, ER);
- qfq_move_groups(q, mask, IB, IR);
-}
-
-/*
- * perhaps
- *
- old_V ^= q->V;
- old_V >>= QFQ_MIN_SLOT_SHIFT;
- if (old_V) {
- ...
- }
- *
- */
-static inline void
-qfq_make_eligible(struct qfq_sched *q, uint64_t old_V)
-{
- unsigned long mask, vslot, old_vslot;
-
- vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
- old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
-
- if (vslot != old_vslot) {
- mask = (2UL << (__fls(vslot ^ old_vslot))) - 1;
- qfq_move_groups(q, mask, IR, ER);
- qfq_move_groups(q, mask, IB, EB);
- }
-}
-
-/*
- * XXX we should make sure that slot becomes less than 32.
- * This is guaranteed by the input values.
- * roundedS is always cl->S rounded on grp->slot_shift bits.
- */
-static inline void
-qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS)
-{
- uint64_t slot = (roundedS - grp->S) >> grp->slot_shift;
- unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
-
- cl->next = grp->slots[i];
- grp->slots[i] = cl;
- __set_bit(slot, &grp->full_slots);
-}
-
-/*
- * remove the entry from the slot
- */
-static inline void
-qfq_front_slot_remove(struct qfq_group *grp)
-{
- struct qfq_class **h = &grp->slots[grp->front];
-
- *h = (*h)->next;
- if (!*h)
- __clear_bit(0, &grp->full_slots);
-}
-
-/*
- * Returns the first full queue in a group. As a side effect,
- * adjust the bucket list so the first non-empty bucket is at
- * position 0 in full_slots.
- */
-static inline struct qfq_class *
-qfq_slot_scan(struct qfq_group *grp)
-{
- int i;
-
- ND("grp %d full %x", grp->index, grp->full_slots);
- if (!grp->full_slots)
- return NULL;
-
- i = ffs(grp->full_slots) - 1; // zero-based
- if (i > 0) {
- grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
- grp->full_slots >>= i;
- }
-
- return grp->slots[grp->front];
-}
-
-/*
- * adjust the bucket list. When the start time of a group decreases,
- * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
- * move the objects. The mask of occupied slots must be shifted
- * because we use ffs() to find the first non-empty slot.
- * This covers decreases in the group's start time, but what about
- * increases of the start time ?
- * Here too we should make sure that i is less than 32
- */
-static inline void
-qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS)
-{
- unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
-
- grp->full_slots <<= i;
- grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
-}
-
-
-static inline void
-qfq_update_eligible(struct qfq_sched *q, uint64_t old_V)
-{
- bitmap ineligible;
-
- ineligible = q->bitmaps[IR] | q->bitmaps[IB];
- if (ineligible) {
- if (!q->bitmaps[ER]) {
- struct qfq_group *grp;
- grp = qfq_ffs(q, ineligible);
- if (qfq_gt(grp->S, q->V))
- q->V = grp->S;
- }
- qfq_make_eligible(q, old_V);
- }
-}
-
-/*
- * Updates the class, returns true if also the group needs to be updated.
- */
-static inline int
-qfq_update_class(struct qfq_sched *q, struct qfq_group *grp,
- struct qfq_class *cl)
-{
-
- cl->S = cl->F;
- if (cl->_q.mq.head == NULL) {
- qfq_front_slot_remove(grp);
- } else {
- unsigned int len;
- uint64_t roundedS;
-
- len = cl->_q.mq.head->m_pkthdr.len;
- cl->F = cl->S + (uint64_t)len * cl->inv_w;
- roundedS = qfq_round_down(cl->S, grp->slot_shift);
- if (roundedS == grp->S)
- return 0;
-
- qfq_front_slot_remove(grp);
- qfq_slot_insert(grp, cl, roundedS);
- }
- return 1;
-}
-
-static struct mbuf *
-qfq_dequeue(struct dn_sch_inst *si)
-{
- struct qfq_sched *q = (struct qfq_sched *)(si + 1);
- struct qfq_group *grp;
- struct qfq_class *cl;
- struct mbuf *m;
- uint64_t old_V;
-
- NO(q->loops++;)
- if (!q->bitmaps[ER]) {
- NO(if (q->queued)
- dump_sched(q, "start dequeue");)
- return NULL;
- }
-
- grp = qfq_ffs(q, q->bitmaps[ER]);
-
- cl = grp->slots[grp->front];
- /* extract from the first bucket in the bucket list */
- m = dn_dequeue(&cl->_q);
-
- if (!m) {
- D("BUG/* non-workconserving leaf */");
- return NULL;
- }
- NO(q->queued--;)
- old_V = q->V;
- q->V += (uint64_t)m->m_pkthdr.len * IWSUM;
- ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V);
-
- if (qfq_update_class(q, grp, cl)) {
- uint64_t old_F = grp->F;
- cl = qfq_slot_scan(grp);
- if (!cl) { /* group gone, remove from ER */
- __clear_bit(grp->index, &q->bitmaps[ER]);
- // grp->S = grp->F + 1; // XXX debugging only
- } else {
- uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift);
- unsigned int s;
-
- if (grp->S == roundedS)
- goto skip_unblock;
- grp->S = roundedS;
- grp->F = roundedS + (2ULL << grp->slot_shift);
- /* remove from ER and put in the new set */
- __clear_bit(grp->index, &q->bitmaps[ER]);
- s = qfq_calc_state(q, grp);
- __set_bit(grp->index, &q->bitmaps[s]);
- }
- /* we need to unblock even if the group has gone away */
- qfq_unblock_groups(q, grp->index, old_F);
- }
-
-skip_unblock:
- qfq_update_eligible(q, old_V);
- NO(if (!q->bitmaps[ER] && q->queued)
- dump_sched(q, "end dequeue");)
-
- return m;
-}
-
-/*
- * Assign a reasonable start time for a new flow k in group i.
- * Admissible values for \hat(F) are multiples of \sigma_i
- * no greater than V+\sigma_i . Larger values mean that
- * we had a wraparound so we consider the timestamp to be stale.
- *
- * If F is not stale and F >= V then we set S = F.
- * Otherwise we should assign S = V, but this may violate
- * the ordering in ER. So, if we have groups in ER, set S to
- * the F_j of the first group j which would be blocking us.
- * We are guaranteed not to move S backward because
- * otherwise our group i would still be blocked.
- */
-static inline void
-qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
-{
- unsigned long mask;
- uint32_t limit, roundedF;
- int slot_shift = cl->grp->slot_shift;
-
- roundedF = qfq_round_down(cl->F, slot_shift);
- limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift);
-
- if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
- /* timestamp was stale */
- mask = mask_from(q->bitmaps[ER], cl->grp->index);
- if (mask) {
- struct qfq_group *next = qfq_ffs(q, mask);
- if (qfq_gt(roundedF, next->F)) {
- cl->S = next->F;
- return;
- }
- }
- cl->S = q->V;
- } else { /* timestamp is not stale */
- cl->S = cl->F;
- }
-}
-
-static int
-qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m)
-{
- struct qfq_sched *q = (struct qfq_sched *)(si + 1);
- struct qfq_group *grp;
- struct qfq_class *cl = (struct qfq_class *)_q;
- uint64_t roundedS;
- int s;
-
- NO(q->loops++;)
- DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len,
- _q, cl->inv_w, cl->grp->index);
- /* XXX verify that the packet obeys the parameters */
- if (m != _q->mq.head) {
- if (dn_enqueue(_q, m, 0)) /* packet was dropped */
- return 1;
- NO(q->queued++;)
- if (m != _q->mq.head)
- return 0;
- }
- /* If reach this point, queue q was idle */
- grp = cl->grp;
- qfq_update_start(q, cl); /* adjust start time */
- /* compute new finish time and rounded start. */
- cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w;
- roundedS = qfq_round_down(cl->S, grp->slot_shift);
-
- /*
- * insert cl in the correct bucket.
- * If cl->S >= grp->S we don't need to adjust the
- * bucket list and simply go to the insertion phase.
- * Otherwise grp->S is decreasing, we must make room
- * in the bucket list, and also recompute the group state.
- * Finally, if there were no flows in this group and nobody
- * was in ER make sure to adjust V.
- */
- if (grp->full_slots) {
- if (!qfq_gt(grp->S, cl->S))
- goto skip_update;
- /* create a slot for this cl->S */
- qfq_slot_rotate(q, grp, roundedS);
- /* group was surely ineligible, remove */
- __clear_bit(grp->index, &q->bitmaps[IR]);
- __clear_bit(grp->index, &q->bitmaps[IB]);
- } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
- q->V = roundedS;
-
- grp->S = roundedS;
- grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i
- s = qfq_calc_state(q, grp);
- __set_bit(grp->index, &q->bitmaps[s]);
- ND("new state %d 0x%x", s, q->bitmaps[s]);
- ND("S %llx F %llx V %llx", cl->S, cl->F, q->V);
-skip_update:
- qfq_slot_insert(grp, cl, roundedS);
-
- return 0;
-}
-
-
-#if 0
-static inline void
-qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
- struct qfq_class *cl, struct qfq_class **pprev)
-{
- unsigned int i, offset;
- uint64_t roundedS;
-
- roundedS = qfq_round_down(cl->S, grp->slot_shift);
- offset = (roundedS - grp->S) >> grp->slot_shift;
- i = (grp->front + offset) % QFQ_MAX_SLOTS;
-
-#ifdef notyet
- if (!pprev) {
- pprev = &grp->slots[i];
- while (*pprev && *pprev != cl)
- pprev = &(*pprev)->next;
- }
-#endif
-
- *pprev = cl->next;
- if (!grp->slots[i])
- __clear_bit(offset, &grp->full_slots);
-}
-
-/*
- * called to forcibly destroy a queue.
- * If the queue is not in the front bucket, or if it has
- * other queues in the front bucket, we can simply remove
- * the queue with no other side effects.
- * Otherwise we must propagate the event up.
- * XXX description to be completed.
- */
-static void
-qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl,
- struct qfq_class **pprev)
-{
- struct qfq_group *grp = &q->groups[cl->index];
- unsigned long mask;
- uint64_t roundedS;
- int s;
-
- cl->F = cl->S; // not needed if the class goes away.
- qfq_slot_remove(q, grp, cl, pprev);
-
- if (!grp->full_slots) {
- /* nothing left in the group, remove from all sets.
- * Do ER last because if we were blocking other groups
- * we must unblock them.
- */
- __clear_bit(grp->index, &q->bitmaps[IR]);
- __clear_bit(grp->index, &q->bitmaps[EB]);
- __clear_bit(grp->index, &q->bitmaps[IB]);
-
- if (test_bit(grp->index, &q->bitmaps[ER]) &&
- !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
- mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
- if (mask)
- mask = ~((1UL << __fls(mask)) - 1);
- else
- mask = ~0UL;
- qfq_move_groups(q, mask, EB, ER);
- qfq_move_groups(q, mask, IB, IR);
- }
- __clear_bit(grp->index, &q->bitmaps[ER]);
- } else if (!grp->slots[grp->front]) {
- cl = qfq_slot_scan(grp);
- roundedS = qfq_round_down(cl->S, grp->slot_shift);
- if (grp->S != roundedS) {
- __clear_bit(grp->index, &q->bitmaps[ER]);
- __clear_bit(grp->index, &q->bitmaps[IR]);
- __clear_bit(grp->index, &q->bitmaps[EB]);
- __clear_bit(grp->index, &q->bitmaps[IB]);
- grp->S = roundedS;
- grp->F = roundedS + (2ULL << grp->slot_shift);
- s = qfq_calc_state(q, grp);
- __set_bit(grp->index, &q->bitmaps[s]);
- }
- }
- qfq_update_eligible(q, q->V);
-}
-#endif
-
-static int
-qfq_new_fsk(struct dn_fsk *f)
-{
- ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight");
- ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen");
- ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]);
- return 0;
-}
-
-/*
- * initialize a new scheduler instance
- */
-static int
-qfq_new_sched(struct dn_sch_inst *si)
-{
- struct qfq_sched *q = (struct qfq_sched *)(si + 1);
- struct qfq_group *grp;
- int i;
-
- for (i = 0; i <= QFQ_MAX_INDEX; i++) {
- grp = &q->groups[i];
- grp->index = i;
- grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS -
- (QFQ_MAX_INDEX - i);
- }
- return 0;
-}
-
-/*
- * QFQ scheduler descriptor
- */
-static struct dn_alg qfq_desc = {
- _SI( .type = ) DN_SCHED_QFQ,
- _SI( .name = ) "QFQ",
- _SI( .flags = ) DN_MULTIQUEUE,
-
- _SI( .schk_datalen = ) 0,
- _SI( .si_datalen = ) sizeof(struct qfq_sched),
- _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue),
-
- _SI( .enqueue = ) qfq_enqueue,
- _SI( .dequeue = ) qfq_dequeue,
-
- _SI( .config = ) NULL,
- _SI( .destroy = ) NULL,
- _SI( .new_sched = ) qfq_new_sched,
- _SI( .free_sched = ) NULL,
- _SI( .new_fsk = ) qfq_new_fsk,
- _SI( .free_fsk = ) NULL,
- _SI( .new_queue = ) qfq_new_queue,
- _SI( .free_queue = ) qfq_free_queue,
-};
-
-DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);
-
-#ifdef QFQ_DEBUG
-static void
-dump_groups(struct qfq_sched *q, uint32_t mask)
-{
- int i, j;
-
- for (i = 0; i < QFQ_MAX_INDEX + 1; i++) {
- struct qfq_group *g = &q->groups[i];
-
- if (0 == (mask & (1<<i)))
- continue;
- for (j = 0; j < QFQ_MAX_SLOTS; j++) {
- if (g->slots[j])
- D(" bucket %d %p", j, g->slots[j]);
- }
- D("full_slots 0x%x", g->full_slots);
- D(" %2d S 0x%20llx F 0x%llx %c", i,
- g->S, g->F,
- mask & (1<<i) ? '1' : '0');
- }
-}
-
-static void
-dump_sched(struct qfq_sched *q, const char *msg)
-{
- D("--- in %s: ---", msg);
- ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V);
- D(" ER 0x%08x", q->bitmaps[ER]);
- D(" EB 0x%08x", q->bitmaps[EB]);
- D(" IR 0x%08x", q->bitmaps[IR]);
- D(" IB 0x%08x", q->bitmaps[IB]);
- dump_groups(q, 0xffffffff);
-};
-#endif /* QFQ_DEBUG */
diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_rr.c b/freebsd/sys/netpfil/ipfw/dn_sched_rr.c
deleted file mode 100644
index c1862ab0..00000000
--- a/freebsd/sys/netpfil/ipfw/dn_sched_rr.c
+++ /dev/null
@@ -1,309 +0,0 @@
-#include <machine/rtems-bsd-kernel-space.h>
-
-/*
- * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
- * All rights reserved
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * $FreeBSD$
- */
-
-#ifdef _KERNEL
-#include <sys/malloc.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/kernel.h>
-#include <sys/mbuf.h>
-#include <sys/module.h>
-#include <net/if.h> /* IFNAMSIZ */
-#include <netinet/in.h>
-#include <netinet/ip_var.h> /* ipfw_rule_ref */
-#include <netinet/ip_fw.h> /* flow_id */
-#include <netinet/ip_dummynet.h>
-#include <netpfil/ipfw/dn_heap.h>
-#include <netpfil/ipfw/ip_dn_private.h>
-#include <netpfil/ipfw/dn_sched.h>
-#else
-#include <dn_test.h>
-#endif
-
-#define DN_SCHED_RR 3 // XXX Where?
-
-struct rr_queue {
- struct dn_queue q; /* Standard queue */
- int status; /* 1: queue is in the list */
- int credit; /* Number of bytes to transmit */
- int quantum; /* quantum * C */
- struct rr_queue *qnext; /* */
-};
-
-/* struct rr_schk contains global config parameters
- * and is right after dn_schk
- */
-struct rr_schk {
- int min_q; /* Min quantum */
- int max_q; /* Max quantum */
- int q_bytes; /* Bytes per quantum */
-};
-
-/* per-instance round robin list, right after dn_sch_inst */
-struct rr_si {
- struct rr_queue *head, *tail; /* Pointer to current queue */
-};
-
-/* Append a queue to the rr list */
-static inline void
-rr_append(struct rr_queue *q, struct rr_si *si)
-{
- q->status = 1; /* mark as in-rr_list */
- q->credit = q->quantum; /* initialize credit */
-
- /* append to the tail */
- if (si->head == NULL)
- si->head = q;
- else
- si->tail->qnext = q;
- si->tail = q; /* advance the tail pointer */
- q->qnext = si->head; /* make it circular */
-}
-
-/* Remove the head queue from circular list. */
-static inline void
-rr_remove_head(struct rr_si *si)
-{
- if (si->head == NULL)
- return; /* empty queue */
- si->head->status = 0;
-
- if (si->head == si->tail) {
- si->head = si->tail = NULL;
- return;
- }
-
- si->head = si->head->qnext;
- si->tail->qnext = si->head;
-}
-
-/* Remove a queue from circular list.
- * XXX see if ti can be merge with remove_queue()
- */
-static inline void
-remove_queue_q(struct rr_queue *q, struct rr_si *si)
-{
- struct rr_queue *prev;
-
- if (q->status != 1)
- return;
- if (q == si->head) {
- rr_remove_head(si);
- return;
- }
-
- for (prev = si->head; prev; prev = prev->qnext) {
- if (prev->qnext != q)
- continue;
- prev->qnext = q->qnext;
- if (q == si->tail)
- si->tail = prev;
- q->status = 0;
- break;
- }
-}
-
-
-static inline void
-next_pointer(struct rr_si *si)
-{
- if (si->head == NULL)
- return; /* empty queue */
-
- si->head = si->head->qnext;
- si->tail = si->tail->qnext;
-}
-
-static int
-rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
-{
- struct rr_si *si;
- struct rr_queue *rrq;
-
- if (m != q->mq.head) {
- if (dn_enqueue(q, m, 0)) /* packet was dropped */
- return 1;
- if (m != q->mq.head)
- return 0;
- }
-
- /* If reach this point, queue q was idle */
- si = (struct rr_si *)(_si + 1);
- rrq = (struct rr_queue *)q;
-
- if (rrq->status == 1) /* Queue is already in the queue list */
- return 0;
-
- /* Insert the queue in the queue list */
- rr_append(rrq, si);
-
- return 0;
-}
-
-static struct mbuf *
-rr_dequeue(struct dn_sch_inst *_si)
-{
- /* Access scheduler instance private data */
- struct rr_si *si = (struct rr_si *)(_si + 1);
- struct rr_queue *rrq;
- uint64_t len;
-
- while ( (rrq = si->head) ) {
- struct mbuf *m = rrq->q.mq.head;
- if ( m == NULL) {
- /* empty queue, remove from list */
- rr_remove_head(si);
- continue;
- }
- len = m->m_pkthdr.len;
-
- if (len > rrq->credit) {
- /* Packet too big */
- rrq->credit += rrq->quantum;
- /* Try next queue */
- next_pointer(si);
- } else {
- rrq->credit -= len;
- return dn_dequeue(&rrq->q);
- }
- }
-
- /* no packet to dequeue*/
- return NULL;
-}
-
-static int
-rr_config(struct dn_schk *_schk)
-{
- struct rr_schk *schk = (struct rr_schk *)(_schk + 1);
- ND("called");
-
- /* use reasonable quantums (64..2k bytes, default 1500) */
- schk->min_q = 64;
- schk->max_q = 2048;
- schk->q_bytes = 1500; /* quantum */
-
- return 0;
-}
-
-static int
-rr_new_sched(struct dn_sch_inst *_si)
-{
- struct rr_si *si = (struct rr_si *)(_si + 1);
-
- ND("called");
- si->head = si->tail = NULL;
-
- return 0;
-}
-
-static int
-rr_free_sched(struct dn_sch_inst *_si)
-{
- ND("called");
- /* Nothing to do? */
- return 0;
-}
-
-static int
-rr_new_fsk(struct dn_fsk *fs)
-{
- struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1);
- /* par[0] is the weight, par[1] is the quantum step */
- ipdn_bound_var(&fs->fs.par[0], 1,
- 1, 65536, "RR weight");
- ipdn_bound_var(&fs->fs.par[1], schk->q_bytes,
- schk->min_q, schk->max_q, "RR quantum");
- return 0;
-}
-
-static int
-rr_new_queue(struct dn_queue *_q)
-{
- struct rr_queue *q = (struct rr_queue *)_q;
-
- _q->ni.oid.subtype = DN_SCHED_RR;
-
- q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1];
- ND("called, q->quantum %d", q->quantum);
- q->credit = q->quantum;
- q->status = 0;
-
- if (_q->mq.head != NULL) {
- /* Queue NOT empty, insert in the queue list */
- rr_append(q, (struct rr_si *)(_q->_si + 1));
- }
- return 0;
-}
-
-static int
-rr_free_queue(struct dn_queue *_q)
-{
- struct rr_queue *q = (struct rr_queue *)_q;
-
- ND("called");
- if (q->status == 1) {
- struct rr_si *si = (struct rr_si *)(_q->_si + 1);
- remove_queue_q(q, si);
- }
- return 0;
-}
-
-/*
- * RR scheduler descriptor
- * contains the type of the scheduler, the name, the size of the
- * structures and function pointers.
- */
-static struct dn_alg rr_desc = {
- _SI( .type = ) DN_SCHED_RR,
- _SI( .name = ) "RR",
- _SI( .flags = ) DN_MULTIQUEUE,
-
- _SI( .schk_datalen = ) 0,
- _SI( .si_datalen = ) sizeof(struct rr_si),
- _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue),
-
- _SI( .enqueue = ) rr_enqueue,
- _SI( .dequeue = ) rr_dequeue,
-
- _SI( .config = ) rr_config,
- _SI( .destroy = ) NULL,
- _SI( .new_sched = ) rr_new_sched,
- _SI( .free_sched = ) rr_free_sched,
- _SI( .new_fsk = ) rr_new_fsk,
- _SI( .free_fsk = ) NULL,
- _SI( .new_queue = ) rr_new_queue,
- _SI( .free_queue = ) rr_free_queue,
-};
-
-
-DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc);
diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_wf2q.c b/freebsd/sys/netpfil/ipfw/dn_sched_wf2q.c
deleted file mode 100644
index 77c4bbad..00000000
--- a/freebsd/sys/netpfil/ipfw/dn_sched_wf2q.c
+++ /dev/null
@@ -1,375 +0,0 @@
-#include <machine/rtems-bsd-kernel-space.h>
-
-/*
- * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
- * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa
- * All rights reserved
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * $FreeBSD$
- */
-
-#ifdef _KERNEL
-#include <sys/malloc.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/kernel.h>
-#include <sys/mbuf.h>
-#include <sys/module.h>
-#include <net/if.h> /* IFNAMSIZ */
-#include <netinet/in.h>
-#include <netinet/ip_var.h> /* ipfw_rule_ref */
-#include <netinet/ip_fw.h> /* flow_id */
-#include <netinet/ip_dummynet.h>
-#include <netpfil/ipfw/dn_heap.h>
-#include <netpfil/ipfw/ip_dn_private.h>
-#include <netpfil/ipfw/dn_sched.h>
-#else
-#include <dn_test.h>
-#endif
-
-#ifndef MAX64
-#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
-#endif
-
-/*
- * timestamps are computed on 64 bit using fixed point arithmetic.
- * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len
- * and sum of weights, respectively. FRAC_BITS is the number of
- * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large
- * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w
- * using an unsigned 32-bit division, and to avoid wraparounds we need
- * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64
- * As an example
- * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19
- */
-#ifndef FRAC_BITS
-#define FRAC_BITS 28 /* shift for fixed point arithmetic */
-#define ONE_FP (1UL << FRAC_BITS)
-#endif
-
-/*
- * Private information for the scheduler instance:
- * sch_heap (key is Finish time) returns the next queue to serve
- * ne_heap (key is Start time) stores not-eligible queues
- * idle_heap (key=start/finish time) stores idle flows. It must
- * support extract-from-middle.
- * A flow is only in 1 of the three heaps.
- * XXX todo: use a more efficient data structure, e.g. a tree sorted
- * by F with min_subtree(S) in each node
- */
-struct wf2qp_si {
- struct dn_heap sch_heap; /* top extract - key Finish time */
- struct dn_heap ne_heap; /* top extract - key Start time */
- struct dn_heap idle_heap; /* random extract - key Start=Finish time */
- uint64_t V; /* virtual time */
- uint32_t inv_wsum; /* inverse of sum of weights */
- uint32_t wsum; /* sum of weights */
-};
-
-struct wf2qp_queue {
- struct dn_queue _q;
- uint64_t S, F; /* start time, finish time */
- uint32_t inv_w; /* ONE_FP / weight */
- int32_t heap_pos; /* position (index) of struct in heap */
-};
-
-/*
- * This file implements a WF2Q+ scheduler as it has been in dummynet
- * since 2000.
- * The scheduler supports per-flow queues and has O(log N) complexity.
- *
- * WF2Q+ needs to drain entries from the idle heap so that we
- * can keep the sum of weights up to date. We can do it whenever
- * we get a chance, or periodically, or following some other
- * strategy. The function idle_check() drains at most N elements
- * from the idle heap.
- */
-static void
-idle_check(struct wf2qp_si *si, int n, int force)
-{
- struct dn_heap *h = &si->idle_heap;
- while (n-- > 0 && h->elements > 0 &&
- (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) {
- struct dn_queue *q = HEAP_TOP(h)->object;
- struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
-
- heap_extract(h, NULL);
- /* XXX to let the flowset delete the queue we should
- * mark it as 'unused' by the scheduler.
- */
- alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */
- si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */
- if (si->wsum > 0)
- si->inv_wsum = ONE_FP/si->wsum;
- }
-}
-
-static int
-wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
-{
- struct dn_fsk *fs = q->fs;
- struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
- struct wf2qp_queue *alg_fq;
- uint64_t len = m->m_pkthdr.len;
-
- if (m != q->mq.head) {
- if (dn_enqueue(q, m, 0)) /* packet was dropped */
- return 1;
- if (m != q->mq.head) /* queue was already busy */
- return 0;
- }
-
- /* If reach this point, queue q was idle */
- alg_fq = (struct wf2qp_queue *)q;
-
- if (DN_KEY_LT(alg_fq->F, alg_fq->S)) {
- /* F<S means timestamps are invalid ->brand new queue. */
- alg_fq->S = si->V; /* init start time */
- si->wsum += fs->fs.par[0]; /* add weight of new queue. */
- si->inv_wsum = ONE_FP/si->wsum;
- } else { /* if it was idle then it was in the idle heap */
- heap_extract(&si->idle_heap, q);
- alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */
- }
- alg_fq->F = alg_fq->S + len * alg_fq->inv_w;
-
- /* if nothing is backlogged, make sure this flow is eligible */
- if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0)
- si->V = MAX64(alg_fq->S, si->V);
-
- /*
- * Look at eligibility. A flow is not eligibile if S>V (when
- * this happens, it means that there is some other flow already
- * scheduled for the same pipe, so the sch_heap cannot be
- * empty). If the flow is not eligible we just store it in the
- * ne_heap. Otherwise, we store in the sch_heap.
- * Note that for all flows in sch_heap (SCH), S_i <= V,
- * and for all flows in ne_heap (NEH), S_i > V.
- * So when we need to compute max(V, min(S_i)) forall i in
- * SCH+NEH, we only need to look into NEH.
- */
- if (DN_KEY_LT(si->V, alg_fq->S)) {
- /* S>V means flow Not eligible. */
- if (si->sch_heap.elements == 0)
- D("++ ouch! not eligible but empty scheduler!");
- heap_insert(&si->ne_heap, alg_fq->S, q);
- } else {
- heap_insert(&si->sch_heap, alg_fq->F, q);
- }
- return 0;
-}
-
-/* XXX invariant: sch > 0 || V >= min(S in neh) */
-static struct mbuf *
-wf2qp_dequeue(struct dn_sch_inst *_si)
-{
- /* Access scheduler instance private data */
- struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
- struct mbuf *m;
- struct dn_queue *q;
- struct dn_heap *sch = &si->sch_heap;
- struct dn_heap *neh = &si->ne_heap;
- struct wf2qp_queue *alg_fq;
-
- if (sch->elements == 0 && neh->elements == 0) {
- /* we have nothing to do. We could kill the idle heap
- * altogether and reset V
- */
- idle_check(si, 0x7fffffff, 1);
- si->V = 0;
- si->wsum = 0; /* should be set already */
- return NULL; /* quick return if nothing to do */
- }
- idle_check(si, 1, 0); /* drain something from the idle heap */
-
- /* make sure at least one element is eligible, bumping V
- * and moving entries that have become eligible.
- * We need to repeat the first part twice, before and
- * after extracting the candidate, or enqueue() will
- * find the data structure in a wrong state.
- */
- m = NULL;
- for(;;) {
- /*
- * Compute V = max(V, min(S_i)). Remember that all elements
- * in sch have by definition S_i <= V so if sch is not empty,
- * V is surely the max and we must not update it. Conversely,
- * if sch is empty we only need to look at neh.
- * We don't need to move the queues, as it will be done at the
- * next enqueue
- */
- if (sch->elements == 0 && neh->elements > 0) {
- si->V = MAX64(si->V, HEAP_TOP(neh)->key);
- }
- while (neh->elements > 0 &&
- DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) {
- q = HEAP_TOP(neh)->object;
- alg_fq = (struct wf2qp_queue *)q;
- heap_extract(neh, NULL);
- heap_insert(sch, alg_fq->F, q);
- }
- if (m) /* pkt found in previous iteration */
- break;
- /* ok we have at least one eligible pkt */
- q = HEAP_TOP(sch)->object;
- alg_fq = (struct wf2qp_queue *)q;
- m = dn_dequeue(q);
- heap_extract(sch, NULL); /* Remove queue from heap. */
- si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum;
- alg_fq->S = alg_fq->F; /* Update start time. */
- if (q->mq.head == 0) { /* not backlogged any more. */
- heap_insert(&si->idle_heap, alg_fq->F, q);
- } else { /* Still backlogged. */
- /* Update F, store in neh or sch */
- uint64_t len = q->mq.head->m_pkthdr.len;
- alg_fq->F += len * alg_fq->inv_w;
- if (DN_KEY_LEQ(alg_fq->S, si->V)) {
- heap_insert(sch, alg_fq->F, q);
- } else {
- heap_insert(neh, alg_fq->S, q);
- }
- }
- }
- return m;
-}
-
-static int
-wf2qp_new_sched(struct dn_sch_inst *_si)
-{
- struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
- int ofs = offsetof(struct wf2qp_queue, heap_pos);
-
- /* all heaps support extract from middle */
- if (heap_init(&si->idle_heap, 16, ofs) ||
- heap_init(&si->sch_heap, 16, ofs) ||
- heap_init(&si->ne_heap, 16, ofs)) {
- heap_free(&si->ne_heap);
- heap_free(&si->sch_heap);
- heap_free(&si->idle_heap);
- return ENOMEM;
- }
- return 0;
-}
-
-static int
-wf2qp_free_sched(struct dn_sch_inst *_si)
-{
- struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
-
- heap_free(&si->sch_heap);
- heap_free(&si->ne_heap);
- heap_free(&si->idle_heap);
-
- return 0;
-}
-
-static int
-wf2qp_new_fsk(struct dn_fsk *fs)
-{
- ipdn_bound_var(&fs->fs.par[0], 1,
- 1, 100, "WF2Q+ weight");
- return 0;
-}
-
-static int
-wf2qp_new_queue(struct dn_queue *_q)
-{
- struct wf2qp_queue *q = (struct wf2qp_queue *)_q;
-
- _q->ni.oid.subtype = DN_SCHED_WF2QP;
- q->F = 0; /* not strictly necessary */
- q->S = q->F + 1; /* mark timestamp as invalid. */
- q->inv_w = ONE_FP / _q->fs->fs.par[0];
- if (_q->mq.head != NULL) {
- wf2qp_enqueue(_q->_si, _q, _q->mq.head);
- }
- return 0;
-}
-
-/*
- * Called when the infrastructure removes a queue (e.g. flowset
- * is reconfigured). Nothing to do if we did not 'own' the queue,
- * otherwise remove it from the right heap and adjust the sum
- * of weights.
- */
-static int
-wf2qp_free_queue(struct dn_queue *q)
-{
- struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
- struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1);
-
- if (alg_fq->S >= alg_fq->F + 1)
- return 0; /* nothing to do, not in any heap */
- si->wsum -= q->fs->fs.par[0];
- if (si->wsum > 0)
- si->inv_wsum = ONE_FP/si->wsum;
-
- /* extract from the heap. XXX TODO we may need to adjust V
- * to make sure the invariants hold.
- */
- if (q->mq.head == NULL) {
- heap_extract(&si->idle_heap, q);
- } else if (DN_KEY_LT(si->V, alg_fq->S)) {
- heap_extract(&si->ne_heap, q);
- } else {
- heap_extract(&si->sch_heap, q);
- }
- return 0;
-}
-
-/*
- * WF2Q+ scheduler descriptor
- * contains the type of the scheduler, the name, the size of the
- * structures and function pointers.
- */
-static struct dn_alg wf2qp_desc = {
- _SI( .type = ) DN_SCHED_WF2QP,
- _SI( .name = ) "WF2Q+",
- _SI( .flags = ) DN_MULTIQUEUE,
-
- /* we need extra space in the si and the queue */
- _SI( .schk_datalen = ) 0,
- _SI( .si_datalen = ) sizeof(struct wf2qp_si),
- _SI( .q_datalen = ) sizeof(struct wf2qp_queue) -
- sizeof(struct dn_queue),
-
- _SI( .enqueue = ) wf2qp_enqueue,
- _SI( .dequeue = ) wf2qp_dequeue,
-
- _SI( .config = ) NULL,
- _SI( .destroy = ) NULL,
- _SI( .new_sched = ) wf2qp_new_sched,
- _SI( .free_sched = ) wf2qp_free_sched,
-
- _SI( .new_fsk = ) wf2qp_new_fsk,
- _SI( .free_fsk = ) NULL,
-
- _SI( .new_queue = ) wf2qp_new_queue,
- _SI( .free_queue = ) wf2qp_free_queue,
-};
-
-
-DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc);
diff --git a/freebsd/sys/netpfil/ipfw/ip_dn_glue.c b/freebsd/sys/netpfil/ipfw/ip_dn_glue.c
deleted file mode 100644
index 8e0cc36d..00000000
--- a/freebsd/sys/netpfil/ipfw/ip_dn_glue.c
+++ /dev/null
@@ -1,848 +0,0 @@
-#include <machine/rtems-bsd-kernel-space.h>
-
-/*-
- * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
- * All rights reserved
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * $FreeBSD$
- *
- * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8
- */
-
-#include <rtems/bsd/local/opt_inet6.h>
-
-#include <rtems/bsd/sys/param.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/kernel.h>
-#include <rtems/bsd/sys/lock.h>
-#include <sys/module.h>
-#include <sys/priv.h>
-#include <sys/proc.h>
-#include <sys/rwlock.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/time.h>
-#include <sys/taskqueue.h>
-#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
-#include <netinet/in.h>
-#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
-#include <netinet/ip_fw.h>
-#include <netinet/ip_dummynet.h>
-
-#include <netpfil/ipfw/ip_fw_private.h>
-#include <netpfil/ipfw/dn_heap.h>
-#include <netpfil/ipfw/ip_dn_private.h>
-#include <netpfil/ipfw/dn_sched.h>
-
-/* FREEBSD7.2 ip_dummynet.h r191715*/
-
-struct dn_heap_entry7 {
- int64_t key; /* sorting key. Topmost element is smallest one */
- void *object; /* object pointer */
-};
-
-struct dn_heap7 {
- int size;
- int elements;
- int offset; /* XXX if > 0 this is the offset of direct ptr to obj */
- struct dn_heap_entry7 *p; /* really an array of "size" entries */
-};
-
-/* Common to 7.2 and 8 */
-struct dn_flow_set {
- SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */
-
- u_short fs_nr ; /* flow_set number */
- u_short flags_fs;
-#define DNOLD_HAVE_FLOW_MASK 0x0001
-#define DNOLD_IS_RED 0x0002
-#define DNOLD_IS_GENTLE_RED 0x0004
-#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */
-#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */
-#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */
-#define DNOLD_IS_PIPE 0x4000
-#define DNOLD_IS_QUEUE 0x8000
-
- struct dn_pipe7 *pipe ; /* pointer to parent pipe */
- u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */
-
- int weight ; /* WFQ queue weight */
- int qsize ; /* queue size in slots or bytes */
- int plr ; /* pkt loss rate (2^31-1 means 100%) */
-
- struct ipfw_flow_id flow_mask ;
-
- /* hash table of queues onto this flow_set */
- int rq_size ; /* number of slots */
- int rq_elements ; /* active elements */
- struct dn_flow_queue7 **rq; /* array of rq_size entries */
-
- u_int32_t last_expired ; /* do not expire too frequently */
- int backlogged ; /* #active queues for this flowset */
-
- /* RED parameters */
-#define SCALE_RED 16
-#define SCALE(x) ( (x) << SCALE_RED )
-#define SCALE_VAL(x) ( (x) >> SCALE_RED )
-#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED )
- int w_q ; /* queue weight (scaled) */
- int max_th ; /* maximum threshold for queue (scaled) */
- int min_th ; /* minimum threshold for queue (scaled) */
- int max_p ; /* maximum value for p_b (scaled) */
- u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */
- u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */
- u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */
- u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */
- u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */
- u_int lookup_depth ; /* depth of lookup table */
- int lookup_step ; /* granularity inside the lookup table */
- int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
- int avg_pkt_size ; /* medium packet size */
- int max_pkt_size ; /* max packet size */
-};
-SLIST_HEAD(dn_flow_set_head, dn_flow_set);
-
-#define DN_IS_PIPE 0x4000
-#define DN_IS_QUEUE 0x8000
-struct dn_flow_queue7 {
- struct dn_flow_queue7 *next ;
- struct ipfw_flow_id id ;
-
- struct mbuf *head, *tail ; /* queue of packets */
- u_int len ;
- u_int len_bytes ;
-
- u_long numbytes;
-
- u_int64_t tot_pkts ; /* statistics counters */
- u_int64_t tot_bytes ;
- u_int32_t drops ;
-
- int hash_slot ; /* debugging/diagnostic */
-
- /* RED parameters */
- int avg ; /* average queue length est. (scaled) */
- int count ; /* arrivals since last RED drop */
- int random ; /* random value (scaled) */
- u_int32_t q_time; /* start of queue idle time */
-
- /* WF2Q+ support */
- struct dn_flow_set *fs ; /* parent flow set */
- int heap_pos ; /* position (index) of struct in heap */
- int64_t sched_time ; /* current time when queue enters ready_heap */
-
- int64_t S,F ; /* start time, finish time */
-};
-
-struct dn_pipe7 { /* a pipe */
- SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */
-
- int pipe_nr ; /* number */
- int bandwidth; /* really, bytes/tick. */
- int delay ; /* really, ticks */
-
- struct mbuf *head, *tail ; /* packets in delay line */
-
- /* WF2Q+ */
- struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
- struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
- struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
-
- int64_t V ; /* virtual time */
- int sum; /* sum of weights of all active sessions */
-
- int numbytes;
-
- int64_t sched_time ; /* time pipe was scheduled in ready_heap */
-
- /*
- * When the tx clock come from an interface (if_name[0] != '\0'), its name
- * is stored below, whereas the ifp is filled when the rule is configured.
- */
- char if_name[IFNAMSIZ];
- struct ifnet *ifp ;
- int ready ; /* set if ifp != NULL and we got a signal from it */
-
- struct dn_flow_set fs ; /* used with fixed-rate flows */
-};
-SLIST_HEAD(dn_pipe_head7, dn_pipe7);
-
-
-/* FREEBSD8 ip_dummynet.h r196045 */
-struct dn_flow_queue8 {
- struct dn_flow_queue8 *next ;
- struct ipfw_flow_id id ;
-
- struct mbuf *head, *tail ; /* queue of packets */
- u_int len ;
- u_int len_bytes ;
-
- uint64_t numbytes ; /* credit for transmission (dynamic queues) */
- int64_t extra_bits; /* extra bits simulating unavailable channel */
-
- u_int64_t tot_pkts ; /* statistics counters */
- u_int64_t tot_bytes ;
- u_int32_t drops ;
-
- int hash_slot ; /* debugging/diagnostic */
-
- /* RED parameters */
- int avg ; /* average queue length est. (scaled) */
- int count ; /* arrivals since last RED drop */
- int random ; /* random value (scaled) */
- int64_t idle_time; /* start of queue idle time */
-
- /* WF2Q+ support */
- struct dn_flow_set *fs ; /* parent flow set */
- int heap_pos ; /* position (index) of struct in heap */
- int64_t sched_time ; /* current time when queue enters ready_heap */
-
- int64_t S,F ; /* start time, finish time */
-};
-
-struct dn_pipe8 { /* a pipe */
- SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */
-
- int pipe_nr ; /* number */
- int bandwidth; /* really, bytes/tick. */
- int delay ; /* really, ticks */
-
- struct mbuf *head, *tail ; /* packets in delay line */
-
- /* WF2Q+ */
- struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
- struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
- struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
-
- int64_t V ; /* virtual time */
- int sum; /* sum of weights of all active sessions */
-
- /* Same as in dn_flow_queue, numbytes can become large */
- int64_t numbytes; /* bits I can transmit (more or less). */
- uint64_t burst; /* burst size, scaled: bits * hz */
-
- int64_t sched_time ; /* time pipe was scheduled in ready_heap */
- int64_t idle_time; /* start of pipe idle time */
-
- char if_name[IFNAMSIZ];
- struct ifnet *ifp ;
- int ready ; /* set if ifp != NULL and we got a signal from it */
-
- struct dn_flow_set fs ; /* used with fixed-rate flows */
-
- /* fields to simulate a delay profile */
-#define ED_MAX_NAME_LEN 32
- char name[ED_MAX_NAME_LEN];
- int loss_level;
- int samples_no;
- int *samples;
-};
-
-#define ED_MAX_SAMPLES_NO 1024
-struct dn_pipe_max8 {
- struct dn_pipe8 pipe;
- int samples[ED_MAX_SAMPLES_NO];
-};
-SLIST_HEAD(dn_pipe_head8, dn_pipe8);
-
-/*
- * Changes from 7.2 to 8:
- * dn_pipe:
- * numbytes from int to int64_t
- * add burst (int64_t)
- * add idle_time (int64_t)
- * add profile
- * add struct dn_pipe_max
- * add flag DN_HAS_PROFILE
- *
- * dn_flow_queue
- * numbytes from u_long to int64_t
- * add extra_bits (int64_t)
- * q_time from u_int32_t to int64_t and name idle_time
- *
- * dn_flow_set unchanged
- *
- */
-
-/* NOTE:XXX copied from dummynet.c */
-#define O_NEXT(p, len) ((void *)((char *)p + len))
-static void
-oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
-{
- oid->len = len;
- oid->type = type;
- oid->subtype = 0;
- oid->id = id;
-}
-/* make room in the buffer and move the pointer forward */
-static void *
-o_next(struct dn_id **o, int len, int type)
-{
- struct dn_id *ret = *o;
- oid_fill(ret, len, type, 0);
- *o = O_NEXT(*o, len);
- return ret;
-}
-
-
-static size_t pipesize7 = sizeof(struct dn_pipe7);
-static size_t pipesize8 = sizeof(struct dn_pipe8);
-static size_t pipesizemax8 = sizeof(struct dn_pipe_max8);
-
-/* Indicate 'ipfw' version
- * 1: from FreeBSD 7.2
- * 0: from FreeBSD 8
- * -1: unknow (for now is unused)
- *
- * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives
- * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknow,
- * it is suppose to be the FreeBSD 8 version.
- */
-static int is7 = 0;
-
-static int
-convertflags2new(int src)
-{
- int dst = 0;
-
- if (src & DNOLD_HAVE_FLOW_MASK)
- dst |= DN_HAVE_MASK;
- if (src & DNOLD_QSIZE_IS_BYTES)
- dst |= DN_QSIZE_BYTES;
- if (src & DNOLD_NOERROR)
- dst |= DN_NOERROR;
- if (src & DNOLD_IS_RED)
- dst |= DN_IS_RED;
- if (src & DNOLD_IS_GENTLE_RED)
- dst |= DN_IS_GENTLE_RED;
- if (src & DNOLD_HAS_PROFILE)
- dst |= DN_HAS_PROFILE;
-
- return dst;
-}
-
-static int
-convertflags2old(int src)
-{
- int dst = 0;
-
- if (src & DN_HAVE_MASK)
- dst |= DNOLD_HAVE_FLOW_MASK;
- if (src & DN_IS_RED)
- dst |= DNOLD_IS_RED;
- if (src & DN_IS_GENTLE_RED)
- dst |= DNOLD_IS_GENTLE_RED;
- if (src & DN_NOERROR)
- dst |= DNOLD_NOERROR;
- if (src & DN_HAS_PROFILE)
- dst |= DNOLD_HAS_PROFILE;
- if (src & DN_QSIZE_BYTES)
- dst |= DNOLD_QSIZE_IS_BYTES;
-
- return dst;
-}
-
-static int
-dn_compat_del(void *v)
-{
- struct dn_pipe7 *p = (struct dn_pipe7 *) v;
- struct dn_pipe8 *p8 = (struct dn_pipe8 *) v;
- struct {
- struct dn_id oid;
- uintptr_t a[1]; /* add more if we want a list */
- } cmd;
-
- /* XXX DN_API_VERSION ??? */
- oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
-
- if (is7) {
- if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
- return EINVAL;
- if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
- return EINVAL;
- } else {
- if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0)
- return EINVAL;
- if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0)
- return EINVAL;
- }
-
- if (p->pipe_nr != 0) { /* pipe x delete */
- cmd.a[0] = p->pipe_nr;
- cmd.oid.subtype = DN_LINK;
- } else { /* queue x delete */
- cmd.oid.subtype = DN_FS;
- cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr;
- }
-
- return do_config(&cmd, cmd.oid.len);
-}
-
-static int
-dn_compat_config_queue(struct dn_fs *fs, void* v)
-{
- struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
- struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
- struct dn_flow_set *f;
-
- if (is7)
- f = &p7->fs;
- else
- f = &p8->fs;
-
- fs->fs_nr = f->fs_nr;
- fs->sched_nr = f->parent_nr;
- fs->flow_mask = f->flow_mask;
- fs->buckets = f->rq_size;
- fs->qsize = f->qsize;
- fs->plr = f->plr;
- fs->par[0] = f->weight;
- fs->flags = convertflags2new(f->flags_fs);
- if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) {
- fs->w_q = f->w_q;
- fs->max_th = f->max_th;
- fs->min_th = f->min_th;
- fs->max_p = f->max_p;
- }
-
- return 0;
-}
-
-static int
-dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p,
- struct dn_fs *fs, void* v)
-{
- struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
- struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
- int i = p7->pipe_nr;
-
- sch->sched_nr = i;
- sch->oid.subtype = 0;
- p->link_nr = i;
- fs->fs_nr = i + 2*DN_MAX_ID;
- fs->sched_nr = i + DN_MAX_ID;
-
- /* Common to 7 and 8 */
- p->bandwidth = p7->bandwidth;
- p->delay = p7->delay;
- if (!is7) {
- /* FreeBSD 8 has burst */
- p->burst = p8->burst;
- }
-
- /* fill the fifo flowset */
- dn_compat_config_queue(fs, v);
- fs->fs_nr = i + 2*DN_MAX_ID;
- fs->sched_nr = i + DN_MAX_ID;
-
- /* Move scheduler related parameter from fs to sch */
- sch->buckets = fs->buckets; /*XXX*/
- fs->buckets = 0;
- if (fs->flags & DN_HAVE_MASK) {
- sch->flags |= DN_HAVE_MASK;
- fs->flags &= ~DN_HAVE_MASK;
- sch->sched_mask = fs->flow_mask;
- bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id));
- }
-
- return 0;
-}
-
-static int
-dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p,
- void *v)
-{
- struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
-
- p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]);
-
- pf->link_nr = p->link_nr;
- pf->loss_level = p8->loss_level;
-// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant?
- pf->samples_no = p8->samples_no;
- strncpy(pf->name, p8->name,sizeof(pf->name));
- bcopy(p8->samples, pf->samples, sizeof(pf->samples));
-
- return 0;
-}
-
-/*
- * If p->pipe_nr != 0 the command is 'pipe x config', so need to create
- * the three main struct, else only a flowset is created
- */
-static int
-dn_compat_configure(void *v)
-{
- struct dn_id *buf = NULL, *base;
- struct dn_sch *sch = NULL;
- struct dn_link *p = NULL;
- struct dn_fs *fs = NULL;
- struct dn_profile *pf = NULL;
- int lmax;
- int error;
-
- struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
- struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
-
- int i; /* number of object to configure */
-
- lmax = sizeof(struct dn_id); /* command header */
- lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
- sizeof(struct dn_fs) + sizeof(struct dn_profile);
-
- base = buf = malloc(lmax, M_DUMMYNET, M_WAIT|M_ZERO);
- o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
- base->id = DN_API_VERSION;
-
- /* pipe_nr is the same in p7 and p8 */
- i = p7->pipe_nr;
- if (i != 0) { /* pipe config */
- sch = o_next(&buf, sizeof(*sch), DN_SCH);
- p = o_next(&buf, sizeof(*p), DN_LINK);
- fs = o_next(&buf, sizeof(*fs), DN_FS);
-
- error = dn_compat_config_pipe(sch, p, fs, v);
- if (error) {
- free(buf, M_DUMMYNET);
- return error;
- }
- if (!is7 && p8->samples_no > 0) {
- /* Add profiles*/
- pf = o_next(&buf, sizeof(*pf), DN_PROFILE);
- error = dn_compat_config_profile(pf, p, v);
- if (error) {
- free(buf, M_DUMMYNET);
- return error;
- }
- }
- } else { /* queue config */
- fs = o_next(&buf, sizeof(*fs), DN_FS);
- error = dn_compat_config_queue(fs, v);
- if (error) {
- free(buf, M_DUMMYNET);
- return error;
- }
- }
- error = do_config(base, (char *)buf - (char *)base);
-
- if (buf)
- free(buf, M_DUMMYNET);
- return error;
-}
-
-int
-dn_compat_calc_size(void)
-{
- int need = 0;
- /* XXX use FreeBSD 8 struct size */
- /* NOTE:
- * - half scheduler: schk_count/2
- * - all flowset: fsk_count
- * - all flowset queues: queue_count
- * - all pipe queue: si_count
- */
- need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2;
- need += dn_cfg.fsk_count * sizeof(struct dn_flow_set);
- need += dn_cfg.si_count * sizeof(struct dn_flow_queue8);
- need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8);
-
- return need;
-}
-
-int
-dn_c_copy_q (void *_ni, void *arg)
-{
- struct copy_args *a = arg;
- struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start;
- struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start;
- struct dn_flow *ni = (struct dn_flow *)_ni;
- int size = 0;
-
- /* XXX hash slot not set */
- /* No difference between 7.2/8 */
- fq7->len = ni->length;
- fq7->len_bytes = ni->len_bytes;
- fq7->id = ni->fid;
-
- if (is7) {
- size = sizeof(struct dn_flow_queue7);
- fq7->tot_pkts = ni->tot_pkts;
- fq7->tot_bytes = ni->tot_bytes;
- fq7->drops = ni->drops;
- } else {
- size = sizeof(struct dn_flow_queue8);
- fq8->tot_pkts = ni->tot_pkts;
- fq8->tot_bytes = ni->tot_bytes;
- fq8->drops = ni->drops;
- }
-
- *a->start += size;
- return 0;
-}
-
-int
-dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq)
-{
- struct dn_link *l = &s->link;
- struct dn_fsk *f = s->fs;
-
- struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start;
- struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start;
- struct dn_flow_set *fs;
- int size = 0;
-
- if (is7) {
- fs = &pipe7->fs;
- size = sizeof(struct dn_pipe7);
- } else {
- fs = &pipe8->fs;
- size = sizeof(struct dn_pipe8);
- }
-
- /* These 4 field are the same in pipe7 and pipe8 */
- pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE;
- pipe7->bandwidth = l->bandwidth;
- pipe7->delay = l->delay * 1000 / hz;
- pipe7->pipe_nr = l->link_nr - DN_MAX_ID;
-
- if (!is7) {
- if (s->profile) {
- struct dn_profile *pf = s->profile;
- strncpy(pipe8->name, pf->name, sizeof(pf->name));
- pipe8->loss_level = pf->loss_level;
- pipe8->samples_no = pf->samples_no;
- }
- pipe8->burst = div64(l->burst , 8 * hz);
- }
-
- fs->flow_mask = s->sch.sched_mask;
- fs->rq_size = s->sch.buckets ? s->sch.buckets : 1;
-
- fs->parent_nr = l->link_nr - DN_MAX_ID;
- fs->qsize = f->fs.qsize;
- fs->plr = f->fs.plr;
- fs->w_q = f->fs.w_q;
- fs->max_th = f->max_th;
- fs->min_th = f->min_th;
- fs->max_p = f->fs.max_p;
- fs->rq_elements = nq;
-
- fs->flags_fs = convertflags2old(f->fs.flags);
-
- *a->start += size;
- return 0;
-}
-
-
-int
-dn_compat_copy_pipe(struct copy_args *a, void *_o)
-{
- int have = a->end - *a->start;
- int need = 0;
- int pipe_size = sizeof(struct dn_pipe8);
- int queue_size = sizeof(struct dn_flow_queue8);
- int n_queue = 0; /* number of queues */
-
- struct dn_schk *s = (struct dn_schk *)_o;
- /* calculate needed space:
- * - struct dn_pipe
- * - if there are instances, dn_queue * n_instances
- */
- n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) :
- (s->siht ? 1 : 0));
- need = pipe_size + queue_size * n_queue;
- if (have < need) {
- D("have %d < need %d", have, need);
- return 1;
- }
- /* copy pipe */
- dn_c_copy_pipe(s, a, n_queue);
-
- /* copy queues */
- if (s->sch.flags & DN_HAVE_MASK)
- dn_ht_scan(s->siht, dn_c_copy_q, a);
- else if (s->siht)
- dn_c_copy_q(s->siht, a);
- return 0;
-}
-
-int
-dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq)
-{
- struct dn_flow_set *fs = (struct dn_flow_set *)*a->start;
-
- fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
- fs->fs_nr = f->fs.fs_nr;
- fs->qsize = f->fs.qsize;
- fs->plr = f->fs.plr;
- fs->w_q = f->fs.w_q;
- fs->max_th = f->max_th;
- fs->min_th = f->min_th;
- fs->max_p = f->fs.max_p;
- fs->flow_mask = f->fs.flow_mask;
- fs->rq_elements = nq;
- fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1);
- fs->parent_nr = f->fs.sched_nr;
- fs->weight = f->fs.par[0];
-
- fs->flags_fs = convertflags2old(f->fs.flags);
- *a->start += sizeof(struct dn_flow_set);
- return 0;
-}
-
-int
-dn_compat_copy_queue(struct copy_args *a, void *_o)
-{
- int have = a->end - *a->start;
- int need = 0;
- int fs_size = sizeof(struct dn_flow_set);
- int queue_size = sizeof(struct dn_flow_queue8);
-
- struct dn_fsk *fs = (struct dn_fsk *)_o;
- int n_queue = 0; /* number of queues */
-
- n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) :
- (fs->qht ? 1 : 0));
-
- need = fs_size + queue_size * n_queue;
- if (have < need) {
- D("have < need");
- return 1;
- }
-
- /* copy flowset */
- dn_c_copy_fs(fs, a, n_queue);
-
- /* copy queues */
- if (fs->fs.flags & DN_HAVE_MASK)
- dn_ht_scan(fs->qht, dn_c_copy_q, a);
- else if (fs->qht)
- dn_c_copy_q(fs->qht, a);
-
- return 0;
-}
-
-int
-copy_data_helper_compat(void *_o, void *_arg)
-{
- struct copy_args *a = _arg;
-
- if (a->type == DN_COMPAT_PIPE) {
- struct dn_schk *s = _o;
- if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) {
- return 0; /* not old type */
- }
- /* copy pipe parameters, and if instance exists, copy
- * other parameters and eventually queues.
- */
- if(dn_compat_copy_pipe(a, _o))
- return DNHT_SCAN_END;
- } else if (a->type == DN_COMPAT_QUEUE) {
- struct dn_fsk *fs = _o;
- if (fs->fs.fs_nr >= DN_MAX_ID)
- return 0;
- if (dn_compat_copy_queue(a, _o))
- return DNHT_SCAN_END;
- }
- return 0;
-}
-
-/* Main function to manage old requests */
-int
-ip_dummynet_compat(struct sockopt *sopt)
-{
- int error=0;
- void *v = NULL;
- struct dn_id oid;
-
- /* Lenght of data, used to found ipfw version... */
- int len = sopt->sopt_valsize;
-
- /* len can be 0 if command was dummynet_flush */
- if (len == pipesize7) {
- D("setting compatibility with FreeBSD 7.2");
- is7 = 1;
- }
- else if (len == pipesize8 || len == pipesizemax8) {
- D("setting compatibility with FreeBSD 8");
- is7 = 0;
- }
-
- switch (sopt->sopt_name) {
- default:
- printf("dummynet: -- unknown option %d", sopt->sopt_name);
- error = EINVAL;
- break;
-
- case IP_DUMMYNET_FLUSH:
- oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
- do_config(&oid, oid.len);
- break;
-
- case IP_DUMMYNET_DEL:
- v = malloc(len, M_TEMP, M_WAITOK);
- error = sooptcopyin(sopt, v, len, len);
- if (error)
- break;
- error = dn_compat_del(v);
- free(v, M_TEMP);
- break;
-
- case IP_DUMMYNET_CONFIGURE:
- v = malloc(len, M_TEMP, M_WAITOK);
- error = sooptcopyin(sopt, v, len, len);
- if (error)
- break;
- error = dn_compat_configure(v);
- free(v, M_TEMP);
- break;
-
- case IP_DUMMYNET_GET: {
- void *buf;
- int ret;
- int original_size = sopt->sopt_valsize;
- int size;
-
- ret = dummynet_get(sopt, &buf);
- if (ret)
- return 0;//XXX ?
- size = sopt->sopt_valsize;
- sopt->sopt_valsize = original_size;
- D("size=%d, buf=%p", size, buf);
- ret = sooptcopyout(sopt, buf, size);
- if (ret)
- printf(" %s ERROR sooptcopyout\n", __FUNCTION__);
- if (buf)
- free(buf, M_DUMMYNET);
- }
- }
-
- return error;
-}
-
-
diff --git a/freebsd/sys/netpfil/ipfw/ip_dn_io.c b/freebsd/sys/netpfil/ipfw/ip_dn_io.c
deleted file mode 100644
index 23392a55..00000000
--- a/freebsd/sys/netpfil/ipfw/ip_dn_io.c
+++ /dev/null
@@ -1,852 +0,0 @@
-#include <machine/rtems-bsd-kernel-space.h>
-
-/*-
- * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
- * All rights reserved
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * Dummynet portions related to packet handling.
- */
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <rtems/bsd/local/opt_inet6.h>
-
-#include <rtems/bsd/sys/param.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/kernel.h>
-#include <rtems/bsd/sys/lock.h>
-#include <sys/module.h>
-#include <sys/priv.h>
-#include <sys/proc.h>
-#include <sys/rwlock.h>
-#include <sys/socket.h>
-#include <sys/time.h>
-#include <sys/sysctl.h>
-
-#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
-#include <net/netisr.h>
-#include <net/vnet.h>
-
-#include <netinet/in.h>
-#include <netinet/ip.h> /* ip_len, ip_off */
-#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
-#include <netinet/ip_fw.h>
-#include <netinet/ip_dummynet.h>
-#include <netinet/if_ether.h> /* various ether_* routines */
-#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
-#include <netinet6/ip6_var.h>
-
-#include <netpfil/ipfw/ip_fw_private.h>
-#include <netpfil/ipfw/dn_heap.h>
-#include <netpfil/ipfw/ip_dn_private.h>
-#include <netpfil/ipfw/dn_sched.h>
-
-/*
- * We keep a private variable for the simulation time, but we could
- * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
- * instead of dn_cfg.curr_time
- */
-
-struct dn_parms dn_cfg;
-//VNET_DEFINE(struct dn_parms, _base_dn_cfg);
-
-static long tick_last; /* Last tick duration (usec). */
-static long tick_delta; /* Last vs standard tick diff (usec). */
-static long tick_delta_sum; /* Accumulated tick difference (usec).*/
-static long tick_adjustment; /* Tick adjustments done. */
-static long tick_lost; /* Lost(coalesced) ticks number. */
-/* Adjusted vs non-adjusted curr_time difference (ticks). */
-static long tick_diff;
-
-static unsigned long io_pkt;
-static unsigned long io_pkt_fast;
-static unsigned long io_pkt_drop;
-
-/*
- * We use a heap to store entities for which we have pending timer events.
- * The heap is checked at every tick and all entities with expired events
- * are extracted.
- */
-
-MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
-
-extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
-
-#ifdef SYSCTL_NODE
-
-SYSBEGIN(f4)
-
-SYSCTL_DECL(_net_inet);
-SYSCTL_DECL(_net_inet_ip);
-static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
-
-/* wrapper to pass dn_cfg fields to SYSCTL_* */
-//#define DC(x) (&(VNET_NAME(_base_dn_cfg).x))
-#define DC(x) (&(dn_cfg.x))
-/* parameters */
-
-static int
-sysctl_hash_size(SYSCTL_HANDLER_ARGS)
-{
- int error, value;
-
- value = dn_cfg.hash_size;
- error = sysctl_handle_int(oidp, &value, 0, req);
- if (error != 0 || req->newptr == NULL)
- return (error);
- if (value < 16 || value > 65536)
- return (EINVAL);
- dn_cfg.hash_size = value;
- return (0);
-}
-
-SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, hash_size,
- CTLTYPE_INT | CTLFLAG_RW, 0, 0, sysctl_hash_size,
- "I", "Default hash table size");
-
-static int
-sysctl_limits(SYSCTL_HANDLER_ARGS)
-{
- int error;
- long value;
-
- if (arg2 != 0)
- value = dn_cfg.slot_limit;
- else
- value = dn_cfg.byte_limit;
- error = sysctl_handle_long(oidp, &value, 0, req);
-
- if (error != 0 || req->newptr == NULL)
- return (error);
- if (arg2 != 0) {
- if (value < 1)
- return (EINVAL);
- dn_cfg.slot_limit = value;
- } else {
- if (value < 1500)
- return (EINVAL);
- dn_cfg.byte_limit = value;
- }
- return (0);
-}
-
-SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
- CTLTYPE_LONG | CTLFLAG_RW, 0, 1, sysctl_limits,
- "L", "Upper limit in slots for pipe queue.");
-SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
- CTLTYPE_LONG | CTLFLAG_RW, 0, 0, sysctl_limits,
- "L", "Upper limit in bytes for pipe queue.");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
- CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io.");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug,
- CTLFLAG_RW, DC(debug), 0, "Dummynet debug level");
-
-/* RED parameters */
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
- CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
- CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
- CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size");
-
-/* time adjustment */
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
- CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
- CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
- CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
- CTLFLAG_RD, &tick_diff, 0,
- "Adjusted vs non-adjusted curr_time difference (ticks).");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
- CTLFLAG_RD, &tick_lost, 0,
- "Number of ticks coalesced by dummynet taskqueue.");
-
-/* Drain parameters */
-SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire,
- CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes");
-SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle,
- CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes");
-
-/* statistics */
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count,
- CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count,
- CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count,
- CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count,
- CTLFLAG_RD, DC(queue_count), 0, "Number of queues");
-SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
- CTLFLAG_RD, &io_pkt, 0,
- "Number of packets passed to dummynet.");
-SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
- CTLFLAG_RD, &io_pkt_fast, 0,
- "Number of packets bypassed dummynet scheduler.");
-SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
- CTLFLAG_RD, &io_pkt_drop, 0,
- "Number of packets dropped by dummynet.");
-#undef DC
-SYSEND
-
-#endif
-
-static void dummynet_send(struct mbuf *);
-
-/*
- * Packets processed by dummynet have an mbuf tag associated with
- * them that carries their dummynet state.
- * Outside dummynet, only the 'rule' field is relevant, and it must
- * be at the beginning of the structure.
- */
-struct dn_pkt_tag {
- struct ipfw_rule_ref rule; /* matching rule */
-
- /* second part, dummynet specific */
- int dn_dir; /* action when packet comes out.*/
- /* see ip_fw_private.h */
- uint64_t output_time; /* when the pkt is due for delivery*/
- struct ifnet *ifp; /* interface, for ip_output */
- struct _ip6dn_args ip6opt; /* XXX ipv6 options */
-};
-
-/*
- * Return the mbuf tag holding the dummynet state (it should
- * be the first one on the list).
- */
-static struct dn_pkt_tag *
-dn_tag_get(struct mbuf *m)
-{
- struct m_tag *mtag = m_tag_first(m);
- KASSERT(mtag != NULL &&
- mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
- mtag->m_tag_id == PACKET_TAG_DUMMYNET,
- ("packet on dummynet queue w/o dummynet tag!"));
- return (struct dn_pkt_tag *)(mtag+1);
-}
-
-static inline void
-mq_append(struct mq *q, struct mbuf *m)
-{
- if (q->head == NULL)
- q->head = m;
- else
- q->tail->m_nextpkt = m;
- q->tail = m;
- m->m_nextpkt = NULL;
-}
-
-/*
- * Dispose a list of packet. Use a functions so if we need to do
- * more work, this is a central point to do it.
- */
-void dn_free_pkts(struct mbuf *mnext)
-{
- struct mbuf *m;
-
- while ((m = mnext) != NULL) {
- mnext = m->m_nextpkt;
- FREE_PKT(m);
- }
-}
-
-static int
-red_drops (struct dn_queue *q, int len)
-{
- /*
- * RED algorithm
- *
- * RED calculates the average queue size (avg) using a low-pass filter
- * with an exponential weighted (w_q) moving average:
- * avg <- (1-w_q) * avg + w_q * q_size
- * where q_size is the queue length (measured in bytes or * packets).
- *
- * If q_size == 0, we compute the idle time for the link, and set
- * avg = (1 - w_q)^(idle/s)
- * where s is the time needed for transmitting a medium-sized packet.
- *
- * Now, if avg < min_th the packet is enqueued.
- * If avg > max_th the packet is dropped. Otherwise, the packet is
- * dropped with probability P function of avg.
- */
-
- struct dn_fsk *fs = q->fs;
- int64_t p_b = 0;
-
- /* Queue in bytes or packets? */
- uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ?
- q->ni.len_bytes : q->ni.length;
-
- /* Average queue size estimation. */
- if (q_size != 0) {
- /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
- int diff = SCALE(q_size) - q->avg;
- int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
-
- q->avg += (int)v;
- } else {
- /*
- * Queue is empty, find for how long the queue has been
- * empty and use a lookup table for computing
- * (1 - * w_q)^(idle_time/s) where s is the time to send a
- * (small) packet.
- * XXX check wraps...
- */
- if (q->avg) {
- u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step);
-
- q->avg = (t < fs->lookup_depth) ?
- SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
- }
- }
-
- /* Should i drop? */
- if (q->avg < fs->min_th) {
- q->count = -1;
- return (0); /* accept packet */
- }
- if (q->avg >= fs->max_th) { /* average queue >= max threshold */
- if (fs->fs.flags & DN_IS_GENTLE_RED) {
- /*
- * According to Gentle-RED, if avg is greater than
- * max_th the packet is dropped with a probability
- * p_b = c_3 * avg - c_4
- * where c_3 = (1 - max_p) / max_th
- * c_4 = 1 - 2 * max_p
- */
- p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
- fs->c_4;
- } else {
- q->count = -1;
- return (1);
- }
- } else if (q->avg > fs->min_th) {
- /*
- * We compute p_b using the linear dropping function
- * p_b = c_1 * avg - c_2
- * where c_1 = max_p / (max_th - min_th)
- * c_2 = max_p * min_th / (max_th - min_th)
- */
- p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
- }
-
- if (fs->fs.flags & DN_QSIZE_BYTES)
- p_b = div64((p_b * len) , fs->max_pkt_size);
- if (++q->count == 0)
- q->random = random() & 0xffff;
- else {
- /*
- * q->count counts packets arrived since last drop, so a greater
- * value of q->count means a greater packet drop probability.
- */
- if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
- q->count = 0;
- /* After a drop we calculate a new random value. */
- q->random = random() & 0xffff;
- return (1); /* drop */
- }
- }
- /* End of RED algorithm. */
-
- return (0); /* accept */
-
-}
-
-/*
- * Enqueue a packet in q, subject to space and queue management policy
- * (whose parameters are in q->fs).
- * Update stats for the queue and the scheduler.
- * Return 0 on success, 1 on drop. The packet is consumed anyways.
- */
-int
-dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
-{
- struct dn_fs *f;
- struct dn_flow *ni; /* stats for scheduler instance */
- uint64_t len;
-
- if (q->fs == NULL || q->_si == NULL) {
- printf("%s fs %p si %p, dropping\n",
- __FUNCTION__, q->fs, q->_si);
- FREE_PKT(m);
- return 1;
- }
- f = &(q->fs->fs);
- ni = &q->_si->ni;
- len = m->m_pkthdr.len;
- /* Update statistics, then check reasons to drop pkt. */
- q->ni.tot_bytes += len;
- q->ni.tot_pkts++;
- ni->tot_bytes += len;
- ni->tot_pkts++;
- if (drop)
- goto drop;
- if (f->plr && random() < f->plr)
- goto drop;
- if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len))
- goto drop;
- if (f->flags & DN_QSIZE_BYTES) {
- if (q->ni.len_bytes > f->qsize)
- goto drop;
- } else if (q->ni.length >= f->qsize) {
- goto drop;
- }
- mq_append(&q->mq, m);
- q->ni.length++;
- q->ni.len_bytes += len;
- ni->length++;
- ni->len_bytes += len;
- return 0;
-
-drop:
- io_pkt_drop++;
- q->ni.drops++;
- ni->drops++;
- FREE_PKT(m);
- return 1;
-}
-
-/*
- * Fetch packets from the delay line which are due now. If there are
- * leftover packets, reinsert the delay line in the heap.
- * Runs under scheduler lock.
- */
-static void
-transmit_event(struct mq *q, struct delay_line *dline, uint64_t now)
-{
- struct mbuf *m;
- struct dn_pkt_tag *pkt = NULL;
-
- dline->oid.subtype = 0; /* not in heap */
- while ((m = dline->mq.head) != NULL) {
- pkt = dn_tag_get(m);
- if (!DN_KEY_LEQ(pkt->output_time, now))
- break;
- dline->mq.head = m->m_nextpkt;
- mq_append(q, m);
- }
- if (m != NULL) {
- dline->oid.subtype = 1; /* in heap */
- heap_insert(&dn_cfg.evheap, pkt->output_time, dline);
- }
-}
-
-/*
- * Convert the additional MAC overheads/delays into an equivalent
- * number of bits for the given data rate. The samples are
- * in milliseconds so we need to divide by 1000.
- */
-static uint64_t
-extra_bits(struct mbuf *m, struct dn_schk *s)
-{
- int index;
- uint64_t bits;
- struct dn_profile *pf = s->profile;
-
- if (!pf || pf->samples_no == 0)
- return 0;
- index = random() % pf->samples_no;
- bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000);
- if (index >= pf->loss_level) {
- struct dn_pkt_tag *dt = dn_tag_get(m);
- if (dt)
- dt->dn_dir = DIR_DROP;
- }
- return bits;
-}
-
-/*
- * Send traffic from a scheduler instance due by 'now'.
- * Return a pointer to the head of the queue.
- */
-static struct mbuf *
-serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now)
-{
- struct mq def_q;
- struct dn_schk *s = si->sched;
- struct mbuf *m = NULL;
- int delay_line_idle = (si->dline.mq.head == NULL);
- int done, bw;
-
- if (q == NULL) {
- q = &def_q;
- q->head = NULL;
- }
-
- bw = s->link.bandwidth;
- si->kflags &= ~DN_ACTIVE;
-
- if (bw > 0)
- si->credit += (now - si->sched_time) * bw;
- else
- si->credit = 0;
- si->sched_time = now;
- done = 0;
- while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) {
- uint64_t len_scaled;
-
- done++;
- len_scaled = (bw == 0) ? 0 : hz *
- (m->m_pkthdr.len * 8 + extra_bits(m, s));
- si->credit -= len_scaled;
- /* Move packet in the delay line */
- dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay ;
- mq_append(&si->dline.mq, m);
- }
-
- /*
- * If credit >= 0 the instance is idle, mark time.
- * Otherwise put back in the heap, and adjust the output
- * time of the last inserted packet, m, which was too early.
- */
- if (si->credit >= 0) {
- si->idle_time = now;
- } else {
- uint64_t t;
- KASSERT (bw > 0, ("bw=0 and credit<0 ?"));
- t = div64(bw - 1 - si->credit, bw);
- if (m)
- dn_tag_get(m)->output_time += t;
- si->kflags |= DN_ACTIVE;
- heap_insert(&dn_cfg.evheap, now + t, si);
- }
- if (delay_line_idle && done)
- transmit_event(q, &si->dline, now);
- return q->head;
-}
-
-/*
- * The timer handler for dummynet. Time is computed in ticks, but
- * but the code is tolerant to the actual rate at which this is called.
- * Once complete, the function reschedules itself for the next tick.
- */
-void
-dummynet_task(void *context, int pending)
-{
- struct timeval t;
- struct mq q = { NULL, NULL }; /* queue to accumulate results */
-
- CURVNET_SET((struct vnet *)context);
-
- DN_BH_WLOCK();
-
- /* Update number of lost(coalesced) ticks. */
- tick_lost += pending - 1;
-
- getmicrouptime(&t);
- /* Last tick duration (usec). */
- tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 +
- (t.tv_usec - dn_cfg.prev_t.tv_usec);
- /* Last tick vs standard tick difference (usec). */
- tick_delta = (tick_last * hz - 1000000) / hz;
- /* Accumulated tick difference (usec). */
- tick_delta_sum += tick_delta;
-
- dn_cfg.prev_t = t;
-
- /*
- * Adjust curr_time if the accumulated tick difference is
- * greater than the 'standard' tick. Since curr_time should
- * be monotonically increasing, we do positive adjustments
- * as required, and throttle curr_time in case of negative
- * adjustment.
- */
- dn_cfg.curr_time++;
- if (tick_delta_sum - tick >= 0) {
- int diff = tick_delta_sum / tick;
-
- dn_cfg.curr_time += diff;
- tick_diff += diff;
- tick_delta_sum %= tick;
- tick_adjustment++;
- } else if (tick_delta_sum + tick <= 0) {
- dn_cfg.curr_time--;
- tick_diff--;
- tick_delta_sum += tick;
- tick_adjustment++;
- }
-
- /* serve pending events, accumulate in q */
- for (;;) {
- struct dn_id *p; /* generic parameter to handler */
-
- if (dn_cfg.evheap.elements == 0 ||
- DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key))
- break;
- p = HEAP_TOP(&dn_cfg.evheap)->object;
- heap_extract(&dn_cfg.evheap, NULL);
-
- if (p->type == DN_SCH_I) {
- serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time);
- } else { /* extracted a delay line */
- transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time);
- }
- }
- if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) {
- dn_cfg.expire_cycle = 0;
- dn_drain_scheduler();
- dn_drain_queue();
- }
-
- DN_BH_WUNLOCK();
- dn_reschedule();
- if (q.head != NULL)
- dummynet_send(q.head);
- CURVNET_RESTORE();
-}
-
-/*
- * forward a chain of packets to the proper destination.
- * This runs outside the dummynet lock.
- */
-static void
-dummynet_send(struct mbuf *m)
-{
- struct mbuf *n;
-
- for (; m != NULL; m = n) {
- struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */
- struct m_tag *tag;
- int dst;
-
- n = m->m_nextpkt;
- m->m_nextpkt = NULL;
- tag = m_tag_first(m);
- if (tag == NULL) { /* should not happen */
- dst = DIR_DROP;
- } else {
- struct dn_pkt_tag *pkt = dn_tag_get(m);
- /* extract the dummynet info, rename the tag
- * to carry reinject info.
- */
- dst = pkt->dn_dir;
- ifp = pkt->ifp;
- tag->m_tag_cookie = MTAG_IPFW_RULE;
- tag->m_tag_id = 0;
- }
-
- switch (dst) {
- case DIR_OUT:
- SET_HOST_IPLEN(mtod(m, struct ip *));
- ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
- break ;
-
- case DIR_IN :
- /* put header in network format for ip_input() */
- //SET_NET_IPLEN(mtod(m, struct ip *));
- netisr_dispatch(NETISR_IP, m);
- break;
-
-#ifdef INET6
- case DIR_IN | PROTO_IPV6:
- netisr_dispatch(NETISR_IPV6, m);
- break;
-
- case DIR_OUT | PROTO_IPV6:
- ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
- break;
-#endif
-
- case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
- if (bridge_dn_p != NULL)
- ((*bridge_dn_p)(m, ifp));
- else
- printf("dummynet: if_bridge not loaded\n");
-
- break;
-
- case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
- /*
- * The Ethernet code assumes the Ethernet header is
- * contiguous in the first mbuf header.
- * Insure this is true.
- */
- if (m->m_len < ETHER_HDR_LEN &&
- (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
- printf("dummynet/ether: pullup failed, "
- "dropping packet\n");
- break;
- }
- ether_demux(m->m_pkthdr.rcvif, m);
- break;
-
- case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */
- ether_output_frame(ifp, m);
- break;
-
- case DIR_DROP:
- /* drop the packet after some time */
- FREE_PKT(m);
- break;
-
- default:
- printf("dummynet: bad switch %d!\n", dst);
- FREE_PKT(m);
- break;
- }
- }
-}
-
-static inline int
-tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa)
-{
- struct dn_pkt_tag *dt;
- struct m_tag *mtag;
-
- mtag = m_tag_get(PACKET_TAG_DUMMYNET,
- sizeof(*dt), M_NOWAIT | M_ZERO);
- if (mtag == NULL)
- return 1; /* Cannot allocate packet header. */
- m_tag_prepend(m, mtag); /* Attach to mbuf chain. */
- dt = (struct dn_pkt_tag *)(mtag + 1);
- dt->rule = fwa->rule;
- dt->rule.info &= IPFW_ONEPASS; /* only keep this info */
- dt->dn_dir = dir;
- dt->ifp = fwa->oif;
- /* dt->output tame is updated as we move through */
- dt->output_time = dn_cfg.curr_time;
- return 0;
-}
-
-
-/*
- * dummynet hook for packets.
- * We use the argument to locate the flowset fs and the sched_set sch
- * associated to it. The we apply flow_mask and sched_mask to
- * determine the queue and scheduler instances.
- *
- * dir where shall we send the packet after dummynet.
- * *m0 the mbuf with the packet
- * ifp the 'ifp' parameter from the caller.
- * NULL in ip_input, destination interface in ip_output,
- */
-int
-dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
-{
- struct mbuf *m = *m0;
- struct dn_fsk *fs = NULL;
- struct dn_sch_inst *si;
- struct dn_queue *q = NULL; /* default */
-
- int fs_id = (fwa->rule.info & IPFW_INFO_MASK) +
- ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0);
- DN_BH_WLOCK();
- io_pkt++;
- /* we could actually tag outside the lock, but who cares... */
- if (tag_mbuf(m, dir, fwa))
- goto dropit;
- if (dn_cfg.busy) {
- /* if the upper half is busy doing something expensive,
- * lets queue the packet and move forward
- */
- mq_append(&dn_cfg.pending, m);
- m = *m0 = NULL; /* consumed */
- goto done; /* already active, nothing to do */
- }
- /* XXX locate_flowset could be optimised with a direct ref. */
- fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL);
- if (fs == NULL)
- goto dropit; /* This queue/pipe does not exist! */
- if (fs->sched == NULL) /* should not happen */
- goto dropit;
- /* find scheduler instance, possibly applying sched_mask */
- si = ipdn_si_find(fs->sched, &(fwa->f_id));
- if (si == NULL)
- goto dropit;
- /*
- * If the scheduler supports multiple queues, find the right one
- * (otherwise it will be ignored by enqueue).
- */
- if (fs->sched->fp->flags & DN_MULTIQUEUE) {
- q = ipdn_q_find(fs, si, &(fwa->f_id));
- if (q == NULL)
- goto dropit;
- }
- if (fs->sched->fp->enqueue(si, q, m)) {
- /* packet was dropped by enqueue() */
- m = *m0 = NULL;
- goto dropit;
- }
-
- if (si->kflags & DN_ACTIVE) {
- m = *m0 = NULL; /* consumed */
- goto done; /* already active, nothing to do */
- }
-
- /* compute the initial allowance */
- if (si->idle_time < dn_cfg.curr_time) {
- /* Do this only on the first packet on an idle pipe */
- struct dn_link *p = &fs->sched->link;
-
- si->sched_time = dn_cfg.curr_time;
- si->credit = dn_cfg.io_fast ? p->bandwidth : 0;
- if (p->burst) {
- uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth;
- if (burst > p->burst)
- burst = p->burst;
- si->credit += burst;
- }
- }
- /* pass through scheduler and delay line */
- m = serve_sched(NULL, si, dn_cfg.curr_time);
-
- /* optimization -- pass it back to ipfw for immediate send */
- /* XXX Don't call dummynet_send() if scheduler return the packet
- * just enqueued. This avoid a lock order reversal.
- *
- */
- if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) {
- /* fast io, rename the tag * to carry reinject info. */
- struct m_tag *tag = m_tag_first(m);
-
- tag->m_tag_cookie = MTAG_IPFW_RULE;
- tag->m_tag_id = 0;
- io_pkt_fast++;
- if (m->m_nextpkt != NULL) {
- printf("dummynet: fast io: pkt chain detected!\n");
- m->m_nextpkt = NULL;
- }
- m = NULL;
- } else {
- *m0 = NULL;
- }
-done:
- DN_BH_WUNLOCK();
- if (m)
- dummynet_send(m);
- return 0;
-
-dropit:
- io_pkt_drop++;
- DN_BH_WUNLOCK();
- if (m)
- FREE_PKT(m);
- *m0 = NULL;
- return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS;
-}
diff --git a/freebsd/sys/netpfil/ipfw/ip_dn_private.h b/freebsd/sys/netpfil/ipfw/ip_dn_private.h
index 159ddc9a..2fce1366 100644
--- a/freebsd/sys/netpfil/ipfw/ip_dn_private.h
+++ b/freebsd/sys/netpfil/ipfw/ip_dn_private.h
@@ -81,8 +81,13 @@ SLIST_HEAD(dn_fsk_head, dn_fsk);
SLIST_HEAD(dn_queue_head, dn_queue);
SLIST_HEAD(dn_alg_head, dn_alg);
+#ifdef NEW_AQM
+SLIST_HEAD(dn_aqm_head, dn_aqm); /* for new AQMs */
+#endif
+
struct mq { /* a basic queue of packets*/
struct mbuf *head, *tail;
+ int count;
};
static inline void
@@ -91,7 +96,7 @@ set_oid(struct dn_id *o, int type, int len)
o->type = type;
o->len = len;
o->subtype = 0;
-};
+}
/*
* configuration and global data for a dummynet instance
@@ -135,6 +140,9 @@ struct dn_parms {
/* list of flowsets without a scheduler -- use sch_chain */
struct dn_fsk_head fsu; /* list of unlinked flowsets */
struct dn_alg_head schedlist; /* list of algorithms */
+#ifdef NEW_AQM
+ struct dn_aqm_head aqmlist; /* list of AQMs */
+#endif
/* Store the fs/sch to scan when draining. The value is the
* bucket number of the hash table. Expire can be disabled
@@ -231,6 +239,10 @@ struct dn_fsk { /* kernel side of a flowset */
int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
int avg_pkt_size ; /* medium packet size */
int max_pkt_size ; /* max packet size */
+#ifdef NEW_AQM
+ struct dn_aqm *aqmfp; /* Pointer to AQM functions */
+ void *aqmcfg; /* configuration parameters for AQM */
+#endif
};
/*
@@ -253,6 +265,9 @@ struct dn_queue {
int count; /* arrivals since last RED drop */
int random; /* random value (scaled) */
uint64_t q_time; /* start of queue idle time */
+#ifdef NEW_AQM
+ void *aqm_status; /* per-queue status variables*/
+#endif
};
@@ -400,4 +415,49 @@ int do_config(void *p, int l);
void dn_drain_scheduler(void);
void dn_drain_queue(void);
+#ifdef NEW_AQM
+int ecn_mark(struct mbuf* m);
+
+/* moved from ip_dn_io.c to here to be available for AQMs modules*/
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+#ifdef USERSPACE
+ // buffers from netmap need to be copied
+ // XXX note that the routine is not expected to fail
+ ND("append %p to %p", m, q);
+ if (m->m_flags & M_STACK) {
+ struct mbuf *m_new;
+ void *p;
+ int l, ofs;
+
+ ofs = m->m_data - m->__m_extbuf;
+ // XXX allocate
+ MGETHDR(m_new, M_NOWAIT, MT_DATA);
+ ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p",
+ m, m->__m_extbuf, m->__m_extlen, ofs, m_new);
+ p = m_new->__m_extbuf; /* new pointer */
+ l = m_new->__m_extlen; /* new len */
+ if (l <= m->__m_extlen) {
+ panic("extlen too large");
+ }
+
+ *m_new = *m; // copy
+ m_new->m_flags &= ~M_STACK;
+ m_new->__m_extbuf = p; // point to new buffer
+ _pkt_copy(m->__m_extbuf, p, m->__m_extlen);
+ m_new->m_data = p + ofs;
+ m = m_new;
+ }
+#endif /* USERSPACE */
+ if (q->head == NULL)
+ q->head = m;
+ else
+ q->tail->m_nextpkt = m;
+ q->count++;
+ q->tail = m;
+ m->m_nextpkt = NULL;
+}
+#endif /* NEW_AQM */
+
#endif /* _IP_DN_PRIVATE_H */
diff --git a/freebsd/sys/netpfil/ipfw/ip_dummynet.c b/freebsd/sys/netpfil/ipfw/ip_dummynet.c
deleted file mode 100644
index 40c37d80..00000000
--- a/freebsd/sys/netpfil/ipfw/ip_dummynet.c
+++ /dev/null
@@ -1,2309 +0,0 @@
-#include <machine/rtems-bsd-kernel-space.h>
-
-/*-
- * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
- * Portions Copyright (c) 2000 Akamba Corp.
- * All rights reserved
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-/*
- * Configuration and internal object management for dummynet.
- */
-
-#include <rtems/bsd/local/opt_inet6.h>
-
-#include <rtems/bsd/sys/param.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/kernel.h>
-#include <rtems/bsd/sys/lock.h>
-#include <sys/module.h>
-#include <sys/priv.h>
-#include <sys/proc.h>
-#include <sys/rwlock.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/time.h>
-#include <sys/taskqueue.h>
-#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
-#include <netinet/in.h>
-#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
-#include <netinet/ip_fw.h>
-#include <netinet/ip_dummynet.h>
-
-#include <netpfil/ipfw/ip_fw_private.h>
-#include <netpfil/ipfw/dn_heap.h>
-#include <netpfil/ipfw/ip_dn_private.h>
-#include <netpfil/ipfw/dn_sched.h>
-
-/* which objects to copy */
-#define DN_C_LINK 0x01
-#define DN_C_SCH 0x02
-#define DN_C_FLOW 0x04
-#define DN_C_FS 0x08
-#define DN_C_QUEUE 0x10
-
-/* we use this argument in case of a schk_new */
-struct schk_new_arg {
- struct dn_alg *fp;
- struct dn_sch *sch;
-};
-
-/*---- callout hooks. ----*/
-static struct callout dn_timeout;
-static struct task dn_task;
-static struct taskqueue *dn_tq = NULL;
-
-static void
-dummynet(void * __unused unused)
-{
-
- taskqueue_enqueue(dn_tq, &dn_task);
-}
-
-void
-dn_reschedule(void)
-{
- callout_reset(&dn_timeout, 1, dummynet, NULL);
-}
-/*----- end of callout hooks -----*/
-
-/* Return a scheduler descriptor given the type or name. */
-static struct dn_alg *
-find_sched_type(int type, char *name)
-{
- struct dn_alg *d;
-
- SLIST_FOREACH(d, &dn_cfg.schedlist, next) {
- if (d->type == type || (name && !strcasecmp(d->name, name)))
- return d;
- }
- return NULL; /* not found */
-}
-
-int
-ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
-{
- int oldv = *v;
- const char *op = NULL;
- if (dflt < lo)
- dflt = lo;
- if (dflt > hi)
- dflt = hi;
- if (oldv < lo) {
- *v = dflt;
- op = "Bump";
- } else if (oldv > hi) {
- *v = hi;
- op = "Clamp";
- } else
- return *v;
- if (op && msg)
- printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
- return *v;
-}
-
-/*---- flow_id mask, hash and compare functions ---*/
-/*
- * The flow_id includes the 5-tuple, the queue/pipe number
- * which we store in the extra area in host order,
- * and for ipv6 also the flow_id6.
- * XXX see if we want the tos byte (can store in 'flags')
- */
-static struct ipfw_flow_id *
-flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id)
-{
- int is_v6 = IS_IP6_FLOW_ID(id);
-
- id->dst_port &= mask->dst_port;
- id->src_port &= mask->src_port;
- id->proto &= mask->proto;
- id->extra &= mask->extra;
- if (is_v6) {
- APPLY_MASK(&id->dst_ip6, &mask->dst_ip6);
- APPLY_MASK(&id->src_ip6, &mask->src_ip6);
- id->flow_id6 &= mask->flow_id6;
- } else {
- id->dst_ip &= mask->dst_ip;
- id->src_ip &= mask->src_ip;
- }
- return id;
-}
-
-/* computes an OR of two masks, result in dst and also returned */
-static struct ipfw_flow_id *
-flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst)
-{
- int is_v6 = IS_IP6_FLOW_ID(dst);
-
- dst->dst_port |= src->dst_port;
- dst->src_port |= src->src_port;
- dst->proto |= src->proto;
- dst->extra |= src->extra;
- if (is_v6) {
-#define OR_MASK(_d, _s) \
- (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \
- (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \
- (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \
- (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3];
- OR_MASK(&dst->dst_ip6, &src->dst_ip6);
- OR_MASK(&dst->src_ip6, &src->src_ip6);
-#undef OR_MASK
- dst->flow_id6 |= src->flow_id6;
- } else {
- dst->dst_ip |= src->dst_ip;
- dst->src_ip |= src->src_ip;
- }
- return dst;
-}
-
-static int
-nonzero_mask(struct ipfw_flow_id *m)
-{
- if (m->dst_port || m->src_port || m->proto || m->extra)
- return 1;
- if (IS_IP6_FLOW_ID(m)) {
- return
- m->dst_ip6.__u6_addr.__u6_addr32[0] ||
- m->dst_ip6.__u6_addr.__u6_addr32[1] ||
- m->dst_ip6.__u6_addr.__u6_addr32[2] ||
- m->dst_ip6.__u6_addr.__u6_addr32[3] ||
- m->src_ip6.__u6_addr.__u6_addr32[0] ||
- m->src_ip6.__u6_addr.__u6_addr32[1] ||
- m->src_ip6.__u6_addr.__u6_addr32[2] ||
- m->src_ip6.__u6_addr.__u6_addr32[3] ||
- m->flow_id6;
- } else {
- return m->dst_ip || m->src_ip;
- }
-}
-
-/* XXX we may want a better hash function */
-static uint32_t
-flow_id_hash(struct ipfw_flow_id *id)
-{
- uint32_t i;
-
- if (IS_IP6_FLOW_ID(id)) {
- uint32_t *d = (uint32_t *)&id->dst_ip6;
- uint32_t *s = (uint32_t *)&id->src_ip6;
- i = (d[0] ) ^ (d[1]) ^
- (d[2] ) ^ (d[3]) ^
- (d[0] >> 15) ^ (d[1] >> 15) ^
- (d[2] >> 15) ^ (d[3] >> 15) ^
- (s[0] << 1) ^ (s[1] << 1) ^
- (s[2] << 1) ^ (s[3] << 1) ^
- (s[0] << 16) ^ (s[1] << 16) ^
- (s[2] << 16) ^ (s[3] << 16) ^
- (id->dst_port << 1) ^ (id->src_port) ^
- (id->extra) ^
- (id->proto ) ^ (id->flow_id6);
- } else {
- i = (id->dst_ip) ^ (id->dst_ip >> 15) ^
- (id->src_ip << 1) ^ (id->src_ip >> 16) ^
- (id->extra) ^
- (id->dst_port << 1) ^ (id->src_port) ^ (id->proto);
- }
- return i;
-}
-
-/* Like bcmp, returns 0 if ids match, 1 otherwise. */
-static int
-flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2)
-{
- int is_v6 = IS_IP6_FLOW_ID(id1);
-
- if (!is_v6) {
- if (IS_IP6_FLOW_ID(id2))
- return 1; /* different address families */
-
- return (id1->dst_ip == id2->dst_ip &&
- id1->src_ip == id2->src_ip &&
- id1->dst_port == id2->dst_port &&
- id1->src_port == id2->src_port &&
- id1->proto == id2->proto &&
- id1->extra == id2->extra) ? 0 : 1;
- }
- /* the ipv6 case */
- return (
- !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) &&
- !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) &&
- id1->dst_port == id2->dst_port &&
- id1->src_port == id2->src_port &&
- id1->proto == id2->proto &&
- id1->extra == id2->extra &&
- id1->flow_id6 == id2->flow_id6) ? 0 : 1;
-}
-/*--------- end of flow-id mask, hash and compare ---------*/
-
-/*--- support functions for the qht hashtable ----
- * Entries are hashed by flow-id
- */
-static uint32_t
-q_hash(uintptr_t key, int flags, void *arg)
-{
- /* compute the hash slot from the flow id */
- struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
- &((struct dn_queue *)key)->ni.fid :
- (struct ipfw_flow_id *)key;
-
- return flow_id_hash(id);
-}
-
-static int
-q_match(void *obj, uintptr_t key, int flags, void *arg)
-{
- struct dn_queue *o = (struct dn_queue *)obj;
- struct ipfw_flow_id *id2;
-
- if (flags & DNHT_KEY_IS_OBJ) {
- /* compare pointers */
- id2 = &((struct dn_queue *)key)->ni.fid;
- } else {
- id2 = (struct ipfw_flow_id *)key;
- }
- return (0 == flow_id_cmp(&o->ni.fid, id2));
-}
-
-/*
- * create a new queue instance for the given 'key'.
- */
-static void *
-q_new(uintptr_t key, int flags, void *arg)
-{
- struct dn_queue *q, *template = arg;
- struct dn_fsk *fs = template->fs;
- int size = sizeof(*q) + fs->sched->fp->q_datalen;
-
- q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO);
- if (q == NULL) {
- D("no memory for new queue");
- return NULL;
- }
-
- set_oid(&q->ni.oid, DN_QUEUE, size);
- if (fs->fs.flags & DN_QHT_HASH)
- q->ni.fid = *(struct ipfw_flow_id *)key;
- q->fs = fs;
- q->_si = template->_si;
- q->_si->q_count++;
-
- if (fs->sched->fp->new_queue)
- fs->sched->fp->new_queue(q);
- dn_cfg.queue_count++;
- return q;
-}
-
-/*
- * Notify schedulers that a queue is going away.
- * If (flags & DN_DESTROY), also free the packets.
- * The version for callbacks is called q_delete_cb().
- */
-static void
-dn_delete_queue(struct dn_queue *q, int flags)
-{
- struct dn_fsk *fs = q->fs;
-
- // D("fs %p si %p\n", fs, q->_si);
- /* notify the parent scheduler that the queue is going away */
- if (fs && fs->sched->fp->free_queue)
- fs->sched->fp->free_queue(q);
- q->_si->q_count--;
- q->_si = NULL;
- if (flags & DN_DESTROY) {
- if (q->mq.head)
- dn_free_pkts(q->mq.head);
- bzero(q, sizeof(*q)); // safety
- free(q, M_DUMMYNET);
- dn_cfg.queue_count--;
- }
-}
-
-static int
-q_delete_cb(void *q, void *arg)
-{
- int flags = (int)(uintptr_t)arg;
- dn_delete_queue(q, flags);
- return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0;
-}
-
-/*
- * calls dn_delete_queue/q_delete_cb on all queues,
- * which notifies the parent scheduler and possibly drains packets.
- * flags & DN_DESTROY: drains queues and destroy qht;
- */
-static void
-qht_delete(struct dn_fsk *fs, int flags)
-{
- ND("fs %d start flags %d qht %p",
- fs->fs.fs_nr, flags, fs->qht);
- if (!fs->qht)
- return;
- if (fs->fs.flags & DN_QHT_HASH) {
- dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags);
- if (flags & DN_DESTROY) {
- dn_ht_free(fs->qht, 0);
- fs->qht = NULL;
- }
- } else {
- dn_delete_queue((struct dn_queue *)(fs->qht), flags);
- if (flags & DN_DESTROY)
- fs->qht = NULL;
- }
-}
-
-/*
- * Find and possibly create the queue for a MULTIQUEUE scheduler.
- * We never call it for !MULTIQUEUE (the queue is in the sch_inst).
- */
-struct dn_queue *
-ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si,
- struct ipfw_flow_id *id)
-{
- struct dn_queue template;
-
- template._si = si;
- template.fs = fs;
-
- if (fs->fs.flags & DN_QHT_HASH) {
- struct ipfw_flow_id masked_id;
- if (fs->qht == NULL) {
- fs->qht = dn_ht_init(NULL, fs->fs.buckets,
- offsetof(struct dn_queue, q_next),
- q_hash, q_match, q_new);
- if (fs->qht == NULL)
- return NULL;
- }
- masked_id = *id;
- flow_id_mask(&fs->fsk_mask, &masked_id);
- return dn_ht_find(fs->qht, (uintptr_t)&masked_id,
- DNHT_INSERT, &template);
- } else {
- if (fs->qht == NULL)
- fs->qht = q_new(0, 0, &template);
- return (struct dn_queue *)fs->qht;
- }
-}
-/*--- end of queue hash table ---*/
-
-/*--- support functions for the sch_inst hashtable ----
- *
- * These are hashed by flow-id
- */
-static uint32_t
-si_hash(uintptr_t key, int flags, void *arg)
-{
- /* compute the hash slot from the flow id */
- struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
- &((struct dn_sch_inst *)key)->ni.fid :
- (struct ipfw_flow_id *)key;
-
- return flow_id_hash(id);
-}
-
-static int
-si_match(void *obj, uintptr_t key, int flags, void *arg)
-{
- struct dn_sch_inst *o = obj;
- struct ipfw_flow_id *id2;
-
- id2 = (flags & DNHT_KEY_IS_OBJ) ?
- &((struct dn_sch_inst *)key)->ni.fid :
- (struct ipfw_flow_id *)key;
- return flow_id_cmp(&o->ni.fid, id2) == 0;
-}
-
-/*
- * create a new instance for the given 'key'
- * Allocate memory for instance, delay line and scheduler private data.
- */
-static void *
-si_new(uintptr_t key, int flags, void *arg)
-{
- struct dn_schk *s = arg;
- struct dn_sch_inst *si;
- int l = sizeof(*si) + s->fp->si_datalen;
-
- si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
- if (si == NULL)
- goto error;
-
- /* Set length only for the part passed up to userland. */
- set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow));
- set_oid(&(si->dline.oid), DN_DELAY_LINE,
- sizeof(struct delay_line));
- /* mark si and dline as outside the event queue */
- si->ni.oid.id = si->dline.oid.id = -1;
-
- si->sched = s;
- si->dline.si = si;
-
- if (s->fp->new_sched && s->fp->new_sched(si)) {
- D("new_sched error");
- goto error;
- }
- if (s->sch.flags & DN_HAVE_MASK)
- si->ni.fid = *(struct ipfw_flow_id *)key;
-
- dn_cfg.si_count++;
- return si;
-
-error:
- if (si) {
- bzero(si, sizeof(*si)); // safety
- free(si, M_DUMMYNET);
- }
- return NULL;
-}
-
-/*
- * Callback from siht to delete all scheduler instances. Remove
- * si and delay line from the system heap, destroy all queues.
- * We assume that all flowset have been notified and do not
- * point to us anymore.
- */
-static int
-si_destroy(void *_si, void *arg)
-{
- struct dn_sch_inst *si = _si;
- struct dn_schk *s = si->sched;
- struct delay_line *dl = &si->dline;
-
- if (dl->oid.subtype) /* remove delay line from event heap */
- heap_extract(&dn_cfg.evheap, dl);
- dn_free_pkts(dl->mq.head); /* drain delay line */
- if (si->kflags & DN_ACTIVE) /* remove si from event heap */
- heap_extract(&dn_cfg.evheap, si);
- if (s->fp->free_sched)
- s->fp->free_sched(si);
- bzero(si, sizeof(*si)); /* safety */
- free(si, M_DUMMYNET);
- dn_cfg.si_count--;
- return DNHT_SCAN_DEL;
-}
-
-/*
- * Find the scheduler instance for this packet. If we need to apply
- * a mask, do on a local copy of the flow_id to preserve the original.
- * Assume siht is always initialized if we have a mask.
- */
-struct dn_sch_inst *
-ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id)
-{
-
- if (s->sch.flags & DN_HAVE_MASK) {
- struct ipfw_flow_id id_t = *id;
- flow_id_mask(&s->sch.sched_mask, &id_t);
- return dn_ht_find(s->siht, (uintptr_t)&id_t,
- DNHT_INSERT, s);
- }
- if (!s->siht)
- s->siht = si_new(0, 0, s);
- return (struct dn_sch_inst *)s->siht;
-}
-
-/* callback to flush credit for the scheduler instance */
-static int
-si_reset_credit(void *_si, void *arg)
-{
- struct dn_sch_inst *si = _si;
- struct dn_link *p = &si->sched->link;
-
- si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0);
- return 0;
-}
-
-static void
-schk_reset_credit(struct dn_schk *s)
-{
- if (s->sch.flags & DN_HAVE_MASK)
- dn_ht_scan(s->siht, si_reset_credit, NULL);
- else if (s->siht)
- si_reset_credit(s->siht, NULL);
-}
-/*---- end of sch_inst hashtable ---------------------*/
-
-/*-------------------------------------------------------
- * flowset hash (fshash) support. Entries are hashed by fs_nr.
- * New allocations are put in the fsunlinked list, from which
- * they are removed when they point to a specific scheduler.
- */
-static uint32_t
-fsk_hash(uintptr_t key, int flags, void *arg)
-{
- uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
- ((struct dn_fsk *)key)->fs.fs_nr;
-
- return ( (i>>8)^(i>>4)^i );
-}
-
-static int
-fsk_match(void *obj, uintptr_t key, int flags, void *arg)
-{
- struct dn_fsk *fs = obj;
- int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
- ((struct dn_fsk *)key)->fs.fs_nr;
-
- return (fs->fs.fs_nr == i);
-}
-
-static void *
-fsk_new(uintptr_t key, int flags, void *arg)
-{
- struct dn_fsk *fs;
-
- fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO);
- if (fs) {
- set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs));
- dn_cfg.fsk_count++;
- fs->drain_bucket = 0;
- SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
- }
- return fs;
-}
-
-/*
- * detach flowset from its current scheduler. Flags as follows:
- * DN_DETACH removes from the fsk_list
- * DN_DESTROY deletes individual queues
- * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked).
- */
-static void
-fsk_detach(struct dn_fsk *fs, int flags)
-{
- if (flags & DN_DELETE_FS)
- flags |= DN_DESTROY;
- ND("fs %d from sched %d flags %s %s %s",
- fs->fs.fs_nr, fs->fs.sched_nr,
- (flags & DN_DELETE_FS) ? "DEL_FS":"",
- (flags & DN_DESTROY) ? "DEL":"",
- (flags & DN_DETACH) ? "DET":"");
- if (flags & DN_DETACH) { /* detach from the list */
- struct dn_fsk_head *h;
- h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu;
- SLIST_REMOVE(h, fs, dn_fsk, sch_chain);
- }
- /* Free the RED parameters, they will be recomputed on
- * subsequent attach if needed.
- */
- if (fs->w_q_lookup)
- free(fs->w_q_lookup, M_DUMMYNET);
- fs->w_q_lookup = NULL;
- qht_delete(fs, flags);
- if (fs->sched && fs->sched->fp->free_fsk)
- fs->sched->fp->free_fsk(fs);
- fs->sched = NULL;
- if (flags & DN_DELETE_FS) {
- bzero(fs, sizeof(*fs)); /* safety */
- free(fs, M_DUMMYNET);
- dn_cfg.fsk_count--;
- } else {
- SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
- }
-}
-
-/*
- * Detach or destroy all flowsets in a list.
- * flags specifies what to do:
- * DN_DESTROY: flush all queues
- * DN_DELETE_FS: DN_DESTROY + destroy flowset
- * DN_DELETE_FS implies DN_DESTROY
- */
-static void
-fsk_detach_list(struct dn_fsk_head *h, int flags)
-{
- struct dn_fsk *fs;
- int n = 0; /* only for stats */
-
- ND("head %p flags %x", h, flags);
- while ((fs = SLIST_FIRST(h))) {
- SLIST_REMOVE_HEAD(h, sch_chain);
- n++;
- fsk_detach(fs, flags);
- }
- ND("done %d flowsets", n);
-}
-
-/*
- * called on 'queue X delete' -- removes the flowset from fshash,
- * deletes all queues for the flowset, and removes the flowset.
- */
-static int
-delete_fs(int i, int locked)
-{
- struct dn_fsk *fs;
- int err = 0;
-
- if (!locked)
- DN_BH_WLOCK();
- fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL);
- ND("fs %d found %p", i, fs);
- if (fs) {
- fsk_detach(fs, DN_DETACH | DN_DELETE_FS);
- err = 0;
- } else
- err = EINVAL;
- if (!locked)
- DN_BH_WUNLOCK();
- return err;
-}
-
-/*----- end of flowset hashtable support -------------*/
-
-/*------------------------------------------------------------
- * Scheduler hash. When searching by index we pass sched_nr,
- * otherwise we pass struct dn_sch * which is the first field in
- * struct dn_schk so we can cast between the two. We use this trick
- * because in the create phase (but it should be fixed).
- */
-static uint32_t
-schk_hash(uintptr_t key, int flags, void *_arg)
-{
- uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
- ((struct dn_schk *)key)->sch.sched_nr;
- return ( (i>>8)^(i>>4)^i );
-}
-
-static int
-schk_match(void *obj, uintptr_t key, int flags, void *_arg)
-{
- struct dn_schk *s = (struct dn_schk *)obj;
- int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
- ((struct dn_schk *)key)->sch.sched_nr;
- return (s->sch.sched_nr == i);
-}
-
-/*
- * Create the entry and intialize with the sched hash if needed.
- * Leave s->fp unset so we can tell whether a dn_ht_find() returns
- * a new object or a previously existing one.
- */
-static void *
-schk_new(uintptr_t key, int flags, void *arg)
-{
- struct schk_new_arg *a = arg;
- struct dn_schk *s;
- int l = sizeof(*s) +a->fp->schk_datalen;
-
- s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
- if (s == NULL)
- return NULL;
- set_oid(&s->link.oid, DN_LINK, sizeof(s->link));
- s->sch = *a->sch; // copy initial values
- s->link.link_nr = s->sch.sched_nr;
- SLIST_INIT(&s->fsk_list);
- /* initialize the hash table or create the single instance */
- s->fp = a->fp; /* si_new needs this */
- s->drain_bucket = 0;
- if (s->sch.flags & DN_HAVE_MASK) {
- s->siht = dn_ht_init(NULL, s->sch.buckets,
- offsetof(struct dn_sch_inst, si_next),
- si_hash, si_match, si_new);
- if (s->siht == NULL) {
- free(s, M_DUMMYNET);
- return NULL;
- }
- }
- s->fp = NULL; /* mark as a new scheduler */
- dn_cfg.schk_count++;
- return s;
-}
-
-/*
- * Callback for sched delete. Notify all attached flowsets to
- * detach from the scheduler, destroy the internal flowset, and
- * all instances. The scheduler goes away too.
- * arg is 0 (only detach flowsets and destroy instances)
- * DN_DESTROY (detach & delete queues, delete schk)
- * or DN_DELETE_FS (delete queues and flowsets, delete schk)
- */
-static int
-schk_delete_cb(void *obj, void *arg)
-{
- struct dn_schk *s = obj;
-#if 0
- int a = (int)arg;
- ND("sched %d arg %s%s",
- s->sch.sched_nr,
- a&DN_DESTROY ? "DEL ":"",
- a&DN_DELETE_FS ? "DEL_FS":"");
-#endif
- fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0);
- /* no more flowset pointing to us now */
- if (s->sch.flags & DN_HAVE_MASK) {
- dn_ht_scan(s->siht, si_destroy, NULL);
- dn_ht_free(s->siht, 0);
- } else if (s->siht)
- si_destroy(s->siht, NULL);
- if (s->profile) {
- free(s->profile, M_DUMMYNET);
- s->profile = NULL;
- }
- s->siht = NULL;
- if (s->fp->destroy)
- s->fp->destroy(s);
- bzero(s, sizeof(*s)); // safety
- free(obj, M_DUMMYNET);
- dn_cfg.schk_count--;
- return DNHT_SCAN_DEL;
-}
-
-/*
- * called on a 'sched X delete' command. Deletes a single scheduler.
- * This is done by removing from the schedhash, unlinking all
- * flowsets and deleting their traffic.
- */
-static int
-delete_schk(int i)
-{
- struct dn_schk *s;
-
- s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
- ND("%d %p", i, s);
- if (!s)
- return EINVAL;
- delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */
- /* then detach flowsets, delete traffic */
- schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY);
- return 0;
-}
-/*--- end of schk hashtable support ---*/
-
-static int
-copy_obj(char **start, char *end, void *_o, const char *msg, int i)
-{
- struct dn_id *o = _o;
- int have = end - *start;
-
- if (have < o->len || o->len == 0 || o->type == 0) {
- D("(WARN) type %d %s %d have %d need %d",
- o->type, msg, i, have, o->len);
- return 1;
- }
- ND("type %d %s %d len %d", o->type, msg, i, o->len);
- bcopy(_o, *start, o->len);
- if (o->type == DN_LINK) {
- /* Adjust burst parameter for link */
- struct dn_link *l = (struct dn_link *)*start;
- l->burst = div64(l->burst, 8 * hz);
- l->delay = l->delay * 1000 / hz;
- } else if (o->type == DN_SCH) {
- /* Set id->id to the number of instances */
- struct dn_schk *s = _o;
- struct dn_id *id = (struct dn_id *)(*start);
- id->id = (s->sch.flags & DN_HAVE_MASK) ?
- dn_ht_entries(s->siht) : (s->siht ? 1 : 0);
- }
- *start += o->len;
- return 0;
-}
-
-/* Specific function to copy a queue.
- * Copies only the user-visible part of a queue (which is in
- * a struct dn_flow), and sets len accordingly.
- */
-static int
-copy_obj_q(char **start, char *end, void *_o, const char *msg, int i)
-{
- struct dn_id *o = _o;
- int have = end - *start;
- int len = sizeof(struct dn_flow); /* see above comment */
-
- if (have < len || o->len == 0 || o->type != DN_QUEUE) {
- D("ERROR type %d %s %d have %d need %d",
- o->type, msg, i, have, len);
- return 1;
- }
- ND("type %d %s %d len %d", o->type, msg, i, len);
- bcopy(_o, *start, len);
- ((struct dn_id*)(*start))->len = len;
- *start += len;
- return 0;
-}
-
-static int
-copy_q_cb(void *obj, void *arg)
-{
- struct dn_queue *q = obj;
- struct copy_args *a = arg;
- struct dn_flow *ni = (struct dn_flow *)(*a->start);
- if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1))
- return DNHT_SCAN_END;
- ni->oid.type = DN_FLOW; /* override the DN_QUEUE */
- ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL);
- return 0;
-}
-
-static int
-copy_q(struct copy_args *a, struct dn_fsk *fs, int flags)
-{
- if (!fs->qht)
- return 0;
- if (fs->fs.flags & DN_QHT_HASH)
- dn_ht_scan(fs->qht, copy_q_cb, a);
- else
- copy_q_cb(fs->qht, a);
- return 0;
-}
-
-/*
- * This routine only copies the initial part of a profile ? XXX
- */
-static int
-copy_profile(struct copy_args *a, struct dn_profile *p)
-{
- int have = a->end - *a->start;
- /* XXX here we check for max length */
- int profile_len = sizeof(struct dn_profile) -
- ED_MAX_SAMPLES_NO*sizeof(int);
-
- if (p == NULL)
- return 0;
- if (have < profile_len) {
- D("error have %d need %d", have, profile_len);
- return 1;
- }
- bcopy(p, *a->start, profile_len);
- ((struct dn_id *)(*a->start))->len = profile_len;
- *a->start += profile_len;
- return 0;
-}
-
-static int
-copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags)
-{
- struct dn_fs *ufs = (struct dn_fs *)(*a->start);
- if (!fs)
- return 0;
- ND("flowset %d", fs->fs.fs_nr);
- if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr))
- return DNHT_SCAN_END;
- ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ?
- dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0);
- if (flags) { /* copy queues */
- copy_q(a, fs, 0);
- }
- return 0;
-}
-
-static int
-copy_si_cb(void *obj, void *arg)
-{
- struct dn_sch_inst *si = obj;
- struct copy_args *a = arg;
- struct dn_flow *ni = (struct dn_flow *)(*a->start);
- if (copy_obj(a->start, a->end, &si->ni, "inst",
- si->sched->sch.sched_nr))
- return DNHT_SCAN_END;
- ni->oid.type = DN_FLOW; /* override the DN_SCH_I */
- ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL);
- return 0;
-}
-
-static int
-copy_si(struct copy_args *a, struct dn_schk *s, int flags)
-{
- if (s->sch.flags & DN_HAVE_MASK)
- dn_ht_scan(s->siht, copy_si_cb, a);
- else if (s->siht)
- copy_si_cb(s->siht, a);
- return 0;
-}
-
-/*
- * compute a list of children of a scheduler and copy up
- */
-static int
-copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags)
-{
- struct dn_fsk *fs;
- struct dn_id *o;
- uint32_t *p;
-
- int n = 0, space = sizeof(*o);
- SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
- if (fs->fs.fs_nr < DN_MAX_ID)
- n++;
- }
- space += n * sizeof(uint32_t);
- DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n);
- if (a->end - *(a->start) < space)
- return DNHT_SCAN_END;
- o = (struct dn_id *)(*(a->start));
- o->len = space;
- *a->start += o->len;
- o->type = DN_TEXT;
- p = (uint32_t *)(o+1);
- SLIST_FOREACH(fs, &s->fsk_list, sch_chain)
- if (fs->fs.fs_nr < DN_MAX_ID)
- *p++ = fs->fs.fs_nr;
- return 0;
-}
-
-static int
-copy_data_helper(void *_o, void *_arg)
-{
- struct copy_args *a = _arg;
- uint32_t *r = a->extra->r; /* start of first range */
- uint32_t *lim; /* first invalid pointer */
- int n;
-
- lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len);
-
- if (a->type == DN_LINK || a->type == DN_SCH) {
- /* pipe|sched show, we receive a dn_schk */
- struct dn_schk *s = _o;
-
- n = s->sch.sched_nr;
- if (a->type == DN_SCH && n >= DN_MAX_ID)
- return 0; /* not a scheduler */
- if (a->type == DN_LINK && n <= DN_MAX_ID)
- return 0; /* not a pipe */
-
- /* see if the object is within one of our ranges */
- for (;r < lim; r += 2) {
- if (n < r[0] || n > r[1])
- continue;
- /* Found a valid entry, copy and we are done */
- if (a->flags & DN_C_LINK) {
- if (copy_obj(a->start, a->end,
- &s->link, "link", n))
- return DNHT_SCAN_END;
- if (copy_profile(a, s->profile))
- return DNHT_SCAN_END;
- if (copy_flowset(a, s->fs, 0))
- return DNHT_SCAN_END;
- }
- if (a->flags & DN_C_SCH) {
- if (copy_obj(a->start, a->end,
- &s->sch, "sched", n))
- return DNHT_SCAN_END;
- /* list all attached flowsets */
- if (copy_fsk_list(a, s, 0))
- return DNHT_SCAN_END;
- }
- if (a->flags & DN_C_FLOW)
- copy_si(a, s, 0);
- break;
- }
- } else if (a->type == DN_FS) {
- /* queue show, skip internal flowsets */
- struct dn_fsk *fs = _o;
-
- n = fs->fs.fs_nr;
- if (n >= DN_MAX_ID)
- return 0;
- /* see if the object is within one of our ranges */
- for (;r < lim; r += 2) {
- if (n < r[0] || n > r[1])
- continue;
- if (copy_flowset(a, fs, 0))
- return DNHT_SCAN_END;
- copy_q(a, fs, 0);
- break; /* we are done */
- }
- }
- return 0;
-}
-
-static inline struct dn_schk *
-locate_scheduler(int i)
-{
- return dn_ht_find(dn_cfg.schedhash, i, 0, NULL);
-}
-
-/*
- * red parameters are in fixed point arithmetic.
- */
-static int
-config_red(struct dn_fsk *fs)
-{
- int64_t s, idle, weight, w0;
- int t, i;
-
- fs->w_q = fs->fs.w_q;
- fs->max_p = fs->fs.max_p;
- ND("called");
- /* Doing stuff that was in userland */
- i = fs->sched->link.bandwidth;
- s = (i <= 0) ? 0 :
- hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i;
-
- idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */
- fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth);
- /* fs->lookup_step not scaled, */
- if (!fs->lookup_step)
- fs->lookup_step = 1;
- w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled
-
- for (t = fs->lookup_step; t > 1; --t)
- weight = SCALE_MUL(weight, w0);
- fs->lookup_weight = (int)(weight); // scaled
-
- /* Now doing stuff that was in kerneland */
- fs->min_th = SCALE(fs->fs.min_th);
- fs->max_th = SCALE(fs->fs.max_th);
-
- fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th);
- fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th));
-
- if (fs->fs.flags & DN_IS_GENTLE_RED) {
- fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th;
- fs->c_4 = SCALE(1) - 2 * fs->max_p;
- }
-
- /* If the lookup table already exist, free and create it again. */
- if (fs->w_q_lookup) {
- free(fs->w_q_lookup, M_DUMMYNET);
- fs->w_q_lookup = NULL;
- }
- if (dn_cfg.red_lookup_depth == 0) {
- printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
- "must be > 0\n");
- fs->fs.flags &= ~DN_IS_RED;
- fs->fs.flags &= ~DN_IS_GENTLE_RED;
- return (EINVAL);
- }
- fs->lookup_depth = dn_cfg.red_lookup_depth;
- fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int),
- M_DUMMYNET, M_NOWAIT);
- if (fs->w_q_lookup == NULL) {
- printf("dummynet: sorry, cannot allocate red lookup table\n");
- fs->fs.flags &= ~DN_IS_RED;
- fs->fs.flags &= ~DN_IS_GENTLE_RED;
- return(ENOSPC);
- }
-
- /* Fill the lookup table with (1 - w_q)^x */
- fs->w_q_lookup[0] = SCALE(1) - fs->w_q;
-
- for (i = 1; i < fs->lookup_depth; i++)
- fs->w_q_lookup[i] =
- SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight);
-
- if (dn_cfg.red_avg_pkt_size < 1)
- dn_cfg.red_avg_pkt_size = 512;
- fs->avg_pkt_size = dn_cfg.red_avg_pkt_size;
- if (dn_cfg.red_max_pkt_size < 1)
- dn_cfg.red_max_pkt_size = 1500;
- fs->max_pkt_size = dn_cfg.red_max_pkt_size;
- ND("exit");
- return 0;
-}
-
-/* Scan all flowset attached to this scheduler and update red */
-static void
-update_red(struct dn_schk *s)
-{
- struct dn_fsk *fs;
- SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
- if (fs && (fs->fs.flags & DN_IS_RED))
- config_red(fs);
- }
-}
-
-/* attach flowset to scheduler s, possibly requeue */
-static void
-fsk_attach(struct dn_fsk *fs, struct dn_schk *s)
-{
- ND("remove fs %d from fsunlinked, link to sched %d",
- fs->fs.fs_nr, s->sch.sched_nr);
- SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain);
- fs->sched = s;
- SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain);
- if (s->fp->new_fsk)
- s->fp->new_fsk(fs);
- /* XXX compute fsk_mask */
- fs->fsk_mask = fs->fs.flow_mask;
- if (fs->sched->sch.flags & DN_HAVE_MASK)
- flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask);
- if (fs->qht) {
- /*
- * we must drain qht according to the old
- * type, and reinsert according to the new one.
- * The requeue is complex -- in general we need to
- * reclassify every single packet.
- * For the time being, let's hope qht is never set
- * when we reach this point.
- */
- D("XXX TODO requeue from fs %d to sch %d",
- fs->fs.fs_nr, s->sch.sched_nr);
- fs->qht = NULL;
- }
- /* set the new type for qht */
- if (nonzero_mask(&fs->fsk_mask))
- fs->fs.flags |= DN_QHT_HASH;
- else
- fs->fs.flags &= ~DN_QHT_HASH;
-
- /* XXX config_red() can fail... */
- if (fs->fs.flags & DN_IS_RED)
- config_red(fs);
-}
-
-/* update all flowsets which may refer to this scheduler */
-static void
-update_fs(struct dn_schk *s)
-{
- struct dn_fsk *fs, *tmp;
-
- SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) {
- if (s->sch.sched_nr != fs->fs.sched_nr) {
- D("fs %d for sch %d not %d still unlinked",
- fs->fs.fs_nr, fs->fs.sched_nr,
- s->sch.sched_nr);
- continue;
- }
- fsk_attach(fs, s);
- }
-}
-
-/*
- * Configuration -- to preserve backward compatibility we use
- * the following scheme (N is 65536)
- * NUMBER SCHED LINK FLOWSET
- * 1 .. N-1 (1)WFQ (2)WFQ (3)queue
- * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1
- * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1
- *
- * "pipe i config" configures #1, #2 and #3
- * "sched i config" configures #1 and possibly #6
- * "queue i config" configures #3
- * #1 is configured with 'pipe i config' or 'sched i config'
- * #2 is configured with 'pipe i config', and created if not
- * existing with 'sched i config'
- * #3 is configured with 'queue i config'
- * #4 is automatically configured after #1, can only be FIFO
- * #5 is automatically configured after #2
- * #6 is automatically created when #1 is !MULTIQUEUE,
- * and can be updated.
- * #7 is automatically configured after #2
- */
-
-/*
- * configure a link (and its FIFO instance)
- */
-static int
-config_link(struct dn_link *p, struct dn_id *arg)
-{
- int i;
-
- if (p->oid.len != sizeof(*p)) {
- D("invalid pipe len %d", p->oid.len);
- return EINVAL;
- }
- i = p->link_nr;
- if (i <= 0 || i >= DN_MAX_ID)
- return EINVAL;
- /*
- * The config program passes parameters as follows:
- * bw = bits/second (0 means no limits),
- * delay = ms, must be translated into ticks.
- * qsize = slots/bytes
- * burst ???
- */
- p->delay = (p->delay * hz) / 1000;
- /* Scale burst size: bytes -> bits * hz */
- p->burst *= 8 * hz;
-
- DN_BH_WLOCK();
- /* do it twice, base link and FIFO link */
- for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
- struct dn_schk *s = locate_scheduler(i);
- if (s == NULL) {
- DN_BH_WUNLOCK();
- D("sched %d not found", i);
- return EINVAL;
- }
- /* remove profile if exists */
- if (s->profile) {
- free(s->profile, M_DUMMYNET);
- s->profile = NULL;
- }
- /* copy all parameters */
- s->link.oid = p->oid;
- s->link.link_nr = i;
- s->link.delay = p->delay;
- if (s->link.bandwidth != p->bandwidth) {
- /* XXX bandwidth changes, need to update red params */
- s->link.bandwidth = p->bandwidth;
- update_red(s);
- }
- s->link.burst = p->burst;
- schk_reset_credit(s);
- }
- dn_cfg.id++;
- DN_BH_WUNLOCK();
- return 0;
-}
-
-/*
- * configure a flowset. Can be called from inside with locked=1,
- */
-static struct dn_fsk *
-config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
-{
- int i;
- struct dn_fsk *fs;
-
- if (nfs->oid.len != sizeof(*nfs)) {
- D("invalid flowset len %d", nfs->oid.len);
- return NULL;
- }
- i = nfs->fs_nr;
- if (i <= 0 || i >= 3*DN_MAX_ID)
- return NULL;
- ND("flowset %d", i);
- /* XXX other sanity checks */
- if (nfs->flags & DN_QSIZE_BYTES) {
- ipdn_bound_var(&nfs->qsize, 16384,
- 1500, dn_cfg.byte_limit, NULL); // "queue byte size");
- } else {
- ipdn_bound_var(&nfs->qsize, 50,
- 1, dn_cfg.slot_limit, NULL); // "queue slot size");
- }
- if (nfs->flags & DN_HAVE_MASK) {
- /* make sure we have some buckets */
- ipdn_bound_var(&nfs->buckets, dn_cfg.hash_size,
- 1, dn_cfg.max_hash_size, "flowset buckets");
- } else {
- nfs->buckets = 1; /* we only need 1 */
- }
- if (!locked)
- DN_BH_WLOCK();
- do { /* exit with break when done */
- struct dn_schk *s;
- int flags = nfs->sched_nr ? DNHT_INSERT : 0;
- int j;
- int oldc = dn_cfg.fsk_count;
- fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL);
- if (fs == NULL) {
- D("missing sched for flowset %d", i);
- break;
- }
- /* grab some defaults from the existing one */
- if (nfs->sched_nr == 0) /* reuse */
- nfs->sched_nr = fs->fs.sched_nr;
- for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) {
- if (nfs->par[j] == -1) /* reuse */
- nfs->par[j] = fs->fs.par[j];
- }
- if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) {
- ND("flowset %d unchanged", i);
- break; /* no change, nothing to do */
- }
- if (oldc != dn_cfg.fsk_count) /* new item */
- dn_cfg.id++;
- s = locate_scheduler(nfs->sched_nr);
- /* detach from old scheduler if needed, preserving
- * queues if we need to reattach. Then update the
- * configuration, and possibly attach to the new sched.
- */
- DX(2, "fs %d changed sched %d@%p to %d@%p",
- fs->fs.fs_nr,
- fs->fs.sched_nr, fs->sched, nfs->sched_nr, s);
- if (fs->sched) {
- int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY);
- flags |= DN_DESTROY; /* XXX temporary */
- fsk_detach(fs, flags);
- }
- fs->fs = *nfs; /* copy configuration */
- if (s != NULL)
- fsk_attach(fs, s);
- } while (0);
- if (!locked)
- DN_BH_WUNLOCK();
- return fs;
-}
-
-/*
- * config/reconfig a scheduler and its FIFO variant.
- * For !MULTIQUEUE schedulers, also set up the flowset.
- *
- * On reconfigurations (detected because s->fp is set),
- * detach existing flowsets preserving traffic, preserve link,
- * and delete the old scheduler creating a new one.
- */
-static int
-config_sched(struct dn_sch *_nsch, struct dn_id *arg)
-{
- struct dn_schk *s;
- struct schk_new_arg a; /* argument for schk_new */
- int i;
- struct dn_link p; /* copy of oldlink */
- struct dn_profile *pf = NULL; /* copy of old link profile */
- /* Used to preserv mask parameter */
- struct ipfw_flow_id new_mask;
- int new_buckets = 0;
- int new_flags = 0;
- int pipe_cmd;
- int err = ENOMEM;
-
- a.sch = _nsch;
- if (a.sch->oid.len != sizeof(*a.sch)) {
- D("bad sched len %d", a.sch->oid.len);
- return EINVAL;
- }
- i = a.sch->sched_nr;
- if (i <= 0 || i >= DN_MAX_ID)
- return EINVAL;
- /* make sure we have some buckets */
- if (a.sch->flags & DN_HAVE_MASK)
- ipdn_bound_var(&a.sch->buckets, dn_cfg.hash_size,
- 1, dn_cfg.max_hash_size, "sched buckets");
- /* XXX other sanity checks */
- bzero(&p, sizeof(p));
-
- pipe_cmd = a.sch->flags & DN_PIPE_CMD;
- a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set?
- if (pipe_cmd) {
- /* Copy mask parameter */
- new_mask = a.sch->sched_mask;
- new_buckets = a.sch->buckets;
- new_flags = a.sch->flags;
- }
- DN_BH_WLOCK();
-again: /* run twice, for wfq and fifo */
- /*
- * lookup the type. If not supplied, use the previous one
- * or default to WF2Q+. Otherwise, return an error.
- */
- dn_cfg.id++;
- a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name);
- if (a.fp != NULL) {
- /* found. Lookup or create entry */
- s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a);
- } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) {
- /* No type. search existing s* or retry with WF2Q+ */
- s = dn_ht_find(dn_cfg.schedhash, i, 0, &a);
- if (s != NULL) {
- a.fp = s->fp;
- /* Scheduler exists, skip to FIFO scheduler
- * if command was pipe config...
- */
- if (pipe_cmd)
- goto next;
- } else {
- /* New scheduler, create a wf2q+ with no mask
- * if command was pipe config...
- */
- if (pipe_cmd) {
- /* clear mask parameter */
- bzero(&a.sch->sched_mask, sizeof(new_mask));
- a.sch->buckets = 0;
- a.sch->flags &= ~DN_HAVE_MASK;
- }
- a.sch->oid.subtype = DN_SCHED_WF2QP;
- goto again;
- }
- } else {
- D("invalid scheduler type %d %s",
- a.sch->oid.subtype, a.sch->name);
- err = EINVAL;
- goto error;
- }
- /* normalize name and subtype */
- a.sch->oid.subtype = a.fp->type;
- bzero(a.sch->name, sizeof(a.sch->name));
- strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name));
- if (s == NULL) {
- D("cannot allocate scheduler %d", i);
- goto error;
- }
- /* restore existing link if any */
- if (p.link_nr) {
- s->link = p;
- if (!pf || pf->link_nr != p.link_nr) { /* no saved value */
- s->profile = NULL; /* XXX maybe not needed */
- } else {
- s->profile = malloc(sizeof(struct dn_profile),
- M_DUMMYNET, M_NOWAIT | M_ZERO);
- if (s->profile == NULL) {
- D("cannot allocate profile");
- goto error; //XXX
- }
- bcopy(pf, s->profile, sizeof(*pf));
- }
- }
- p.link_nr = 0;
- if (s->fp == NULL) {
- DX(2, "sched %d new type %s", i, a.fp->name);
- } else if (s->fp != a.fp ||
- bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) {
- /* already existing. */
- DX(2, "sched %d type changed from %s to %s",
- i, s->fp->name, a.fp->name);
- DX(4, " type/sub %d/%d -> %d/%d",
- s->sch.oid.type, s->sch.oid.subtype,
- a.sch->oid.type, a.sch->oid.subtype);
- if (s->link.link_nr == 0)
- D("XXX WARNING link 0 for sched %d", i);
- p = s->link; /* preserve link */
- if (s->profile) {/* preserve profile */
- if (!pf)
- pf = malloc(sizeof(*pf),
- M_DUMMYNET, M_NOWAIT | M_ZERO);
- if (pf) /* XXX should issue a warning otherwise */
- bcopy(s->profile, pf, sizeof(*pf));
- }
- /* remove from the hash */
- dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
- /* Detach flowsets, preserve queues. */
- // schk_delete_cb(s, NULL);
- // XXX temporarily, kill queues
- schk_delete_cb(s, (void *)DN_DESTROY);
- goto again;
- } else {
- DX(4, "sched %d unchanged type %s", i, a.fp->name);
- }
- /* complete initialization */
- s->sch = *a.sch;
- s->fp = a.fp;
- s->cfg = arg;
- // XXX schk_reset_credit(s);
- /* create the internal flowset if needed,
- * trying to reuse existing ones if available
- */
- if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) {
- s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL);
- if (!s->fs) {
- struct dn_fs fs;
- bzero(&fs, sizeof(fs));
- set_oid(&fs.oid, DN_FS, sizeof(fs));
- fs.fs_nr = i + DN_MAX_ID;
- fs.sched_nr = i;
- s->fs = config_fs(&fs, NULL, 1 /* locked */);
- }
- if (!s->fs) {
- schk_delete_cb(s, (void *)DN_DESTROY);
- D("error creating internal fs for %d", i);
- goto error;
- }
- }
- /* call init function after the flowset is created */
- if (s->fp->config)
- s->fp->config(s);
- update_fs(s);
-next:
- if (i < DN_MAX_ID) { /* now configure the FIFO instance */
- i += DN_MAX_ID;
- if (pipe_cmd) {
- /* Restore mask parameter for FIFO */
- a.sch->sched_mask = new_mask;
- a.sch->buckets = new_buckets;
- a.sch->flags = new_flags;
- } else {
- /* sched config shouldn't modify the FIFO scheduler */
- if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) {
- /* FIFO already exist, don't touch it */
- err = 0; /* and this is not an error */
- goto error;
- }
- }
- a.sch->sched_nr = i;
- a.sch->oid.subtype = DN_SCHED_FIFO;
- bzero(a.sch->name, sizeof(a.sch->name));
- goto again;
- }
- err = 0;
-error:
- DN_BH_WUNLOCK();
- if (pf)
- free(pf, M_DUMMYNET);
- return err;
-}
-
-/*
- * attach a profile to a link
- */
-static int
-config_profile(struct dn_profile *pf, struct dn_id *arg)
-{
- struct dn_schk *s;
- int i, olen, err = 0;
-
- if (pf->oid.len < sizeof(*pf)) {
- D("short profile len %d", pf->oid.len);
- return EINVAL;
- }
- i = pf->link_nr;
- if (i <= 0 || i >= DN_MAX_ID)
- return EINVAL;
- /* XXX other sanity checks */
- DN_BH_WLOCK();
- for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
- s = locate_scheduler(i);
-
- if (s == NULL) {
- err = EINVAL;
- break;
- }
- dn_cfg.id++;
- /*
- * If we had a profile and the new one does not fit,
- * or it is deleted, then we need to free memory.
- */
- if (s->profile && (pf->samples_no == 0 ||
- s->profile->oid.len < pf->oid.len)) {
- free(s->profile, M_DUMMYNET);
- s->profile = NULL;
- }
- if (pf->samples_no == 0)
- continue;
- /*
- * new profile, possibly allocate memory
- * and copy data.
- */
- if (s->profile == NULL)
- s->profile = malloc(pf->oid.len,
- M_DUMMYNET, M_NOWAIT | M_ZERO);
- if (s->profile == NULL) {
- D("no memory for profile %d", i);
- err = ENOMEM;
- break;
- }
- /* preserve larger length XXX double check */
- olen = s->profile->oid.len;
- if (olen < pf->oid.len)
- olen = pf->oid.len;
- bcopy(pf, s->profile, pf->oid.len);
- s->profile->oid.len = olen;
- }
- DN_BH_WUNLOCK();
- return err;
-}
-
-/*
- * Delete all objects:
- */
-static void
-dummynet_flush(void)
-{
-
- /* delete all schedulers and related links/queues/flowsets */
- dn_ht_scan(dn_cfg.schedhash, schk_delete_cb,
- (void *)(uintptr_t)DN_DELETE_FS);
- /* delete all remaining (unlinked) flowsets */
- DX(4, "still %d unlinked fs", dn_cfg.fsk_count);
- dn_ht_free(dn_cfg.fshash, DNHT_REMOVE);
- fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS);
- /* Reinitialize system heap... */
- heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
-}
-
-/*
- * Main handler for configuration. We are guaranteed to be called
- * with an oid which is at least a dn_id.
- * - the first object is the command (config, delete, flush, ...)
- * - config_link must be issued after the corresponding config_sched
- * - parameters (DN_TXT) for an object must preceed the object
- * processed on a config_sched.
- */
-int
-do_config(void *p, int l)
-{
- struct dn_id *next, *o;
- int err = 0, err2 = 0;
- struct dn_id *arg = NULL;
- uintptr_t *a;
-
- o = p;
- if (o->id != DN_API_VERSION) {
- D("invalid api version got %d need %d",
- o->id, DN_API_VERSION);
- return EINVAL;
- }
- for (; l >= sizeof(*o); o = next) {
- struct dn_id *prev = arg;
- if (o->len < sizeof(*o) || l < o->len) {
- D("bad len o->len %d len %d", o->len, l);
- err = EINVAL;
- break;
- }
- l -= o->len;
- next = (struct dn_id *)((char *)o + o->len);
- err = 0;
- switch (o->type) {
- default:
- D("cmd %d not implemented", o->type);
- break;
-
-#ifdef EMULATE_SYSCTL
- /* sysctl emulation.
- * if we recognize the command, jump to the correct
- * handler and return
- */
- case DN_SYSCTL_SET:
- err = kesysctl_emu_set(p, l);
- return err;
-#endif
-
- case DN_CMD_CONFIG: /* simply a header */
- break;
-
- case DN_CMD_DELETE:
- /* the argument is in the first uintptr_t after o */
- a = (uintptr_t *)(o+1);
- if (o->len < sizeof(*o) + sizeof(*a)) {
- err = EINVAL;
- break;
- }
- switch (o->subtype) {
- case DN_LINK:
- /* delete base and derived schedulers */
- DN_BH_WLOCK();
- err = delete_schk(*a);
- err2 = delete_schk(*a + DN_MAX_ID);
- DN_BH_WUNLOCK();
- if (!err)
- err = err2;
- break;
-
- default:
- D("invalid delete type %d",
- o->subtype);
- err = EINVAL;
- break;
-
- case DN_FS:
- err = (*a <1 || *a >= DN_MAX_ID) ?
- EINVAL : delete_fs(*a, 0) ;
- break;
- }
- break;
-
- case DN_CMD_FLUSH:
- DN_BH_WLOCK();
- dummynet_flush();
- DN_BH_WUNLOCK();
- break;
- case DN_TEXT: /* store argument the next block */
- prev = NULL;
- arg = o;
- break;
- case DN_LINK:
- err = config_link((struct dn_link *)o, arg);
- break;
- case DN_PROFILE:
- err = config_profile((struct dn_profile *)o, arg);
- break;
- case DN_SCH:
- err = config_sched((struct dn_sch *)o, arg);
- break;
- case DN_FS:
- err = (NULL==config_fs((struct dn_fs *)o, arg, 0));
- break;
- }
- if (prev)
- arg = NULL;
- if (err != 0)
- break;
- }
- return err;
-}
-
-static int
-compute_space(struct dn_id *cmd, struct copy_args *a)
-{
- int x = 0, need = 0;
- int profile_size = sizeof(struct dn_profile) -
- ED_MAX_SAMPLES_NO*sizeof(int);
-
- /* NOTE about compute space:
- * NP = dn_cfg.schk_count
- * NSI = dn_cfg.si_count
- * NF = dn_cfg.fsk_count
- * NQ = dn_cfg.queue_count
- * - ipfw pipe show
- * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
- * link, scheduler template, flowset
- * integrated in scheduler and header
- * for flowset list
- * (NSI)*(dn_flow) all scheduler instance (includes
- * the queue instance)
- * - ipfw sched show
- * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
- * link, scheduler template, flowset
- * integrated in scheduler and header
- * for flowset list
- * (NSI * dn_flow) all scheduler instances
- * (NF * sizeof(uint_32)) space for flowset list linked to scheduler
- * (NQ * dn_queue) all queue [XXXfor now not listed]
- * - ipfw queue show
- * (NF * dn_fs) all flowset
- * (NQ * dn_queue) all queues
- */
- switch (cmd->subtype) {
- default:
- return -1;
- /* XXX where do LINK and SCH differ ? */
- /* 'ipfw sched show' could list all queues associated to
- * a scheduler. This feature for now is disabled
- */
- case DN_LINK: /* pipe show */
- x = DN_C_LINK | DN_C_SCH | DN_C_FLOW;
- need += dn_cfg.schk_count *
- (sizeof(struct dn_fs) + profile_size) / 2;
- need += dn_cfg.fsk_count * sizeof(uint32_t);
- break;
- case DN_SCH: /* sched show */
- need += dn_cfg.schk_count *
- (sizeof(struct dn_fs) + profile_size) / 2;
- need += dn_cfg.fsk_count * sizeof(uint32_t);
- x = DN_C_SCH | DN_C_LINK | DN_C_FLOW;
- break;
- case DN_FS: /* queue show */
- x = DN_C_FS | DN_C_QUEUE;
- break;
- case DN_GET_COMPAT: /* compatibility mode */
- need = dn_compat_calc_size();
- break;
- }
- a->flags = x;
- if (x & DN_C_SCH) {
- need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2;
- /* NOT also, each fs might be attached to a sched */
- need += dn_cfg.schk_count * sizeof(struct dn_id) / 2;
- }
- if (x & DN_C_FS)
- need += dn_cfg.fsk_count * sizeof(struct dn_fs);
- if (x & DN_C_LINK) {
- need += dn_cfg.schk_count * sizeof(struct dn_link) / 2;
- }
- /*
- * When exporting a queue to userland, only pass up the
- * struct dn_flow, which is the only visible part.
- */
-
- if (x & DN_C_QUEUE)
- need += dn_cfg.queue_count * sizeof(struct dn_flow);
- if (x & DN_C_FLOW)
- need += dn_cfg.si_count * (sizeof(struct dn_flow));
- return need;
-}
-
-/*
- * If compat != NULL dummynet_get is called in compatibility mode.
- * *compat will be the pointer to the buffer to pass to ipfw
- */
-int
-dummynet_get(struct sockopt *sopt, void **compat)
-{
- int have, i, need, error;
- char *start = NULL, *buf;
- size_t sopt_valsize;
- struct dn_id *cmd;
- struct copy_args a;
- struct copy_range r;
- int l = sizeof(struct dn_id);
-
- bzero(&a, sizeof(a));
- bzero(&r, sizeof(r));
-
- /* save and restore original sopt_valsize around copyin */
- sopt_valsize = sopt->sopt_valsize;
-
- cmd = &r.o;
-
- if (!compat) {
- /* copy at least an oid, and possibly a full object */
- error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd));
- sopt->sopt_valsize = sopt_valsize;
- if (error)
- goto done;
- l = cmd->len;
-#ifdef EMULATE_SYSCTL
- /* sysctl emulation. */
- if (cmd->type == DN_SYSCTL_GET)
- return kesysctl_emu_get(sopt);
-#endif
- if (l > sizeof(r)) {
- /* request larger than default, allocate buffer */
- cmd = malloc(l, M_DUMMYNET, M_WAITOK);
- error = sooptcopyin(sopt, cmd, l, l);
- sopt->sopt_valsize = sopt_valsize;
- if (error)
- goto done;
- }
- } else { /* compatibility */
- error = 0;
- cmd->type = DN_CMD_GET;
- cmd->len = sizeof(struct dn_id);
- cmd->subtype = DN_GET_COMPAT;
- // cmd->id = sopt_valsize;
- D("compatibility mode");
- }
- a.extra = (struct copy_range *)cmd;
- if (cmd->len == sizeof(*cmd)) { /* no range, create a default */
- uint32_t *rp = (uint32_t *)(cmd + 1);
- cmd->len += 2* sizeof(uint32_t);
- rp[0] = 1;
- rp[1] = DN_MAX_ID - 1;
- if (cmd->subtype == DN_LINK) {
- rp[0] += DN_MAX_ID;
- rp[1] += DN_MAX_ID;
- }
- }
- /* Count space (under lock) and allocate (outside lock).
- * Exit with lock held if we manage to get enough buffer.
- * Try a few times then give up.
- */
- for (have = 0, i = 0; i < 10; i++) {
- DN_BH_WLOCK();
- need = compute_space(cmd, &a);
-
- /* if there is a range, ignore value from compute_space() */
- if (l > sizeof(*cmd))
- need = sopt_valsize - sizeof(*cmd);
-
- if (need < 0) {
- DN_BH_WUNLOCK();
- error = EINVAL;
- goto done;
- }
- need += sizeof(*cmd);
- cmd->id = need;
- if (have >= need)
- break;
-
- DN_BH_WUNLOCK();
- if (start)
- free(start, M_DUMMYNET);
- start = NULL;
- if (need > sopt_valsize)
- break;
-
- have = need;
- start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO);
- }
-
- if (start == NULL) {
- if (compat) {
- *compat = NULL;
- error = 1; // XXX
- } else {
- error = sooptcopyout(sopt, cmd, sizeof(*cmd));
- }
- goto done;
- }
- ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, "
- "%d:%d si %d, %d:%d queues %d",
- dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH,
- dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK,
- dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS,
- dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I,
- dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE);
- sopt->sopt_valsize = sopt_valsize;
- a.type = cmd->subtype;
-
- if (compat == NULL) {
- bcopy(cmd, start, sizeof(*cmd));
- ((struct dn_id*)(start))->len = sizeof(struct dn_id);
- buf = start + sizeof(*cmd);
- } else
- buf = start;
- a.start = &buf;
- a.end = start + have;
- /* start copying other objects */
- if (compat) {
- a.type = DN_COMPAT_PIPE;
- dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a);
- a.type = DN_COMPAT_QUEUE;
- dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a);
- } else if (a.type == DN_FS) {
- dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a);
- } else {
- dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a);
- }
- DN_BH_WUNLOCK();
-
- if (compat) {
- *compat = start;
- sopt->sopt_valsize = buf - start;
- /* free() is done by ip_dummynet_compat() */
- start = NULL; //XXX hack
- } else {
- error = sooptcopyout(sopt, start, buf - start);
- }
-done:
- if (cmd && cmd != &r.o)
- free(cmd, M_DUMMYNET);
- if (start)
- free(start, M_DUMMYNET);
- return error;
-}
-
-/* Callback called on scheduler instance to delete it if idle */
-static int
-drain_scheduler_cb(void *_si, void *arg)
-{
- struct dn_sch_inst *si = _si;
-
- if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL)
- return 0;
-
- if (si->sched->fp->flags & DN_MULTIQUEUE) {
- if (si->q_count == 0)
- return si_destroy(si, NULL);
- else
- return 0;
- } else { /* !DN_MULTIQUEUE */
- if ((si+1)->ni.length == 0)
- return si_destroy(si, NULL);
- else
- return 0;
- }
- return 0; /* unreachable */
-}
-
-/* Callback called on scheduler to check if it has instances */
-static int
-drain_scheduler_sch_cb(void *_s, void *arg)
-{
- struct dn_schk *s = _s;
-
- if (s->sch.flags & DN_HAVE_MASK) {
- dn_ht_scan_bucket(s->siht, &s->drain_bucket,
- drain_scheduler_cb, NULL);
- s->drain_bucket++;
- } else {
- if (s->siht) {
- if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL)
- s->siht = NULL;
- }
- }
- return 0;
-}
-
-/* Called every tick, try to delete a 'bucket' of scheduler */
-void
-dn_drain_scheduler(void)
-{
- dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch,
- drain_scheduler_sch_cb, NULL);
- dn_cfg.drain_sch++;
-}
-
-/* Callback called on queue to delete if it is idle */
-static int
-drain_queue_cb(void *_q, void *arg)
-{
- struct dn_queue *q = _q;
-
- if (q->ni.length == 0) {
- dn_delete_queue(q, DN_DESTROY);
- return DNHT_SCAN_DEL; /* queue is deleted */
- }
-
- return 0; /* queue isn't deleted */
-}
-
-/* Callback called on flowset used to check if it has queues */
-static int
-drain_queue_fs_cb(void *_fs, void *arg)
-{
- struct dn_fsk *fs = _fs;
-
- if (fs->fs.flags & DN_QHT_HASH) {
- /* Flowset has a hash table for queues */
- dn_ht_scan_bucket(fs->qht, &fs->drain_bucket,
- drain_queue_cb, NULL);
- fs->drain_bucket++;
- } else {
- /* No hash table for this flowset, null the pointer
- * if the queue is deleted
- */
- if (fs->qht) {
- if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL)
- fs->qht = NULL;
- }
- }
- return 0;
-}
-
-/* Called every tick, try to delete a 'bucket' of queue */
-void
-dn_drain_queue(void)
-{
- /* scan a bucket of flowset */
- dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs,
- drain_queue_fs_cb, NULL);
- dn_cfg.drain_fs++;
-}
-
-/*
- * Handler for the various dummynet socket options
- */
-static int
-ip_dn_ctl(struct sockopt *sopt)
-{
- void *p = NULL;
- int error, l;
-
- error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
- if (error)
- return (error);
-
- /* Disallow sets in really-really secure mode. */
- if (sopt->sopt_dir == SOPT_SET) {
- error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
- if (error)
- return (error);
- }
-
- switch (sopt->sopt_name) {
- default :
- D("dummynet: unknown option %d", sopt->sopt_name);
- error = EINVAL;
- break;
-
- case IP_DUMMYNET_FLUSH:
- case IP_DUMMYNET_CONFIGURE:
- case IP_DUMMYNET_DEL: /* remove a pipe or queue */
- case IP_DUMMYNET_GET:
- D("dummynet: compat option %d", sopt->sopt_name);
- error = ip_dummynet_compat(sopt);
- break;
-
- case IP_DUMMYNET3 :
- if (sopt->sopt_dir == SOPT_GET) {
- error = dummynet_get(sopt, NULL);
- break;
- }
- l = sopt->sopt_valsize;
- if (l < sizeof(struct dn_id) || l > 12000) {
- D("argument len %d invalid", l);
- break;
- }
- p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ?
- error = sooptcopyin(sopt, p, l, l);
- if (error)
- break ;
- error = do_config(p, l);
- break;
- }
-
- if (p != NULL)
- free(p, M_TEMP);
-
- return error ;
-}
-
-
-static void
-ip_dn_init(void)
-{
- if (dn_cfg.init_done)
- return;
- printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet);
- dn_cfg.init_done = 1;
- /* Set defaults here. MSVC does not accept initializers,
- * and this is also useful for vimages
- */
- /* queue limits */
- dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */
- dn_cfg.byte_limit = 1024 * 1024;
- dn_cfg.expire = 1;
-
- /* RED parameters */
- dn_cfg.red_lookup_depth = 256; /* default lookup table depth */
- dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */
- dn_cfg.red_max_pkt_size = 1500; /* default max packet size */
-
- /* hash tables */
- dn_cfg.max_hash_size = 65536; /* max in the hash tables */
- dn_cfg.hash_size = 64; /* default hash size */
-
- /* create hash tables for schedulers and flowsets.
- * In both we search by key and by pointer.
- */
- dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size,
- offsetof(struct dn_schk, schk_next),
- schk_hash, schk_match, schk_new);
- dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size,
- offsetof(struct dn_fsk, fsk_next),
- fsk_hash, fsk_match, fsk_new);
-
- /* bucket index to drain object */
- dn_cfg.drain_fs = 0;
- dn_cfg.drain_sch = 0;
-
- heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
- SLIST_INIT(&dn_cfg.fsu);
- SLIST_INIT(&dn_cfg.schedlist);
-
- DN_LOCK_INIT();
-
- TASK_INIT(&dn_task, 0, dummynet_task, curvnet);
- dn_tq = taskqueue_create("dummynet", M_WAITOK,
- taskqueue_thread_enqueue, &dn_tq);
- taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");
-
- callout_init(&dn_timeout, CALLOUT_MPSAFE);
- callout_reset(&dn_timeout, 1, dummynet, NULL);
-
- /* Initialize curr_time adjustment mechanics. */
- getmicrouptime(&dn_cfg.prev_t);
-}
-
-static void
-ip_dn_destroy(int last)
-{
- callout_drain(&dn_timeout);
-
- DN_BH_WLOCK();
- if (last) {
- ND("removing last instance\n");
- ip_dn_ctl_ptr = NULL;
- ip_dn_io_ptr = NULL;
- }
-
- dummynet_flush();
- DN_BH_WUNLOCK();
- taskqueue_drain(dn_tq, &dn_task);
- taskqueue_free(dn_tq);
-
- dn_ht_free(dn_cfg.schedhash, 0);
- dn_ht_free(dn_cfg.fshash, 0);
- heap_free(&dn_cfg.evheap);
-
- DN_LOCK_DESTROY();
-}
-
-static int
-dummynet_modevent(module_t mod, int type, void *data)
-{
-
- if (type == MOD_LOAD) {
- if (ip_dn_io_ptr) {
- printf("DUMMYNET already loaded\n");
- return EEXIST ;
- }
- ip_dn_init();
- ip_dn_ctl_ptr = ip_dn_ctl;
- ip_dn_io_ptr = dummynet_io;
- return 0;
- } else if (type == MOD_UNLOAD) {
- ip_dn_destroy(1 /* last */);
- return 0;
- } else
- return EOPNOTSUPP;
-}
-
-/* modevent helpers for the modules */
-static int
-load_dn_sched(struct dn_alg *d)
-{
- struct dn_alg *s;
-
- if (d == NULL)
- return 1; /* error */
- ip_dn_init(); /* just in case, we need the lock */
-
- /* Check that mandatory funcs exists */
- if (d->enqueue == NULL || d->dequeue == NULL) {
- D("missing enqueue or dequeue for %s", d->name);
- return 1;
- }
-
- /* Search if scheduler already exists */
- DN_BH_WLOCK();
- SLIST_FOREACH(s, &dn_cfg.schedlist, next) {
- if (strcmp(s->name, d->name) == 0) {
- D("%s already loaded", d->name);
- break; /* scheduler already exists */
- }
- }
- if (s == NULL)
- SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next);
- DN_BH_WUNLOCK();
- D("dn_sched %s %sloaded", d->name, s ? "not ":"");
- return s ? 1 : 0;
-}
-
-static int
-unload_dn_sched(struct dn_alg *s)
-{
- struct dn_alg *tmp, *r;
- int err = EINVAL;
-
- ND("called for %s", s->name);
-
- DN_BH_WLOCK();
- SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) {
- if (strcmp(s->name, r->name) != 0)
- continue;
- ND("ref_count = %d", r->ref_count);
- err = (r->ref_count != 0) ? EBUSY : 0;
- if (err == 0)
- SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next);
- break;
- }
- DN_BH_WUNLOCK();
- D("dn_sched %s %sunloaded", s->name, err ? "not ":"");
- return err;
-}
-
-int
-dn_sched_modevent(module_t mod, int cmd, void *arg)
-{
- struct dn_alg *sch = arg;
-
- if (cmd == MOD_LOAD)
- return load_dn_sched(sch);
- else if (cmd == MOD_UNLOAD)
- return unload_dn_sched(sch);
- else
- return EINVAL;
-}
-
-static moduledata_t dummynet_mod = {
- "dummynet", dummynet_modevent, NULL
-};
-
-#define DN_SI_SUB SI_SUB_PROTO_IFATTACHDOMAIN
-#define DN_MODEV_ORD (SI_ORDER_ANY - 128) /* after ipfw */
-DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD);
-MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
-MODULE_VERSION(dummynet, 3);
-
-/*
- * Starting up. Done in order after dummynet_modevent() has been called.
- * VNET_SYSINIT is also called for each existing vnet and each new vnet.
- */
-//VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL);
-
-/*
- * Shutdown handlers up shop. These are done in REVERSE ORDER, but still
- * after dummynet_modevent() has been called. Not called on reboot.
- * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
- * or when the module is unloaded.
- */
-//VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL);
-
-/* end of file */
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw2.c b/freebsd/sys/netpfil/ipfw/ip_fw2.c
index 224ba937..a3a11819 100644
--- a/freebsd/sys/netpfil/ipfw/ip_fw2.c
+++ b/freebsd/sys/netpfil/ipfw/ip_fw2.c
@@ -36,7 +36,7 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_ipdivert.h>
#include <rtems/bsd/local/opt_inet.h>
#ifndef INET
-#error IPFIREWALL requires INET.
+#error "IPFIREWALL requires INET"
#endif /* INET */
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_ipsec.h>
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
#include <sys/condvar.h>
+#include <sys/counter.h>
#include <sys/eventhandler.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
@@ -54,6 +55,7 @@ __FBSDID("$FreeBSD$");
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
+#include <sys/rmlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
@@ -61,11 +63,13 @@ __FBSDID("$FreeBSD$");
#include <sys/ucred.h>
#include <net/ethernet.h> /* for ETHERTYPE_IP */
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
-#include <net/pf_mtag.h>
#include <net/pfil.h>
#include <net/vnet.h>
+#include <netpfil/pf/pf_mtag.h>
+
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_pcb.h>
@@ -82,7 +86,9 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
+#include <netinet/in_fib.h>
#ifdef INET6
+#include <netinet6/in6_fib.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/scope6_var.h>
#include <netinet6/ip6_var.h>
@@ -101,10 +107,6 @@ __FBSDID("$FreeBSD$");
* All ipfw global variables are here.
*/
-/* ipfw_vnet_ready controls when we are open for business */
-static VNET_DEFINE(int, ipfw_vnet_ready) = 0;
-#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready)
-
static VNET_DEFINE(int, fw_deny_unknown_exthdrs);
#define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs)
@@ -121,9 +123,20 @@ VNET_DEFINE(int, autoinc_step);
VNET_DEFINE(int, fw_one_pass) = 1;
VNET_DEFINE(unsigned int, fw_tables_max);
+VNET_DEFINE(unsigned int, fw_tables_sets) = 0; /* Don't use set-aware tables */
/* Use 128 tables by default */
static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT;
+#ifndef LINEAR_SKIPTO
+static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num,
+ int tablearg, int jump_backwards);
+#define JUMP(ch, f, num, targ, back) jump_fast(ch, f, num, targ, back)
+#else
+static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num,
+ int tablearg, int jump_backwards);
+#define JUMP(ch, f, num, targ, back) jump_linear(ch, f, num, targ, back)
+#endif
+
/*
* Each rule belongs to one of 32 different sets (0..31).
* The variable set_disable contains one bit per set.
@@ -144,6 +157,9 @@ VNET_DEFINE(int, verbose_limit);
/* layer3_chain contains the list of rules for layer 3 */
VNET_DEFINE(struct ip_fw_chain, layer3_chain);
+/* ipfw_vnet_ready controls when we are open for business */
+VNET_DEFINE(int, ipfw_vnet_ready) = 0;
+
VNET_DEFINE(int, ipfw_nat_ready) = 0;
ipfw_nat_t *ipfw_nat_ptr = NULL;
@@ -156,45 +172,51 @@ ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
#ifdef SYSCTL_NODE
uint32_t dummy_def = IPFW_DEFAULT_RULE;
static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS);
+static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS);
SYSBEGIN(f3)
SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
- CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
+ CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
"Only do a single pass through ipfw when using dummynet(4)");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
- CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
"Rule number auto-increment step");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose,
- CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose,
+ CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
"Log matches to ipfw rules");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
- CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
"Set upper limit of matches of ipfw rules logged");
SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
&dummy_def, 0,
"The default/max possible rule number.");
-SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_max,
- CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU",
- "Maximum number of tables");
+SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_max,
+ CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU",
+ "Maximum number of concurrently used tables");
+SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets,
+ CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
+ 0, 0, sysctl_ipfw_tables_sets, "IU",
+ "Use per-set namespace for tables");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
&default_to_accept, 0,
"Make the default rule accept all packets.");
-TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept);
-TUNABLE_INT("net.inet.ip.fw.tables_max", &default_fw_tables);
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count,
- CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
+TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables);
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count,
+ CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
"Number of static rules");
#ifdef INET6
SYSCTL_DECL(_net_inet6_ip6);
SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
-SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
- CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0,
+SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
+ CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
+ &VNET_NAME(fw_deny_unknown_exthdrs), 0,
"Deny packets with unknown IPv6 Extension Headers");
-SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6,
- CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0,
+SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6,
+ CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
+ &VNET_NAME(fw_permit_single_frag6), 0,
"Permit single packet IPv6 fragments");
#endif /* INET6 */
@@ -352,15 +374,18 @@ tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
}
static int
-iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uint32_t *tablearg)
+iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain,
+ uint32_t *tablearg)
{
+
if (ifp == NULL) /* no iface with this packet, match fails */
- return 0;
+ return (0);
+
/* Check by name or by IP address */
if (cmd->name[0] != '\0') { /* match by name */
if (cmd->name[0] == '\1') /* use tablearg to match */
- return ipfw_lookup_table_extended(chain, cmd->p.glob,
- ifp->if_xname, tablearg, IPFW_TABLE_INTERFACE);
+ return ipfw_lookup_table_extended(chain, cmd->p.kidx, 0,
+ &ifp->if_index, tablearg);
/* Check name */
if (cmd->p.glob) {
if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
@@ -370,7 +395,7 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uin
return(1);
}
} else {
-#ifdef __FreeBSD__ /* and OSX too ? */
+#if !defined(USERSPACE) && defined(__FreeBSD__) /* and OSX too ? */
struct ifaddr *ia;
if_addr_rlock(ifp);
@@ -413,50 +438,33 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uin
static int
verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
{
-#ifndef __FreeBSD__
+#if defined(USERSPACE) || !defined(__FreeBSD__)
return 0;
#else
- struct route ro;
- struct sockaddr_in *dst;
-
- bzero(&ro, sizeof(ro));
-
- dst = (struct sockaddr_in *)&(ro.ro_dst);
- dst->sin_family = AF_INET;
- dst->sin_len = sizeof(*dst);
- dst->sin_addr = src;
- in_rtalloc_ign(&ro, 0, fib);
+ struct nhop4_basic nh4;
- if (ro.ro_rt == NULL)
- return 0;
+ if (fib4_lookup_nh_basic(fib, src, NHR_IFAIF, 0, &nh4) != 0)
+ return (0);
/*
* If ifp is provided, check for equality with rtentry.
* We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
* in order to pass packets injected back by if_simloop():
- * if useloopback == 1 routing entry (via lo0) for our own address
+ * routing entry (via lo0) for our own address
* may exist, so we need to handle routing assymetry.
*/
- if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
- RTFREE(ro.ro_rt);
- return 0;
- }
+ if (ifp != NULL && ifp != nh4.nh_ifp)
+ return (0);
/* if no ifp provided, check if rtentry is not default route */
- if (ifp == NULL &&
- satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) {
- RTFREE(ro.ro_rt);
- return 0;
- }
+ if (ifp == NULL && (nh4.nh_flags & NHF_DEFAULT) != 0)
+ return (0);
/* or if this is a blackhole/reject route */
- if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
- RTFREE(ro.ro_rt);
- return 0;
- }
+ if (ifp == NULL && (nh4.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0)
+ return (0);
/* found valid route */
- RTFREE(ro.ro_rt);
return 1;
#endif /* __FreeBSD__ */
}
@@ -482,79 +490,62 @@ flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
}
/* support for IP6_*_ME opcodes */
+static const struct in6_addr lla_mask = {{{
+ 0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+}}};
+
static int
-search_ip6_addr_net (struct in6_addr * ip6_addr)
+ipfw_localip6(struct in6_addr *in6)
{
- struct ifnet *mdc;
- struct ifaddr *mdc2;
- struct in6_ifaddr *fdm;
- struct in6_addr copia;
-
- TAILQ_FOREACH(mdc, &V_ifnet, if_link) {
- if_addr_rlock(mdc);
- TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) {
- if (mdc2->ifa_addr->sa_family == AF_INET6) {
- fdm = (struct in6_ifaddr *)mdc2;
- copia = fdm->ia_addr.sin6_addr;
- /* need for leaving scope_id in the sock_addr */
- in6_clearscope(&copia);
- if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) {
- if_addr_runlock(mdc);
- return 1;
- }
- }
+ struct rm_priotracker in6_ifa_tracker;
+ struct in6_ifaddr *ia;
+
+ if (IN6_IS_ADDR_MULTICAST(in6))
+ return (0);
+
+ if (!IN6_IS_ADDR_LINKLOCAL(in6))
+ return (in6_localip(in6));
+
+ IN6_IFADDR_RLOCK(&in6_ifa_tracker);
+ TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
+ if (!IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr))
+ continue;
+ if (IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
+ in6, &lla_mask)) {
+ IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
+ return (1);
}
- if_addr_runlock(mdc);
}
- return 0;
+ IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
+ return (0);
}
static int
verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib)
{
- struct route_in6 ro;
- struct sockaddr_in6 *dst;
+ struct nhop6_basic nh6;
- bzero(&ro, sizeof(ro));
-
- dst = (struct sockaddr_in6 * )&(ro.ro_dst);
- dst->sin6_family = AF_INET6;
- dst->sin6_len = sizeof(*dst);
- dst->sin6_addr = *src;
+ if (IN6_IS_SCOPE_LINKLOCAL(src))
+ return (1);
- in6_rtalloc_ign(&ro, 0, fib);
- if (ro.ro_rt == NULL)
- return 0;
+ if (fib6_lookup_nh_basic(fib, src, 0, NHR_IFAIF, 0, &nh6) != 0)
+ return (0);
- /*
- * if ifp is provided, check for equality with rtentry
- * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
- * to support the case of sending packets to an address of our own.
- * (where the former interface is the first argument of if_simloop()
- * (=ifp), the latter is lo0)
- */
- if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
- RTFREE(ro.ro_rt);
- return 0;
- }
+ /* If ifp is provided, check for equality with route table. */
+ if (ifp != NULL && ifp != nh6.nh_ifp)
+ return (0);
/* if no ifp provided, check if rtentry is not default route */
- if (ifp == NULL &&
- IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) {
- RTFREE(ro.ro_rt);
- return 0;
- }
+ if (ifp == NULL && (nh6.nh_flags & NHF_DEFAULT) != 0)
+ return (0);
/* or if this is a blackhole/reject route */
- if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
- RTFREE(ro.ro_rt);
- return 0;
- }
+ if (ifp == NULL && (nh6.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0)
+ return (0);
/* found valid route */
- RTFREE(ro.ro_rt);
return 1;
-
}
static int
@@ -632,8 +623,6 @@ send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
m_adj(m, args->L3offset);
#endif
if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
- /* We need the IP header in host order for icmp_error(). */
- SET_HOST_IPLEN(ip);
icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
} else if (args->f_id.proto == IPPROTO_TCP) {
struct tcphdr *const tcp =
@@ -666,6 +655,9 @@ static int
check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp,
struct ucred **uc)
{
+#if defined(USERSPACE)
+ return 0; // not supported in userspace
+#else
#ifndef __FreeBSD__
/* XXX */
return cred_check(insn, proto, oif,
@@ -776,6 +768,7 @@ check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp,
#endif /* __rtems__ */
return (match);
#endif /* __FreeBSD__ */
+#endif /* not supported in userspace */
}
/*
@@ -793,9 +786,10 @@ set_match(struct ip_fw_args *args, int slot,
args->rule.rulenum = chain->map[slot]->rulenum;
}
+#ifndef LINEAR_SKIPTO
/*
* Helper function to enable cached rule lookups using
- * x_next and next_rule fields in ipfw rule.
+ * cached_id and cached_pos fields in ipfw rule.
*/
static int
jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num,
@@ -803,28 +797,51 @@ jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num,
{
int f_pos;
- /* If possible use cached f_pos (in f->next_rule),
- * whose version is written in f->next_rule
+ /* If possible use cached f_pos (in f->cached_pos),
+ * whose version is written in f->cached_id
* (horrible hacks to avoid changing the ABI).
*/
- if (num != IP_FW_TABLEARG && (uintptr_t)f->x_next == chain->id)
- f_pos = (uintptr_t)f->next_rule;
+ if (num != IP_FW_TARG && f->cached_id == chain->id)
+ f_pos = f->cached_pos;
else {
- int i = IP_FW_ARG_TABLEARG(num);
+ int i = IP_FW_ARG_TABLEARG(chain, num, skipto);
/* make sure we do not jump backward */
if (jump_backwards == 0 && i <= f->rulenum)
i = f->rulenum + 1;
- f_pos = ipfw_find_rule(chain, i, 0);
+ if (chain->idxmap != NULL)
+ f_pos = chain->idxmap[i];
+ else
+ f_pos = ipfw_find_rule(chain, i, 0);
/* update the cache */
- if (num != IP_FW_TABLEARG) {
- f->next_rule = (void *)(uintptr_t)f_pos;
- f->x_next = (void *)(uintptr_t)chain->id;
+ if (num != IP_FW_TARG) {
+ f->cached_id = chain->id;
+ f->cached_pos = f_pos;
}
}
return (f_pos);
}
+#else
+/*
+ * Helper function to enable real fast rule lookups.
+ */
+static int
+jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num,
+ int tablearg, int jump_backwards)
+{
+ int f_pos;
+
+ num = IP_FW_ARG_TABLEARG(chain, num, skipto);
+ /* make sure we do not jump backward */
+ if (jump_backwards == 0 && num <= f->rulenum)
+ num = f->rulenum + 1;
+ f_pos = chain->idxmap[num];
+
+ return (f_pos);
+}
+#endif
+#define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f)
/*
* The main check routine for the firewall.
*
@@ -929,7 +946,7 @@ ipfw_chk(struct ip_fw_args *args)
* offset == 0 means that (if this is an IPv4 packet)
* this is the first or only fragment.
* For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header
- * or there is a single packet fragement (fragement header added
+ * or there is a single packet fragment (fragment header added
* without needed). We will treat a single packet fragment as if
* there was no fragment header (or log/block depending on the
* V_fw_permit_single_frag6 sysctl setting).
@@ -964,6 +981,7 @@ ipfw_chk(struct ip_fw_args *args)
* MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
*/
int dyn_dir = MATCH_UNKNOWN;
+ uint16_t dyn_name = 0;
ipfw_dyn_rule *q = NULL;
struct ip_fw_chain *chain = &V_layer3_chain;
@@ -984,6 +1002,7 @@ ipfw_chk(struct ip_fw_args *args)
int is_ipv4 = 0;
int done = 0; /* flag to exit the outer loop */
+ IPFW_RLOCK_TRACKER;
if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
return (IP_FW_PASS); /* accept */
@@ -1249,9 +1268,9 @@ do { \
args->f_id.dst_port = dst_port = ntohs(dst_port);
}
- IPFW_RLOCK(chain);
+ IPFW_PF_RLOCK(chain);
if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
- IPFW_RUNLOCK(chain);
+ IPFW_PF_RUNLOCK(chain);
return (IP_FW_PASS); /* accept */
}
if (args->rule.slot) {
@@ -1471,9 +1490,10 @@ do { \
proto != IPPROTO_UDP)
break;
else if (v == 2)
- key = htonl(dst_port);
+ key = dst_port;
else if (v == 3)
- key = htonl(src_port);
+ key = src_port;
+#ifndef USERSPACE
else if (v == 4 || v == 5) {
check_uidgid(
(ipfw_insn_u32 *)cmd,
@@ -1499,8 +1519,9 @@ do { \
else if (v == 5 /* O_JAIL */)
key = ucred_cache.xid;
#endif /* !__FreeBSD__ */
- key = htonl(key);
- } else
+ }
+#endif /* !USERSPACE */
+ else
break;
}
match = ipfw_lookup_table(chain,
@@ -1517,8 +1538,9 @@ do { \
void *pkey = (cmd->opcode == O_IP_DST_LOOKUP) ?
&args->f_id.dst_ip6: &args->f_id.src_ip6;
match = ipfw_lookup_table_extended(chain,
- cmd->arg1, pkey, &v,
- IPFW_TABLE_CIDR);
+ cmd->arg1,
+ sizeof(struct in6_addr),
+ pkey, &v);
if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
match = ((ipfw_insn_u32 *)cmd)->d[0] == v;
if (match)
@@ -1526,6 +1548,17 @@ do { \
}
break;
+ case O_IP_FLOW_LOOKUP:
+ {
+ uint32_t v = 0;
+ match = ipfw_lookup_table_extended(chain,
+ cmd->arg1, 0, &args->f_id, &v);
+ if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
+ match = ((ipfw_insn_u32 *)cmd)->d[0] == v;
+ if (match)
+ tablearg = v;
+ }
+ break;
case O_IP_SRC_MASK:
case O_IP_DST_MASK:
if (is_ipv4) {
@@ -1551,7 +1584,7 @@ do { \
#ifdef INET6
/* FALLTHROUGH */
case O_IP6_SRC_ME:
- match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
+ match= is_ipv6 && ipfw_localip6(&args->f_id.src_ip6);
#endif
break;
@@ -1590,7 +1623,7 @@ do { \
#ifdef INET6
/* FALLTHROUGH */
case O_IP6_DST_ME:
- match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
+ match= is_ipv6 && ipfw_localip6(&args->f_id.dst_ip6);
#endif
break;
@@ -1697,7 +1730,7 @@ do { \
break;
/* DSCP bitmask is stored as low_u32 high_u32 */
- if (x > 32)
+ if (x >= 32)
match = *(p + 1) & (1 << (x - 32));
else
match = *p & (1 << x);
@@ -1732,9 +1765,11 @@ do { \
break;
case O_TCPOPTS:
- PULLUP_LEN(hlen, ulp, (TCP(ulp)->th_off << 2));
- match = (proto == IPPROTO_TCP && offset == 0 &&
- tcpopts_match(TCP(ulp), cmd));
+ if (proto == IPPROTO_TCP && offset == 0 && ulp){
+ PULLUP_LEN(hlen, ulp,
+ (TCP(ulp)->th_off << 2));
+ match = tcpopts_match(TCP(ulp), cmd);
+ }
break;
case O_TCPSEQ:
@@ -1778,27 +1813,37 @@ do { \
case O_ALTQ: {
struct pf_mtag *at;
+ struct m_tag *mtag;
ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+ /*
+ * ALTQ uses mbuf tags from another
+ * packet filtering system - pf(4).
+ * We allocate a tag in its format
+ * and fill it in, pretending to be pf(4).
+ */
match = 1;
at = pf_find_mtag(m);
if (at != NULL && at->qid != 0)
break;
- at = pf_get_mtag(m);
- if (at == NULL) {
+ mtag = m_tag_get(PACKET_TAG_PF,
+ sizeof(struct pf_mtag), M_NOWAIT | M_ZERO);
+ if (mtag == NULL) {
/*
* Let the packet fall back to the
* default ALTQ.
*/
break;
}
+ m_tag_prepend(m, mtag);
+ at = (struct pf_mtag *)(mtag + 1);
at->qid = altq->qid;
at->hdr = ip;
break;
}
case O_LOG:
- ipfw_log(f, hlen, args, m,
+ ipfw_log(chain, f, hlen, args, m,
oif, offset | ip6f_mf, tablearg, ip);
match = 1;
break;
@@ -1920,7 +1965,7 @@ do { \
case O_TAG: {
struct m_tag *mtag;
- uint32_t tag = IP_FW_ARG_TABLEARG(cmd->arg1);
+ uint32_t tag = TARG(cmd->arg1, tag);
/* Packet is already tagged with this tag? */
mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
@@ -1954,6 +1999,7 @@ do { \
break;
case O_SOCKARG: {
+#ifndef USERSPACE /* not supported in userspace */
struct inpcb *inp = args->inp;
struct inpcbinfo *pi;
@@ -1972,7 +2018,7 @@ do { \
* certainly be inp_user_cookie?
*/
- /* For incomming packet, lookup up the
+ /* For incoming packet, lookup up the
inpcb using the src/dest ip/port tuple */
if (inp == NULL) {
inp = in_pcblookup(pi,
@@ -1994,12 +2040,13 @@ do { \
match = 1;
}
}
+#endif /* !USERSPACE */
break;
}
case O_TAGGED: {
struct m_tag *mtag;
- uint32_t tag = IP_FW_ARG_TABLEARG(cmd->arg1);
+ uint32_t tag = TARG(cmd->arg1, tag);
if (cmdlen == 1) {
match = m_tag_locate(m, MTAG_IPFW,
@@ -2070,7 +2117,7 @@ do { \
*/
case O_LIMIT:
case O_KEEP_STATE:
- if (ipfw_install_state(f,
+ if (ipfw_install_state(chain, f,
(ipfw_insn_limit *)cmd, args, tablearg)) {
/* error or limit violation */
retval = IP_FW_DENY;
@@ -2085,17 +2132,35 @@ do { \
/*
* dynamic rules are checked at the first
* keep-state or check-state occurrence,
- * with the result being stored in dyn_dir.
+ * with the result being stored in dyn_dir
+ * and dyn_name.
* The compiler introduces a PROBE_STATE
* instruction for us when we have a
* KEEP_STATE (because PROBE_STATE needs
* to be run first).
+ *
+ * (dyn_dir == MATCH_UNKNOWN) means this is
+ * first lookup for such f_id. Do lookup.
+ *
+ * (dyn_dir != MATCH_UNKNOWN &&
+ * dyn_name != 0 && dyn_name != cmd->arg1)
+ * means previous lookup didn't find dynamic
+ * rule for specific state name and current
+ * lookup will search rule with another state
+ * name. Redo lookup.
+ *
+ * (dyn_dir != MATCH_UNKNOWN && dyn_name == 0)
+ * means previous lookup was for `any' name
+ * and it didn't find rule. No need to do
+ * lookup again.
*/
- if (dyn_dir == MATCH_UNKNOWN &&
+ if ((dyn_dir == MATCH_UNKNOWN ||
+ (dyn_name != 0 &&
+ dyn_name != cmd->arg1)) &&
(q = ipfw_lookup_dyn_rule(&args->f_id,
&dyn_dir, proto == IPPROTO_TCP ?
- TCP(ulp) : NULL))
- != NULL) {
+ TCP(ulp): NULL,
+ (dyn_name = cmd->arg1))) != NULL) {
/*
* Found dynamic entry, update stats
* and jump to the 'action' part of
@@ -2137,7 +2202,7 @@ do { \
case O_PIPE:
case O_QUEUE:
set_match(args, f_pos, chain);
- args->rule.info = IP_FW_ARG_TABLEARG(cmd->arg1);
+ args->rule.info = TARG(cmd->arg1, pipe);
if (cmd->opcode == O_PIPE)
args->rule.info |= IPFW_IS_PIPE;
if (V_fw_one_pass)
@@ -2157,7 +2222,7 @@ do { \
retval = (cmd->opcode == O_DIVERT) ?
IP_FW_DIVERT : IP_FW_TEE;
set_match(args, f_pos, chain);
- args->rule.info = IP_FW_ARG_TABLEARG(cmd->arg1);
+ args->rule.info = TARG(cmd->arg1, divert);
break;
case O_COUNT:
@@ -2167,7 +2232,7 @@ do { \
case O_SKIPTO:
IPFW_INC_RULE_COUNTER(f, pktlen);
- f_pos = jump_fast(chain, f, cmd->arg1, tablearg, 0);
+ f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0);
/*
* Skip disabled rules, and re-enter
* the inner loop with the correct
@@ -2256,7 +2321,7 @@ do { \
if (IS_CALL) {
stack[mtag->m_tag_id] = f->rulenum;
mtag->m_tag_id++;
- f_pos = jump_fast(chain, f, cmd->arg1,
+ f_pos = JUMP(chain, f, cmd->arg1,
tablearg, 1);
} else { /* `return' action */
mtag->m_tag_id--;
@@ -2328,13 +2393,48 @@ do { \
if (q == NULL || q->rule != f ||
dyn_dir == MATCH_FORWARD) {
struct sockaddr_in *sa;
+
sa = &(((ipfw_insn_sa *)cmd)->sa);
if (sa->sin_addr.s_addr == INADDR_ANY) {
- bcopy(sa, &args->hopstore,
- sizeof(*sa));
- args->hopstore.sin_addr.s_addr =
- htonl(tablearg);
- args->next_hop = &args->hopstore;
+#ifdef INET6
+ /*
+ * We use O_FORWARD_IP opcode for
+ * fwd rule with tablearg, but tables
+ * now support IPv6 addresses. And
+ * when we are inspecting IPv6 packet,
+ * we can use nh6 field from
+ * table_value as next_hop6 address.
+ */
+ if (is_ipv6) {
+ struct sockaddr_in6 *sa6;
+
+ sa6 = args->next_hop6 =
+ &args->hopstore6;
+ sa6->sin6_family = AF_INET6;
+ sa6->sin6_len = sizeof(*sa6);
+ sa6->sin6_addr = TARG_VAL(
+ chain, tablearg, nh6);
+ /*
+ * Set sin6_scope_id only for
+ * link-local unicast addresses.
+ */
+ if (IN6_IS_ADDR_LINKLOCAL(
+ &sa6->sin6_addr))
+ sa6->sin6_scope_id =
+ TARG_VAL(chain,
+ tablearg,
+ zoneid);
+ } else
+#endif
+ {
+ sa = args->next_hop =
+ &args->hopstore;
+ sa->sin_family = AF_INET;
+ sa->sin_len = sizeof(*sa);
+ sa->sin_addr.s_addr = htonl(
+ TARG_VAL(chain, tablearg,
+ nh4));
+ }
} else {
args->next_hop = sa;
}
@@ -2364,7 +2464,7 @@ do { \
case O_NETGRAPH:
case O_NGTEE:
set_match(args, f_pos, chain);
- args->rule.info = IP_FW_ARG_TABLEARG(cmd->arg1);
+ args->rule.info = TARG(cmd->arg1, netgraph);
if (V_fw_one_pass)
args->rule.info |= IPFW_ONEPASS;
retval = (cmd->opcode == O_NETGRAPH) ?
@@ -2377,7 +2477,7 @@ do { \
uint32_t fib;
IPFW_INC_RULE_COUNTER(f, pktlen);
- fib = IP_FW_ARG_TABLEARG(cmd->arg1);
+ fib = TARG(cmd->arg1, fib) & 0x7FFF;
if (fib >= rt_numfibs)
fib = 0;
M_SETFIB(m, fib);
@@ -2389,15 +2489,16 @@ do { \
case O_SETDSCP: {
uint16_t code;
- code = IP_FW_ARG_TABLEARG(cmd->arg1) & 0x3F;
+ code = TARG(cmd->arg1, dscp) & 0x3F;
l = 0; /* exit inner loop */
if (is_ipv4) {
- uint16_t a;
+ uint16_t old;
- a = ip->ip_tos;
- ip->ip_tos = (code << 2) | (ip->ip_tos & 0x03);
- a += ntohs(ip->ip_sum) - ip->ip_tos;
- ip->ip_sum = htons(a);
+ old = *(uint16_t *)ip;
+ ip->ip_tos = (code << 2) |
+ (ip->ip_tos & 0x03);
+ ip->ip_sum = cksum_adjust(ip->ip_sum,
+ old, *(uint16_t *)ip);
} else if (is_ipv6) {
uint8_t *v;
@@ -2425,20 +2526,20 @@ do { \
set_match(args, f_pos, chain);
/* Check if this is 'global' nat rule */
- if (cmd->arg1 == 0) {
+ if (cmd->arg1 == IP_FW_NAT44_GLOBAL) {
retval = ipfw_nat_ptr(args, NULL, m);
break;
}
t = ((ipfw_insn_nat *)cmd)->nat;
if (t == NULL) {
- nat_id = IP_FW_ARG_TABLEARG(cmd->arg1);
+ nat_id = TARG(cmd->arg1, nat);
t = (*lookup_nat_ptr)(&chain->nat, nat_id);
if (t == NULL) {
retval = IP_FW_DENY;
break;
}
- if (cmd->arg1 != IP_FW_TABLEARG)
+ if (cmd->arg1 != IP_FW_TARG)
((ipfw_insn_nat *)cmd)->nat = t;
}
retval = ipfw_nat_ptr(args, t, m);
@@ -2454,11 +2555,6 @@ do { \
/* if not fragmented, go to next rule */
if ((ip_off & (IP_MF | IP_OFFMASK)) == 0)
break;
- /*
- * ip_reass() expects len & off in host
- * byte order.
- */
- SET_HOST_IPLEN(ip);
args->m = m = ip_reass(m);
@@ -2472,7 +2568,6 @@ do { \
ip = mtod(m, struct ip *);
hlen = ip->ip_hl << 2;
- SET_NET_IPLEN(ip);
ip->ip_sum = 0;
if (hlen == sizeof(struct ip))
ip->ip_sum = in_cksum_hdr(ip);
@@ -2484,6 +2579,11 @@ do { \
done = 1; /* exit outer loop */
break;
}
+ case O_EXTERNAL_ACTION:
+ l = 0; /* in any case exit inner loop */
+ retval = ipfw_run_eaction(chain, args,
+ cmd, &done);
+ break;
default:
panic("-- unknown opcode %d\n", cmd->opcode);
@@ -2521,7 +2621,7 @@ do { \
retval = IP_FW_DENY;
printf("ipfw: ouch!, skip past end of rules, denying packet\n");
}
- IPFW_RUNLOCK(chain);
+ IPFW_PF_RUNLOCK(chain);
#ifdef __FreeBSD__
if (ucred_cache != NULL)
crfree(ucred_cache);
@@ -2553,7 +2653,27 @@ sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS)
return (ipfw_resize_tables(&V_layer3_chain, ntables));
}
+
+/*
+ * Switches table namespace between global and per-set.
+ */
+static int
+sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ unsigned int sets;
+
+ sets = V_fw_tables_sets;
+
+ error = sysctl_handle_int(oidp, &sets, 0, req);
+ /* Read operation or some error */
+ if ((error != 0) || (req->newptr == NULL))
+ return (error);
+
+ return (ipfw_switch_tables_namespace(&V_layer3_chain, sets));
+}
#endif
+
/*
* Module and VNET glue
*/
@@ -2607,7 +2727,8 @@ ipfw_init(void)
if (default_fw_tables > IPFW_TABLES_MAX)
default_fw_tables = IPFW_TABLES_MAX;
- ipfw_log_bpf(1); /* init */
+ ipfw_init_sopt_handler();
+ ipfw_iface_init();
return (error);
}
@@ -2619,7 +2740,8 @@ static void
ipfw_destroy(void)
{
- ipfw_log_bpf(0); /* uninit */
+ ipfw_iface_destroy();
+ ipfw_destroy_sopt_handler();
printf("IP firewall unloaded\n");
}
#endif /* __rtems__ */
@@ -2631,12 +2753,14 @@ ipfw_destroy(void)
static int
vnet_ipfw_init(const void *unused)
{
- int error;
+ int error, first;
struct ip_fw *rule = NULL;
struct ip_fw_chain *chain;
chain = &V_layer3_chain;
+ first = IS_DEFAULT_VNET(curvnet) ? 1 : 0;
+
/* First set up some values that are compile time options */
V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */
V_fw_deny_unknown_exthdrs = 1;
@@ -2650,16 +2774,19 @@ vnet_ipfw_init(const void *unused)
LIST_INIT(&chain->nat);
#endif
+ /* Init shared services hash table */
+ ipfw_init_srv(chain);
+
+ ipfw_init_obj_rewriter();
+ ipfw_init_counters();
/* insert the default rule and create the initial map */
chain->n_rules = 1;
- chain->static_len = sizeof(struct ip_fw);
chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_WAITOK | M_ZERO);
- if (chain->map)
- rule = malloc(chain->static_len, M_IPFW, M_WAITOK | M_ZERO);
+ rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw));
/* Set initial number of tables */
V_fw_tables_max = default_fw_tables;
- error = ipfw_init_tables(chain);
+ error = ipfw_init_tables(chain, first);
if (error) {
printf("ipfw2: setting up tables failed\n");
free(chain->map, M_IPFW);
@@ -2676,18 +2803,24 @@ vnet_ipfw_init(const void *unused)
rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
chain->default_rule = chain->map[0] = rule;
chain->id = rule->id = 1;
+ /* Pre-calculate rules length for legacy dump format */
+ chain->static_len = sizeof(struct ip_fw_rule0);
IPFW_LOCK_INIT(chain);
ipfw_dyn_init(chain);
+ ipfw_eaction_init(chain, first);
+#ifdef LINEAR_SKIPTO
+ ipfw_init_skipto_cache(chain);
+#endif
+ ipfw_bpf_init(first);
/* First set up some values that are compile time options */
V_ipfw_vnet_ready = 1; /* Open for business */
/*
- * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr)
- * and pfil hooks for ipv4 and ipv6. Even if the latter two fail
- * we still keep the module alive because the sockopt and
- * layer2 paths are still useful.
+ * Hook the sockopt handler and pfil hooks for ipv4 and ipv6.
+ * Even if the latter two fail we still keep the module alive
+ * because the sockopt and layer2 paths are still useful.
* ipfw[6]_hook return 0 on success, ENOENT on failure,
* so we can ignore the exact return value and just set a flag.
*
@@ -2697,8 +2830,7 @@ vnet_ipfw_init(const void *unused)
* In layer2 we have the same behaviour, except that V_ether_ipfw
* is checked on each packet because there are no pfil hooks.
*/
- V_ip_fw_ctl_ptr = ipfw_ctl;
- V_ip_fw_chk_ptr = ipfw_chk;
+ V_ip_fw_ctl_ptr = ipfw_ctl3;
error = ipfw_attach_hooks(1);
return (error);
}
@@ -2710,9 +2842,9 @@ vnet_ipfw_init(const void *unused)
static int
vnet_ipfw_uninit(const void *unused)
{
- struct ip_fw *reap, *rule;
+ struct ip_fw *reap;
struct ip_fw_chain *chain = &V_layer3_chain;
- int i;
+ int i, last;
V_ipfw_vnet_ready = 0; /* tell new callers to go away */
/*
@@ -2721,33 +2853,39 @@ vnet_ipfw_uninit(const void *unused)
* sure the update is propagated and nobody will be in.
*/
(void)ipfw_attach_hooks(0 /* detach */);
- V_ip_fw_chk_ptr = NULL;
V_ip_fw_ctl_ptr = NULL;
+
+ last = IS_DEFAULT_VNET(curvnet) ? 1 : 0;
+
IPFW_UH_WLOCK(chain);
IPFW_UH_WUNLOCK(chain);
- IPFW_UH_WLOCK(chain);
- IPFW_WLOCK(chain);
ipfw_dyn_uninit(0); /* run the callout_drain */
- IPFW_WUNLOCK(chain);
- ipfw_destroy_tables(chain);
+ IPFW_UH_WLOCK(chain);
+
reap = NULL;
IPFW_WLOCK(chain);
- for (i = 0; i < chain->n_rules; i++) {
- rule = chain->map[i];
- rule->x_next = reap;
- reap = rule;
- }
- if (chain->map)
- free(chain->map, M_IPFW);
+ for (i = 0; i < chain->n_rules; i++)
+ ipfw_reap_add(chain, &reap, chain->map[i]);
+ free(chain->map, M_IPFW);
+#ifdef LINEAR_SKIPTO
+ ipfw_destroy_skipto_cache(chain);
+#endif
IPFW_WUNLOCK(chain);
IPFW_UH_WUNLOCK(chain);
+ ipfw_destroy_tables(chain, last);
+ ipfw_eaction_uninit(chain, last);
if (reap != NULL)
ipfw_reap_rules(reap);
+ vnet_ipfw_iface_destroy(chain);
+ ipfw_destroy_srv(chain);
IPFW_LOCK_DESTROY(chain);
ipfw_dyn_uninit(1); /* free the remaining parts */
- return 0;
+ ipfw_destroy_counters();
+ ipfw_destroy_obj_rewriter();
+ ipfw_bpf_uninit(last);
+ return (0);
}
#endif /* __rtems__ */
@@ -2793,13 +2931,14 @@ static moduledata_t ipfwmod = {
};
/* Define startup order. */
-#define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN
+#define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_FIREWALL
#define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */
#define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */
#define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */
DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER);
-MODULE_VERSION(ipfw, 2);
+FEATURE(ipfw_ctl3, "ipfw new sockopt calls");
+MODULE_VERSION(ipfw, 3);
/* should declare some dependencies here */
/*
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_bpf.c b/freebsd/sys/netpfil/ipfw/ip_fw_bpf.c
new file mode 100644
index 00000000..3127809b
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/ip_fw_bpf.c
@@ -0,0 +1,211 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2016 Yandex LLC
+ * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_pflog.h>
+#include <net/if_var.h>
+#include <net/if_clone.h>
+#include <net/if_types.h>
+#include <net/vnet.h>
+#include <net/bpf.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ip_var.h>
+#include <netpfil/ipfw/ip_fw_private.h>
+
+static VNET_DEFINE(struct ifnet *, log_if);
+static VNET_DEFINE(struct ifnet *, pflog_if);
+static VNET_DEFINE(struct if_clone *, ipfw_cloner);
+static VNET_DEFINE(struct if_clone *, ipfwlog_cloner);
+#define V_ipfw_cloner VNET(ipfw_cloner)
+#define V_ipfwlog_cloner VNET(ipfwlog_cloner)
+#define V_log_if VNET(log_if)
+#define V_pflog_if VNET(pflog_if)
+
+static struct rmlock log_if_lock;
+#define LOGIF_LOCK_INIT(x) rm_init(&log_if_lock, "ipfw log_if lock")
+#define LOGIF_LOCK_DESTROY(x) rm_destroy(&log_if_lock)
+#define LOGIF_RLOCK_TRACKER struct rm_priotracker _log_tracker
+#define LOGIF_RLOCK(x) rm_rlock(&log_if_lock, &_log_tracker)
+#define LOGIF_RUNLOCK(x) rm_runlock(&log_if_lock, &_log_tracker)
+#define LOGIF_WLOCK(x) rm_wlock(&log_if_lock)
+#define LOGIF_WUNLOCK(x) rm_wunlock(&log_if_lock)
+
+static const char ipfwname[] = "ipfw";
+static const char ipfwlogname[] = "ipfwlog";
+
+static int
+ipfw_bpf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr)
+{
+
+ return (EINVAL);
+}
+
+static int
+ipfw_bpf_output(struct ifnet *ifp, struct mbuf *m,
+ const struct sockaddr *dst, struct route *ro)
+{
+
+ if (m != NULL)
+ FREE_PKT(m);
+ return (0);
+}
+
+static void
+ipfw_clone_destroy(struct ifnet *ifp)
+{
+
+ LOGIF_WLOCK();
+ if (ifp->if_hdrlen == ETHER_HDR_LEN)
+ V_log_if = NULL;
+ else
+ V_pflog_if = NULL;
+ LOGIF_WUNLOCK();
+
+ bpfdetach(ifp);
+ if_detach(ifp);
+ if_free(ifp);
+}
+
+static int
+ipfw_clone_create(struct if_clone *ifc, int unit, caddr_t params)
+{
+ struct ifnet *ifp;
+
+ ifp = if_alloc(IFT_PFLOG);
+ if (ifp == NULL)
+ return (ENOSPC);
+ if_initname(ifp, ipfwname, unit);
+ ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
+ ifp->if_mtu = 65536;
+ ifp->if_ioctl = ipfw_bpf_ioctl;
+ ifp->if_output = ipfw_bpf_output;
+ ifp->if_hdrlen = ETHER_HDR_LEN;
+ if_attach(ifp);
+ bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
+ LOGIF_WLOCK();
+ if (V_log_if != NULL) {
+ LOGIF_WUNLOCK();
+ bpfdetach(ifp);
+ if_detach(ifp);
+ if_free(ifp);
+ return (EEXIST);
+ }
+ V_log_if = ifp;
+ LOGIF_WUNLOCK();
+ return (0);
+}
+
+static int
+ipfwlog_clone_create(struct if_clone *ifc, int unit, caddr_t params)
+{
+ struct ifnet *ifp;
+
+ ifp = if_alloc(IFT_PFLOG);
+ if (ifp == NULL)
+ return (ENOSPC);
+ if_initname(ifp, ipfwlogname, unit);
+ ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
+ ifp->if_mtu = 65536;
+ ifp->if_ioctl = ipfw_bpf_ioctl;
+ ifp->if_output = ipfw_bpf_output;
+ ifp->if_hdrlen = PFLOG_HDRLEN;
+ if_attach(ifp);
+ bpfattach(ifp, DLT_PFLOG, PFLOG_HDRLEN);
+ LOGIF_WLOCK();
+ if (V_pflog_if != NULL) {
+ LOGIF_WUNLOCK();
+ bpfdetach(ifp);
+ if_detach(ifp);
+ if_free(ifp);
+ return (EEXIST);
+ }
+ V_pflog_if = ifp;
+ LOGIF_WUNLOCK();
+ return (0);
+}
+
+void
+ipfw_bpf_mtap2(void *data, u_int dlen, struct mbuf *m)
+{
+ LOGIF_RLOCK_TRACKER;
+
+ LOGIF_RLOCK();
+ if (dlen == ETHER_HDR_LEN) {
+ if (V_log_if == NULL) {
+ LOGIF_RUNLOCK();
+ return;
+ }
+ BPF_MTAP2(V_log_if, data, dlen, m);
+ } else if (dlen == PFLOG_HDRLEN) {
+ if (V_pflog_if == NULL) {
+ LOGIF_RUNLOCK();
+ return;
+ }
+ BPF_MTAP2(V_pflog_if, data, dlen, m);
+ }
+ LOGIF_RUNLOCK();
+}
+
+void
+ipfw_bpf_init(int first)
+{
+
+ if (first) {
+ LOGIF_LOCK_INIT();
+ V_log_if = NULL;
+ V_pflog_if = NULL;
+ }
+ V_ipfw_cloner = if_clone_simple(ipfwname, ipfw_clone_create,
+ ipfw_clone_destroy, 0);
+ V_ipfwlog_cloner = if_clone_simple(ipfwlogname, ipfwlog_clone_create,
+ ipfw_clone_destroy, 0);
+}
+
+void
+ipfw_bpf_uninit(int last)
+{
+
+ if_clone_detach(V_ipfw_cloner);
+ if_clone_detach(V_ipfwlog_cloner);
+ if (last)
+ LOGIF_LOCK_DESTROY();
+}
+
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_dynamic.c b/freebsd/sys/netpfil/ipfw/ip_fw_dynamic.c
new file mode 100644
index 00000000..4696faac
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/ip_fw_dynamic.c
@@ -0,0 +1,1822 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define DEB(x)
+#define DDB(x) x
+
+/*
+ * Dynamic rule support for ipfw
+ */
+
+#include <rtems/bsd/local/opt_ipfw.h>
+#include <rtems/bsd/local/opt_inet.h>
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#include <rtems/bsd/local/opt_inet6.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h> /* ip_defttl */
+#include <netinet/ip_fw.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+
+#include <netinet/ip6.h> /* IN6_ARE_ADDR_EQUAL */
+#ifdef INET6
+#include <netinet6/in6_var.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <netpfil/ipfw/ip_fw_private.h>
+
+#include <machine/in_cksum.h> /* XXX for in_cksum */
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * Description of dynamic rules.
+ *
+ * Dynamic rules are stored in lists accessed through a hash table
+ * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
+ * be modified through the sysctl variable dyn_buckets which is
+ * updated when the table becomes empty.
+ *
+ * XXX currently there is only one list, ipfw_dyn.
+ *
+ * When a packet is received, its address fields are first masked
+ * with the mask defined for the rule, then hashed, then matched
+ * against the entries in the corresponding list.
+ * Dynamic rules can be used for different purposes:
+ * + stateful rules;
+ * + enforcing limits on the number of sessions;
+ * + in-kernel NAT (not implemented yet)
+ *
+ * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
+ * measured in seconds and depending on the flags.
+ *
+ * The total number of dynamic rules is equal to UMA zone items count.
+ * The max number of dynamic rules is dyn_max. When we reach
+ * the maximum number of rules we do not create anymore. This is
+ * done to avoid consuming too much memory, but also too much
+ * time when searching on each packet (ideally, we should try instead
+ * to put a limit on the length of the list on each bucket...).
+ *
+ * Each dynamic rule holds a pointer to the parent ipfw rule so
+ * we know what action to perform. Dynamic rules are removed when
+ * the parent rule is deleted. This can be changed by dyn_keep_states
+ * sysctl.
+ *
+ * There are some limitations with dynamic rules -- we do not
+ * obey the 'randomized match', and we do not do multiple
+ * passes through the firewall. XXX check the latter!!!
+ */
+
+struct ipfw_dyn_bucket {
+ struct mtx mtx; /* Bucket protecting lock */
+ ipfw_dyn_rule *head; /* Pointer to first rule */
+};
+
+/*
+ * Static variables followed by global ones
+ */
+static VNET_DEFINE(struct ipfw_dyn_bucket *, ipfw_dyn_v);
+static VNET_DEFINE(u_int32_t, dyn_buckets_max);
+static VNET_DEFINE(u_int32_t, curr_dyn_buckets);
+static VNET_DEFINE(struct callout, ipfw_timeout);
+#define V_ipfw_dyn_v VNET(ipfw_dyn_v)
+#define V_dyn_buckets_max VNET(dyn_buckets_max)
+#define V_curr_dyn_buckets VNET(curr_dyn_buckets)
+#define V_ipfw_timeout VNET(ipfw_timeout)
+
+static VNET_DEFINE(uma_zone_t, ipfw_dyn_rule_zone);
+#define V_ipfw_dyn_rule_zone VNET(ipfw_dyn_rule_zone)
+
+#define IPFW_BUCK_LOCK_INIT(b) \
+ mtx_init(&(b)->mtx, "IPFW dynamic bucket", NULL, MTX_DEF)
+#define IPFW_BUCK_LOCK_DESTROY(b) \
+ mtx_destroy(&(b)->mtx)
+#define IPFW_BUCK_LOCK(i) mtx_lock(&V_ipfw_dyn_v[(i)].mtx)
+#define IPFW_BUCK_UNLOCK(i) mtx_unlock(&V_ipfw_dyn_v[(i)].mtx)
+#define IPFW_BUCK_ASSERT(i) mtx_assert(&V_ipfw_dyn_v[(i)].mtx, MA_OWNED)
+
+
+static VNET_DEFINE(int, dyn_keep_states);
+#define V_dyn_keep_states VNET(dyn_keep_states)
+
+/*
+ * Timeouts for various events in handing dynamic rules.
+ */
+static VNET_DEFINE(u_int32_t, dyn_ack_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_syn_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_fin_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_rst_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_udp_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_short_lifetime);
+
+#define V_dyn_ack_lifetime VNET(dyn_ack_lifetime)
+#define V_dyn_syn_lifetime VNET(dyn_syn_lifetime)
+#define V_dyn_fin_lifetime VNET(dyn_fin_lifetime)
+#define V_dyn_rst_lifetime VNET(dyn_rst_lifetime)
+#define V_dyn_udp_lifetime VNET(dyn_udp_lifetime)
+#define V_dyn_short_lifetime VNET(dyn_short_lifetime)
+
+/*
+ * Keepalives are sent if dyn_keepalive is set. They are sent every
+ * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
+ * seconds of lifetime of a rule.
+ * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
+ * than dyn_keepalive_period.
+ */
+
+static VNET_DEFINE(u_int32_t, dyn_keepalive_interval);
+static VNET_DEFINE(u_int32_t, dyn_keepalive_period);
+static VNET_DEFINE(u_int32_t, dyn_keepalive);
+static VNET_DEFINE(time_t, dyn_keepalive_last);
+
+#define V_dyn_keepalive_interval VNET(dyn_keepalive_interval)
+#define V_dyn_keepalive_period VNET(dyn_keepalive_period)
+#define V_dyn_keepalive VNET(dyn_keepalive)
+#define V_dyn_keepalive_last VNET(dyn_keepalive_last)
+
+static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */
+
+#define DYN_COUNT uma_zone_get_cur(V_ipfw_dyn_rule_zone)
+#define V_dyn_max VNET(dyn_max)
+
+/* for userspace, we emulate the uma_zone_counter with ipfw_dyn_count */
+static int ipfw_dyn_count; /* number of objects */
+
+#ifdef USERSPACE /* emulation of UMA object counters for userspace */
+#define uma_zone_get_cur(x) ipfw_dyn_count
+#endif /* USERSPACE */
+
+static int last_log; /* Log ratelimiting */
+
+static void ipfw_dyn_tick(void *vnetx);
+static void check_dyn_rules(struct ip_fw_chain *, ipfw_range_tlv *, int, int);
+#ifdef SYSCTL_NODE
+
+static int sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS);
+static int sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS);
+
+SYSBEGIN(f2)
+
+SYSCTL_DECL(_net_inet_ip_fw);
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_buckets_max), 0,
+ "Max number of dyn. buckets");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
+ CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0,
+ "Current Number of dyn. buckets");
+SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
+ CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RD, 0, 0, sysctl_ipfw_dyn_count, "IU",
+ "Number of dyn. rules");
+SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
+ CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_dyn_max, "IU",
+ "Max number of dyn. rules");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0,
+ "Lifetime of dyn. rules for acks");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0,
+ "Lifetime of dyn. rules for syn");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0,
+ "Lifetime of dyn. rules for fin");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0,
+ "Lifetime of dyn. rules for rst");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0,
+ "Lifetime of dyn. rules for UDP");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0,
+ "Lifetime of dyn. rules for other situations");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
+ "Enable keepalives for dyn. rules");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keep_states,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0,
+ "Do not flush dynamic states on rule deletion");
+
+SYSEND
+
+#endif /* SYSCTL_NODE */
+
+
+#ifdef INET6
+static __inline int
+hash_packet6(struct ipfw_flow_id *id)
+{
+ u_int32_t i;
+ i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
+ (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
+ (id->src_ip6.__u6_addr.__u6_addr32[2]) ^
+ (id->src_ip6.__u6_addr.__u6_addr32[3]) ^
+ (id->dst_port) ^ (id->src_port);
+ return i;
+}
+#endif
+
+/*
+ * IMPORTANT: the hash function for dynamic rules must be commutative
+ * in source and destination (ip,port), because rules are bidirectional
+ * and we want to find both in the same bucket.
+ */
+static __inline int
+hash_packet(struct ipfw_flow_id *id, int buckets)
+{
+ u_int32_t i;
+
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(id))
+ i = hash_packet6(id);
+ else
+#endif /* INET6 */
+ i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
+ i &= (buckets - 1);
+ return i;
+}
+
+#if 0
+#define DYN_DEBUG(fmt, ...) do { \
+ printf("%s: " fmt "\n", __func__, __VA_ARGS__); \
+} while (0)
+#else
+#define DYN_DEBUG(fmt, ...)
+#endif
+
+static char *default_state_name = "default";
+struct dyn_state_obj {
+ struct named_object no;
+ char name[64];
+};
+
+#define DYN_STATE_OBJ(ch, cmd) \
+ ((struct dyn_state_obj *)SRV_OBJECT(ch, (cmd)->arg1))
+/*
+ * Classifier callback.
+ * Return 0 if opcode contains object that should be referenced
+ * or rewritten.
+ */
+static int
+dyn_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
+{
+
+ DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1);
+ /* Don't rewrite "check-state any" */
+ if (cmd->arg1 == 0 &&
+ cmd->opcode == O_CHECK_STATE)
+ return (1);
+
+ *puidx = cmd->arg1;
+ *ptype = 0;
+ return (0);
+}
+
+static void
+dyn_update(ipfw_insn *cmd, uint16_t idx)
+{
+
+ cmd->arg1 = idx;
+ DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1);
+}
+
+static int
+dyn_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
+ struct named_object **pno)
+{
+ ipfw_obj_ntlv *ntlv;
+ const char *name;
+
+ DYN_DEBUG("uidx %d", ti->uidx);
+ if (ti->uidx != 0) {
+ if (ti->tlvs == NULL)
+ return (EINVAL);
+ /* Search ntlv in the buffer provided by user */
+ ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
+ IPFW_TLV_STATE_NAME);
+ if (ntlv == NULL)
+ return (EINVAL);
+ name = ntlv->name;
+ } else
+ name = default_state_name;
+ /*
+ * Search named object with corresponding name.
+ * Since states objects are global - ignore the set value
+ * and use zero instead.
+ */
+ *pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0,
+ IPFW_TLV_STATE_NAME, name);
+ /*
+ * We always return success here.
+ * The caller will check *pno and mark object as unresolved,
+ * then it will automatically create "default" object.
+ */
+ return (0);
+}
+
+static struct named_object *
+dyn_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
+{
+
+ DYN_DEBUG("kidx %d", idx);
+ return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx));
+}
+
+static int
+dyn_create(struct ip_fw_chain *ch, struct tid_info *ti,
+ uint16_t *pkidx)
+{
+ struct namedobj_instance *ni;
+ struct dyn_state_obj *obj;
+ struct named_object *no;
+ ipfw_obj_ntlv *ntlv;
+ char *name;
+
+ DYN_DEBUG("uidx %d", ti->uidx);
+ if (ti->uidx != 0) {
+ if (ti->tlvs == NULL)
+ return (EINVAL);
+ ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
+ IPFW_TLV_STATE_NAME);
+ if (ntlv == NULL)
+ return (EINVAL);
+ name = ntlv->name;
+ } else
+ name = default_state_name;
+
+ ni = CHAIN_TO_SRV(ch);
+ obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO);
+ obj->no.name = obj->name;
+ obj->no.etlv = IPFW_TLV_STATE_NAME;
+ strlcpy(obj->name, name, sizeof(obj->name));
+
+ IPFW_UH_WLOCK(ch);
+ no = ipfw_objhash_lookup_name_type(ni, 0,
+ IPFW_TLV_STATE_NAME, name);
+ if (no != NULL) {
+ /*
+ * Object is already created.
+ * Just return its kidx and bump refcount.
+ */
+ *pkidx = no->kidx;
+ no->refcnt++;
+ IPFW_UH_WUNLOCK(ch);
+ free(obj, M_IPFW);
+ DYN_DEBUG("\tfound kidx %d", *pkidx);
+ return (0);
+ }
+ if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) {
+ DYN_DEBUG("\talloc_idx failed for %s", name);
+ IPFW_UH_WUNLOCK(ch);
+ free(obj, M_IPFW);
+ return (ENOSPC);
+ }
+ ipfw_objhash_add(ni, &obj->no);
+ IPFW_WLOCK(ch);
+ SRV_OBJECT(ch, obj->no.kidx) = obj;
+ IPFW_WUNLOCK(ch);
+ obj->no.refcnt++;
+ *pkidx = obj->no.kidx;
+ IPFW_UH_WUNLOCK(ch);
+ DYN_DEBUG("\tcreated kidx %d", *pkidx);
+ return (0);
+}
+
+static void
+dyn_destroy(struct ip_fw_chain *ch, struct named_object *no)
+{
+ struct dyn_state_obj *obj;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ KASSERT(no->refcnt == 1,
+ ("Destroying object '%s' (type %u, idx %u) with refcnt %u",
+ no->name, no->etlv, no->kidx, no->refcnt));
+
+ DYN_DEBUG("kidx %d", no->kidx);
+ IPFW_WLOCK(ch);
+ obj = SRV_OBJECT(ch, no->kidx);
+ SRV_OBJECT(ch, no->kidx) = NULL;
+ IPFW_WUNLOCK(ch);
+ ipfw_objhash_del(CHAIN_TO_SRV(ch), no);
+ ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), no->kidx);
+
+ free(obj, M_IPFW);
+}
+
+static struct opcode_obj_rewrite dyn_opcodes[] = {
+ {
+ O_KEEP_STATE, IPFW_TLV_STATE_NAME,
+ dyn_classify, dyn_update,
+ dyn_findbyname, dyn_findbykidx,
+ dyn_create, dyn_destroy
+ },
+ {
+ O_CHECK_STATE, IPFW_TLV_STATE_NAME,
+ dyn_classify, dyn_update,
+ dyn_findbyname, dyn_findbykidx,
+ dyn_create, dyn_destroy
+ },
+ {
+ O_PROBE_STATE, IPFW_TLV_STATE_NAME,
+ dyn_classify, dyn_update,
+ dyn_findbyname, dyn_findbykidx,
+ dyn_create, dyn_destroy
+ },
+ {
+ O_LIMIT, IPFW_TLV_STATE_NAME,
+ dyn_classify, dyn_update,
+ dyn_findbyname, dyn_findbykidx,
+ dyn_create, dyn_destroy
+ },
+};
+/**
+ * Print customizable flow id description via log(9) facility.
+ */
+static void
+print_dyn_rule_flags(struct ipfw_flow_id *id, int dyn_type, int log_flags,
+ char *prefix, char *postfix)
+{
+ struct in_addr da;
+#ifdef INET6
+ char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
+#else
+ char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(id)) {
+ ip6_sprintf(src, &id->src_ip6);
+ ip6_sprintf(dst, &id->dst_ip6);
+ } else
+#endif
+ {
+ da.s_addr = htonl(id->src_ip);
+ inet_ntop(AF_INET, &da, src, sizeof(src));
+ da.s_addr = htonl(id->dst_ip);
+ inet_ntop(AF_INET, &da, dst, sizeof(dst));
+ }
+ log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n",
+ prefix, dyn_type, src, id->src_port, dst,
+ id->dst_port, DYN_COUNT, postfix);
+}
+
+#define print_dyn_rule(id, dtype, prefix, postfix) \
+ print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix)
+
+#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0)
+#define TIME_LE(a,b) ((int)((a)-(b)) < 0)
+
+static void
+dyn_update_proto_state(ipfw_dyn_rule *q, const struct ipfw_flow_id *id,
+ const struct tcphdr *tcp, int dir)
+{
+ uint32_t ack;
+ u_char flags;
+
+ if (id->proto == IPPROTO_TCP) {
+ flags = id->_flags & (TH_FIN | TH_SYN | TH_RST);
+#define BOTH_SYN (TH_SYN | (TH_SYN << 8))
+#define BOTH_FIN (TH_FIN | (TH_FIN << 8))
+#define TCP_FLAGS (TH_FLAGS | (TH_FLAGS << 8))
+#define ACK_FWD 0x10000 /* fwd ack seen */
+#define ACK_REV 0x20000 /* rev ack seen */
+
+ q->state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
+ switch (q->state & TCP_FLAGS) {
+ case TH_SYN: /* opening */
+ q->expire = time_uptime + V_dyn_syn_lifetime;
+ break;
+
+ case BOTH_SYN: /* move to established */
+ case BOTH_SYN | TH_FIN: /* one side tries to close */
+ case BOTH_SYN | (TH_FIN << 8):
+#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
+ if (tcp == NULL)
+ break;
+
+ ack = ntohl(tcp->th_ack);
+ if (dir == MATCH_FORWARD) {
+ if (q->ack_fwd == 0 ||
+ _SEQ_GE(ack, q->ack_fwd)) {
+ q->ack_fwd = ack;
+ q->state |= ACK_FWD;
+ }
+ } else {
+ if (q->ack_rev == 0 ||
+ _SEQ_GE(ack, q->ack_rev)) {
+ q->ack_rev = ack;
+ q->state |= ACK_REV;
+ }
+ }
+ if ((q->state & (ACK_FWD | ACK_REV)) ==
+ (ACK_FWD | ACK_REV)) {
+ q->expire = time_uptime + V_dyn_ack_lifetime;
+ q->state &= ~(ACK_FWD | ACK_REV);
+ }
+ break;
+
+ case BOTH_SYN | BOTH_FIN: /* both sides closed */
+ if (V_dyn_fin_lifetime >= V_dyn_keepalive_period)
+ V_dyn_fin_lifetime =
+ V_dyn_keepalive_period - 1;
+ q->expire = time_uptime + V_dyn_fin_lifetime;
+ break;
+
+ default:
+#if 0
+ /*
+ * reset or some invalid combination, but can also
+ * occur if we use keep-state the wrong way.
+ */
+ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
+ printf("invalid state: 0x%x\n", q->state);
+#endif
+ if (V_dyn_rst_lifetime >= V_dyn_keepalive_period)
+ V_dyn_rst_lifetime =
+ V_dyn_keepalive_period - 1;
+ q->expire = time_uptime + V_dyn_rst_lifetime;
+ break;
+ }
+ } else if (id->proto == IPPROTO_UDP) {
+ q->expire = time_uptime + V_dyn_udp_lifetime;
+ } else {
+ /* other protocols */
+ q->expire = time_uptime + V_dyn_short_lifetime;
+ }
+}
+
+/*
+ * Lookup a dynamic rule, locked version.
+ */
+static ipfw_dyn_rule *
+lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int i, int *match_direction,
+ struct tcphdr *tcp, uint16_t kidx)
+{
+ /*
+ * Stateful ipfw extensions.
+ * Lookup into dynamic session queue.
+ */
+ ipfw_dyn_rule *prev, *q = NULL;
+ int dir;
+
+ IPFW_BUCK_ASSERT(i);
+
+ dir = MATCH_NONE;
+ for (prev = NULL, q = V_ipfw_dyn_v[i].head; q; prev = q, q = q->next) {
+ if (q->dyn_type == O_LIMIT_PARENT)
+ continue;
+
+ if (pkt->proto != q->id.proto)
+ continue;
+
+ if (kidx != 0 && kidx != q->kidx)
+ continue;
+
+ if (IS_IP6_FLOW_ID(pkt)) {
+ if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.src_ip6) &&
+ IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.dst_ip6) &&
+ pkt->src_port == q->id.src_port &&
+ pkt->dst_port == q->id.dst_port) {
+ dir = MATCH_FORWARD;
+ break;
+ }
+ if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.dst_ip6) &&
+ IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.src_ip6) &&
+ pkt->src_port == q->id.dst_port &&
+ pkt->dst_port == q->id.src_port) {
+ dir = MATCH_REVERSE;
+ break;
+ }
+ } else {
+ if (pkt->src_ip == q->id.src_ip &&
+ pkt->dst_ip == q->id.dst_ip &&
+ pkt->src_port == q->id.src_port &&
+ pkt->dst_port == q->id.dst_port) {
+ dir = MATCH_FORWARD;
+ break;
+ }
+ if (pkt->src_ip == q->id.dst_ip &&
+ pkt->dst_ip == q->id.src_ip &&
+ pkt->src_port == q->id.dst_port &&
+ pkt->dst_port == q->id.src_port) {
+ dir = MATCH_REVERSE;
+ break;
+ }
+ }
+ }
+ if (q == NULL)
+ goto done; /* q = NULL, not found */
+
+ if (prev != NULL) { /* found and not in front */
+ prev->next = q->next;
+ q->next = V_ipfw_dyn_v[i].head;
+ V_ipfw_dyn_v[i].head = q;
+ }
+
+ /* update state according to flags */
+ dyn_update_proto_state(q, pkt, tcp, dir);
+done:
+ if (match_direction != NULL)
+ *match_direction = dir;
+ return (q);
+}
+
+ipfw_dyn_rule *
+ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
+ struct tcphdr *tcp, uint16_t kidx)
+{
+ ipfw_dyn_rule *q;
+ int i;
+
+ i = hash_packet(pkt, V_curr_dyn_buckets);
+
+ IPFW_BUCK_LOCK(i);
+ q = lookup_dyn_rule_locked(pkt, i, match_direction, tcp, kidx);
+ if (q == NULL)
+ IPFW_BUCK_UNLOCK(i);
+ /* NB: return table locked when q is not NULL */
+ return q;
+}
+
+/*
+ * Unlock bucket mtx
+ * @p - pointer to dynamic rule
+ */
+void
+ipfw_dyn_unlock(ipfw_dyn_rule *q)
+{
+
+ IPFW_BUCK_UNLOCK(q->bucket);
+}
+
+static int
+resize_dynamic_table(struct ip_fw_chain *chain, int nbuckets)
+{
+ int i, k, nbuckets_old;
+ ipfw_dyn_rule *q;
+ struct ipfw_dyn_bucket *dyn_v, *dyn_v_old;
+
+ /* Check if given number is power of 2 and less than 64k */
+ if ((nbuckets > 65536) || (!powerof2(nbuckets)))
+ return 1;
+
+ CTR3(KTR_NET, "%s: resize dynamic hash: %d -> %d", __func__,
+ V_curr_dyn_buckets, nbuckets);
+
+ /* Allocate and initialize new hash */
+ dyn_v = malloc(nbuckets * sizeof(*dyn_v), M_IPFW,
+ M_WAITOK | M_ZERO);
+
+ for (i = 0 ; i < nbuckets; i++)
+ IPFW_BUCK_LOCK_INIT(&dyn_v[i]);
+
+ /*
+ * Call upper half lock, as get_map() do to ease
+ * read-only access to dynamic rules hash from sysctl
+ */
+ IPFW_UH_WLOCK(chain);
+
+ /*
+ * Acquire chain write lock to permit hash access
+ * for main traffic path without additional locks
+ */
+ IPFW_WLOCK(chain);
+
+ /* Save old values */
+ nbuckets_old = V_curr_dyn_buckets;
+ dyn_v_old = V_ipfw_dyn_v;
+
+ /* Skip relinking if array is not set up */
+ if (V_ipfw_dyn_v == NULL)
+ V_curr_dyn_buckets = 0;
+
+ /* Re-link all dynamic states */
+ for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+ while (V_ipfw_dyn_v[i].head != NULL) {
+ /* Remove from current chain */
+ q = V_ipfw_dyn_v[i].head;
+ V_ipfw_dyn_v[i].head = q->next;
+
+ /* Get new hash value */
+ k = hash_packet(&q->id, nbuckets);
+ q->bucket = k;
+ /* Add to the new head */
+ q->next = dyn_v[k].head;
+ dyn_v[k].head = q;
+ }
+ }
+
+ /* Update current pointers/buckets values */
+ V_curr_dyn_buckets = nbuckets;
+ V_ipfw_dyn_v = dyn_v;
+
+ IPFW_WUNLOCK(chain);
+
+ IPFW_UH_WUNLOCK(chain);
+
+ /* Start periodic callout on initial creation */
+ if (dyn_v_old == NULL) {
+ callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, curvnet, 0);
+ return (0);
+ }
+
+ /* Destroy all mutexes */
+ for (i = 0 ; i < nbuckets_old ; i++)
+ IPFW_BUCK_LOCK_DESTROY(&dyn_v_old[i]);
+
+ /* Free old hash */
+ free(dyn_v_old, M_IPFW);
+
+ return 0;
+}
+
+/**
+ * Install state of type 'type' for a dynamic session.
+ * The hash table contains two type of rules:
+ * - regular rules (O_KEEP_STATE)
+ * - rules for sessions with limited number of sess per user
+ * (O_LIMIT). When they are created, the parent is
+ * increased by 1, and decreased on delete. In this case,
+ * the third parameter is the parent rule and not the chain.
+ * - "parent" rules for the above (O_LIMIT_PARENT).
+ */
+static ipfw_dyn_rule *
+add_dyn_rule(struct ipfw_flow_id *id, int i, uint8_t dyn_type,
+ struct ip_fw *rule, uint16_t kidx)
+{
+ ipfw_dyn_rule *r;
+
+ IPFW_BUCK_ASSERT(i);
+
+ r = uma_zalloc(V_ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
+ if (r == NULL) {
+ if (last_log != time_uptime) {
+ last_log = time_uptime;
+ log(LOG_DEBUG,
+ "ipfw: Cannot allocate dynamic state, "
+ "consider increasing net.inet.ip.fw.dyn_max\n");
+ }
+ return NULL;
+ }
+ ipfw_dyn_count++;
+
+ /*
+ * refcount on parent is already incremented, so
+ * it is safe to use parent unlocked.
+ */
+ if (dyn_type == O_LIMIT) {
+ ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
+ if ( parent->dyn_type != O_LIMIT_PARENT)
+ panic("invalid parent");
+ r->parent = parent;
+ rule = parent->rule;
+ }
+
+ r->id = *id;
+ r->expire = time_uptime + V_dyn_syn_lifetime;
+ r->rule = rule;
+ r->dyn_type = dyn_type;
+ IPFW_ZERO_DYN_COUNTER(r);
+ r->count = 0;
+ r->kidx = kidx;
+ r->bucket = i;
+ r->next = V_ipfw_dyn_v[i].head;
+ V_ipfw_dyn_v[i].head = r;
+ DEB(print_dyn_rule(id, dyn_type, "add dyn entry", "total");)
+ return r;
+}
+
+/**
+ * lookup dynamic parent rule using pkt and rule as search keys.
+ * If the lookup fails, then install one.
+ */
+static ipfw_dyn_rule *
+lookup_dyn_parent(struct ipfw_flow_id *pkt, int *pindex, struct ip_fw *rule,
+ uint16_t kidx)
+{
+ ipfw_dyn_rule *q;
+ int i, is_v6;
+
+ is_v6 = IS_IP6_FLOW_ID(pkt);
+ i = hash_packet( pkt, V_curr_dyn_buckets );
+ *pindex = i;
+ IPFW_BUCK_LOCK(i);
+ for (q = V_ipfw_dyn_v[i].head ; q != NULL ; q=q->next)
+ if (q->dyn_type == O_LIMIT_PARENT &&
+ kidx == q->kidx &&
+ rule == q->rule &&
+ pkt->proto == q->id.proto &&
+ pkt->src_port == q->id.src_port &&
+ pkt->dst_port == q->id.dst_port &&
+ (
+ (is_v6 &&
+ IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+ &(q->id.src_ip6)) &&
+ IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+ &(q->id.dst_ip6))) ||
+ (!is_v6 &&
+ pkt->src_ip == q->id.src_ip &&
+ pkt->dst_ip == q->id.dst_ip)
+ )
+ ) {
+ q->expire = time_uptime + V_dyn_short_lifetime;
+ DEB(print_dyn_rule(pkt, q->dyn_type,
+ "lookup_dyn_parent found", "");)
+ return q;
+ }
+
+ /* Add virtual limiting rule */
+ return add_dyn_rule(pkt, i, O_LIMIT_PARENT, rule, kidx);
+}
+
+/**
+ * Install dynamic state for rule type cmd->o.opcode
+ *
+ * Returns 1 (failure) if state is not installed because of errors or because
+ * session limitations are enforced.
+ */
+int
+ipfw_install_state(struct ip_fw_chain *chain, struct ip_fw *rule,
+ ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg)
+{
+ ipfw_dyn_rule *q;
+ int i;
+
+ DEB(print_dyn_rule(&args->f_id, cmd->o.opcode, "install_state",
+ (cmd->o.arg1 == 0 ? "": DYN_STATE_OBJ(chain, &cmd->o)->name));)
+
+ i = hash_packet(&args->f_id, V_curr_dyn_buckets);
+
+ IPFW_BUCK_LOCK(i);
+
+ q = lookup_dyn_rule_locked(&args->f_id, i, NULL, NULL, cmd->o.arg1);
+ if (q != NULL) { /* should never occur */
+ DEB(
+ if (last_log != time_uptime) {
+ last_log = time_uptime;
+ printf("ipfw: %s: entry already present, done\n",
+ __func__);
+ })
+ IPFW_BUCK_UNLOCK(i);
+ return (0);
+ }
+
+ /*
+ * State limiting is done via uma(9) zone limiting.
+ * Save pointer to newly-installed rule and reject
+ * packet if add_dyn_rule() returned NULL.
+ * Note q is currently set to NULL.
+ */
+
+ switch (cmd->o.opcode) {
+ case O_KEEP_STATE: /* bidir rule */
+ q = add_dyn_rule(&args->f_id, i, O_KEEP_STATE, rule,
+ cmd->o.arg1);
+ break;
+
+ case O_LIMIT: { /* limit number of sessions */
+ struct ipfw_flow_id id;
+ ipfw_dyn_rule *parent;
+ uint32_t conn_limit;
+ uint16_t limit_mask = cmd->limit_mask;
+ int pindex;
+
+ conn_limit = IP_FW_ARG_TABLEARG(chain, cmd->conn_limit, limit);
+
+ DEB(
+ if (cmd->conn_limit == IP_FW_TARG)
+ printf("ipfw: %s: O_LIMIT rule, conn_limit: %u "
+ "(tablearg)\n", __func__, conn_limit);
+ else
+ printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n",
+ __func__, conn_limit);
+ )
+
+ id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0;
+ id.proto = args->f_id.proto;
+ id.addr_type = args->f_id.addr_type;
+ id.fib = M_GETFIB(args->m);
+
+ if (IS_IP6_FLOW_ID (&(args->f_id))) {
+ bzero(&id.src_ip6, sizeof(id.src_ip6));
+ bzero(&id.dst_ip6, sizeof(id.dst_ip6));
+
+ if (limit_mask & DYN_SRC_ADDR)
+ id.src_ip6 = args->f_id.src_ip6;
+ if (limit_mask & DYN_DST_ADDR)
+ id.dst_ip6 = args->f_id.dst_ip6;
+ } else {
+ if (limit_mask & DYN_SRC_ADDR)
+ id.src_ip = args->f_id.src_ip;
+ if (limit_mask & DYN_DST_ADDR)
+ id.dst_ip = args->f_id.dst_ip;
+ }
+ if (limit_mask & DYN_SRC_PORT)
+ id.src_port = args->f_id.src_port;
+ if (limit_mask & DYN_DST_PORT)
+ id.dst_port = args->f_id.dst_port;
+
+ /*
+ * We have to release lock for previous bucket to
+ * avoid possible deadlock
+ */
+ IPFW_BUCK_UNLOCK(i);
+
+ parent = lookup_dyn_parent(&id, &pindex, rule, cmd->o.arg1);
+ if (parent == NULL) {
+ printf("ipfw: %s: add parent failed\n", __func__);
+ IPFW_BUCK_UNLOCK(pindex);
+ return (1);
+ }
+
+ if (parent->count >= conn_limit) {
+ if (V_fw_verbose && last_log != time_uptime) {
+ last_log = time_uptime;
+ char sbuf[24];
+ last_log = time_uptime;
+ snprintf(sbuf, sizeof(sbuf),
+ "%d drop session",
+ parent->rule->rulenum);
+ print_dyn_rule_flags(&args->f_id,
+ cmd->o.opcode,
+ LOG_SECURITY | LOG_DEBUG,
+ sbuf, "too many entries");
+ }
+ IPFW_BUCK_UNLOCK(pindex);
+ return (1);
+ }
+ /* Increment counter on parent */
+ parent->count++;
+ IPFW_BUCK_UNLOCK(pindex);
+
+ IPFW_BUCK_LOCK(i);
+ q = add_dyn_rule(&args->f_id, i, O_LIMIT,
+ (struct ip_fw *)parent, cmd->o.arg1);
+ if (q == NULL) {
+ /* Decrement index and notify caller */
+ IPFW_BUCK_UNLOCK(i);
+ IPFW_BUCK_LOCK(pindex);
+ parent->count--;
+ IPFW_BUCK_UNLOCK(pindex);
+ return (1);
+ }
+ break;
+ }
+ default:
+ printf("ipfw: %s: unknown dynamic rule type %u\n",
+ __func__, cmd->o.opcode);
+ }
+
+ if (q == NULL) {
+ IPFW_BUCK_UNLOCK(i);
+ return (1); /* Notify caller about failure */
+ }
+
+ dyn_update_proto_state(q, &args->f_id, NULL, MATCH_FORWARD);
+ IPFW_BUCK_UNLOCK(i);
+ return (0);
+}
+
+/*
+ * Generate a TCP packet, containing either a RST or a keepalive.
+ * When flags & TH_RST, we are sending a RST packet, because of a
+ * "reset" action matched the packet.
+ * Otherwise we are sending a keepalive, and flags & TH_
+ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
+ * so that MAC can label the reply appropriately.
+ */
+struct mbuf *
+ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
+ u_int32_t ack, int flags)
+{
+ struct mbuf *m = NULL; /* stupid compiler */
+ int len, dir;
+ struct ip *h = NULL; /* stupid compiler */
+#ifdef INET6
+ struct ip6_hdr *h6 = NULL;
+#endif
+ struct tcphdr *th = NULL;
+
+ MGETHDR(m, M_NOWAIT, MT_DATA);
+ if (m == NULL)
+ return (NULL);
+
+ M_SETFIB(m, id->fib);
+#ifdef MAC
+ if (replyto != NULL)
+ mac_netinet_firewall_reply(replyto, m);
+ else
+ mac_netinet_firewall_send(m);
+#else
+ (void)replyto; /* don't warn about unused arg */
+#endif
+
+ switch (id->addr_type) {
+ case 4:
+ len = sizeof(struct ip) + sizeof(struct tcphdr);
+ break;
+#ifdef INET6
+ case 6:
+ len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ break;
+#endif
+ default:
+ /* XXX: log me?!? */
+ FREE_PKT(m);
+ return (NULL);
+ }
+ dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN);
+
+ m->m_data += max_linkhdr;
+ m->m_flags |= M_SKIP_FIREWALL;
+ m->m_pkthdr.len = m->m_len = len;
+ m->m_pkthdr.rcvif = NULL;
+ bzero(m->m_data, len);
+
+ switch (id->addr_type) {
+ case 4:
+ h = mtod(m, struct ip *);
+
+ /* prepare for checksum */
+ h->ip_p = IPPROTO_TCP;
+ h->ip_len = htons(sizeof(struct tcphdr));
+ if (dir) {
+ h->ip_src.s_addr = htonl(id->src_ip);
+ h->ip_dst.s_addr = htonl(id->dst_ip);
+ } else {
+ h->ip_src.s_addr = htonl(id->dst_ip);
+ h->ip_dst.s_addr = htonl(id->src_ip);
+ }
+
+ th = (struct tcphdr *)(h + 1);
+ break;
+#ifdef INET6
+ case 6:
+ h6 = mtod(m, struct ip6_hdr *);
+
+ /* prepare for checksum */
+ h6->ip6_nxt = IPPROTO_TCP;
+ h6->ip6_plen = htons(sizeof(struct tcphdr));
+ if (dir) {
+ h6->ip6_src = id->src_ip6;
+ h6->ip6_dst = id->dst_ip6;
+ } else {
+ h6->ip6_src = id->dst_ip6;
+ h6->ip6_dst = id->src_ip6;
+ }
+
+ th = (struct tcphdr *)(h6 + 1);
+ break;
+#endif
+ }
+
+ if (dir) {
+ th->th_sport = htons(id->src_port);
+ th->th_dport = htons(id->dst_port);
+ } else {
+ th->th_sport = htons(id->dst_port);
+ th->th_dport = htons(id->src_port);
+ }
+ th->th_off = sizeof(struct tcphdr) >> 2;
+
+ if (flags & TH_RST) {
+ if (flags & TH_ACK) {
+ th->th_seq = htonl(ack);
+ th->th_flags = TH_RST;
+ } else {
+ if (flags & TH_SYN)
+ seq++;
+ th->th_ack = htonl(seq);
+ th->th_flags = TH_RST | TH_ACK;
+ }
+ } else {
+ /*
+ * Keepalive - use caller provided sequence numbers
+ */
+ th->th_seq = htonl(seq);
+ th->th_ack = htonl(ack);
+ th->th_flags = TH_ACK;
+ }
+
+ switch (id->addr_type) {
+ case 4:
+ th->th_sum = in_cksum(m, len);
+
+ /* finish the ip header */
+ h->ip_v = 4;
+ h->ip_hl = sizeof(*h) >> 2;
+ h->ip_tos = IPTOS_LOWDELAY;
+ h->ip_off = htons(0);
+ h->ip_len = htons(len);
+ h->ip_ttl = V_ip_defttl;
+ h->ip_sum = 0;
+ break;
+#ifdef INET6
+ case 6:
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6),
+ sizeof(struct tcphdr));
+
+ /* finish the ip6 header */
+ h6->ip6_vfc |= IPV6_VERSION;
+ h6->ip6_hlim = IPV6_DEFHLIM;
+ break;
+#endif
+ }
+
+ return (m);
+}
+
+/*
+ * Queue keepalive packets for given dynamic rule
+ */
+static struct mbuf **
+ipfw_dyn_send_ka(struct mbuf **mtailp, ipfw_dyn_rule *q)
+{
+ struct mbuf *m_rev, *m_fwd;
+
+ m_rev = (q->state & ACK_REV) ? NULL :
+ ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN);
+ m_fwd = (q->state & ACK_FWD) ? NULL :
+ ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1, q->ack_rev, 0);
+
+ if (m_rev != NULL) {
+ *mtailp = m_rev;
+ mtailp = &(*mtailp)->m_nextpkt;
+ }
+ if (m_fwd != NULL) {
+ *mtailp = m_fwd;
+ mtailp = &(*mtailp)->m_nextpkt;
+ }
+
+ return (mtailp);
+}
+
+/*
+ * This procedure is used to perform various maintenance
+ * on dynamic hash list. Currently it is called every second.
+ */
+static void
+ipfw_dyn_tick(void * vnetx)
+{
+ struct ip_fw_chain *chain;
+ int check_ka = 0;
+#ifdef VIMAGE
+ struct vnet *vp = vnetx;
+#endif
+
+ CURVNET_SET(vp);
+
+ chain = &V_layer3_chain;
+
+ /* Run keepalive checks every keepalive_period iff ka is enabled */
+ if ((V_dyn_keepalive_last + V_dyn_keepalive_period <= time_uptime) &&
+ (V_dyn_keepalive != 0)) {
+ V_dyn_keepalive_last = time_uptime;
+ check_ka = 1;
+ }
+
+ check_dyn_rules(chain, NULL, check_ka, 1);
+
+ callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, vnetx, 0);
+
+ CURVNET_RESTORE();
+}
+
+
+/*
+ * Walk through all dynamic states doing generic maintenance:
+ * 1) free expired states
+ * 2) free all states based on deleted rule / set
+ * 3) send keepalives for states if needed
+ *
+ * @chain - pointer to current ipfw rules chain
+ * @rule - delete all states originated by given rule if != NULL
+ * @set - delete all states originated by any rule in set @set if != RESVD_SET
+ * @check_ka - perform checking/sending keepalives
+ * @timer - indicate call from timer routine.
+ *
+ * Timer routine must call this function unlocked to permit
+ * sending keepalives/resizing table.
+ *
+ * Others has to call function with IPFW_UH_WLOCK held.
+ * Additionally, function assume that dynamic rule/set is
+ * ALREADY deleted so no new states can be generated by
+ * 'deleted' rules.
+ *
+ * Write lock is needed to ensure that unused parent rules
+ * are not freed by other instance (see stage 2, 3)
+ */
+static void
+check_dyn_rules(struct ip_fw_chain *chain, ipfw_range_tlv *rt,
+ int check_ka, int timer)
+{
+ struct mbuf *m0, *m, *mnext, **mtailp;
+ struct ip *h;
+ int i, dyn_count, new_buckets = 0, max_buckets;
+ int expired = 0, expired_limits = 0, parents = 0, total = 0;
+ ipfw_dyn_rule *q, *q_prev, *q_next;
+ ipfw_dyn_rule *exp_head, **exptailp;
+ ipfw_dyn_rule *exp_lhead, **expltailp;
+
+ KASSERT(V_ipfw_dyn_v != NULL, ("%s: dynamic table not allocated",
+ __func__));
+
+ /* Avoid possible LOR */
+ KASSERT(!check_ka || timer, ("%s: keepalive check with lock held",
+ __func__));
+
+ /*
+ * Do not perform any checks if we currently have no dynamic states
+ */
+ if (DYN_COUNT == 0)
+ return;
+
+ /* Expired states */
+ exp_head = NULL;
+ exptailp = &exp_head;
+
+ /* Expired limit states */
+ exp_lhead = NULL;
+ expltailp = &exp_lhead;
+
+ /*
+ * We make a chain of packets to go out here -- not deferring
+ * until after we drop the IPFW dynamic rule lock would result
+ * in a lock order reversal with the normal packet input -> ipfw
+ * call stack.
+ */
+ m0 = NULL;
+ mtailp = &m0;
+
+ /* Protect from hash resizing */
+ if (timer != 0)
+ IPFW_UH_WLOCK(chain);
+ else
+ IPFW_UH_WLOCK_ASSERT(chain);
+
+#define NEXT_RULE() { q_prev = q; q = q->next ; continue; }
+
+ /* Stage 1: perform requested deletion */
+ for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+ IPFW_BUCK_LOCK(i);
+ for (q = V_ipfw_dyn_v[i].head, q_prev = q; q ; ) {
+ /* account every rule */
+ total++;
+
+ /* Skip parent rules at all */
+ if (q->dyn_type == O_LIMIT_PARENT) {
+ parents++;
+ NEXT_RULE();
+ }
+
+ /*
+ * Remove rules which are:
+ * 1) expired
+ * 2) matches deletion range
+ */
+ if ((TIME_LEQ(q->expire, time_uptime)) ||
+ (rt != NULL && ipfw_match_range(q->rule, rt))) {
+ if (TIME_LE(time_uptime, q->expire) &&
+ q->dyn_type == O_KEEP_STATE &&
+ V_dyn_keep_states != 0) {
+ /*
+ * Do not delete state if
+ * it is not expired and
+ * dyn_keep_states is ON.
+ * However we need to re-link it
+ * to any other stable rule
+ */
+ q->rule = chain->default_rule;
+ NEXT_RULE();
+ }
+
+ /* Unlink q from current list */
+ q_next = q->next;
+ if (q == V_ipfw_dyn_v[i].head)
+ V_ipfw_dyn_v[i].head = q_next;
+ else
+ q_prev->next = q_next;
+
+ q->next = NULL;
+
+ /* queue q to expire list */
+ if (q->dyn_type != O_LIMIT) {
+ *exptailp = q;
+ exptailp = &(*exptailp)->next;
+ DEB(print_dyn_rule(&q->id, q->dyn_type,
+ "unlink entry", "left");
+ )
+ } else {
+ /* Separate list for limit rules */
+ *expltailp = q;
+ expltailp = &(*expltailp)->next;
+ expired_limits++;
+ DEB(print_dyn_rule(&q->id, q->dyn_type,
+ "unlink limit entry", "left");
+ )
+ }
+
+ q = q_next;
+ expired++;
+ continue;
+ }
+
+ /*
+ * Check if we need to send keepalive:
+ * we need to ensure if is time to do KA,
+ * this is established TCP session, and
+ * expire time is within keepalive interval
+ */
+ if ((check_ka != 0) && (q->id.proto == IPPROTO_TCP) &&
+ ((q->state & BOTH_SYN) == BOTH_SYN) &&
+ (TIME_LEQ(q->expire, time_uptime +
+ V_dyn_keepalive_interval)))
+ mtailp = ipfw_dyn_send_ka(mtailp, q);
+
+ NEXT_RULE();
+ }
+ IPFW_BUCK_UNLOCK(i);
+ }
+
+ /* Stage 2: decrement counters from O_LIMIT parents */
+ if (expired_limits != 0) {
+ /*
+ * XXX: Note that deleting set with more than one
+ * heavily-used LIMIT rules can result in overwhelming
+ * locking due to lack of per-hash value sorting
+ *
+ * We should probably think about:
+ * 1) pre-allocating hash of size, say,
+ * MAX(16, V_curr_dyn_buckets / 1024)
+ * 2) checking if expired_limits is large enough
+ * 3) If yes, init hash (or its part), re-link
+ * current list and start decrementing procedure in
+ * each bucket separately
+ */
+
+ /*
+ * Small optimization: do not unlock bucket until
+ * we see the next item resides in different bucket
+ */
+ if (exp_lhead != NULL) {
+ i = exp_lhead->parent->bucket;
+ IPFW_BUCK_LOCK(i);
+ }
+ for (q = exp_lhead; q != NULL; q = q->next) {
+ if (i != q->parent->bucket) {
+ IPFW_BUCK_UNLOCK(i);
+ i = q->parent->bucket;
+ IPFW_BUCK_LOCK(i);
+ }
+
+ /* Decrease parent refcount */
+ q->parent->count--;
+ }
+ if (exp_lhead != NULL)
+ IPFW_BUCK_UNLOCK(i);
+ }
+
+ /*
+ * We protectet ourselves from unused parent deletion
+ * (from the timer function) by holding UH write lock.
+ */
+
+ /* Stage 3: remove unused parent rules */
+ if ((parents != 0) && (expired != 0)) {
+ for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+ IPFW_BUCK_LOCK(i);
+ for (q = V_ipfw_dyn_v[i].head, q_prev = q ; q ; ) {
+ if (q->dyn_type != O_LIMIT_PARENT)
+ NEXT_RULE();
+
+ if (q->count != 0)
+ NEXT_RULE();
+
+ /* Parent rule without consumers */
+
+ /* Unlink q from current list */
+ q_next = q->next;
+ if (q == V_ipfw_dyn_v[i].head)
+ V_ipfw_dyn_v[i].head = q_next;
+ else
+ q_prev->next = q_next;
+
+ q->next = NULL;
+
+ /* Add to expired list */
+ *exptailp = q;
+ exptailp = &(*exptailp)->next;
+
+ DEB(print_dyn_rule(&q->id, q->dyn_type,
+ "unlink parent entry", "left");
+ )
+
+ expired++;
+
+ q = q_next;
+ }
+ IPFW_BUCK_UNLOCK(i);
+ }
+ }
+
+#undef NEXT_RULE
+
+ if (timer != 0) {
+ /*
+ * Check if we need to resize hash:
+ * if current number of states exceeds number of buckes in hash,
+ * grow hash size to the minimum power of 2 which is bigger than
+ * current states count. Limit hash size by 64k.
+ */
+ max_buckets = (V_dyn_buckets_max > 65536) ?
+ 65536 : V_dyn_buckets_max;
+
+ dyn_count = DYN_COUNT;
+
+ if ((dyn_count > V_curr_dyn_buckets * 2) &&
+ (dyn_count < max_buckets)) {
+ new_buckets = V_curr_dyn_buckets;
+ while (new_buckets < dyn_count) {
+ new_buckets *= 2;
+
+ if (new_buckets >= max_buckets)
+ break;
+ }
+ }
+
+ IPFW_UH_WUNLOCK(chain);
+ }
+
+ /* Finally delete old states ad limits if any */
+ for (q = exp_head; q != NULL; q = q_next) {
+ q_next = q->next;
+ uma_zfree(V_ipfw_dyn_rule_zone, q);
+ ipfw_dyn_count--;
+ }
+
+ for (q = exp_lhead; q != NULL; q = q_next) {
+ q_next = q->next;
+ uma_zfree(V_ipfw_dyn_rule_zone, q);
+ ipfw_dyn_count--;
+ }
+
+ /*
+ * The rest code MUST be called from timer routine only
+ * without holding any locks
+ */
+ if (timer == 0)
+ return;
+
+ /* Send keepalive packets if any */
+ for (m = m0; m != NULL; m = mnext) {
+ mnext = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ h = mtod(m, struct ip *);
+ if (h->ip_v == 4)
+ ip_output(m, NULL, NULL, 0, NULL, NULL);
+#ifdef INET6
+ else
+ ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
+#endif
+ }
+
+ /* Run table resize without holding any locks */
+ if (new_buckets != 0)
+ resize_dynamic_table(chain, new_buckets);
+}
+
+/*
+ * Deletes all dynamic rules originated by given rule or all rules in
+ * given set. Specify RESVD_SET to indicate set should not be used.
+ * @chain - pointer to current ipfw rules chain
+ * @rr - delete all states originated by rules in matched range.
+ *
+ * Function has to be called with IPFW_UH_WLOCK held.
+ * Additionally, function assume that dynamic rule/set is
+ * ALREADY deleted so no new states can be generated by
+ * 'deleted' rules.
+ */
+void
+ipfw_expire_dyn_rules(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
+{
+
+ check_dyn_rules(chain, rt, 0, 0);
+}
+
+/*
+ * Check if rule contains at least one dynamic opcode.
+ *
+ * Returns 1 if such opcode is found, 0 otherwise.
+ */
+int
+ipfw_is_dyn_rule(struct ip_fw *rule)
+{
+ int cmdlen, l;
+ ipfw_insn *cmd;
+
+ l = rule->cmd_len;
+ cmd = rule->cmd;
+ cmdlen = 0;
+ for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+
+ switch (cmd->opcode) {
+ case O_LIMIT:
+ case O_KEEP_STATE:
+ case O_PROBE_STATE:
+ case O_CHECK_STATE:
+ return (1);
+ }
+ }
+
+ return (0);
+}
+
+void
+ipfw_dyn_init(struct ip_fw_chain *chain)
+{
+
+ V_ipfw_dyn_v = NULL;
+ V_dyn_buckets_max = 256; /* must be power of 2 */
+ V_curr_dyn_buckets = 256; /* must be power of 2 */
+
+ V_dyn_ack_lifetime = 300;
+ V_dyn_syn_lifetime = 20;
+ V_dyn_fin_lifetime = 1;
+ V_dyn_rst_lifetime = 1;
+ V_dyn_udp_lifetime = 10;
+ V_dyn_short_lifetime = 5;
+
+ V_dyn_keepalive_interval = 20;
+ V_dyn_keepalive_period = 5;
+ V_dyn_keepalive = 1; /* do send keepalives */
+ V_dyn_keepalive_last = time_uptime;
+
+ V_dyn_max = 16384; /* max # of dynamic rules */
+
+ V_ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule",
+ sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+
+ /* Enforce limit on dynamic rules */
+ uma_zone_set_max(V_ipfw_dyn_rule_zone, V_dyn_max);
+
+ callout_init(&V_ipfw_timeout, 1);
+
+ /*
+ * This can potentially be done on first dynamic rule
+ * being added to chain.
+ */
+ resize_dynamic_table(chain, V_curr_dyn_buckets);
+ IPFW_ADD_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes);
+}
+
+void
+ipfw_dyn_uninit(int pass)
+{
+ int i;
+
+ if (pass == 0) {
+ callout_drain(&V_ipfw_timeout);
+ return;
+ }
+ IPFW_DEL_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes);
+
+ if (V_ipfw_dyn_v != NULL) {
+ /*
+ * Skip deleting all dynamic states -
+ * uma_zdestroy() does this more efficiently;
+ */
+
+ /* Destroy all mutexes */
+ for (i = 0 ; i < V_curr_dyn_buckets ; i++)
+ IPFW_BUCK_LOCK_DESTROY(&V_ipfw_dyn_v[i]);
+ free(V_ipfw_dyn_v, M_IPFW);
+ V_ipfw_dyn_v = NULL;
+ }
+
+ uma_zdestroy(V_ipfw_dyn_rule_zone);
+}
+
+#ifdef SYSCTL_NODE
+/*
+ * Get/set maximum number of dynamic states in given VNET instance.
+ */
+static int
+sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ unsigned int nstates;
+
+ nstates = V_dyn_max;
+
+ error = sysctl_handle_int(oidp, &nstates, 0, req);
+ /* Read operation or some error */
+ if ((error != 0) || (req->newptr == NULL))
+ return (error);
+
+ V_dyn_max = nstates;
+ uma_zone_set_max(V_ipfw_dyn_rule_zone, V_dyn_max);
+
+ return (0);
+}
+
+/*
+ * Get current number of dynamic states in given VNET instance.
+ */
+static int
+sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ unsigned int nstates;
+
+ nstates = DYN_COUNT;
+
+ error = sysctl_handle_int(oidp, &nstates, 0, req);
+
+ return (error);
+}
+#endif
+
+/*
+ * Returns size of dynamic states in legacy format
+ */
+int
+ipfw_dyn_len(void)
+{
+
+ return (V_ipfw_dyn_v == NULL) ? 0 :
+ (DYN_COUNT * sizeof(ipfw_dyn_rule));
+}
+
+/*
+ * Returns number of dynamic states.
+ * Used by dump format v1 (current).
+ */
+int
+ipfw_dyn_get_count(void)
+{
+
+ return (V_ipfw_dyn_v == NULL) ? 0 : DYN_COUNT;
+}
+
+static void
+export_dyn_rule(ipfw_dyn_rule *src, ipfw_dyn_rule *dst)
+{
+
+ memcpy(dst, src, sizeof(*src));
+ memcpy(&(dst->rule), &(src->rule->rulenum), sizeof(src->rule->rulenum));
+ /*
+ * store set number into high word of
+ * dst->rule pointer.
+ */
+ memcpy((char *)&dst->rule + sizeof(src->rule->rulenum),
+ &(src->rule->set), sizeof(src->rule->set));
+ /*
+ * store a non-null value in "next".
+ * The userland code will interpret a
+ * NULL here as a marker
+ * for the last dynamic rule.
+ */
+ memcpy(&dst->next, &dst, sizeof(dst));
+ dst->expire =
+ TIME_LEQ(dst->expire, time_uptime) ? 0 : dst->expire - time_uptime;
+}
+
+/*
+ * Fills int buffer given by @sd with dynamic states.
+ * Used by dump format v1 (current).
+ *
+ * Returns 0 on success.
+ */
+int
+ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd)
+{
+ ipfw_dyn_rule *p;
+ ipfw_obj_dyntlv *dst, *last;
+ ipfw_obj_ctlv *ctlv;
+ int i;
+ size_t sz;
+
+ if (V_ipfw_dyn_v == NULL)
+ return (0);
+
+ IPFW_UH_RLOCK_ASSERT(chain);
+
+ ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv));
+ if (ctlv == NULL)
+ return (ENOMEM);
+ sz = sizeof(ipfw_obj_dyntlv);
+ ctlv->head.type = IPFW_TLV_DYNSTATE_LIST;
+ ctlv->objsize = sz;
+ last = NULL;
+
+ for (i = 0 ; i < V_curr_dyn_buckets; i++) {
+ IPFW_BUCK_LOCK(i);
+ for (p = V_ipfw_dyn_v[i].head ; p != NULL; p = p->next) {
+ dst = (ipfw_obj_dyntlv *)ipfw_get_sopt_space(sd, sz);
+ if (dst == NULL) {
+ IPFW_BUCK_UNLOCK(i);
+ return (ENOMEM);
+ }
+
+ export_dyn_rule(p, &dst->state);
+ dst->head.length = sz;
+ dst->head.type = IPFW_TLV_DYN_ENT;
+ last = dst;
+ }
+ IPFW_BUCK_UNLOCK(i);
+ }
+
+ if (last != NULL) /* mark last dynamic rule */
+ last->head.flags = IPFW_DF_LAST;
+
+ return (0);
+}
+
+/*
+ * Fill given buffer with dynamic states (legacy format).
+ * IPFW_UH_RLOCK has to be held while calling.
+ */
+void
+ipfw_get_dynamic(struct ip_fw_chain *chain, char **pbp, const char *ep)
+{
+ ipfw_dyn_rule *p, *last = NULL;
+ char *bp;
+ int i;
+
+ if (V_ipfw_dyn_v == NULL)
+ return;
+ bp = *pbp;
+
+ IPFW_UH_RLOCK_ASSERT(chain);
+
+ for (i = 0 ; i < V_curr_dyn_buckets; i++) {
+ IPFW_BUCK_LOCK(i);
+ for (p = V_ipfw_dyn_v[i].head ; p != NULL; p = p->next) {
+ if (bp + sizeof *p <= ep) {
+ ipfw_dyn_rule *dst =
+ (ipfw_dyn_rule *)bp;
+
+ export_dyn_rule(p, dst);
+ last = dst;
+ bp += sizeof(ipfw_dyn_rule);
+ }
+ }
+ IPFW_BUCK_UNLOCK(i);
+ }
+
+ if (last != NULL) /* mark last dynamic rule */
+ bzero(&last->next, sizeof(last));
+ *pbp = bp;
+}
+/* end of file */
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_eaction.c b/freebsd/sys/netpfil/ipfw/ip_fw_eaction.c
new file mode 100644
index 00000000..2c6ba8b9
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/ip_fw_eaction.c
@@ -0,0 +1,383 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2016 Yandex LLC
+ * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/hash.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/queue.h>
+#include <net/pfil.h>
+
+#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* struct ipfw_rule_ref */
+#include <netinet/ip_fw.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+
+#include <rtems/bsd/local/opt_ipfw.h>
+
+/*
+ * External actions support for ipfw.
+ *
+ * This code provides KPI for implementing loadable modules, that
+ * can provide handlers for external action opcodes in the ipfw's
+ * rules.
+ * Module should implement opcode handler with type ipfw_eaction_t.
+ * This handler will be called by ipfw_chk() function when
+ * O_EXTERNAL_ACTION opcode will be matched. The handler must return
+ * value used as return value in ipfw_chk(), i.e. IP_FW_PASS,
+ * IP_FW_DENY (see ip_fw_private.h).
+ * Also the last argument must be set by handler. If it is zero,
+ * the search continues to the next rule. If it has non zero value,
+ * the search terminates.
+ *
+ * The module that implements external action should register its
+ * handler and name with ipfw_add_eaction() function.
+ * This function will return eaction_id, that can be used by module.
+ *
+ * It is possible to pass some additional information to external
+ * action handler via the O_EXTERNAL_INSTANCE opcode. This opcode
+ * will be next after the O_EXTERNAL_ACTION opcode. cmd->arg1 will
+ * contain index of named object related to instance of external action.
+ *
+ * In case when eaction module uses named instances, it should register
+ * opcode rewriting routines for O_EXTERNAL_INSTANCE opcode. The
+ * classifier callback can look back into O_EXTERNAL_ACTION opcode (it
+ * must be in the (ipfw_insn *)(cmd - 1)). By arg1 from O_EXTERNAL_ACTION
+ * it can deteremine eaction_id and compare it with its own.
+ * The macro IPFW_TLV_EACTION_NAME(eaction_id) can be used to deteremine
+ * the type of named_object related to external action instance.
+ *
+ * On module unload handler should be deregistered with ipfw_del_eaction()
+ * function using known eaction_id.
+ */
+
+struct eaction_obj {
+ struct named_object no;
+ ipfw_eaction_t *handler;
+ char name[64];
+};
+
+#define EACTION_OBJ(ch, cmd) \
+ ((struct eaction_obj *)SRV_OBJECT((ch), (cmd)->arg1))
+
+#if 0
+#define EACTION_DEBUG(fmt, ...) do { \
+ printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \
+} while (0)
+#else
+#define EACTION_DEBUG(fmt, ...)
+#endif
+
+const char *default_eaction_typename = "drop";
+static int
+default_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args,
+ ipfw_insn *cmd, int *done)
+{
+
+ *done = 1; /* terminate the search */
+ return (IP_FW_DENY);
+}
+
+/*
+ * Opcode rewriting callbacks.
+ */
+static int
+eaction_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
+{
+
+ EACTION_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1);
+ *puidx = cmd->arg1;
+ *ptype = 0;
+ return (0);
+}
+
+static void
+eaction_update(ipfw_insn *cmd, uint16_t idx)
+{
+
+ cmd->arg1 = idx;
+ EACTION_DEBUG("opcode %d, arg1 -> %d", cmd->opcode, cmd->arg1);
+}
+
+static int
+eaction_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
+ struct named_object **pno)
+{
+ ipfw_obj_ntlv *ntlv;
+
+ if (ti->tlvs == NULL)
+ return (EINVAL);
+
+ /* Search ntlv in the buffer provided by user */
+ ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
+ IPFW_TLV_EACTION);
+ if (ntlv == NULL)
+ return (EINVAL);
+ EACTION_DEBUG("name %s, uidx %u, type %u", ntlv->name,
+ ti->uidx, ti->type);
+ /*
+ * Search named object with corresponding name.
+ * Since eaction objects are global - ignore the set value
+ * and use zero instead.
+ */
+ *pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch),
+ 0, IPFW_TLV_EACTION, ntlv->name);
+ if (*pno == NULL)
+ return (ESRCH);
+ return (0);
+}
+
+static struct named_object *
+eaction_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
+{
+
+ EACTION_DEBUG("kidx %u", idx);
+ return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx));
+}
+
+static struct opcode_obj_rewrite eaction_opcodes[] = {
+ {
+ .opcode = O_EXTERNAL_ACTION,
+ .etlv = IPFW_TLV_EACTION,
+ .classifier = eaction_classify,
+ .update = eaction_update,
+ .find_byname = eaction_findbyname,
+ .find_bykidx = eaction_findbykidx,
+ },
+};
+
+static int
+create_eaction_obj(struct ip_fw_chain *ch, ipfw_eaction_t handler,
+ const char *name, uint16_t *eaction_id)
+{
+ struct namedobj_instance *ni;
+ struct eaction_obj *obj;
+
+ IPFW_UH_UNLOCK_ASSERT(ch);
+
+ ni = CHAIN_TO_SRV(ch);
+ obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO);
+ obj->no.name = obj->name;
+ obj->no.etlv = IPFW_TLV_EACTION;
+ obj->handler = handler;
+ strlcpy(obj->name, name, sizeof(obj->name));
+
+ IPFW_UH_WLOCK(ch);
+ if (ipfw_objhash_lookup_name_type(ni, 0, IPFW_TLV_EACTION,
+ name) != NULL) {
+ /*
+ * Object is already created.
+ * We don't allow eactions with the same name.
+ */
+ IPFW_UH_WUNLOCK(ch);
+ free(obj, M_IPFW);
+ EACTION_DEBUG("External action with typename "
+ "'%s' already exists", name);
+ return (EEXIST);
+ }
+ if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) {
+ IPFW_UH_WUNLOCK(ch);
+ free(obj, M_IPFW);
+ EACTION_DEBUG("alloc_idx failed");
+ return (ENOSPC);
+ }
+ ipfw_objhash_add(ni, &obj->no);
+ IPFW_WLOCK(ch);
+ SRV_OBJECT(ch, obj->no.kidx) = obj;
+ IPFW_WUNLOCK(ch);
+ obj->no.refcnt++;
+ IPFW_UH_WUNLOCK(ch);
+
+ if (eaction_id != NULL)
+ *eaction_id = obj->no.kidx;
+ return (0);
+}
+
+static void
+destroy_eaction_obj(struct ip_fw_chain *ch, struct named_object *no)
+{
+ struct namedobj_instance *ni;
+ struct eaction_obj *obj;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ ni = CHAIN_TO_SRV(ch);
+ IPFW_WLOCK(ch);
+ obj = SRV_OBJECT(ch, no->kidx);
+ SRV_OBJECT(ch, no->kidx) = NULL;
+ IPFW_WUNLOCK(ch);
+ ipfw_objhash_del(ni, no);
+ ipfw_objhash_free_idx(ni, no->kidx);
+ free(obj, M_IPFW);
+}
+
+/*
+ * Resets all eaction opcodes to default handlers.
+ */
+static void
+reset_eaction_obj(struct ip_fw_chain *ch, uint16_t eaction_id)
+{
+ struct named_object *no;
+ struct ip_fw *rule;
+ ipfw_insn *cmd;
+ int i;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ no = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0,
+ IPFW_TLV_EACTION, default_eaction_typename);
+ if (no == NULL)
+ panic("Default external action handler is not found");
+ if (eaction_id == no->kidx)
+ panic("Wrong eaction_id");
+ EACTION_DEBUG("replace id %u with %u", eaction_id, no->kidx);
+ IPFW_WLOCK(ch);
+ for (i = 0; i < ch->n_rules; i++) {
+ rule = ch->map[i];
+ cmd = ACTION_PTR(rule);
+ if (cmd->opcode != O_EXTERNAL_ACTION)
+ continue;
+ if (cmd->arg1 != eaction_id)
+ continue;
+ cmd->arg1 = no->kidx; /* Set to default id */
+ /*
+ * XXX: we only bump refcount on default_eaction.
+ * Refcount on the original object will be just
+ * ignored on destroy. But on default_eaction it
+ * will be decremented on rule deletion.
+ */
+ no->refcnt++;
+ /*
+ * Since named_object related to this instance will be
+ * also destroyed, truncate the chain of opcodes to
+ * remove O_EXTERNAL_INSTANCE opcode.
+ */
+ if (rule->act_ofs < rule->cmd_len - 1) {
+ EACTION_DEBUG("truncate rule %d", rule->rulenum);
+ rule->cmd_len--;
+ }
+ }
+ IPFW_WUNLOCK(ch);
+}
+
+/*
+ * Initialize external actions framework.
+ * Create object with default eaction handler "drop".
+ */
+int
+ipfw_eaction_init(struct ip_fw_chain *ch, int first)
+{
+ int error;
+
+ error = create_eaction_obj(ch, default_eaction,
+ default_eaction_typename, NULL);
+ if (error != 0)
+ return (error);
+ IPFW_ADD_OBJ_REWRITER(first, eaction_opcodes);
+ EACTION_DEBUG("External actions support initialized");
+ return (0);
+}
+
+void
+ipfw_eaction_uninit(struct ip_fw_chain *ch, int last)
+{
+ struct namedobj_instance *ni;
+ struct named_object *no;
+
+ ni = CHAIN_TO_SRV(ch);
+
+ IPFW_UH_WLOCK(ch);
+ no = ipfw_objhash_lookup_name_type(ni, 0, IPFW_TLV_EACTION,
+ default_eaction_typename);
+ if (no != NULL)
+ destroy_eaction_obj(ch, no);
+ IPFW_UH_WUNLOCK(ch);
+ IPFW_DEL_OBJ_REWRITER(last, eaction_opcodes);
+ EACTION_DEBUG("External actions support uninitialized");
+}
+
+/*
+ * Registers external action handler to the global array.
+ * On success it returns eaction id, otherwise - zero.
+ */
+uint16_t
+ipfw_add_eaction(struct ip_fw_chain *ch, ipfw_eaction_t handler,
+ const char *name)
+{
+ uint16_t eaction_id;
+
+ eaction_id = 0;
+ if (ipfw_check_object_name_generic(name) == 0) {
+ create_eaction_obj(ch, handler, name, &eaction_id);
+ EACTION_DEBUG("Registered external action '%s' with id %u",
+ name, eaction_id);
+ }
+ return (eaction_id);
+}
+
+/*
+ * Deregisters external action handler with id eaction_id.
+ */
+int
+ipfw_del_eaction(struct ip_fw_chain *ch, uint16_t eaction_id)
+{
+ struct named_object *no;
+
+ IPFW_UH_WLOCK(ch);
+ no = ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), eaction_id);
+ if (no == NULL || no->etlv != IPFW_TLV_EACTION) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EINVAL);
+ }
+ if (no->refcnt > 1)
+ reset_eaction_obj(ch, eaction_id);
+ EACTION_DEBUG("External action '%s' with id %u unregistered",
+ no->name, eaction_id);
+ destroy_eaction_obj(ch, no);
+ IPFW_UH_WUNLOCK(ch);
+ return (0);
+}
+
+int
+ipfw_run_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args,
+ ipfw_insn *cmd, int *done)
+{
+
+ return (EACTION_OBJ(ch, cmd)->handler(ch, args, cmd, done));
+}
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_iface.c b/freebsd/sys/netpfil/ipfw/ip_fw_iface.c
new file mode 100644
index 00000000..f8973a91
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/ip_fw_iface.c
@@ -0,0 +1,541 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2014 Yandex LLC.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Kernel interface tracking API.
+ *
+ */
+
+#include <rtems/bsd/local/opt_ipfw.h>
+#include <rtems/bsd/local/opt_inet.h>
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#include <rtems/bsd/local/opt_inet6.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+#include <sys/queue.h>
+#include <sys/eventhandler.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* struct ipfw_rule_ref */
+#include <netinet/ip_fw.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+
+#define CHAIN_TO_II(ch) ((struct namedobj_instance *)ch->ifcfg)
+
+#define DEFAULT_IFACES 128
+
+static void handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif,
+ uint16_t ifindex);
+static void handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif,
+ uint16_t ifindex);
+static int list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd);
+
+static struct ipfw_sopt_handler scodes[] = {
+ { IP_FW_XIFLIST, 0, HDIR_GET, list_ifaces },
+};
+
+/*
+ * FreeBSD Kernel interface.
+ */
+static void ipfw_kifhandler(void *arg, struct ifnet *ifp);
+static int ipfw_kiflookup(char *name);
+static void iface_khandler_register(void);
+static void iface_khandler_deregister(void);
+
+static eventhandler_tag ipfw_ifdetach_event, ipfw_ifattach_event;
+static int num_vnets = 0;
+static struct mtx vnet_mtx;
+
+/*
+ * Checks if kernel interface is contained in our tracked
+ * interface list and calls attach/detach handler.
+ */
+static void
+ipfw_kifhandler(void *arg, struct ifnet *ifp)
+{
+ struct ip_fw_chain *ch;
+ struct ipfw_iface *iif;
+ struct namedobj_instance *ii;
+ uintptr_t htype;
+
+ if (V_ipfw_vnet_ready == 0)
+ return;
+
+ ch = &V_layer3_chain;
+ htype = (uintptr_t)arg;
+
+ IPFW_UH_WLOCK(ch);
+ ii = CHAIN_TO_II(ch);
+ if (ii == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return;
+ }
+ iif = (struct ipfw_iface*)ipfw_objhash_lookup_name(ii, 0,
+ if_name(ifp));
+ if (iif != NULL) {
+ if (htype == 1)
+ handle_ifattach(ch, iif, ifp->if_index);
+ else
+ handle_ifdetach(ch, iif, ifp->if_index);
+ }
+ IPFW_UH_WUNLOCK(ch);
+}
+
+/*
+ * Reference current VNET as iface tracking API user.
+ * Registers interface tracking handlers for first VNET.
+ */
+static void
+iface_khandler_register()
+{
+ int create;
+
+ create = 0;
+
+ mtx_lock(&vnet_mtx);
+ if (num_vnets == 0)
+ create = 1;
+ num_vnets++;
+ mtx_unlock(&vnet_mtx);
+
+ if (create == 0)
+ return;
+
+ printf("IPFW: starting up interface tracker\n");
+
+ ipfw_ifdetach_event = EVENTHANDLER_REGISTER(
+ ifnet_departure_event, ipfw_kifhandler, NULL,
+ EVENTHANDLER_PRI_ANY);
+ ipfw_ifattach_event = EVENTHANDLER_REGISTER(
+ ifnet_arrival_event, ipfw_kifhandler, (void*)((uintptr_t)1),
+ EVENTHANDLER_PRI_ANY);
+}
+
+/*
+ *
+ * Detach interface event handlers on last VNET instance
+ * detach.
+ */
+static void
+iface_khandler_deregister()
+{
+ int destroy;
+
+ destroy = 0;
+ mtx_lock(&vnet_mtx);
+ if (num_vnets == 1)
+ destroy = 1;
+ num_vnets--;
+ mtx_unlock(&vnet_mtx);
+
+ if (destroy == 0)
+ return;
+
+ EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
+ ipfw_ifattach_event);
+ EVENTHANDLER_DEREGISTER(ifnet_departure_event,
+ ipfw_ifdetach_event);
+}
+
+/*
+ * Retrieves ifindex for given @name.
+ *
+ * Returns ifindex or 0.
+ */
+static int
+ipfw_kiflookup(char *name)
+{
+ struct ifnet *ifp;
+ int ifindex;
+
+ ifindex = 0;
+
+ if ((ifp = ifunit_ref(name)) != NULL) {
+ ifindex = ifp->if_index;
+ if_rele(ifp);
+ }
+
+ return (ifindex);
+}
+
+/*
+ * Global ipfw startup hook.
+ * Since we perform lazy initialization, do nothing except
+ * mutex init.
+ */
+int
+ipfw_iface_init()
+{
+
+ mtx_init(&vnet_mtx, "IPFW ifhandler mtx", NULL, MTX_DEF);
+ IPFW_ADD_SOPT_HANDLER(1, scodes);
+ return (0);
+}
+
+/*
+ * Global ipfw destroy hook.
+ * Unregister khandlers iff init has been done.
+ */
+void
+ipfw_iface_destroy()
+{
+
+ IPFW_DEL_SOPT_HANDLER(1, scodes);
+ mtx_destroy(&vnet_mtx);
+}
+
+/*
+ * Perform actual init on internal request.
+ * Inits both namehash and global khandler.
+ */
+static void
+vnet_ipfw_iface_init(struct ip_fw_chain *ch)
+{
+ struct namedobj_instance *ii;
+
+ ii = ipfw_objhash_create(DEFAULT_IFACES);
+ IPFW_UH_WLOCK(ch);
+ if (ch->ifcfg == NULL) {
+ ch->ifcfg = ii;
+ ii = NULL;
+ }
+ IPFW_UH_WUNLOCK(ch);
+
+ if (ii != NULL) {
+ /* Already initialized. Free namehash. */
+ ipfw_objhash_destroy(ii);
+ } else {
+ /* We're the first ones. Init kernel hooks. */
+ iface_khandler_register();
+ }
+}
+
+static int
+destroy_iface(struct namedobj_instance *ii, struct named_object *no,
+ void *arg)
+{
+
+ /* Assume all consumers have been already detached */
+ free(no, M_IPFW);
+ return (0);
+}
+
+/*
+ * Per-VNET ipfw detach hook.
+ *
+ */
+void
+vnet_ipfw_iface_destroy(struct ip_fw_chain *ch)
+{
+ struct namedobj_instance *ii;
+
+ IPFW_UH_WLOCK(ch);
+ ii = CHAIN_TO_II(ch);
+ ch->ifcfg = NULL;
+ IPFW_UH_WUNLOCK(ch);
+
+ if (ii != NULL) {
+ ipfw_objhash_foreach(ii, destroy_iface, ch);
+ ipfw_objhash_destroy(ii);
+ iface_khandler_deregister();
+ }
+}
+
+/*
+ * Notify the subsystem that we are interested in tracking
+ * interface @name. This function has to be called without
+ * holding any locks to permit allocating the necessary states
+ * for proper interface tracking.
+ *
+ * Returns 0 on success.
+ */
+int
+ipfw_iface_ref(struct ip_fw_chain *ch, char *name,
+ struct ipfw_ifc *ic)
+{
+ struct namedobj_instance *ii;
+ struct ipfw_iface *iif, *tmp;
+
+ if (strlen(name) >= sizeof(iif->ifname))
+ return (EINVAL);
+
+ IPFW_UH_WLOCK(ch);
+
+ ii = CHAIN_TO_II(ch);
+ if (ii == NULL) {
+
+ /*
+ * First request to subsystem.
+ * Let's perform init.
+ */
+ IPFW_UH_WUNLOCK(ch);
+ vnet_ipfw_iface_init(ch);
+ IPFW_UH_WLOCK(ch);
+ ii = CHAIN_TO_II(ch);
+ }
+
+ iif = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name);
+
+ if (iif != NULL) {
+ iif->no.refcnt++;
+ ic->iface = iif;
+ IPFW_UH_WUNLOCK(ch);
+ return (0);
+ }
+
+ IPFW_UH_WUNLOCK(ch);
+
+ /* Not found. Let's create one */
+ iif = malloc(sizeof(struct ipfw_iface), M_IPFW, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&iif->consumers);
+ iif->no.name = iif->ifname;
+ strlcpy(iif->ifname, name, sizeof(iif->ifname));
+
+ /*
+ * Ref & link to the list.
+ *
+ * We assume ifnet_arrival_event / ifnet_departure_event
+ * are not holding any locks.
+ */
+ iif->no.refcnt = 1;
+ IPFW_UH_WLOCK(ch);
+
+ tmp = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name);
+ if (tmp != NULL) {
+ /* Interface has been created since unlock. Ref and return */
+ tmp->no.refcnt++;
+ ic->iface = tmp;
+ IPFW_UH_WUNLOCK(ch);
+ free(iif, M_IPFW);
+ return (0);
+ }
+
+ iif->ifindex = ipfw_kiflookup(name);
+ if (iif->ifindex != 0)
+ iif->resolved = 1;
+
+ ipfw_objhash_add(ii, &iif->no);
+ ic->iface = iif;
+
+ IPFW_UH_WUNLOCK(ch);
+
+ return (0);
+}
+
+/*
+ * Adds @ic to the list of iif interface consumers.
+ * Must be called with holding both UH+WLOCK.
+ * Callback may be immediately called (if interface exists).
+ */
+void
+ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic)
+{
+ struct ipfw_iface *iif;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+ IPFW_WLOCK_ASSERT(ch);
+
+ iif = ic->iface;
+
+ TAILQ_INSERT_TAIL(&iif->consumers, ic, next);
+ if (iif->resolved != 0)
+ ic->cb(ch, ic->cbdata, iif->ifindex);
+}
+
+/*
+ * Unlinks interface tracker object @ic from interface.
+ * Must be called while holding UH lock.
+ */
+void
+ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic)
+{
+ struct ipfw_iface *iif;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ iif = ic->iface;
+ TAILQ_REMOVE(&iif->consumers, ic, next);
+}
+
+/*
+ * Unreference interface specified by @ic.
+ * Must be called while holding UH lock.
+ */
+void
+ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic)
+{
+ struct ipfw_iface *iif;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ iif = ic->iface;
+ ic->iface = NULL;
+
+ iif->no.refcnt--;
+ /* TODO: check for references & delete */
+}
+
+/*
+ * Interface arrival handler.
+ */
+static void
+handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif,
+ uint16_t ifindex)
+{
+ struct ipfw_ifc *ic;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ iif->gencnt++;
+ iif->resolved = 1;
+ iif->ifindex = ifindex;
+
+ IPFW_WLOCK(ch);
+ TAILQ_FOREACH(ic, &iif->consumers, next)
+ ic->cb(ch, ic->cbdata, iif->ifindex);
+ IPFW_WUNLOCK(ch);
+}
+
+/*
+ * Interface departure handler.
+ */
+static void
+handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif,
+ uint16_t ifindex)
+{
+ struct ipfw_ifc *ic;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ IPFW_WLOCK(ch);
+ TAILQ_FOREACH(ic, &iif->consumers, next)
+ ic->cb(ch, ic->cbdata, 0);
+ IPFW_WUNLOCK(ch);
+
+ iif->gencnt++;
+ iif->resolved = 0;
+ iif->ifindex = 0;
+}
+
+struct dump_iface_args {
+ struct ip_fw_chain *ch;
+ struct sockopt_data *sd;
+};
+
+static int
+export_iface_internal(struct namedobj_instance *ii, struct named_object *no,
+ void *arg)
+{
+ ipfw_iface_info *i;
+ struct dump_iface_args *da;
+ struct ipfw_iface *iif;
+
+ da = (struct dump_iface_args *)arg;
+
+ i = (ipfw_iface_info *)ipfw_get_sopt_space(da->sd, sizeof(*i));
+ KASSERT(i != NULL, ("previously checked buffer is not enough"));
+
+ iif = (struct ipfw_iface *)no;
+
+ strlcpy(i->ifname, iif->ifname, sizeof(i->ifname));
+ if (iif->resolved)
+ i->flags |= IPFW_IFFLAG_RESOLVED;
+ i->ifindex = iif->ifindex;
+ i->refcnt = iif->no.refcnt;
+ i->gencnt = iif->gencnt;
+ return (0);
+}
+
+/*
+ * Lists all interface currently tracked by ipfw.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
+ * Reply: [ ipfw_obj_lheader ipfw_iface_info x N ]
+ *
+ * Returns 0 on success
+ */
+static int
+list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ struct namedobj_instance *ii;
+ struct _ipfw_obj_lheader *olh;
+ struct dump_iface_args da;
+ uint32_t count, size;
+
+ olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
+ if (olh == NULL)
+ return (EINVAL);
+ if (sd->valsize < olh->size)
+ return (EINVAL);
+
+ IPFW_UH_RLOCK(ch);
+ ii = CHAIN_TO_II(ch);
+ if (ii != NULL)
+ count = ipfw_objhash_count(ii);
+ else
+ count = 0;
+ size = count * sizeof(ipfw_iface_info) + sizeof(ipfw_obj_lheader);
+
+ /* Fill in header regadless of buffer size */
+ olh->count = count;
+ olh->objsize = sizeof(ipfw_iface_info);
+
+ if (size > olh->size) {
+ olh->size = size;
+ IPFW_UH_RUNLOCK(ch);
+ return (ENOMEM);
+ }
+ olh->size = size;
+
+ da.ch = ch;
+ da.sd = sd;
+
+ if (ii != NULL)
+ ipfw_objhash_foreach(ii, export_iface_internal, &da);
+ IPFW_UH_RUNLOCK(ch);
+
+ return (0);
+}
+
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_log.c b/freebsd/sys/netpfil/ipfw/ip_fw_log.c
index 60b0df7d..658e1256 100644
--- a/freebsd/sys/netpfil/ipfw/ip_fw_log.c
+++ b/freebsd/sys/netpfil/ipfw/ip_fw_log.c
@@ -41,16 +41,15 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
-#include <sys/mbuf.h>
#include <sys/kernel.h>
+#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <net/ethernet.h> /* for ETHERTYPE_IP */
#include <net/if.h>
+#include <net/if_var.h>
#include <net/vnet.h>
-#include <net/if_types.h> /* for IFT_ETHER */
-#include <net/bpf.h> /* for BPF */
#include <netinet/in.h>
#include <netinet/ip.h>
@@ -83,111 +82,48 @@ __FBSDID("$FreeBSD$");
#define ICMP(p) ((struct icmphdr *)(p))
#define ICMP6(p) ((struct icmp6_hdr *)(p))
+#ifdef __APPLE__
+#undef snprintf
+#define snprintf sprintf
+#define SNPARGS(buf, len) buf + len
+#define SNP(buf) buf
+#else /* !__APPLE__ */
#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
#define SNP(buf) buf, sizeof(buf)
+#endif /* !__APPLE__ */
-#ifdef WITHOUT_BPF
-void
-ipfw_log_bpf(int onoff)
-{
-}
-#else /* !WITHOUT_BPF */
-static struct ifnet *log_if; /* hook to attach to bpf */
-
-/* we use this dummy function for all ifnet callbacks */
-static int
-log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr)
-{
- return EINVAL;
-}
-
-static int
-ipfw_log_output(struct ifnet *ifp, struct mbuf *m,
- struct sockaddr *dst, struct route *ro)
-{
- if (m != NULL)
- m_freem(m);
- return EINVAL;
-}
-
-static void
-ipfw_log_start(struct ifnet* ifp)
-{
- panic("ipfw_log_start() must not be called");
-}
-
-static const u_char ipfwbroadcastaddr[6] =
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
-
-void
-ipfw_log_bpf(int onoff)
-{
- struct ifnet *ifp;
-
- if (onoff) {
- if (log_if)
- return;
- ifp = if_alloc(IFT_ETHER);
- if (ifp == NULL)
- return;
- if_initname(ifp, "ipfw", 0);
- ifp->if_mtu = 65536;
- ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
- ifp->if_init = (void *)log_dummy;
- ifp->if_ioctl = log_dummy;
- ifp->if_start = ipfw_log_start;
- ifp->if_output = ipfw_log_output;
- ifp->if_addrlen = 6;
- ifp->if_hdrlen = 14;
- if_attach(ifp);
- ifp->if_broadcastaddr = ipfwbroadcastaddr;
- ifp->if_baudrate = IF_Mbps(10);
- bpfattach(ifp, DLT_EN10MB, 14);
- log_if = ifp;
- } else {
- if (log_if) {
- ether_ifdetach(log_if);
- if_free(log_if);
- }
- log_if = NULL;
- }
-}
-#endif /* !WITHOUT_BPF */
-
+#define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f)
/*
* We enter here when we have a rule with O_LOG.
* XXX this function alone takes about 2Kbytes of code!
*/
void
-ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
- struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
- struct ip *ip)
+ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen,
+ struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif,
+ u_short offset, uint32_t tablearg, struct ip *ip)
{
char *action;
int limit_reached = 0;
char action2[92], proto[128], fragment[32];
if (V_fw_verbose == 0) {
-#ifndef WITHOUT_BPF
-
- if (log_if == NULL || log_if->if_bpf == NULL)
- return;
-
if (args->eh) /* layer2, use orig hdr */
- BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m);
+ ipfw_bpf_mtap2(args->eh, ETHER_HDR_LEN, m);
else {
/* Add fake header. Later we will store
* more info in the header.
*/
if (ip->ip_v == 4)
- BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m);
- else if (ip->ip_v == 6)
- BPF_MTAP2(log_if, "DDDDDDSSSSSS\x86\xdd", ETHER_HDR_LEN, m);
+ ipfw_bpf_mtap2("DDDDDDSSSSSS\x08\x00",
+ ETHER_HDR_LEN, m);
+ else if (ip->ip_v == 6)
+ ipfw_bpf_mtap2("DDDDDDSSSSSS\x86\xdd",
+ ETHER_HDR_LEN, m);
else
/* Obviously bogus EtherType. */
- BPF_MTAP2(log_if, "DDDDDDSSSSSS\xff\xff", ETHER_HDR_LEN, m);
+ ipfw_bpf_mtap2("DDDDDDSSSSSS\xff\xff",
+ ETHER_HDR_LEN, m);
}
-#endif /* !WITHOUT_BPF */
return;
}
/* the old 'log' function */
@@ -254,27 +190,27 @@ ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
break;
case O_DIVERT:
snprintf(SNPARGS(action2, 0), "Divert %d",
- cmd->arg1);
+ TARG(cmd->arg1, divert));
break;
case O_TEE:
snprintf(SNPARGS(action2, 0), "Tee %d",
- cmd->arg1);
+ TARG(cmd->arg1, divert));
break;
case O_SETFIB:
snprintf(SNPARGS(action2, 0), "SetFib %d",
- IP_FW_ARG_TABLEARG(cmd->arg1));
+ TARG(cmd->arg1, fib) & 0x7FFF);
break;
case O_SKIPTO:
snprintf(SNPARGS(action2, 0), "SkipTo %d",
- IP_FW_ARG_TABLEARG(cmd->arg1));
+ TARG(cmd->arg1, skipto));
break;
case O_PIPE:
snprintf(SNPARGS(action2, 0), "Pipe %d",
- IP_FW_ARG_TABLEARG(cmd->arg1));
+ TARG(cmd->arg1, pipe));
break;
case O_QUEUE:
snprintf(SNPARGS(action2, 0), "Queue %d",
- IP_FW_ARG_TABLEARG(cmd->arg1));
+ TARG(cmd->arg1, pipe));
break;
case O_FORWARD_IP: {
ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
@@ -435,7 +371,7 @@ ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
#ifdef INET6
if (IS_IP6_FLOW_ID(&(args->f_id))) {
- if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG))
+ if (offset || ip6f_mf)
snprintf(SNPARGS(fragment, 0),
" (frag %08x:%d@%d%s)",
args->f_id.extra,
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_nat.c b/freebsd/sys/netpfil/ipfw/ip_fw_nat.c
index 5d4dcc9f..58bc1f3c 100644
--- a/freebsd/sys/netpfil/ipfw/ip_fw_nat.c
+++ b/freebsd/sys/netpfil/ipfw/ip_fw_nat.c
@@ -33,17 +33,18 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/eventhandler.h>
#include <sys/malloc.h>
+#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <rtems/bsd/sys/lock.h>
#include <sys/module.h>
#include <sys/rwlock.h>
-
-#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */
+#include <sys/rmlock.h>
#include <netinet/libalias/alias.h>
#include <netinet/libalias/alias_local.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
@@ -55,6 +56,45 @@ __FBSDID("$FreeBSD$");
#include <machine/in_cksum.h> /* XXX for in_cksum */
+struct cfg_spool {
+ LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */
+ struct in_addr addr;
+ uint16_t port;
+};
+
+/* Nat redirect configuration. */
+struct cfg_redir {
+ LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */
+ uint16_t mode; /* type of redirect mode */
+ uint16_t proto; /* protocol: tcp/udp */
+ struct in_addr laddr; /* local ip address */
+ struct in_addr paddr; /* public ip address */
+ struct in_addr raddr; /* remote ip address */
+ uint16_t lport; /* local port */
+ uint16_t pport; /* public port */
+ uint16_t rport; /* remote port */
+ uint16_t pport_cnt; /* number of public ports */
+ uint16_t rport_cnt; /* number of remote ports */
+ struct alias_link **alink;
+ u_int16_t spool_cnt; /* num of entry in spool chain */
+ /* chain of spool instances */
+ LIST_HEAD(spool_chain, cfg_spool) spool_chain;
+};
+
+/* Nat configuration data struct. */
+struct cfg_nat {
+ /* chain of nat instances */
+ LIST_ENTRY(cfg_nat) _next;
+ int id; /* nat id */
+ struct in_addr ip; /* nat ip address */
+ struct libalias *lib; /* libalias instance */
+ int mode; /* aliasing mode */
+ int redir_cnt; /* number of entry in spool chain */
+ /* chain of redir instances */
+ LIST_HEAD(redir_chain, cfg_redir) redir_chain;
+ char if_name[IF_NAMESIZE]; /* interface name */
+};
+
static eventhandler_tag ifaddr_event_tag;
static void
@@ -66,8 +106,12 @@ ifaddr_change(void *arg __unused, struct ifnet *ifp)
KASSERT(curvnet == ifp->if_vnet,
("curvnet(%p) differs from iface vnet(%p)", curvnet, ifp->if_vnet));
+
+ if (V_ipfw_vnet_ready == 0 || V_ipfw_nat_ready == 0)
+ return;
+
chain = &V_layer3_chain;
- IPFW_WLOCK(chain);
+ IPFW_UH_WLOCK(chain);
/* Check every nat entry... */
LIST_FOREACH(ptr, &chain->nat, _next) {
/* ...using nic 'ifp->if_xname' as dynamic alias address. */
@@ -79,13 +123,15 @@ ifaddr_change(void *arg __unused, struct ifnet *ifp)
continue;
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
+ IPFW_WLOCK(chain);
ptr->ip = ((struct sockaddr_in *)
(ifa->ifa_addr))->sin_addr;
LibAliasSetAddress(ptr->lib, ptr->ip);
+ IPFW_WUNLOCK(chain);
}
if_addr_runlock(ifp);
}
- IPFW_WUNLOCK(chain);
+ IPFW_UH_WUNLOCK(chain);
}
/*
@@ -117,11 +163,11 @@ del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
LIST_FOREACH_SAFE(r, head, _next, tmp_r) {
num = 1; /* Number of alias_link to delete. */
switch (r->mode) {
- case REDIR_PORT:
+ case NAT44_REDIR_PORT:
num = r->pport_cnt;
/* FALLTHROUGH */
- case REDIR_ADDR:
- case REDIR_PROTO:
+ case NAT44_REDIR_ADDR:
+ case NAT44_REDIR_PROTO:
/* Delete all libalias redirect entry. */
for (i = 0; i < num; i++)
LibAliasRedirectDelete(n->lib, r->alink[i]);
@@ -142,27 +188,41 @@ del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
}
}
-static void
+static int
add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
{
- struct cfg_redir *r, *ser_r;
- struct cfg_spool *s, *ser_s;
+ struct cfg_redir *r;
+ struct cfg_spool *s;
+ struct nat44_cfg_redir *ser_r;
+ struct nat44_cfg_spool *ser_s;
+
int cnt, off, i;
for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
- ser_r = (struct cfg_redir *)&buf[off];
- r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
- memcpy(r, ser_r, SOF_REDIR);
+ ser_r = (struct nat44_cfg_redir *)&buf[off];
+ r = malloc(sizeof(*r), M_IPFW, M_WAITOK | M_ZERO);
+ r->mode = ser_r->mode;
+ r->laddr = ser_r->laddr;
+ r->paddr = ser_r->paddr;
+ r->raddr = ser_r->raddr;
+ r->lport = ser_r->lport;
+ r->pport = ser_r->pport;
+ r->rport = ser_r->rport;
+ r->pport_cnt = ser_r->pport_cnt;
+ r->rport_cnt = ser_r->rport_cnt;
+ r->proto = ser_r->proto;
+ r->spool_cnt = ser_r->spool_cnt;
+ //memcpy(r, ser_r, SOF_REDIR);
LIST_INIT(&r->spool_chain);
- off += SOF_REDIR;
+ off += sizeof(struct nat44_cfg_redir);
r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt,
M_IPFW, M_WAITOK | M_ZERO);
switch (r->mode) {
- case REDIR_ADDR:
+ case NAT44_REDIR_ADDR:
r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr,
r->paddr);
break;
- case REDIR_PORT:
+ case NAT44_REDIR_PORT:
for (i = 0 ; i < r->pport_cnt; i++) {
/* If remotePort is all ports, set it to 0. */
u_short remotePortCopy = r->rport + i;
@@ -178,7 +238,7 @@ add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
}
}
break;
- case REDIR_PROTO:
+ case NAT44_REDIR_PROTO:
r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr,
r->raddr, r->paddr, r->proto);
break;
@@ -186,25 +246,41 @@ add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
printf("unknown redirect mode: %u\n", r->mode);
break;
}
- /* XXX perhaps return an error instead of panic ? */
- if (r->alink[0] == NULL)
- panic("LibAliasRedirect* returned NULL");
+ if (r->alink[0] == NULL) {
+ printf("LibAliasRedirect* returned NULL\n");
+ free(r->alink, M_IPFW);
+ free(r, M_IPFW);
+ return (EINVAL);
+ }
/* LSNAT handling. */
for (i = 0; i < r->spool_cnt; i++) {
- ser_s = (struct cfg_spool *)&buf[off];
- s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
- memcpy(s, ser_s, SOF_SPOOL);
+ ser_s = (struct nat44_cfg_spool *)&buf[off];
+ s = malloc(sizeof(*s), M_IPFW, M_WAITOK | M_ZERO);
+ s->addr = ser_s->addr;
+ s->port = ser_s->port;
LibAliasAddServer(ptr->lib, r->alink[0],
s->addr, htons(s->port));
- off += SOF_SPOOL;
+ off += sizeof(struct nat44_cfg_spool);
/* Hook spool entry. */
LIST_INSERT_HEAD(&r->spool_chain, s, _next);
}
/* And finally hook this redir entry. */
LIST_INSERT_HEAD(&ptr->redir_chain, r, _next);
}
+
+ return (0);
+}
+
+static void
+free_nat_instance(struct cfg_nat *ptr)
+{
+
+ del_redir_spool_cfg(ptr, &ptr->redir_chain);
+ LibAliasUninit(ptr->lib);
+ free(ptr, M_IPFW);
}
+
/*
* ipfw_nat - perform mbuf header translation.
*
@@ -345,11 +421,11 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
if (ldt) {
struct tcphdr *th;
struct udphdr *uh;
- u_short cksum;
+ uint16_t ip_len, cksum;
- ip->ip_len = ntohs(ip->ip_len);
+ ip_len = ntohs(ip->ip_len);
cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
- htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2)));
+ htons(ip->ip_p + ip_len - (ip->ip_hl << 2)));
switch (ip->ip_p) {
case IPPROTO_TCP:
@@ -375,7 +451,6 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
in_delayed_cksum(mcl);
mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
}
- ip->ip_len = htons(ip->ip_len);
}
args->m = mcl;
return (IP_FW_NAT);
@@ -393,60 +468,68 @@ lookup_nat(struct nat_list *l, int nat_id)
return res;
}
-static int
-ipfw_nat_cfg(struct sockopt *sopt)
+static struct cfg_nat *
+lookup_nat_name(struct nat_list *l, char *name)
{
- struct cfg_nat *cfg, *ptr;
- char *buf;
- struct ip_fw_chain *chain = &V_layer3_chain;
- size_t len;
- int gencnt, error = 0;
+ struct cfg_nat *res;
+ int id;
+ char *errptr;
- len = sopt->sopt_valsize;
- buf = malloc(len, M_TEMP, M_WAITOK | M_ZERO);
- if ((error = sooptcopyin(sopt, buf, len, sizeof(struct cfg_nat))) != 0)
- goto out;
+ id = strtol(name, &errptr, 10);
+ if (id == 0 || *errptr != '\0')
+ return (NULL);
- cfg = (struct cfg_nat *)buf;
- if (cfg->id < 0) {
- error = EINVAL;
- goto out;
+ LIST_FOREACH(res, l, _next) {
+ if (res->id == id)
+ break;
}
+ return (res);
+}
+
+/* IP_FW3 configuration routines */
+
+static void
+nat44_config(struct ip_fw_chain *chain, struct nat44_cfg_nat *ucfg)
+{
+ struct cfg_nat *ptr, *tcfg;
+ int gencnt;
/*
* Find/create nat rule.
*/
- IPFW_WLOCK(chain);
+ IPFW_UH_WLOCK(chain);
gencnt = chain->gencnt;
- ptr = lookup_nat(&chain->nat, cfg->id);
+ ptr = lookup_nat_name(&chain->nat, ucfg->name);
if (ptr == NULL) {
- IPFW_WUNLOCK(chain);
+ IPFW_UH_WUNLOCK(chain);
/* New rule: allocate and init new instance. */
ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_WAITOK | M_ZERO);
ptr->lib = LibAliasInit(NULL);
LIST_INIT(&ptr->redir_chain);
} else {
/* Entry already present: temporarily unhook it. */
+ IPFW_WLOCK(chain);
LIST_REMOVE(ptr, _next);
- flush_nat_ptrs(chain, cfg->id);
+ flush_nat_ptrs(chain, ptr->id);
IPFW_WUNLOCK(chain);
+ IPFW_UH_WUNLOCK(chain);
}
/*
- * Basic nat configuration.
+ * Basic nat (re)configuration.
*/
- ptr->id = cfg->id;
+ ptr->id = strtol(ucfg->name, NULL, 10);
/*
* XXX - what if this rule doesn't nat any ip and just
* redirect?
* do we set aliasaddress to 0.0.0.0?
*/
- ptr->ip = cfg->ip;
- ptr->redir_cnt = cfg->redir_cnt;
- ptr->mode = cfg->mode;
- LibAliasSetMode(ptr->lib, cfg->mode, ~0);
+ ptr->ip = ucfg->ip;
+ ptr->redir_cnt = ucfg->redir_cnt;
+ ptr->mode = ucfg->mode;
+ strlcpy(ptr->if_name, ucfg->if_name, sizeof(ptr->if_name));
+ LibAliasSetMode(ptr->lib, ptr->mode, ~0);
LibAliasSetAddress(ptr->lib, ptr->ip);
- memcpy(ptr->if_name, cfg->if_name, IF_NAMESIZE);
/*
* Redir and LSNAT configuration.
@@ -454,16 +537,453 @@ ipfw_nat_cfg(struct sockopt *sopt)
/* Delete old cfgs. */
del_redir_spool_cfg(ptr, &ptr->redir_chain);
/* Add new entries. */
- add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr);
+ add_redir_spool_cfg((char *)(ucfg + 1), ptr);
+ IPFW_UH_WLOCK(chain);
- IPFW_WLOCK(chain);
/* Extra check to avoid race with another ipfw_nat_cfg() */
- if (gencnt != chain->gencnt &&
- ((cfg = lookup_nat(&chain->nat, ptr->id)) != NULL))
- LIST_REMOVE(cfg, _next);
+ tcfg = NULL;
+ if (gencnt != chain->gencnt)
+ tcfg = lookup_nat_name(&chain->nat, ucfg->name);
+ IPFW_WLOCK(chain);
+ if (tcfg != NULL)
+ LIST_REMOVE(tcfg, _next);
LIST_INSERT_HEAD(&chain->nat, ptr, _next);
+ IPFW_WUNLOCK(chain);
chain->gencnt++;
+
+ IPFW_UH_WUNLOCK(chain);
+
+ if (tcfg != NULL)
+ free_nat_instance(ptr);
+}
+
+/*
+ * Creates/configure nat44 instance
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header nat44_cfg_nat .. ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat44_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_header *oh;
+ struct nat44_cfg_nat *ucfg;
+ int id;
+ size_t read;
+ char *errptr;
+
+ /* Check minimum header size */
+ if (sd->valsize < (sizeof(*oh) + sizeof(*ucfg)))
+ return (EINVAL);
+
+ oh = (ipfw_obj_header *)sd->kbuf;
+
+ /* Basic length checks for TLVs */
+ if (oh->ntlv.head.length != sizeof(oh->ntlv))
+ return (EINVAL);
+
+ ucfg = (struct nat44_cfg_nat *)(oh + 1);
+
+ /* Check if name is properly terminated and looks like number */
+ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name))
+ return (EINVAL);
+ id = strtol(ucfg->name, &errptr, 10);
+ if (id == 0 || *errptr != '\0')
+ return (EINVAL);
+
+ read = sizeof(*oh) + sizeof(*ucfg);
+ /* Check number of redirs */
+ if (sd->valsize < read + ucfg->redir_cnt*sizeof(struct nat44_cfg_redir))
+ return (EINVAL);
+
+ nat44_config(chain, ucfg);
+ return (0);
+}
+
+/*
+ * Destroys given nat instances.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat44_destroy(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_header *oh;
+ struct cfg_nat *ptr;
+ ipfw_obj_ntlv *ntlv;
+
+ /* Check minimum header size */
+ if (sd->valsize < sizeof(*oh))
+ return (EINVAL);
+
+ oh = (ipfw_obj_header *)sd->kbuf;
+
+ /* Basic length checks for TLVs */
+ if (oh->ntlv.head.length != sizeof(oh->ntlv))
+ return (EINVAL);
+
+ ntlv = &oh->ntlv;
+ /* Check if name is properly terminated */
+ if (strnlen(ntlv->name, sizeof(ntlv->name)) == sizeof(ntlv->name))
+ return (EINVAL);
+
+ IPFW_UH_WLOCK(chain);
+ ptr = lookup_nat_name(&chain->nat, ntlv->name);
+ if (ptr == NULL) {
+ IPFW_UH_WUNLOCK(chain);
+ return (ESRCH);
+ }
+ IPFW_WLOCK(chain);
+ LIST_REMOVE(ptr, _next);
+ flush_nat_ptrs(chain, ptr->id);
IPFW_WUNLOCK(chain);
+ IPFW_UH_WUNLOCK(chain);
+
+ free_nat_instance(ptr);
+
+ return (0);
+}
+
+static void
+export_nat_cfg(struct cfg_nat *ptr, struct nat44_cfg_nat *ucfg)
+{
+
+ snprintf(ucfg->name, sizeof(ucfg->name), "%d", ptr->id);
+ ucfg->ip = ptr->ip;
+ ucfg->redir_cnt = ptr->redir_cnt;
+ ucfg->mode = ptr->mode;
+ strlcpy(ucfg->if_name, ptr->if_name, sizeof(ucfg->if_name));
+}
+
+/*
+ * Gets config for given nat instance
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header nat44_cfg_nat .. ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat44_get_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_header *oh;
+ struct nat44_cfg_nat *ucfg;
+ struct cfg_nat *ptr;
+ struct cfg_redir *r;
+ struct cfg_spool *s;
+ struct nat44_cfg_redir *ser_r;
+ struct nat44_cfg_spool *ser_s;
+ size_t sz;
+
+ sz = sizeof(*oh) + sizeof(*ucfg);
+ /* Check minimum header size */
+ if (sd->valsize < sz)
+ return (EINVAL);
+
+ oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
+
+ /* Basic length checks for TLVs */
+ if (oh->ntlv.head.length != sizeof(oh->ntlv))
+ return (EINVAL);
+
+ ucfg = (struct nat44_cfg_nat *)(oh + 1);
+
+ /* Check if name is properly terminated */
+ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name))
+ return (EINVAL);
+
+ IPFW_UH_RLOCK(chain);
+ ptr = lookup_nat_name(&chain->nat, ucfg->name);
+ if (ptr == NULL) {
+ IPFW_UH_RUNLOCK(chain);
+ return (ESRCH);
+ }
+
+ export_nat_cfg(ptr, ucfg);
+
+ /* Estimate memory amount */
+ sz = sizeof(ipfw_obj_header) + sizeof(struct nat44_cfg_nat);
+ LIST_FOREACH(r, &ptr->redir_chain, _next) {
+ sz += sizeof(struct nat44_cfg_redir);
+ LIST_FOREACH(s, &r->spool_chain, _next)
+ sz += sizeof(struct nat44_cfg_spool);
+ }
+
+ ucfg->size = sz;
+ if (sd->valsize < sz) {
+
+ /*
+ * Submitted buffer size is not enough.
+ * WE've already filled in @ucfg structure with
+ * relevant info including size, so we
+ * can return. Buffer will be flushed automatically.
+ */
+ IPFW_UH_RUNLOCK(chain);
+ return (ENOMEM);
+ }
+
+ /* Size OK, let's copy data */
+ LIST_FOREACH(r, &ptr->redir_chain, _next) {
+ ser_r = (struct nat44_cfg_redir *)ipfw_get_sopt_space(sd,
+ sizeof(*ser_r));
+ ser_r->mode = r->mode;
+ ser_r->laddr = r->laddr;
+ ser_r->paddr = r->paddr;
+ ser_r->raddr = r->raddr;
+ ser_r->lport = r->lport;
+ ser_r->pport = r->pport;
+ ser_r->rport = r->rport;
+ ser_r->pport_cnt = r->pport_cnt;
+ ser_r->rport_cnt = r->rport_cnt;
+ ser_r->proto = r->proto;
+ ser_r->spool_cnt = r->spool_cnt;
+
+ LIST_FOREACH(s, &r->spool_chain, _next) {
+ ser_s = (struct nat44_cfg_spool *)ipfw_get_sopt_space(
+ sd, sizeof(*ser_s));
+
+ ser_s->addr = s->addr;
+ ser_s->port = s->port;
+ }
+ }
+
+ IPFW_UH_RUNLOCK(chain);
+
+ return (0);
+}
+
+/*
+ * Lists all nat44 instances currently available in kernel.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_lheader ]
+ * Reply: [ ipfw_obj_lheader nat44_cfg_nat x N ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat44_list_nat(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_lheader *olh;
+ struct nat44_cfg_nat *ucfg;
+ struct cfg_nat *ptr;
+ int nat_count;
+
+ /* Check minimum header size */
+ if (sd->valsize < sizeof(ipfw_obj_lheader))
+ return (EINVAL);
+
+ olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh));
+ IPFW_UH_RLOCK(chain);
+ nat_count = 0;
+ LIST_FOREACH(ptr, &chain->nat, _next)
+ nat_count++;
+
+ olh->count = nat_count;
+ olh->objsize = sizeof(struct nat44_cfg_nat);
+ olh->size = sizeof(*olh) + olh->count * olh->objsize;
+
+ if (sd->valsize < olh->size) {
+ IPFW_UH_RUNLOCK(chain);
+ return (ENOMEM);
+ }
+
+ LIST_FOREACH(ptr, &chain->nat, _next) {
+ ucfg = (struct nat44_cfg_nat *)ipfw_get_sopt_space(sd,
+ sizeof(*ucfg));
+ export_nat_cfg(ptr, ucfg);
+ }
+
+ IPFW_UH_RUNLOCK(chain);
+
+ return (0);
+}
+
+/*
+ * Gets log for given nat instance
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header nat44_cfg_nat ]
+ * Reply: [ ipfw_obj_header nat44_cfg_nat LOGBUFFER ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat44_get_log(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_header *oh;
+ struct nat44_cfg_nat *ucfg;
+ struct cfg_nat *ptr;
+ void *pbuf;
+ size_t sz;
+
+ sz = sizeof(*oh) + sizeof(*ucfg);
+ /* Check minimum header size */
+ if (sd->valsize < sz)
+ return (EINVAL);
+
+ oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
+
+ /* Basic length checks for TLVs */
+ if (oh->ntlv.head.length != sizeof(oh->ntlv))
+ return (EINVAL);
+
+ ucfg = (struct nat44_cfg_nat *)(oh + 1);
+
+ /* Check if name is properly terminated */
+ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name))
+ return (EINVAL);
+
+ IPFW_UH_RLOCK(chain);
+ ptr = lookup_nat_name(&chain->nat, ucfg->name);
+ if (ptr == NULL) {
+ IPFW_UH_RUNLOCK(chain);
+ return (ESRCH);
+ }
+
+ if (ptr->lib->logDesc == NULL) {
+ IPFW_UH_RUNLOCK(chain);
+ return (ENOENT);
+ }
+
+ export_nat_cfg(ptr, ucfg);
+
+ /* Estimate memory amount */
+ ucfg->size = sizeof(struct nat44_cfg_nat) + LIBALIAS_BUF_SIZE;
+ if (sd->valsize < sz + sizeof(*oh)) {
+
+ /*
+ * Submitted buffer size is not enough.
+ * WE've already filled in @ucfg structure with
+ * relevant info including size, so we
+ * can return. Buffer will be flushed automatically.
+ */
+ IPFW_UH_RUNLOCK(chain);
+ return (ENOMEM);
+ }
+
+ pbuf = (void *)ipfw_get_sopt_space(sd, LIBALIAS_BUF_SIZE);
+ memcpy(pbuf, ptr->lib->logDesc, LIBALIAS_BUF_SIZE);
+
+ IPFW_UH_RUNLOCK(chain);
+
+ return (0);
+}
+
+static struct ipfw_sopt_handler scodes[] = {
+ { IP_FW_NAT44_XCONFIG, 0, HDIR_SET, nat44_cfg },
+ { IP_FW_NAT44_DESTROY, 0, HDIR_SET, nat44_destroy },
+ { IP_FW_NAT44_XGETCONFIG, 0, HDIR_GET, nat44_get_cfg },
+ { IP_FW_NAT44_LIST_NAT, 0, HDIR_GET, nat44_list_nat },
+ { IP_FW_NAT44_XGETLOG, 0, HDIR_GET, nat44_get_log },
+};
+
+
+/*
+ * Legacy configuration routines
+ */
+
+struct cfg_spool_legacy {
+ LIST_ENTRY(cfg_spool_legacy) _next;
+ struct in_addr addr;
+ u_short port;
+};
+
+struct cfg_redir_legacy {
+ LIST_ENTRY(cfg_redir) _next;
+ u_int16_t mode;
+ struct in_addr laddr;
+ struct in_addr paddr;
+ struct in_addr raddr;
+ u_short lport;
+ u_short pport;
+ u_short rport;
+ u_short pport_cnt;
+ u_short rport_cnt;
+ int proto;
+ struct alias_link **alink;
+ u_int16_t spool_cnt;
+ LIST_HEAD(, cfg_spool_legacy) spool_chain;
+};
+
+struct cfg_nat_legacy {
+ LIST_ENTRY(cfg_nat_legacy) _next;
+ int id;
+ struct in_addr ip;
+ char if_name[IF_NAMESIZE];
+ int mode;
+ struct libalias *lib;
+ int redir_cnt;
+ LIST_HEAD(, cfg_redir_legacy) redir_chain;
+};
+
+static int
+ipfw_nat_cfg(struct sockopt *sopt)
+{
+ struct cfg_nat_legacy *cfg;
+ struct nat44_cfg_nat *ucfg;
+ struct cfg_redir_legacy *rdir;
+ struct nat44_cfg_redir *urdir;
+ char *buf;
+ size_t len, len2;
+ int error, i;
+
+ len = sopt->sopt_valsize;
+ len2 = len + 128;
+
+ /*
+ * Allocate 2x buffer to store converted structures.
+ * new redir_cfg has shrunk, so we're sure that
+ * new buffer size is enough.
+ */
+ buf = malloc(roundup2(len, 8) + len2, M_TEMP, M_WAITOK | M_ZERO);
+ error = sooptcopyin(sopt, buf, len, sizeof(struct cfg_nat_legacy));
+ if (error != 0)
+ goto out;
+
+ cfg = (struct cfg_nat_legacy *)buf;
+ if (cfg->id < 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ ucfg = (struct nat44_cfg_nat *)&buf[roundup2(len, 8)];
+ snprintf(ucfg->name, sizeof(ucfg->name), "%d", cfg->id);
+ strlcpy(ucfg->if_name, cfg->if_name, sizeof(ucfg->if_name));
+ ucfg->ip = cfg->ip;
+ ucfg->mode = cfg->mode;
+ ucfg->redir_cnt = cfg->redir_cnt;
+
+ if (len < sizeof(*cfg) + cfg->redir_cnt * sizeof(*rdir)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ urdir = (struct nat44_cfg_redir *)(ucfg + 1);
+ rdir = (struct cfg_redir_legacy *)(cfg + 1);
+ for (i = 0; i < cfg->redir_cnt; i++) {
+ urdir->mode = rdir->mode;
+ urdir->laddr = rdir->laddr;
+ urdir->paddr = rdir->paddr;
+ urdir->raddr = rdir->raddr;
+ urdir->lport = rdir->lport;
+ urdir->pport = rdir->pport;
+ urdir->rport = rdir->rport;
+ urdir->pport_cnt = rdir->pport_cnt;
+ urdir->rport_cnt = rdir->rport_cnt;
+ urdir->proto = rdir->proto;
+ urdir->spool_cnt = rdir->spool_cnt;
+
+ urdir++;
+ rdir++;
+ }
+
+ nat44_config(&V_layer3_chain, ucfg);
out:
free(buf, M_TEMP);
@@ -479,18 +999,18 @@ ipfw_nat_del(struct sockopt *sopt)
sooptcopyin(sopt, &i, sizeof i, sizeof i);
/* XXX validate i */
- IPFW_WLOCK(chain);
+ IPFW_UH_WLOCK(chain);
ptr = lookup_nat(&chain->nat, i);
if (ptr == NULL) {
- IPFW_WUNLOCK(chain);
+ IPFW_UH_WUNLOCK(chain);
return (EINVAL);
}
+ IPFW_WLOCK(chain);
LIST_REMOVE(ptr, _next);
flush_nat_ptrs(chain, i);
IPFW_WUNLOCK(chain);
- del_redir_spool_cfg(ptr, &ptr->redir_chain);
- LibAliasUninit(ptr->lib);
- free(ptr, M_IPFW);
+ IPFW_UH_WUNLOCK(chain);
+ free_nat_instance(ptr);
return (0);
}
@@ -499,28 +1019,31 @@ ipfw_nat_get_cfg(struct sockopt *sopt)
{
struct ip_fw_chain *chain = &V_layer3_chain;
struct cfg_nat *n;
+ struct cfg_nat_legacy *ucfg;
struct cfg_redir *r;
struct cfg_spool *s;
+ struct cfg_redir_legacy *ser_r;
+ struct cfg_spool_legacy *ser_s;
char *data;
int gencnt, nat_cnt, len, error;
nat_cnt = 0;
len = sizeof(nat_cnt);
- IPFW_RLOCK(chain);
+ IPFW_UH_RLOCK(chain);
retry:
gencnt = chain->gencnt;
/* Estimate memory amount */
LIST_FOREACH(n, &chain->nat, _next) {
nat_cnt++;
- len += sizeof(struct cfg_nat);
+ len += sizeof(struct cfg_nat_legacy);
LIST_FOREACH(r, &n->redir_chain, _next) {
- len += sizeof(struct cfg_redir);
+ len += sizeof(struct cfg_redir_legacy);
LIST_FOREACH(s, &r->spool_chain, _next)
- len += sizeof(struct cfg_spool);
+ len += sizeof(struct cfg_spool_legacy);
}
}
- IPFW_RUNLOCK(chain);
+ IPFW_UH_RUNLOCK(chain);
data = malloc(len, M_TEMP, M_WAITOK | M_ZERO);
bcopy(&nat_cnt, data, sizeof(nat_cnt));
@@ -528,25 +1051,43 @@ retry:
nat_cnt = 0;
len = sizeof(nat_cnt);
- IPFW_RLOCK(chain);
+ IPFW_UH_RLOCK(chain);
if (gencnt != chain->gencnt) {
free(data, M_TEMP);
goto retry;
}
/* Serialize all the data. */
LIST_FOREACH(n, &chain->nat, _next) {
- bcopy(n, &data[len], sizeof(struct cfg_nat));
- len += sizeof(struct cfg_nat);
+ ucfg = (struct cfg_nat_legacy *)&data[len];
+ ucfg->id = n->id;
+ ucfg->ip = n->ip;
+ ucfg->redir_cnt = n->redir_cnt;
+ ucfg->mode = n->mode;
+ strlcpy(ucfg->if_name, n->if_name, sizeof(ucfg->if_name));
+ len += sizeof(struct cfg_nat_legacy);
LIST_FOREACH(r, &n->redir_chain, _next) {
- bcopy(r, &data[len], sizeof(struct cfg_redir));
- len += sizeof(struct cfg_redir);
+ ser_r = (struct cfg_redir_legacy *)&data[len];
+ ser_r->mode = r->mode;
+ ser_r->laddr = r->laddr;
+ ser_r->paddr = r->paddr;
+ ser_r->raddr = r->raddr;
+ ser_r->lport = r->lport;
+ ser_r->pport = r->pport;
+ ser_r->rport = r->rport;
+ ser_r->pport_cnt = r->pport_cnt;
+ ser_r->rport_cnt = r->rport_cnt;
+ ser_r->proto = r->proto;
+ ser_r->spool_cnt = r->spool_cnt;
+ len += sizeof(struct cfg_redir_legacy);
LIST_FOREACH(s, &r->spool_chain, _next) {
- bcopy(s, &data[len], sizeof(struct cfg_spool));
- len += sizeof(struct cfg_spool);
+ ser_s = (struct cfg_spool_legacy *)&data[len];
+ ser_s->addr = s->addr;
+ ser_s->port = s->port;
+ len += sizeof(struct cfg_spool_legacy);
}
}
}
- IPFW_RUNLOCK(chain);
+ IPFW_UH_RUNLOCK(chain);
error = sooptcopyout(sopt, data, len);
free(data, M_TEMP);
@@ -561,6 +1102,7 @@ ipfw_nat_get_log(struct sockopt *sopt)
struct cfg_nat *ptr;
int i, size;
struct ip_fw_chain *chain;
+ IPFW_RLOCK_TRACKER;
chain = &V_layer3_chain;
@@ -609,14 +1151,12 @@ vnet_ipfw_nat_uninit(const void *arg __unused)
chain = &V_layer3_chain;
IPFW_WLOCK(chain);
+ V_ipfw_nat_ready = 0;
LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) {
LIST_REMOVE(ptr, _next);
- del_redir_spool_cfg(ptr, &ptr->redir_chain);
- LibAliasUninit(ptr->lib);
- free(ptr, M_IPFW);
+ free_nat_instance(ptr);
}
flush_nat_ptrs(chain, -1 /* flush all */);
- V_ipfw_nat_ready = 0;
IPFW_WUNLOCK(chain);
return (0);
}
@@ -632,6 +1172,7 @@ ipfw_nat_init(void)
ipfw_nat_del_ptr = ipfw_nat_del;
ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg;
ipfw_nat_get_log_ptr = ipfw_nat_get_log;
+ IPFW_ADD_SOPT_HANDLER(1, scodes);
ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change,
NULL, EVENTHANDLER_PRI_ANY);
@@ -643,6 +1184,7 @@ ipfw_nat_destroy(void)
EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag);
/* deregister ipfw_nat */
+ IPFW_DEL_SOPT_HANDLER(1, scodes);
ipfw_nat_ptr = NULL;
lookup_nat_ptr = NULL;
ipfw_nat_cfg_ptr = NULL;
@@ -677,14 +1219,14 @@ static moduledata_t ipfw_nat_mod = {
};
/* Define startup order. */
-#define IPFW_NAT_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN
-#define IPFW_NAT_MODEVENT_ORDER (SI_ORDER_ANY - 128)
+#define IPFW_NAT_SI_SUB_FIREWALL SI_SUB_PROTO_FIREWALL
+#define IPFW_NAT_MODEVENT_ORDER (SI_ORDER_ANY - 128) /* after ipfw */
#define IPFW_NAT_MODULE_ORDER (IPFW_NAT_MODEVENT_ORDER + 1)
#define IPFW_NAT_VNET_ORDER (IPFW_NAT_MODEVENT_ORDER + 2)
DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, IPFW_NAT_SI_SUB_FIREWALL, SI_ORDER_ANY);
MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1);
-MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2);
+MODULE_DEPEND(ipfw_nat, ipfw, 3, 3, 3);
MODULE_VERSION(ipfw_nat, 1);
SYSINIT(ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER,
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_pfil.c b/freebsd/sys/netpfil/ipfw/ip_fw_pfil.c
index d2e1b448..59c13aa5 100644
--- a/freebsd/sys/netpfil/ipfw/ip_fw_pfil.c
+++ b/freebsd/sys/netpfil/ipfw/ip_fw_pfil.c
@@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
#include <net/if.h>
#include <net/route.h>
+#include <net/ethernet.h>
#include <net/pfil.h>
#include <net/vnet.h>
@@ -60,6 +61,7 @@ __FBSDID("$FreeBSD$");
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
+#include <netinet6/scope6_var.h>
#endif
#include <netgraph/ng_ipfw.h>
@@ -76,26 +78,39 @@ static VNET_DEFINE(int, fw6_enable) = 1;
#define V_fw6_enable VNET(fw6_enable)
#endif
+static VNET_DEFINE(int, fwlink_enable) = 0;
+#define V_fwlink_enable VNET(fwlink_enable)
+
int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
/* Forward declarations. */
static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int);
+int ipfw_check_packet(void *, struct mbuf **, struct ifnet *, int,
+ struct inpcb *);
+int ipfw_check_frame(void *, struct mbuf **, struct ifnet *, int,
+ struct inpcb *);
#ifdef SYSCTL_NODE
SYSBEGIN(f1)
SYSCTL_DECL(_net_inet_ip_fw);
-SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0,
- ipfw_chg_hook, "I", "Enable ipfw");
+SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3,
+ &VNET_NAME(fw_enable), 0, ipfw_chg_hook, "I", "Enable ipfw");
#ifdef INET6
SYSCTL_DECL(_net_inet6_ip6_fw);
-SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0,
- ipfw_chg_hook, "I", "Enable ipfw+6");
+SYSCTL_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3,
+ &VNET_NAME(fw6_enable), 0, ipfw_chg_hook, "I", "Enable ipfw+6");
#endif /* INET6 */
+SYSCTL_DECL(_net_link_ether);
+SYSCTL_PROC(_net_link_ether, OID_AUTO, ipfw,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3,
+ &VNET_NAME(fwlink_enable), 0, ipfw_chg_hook, "I",
+ "Pass ether pkts through firewall");
+
SYSEND
#endif /* SYSCTL_NODE */
@@ -106,7 +121,7 @@ SYSEND
* The packet may be consumed.
*/
int
-ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+ipfw_check_packet(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
struct inpcb *inp)
{
struct ip_fw_args args;
@@ -114,10 +129,6 @@ ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
int ipfw;
int ret;
- /* all the processing now uses ip_len in net format */
- if (mtod(*m0, struct ip *)->ip_v == 4)
- SET_NET_IPLEN(mtod(*m0, struct ip *));
-
/* convert dir to IPFW values */
dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT;
bzero(&args, sizeof(args));
@@ -131,11 +142,8 @@ again:
if (tag != NULL) {
args.rule = *((struct ipfw_rule_ref *)(tag+1));
m_tag_delete(*m0, tag);
- if (args.rule.info & IPFW_ONEPASS) {
- if (mtod(*m0, struct ip *)->ip_v == 4)
- SET_HOST_IPLEN(mtod(*m0, struct ip *));
+ if (args.rule.info & IPFW_ONEPASS)
return (0);
- }
}
args.m = *m0;
@@ -192,8 +200,20 @@ again:
}
#ifdef INET6
if (args.next_hop6 != NULL) {
- bcopy(args.next_hop6, (fwd_tag+1), len);
- if (in6_localip(&args.next_hop6->sin6_addr))
+ struct sockaddr_in6 *sa6;
+
+ sa6 = (struct sockaddr_in6 *)(fwd_tag + 1);
+ bcopy(args.next_hop6, sa6, len);
+ /*
+ * If nh6 address is link-local we should convert
+ * it to kernel internal form before doing any
+ * comparisons.
+ */
+ if (sa6_embedscope(sa6, V_ip6_use_defzone) != 0) {
+ ret = EACCES;
+ break;
+ }
+ if (in6_localip(&sa6->sin6_addr))
(*m0)->m_flags |= M_FASTFWD_OURS;
(*m0)->m_flags |= M_IP6_NEXTHOP;
}
@@ -279,8 +299,112 @@ again:
FREE_PKT(*m0);
*m0 = NULL;
}
- if (*m0 && mtod(*m0, struct ip *)->ip_v == 4)
- SET_HOST_IPLEN(mtod(*m0, struct ip *));
+
+ return ret;
+}
+
+/*
+ * ipfw processing for ethernet packets (in and out).
+ * Inteface is NULL from ether_demux, and ifp from
+ * ether_output_frame.
+ */
+int
+ipfw_check_frame(void *arg, struct mbuf **m0, struct ifnet *dst, int dir,
+ struct inpcb *inp)
+{
+ struct ether_header *eh;
+ struct ether_header save_eh;
+ struct mbuf *m;
+ int i, ret;
+ struct ip_fw_args args;
+ struct m_tag *mtag;
+
+ /* fetch start point from rule, if any */
+ mtag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL);
+ if (mtag == NULL) {
+ args.rule.slot = 0;
+ } else {
+ /* dummynet packet, already partially processed */
+ struct ipfw_rule_ref *r;
+
+ /* XXX can we free it after use ? */
+ mtag->m_tag_id = PACKET_TAG_NONE;
+ r = (struct ipfw_rule_ref *)(mtag + 1);
+ if (r->info & IPFW_ONEPASS)
+ return (0);
+ args.rule = *r;
+ }
+
+ /* I need some amt of data to be contiguous */
+ m = *m0;
+ i = min(m->m_pkthdr.len, max_protohdr);
+ if (m->m_len < i) {
+ m = m_pullup(m, i);
+ if (m == NULL) {
+ *m0 = m;
+ return (0);
+ }
+ }
+ eh = mtod(m, struct ether_header *);
+ save_eh = *eh; /* save copy for restore below */
+ m_adj(m, ETHER_HDR_LEN); /* strip ethernet header */
+
+ args.m = m; /* the packet we are looking at */
+ args.oif = dir == PFIL_OUT ? dst: NULL; /* destination, if any */
+ args.next_hop = NULL; /* we do not support forward yet */
+ args.next_hop6 = NULL; /* we do not support forward yet */
+ args.eh = &save_eh; /* MAC header for bridged/MAC packets */
+ args.inp = NULL; /* used by ipfw uid/gid/jail rules */
+ i = ipfw_chk(&args);
+ m = args.m;
+ if (m != NULL) {
+ /*
+ * Restore Ethernet header, as needed, in case the
+ * mbuf chain was replaced by ipfw.
+ */
+ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
+ if (m == NULL) {
+ *m0 = NULL;
+ return (0);
+ }
+ if (eh != mtod(m, struct ether_header *))
+ bcopy(&save_eh, mtod(m, struct ether_header *),
+ ETHER_HDR_LEN);
+ }
+ *m0 = m;
+
+ ret = 0;
+ /* Check result of ipfw_chk() */
+ switch (i) {
+ case IP_FW_PASS:
+ break;
+
+ case IP_FW_DENY:
+ ret = EACCES;
+ break; /* i.e. drop */
+
+ case IP_FW_DUMMYNET:
+ ret = EACCES;
+ int dir;
+
+ if (ip_dn_io_ptr == NULL)
+ break; /* i.e. drop */
+
+ *m0 = NULL;
+ dir = PROTO_LAYER2 | (dst ? DIR_OUT : DIR_IN);
+ ip_dn_io_ptr(&m, dir, &args);
+ return 0;
+
+ default:
+ KASSERT(0, ("%s: unknown retval", __func__));
+ }
+
+ if (ret != 0) {
+ if (*m0)
+ FREE_PKT(*m0);
+ *m0 = NULL;
+ }
+
return ret;
}
@@ -303,7 +427,7 @@ ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule,
clone = *m0; /* use the original mbuf */
*m0 = NULL;
} else {
- clone = m_dup(*m0, M_DONTWAIT);
+ clone = m_dup(*m0, M_NOWAIT);
/* If we cannot duplicate the mbuf, we sacrifice the divert
* chain and continue with the tee-ed packet.
*/
@@ -325,7 +449,6 @@ ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule,
int hlen;
struct mbuf *reass;
- SET_HOST_IPLEN(ip); /* ip_reass wants host order */
reass = ip_reass(clone); /* Reassemble packet. */
if (reass == NULL)
return 0; /* not an error */
@@ -336,7 +459,6 @@ ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule,
*/
ip = mtod(reass, struct ip *);
hlen = ip->ip_hl << 2;
- SET_NET_IPLEN(ip);
ip->ip_sum = 0;
if (hlen == sizeof(struct ip))
ip->ip_sum = in_cksum_hdr(ip);
@@ -385,13 +507,16 @@ static int
ipfw_hook(int onoff, int pf)
{
struct pfil_head *pfh;
+ pfil_func_t hook_func;
pfh = pfil_head_get(PFIL_TYPE_AF, pf);
if (pfh == NULL)
return ENOENT;
+ hook_func = (pf == AF_LINK) ? ipfw_check_frame : ipfw_check_packet;
+
(void) (onoff ? pfil_add_hook : pfil_remove_hook)
- (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh);
+ (hook_func, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh);
return 0;
}
@@ -415,51 +540,50 @@ ipfw_attach_hooks(int arg)
printf("ipfw6_hook() error\n");
}
#endif
+ if (arg == 0) /* detach */
+ ipfw_hook(0, AF_LINK);
+ else if (V_fwlink_enable && ipfw_hook(1, AF_LINK) != 0) {
+ error = ENOENT;
+ printf("ipfw_link_hook() error\n");
+ }
return error;
}
int
ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
{
- int enable;
- int oldenable;
+ int newval;
int error;
int af;
- if (arg1 == &VNET_NAME(fw_enable)) {
- enable = V_fw_enable;
+ if (arg1 == &V_fw_enable)
af = AF_INET;
- }
#ifdef INET6
- else if (arg1 == &VNET_NAME(fw6_enable)) {
- enable = V_fw6_enable;
+ else if (arg1 == &V_fw6_enable)
af = AF_INET6;
- }
#endif
+ else if (arg1 == &V_fwlink_enable)
+ af = AF_LINK;
else
return (EINVAL);
- oldenable = enable;
-
- error = sysctl_handle_int(oidp, &enable, 0, req);
+ newval = *(int *)arg1;
+ /* Handle sysctl change */
+ error = sysctl_handle_int(oidp, &newval, 0, req);
if (error)
return (error);
- enable = (enable) ? 1 : 0;
+ /* Formalize new value */
+ newval = (newval) ? 1 : 0;
- if (enable == oldenable)
+ if (*(int *)arg1 == newval)
return (0);
- error = ipfw_hook(enable, af);
+ error = ipfw_hook(newval, af);
if (error)
return (error);
- if (af == AF_INET)
- V_fw_enable = enable;
-#ifdef INET6
- else if (af == AF_INET6)
- V_fw6_enable = enable;
-#endif
+ *(int *)arg1 = newval;
return (0);
}
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_private.h b/freebsd/sys/netpfil/ipfw/ip_fw_private.h
index ceabf88d..3b483625 100644
--- a/freebsd/sys/netpfil/ipfw/ip_fw_private.h
+++ b/freebsd/sys/netpfil/ipfw/ip_fw_private.h
@@ -66,14 +66,12 @@ enum {
*/
struct _ip6dn_args {
struct ip6_pktopts *opt_or;
- struct route_in6 ro_or;
int flags_or;
struct ip6_moptions *im6o_or;
struct ifnet *origifp_or;
struct ifnet *ifp_or;
struct sockaddr_in6 dst_or;
u_long mtu_or;
- struct route_in6 ro_pmtu_or;
};
@@ -104,7 +102,10 @@ struct ip_fw_args {
struct inpcb *inp;
struct _ip6dn_args dummypar; /* dummynet->ip6_output */
- struct sockaddr_in hopstore; /* store here if cannot use a pointer */
+ union { /* store here if cannot use a pointer */
+ struct sockaddr_in hopstore;
+ struct sockaddr_in6 hopstore6;
+ };
};
MALLOC_DECLARE(M_IPFW);
@@ -152,10 +153,13 @@ void ipfw_nat_destroy(void);
/* In ip_fw_log.c */
struct ip;
-void ipfw_log_bpf(int);
-void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
- struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
- struct ip *ip);
+struct ip_fw_chain;
+void ipfw_bpf_init(int);
+void ipfw_bpf_uninit(int);
+void ipfw_bpf_mtap2(void *, u_int, struct mbuf *);
+void ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen,
+ struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif,
+ u_short offset, uint32_t tablearg, struct ip *ip);
VNET_DECLARE(u_int64_t, norule_counter);
#define V_norule_counter VNET(norule_counter)
VNET_DECLARE(int, verbose_limit);
@@ -176,22 +180,26 @@ enum { /* result for matching dynamic rules */
* Eventually we may implement it with a callback on the function.
*/
struct ip_fw_chain;
-void ipfw_expire_dyn_rules(struct ip_fw_chain *, struct ip_fw *, int);
+struct sockopt_data;
+int ipfw_is_dyn_rule(struct ip_fw *rule);
+void ipfw_expire_dyn_rules(struct ip_fw_chain *, ipfw_range_tlv *);
void ipfw_dyn_unlock(ipfw_dyn_rule *q);
struct tcphdr;
struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *,
u_int32_t, u_int32_t, int);
-int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
- struct ip_fw_args *args, uint32_t tablearg);
+int ipfw_install_state(struct ip_fw_chain *chain, struct ip_fw *rule,
+ ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg);
ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt,
- int *match_direction, struct tcphdr *tcp);
+ int *match_direction, struct tcphdr *tcp, uint16_t kidx);
void ipfw_remove_dyn_children(struct ip_fw *rule);
void ipfw_get_dynamic(struct ip_fw_chain *chain, char **bp, const char *ep);
+int ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd);
void ipfw_dyn_init(struct ip_fw_chain *); /* per-vnet initialization */
void ipfw_dyn_uninit(int); /* per-vnet deinitialization */
int ipfw_dyn_len(void);
+int ipfw_dyn_get_count(void);
/* common variables */
VNET_DECLARE(int, fw_one_pass);
@@ -203,6 +211,9 @@ VNET_DECLARE(int, fw_verbose);
VNET_DECLARE(struct ip_fw_chain, layer3_chain);
#define V_layer3_chain VNET(layer3_chain)
+VNET_DECLARE(int, ipfw_vnet_ready);
+#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready)
+
VNET_DECLARE(u_int32_t, set_disable);
#define V_set_disable VNET(set_disable)
@@ -212,23 +223,66 @@ VNET_DECLARE(int, autoinc_step);
VNET_DECLARE(unsigned int, fw_tables_max);
#define V_fw_tables_max VNET(fw_tables_max)
+VNET_DECLARE(unsigned int, fw_tables_sets);
+#define V_fw_tables_sets VNET(fw_tables_sets)
+
+struct tables_config;
+
+#ifdef _KERNEL
+/*
+ * Here we have the structure representing an ipfw rule.
+ *
+ * It starts with a general area
+ * followed by an array of one or more instructions, which the code
+ * accesses as an array of 32-bit values.
+ *
+ * Given a rule pointer r:
+ *
+ * r->cmd is the start of the first instruction.
+ * ACTION_PTR(r) is the start of the first action (things to do
+ * once a rule matched).
+ */
+
+struct ip_fw {
+ uint16_t act_ofs; /* offset of action in 32-bit units */
+ uint16_t cmd_len; /* # of 32-bit words in cmd */
+ uint16_t rulenum; /* rule number */
+ uint8_t set; /* rule set (0..31) */
+ uint8_t flags; /* currently unused */
+ counter_u64_t cntr; /* Pointer to rule counters */
+ uint32_t timestamp; /* tv_sec of last match */
+ uint32_t id; /* rule id */
+ uint32_t cached_id; /* used by jump_fast */
+ uint32_t cached_pos; /* used by jump_fast */
+
+ ipfw_insn cmd[1]; /* storage for commands */
+};
+
+#define IPFW_RULE_CNTR_SIZE (2 * sizeof(uint64_t))
+
+#endif
+
struct ip_fw_chain {
struct ip_fw **map; /* array of rule ptrs to ease lookup */
uint32_t id; /* ruleset id */
int n_rules; /* number of static rules */
- LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */
- struct radix_node_head **tables; /* IPv4 tables */
- struct radix_node_head **xtables; /* extended tables */
- uint8_t *tabletype; /* Array of table types */
+ void *tablestate; /* runtime table info */
+ void *valuestate; /* runtime table value info */
+ int *idxmap; /* skipto array of rules */
+ void **srvstate; /* runtime service mappings */
#if defined( __linux__ ) || defined( _WIN32 )
spinlock_t rwmtx;
#else
- struct rwlock rwmtx;
+ struct rmlock rwmtx;
#endif
- int static_len; /* total len of static rules */
+ int static_len; /* total len of static rules (v0) */
uint32_t gencnt; /* NAT generation count */
- struct ip_fw *reap; /* list of rules to reap */
+ LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */
struct ip_fw *default_rule;
+ struct tables_config *tblcfg; /* tables module data */
+ void *ifcfg; /* interface module data */
+ int *idxmap_back; /* standby skipto array of rules */
+ struct namedobj_instance *srvmap; /* cfg name->number mappings */
#if defined( __linux__ ) || defined( _WIN32 )
spinlock_t uh_lock;
#else
@@ -236,13 +290,81 @@ struct ip_fw_chain {
#endif
};
+/* 64-byte structure representing multi-field table value */
+struct table_value {
+ uint32_t tag; /* O_TAG/O_TAGGED */
+ uint32_t pipe; /* O_PIPE/O_QUEUE */
+ uint16_t divert; /* O_DIVERT/O_TEE */
+ uint16_t skipto; /* skipto, CALLRET */
+ uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */
+ uint32_t fib; /* O_SETFIB */
+ uint32_t nat; /* O_NAT */
+ uint32_t nh4;
+ uint8_t dscp;
+ uint8_t spare0;
+ uint16_t spare1;
+ /* -- 32 bytes -- */
+ struct in6_addr nh6;
+ uint32_t limit; /* O_LIMIT */
+ uint32_t zoneid; /* scope zone id for nh6 */
+ uint64_t refcnt; /* Number of references */
+};
+
+
+struct named_object {
+ TAILQ_ENTRY(named_object) nn_next; /* namehash */
+ TAILQ_ENTRY(named_object) nv_next; /* valuehash */
+ char *name; /* object name */
+ uint16_t etlv; /* Export TLV id */
+ uint8_t subtype;/* object subtype within class */
+ uint8_t set; /* set object belongs to */
+ uint16_t kidx; /* object kernel index */
+ uint16_t spare;
+ uint32_t ocnt; /* object counter for internal use */
+ uint32_t refcnt; /* number of references */
+};
+TAILQ_HEAD(namedobjects_head, named_object);
+
struct sockopt; /* used by tcp_var.h */
+struct sockopt_data {
+ caddr_t kbuf; /* allocated buffer */
+ size_t ksize; /* given buffer size */
+ size_t koff; /* data already used */
+ size_t kavail; /* number of bytes available */
+ size_t ktotal; /* total bytes pushed */
+ struct sockopt *sopt; /* socket data */
+ caddr_t sopt_val; /* sopt user buffer */
+ size_t valsize; /* original data size */
+};
+
+struct ipfw_ifc;
+
+typedef void (ipfw_ifc_cb)(struct ip_fw_chain *ch, void *cbdata,
+ uint16_t ifindex);
+
+struct ipfw_iface {
+ struct named_object no;
+ char ifname[64];
+ int resolved;
+ uint16_t ifindex;
+ uint16_t spare;
+ uint64_t gencnt;
+ TAILQ_HEAD(, ipfw_ifc) consumers;
+};
+
+struct ipfw_ifc {
+ TAILQ_ENTRY(ipfw_ifc) next;
+ struct ipfw_iface *iface;
+ ipfw_ifc_cb *cb;
+ void *cbdata;
+};
/* Macro for working with various counters */
#define IPFW_INC_RULE_COUNTER(_cntr, _bytes) do { \
- (_cntr)->pcnt++; \
- (_cntr)->bcnt += _bytes; \
- (_cntr)->timestamp = time_uptime; \
+ counter_u64_add((_cntr)->cntr, 1); \
+ counter_u64_add((_cntr)->cntr + 1, _bytes); \
+ if ((_cntr)->timestamp != time_uptime) \
+ (_cntr)->timestamp = time_uptime; \
} while (0)
#define IPFW_INC_DYN_COUNTER(_cntr, _bytes) do { \
@@ -251,8 +373,8 @@ struct sockopt; /* used by tcp_var.h */
} while (0)
#define IPFW_ZERO_RULE_COUNTER(_cntr) do { \
- (_cntr)->pcnt = 0; \
- (_cntr)->bcnt = 0; \
+ counter_u64_zero((_cntr)->cntr); \
+ counter_u64_zero((_cntr)->cntr + 1); \
(_cntr)->timestamp = 0; \
} while (0)
@@ -261,12 +383,15 @@ struct sockopt; /* used by tcp_var.h */
(_cntr)->bcnt = 0; \
} while (0)
-#define IP_FW_ARG_TABLEARG(a) ((a) == IP_FW_TABLEARG) ? tablearg : (a)
+#define TARG_VAL(ch, k, f) ((struct table_value *)((ch)->valuestate))[k].f
+#define IP_FW_ARG_TABLEARG(ch, a, f) \
+ (((a) == IP_FW_TARG) ? TARG_VAL(ch, tablearg, f) : (a))
/*
* The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c
* so the variable and the macros must be here.
*/
+#if defined( __linux__ ) || defined( _WIN32 )
#define IPFW_LOCK_INIT(_chain) do { \
rw_init(&(_chain)->rwmtx, "IPFW static rules"); \
rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \
@@ -280,49 +405,354 @@ struct sockopt; /* used by tcp_var.h */
#define IPFW_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_RLOCKED)
#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED)
-#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
-#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
-#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
-#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)
+#define IPFW_RLOCK_TRACKER
+#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
+#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
+#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
+#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)
+#define IPFW_PF_RLOCK(p) IPFW_RLOCK(p)
+#define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p)
+#else /* FreeBSD */
+#define IPFW_LOCK_INIT(_chain) do { \
+ rm_init(&(_chain)->rwmtx, "IPFW static rules"); \
+ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \
+ } while (0)
+
+#define IPFW_LOCK_DESTROY(_chain) do { \
+ rm_destroy(&(_chain)->rwmtx); \
+ rw_destroy(&(_chain)->uh_lock); \
+ } while (0)
+
+#define IPFW_RLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_RLOCKED)
+#define IPFW_WLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_WLOCKED)
+
+#define IPFW_RLOCK_TRACKER struct rm_priotracker _tracker
+#define IPFW_RLOCK(p) rm_rlock(&(p)->rwmtx, &_tracker)
+#define IPFW_RUNLOCK(p) rm_runlock(&(p)->rwmtx, &_tracker)
+#define IPFW_WLOCK(p) rm_wlock(&(p)->rwmtx)
+#define IPFW_WUNLOCK(p) rm_wunlock(&(p)->rwmtx)
+#define IPFW_PF_RLOCK(p) IPFW_RLOCK(p)
+#define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p)
+#endif
#define IPFW_UH_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_RLOCKED)
#define IPFW_UH_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_WLOCKED)
+#define IPFW_UH_UNLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_UNLOCKED)
#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock)
#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock)
#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock)
#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock)
+struct obj_idx {
+ uint16_t uidx; /* internal index supplied by userland */
+ uint16_t kidx; /* kernel object index */
+ uint16_t off; /* tlv offset from rule end in 4-byte words */
+ uint8_t spare;
+ uint8_t type; /* object type within its category */
+};
+
+struct rule_check_info {
+ uint16_t flags; /* rule-specific check flags */
+ uint16_t object_opcodes; /* num of opcodes referencing objects */
+ uint16_t urule_numoff; /* offset of rulenum in bytes */
+ uint8_t version; /* rule version */
+ uint8_t spare;
+ ipfw_obj_ctlv *ctlv; /* name TLV containter */
+ struct ip_fw *krule; /* resulting rule pointer */
+ caddr_t urule; /* original rule pointer */
+ struct obj_idx obuf[8]; /* table references storage */
+};
+
+/* Legacy interface support */
+/*
+ * FreeBSD 8 export rule format
+ */
+struct ip_fw_rule0 {
+ struct ip_fw *x_next; /* linked list of rules */
+ struct ip_fw *next_rule; /* ptr to next [skipto] rule */
+ /* 'next_rule' is used to pass up 'set_disable' status */
+
+ uint16_t act_ofs; /* offset of action in 32-bit units */
+ uint16_t cmd_len; /* # of 32-bit words in cmd */
+ uint16_t rulenum; /* rule number */
+ uint8_t set; /* rule set (0..31) */
+ uint8_t _pad; /* padding */
+ uint32_t id; /* rule id */
+
+ /* These fields are present in all rules. */
+ uint64_t pcnt; /* Packet counter */
+ uint64_t bcnt; /* Byte counter */
+ uint32_t timestamp; /* tv_sec of last match */
+
+ ipfw_insn cmd[1]; /* storage for commands */
+};
+
+struct ip_fw_bcounter0 {
+ uint64_t pcnt; /* Packet counter */
+ uint64_t bcnt; /* Byte counter */
+ uint32_t timestamp; /* tv_sec of last match */
+};
+
+/* Kernel rule length */
+/*
+ * RULE _K_ SIZE _V_ ->
+ * get kernel size from userland rool version _V_.
+ * RULE _U_ SIZE _V_ ->
+ * get user size version _V_ from kernel rule
+ * RULESIZE _V_ ->
+ * get user size rule length
+ */
+/* FreeBSD8 <> current kernel format */
+#define RULEUSIZE0(r) (sizeof(struct ip_fw_rule0) + (r)->cmd_len * 4 - 4)
+#define RULEKSIZE0(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8)
+/* FreeBSD11 <> current kernel format */
+#define RULEUSIZE1(r) (roundup2(sizeof(struct ip_fw_rule) + \
+ (r)->cmd_len * 4 - 4, 8))
+#define RULEKSIZE1(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8)
+
+/*
+ * Tables/Objects index rewriting code
+ */
+
+/* Default and maximum number of ipfw tables/objects. */
+#define IPFW_TABLES_MAX 65536
+#define IPFW_TABLES_DEFAULT 128
+#define IPFW_OBJECTS_MAX 65536
+#define IPFW_OBJECTS_DEFAULT 1024
+
+#define CHAIN_TO_SRV(ch) ((ch)->srvmap)
+#define SRV_OBJECT(ch, idx) ((ch)->srvstate[(idx)])
+
+struct tid_info {
+ uint32_t set; /* table set */
+ uint16_t uidx; /* table index */
+ uint8_t type; /* table type */
+ uint8_t atype;
+ uint8_t spare;
+ int tlen; /* Total TLV size block */
+ void *tlvs; /* Pointer to first TLV */
+};
+
+/*
+ * Classifier callback. Checks if @cmd opcode contains kernel object reference.
+ * If true, returns its index and type.
+ * Returns 0 if match is found, 1 overwise.
+ */
+typedef int (ipfw_obj_rw_cl)(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype);
+/*
+ * Updater callback. Sets kernel object reference index to @puidx
+ */
+typedef void (ipfw_obj_rw_upd)(ipfw_insn *cmd, uint16_t puidx);
+/*
+ * Finder callback. Tries to find named object by name (specified via @ti).
+ * Stores found named object pointer in @pno.
+ * If object was not found, NULL is stored.
+ *
+ * Return 0 if input data was valid.
+ */
+typedef int (ipfw_obj_fname_cb)(struct ip_fw_chain *ch,
+ struct tid_info *ti, struct named_object **pno);
+/*
+ * Another finder callback. Tries to findex named object by kernel index.
+ *
+ * Returns pointer to named object or NULL.
+ */
+typedef struct named_object *(ipfw_obj_fidx_cb)(struct ip_fw_chain *ch,
+ uint16_t kidx);
+/*
+ * Object creator callback. Tries to create object specified by @ti.
+ * Stores newly-allocated object index in @pkidx.
+ *
+ * Returns 0 on success.
+ */
+typedef int (ipfw_obj_create_cb)(struct ip_fw_chain *ch, struct tid_info *ti,
+ uint16_t *pkidx);
+/*
+ * Object destroy callback. Intended to free resources allocated by
+ * create_object callback.
+ */
+typedef void (ipfw_obj_destroy_cb)(struct ip_fw_chain *ch,
+ struct named_object *no);
+/*
+ * Sets handler callback. Handles moving and swaping set of named object.
+ * SWAP_ALL moves all named objects from set `set' to `new_set' and vise versa;
+ * TEST_ALL checks that there aren't any named object with conflicting names;
+ * MOVE_ALL moves all named objects from set `set' to `new_set';
+ * COUNT_ONE used to count number of references used by object with kidx `set';
+ * TEST_ONE checks that named object with kidx `set' can be moved to `new_set`;
+ * MOVE_ONE moves named object with kidx `set' to set `new_set'.
+ */
+enum ipfw_sets_cmd {
+ SWAP_ALL = 0, TEST_ALL, MOVE_ALL, COUNT_ONE, TEST_ONE, MOVE_ONE
+};
+typedef int (ipfw_obj_sets_cb)(struct ip_fw_chain *ch,
+ uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd);
+
+
+struct opcode_obj_rewrite {
+ uint32_t opcode; /* Opcode to act upon */
+ uint32_t etlv; /* Relevant export TLV id */
+ ipfw_obj_rw_cl *classifier; /* Check if rewrite is needed */
+ ipfw_obj_rw_upd *update; /* update cmd with new value */
+ ipfw_obj_fname_cb *find_byname; /* Find named object by name */
+ ipfw_obj_fidx_cb *find_bykidx; /* Find named object by kidx */
+ ipfw_obj_create_cb *create_object; /* Create named object */
+ ipfw_obj_destroy_cb *destroy_object;/* Destroy named object */
+ ipfw_obj_sets_cb *manage_sets; /* Swap or move sets */
+};
+
+#define IPFW_ADD_OBJ_REWRITER(f, c) do { \
+ if ((f) != 0) \
+ ipfw_add_obj_rewriter(c, \
+ sizeof(c) / sizeof(c[0])); \
+ } while(0)
+#define IPFW_DEL_OBJ_REWRITER(l, c) do { \
+ if ((l) != 0) \
+ ipfw_del_obj_rewriter(c, \
+ sizeof(c) / sizeof(c[0])); \
+ } while(0)
+
+/* In ip_fw_iface.c */
+int ipfw_iface_init(void);
+void ipfw_iface_destroy(void);
+void vnet_ipfw_iface_destroy(struct ip_fw_chain *ch);
+int ipfw_iface_ref(struct ip_fw_chain *ch, char *name,
+ struct ipfw_ifc *ic);
+void ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic);
+void ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic);
+void ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic);
+
/* In ip_fw_sockopt.c */
+void ipfw_init_skipto_cache(struct ip_fw_chain *chain);
+void ipfw_destroy_skipto_cache(struct ip_fw_chain *chain);
int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
-int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
-int ipfw_ctl(struct sockopt *sopt);
+int ipfw_ctl3(struct sockopt *sopt);
int ipfw_chk(struct ip_fw_args *args);
+void ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head,
+ struct ip_fw *rule);
void ipfw_reap_rules(struct ip_fw *head);
-
-/* In ip_fw_pfil */
-int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
- struct inpcb *inp);
+void ipfw_init_counters(void);
+void ipfw_destroy_counters(void);
+struct ip_fw *ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize);
+int ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt);
+
+typedef int (sopt_handler_f)(struct ip_fw_chain *ch,
+ ip_fw3_opheader *op3, struct sockopt_data *sd);
+struct ipfw_sopt_handler {
+ uint16_t opcode;
+ uint8_t version;
+ uint8_t dir;
+ sopt_handler_f *handler;
+ uint64_t refcnt;
+};
+#define HDIR_SET 0x01 /* Handler is used to set some data */
+#define HDIR_GET 0x02 /* Handler is used to retrieve data */
+#define HDIR_BOTH HDIR_GET|HDIR_SET
+
+void ipfw_init_sopt_handler(void);
+void ipfw_destroy_sopt_handler(void);
+void ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count);
+int ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count);
+caddr_t ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed);
+caddr_t ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed);
+#define IPFW_ADD_SOPT_HANDLER(f, c) do { \
+ if ((f) != 0) \
+ ipfw_add_sopt_handler(c, \
+ sizeof(c) / sizeof(c[0])); \
+ } while(0)
+#define IPFW_DEL_SOPT_HANDLER(l, c) do { \
+ if ((l) != 0) \
+ ipfw_del_sopt_handler(c, \
+ sizeof(c) / sizeof(c[0])); \
+ } while(0)
+
+struct namedobj_instance;
+typedef int (objhash_cb_t)(struct namedobj_instance *ni, struct named_object *,
+ void *arg);
+typedef uint32_t (objhash_hash_f)(struct namedobj_instance *ni, const void *key,
+ uint32_t kopt);
+typedef int (objhash_cmp_f)(struct named_object *no, const void *key,
+ uint32_t kopt);
+struct namedobj_instance *ipfw_objhash_create(uint32_t items);
+void ipfw_objhash_destroy(struct namedobj_instance *);
+void ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks);
+void ipfw_objhash_bitmap_merge(struct namedobj_instance *ni,
+ void **idx, int *blocks);
+void ipfw_objhash_bitmap_swap(struct namedobj_instance *ni,
+ void **idx, int *blocks);
+void ipfw_objhash_bitmap_free(void *idx, int blocks);
+void ipfw_objhash_set_hashf(struct namedobj_instance *ni, objhash_hash_f *f);
+struct named_object *ipfw_objhash_lookup_name(struct namedobj_instance *ni,
+ uint32_t set, char *name);
+struct named_object *ipfw_objhash_lookup_name_type(struct namedobj_instance *ni,
+ uint32_t set, uint32_t type, const char *name);
+struct named_object *ipfw_objhash_lookup_kidx(struct namedobj_instance *ni,
+ uint16_t idx);
+int ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a,
+ struct named_object *b);
+void ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no);
+void ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no);
+uint32_t ipfw_objhash_count(struct namedobj_instance *ni);
+uint32_t ipfw_objhash_count_type(struct namedobj_instance *ni, uint16_t type);
+int ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f,
+ void *arg);
+int ipfw_objhash_foreach_type(struct namedobj_instance *ni, objhash_cb_t *f,
+ void *arg, uint16_t type);
+int ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx);
+int ipfw_objhash_alloc_idx(void *n, uint16_t *pidx);
+void ipfw_objhash_set_funcs(struct namedobj_instance *ni,
+ objhash_hash_f *hash_f, objhash_cmp_f *cmp_f);
+int ipfw_objhash_find_type(struct namedobj_instance *ni, struct tid_info *ti,
+ uint32_t etlv, struct named_object **pno);
+void ipfw_export_obj_ntlv(struct named_object *no, ipfw_obj_ntlv *ntlv);
+ipfw_obj_ntlv *ipfw_find_name_tlv_type(void *tlvs, int len, uint16_t uidx,
+ uint32_t etlv);
+void ipfw_init_obj_rewriter(void);
+void ipfw_destroy_obj_rewriter(void);
+void ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count);
+int ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count);
+
+int create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd,
+ struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti);
+void update_opcode_kidx(ipfw_insn *cmd, uint16_t idx);
+int classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx);
+void ipfw_init_srv(struct ip_fw_chain *ch);
+void ipfw_destroy_srv(struct ip_fw_chain *ch);
+int ipfw_check_object_name_generic(const char *name);
+int ipfw_obj_manage_sets(struct namedobj_instance *ni, uint16_t type,
+ uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd);
+
+/* In ip_fw_eaction.c */
+typedef int (ipfw_eaction_t)(struct ip_fw_chain *ch, struct ip_fw_args *args,
+ ipfw_insn *cmd, int *done);
+int ipfw_eaction_init(struct ip_fw_chain *ch, int first);
+void ipfw_eaction_uninit(struct ip_fw_chain *ch, int last);
+
+uint16_t ipfw_add_eaction(struct ip_fw_chain *ch, ipfw_eaction_t handler,
+ const char *name);
+int ipfw_del_eaction(struct ip_fw_chain *ch, uint16_t eaction_id);
+int ipfw_run_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args,
+ ipfw_insn *cmd, int *done);
/* In ip_fw_table.c */
-struct radix_node;
+struct table_info;
+
+typedef int (table_lookup_t)(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val);
+
int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
uint32_t *val);
-int ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, void *paddr,
- uint32_t *val, int type);
-int ipfw_init_tables(struct ip_fw_chain *ch);
-void ipfw_destroy_tables(struct ip_fw_chain *ch);
-int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl);
-int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr,
- uint8_t plen, uint8_t mlen, uint8_t type, uint32_t value);
-int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr,
- uint8_t plen, uint8_t mlen, uint8_t type);
-int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt);
-int ipfw_dump_table_entry(struct radix_node *rn, void *arg);
-int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl);
-int ipfw_count_xtable(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt);
-int ipfw_dump_xtable(struct ip_fw_chain *ch, ipfw_xtable *tbl);
+int ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl,
+ uint16_t plen, void *paddr, uint32_t *val);
+struct named_object *ipfw_objhash_lookup_table_kidx(struct ip_fw_chain *ch,
+ uint16_t kidx);
+int ipfw_ref_table(struct ip_fw_chain *ch, ipfw_obj_ntlv *ntlv, uint16_t *kidx);
+void ipfw_unref_table(struct ip_fw_chain *ch, uint16_t kidx);
+int ipfw_init_tables(struct ip_fw_chain *ch, int first);
int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables);
+int ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int nsets);
+void ipfw_destroy_tables(struct ip_fw_chain *ch, int last);
/* In ip_fw_nat.c -- XXX to be moved to ip_var.h */
@@ -341,5 +771,22 @@ extern ipfw_nat_cfg_t *ipfw_nat_del_ptr;
extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+/* Helper functions for IP checksum adjustment */
+static __inline uint16_t
+cksum_add(uint16_t sum, uint16_t a)
+{
+ uint16_t res;
+
+ res = sum + a;
+ return (res + (res < a));
+}
+
+static __inline uint16_t
+cksum_adjust(uint16_t oldsum, uint16_t old, uint16_t new)
+{
+
+ return (~cksum_add(cksum_add(~oldsum, ~old), new));
+}
+
#endif /* _KERNEL */
#endif /* _IPFW2_PRIVATE_H */
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_sockopt.c b/freebsd/sys/netpfil/ipfw/ip_fw_sockopt.c
index 95cd8c81..468e4ad4 100644
--- a/freebsd/sys/netpfil/ipfw/ip_fw_sockopt.c
+++ b/freebsd/sys/netpfil/ipfw/ip_fw_sockopt.c
@@ -2,6 +2,8 @@
/*-
* Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ * Copyright (c) 2014 Yandex LLC
+ * Copyright (c) 2014 Alexander V. Chernikov
*
* Supported by: Valeria Paoli
*
@@ -31,8 +33,8 @@
__FBSDID("$FreeBSD$");
/*
- * Sockopt support for ipfw. The routines here implement
- * the upper half of the ipfw code.
+ * Control socket and rule management routines for ipfw.
+ * Control is currently implemented via IP_FW3 setsockopt() code.
*/
#include <rtems/bsd/local/opt_ipfw.h>
@@ -51,30 +53,174 @@ __FBSDID("$FreeBSD$");
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
+#include <sys/rmlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
+#include <sys/fnv_hash.h>
#include <net/if.h>
#include <net/route.h>
#include <net/vnet.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
#include <netinet/in.h>
#include <netinet/ip_var.h> /* hooks */
#include <netinet/ip_fw.h>
#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/ip_fw_table.h>
#ifdef MAC
#include <security/mac/mac_framework.h>
#endif
+static int ipfw_ctl(struct sockopt *sopt);
+static int check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len,
+ struct rule_check_info *ci);
+static int check_ipfw_rule1(struct ip_fw_rule *rule, int size,
+ struct rule_check_info *ci);
+static int check_ipfw_rule0(struct ip_fw_rule0 *rule, int size,
+ struct rule_check_info *ci);
+static int rewrite_rule_uidx(struct ip_fw_chain *chain,
+ struct rule_check_info *ci);
+
+#define NAMEDOBJ_HASH_SIZE 32
+
+struct namedobj_instance {
+ struct namedobjects_head *names;
+ struct namedobjects_head *values;
+ uint32_t nn_size; /* names hash size */
+ uint32_t nv_size; /* number hash size */
+ u_long *idx_mask; /* used items bitmask */
+ uint32_t max_blocks; /* number of "long" blocks in bitmask */
+ uint32_t count; /* number of items */
+ uint16_t free_off[IPFW_MAX_SETS]; /* first possible free offset */
+ objhash_hash_f *hash_f;
+ objhash_cmp_f *cmp_f;
+};
+#define BLOCK_ITEMS (8 * sizeof(u_long)) /* Number of items for ffsl() */
+
+static uint32_t objhash_hash_name(struct namedobj_instance *ni,
+ const void *key, uint32_t kopt);
+static uint32_t objhash_hash_idx(struct namedobj_instance *ni, uint32_t val);
+static int objhash_cmp_name(struct named_object *no, const void *name,
+ uint32_t set);
+
MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
+static int dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd);
+static int add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd);
+static int del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd);
+static int clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd);
+static int move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd);
+static int manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd);
+static int dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd);
+static int dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd);
+
+/* ctl3 handler data */
+struct mtx ctl3_lock;
+#define CTL3_LOCK_INIT() mtx_init(&ctl3_lock, "ctl3_lock", NULL, MTX_DEF)
+#define CTL3_LOCK_DESTROY() mtx_destroy(&ctl3_lock)
+#define CTL3_LOCK() mtx_lock(&ctl3_lock)
+#define CTL3_UNLOCK() mtx_unlock(&ctl3_lock)
+
+static struct ipfw_sopt_handler *ctl3_handlers;
+static size_t ctl3_hsize;
+static uint64_t ctl3_refct, ctl3_gencnt;
+#define CTL3_SMALLBUF 4096 /* small page-size write buffer */
+#define CTL3_LARGEBUF 16 * 1024 * 1024 /* handle large rulesets */
+
+static int ipfw_flush_sopt_data(struct sockopt_data *sd);
+
+static struct ipfw_sopt_handler scodes[] = {
+ { IP_FW_XGET, 0, HDIR_GET, dump_config },
+ { IP_FW_XADD, 0, HDIR_BOTH, add_rules },
+ { IP_FW_XDEL, 0, HDIR_BOTH, del_rules },
+ { IP_FW_XZERO, 0, HDIR_SET, clear_rules },
+ { IP_FW_XRESETLOG, 0, HDIR_SET, clear_rules },
+ { IP_FW_XMOVE, 0, HDIR_SET, move_rules },
+ { IP_FW_SET_SWAP, 0, HDIR_SET, manage_sets },
+ { IP_FW_SET_MOVE, 0, HDIR_SET, manage_sets },
+ { IP_FW_SET_ENABLE, 0, HDIR_SET, manage_sets },
+ { IP_FW_DUMP_SOPTCODES, 0, HDIR_GET, dump_soptcodes },
+ { IP_FW_DUMP_SRVOBJECTS,0, HDIR_GET, dump_srvobjects },
+};
+
+static int
+set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule);
+static struct opcode_obj_rewrite *find_op_rw(ipfw_insn *cmd,
+ uint16_t *puidx, uint8_t *ptype);
+static int mark_object_kidx(struct ip_fw_chain *ch, struct ip_fw *rule,
+ uint32_t *bmask);
+static int ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule,
+ struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti);
+static int ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd,
+ struct tid_info *ti, struct obj_idx *pidx, int *unresolved);
+static void unref_rule_objects(struct ip_fw_chain *chain, struct ip_fw *rule);
+static void unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd,
+ struct obj_idx *oib, struct obj_idx *end);
+static int export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx,
+ struct sockopt_data *sd);
+
+/*
+ * Opcode object rewriter variables
+ */
+struct opcode_obj_rewrite *ctl3_rewriters;
+static size_t ctl3_rsize;
+
/*
- * static variables followed by global ones (none in this file)
+ * static variables followed by global ones
*/
+static VNET_DEFINE(uma_zone_t, ipfw_cntr_zone);
+#define V_ipfw_cntr_zone VNET(ipfw_cntr_zone)
+
+void
+ipfw_init_counters()
+{
+
+ V_ipfw_cntr_zone = uma_zcreate("IPFW counters",
+ IPFW_RULE_CNTR_SIZE, NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, UMA_ZONE_PCPU);
+}
+
+void
+ipfw_destroy_counters()
+{
+
+ uma_zdestroy(V_ipfw_cntr_zone);
+}
+
+struct ip_fw *
+ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize)
+{
+ struct ip_fw *rule;
+
+ rule = malloc(rulesize, M_IPFW, M_WAITOK | M_ZERO);
+ rule->cntr = uma_zalloc(V_ipfw_cntr_zone, M_WAITOK | M_ZERO);
+
+ return (rule);
+}
+
+static void
+free_rule(struct ip_fw *rule)
+{
+
+ uma_zfree(V_ipfw_cntr_zone, rule->cntr);
+ free(rule, M_IPFW);
+}
+
+
/*
* Find the smallest rule >= key, id.
* We could use bsearch but it is so simple that we code it directly
@@ -96,11 +242,109 @@ ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
lo = i + 1; /* continue from the next one */
else /* r->id >= id */
hi = i; /* this might be good */
- };
+ }
return hi;
}
/*
+ * Builds skipto cache on rule set @map.
+ */
+static void
+update_skipto_cache(struct ip_fw_chain *chain, struct ip_fw **map)
+{
+ int *smap, rulenum;
+ int i, mi;
+
+ IPFW_UH_WLOCK_ASSERT(chain);
+
+ mi = 0;
+ rulenum = map[mi]->rulenum;
+ smap = chain->idxmap_back;
+
+ if (smap == NULL)
+ return;
+
+ for (i = 0; i < 65536; i++) {
+ smap[i] = mi;
+ /* Use the same rule index until i < rulenum */
+ if (i != rulenum || i == 65535)
+ continue;
+ /* Find next rule with num > i */
+ rulenum = map[++mi]->rulenum;
+ while (rulenum == i)
+ rulenum = map[++mi]->rulenum;
+ }
+}
+
+/*
+ * Swaps prepared (backup) index with current one.
+ */
+static void
+swap_skipto_cache(struct ip_fw_chain *chain)
+{
+ int *map;
+
+ IPFW_UH_WLOCK_ASSERT(chain);
+ IPFW_WLOCK_ASSERT(chain);
+
+ map = chain->idxmap;
+ chain->idxmap = chain->idxmap_back;
+ chain->idxmap_back = map;
+}
+
+/*
+ * Allocate and initialize skipto cache.
+ */
+void
+ipfw_init_skipto_cache(struct ip_fw_chain *chain)
+{
+ int *idxmap, *idxmap_back;
+
+ idxmap = malloc(65536 * sizeof(uint32_t *), M_IPFW,
+ M_WAITOK | M_ZERO);
+ idxmap_back = malloc(65536 * sizeof(uint32_t *), M_IPFW,
+ M_WAITOK | M_ZERO);
+
+ /*
+ * Note we may be called at any time after initialization,
+ * for example, on first skipto rule, so we need to
+ * provide valid chain->idxmap on return
+ */
+
+ IPFW_UH_WLOCK(chain);
+ if (chain->idxmap != NULL) {
+ IPFW_UH_WUNLOCK(chain);
+ free(idxmap, M_IPFW);
+ free(idxmap_back, M_IPFW);
+ return;
+ }
+
+ /* Set backup pointer first to permit building cache */
+ chain->idxmap_back = idxmap_back;
+ update_skipto_cache(chain, chain->map);
+ IPFW_WLOCK(chain);
+ /* It is now safe to set chain->idxmap ptr */
+ chain->idxmap = idxmap;
+ swap_skipto_cache(chain);
+ IPFW_WUNLOCK(chain);
+ IPFW_UH_WUNLOCK(chain);
+}
+
+/*
+ * Destroys skipto cache.
+ */
+void
+ipfw_destroy_skipto_cache(struct ip_fw_chain *chain)
+{
+
+ if (chain->idxmap != NULL)
+ free(chain->idxmap, M_IPFW);
+ if (chain->idxmap != NULL)
+ free(chain->idxmap_back, M_IPFW);
+}
+
+
+/*
* allocate a new map, returns the chain locked. extra is the number
* of entries to add or delete.
*/
@@ -110,11 +354,12 @@ get_map(struct ip_fw_chain *chain, int extra, int locked)
for (;;) {
struct ip_fw **map;
- int i;
+ int i, mflags;
+
+ mflags = M_ZERO | ((locked != 0) ? M_NOWAIT : M_WAITOK);
i = chain->n_rules + extra;
- map = malloc(i * sizeof(struct ip_fw *), M_IPFW,
- locked ? M_NOWAIT : M_WAITOK);
+ map = malloc(i * sizeof(struct ip_fw *), M_IPFW, mflags);
if (map == NULL) {
printf("%s: cannot allocate map\n", __FUNCTION__);
return NULL;
@@ -143,69 +388,403 @@ swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
chain->n_rules = new_len;
old_map = chain->map;
chain->map = new_map;
+ swap_skipto_cache(chain);
IPFW_WUNLOCK(chain);
return old_map;
}
+
+static void
+export_cntr1_base(struct ip_fw *krule, struct ip_fw_bcounter *cntr)
+{
+ struct timeval boottime;
+
+ cntr->size = sizeof(*cntr);
+
+ if (krule->cntr != NULL) {
+ cntr->pcnt = counter_u64_fetch(krule->cntr);
+ cntr->bcnt = counter_u64_fetch(krule->cntr + 1);
+ cntr->timestamp = krule->timestamp;
+ }
+ if (cntr->timestamp > 0) {
+ getboottime(&boottime);
+ cntr->timestamp += boottime.tv_sec;
+ }
+}
+
+static void
+export_cntr0_base(struct ip_fw *krule, struct ip_fw_bcounter0 *cntr)
+{
+ struct timeval boottime;
+
+ if (krule->cntr != NULL) {
+ cntr->pcnt = counter_u64_fetch(krule->cntr);
+ cntr->bcnt = counter_u64_fetch(krule->cntr + 1);
+ cntr->timestamp = krule->timestamp;
+ }
+ if (cntr->timestamp > 0) {
+ getboottime(&boottime);
+ cntr->timestamp += boottime.tv_sec;
+ }
+}
+
+/*
+ * Copies rule @urule from v1 userland format (current).
+ * to kernel @krule.
+ * Assume @krule is zeroed.
+ */
+static void
+import_rule1(struct rule_check_info *ci)
+{
+ struct ip_fw_rule *urule;
+ struct ip_fw *krule;
+
+ urule = (struct ip_fw_rule *)ci->urule;
+ krule = (struct ip_fw *)ci->krule;
+
+ /* copy header */
+ krule->act_ofs = urule->act_ofs;
+ krule->cmd_len = urule->cmd_len;
+ krule->rulenum = urule->rulenum;
+ krule->set = urule->set;
+ krule->flags = urule->flags;
+
+ /* Save rulenum offset */
+ ci->urule_numoff = offsetof(struct ip_fw_rule, rulenum);
+
+ /* Copy opcodes */
+ memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t));
+}
+
+/*
+ * Export rule into v1 format (Current).
+ * Layout:
+ * [ ipfw_obj_tlv(IPFW_TLV_RULE_ENT)
+ * [ ip_fw_rule ] OR
+ * [ ip_fw_bcounter ip_fw_rule] (depends on rcntrs).
+ * ]
+ * Assume @data is zeroed.
+ */
+static void
+export_rule1(struct ip_fw *krule, caddr_t data, int len, int rcntrs)
+{
+ struct ip_fw_bcounter *cntr;
+ struct ip_fw_rule *urule;
+ ipfw_obj_tlv *tlv;
+
+ /* Fill in TLV header */
+ tlv = (ipfw_obj_tlv *)data;
+ tlv->type = IPFW_TLV_RULE_ENT;
+ tlv->length = len;
+
+ if (rcntrs != 0) {
+ /* Copy counters */
+ cntr = (struct ip_fw_bcounter *)(tlv + 1);
+ urule = (struct ip_fw_rule *)(cntr + 1);
+ export_cntr1_base(krule, cntr);
+ } else
+ urule = (struct ip_fw_rule *)(tlv + 1);
+
+ /* copy header */
+ urule->act_ofs = krule->act_ofs;
+ urule->cmd_len = krule->cmd_len;
+ urule->rulenum = krule->rulenum;
+ urule->set = krule->set;
+ urule->flags = krule->flags;
+ urule->id = krule->id;
+
+ /* Copy opcodes */
+ memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t));
+}
+
+
+/*
+ * Copies rule @urule from FreeBSD8 userland format (v0)
+ * to kernel @krule.
+ * Assume @krule is zeroed.
+ */
+static void
+import_rule0(struct rule_check_info *ci)
+{
+ struct ip_fw_rule0 *urule;
+ struct ip_fw *krule;
+ int cmdlen, l;
+ ipfw_insn *cmd;
+ ipfw_insn_limit *lcmd;
+ ipfw_insn_if *cmdif;
+
+ urule = (struct ip_fw_rule0 *)ci->urule;
+ krule = (struct ip_fw *)ci->krule;
+
+ /* copy header */
+ krule->act_ofs = urule->act_ofs;
+ krule->cmd_len = urule->cmd_len;
+ krule->rulenum = urule->rulenum;
+ krule->set = urule->set;
+ if ((urule->_pad & 1) != 0)
+ krule->flags |= IPFW_RULE_NOOPT;
+
+ /* Save rulenum offset */
+ ci->urule_numoff = offsetof(struct ip_fw_rule0, rulenum);
+
+ /* Copy opcodes */
+ memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t));
+
+ /*
+ * Alter opcodes:
+ * 1) convert tablearg value from 65535 to 0
+ * 2) Add high bit to O_SETFIB/O_SETDSCP values (to make room
+ * for targ).
+ * 3) convert table number in iface opcodes to u16
+ * 4) convert old `nat global` into new 65535
+ */
+ l = krule->cmd_len;
+ cmd = krule->cmd;
+ cmdlen = 0;
+
+ for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+
+ switch (cmd->opcode) {
+ /* Opcodes supporting tablearg */
+ case O_TAG:
+ case O_TAGGED:
+ case O_PIPE:
+ case O_QUEUE:
+ case O_DIVERT:
+ case O_TEE:
+ case O_SKIPTO:
+ case O_CALLRETURN:
+ case O_NETGRAPH:
+ case O_NGTEE:
+ case O_NAT:
+ if (cmd->arg1 == IP_FW_TABLEARG)
+ cmd->arg1 = IP_FW_TARG;
+ else if (cmd->arg1 == 0)
+ cmd->arg1 = IP_FW_NAT44_GLOBAL;
+ break;
+ case O_SETFIB:
+ case O_SETDSCP:
+ if (cmd->arg1 == IP_FW_TABLEARG)
+ cmd->arg1 = IP_FW_TARG;
+ else
+ cmd->arg1 |= 0x8000;
+ break;
+ case O_LIMIT:
+ lcmd = (ipfw_insn_limit *)cmd;
+ if (lcmd->conn_limit == IP_FW_TABLEARG)
+ lcmd->conn_limit = IP_FW_TARG;
+ break;
+ /* Interface tables */
+ case O_XMIT:
+ case O_RECV:
+ case O_VIA:
+ /* Interface table, possibly */
+ cmdif = (ipfw_insn_if *)cmd;
+ if (cmdif->name[0] != '\1')
+ break;
+
+ cmdif->p.kidx = (uint16_t)cmdif->p.glob;
+ break;
+ }
+ }
+}
+
+/*
+ * Copies rule @krule from kernel to FreeBSD8 userland format (v0)
+ */
+static void
+export_rule0(struct ip_fw *krule, struct ip_fw_rule0 *urule, int len)
+{
+ int cmdlen, l;
+ ipfw_insn *cmd;
+ ipfw_insn_limit *lcmd;
+ ipfw_insn_if *cmdif;
+
+ /* copy header */
+ memset(urule, 0, len);
+ urule->act_ofs = krule->act_ofs;
+ urule->cmd_len = krule->cmd_len;
+ urule->rulenum = krule->rulenum;
+ urule->set = krule->set;
+ if ((krule->flags & IPFW_RULE_NOOPT) != 0)
+ urule->_pad |= 1;
+
+ /* Copy opcodes */
+ memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t));
+
+ /* Export counters */
+ export_cntr0_base(krule, (struct ip_fw_bcounter0 *)&urule->pcnt);
+
+ /*
+ * Alter opcodes:
+ * 1) convert tablearg value from 0 to 65535
+ * 2) Remove highest bit from O_SETFIB/O_SETDSCP values.
+ * 3) convert table number in iface opcodes to int
+ */
+ l = urule->cmd_len;
+ cmd = urule->cmd;
+ cmdlen = 0;
+
+ for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+
+ switch (cmd->opcode) {
+ /* Opcodes supporting tablearg */
+ case O_TAG:
+ case O_TAGGED:
+ case O_PIPE:
+ case O_QUEUE:
+ case O_DIVERT:
+ case O_TEE:
+ case O_SKIPTO:
+ case O_CALLRETURN:
+ case O_NETGRAPH:
+ case O_NGTEE:
+ case O_NAT:
+ if (cmd->arg1 == IP_FW_TARG)
+ cmd->arg1 = IP_FW_TABLEARG;
+ else if (cmd->arg1 == IP_FW_NAT44_GLOBAL)
+ cmd->arg1 = 0;
+ break;
+ case O_SETFIB:
+ case O_SETDSCP:
+ if (cmd->arg1 == IP_FW_TARG)
+ cmd->arg1 = IP_FW_TABLEARG;
+ else
+ cmd->arg1 &= ~0x8000;
+ break;
+ case O_LIMIT:
+ lcmd = (ipfw_insn_limit *)cmd;
+ if (lcmd->conn_limit == IP_FW_TARG)
+ lcmd->conn_limit = IP_FW_TABLEARG;
+ break;
+ /* Interface tables */
+ case O_XMIT:
+ case O_RECV:
+ case O_VIA:
+ /* Interface table, possibly */
+ cmdif = (ipfw_insn_if *)cmd;
+ if (cmdif->name[0] != '\1')
+ break;
+
+ cmdif->p.glob = cmdif->p.kidx;
+ break;
+ }
+ }
+}
+
/*
- * Add a new rule to the list. Copy the rule into a malloc'ed area, then
- * possibly create a rule number and add the rule to the list.
+ * Add new rule(s) to the list possibly creating rule number for each.
* Update the rule_number in the input struct so the caller knows it as well.
- * XXX DO NOT USE FOR THE DEFAULT RULE.
* Must be called without IPFW_UH held
*/
-int
-ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
+static int
+commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, int count)
{
- struct ip_fw *rule;
- int i, l, insert_before;
+ int error, i, insert_before, tcount;
+ uint16_t rulenum, *pnum;
+ struct rule_check_info *ci;
+ struct ip_fw *krule;
struct ip_fw **map; /* the new array of pointers */
- if (chain->map == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE - 1)
- return (EINVAL);
+ /* Check if we need to do table/obj index remap */
+ tcount = 0;
+ for (ci = rci, i = 0; i < count; ci++, i++) {
+ if (ci->object_opcodes == 0)
+ continue;
+
+ /*
+ * Rule has some object opcodes.
+ * We need to find (and create non-existing)
+ * kernel objects, and reference existing ones.
+ */
+ error = rewrite_rule_uidx(chain, ci);
+ if (error != 0) {
+
+ /*
+ * rewrite failed, state for current rule
+ * has been reverted. Check if we need to
+ * revert more.
+ */
+ if (tcount > 0) {
+
+ /*
+ * We have some more table rules
+ * we need to rollback.
+ */
+
+ IPFW_UH_WLOCK(chain);
+ while (ci != rci) {
+ ci--;
+ if (ci->object_opcodes == 0)
+ continue;
+ unref_rule_objects(chain,ci->krule);
+
+ }
+ IPFW_UH_WUNLOCK(chain);
+
+ }
+
+ return (error);
+ }
+
+ tcount++;
+ }
- l = RULESIZE(input_rule);
- rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
- if (rule == NULL)
- return (ENOSPC);
/* get_map returns with IPFW_UH_WLOCK if successful */
- map = get_map(chain, 1, 0 /* not locked */);
+ map = get_map(chain, count, 0 /* not locked */);
if (map == NULL) {
- free(rule, M_IPFW);
- return ENOSPC;
- }
+ if (tcount > 0) {
+ /* Unbind tables */
+ IPFW_UH_WLOCK(chain);
+ for (ci = rci, i = 0; i < count; ci++, i++) {
+ if (ci->object_opcodes == 0)
+ continue;
+
+ unref_rule_objects(chain, ci->krule);
+ }
+ IPFW_UH_WUNLOCK(chain);
+ }
- bcopy(input_rule, rule, l);
- /* clear fields not settable from userland */
- rule->x_next = NULL;
- rule->next_rule = NULL;
- IPFW_ZERO_RULE_COUNTER(rule);
+ return (ENOSPC);
+ }
if (V_autoinc_step < 1)
V_autoinc_step = 1;
else if (V_autoinc_step > 1000)
V_autoinc_step = 1000;
+
+ /* FIXME: Handle count > 1 */
+ ci = rci;
+ krule = ci->krule;
+ rulenum = krule->rulenum;
+
/* find the insertion point, we will insert before */
- insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE;
+ insert_before = rulenum ? rulenum + 1 : IPFW_DEFAULT_RULE;
i = ipfw_find_rule(chain, insert_before, 0);
/* duplicate first part */
if (i > 0)
bcopy(chain->map, map, i * sizeof(struct ip_fw *));
- map[i] = rule;
+ map[i] = krule;
/* duplicate remaining part, we always have the default rule */
bcopy(chain->map + i, map + i + 1,
sizeof(struct ip_fw *) *(chain->n_rules - i));
- if (rule->rulenum == 0) {
- /* write back the number */
- rule->rulenum = i > 0 ? map[i-1]->rulenum : 0;
- if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
- rule->rulenum += V_autoinc_step;
- input_rule->rulenum = rule->rulenum;
+ if (rulenum == 0) {
+ /* Compute rule number and write it back */
+ rulenum = i > 0 ? map[i-1]->rulenum : 0;
+ if (rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
+ rulenum += V_autoinc_step;
+ krule->rulenum = rulenum;
+ /* Save number to userland rule */
+ pnum = (uint16_t *)((caddr_t)ci->urule + ci->urule_numoff);
+ *pnum = rulenum;
}
- rule->id = chain->id + 1;
+ krule->id = chain->id + 1;
+ update_skipto_cache(chain, map);
map = swap_map(chain, map, chain->n_rules + 1);
- chain->static_len += l;
+ chain->static_len += RULEUSIZE0(krule);
IPFW_UH_WUNLOCK(chain);
if (map)
free(map, M_IPFW);
@@ -213,6 +792,23 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
}
/*
+ * Adds @rule to the list of rules to reap
+ */
+void
+ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head,
+ struct ip_fw *rule)
+{
+
+ IPFW_UH_WLOCK_ASSERT(chain);
+
+ /* Unlink rule from everywhere */
+ unref_rule_objects(chain, rule);
+
+ *((struct ip_fw **)rule) = *head;
+ *head = rule;
+}
+
+/*
* Reclaim storage associated with a list of rules. This is
* typically the list created using remove_rule.
* A NULL pointer on input is handled correctly.
@@ -223,22 +819,12 @@ ipfw_reap_rules(struct ip_fw *head)
struct ip_fw *rule;
while ((rule = head) != NULL) {
- head = head->x_next;
- free(rule, M_IPFW);
+ head = *((struct ip_fw **)head);
+ free_rule(rule);
}
}
/*
- * Used by del_entry() to check if a rule should be kept.
- * Returns 1 if the rule must be kept, 0 otherwise.
- *
- * Called with cmd = {0,1,5}.
- * cmd == 0 matches on rule numbers, excludes rules in RESVD_SET if n == 0 ;
- * cmd == 1 matches on set numbers only, rule numbers are ignored;
- * cmd == 5 matches on rule and set numbers.
- *
- * n == 0 is a wildcard for rule numbers, there is no wildcard for sets.
- *
* Rules to keep are
* (default || reserved || !match_set || !match_number)
* where
@@ -255,14 +841,608 @@ ipfw_reap_rules(struct ip_fw *head)
* // number is ignored for cmd == 1 or n == 0
*
*/
+int
+ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt)
+{
+
+ /* Don't match default rule for modification queries */
+ if (rule->rulenum == IPFW_DEFAULT_RULE &&
+ (rt->flags & IPFW_RCFLAG_DEFAULT) == 0)
+ return (0);
+
+ /* Don't match rules in reserved set for flush requests */
+ if ((rt->flags & IPFW_RCFLAG_ALL) != 0 && rule->set == RESVD_SET)
+ return (0);
+
+ /* If we're filtering by set, don't match other sets */
+ if ((rt->flags & IPFW_RCFLAG_SET) != 0 && rule->set != rt->set)
+ return (0);
+
+ if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 &&
+ (rule->rulenum < rt->start_rule || rule->rulenum > rt->end_rule))
+ return (0);
+
+ return (1);
+}
+
+struct manage_sets_args {
+ uint16_t set;
+ uint8_t new_set;
+};
+
+static int
+swap_sets_cb(struct namedobj_instance *ni, struct named_object *no,
+ void *arg)
+{
+ struct manage_sets_args *args;
+
+ args = (struct manage_sets_args *)arg;
+ if (no->set == (uint8_t)args->set)
+ no->set = args->new_set;
+ else if (no->set == args->new_set)
+ no->set = (uint8_t)args->set;
+ return (0);
+}
+
+static int
+move_sets_cb(struct namedobj_instance *ni, struct named_object *no,
+ void *arg)
+{
+ struct manage_sets_args *args;
+
+ args = (struct manage_sets_args *)arg;
+ if (no->set == (uint8_t)args->set)
+ no->set = args->new_set;
+ return (0);
+}
+
+static int
+test_sets_cb(struct namedobj_instance *ni, struct named_object *no,
+ void *arg)
+{
+ struct manage_sets_args *args;
+
+ args = (struct manage_sets_args *)arg;
+ if (no->set != (uint8_t)args->set)
+ return (0);
+ if (ipfw_objhash_lookup_name_type(ni, args->new_set,
+ no->etlv, no->name) != NULL)
+ return (EEXIST);
+ return (0);
+}
+
+/*
+ * Generic function to handler moving and swapping sets.
+ */
+int
+ipfw_obj_manage_sets(struct namedobj_instance *ni, uint16_t type,
+ uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd)
+{
+ struct manage_sets_args args;
+ struct named_object *no;
+
+ args.set = set;
+ args.new_set = new_set;
+ switch (cmd) {
+ case SWAP_ALL:
+ return (ipfw_objhash_foreach_type(ni, swap_sets_cb,
+ &args, type));
+ case TEST_ALL:
+ return (ipfw_objhash_foreach_type(ni, test_sets_cb,
+ &args, type));
+ case MOVE_ALL:
+ return (ipfw_objhash_foreach_type(ni, move_sets_cb,
+ &args, type));
+ case COUNT_ONE:
+ /*
+ * @set used to pass kidx.
+ * When @new_set is zero - reset object counter,
+ * otherwise increment it.
+ */
+ no = ipfw_objhash_lookup_kidx(ni, set);
+ if (new_set != 0)
+ no->ocnt++;
+ else
+ no->ocnt = 0;
+ return (0);
+ case TEST_ONE:
+ /* @set used to pass kidx */
+ no = ipfw_objhash_lookup_kidx(ni, set);
+ /*
+ * First check number of references:
+ * when it differs, this mean other rules are holding
+ * reference to given object, so it is not possible to
+ * change its set. Note that refcnt may account references
+ * to some going-to-be-added rules. Since we don't know
+ * their numbers (and even if they will be added) it is
+ * perfectly OK to return error here.
+ */
+ if (no->ocnt != no->refcnt)
+ return (EBUSY);
+ if (ipfw_objhash_lookup_name_type(ni, new_set, type,
+ no->name) != NULL)
+ return (EEXIST);
+ return (0);
+ case MOVE_ONE:
+ /* @set used to pass kidx */
+ no = ipfw_objhash_lookup_kidx(ni, set);
+ no->set = new_set;
+ return (0);
+ }
+ return (EINVAL);
+}
+
+/*
+ * Delete rules matching range @rt.
+ * Saves number of deleted rules in @ndel.
+ *
+ * Returns 0 on success.
+ */
+static int
+delete_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int *ndel)
+{
+ struct ip_fw *reap, *rule, **map;
+ int end, start;
+ int i, n, ndyn, ofs;
+
+ reap = NULL;
+ IPFW_UH_WLOCK(chain); /* arbitrate writers */
+
+ /*
+ * Stage 1: Determine range to inspect.
+ * Range is half-inclusive, e.g [start, end).
+ */
+ start = 0;
+ end = chain->n_rules - 1;
+
+ if ((rt->flags & IPFW_RCFLAG_RANGE) != 0) {
+ start = ipfw_find_rule(chain, rt->start_rule, 0);
+
+ end = ipfw_find_rule(chain, rt->end_rule, 0);
+ if (rt->end_rule != IPFW_DEFAULT_RULE)
+ while (chain->map[end]->rulenum == rt->end_rule)
+ end++;
+ }
+
+ /* Allocate new map of the same size */
+ map = get_map(chain, 0, 1 /* locked */);
+ if (map == NULL) {
+ IPFW_UH_WUNLOCK(chain);
+ return (ENOMEM);
+ }
+
+ n = 0;
+ ndyn = 0;
+ ofs = start;
+ /* 1. bcopy the initial part of the map */
+ if (start > 0)
+ bcopy(chain->map, map, start * sizeof(struct ip_fw *));
+ /* 2. copy active rules between start and end */
+ for (i = start; i < end; i++) {
+ rule = chain->map[i];
+ if (ipfw_match_range(rule, rt) == 0) {
+ map[ofs++] = rule;
+ continue;
+ }
+
+ n++;
+ if (ipfw_is_dyn_rule(rule) != 0)
+ ndyn++;
+ }
+ /* 3. copy the final part of the map */
+ bcopy(chain->map + end, map + ofs,
+ (chain->n_rules - end) * sizeof(struct ip_fw *));
+ /* 4. recalculate skipto cache */
+ update_skipto_cache(chain, map);
+ /* 5. swap the maps (under UH_WLOCK + WHLOCK) */
+ map = swap_map(chain, map, chain->n_rules - n);
+ /* 6. Remove all dynamic states originated by deleted rules */
+ if (ndyn > 0)
+ ipfw_expire_dyn_rules(chain, rt);
+ /* 7. now remove the rules deleted from the old map */
+ for (i = start; i < end; i++) {
+ rule = map[i];
+ if (ipfw_match_range(rule, rt) == 0)
+ continue;
+ chain->static_len -= RULEUSIZE0(rule);
+ ipfw_reap_add(chain, &reap, rule);
+ }
+ IPFW_UH_WUNLOCK(chain);
+
+ ipfw_reap_rules(reap);
+ if (map != NULL)
+ free(map, M_IPFW);
+ *ndel = n;
+ return (0);
+}
+
+static int
+move_objects(struct ip_fw_chain *ch, ipfw_range_tlv *rt)
+{
+ struct opcode_obj_rewrite *rw;
+ struct ip_fw *rule;
+ ipfw_insn *cmd;
+ int cmdlen, i, l, c;
+ uint16_t kidx;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ /* Stage 1: count number of references by given rules */
+ for (c = 0, i = 0; i < ch->n_rules - 1; i++) {
+ rule = ch->map[i];
+ if (ipfw_match_range(rule, rt) == 0)
+ continue;
+ if (rule->set == rt->new_set) /* nothing to do */
+ continue;
+ /* Search opcodes with named objects */
+ for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd;
+ l > 0; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+ rw = find_op_rw(cmd, &kidx, NULL);
+ if (rw == NULL || rw->manage_sets == NULL)
+ continue;
+ /*
+ * When manage_sets() returns non-zero value to
+ * COUNT_ONE command, consider this as an object
+ * doesn't support sets (e.g. disabled with sysctl).
+ * So, skip checks for this object.
+ */
+ if (rw->manage_sets(ch, kidx, 1, COUNT_ONE) != 0)
+ continue;
+ c++;
+ }
+ }
+ if (c == 0) /* No objects found */
+ return (0);
+ /* Stage 2: verify "ownership" */
+ for (c = 0, i = 0; (i < ch->n_rules - 1) && c == 0; i++) {
+ rule = ch->map[i];
+ if (ipfw_match_range(rule, rt) == 0)
+ continue;
+ if (rule->set == rt->new_set) /* nothing to do */
+ continue;
+ /* Search opcodes with named objects */
+ for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd;
+ l > 0 && c == 0; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+ rw = find_op_rw(cmd, &kidx, NULL);
+ if (rw == NULL || rw->manage_sets == NULL)
+ continue;
+ /* Test for ownership and conflicting names */
+ c = rw->manage_sets(ch, kidx,
+ (uint8_t)rt->new_set, TEST_ONE);
+ }
+ }
+ /* Stage 3: change set and cleanup */
+ for (i = 0; i < ch->n_rules - 1; i++) {
+ rule = ch->map[i];
+ if (ipfw_match_range(rule, rt) == 0)
+ continue;
+ if (rule->set == rt->new_set) /* nothing to do */
+ continue;
+ /* Search opcodes with named objects */
+ for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd;
+ l > 0; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+ rw = find_op_rw(cmd, &kidx, NULL);
+ if (rw == NULL || rw->manage_sets == NULL)
+ continue;
+ /* cleanup object counter */
+ rw->manage_sets(ch, kidx,
+ 0 /* reset counter */, COUNT_ONE);
+ if (c != 0)
+ continue;
+ /* change set */
+ rw->manage_sets(ch, kidx,
+ (uint8_t)rt->new_set, MOVE_ONE);
+ }
+ }
+ return (c);
+}/*
+ * Changes set of given rule rannge @rt
+ * with each other.
+ *
+ * Returns 0 on success.
+ */
+static int
+move_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
+{
+ struct ip_fw *rule;
+ int i;
+
+ IPFW_UH_WLOCK(chain);
+
+ /*
+ * Move rules with matching paramenerts to a new set.
+ * This one is much more complex. We have to ensure
+ * that all referenced tables (if any) are referenced
+ * by given rule subset only. Otherwise, we can't move
+ * them to new set and have to return error.
+ */
+ if ((i = move_objects(chain, rt)) != 0) {
+ IPFW_UH_WUNLOCK(chain);
+ return (i);
+ }
+
+ /* XXX: We have to do swap holding WLOCK */
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ if (ipfw_match_range(rule, rt) == 0)
+ continue;
+ rule->set = rt->new_set;
+ }
+
+ IPFW_UH_WUNLOCK(chain);
+
+ return (0);
+}
+
+/*
+ * Clear counters for a specific rule.
+ * Normally run under IPFW_UH_RLOCK, but these are idempotent ops
+ * so we only care that rules do not disappear.
+ */
+static void
+clear_counters(struct ip_fw *rule, int log_only)
+{
+ ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
+
+ if (log_only == 0)
+ IPFW_ZERO_RULE_COUNTER(rule);
+ if (l->o.opcode == O_LOG)
+ l->log_left = l->max_log;
+}
+
+/*
+ * Flushes rules counters and/or log values on matching range.
+ *
+ * Returns number of items cleared.
+ */
+static int
+clear_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int log_only)
+{
+ struct ip_fw *rule;
+ int num;
+ int i;
+
+ num = 0;
+ rt->flags |= IPFW_RCFLAG_DEFAULT;
+
+ IPFW_UH_WLOCK(chain); /* arbitrate writers */
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ if (ipfw_match_range(rule, rt) == 0)
+ continue;
+ clear_counters(rule, log_only);
+ num++;
+ }
+ IPFW_UH_WUNLOCK(chain);
+
+ return (num);
+}
+
+static int
+check_range_tlv(ipfw_range_tlv *rt)
+{
+
+ if (rt->head.length != sizeof(*rt))
+ return (1);
+ if (rt->start_rule > rt->end_rule)
+ return (1);
+ if (rt->set >= IPFW_MAX_SETS || rt->new_set >= IPFW_MAX_SETS)
+ return (1);
+
+ if ((rt->flags & IPFW_RCFLAG_USER) != rt->flags)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Delete rules matching specified parameters
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ipfw_range_tlv ]
+ * Reply: [ ipfw_obj_header ipfw_range_tlv ]
+ *
+ * Saves number of deleted rules in ipfw_range_tlv->new_set.
+ *
+ * Returns 0 on success.
+ */
+static int
+del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_range_header *rh;
+ int error, ndel;
+
+ if (sd->valsize != sizeof(*rh))
+ return (EINVAL);
+
+ rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
+
+ if (check_range_tlv(&rh->range) != 0)
+ return (EINVAL);
+
+ ndel = 0;
+ if ((error = delete_range(chain, &rh->range, &ndel)) != 0)
+ return (error);
+
+ /* Save number of rules deleted */
+ rh->range.new_set = ndel;
+ return (0);
+}
+
+/*
+ * Move rules/sets matching specified parameters
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ipfw_range_tlv ]
+ *
+ * Returns 0 on success.
+ */
+static int
+move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_range_header *rh;
+
+ if (sd->valsize != sizeof(*rh))
+ return (EINVAL);
+
+ rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
+
+ if (check_range_tlv(&rh->range) != 0)
+ return (EINVAL);
+
+ return (move_range(chain, &rh->range));
+}
+
+/*
+ * Clear rule accounting data matching specified parameters
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ipfw_range_tlv ]
+ * Reply: [ ipfw_obj_header ipfw_range_tlv ]
+ *
+ * Saves number of cleared rules in ipfw_range_tlv->new_set.
+ *
+ * Returns 0 on success.
+ */
static int
-keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n)
+clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
{
- return
- (rule->rulenum == IPFW_DEFAULT_RULE) ||
- (cmd == 0 && n == 0 && rule->set == RESVD_SET) ||
- !(cmd == 0 || rule->set == set) ||
- !(cmd == 1 || n == 0 || n == rule->rulenum);
+ ipfw_range_header *rh;
+ int log_only, num;
+ char *msg;
+
+ if (sd->valsize != sizeof(*rh))
+ return (EINVAL);
+
+ rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
+
+ if (check_range_tlv(&rh->range) != 0)
+ return (EINVAL);
+
+ log_only = (op3->opcode == IP_FW_XRESETLOG);
+
+ num = clear_range(chain, &rh->range, log_only);
+
+ if (rh->range.flags & IPFW_RCFLAG_ALL)
+ msg = log_only ? "All logging counts reset" :
+ "Accounting cleared";
+ else
+ msg = log_only ? "logging count reset" : "cleared";
+
+ if (V_fw_verbose) {
+ int lev = LOG_SECURITY | LOG_NOTICE;
+ log(lev, "ipfw: %s.\n", msg);
+ }
+
+ /* Save number of rules cleared */
+ rh->range.new_set = num;
+ return (0);
+}
+
+static void
+enable_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
+{
+ uint32_t v_set;
+
+ IPFW_UH_WLOCK_ASSERT(chain);
+
+ /* Change enabled/disabled sets mask */
+ v_set = (V_set_disable | rt->set) & ~rt->new_set;
+ v_set &= ~(1 << RESVD_SET); /* set RESVD_SET always enabled */
+ IPFW_WLOCK(chain);
+ V_set_disable = v_set;
+ IPFW_WUNLOCK(chain);
+}
+
+static int
+swap_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int mv)
+{
+ struct opcode_obj_rewrite *rw;
+ struct ip_fw *rule;
+ int i;
+
+ IPFW_UH_WLOCK_ASSERT(chain);
+
+ if (rt->set == rt->new_set) /* nothing to do */
+ return (0);
+
+ if (mv != 0) {
+ /*
+ * Berfore moving the rules we need to check that
+ * there aren't any conflicting named objects.
+ */
+ for (rw = ctl3_rewriters;
+ rw < ctl3_rewriters + ctl3_rsize; rw++) {
+ if (rw->manage_sets == NULL)
+ continue;
+ i = rw->manage_sets(chain, (uint8_t)rt->set,
+ (uint8_t)rt->new_set, TEST_ALL);
+ if (i != 0)
+ return (EEXIST);
+ }
+ }
+ /* Swap or move two sets */
+ for (i = 0; i < chain->n_rules - 1; i++) {
+ rule = chain->map[i];
+ if (rule->set == (uint8_t)rt->set)
+ rule->set = (uint8_t)rt->new_set;
+ else if (rule->set == (uint8_t)rt->new_set && mv == 0)
+ rule->set = (uint8_t)rt->set;
+ }
+ for (rw = ctl3_rewriters; rw < ctl3_rewriters + ctl3_rsize; rw++) {
+ if (rw->manage_sets == NULL)
+ continue;
+ rw->manage_sets(chain, (uint8_t)rt->set,
+ (uint8_t)rt->new_set, mv != 0 ? MOVE_ALL: SWAP_ALL);
+ }
+ return (0);
+}
+
+/*
+ * Swaps or moves set
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ipfw_range_tlv ]
+ *
+ * Returns 0 on success.
+ */
+static int
+manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_range_header *rh;
+ int ret;
+
+ if (sd->valsize != sizeof(*rh))
+ return (EINVAL);
+
+ rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
+
+ if (rh->range.head.length != sizeof(ipfw_range_tlv))
+ return (1);
+ /* enable_sets() expects bitmasks. */
+ if (op3->opcode != IP_FW_SET_ENABLE &&
+ (rh->range.set >= IPFW_MAX_SETS ||
+ rh->range.new_set >= IPFW_MAX_SETS))
+ return (EINVAL);
+
+ ret = 0;
+ IPFW_UH_WLOCK(chain);
+ switch (op3->opcode) {
+ case IP_FW_SET_SWAP:
+ case IP_FW_SET_MOVE:
+ ret = swap_sets(chain, &rh->range,
+ op3->opcode == IP_FW_SET_MOVE);
+ break;
+ case IP_FW_SET_ENABLE:
+ enable_sets(chain, &rh->range);
+ break;
+ }
+ IPFW_UH_WUNLOCK(chain);
+
+ return (ret);
}
/**
@@ -282,12 +1462,11 @@ keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n)
static int
del_entry(struct ip_fw_chain *chain, uint32_t arg)
{
- struct ip_fw *rule;
uint32_t num; /* rule number or old_set */
uint8_t cmd, new_set;
- int start, end, i, ofs, n;
- struct ip_fw **map = NULL;
+ int do_del, ndel;
int error = 0;
+ ipfw_range_tlv rt;
num = arg & 0xffff;
cmd = (arg >> 24) & 0xff;
@@ -303,149 +1482,60 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
return EINVAL;
}
- IPFW_UH_WLOCK(chain); /* arbitrate writers */
- chain->reap = NULL; /* prepare for deletions */
+ /* Convert old requests into new representation */
+ memset(&rt, 0, sizeof(rt));
+ rt.start_rule = num;
+ rt.end_rule = num;
+ rt.set = num;
+ rt.new_set = new_set;
+ do_del = 0;
switch (cmd) {
- case 0: /* delete rules "num" (num == 0 matches all) */
- case 1: /* delete all rules in set N */
- case 5: /* delete rules with number N and set "new_set". */
-
- /*
- * Locate first rule to delete (start), the rule after
- * the last one to delete (end), and count how many
- * rules to delete (n). Always use keep_rule() to
- * determine which rules to keep.
- */
- n = 0;
- if (cmd == 1) {
- /* look for a specific set including RESVD_SET.
- * Must scan the entire range, ignore num.
- */
- new_set = num;
- for (start = -1, end = i = 0; i < chain->n_rules; i++) {
- if (keep_rule(chain->map[i], cmd, new_set, 0))
- continue;
- if (start < 0)
- start = i;
- end = i;
- n++;
- }
- end++; /* first non-matching */
- } else {
- /* Optimized search on rule numbers */
- start = ipfw_find_rule(chain, num, 0);
- for (end = start; end < chain->n_rules; end++) {
- rule = chain->map[end];
- if (num > 0 && rule->rulenum != num)
- break;
- if (!keep_rule(rule, cmd, new_set, num))
- n++;
- }
- }
-
- if (n == 0) {
- /* A flush request (arg == 0 or cmd == 1) on empty
- * ruleset returns with no error. On the contrary,
- * if there is no match on a specific request,
- * we return EINVAL.
- */
- if (arg != 0 && cmd != 1)
- error = EINVAL;
- break;
- }
-
- /* We have something to delete. Allocate the new map */
- map = get_map(chain, -n, 1 /* locked */);
- if (map == NULL) {
- error = EINVAL;
- break;
- }
-
- /* 1. bcopy the initial part of the map */
- if (start > 0)
- bcopy(chain->map, map, start * sizeof(struct ip_fw *));
- /* 2. copy active rules between start and end */
- for (i = ofs = start; i < end; i++) {
- rule = chain->map[i];
- if (keep_rule(rule, cmd, new_set, num))
- map[ofs++] = rule;
- }
- /* 3. copy the final part of the map */
- bcopy(chain->map + end, map + ofs,
- (chain->n_rules - end) * sizeof(struct ip_fw *));
- /* 4. swap the maps (under BH_LOCK) */
- map = swap_map(chain, map, chain->n_rules - n);
- /* 5. now remove the rules deleted from the old map */
- if (cmd == 1)
- ipfw_expire_dyn_rules(chain, NULL, new_set);
- for (i = start; i < end; i++) {
- rule = map[i];
- if (keep_rule(rule, cmd, new_set, num))
- continue;
- chain->static_len -= RULESIZE(rule);
- if (cmd != 1)
- ipfw_expire_dyn_rules(chain, rule, RESVD_SET);
- rule->x_next = chain->reap;
- chain->reap = rule;
- }
+ case 0: /* delete rules numbered "rulenum" */
+ if (num == 0)
+ rt.flags |= IPFW_RCFLAG_ALL;
+ else
+ rt.flags |= IPFW_RCFLAG_RANGE;
+ do_del = 1;
break;
-
- /*
- * In the next 3 cases the loop stops at (n_rules - 1)
- * because the default rule is never eligible..
- */
-
- case 2: /* move rules with given RULE number to new set */
- for (i = 0; i < chain->n_rules - 1; i++) {
- rule = chain->map[i];
- if (rule->rulenum == num)
- rule->set = new_set;
- }
+ case 1: /* delete rules in set "rulenum" */
+ rt.flags |= IPFW_RCFLAG_SET;
+ do_del = 1;
break;
-
- case 3: /* move rules with given SET number to new set */
- for (i = 0; i < chain->n_rules - 1; i++) {
- rule = chain->map[i];
- if (rule->set == num)
- rule->set = new_set;
- }
+ case 5: /* delete rules "rulenum" and set "new_set" */
+ rt.flags |= IPFW_RCFLAG_RANGE | IPFW_RCFLAG_SET;
+ rt.set = new_set;
+ rt.new_set = 0;
+ do_del = 1;
break;
-
- case 4: /* swap two sets */
- for (i = 0; i < chain->n_rules - 1; i++) {
- rule = chain->map[i];
- if (rule->set == num)
- rule->set = new_set;
- else if (rule->set == new_set)
- rule->set = num;
- }
+ case 2: /* move rules "rulenum" to set "new_set" */
+ rt.flags |= IPFW_RCFLAG_RANGE;
break;
+ case 3: /* move rules from set "rulenum" to set "new_set" */
+ IPFW_UH_WLOCK(chain);
+ error = swap_sets(chain, &rt, 1);
+ IPFW_UH_WUNLOCK(chain);
+ return (error);
+ case 4: /* swap sets "rulenum" and "new_set" */
+ IPFW_UH_WLOCK(chain);
+ error = swap_sets(chain, &rt, 0);
+ IPFW_UH_WUNLOCK(chain);
+ return (error);
+ default:
+ return (ENOTSUP);
}
- rule = chain->reap;
- chain->reap = NULL;
- IPFW_UH_WUNLOCK(chain);
- ipfw_reap_rules(rule);
- if (map)
- free(map, M_IPFW);
- return error;
-}
+ if (do_del != 0) {
+ if ((error = delete_range(chain, &rt, &ndel)) != 0)
+ return (error);
-/*
- * Clear counters for a specific rule.
- * Normally run under IPFW_UH_RLOCK, but these are idempotent ops
- * so we only care that rules do not disappear.
- */
-static void
-clear_counters(struct ip_fw *rule, int log_only)
-{
- ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
+ if (ndel == 0 && (cmd != 1 && num != 0))
+ return (EINVAL);
- if (log_only == 0)
- IPFW_ZERO_RULE_COUNTER(rule);
- if (l->o.opcode == O_LOG)
- l->log_left = l->max_log;
+ return (0);
+ }
+
+ return (move_range(chain, &rt));
}
/**
@@ -516,23 +1606,57 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
return (0);
}
+
/*
- * Check validity of the structure before insert.
- * Rules are simple, so this mostly need to check rule sizes.
+ * Check rule head in FreeBSD11 format
+ *
*/
static int
-check_ipfw_struct(struct ip_fw *rule, int size)
+check_ipfw_rule1(struct ip_fw_rule *rule, int size,
+ struct rule_check_info *ci)
{
- int l, cmdlen = 0;
- int have_action=0;
- ipfw_insn *cmd;
+ int l;
+
+ if (size < sizeof(*rule)) {
+ printf("ipfw: rule too short\n");
+ return (EINVAL);
+ }
+
+ /* Check for valid cmd_len */
+ l = roundup2(RULESIZE(rule), sizeof(uint64_t));
+ if (l != size) {
+ printf("ipfw: size mismatch (have %d want %d)\n", size, l);
+ return (EINVAL);
+ }
+ if (rule->act_ofs >= rule->cmd_len) {
+ printf("ipfw: bogus action offset (%u > %u)\n",
+ rule->act_ofs, rule->cmd_len - 1);
+ return (EINVAL);
+ }
+
+ if (rule->rulenum > IPFW_DEFAULT_RULE - 1)
+ return (EINVAL);
+
+ return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci));
+}
+
+/*
+ * Check rule head in FreeBSD8 format
+ *
+ */
+static int
+check_ipfw_rule0(struct ip_fw_rule0 *rule, int size,
+ struct rule_check_info *ci)
+{
+ int l;
if (size < sizeof(*rule)) {
printf("ipfw: rule too short\n");
return (EINVAL);
}
- /* first, check for valid size */
- l = RULESIZE(rule);
+
+ /* Check for valid cmd_len */
+ l = sizeof(*rule) + rule->cmd_len * 4 - 4;
if (l != size) {
printf("ipfw: size mismatch (have %d want %d)\n", size, l);
return (EINVAL);
@@ -542,12 +1666,26 @@ check_ipfw_struct(struct ip_fw *rule, int size)
rule->act_ofs, rule->cmd_len - 1);
return (EINVAL);
}
+
+ if (rule->rulenum > IPFW_DEFAULT_RULE - 1)
+ return (EINVAL);
+
+ return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci));
+}
+
+static int
+check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci)
+{
+ int cmdlen, l;
+ int have_action;
+
+ have_action = 0;
+
/*
* Now go for the individual checks. Very simple ones, basically only
* instruction sizes.
*/
- for (l = rule->cmd_len, cmd = rule->cmd ;
- l > 0 ; l -= cmdlen, cmd += cmdlen) {
+ for (l = cmd_len; l > 0 ; l -= cmdlen, cmd += cmdlen) {
cmdlen = F_LEN(cmd);
if (cmdlen > l) {
printf("ipfw: opcode %d size truncated\n",
@@ -557,6 +1695,10 @@ check_ipfw_struct(struct ip_fw *rule, int size)
switch (cmd->opcode) {
case O_PROBE_STATE:
case O_KEEP_STATE:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ ci->object_opcodes++;
+ break;
case O_PROTO:
case O_IP_SRC_ME:
case O_IP_DST_ME:
@@ -588,6 +1730,35 @@ check_ipfw_struct(struct ip_fw *rule, int size)
goto bad_size;
break;
+ case O_EXTERNAL_ACTION:
+ if (cmd->arg1 == 0 ||
+ cmdlen != F_INSN_SIZE(ipfw_insn)) {
+ printf("ipfw: invalid external "
+ "action opcode\n");
+ return (EINVAL);
+ }
+ ci->object_opcodes++;
+ /* Do we have O_EXTERNAL_INSTANCE opcode? */
+ if (l != cmdlen) {
+ l -= cmdlen;
+ cmd += cmdlen;
+ cmdlen = F_LEN(cmd);
+ if (cmd->opcode != O_EXTERNAL_INSTANCE) {
+ printf("ipfw: invalid opcode "
+ "next to external action %u\n",
+ cmd->opcode);
+ return (EINVAL);
+ }
+ if (cmd->arg1 == 0 ||
+ cmdlen != F_INSN_SIZE(ipfw_insn)) {
+ printf("ipfw: invalid external "
+ "action instance opcode\n");
+ return (EINVAL);
+ }
+ ci->object_opcodes++;
+ }
+ goto check_action;
+
case O_FIB:
if (cmdlen != F_INSN_SIZE(ipfw_insn))
goto bad_size;
@@ -601,10 +1772,10 @@ check_ipfw_struct(struct ip_fw *rule, int size)
case O_SETFIB:
if (cmdlen != F_INSN_SIZE(ipfw_insn))
goto bad_size;
- if ((cmd->arg1 != IP_FW_TABLEARG) &&
- (cmd->arg1 >= rt_numfibs)) {
+ if ((cmd->arg1 != IP_FW_TARG) &&
+ ((cmd->arg1 & 0x7FFF) >= rt_numfibs)) {
printf("ipfw: invalid fib number %d\n",
- cmd->arg1);
+ cmd->arg1 & 0x7FFF);
return EINVAL;
}
goto check_action;
@@ -625,6 +1796,7 @@ check_ipfw_struct(struct ip_fw *rule, int size)
case O_LIMIT:
if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
goto bad_size;
+ ci->object_opcodes++;
break;
case O_LOG:
@@ -639,7 +1811,7 @@ check_ipfw_struct(struct ip_fw *rule, int size)
case O_IP_SRC_MASK:
case O_IP_DST_MASK:
/* only odd command lengths */
- if ( !(cmdlen & 1) || cmdlen > 31)
+ if ((cmdlen & 1) == 0)
goto bad_size;
break;
@@ -666,6 +1838,18 @@ check_ipfw_struct(struct ip_fw *rule, int size)
cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 &&
cmdlen != F_INSN_SIZE(ipfw_insn_u32))
goto bad_size;
+ ci->object_opcodes++;
+ break;
+ case O_IP_FLOW_LOOKUP:
+ if (cmd->arg1 >= V_fw_tables_max) {
+ printf("ipfw: invalid table number %d\n",
+ cmd->arg1);
+ return (EINVAL);
+ }
+ if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
+ cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+ goto bad_size;
+ ci->object_opcodes++;
break;
case O_MACADDR2:
if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
@@ -700,6 +1884,7 @@ check_ipfw_struct(struct ip_fw *rule, int size)
case O_VIA:
if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
goto bad_size;
+ ci->object_opcodes++;
break;
case O_ALTQ:
@@ -742,8 +1927,10 @@ check_ipfw_struct(struct ip_fw *rule, int size)
if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
goto bad_size;
goto check_action;
- case O_FORWARD_MAC: /* XXX not implemented yet */
case O_CHECK_STATE:
+ ci->object_opcodes++;
+ /* FALLTHROUGH */
+ case O_FORWARD_MAC: /* XXX not implemented yet */
case O_COUNT:
case O_ACCEPT:
case O_DENY:
@@ -763,14 +1950,14 @@ check_action:
printf("ipfw: opcode %d, multiple actions"
" not allowed\n",
cmd->opcode);
- return EINVAL;
+ return (EINVAL);
}
have_action = 1;
if (l != cmdlen) {
printf("ipfw: opcode %d, action must be"
" last opcode\n",
cmd->opcode);
- return EINVAL;
+ return (EINVAL);
}
break;
#ifdef INET6
@@ -813,25 +2000,25 @@ check_action:
case O_IP6_DST_MASK:
case O_ICMP6TYPE:
printf("ipfw: no IPv6 support in kernel\n");
- return EPROTONOSUPPORT;
+ return (EPROTONOSUPPORT);
#endif
default:
printf("ipfw: opcode %d, unknown opcode\n",
cmd->opcode);
- return EINVAL;
+ return (EINVAL);
}
}
}
if (have_action == 0) {
printf("ipfw: missing action\n");
- return EINVAL;
+ return (EINVAL);
}
return 0;
bad_size:
printf("ipfw: opcode %d size %d wrong\n",
cmd->opcode, cmdlen);
- return EINVAL;
+ return (EINVAL);
}
@@ -863,8 +2050,8 @@ struct ip_fw7 {
ipfw_insn cmd[1]; /* storage for commands */
};
- int convert_rule_to_7(struct ip_fw *rule);
-int convert_rule_to_8(struct ip_fw *rule);
+static int convert_rule_to_7(struct ip_fw_rule0 *rule);
+static int convert_rule_to_8(struct ip_fw_rule0 *rule);
#ifndef RULESIZE7
#define RULESIZE7(rule) (sizeof(struct ip_fw7) + \
@@ -882,10 +2069,15 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
{
char *bp = buf;
char *ep = bp + space;
- struct ip_fw *rule, *dst;
- int l, i;
+ struct ip_fw *rule;
+ struct ip_fw_rule0 *dst;
+ struct timeval boottime;
+ int error, i, l, warnflag;
time_t boot_seconds;
+ warnflag = 0;
+
+ getboottime(&boottime);
boot_seconds = boottime.tv_sec;
for (i = 0; i < chain->n_rules; i++) {
rule = chain->map[i];
@@ -894,9 +2086,12 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
/* Convert rule to FreeBSd 7.2 format */
l = RULESIZE7(rule);
if (bp + l + sizeof(uint32_t) <= ep) {
- int error;
bcopy(rule, bp, l + sizeof(uint32_t));
- error = convert_rule_to_7((struct ip_fw *) bp);
+ error = set_legacy_obj_kidx(chain,
+ (struct ip_fw_rule0 *)bp);
+ if (error != 0)
+ return (0);
+ error = convert_rule_to_7((struct ip_fw_rule0 *) bp);
if (error)
return 0; /*XXX correct? */
/*
@@ -914,76 +2109,1631 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
continue; /* go to next rule */
}
- /* normal mode, don't touch rules */
- l = RULESIZE(rule);
+ l = RULEUSIZE0(rule);
if (bp + l > ep) { /* should not happen */
printf("overflow dumping static rules\n");
break;
}
- dst = (struct ip_fw *)bp;
- bcopy(rule, dst, l);
+ dst = (struct ip_fw_rule0 *)bp;
+ export_rule0(rule, dst, l);
+ error = set_legacy_obj_kidx(chain, dst);
+
/*
* XXX HACK. Store the disable mask in the "next"
* pointer in a wild attempt to keep the ABI the same.
* Why do we do this on EVERY rule?
+ *
+ * XXX: "ipfw set show" (ab)uses IP_FW_GET to read disabled mask
+ * so we need to fail _after_ saving at least one mask.
*/
bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
if (dst->timestamp)
dst->timestamp += boot_seconds;
bp += l;
+
+ if (error != 0) {
+ if (error == 2) {
+ /* Non-fatal table rewrite error. */
+ warnflag = 1;
+ continue;
+ }
+ printf("Stop on rule %d. Fail to convert table\n",
+ rule->rulenum);
+ break;
+ }
}
+ if (warnflag != 0)
+ printf("ipfw: process %s is using legacy interfaces,"
+ " consider rebuilding\n", "");
ipfw_get_dynamic(chain, &bp, ep); /* protected by the dynamic lock */
return (bp - (char *)buf);
}
-#define IP_FW3_OPLENGTH(x) ((x)->sopt_valsize - sizeof(ip_fw3_opheader))
-/**
- * {set|get}sockopt parser.
+struct dump_args {
+ uint32_t b; /* start rule */
+ uint32_t e; /* end rule */
+ uint32_t rcount; /* number of rules */
+ uint32_t rsize; /* rules size */
+ uint32_t tcount; /* number of tables */
+ int rcounters; /* counters */
+};
+
+void
+ipfw_export_obj_ntlv(struct named_object *no, ipfw_obj_ntlv *ntlv)
+{
+
+ ntlv->head.type = no->etlv;
+ ntlv->head.length = sizeof(*ntlv);
+ ntlv->idx = no->kidx;
+ strlcpy(ntlv->name, no->name, sizeof(ntlv->name));
+}
+
+/*
+ * Export named object info in instance @ni, identified by @kidx
+ * to ipfw_obj_ntlv. TLV is allocated from @sd space.
+ *
+ * Returns 0 on success.
+ */
+static int
+export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx,
+ struct sockopt_data *sd)
+{
+ struct named_object *no;
+ ipfw_obj_ntlv *ntlv;
+
+ no = ipfw_objhash_lookup_kidx(ni, kidx);
+ KASSERT(no != NULL, ("invalid object kernel index passed"));
+
+ ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv));
+ if (ntlv == NULL)
+ return (ENOMEM);
+
+ ipfw_export_obj_ntlv(no, ntlv);
+ return (0);
+}
+
+/*
+ * Dumps static rules with table TLVs in buffer @sd.
+ *
+ * Returns 0 on success.
+ */
+static int
+dump_static_rules(struct ip_fw_chain *chain, struct dump_args *da,
+ uint32_t *bmask, struct sockopt_data *sd)
+{
+ int error;
+ int i, l;
+ uint32_t tcount;
+ ipfw_obj_ctlv *ctlv;
+ struct ip_fw *krule;
+ struct namedobj_instance *ni;
+ caddr_t dst;
+
+ /* Dump table names first (if any) */
+ if (da->tcount > 0) {
+ /* Header first */
+ ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv));
+ if (ctlv == NULL)
+ return (ENOMEM);
+ ctlv->head.type = IPFW_TLV_TBLNAME_LIST;
+ ctlv->head.length = da->tcount * sizeof(ipfw_obj_ntlv) +
+ sizeof(*ctlv);
+ ctlv->count = da->tcount;
+ ctlv->objsize = sizeof(ipfw_obj_ntlv);
+ }
+
+ i = 0;
+ tcount = da->tcount;
+ ni = ipfw_get_table_objhash(chain);
+ while (tcount > 0) {
+ if ((bmask[i / 32] & (1 << (i % 32))) == 0) {
+ i++;
+ continue;
+ }
+
+ /* Jump to shared named object bitmask */
+ if (i >= IPFW_TABLES_MAX) {
+ ni = CHAIN_TO_SRV(chain);
+ i -= IPFW_TABLES_MAX;
+ bmask += IPFW_TABLES_MAX / 32;
+ }
+
+ if ((error = export_objhash_ntlv(ni, i, sd)) != 0)
+ return (error);
+
+ i++;
+ tcount--;
+ }
+
+ /* Dump rules */
+ ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv));
+ if (ctlv == NULL)
+ return (ENOMEM);
+ ctlv->head.type = IPFW_TLV_RULE_LIST;
+ ctlv->head.length = da->rsize + sizeof(*ctlv);
+ ctlv->count = da->rcount;
+
+ for (i = da->b; i < da->e; i++) {
+ krule = chain->map[i];
+
+ l = RULEUSIZE1(krule) + sizeof(ipfw_obj_tlv);
+ if (da->rcounters != 0)
+ l += sizeof(struct ip_fw_bcounter);
+ dst = (caddr_t)ipfw_get_sopt_space(sd, l);
+ if (dst == NULL)
+ return (ENOMEM);
+
+ export_rule1(krule, dst, l, da->rcounters);
+ }
+
+ return (0);
+}
+
+/*
+ * Marks every object index used in @rule with bit in @bmask.
+ * Used to generate bitmask of referenced tables/objects for given ruleset
+ * or its part.
+ *
+ * Returns number of newly-referenced objects.
+ */
+static int
+mark_object_kidx(struct ip_fw_chain *ch, struct ip_fw *rule,
+ uint32_t *bmask)
+{
+ struct opcode_obj_rewrite *rw;
+ ipfw_insn *cmd;
+ int bidx, cmdlen, l, count;
+ uint16_t kidx;
+ uint8_t subtype;
+
+ l = rule->cmd_len;
+ cmd = rule->cmd;
+ cmdlen = 0;
+ count = 0;
+ for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+
+ rw = find_op_rw(cmd, &kidx, &subtype);
+ if (rw == NULL)
+ continue;
+
+ bidx = kidx / 32;
+ /*
+ * Maintain separate bitmasks for table and
+ * non-table objects.
+ */
+ if (rw->etlv != IPFW_TLV_TBL_NAME)
+ bidx += IPFW_TABLES_MAX / 32;
+
+ if ((bmask[bidx] & (1 << (kidx % 32))) == 0)
+ count++;
+
+ bmask[bidx] |= 1 << (kidx % 32);
+ }
+
+ return (count);
+}
+
+/*
+ * Dumps requested objects data
+ * Data layout (version 0)(current):
+ * Request: [ ipfw_cfg_lheader ] + IPFW_CFG_GET_* flags
+ * size = ipfw_cfg_lheader.size
+ * Reply: [ ipfw_cfg_lheader
+ * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional)
+ * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST)
+ * ipfw_obj_tlv(IPFW_TLV_RULE_ENT) [ ip_fw_bcounter (optional) ip_fw_rule ]
+ * ] (optional)
+ * [ ipfw_obj_ctlv(IPFW_TLV_STATE_LIST) ipfw_obj_dyntlv x N ] (optional)
+ * ]
+ * * NOTE IPFW_TLV_STATE_LIST has the single valid field: objsize.
+ * The rest (size, count) are set to zero and needs to be ignored.
+ *
+ * Returns 0 on success.
*/
+static int
+dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_cfg_lheader *hdr;
+ struct ip_fw *rule;
+ size_t sz, rnum;
+ uint32_t hdr_flags;
+ int error, i;
+ struct dump_args da;
+ uint32_t *bmask;
+
+ hdr = (ipfw_cfg_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr));
+ if (hdr == NULL)
+ return (EINVAL);
+
+ error = 0;
+ bmask = NULL;
+ /* Allocate needed state. Note we allocate 2xspace mask, for table&srv */
+ if (hdr->flags & IPFW_CFG_GET_STATIC)
+ bmask = malloc(IPFW_TABLES_MAX / 4, M_TEMP, M_WAITOK | M_ZERO);
+
+ IPFW_UH_RLOCK(chain);
+
+ /*
+ * STAGE 1: Determine size/count for objects in range.
+ * Prepare used tables bitmask.
+ */
+ sz = sizeof(ipfw_cfg_lheader);
+ memset(&da, 0, sizeof(da));
+
+ da.b = 0;
+ da.e = chain->n_rules;
+
+ if (hdr->end_rule != 0) {
+ /* Handle custom range */
+ if ((rnum = hdr->start_rule) > IPFW_DEFAULT_RULE)
+ rnum = IPFW_DEFAULT_RULE;
+ da.b = ipfw_find_rule(chain, rnum, 0);
+ rnum = hdr->end_rule;
+ rnum = (rnum < IPFW_DEFAULT_RULE) ? rnum+1 : IPFW_DEFAULT_RULE;
+ da.e = ipfw_find_rule(chain, rnum, 0) + 1;
+ }
+
+ if (hdr->flags & IPFW_CFG_GET_STATIC) {
+ for (i = da.b; i < da.e; i++) {
+ rule = chain->map[i];
+ da.rsize += RULEUSIZE1(rule) + sizeof(ipfw_obj_tlv);
+ da.rcount++;
+ /* Update bitmask of used objects for given range */
+ da.tcount += mark_object_kidx(chain, rule, bmask);
+ }
+ /* Add counters if requested */
+ if (hdr->flags & IPFW_CFG_GET_COUNTERS) {
+ da.rsize += sizeof(struct ip_fw_bcounter) * da.rcount;
+ da.rcounters = 1;
+ }
+
+ if (da.tcount > 0)
+ sz += da.tcount * sizeof(ipfw_obj_ntlv) +
+ sizeof(ipfw_obj_ctlv);
+ sz += da.rsize + sizeof(ipfw_obj_ctlv);
+ }
+
+ if (hdr->flags & IPFW_CFG_GET_STATES)
+ sz += ipfw_dyn_get_count() * sizeof(ipfw_obj_dyntlv) +
+ sizeof(ipfw_obj_ctlv);
+
+
+ /*
+ * Fill header anyway.
+ * Note we have to save header fields to stable storage
+ * buffer inside @sd can be flushed after dumping rules
+ */
+ hdr->size = sz;
+ hdr->set_mask = ~V_set_disable;
+ hdr_flags = hdr->flags;
+ hdr = NULL;
+
+ if (sd->valsize < sz) {
+ error = ENOMEM;
+ goto cleanup;
+ }
+
+ /* STAGE2: Store actual data */
+ if (hdr_flags & IPFW_CFG_GET_STATIC) {
+ error = dump_static_rules(chain, &da, bmask, sd);
+ if (error != 0)
+ goto cleanup;
+ }
+
+ if (hdr_flags & IPFW_CFG_GET_STATES)
+ error = ipfw_dump_states(chain, sd);
+
+cleanup:
+ IPFW_UH_RUNLOCK(chain);
+
+ if (bmask != NULL)
+ free(bmask, M_TEMP);
+
+ return (error);
+}
+
int
-ipfw_ctl(struct sockopt *sopt)
+ipfw_check_object_name_generic(const char *name)
+{
+ int nsize;
+
+ nsize = sizeof(((ipfw_obj_ntlv *)0)->name);
+ if (strnlen(name, nsize) == nsize)
+ return (EINVAL);
+ if (name[0] == '\0')
+ return (EINVAL);
+ return (0);
+}
+
+/*
+ * Creates non-existent objects referenced by rule.
+ *
+ * Return 0 on success.
+ */
+int
+create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd,
+ struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti)
+{
+ struct opcode_obj_rewrite *rw;
+ struct obj_idx *p;
+ uint16_t kidx;
+ int error;
+
+ /*
+ * Compatibility stuff: do actual creation for non-existing,
+ * but referenced objects.
+ */
+ for (p = oib; p < pidx; p++) {
+ if (p->kidx != 0)
+ continue;
+
+ ti->uidx = p->uidx;
+ ti->type = p->type;
+ ti->atype = 0;
+
+ rw = find_op_rw(cmd + p->off, NULL, NULL);
+ KASSERT(rw != NULL, ("Unable to find handler for op %d",
+ (cmd + p->off)->opcode));
+
+ if (rw->create_object == NULL)
+ error = EOPNOTSUPP;
+ else
+ error = rw->create_object(ch, ti, &kidx);
+ if (error == 0) {
+ p->kidx = kidx;
+ continue;
+ }
+
+ /*
+ * Error happened. We have to rollback everything.
+ * Drop all already acquired references.
+ */
+ IPFW_UH_WLOCK(ch);
+ unref_oib_objects(ch, cmd, oib, pidx);
+ IPFW_UH_WUNLOCK(ch);
+
+ return (error);
+ }
+
+ return (0);
+}
+
+/*
+ * Compatibility function for old ipfw(8) binaries.
+ * Rewrites table/nat kernel indices with userland ones.
+ * Convert tables matching '/^\d+$/' to their atoi() value.
+ * Use number 65535 for other tables.
+ *
+ * Returns 0 on success.
+ */
+static int
+set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule)
+{
+ struct opcode_obj_rewrite *rw;
+ struct named_object *no;
+ ipfw_insn *cmd;
+ char *end;
+ long val;
+ int cmdlen, error, l;
+ uint16_t kidx, uidx;
+ uint8_t subtype;
+
+ error = 0;
+
+ l = rule->cmd_len;
+ cmd = rule->cmd;
+ cmdlen = 0;
+ for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+
+ /* Check if is index in given opcode */
+ rw = find_op_rw(cmd, &kidx, &subtype);
+ if (rw == NULL)
+ continue;
+
+ /* Try to find referenced kernel object */
+ no = rw->find_bykidx(ch, kidx);
+ if (no == NULL)
+ continue;
+
+ val = strtol(no->name, &end, 10);
+ if (*end == '\0' && val < 65535) {
+ uidx = val;
+ } else {
+
+ /*
+ * We are called via legacy opcode.
+ * Save error and show table as fake number
+ * not to make ipfw(8) hang.
+ */
+ uidx = 65535;
+ error = 2;
+ }
+
+ rw->update(cmd, uidx);
+ }
+
+ return (error);
+}
+
+
+/*
+ * Unreferences all already-referenced objects in given @cmd rule,
+ * using information in @oib.
+ *
+ * Used to rollback partially converted rule on error.
+ */
+static void
+unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib,
+ struct obj_idx *end)
+{
+ struct opcode_obj_rewrite *rw;
+ struct named_object *no;
+ struct obj_idx *p;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ for (p = oib; p < end; p++) {
+ if (p->kidx == 0)
+ continue;
+
+ rw = find_op_rw(cmd + p->off, NULL, NULL);
+ KASSERT(rw != NULL, ("Unable to find handler for op %d",
+ (cmd + p->off)->opcode));
+
+ /* Find & unref by existing idx */
+ no = rw->find_bykidx(ch, p->kidx);
+ KASSERT(no != NULL, ("Ref'd object %d disappeared", p->kidx));
+ no->refcnt--;
+ }
+}
+
+/*
+ * Remove references from every object used in @rule.
+ * Used at rule removal code.
+ */
+static void
+unref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule)
+{
+ struct opcode_obj_rewrite *rw;
+ struct named_object *no;
+ ipfw_insn *cmd;
+ int cmdlen, l;
+ uint16_t kidx;
+ uint8_t subtype;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ l = rule->cmd_len;
+ cmd = rule->cmd;
+ cmdlen = 0;
+ for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+
+ rw = find_op_rw(cmd, &kidx, &subtype);
+ if (rw == NULL)
+ continue;
+ no = rw->find_bykidx(ch, kidx);
+
+ KASSERT(no != NULL, ("table id %d not found", kidx));
+ KASSERT(no->subtype == subtype,
+ ("wrong type %d (%d) for table id %d",
+ no->subtype, subtype, kidx));
+ KASSERT(no->refcnt > 0, ("refcount for table %d is %d",
+ kidx, no->refcnt));
+
+ if (no->refcnt == 1 && rw->destroy_object != NULL)
+ rw->destroy_object(ch, no);
+ else
+ no->refcnt--;
+ }
+}
+
+
+/*
+ * Find and reference object (if any) stored in instruction @cmd.
+ *
+ * Saves object info in @pidx, sets
+ * - @unresolved to 1 if object should exists but not found
+ *
+ * Returns non-zero value in case of error.
+ */
+static int
+ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd, struct tid_info *ti,
+ struct obj_idx *pidx, int *unresolved)
+{
+ struct named_object *no;
+ struct opcode_obj_rewrite *rw;
+ int error;
+
+ /* Check if this opcode is candidate for rewrite */
+ rw = find_op_rw(cmd, &ti->uidx, &ti->type);
+ if (rw == NULL)
+ return (0);
+
+ /* Need to rewrite. Save necessary fields */
+ pidx->uidx = ti->uidx;
+ pidx->type = ti->type;
+
+ /* Try to find referenced kernel object */
+ error = rw->find_byname(ch, ti, &no);
+ if (error != 0)
+ return (error);
+ if (no == NULL) {
+ /*
+ * Report about unresolved object for automaic
+ * creation.
+ */
+ *unresolved = 1;
+ return (0);
+ }
+
+ /* Found. Bump refcount and update kidx. */
+ no->refcnt++;
+ rw->update(cmd, no->kidx);
+ return (0);
+}
+
+/*
+ * Finds and bumps refcount for objects referenced by given @rule.
+ * Auto-creates non-existing tables.
+ * Fills in @oib array with userland/kernel indexes.
+ *
+ * Returns 0 on success.
+ */
+static int
+ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule,
+ struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti)
+{
+ struct obj_idx *pidx;
+ ipfw_insn *cmd;
+ int cmdlen, error, l, unresolved;
+
+ pidx = oib;
+ l = rule->cmd_len;
+ cmd = rule->cmd;
+ cmdlen = 0;
+ error = 0;
+
+ IPFW_UH_WLOCK(ch);
+
+ /* Increase refcount on each existing referenced table. */
+ for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+ unresolved = 0;
+
+ error = ref_opcode_object(ch, cmd, ti, pidx, &unresolved);
+ if (error != 0)
+ break;
+ /*
+ * Compatibility stuff for old clients:
+ * prepare to automaitcally create non-existing objects.
+ */
+ if (unresolved != 0) {
+ pidx->off = rule->cmd_len - l;
+ pidx++;
+ }
+ }
+
+ if (error != 0) {
+ /* Unref everything we have already done */
+ unref_oib_objects(ch, rule->cmd, oib, pidx);
+ IPFW_UH_WUNLOCK(ch);
+ return (error);
+ }
+ IPFW_UH_WUNLOCK(ch);
+
+ /* Perform auto-creation for non-existing objects */
+ if (pidx != oib)
+ error = create_objects_compat(ch, rule->cmd, oib, pidx, ti);
+
+ /* Calculate real number of dynamic objects */
+ ci->object_opcodes = (uint16_t)(pidx - oib);
+
+ return (error);
+}
+
+/*
+ * Checks is opcode is referencing table of appropriate type.
+ * Adds reference count for found table if true.
+ * Rewrites user-supplied opcode values with kernel ones.
+ *
+ * Returns 0 on success and appropriate error code otherwise.
+ */
+static int
+rewrite_rule_uidx(struct ip_fw_chain *chain, struct rule_check_info *ci)
+{
+ int error;
+ ipfw_insn *cmd;
+ uint8_t type;
+ struct obj_idx *p, *pidx_first, *pidx_last;
+ struct tid_info ti;
+
+ /*
+ * Prepare an array for storing opcode indices.
+ * Use stack allocation by default.
+ */
+ if (ci->object_opcodes <= (sizeof(ci->obuf)/sizeof(ci->obuf[0]))) {
+ /* Stack */
+ pidx_first = ci->obuf;
+ } else
+ pidx_first = malloc(
+ ci->object_opcodes * sizeof(struct obj_idx),
+ M_IPFW, M_WAITOK | M_ZERO);
+
+ error = 0;
+ type = 0;
+ memset(&ti, 0, sizeof(ti));
+
+ /* Use set rule is assigned to. */
+ ti.set = ci->krule->set;
+ if (ci->ctlv != NULL) {
+ ti.tlvs = (void *)(ci->ctlv + 1);
+ ti.tlen = ci->ctlv->head.length - sizeof(ipfw_obj_ctlv);
+ }
+
+ /* Reference all used tables and other objects */
+ error = ref_rule_objects(chain, ci->krule, ci, pidx_first, &ti);
+ if (error != 0)
+ goto free;
+ /*
+ * Note that ref_rule_objects() might have updated ci->object_opcodes
+ * to reflect actual number of object opcodes.
+ */
+
+ /* Perform rewrite of remaining opcodes */
+ p = pidx_first;
+ pidx_last = pidx_first + ci->object_opcodes;
+ for (p = pidx_first; p < pidx_last; p++) {
+ cmd = ci->krule->cmd + p->off;
+ update_opcode_kidx(cmd, p->kidx);
+ }
+
+free:
+ if (pidx_first != ci->obuf)
+ free(pidx_first, M_IPFW);
+
+ return (error);
+}
+
+/*
+ * Adds one or more rules to ipfw @chain.
+ * Data layout (version 0)(current):
+ * Request:
+ * [
+ * ip_fw3_opheader
+ * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional *1)
+ * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] (*2) (*3)
+ * ]
+ * Reply:
+ * [
+ * ip_fw3_opheader
+ * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional)
+ * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ]
+ * ]
+ *
+ * Rules in reply are modified to store their actual ruleset number.
+ *
+ * (*1) TLVs inside IPFW_TLV_TBL_LIST needs to be sorted ascending
+ * according to their idx field and there has to be no duplicates.
+ * (*2) Numbered rules inside IPFW_TLV_RULE_LIST needs to be sorted ascending.
+ * (*3) Each ip_fw structure needs to be aligned to u64 boundary.
+ *
+ * Returns 0 on success.
+ */
+static int
+add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_ctlv *ctlv, *rtlv, *tstate;
+ ipfw_obj_ntlv *ntlv;
+ int clen, error, idx;
+ uint32_t count, read;
+ struct ip_fw_rule *r;
+ struct rule_check_info rci, *ci, *cbuf;
+ int i, rsize;
+
+ op3 = (ip_fw3_opheader *)ipfw_get_sopt_space(sd, sd->valsize);
+ ctlv = (ipfw_obj_ctlv *)(op3 + 1);
+
+ read = sizeof(ip_fw3_opheader);
+ rtlv = NULL;
+ tstate = NULL;
+ cbuf = NULL;
+ memset(&rci, 0, sizeof(struct rule_check_info));
+
+ if (read + sizeof(*ctlv) > sd->valsize)
+ return (EINVAL);
+
+ if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) {
+ clen = ctlv->head.length;
+ /* Check size and alignment */
+ if (clen > sd->valsize || clen < sizeof(*ctlv))
+ return (EINVAL);
+ if ((clen % sizeof(uint64_t)) != 0)
+ return (EINVAL);
+
+ /*
+ * Some table names or other named objects.
+ * Check for validness.
+ */
+ count = (ctlv->head.length - sizeof(*ctlv)) / sizeof(*ntlv);
+ if (ctlv->count != count || ctlv->objsize != sizeof(*ntlv))
+ return (EINVAL);
+
+ /*
+ * Check each TLV.
+ * Ensure TLVs are sorted ascending and
+ * there are no duplicates.
+ */
+ idx = -1;
+ ntlv = (ipfw_obj_ntlv *)(ctlv + 1);
+ while (count > 0) {
+ if (ntlv->head.length != sizeof(ipfw_obj_ntlv))
+ return (EINVAL);
+
+ error = ipfw_check_object_name_generic(ntlv->name);
+ if (error != 0)
+ return (error);
+
+ if (ntlv->idx <= idx)
+ return (EINVAL);
+
+ idx = ntlv->idx;
+ count--;
+ ntlv++;
+ }
+
+ tstate = ctlv;
+ read += ctlv->head.length;
+ ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length);
+ }
+
+ if (read + sizeof(*ctlv) > sd->valsize)
+ return (EINVAL);
+
+ if (ctlv->head.type == IPFW_TLV_RULE_LIST) {
+ clen = ctlv->head.length;
+ if (clen + read > sd->valsize || clen < sizeof(*ctlv))
+ return (EINVAL);
+ if ((clen % sizeof(uint64_t)) != 0)
+ return (EINVAL);
+
+ /*
+ * TODO: Permit adding multiple rules at once
+ */
+ if (ctlv->count != 1)
+ return (ENOTSUP);
+
+ clen -= sizeof(*ctlv);
+
+ if (ctlv->count > clen / sizeof(struct ip_fw_rule))
+ return (EINVAL);
+
+ /* Allocate state for each rule or use stack */
+ if (ctlv->count == 1) {
+ memset(&rci, 0, sizeof(struct rule_check_info));
+ cbuf = &rci;
+ } else
+ cbuf = malloc(ctlv->count * sizeof(*ci), M_TEMP,
+ M_WAITOK | M_ZERO);
+ ci = cbuf;
+
+ /*
+ * Check each rule for validness.
+ * Ensure numbered rules are sorted ascending
+ * and properly aligned
+ */
+ idx = 0;
+ r = (struct ip_fw_rule *)(ctlv + 1);
+ count = 0;
+ error = 0;
+ while (clen > 0) {
+ rsize = roundup2(RULESIZE(r), sizeof(uint64_t));
+ if (rsize > clen || ctlv->count <= count) {
+ error = EINVAL;
+ break;
+ }
+
+ ci->ctlv = tstate;
+ error = check_ipfw_rule1(r, rsize, ci);
+ if (error != 0)
+ break;
+
+ /* Check sorting */
+ if (r->rulenum != 0 && r->rulenum < idx) {
+ printf("rulenum %d idx %d\n", r->rulenum, idx);
+ error = EINVAL;
+ break;
+ }
+ idx = r->rulenum;
+
+ ci->urule = (caddr_t)r;
+
+ rsize = roundup2(rsize, sizeof(uint64_t));
+ clen -= rsize;
+ r = (struct ip_fw_rule *)((caddr_t)r + rsize);
+ count++;
+ ci++;
+ }
+
+ if (ctlv->count != count || error != 0) {
+ if (cbuf != &rci)
+ free(cbuf, M_TEMP);
+ return (EINVAL);
+ }
+
+ rtlv = ctlv;
+ read += ctlv->head.length;
+ ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length);
+ }
+
+ if (read != sd->valsize || rtlv == NULL || rtlv->count == 0) {
+ if (cbuf != NULL && cbuf != &rci)
+ free(cbuf, M_TEMP);
+ return (EINVAL);
+ }
+
+ /*
+ * Passed rules seems to be valid.
+ * Allocate storage and try to add them to chain.
+ */
+ for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) {
+ clen = RULEKSIZE1((struct ip_fw_rule *)ci->urule);
+ ci->krule = ipfw_alloc_rule(chain, clen);
+ import_rule1(ci);
+ }
+
+ if ((error = commit_rules(chain, cbuf, rtlv->count)) != 0) {
+ /* Free allocate krules */
+ for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++)
+ free_rule(ci->krule);
+ }
+
+ if (cbuf != NULL && cbuf != &rci)
+ free(cbuf, M_TEMP);
+
+ return (error);
+}
+
+/*
+ * Lists all sopts currently registered.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
+ * Reply: [ ipfw_obj_lheader ipfw_sopt_info x N ]
+ *
+ * Returns 0 on success
+ */
+static int
+dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ struct _ipfw_obj_lheader *olh;
+ ipfw_sopt_info *i;
+ struct ipfw_sopt_handler *sh;
+ uint32_t count, n, size;
+
+ olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
+ if (olh == NULL)
+ return (EINVAL);
+ if (sd->valsize < olh->size)
+ return (EINVAL);
+
+ CTL3_LOCK();
+ count = ctl3_hsize;
+ size = count * sizeof(ipfw_sopt_info) + sizeof(ipfw_obj_lheader);
+
+ /* Fill in header regadless of buffer size */
+ olh->count = count;
+ olh->objsize = sizeof(ipfw_sopt_info);
+
+ if (size > olh->size) {
+ olh->size = size;
+ CTL3_UNLOCK();
+ return (ENOMEM);
+ }
+ olh->size = size;
+
+ for (n = 1; n <= count; n++) {
+ i = (ipfw_sopt_info *)ipfw_get_sopt_space(sd, sizeof(*i));
+ KASSERT(i != NULL, ("previously checked buffer is not enough"));
+ sh = &ctl3_handlers[n];
+ i->opcode = sh->opcode;
+ i->version = sh->version;
+ i->refcnt = sh->refcnt;
+ }
+ CTL3_UNLOCK();
+
+ return (0);
+}
+
+/*
+ * Compares two opcodes.
+ * Used both in qsort() and bsearch().
+ *
+ * Returns 0 if match is found.
+ */
+static int
+compare_opcodes(const void *_a, const void *_b)
+{
+ const struct opcode_obj_rewrite *a, *b;
+
+ a = (const struct opcode_obj_rewrite *)_a;
+ b = (const struct opcode_obj_rewrite *)_b;
+
+ if (a->opcode < b->opcode)
+ return (-1);
+ else if (a->opcode > b->opcode)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * XXX: Rewrite bsearch()
+ */
+static int
+find_op_rw_range(uint16_t op, struct opcode_obj_rewrite **plo,
+ struct opcode_obj_rewrite **phi)
+{
+ struct opcode_obj_rewrite *ctl3_max, *lo, *hi, h, *rw;
+
+ memset(&h, 0, sizeof(h));
+ h.opcode = op;
+
+ rw = (struct opcode_obj_rewrite *)bsearch(&h, ctl3_rewriters,
+ ctl3_rsize, sizeof(h), compare_opcodes);
+ if (rw == NULL)
+ return (1);
+
+ /* Find the first element matching the same opcode */
+ lo = rw;
+ for ( ; lo > ctl3_rewriters && (lo - 1)->opcode == op; lo--)
+ ;
+
+ /* Find the last element matching the same opcode */
+ hi = rw;
+ ctl3_max = ctl3_rewriters + ctl3_rsize;
+ for ( ; (hi + 1) < ctl3_max && (hi + 1)->opcode == op; hi++)
+ ;
+
+ *plo = lo;
+ *phi = hi;
+
+ return (0);
+}
+
+/*
+ * Finds opcode object rewriter based on @code.
+ *
+ * Returns pointer to handler or NULL.
+ */
+static struct opcode_obj_rewrite *
+find_op_rw(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
+{
+ struct opcode_obj_rewrite *rw, *lo, *hi;
+ uint16_t uidx;
+ uint8_t subtype;
+
+ if (find_op_rw_range(cmd->opcode, &lo, &hi) != 0)
+ return (NULL);
+
+ for (rw = lo; rw <= hi; rw++) {
+ if (rw->classifier(cmd, &uidx, &subtype) == 0) {
+ if (puidx != NULL)
+ *puidx = uidx;
+ if (ptype != NULL)
+ *ptype = subtype;
+ return (rw);
+ }
+ }
+
+ return (NULL);
+}
+int
+classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx)
+{
+
+ if (find_op_rw(cmd, puidx, NULL) == 0)
+ return (1);
+ return (0);
+}
+
+void
+update_opcode_kidx(ipfw_insn *cmd, uint16_t idx)
+{
+ struct opcode_obj_rewrite *rw;
+
+ rw = find_op_rw(cmd, NULL, NULL);
+ KASSERT(rw != NULL, ("No handler to update opcode %d", cmd->opcode));
+ rw->update(cmd, idx);
+}
+
+void
+ipfw_init_obj_rewriter()
+{
+
+ ctl3_rewriters = NULL;
+ ctl3_rsize = 0;
+}
+
+void
+ipfw_destroy_obj_rewriter()
+{
+
+ if (ctl3_rewriters != NULL)
+ free(ctl3_rewriters, M_IPFW);
+ ctl3_rewriters = NULL;
+ ctl3_rsize = 0;
+}
+
+/*
+ * Adds one or more opcode object rewrite handlers to the global array.
+ * Function may sleep.
+ */
+void
+ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count)
+{
+ size_t sz;
+ struct opcode_obj_rewrite *tmp;
+
+ CTL3_LOCK();
+
+ for (;;) {
+ sz = ctl3_rsize + count;
+ CTL3_UNLOCK();
+ tmp = malloc(sizeof(*rw) * sz, M_IPFW, M_WAITOK | M_ZERO);
+ CTL3_LOCK();
+ if (ctl3_rsize + count <= sz)
+ break;
+
+ /* Retry */
+ free(tmp, M_IPFW);
+ }
+
+ /* Merge old & new arrays */
+ sz = ctl3_rsize + count;
+ memcpy(tmp, ctl3_rewriters, ctl3_rsize * sizeof(*rw));
+ memcpy(&tmp[ctl3_rsize], rw, count * sizeof(*rw));
+ qsort(tmp, sz, sizeof(*rw), compare_opcodes);
+ /* Switch new and free old */
+ if (ctl3_rewriters != NULL)
+ free(ctl3_rewriters, M_IPFW);
+ ctl3_rewriters = tmp;
+ ctl3_rsize = sz;
+
+ CTL3_UNLOCK();
+}
+
+/*
+ * Removes one or more object rewrite handlers from the global array.
+ */
+int
+ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count)
+{
+ size_t sz;
+ struct opcode_obj_rewrite *ctl3_max, *ktmp, *lo, *hi;
+ int i;
+
+ CTL3_LOCK();
+
+ for (i = 0; i < count; i++) {
+ if (find_op_rw_range(rw[i].opcode, &lo, &hi) != 0)
+ continue;
+
+ for (ktmp = lo; ktmp <= hi; ktmp++) {
+ if (ktmp->classifier != rw[i].classifier)
+ continue;
+
+ ctl3_max = ctl3_rewriters + ctl3_rsize;
+ sz = (ctl3_max - (ktmp + 1)) * sizeof(*ktmp);
+ memmove(ktmp, ktmp + 1, sz);
+ ctl3_rsize--;
+ break;
+ }
+
+ }
+
+ if (ctl3_rsize == 0) {
+ if (ctl3_rewriters != NULL)
+ free(ctl3_rewriters, M_IPFW);
+ ctl3_rewriters = NULL;
+ }
+
+ CTL3_UNLOCK();
+
+ return (0);
+}
+
+static int
+export_objhash_ntlv_internal(struct namedobj_instance *ni,
+ struct named_object *no, void *arg)
+{
+ struct sockopt_data *sd;
+ ipfw_obj_ntlv *ntlv;
+
+ sd = (struct sockopt_data *)arg;
+ ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv));
+ if (ntlv == NULL)
+ return (ENOMEM);
+ ipfw_export_obj_ntlv(no, ntlv);
+ return (0);
+}
+
+/*
+ * Lists all service objects.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_lheader ] size = ipfw_obj_lheader.size
+ * Reply: [ ipfw_obj_lheader [ ipfw_obj_ntlv x N ] (optional) ]
+ * Returns 0 on success
+ */
+static int
+dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_lheader *hdr;
+ int count;
+
+ hdr = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr));
+ if (hdr == NULL)
+ return (EINVAL);
+
+ IPFW_UH_RLOCK(chain);
+ count = ipfw_objhash_count(CHAIN_TO_SRV(chain));
+ hdr->size = sizeof(ipfw_obj_lheader) + count * sizeof(ipfw_obj_ntlv);
+ if (sd->valsize < hdr->size) {
+ IPFW_UH_RUNLOCK(chain);
+ return (ENOMEM);
+ }
+ hdr->count = count;
+ hdr->objsize = sizeof(ipfw_obj_ntlv);
+ if (count > 0)
+ ipfw_objhash_foreach(CHAIN_TO_SRV(chain),
+ export_objhash_ntlv_internal, sd);
+ IPFW_UH_RUNLOCK(chain);
+ return (0);
+}
+
+/*
+ * Compares two sopt handlers (code, version and handler ptr).
+ * Used both as qsort() and bsearch().
+ * Does not compare handler for latter case.
+ *
+ * Returns 0 if match is found.
+ */
+static int
+compare_sh(const void *_a, const void *_b)
+{
+ const struct ipfw_sopt_handler *a, *b;
+
+ a = (const struct ipfw_sopt_handler *)_a;
+ b = (const struct ipfw_sopt_handler *)_b;
+
+ if (a->opcode < b->opcode)
+ return (-1);
+ else if (a->opcode > b->opcode)
+ return (1);
+
+ if (a->version < b->version)
+ return (-1);
+ else if (a->version > b->version)
+ return (1);
+
+ /* bsearch helper */
+ if (a->handler == NULL)
+ return (0);
+
+ if ((uintptr_t)a->handler < (uintptr_t)b->handler)
+ return (-1);
+ else if ((uintptr_t)a->handler > (uintptr_t)b->handler)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Finds sopt handler based on @code and @version.
+ *
+ * Returns pointer to handler or NULL.
+ */
+static struct ipfw_sopt_handler *
+find_sh(uint16_t code, uint8_t version, sopt_handler_f *handler)
+{
+ struct ipfw_sopt_handler *sh, h;
+
+ memset(&h, 0, sizeof(h));
+ h.opcode = code;
+ h.version = version;
+ h.handler = handler;
+
+ sh = (struct ipfw_sopt_handler *)bsearch(&h, ctl3_handlers,
+ ctl3_hsize, sizeof(h), compare_sh);
+
+ return (sh);
+}
+
+static int
+find_ref_sh(uint16_t opcode, uint8_t version, struct ipfw_sopt_handler *psh)
+{
+ struct ipfw_sopt_handler *sh;
+
+ CTL3_LOCK();
+ if ((sh = find_sh(opcode, version, NULL)) == NULL) {
+ CTL3_UNLOCK();
+ printf("ipfw: ipfw_ctl3 invalid option %d""v""%d\n",
+ opcode, version);
+ return (EINVAL);
+ }
+ sh->refcnt++;
+ ctl3_refct++;
+ /* Copy handler data to requested buffer */
+ *psh = *sh;
+ CTL3_UNLOCK();
+
+ return (0);
+}
+
+static void
+find_unref_sh(struct ipfw_sopt_handler *psh)
+{
+ struct ipfw_sopt_handler *sh;
+
+ CTL3_LOCK();
+ sh = find_sh(psh->opcode, psh->version, NULL);
+ KASSERT(sh != NULL, ("ctl3 handler disappeared"));
+ sh->refcnt--;
+ ctl3_refct--;
+ CTL3_UNLOCK();
+}
+
+void
+ipfw_init_sopt_handler()
+{
+
+ CTL3_LOCK_INIT();
+ IPFW_ADD_SOPT_HANDLER(1, scodes);
+}
+
+void
+ipfw_destroy_sopt_handler()
+{
+
+ IPFW_DEL_SOPT_HANDLER(1, scodes);
+ CTL3_LOCK_DESTROY();
+}
+
+/*
+ * Adds one or more sockopt handlers to the global array.
+ * Function may sleep.
+ */
+void
+ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count)
+{
+ size_t sz;
+ struct ipfw_sopt_handler *tmp;
+
+ CTL3_LOCK();
+
+ for (;;) {
+ sz = ctl3_hsize + count;
+ CTL3_UNLOCK();
+ tmp = malloc(sizeof(*sh) * sz, M_IPFW, M_WAITOK | M_ZERO);
+ CTL3_LOCK();
+ if (ctl3_hsize + count <= sz)
+ break;
+
+ /* Retry */
+ free(tmp, M_IPFW);
+ }
+
+ /* Merge old & new arrays */
+ sz = ctl3_hsize + count;
+ memcpy(tmp, ctl3_handlers, ctl3_hsize * sizeof(*sh));
+ memcpy(&tmp[ctl3_hsize], sh, count * sizeof(*sh));
+ qsort(tmp, sz, sizeof(*sh), compare_sh);
+ /* Switch new and free old */
+ if (ctl3_handlers != NULL)
+ free(ctl3_handlers, M_IPFW);
+ ctl3_handlers = tmp;
+ ctl3_hsize = sz;
+ ctl3_gencnt++;
+
+ CTL3_UNLOCK();
+}
+
+/*
+ * Removes one or more sockopt handlers from the global array.
+ */
+int
+ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count)
+{
+ size_t sz;
+ struct ipfw_sopt_handler *tmp, *h;
+ int i;
+
+ CTL3_LOCK();
+
+ for (i = 0; i < count; i++) {
+ tmp = &sh[i];
+ h = find_sh(tmp->opcode, tmp->version, tmp->handler);
+ if (h == NULL)
+ continue;
+
+ sz = (ctl3_handlers + ctl3_hsize - (h + 1)) * sizeof(*h);
+ memmove(h, h + 1, sz);
+ ctl3_hsize--;
+ }
+
+ if (ctl3_hsize == 0) {
+ if (ctl3_handlers != NULL)
+ free(ctl3_handlers, M_IPFW);
+ ctl3_handlers = NULL;
+ }
+
+ ctl3_gencnt++;
+
+ CTL3_UNLOCK();
+
+ return (0);
+}
+
+/*
+ * Writes data accumulated in @sd to sockopt buffer.
+ * Zeroes internal @sd buffer.
+ */
+static int
+ipfw_flush_sopt_data(struct sockopt_data *sd)
+{
+ struct sockopt *sopt;
+ int error;
+ size_t sz;
+
+ sz = sd->koff;
+ if (sz == 0)
+ return (0);
+
+ sopt = sd->sopt;
+
+ if (sopt->sopt_dir == SOPT_GET) {
+ error = copyout(sd->kbuf, sopt->sopt_val, sz);
+ if (error != 0)
+ return (error);
+ }
+
+ memset(sd->kbuf, 0, sd->ksize);
+ sd->ktotal += sz;
+ sd->koff = 0;
+ if (sd->ktotal + sd->ksize < sd->valsize)
+ sd->kavail = sd->ksize;
+ else
+ sd->kavail = sd->valsize - sd->ktotal;
+
+ /* Update sopt buffer data */
+ sopt->sopt_valsize = sd->ktotal;
+ sopt->sopt_val = sd->sopt_val + sd->ktotal;
+
+ return (0);
+}
+
+/*
+ * Ensures that @sd buffer has contiguous @neeeded number of
+ * bytes.
+ *
+ * Returns pointer to requested space or NULL.
+ */
+caddr_t
+ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed)
{
-#define RULE_MAXSIZE (256*sizeof(u_int32_t))
int error;
- size_t size, len, valsize;
- struct ip_fw *buf, *rule;
+ caddr_t addr;
+
+ if (sd->kavail < needed) {
+ /*
+ * Flush data and try another time.
+ */
+ error = ipfw_flush_sopt_data(sd);
+
+ if (sd->kavail < needed || error != 0)
+ return (NULL);
+ }
+
+ addr = sd->kbuf + sd->koff;
+ sd->koff += needed;
+ sd->kavail -= needed;
+ return (addr);
+}
+
+/*
+ * Requests @needed contiguous bytes from @sd buffer.
+ * Function is used to notify subsystem that we are
+ * interesed in first @needed bytes (request header)
+ * and the rest buffer can be safely zeroed.
+ *
+ * Returns pointer to requested space or NULL.
+ */
+caddr_t
+ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed)
+{
+ caddr_t addr;
+
+ if ((addr = ipfw_get_sopt_space(sd, needed)) == NULL)
+ return (NULL);
+
+ if (sd->kavail > 0)
+ memset(sd->kbuf + sd->koff, 0, sd->kavail);
+
+ return (addr);
+}
+
+/*
+ * New sockopt handler.
+ */
+int
+ipfw_ctl3(struct sockopt *sopt)
+{
+ int error, locked;
+ size_t size, valsize;
struct ip_fw_chain *chain;
- u_int32_t rulenum[2];
- uint32_t opt;
- char xbuf[128];
+ char xbuf[256];
+ struct sockopt_data sdata;
+ struct ipfw_sopt_handler h;
ip_fw3_opheader *op3 = NULL;
error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
- if (error)
+ if (error != 0)
+ return (error);
+
+ if (sopt->sopt_name != IP_FW3)
+ return (ipfw_ctl(sopt));
+
+ chain = &V_layer3_chain;
+ error = 0;
+
+ /* Save original valsize before it is altered via sooptcopyin() */
+ valsize = sopt->sopt_valsize;
+ memset(&sdata, 0, sizeof(sdata));
+ /* Read op3 header first to determine actual operation */
+ op3 = (ip_fw3_opheader *)xbuf;
+ error = sooptcopyin(sopt, op3, sizeof(*op3), sizeof(*op3));
+ if (error != 0)
+ return (error);
+ sopt->sopt_valsize = valsize;
+
+ /*
+ * Find and reference command.
+ */
+ error = find_ref_sh(op3->opcode, op3->version, &h);
+ if (error != 0)
return (error);
/*
* Disallow modifications in really-really secure mode, but still allow
* the logging counters to be reset.
*/
- if (sopt->sopt_name == IP_FW_ADD ||
- (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
+ if ((h.dir & HDIR_SET) != 0 && h.opcode != IP_FW_XRESETLOG) {
error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
- if (error)
+ if (error != 0) {
+ find_unref_sh(&h);
return (error);
+ }
}
+ /*
+ * Fill in sockopt_data structure that may be useful for
+ * IP_FW3 get requests.
+ */
+ locked = 0;
+ if (valsize <= sizeof(xbuf)) {
+ /* use on-stack buffer */
+ sdata.kbuf = xbuf;
+ sdata.ksize = sizeof(xbuf);
+ sdata.kavail = valsize;
+ } else {
+
+ /*
+ * Determine opcode type/buffer size:
+ * allocate sliding-window buf for data export or
+ * contiguous buffer for special ops.
+ */
+ if ((h.dir & HDIR_SET) != 0) {
+ /* Set request. Allocate contigous buffer. */
+ if (valsize > CTL3_LARGEBUF) {
+ find_unref_sh(&h);
+ return (EFBIG);
+ }
+
+ size = valsize;
+ } else {
+ /* Get request. Allocate sliding window buffer */
+ size = (valsize<CTL3_SMALLBUF) ? valsize:CTL3_SMALLBUF;
+
+ if (size < valsize) {
+ /* We have to wire user buffer */
+ error = vslock(sopt->sopt_val, valsize);
+ if (error != 0)
+ return (error);
+ locked = 1;
+ }
+ }
+
+ sdata.kbuf = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
+ sdata.ksize = size;
+ sdata.kavail = size;
+ }
+
+ sdata.sopt = sopt;
+ sdata.sopt_val = sopt->sopt_val;
+ sdata.valsize = valsize;
+
+ /*
+ * Copy either all request (if valsize < bsize_max)
+ * or first bsize_max bytes to guarantee most consumers
+ * that all necessary data has been copied).
+ * Anyway, copy not less than sizeof(ip_fw3_opheader).
+ */
+ if ((error = sooptcopyin(sopt, sdata.kbuf, sdata.ksize,
+ sizeof(ip_fw3_opheader))) != 0)
+ return (error);
+ op3 = (ip_fw3_opheader *)sdata.kbuf;
+
+ /* Finally, run handler */
+ error = h.handler(chain, op3, &sdata);
+ find_unref_sh(&h);
+
+ /* Flush state and free buffers */
+ if (error == 0)
+ error = ipfw_flush_sopt_data(&sdata);
+ else
+ ipfw_flush_sopt_data(&sdata);
+
+ if (locked != 0)
+ vsunlock(sdata.sopt_val, valsize);
+
+ /* Restore original pointer and set number of bytes written */
+ sopt->sopt_val = sdata.sopt_val;
+ sopt->sopt_valsize = sdata.ktotal;
+ if (sdata.kbuf != xbuf)
+ free(sdata.kbuf, M_TEMP);
+
+ return (error);
+}
+
+/**
+ * {set|get}sockopt parser.
+ */
+int
+ipfw_ctl(struct sockopt *sopt)
+{
+#define RULE_MAXSIZE (512*sizeof(u_int32_t))
+ int error;
+ size_t size, valsize;
+ struct ip_fw *buf;
+ struct ip_fw_rule0 *rule;
+ struct ip_fw_chain *chain;
+ u_int32_t rulenum[2];
+ uint32_t opt;
+ struct rule_check_info ci;
+ IPFW_RLOCK_TRACKER;
+
chain = &V_layer3_chain;
error = 0;
/* Save original valsize before it is altered via sooptcopyin() */
valsize = sopt->sopt_valsize;
- if ((opt = sopt->sopt_name) == IP_FW3) {
- /*
- * Copy not less than sizeof(ip_fw3_opheader).
- * We hope any IP_FW3 command will fit into 128-byte buffer.
- */
- if ((error = sooptcopyin(sopt, xbuf, sizeof(xbuf),
- sizeof(ip_fw3_opheader))) != 0)
+ opt = sopt->sopt_name;
+
+ /*
+ * Disallow modifications in really-really secure mode, but still allow
+ * the logging counters to be reset.
+ */
+ if (opt == IP_FW_ADD ||
+ (sopt->sopt_dir == SOPT_SET && opt != IP_FW_RESETLOG)) {
+ error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+ if (error != 0)
return (error);
- op3 = (ip_fw3_opheader *)xbuf;
- opt = op3->opcode;
}
switch (opt) {
@@ -1006,9 +3756,7 @@ ipfw_ctl(struct sockopt *sopt)
size += ipfw_dyn_len();
if (size >= sopt->sopt_valsize)
break;
- buf = malloc(size, M_TEMP, M_WAITOK);
- if (buf == NULL)
- break;
+ buf = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
IPFW_UH_RLOCK(chain);
/* check again how much space we need */
want = chain->static_len + ipfw_dyn_len();
@@ -1033,6 +3781,8 @@ ipfw_ctl(struct sockopt *sopt)
error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
sizeof(struct ip_fw7) );
+ memset(&ci, 0, sizeof(struct rule_check_info));
+
/*
* If the size of commands equals RULESIZE7 then we assume
* a FreeBSD7.2 binary is talking to us (set is7=1).
@@ -1042,25 +3792,30 @@ ipfw_ctl(struct sockopt *sopt)
* the first ipfw command is 'ipfw [pipe] list')
* the ipfw binary may crash or loop infinitly...
*/
- if (sopt->sopt_valsize == RULESIZE7(rule)) {
+ size = sopt->sopt_valsize;
+ if (size == RULESIZE7(rule)) {
is7 = 1;
error = convert_rule_to_8(rule);
if (error) {
free(rule, M_TEMP);
return error;
}
- if (error == 0)
- error = check_ipfw_struct(rule, RULESIZE(rule));
- } else {
+ size = RULESIZE(rule);
+ } else
is7 = 0;
if (error == 0)
- error = check_ipfw_struct(rule, sopt->sopt_valsize);
- }
+ error = check_ipfw_rule0(rule, size, &ci);
if (error == 0) {
- /* locking is done within ipfw_add_rule() */
- error = ipfw_add_rule(chain, rule);
- size = RULESIZE(rule);
- if (!error && sopt->sopt_dir == SOPT_GET) {
+ /* locking is done within add_rule() */
+ struct ip_fw *krule;
+ krule = ipfw_alloc_rule(chain, RULEKSIZE0(rule));
+ ci.urule = (caddr_t)rule;
+ ci.krule = krule;
+ import_rule0(&ci);
+ error = commit_rules(chain, &ci, 1);
+ if (error != 0)
+ free_rule(ci.krule);
+ else if (sopt->sopt_dir == SOPT_GET) {
if (is7) {
error = convert_rule_to_7(rule);
size = RULESIZE7(rule);
@@ -1119,82 +3874,64 @@ ipfw_ctl(struct sockopt *sopt)
sopt->sopt_name == IP_FW_RESETLOG);
break;
- /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/
+ /*--- TABLE opcodes ---*/
case IP_FW_TABLE_ADD:
- {
- ipfw_table_entry ent;
-
- error = sooptcopyin(sopt, &ent,
- sizeof(ent), sizeof(ent));
- if (error)
- break;
- error = ipfw_add_table_entry(chain, ent.tbl,
- &ent.addr, sizeof(ent.addr), ent.masklen,
- IPFW_TABLE_CIDR, ent.value);
- }
- break;
-
case IP_FW_TABLE_DEL:
{
ipfw_table_entry ent;
+ struct tentry_info tei;
+ struct tid_info ti;
+ struct table_value v;
error = sooptcopyin(sopt, &ent,
sizeof(ent), sizeof(ent));
if (error)
break;
- error = ipfw_del_table_entry(chain, ent.tbl,
- &ent.addr, sizeof(ent.addr), ent.masklen, IPFW_TABLE_CIDR);
- }
- break;
-
- case IP_FW_TABLE_XADD: /* IP_FW3 */
- case IP_FW_TABLE_XDEL: /* IP_FW3 */
- {
- ipfw_table_xentry *xent = (ipfw_table_xentry *)(op3 + 1);
-
- /* Check minimum header size */
- if (IP_FW3_OPLENGTH(sopt) < offsetof(ipfw_table_xentry, k)) {
- error = EINVAL;
- break;
- }
- /* Check if len field is valid */
- if (xent->len > sizeof(ipfw_table_xentry)) {
- error = EINVAL;
- break;
- }
-
- len = xent->len - offsetof(ipfw_table_xentry, k);
-
- error = (opt == IP_FW_TABLE_XADD) ?
- ipfw_add_table_entry(chain, xent->tbl, &xent->k,
- len, xent->masklen, xent->type, xent->value) :
- ipfw_del_table_entry(chain, xent->tbl, &xent->k,
- len, xent->masklen, xent->type);
+ memset(&tei, 0, sizeof(tei));
+ tei.paddr = &ent.addr;
+ tei.subtype = AF_INET;
+ tei.masklen = ent.masklen;
+ ipfw_import_table_value_legacy(ent.value, &v);
+ tei.pvalue = &v;
+ memset(&ti, 0, sizeof(ti));
+ ti.uidx = ent.tbl;
+ ti.type = IPFW_TABLE_CIDR;
+
+ error = (opt == IP_FW_TABLE_ADD) ?
+ add_table_entry(chain, &ti, &tei, 0, 1) :
+ del_table_entry(chain, &ti, &tei, 0, 1);
}
break;
+
case IP_FW_TABLE_FLUSH:
{
u_int16_t tbl;
+ struct tid_info ti;
error = sooptcopyin(sopt, &tbl,
sizeof(tbl), sizeof(tbl));
if (error)
break;
- error = ipfw_flush_table(chain, tbl);
+ memset(&ti, 0, sizeof(ti));
+ ti.uidx = tbl;
+ error = flush_table(chain, &ti);
}
break;
case IP_FW_TABLE_GETSIZE:
{
u_int32_t tbl, cnt;
+ struct tid_info ti;
if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
sizeof(tbl))))
break;
+ memset(&ti, 0, sizeof(ti));
+ ti.uidx = tbl;
IPFW_RLOCK(chain);
- error = ipfw_count_table(chain, tbl, &cnt);
+ error = ipfw_count_table(chain, &ti, &cnt);
IPFW_RUNLOCK(chain);
if (error)
break;
@@ -1205,6 +3942,7 @@ ipfw_ctl(struct sockopt *sopt)
case IP_FW_TABLE_LIST:
{
ipfw_table *tbl;
+ struct tid_info ti;
if (sopt->sopt_valsize < sizeof(*tbl)) {
error = EINVAL;
@@ -1219,8 +3957,10 @@ ipfw_ctl(struct sockopt *sopt)
}
tbl->size = (size - sizeof(*tbl)) /
sizeof(ipfw_table_entry);
+ memset(&ti, 0, sizeof(ti));
+ ti.uidx = tbl->tbl;
IPFW_RLOCK(chain);
- error = ipfw_dump_table(chain, tbl);
+ error = ipfw_dump_table_legacy(chain, &ti, tbl);
IPFW_RUNLOCK(chain);
if (error) {
free(tbl, M_TEMP);
@@ -1231,62 +3971,6 @@ ipfw_ctl(struct sockopt *sopt)
}
break;
- case IP_FW_TABLE_XGETSIZE: /* IP_FW3 */
- {
- uint32_t *tbl;
-
- if (IP_FW3_OPLENGTH(sopt) < sizeof(uint32_t)) {
- error = EINVAL;
- break;
- }
-
- tbl = (uint32_t *)(op3 + 1);
-
- IPFW_RLOCK(chain);
- error = ipfw_count_xtable(chain, *tbl, tbl);
- IPFW_RUNLOCK(chain);
- if (error)
- break;
- error = sooptcopyout(sopt, op3, sopt->sopt_valsize);
- }
- break;
-
- case IP_FW_TABLE_XLIST: /* IP_FW3 */
- {
- ipfw_xtable *tbl;
-
- if ((size = valsize) < sizeof(ipfw_xtable)) {
- error = EINVAL;
- break;
- }
-
- tbl = malloc(size, M_TEMP, M_ZERO | M_WAITOK);
- memcpy(tbl, op3, sizeof(ipfw_xtable));
-
- /* Get maximum number of entries we can store */
- tbl->size = (size - sizeof(ipfw_xtable)) /
- sizeof(ipfw_table_xentry);
- IPFW_RLOCK(chain);
- error = ipfw_dump_xtable(chain, tbl);
- IPFW_RUNLOCK(chain);
- if (error) {
- free(tbl, M_TEMP);
- break;
- }
-
- /* Revert size field back to bytes */
- tbl->size = tbl->size * sizeof(ipfw_table_xentry) +
- sizeof(ipfw_table);
- /*
- * Since we call sooptcopyin() with small buffer, sopt_valsize is
- * decreased to reflect supplied buffer size. Set it back to original value
- */
- sopt->sopt_valsize = valsize;
- error = sooptcopyout(sopt, tbl, size);
- free(tbl, M_TEMP);
- }
- break;
-
/*--- NAT operations are protected by the IPFW_LOCK ---*/
case IP_FW_NAT_CFG:
if (IPFW_NAT_LOADED)
@@ -1336,18 +4020,16 @@ ipfw_ctl(struct sockopt *sopt)
return (error);
#undef RULE_MAXSIZE
}
-
-
#define RULE_MAXSIZE (256*sizeof(u_int32_t))
/* Functions to convert rules 7.2 <==> 8.0 */
-int
-convert_rule_to_7(struct ip_fw *rule)
+static int
+convert_rule_to_7(struct ip_fw_rule0 *rule)
{
/* Used to modify original rule */
struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
/* copy of original rule, version 8 */
- struct ip_fw *tmp;
+ struct ip_fw_rule0 *tmp;
/* Used to copy commands */
ipfw_insn *ccmd, *dst;
@@ -1360,13 +4042,12 @@ convert_rule_to_7(struct ip_fw *rule)
bcopy(rule, tmp, RULE_MAXSIZE);
/* Copy fields */
- rule7->_pad = tmp->_pad;
+ //rule7->_pad = tmp->_pad;
rule7->set = tmp->set;
rule7->rulenum = tmp->rulenum;
rule7->cmd_len = tmp->cmd_len;
rule7->act_ofs = tmp->act_ofs;
rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
- rule7->next = (struct ip_fw7 *)tmp->x_next;
rule7->cmd_len = tmp->cmd_len;
rule7->pcnt = tmp->pcnt;
rule7->bcnt = tmp->bcnt;
@@ -1396,8 +4077,8 @@ convert_rule_to_7(struct ip_fw *rule)
return 0;
}
-int
-convert_rule_to_8(struct ip_fw *rule)
+static int
+convert_rule_to_8(struct ip_fw_rule0 *rule)
{
/* Used to modify original rule */
struct ip_fw7 *rule7 = (struct ip_fw7 *) rule;
@@ -1439,7 +4120,6 @@ convert_rule_to_8(struct ip_fw *rule)
rule->cmd_len = tmp->cmd_len;
rule->act_ofs = tmp->act_ofs;
rule->next_rule = (struct ip_fw *)tmp->next_rule;
- rule->x_next = (struct ip_fw *)tmp->next;
rule->cmd_len = tmp->cmd_len;
rule->id = 0; /* XXX see if is ok = 0 */
rule->pcnt = tmp->pcnt;
@@ -1450,4 +4130,486 @@ convert_rule_to_8(struct ip_fw *rule)
return 0;
}
+/*
+ * Named object api
+ *
+ */
+
+void
+ipfw_init_srv(struct ip_fw_chain *ch)
+{
+
+ ch->srvmap = ipfw_objhash_create(IPFW_OBJECTS_DEFAULT);
+ ch->srvstate = malloc(sizeof(void *) * IPFW_OBJECTS_DEFAULT,
+ M_IPFW, M_WAITOK | M_ZERO);
+}
+
+void
+ipfw_destroy_srv(struct ip_fw_chain *ch)
+{
+
+ free(ch->srvstate, M_IPFW);
+ ipfw_objhash_destroy(ch->srvmap);
+}
+
+/*
+ * Allocate new bitmask which can be used to enlarge/shrink
+ * named instance index.
+ */
+void
+ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks)
+{
+ size_t size;
+ int max_blocks;
+ u_long *idx_mask;
+
+ KASSERT((items % BLOCK_ITEMS) == 0,
+ ("bitmask size needs to power of 2 and greater or equal to %zu",
+ BLOCK_ITEMS));
+
+ max_blocks = items / BLOCK_ITEMS;
+ size = items / 8;
+ idx_mask = malloc(size * IPFW_MAX_SETS, M_IPFW, M_WAITOK);
+ /* Mark all as free */
+ memset(idx_mask, 0xFF, size * IPFW_MAX_SETS);
+ *idx_mask &= ~(u_long)1; /* Skip index 0 */
+
+ *idx = idx_mask;
+ *pblocks = max_blocks;
+}
+
+/*
+ * Copy current bitmask index to new one.
+ */
+void
+ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, void **idx, int *blocks)
+{
+ int old_blocks, new_blocks;
+ u_long *old_idx, *new_idx;
+ int i;
+
+ old_idx = ni->idx_mask;
+ old_blocks = ni->max_blocks;
+ new_idx = *idx;
+ new_blocks = *blocks;
+
+ for (i = 0; i < IPFW_MAX_SETS; i++) {
+ memcpy(&new_idx[new_blocks * i], &old_idx[old_blocks * i],
+ old_blocks * sizeof(u_long));
+ }
+}
+
+/*
+ * Swaps current @ni index with new one.
+ */
+void
+ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, void **idx, int *blocks)
+{
+ int old_blocks;
+ u_long *old_idx;
+
+ old_idx = ni->idx_mask;
+ old_blocks = ni->max_blocks;
+
+ ni->idx_mask = *idx;
+ ni->max_blocks = *blocks;
+
+ /* Save old values */
+ *idx = old_idx;
+ *blocks = old_blocks;
+}
+
+void
+ipfw_objhash_bitmap_free(void *idx, int blocks)
+{
+
+ free(idx, M_IPFW);
+}
+
+/*
+ * Creates named hash instance.
+ * Must be called without holding any locks.
+ * Return pointer to new instance.
+ */
+struct namedobj_instance *
+ipfw_objhash_create(uint32_t items)
+{
+ struct namedobj_instance *ni;
+ int i;
+ size_t size;
+
+ size = sizeof(struct namedobj_instance) +
+ sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE +
+ sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE;
+
+ ni = malloc(size, M_IPFW, M_WAITOK | M_ZERO);
+ ni->nn_size = NAMEDOBJ_HASH_SIZE;
+ ni->nv_size = NAMEDOBJ_HASH_SIZE;
+
+ ni->names = (struct namedobjects_head *)(ni +1);
+ ni->values = &ni->names[ni->nn_size];
+
+ for (i = 0; i < ni->nn_size; i++)
+ TAILQ_INIT(&ni->names[i]);
+
+ for (i = 0; i < ni->nv_size; i++)
+ TAILQ_INIT(&ni->values[i]);
+
+ /* Set default hashing/comparison functions */
+ ni->hash_f = objhash_hash_name;
+ ni->cmp_f = objhash_cmp_name;
+
+ /* Allocate bitmask separately due to possible resize */
+ ipfw_objhash_bitmap_alloc(items, (void*)&ni->idx_mask, &ni->max_blocks);
+
+ return (ni);
+}
+
+void
+ipfw_objhash_destroy(struct namedobj_instance *ni)
+{
+
+ free(ni->idx_mask, M_IPFW);
+ free(ni, M_IPFW);
+}
+
+void
+ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f,
+ objhash_cmp_f *cmp_f)
+{
+
+ ni->hash_f = hash_f;
+ ni->cmp_f = cmp_f;
+}
+
+static uint32_t
+objhash_hash_name(struct namedobj_instance *ni, const void *name, uint32_t set)
+{
+
+ return (fnv_32_str((const char *)name, FNV1_32_INIT));
+}
+
+static int
+objhash_cmp_name(struct named_object *no, const void *name, uint32_t set)
+{
+
+ if ((strcmp(no->name, (const char *)name) == 0) && (no->set == set))
+ return (0);
+
+ return (1);
+}
+
+static uint32_t
+objhash_hash_idx(struct namedobj_instance *ni, uint32_t val)
+{
+ uint32_t v;
+
+ v = val % (ni->nv_size - 1);
+
+ return (v);
+}
+
+struct named_object *
+ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, char *name)
+{
+ struct named_object *no;
+ uint32_t hash;
+
+ hash = ni->hash_f(ni, name, set) % ni->nn_size;
+
+ TAILQ_FOREACH(no, &ni->names[hash], nn_next) {
+ if (ni->cmp_f(no, name, set) == 0)
+ return (no);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Find named object by @uid.
+ * Check @tlvs for valid data inside.
+ *
+ * Returns pointer to found TLV or NULL.
+ */
+ipfw_obj_ntlv *
+ipfw_find_name_tlv_type(void *tlvs, int len, uint16_t uidx, uint32_t etlv)
+{
+ ipfw_obj_ntlv *ntlv;
+ uintptr_t pa, pe;
+ int l;
+
+ pa = (uintptr_t)tlvs;
+ pe = pa + len;
+ l = 0;
+ for (; pa < pe; pa += l) {
+ ntlv = (ipfw_obj_ntlv *)pa;
+ l = ntlv->head.length;
+
+ if (l != sizeof(*ntlv))
+ return (NULL);
+
+ if (ntlv->idx != uidx)
+ continue;
+ /*
+ * When userland has specified zero TLV type, do
+ * not compare it with eltv. In some cases userland
+ * doesn't know what type should it have. Use only
+ * uidx and name for search named_object.
+ */
+ if (ntlv->head.type != 0 &&
+ ntlv->head.type != (uint16_t)etlv)
+ continue;
+
+ if (ipfw_check_object_name_generic(ntlv->name) != 0)
+ return (NULL);
+
+ return (ntlv);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Finds object config based on either legacy index
+ * or name in ntlv.
+ * Note @ti structure contains unchecked data from userland.
+ *
+ * Returns 0 in success and fills in @pno with found config
+ */
+int
+ipfw_objhash_find_type(struct namedobj_instance *ni, struct tid_info *ti,
+ uint32_t etlv, struct named_object **pno)
+{
+ char *name;
+ ipfw_obj_ntlv *ntlv;
+ uint32_t set;
+
+ if (ti->tlvs == NULL)
+ return (EINVAL);
+
+ ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, etlv);
+ if (ntlv == NULL)
+ return (EINVAL);
+ name = ntlv->name;
+
+ /*
+ * Use set provided by @ti instead of @ntlv one.
+ * This is needed due to different sets behavior
+ * controlled by V_fw_tables_sets.
+ */
+ set = ti->set;
+ *pno = ipfw_objhash_lookup_name(ni, set, name);
+ if (*pno == NULL)
+ return (ESRCH);
+ return (0);
+}
+
+/*
+ * Find named object by name, considering also its TLV type.
+ */
+struct named_object *
+ipfw_objhash_lookup_name_type(struct namedobj_instance *ni, uint32_t set,
+ uint32_t type, const char *name)
+{
+ struct named_object *no;
+ uint32_t hash;
+
+ hash = ni->hash_f(ni, name, set) % ni->nn_size;
+
+ TAILQ_FOREACH(no, &ni->names[hash], nn_next) {
+ if (ni->cmp_f(no, name, set) == 0 &&
+ no->etlv == (uint16_t)type)
+ return (no);
+ }
+
+ return (NULL);
+}
+
+struct named_object *
+ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t kidx)
+{
+ struct named_object *no;
+ uint32_t hash;
+
+ hash = objhash_hash_idx(ni, kidx);
+
+ TAILQ_FOREACH(no, &ni->values[hash], nv_next) {
+ if (no->kidx == kidx)
+ return (no);
+ }
+
+ return (NULL);
+}
+
+int
+ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a,
+ struct named_object *b)
+{
+
+ if ((strcmp(a->name, b->name) == 0) && a->set == b->set)
+ return (1);
+
+ return (0);
+}
+
+void
+ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no)
+{
+ uint32_t hash;
+
+ hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size;
+ TAILQ_INSERT_HEAD(&ni->names[hash], no, nn_next);
+
+ hash = objhash_hash_idx(ni, no->kidx);
+ TAILQ_INSERT_HEAD(&ni->values[hash], no, nv_next);
+
+ ni->count++;
+}
+
+void
+ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no)
+{
+ uint32_t hash;
+
+ hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size;
+ TAILQ_REMOVE(&ni->names[hash], no, nn_next);
+
+ hash = objhash_hash_idx(ni, no->kidx);
+ TAILQ_REMOVE(&ni->values[hash], no, nv_next);
+
+ ni->count--;
+}
+
+uint32_t
+ipfw_objhash_count(struct namedobj_instance *ni)
+{
+
+ return (ni->count);
+}
+
+uint32_t
+ipfw_objhash_count_type(struct namedobj_instance *ni, uint16_t type)
+{
+ struct named_object *no;
+ uint32_t count;
+ int i;
+
+ count = 0;
+ for (i = 0; i < ni->nn_size; i++) {
+ TAILQ_FOREACH(no, &ni->names[i], nn_next) {
+ if (no->etlv == type)
+ count++;
+ }
+ }
+ return (count);
+}
+
+/*
+ * Runs @func for each found named object.
+ * It is safe to delete objects from callback
+ */
+int
+ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, void *arg)
+{
+ struct named_object *no, *no_tmp;
+ int i, ret;
+
+ for (i = 0; i < ni->nn_size; i++) {
+ TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) {
+ ret = f(ni, no, arg);
+ if (ret != 0)
+ return (ret);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Runs @f for each found named object with type @type.
+ * It is safe to delete objects from callback
+ */
+int
+ipfw_objhash_foreach_type(struct namedobj_instance *ni, objhash_cb_t *f,
+ void *arg, uint16_t type)
+{
+ struct named_object *no, *no_tmp;
+ int i, ret;
+
+ for (i = 0; i < ni->nn_size; i++) {
+ TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) {
+ if (no->etlv != type)
+ continue;
+ ret = f(ni, no, arg);
+ if (ret != 0)
+ return (ret);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Removes index from given set.
+ * Returns 0 on success.
+ */
+int
+ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx)
+{
+ u_long *mask;
+ int i, v;
+
+ i = idx / BLOCK_ITEMS;
+ v = idx % BLOCK_ITEMS;
+
+ if (i >= ni->max_blocks)
+ return (1);
+
+ mask = &ni->idx_mask[i];
+
+ if ((*mask & ((u_long)1 << v)) != 0)
+ return (1);
+
+ /* Mark as free */
+ *mask |= (u_long)1 << v;
+
+ /* Update free offset */
+ if (ni->free_off[0] > i)
+ ni->free_off[0] = i;
+
+ return (0);
+}
+
+/*
+ * Allocate new index in given instance and stores in in @pidx.
+ * Returns 0 on success.
+ */
+int
+ipfw_objhash_alloc_idx(void *n, uint16_t *pidx)
+{
+ struct namedobj_instance *ni;
+ u_long *mask;
+ int i, off, v;
+
+ ni = (struct namedobj_instance *)n;
+
+ off = ni->free_off[0];
+ mask = &ni->idx_mask[off];
+
+ for (i = off; i < ni->max_blocks; i++, mask++) {
+ if ((v = ffsl(*mask)) == 0)
+ continue;
+
+ /* Mark as busy */
+ *mask &= ~ ((u_long)1 << (v - 1));
+
+ ni->free_off[0] = i;
+
+ v = BLOCK_ITEMS * i + v - 1;
+
+ *pidx = v;
+ return (0);
+ }
+
+ return (1);
+}
+
/* end of file */
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_table.c b/freebsd/sys/netpfil/ipfw/ip_fw_table.c
index 71579795..9d2baad2 100644
--- a/freebsd/sys/netpfil/ipfw/ip_fw_table.c
+++ b/freebsd/sys/netpfil/ipfw/ip_fw_table.c
@@ -2,6 +2,8 @@
/*-
* Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko.
+ * Copyright (c) 2014 Yandex LLC
+ * Copyright (c) 2014 Alexander V. Chernikov
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -29,24 +31,18 @@
__FBSDID("$FreeBSD$");
/*
- * Lookup table support for ipfw
+ * Lookup table support for ipfw.
*
- * Lookup tables are implemented (at the moment) using the radix
- * tree used for routing tables. Tables store key-value entries, where
- * keys are network prefixes (addr/masklen), and values are integers.
- * As a degenerate case we can interpret keys as 32-bit integers
- * (with a /32 mask).
+ * This file contains handlers for all generic tables' operations:
+ * add/del/flush entries, list/dump tables etc..
*
- * The table is protected by the IPFW lock even for manipulation coming
- * from userland, because operations are typically fast.
+ * Table data modification is protected by both UH and runtime lock
+ * while reading configuration/data is protected by UH lock.
+ *
+ * Lookup algorithms for all table types are located in ip_fw_table_algo.c
*/
#include <rtems/bsd/local/opt_ipfw.h>
-#include <rtems/bsd/local/opt_inet.h>
-#ifndef INET
-#error IPFIREWALL requires INET.
-#endif /* INET */
-#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
@@ -54,713 +50,3296 @@ __FBSDID("$FreeBSD$");
#include <sys/kernel.h>
#include <rtems/bsd/sys/lock.h>
#include <sys/rwlock.h>
+#include <sys/rmlock.h>
#include <sys/socket.h>
+#include <sys/socketvar.h>
#include <sys/queue.h>
#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */
-#include <net/radix.h>
-#include <net/route.h>
-#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/ip_var.h> /* struct ipfw_rule_ref */
#include <netinet/ip_fw.h>
#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/ip_fw_table.h>
-#ifdef MAC
-#include <security/mac/mac_framework.h>
-#endif
+ /*
+ * Table has the following `type` concepts:
+ *
+ * `no.type` represents lookup key type (addr, ifp, uid, etc..)
+ * vmask represents bitmask of table values which are present at the moment.
+ * Special IPFW_VTYPE_LEGACY ( (uint32_t)-1 ) represents old
+ * single-value-for-all approach.
+ */
+struct table_config {
+ struct named_object no;
+ uint8_t tflags; /* type flags */
+ uint8_t locked; /* 1 if locked from changes */
+ uint8_t linked; /* 1 if already linked */
+ uint8_t ochanged; /* used by set swapping */
+ uint8_t vshared; /* 1 if using shared value array */
+ uint8_t spare[3];
+ uint32_t count; /* Number of records */
+ uint32_t limit; /* Max number of records */
+ uint32_t vmask; /* bitmask with supported values */
+ uint32_t ocount; /* used by set swapping */
+ uint64_t gencnt; /* generation count */
+ char tablename[64]; /* table name */
+ struct table_algo *ta; /* Callbacks for given algo */
+ void *astate; /* algorithm state */
+ struct table_info ti_copy; /* data to put to table_info */
+ struct namedobj_instance *vi;
+};
-MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
+static int find_table_err(struct namedobj_instance *ni, struct tid_info *ti,
+ struct table_config **tc);
+static struct table_config *find_table(struct namedobj_instance *ni,
+ struct tid_info *ti);
+static struct table_config *alloc_table_config(struct ip_fw_chain *ch,
+ struct tid_info *ti, struct table_algo *ta, char *adata, uint8_t tflags);
+static void free_table_config(struct namedobj_instance *ni,
+ struct table_config *tc);
+static int create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti,
+ char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int ref);
+static void link_table(struct ip_fw_chain *ch, struct table_config *tc);
+static void unlink_table(struct ip_fw_chain *ch, struct table_config *tc);
+static int find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti,
+ struct tentry_info *tei, uint32_t count, int op, struct table_config **ptc);
+#define OP_ADD 1
+#define OP_DEL 0
+static int export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh,
+ struct sockopt_data *sd);
+static void export_table_info(struct ip_fw_chain *ch, struct table_config *tc,
+ ipfw_xtable_info *i);
+static int dump_table_tentry(void *e, void *arg);
+static int dump_table_xentry(void *e, void *arg);
+
+static int swap_tables(struct ip_fw_chain *ch, struct tid_info *a,
+ struct tid_info *b);
+
+static int check_table_name(const char *name);
+static int check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts,
+ struct table_config *tc, struct table_info *ti, uint32_t count);
+static int destroy_table(struct ip_fw_chain *ch, struct tid_info *ti);
+
+static struct table_algo *find_table_algo(struct tables_config *tableconf,
+ struct tid_info *ti, char *name);
+
+static void objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti);
+static void ntlv_to_ti(struct _ipfw_obj_ntlv *ntlv, struct tid_info *ti);
+
+#define CHAIN_TO_NI(chain) (CHAIN_TO_TCFG(chain)->namehash)
+#define KIDX_TO_TI(ch, k) (&(((struct table_info *)(ch)->tablestate)[k]))
+
+#define TA_BUF_SZ 128 /* On-stack buffer for add/delete state */
-struct table_entry {
- struct radix_node rn[2];
- struct sockaddr_in addr, mask;
- u_int32_t value;
-};
+void
+rollback_toperation_state(struct ip_fw_chain *ch, void *object)
+{
+ struct tables_config *tcfg;
+ struct op_state *os;
-struct xaddr_iface {
- uint8_t if_len; /* length of this struct */
- uint8_t pad[7]; /* Align name */
- char ifname[IF_NAMESIZE]; /* Interface name */
-};
+ tcfg = CHAIN_TO_TCFG(ch);
+ TAILQ_FOREACH(os, &tcfg->state_list, next)
+ os->func(object, os);
+}
+
+void
+add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts)
+{
+ struct tables_config *tcfg;
+
+ tcfg = CHAIN_TO_TCFG(ch);
+ TAILQ_INSERT_HEAD(&tcfg->state_list, &ts->opstate, next);
+}
+
+void
+del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts)
+{
+ struct tables_config *tcfg;
+
+ tcfg = CHAIN_TO_TCFG(ch);
+ TAILQ_REMOVE(&tcfg->state_list, &ts->opstate, next);
+}
+
+void
+tc_ref(struct table_config *tc)
+{
+
+ tc->no.refcnt++;
+}
+
+void
+tc_unref(struct table_config *tc)
+{
+
+ tc->no.refcnt--;
+}
+
+static struct table_value *
+get_table_value(struct ip_fw_chain *ch, struct table_config *tc, uint32_t kidx)
+{
+ struct table_value *pval;
+
+ pval = (struct table_value *)ch->valuestate;
+
+ return (&pval[kidx]);
+}
-struct table_xentry {
- struct radix_node rn[2];
- union {
-#ifdef INET6
- struct sockaddr_in6 addr6;
-#endif
- struct xaddr_iface iface;
- } a;
- union {
-#ifdef INET6
- struct sockaddr_in6 mask6;
-#endif
- struct xaddr_iface ifmask;
- } m;
- u_int32_t value;
-};
/*
- * The radix code expects addr and mask to be array of bytes,
- * with the first byte being the length of the array. rn_inithead
- * is called with the offset in bits of the lookup key within the
- * array. If we use a sockaddr_in as the underlying type,
- * sin_len is conveniently located at offset 0, sin_addr is at
- * offset 4 and normally aligned.
- * But for portability, let's avoid assumption and make the code explicit
+ * Checks if we're able to insert/update entry @tei into table
+ * w.r.t @tc limits.
+ * May alter @tei to indicate insertion error / insert
+ * options.
+ *
+ * Returns 0 if operation can be performed/
*/
-#define KEY_LEN(v) *((uint8_t *)&(v))
-#define KEY_OFS (8*offsetof(struct sockaddr_in, sin_addr))
+static int
+check_table_limit(struct table_config *tc, struct tentry_info *tei)
+{
+
+ if (tc->limit == 0 || tc->count < tc->limit)
+ return (0);
+
+ if ((tei->flags & TEI_FLAGS_UPDATE) == 0) {
+ /* Notify userland on error cause */
+ tei->flags |= TEI_FLAGS_LIMIT;
+ return (EFBIG);
+ }
+
+ /*
+ * We have UPDATE flag set.
+ * Permit updating record (if found),
+ * but restrict adding new one since we've
+ * already hit the limit.
+ */
+ tei->flags |= TEI_FLAGS_DONTADD;
+
+ return (0);
+}
+
/*
- * Do not require radix to compare more than actual IPv4/IPv6 address
+ * Convert algorithm callback return code into
+ * one of pre-defined states known by userland.
*/
-#define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t))
-#define KEY_LEN_INET6 (offsetof(struct sockaddr_in6, sin6_addr) + sizeof(struct in6_addr))
-#define KEY_LEN_IFACE (offsetof(struct xaddr_iface, ifname))
+static void
+store_tei_result(struct tentry_info *tei, int op, int error, uint32_t num)
+{
+ int flag;
-#define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr))
-#define OFF_LEN_INET6 (8 * offsetof(struct sockaddr_in6, sin6_addr))
-#define OFF_LEN_IFACE (8 * offsetof(struct xaddr_iface, ifname))
+ flag = 0;
+ switch (error) {
+ case 0:
+ if (op == OP_ADD && num != 0)
+ flag = TEI_FLAGS_ADDED;
+ if (op == OP_DEL)
+ flag = TEI_FLAGS_DELETED;
+ break;
+ case ENOENT:
+ flag = TEI_FLAGS_NOTFOUND;
+ break;
+ case EEXIST:
+ flag = TEI_FLAGS_EXISTS;
+ break;
+ default:
+ flag = TEI_FLAGS_ERROR;
+ }
-#ifdef INET6
-static inline void
-ipv6_writemask(struct in6_addr *addr6, uint8_t mask)
+ tei->flags |= flag;
+}
+
+/*
+ * Creates and references table with default parameters.
+ * Saves table config, algo and allocated kidx info @ptc, @pta and
+ * @pkidx if non-zero.
+ * Used for table auto-creation to support old binaries.
+ *
+ * Returns 0 on success.
+ */
+static int
+create_table_compat(struct ip_fw_chain *ch, struct tid_info *ti,
+ uint16_t *pkidx)
{
- uint32_t *cp;
+ ipfw_xtable_info xi;
+ int error;
+
+ memset(&xi, 0, sizeof(xi));
+ /* Set default value mask for legacy clients */
+ xi.vmask = IPFW_VTYPE_LEGACY;
+
+ error = create_table_internal(ch, ti, NULL, &xi, pkidx, 1);
+ if (error != 0)
+ return (error);
- for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32)
- *cp++ = 0xFFFFFFFF;
- *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0);
+ return (0);
+}
+
+/*
+ * Find and reference existing table optionally
+ * creating new one.
+ *
+ * Saves found table config into @ptc.
+ * Note function may drop/acquire UH_WLOCK.
+ * Returns 0 if table was found/created and referenced
+ * or non-zero return code.
+ */
+static int
+find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti,
+ struct tentry_info *tei, uint32_t count, int op,
+ struct table_config **ptc)
+{
+ struct namedobj_instance *ni;
+ struct table_config *tc;
+ uint16_t kidx;
+ int error;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ ni = CHAIN_TO_NI(ch);
+ tc = NULL;
+ if ((tc = find_table(ni, ti)) != NULL) {
+ /* check table type */
+ if (tc->no.subtype != ti->type)
+ return (EINVAL);
+
+ if (tc->locked != 0)
+ return (EACCES);
+
+ /* Try to exit early on limit hit */
+ if (op == OP_ADD && count == 1 &&
+ check_table_limit(tc, tei) != 0)
+ return (EFBIG);
+
+ /* Reference and return */
+ tc->no.refcnt++;
+ *ptc = tc;
+ return (0);
+ }
+
+ if (op == OP_DEL)
+ return (ESRCH);
+
+ /* Compatibility mode: create new table for old clients */
+ if ((tei->flags & TEI_FLAGS_COMPAT) == 0)
+ return (ESRCH);
+
+ IPFW_UH_WUNLOCK(ch);
+ error = create_table_compat(ch, ti, &kidx);
+ IPFW_UH_WLOCK(ch);
+
+ if (error != 0)
+ return (error);
+
+ tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx);
+ KASSERT(tc != NULL, ("create_table_compat returned bad idx %d", kidx));
+
+ /* OK, now we've got referenced table. */
+ *ptc = tc;
+ return (0);
+}
+
+/*
+ * Rolls back already @added to @tc entries using state array @ta_buf_m.
+ * Assume the following layout:
+ * 1) ADD state (ta_buf_m[0] ... t_buf_m[added - 1]) for handling update cases
+ * 2) DEL state (ta_buf_m[count[ ... t_buf_m[count + added - 1])
+ * for storing deleted state
+ */
+static void
+rollback_added_entries(struct ip_fw_chain *ch, struct table_config *tc,
+ struct table_info *tinfo, struct tentry_info *tei, caddr_t ta_buf_m,
+ uint32_t count, uint32_t added)
+{
+ struct table_algo *ta;
+ struct tentry_info *ptei;
+ caddr_t v, vv;
+ size_t ta_buf_sz;
+ int error, i;
+ uint32_t num;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ ta = tc->ta;
+ ta_buf_sz = ta->ta_buf_size;
+ v = ta_buf_m;
+ vv = v + count * ta_buf_sz;
+ for (i = 0; i < added; i++, v += ta_buf_sz, vv += ta_buf_sz) {
+ ptei = &tei[i];
+ if ((ptei->flags & TEI_FLAGS_UPDATED) != 0) {
+
+ /*
+ * We have old value stored by previous
+ * call in @ptei->value. Do add once again
+ * to restore it.
+ */
+ error = ta->add(tc->astate, tinfo, ptei, v, &num);
+ KASSERT(error == 0, ("rollback UPDATE fail"));
+ KASSERT(num == 0, ("rollback UPDATE fail2"));
+ continue;
+ }
+
+ error = ta->prepare_del(ch, ptei, vv);
+ KASSERT(error == 0, ("pre-rollback INSERT failed"));
+ error = ta->del(tc->astate, tinfo, ptei, vv, &num);
+ KASSERT(error == 0, ("rollback INSERT failed"));
+ tc->count -= num;
+ }
+}
+
+/*
+ * Prepares add/del state for all @count entries in @tei.
+ * Uses either stack buffer (@ta_buf) or allocates a new one.
+ * Stores pointer to allocated buffer back to @ta_buf.
+ *
+ * Returns 0 on success.
+ */
+static int
+prepare_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta,
+ struct tentry_info *tei, uint32_t count, int op, caddr_t *ta_buf)
+{
+ caddr_t ta_buf_m, v;
+ size_t ta_buf_sz, sz;
+ struct tentry_info *ptei;
+ int error, i;
+
+ error = 0;
+ ta_buf_sz = ta->ta_buf_size;
+ if (count == 1) {
+ /* Sigle add/delete, use on-stack buffer */
+ memset(*ta_buf, 0, TA_BUF_SZ);
+ ta_buf_m = *ta_buf;
+ } else {
+
+ /*
+ * Multiple adds/deletes, allocate larger buffer
+ *
+ * Note we need 2xcount buffer for add case:
+ * we have hold both ADD state
+ * and DELETE state (this may be needed
+ * if we need to rollback all changes)
+ */
+ sz = count * ta_buf_sz;
+ ta_buf_m = malloc((op == OP_ADD) ? sz * 2 : sz, M_TEMP,
+ M_WAITOK | M_ZERO);
+ }
+
+ v = ta_buf_m;
+ for (i = 0; i < count; i++, v += ta_buf_sz) {
+ ptei = &tei[i];
+ error = (op == OP_ADD) ?
+ ta->prepare_add(ch, ptei, v) : ta->prepare_del(ch, ptei, v);
+
+ /*
+ * Some syntax error (incorrect mask, or address, or
+ * anything). Return error regardless of atomicity
+ * settings.
+ */
+ if (error != 0)
+ break;
+ }
+
+ *ta_buf = ta_buf_m;
+ return (error);
}
-#endif
+/*
+ * Flushes allocated state for each @count entries in @tei.
+ * Frees @ta_buf_m if differs from stack buffer @ta_buf.
+ */
+static void
+flush_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta,
+ struct tentry_info *tei, uint32_t count, int rollback,
+ caddr_t ta_buf_m, caddr_t ta_buf)
+{
+ caddr_t v;
+ struct tentry_info *ptei;
+ size_t ta_buf_sz;
+ int i;
+
+ ta_buf_sz = ta->ta_buf_size;
+
+ /* Run cleaning callback anyway */
+ v = ta_buf_m;
+ for (i = 0; i < count; i++, v += ta_buf_sz) {
+ ptei = &tei[i];
+ ta->flush_entry(ch, ptei, v);
+ if (ptei->ptv != NULL) {
+ free(ptei->ptv, M_IPFW);
+ ptei->ptv = NULL;
+ }
+ }
+
+ /* Clean up "deleted" state in case of rollback */
+ if (rollback != 0) {
+ v = ta_buf_m + count * ta_buf_sz;
+ for (i = 0; i < count; i++, v += ta_buf_sz)
+ ta->flush_entry(ch, &tei[i], v);
+ }
+
+ if (ta_buf_m != ta_buf)
+ free(ta_buf_m, M_TEMP);
+}
+
+
+static void
+rollback_add_entry(void *object, struct op_state *_state)
+{
+ struct ip_fw_chain *ch;
+ struct tableop_state *ts;
+
+ ts = (struct tableop_state *)_state;
+
+ if (ts->tc != object && ts->ch != object)
+ return;
+
+ ch = ts->ch;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ /* Call specifid unlockers */
+ rollback_table_values(ts);
+
+ /* Indicate we've called */
+ ts->modified = 1;
+}
+
+/*
+ * Adds/updates one or more entries in table @ti.
+ *
+ * Function may drop/reacquire UH wlock multiple times due to
+ * items alloc, algorithm callbacks (check_space), value linkage
+ * (new values, value storage realloc), etc..
+ * Other processes like other adds (which may involve storage resize),
+ * table swaps (which changes table data and may change algo type),
+ * table modify (which may change value mask) may be executed
+ * simultaneously so we need to deal with it.
+ *
+ * The following approach was implemented:
+ * we have per-chain linked list, protected with UH lock.
+ * add_table_entry prepares special on-stack structure wthich is passed
+ * to its descendants. Users add this structure to this list before unlock.
+ * After performing needed operations and acquiring UH lock back, each user
+ * checks if structure has changed. If true, it rolls local state back and
+ * returns without error to the caller.
+ * add_table_entry() on its own checks if structure has changed and restarts
+ * its operation from the beginning (goto restart).
+ *
+ * Functions which are modifying fields of interest (currently
+ * resize_shared_value_storage() and swap_tables() )
+ * traverses given list while holding UH lock immediately before
+ * performing their operations calling function provided be list entry
+ * ( currently rollback_add_entry ) which performs rollback for all necessary
+ * state and sets appropriate values in structure indicating rollback
+ * has happened.
+ *
+ * Algo interaction:
+ * Function references @ti first to ensure table won't
+ * disappear or change its type.
+ * After that, prepare_add callback is called for each @tei entry.
+ * Next, we try to add each entry under UH+WHLOCK
+ * using add() callback.
+ * Finally, we free all state by calling flush_entry callback
+ * for each @tei.
+ *
+ * Returns 0 on success.
+ */
int
-ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr,
- uint8_t plen, uint8_t mlen, uint8_t type, uint32_t value)
-{
- struct radix_node_head *rnh, **rnh_ptr;
- struct table_entry *ent;
- struct table_xentry *xent;
- struct radix_node *rn;
- in_addr_t addr;
- int offset;
- void *ent_ptr;
- struct sockaddr *addr_ptr, *mask_ptr;
- char c;
-
- if (tbl >= V_fw_tables_max)
- return (EINVAL);
+add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti,
+ struct tentry_info *tei, uint8_t flags, uint32_t count)
+{
+ struct table_config *tc;
+ struct table_algo *ta;
+ uint16_t kidx;
+ int error, first_error, i, rollback;
+ uint32_t num, numadd;
+ struct tentry_info *ptei;
+ struct tableop_state ts;
+ char ta_buf[TA_BUF_SZ];
+ caddr_t ta_buf_m, v;
+
+ memset(&ts, 0, sizeof(ts));
+ ta = NULL;
+ IPFW_UH_WLOCK(ch);
- switch (type) {
- case IPFW_TABLE_CIDR:
- if (plen == sizeof(in_addr_t)) {
-#ifdef INET
- /* IPv4 case */
- if (mlen > 32)
- return (EINVAL);
- ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO);
- ent->value = value;
- /* Set 'total' structure length */
- KEY_LEN(ent->addr) = KEY_LEN_INET;
- KEY_LEN(ent->mask) = KEY_LEN_INET;
- /* Set offset of IPv4 address in bits */
- offset = OFF_LEN_INET;
- ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
- addr = *((in_addr_t *)paddr);
- ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
- /* Set pointers */
- rnh_ptr = &ch->tables[tbl];
- ent_ptr = ent;
- addr_ptr = (struct sockaddr *)&ent->addr;
- mask_ptr = (struct sockaddr *)&ent->mask;
-#endif
-#ifdef INET6
- } else if (plen == sizeof(struct in6_addr)) {
- /* IPv6 case */
- if (mlen > 128)
- return (EINVAL);
- xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO);
- xent->value = value;
- /* Set 'total' structure length */
- KEY_LEN(xent->a.addr6) = KEY_LEN_INET6;
- KEY_LEN(xent->m.mask6) = KEY_LEN_INET6;
- /* Set offset of IPv6 address in bits */
- offset = OFF_LEN_INET6;
- ipv6_writemask(&xent->m.mask6.sin6_addr, mlen);
- memcpy(&xent->a.addr6.sin6_addr, paddr, sizeof(struct in6_addr));
- APPLY_MASK(&xent->a.addr6.sin6_addr, &xent->m.mask6.sin6_addr);
- /* Set pointers */
- rnh_ptr = &ch->xtables[tbl];
- ent_ptr = xent;
- addr_ptr = (struct sockaddr *)&xent->a.addr6;
- mask_ptr = (struct sockaddr *)&xent->m.mask6;
-#endif
- } else {
- /* Unknown CIDR type */
- return (EINVAL);
+ /*
+ * Find and reference existing table.
+ */
+restart:
+ if (ts.modified != 0) {
+ IPFW_UH_WUNLOCK(ch);
+ flush_batch_buffer(ch, ta, tei, count, rollback,
+ ta_buf_m, ta_buf);
+ memset(&ts, 0, sizeof(ts));
+ ta = NULL;
+ IPFW_UH_WLOCK(ch);
+ }
+
+ error = find_ref_table(ch, ti, tei, count, OP_ADD, &tc);
+ if (error != 0) {
+ IPFW_UH_WUNLOCK(ch);
+ return (error);
+ }
+ ta = tc->ta;
+
+ /* Fill in tablestate */
+ ts.ch = ch;
+ ts.opstate.func = rollback_add_entry;
+ ts.tc = tc;
+ ts.vshared = tc->vshared;
+ ts.vmask = tc->vmask;
+ ts.ta = ta;
+ ts.tei = tei;
+ ts.count = count;
+ rollback = 0;
+ add_toperation_state(ch, &ts);
+ IPFW_UH_WUNLOCK(ch);
+
+ /* Allocate memory and prepare record(s) */
+ /* Pass stack buffer by default */
+ ta_buf_m = ta_buf;
+ error = prepare_batch_buffer(ch, ta, tei, count, OP_ADD, &ta_buf_m);
+
+ IPFW_UH_WLOCK(ch);
+ del_toperation_state(ch, &ts);
+ /* Drop reference we've used in first search */
+ tc->no.refcnt--;
+
+ /* Check prepare_batch_buffer() error */
+ if (error != 0)
+ goto cleanup;
+
+ /*
+ * Check if table swap has happened.
+ * (so table algo might be changed).
+ * Restart operation to achieve consistent behavior.
+ */
+ if (ts.modified != 0)
+ goto restart;
+
+ /*
+ * Link all values values to shared/per-table value array.
+ *
+ * May release/reacquire UH_WLOCK.
+ */
+ error = ipfw_link_table_values(ch, &ts);
+ if (error != 0)
+ goto cleanup;
+ if (ts.modified != 0)
+ goto restart;
+
+ /*
+ * Ensure we are able to add all entries without additional
+ * memory allocations. May release/reacquire UH_WLOCK.
+ */
+ kidx = tc->no.kidx;
+ error = check_table_space(ch, &ts, tc, KIDX_TO_TI(ch, kidx), count);
+ if (error != 0)
+ goto cleanup;
+ if (ts.modified != 0)
+ goto restart;
+
+ /* We've got valid table in @tc. Let's try to add data */
+ kidx = tc->no.kidx;
+ ta = tc->ta;
+ numadd = 0;
+ first_error = 0;
+
+ IPFW_WLOCK(ch);
+
+ v = ta_buf_m;
+ for (i = 0; i < count; i++, v += ta->ta_buf_size) {
+ ptei = &tei[i];
+ num = 0;
+ /* check limit before adding */
+ if ((error = check_table_limit(tc, ptei)) == 0) {
+ error = ta->add(tc->astate, KIDX_TO_TI(ch, kidx),
+ ptei, v, &num);
+ /* Set status flag to inform userland */
+ store_tei_result(ptei, OP_ADD, error, num);
}
+ if (error == 0) {
+ /* Update number of records to ease limit checking */
+ tc->count += num;
+ numadd += num;
+ continue;
+ }
+
+ if (first_error == 0)
+ first_error = error;
+
+ /*
+ * Some error have happened. Check our atomicity
+ * settings: continue if atomicity is not required,
+ * rollback changes otherwise.
+ */
+ if ((flags & IPFW_CTF_ATOMIC) == 0)
+ continue;
+
+ rollback_added_entries(ch, tc, KIDX_TO_TI(ch, kidx),
+ tei, ta_buf_m, count, i);
+
+ rollback = 1;
break;
+ }
+
+ IPFW_WUNLOCK(ch);
+
+ ipfw_garbage_table_values(ch, tc, tei, count, rollback);
+
+ /* Permit post-add algorithm grow/rehash. */
+ if (numadd != 0)
+ check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0);
+
+ /* Return first error to user, if any */
+ error = first_error;
+
+cleanup:
+ IPFW_UH_WUNLOCK(ch);
+
+ flush_batch_buffer(ch, ta, tei, count, rollback, ta_buf_m, ta_buf);
- case IPFW_TABLE_INTERFACE:
- /* Check if string is terminated */
- c = ((char *)paddr)[IF_NAMESIZE - 1];
- ((char *)paddr)[IF_NAMESIZE - 1] = '\0';
- if (((mlen = strlen((char *)paddr)) == IF_NAMESIZE - 1) && (c != '\0'))
- return (EINVAL);
+ return (error);
+}
- /* Include last \0 into comparison */
- mlen++;
-
- xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO);
- xent->value = value;
- /* Set 'total' structure length */
- KEY_LEN(xent->a.iface) = KEY_LEN_IFACE + mlen;
- KEY_LEN(xent->m.ifmask) = KEY_LEN_IFACE + mlen;
- /* Set offset of interface name in bits */
- offset = OFF_LEN_IFACE;
- memcpy(xent->a.iface.ifname, paddr, mlen);
- /* Assume direct match */
- /* TODO: Add interface pattern matching */
-#if 0
- memset(xent->m.ifmask.ifname, 0xFF, IF_NAMESIZE);
- mask_ptr = (struct sockaddr *)&xent->m.ifmask;
-#endif
- /* Set pointers */
- rnh_ptr = &ch->xtables[tbl];
- ent_ptr = xent;
- addr_ptr = (struct sockaddr *)&xent->a.iface;
- mask_ptr = NULL;
- break;
+/*
+ * Deletes one or more entries in table @ti.
+ *
+ * Returns 0 on success.
+ */
+int
+del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti,
+ struct tentry_info *tei, uint8_t flags, uint32_t count)
+{
+ struct table_config *tc;
+ struct table_algo *ta;
+ struct tentry_info *ptei;
+ uint16_t kidx;
+ int error, first_error, i;
+ uint32_t num, numdel;
+ char ta_buf[TA_BUF_SZ];
+ caddr_t ta_buf_m, v;
- default:
- return (EINVAL);
+ /*
+ * Find and reference existing table.
+ */
+ IPFW_UH_WLOCK(ch);
+ error = find_ref_table(ch, ti, tei, count, OP_DEL, &tc);
+ if (error != 0) {
+ IPFW_UH_WUNLOCK(ch);
+ return (error);
+ }
+ ta = tc->ta;
+ IPFW_UH_WUNLOCK(ch);
+
+ /* Allocate memory and prepare record(s) */
+ /* Pass stack buffer by default */
+ ta_buf_m = ta_buf;
+ error = prepare_batch_buffer(ch, ta, tei, count, OP_DEL, &ta_buf_m);
+ if (error != 0)
+ goto cleanup;
+
+ IPFW_UH_WLOCK(ch);
+
+ /* Drop reference we've used in first search */
+ tc->no.refcnt--;
+
+ /*
+ * Check if table algo is still the same.
+ * (changed ta may be the result of table swap).
+ */
+ if (ta != tc->ta) {
+ IPFW_UH_WUNLOCK(ch);
+ error = EINVAL;
+ goto cleanup;
}
+ kidx = tc->no.kidx;
+ numdel = 0;
+ first_error = 0;
+
IPFW_WLOCK(ch);
+ v = ta_buf_m;
+ for (i = 0; i < count; i++, v += ta->ta_buf_size) {
+ ptei = &tei[i];
+ num = 0;
+ error = ta->del(tc->astate, KIDX_TO_TI(ch, kidx), ptei, v,
+ &num);
+ /* Save state for userland */
+ store_tei_result(ptei, OP_DEL, error, num);
+ if (error != 0 && first_error == 0)
+ first_error = error;
+ tc->count -= num;
+ numdel += num;
+ }
+ IPFW_WUNLOCK(ch);
- /* Check if tabletype is valid */
- if ((ch->tabletype[tbl] != 0) && (ch->tabletype[tbl] != type)) {
- IPFW_WUNLOCK(ch);
- free(ent_ptr, M_IPFW_TBL);
- return (EINVAL);
+ /* Unlink non-used values */
+ ipfw_garbage_table_values(ch, tc, tei, count, 0);
+
+ if (numdel != 0) {
+ /* Run post-del hook to permit shrinking */
+ check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0);
}
- /* Check if radix tree exists */
- if ((rnh = *rnh_ptr) == NULL) {
- IPFW_WUNLOCK(ch);
- /* Create radix for a new table */
- if (!rn_inithead((void **)&rnh, offset)) {
- free(ent_ptr, M_IPFW_TBL);
- return (ENOMEM);
+ IPFW_UH_WUNLOCK(ch);
+
+ /* Return first error to user, if any */
+ error = first_error;
+
+cleanup:
+ flush_batch_buffer(ch, ta, tei, count, 0, ta_buf_m, ta_buf);
+
+ return (error);
+}
+
+/*
+ * Ensure that table @tc has enough space to add @count entries without
+ * need for reallocation.
+ *
+ * Callbacks order:
+ * 0) need_modify() (UH_WLOCK) - checks if @count items can be added w/o resize.
+ *
+ * 1) alloc_modify (no locks, M_WAITOK) - alloc new state based on @pflags.
+ * 2) prepare_modifyt (UH_WLOCK) - copy old data into new storage
+ * 3) modify (UH_WLOCK + WLOCK) - switch pointers
+ * 4) flush_modify (UH_WLOCK) - free state, if needed
+ *
+ * Returns 0 on success.
+ */
+static int
+check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts,
+ struct table_config *tc, struct table_info *ti, uint32_t count)
+{
+ struct table_algo *ta;
+ uint64_t pflags;
+ char ta_buf[TA_BUF_SZ];
+ int error;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ error = 0;
+ ta = tc->ta;
+ if (ta->need_modify == NULL)
+ return (0);
+
+ /* Acquire reference not to loose @tc between locks/unlocks */
+ tc->no.refcnt++;
+
+ /*
+ * TODO: think about avoiding race between large add/large delete
+ * operation on algorithm which implements shrinking along with
+ * growing.
+ */
+ while (true) {
+ pflags = 0;
+ if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) {
+ error = 0;
+ break;
}
- IPFW_WLOCK(ch);
- if (*rnh_ptr != NULL) {
- /* Tree is already attached by other thread */
- rn_detachhead((void **)&rnh);
- rnh = *rnh_ptr;
- /* Check table type another time */
- if (ch->tabletype[tbl] != type) {
- IPFW_WUNLOCK(ch);
- free(ent_ptr, M_IPFW_TBL);
- return (EINVAL);
- }
- } else {
- *rnh_ptr = rnh;
- /*
- * Set table type. It can be set already
- * (if we have IPv6-only table) but setting
- * it another time does not hurt
+ /* We have to shrink/grow table */
+ if (ts != NULL)
+ add_toperation_state(ch, ts);
+ IPFW_UH_WUNLOCK(ch);
+
+ memset(&ta_buf, 0, sizeof(ta_buf));
+ error = ta->prepare_mod(ta_buf, &pflags);
+
+ IPFW_UH_WLOCK(ch);
+ if (ts != NULL)
+ del_toperation_state(ch, ts);
+
+ if (error != 0)
+ break;
+
+ if (ts != NULL && ts->modified != 0) {
+
+ /*
+ * Swap operation has happened
+ * so we're currently operating on other
+ * table data. Stop doing this.
+ */
+ ta->flush_mod(ta_buf);
+ break;
+ }
+
+ /* Check if we still need to alter table */
+ ti = KIDX_TO_TI(ch, tc->no.kidx);
+ if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) {
+ IPFW_UH_WUNLOCK(ch);
+
+ /*
+ * Other thread has already performed resize.
+ * Flush our state and return.
*/
- ch->tabletype[tbl] = type;
+ ta->flush_mod(ta_buf);
+ break;
+ }
+
+ error = ta->fill_mod(tc->astate, ti, ta_buf, &pflags);
+ if (error == 0) {
+ /* Do actual modification */
+ IPFW_WLOCK(ch);
+ ta->modify(tc->astate, ti, ta_buf, pflags);
+ IPFW_WUNLOCK(ch);
}
+
+ /* Anyway, flush data and retry */
+ ta->flush_mod(ta_buf);
}
- rn = rnh->rnh_addaddr(addr_ptr, mask_ptr, rnh, ent_ptr);
- IPFW_WUNLOCK(ch);
+ tc->no.refcnt--;
+ return (error);
+}
- if (rn == NULL) {
- free(ent_ptr, M_IPFW_TBL);
- return (EEXIST);
+/*
+ * Adds or deletes record in table.
+ * Data layout (v0):
+ * Request: [ ip_fw3_opheader ipfw_table_xentry ]
+ *
+ * Returns 0 on success
+ */
+static int
+manage_table_ent_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_table_xentry *xent;
+ struct tentry_info tei;
+ struct tid_info ti;
+ struct table_value v;
+ int error, hdrlen, read;
+
+ hdrlen = offsetof(ipfw_table_xentry, k);
+
+ /* Check minimum header size */
+ if (sd->valsize < (sizeof(*op3) + hdrlen))
+ return (EINVAL);
+
+ read = sizeof(ip_fw3_opheader);
+
+ /* Check if xentry len field is valid */
+ xent = (ipfw_table_xentry *)(op3 + 1);
+ if (xent->len < hdrlen || xent->len + read > sd->valsize)
+ return (EINVAL);
+
+ memset(&tei, 0, sizeof(tei));
+ tei.paddr = &xent->k;
+ tei.masklen = xent->masklen;
+ ipfw_import_table_value_legacy(xent->value, &v);
+ tei.pvalue = &v;
+ /* Old requests compatibility */
+ tei.flags = TEI_FLAGS_COMPAT;
+ if (xent->type == IPFW_TABLE_ADDR) {
+ if (xent->len - hdrlen == sizeof(in_addr_t))
+ tei.subtype = AF_INET;
+ else
+ tei.subtype = AF_INET6;
}
- return (0);
+
+ memset(&ti, 0, sizeof(ti));
+ ti.uidx = xent->tbl;
+ ti.type = xent->type;
+
+ error = (op3->opcode == IP_FW_TABLE_XADD) ?
+ add_table_entry(ch, &ti, &tei, 0, 1) :
+ del_table_entry(ch, &ti, &tei, 0, 1);
+
+ return (error);
}
-int
-ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr,
- uint8_t plen, uint8_t mlen, uint8_t type)
+/*
+ * Adds or deletes record in table.
+ * Data layout (v1)(current):
+ * Request: [ ipfw_obj_header
+ * ipfw_obj_ctlv(IPFW_TLV_TBLENT_LIST) [ ipfw_obj_tentry x N ]
+ * ]
+ *
+ * Returns 0 on success
+ */
+static int
+manage_table_ent_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
{
- struct radix_node_head *rnh, **rnh_ptr;
- struct table_entry *ent;
- in_addr_t addr;
- struct sockaddr_in sa, mask;
- struct sockaddr *sa_ptr, *mask_ptr;
- char c;
+ ipfw_obj_tentry *tent, *ptent;
+ ipfw_obj_ctlv *ctlv;
+ ipfw_obj_header *oh;
+ struct tentry_info *ptei, tei, *tei_buf;
+ struct tid_info ti;
+ int error, i, kidx, read;
+
+ /* Check minimum header size */
+ if (sd->valsize < (sizeof(*oh) + sizeof(*ctlv)))
+ return (EINVAL);
- if (tbl >= V_fw_tables_max)
+ /* Check if passed data is too long */
+ if (sd->valsize != sd->kavail)
return (EINVAL);
- switch (type) {
- case IPFW_TABLE_CIDR:
- if (plen == sizeof(in_addr_t)) {
- /* Set 'total' structure length */
- KEY_LEN(sa) = KEY_LEN_INET;
- KEY_LEN(mask) = KEY_LEN_INET;
- mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
- addr = *((in_addr_t *)paddr);
- sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
- rnh_ptr = &ch->tables[tbl];
- sa_ptr = (struct sockaddr *)&sa;
- mask_ptr = (struct sockaddr *)&mask;
-#ifdef INET6
- } else if (plen == sizeof(struct in6_addr)) {
- /* IPv6 case */
- if (mlen > 128)
- return (EINVAL);
- struct sockaddr_in6 sa6, mask6;
- memset(&sa6, 0, sizeof(struct sockaddr_in6));
- memset(&mask6, 0, sizeof(struct sockaddr_in6));
- /* Set 'total' structure length */
- KEY_LEN(sa6) = KEY_LEN_INET6;
- KEY_LEN(mask6) = KEY_LEN_INET6;
- ipv6_writemask(&mask6.sin6_addr, mlen);
- memcpy(&sa6.sin6_addr, paddr, sizeof(struct in6_addr));
- APPLY_MASK(&sa6.sin6_addr, &mask6.sin6_addr);
- rnh_ptr = &ch->xtables[tbl];
- sa_ptr = (struct sockaddr *)&sa6;
- mask_ptr = (struct sockaddr *)&mask6;
-#endif
- } else {
- /* Unknown CIDR type */
- return (EINVAL);
- }
- break;
+ oh = (ipfw_obj_header *)sd->kbuf;
- case IPFW_TABLE_INTERFACE:
- /* Check if string is terminated */
- c = ((char *)paddr)[IF_NAMESIZE - 1];
- ((char *)paddr)[IF_NAMESIZE - 1] = '\0';
- if (((mlen = strlen((char *)paddr)) == IF_NAMESIZE - 1) && (c != '\0'))
- return (EINVAL);
+ /* Basic length checks for TLVs */
+ if (oh->ntlv.head.length != sizeof(oh->ntlv))
+ return (EINVAL);
- struct xaddr_iface ifname, ifmask;
- memset(&ifname, 0, sizeof(ifname));
-
- /* Include last \0 into comparison */
- mlen++;
-
- /* Set 'total' structure length */
- KEY_LEN(ifname) = KEY_LEN_IFACE + mlen;
- KEY_LEN(ifmask) = KEY_LEN_IFACE + mlen;
- /* Assume direct match */
- /* FIXME: Add interface pattern matching */
-#if 0
- memset(ifmask.ifname, 0xFF, IF_NAMESIZE);
- mask_ptr = (struct sockaddr *)&ifmask;
-#endif
- mask_ptr = NULL;
- memcpy(ifname.ifname, paddr, mlen);
- /* Set pointers */
- rnh_ptr = &ch->xtables[tbl];
- sa_ptr = (struct sockaddr *)&ifname;
+ read = sizeof(*oh);
- break;
+ ctlv = (ipfw_obj_ctlv *)(oh + 1);
+ if (ctlv->head.length + read != sd->valsize)
+ return (EINVAL);
- default:
+ read += sizeof(*ctlv);
+ tent = (ipfw_obj_tentry *)(ctlv + 1);
+ if (ctlv->count * sizeof(*tent) + read != sd->valsize)
return (EINVAL);
+
+ if (ctlv->count == 0)
+ return (0);
+
+ /*
+ * Mark entire buffer as "read".
+ * This instructs sopt api write it back
+ * after function return.
+ */
+ ipfw_get_sopt_header(sd, sd->valsize);
+
+ /* Perform basic checks for each entry */
+ ptent = tent;
+ kidx = tent->idx;
+ for (i = 0; i < ctlv->count; i++, ptent++) {
+ if (ptent->head.length != sizeof(*ptent))
+ return (EINVAL);
+ if (ptent->idx != kidx)
+ return (ENOTSUP);
}
- IPFW_WLOCK(ch);
- if ((rnh = *rnh_ptr) == NULL) {
- IPFW_WUNLOCK(ch);
+ /* Convert data into kernel request objects */
+ objheader_to_ti(oh, &ti);
+ ti.type = oh->ntlv.type;
+ ti.uidx = kidx;
+
+ /* Use on-stack buffer for single add/del */
+ if (ctlv->count == 1) {
+ memset(&tei, 0, sizeof(tei));
+ tei_buf = &tei;
+ } else
+ tei_buf = malloc(ctlv->count * sizeof(tei), M_TEMP,
+ M_WAITOK | M_ZERO);
+
+ ptei = tei_buf;
+ ptent = tent;
+ for (i = 0; i < ctlv->count; i++, ptent++, ptei++) {
+ ptei->paddr = &ptent->k;
+ ptei->subtype = ptent->subtype;
+ ptei->masklen = ptent->masklen;
+ if (ptent->head.flags & IPFW_TF_UPDATE)
+ ptei->flags |= TEI_FLAGS_UPDATE;
+
+ ipfw_import_table_value_v1(&ptent->v.value);
+ ptei->pvalue = (struct table_value *)&ptent->v.value;
+ }
+
+ error = (oh->opheader.opcode == IP_FW_TABLE_XADD) ?
+ add_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count) :
+ del_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count);
+
+ /* Translate result back to userland */
+ ptei = tei_buf;
+ ptent = tent;
+ for (i = 0; i < ctlv->count; i++, ptent++, ptei++) {
+ if (ptei->flags & TEI_FLAGS_ADDED)
+ ptent->result = IPFW_TR_ADDED;
+ else if (ptei->flags & TEI_FLAGS_DELETED)
+ ptent->result = IPFW_TR_DELETED;
+ else if (ptei->flags & TEI_FLAGS_UPDATED)
+ ptent->result = IPFW_TR_UPDATED;
+ else if (ptei->flags & TEI_FLAGS_LIMIT)
+ ptent->result = IPFW_TR_LIMIT;
+ else if (ptei->flags & TEI_FLAGS_ERROR)
+ ptent->result = IPFW_TR_ERROR;
+ else if (ptei->flags & TEI_FLAGS_NOTFOUND)
+ ptent->result = IPFW_TR_NOTFOUND;
+ else if (ptei->flags & TEI_FLAGS_EXISTS)
+ ptent->result = IPFW_TR_EXISTS;
+ ipfw_export_table_value_v1(ptei->pvalue, &ptent->v.value);
+ }
+
+ if (tei_buf != &tei)
+ free(tei_buf, M_TEMP);
+
+ return (error);
+}
+
+/*
+ * Looks up an entry in given table.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ipfw_obj_tentry ]
+ * Reply: [ ipfw_obj_header ipfw_obj_tentry ]
+ *
+ * Returns 0 on success
+ */
+static int
+find_table_entry(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_tentry *tent;
+ ipfw_obj_header *oh;
+ struct tid_info ti;
+ struct table_config *tc;
+ struct table_algo *ta;
+ struct table_info *kti;
+ struct namedobj_instance *ni;
+ int error;
+ size_t sz;
+
+ /* Check minimum header size */
+ sz = sizeof(*oh) + sizeof(*tent);
+ if (sd->valsize != sz)
+ return (EINVAL);
+
+ oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
+ tent = (ipfw_obj_tentry *)(oh + 1);
+
+ /* Basic length checks for TLVs */
+ if (oh->ntlv.head.length != sizeof(oh->ntlv))
+ return (EINVAL);
+
+ objheader_to_ti(oh, &ti);
+ ti.type = oh->ntlv.type;
+ ti.uidx = tent->idx;
+
+ IPFW_UH_RLOCK(ch);
+ ni = CHAIN_TO_NI(ch);
+
+ /*
+ * Find existing table and check its type .
+ */
+ ta = NULL;
+ if ((tc = find_table(ni, &ti)) == NULL) {
+ IPFW_UH_RUNLOCK(ch);
return (ESRCH);
}
- if (ch->tabletype[tbl] != type) {
- IPFW_WUNLOCK(ch);
+ /* check table type */
+ if (tc->no.subtype != ti.type) {
+ IPFW_UH_RUNLOCK(ch);
return (EINVAL);
}
- ent = (struct table_entry *)rnh->rnh_deladdr(sa_ptr, mask_ptr, rnh);
- IPFW_WUNLOCK(ch);
+ kti = KIDX_TO_TI(ch, tc->no.kidx);
+ ta = tc->ta;
- if (ent == NULL)
- return (ESRCH);
+ if (ta->find_tentry == NULL)
+ return (ENOTSUP);
- free(ent, M_IPFW_TBL);
- return (0);
+ error = ta->find_tentry(tc->astate, kti, tent);
+
+ IPFW_UH_RUNLOCK(ch);
+
+ return (error);
}
+/*
+ * Flushes all entries or destroys given table.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ]
+ *
+ * Returns 0 on success
+ */
static int
-flush_table_entry(struct radix_node *rn, void *arg)
+flush_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
{
- struct radix_node_head * const rnh = arg;
- struct table_entry *ent;
+ int error;
+ struct _ipfw_obj_header *oh;
+ struct tid_info ti;
- ent = (struct table_entry *)
- rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
- if (ent != NULL)
- free(ent, M_IPFW_TBL);
- return (0);
+ if (sd->valsize != sizeof(*oh))
+ return (EINVAL);
+
+ oh = (struct _ipfw_obj_header *)op3;
+ objheader_to_ti(oh, &ti);
+
+ if (op3->opcode == IP_FW_TABLE_XDESTROY)
+ error = destroy_table(ch, &ti);
+ else if (op3->opcode == IP_FW_TABLE_XFLUSH)
+ error = flush_table(ch, &ti);
+ else
+ return (ENOTSUP);
+
+ return (error);
}
+static void
+restart_flush(void *object, struct op_state *_state)
+{
+ struct tableop_state *ts;
+
+ ts = (struct tableop_state *)_state;
+
+ if (ts->tc != object)
+ return;
+
+ /* Indicate we've called */
+ ts->modified = 1;
+}
+
+/*
+ * Flushes given table.
+ *
+ * Function create new table instance with the same
+ * parameters, swaps it with old one and
+ * flushes state without holding runtime WLOCK.
+ *
+ * Returns 0 on success.
+ */
int
-ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl)
+flush_table(struct ip_fw_chain *ch, struct tid_info *ti)
{
- struct radix_node_head *rnh, *xrnh;
+ struct namedobj_instance *ni;
+ struct table_config *tc;
+ struct table_algo *ta;
+ struct table_info ti_old, ti_new, *tablestate;
+ void *astate_old, *astate_new;
+ char algostate[64], *pstate;
+ struct tableop_state ts;
+ int error, need_gc;
+ uint16_t kidx;
+ uint8_t tflags;
- if (tbl >= V_fw_tables_max)
- return (EINVAL);
+ /*
+ * Stage 1: save table algorithm.
+ * Reference found table to ensure it won't disappear.
+ */
+ IPFW_UH_WLOCK(ch);
+ ni = CHAIN_TO_NI(ch);
+ if ((tc = find_table(ni, ti)) == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return (ESRCH);
+ }
+ need_gc = 0;
+ astate_new = NULL;
+ memset(&ti_new, 0, sizeof(ti_new));
+restart:
+ /* Set up swap handler */
+ memset(&ts, 0, sizeof(ts));
+ ts.opstate.func = restart_flush;
+ ts.tc = tc;
+
+ ta = tc->ta;
+ /* Do not flush readonly tables */
+ if ((ta->flags & TA_FLAG_READONLY) != 0) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EACCES);
+ }
+ /* Save startup algo parameters */
+ if (ta->print_config != NULL) {
+ ta->print_config(tc->astate, KIDX_TO_TI(ch, tc->no.kidx),
+ algostate, sizeof(algostate));
+ pstate = algostate;
+ } else
+ pstate = NULL;
+ tflags = tc->tflags;
+ tc->no.refcnt++;
+ add_toperation_state(ch, &ts);
+ IPFW_UH_WUNLOCK(ch);
+
+ /*
+ * Stage 1.5: if this is not the first attempt, destroy previous state
+ */
+ if (need_gc != 0) {
+ ta->destroy(astate_new, &ti_new);
+ need_gc = 0;
+ }
/*
- * We free both (IPv4 and extended) radix trees and
- * clear table type here to permit table to be reused
- * for different type without module reload
+ * Stage 2: allocate new table instance using same algo.
*/
+ memset(&ti_new, 0, sizeof(struct table_info));
+ error = ta->init(ch, &astate_new, &ti_new, pstate, tflags);
+
+ /*
+ * Stage 3: swap old state pointers with newly-allocated ones.
+ * Decrease refcount.
+ */
+ IPFW_UH_WLOCK(ch);
+ tc->no.refcnt--;
+ del_toperation_state(ch, &ts);
+
+ if (error != 0) {
+ IPFW_UH_WUNLOCK(ch);
+ return (error);
+ }
+
+ /*
+ * Restart operation if table swap has happened:
+ * even if algo may be the same, algo init parameters
+ * may change. Restart operation instead of doing
+ * complex checks.
+ */
+ if (ts.modified != 0) {
+ /* Delay destroying data since we're holding UH lock */
+ need_gc = 1;
+ goto restart;
+ }
+
+ ni = CHAIN_TO_NI(ch);
+ kidx = tc->no.kidx;
+ tablestate = (struct table_info *)ch->tablestate;
IPFW_WLOCK(ch);
- /* Set IPv4 table pointer to zero */
- if ((rnh = ch->tables[tbl]) != NULL)
- ch->tables[tbl] = NULL;
- /* Set extended table pointer to zero */
- if ((xrnh = ch->xtables[tbl]) != NULL)
- ch->xtables[tbl] = NULL;
- /* Zero table type */
- ch->tabletype[tbl] = 0;
+ ti_old = tablestate[kidx];
+ tablestate[kidx] = ti_new;
IPFW_WUNLOCK(ch);
- if (rnh != NULL) {
- rnh->rnh_walktree(rnh, flush_table_entry, rnh);
- rn_detachhead((void **)&rnh);
+ astate_old = tc->astate;
+ tc->astate = astate_new;
+ tc->ti_copy = ti_new;
+ tc->count = 0;
+
+ /* Notify algo on real @ti address */
+ if (ta->change_ti != NULL)
+ ta->change_ti(tc->astate, &tablestate[kidx]);
+
+ /*
+ * Stage 4: unref values.
+ */
+ ipfw_unref_table_values(ch, tc, ta, astate_old, &ti_old);
+ IPFW_UH_WUNLOCK(ch);
+
+ /*
+ * Stage 5: perform real flush/destroy.
+ */
+ ta->destroy(astate_old, &ti_old);
+
+ return (0);
+}
+
+/*
+ * Swaps two tables.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ipfw_obj_ntlv ]
+ *
+ * Returns 0 on success
+ */
+static int
+swap_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ int error;
+ struct _ipfw_obj_header *oh;
+ struct tid_info ti_a, ti_b;
+
+ if (sd->valsize != sizeof(*oh) + sizeof(ipfw_obj_ntlv))
+ return (EINVAL);
+
+ oh = (struct _ipfw_obj_header *)op3;
+ ntlv_to_ti(&oh->ntlv, &ti_a);
+ ntlv_to_ti((ipfw_obj_ntlv *)(oh + 1), &ti_b);
+
+ error = swap_tables(ch, &ti_a, &ti_b);
+
+ return (error);
+}
+
+/*
+ * Swaps two tables of the same type/valtype.
+ *
+ * Checks if tables are compatible and limits
+ * permits swap, than actually perform swap.
+ *
+ * Each table consists of 2 different parts:
+ * config:
+ * @tc (with name, set, kidx) and rule bindings, which is "stable".
+ * number of items
+ * table algo
+ * runtime:
+ * runtime data @ti (ch->tablestate)
+ * runtime cache in @tc
+ * algo-specific data (@tc->astate)
+ *
+ * So we switch:
+ * all runtime data
+ * number of items
+ * table algo
+ *
+ * After that we call @ti change handler for each table.
+ *
+ * Note that referencing @tc won't protect tc->ta from change.
+ * XXX: Do we need to restrict swap between locked tables?
+ * XXX: Do we need to exchange ftype?
+ *
+ * Returns 0 on success.
+ */
+static int
+swap_tables(struct ip_fw_chain *ch, struct tid_info *a,
+ struct tid_info *b)
+{
+ struct namedobj_instance *ni;
+ struct table_config *tc_a, *tc_b;
+ struct table_algo *ta;
+ struct table_info ti, *tablestate;
+ void *astate;
+ uint32_t count;
+
+ /*
+ * Stage 1: find both tables and ensure they are of
+ * the same type.
+ */
+ IPFW_UH_WLOCK(ch);
+ ni = CHAIN_TO_NI(ch);
+ if ((tc_a = find_table(ni, a)) == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return (ESRCH);
+ }
+ if ((tc_b = find_table(ni, b)) == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return (ESRCH);
+ }
+
+ /* It is very easy to swap between the same table */
+ if (tc_a == tc_b) {
+ IPFW_UH_WUNLOCK(ch);
+ return (0);
+ }
+
+ /* Check type and value are the same */
+ if (tc_a->no.subtype!=tc_b->no.subtype || tc_a->tflags!=tc_b->tflags) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EINVAL);
}
- if (xrnh != NULL) {
- xrnh->rnh_walktree(xrnh, flush_table_entry, xrnh);
- rn_detachhead((void **)&xrnh);
+ /* Check limits before swap */
+ if ((tc_a->limit != 0 && tc_b->count > tc_a->limit) ||
+ (tc_b->limit != 0 && tc_a->count > tc_b->limit)) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EFBIG);
}
+ /* Check if one of the tables is readonly */
+ if (((tc_a->ta->flags | tc_b->ta->flags) & TA_FLAG_READONLY) != 0) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EACCES);
+ }
+
+ /* Notify we're going to swap */
+ rollback_toperation_state(ch, tc_a);
+ rollback_toperation_state(ch, tc_b);
+
+ /* Everything is fine, prepare to swap */
+ tablestate = (struct table_info *)ch->tablestate;
+ ti = tablestate[tc_a->no.kidx];
+ ta = tc_a->ta;
+ astate = tc_a->astate;
+ count = tc_a->count;
+
+ IPFW_WLOCK(ch);
+ /* a <- b */
+ tablestate[tc_a->no.kidx] = tablestate[tc_b->no.kidx];
+ tc_a->ta = tc_b->ta;
+ tc_a->astate = tc_b->astate;
+ tc_a->count = tc_b->count;
+ /* b <- a */
+ tablestate[tc_b->no.kidx] = ti;
+ tc_b->ta = ta;
+ tc_b->astate = astate;
+ tc_b->count = count;
+ IPFW_WUNLOCK(ch);
+
+ /* Ensure tc.ti copies are in sync */
+ tc_a->ti_copy = tablestate[tc_a->no.kidx];
+ tc_b->ti_copy = tablestate[tc_b->no.kidx];
+
+ /* Notify both tables on @ti change */
+ if (tc_a->ta->change_ti != NULL)
+ tc_a->ta->change_ti(tc_a->astate, &tablestate[tc_a->no.kidx]);
+ if (tc_b->ta->change_ti != NULL)
+ tc_b->ta->change_ti(tc_b->astate, &tablestate[tc_b->no.kidx]);
+
+ IPFW_UH_WUNLOCK(ch);
+
return (0);
}
-void
-ipfw_destroy_tables(struct ip_fw_chain *ch)
+/*
+ * Destroys table specified by @ti.
+ * Data layout (v0)(current):
+ * Request: [ ip_fw3_opheader ]
+ *
+ * Returns 0 on success
+ */
+static int
+destroy_table(struct ip_fw_chain *ch, struct tid_info *ti)
{
- uint16_t tbl;
+ struct namedobj_instance *ni;
+ struct table_config *tc;
- /* Flush all tables */
- for (tbl = 0; tbl < V_fw_tables_max; tbl++)
- ipfw_flush_table(ch, tbl);
+ IPFW_UH_WLOCK(ch);
- /* Free pointers itself */
- free(ch->tables, M_IPFW);
- free(ch->xtables, M_IPFW);
- free(ch->tabletype, M_IPFW);
+ ni = CHAIN_TO_NI(ch);
+ if ((tc = find_table(ni, ti)) == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return (ESRCH);
+ }
+
+ /* Do not permit destroying referenced tables */
+ if (tc->no.refcnt > 0) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EBUSY);
+ }
+
+ IPFW_WLOCK(ch);
+ unlink_table(ch, tc);
+ IPFW_WUNLOCK(ch);
+
+ /* Free obj index */
+ if (ipfw_objhash_free_idx(ni, tc->no.kidx) != 0)
+ printf("Error unlinking kidx %d from table %s\n",
+ tc->no.kidx, tc->tablename);
+
+ /* Unref values used in tables while holding UH lock */
+ ipfw_unref_table_values(ch, tc, tc->ta, tc->astate, &tc->ti_copy);
+ IPFW_UH_WUNLOCK(ch);
+
+ free_table_config(ni, tc);
+
+ return (0);
}
-int
-ipfw_init_tables(struct ip_fw_chain *ch)
+static uint32_t
+roundup2p(uint32_t v)
{
- /* Allocate pointers */
- ch->tables = malloc(V_fw_tables_max * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO);
- ch->xtables = malloc(V_fw_tables_max * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO);
- ch->tabletype = malloc(V_fw_tables_max * sizeof(uint8_t), M_IPFW, M_WAITOK | M_ZERO);
- return (0);
+
+ v--;
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ v++;
+
+ return (v);
}
+/*
+ * Grow tables index.
+ *
+ * Returns 0 on success.
+ */
int
ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables)
{
- struct radix_node_head **tables, **xtables, *rnh;
- struct radix_node_head **tables_old, **xtables_old;
- uint8_t *tabletype, *tabletype_old;
unsigned int ntables_old, tbl;
+ struct namedobj_instance *ni;
+ void *new_idx, *old_tablestate, *tablestate;
+ struct table_info *ti;
+ struct table_config *tc;
+ int i, new_blocks;
/* Check new value for validity */
+ if (ntables == 0)
+ return (EINVAL);
if (ntables > IPFW_TABLES_MAX)
ntables = IPFW_TABLES_MAX;
+ /* Alight to nearest power of 2 */
+ ntables = (unsigned int)roundup2p(ntables);
/* Allocate new pointers */
- tables = malloc(ntables * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO);
- xtables = malloc(ntables * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO);
- tabletype = malloc(ntables * sizeof(uint8_t), M_IPFW, M_WAITOK | M_ZERO);
+ tablestate = malloc(ntables * sizeof(struct table_info),
+ M_IPFW, M_WAITOK | M_ZERO);
- IPFW_WLOCK(ch);
+ ipfw_objhash_bitmap_alloc(ntables, (void *)&new_idx, &new_blocks);
+
+ IPFW_UH_WLOCK(ch);
tbl = (ntables >= V_fw_tables_max) ? V_fw_tables_max : ntables;
+ ni = CHAIN_TO_NI(ch);
- /* Copy old table pointers */
- memcpy(tables, ch->tables, sizeof(void *) * tbl);
- memcpy(xtables, ch->xtables, sizeof(void *) * tbl);
- memcpy(tabletype, ch->tabletype, sizeof(uint8_t) * tbl);
+ /* Temporary restrict decreasing max_tables */
+ if (ntables < V_fw_tables_max) {
- /* Change pointers and number of tables */
- tables_old = ch->tables;
- xtables_old = ch->xtables;
- tabletype_old = ch->tabletype;
- ch->tables = tables;
- ch->xtables = xtables;
- ch->tabletype = tabletype;
+ /*
+ * FIXME: Check if we really can shrink
+ */
+ IPFW_UH_WUNLOCK(ch);
+ return (EINVAL);
+ }
+
+ /* Copy table info/indices */
+ memcpy(tablestate, ch->tablestate, sizeof(struct table_info) * tbl);
+ ipfw_objhash_bitmap_merge(ni, &new_idx, &new_blocks);
+
+ IPFW_WLOCK(ch);
+
+ /* Change pointers */
+ old_tablestate = ch->tablestate;
+ ch->tablestate = tablestate;
+ ipfw_objhash_bitmap_swap(ni, &new_idx, &new_blocks);
ntables_old = V_fw_tables_max;
V_fw_tables_max = ntables;
IPFW_WUNLOCK(ch);
- /* Check if we need to destroy radix trees */
- if (ntables < ntables_old) {
- for (tbl = ntables; tbl < ntables_old; tbl++) {
- if ((rnh = tables_old[tbl]) != NULL) {
- rnh->rnh_walktree(rnh, flush_table_entry, rnh);
- rn_detachhead((void **)&rnh);
- }
+ /* Notify all consumers that their @ti pointer has changed */
+ ti = (struct table_info *)ch->tablestate;
+ for (i = 0; i < tbl; i++, ti++) {
+ if (ti->lookup == NULL)
+ continue;
+ tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, i);
+ if (tc == NULL || tc->ta->change_ti == NULL)
+ continue;
- if ((rnh = xtables_old[tbl]) != NULL) {
- rnh->rnh_walktree(rnh, flush_table_entry, rnh);
- rn_detachhead((void **)&rnh);
- }
- }
+ tc->ta->change_ti(tc->astate, ti);
}
+ IPFW_UH_WUNLOCK(ch);
+
/* Free old pointers */
- free(tables_old, M_IPFW);
- free(xtables_old, M_IPFW);
- free(tabletype_old, M_IPFW);
+ free(old_tablestate, M_IPFW);
+ ipfw_objhash_bitmap_free(new_idx, new_blocks);
+
+ return (0);
+}
+
+/*
+ * Lookup table's named object by its @kidx.
+ */
+struct named_object *
+ipfw_objhash_lookup_table_kidx(struct ip_fw_chain *ch, uint16_t kidx)
+{
+
+ return (ipfw_objhash_lookup_kidx(CHAIN_TO_NI(ch), kidx));
+}
+
+/*
+ * Take reference to table specified in @ntlv.
+ * On success return its @kidx.
+ */
+int
+ipfw_ref_table(struct ip_fw_chain *ch, ipfw_obj_ntlv *ntlv, uint16_t *kidx)
+{
+ struct tid_info ti;
+ struct table_config *tc;
+ int error;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ ntlv_to_ti(ntlv, &ti);
+ error = find_table_err(CHAIN_TO_NI(ch), &ti, &tc);
+ if (error != 0)
+ return (error);
+
+ if (tc == NULL)
+ return (ESRCH);
+
+ tc_ref(tc);
+ *kidx = tc->no.kidx;
return (0);
}
+void
+ipfw_unref_table(struct ip_fw_chain *ch, uint16_t kidx)
+{
+
+ struct namedobj_instance *ni;
+ struct named_object *no;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+ ni = CHAIN_TO_NI(ch);
+ no = ipfw_objhash_lookup_kidx(ni, kidx);
+ KASSERT(no != NULL, ("Table with index %d not found", kidx));
+ no->refcnt--;
+}
+
+/*
+ * Lookup an IP @addr in table @tbl.
+ * Stores found value in @val.
+ *
+ * Returns 1 if @addr was found.
+ */
int
ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
uint32_t *val)
{
- struct radix_node_head *rnh;
- struct table_entry *ent;
- struct sockaddr_in sa;
+ struct table_info *ti;
- if (tbl >= V_fw_tables_max)
- return (0);
- if ((rnh = ch->tables[tbl]) == NULL)
- return (0);
- KEY_LEN(sa) = KEY_LEN_INET;
- sa.sin_addr.s_addr = addr;
- ent = (struct table_entry *)(rnh->rnh_matchaddr(&sa, rnh));
- if (ent != NULL) {
- *val = ent->value;
- return (1);
+ ti = KIDX_TO_TI(ch, tbl);
+
+ return (ti->lookup(ti, &addr, sizeof(in_addr_t), val));
+}
+
+/*
+ * Lookup an arbtrary key @paddr of legth @plen in table @tbl.
+ * Stores found value in @val.
+ *
+ * Returns 1 if key was found.
+ */
+int
+ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen,
+ void *paddr, uint32_t *val)
+{
+ struct table_info *ti;
+
+ ti = KIDX_TO_TI(ch, tbl);
+
+ return (ti->lookup(ti, paddr, plen, val));
+}
+
+/*
+ * Info/List/dump support for tables.
+ *
+ */
+
+/*
+ * High-level 'get' cmds sysctl handlers
+ */
+
+/*
+ * Lists all tables currently available in kernel.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
+ * Reply: [ ipfw_obj_lheader ipfw_xtable_info x N ]
+ *
+ * Returns 0 on success
+ */
+static int
+list_tables(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ struct _ipfw_obj_lheader *olh;
+ int error;
+
+ olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
+ if (olh == NULL)
+ return (EINVAL);
+ if (sd->valsize < olh->size)
+ return (EINVAL);
+
+ IPFW_UH_RLOCK(ch);
+ error = export_tables(ch, olh, sd);
+ IPFW_UH_RUNLOCK(ch);
+
+ return (error);
+}
+
+/*
+ * Store table info to buffer provided by @sd.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ipfw_xtable_info(empty)]
+ * Reply: [ ipfw_obj_header ipfw_xtable_info ]
+ *
+ * Returns 0 on success.
+ */
+static int
+describe_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ struct _ipfw_obj_header *oh;
+ struct table_config *tc;
+ struct tid_info ti;
+ size_t sz;
+
+ sz = sizeof(*oh) + sizeof(ipfw_xtable_info);
+ oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
+ if (oh == NULL)
+ return (EINVAL);
+
+ objheader_to_ti(oh, &ti);
+
+ IPFW_UH_RLOCK(ch);
+ if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
+ IPFW_UH_RUNLOCK(ch);
+ return (ESRCH);
}
+
+ export_table_info(ch, tc, (ipfw_xtable_info *)(oh + 1));
+ IPFW_UH_RUNLOCK(ch);
+
return (0);
}
-int
-ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, void *paddr,
- uint32_t *val, int type)
+/*
+ * Modifies existing table.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ipfw_xtable_info ]
+ *
+ * Returns 0 on success
+ */
+static int
+modify_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
{
- struct radix_node_head *rnh;
- struct table_xentry *xent;
- struct sockaddr_in6 sa6;
- struct xaddr_iface iface;
+ struct _ipfw_obj_header *oh;
+ ipfw_xtable_info *i;
+ char *tname;
+ struct tid_info ti;
+ struct namedobj_instance *ni;
+ struct table_config *tc;
+
+ if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info))
+ return (EINVAL);
- if (tbl >= V_fw_tables_max)
- return (0);
- if ((rnh = ch->xtables[tbl]) == NULL)
- return (0);
+ oh = (struct _ipfw_obj_header *)sd->kbuf;
+ i = (ipfw_xtable_info *)(oh + 1);
- switch (type) {
- case IPFW_TABLE_CIDR:
- KEY_LEN(sa6) = KEY_LEN_INET6;
- memcpy(&sa6.sin6_addr, paddr, sizeof(struct in6_addr));
- xent = (struct table_xentry *)(rnh->rnh_matchaddr(&sa6, rnh));
- break;
+ /*
+ * Verify user-supplied strings.
+ * Check for null-terminated/zero-length strings/
+ */
+ tname = oh->ntlv.name;
+ if (check_table_name(tname) != 0)
+ return (EINVAL);
- case IPFW_TABLE_INTERFACE:
- KEY_LEN(iface) = KEY_LEN_IFACE +
- strlcpy(iface.ifname, (char *)paddr, IF_NAMESIZE) + 1;
- /* Assume direct match */
- /* FIXME: Add interface pattern matching */
- xent = (struct table_xentry *)(rnh->rnh_matchaddr(&iface, rnh));
- break;
+ objheader_to_ti(oh, &ti);
+ ti.type = i->type;
- default:
- return (0);
+ IPFW_UH_WLOCK(ch);
+ ni = CHAIN_TO_NI(ch);
+ if ((tc = find_table(ni, &ti)) == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return (ESRCH);
}
- if (xent != NULL) {
- *val = xent->value;
- return (1);
+ /* Do not support any modifications for readonly tables */
+ if ((tc->ta->flags & TA_FLAG_READONLY) != 0) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EACCES);
}
+
+ if ((i->mflags & IPFW_TMFLAGS_LIMIT) != 0)
+ tc->limit = i->limit;
+ if ((i->mflags & IPFW_TMFLAGS_LOCK) != 0)
+ tc->locked = ((i->flags & IPFW_TGFLAGS_LOCKED) != 0);
+ IPFW_UH_WUNLOCK(ch);
+
return (0);
}
+/*
+ * Creates new table.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ipfw_xtable_info ]
+ *
+ * Returns 0 on success
+ */
static int
-count_table_entry(struct radix_node *rn, void *arg)
+create_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
{
- u_int32_t * const cnt = arg;
+ struct _ipfw_obj_header *oh;
+ ipfw_xtable_info *i;
+ char *tname, *aname;
+ struct tid_info ti;
+ struct namedobj_instance *ni;
+
+ if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info))
+ return (EINVAL);
+
+ oh = (struct _ipfw_obj_header *)sd->kbuf;
+ i = (ipfw_xtable_info *)(oh + 1);
+
+ /*
+ * Verify user-supplied strings.
+ * Check for null-terminated/zero-length strings/
+ */
+ tname = oh->ntlv.name;
+ aname = i->algoname;
+ if (check_table_name(tname) != 0 ||
+ strnlen(aname, sizeof(i->algoname)) == sizeof(i->algoname))
+ return (EINVAL);
+
+ if (aname[0] == '\0') {
+ /* Use default algorithm */
+ aname = NULL;
+ }
+
+ objheader_to_ti(oh, &ti);
+ ti.type = i->type;
+
+ ni = CHAIN_TO_NI(ch);
+
+ IPFW_UH_RLOCK(ch);
+ if (find_table(ni, &ti) != NULL) {
+ IPFW_UH_RUNLOCK(ch);
+ return (EEXIST);
+ }
+ IPFW_UH_RUNLOCK(ch);
+
+ return (create_table_internal(ch, &ti, aname, i, NULL, 0));
+}
+
+/*
+ * Creates new table based on @ti and @aname.
+ *
+ * Assume @aname to be checked and valid.
+ * Stores allocated table kidx inside @pkidx (if non-NULL).
+ * Reference created table if @compat is non-zero.
+ *
+ * Returns 0 on success.
+ */
+static int
+create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti,
+ char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int compat)
+{
+ struct namedobj_instance *ni;
+ struct table_config *tc, *tc_new, *tmp;
+ struct table_algo *ta;
+ uint16_t kidx;
+
+ ni = CHAIN_TO_NI(ch);
+
+ ta = find_table_algo(CHAIN_TO_TCFG(ch), ti, aname);
+ if (ta == NULL)
+ return (ENOTSUP);
+
+ tc = alloc_table_config(ch, ti, ta, aname, i->tflags);
+ if (tc == NULL)
+ return (ENOMEM);
+
+ tc->vmask = i->vmask;
+ tc->limit = i->limit;
+ if (ta->flags & TA_FLAG_READONLY)
+ tc->locked = 1;
+ else
+ tc->locked = (i->flags & IPFW_TGFLAGS_LOCKED) != 0;
+
+ IPFW_UH_WLOCK(ch);
+
+ /* Check if table has been already created */
+ tc_new = find_table(ni, ti);
+ if (tc_new != NULL) {
+
+ /*
+ * Compat: do not fail if we're
+ * requesting to create existing table
+ * which has the same type
+ */
+ if (compat == 0 || tc_new->no.subtype != tc->no.subtype) {
+ IPFW_UH_WUNLOCK(ch);
+ free_table_config(ni, tc);
+ return (EEXIST);
+ }
+
+ /* Exchange tc and tc_new for proper refcounting & freeing */
+ tmp = tc;
+ tc = tc_new;
+ tc_new = tmp;
+ } else {
+ /* New table */
+ if (ipfw_objhash_alloc_idx(ni, &kidx) != 0) {
+ IPFW_UH_WUNLOCK(ch);
+ printf("Unable to allocate table index."
+ " Consider increasing net.inet.ip.fw.tables_max");
+ free_table_config(ni, tc);
+ return (EBUSY);
+ }
+ tc->no.kidx = kidx;
+ tc->no.etlv = IPFW_TLV_TBL_NAME;
+
+ IPFW_WLOCK(ch);
+ link_table(ch, tc);
+ IPFW_WUNLOCK(ch);
+ }
+
+ if (compat != 0)
+ tc->no.refcnt++;
+ if (pkidx != NULL)
+ *pkidx = tc->no.kidx;
+
+ IPFW_UH_WUNLOCK(ch);
+
+ if (tc_new != NULL)
+ free_table_config(ni, tc_new);
- (*cnt)++;
return (0);
}
+static void
+ntlv_to_ti(ipfw_obj_ntlv *ntlv, struct tid_info *ti)
+{
+
+ memset(ti, 0, sizeof(struct tid_info));
+ ti->set = ntlv->set;
+ ti->uidx = ntlv->idx;
+ ti->tlvs = ntlv;
+ ti->tlen = ntlv->head.length;
+}
+
+static void
+objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti)
+{
+
+ ntlv_to_ti(&oh->ntlv, ti);
+}
+
+struct namedobj_instance *
+ipfw_get_table_objhash(struct ip_fw_chain *ch)
+{
+
+ return (CHAIN_TO_NI(ch));
+}
+
+/*
+ * Exports basic table info as name TLV.
+ * Used inside dump_static_rules() to provide info
+ * about all tables referenced by current ruleset.
+ *
+ * Returns 0 on success.
+ */
int
-ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
+ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx,
+ struct sockopt_data *sd)
+{
+ struct namedobj_instance *ni;
+ struct named_object *no;
+ ipfw_obj_ntlv *ntlv;
+
+ ni = CHAIN_TO_NI(ch);
+
+ no = ipfw_objhash_lookup_kidx(ni, kidx);
+ KASSERT(no != NULL, ("invalid table kidx passed"));
+
+ ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv));
+ if (ntlv == NULL)
+ return (ENOMEM);
+
+ ntlv->head.type = IPFW_TLV_TBL_NAME;
+ ntlv->head.length = sizeof(*ntlv);
+ ntlv->idx = no->kidx;
+ strlcpy(ntlv->name, no->name, sizeof(ntlv->name));
+
+ return (0);
+}
+
+struct dump_args {
+ struct ip_fw_chain *ch;
+ struct table_info *ti;
+ struct table_config *tc;
+ struct sockopt_data *sd;
+ uint32_t cnt;
+ uint16_t uidx;
+ int error;
+ uint32_t size;
+ ipfw_table_entry *ent;
+ ta_foreach_f *f;
+ void *farg;
+ ipfw_obj_tentry tent;
+};
+
+static int
+count_ext_entries(void *e, void *arg)
{
- struct radix_node_head *rnh;
+ struct dump_args *da;
- if (tbl >= V_fw_tables_max)
+ da = (struct dump_args *)arg;
+ da->cnt++;
+
+ return (0);
+}
+
+/*
+ * Gets number of items from table either using
+ * internal counter or calling algo callback for
+ * externally-managed tables.
+ *
+ * Returns number of records.
+ */
+static uint32_t
+table_get_count(struct ip_fw_chain *ch, struct table_config *tc)
+{
+ struct table_info *ti;
+ struct table_algo *ta;
+ struct dump_args da;
+
+ ti = KIDX_TO_TI(ch, tc->no.kidx);
+ ta = tc->ta;
+
+ /* Use internal counter for self-managed tables */
+ if ((ta->flags & TA_FLAG_READONLY) == 0)
+ return (tc->count);
+
+ /* Use callback to quickly get number of items */
+ if ((ta->flags & TA_FLAG_EXTCOUNTER) != 0)
+ return (ta->get_count(tc->astate, ti));
+
+ /* Count number of iterms ourselves */
+ memset(&da, 0, sizeof(da));
+ ta->foreach(tc->astate, ti, count_ext_entries, &da);
+
+ return (da.cnt);
+}
+
+/*
+ * Exports table @tc info into standard ipfw_xtable_info format.
+ */
+static void
+export_table_info(struct ip_fw_chain *ch, struct table_config *tc,
+ ipfw_xtable_info *i)
+{
+ struct table_info *ti;
+ struct table_algo *ta;
+
+ i->type = tc->no.subtype;
+ i->tflags = tc->tflags;
+ i->vmask = tc->vmask;
+ i->set = tc->no.set;
+ i->kidx = tc->no.kidx;
+ i->refcnt = tc->no.refcnt;
+ i->count = table_get_count(ch, tc);
+ i->limit = tc->limit;
+ i->flags |= (tc->locked != 0) ? IPFW_TGFLAGS_LOCKED : 0;
+ i->size = i->count * sizeof(ipfw_obj_tentry);
+ i->size += sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info);
+ strlcpy(i->tablename, tc->tablename, sizeof(i->tablename));
+ ti = KIDX_TO_TI(ch, tc->no.kidx);
+ ta = tc->ta;
+ if (ta->print_config != NULL) {
+ /* Use algo function to print table config to string */
+ ta->print_config(tc->astate, ti, i->algoname,
+ sizeof(i->algoname));
+ } else
+ strlcpy(i->algoname, ta->name, sizeof(i->algoname));
+ /* Dump algo-specific data, if possible */
+ if (ta->dump_tinfo != NULL) {
+ ta->dump_tinfo(tc->astate, ti, &i->ta_info);
+ i->ta_info.flags |= IPFW_TATFLAGS_DATA;
+ }
+}
+
+struct dump_table_args {
+ struct ip_fw_chain *ch;
+ struct sockopt_data *sd;
+};
+
+static int
+export_table_internal(struct namedobj_instance *ni, struct named_object *no,
+ void *arg)
+{
+ ipfw_xtable_info *i;
+ struct dump_table_args *dta;
+
+ dta = (struct dump_table_args *)arg;
+
+ i = (ipfw_xtable_info *)ipfw_get_sopt_space(dta->sd, sizeof(*i));
+ KASSERT(i != NULL, ("previously checked buffer is not enough"));
+
+ export_table_info(dta->ch, (struct table_config *)no, i);
+ return (0);
+}
+
+/*
+ * Export all tables as ipfw_xtable_info structures to
+ * storage provided by @sd.
+ *
+ * If supplied buffer is too small, fills in required size
+ * and returns ENOMEM.
+ * Returns 0 on success.
+ */
+static int
+export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh,
+ struct sockopt_data *sd)
+{
+ uint32_t size;
+ uint32_t count;
+ struct dump_table_args dta;
+
+ count = ipfw_objhash_count(CHAIN_TO_NI(ch));
+ size = count * sizeof(ipfw_xtable_info) + sizeof(ipfw_obj_lheader);
+
+ /* Fill in header regadless of buffer size */
+ olh->count = count;
+ olh->objsize = sizeof(ipfw_xtable_info);
+
+ if (size > olh->size) {
+ olh->size = size;
+ return (ENOMEM);
+ }
+
+ olh->size = size;
+
+ dta.ch = ch;
+ dta.sd = sd;
+
+ ipfw_objhash_foreach(CHAIN_TO_NI(ch), export_table_internal, &dta);
+
+ return (0);
+}
+
+/*
+ * Dumps all table data
+ * Data layout (v1)(current):
+ * Request: [ ipfw_obj_header ], size = ipfw_xtable_info.size
+ * Reply: [ ipfw_obj_header ipfw_xtable_info ipfw_obj_tentry x N ]
+ *
+ * Returns 0 on success
+ */
+static int
+dump_table_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ struct _ipfw_obj_header *oh;
+ ipfw_xtable_info *i;
+ struct tid_info ti;
+ struct table_config *tc;
+ struct table_algo *ta;
+ struct dump_args da;
+ uint32_t sz;
+
+ sz = sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info);
+ oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
+ if (oh == NULL)
+ return (EINVAL);
+
+ i = (ipfw_xtable_info *)(oh + 1);
+ objheader_to_ti(oh, &ti);
+
+ IPFW_UH_RLOCK(ch);
+ if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
+ IPFW_UH_RUNLOCK(ch);
+ return (ESRCH);
+ }
+ export_table_info(ch, tc, i);
+
+ if (sd->valsize < i->size) {
+
+ /*
+ * Submitted buffer size is not enough.
+ * WE've already filled in @i structure with
+ * relevant table info including size, so we
+ * can return. Buffer will be flushed automatically.
+ */
+ IPFW_UH_RUNLOCK(ch);
+ return (ENOMEM);
+ }
+
+ /*
+ * Do the actual dump in eXtended format
+ */
+ memset(&da, 0, sizeof(da));
+ da.ch = ch;
+ da.ti = KIDX_TO_TI(ch, tc->no.kidx);
+ da.tc = tc;
+ da.sd = sd;
+
+ ta = tc->ta;
+
+ ta->foreach(tc->astate, da.ti, dump_table_tentry, &da);
+ IPFW_UH_RUNLOCK(ch);
+
+ return (da.error);
+}
+
+/*
+ * Dumps all table data
+ * Data layout (version 0)(legacy):
+ * Request: [ ipfw_xtable ], size = IP_FW_TABLE_XGETSIZE()
+ * Reply: [ ipfw_xtable ipfw_table_xentry x N ]
+ *
+ * Returns 0 on success
+ */
+static int
+dump_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_xtable *xtbl;
+ struct tid_info ti;
+ struct table_config *tc;
+ struct table_algo *ta;
+ struct dump_args da;
+ size_t sz, count;
+
+ xtbl = (ipfw_xtable *)ipfw_get_sopt_header(sd, sizeof(ipfw_xtable));
+ if (xtbl == NULL)
return (EINVAL);
- *cnt = 0;
- if ((rnh = ch->tables[tbl]) == NULL)
+
+ memset(&ti, 0, sizeof(ti));
+ ti.uidx = xtbl->tbl;
+
+ IPFW_UH_RLOCK(ch);
+ if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
+ IPFW_UH_RUNLOCK(ch);
return (0);
- rnh->rnh_walktree(rnh, count_table_entry, cnt);
+ }
+ count = table_get_count(ch, tc);
+ sz = count * sizeof(ipfw_table_xentry) + sizeof(ipfw_xtable);
+
+ xtbl->cnt = count;
+ xtbl->size = sz;
+ xtbl->type = tc->no.subtype;
+ xtbl->tbl = ti.uidx;
+
+ if (sd->valsize < sz) {
+
+ /*
+ * Submitted buffer size is not enough.
+ * WE've already filled in @i structure with
+ * relevant table info including size, so we
+ * can return. Buffer will be flushed automatically.
+ */
+ IPFW_UH_RUNLOCK(ch);
+ return (ENOMEM);
+ }
+
+ /* Do the actual dump in eXtended format */
+ memset(&da, 0, sizeof(da));
+ da.ch = ch;
+ da.ti = KIDX_TO_TI(ch, tc->no.kidx);
+ da.tc = tc;
+ da.sd = sd;
+
+ ta = tc->ta;
+
+ ta->foreach(tc->astate, da.ti, dump_table_xentry, &da);
+ IPFW_UH_RUNLOCK(ch);
+
+ return (0);
+}
+
+/*
+ * Legacy function to retrieve number of items in table.
+ */
+static int
+get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ uint32_t *tbl;
+ struct tid_info ti;
+ size_t sz;
+ int error;
+
+ sz = sizeof(*op3) + sizeof(uint32_t);
+ op3 = (ip_fw3_opheader *)ipfw_get_sopt_header(sd, sz);
+ if (op3 == NULL)
+ return (EINVAL);
+
+ tbl = (uint32_t *)(op3 + 1);
+ memset(&ti, 0, sizeof(ti));
+ ti.uidx = *tbl;
+ IPFW_UH_RLOCK(ch);
+ error = ipfw_count_xtable(ch, &ti, tbl);
+ IPFW_UH_RUNLOCK(ch);
+ return (error);
+}
+
+/*
+ * Legacy IP_FW_TABLE_GETSIZE handler
+ */
+int
+ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt)
+{
+ struct table_config *tc;
+
+ if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL)
+ return (ESRCH);
+ *cnt = table_get_count(ch, tc);
+ return (0);
+}
+
+/*
+ * Legacy IP_FW_TABLE_XGETSIZE handler
+ */
+int
+ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt)
+{
+ struct table_config *tc;
+ uint32_t count;
+
+ if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) {
+ *cnt = 0;
+ return (0); /* 'table all list' requires success */
+ }
+
+ count = table_get_count(ch, tc);
+ *cnt = count * sizeof(ipfw_table_xentry);
+ if (count > 0)
+ *cnt += sizeof(ipfw_xtable);
return (0);
}
static int
-dump_table_entry(struct radix_node *rn, void *arg)
+dump_table_entry(void *e, void *arg)
{
- struct table_entry * const n = (struct table_entry *)rn;
- ipfw_table * const tbl = arg;
+ struct dump_args *da;
+ struct table_config *tc;
+ struct table_algo *ta;
ipfw_table_entry *ent;
+ struct table_value *pval;
+ int error;
+
+ da = (struct dump_args *)arg;
+
+ tc = da->tc;
+ ta = tc->ta;
- if (tbl->cnt == tbl->size)
+ /* Out of memory, returning */
+ if (da->cnt == da->size)
return (1);
- ent = &tbl->ent[tbl->cnt];
- ent->tbl = tbl->tbl;
- if (in_nullhost(n->mask.sin_addr))
- ent->masklen = 0;
- else
- ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
- ent->addr = n->addr.sin_addr.s_addr;
- ent->value = n->value;
- tbl->cnt++;
+ ent = da->ent++;
+ ent->tbl = da->uidx;
+ da->cnt++;
+
+ error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent);
+ if (error != 0)
+ return (error);
+
+ ent->addr = da->tent.k.addr.s_addr;
+ ent->masklen = da->tent.masklen;
+ pval = get_table_value(da->ch, da->tc, da->tent.v.kidx);
+ ent->value = ipfw_export_table_value_legacy(pval);
+
return (0);
}
+/*
+ * Dumps table in pre-8.1 legacy format.
+ */
int
-ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl)
+ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti,
+ ipfw_table *tbl)
{
- struct radix_node_head *rnh;
+ struct table_config *tc;
+ struct table_algo *ta;
+ struct dump_args da;
- if (tbl->tbl >= V_fw_tables_max)
- return (EINVAL);
tbl->cnt = 0;
- if ((rnh = ch->tables[tbl->tbl]) == NULL)
+
+ if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL)
+ return (0); /* XXX: We should return ESRCH */
+
+ ta = tc->ta;
+
+ /* This dump format supports IPv4 only */
+ if (tc->no.subtype != IPFW_TABLE_ADDR)
return (0);
- rnh->rnh_walktree(rnh, dump_table_entry, tbl);
+
+ memset(&da, 0, sizeof(da));
+ da.ch = ch;
+ da.ti = KIDX_TO_TI(ch, tc->no.kidx);
+ da.tc = tc;
+ da.ent = &tbl->ent[0];
+ da.size = tbl->size;
+
+ tbl->cnt = 0;
+ ta->foreach(tc->astate, da.ti, dump_table_entry, &da);
+ tbl->cnt = da.cnt;
+
+ return (0);
+}
+
+/*
+ * Dumps table entry in eXtended format (v1)(current).
+ */
+static int
+dump_table_tentry(void *e, void *arg)
+{
+ struct dump_args *da;
+ struct table_config *tc;
+ struct table_algo *ta;
+ struct table_value *pval;
+ ipfw_obj_tentry *tent;
+ int error;
+
+ da = (struct dump_args *)arg;
+
+ tc = da->tc;
+ ta = tc->ta;
+
+ tent = (ipfw_obj_tentry *)ipfw_get_sopt_space(da->sd, sizeof(*tent));
+ /* Out of memory, returning */
+ if (tent == NULL) {
+ da->error = ENOMEM;
+ return (1);
+ }
+ tent->head.length = sizeof(ipfw_obj_tentry);
+ tent->idx = da->uidx;
+
+ error = ta->dump_tentry(tc->astate, da->ti, e, tent);
+ if (error != 0)
+ return (error);
+
+ pval = get_table_value(da->ch, da->tc, tent->v.kidx);
+ ipfw_export_table_value_v1(pval, &tent->v.value);
+
+ return (0);
+}
+
+/*
+ * Dumps table entry in eXtended format (v0).
+ */
+static int
+dump_table_xentry(void *e, void *arg)
+{
+ struct dump_args *da;
+ struct table_config *tc;
+ struct table_algo *ta;
+ ipfw_table_xentry *xent;
+ ipfw_obj_tentry *tent;
+ struct table_value *pval;
+ int error;
+
+ da = (struct dump_args *)arg;
+
+ tc = da->tc;
+ ta = tc->ta;
+
+ xent = (ipfw_table_xentry *)ipfw_get_sopt_space(da->sd, sizeof(*xent));
+ /* Out of memory, returning */
+ if (xent == NULL)
+ return (1);
+ xent->len = sizeof(ipfw_table_xentry);
+ xent->tbl = da->uidx;
+
+ memset(&da->tent, 0, sizeof(da->tent));
+ tent = &da->tent;
+ error = ta->dump_tentry(tc->astate, da->ti, e, tent);
+ if (error != 0)
+ return (error);
+
+ /* Convert current format to previous one */
+ xent->masklen = tent->masklen;
+ pval = get_table_value(da->ch, da->tc, da->tent.v.kidx);
+ xent->value = ipfw_export_table_value_legacy(pval);
+ /* Apply some hacks */
+ if (tc->no.subtype == IPFW_TABLE_ADDR && tent->subtype == AF_INET) {
+ xent->k.addr6.s6_addr32[3] = tent->k.addr.s_addr;
+ xent->flags = IPFW_TCF_INET;
+ } else
+ memcpy(&xent->k, &tent->k, sizeof(xent->k));
+
return (0);
}
+/*
+ * Helper function to export table algo data
+ * to tentry format before calling user function.
+ *
+ * Returns 0 on success.
+ */
static int
-count_table_xentry(struct radix_node *rn, void *arg)
+prepare_table_tentry(void *e, void *arg)
{
- uint32_t * const cnt = arg;
+ struct dump_args *da;
+ struct table_config *tc;
+ struct table_algo *ta;
+ int error;
+
+ da = (struct dump_args *)arg;
+
+ tc = da->tc;
+ ta = tc->ta;
+
+ error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent);
+ if (error != 0)
+ return (error);
+
+ da->f(&da->tent, da->farg);
- (*cnt) += sizeof(ipfw_table_xentry);
return (0);
}
+/*
+ * Allow external consumers to read table entries in standard format.
+ */
int
-ipfw_count_xtable(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
+ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx,
+ ta_foreach_f *f, void *arg)
+{
+ struct namedobj_instance *ni;
+ struct table_config *tc;
+ struct table_algo *ta;
+ struct dump_args da;
+
+ ni = CHAIN_TO_NI(ch);
+
+ tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx);
+ if (tc == NULL)
+ return (ESRCH);
+
+ ta = tc->ta;
+
+ memset(&da, 0, sizeof(da));
+ da.ch = ch;
+ da.ti = KIDX_TO_TI(ch, tc->no.kidx);
+ da.tc = tc;
+ da.f = f;
+ da.farg = arg;
+
+ ta->foreach(tc->astate, da.ti, prepare_table_tentry, &da);
+
+ return (0);
+}
+
+/*
+ * Table algorithms
+ */
+
+/*
+ * Finds algorithm by index, table type or supplied name.
+ *
+ * Returns pointer to algo or NULL.
+ */
+static struct table_algo *
+find_table_algo(struct tables_config *tcfg, struct tid_info *ti, char *name)
{
- struct radix_node_head *rnh;
+ int i, l;
+ struct table_algo *ta;
+
+ if (ti->type > IPFW_TABLE_MAXTYPE)
+ return (NULL);
+
+ /* Search by index */
+ if (ti->atype != 0) {
+ if (ti->atype > tcfg->algo_count)
+ return (NULL);
+ return (tcfg->algo[ti->atype]);
+ }
+
+ if (name == NULL) {
+ /* Return default algorithm for given type if set */
+ return (tcfg->def_algo[ti->type]);
+ }
+
+ /* Search by name */
+ /* TODO: better search */
+ for (i = 1; i <= tcfg->algo_count; i++) {
+ ta = tcfg->algo[i];
+
+ /*
+ * One can supply additional algorithm
+ * parameters so we compare only the first word
+ * of supplied name:
+ * 'addr:chash hsize=32'
+ * '^^^^^^^^^'
+ *
+ */
+ l = strlen(ta->name);
+ if (strncmp(name, ta->name, l) != 0)
+ continue;
+ if (name[l] != '\0' && name[l] != ' ')
+ continue;
+ /* Check if we're requesting proper table type */
+ if (ti->type != 0 && ti->type != ta->type)
+ return (NULL);
+ return (ta);
+ }
- if (tbl >= V_fw_tables_max)
+ return (NULL);
+}
+
+/*
+ * Register new table algo @ta.
+ * Stores algo id inside @idx.
+ *
+ * Returns 0 on success.
+ */
+int
+ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, size_t size,
+ int *idx)
+{
+ struct tables_config *tcfg;
+ struct table_algo *ta_new;
+ size_t sz;
+
+ if (size > sizeof(struct table_algo))
return (EINVAL);
- *cnt = 0;
- if ((rnh = ch->tables[tbl]) != NULL)
- rnh->rnh_walktree(rnh, count_table_xentry, cnt);
- if ((rnh = ch->xtables[tbl]) != NULL)
- rnh->rnh_walktree(rnh, count_table_xentry, cnt);
- /* Return zero if table is empty */
- if (*cnt > 0)
- (*cnt) += sizeof(ipfw_xtable);
+
+ /* Check for the required on-stack size for add/del */
+ sz = roundup2(ta->ta_buf_size, sizeof(void *));
+ if (sz > TA_BUF_SZ)
+ return (EINVAL);
+
+ KASSERT(ta->type <= IPFW_TABLE_MAXTYPE,("Increase IPFW_TABLE_MAXTYPE"));
+
+ /* Copy algorithm data to stable storage. */
+ ta_new = malloc(sizeof(struct table_algo), M_IPFW, M_WAITOK | M_ZERO);
+ memcpy(ta_new, ta, size);
+
+ tcfg = CHAIN_TO_TCFG(ch);
+
+ KASSERT(tcfg->algo_count < 255, ("Increase algo array size"));
+
+ tcfg->algo[++tcfg->algo_count] = ta_new;
+ ta_new->idx = tcfg->algo_count;
+
+ /* Set algorithm as default one for given type */
+ if ((ta_new->flags & TA_FLAG_DEFAULT) != 0 &&
+ tcfg->def_algo[ta_new->type] == NULL)
+ tcfg->def_algo[ta_new->type] = ta_new;
+
+ *idx = ta_new->idx;
+
return (0);
}
+/*
+ * Unregisters table algo using @idx as id.
+ * XXX: It is NOT safe to call this function in any place
+ * other than ipfw instance destroy handler.
+ */
+void
+ipfw_del_table_algo(struct ip_fw_chain *ch, int idx)
+{
+ struct tables_config *tcfg;
+ struct table_algo *ta;
+
+ tcfg = CHAIN_TO_TCFG(ch);
+
+ KASSERT(idx <= tcfg->algo_count, ("algo idx %d out of range 1..%d",
+ idx, tcfg->algo_count));
+ ta = tcfg->algo[idx];
+ KASSERT(ta != NULL, ("algo idx %d is NULL", idx));
+
+ if (tcfg->def_algo[ta->type] == ta)
+ tcfg->def_algo[ta->type] = NULL;
+
+ free(ta, M_IPFW);
+}
+
+/*
+ * Lists all table algorithms currently available.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
+ * Reply: [ ipfw_obj_lheader ipfw_ta_info x N ]
+ *
+ * Returns 0 on success
+ */
static int
-dump_table_xentry_base(struct radix_node *rn, void *arg)
+list_table_algo(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
{
- struct table_entry * const n = (struct table_entry *)rn;
- ipfw_xtable * const tbl = arg;
- ipfw_table_xentry *xent;
+ struct _ipfw_obj_lheader *olh;
+ struct tables_config *tcfg;
+ ipfw_ta_info *i;
+ struct table_algo *ta;
+ uint32_t count, n, size;
+
+ olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
+ if (olh == NULL)
+ return (EINVAL);
+ if (sd->valsize < olh->size)
+ return (EINVAL);
+
+ IPFW_UH_RLOCK(ch);
+ tcfg = CHAIN_TO_TCFG(ch);
+ count = tcfg->algo_count;
+ size = count * sizeof(ipfw_ta_info) + sizeof(ipfw_obj_lheader);
+
+ /* Fill in header regadless of buffer size */
+ olh->count = count;
+ olh->objsize = sizeof(ipfw_ta_info);
+
+ if (size > olh->size) {
+ olh->size = size;
+ IPFW_UH_RUNLOCK(ch);
+ return (ENOMEM);
+ }
+ olh->size = size;
+
+ for (n = 1; n <= count; n++) {
+ i = (ipfw_ta_info *)ipfw_get_sopt_space(sd, sizeof(*i));
+ KASSERT(i != NULL, ("previously checked buffer is not enough"));
+ ta = tcfg->algo[n];
+ strlcpy(i->algoname, ta->name, sizeof(i->algoname));
+ i->type = ta->type;
+ i->refcnt = ta->refcnt;
+ }
+
+ IPFW_UH_RUNLOCK(ch);
- /* Out of memory, returning */
- if (tbl->cnt == tbl->size)
- return (1);
- xent = &tbl->xent[tbl->cnt];
- xent->len = sizeof(ipfw_table_xentry);
- xent->tbl = tbl->tbl;
- if (in_nullhost(n->mask.sin_addr))
- xent->masklen = 0;
- else
- xent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
- /* Save IPv4 address as deprecated IPv6 compatible */
- xent->k.addr6.s6_addr32[3] = n->addr.sin_addr.s_addr;
- xent->value = n->value;
- tbl->cnt++;
return (0);
}
static int
-dump_table_xentry_extended(struct radix_node *rn, void *arg)
+classify_srcdst(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
{
- struct table_xentry * const n = (struct table_xentry *)rn;
- ipfw_xtable * const tbl = arg;
- ipfw_table_xentry *xent;
-#ifdef INET6
- int i;
- uint32_t *v;
-#endif
- /* Out of memory, returning */
- if (tbl->cnt == tbl->size)
+ /* Basic IPv4/IPv6 or u32 lookups */
+ *puidx = cmd->arg1;
+ /* Assume ADDR by default */
+ *ptype = IPFW_TABLE_ADDR;
+ int v;
+
+ if (F_LEN(cmd) > F_INSN_SIZE(ipfw_insn_u32)) {
+ /*
+ * generic lookup. The key must be
+ * in 32bit big-endian format.
+ */
+ v = ((ipfw_insn_u32 *)cmd)->d[1];
+ switch (v) {
+ case 0:
+ case 1:
+ /* IPv4 src/dst */
+ break;
+ case 2:
+ case 3:
+ /* src/dst port */
+ *ptype = IPFW_TABLE_NUMBER;
+ break;
+ case 4:
+ /* uid/gid */
+ *ptype = IPFW_TABLE_NUMBER;
+ break;
+ case 5:
+ /* jid */
+ *ptype = IPFW_TABLE_NUMBER;
+ break;
+ case 6:
+ /* dscp */
+ *ptype = IPFW_TABLE_NUMBER;
+ break;
+ }
+ }
+
+ return (0);
+}
+
+static int
+classify_via(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
+{
+ ipfw_insn_if *cmdif;
+
+ /* Interface table, possibly */
+ cmdif = (ipfw_insn_if *)cmd;
+ if (cmdif->name[0] != '\1')
return (1);
- xent = &tbl->xent[tbl->cnt];
- xent->len = sizeof(ipfw_table_xentry);
- xent->tbl = tbl->tbl;
-
- switch (tbl->type) {
-#ifdef INET6
- case IPFW_TABLE_CIDR:
- /* Count IPv6 mask */
- v = (uint32_t *)&n->m.mask6.sin6_addr;
- for (i = 0; i < sizeof(struct in6_addr) / 4; i++, v++)
- xent->masklen += bitcount32(*v);
- memcpy(&xent->k, &n->a.addr6.sin6_addr, sizeof(struct in6_addr));
- break;
-#endif
- case IPFW_TABLE_INTERFACE:
- /* Assume exact mask */
- xent->masklen = 8 * IF_NAMESIZE;
- memcpy(&xent->k, &n->a.iface.ifname, IF_NAMESIZE);
+
+ *ptype = IPFW_TABLE_INTERFACE;
+ *puidx = cmdif->p.kidx;
+
+ return (0);
+}
+
+static int
+classify_flow(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
+{
+
+ *puidx = cmd->arg1;
+ *ptype = IPFW_TABLE_FLOW;
+
+ return (0);
+}
+
+static void
+update_arg1(ipfw_insn *cmd, uint16_t idx)
+{
+
+ cmd->arg1 = idx;
+}
+
+static void
+update_via(ipfw_insn *cmd, uint16_t idx)
+{
+ ipfw_insn_if *cmdif;
+
+ cmdif = (ipfw_insn_if *)cmd;
+ cmdif->p.kidx = idx;
+}
+
+static int
+table_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
+ struct named_object **pno)
+{
+ struct table_config *tc;
+ int error;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ error = find_table_err(CHAIN_TO_NI(ch), ti, &tc);
+ if (error != 0)
+ return (error);
+
+ *pno = &tc->no;
+ return (0);
+}
+
+/* XXX: sets-sets! */
+static struct named_object *
+table_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
+{
+ struct namedobj_instance *ni;
+ struct table_config *tc;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+ ni = CHAIN_TO_NI(ch);
+ tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, idx);
+ KASSERT(tc != NULL, ("Table with index %d not found", idx));
+
+ return (&tc->no);
+}
+
+static int
+table_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set,
+ enum ipfw_sets_cmd cmd)
+{
+
+ switch (cmd) {
+ case SWAP_ALL:
+ case TEST_ALL:
+ /*
+ * Return success for TEST_ALL, since nothing prevents
+ * move rules from one set to another. All tables are
+ * accessible from all sets when per-set tables sysctl
+ * is disabled.
+ */
+ case MOVE_ALL:
+ case TEST_ONE:
+ case MOVE_ONE:
+ /*
+ * NOTE: we need to use ipfw_objhash_del/ipfw_objhash_add
+ * if set number will be used in hash function. Currently
+ * we can just use generic handler that replaces set value.
+ */
+ if (V_fw_tables_sets == 0)
+ return (0);
break;
-
- default:
- /* unknown, skip entry */
+ case COUNT_ONE:
+ /*
+ * Return EOPNOTSUPP for COUNT_ONE when per-set sysctl is
+ * disabled. This allow skip table's opcodes from additional
+ * checks when specific rules moved to another set.
+ */
+ if (V_fw_tables_sets == 0)
+ return (EOPNOTSUPP);
+ }
+ /* Use generic sets handler when per-set sysctl is enabled. */
+ return (ipfw_obj_manage_sets(CHAIN_TO_NI(ch), IPFW_TLV_TBL_NAME,
+ set, new_set, cmd));
+}
+
+static struct opcode_obj_rewrite opcodes[] = {
+ {
+ .opcode = O_IP_SRC_LOOKUP,
+ .etlv = IPFW_TLV_TBL_NAME,
+ .classifier = classify_srcdst,
+ .update = update_arg1,
+ .find_byname = table_findbyname,
+ .find_bykidx = table_findbykidx,
+ .create_object = create_table_compat,
+ .manage_sets = table_manage_sets,
+ },
+ {
+ .opcode = O_IP_DST_LOOKUP,
+ .etlv = IPFW_TLV_TBL_NAME,
+ .classifier = classify_srcdst,
+ .update = update_arg1,
+ .find_byname = table_findbyname,
+ .find_bykidx = table_findbykidx,
+ .create_object = create_table_compat,
+ .manage_sets = table_manage_sets,
+ },
+ {
+ .opcode = O_IP_FLOW_LOOKUP,
+ .etlv = IPFW_TLV_TBL_NAME,
+ .classifier = classify_flow,
+ .update = update_arg1,
+ .find_byname = table_findbyname,
+ .find_bykidx = table_findbykidx,
+ .create_object = create_table_compat,
+ .manage_sets = table_manage_sets,
+ },
+ {
+ .opcode = O_XMIT,
+ .etlv = IPFW_TLV_TBL_NAME,
+ .classifier = classify_via,
+ .update = update_via,
+ .find_byname = table_findbyname,
+ .find_bykidx = table_findbykidx,
+ .create_object = create_table_compat,
+ .manage_sets = table_manage_sets,
+ },
+ {
+ .opcode = O_RECV,
+ .etlv = IPFW_TLV_TBL_NAME,
+ .classifier = classify_via,
+ .update = update_via,
+ .find_byname = table_findbyname,
+ .find_bykidx = table_findbykidx,
+ .create_object = create_table_compat,
+ .manage_sets = table_manage_sets,
+ },
+ {
+ .opcode = O_VIA,
+ .etlv = IPFW_TLV_TBL_NAME,
+ .classifier = classify_via,
+ .update = update_via,
+ .find_byname = table_findbyname,
+ .find_bykidx = table_findbykidx,
+ .create_object = create_table_compat,
+ .manage_sets = table_manage_sets,
+ },
+};
+
+static int
+test_sets_cb(struct namedobj_instance *ni __unused, struct named_object *no,
+ void *arg __unused)
+{
+
+ /* Check that there aren't any tables in not default set */
+ if (no->set != 0)
+ return (EBUSY);
+ return (0);
+}
+
+/*
+ * Switch between "set 0" and "rule's set" table binding,
+ * Check all ruleset bindings and permits changing
+ * IFF each binding has both rule AND table in default set (set 0).
+ *
+ * Returns 0 on success.
+ */
+int
+ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int sets)
+{
+ struct opcode_obj_rewrite *rw;
+ struct namedobj_instance *ni;
+ struct named_object *no;
+ struct ip_fw *rule;
+ ipfw_insn *cmd;
+ int cmdlen, i, l;
+ uint16_t kidx;
+ uint8_t subtype;
+
+ IPFW_UH_WLOCK(ch);
+
+ if (V_fw_tables_sets == sets) {
+ IPFW_UH_WUNLOCK(ch);
return (0);
}
+ ni = CHAIN_TO_NI(ch);
+ if (sets == 0) {
+ /*
+ * Prevent disabling sets support if we have some tables
+ * in not default sets.
+ */
+ if (ipfw_objhash_foreach_type(ni, test_sets_cb,
+ NULL, IPFW_TLV_TBL_NAME) != 0) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EBUSY);
+ }
+ }
+ /*
+ * Scan all rules and examine tables opcodes.
+ */
+ for (i = 0; i < ch->n_rules; i++) {
+ rule = ch->map[i];
+
+ l = rule->cmd_len;
+ cmd = rule->cmd;
+ cmdlen = 0;
+ for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+ /* Check only tables opcodes */
+ for (kidx = 0, rw = opcodes;
+ rw < opcodes + nitems(opcodes); rw++) {
+ if (rw->opcode != cmd->opcode)
+ continue;
+ if (rw->classifier(cmd, &kidx, &subtype) == 0)
+ break;
+ }
+ if (kidx == 0)
+ continue;
+ no = ipfw_objhash_lookup_kidx(ni, kidx);
+ /* Check if both table object and rule has the set 0 */
+ if (no->set != 0 || rule->set != 0) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EBUSY);
+ }
+
+ }
+ }
+ V_fw_tables_sets = sets;
+ IPFW_UH_WUNLOCK(ch);
+ return (0);
+}
+
+/*
+ * Checks table name for validity.
+ * Enforce basic length checks, the rest
+ * should be done in userland.
+ *
+ * Returns 0 if name is considered valid.
+ */
+static int
+check_table_name(const char *name)
+{
+
+ /*
+ * TODO: do some more complicated checks
+ */
+ return (ipfw_check_object_name_generic(name));
+}
+
+/*
+ * Finds table config based on either legacy index
+ * or name in ntlv.
+ * Note @ti structure contains unchecked data from userland.
+ *
+ * Returns 0 in success and fills in @tc with found config
+ */
+static int
+find_table_err(struct namedobj_instance *ni, struct tid_info *ti,
+ struct table_config **tc)
+{
+ char *name, bname[16];
+ struct named_object *no;
+ ipfw_obj_ntlv *ntlv;
+ uint32_t set;
+
+ if (ti->tlvs != NULL) {
+ ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
+ IPFW_TLV_TBL_NAME);
+ if (ntlv == NULL)
+ return (EINVAL);
+ name = ntlv->name;
+
+ /*
+ * Use set provided by @ti instead of @ntlv one.
+ * This is needed due to different sets behavior
+ * controlled by V_fw_tables_sets.
+ */
+ set = (V_fw_tables_sets != 0) ? ti->set : 0;
+ } else {
+ snprintf(bname, sizeof(bname), "%d", ti->uidx);
+ name = bname;
+ set = 0;
+ }
+
+ no = ipfw_objhash_lookup_name(ni, set, name);
+ *tc = (struct table_config *)no;
+
+ return (0);
+}
+
+/*
+ * Finds table config based on either legacy index
+ * or name in ntlv.
+ * Note @ti structure contains unchecked data from userland.
+ *
+ * Returns pointer to table_config or NULL.
+ */
+static struct table_config *
+find_table(struct namedobj_instance *ni, struct tid_info *ti)
+{
+ struct table_config *tc;
+
+ if (find_table_err(ni, ti, &tc) != 0)
+ return (NULL);
+
+ return (tc);
+}
+
+/*
+ * Allocate new table config structure using
+ * specified @algo and @aname.
+ *
+ * Returns pointer to config or NULL.
+ */
+static struct table_config *
+alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti,
+ struct table_algo *ta, char *aname, uint8_t tflags)
+{
+ char *name, bname[16];
+ struct table_config *tc;
+ int error;
+ ipfw_obj_ntlv *ntlv;
+ uint32_t set;
+
+ if (ti->tlvs != NULL) {
+ ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
+ IPFW_TLV_TBL_NAME);
+ if (ntlv == NULL)
+ return (NULL);
+ name = ntlv->name;
+ set = ntlv->set;
+ } else {
+ /* Compat part: convert number to string representation */
+ snprintf(bname, sizeof(bname), "%d", ti->uidx);
+ name = bname;
+ set = 0;
+ }
+
+ tc = malloc(sizeof(struct table_config), M_IPFW, M_WAITOK | M_ZERO);
+ tc->no.name = tc->tablename;
+ tc->no.subtype = ta->type;
+ tc->no.set = set;
+ tc->tflags = tflags;
+ tc->ta = ta;
+ strlcpy(tc->tablename, name, sizeof(tc->tablename));
+ /* Set "shared" value type by default */
+ tc->vshared = 1;
+
+ /* Preallocate data structures for new tables */
+ error = ta->init(ch, &tc->astate, &tc->ti_copy, aname, tflags);
+ if (error != 0) {
+ free(tc, M_IPFW);
+ return (NULL);
+ }
+
+ return (tc);
+}
+
+/*
+ * Destroys table state and config.
+ */
+static void
+free_table_config(struct namedobj_instance *ni, struct table_config *tc)
+{
+
+ KASSERT(tc->linked == 0, ("free() on linked config"));
+ /* UH lock MUST NOT be held */
+
+ /*
+ * We're using ta without any locking/referencing.
+ * TODO: fix this if we're going to use unloadable algos.
+ */
+ tc->ta->destroy(tc->astate, &tc->ti_copy);
+ free(tc, M_IPFW);
+}
+
+/*
+ * Links @tc to @chain table named instance.
+ * Sets appropriate type/states in @chain table info.
+ */
+static void
+link_table(struct ip_fw_chain *ch, struct table_config *tc)
+{
+ struct namedobj_instance *ni;
+ struct table_info *ti;
+ uint16_t kidx;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+ IPFW_WLOCK_ASSERT(ch);
+
+ ni = CHAIN_TO_NI(ch);
+ kidx = tc->no.kidx;
+
+ ipfw_objhash_add(ni, &tc->no);
+
+ ti = KIDX_TO_TI(ch, kidx);
+ *ti = tc->ti_copy;
+
+ /* Notify algo on real @ti address */
+ if (tc->ta->change_ti != NULL)
+ tc->ta->change_ti(tc->astate, ti);
+
+ tc->linked = 1;
+ tc->ta->refcnt++;
+}
+
+/*
+ * Unlinks @tc from @chain table named instance.
+ * Zeroes states in @chain and stores them in @tc.
+ */
+static void
+unlink_table(struct ip_fw_chain *ch, struct table_config *tc)
+{
+ struct namedobj_instance *ni;
+ struct table_info *ti;
+ uint16_t kidx;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+ IPFW_WLOCK_ASSERT(ch);
+
+ ni = CHAIN_TO_NI(ch);
+ kidx = tc->no.kidx;
+
+ /* Clear state. @ti copy is already saved inside @tc */
+ ipfw_objhash_del(ni, &tc->no);
+ ti = KIDX_TO_TI(ch, kidx);
+ memset(ti, 0, sizeof(struct table_info));
+ tc->linked = 0;
+ tc->ta->refcnt--;
+
+ /* Notify algo on real @ti address */
+ if (tc->ta->change_ti != NULL)
+ tc->ta->change_ti(tc->astate, NULL);
+}
+
+static struct ipfw_sopt_handler scodes[] = {
+ { IP_FW_TABLE_XCREATE, 0, HDIR_SET, create_table },
+ { IP_FW_TABLE_XDESTROY, 0, HDIR_SET, flush_table_v0 },
+ { IP_FW_TABLE_XFLUSH, 0, HDIR_SET, flush_table_v0 },
+ { IP_FW_TABLE_XMODIFY, 0, HDIR_BOTH, modify_table },
+ { IP_FW_TABLE_XINFO, 0, HDIR_GET, describe_table },
+ { IP_FW_TABLES_XLIST, 0, HDIR_GET, list_tables },
+ { IP_FW_TABLE_XLIST, 0, HDIR_GET, dump_table_v0 },
+ { IP_FW_TABLE_XLIST, 1, HDIR_GET, dump_table_v1 },
+ { IP_FW_TABLE_XADD, 0, HDIR_BOTH, manage_table_ent_v0 },
+ { IP_FW_TABLE_XADD, 1, HDIR_BOTH, manage_table_ent_v1 },
+ { IP_FW_TABLE_XDEL, 0, HDIR_BOTH, manage_table_ent_v0 },
+ { IP_FW_TABLE_XDEL, 1, HDIR_BOTH, manage_table_ent_v1 },
+ { IP_FW_TABLE_XFIND, 0, HDIR_GET, find_table_entry },
+ { IP_FW_TABLE_XSWAP, 0, HDIR_SET, swap_table },
+ { IP_FW_TABLES_ALIST, 0, HDIR_GET, list_table_algo },
+ { IP_FW_TABLE_XGETSIZE, 0, HDIR_GET, get_table_size },
+};
- xent->value = n->value;
- tbl->cnt++;
+static int
+destroy_table_locked(struct namedobj_instance *ni, struct named_object *no,
+ void *arg)
+{
+
+ unlink_table((struct ip_fw_chain *)arg, (struct table_config *)no);
+ if (ipfw_objhash_free_idx(ni, no->kidx) != 0)
+ printf("Error unlinking kidx %d from table %s\n",
+ no->kidx, no->name);
+ free_table_config(ni, (struct table_config *)no);
return (0);
}
+/*
+ * Shuts tables module down.
+ */
+void
+ipfw_destroy_tables(struct ip_fw_chain *ch, int last)
+{
+
+ IPFW_DEL_SOPT_HANDLER(last, scodes);
+ IPFW_DEL_OBJ_REWRITER(last, opcodes);
+
+ /* Remove all tables from working set */
+ IPFW_UH_WLOCK(ch);
+ IPFW_WLOCK(ch);
+ ipfw_objhash_foreach(CHAIN_TO_NI(ch), destroy_table_locked, ch);
+ IPFW_WUNLOCK(ch);
+ IPFW_UH_WUNLOCK(ch);
+
+ /* Free pointers itself */
+ free(ch->tablestate, M_IPFW);
+
+ ipfw_table_value_destroy(ch, last);
+ ipfw_table_algo_destroy(ch);
+
+ ipfw_objhash_destroy(CHAIN_TO_NI(ch));
+ free(CHAIN_TO_TCFG(ch), M_IPFW);
+}
+
+/*
+ * Starts tables module.
+ */
int
-ipfw_dump_xtable(struct ip_fw_chain *ch, ipfw_xtable *tbl)
+ipfw_init_tables(struct ip_fw_chain *ch, int first)
{
- struct radix_node_head *rnh;
+ struct tables_config *tcfg;
- if (tbl->tbl >= V_fw_tables_max)
- return (EINVAL);
- tbl->cnt = 0;
- tbl->type = ch->tabletype[tbl->tbl];
- if ((rnh = ch->tables[tbl->tbl]) != NULL)
- rnh->rnh_walktree(rnh, dump_table_xentry_base, tbl);
- if ((rnh = ch->xtables[tbl->tbl]) != NULL)
- rnh->rnh_walktree(rnh, dump_table_xentry_extended, tbl);
+ /* Allocate pointers */
+ ch->tablestate = malloc(V_fw_tables_max * sizeof(struct table_info),
+ M_IPFW, M_WAITOK | M_ZERO);
+
+ tcfg = malloc(sizeof(struct tables_config), M_IPFW, M_WAITOK | M_ZERO);
+ tcfg->namehash = ipfw_objhash_create(V_fw_tables_max);
+ ch->tblcfg = tcfg;
+
+ ipfw_table_value_init(ch, first);
+ ipfw_table_algo_init(ch);
+
+ IPFW_ADD_OBJ_REWRITER(first, opcodes);
+ IPFW_ADD_SOPT_HANDLER(first, scodes);
return (0);
}
-/* end of file */
+
+
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_table.h b/freebsd/sys/netpfil/ipfw/ip_fw_table.h
new file mode 100644
index 00000000..d6578482
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/ip_fw_table.h
@@ -0,0 +1,234 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IPFW2_TABLE_H
+#define _IPFW2_TABLE_H
+
+/*
+ * Internal constants and data structures used by ipfw tables
+ * not meant to be exported outside the kernel.
+ */
+#ifdef _KERNEL
+
+struct table_algo;
+struct tables_config {
+ struct namedobj_instance *namehash;
+ struct namedobj_instance *valhash;
+ uint32_t val_size;
+ uint32_t algo_count;
+ struct table_algo *algo[256];
+ struct table_algo *def_algo[IPFW_TABLE_MAXTYPE + 1];
+ TAILQ_HEAD(op_state_l,op_state) state_list;
+};
+#define CHAIN_TO_TCFG(chain) ((struct tables_config *)(chain)->tblcfg)
+
+struct table_info {
+ table_lookup_t *lookup; /* Lookup function */
+ void *state; /* Lookup radix/other structure */
+ void *xstate; /* eXtended state */
+ u_long data; /* Hints for given func */
+};
+
+struct table_value;
+struct tentry_info {
+ void *paddr;
+ struct table_value *pvalue;
+ void *ptv; /* Temporary field to hold obj */
+ uint8_t masklen; /* mask length */
+ uint8_t subtype;
+ uint16_t flags; /* record flags */
+ uint32_t value; /* value index */
+};
+#define TEI_FLAGS_UPDATE 0x0001 /* Add or update rec if exists */
+#define TEI_FLAGS_UPDATED 0x0002 /* Entry has been updated */
+#define TEI_FLAGS_COMPAT 0x0004 /* Called from old ABI */
+#define TEI_FLAGS_DONTADD 0x0008 /* Do not create new rec */
+#define TEI_FLAGS_ADDED 0x0010 /* Entry was added */
+#define TEI_FLAGS_DELETED 0x0020 /* Entry was deleted */
+#define TEI_FLAGS_LIMIT 0x0040 /* Limit was hit */
+#define TEI_FLAGS_ERROR 0x0080 /* Unknown request error */
+#define TEI_FLAGS_NOTFOUND 0x0100 /* Entry was not found */
+#define TEI_FLAGS_EXISTS 0x0200 /* Entry already exists */
+
+typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state,
+ struct table_info *ti, char *data, uint8_t tflags);
+typedef void (ta_destroy)(void *ta_state, struct table_info *ti);
+typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf);
+typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf);
+typedef int (ta_add)(void *ta_state, struct table_info *ti,
+ struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+typedef int (ta_del)(void *ta_state, struct table_info *ti,
+ struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+typedef void (ta_flush_entry)(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf);
+
+typedef int (ta_need_modify)(void *ta_state, struct table_info *ti,
+ uint32_t count, uint64_t *pflags);
+typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags);
+typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti,
+ void *ta_buf, uint64_t *pflags);
+typedef void (ta_modify)(void *ta_state, struct table_info *ti,
+ void *ta_buf, uint64_t pflags);
+typedef void (ta_flush_mod)(void *ta_buf);
+
+typedef void (ta_change_ti)(void *ta_state, struct table_info *ti);
+typedef void (ta_print_config)(void *ta_state, struct table_info *ti, char *buf,
+ size_t bufsize);
+
+typedef int ta_foreach_f(void *node, void *arg);
+typedef void ta_foreach(void *ta_state, struct table_info *ti, ta_foreach_f *f,
+ void *arg);
+typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e,
+ ipfw_obj_tentry *tent);
+typedef int ta_find_tentry(void *ta_state, struct table_info *ti,
+ ipfw_obj_tentry *tent);
+typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti,
+ ipfw_ta_tinfo *tinfo);
+typedef uint32_t ta_get_count(void *ta_state, struct table_info *ti);
+
+struct table_algo {
+ char name[16];
+ uint32_t idx;
+ uint32_t type;
+ uint32_t refcnt;
+ uint32_t flags;
+ uint32_t vlimit;
+ size_t ta_buf_size;
+ ta_init *init;
+ ta_destroy *destroy;
+ ta_prepare_add *prepare_add;
+ ta_prepare_del *prepare_del;
+ ta_add *add;
+ ta_del *del;
+ ta_flush_entry *flush_entry;
+ ta_find_tentry *find_tentry;
+ ta_need_modify *need_modify;
+ ta_prepare_mod *prepare_mod;
+ ta_fill_mod *fill_mod;
+ ta_modify *modify;
+ ta_flush_mod *flush_mod;
+ ta_change_ti *change_ti;
+ ta_foreach *foreach;
+ ta_dump_tentry *dump_tentry;
+ ta_print_config *print_config;
+ ta_dump_tinfo *dump_tinfo;
+ ta_get_count *get_count;
+};
+#define TA_FLAG_DEFAULT 0x01 /* Algo is default for given type */
+#define TA_FLAG_READONLY 0x02 /* Algo does not support modifications*/
+#define TA_FLAG_EXTCOUNTER 0x04 /* Algo has external counter available*/
+
+int ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta,
+ size_t size, int *idx);
+void ipfw_del_table_algo(struct ip_fw_chain *ch, int idx);
+
+void ipfw_table_algo_init(struct ip_fw_chain *chain);
+void ipfw_table_algo_destroy(struct ip_fw_chain *chain);
+
+MALLOC_DECLARE(M_IPFW_TBL);
+/* Exported to support legacy opcodes */
+int add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti,
+ struct tentry_info *tei, uint8_t flags, uint32_t count);
+int del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti,
+ struct tentry_info *tei, uint8_t flags, uint32_t count);
+int flush_table(struct ip_fw_chain *ch, struct tid_info *ti);
+void ipfw_import_table_value_legacy(uint32_t value, struct table_value *v);
+uint32_t ipfw_export_table_value_legacy(struct table_value *v);
+int ipfw_get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd);
+
+/* ipfw_table_value.c functions */
+struct table_config;
+struct tableop_state;
+void ipfw_table_value_init(struct ip_fw_chain *ch, int first);
+void ipfw_table_value_destroy(struct ip_fw_chain *ch, int last);
+int ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts);
+void ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc,
+ struct tentry_info *tei, uint32_t count, int rollback);
+void ipfw_import_table_value_v1(ipfw_table_value *iv);
+void ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *iv);
+void ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc,
+ struct table_algo *ta, void *astate, struct table_info *ti);
+void rollback_table_values(struct tableop_state *ts);
+
+int ipfw_rewrite_table_uidx(struct ip_fw_chain *chain,
+ struct rule_check_info *ci);
+int ipfw_mark_table_kidx(struct ip_fw_chain *chain, struct ip_fw *rule,
+ uint32_t *bmask);
+int ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx,
+ struct sockopt_data *sd);
+void ipfw_unref_rule_tables(struct ip_fw_chain *chain, struct ip_fw *rule);
+struct namedobj_instance *ipfw_get_table_objhash(struct ip_fw_chain *ch);
+
+/* utility functions */
+int ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt,
+ uint32_t new_set);
+void ipfw_swap_tables_sets(struct ip_fw_chain *ch, uint32_t old_set,
+ uint32_t new_set, int mv);
+int ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx,
+ ta_foreach_f f, void *arg);
+
+/* internal functions */
+void tc_ref(struct table_config *tc);
+void tc_unref(struct table_config *tc);
+
+struct op_state;
+typedef void (op_rollback_f)(void *object, struct op_state *state);
+struct op_state {
+ TAILQ_ENTRY(op_state) next; /* chain link */
+ op_rollback_f *func;
+};
+
+struct tableop_state {
+ struct op_state opstate;
+ struct ip_fw_chain *ch;
+ struct table_config *tc;
+ struct table_algo *ta;
+ struct tentry_info *tei;
+ uint32_t count;
+ uint32_t vmask;
+ int vshared;
+ int modified;
+};
+
+void add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts);
+void del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts);
+void rollback_toperation_state(struct ip_fw_chain *ch, void *object);
+
+/* Legacy interfaces */
+int ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti,
+ uint32_t *cnt);
+int ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti,
+ uint32_t *cnt);
+int ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti,
+ ipfw_table *tbl);
+
+
+#endif /* _KERNEL */
+#endif /* _IPFW2_TABLE_H */
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_table_algo.c b/freebsd/sys/netpfil/ipfw/ip_fw_table_algo.c
new file mode 100644
index 00000000..e4c82131
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/ip_fw_table_algo.c
@@ -0,0 +1,4112 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2014 Yandex LLC
+ * Copyright (c) 2014 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Lookup table algorithms.
+ *
+ */
+
+#include <rtems/bsd/local/opt_ipfw.h>
+#include <rtems/bsd/local/opt_inet.h>
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#include <rtems/bsd/local/opt_inet6.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+#include <sys/queue.h>
+#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */
+#include <net/radix.h>
+#include <net/route.h>
+#include <net/route_var.h>
+
+#include <netinet/in.h>
+#include <netinet/in_fib.h>
+#include <netinet/ip_var.h> /* struct ipfw_rule_ref */
+#include <netinet/ip_fw.h>
+#include <netinet6/in6_fib.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/ip_fw_table.h>
+
+
+/*
+ * IPFW table lookup algorithms.
+ *
+ * What is needed to add another table algo?
+ *
+ * Algo init:
+ * * struct table_algo has to be filled with:
+ * name: "type:algoname" format, e.g. "addr:radix". Currently
+ * there are the following types: "addr", "iface", "number" and "flow".
+ * type: one of IPFW_TABLE_* types
+ * flags: one or more TA_FLAGS_*
+ * ta_buf_size: size of structure used to store add/del item state.
+ * Needs to be less than TA_BUF_SZ.
+ * callbacks: see below for description.
+ * * ipfw_add_table_algo / ipfw_del_table_algo has to be called
+ *
+ * Callbacks description:
+ *
+ * -init: request to initialize new table instance.
+ * typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state,
+ * struct table_info *ti, char *data, uint8_t tflags);
+ * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success.
+ *
+ * Allocate all structures needed for normal operations.
+ * * Caller may want to parse @data for some algo-specific
+ * options provided by userland.
+ * * Caller may want to save configuration state pointer to @ta_state
+ * * Caller needs to save desired runtime structure pointer(s)
+ * inside @ti fields. Note that it is not correct to save
+ * @ti pointer at this moment. Use -change_ti hook for that.
+ * * Caller has to fill in ti->lookup to appropriate function
+ * pointer.
+ *
+ *
+ *
+ * -destroy: request to destroy table instance.
+ * typedef void (ta_destroy)(void *ta_state, struct table_info *ti);
+ * MANDATORY, unlocked. (M_WAITOK).
+ *
+ * Frees all table entries and all tables structures allocated by -init.
+ *
+ *
+ *
+ * -prepare_add: request to allocate state for adding new entry.
+ * typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei,
+ * void *ta_buf);
+ * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success.
+ *
+ * Allocates state and fills it in with all necessary data (EXCEPT value)
+ * from @tei to minimize operations needed to be done under WLOCK.
+ * "value" field has to be copied to new entry in @add callback.
+ * Buffer ta_buf of size ta->ta_buf_sz may be used to store
+ * allocated state.
+ *
+ *
+ *
+ * -prepare_del: request to set state for deleting existing entry.
+ * typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei,
+ * void *ta_buf);
+ * MANDATORY, locked, UH. (M_NOWAIT). Returns 0 on success.
+ *
+ * Buffer ta_buf of size ta->ta_buf_sz may be used to store
+ * allocated state. Caller should use on-stack ta_buf allocation
+ * instead of doing malloc().
+ *
+ *
+ *
+ * -add: request to insert new entry into runtime/config structures.
+ * typedef int (ta_add)(void *ta_state, struct table_info *ti,
+ * struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+ * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success.
+ *
+ * Insert new entry using previously-allocated state in @ta_buf.
+ * * @tei may have the following flags:
+ * TEI_FLAGS_UPDATE: request to add or update entry.
+ * TEI_FLAGS_DONTADD: request to update (but not add) entry.
+ * * Caller is required to do the following:
+ * copy real entry value from @tei
+ * entry added: return 0, set 1 to @pnum
+ * entry updated: return 0, store 0 to @pnum, store old value in @tei,
+ * add TEI_FLAGS_UPDATED flag to @tei.
+ * entry exists: return EEXIST
+ * entry not found: return ENOENT
+ * other error: return non-zero error code.
+ *
+ *
+ *
+ * -del: request to delete existing entry from runtime/config structures.
+ * typedef int (ta_del)(void *ta_state, struct table_info *ti,
+ * struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+ * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success.
+ *
+ * Delete entry using previously set up in @ta_buf.
+ * * Caller is required to do the following:
+ * entry deleted: return 0, set 1 to @pnum, store old value in @tei.
+ * entry not found: return ENOENT
+ * other error: return non-zero error code.
+ *
+ *
+ *
+ * -flush_entry: flush entry state created by -prepare_add / -del / others
+ * typedef void (ta_flush_entry)(struct ip_fw_chain *ch,
+ * struct tentry_info *tei, void *ta_buf);
+ * MANDATORY, may be locked. (M_NOWAIT).
+ *
+ * Delete state allocated by:
+ * -prepare_add (-add returned EEXIST|UPDATED)
+ * -prepare_del (if any)
+ * -del
+ * * Caller is required to handle empty @ta_buf correctly.
+ *
+ *
+ * -find_tentry: finds entry specified by key @tei
+ * typedef int ta_find_tentry(void *ta_state, struct table_info *ti,
+ * ipfw_obj_tentry *tent);
+ * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 on success.
+ *
+ * Finds entry specified by given key.
+ * * Caller is required to do the following:
+ * entry found: returns 0, export entry to @tent
+ * entry not found: returns ENOENT
+ *
+ *
+ * -need_modify: checks if @ti has enough space to hold another @count items.
+ * typedef int (ta_need_modify)(void *ta_state, struct table_info *ti,
+ * uint32_t count, uint64_t *pflags);
+ * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 if has.
+ *
+ * Checks if given table has enough space to add @count items without
+ * resize. Caller may use @pflags to store desired modification data.
+ *
+ *
+ *
+ * -prepare_mod: allocate structures for table modification.
+ * typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags);
+ * OPTIONAL(need_modify), unlocked. (M_WAITOK). Returns 0 on success.
+ *
+ * Allocate all needed state for table modification. Caller
+ * should use `struct mod_item` to store new state in @ta_buf.
+ * Up to TA_BUF_SZ (128 bytes) can be stored in @ta_buf.
+ *
+ *
+ *
+ * -fill_mod: copy some data to new state/
+ * typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti,
+ * void *ta_buf, uint64_t *pflags);
+ * OPTIONAL(need_modify), locked (UH). (M_NOWAIT). Returns 0 on success.
+ *
+ * Copy as much data as we can to minimize changes under WLOCK.
+ * For example, array can be merged inside this callback.
+ *
+ *
+ *
+ * -modify: perform final modification.
+ * typedef void (ta_modify)(void *ta_state, struct table_info *ti,
+ * void *ta_buf, uint64_t pflags);
+ * OPTIONAL(need_modify), locked (UH+WLOCK). (M_NOWAIT).
+ *
+ * Performs all changes necessary to switch to new structures.
+ * * Caller should save old pointers to @ta_buf storage.
+ *
+ *
+ *
+ * -flush_mod: flush table modification state.
+ * typedef void (ta_flush_mod)(void *ta_buf);
+ * OPTIONAL(need_modify), unlocked. (M_WAITOK).
+ *
+ * Performs flush for the following:
+ * - prepare_mod (modification was not necessary)
+ * - modify (for the old state)
+ *
+ *
+ *
+ * -change_gi: monitor table info pointer changes
+ * typedef void (ta_change_ti)(void *ta_state, struct table_info *ti);
+ * OPTIONAL, locked (UH). (M_NOWAIT).
+ *
+ * Called on @ti pointer changed. Called immediately after -init
+ * to set initial state.
+ *
+ *
+ *
+ * -foreach: calls @f for each table entry
+ * typedef void ta_foreach(void *ta_state, struct table_info *ti,
+ * ta_foreach_f *f, void *arg);
+ * MANDATORY, locked(UH). (M_NOWAIT).
+ *
+ * Runs callback with specified argument for each table entry,
+ * Typically used for dumping table entries.
+ *
+ *
+ *
+ * -dump_tentry: dump table entry in current @tentry format.
+ * typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e,
+ * ipfw_obj_tentry *tent);
+ * MANDATORY, locked(UH). (M_NOWAIT). Returns 0 on success.
+ *
+ * Dumps entry @e to @tent.
+ *
+ *
+ * -print_config: prints custom algorithm options into buffer.
+ * typedef void (ta_print_config)(void *ta_state, struct table_info *ti,
+ * char *buf, size_t bufsize);
+ * OPTIONAL. locked(UH). (M_NOWAIT).
+ *
+ * Prints custom algorithm options in the format suitable to pass
+ * back to -init callback.
+ *
+ *
+ *
+ * -dump_tinfo: dumps algo-specific info.
+ * typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti,
+ * ipfw_ta_tinfo *tinfo);
+ * OPTIONAL. locked(UH). (M_NOWAIT).
+ *
+ * Dumps options like items size/hash size, etc.
+ */
+
+MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
+
+/*
+ * Utility structures/functions common to more than one algo
+ */
+
+struct mod_item {
+ void *main_ptr;
+ size_t size;
+ void *main_ptr6;
+ size_t size6;
+};
+
+static int badd(const void *key, void *item, void *base, size_t nmemb,
+ size_t size, int (*compar) (const void *, const void *));
+static int bdel(const void *key, void *base, size_t nmemb, size_t size,
+ int (*compar) (const void *, const void *));
+
+
+/*
+ * ADDR implementation using radix
+ *
+ */
+
+/*
+ * The radix code expects addr and mask to be array of bytes,
+ * with the first byte being the length of the array. rn_inithead
+ * is called with the offset in bits of the lookup key within the
+ * array. If we use a sockaddr_in as the underlying type,
+ * sin_len is conveniently located at offset 0, sin_addr is at
+ * offset 4 and normally aligned.
+ * But for portability, let's avoid assumption and make the code explicit
+ */
+#define KEY_LEN(v) *((uint8_t *)&(v))
+/*
+ * Do not require radix to compare more than actual IPv4/IPv6 address
+ */
+#define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t))
+#define KEY_LEN_INET6 (offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr))
+
+#define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr))
+#define OFF_LEN_INET6 (8 * offsetof(struct sa_in6, sin6_addr))
+
+struct radix_addr_entry {
+ struct radix_node rn[2];
+ struct sockaddr_in addr;
+ uint32_t value;
+ uint8_t masklen;
+};
+
+struct sa_in6 {
+ uint8_t sin6_len;
+ uint8_t sin6_family;
+ uint8_t pad[2];
+ struct in6_addr sin6_addr;
+};
+
+struct radix_addr_xentry {
+ struct radix_node rn[2];
+ struct sa_in6 addr6;
+ uint32_t value;
+ uint8_t masklen;
+};
+
+struct radix_cfg {
+ struct radix_node_head *head4;
+ struct radix_node_head *head6;
+ size_t count4;
+ size_t count6;
+};
+
+struct ta_buf_radix
+{
+ void *ent_ptr;
+ struct sockaddr *addr_ptr;
+ struct sockaddr *mask_ptr;
+ union {
+ struct {
+ struct sockaddr_in sa;
+ struct sockaddr_in ma;
+ } a4;
+ struct {
+ struct sa_in6 sa;
+ struct sa_in6 ma;
+ } a6;
+ } addr;
+};
+
+static int ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val);
+static int ta_init_radix(struct ip_fw_chain *ch, void **ta_state,
+ struct table_info *ti, char *data, uint8_t tflags);
+static int flush_radix_entry(struct radix_node *rn, void *arg);
+static void ta_destroy_radix(void *ta_state, struct table_info *ti);
+static void ta_dump_radix_tinfo(void *ta_state, struct table_info *ti,
+ ipfw_ta_tinfo *tinfo);
+static int ta_dump_radix_tentry(void *ta_state, struct table_info *ti,
+ void *e, ipfw_obj_tentry *tent);
+static int ta_find_radix_tentry(void *ta_state, struct table_info *ti,
+ ipfw_obj_tentry *tent);
+static void ta_foreach_radix(void *ta_state, struct table_info *ti,
+ ta_foreach_f *f, void *arg);
+static void tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa,
+ struct sockaddr *ma, int *set_mask);
+static int ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf);
+static int ta_add_radix(void *ta_state, struct table_info *ti,
+ struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+static int ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf);
+static int ta_del_radix(void *ta_state, struct table_info *ti,
+ struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf);
+static int ta_need_modify_radix(void *ta_state, struct table_info *ti,
+ uint32_t count, uint64_t *pflags);
+
+static int
+ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val)
+{
+ struct radix_node_head *rnh;
+
+ if (keylen == sizeof(in_addr_t)) {
+ struct radix_addr_entry *ent;
+ struct sockaddr_in sa;
+ KEY_LEN(sa) = KEY_LEN_INET;
+ sa.sin_addr.s_addr = *((in_addr_t *)key);
+ rnh = (struct radix_node_head *)ti->state;
+ ent = (struct radix_addr_entry *)(rnh->rnh_matchaddr(&sa, &rnh->rh));
+ if (ent != NULL) {
+ *val = ent->value;
+ return (1);
+ }
+ } else {
+ struct radix_addr_xentry *xent;
+ struct sa_in6 sa6;
+ KEY_LEN(sa6) = KEY_LEN_INET6;
+ memcpy(&sa6.sin6_addr, key, sizeof(struct in6_addr));
+ rnh = (struct radix_node_head *)ti->xstate;
+ xent = (struct radix_addr_xentry *)(rnh->rnh_matchaddr(&sa6, &rnh->rh));
+ if (xent != NULL) {
+ *val = xent->value;
+ return (1);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * New table
+ */
+static int
+ta_init_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
+ char *data, uint8_t tflags)
+{
+ struct radix_cfg *cfg;
+
+ if (!rn_inithead(&ti->state, OFF_LEN_INET))
+ return (ENOMEM);
+ if (!rn_inithead(&ti->xstate, OFF_LEN_INET6)) {
+ rn_detachhead(&ti->state);
+ return (ENOMEM);
+ }
+
+ cfg = malloc(sizeof(struct radix_cfg), M_IPFW, M_WAITOK | M_ZERO);
+
+ *ta_state = cfg;
+ ti->lookup = ta_lookup_radix;
+
+ return (0);
+}
+
+static int
+flush_radix_entry(struct radix_node *rn, void *arg)
+{
+ struct radix_node_head * const rnh = arg;
+ struct radix_addr_entry *ent;
+
+ ent = (struct radix_addr_entry *)
+ rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, &rnh->rh);
+ if (ent != NULL)
+ free(ent, M_IPFW_TBL);
+ return (0);
+}
+
+static void
+ta_destroy_radix(void *ta_state, struct table_info *ti)
+{
+ struct radix_cfg *cfg;
+ struct radix_node_head *rnh;
+
+ cfg = (struct radix_cfg *)ta_state;
+
+ rnh = (struct radix_node_head *)(ti->state);
+ rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh);
+ rn_detachhead(&ti->state);
+
+ rnh = (struct radix_node_head *)(ti->xstate);
+ rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh);
+ rn_detachhead(&ti->xstate);
+
+ free(cfg, M_IPFW);
+}
+
+/*
+ * Provide algo-specific table info
+ */
+static void
+ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
+{
+ struct radix_cfg *cfg;
+
+ cfg = (struct radix_cfg *)ta_state;
+
+ tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM;
+ tinfo->taclass4 = IPFW_TACLASS_RADIX;
+ tinfo->count4 = cfg->count4;
+ tinfo->itemsize4 = sizeof(struct radix_addr_entry);
+ tinfo->taclass6 = IPFW_TACLASS_RADIX;
+ tinfo->count6 = cfg->count6;
+ tinfo->itemsize6 = sizeof(struct radix_addr_xentry);
+}
+
+static int
+ta_dump_radix_tentry(void *ta_state, struct table_info *ti, void *e,
+ ipfw_obj_tentry *tent)
+{
+ struct radix_addr_entry *n;
+#ifdef INET6
+ struct radix_addr_xentry *xn;
+#endif
+
+ n = (struct radix_addr_entry *)e;
+
+ /* Guess IPv4/IPv6 radix by sockaddr family */
+ if (n->addr.sin_family == AF_INET) {
+ tent->k.addr.s_addr = n->addr.sin_addr.s_addr;
+ tent->masklen = n->masklen;
+ tent->subtype = AF_INET;
+ tent->v.kidx = n->value;
+#ifdef INET6
+ } else {
+ xn = (struct radix_addr_xentry *)e;
+ memcpy(&tent->k, &xn->addr6.sin6_addr, sizeof(struct in6_addr));
+ tent->masklen = xn->masklen;
+ tent->subtype = AF_INET6;
+ tent->v.kidx = xn->value;
+#endif
+ }
+
+ return (0);
+}
+
+static int
+ta_find_radix_tentry(void *ta_state, struct table_info *ti,
+ ipfw_obj_tentry *tent)
+{
+ struct radix_node_head *rnh;
+ void *e;
+
+ e = NULL;
+ if (tent->subtype == AF_INET) {
+ struct sockaddr_in sa;
+ KEY_LEN(sa) = KEY_LEN_INET;
+ sa.sin_addr.s_addr = tent->k.addr.s_addr;
+ rnh = (struct radix_node_head *)ti->state;
+ e = rnh->rnh_matchaddr(&sa, &rnh->rh);
+ } else {
+ struct sa_in6 sa6;
+ KEY_LEN(sa6) = KEY_LEN_INET6;
+ memcpy(&sa6.sin6_addr, &tent->k.addr6, sizeof(struct in6_addr));
+ rnh = (struct radix_node_head *)ti->xstate;
+ e = rnh->rnh_matchaddr(&sa6, &rnh->rh);
+ }
+
+ if (e != NULL) {
+ ta_dump_radix_tentry(ta_state, ti, e, tent);
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+static void
+ta_foreach_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f,
+ void *arg)
+{
+ struct radix_node_head *rnh;
+
+ rnh = (struct radix_node_head *)(ti->state);
+ rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg);
+
+ rnh = (struct radix_node_head *)(ti->xstate);
+ rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg);
+}
+
+
+#ifdef INET6
+static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask);
+
+static inline void
+ipv6_writemask(struct in6_addr *addr6, uint8_t mask)
+{
+ uint32_t *cp;
+
+ for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32)
+ *cp++ = 0xFFFFFFFF;
+ if (mask > 0)
+ *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0);
+}
+#endif
+
+static void
+tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa,
+ struct sockaddr *ma, int *set_mask)
+{
+ int mlen;
+#ifdef INET
+ struct sockaddr_in *addr, *mask;
+#endif
+#ifdef INET6
+ struct sa_in6 *addr6, *mask6;
+#endif
+ in_addr_t a4;
+
+ mlen = tei->masklen;
+
+ if (tei->subtype == AF_INET) {
+#ifdef INET
+ addr = (struct sockaddr_in *)sa;
+ mask = (struct sockaddr_in *)ma;
+ /* Set 'total' structure length */
+ KEY_LEN(*addr) = KEY_LEN_INET;
+ KEY_LEN(*mask) = KEY_LEN_INET;
+ addr->sin_family = AF_INET;
+ mask->sin_addr.s_addr =
+ htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+ a4 = *((in_addr_t *)tei->paddr);
+ addr->sin_addr.s_addr = a4 & mask->sin_addr.s_addr;
+ if (mlen != 32)
+ *set_mask = 1;
+ else
+ *set_mask = 0;
+#endif
+#ifdef INET6
+ } else if (tei->subtype == AF_INET6) {
+ /* IPv6 case */
+ addr6 = (struct sa_in6 *)sa;
+ mask6 = (struct sa_in6 *)ma;
+ /* Set 'total' structure length */
+ KEY_LEN(*addr6) = KEY_LEN_INET6;
+ KEY_LEN(*mask6) = KEY_LEN_INET6;
+ addr6->sin6_family = AF_INET6;
+ ipv6_writemask(&mask6->sin6_addr, mlen);
+ memcpy(&addr6->sin6_addr, tei->paddr, sizeof(struct in6_addr));
+ APPLY_MASK(&addr6->sin6_addr, &mask6->sin6_addr);
+ if (mlen != 128)
+ *set_mask = 1;
+ else
+ *set_mask = 0;
+#endif
+ }
+}
+
+static int
+ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+ struct ta_buf_radix *tb;
+ struct radix_addr_entry *ent;
+#ifdef INET6
+ struct radix_addr_xentry *xent;
+#endif
+ struct sockaddr *addr, *mask;
+ int mlen, set_mask;
+
+ tb = (struct ta_buf_radix *)ta_buf;
+
+ mlen = tei->masklen;
+ set_mask = 0;
+
+ if (tei->subtype == AF_INET) {
+#ifdef INET
+ if (mlen > 32)
+ return (EINVAL);
+ ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO);
+ ent->masklen = mlen;
+
+ addr = (struct sockaddr *)&ent->addr;
+ mask = (struct sockaddr *)&tb->addr.a4.ma;
+ tb->ent_ptr = ent;
+#endif
+#ifdef INET6
+ } else if (tei->subtype == AF_INET6) {
+ /* IPv6 case */
+ if (mlen > 128)
+ return (EINVAL);
+ xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO);
+ xent->masklen = mlen;
+
+ addr = (struct sockaddr *)&xent->addr6;
+ mask = (struct sockaddr *)&tb->addr.a6.ma;
+ tb->ent_ptr = xent;
+#endif
+ } else {
+ /* Unknown CIDR type */
+ return (EINVAL);
+ }
+
+ tei_to_sockaddr_ent(tei, addr, mask, &set_mask);
+ /* Set pointers */
+ tb->addr_ptr = addr;
+ if (set_mask != 0)
+ tb->mask_ptr = mask;
+
+ return (0);
+}
+
+static int
+ta_add_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei,
+ void *ta_buf, uint32_t *pnum)
+{
+ struct radix_cfg *cfg;
+ struct radix_node_head *rnh;
+ struct radix_node *rn;
+ struct ta_buf_radix *tb;
+ uint32_t *old_value, value;
+
+ cfg = (struct radix_cfg *)ta_state;
+ tb = (struct ta_buf_radix *)ta_buf;
+
+ /* Save current entry value from @tei */
+ if (tei->subtype == AF_INET) {
+ rnh = ti->state;
+ ((struct radix_addr_entry *)tb->ent_ptr)->value = tei->value;
+ } else {
+ rnh = ti->xstate;
+ ((struct radix_addr_xentry *)tb->ent_ptr)->value = tei->value;
+ }
+
+ /* Search for an entry first */
+ rn = rnh->rnh_lookup(tb->addr_ptr, tb->mask_ptr, &rnh->rh);
+ if (rn != NULL) {
+ if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
+ return (EEXIST);
+ /* Record already exists. Update value if we're asked to */
+ if (tei->subtype == AF_INET)
+ old_value = &((struct radix_addr_entry *)rn)->value;
+ else
+ old_value = &((struct radix_addr_xentry *)rn)->value;
+
+ value = *old_value;
+ *old_value = tei->value;
+ tei->value = value;
+
+ /* Indicate that update has happened instead of addition */
+ tei->flags |= TEI_FLAGS_UPDATED;
+ *pnum = 0;
+
+ return (0);
+ }
+
+ if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
+ return (EFBIG);
+
+ rn = rnh->rnh_addaddr(tb->addr_ptr, tb->mask_ptr, &rnh->rh,tb->ent_ptr);
+ if (rn == NULL) {
+ /* Unknown error */
+ return (EINVAL);
+ }
+
+ if (tei->subtype == AF_INET)
+ cfg->count4++;
+ else
+ cfg->count6++;
+ tb->ent_ptr = NULL;
+ *pnum = 1;
+
+ return (0);
+}
+
+static int
+ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+ struct ta_buf_radix *tb;
+ struct sockaddr *addr, *mask;
+ int mlen, set_mask;
+
+ tb = (struct ta_buf_radix *)ta_buf;
+
+ mlen = tei->masklen;
+ set_mask = 0;
+
+ if (tei->subtype == AF_INET) {
+ if (mlen > 32)
+ return (EINVAL);
+
+ addr = (struct sockaddr *)&tb->addr.a4.sa;
+ mask = (struct sockaddr *)&tb->addr.a4.ma;
+#ifdef INET6
+ } else if (tei->subtype == AF_INET6) {
+ if (mlen > 128)
+ return (EINVAL);
+
+ addr = (struct sockaddr *)&tb->addr.a6.sa;
+ mask = (struct sockaddr *)&tb->addr.a6.ma;
+#endif
+ } else
+ return (EINVAL);
+
+ tei_to_sockaddr_ent(tei, addr, mask, &set_mask);
+ tb->addr_ptr = addr;
+ if (set_mask != 0)
+ tb->mask_ptr = mask;
+
+ return (0);
+}
+
+static int
+ta_del_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei,
+ void *ta_buf, uint32_t *pnum)
+{
+ struct radix_cfg *cfg;
+ struct radix_node_head *rnh;
+ struct radix_node *rn;
+ struct ta_buf_radix *tb;
+
+ cfg = (struct radix_cfg *)ta_state;
+ tb = (struct ta_buf_radix *)ta_buf;
+
+ if (tei->subtype == AF_INET)
+ rnh = ti->state;
+ else
+ rnh = ti->xstate;
+
+ rn = rnh->rnh_deladdr(tb->addr_ptr, tb->mask_ptr, &rnh->rh);
+
+ if (rn == NULL)
+ return (ENOENT);
+
+ /* Save entry value to @tei */
+ if (tei->subtype == AF_INET)
+ tei->value = ((struct radix_addr_entry *)rn)->value;
+ else
+ tei->value = ((struct radix_addr_xentry *)rn)->value;
+
+ tb->ent_ptr = rn;
+
+ if (tei->subtype == AF_INET)
+ cfg->count4--;
+ else
+ cfg->count6--;
+ *pnum = 1;
+
+ return (0);
+}
+
+static void
+ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+ struct ta_buf_radix *tb;
+
+ tb = (struct ta_buf_radix *)ta_buf;
+
+ if (tb->ent_ptr != NULL)
+ free(tb->ent_ptr, M_IPFW_TBL);
+}
+
+static int
+ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count,
+ uint64_t *pflags)
+{
+
+ /*
+ * radix does not require additional memory allocations
+ * other than nodes itself. Adding new masks to the tree do
+ * but we don't have any API to call (and we don't known which
+ * sizes do we need).
+ */
+ return (0);
+}
+
+struct table_algo addr_radix = {
+ .name = "addr:radix",
+ .type = IPFW_TABLE_ADDR,
+ .flags = TA_FLAG_DEFAULT,
+ .ta_buf_size = sizeof(struct ta_buf_radix),
+ .init = ta_init_radix,
+ .destroy = ta_destroy_radix,
+ .prepare_add = ta_prepare_add_radix,
+ .prepare_del = ta_prepare_del_radix,
+ .add = ta_add_radix,
+ .del = ta_del_radix,
+ .flush_entry = ta_flush_radix_entry,
+ .foreach = ta_foreach_radix,
+ .dump_tentry = ta_dump_radix_tentry,
+ .find_tentry = ta_find_radix_tentry,
+ .dump_tinfo = ta_dump_radix_tinfo,
+ .need_modify = ta_need_modify_radix,
+};
+
+
+/*
+ * addr:hash cmds
+ *
+ *
+ * ti->data:
+ * [inv.mask4][inv.mask6][log2hsize4][log2hsize6]
+ * [ 8][ 8[ 8][ 8]
+ *
+ * inv.mask4: 32 - mask
+ * inv.mask6:
+ * 1) _slow lookup: mask
+ * 2) _aligned: (128 - mask) / 8
+ * 3) _64: 8
+ *
+ *
+ * pflags:
+ * [v4=1/v6=0][hsize]
+ * [ 32][ 32]
+ */
+
+struct chashentry;
+
+SLIST_HEAD(chashbhead, chashentry);
+
+struct chash_cfg {
+ struct chashbhead *head4;
+ struct chashbhead *head6;
+ size_t size4;
+ size_t size6;
+ size_t items4;
+ size_t items6;
+ uint8_t mask4;
+ uint8_t mask6;
+};
+
+struct chashentry {
+ SLIST_ENTRY(chashentry) next;
+ uint32_t value;
+ uint32_t type;
+ union {
+ uint32_t a4; /* Host format */
+ struct in6_addr a6; /* Network format */
+ } a;
+};
+
+struct ta_buf_chash
+{
+ void *ent_ptr;
+ struct chashentry ent;
+};
+
+#ifdef INET
+static __inline uint32_t hash_ip(uint32_t addr, int hsize);
+#endif
+#ifdef INET6
+static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize);
+static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize);
+static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key,
+ int mask, int hsize);
+static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask,
+ int hsize);
+#endif
+static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val);
+static int ta_lookup_chash_aligned(struct table_info *ti, void *key,
+ uint32_t keylen, uint32_t *val);
+static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val);
+static int chash_parse_opts(struct chash_cfg *cfg, char *data);
+static void ta_print_chash_config(void *ta_state, struct table_info *ti,
+ char *buf, size_t bufsize);
+static int ta_log2(uint32_t v);
+static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state,
+ struct table_info *ti, char *data, uint8_t tflags);
+static void ta_destroy_chash(void *ta_state, struct table_info *ti);
+static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti,
+ ipfw_ta_tinfo *tinfo);
+static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti,
+ void *e, ipfw_obj_tentry *tent);
+static uint32_t hash_ent(struct chashentry *ent, int af, int mlen,
+ uint32_t size);
+static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent);
+static int ta_find_chash_tentry(void *ta_state, struct table_info *ti,
+ ipfw_obj_tentry *tent);
+static void ta_foreach_chash(void *ta_state, struct table_info *ti,
+ ta_foreach_f *f, void *arg);
+static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf);
+static int ta_add_chash(void *ta_state, struct table_info *ti,
+ struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf);
+static int ta_del_chash(void *ta_state, struct table_info *ti,
+ struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf);
+static int ta_need_modify_chash(void *ta_state, struct table_info *ti,
+ uint32_t count, uint64_t *pflags);
+static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags);
+static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf,
+ uint64_t *pflags);
+static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf,
+ uint64_t pflags);
+static void ta_flush_mod_chash(void *ta_buf);
+
+
+#ifdef INET
+static __inline uint32_t
+hash_ip(uint32_t addr, int hsize)
+{
+
+ return (addr % (hsize - 1));
+}
+#endif
+
+#ifdef INET6
+static __inline uint32_t
+hash_ip6(struct in6_addr *addr6, int hsize)
+{
+ uint32_t i;
+
+ i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1] ^
+ addr6->s6_addr32[2] ^ addr6->s6_addr32[3];
+
+ return (i % (hsize - 1));
+}
+
+
+static __inline uint16_t
+hash_ip64(struct in6_addr *addr6, int hsize)
+{
+ uint32_t i;
+
+ i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1];
+
+ return (i % (hsize - 1));
+}
+
+
+static __inline uint32_t
+hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize)
+{
+ struct in6_addr mask6;
+
+ ipv6_writemask(&mask6, mask);
+ memcpy(addr6, key, sizeof(struct in6_addr));
+ APPLY_MASK(addr6, &mask6);
+ return (hash_ip6(addr6, hsize));
+}
+
+static __inline uint32_t
+hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize)
+{
+ uint64_t *paddr;
+
+ paddr = (uint64_t *)addr6;
+ *paddr = 0;
+ *(paddr + 1) = 0;
+ memcpy(addr6, key, mask);
+ return (hash_ip6(addr6, hsize));
+}
+#endif
+
+static int
+ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val)
+{
+ struct chashbhead *head;
+ struct chashentry *ent;
+ uint16_t hash, hsize;
+ uint8_t imask;
+
+ if (keylen == sizeof(in_addr_t)) {
+#ifdef INET
+ head = (struct chashbhead *)ti->state;
+ imask = ti->data >> 24;
+ hsize = 1 << ((ti->data & 0xFFFF) >> 8);
+ uint32_t a;
+ a = ntohl(*((in_addr_t *)key));
+ a = a >> imask;
+ hash = hash_ip(a, hsize);
+ SLIST_FOREACH(ent, &head[hash], next) {
+ if (ent->a.a4 == a) {
+ *val = ent->value;
+ return (1);
+ }
+ }
+#endif
+ } else {
+#ifdef INET6
+ /* IPv6: worst scenario: non-round mask */
+ struct in6_addr addr6;
+ head = (struct chashbhead *)ti->xstate;
+ imask = (ti->data & 0xFF0000) >> 16;
+ hsize = 1 << (ti->data & 0xFF);
+ hash = hash_ip6_slow(&addr6, key, imask, hsize);
+ SLIST_FOREACH(ent, &head[hash], next) {
+ if (memcmp(&ent->a.a6, &addr6, 16) == 0) {
+ *val = ent->value;
+ return (1);
+ }
+ }
+#endif
+ }
+
+ return (0);
+}
+
+static int
+ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val)
+{
+ struct chashbhead *head;
+ struct chashentry *ent;
+ uint16_t hash, hsize;
+ uint8_t imask;
+
+ if (keylen == sizeof(in_addr_t)) {
+#ifdef INET
+ head = (struct chashbhead *)ti->state;
+ imask = ti->data >> 24;
+ hsize = 1 << ((ti->data & 0xFFFF) >> 8);
+ uint32_t a;
+ a = ntohl(*((in_addr_t *)key));
+ a = a >> imask;
+ hash = hash_ip(a, hsize);
+ SLIST_FOREACH(ent, &head[hash], next) {
+ if (ent->a.a4 == a) {
+ *val = ent->value;
+ return (1);
+ }
+ }
+#endif
+ } else {
+#ifdef INET6
+ /* IPv6: aligned to 8bit mask */
+ struct in6_addr addr6;
+ uint64_t *paddr, *ptmp;
+ head = (struct chashbhead *)ti->xstate;
+ imask = (ti->data & 0xFF0000) >> 16;
+ hsize = 1 << (ti->data & 0xFF);
+
+ hash = hash_ip6_al(&addr6, key, imask, hsize);
+ paddr = (uint64_t *)&addr6;
+ SLIST_FOREACH(ent, &head[hash], next) {
+ ptmp = (uint64_t *)&ent->a.a6;
+ if (paddr[0] == ptmp[0] && paddr[1] == ptmp[1]) {
+ *val = ent->value;
+ return (1);
+ }
+ }
+#endif
+ }
+
+ return (0);
+}
+
+static int
+ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val)
+{
+ struct chashbhead *head;
+ struct chashentry *ent;
+ uint16_t hash, hsize;
+ uint8_t imask;
+
+ if (keylen == sizeof(in_addr_t)) {
+#ifdef INET
+ head = (struct chashbhead *)ti->state;
+ imask = ti->data >> 24;
+ hsize = 1 << ((ti->data & 0xFFFF) >> 8);
+ uint32_t a;
+ a = ntohl(*((in_addr_t *)key));
+ a = a >> imask;
+ hash = hash_ip(a, hsize);
+ SLIST_FOREACH(ent, &head[hash], next) {
+ if (ent->a.a4 == a) {
+ *val = ent->value;
+ return (1);
+ }
+ }
+#endif
+ } else {
+#ifdef INET6
+ /* IPv6: /64 */
+ uint64_t a6, *paddr;
+ head = (struct chashbhead *)ti->xstate;
+ paddr = (uint64_t *)key;
+ hsize = 1 << (ti->data & 0xFF);
+ a6 = *paddr;
+ hash = hash_ip64((struct in6_addr *)key, hsize);
+ SLIST_FOREACH(ent, &head[hash], next) {
+ paddr = (uint64_t *)&ent->a.a6;
+ if (a6 == *paddr) {
+ *val = ent->value;
+ return (1);
+ }
+ }
+#endif
+ }
+
+ return (0);
+}
+
+static int
+chash_parse_opts(struct chash_cfg *cfg, char *data)
+{
+ char *pdel, *pend, *s;
+ int mask4, mask6;
+
+ mask4 = cfg->mask4;
+ mask6 = cfg->mask6;
+
+ if (data == NULL)
+ return (0);
+ if ((pdel = strchr(data, ' ')) == NULL)
+ return (0);
+ while (*pdel == ' ')
+ pdel++;
+ if (strncmp(pdel, "masks=", 6) != 0)
+ return (EINVAL);
+ if ((s = strchr(pdel, ' ')) != NULL)
+ *s++ = '\0';
+
+ pdel += 6;
+ /* Need /XX[,/YY] */
+ if (*pdel++ != '/')
+ return (EINVAL);
+ mask4 = strtol(pdel, &pend, 10);
+ if (*pend == ',') {
+ /* ,/YY */
+ pdel = pend + 1;
+ if (*pdel++ != '/')
+ return (EINVAL);
+ mask6 = strtol(pdel, &pend, 10);
+ if (*pend != '\0')
+ return (EINVAL);
+ } else if (*pend != '\0')
+ return (EINVAL);
+
+ if (mask4 < 0 || mask4 > 32 || mask6 < 0 || mask6 > 128)
+ return (EINVAL);
+
+ cfg->mask4 = mask4;
+ cfg->mask6 = mask6;
+
+ return (0);
+}
+
+static void
+ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf,
+ size_t bufsize)
+{
+ struct chash_cfg *cfg;
+
+ cfg = (struct chash_cfg *)ta_state;
+
+ if (cfg->mask4 != 32 || cfg->mask6 != 128)
+ snprintf(buf, bufsize, "%s masks=/%d,/%d", "addr:hash",
+ cfg->mask4, cfg->mask6);
+ else
+ snprintf(buf, bufsize, "%s", "addr:hash");
+}
+
+static int
+ta_log2(uint32_t v)
+{
+ uint32_t r;
+
+ r = 0;
+ while (v >>= 1)
+ r++;
+
+ return (r);
+}
+
+/*
+ * New table.
+ * We assume 'data' to be either NULL or the following format:
+ * 'addr:hash [masks=/32[,/128]]'
+ */
+static int
+ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
+ char *data, uint8_t tflags)
+{
+ int error, i;
+ uint32_t hsize;
+ struct chash_cfg *cfg;
+
+ cfg = malloc(sizeof(struct chash_cfg), M_IPFW, M_WAITOK | M_ZERO);
+
+ cfg->mask4 = 32;
+ cfg->mask6 = 128;
+
+ if ((error = chash_parse_opts(cfg, data)) != 0) {
+ free(cfg, M_IPFW);
+ return (error);
+ }
+
+ cfg->size4 = 128;
+ cfg->size6 = 128;
+
+ cfg->head4 = malloc(sizeof(struct chashbhead) * cfg->size4, M_IPFW,
+ M_WAITOK | M_ZERO);
+ cfg->head6 = malloc(sizeof(struct chashbhead) * cfg->size6, M_IPFW,
+ M_WAITOK | M_ZERO);
+ for (i = 0; i < cfg->size4; i++)
+ SLIST_INIT(&cfg->head4[i]);
+ for (i = 0; i < cfg->size6; i++)
+ SLIST_INIT(&cfg->head6[i]);
+
+
+ *ta_state = cfg;
+ ti->state = cfg->head4;
+ ti->xstate = cfg->head6;
+
+ /* Store data depending on v6 mask length */
+ hsize = ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6);
+ if (cfg->mask6 == 64) {
+ ti->data = (32 - cfg->mask4) << 24 | (128 - cfg->mask6) << 16|
+ hsize;
+ ti->lookup = ta_lookup_chash_64;
+ } else if ((cfg->mask6 % 8) == 0) {
+ ti->data = (32 - cfg->mask4) << 24 |
+ cfg->mask6 << 13 | hsize;
+ ti->lookup = ta_lookup_chash_aligned;
+ } else {
+ /* don't do that! */
+ ti->data = (32 - cfg->mask4) << 24 |
+ cfg->mask6 << 16 | hsize;
+ ti->lookup = ta_lookup_chash_slow;
+ }
+
+ return (0);
+}
+
+static void
+ta_destroy_chash(void *ta_state, struct table_info *ti)
+{
+ struct chash_cfg *cfg;
+ struct chashentry *ent, *ent_next;
+ int i;
+
+ cfg = (struct chash_cfg *)ta_state;
+
+ for (i = 0; i < cfg->size4; i++)
+ SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next)
+ free(ent, M_IPFW_TBL);
+
+ for (i = 0; i < cfg->size6; i++)
+ SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next)
+ free(ent, M_IPFW_TBL);
+
+ free(cfg->head4, M_IPFW);
+ free(cfg->head6, M_IPFW);
+
+ free(cfg, M_IPFW);
+}
+
+static void
+ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
+{
+ struct chash_cfg *cfg;
+
+ cfg = (struct chash_cfg *)ta_state;
+
+ tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM;
+ tinfo->taclass4 = IPFW_TACLASS_HASH;
+ tinfo->size4 = cfg->size4;
+ tinfo->count4 = cfg->items4;
+ tinfo->itemsize4 = sizeof(struct chashentry);
+ tinfo->taclass6 = IPFW_TACLASS_HASH;
+ tinfo->size6 = cfg->size6;
+ tinfo->count6 = cfg->items6;
+ tinfo->itemsize6 = sizeof(struct chashentry);
+}
+
+static int
+ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e,
+ ipfw_obj_tentry *tent)
+{
+ struct chash_cfg *cfg;
+ struct chashentry *ent;
+
+ cfg = (struct chash_cfg *)ta_state;
+ ent = (struct chashentry *)e;
+
+ if (ent->type == AF_INET) {
+ tent->k.addr.s_addr = htonl(ent->a.a4 << (32 - cfg->mask4));
+ tent->masklen = cfg->mask4;
+ tent->subtype = AF_INET;
+ tent->v.kidx = ent->value;
+#ifdef INET6
+ } else {
+ memcpy(&tent->k, &ent->a.a6, sizeof(struct in6_addr));
+ tent->masklen = cfg->mask6;
+ tent->subtype = AF_INET6;
+ tent->v.kidx = ent->value;
+#endif
+ }
+
+ return (0);
+}
+
+static uint32_t
+hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size)
+{
+ uint32_t hash;
+
+ hash = 0;
+
+ if (af == AF_INET) {
+#ifdef INET
+ hash = hash_ip(ent->a.a4, size);
+#endif
+ } else {
+#ifdef INET6
+ if (mlen == 64)
+ hash = hash_ip64(&ent->a.a6, size);
+ else
+ hash = hash_ip6(&ent->a.a6, size);
+#endif
+ }
+
+ return (hash);
+}
+
+static int
+tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent)
+{
+ int mlen;
+#ifdef INET6
+ struct in6_addr mask6;
+#endif
+
+
+ mlen = tei->masklen;
+
+ if (tei->subtype == AF_INET) {
+#ifdef INET
+ if (mlen > 32)
+ return (EINVAL);
+ ent->type = AF_INET;
+
+ /* Calculate masked address */
+ ent->a.a4 = ntohl(*((in_addr_t *)tei->paddr)) >> (32 - mlen);
+#endif
+#ifdef INET6
+ } else if (tei->subtype == AF_INET6) {
+ /* IPv6 case */
+ if (mlen > 128)
+ return (EINVAL);
+ ent->type = AF_INET6;
+
+ ipv6_writemask(&mask6, mlen);
+ memcpy(&ent->a.a6, tei->paddr, sizeof(struct in6_addr));
+ APPLY_MASK(&ent->a.a6, &mask6);
+#endif
+ } else {
+ /* Unknown CIDR type */
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+static int
+ta_find_chash_tentry(void *ta_state, struct table_info *ti,
+ ipfw_obj_tentry *tent)
+{
+ struct chash_cfg *cfg;
+ struct chashbhead *head;
+ struct chashentry ent, *tmp;
+ struct tentry_info tei;
+ int error;
+ uint32_t hash;
+
+ cfg = (struct chash_cfg *)ta_state;
+
+ memset(&ent, 0, sizeof(ent));
+ memset(&tei, 0, sizeof(tei));
+
+ if (tent->subtype == AF_INET) {
+ tei.paddr = &tent->k.addr;
+ tei.masklen = cfg->mask4;
+ tei.subtype = AF_INET;
+
+ if ((error = tei_to_chash_ent(&tei, &ent)) != 0)
+ return (error);
+
+ head = cfg->head4;
+ hash = hash_ent(&ent, AF_INET, cfg->mask4, cfg->size4);
+ /* Check for existence */
+ SLIST_FOREACH(tmp, &head[hash], next) {
+ if (tmp->a.a4 != ent.a.a4)
+ continue;
+
+ ta_dump_chash_tentry(ta_state, ti, tmp, tent);
+ return (0);
+ }
+ } else {
+ tei.paddr = &tent->k.addr6;
+ tei.masklen = cfg->mask6;
+ tei.subtype = AF_INET6;
+
+ if ((error = tei_to_chash_ent(&tei, &ent)) != 0)
+ return (error);
+
+ head = cfg->head6;
+ hash = hash_ent(&ent, AF_INET6, cfg->mask6, cfg->size6);
+ /* Check for existence */
+ SLIST_FOREACH(tmp, &head[hash], next) {
+ if (memcmp(&tmp->a.a6, &ent.a.a6, 16) != 0)
+ continue;
+ ta_dump_chash_tentry(ta_state, ti, tmp, tent);
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+static void
+ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f,
+ void *arg)
+{
+ struct chash_cfg *cfg;
+ struct chashentry *ent, *ent_next;
+ int i;
+
+ cfg = (struct chash_cfg *)ta_state;
+
+ for (i = 0; i < cfg->size4; i++)
+ SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next)
+ f(ent, arg);
+
+ for (i = 0; i < cfg->size6; i++)
+ SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next)
+ f(ent, arg);
+}
+
+static int
+ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+ struct ta_buf_chash *tb;
+ struct chashentry *ent;
+ int error;
+
+ tb = (struct ta_buf_chash *)ta_buf;
+
+ ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO);
+
+ error = tei_to_chash_ent(tei, ent);
+ if (error != 0) {
+ free(ent, M_IPFW_TBL);
+ return (error);
+ }
+ tb->ent_ptr = ent;
+
+ return (0);
+}
+
+static int
+ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei,
+ void *ta_buf, uint32_t *pnum)
+{
+ struct chash_cfg *cfg;
+ struct chashbhead *head;
+ struct chashentry *ent, *tmp;
+ struct ta_buf_chash *tb;
+ int exists;
+ uint32_t hash, value;
+
+ cfg = (struct chash_cfg *)ta_state;
+ tb = (struct ta_buf_chash *)ta_buf;
+ ent = (struct chashentry *)tb->ent_ptr;
+ hash = 0;
+ exists = 0;
+
+ /* Read current value from @tei */
+ ent->value = tei->value;
+
+ /* Read cuurrent value */
+ if (tei->subtype == AF_INET) {
+ if (tei->masklen != cfg->mask4)
+ return (EINVAL);
+ head = cfg->head4;
+ hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4);
+
+ /* Check for existence */
+ SLIST_FOREACH(tmp, &head[hash], next) {
+ if (tmp->a.a4 == ent->a.a4) {
+ exists = 1;
+ break;
+ }
+ }
+ } else {
+ if (tei->masklen != cfg->mask6)
+ return (EINVAL);
+ head = cfg->head6;
+ hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6);
+ /* Check for existence */
+ SLIST_FOREACH(tmp, &head[hash], next) {
+ if (memcmp(&tmp->a.a6, &ent->a.a6, 16) == 0) {
+ exists = 1;
+ break;
+ }
+ }
+ }
+
+ if (exists == 1) {
+ if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
+ return (EEXIST);
+ /* Record already exists. Update value if we're asked to */
+ value = tmp->value;
+ tmp->value = tei->value;
+ tei->value = value;
+ /* Indicate that update has happened instead of addition */
+ tei->flags |= TEI_FLAGS_UPDATED;
+ *pnum = 0;
+ } else {
+ if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
+ return (EFBIG);
+ SLIST_INSERT_HEAD(&head[hash], ent, next);
+ tb->ent_ptr = NULL;
+ *pnum = 1;
+
+ /* Update counters */
+ if (tei->subtype == AF_INET)
+ cfg->items4++;
+ else
+ cfg->items6++;
+ }
+
+ return (0);
+}
+
+static int
+ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+ struct ta_buf_chash *tb;
+
+ tb = (struct ta_buf_chash *)ta_buf;
+
+ return (tei_to_chash_ent(tei, &tb->ent));
+}
+
+static int
+ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei,
+ void *ta_buf, uint32_t *pnum)
+{
+ struct chash_cfg *cfg;
+ struct chashbhead *head;
+ struct chashentry *tmp, *tmp_next, *ent;
+ struct ta_buf_chash *tb;
+ uint32_t hash;
+
+ cfg = (struct chash_cfg *)ta_state;
+ tb = (struct ta_buf_chash *)ta_buf;
+ ent = &tb->ent;
+
+ if (tei->subtype == AF_INET) {
+ if (tei->masklen != cfg->mask4)
+ return (EINVAL);
+ head = cfg->head4;
+ hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4);
+
+ SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) {
+ if (tmp->a.a4 != ent->a.a4)
+ continue;
+
+ SLIST_REMOVE(&head[hash], tmp, chashentry, next);
+ cfg->items4--;
+ tb->ent_ptr = tmp;
+ tei->value = tmp->value;
+ *pnum = 1;
+ return (0);
+ }
+ } else {
+ if (tei->masklen != cfg->mask6)
+ return (EINVAL);
+ head = cfg->head6;
+ hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6);
+ SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) {
+ if (memcmp(&tmp->a.a6, &ent->a.a6, 16) != 0)
+ continue;
+
+ SLIST_REMOVE(&head[hash], tmp, chashentry, next);
+ cfg->items6--;
+ tb->ent_ptr = tmp;
+ tei->value = tmp->value;
+ *pnum = 1;
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+static void
+ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+ struct ta_buf_chash *tb;
+
+ tb = (struct ta_buf_chash *)ta_buf;
+
+ if (tb->ent_ptr != NULL)
+ free(tb->ent_ptr, M_IPFW_TBL);
+}
+
+/*
+ * Hash growing callbacks.
+ */
+
+static int
+ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count,
+ uint64_t *pflags)
+{
+ struct chash_cfg *cfg;
+ uint64_t data;
+
+ /*
+ * Since we don't know exact number of IPv4/IPv6 records in @count,
+ * ignore non-zero @count value at all. Check current hash sizes
+ * and return appropriate data.
+ */
+
+ cfg = (struct chash_cfg *)ta_state;
+
+ data = 0;
+ if (cfg->items4 > cfg->size4 && cfg->size4 < 65536)
+ data |= (cfg->size4 * 2) << 16;
+ if (cfg->items6 > cfg->size6 && cfg->size6 < 65536)
+ data |= cfg->size6 * 2;
+
+ if (data != 0) {
+ *pflags = data;
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * Allocate new, larger chash.
+ */
+static int
+ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags)
+{
+ struct mod_item *mi;
+ struct chashbhead *head;
+ int i;
+
+ mi = (struct mod_item *)ta_buf;
+
+ memset(mi, 0, sizeof(struct mod_item));
+ mi->size = (*pflags >> 16) & 0xFFFF;
+ mi->size6 = *pflags & 0xFFFF;
+ if (mi->size > 0) {
+ head = malloc(sizeof(struct chashbhead) * mi->size,
+ M_IPFW, M_WAITOK | M_ZERO);
+ for (i = 0; i < mi->size; i++)
+ SLIST_INIT(&head[i]);
+ mi->main_ptr = head;
+ }
+
+ if (mi->size6 > 0) {
+ head = malloc(sizeof(struct chashbhead) * mi->size6,
+ M_IPFW, M_WAITOK | M_ZERO);
+ for (i = 0; i < mi->size6; i++)
+ SLIST_INIT(&head[i]);
+ mi->main_ptr6 = head;
+ }
+
+ return (0);
+}
+
+/*
+ * Copy data from old runtime array to new one.
+ */
+static int
+ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf,
+ uint64_t *pflags)
+{
+
+ /* In is not possible to do rehash if we're not holidng WLOCK. */
+ return (0);
+}
+
+/*
+ * Switch old & new arrays.
+ */
+static void
+ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf,
+ uint64_t pflags)
+{
+ struct mod_item *mi;
+ struct chash_cfg *cfg;
+ struct chashbhead *old_head, *new_head;
+ struct chashentry *ent, *ent_next;
+ int af, i, mlen;
+ uint32_t nhash;
+ size_t old_size, new_size;
+
+ mi = (struct mod_item *)ta_buf;
+ cfg = (struct chash_cfg *)ta_state;
+
+ /* Check which hash we need to grow and do we still need that */
+ if (mi->size > 0 && cfg->size4 < mi->size) {
+ new_head = (struct chashbhead *)mi->main_ptr;
+ new_size = mi->size;
+ old_size = cfg->size4;
+ old_head = ti->state;
+ mlen = cfg->mask4;
+ af = AF_INET;
+
+ for (i = 0; i < old_size; i++) {
+ SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) {
+ nhash = hash_ent(ent, af, mlen, new_size);
+ SLIST_INSERT_HEAD(&new_head[nhash], ent, next);
+ }
+ }
+
+ ti->state = new_head;
+ cfg->head4 = new_head;
+ cfg->size4 = mi->size;
+ mi->main_ptr = old_head;
+ }
+
+ if (mi->size6 > 0 && cfg->size6 < mi->size6) {
+ new_head = (struct chashbhead *)mi->main_ptr6;
+ new_size = mi->size6;
+ old_size = cfg->size6;
+ old_head = ti->xstate;
+ mlen = cfg->mask6;
+ af = AF_INET6;
+
+ for (i = 0; i < old_size; i++) {
+ SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) {
+ nhash = hash_ent(ent, af, mlen, new_size);
+ SLIST_INSERT_HEAD(&new_head[nhash], ent, next);
+ }
+ }
+
+ ti->xstate = new_head;
+ cfg->head6 = new_head;
+ cfg->size6 = mi->size6;
+ mi->main_ptr6 = old_head;
+ }
+
+ /* Update lower 32 bits with new values */
+ ti->data &= 0xFFFFFFFF00000000;
+ ti->data |= ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6);
+}
+
+/*
+ * Free unneded array.
+ */
+static void
+ta_flush_mod_chash(void *ta_buf)
+{
+ struct mod_item *mi;
+
+ mi = (struct mod_item *)ta_buf;
+ if (mi->main_ptr != NULL)
+ free(mi->main_ptr, M_IPFW);
+ if (mi->main_ptr6 != NULL)
+ free(mi->main_ptr6, M_IPFW);
+}
+
+struct table_algo addr_hash = {
+ .name = "addr:hash",
+ .type = IPFW_TABLE_ADDR,
+ .ta_buf_size = sizeof(struct ta_buf_chash),
+ .init = ta_init_chash,
+ .destroy = ta_destroy_chash,
+ .prepare_add = ta_prepare_add_chash,
+ .prepare_del = ta_prepare_del_chash,
+ .add = ta_add_chash,
+ .del = ta_del_chash,
+ .flush_entry = ta_flush_chash_entry,
+ .foreach = ta_foreach_chash,
+ .dump_tentry = ta_dump_chash_tentry,
+ .find_tentry = ta_find_chash_tentry,
+ .print_config = ta_print_chash_config,
+ .dump_tinfo = ta_dump_chash_tinfo,
+ .need_modify = ta_need_modify_chash,
+ .prepare_mod = ta_prepare_mod_chash,
+ .fill_mod = ta_fill_mod_chash,
+ .modify = ta_modify_chash,
+ .flush_mod = ta_flush_mod_chash,
+};
+
+
+/*
+ * Iface table cmds.
+ *
+ * Implementation:
+ *
+ * Runtime part:
+ * - sorted array of "struct ifidx" pointed by ti->state.
+ * Array is allocated with rounding up to IFIDX_CHUNK. Only existing
+ * interfaces are stored in array, however its allocated size is
+ * sufficient to hold all table records if needed.
+ * - current array size is stored in ti->data
+ *
+ * Table data:
+ * - "struct iftable_cfg" is allocated to store table state (ta_state).
+ * - All table records are stored inside namedobj instance.
+ *
+ */
+
+struct ifidx {
+ uint16_t kidx;
+ uint16_t spare;
+ uint32_t value;
+};
+#define DEFAULT_IFIDX_SIZE 64
+
+struct iftable_cfg;
+
+struct ifentry {
+ struct named_object no;
+ struct ipfw_ifc ic;
+ struct iftable_cfg *icfg;
+ uint32_t value;
+ int linked;
+};
+
+struct iftable_cfg {
+ struct namedobj_instance *ii;
+ struct ip_fw_chain *ch;
+ struct table_info *ti;
+ void *main_ptr;
+ size_t size; /* Number of items allocated in array */
+ size_t count; /* Number of all items */
+ size_t used; /* Number of items _active_ now */
+};
+
+struct ta_buf_ifidx
+{
+ struct ifentry *ife;
+ uint32_t value;
+};
+
+int compare_ifidx(const void *k, const void *v);
+static struct ifidx * ifidx_find(struct table_info *ti, void *key);
+static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val);
+static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state,
+ struct table_info *ti, char *data, uint8_t tflags);
+static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti);
+static int destroy_ifidx_locked(struct namedobj_instance *ii,
+ struct named_object *no, void *arg);
+static void ta_destroy_ifidx(void *ta_state, struct table_info *ti);
+static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti,
+ ipfw_ta_tinfo *tinfo);
+static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf);
+static int ta_add_ifidx(void *ta_state, struct table_info *ti,
+ struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf);
+static int ta_del_ifidx(void *ta_state, struct table_info *ti,
+ struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+static void ta_flush_ifidx_entry(struct ip_fw_chain *ch,
+ struct tentry_info *tei, void *ta_buf);
+static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex);
+static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti,
+ uint32_t count, uint64_t *pflags);
+static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags);
+static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti,
+ void *ta_buf, uint64_t *pflags);
+static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf,
+ uint64_t pflags);
+static void ta_flush_mod_ifidx(void *ta_buf);
+static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e,
+ ipfw_obj_tentry *tent);
+static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti,
+ ipfw_obj_tentry *tent);
+static int foreach_ifidx(struct namedobj_instance *ii, struct named_object *no,
+ void *arg);
+static void ta_foreach_ifidx(void *ta_state, struct table_info *ti,
+ ta_foreach_f *f, void *arg);
+
+int
+compare_ifidx(const void *k, const void *v)
+{
+ const struct ifidx *ifidx;
+ uint16_t key;
+
+ key = *((const uint16_t *)k);
+ ifidx = (const struct ifidx *)v;
+
+ if (key < ifidx->kidx)
+ return (-1);
+ else if (key > ifidx->kidx)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Adds item @item with key @key into ascending-sorted array @base.
+ * Assumes @base has enough additional storage.
+ *
+ * Returns 1 on success, 0 on duplicate key.
+ */
+static int
+badd(const void *key, void *item, void *base, size_t nmemb,
+ size_t size, int (*compar) (const void *, const void *))
+{
+ int min, max, mid, shift, res;
+ caddr_t paddr;
+
+ if (nmemb == 0) {
+ memcpy(base, item, size);
+ return (1);
+ }
+
+ /* Binary search */
+ min = 0;
+ max = nmemb - 1;
+ mid = 0;
+ while (min <= max) {
+ mid = (min + max) / 2;
+ res = compar(key, (const void *)((caddr_t)base + mid * size));
+ if (res == 0)
+ return (0);
+
+ if (res > 0)
+ min = mid + 1;
+ else
+ max = mid - 1;
+ }
+
+ /* Item not found. */
+ res = compar(key, (const void *)((caddr_t)base + mid * size));
+ if (res > 0)
+ shift = mid + 1;
+ else
+ shift = mid;
+
+ paddr = (caddr_t)base + shift * size;
+ if (nmemb > shift)
+ memmove(paddr + size, paddr, (nmemb - shift) * size);
+
+ memcpy(paddr, item, size);
+
+ return (1);
+}
+
+/*
+ * Deletes item with key @key from ascending-sorted array @base.
+ *
+ * Returns 1 on success, 0 for non-existent key.
+ */
+static int
+bdel(const void *key, void *base, size_t nmemb, size_t size,
+ int (*compar) (const void *, const void *))
+{
+ caddr_t item;
+ size_t sz;
+
+ item = (caddr_t)bsearch(key, base, nmemb, size, compar);
+
+ if (item == NULL)
+ return (0);
+
+ sz = (caddr_t)base + nmemb * size - item;
+
+ if (sz > 0)
+ memmove(item, item + size, sz);
+
+ return (1);
+}
+
+static struct ifidx *
+ifidx_find(struct table_info *ti, void *key)
+{
+ struct ifidx *ifi;
+
+ ifi = bsearch(key, ti->state, ti->data, sizeof(struct ifidx),
+ compare_ifidx);
+
+ return (ifi);
+}
+
+static int
+ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val)
+{
+ struct ifidx *ifi;
+
+ ifi = ifidx_find(ti, key);
+
+ if (ifi != NULL) {
+ *val = ifi->value;
+ return (1);
+ }
+
+ return (0);
+}
+
+static int
+ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
+ char *data, uint8_t tflags)
+{
+ struct iftable_cfg *icfg;
+
+ icfg = malloc(sizeof(struct iftable_cfg), M_IPFW, M_WAITOK | M_ZERO);
+
+ icfg->ii = ipfw_objhash_create(DEFAULT_IFIDX_SIZE);
+ icfg->size = DEFAULT_IFIDX_SIZE;
+ icfg->main_ptr = malloc(sizeof(struct ifidx) * icfg->size, M_IPFW,
+ M_WAITOK | M_ZERO);
+ icfg->ch = ch;
+
+ *ta_state = icfg;
+ ti->state = icfg->main_ptr;
+ ti->lookup = ta_lookup_ifidx;
+
+ return (0);
+}
+
+/*
+ * Handle tableinfo @ti pointer change (on table array resize).
+ */
+static void
+ta_change_ti_ifidx(void *ta_state, struct table_info *ti)
+{
+ struct iftable_cfg *icfg;
+
+ icfg = (struct iftable_cfg *)ta_state;
+ icfg->ti = ti;
+}
+
+static int
+destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no,
+ void *arg)
+{
+ struct ifentry *ife;
+ struct ip_fw_chain *ch;
+
+ ch = (struct ip_fw_chain *)arg;
+ ife = (struct ifentry *)no;
+
+ ipfw_iface_del_notify(ch, &ife->ic);
+ ipfw_iface_unref(ch, &ife->ic);
+ free(ife, M_IPFW_TBL);
+ return (0);
+}
+
+
+/*
+ * Destroys table @ti
+ */
+static void
+ta_destroy_ifidx(void *ta_state, struct table_info *ti)
+{
+ struct iftable_cfg *icfg;
+ struct ip_fw_chain *ch;
+
+ icfg = (struct iftable_cfg *)ta_state;
+ ch = icfg->ch;
+
+ if (icfg->main_ptr != NULL)
+ free(icfg->main_ptr, M_IPFW);
+
+ IPFW_UH_WLOCK(ch);
+ ipfw_objhash_foreach(icfg->ii, destroy_ifidx_locked, ch);
+ IPFW_UH_WUNLOCK(ch);
+
+ ipfw_objhash_destroy(icfg->ii);
+
+ free(icfg, M_IPFW);
+}
+
+/*
+ * Provide algo-specific table info
+ */
+static void
+ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
+{
+ struct iftable_cfg *cfg;
+
+ cfg = (struct iftable_cfg *)ta_state;
+
+ tinfo->taclass4 = IPFW_TACLASS_ARRAY;
+ tinfo->size4 = cfg->size;
+ tinfo->count4 = cfg->used;
+ tinfo->itemsize4 = sizeof(struct ifidx);
+}
+
+/*
+ * Prepare state to add to the table:
+ * allocate ifentry and reference needed interface.
+ */
+static int
+ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+ struct ta_buf_ifidx *tb;
+ char *ifname;
+ struct ifentry *ife;
+
+ tb = (struct ta_buf_ifidx *)ta_buf;
+
+ /* Check if string is terminated */
+ ifname = (char *)tei->paddr;
+ if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE)
+ return (EINVAL);
+
+ ife = malloc(sizeof(struct ifentry), M_IPFW_TBL, M_WAITOK | M_ZERO);
+ ife->ic.cb = if_notifier;
+ ife->ic.cbdata = ife;
+
+ if (ipfw_iface_ref(ch, ifname, &ife->ic) != 0) {
+ free(ife, M_IPFW_TBL);
+ return (EINVAL);
+ }
+
+ /* Use ipfw_iface 'ifname' field as stable storage */
+ ife->no.name = ife->ic.iface->ifname;
+
+ tb->ife = ife;
+
+ return (0);
+}
+
+static int
+ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei,
+ void *ta_buf, uint32_t *pnum)
+{
+ struct iftable_cfg *icfg;
+ struct ifentry *ife, *tmp;
+ struct ta_buf_ifidx *tb;
+ struct ipfw_iface *iif;
+ struct ifidx *ifi;
+ char *ifname;
+ uint32_t value;
+
+ tb = (struct ta_buf_ifidx *)ta_buf;
+ ifname = (char *)tei->paddr;
+ icfg = (struct iftable_cfg *)ta_state;
+ ife = tb->ife;
+
+ ife->icfg = icfg;
+ ife->value = tei->value;
+
+ tmp = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname);
+
+ if (tmp != NULL) {
+ if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
+ return (EEXIST);
+
+ /* Exchange values in @tmp and @tei */
+ value = tmp->value;
+ tmp->value = tei->value;
+ tei->value = value;
+
+ iif = tmp->ic.iface;
+ if (iif->resolved != 0) {
+ /* We have to update runtime value, too */
+ ifi = ifidx_find(ti, &iif->ifindex);
+ ifi->value = ife->value;
+ }
+
+ /* Indicate that update has happened instead of addition */
+ tei->flags |= TEI_FLAGS_UPDATED;
+ *pnum = 0;
+ return (0);
+ }
+
+ if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
+ return (EFBIG);
+
+ /* Link to internal list */
+ ipfw_objhash_add(icfg->ii, &ife->no);
+
+ /* Link notifier (possible running its callback) */
+ ipfw_iface_add_notify(icfg->ch, &ife->ic);
+ icfg->count++;
+
+ tb->ife = NULL;
+ *pnum = 1;
+
+ return (0);
+}
+
+/*
+ * Prepare to delete key from table.
+ * Do basic interface name checks.
+ */
+static int
+ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+ struct ta_buf_ifidx *tb;
+ char *ifname;
+
+ tb = (struct ta_buf_ifidx *)ta_buf;
+
+ /* Check if string is terminated */
+ ifname = (char *)tei->paddr;
+ if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Remove key from both configuration list and
+ * runtime array. Removed interface notification.
+ */
+static int
+ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei,
+ void *ta_buf, uint32_t *pnum)
+{
+ struct iftable_cfg *icfg;
+ struct ifentry *ife;
+ struct ta_buf_ifidx *tb;
+ char *ifname;
+ uint16_t ifindex;
+ int res;
+
+ tb = (struct ta_buf_ifidx *)ta_buf;
+ ifname = (char *)tei->paddr;
+ icfg = (struct iftable_cfg *)ta_state;
+ ife = tb->ife;
+
+ ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname);
+
+ if (ife == NULL)
+ return (ENOENT);
+
+ if (ife->linked != 0) {
+ /* We have to remove item from runtime */
+ ifindex = ife->ic.iface->ifindex;
+
+ res = bdel(&ifindex, icfg->main_ptr, icfg->used,
+ sizeof(struct ifidx), compare_ifidx);
+
+ KASSERT(res == 1, ("index %d does not exist", ifindex));
+ icfg->used--;
+ ti->data = icfg->used;
+ ife->linked = 0;
+ }
+
+ /* Unlink from local list */
+ ipfw_objhash_del(icfg->ii, &ife->no);
+ /* Unlink notifier and deref */
+ ipfw_iface_del_notify(icfg->ch, &ife->ic);
+ ipfw_iface_unref(icfg->ch, &ife->ic);
+
+ icfg->count--;
+ tei->value = ife->value;
+
+ tb->ife = ife;
+ *pnum = 1;
+
+ return (0);
+}
+
+/*
+ * Flush deleted entry.
+ * Drops interface reference and frees entry.
+ */
+static void
+ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+ struct ta_buf_ifidx *tb;
+
+ tb = (struct ta_buf_ifidx *)ta_buf;
+
+ if (tb->ife != NULL)
+ free(tb->ife, M_IPFW_TBL);
+}
+
+
+/*
+ * Handle interface announce/withdrawal for particular table.
+ * Every real runtime array modification happens here.
+ */
+static void
+if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex)
+{
+ struct ifentry *ife;
+ struct ifidx ifi;
+ struct iftable_cfg *icfg;
+ struct table_info *ti;
+ int res;
+
+ ife = (struct ifentry *)cbdata;
+ icfg = ife->icfg;
+ ti = icfg->ti;
+
+ KASSERT(ti != NULL, ("ti=NULL, check change_ti handler"));
+
+ if (ife->linked == 0 && ifindex != 0) {
+ /* Interface announce */
+ ifi.kidx = ifindex;
+ ifi.spare = 0;
+ ifi.value = ife->value;
+ res = badd(&ifindex, &ifi, icfg->main_ptr, icfg->used,
+ sizeof(struct ifidx), compare_ifidx);
+ KASSERT(res == 1, ("index %d already exists", ifindex));
+ icfg->used++;
+ ti->data = icfg->used;
+ ife->linked = 1;
+ } else if (ife->linked != 0 && ifindex == 0) {
+ /* Interface withdrawal */
+ ifindex = ife->ic.iface->ifindex;
+
+ res = bdel(&ifindex, icfg->main_ptr, icfg->used,
+ sizeof(struct ifidx), compare_ifidx);
+
+ KASSERT(res == 1, ("index %d does not exist", ifindex));
+ icfg->used--;
+ ti->data = icfg->used;
+ ife->linked = 0;
+ }
+}
+
+
+/*
+ * Table growing callbacks.
+ */
+
+static int
+ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count,
+ uint64_t *pflags)
+{
+ struct iftable_cfg *cfg;
+ uint32_t size;
+
+ cfg = (struct iftable_cfg *)ta_state;
+
+ size = cfg->size;
+ while (size < cfg->count + count)
+ size *= 2;
+
+ if (size != cfg->size) {
+ *pflags = size;
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * Allocate ned, larger runtime ifidx array.
+ */
+static int
+ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags)
+{
+ struct mod_item *mi;
+
+ mi = (struct mod_item *)ta_buf;
+
+ memset(mi, 0, sizeof(struct mod_item));
+ mi->size = *pflags;
+ mi->main_ptr = malloc(sizeof(struct ifidx) * mi->size, M_IPFW,
+ M_WAITOK | M_ZERO);
+
+ return (0);
+}
+
+/*
+ * Copy data from old runtime array to new one.
+ */
+static int
+ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf,
+ uint64_t *pflags)
+{
+ struct mod_item *mi;
+ struct iftable_cfg *icfg;
+
+ mi = (struct mod_item *)ta_buf;
+ icfg = (struct iftable_cfg *)ta_state;
+
+ /* Check if we still need to grow array */
+ if (icfg->size >= mi->size) {
+ *pflags = 0;
+ return (0);
+ }
+
+ memcpy(mi->main_ptr, icfg->main_ptr, icfg->used * sizeof(struct ifidx));
+
+ return (0);
+}
+
+/*
+ * Switch old & new arrays.
+ */
+static void
+ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf,
+ uint64_t pflags)
+{
+ struct mod_item *mi;
+ struct iftable_cfg *icfg;
+ void *old_ptr;
+
+ mi = (struct mod_item *)ta_buf;
+ icfg = (struct iftable_cfg *)ta_state;
+
+ old_ptr = icfg->main_ptr;
+ icfg->main_ptr = mi->main_ptr;
+ icfg->size = mi->size;
+ ti->state = icfg->main_ptr;
+
+ mi->main_ptr = old_ptr;
+}
+
+/*
+ * Free unneded array.
+ */
+static void
+ta_flush_mod_ifidx(void *ta_buf)
+{
+ struct mod_item *mi;
+
+ mi = (struct mod_item *)ta_buf;
+ if (mi->main_ptr != NULL)
+ free(mi->main_ptr, M_IPFW);
+}
+
+static int
+ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e,
+ ipfw_obj_tentry *tent)
+{
+ struct ifentry *ife;
+
+ ife = (struct ifentry *)e;
+
+ tent->masklen = 8 * IF_NAMESIZE;
+ memcpy(&tent->k, ife->no.name, IF_NAMESIZE);
+ tent->v.kidx = ife->value;
+
+ return (0);
+}
+
+static int
+ta_find_ifidx_tentry(void *ta_state, struct table_info *ti,
+ ipfw_obj_tentry *tent)
+{
+ struct iftable_cfg *icfg;
+ struct ifentry *ife;
+ char *ifname;
+
+ icfg = (struct iftable_cfg *)ta_state;
+ ifname = tent->k.iface;
+
+ if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE)
+ return (EINVAL);
+
+ ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname);
+
+ if (ife != NULL) {
+ ta_dump_ifidx_tentry(ta_state, ti, ife, tent);
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+struct wa_ifidx {
+ ta_foreach_f *f;
+ void *arg;
+};
+
+static int
+foreach_ifidx(struct namedobj_instance *ii, struct named_object *no,
+ void *arg)
+{
+ struct ifentry *ife;
+ struct wa_ifidx *wa;
+
+ ife = (struct ifentry *)no;
+ wa = (struct wa_ifidx *)arg;
+
+ wa->f(ife, wa->arg);
+ return (0);
+}
+
+static void
+ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f,
+ void *arg)
+{
+ struct iftable_cfg *icfg;
+ struct wa_ifidx wa;
+
+ icfg = (struct iftable_cfg *)ta_state;
+
+ wa.f = f;
+ wa.arg = arg;
+
+ ipfw_objhash_foreach(icfg->ii, foreach_ifidx, &wa);
+}
+
+struct table_algo iface_idx = {
+ .name = "iface:array",
+ .type = IPFW_TABLE_INTERFACE,
+ .flags = TA_FLAG_DEFAULT,
+ .ta_buf_size = sizeof(struct ta_buf_ifidx),
+ .init = ta_init_ifidx,
+ .destroy = ta_destroy_ifidx,
+ .prepare_add = ta_prepare_add_ifidx,
+ .prepare_del = ta_prepare_del_ifidx,
+ .add = ta_add_ifidx,
+ .del = ta_del_ifidx,
+ .flush_entry = ta_flush_ifidx_entry,
+ .foreach = ta_foreach_ifidx,
+ .dump_tentry = ta_dump_ifidx_tentry,
+ .find_tentry = ta_find_ifidx_tentry,
+ .dump_tinfo = ta_dump_ifidx_tinfo,
+ .need_modify = ta_need_modify_ifidx,
+ .prepare_mod = ta_prepare_mod_ifidx,
+ .fill_mod = ta_fill_mod_ifidx,
+ .modify = ta_modify_ifidx,
+ .flush_mod = ta_flush_mod_ifidx,
+ .change_ti = ta_change_ti_ifidx,
+};
+
+/*
+ * Number array cmds.
+ *
+ * Implementation:
+ *
+ * Runtime part:
+ * - sorted array of "struct numarray" pointed by ti->state.
+ * Array is allocated with rounding up to NUMARRAY_CHUNK.
+ * - current array size is stored in ti->data
+ *
+ */
+
+struct numarray {
+ uint32_t number;
+ uint32_t value;
+};
+
+struct numarray_cfg {
+ void *main_ptr;
+ size_t size; /* Number of items allocated in array */
+ size_t used; /* Number of items _active_ now */
+};
+
+struct ta_buf_numarray
+{
+ struct numarray na;
+};
+
+int compare_numarray(const void *k, const void *v);
+static struct numarray *numarray_find(struct table_info *ti, void *key);
+static int ta_lookup_numarray(struct table_info *ti, void *key,
+ uint32_t keylen, uint32_t *val);
+static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state,
+ struct table_info *ti, char *data, uint8_t tflags);
+static void ta_destroy_numarray(void *ta_state, struct table_info *ti);
+static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti,
+ ipfw_ta_tinfo *tinfo);
+static int ta_prepare_add_numarray(struct ip_fw_chain *ch,
+ struct tentry_info *tei, void *ta_buf);
+static int ta_add_numarray(void *ta_state, struct table_info *ti,
+ struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+static int ta_del_numarray(void *ta_state, struct table_info *ti,
+ struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+static void ta_flush_numarray_entry(struct ip_fw_chain *ch,
+ struct tentry_info *tei, void *ta_buf);
+static int ta_need_modify_numarray(void *ta_state, struct table_info *ti,
+ uint32_t count, uint64_t *pflags);
+static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags);
+static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti,
+ void *ta_buf, uint64_t *pflags);
+static void ta_modify_numarray(void *ta_state, struct table_info *ti,
+ void *ta_buf, uint64_t pflags);
+static void ta_flush_mod_numarray(void *ta_buf);
+static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti,
+ void *e, ipfw_obj_tentry *tent);
+static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti,
+ ipfw_obj_tentry *tent);
+static void ta_foreach_numarray(void *ta_state, struct table_info *ti,
+ ta_foreach_f *f, void *arg);
+
+int
+compare_numarray(const void *k, const void *v)
+{
+ const struct numarray *na;
+ uint32_t key;
+
+ key = *((const uint32_t *)k);
+ na = (const struct numarray *)v;
+
+ if (key < na->number)
+ return (-1);
+ else if (key > na->number)
+ return (1);
+
+ return (0);
+}
+
+static struct numarray *
+numarray_find(struct table_info *ti, void *key)
+{
+ struct numarray *ri;
+
+ ri = bsearch(key, ti->state, ti->data, sizeof(struct numarray),
+ compare_ifidx);
+
+ return (ri);
+}
+
+static int
+ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val)
+{
+ struct numarray *ri;
+
+ ri = numarray_find(ti, key);
+
+ if (ri != NULL) {
+ *val = ri->value;
+ return (1);
+ }
+
+ return (0);
+}
+
+static int
+ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
+ char *data, uint8_t tflags)
+{
+ struct numarray_cfg *cfg;
+
+ cfg = malloc(sizeof(*cfg), M_IPFW, M_WAITOK | M_ZERO);
+
+ cfg->size = 16;
+ cfg->main_ptr = malloc(sizeof(struct numarray) * cfg->size, M_IPFW,
+ M_WAITOK | M_ZERO);
+
+ *ta_state = cfg;
+ ti->state = cfg->main_ptr;
+ ti->lookup = ta_lookup_numarray;
+
+ return (0);
+}
+
+/*
+ * Destroys table @ti
+ */
+static void
+ta_destroy_numarray(void *ta_state, struct table_info *ti)
+{
+ struct numarray_cfg *cfg;
+
+ cfg = (struct numarray_cfg *)ta_state;
+
+ if (cfg->main_ptr != NULL)
+ free(cfg->main_ptr, M_IPFW);
+
+ free(cfg, M_IPFW);
+}
+
+/*
+ * Provide algo-specific table info
+ */
+static void
+ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
+{
+ struct numarray_cfg *cfg;
+
+ cfg = (struct numarray_cfg *)ta_state;
+
+ tinfo->taclass4 = IPFW_TACLASS_ARRAY;
+ tinfo->size4 = cfg->size;
+ tinfo->count4 = cfg->used;
+ tinfo->itemsize4 = sizeof(struct numarray);
+}
+
+/*
+ * Prepare for addition/deletion to an array.
+ */
+static int
+ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+ struct ta_buf_numarray *tb;
+
+ tb = (struct ta_buf_numarray *)ta_buf;
+
+ tb->na.number = *((uint32_t *)tei->paddr);
+
+ return (0);
+}
+
+static int
+ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei,
+ void *ta_buf, uint32_t *pnum)
+{
+ struct numarray_cfg *cfg;
+ struct ta_buf_numarray *tb;
+ struct numarray *ri;
+ int res;
+ uint32_t value;
+
+ tb = (struct ta_buf_numarray *)ta_buf;
+ cfg = (struct numarray_cfg *)ta_state;
+
+ /* Read current value from @tei */
+ tb->na.value = tei->value;
+
+ ri = numarray_find(ti, &tb->na.number);
+
+ if (ri != NULL) {
+ if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
+ return (EEXIST);
+
+ /* Exchange values between ri and @tei */
+ value = ri->value;
+ ri->value = tei->value;
+ tei->value = value;
+ /* Indicate that update has happened instead of addition */
+ tei->flags |= TEI_FLAGS_UPDATED;
+ *pnum = 0;
+ return (0);
+ }
+
+ if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
+ return (EFBIG);
+
+ res = badd(&tb->na.number, &tb->na, cfg->main_ptr, cfg->used,
+ sizeof(struct numarray), compare_numarray);
+
+ KASSERT(res == 1, ("number %d already exists", tb->na.number));
+ cfg->used++;
+ ti->data = cfg->used;
+ *pnum = 1;
+
+ return (0);
+}
+
+/*
+ * Remove key from both configuration list and
+ * runtime array. Removed interface notification.
+ */
+static int
+ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei,
+ void *ta_buf, uint32_t *pnum)
+{
+ struct numarray_cfg *cfg;
+ struct ta_buf_numarray *tb;
+ struct numarray *ri;
+ int res;
+
+ tb = (struct ta_buf_numarray *)ta_buf;
+ cfg = (struct numarray_cfg *)ta_state;
+
+ ri = numarray_find(ti, &tb->na.number);
+ if (ri == NULL)
+ return (ENOENT);
+
+ tei->value = ri->value;
+
+ res = bdel(&tb->na.number, cfg->main_ptr, cfg->used,
+ sizeof(struct numarray), compare_numarray);
+
+ KASSERT(res == 1, ("number %u does not exist", tb->na.number));
+ cfg->used--;
+ ti->data = cfg->used;
+ *pnum = 1;
+
+ return (0);
+}
+
+static void
+ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+
+ /* We don't have any state, do nothing */
+}
+
+
+/*
+ * Table growing callbacks.
+ */
+
+static int
+ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count,
+ uint64_t *pflags)
+{
+ struct numarray_cfg *cfg;
+ size_t size;
+
+ cfg = (struct numarray_cfg *)ta_state;
+
+ size = cfg->size;
+ while (size < cfg->used + count)
+ size *= 2;
+
+ if (size != cfg->size) {
+ *pflags = size;
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * Allocate new, larger runtime array.
+ */
+static int
+ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags)
+{
+ struct mod_item *mi;
+
+ mi = (struct mod_item *)ta_buf;
+
+ memset(mi, 0, sizeof(struct mod_item));
+ mi->size = *pflags;
+ mi->main_ptr = malloc(sizeof(struct numarray) * mi->size, M_IPFW,
+ M_WAITOK | M_ZERO);
+
+ return (0);
+}
+
+/*
+ * Copy data from old runtime array to new one.
+ */
+static int
+ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf,
+ uint64_t *pflags)
+{
+ struct mod_item *mi;
+ struct numarray_cfg *cfg;
+
+ mi = (struct mod_item *)ta_buf;
+ cfg = (struct numarray_cfg *)ta_state;
+
+ /* Check if we still need to grow array */
+ if (cfg->size >= mi->size) {
+ *pflags = 0;
+ return (0);
+ }
+
+ memcpy(mi->main_ptr, cfg->main_ptr, cfg->used * sizeof(struct numarray));
+
+ return (0);
+}
+
+/*
+ * Switch old & new arrays.
+ */
+static void
+ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf,
+ uint64_t pflags)
+{
+ struct mod_item *mi;
+ struct numarray_cfg *cfg;
+ void *old_ptr;
+
+ mi = (struct mod_item *)ta_buf;
+ cfg = (struct numarray_cfg *)ta_state;
+
+ old_ptr = cfg->main_ptr;
+ cfg->main_ptr = mi->main_ptr;
+ cfg->size = mi->size;
+ ti->state = cfg->main_ptr;
+
+ mi->main_ptr = old_ptr;
+}
+
+/*
+ * Free unneded array.
+ */
+static void
+ta_flush_mod_numarray(void *ta_buf)
+{
+ struct mod_item *mi;
+
+ mi = (struct mod_item *)ta_buf;
+ if (mi->main_ptr != NULL)
+ free(mi->main_ptr, M_IPFW);
+}
+
+static int
+ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e,
+ ipfw_obj_tentry *tent)
+{
+ struct numarray *na;
+
+ na = (struct numarray *)e;
+
+ tent->k.key = na->number;
+ tent->v.kidx = na->value;
+
+ return (0);
+}
+
+static int
+ta_find_numarray_tentry(void *ta_state, struct table_info *ti,
+ ipfw_obj_tentry *tent)
+{
+ struct numarray_cfg *cfg;
+ struct numarray *ri;
+
+ cfg = (struct numarray_cfg *)ta_state;
+
+ ri = numarray_find(ti, &tent->k.key);
+
+ if (ri != NULL) {
+ ta_dump_numarray_tentry(ta_state, ti, ri, tent);
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+static void
+ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f,
+ void *arg)
+{
+ struct numarray_cfg *cfg;
+ struct numarray *array;
+ int i;
+
+ cfg = (struct numarray_cfg *)ta_state;
+ array = cfg->main_ptr;
+
+ for (i = 0; i < cfg->used; i++)
+ f(&array[i], arg);
+}
+
+struct table_algo number_array = {
+ .name = "number:array",
+ .type = IPFW_TABLE_NUMBER,
+ .ta_buf_size = sizeof(struct ta_buf_numarray),
+ .init = ta_init_numarray,
+ .destroy = ta_destroy_numarray,
+ .prepare_add = ta_prepare_add_numarray,
+ .prepare_del = ta_prepare_add_numarray,
+ .add = ta_add_numarray,
+ .del = ta_del_numarray,
+ .flush_entry = ta_flush_numarray_entry,
+ .foreach = ta_foreach_numarray,
+ .dump_tentry = ta_dump_numarray_tentry,
+ .find_tentry = ta_find_numarray_tentry,
+ .dump_tinfo = ta_dump_numarray_tinfo,
+ .need_modify = ta_need_modify_numarray,
+ .prepare_mod = ta_prepare_mod_numarray,
+ .fill_mod = ta_fill_mod_numarray,
+ .modify = ta_modify_numarray,
+ .flush_mod = ta_flush_mod_numarray,
+};
+
+/*
+ * flow:hash cmds
+ *
+ *
+ * ti->data:
+ * [inv.mask4][inv.mask6][log2hsize4][log2hsize6]
+ * [ 8][ 8[ 8][ 8]
+ *
+ * inv.mask4: 32 - mask
+ * inv.mask6:
+ * 1) _slow lookup: mask
+ * 2) _aligned: (128 - mask) / 8
+ * 3) _64: 8
+ *
+ *
+ * pflags:
+ * [hsize4][hsize6]
+ * [ 16][ 16]
+ */
+
+struct fhashentry;
+
+SLIST_HEAD(fhashbhead, fhashentry);
+
+struct fhashentry {
+ SLIST_ENTRY(fhashentry) next;
+ uint8_t af;
+ uint8_t proto;
+ uint16_t spare0;
+ uint16_t dport;
+ uint16_t sport;
+ uint32_t value;
+ uint32_t spare1;
+};
+
+struct fhashentry4 {
+ struct fhashentry e;
+ struct in_addr dip;
+ struct in_addr sip;
+};
+
+struct fhashentry6 {
+ struct fhashentry e;
+ struct in6_addr dip6;
+ struct in6_addr sip6;
+};
+
+struct fhash_cfg {
+ struct fhashbhead *head;
+ size_t size;
+ size_t items;
+ struct fhashentry4 fe4;
+ struct fhashentry6 fe6;
+};
+
+struct ta_buf_fhash {
+ void *ent_ptr;
+ struct fhashentry6 fe6;
+};
+
+static __inline int cmp_flow_ent(struct fhashentry *a,
+ struct fhashentry *b, size_t sz);
+static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize);
+static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize);
+static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size);
+static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val);
+static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state,
+struct table_info *ti, char *data, uint8_t tflags);
+static void ta_destroy_fhash(void *ta_state, struct table_info *ti);
+static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti,
+ ipfw_ta_tinfo *tinfo);
+static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti,
+ void *e, ipfw_obj_tentry *tent);
+static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent);
+static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti,
+ ipfw_obj_tentry *tent);
+static void ta_foreach_fhash(void *ta_state, struct table_info *ti,
+ ta_foreach_f *f, void *arg);
+static int ta_prepare_add_fhash(struct ip_fw_chain *ch,
+ struct tentry_info *tei, void *ta_buf);
+static int ta_add_fhash(void *ta_state, struct table_info *ti,
+ struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf);
+static int ta_del_fhash(void *ta_state, struct table_info *ti,
+ struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
+static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf);
+static int ta_need_modify_fhash(void *ta_state, struct table_info *ti,
+ uint32_t count, uint64_t *pflags);
+static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags);
+static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti,
+ void *ta_buf, uint64_t *pflags);
+static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf,
+ uint64_t pflags);
+static void ta_flush_mod_fhash(void *ta_buf);
+
+static __inline int
+cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz)
+{
+ uint64_t *ka, *kb;
+
+ ka = (uint64_t *)(&a->next + 1);
+ kb = (uint64_t *)(&b->next + 1);
+
+ if (*ka == *kb && (memcmp(a + 1, b + 1, sz) == 0))
+ return (1);
+
+ return (0);
+}
+
+static __inline uint32_t
+hash_flow4(struct fhashentry4 *f, int hsize)
+{
+ uint32_t i;
+
+ i = (f->dip.s_addr) ^ (f->sip.s_addr) ^ (f->e.dport) ^ (f->e.sport);
+
+ return (i % (hsize - 1));
+}
+
+static __inline uint32_t
+hash_flow6(struct fhashentry6 *f, int hsize)
+{
+ uint32_t i;
+
+ i = (f->dip6.__u6_addr.__u6_addr32[2]) ^
+ (f->dip6.__u6_addr.__u6_addr32[3]) ^
+ (f->sip6.__u6_addr.__u6_addr32[2]) ^
+ (f->sip6.__u6_addr.__u6_addr32[3]) ^
+ (f->e.dport) ^ (f->e.sport);
+
+ return (i % (hsize - 1));
+}
+
+static uint32_t
+hash_flow_ent(struct fhashentry *ent, uint32_t size)
+{
+ uint32_t hash;
+
+ if (ent->af == AF_INET) {
+ hash = hash_flow4((struct fhashentry4 *)ent, size);
+ } else {
+ hash = hash_flow6((struct fhashentry6 *)ent, size);
+ }
+
+ return (hash);
+}
+
+static int
+ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val)
+{
+ struct fhashbhead *head;
+ struct fhashentry *ent;
+ struct fhashentry4 *m4;
+ struct ipfw_flow_id *id;
+ uint16_t hash, hsize;
+
+ id = (struct ipfw_flow_id *)key;
+ head = (struct fhashbhead *)ti->state;
+ hsize = ti->data;
+ m4 = (struct fhashentry4 *)ti->xstate;
+
+ if (id->addr_type == 4) {
+ struct fhashentry4 f;
+
+ /* Copy hash mask */
+ f = *m4;
+
+ f.dip.s_addr &= id->dst_ip;
+ f.sip.s_addr &= id->src_ip;
+ f.e.dport &= id->dst_port;
+ f.e.sport &= id->src_port;
+ f.e.proto &= id->proto;
+ hash = hash_flow4(&f, hsize);
+ SLIST_FOREACH(ent, &head[hash], next) {
+ if (cmp_flow_ent(ent, &f.e, 2 * 4) != 0) {
+ *val = ent->value;
+ return (1);
+ }
+ }
+ } else if (id->addr_type == 6) {
+ struct fhashentry6 f;
+ uint64_t *fp, *idp;
+
+ /* Copy hash mask */
+ f = *((struct fhashentry6 *)(m4 + 1));
+
+ /* Handle lack of __u6_addr.__u6_addr64 */
+ fp = (uint64_t *)&f.dip6;
+ idp = (uint64_t *)&id->dst_ip6;
+ /* src IPv6 is stored after dst IPv6 */
+ *fp++ &= *idp++;
+ *fp++ &= *idp++;
+ *fp++ &= *idp++;
+ *fp &= *idp;
+ f.e.dport &= id->dst_port;
+ f.e.sport &= id->src_port;
+ f.e.proto &= id->proto;
+ hash = hash_flow6(&f, hsize);
+ SLIST_FOREACH(ent, &head[hash], next) {
+ if (cmp_flow_ent(ent, &f.e, 2 * 16) != 0) {
+ *val = ent->value;
+ return (1);
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * New table.
+ */
+static int
+ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
+ char *data, uint8_t tflags)
+{
+ int i;
+ struct fhash_cfg *cfg;
+ struct fhashentry4 *fe4;
+ struct fhashentry6 *fe6;
+
+ cfg = malloc(sizeof(struct fhash_cfg), M_IPFW, M_WAITOK | M_ZERO);
+
+ cfg->size = 512;
+
+ cfg->head = malloc(sizeof(struct fhashbhead) * cfg->size, M_IPFW,
+ M_WAITOK | M_ZERO);
+ for (i = 0; i < cfg->size; i++)
+ SLIST_INIT(&cfg->head[i]);
+
+ /* Fill in fe masks based on @tflags */
+ fe4 = &cfg->fe4;
+ fe6 = &cfg->fe6;
+ if (tflags & IPFW_TFFLAG_SRCIP) {
+ memset(&fe4->sip, 0xFF, sizeof(fe4->sip));
+ memset(&fe6->sip6, 0xFF, sizeof(fe6->sip6));
+ }
+ if (tflags & IPFW_TFFLAG_DSTIP) {
+ memset(&fe4->dip, 0xFF, sizeof(fe4->dip));
+ memset(&fe6->dip6, 0xFF, sizeof(fe6->dip6));
+ }
+ if (tflags & IPFW_TFFLAG_SRCPORT) {
+ memset(&fe4->e.sport, 0xFF, sizeof(fe4->e.sport));
+ memset(&fe6->e.sport, 0xFF, sizeof(fe6->e.sport));
+ }
+ if (tflags & IPFW_TFFLAG_DSTPORT) {
+ memset(&fe4->e.dport, 0xFF, sizeof(fe4->e.dport));
+ memset(&fe6->e.dport, 0xFF, sizeof(fe6->e.dport));
+ }
+ if (tflags & IPFW_TFFLAG_PROTO) {
+ memset(&fe4->e.proto, 0xFF, sizeof(fe4->e.proto));
+ memset(&fe6->e.proto, 0xFF, sizeof(fe6->e.proto));
+ }
+
+ fe4->e.af = AF_INET;
+ fe6->e.af = AF_INET6;
+
+ *ta_state = cfg;
+ ti->state = cfg->head;
+ ti->xstate = &cfg->fe4;
+ ti->data = cfg->size;
+ ti->lookup = ta_lookup_fhash;
+
+ return (0);
+}
+
+static void
+ta_destroy_fhash(void *ta_state, struct table_info *ti)
+{
+ struct fhash_cfg *cfg;
+ struct fhashentry *ent, *ent_next;
+ int i;
+
+ cfg = (struct fhash_cfg *)ta_state;
+
+ for (i = 0; i < cfg->size; i++)
+ SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next)
+ free(ent, M_IPFW_TBL);
+
+ free(cfg->head, M_IPFW);
+ free(cfg, M_IPFW);
+}
+
+/*
+ * Provide algo-specific table info
+ */
+static void
+ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
+{
+ struct fhash_cfg *cfg;
+
+ cfg = (struct fhash_cfg *)ta_state;
+
+ tinfo->flags = IPFW_TATFLAGS_AFITEM;
+ tinfo->taclass4 = IPFW_TACLASS_HASH;
+ tinfo->size4 = cfg->size;
+ tinfo->count4 = cfg->items;
+ tinfo->itemsize4 = sizeof(struct fhashentry4);
+ tinfo->itemsize6 = sizeof(struct fhashentry6);
+}
+
+static int
+ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e,
+ ipfw_obj_tentry *tent)
+{
+ struct fhash_cfg *cfg;
+ struct fhashentry *ent;
+ struct fhashentry4 *fe4;
+#ifdef INET6
+ struct fhashentry6 *fe6;
+#endif
+ struct tflow_entry *tfe;
+
+ cfg = (struct fhash_cfg *)ta_state;
+ ent = (struct fhashentry *)e;
+ tfe = &tent->k.flow;
+
+ tfe->af = ent->af;
+ tfe->proto = ent->proto;
+ tfe->dport = htons(ent->dport);
+ tfe->sport = htons(ent->sport);
+ tent->v.kidx = ent->value;
+ tent->subtype = ent->af;
+
+ if (ent->af == AF_INET) {
+ fe4 = (struct fhashentry4 *)ent;
+ tfe->a.a4.sip.s_addr = htonl(fe4->sip.s_addr);
+ tfe->a.a4.dip.s_addr = htonl(fe4->dip.s_addr);
+ tent->masklen = 32;
+#ifdef INET6
+ } else {
+ fe6 = (struct fhashentry6 *)ent;
+ tfe->a.a6.sip6 = fe6->sip6;
+ tfe->a.a6.dip6 = fe6->dip6;
+ tent->masklen = 128;
+#endif
+ }
+
+ return (0);
+}
+
+static int
+tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent)
+{
+#ifdef INET
+ struct fhashentry4 *fe4;
+#endif
+#ifdef INET6
+ struct fhashentry6 *fe6;
+#endif
+ struct tflow_entry *tfe;
+
+ tfe = (struct tflow_entry *)tei->paddr;
+
+ ent->af = tei->subtype;
+ ent->proto = tfe->proto;
+ ent->dport = ntohs(tfe->dport);
+ ent->sport = ntohs(tfe->sport);
+
+ if (tei->subtype == AF_INET) {
+#ifdef INET
+ fe4 = (struct fhashentry4 *)ent;
+ fe4->sip.s_addr = ntohl(tfe->a.a4.sip.s_addr);
+ fe4->dip.s_addr = ntohl(tfe->a.a4.dip.s_addr);
+#endif
+#ifdef INET6
+ } else if (tei->subtype == AF_INET6) {
+ fe6 = (struct fhashentry6 *)ent;
+ fe6->sip6 = tfe->a.a6.sip6;
+ fe6->dip6 = tfe->a.a6.dip6;
+#endif
+ } else {
+ /* Unknown CIDR type */
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+
+static int
+ta_find_fhash_tentry(void *ta_state, struct table_info *ti,
+ ipfw_obj_tentry *tent)
+{
+ struct fhash_cfg *cfg;
+ struct fhashbhead *head;
+ struct fhashentry *ent, *tmp;
+ struct fhashentry6 fe6;
+ struct tentry_info tei;
+ int error;
+ uint32_t hash;
+ size_t sz;
+
+ cfg = (struct fhash_cfg *)ta_state;
+
+ ent = &fe6.e;
+
+ memset(&fe6, 0, sizeof(fe6));
+ memset(&tei, 0, sizeof(tei));
+
+ tei.paddr = &tent->k.flow;
+ tei.subtype = tent->subtype;
+
+ if ((error = tei_to_fhash_ent(&tei, ent)) != 0)
+ return (error);
+
+ head = cfg->head;
+ hash = hash_flow_ent(ent, cfg->size);
+
+ if (tei.subtype == AF_INET)
+ sz = 2 * sizeof(struct in_addr);
+ else
+ sz = 2 * sizeof(struct in6_addr);
+
+ /* Check for existence */
+ SLIST_FOREACH(tmp, &head[hash], next) {
+ if (cmp_flow_ent(tmp, ent, sz) != 0) {
+ ta_dump_fhash_tentry(ta_state, ti, tmp, tent);
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+static void
+ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f,
+ void *arg)
+{
+ struct fhash_cfg *cfg;
+ struct fhashentry *ent, *ent_next;
+ int i;
+
+ cfg = (struct fhash_cfg *)ta_state;
+
+ for (i = 0; i < cfg->size; i++)
+ SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next)
+ f(ent, arg);
+}
+
+static int
+ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+ struct ta_buf_fhash *tb;
+ struct fhashentry *ent;
+ size_t sz;
+ int error;
+
+ tb = (struct ta_buf_fhash *)ta_buf;
+
+ if (tei->subtype == AF_INET)
+ sz = sizeof(struct fhashentry4);
+ else if (tei->subtype == AF_INET6)
+ sz = sizeof(struct fhashentry6);
+ else
+ return (EINVAL);
+
+ ent = malloc(sz, M_IPFW_TBL, M_WAITOK | M_ZERO);
+
+ error = tei_to_fhash_ent(tei, ent);
+ if (error != 0) {
+ free(ent, M_IPFW_TBL);
+ return (error);
+ }
+ tb->ent_ptr = ent;
+
+ return (0);
+}
+
+static int
+ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei,
+ void *ta_buf, uint32_t *pnum)
+{
+ struct fhash_cfg *cfg;
+ struct fhashbhead *head;
+ struct fhashentry *ent, *tmp;
+ struct ta_buf_fhash *tb;
+ int exists;
+ uint32_t hash, value;
+ size_t sz;
+
+ cfg = (struct fhash_cfg *)ta_state;
+ tb = (struct ta_buf_fhash *)ta_buf;
+ ent = (struct fhashentry *)tb->ent_ptr;
+ exists = 0;
+
+ /* Read current value from @tei */
+ ent->value = tei->value;
+
+ head = cfg->head;
+ hash = hash_flow_ent(ent, cfg->size);
+
+ if (tei->subtype == AF_INET)
+ sz = 2 * sizeof(struct in_addr);
+ else
+ sz = 2 * sizeof(struct in6_addr);
+
+ /* Check for existence */
+ SLIST_FOREACH(tmp, &head[hash], next) {
+ if (cmp_flow_ent(tmp, ent, sz) != 0) {
+ exists = 1;
+ break;
+ }
+ }
+
+ if (exists == 1) {
+ if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
+ return (EEXIST);
+ /* Record already exists. Update value if we're asked to */
+ /* Exchange values between tmp and @tei */
+ value = tmp->value;
+ tmp->value = tei->value;
+ tei->value = value;
+ /* Indicate that update has happened instead of addition */
+ tei->flags |= TEI_FLAGS_UPDATED;
+ *pnum = 0;
+ } else {
+ if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
+ return (EFBIG);
+
+ SLIST_INSERT_HEAD(&head[hash], ent, next);
+ tb->ent_ptr = NULL;
+ *pnum = 1;
+
+ /* Update counters and check if we need to grow hash */
+ cfg->items++;
+ }
+
+ return (0);
+}
+
+static int
+ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+ struct ta_buf_fhash *tb;
+
+ tb = (struct ta_buf_fhash *)ta_buf;
+
+ return (tei_to_fhash_ent(tei, &tb->fe6.e));
+}
+
+static int
+ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei,
+ void *ta_buf, uint32_t *pnum)
+{
+ struct fhash_cfg *cfg;
+ struct fhashbhead *head;
+ struct fhashentry *ent, *tmp;
+ struct ta_buf_fhash *tb;
+ uint32_t hash;
+ size_t sz;
+
+ cfg = (struct fhash_cfg *)ta_state;
+ tb = (struct ta_buf_fhash *)ta_buf;
+ ent = &tb->fe6.e;
+
+ head = cfg->head;
+ hash = hash_flow_ent(ent, cfg->size);
+
+ if (tei->subtype == AF_INET)
+ sz = 2 * sizeof(struct in_addr);
+ else
+ sz = 2 * sizeof(struct in6_addr);
+
+ /* Check for existence */
+ SLIST_FOREACH(tmp, &head[hash], next) {
+ if (cmp_flow_ent(tmp, ent, sz) == 0)
+ continue;
+
+ SLIST_REMOVE(&head[hash], tmp, fhashentry, next);
+ tei->value = tmp->value;
+ *pnum = 1;
+ cfg->items--;
+ tb->ent_ptr = tmp;
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+static void
+ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
+ void *ta_buf)
+{
+ struct ta_buf_fhash *tb;
+
+ tb = (struct ta_buf_fhash *)ta_buf;
+
+ if (tb->ent_ptr != NULL)
+ free(tb->ent_ptr, M_IPFW_TBL);
+}
+
+/*
+ * Hash growing callbacks.
+ */
+
+static int
+ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count,
+ uint64_t *pflags)
+{
+ struct fhash_cfg *cfg;
+
+ cfg = (struct fhash_cfg *)ta_state;
+
+ if (cfg->items > cfg->size && cfg->size < 65536) {
+ *pflags = cfg->size * 2;
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * Allocate new, larger fhash.
+ */
+static int
+ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags)
+{
+ struct mod_item *mi;
+ struct fhashbhead *head;
+ int i;
+
+ mi = (struct mod_item *)ta_buf;
+
+ memset(mi, 0, sizeof(struct mod_item));
+ mi->size = *pflags;
+ head = malloc(sizeof(struct fhashbhead) * mi->size, M_IPFW,
+ M_WAITOK | M_ZERO);
+ for (i = 0; i < mi->size; i++)
+ SLIST_INIT(&head[i]);
+
+ mi->main_ptr = head;
+
+ return (0);
+}
+
+/*
+ * Copy data from old runtime array to new one.
+ */
+static int
+ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf,
+ uint64_t *pflags)
+{
+
+ /* In is not possible to do rehash if we're not holidng WLOCK. */
+ return (0);
+}
+
+/*
+ * Switch old & new arrays.
+ */
+static void
+ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf,
+ uint64_t pflags)
+{
+ struct mod_item *mi;
+ struct fhash_cfg *cfg;
+ struct fhashbhead *old_head, *new_head;
+ struct fhashentry *ent, *ent_next;
+ int i;
+ uint32_t nhash;
+ size_t old_size;
+
+ mi = (struct mod_item *)ta_buf;
+ cfg = (struct fhash_cfg *)ta_state;
+
+ old_size = cfg->size;
+ old_head = ti->state;
+
+ new_head = (struct fhashbhead *)mi->main_ptr;
+ for (i = 0; i < old_size; i++) {
+ SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) {
+ nhash = hash_flow_ent(ent, mi->size);
+ SLIST_INSERT_HEAD(&new_head[nhash], ent, next);
+ }
+ }
+
+ ti->state = new_head;
+ ti->data = mi->size;
+ cfg->head = new_head;
+ cfg->size = mi->size;
+
+ mi->main_ptr = old_head;
+}
+
+/*
+ * Free unneded array.
+ */
+static void
+ta_flush_mod_fhash(void *ta_buf)
+{
+ struct mod_item *mi;
+
+ mi = (struct mod_item *)ta_buf;
+ if (mi->main_ptr != NULL)
+ free(mi->main_ptr, M_IPFW);
+}
+
+struct table_algo flow_hash = {
+ .name = "flow:hash",
+ .type = IPFW_TABLE_FLOW,
+ .flags = TA_FLAG_DEFAULT,
+ .ta_buf_size = sizeof(struct ta_buf_fhash),
+ .init = ta_init_fhash,
+ .destroy = ta_destroy_fhash,
+ .prepare_add = ta_prepare_add_fhash,
+ .prepare_del = ta_prepare_del_fhash,
+ .add = ta_add_fhash,
+ .del = ta_del_fhash,
+ .flush_entry = ta_flush_fhash_entry,
+ .foreach = ta_foreach_fhash,
+ .dump_tentry = ta_dump_fhash_tentry,
+ .find_tentry = ta_find_fhash_tentry,
+ .dump_tinfo = ta_dump_fhash_tinfo,
+ .need_modify = ta_need_modify_fhash,
+ .prepare_mod = ta_prepare_mod_fhash,
+ .fill_mod = ta_fill_mod_fhash,
+ .modify = ta_modify_fhash,
+ .flush_mod = ta_flush_mod_fhash,
+};
+
+/*
+ * Kernel fibs bindings.
+ *
+ * Implementation:
+ *
+ * Runtime part:
+ * - fully relies on route API
+ * - fib number is stored in ti->data
+ *
+ */
+
+static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val);
+static int kfib_parse_opts(int *pfib, char *data);
+static void ta_print_kfib_config(void *ta_state, struct table_info *ti,
+ char *buf, size_t bufsize);
+static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state,
+ struct table_info *ti, char *data, uint8_t tflags);
+static void ta_destroy_kfib(void *ta_state, struct table_info *ti);
+static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti,
+ ipfw_ta_tinfo *tinfo);
+static int contigmask(uint8_t *p, int len);
+static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e,
+ ipfw_obj_tentry *tent);
+static int ta_dump_kfib_tentry_int(struct sockaddr *paddr,
+ struct sockaddr *pmask, ipfw_obj_tentry *tent);
+static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti,
+ ipfw_obj_tentry *tent);
+static void ta_foreach_kfib(void *ta_state, struct table_info *ti,
+ ta_foreach_f *f, void *arg);
+
+
+static int
+ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen,
+ uint32_t *val)
+{
+#ifdef INET
+ struct nhop4_basic nh4;
+ struct in_addr in;
+#endif
+#ifdef INET6
+ struct nhop6_basic nh6;
+#endif
+ int error;
+
+ error = ENOENT;
+#ifdef INET
+ if (keylen == 4) {
+ in.s_addr = *(in_addr_t *)key;
+ error = fib4_lookup_nh_basic(ti->data,
+ in, 0, 0, &nh4);
+ }
+#endif
+#ifdef INET6
+ if (keylen == 6)
+ error = fib6_lookup_nh_basic(ti->data,
+ (struct in6_addr *)key, 0, 0, 0, &nh6);
+#endif
+
+ if (error != 0)
+ return (0);
+
+ *val = 0;
+
+ return (1);
+}
+
+/* Parse 'fib=%d' */
+static int
+kfib_parse_opts(int *pfib, char *data)
+{
+ char *pdel, *pend, *s;
+ int fibnum;
+
+ if (data == NULL)
+ return (0);
+ if ((pdel = strchr(data, ' ')) == NULL)
+ return (0);
+ while (*pdel == ' ')
+ pdel++;
+ if (strncmp(pdel, "fib=", 4) != 0)
+ return (EINVAL);
+ if ((s = strchr(pdel, ' ')) != NULL)
+ *s++ = '\0';
+
+ pdel += 4;
+ /* Need \d+ */
+ fibnum = strtol(pdel, &pend, 10);
+ if (*pend != '\0')
+ return (EINVAL);
+
+ *pfib = fibnum;
+
+ return (0);
+}
+
+static void
+ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf,
+ size_t bufsize)
+{
+
+ if (ti->data != 0)
+ snprintf(buf, bufsize, "%s fib=%lu", "addr:kfib", ti->data);
+ else
+ snprintf(buf, bufsize, "%s", "addr:kfib");
+}
+
+static int
+ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
+ char *data, uint8_t tflags)
+{
+ int error, fibnum;
+
+ fibnum = 0;
+ if ((error = kfib_parse_opts(&fibnum, data)) != 0)
+ return (error);
+
+ if (fibnum >= rt_numfibs)
+ return (E2BIG);
+
+ ti->data = fibnum;
+ ti->lookup = ta_lookup_kfib;
+
+ return (0);
+}
+
+/*
+ * Destroys table @ti
+ */
+static void
+ta_destroy_kfib(void *ta_state, struct table_info *ti)
+{
+
+}
+
+/*
+ * Provide algo-specific table info
+ */
+static void
+ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
+{
+
+ tinfo->flags = IPFW_TATFLAGS_AFDATA;
+ tinfo->taclass4 = IPFW_TACLASS_RADIX;
+ tinfo->count4 = 0;
+ tinfo->itemsize4 = sizeof(struct rtentry);
+ tinfo->taclass6 = IPFW_TACLASS_RADIX;
+ tinfo->count6 = 0;
+ tinfo->itemsize6 = sizeof(struct rtentry);
+}
+
+static int
+contigmask(uint8_t *p, int len)
+{
+ int i, n;
+
+ for (i = 0; i < len ; i++)
+ if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */
+ break;
+ for (n= i + 1; n < len; n++)
+ if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0)
+ return (-1); /* mask not contiguous */
+ return (i);
+}
+
+
+static int
+ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e,
+ ipfw_obj_tentry *tent)
+{
+ struct rtentry *rte;
+
+ rte = (struct rtentry *)e;
+
+ return ta_dump_kfib_tentry_int(rt_key(rte), rt_mask(rte), tent);
+}
+
+static int
+ta_dump_kfib_tentry_int(struct sockaddr *paddr, struct sockaddr *pmask,
+ ipfw_obj_tentry *tent)
+{
+#ifdef INET
+ struct sockaddr_in *addr, *mask;
+#endif
+#ifdef INET6
+ struct sockaddr_in6 *addr6, *mask6;
+#endif
+ int len;
+
+ len = 0;
+
+ /* Guess IPv4/IPv6 radix by sockaddr family */
+#ifdef INET
+ if (paddr->sa_family == AF_INET) {
+ addr = (struct sockaddr_in *)paddr;
+ mask = (struct sockaddr_in *)pmask;
+ tent->k.addr.s_addr = addr->sin_addr.s_addr;
+ len = 32;
+ if (mask != NULL)
+ len = contigmask((uint8_t *)&mask->sin_addr, 32);
+ if (len == -1)
+ len = 0;
+ tent->masklen = len;
+ tent->subtype = AF_INET;
+ tent->v.kidx = 0; /* Do we need to put GW here? */
+ }
+#endif
+#ifdef INET6
+ if (paddr->sa_family == AF_INET6) {
+ addr6 = (struct sockaddr_in6 *)paddr;
+ mask6 = (struct sockaddr_in6 *)pmask;
+ memcpy(&tent->k, &addr6->sin6_addr, sizeof(struct in6_addr));
+ len = 128;
+ if (mask6 != NULL)
+ len = contigmask((uint8_t *)&mask6->sin6_addr, 128);
+ if (len == -1)
+ len = 0;
+ tent->masklen = len;
+ tent->subtype = AF_INET6;
+ tent->v.kidx = 0;
+ }
+#endif
+
+ return (0);
+}
+
+static int
+ta_find_kfib_tentry(void *ta_state, struct table_info *ti,
+ ipfw_obj_tentry *tent)
+{
+ struct rt_addrinfo info;
+ struct sockaddr_in6 key6, dst6, mask6;
+ struct sockaddr *dst, *key, *mask;
+
+ /* Prepare sockaddr for prefix/mask and info */
+ bzero(&dst6, sizeof(dst6));
+ dst6.sin6_len = sizeof(dst6);
+ dst = (struct sockaddr *)&dst6;
+ bzero(&mask6, sizeof(mask6));
+ mask6.sin6_len = sizeof(mask6);
+ mask = (struct sockaddr *)&mask6;
+
+ bzero(&info, sizeof(info));
+ info.rti_info[RTAX_DST] = dst;
+ info.rti_info[RTAX_NETMASK] = mask;
+
+ /* Prepare the lookup key */
+ bzero(&key6, sizeof(key6));
+ key6.sin6_family = tent->subtype;
+ key = (struct sockaddr *)&key6;
+
+ if (tent->subtype == AF_INET) {
+ ((struct sockaddr_in *)&key6)->sin_addr = tent->k.addr;
+ key6.sin6_len = sizeof(struct sockaddr_in);
+ } else {
+ key6.sin6_addr = tent->k.addr6;
+ key6.sin6_len = sizeof(struct sockaddr_in6);
+ }
+
+ if (rib_lookup_info(ti->data, key, 0, 0, &info) != 0)
+ return (ENOENT);
+ if ((info.rti_addrs & RTA_NETMASK) == 0)
+ mask = NULL;
+
+ ta_dump_kfib_tentry_int(dst, mask, tent);
+
+ return (0);
+}
+
+static void
+ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f,
+ void *arg)
+{
+ struct rib_head *rh;
+ int error;
+
+ rh = rt_tables_get_rnh(ti->data, AF_INET);
+ if (rh != NULL) {
+ RIB_RLOCK(rh);
+ error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg);
+ RIB_RUNLOCK(rh);
+ }
+
+ rh = rt_tables_get_rnh(ti->data, AF_INET6);
+ if (rh != NULL) {
+ RIB_RLOCK(rh);
+ error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg);
+ RIB_RUNLOCK(rh);
+ }
+}
+
+struct table_algo addr_kfib = {
+ .name = "addr:kfib",
+ .type = IPFW_TABLE_ADDR,
+ .flags = TA_FLAG_READONLY,
+ .ta_buf_size = 0,
+ .init = ta_init_kfib,
+ .destroy = ta_destroy_kfib,
+ .foreach = ta_foreach_kfib,
+ .dump_tentry = ta_dump_kfib_tentry,
+ .find_tentry = ta_find_kfib_tentry,
+ .dump_tinfo = ta_dump_kfib_tinfo,
+ .print_config = ta_print_kfib_config,
+};
+
+void
+ipfw_table_algo_init(struct ip_fw_chain *ch)
+{
+ size_t sz;
+
+ /*
+ * Register all algorithms presented here.
+ */
+ sz = sizeof(struct table_algo);
+ ipfw_add_table_algo(ch, &addr_radix, sz, &addr_radix.idx);
+ ipfw_add_table_algo(ch, &addr_hash, sz, &addr_hash.idx);
+ ipfw_add_table_algo(ch, &iface_idx, sz, &iface_idx.idx);
+ ipfw_add_table_algo(ch, &number_array, sz, &number_array.idx);
+ ipfw_add_table_algo(ch, &flow_hash, sz, &flow_hash.idx);
+ ipfw_add_table_algo(ch, &addr_kfib, sz, &addr_kfib.idx);
+}
+
+void
+ipfw_table_algo_destroy(struct ip_fw_chain *ch)
+{
+
+ ipfw_del_table_algo(ch, addr_radix.idx);
+ ipfw_del_table_algo(ch, addr_hash.idx);
+ ipfw_del_table_algo(ch, iface_idx.idx);
+ ipfw_del_table_algo(ch, number_array.idx);
+ ipfw_del_table_algo(ch, flow_hash.idx);
+ ipfw_del_table_algo(ch, addr_kfib.idx);
+}
+
+
diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_table_value.c b/freebsd/sys/netpfil/ipfw/ip_fw_table_value.c
new file mode 100644
index 00000000..ef42e401
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/ip_fw_table_value.c
@@ -0,0 +1,810 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2014 Yandex LLC
+ * Copyright (c) 2014 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Multi-field value support for ipfw tables.
+ *
+ * This file contains necessary functions to convert
+ * large multi-field values into u32 indices suitable to be fed
+ * to various table algorithms. Other machinery like proper refcounting,
+ * internal structures resizing are also kept here.
+ */
+
+#include <rtems/bsd/local/opt_ipfw.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/hash.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/queue.h>
+#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */
+
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* struct ipfw_rule_ref */
+#include <netinet/ip_fw.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/ip_fw_table.h>
+
+static uint32_t hash_table_value(struct namedobj_instance *ni, const void *key,
+ uint32_t kopt);
+static int cmp_table_value(struct named_object *no, const void *key,
+ uint32_t kopt);
+
+static int list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd);
+
+static struct ipfw_sopt_handler scodes[] = {
+ { IP_FW_TABLE_VLIST, 0, HDIR_GET, list_table_values },
+};
+
+#define CHAIN_TO_VI(chain) (CHAIN_TO_TCFG(chain)->valhash)
+
+struct table_val_link
+{
+ struct named_object no;
+ struct table_value *pval; /* Pointer to real table value */
+};
+#define VALDATA_START_SIZE 64 /* Allocate 64-items array by default */
+
+struct vdump_args {
+ struct ip_fw_chain *ch;
+ struct sockopt_data *sd;
+ struct table_value *pval;
+ int error;
+};
+
+
+static uint32_t
+hash_table_value(struct namedobj_instance *ni, const void *key, uint32_t kopt)
+{
+
+ return (hash32_buf(key, 56, 0));
+}
+
+static int
+cmp_table_value(struct named_object *no, const void *key, uint32_t kopt)
+{
+
+ return (memcmp(((struct table_val_link *)no)->pval, key, 56));
+}
+
+static void
+mask_table_value(struct table_value *src, struct table_value *dst,
+ uint32_t mask)
+{
+#define _MCPY(f, b) if ((mask & (b)) != 0) { dst->f = src->f; }
+
+ memset(dst, 0, sizeof(*dst));
+ _MCPY(tag, IPFW_VTYPE_TAG);
+ _MCPY(pipe, IPFW_VTYPE_PIPE);
+ _MCPY(divert, IPFW_VTYPE_DIVERT);
+ _MCPY(skipto, IPFW_VTYPE_SKIPTO);
+ _MCPY(netgraph, IPFW_VTYPE_NETGRAPH);
+ _MCPY(fib, IPFW_VTYPE_FIB);
+ _MCPY(nat, IPFW_VTYPE_NAT);
+ _MCPY(dscp, IPFW_VTYPE_DSCP);
+ _MCPY(nh4, IPFW_VTYPE_NH4);
+ _MCPY(nh6, IPFW_VTYPE_NH6);
+ _MCPY(zoneid, IPFW_VTYPE_NH6);
+#undef _MCPY
+}
+
+static void
+get_value_ptrs(struct ip_fw_chain *ch, struct table_config *tc, int vshared,
+ struct table_value **ptv, struct namedobj_instance **pvi)
+{
+ struct table_value *pval;
+ struct namedobj_instance *vi;
+
+ if (vshared != 0) {
+ pval = (struct table_value *)ch->valuestate;
+ vi = CHAIN_TO_VI(ch);
+ } else {
+ pval = NULL;
+ vi = NULL;
+ //pval = (struct table_value *)&tc->ti.data;
+ }
+
+ if (ptv != NULL)
+ *ptv = pval;
+ if (pvi != NULL)
+ *pvi = vi;
+}
+
+/*
+ * Update pointers to real vaues after @pval change.
+ */
+static int
+update_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
+{
+ struct vdump_args *da;
+ struct table_val_link *ptv;
+ struct table_value *pval;
+
+ da = (struct vdump_args *)arg;
+ ptv = (struct table_val_link *)no;
+
+ pval = da->pval;
+ ptv->pval = &pval[ptv->no.kidx];
+ ptv->no.name = (char *)&pval[ptv->no.kidx];
+ return (0);
+}
+
+/*
+ * Grows value storage shared among all tables.
+ * Drops/reacquires UH locks.
+ * Notifies other running adds on @ch shared storage resize.
+ * Note function does not guarantee that free space
+ * will be available after invocation, so one caller needs
+ * to roll cycle himself.
+ *
+ * Returns 0 if case of no errors.
+ */
+static int
+resize_shared_value_storage(struct ip_fw_chain *ch)
+{
+ struct tables_config *tcfg;
+ struct namedobj_instance *vi;
+ struct table_value *pval, *valuestate, *old_valuestate;
+ void *new_idx;
+ struct vdump_args da;
+ int new_blocks;
+ int val_size, val_size_old;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ valuestate = NULL;
+ new_idx = NULL;
+
+ pval = (struct table_value *)ch->valuestate;
+ vi = CHAIN_TO_VI(ch);
+ tcfg = CHAIN_TO_TCFG(ch);
+
+ val_size = tcfg->val_size * 2;
+
+ if (val_size == (1 << 30))
+ return (ENOSPC);
+
+ IPFW_UH_WUNLOCK(ch);
+
+ valuestate = malloc(sizeof(struct table_value) * val_size, M_IPFW,
+ M_WAITOK | M_ZERO);
+ ipfw_objhash_bitmap_alloc(val_size, (void *)&new_idx,
+ &new_blocks);
+
+ IPFW_UH_WLOCK(ch);
+
+ /*
+ * Check if we still need to resize
+ */
+ if (tcfg->val_size >= val_size)
+ goto done;
+
+ /* Update pointers and notify everyone we're changing @ch */
+ pval = (struct table_value *)ch->valuestate;
+ rollback_toperation_state(ch, ch);
+
+ /* Good. Let's merge */
+ memcpy(valuestate, pval, sizeof(struct table_value) * tcfg->val_size);
+ ipfw_objhash_bitmap_merge(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
+
+ IPFW_WLOCK(ch);
+ /* Change pointers */
+ old_valuestate = ch->valuestate;
+ ch->valuestate = valuestate;
+ valuestate = old_valuestate;
+ ipfw_objhash_bitmap_swap(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
+
+ val_size_old = tcfg->val_size;
+ tcfg->val_size = val_size;
+ val_size = val_size_old;
+ IPFW_WUNLOCK(ch);
+ /* Update pointers to reflect resize */
+ memset(&da, 0, sizeof(da));
+ da.pval = (struct table_value *)ch->valuestate;
+ ipfw_objhash_foreach(vi, update_tvalue, &da);
+
+done:
+ free(valuestate, M_IPFW);
+ ipfw_objhash_bitmap_free(new_idx, new_blocks);
+
+ return (0);
+}
+
+/*
+ * Drops reference for table value with index @kidx, stored in @pval and
+ * @vi. Frees value if it has no references.
+ */
+static void
+unref_table_value(struct namedobj_instance *vi, struct table_value *pval,
+ uint32_t kidx)
+{
+ struct table_val_link *ptvl;
+
+ KASSERT(pval[kidx].refcnt > 0, ("Refcount is 0 on kidx %d", kidx));
+ if (--pval[kidx].refcnt > 0)
+ return;
+
+ /* Last reference, delete item */
+ ptvl = (struct table_val_link *)ipfw_objhash_lookup_kidx(vi, kidx);
+ KASSERT(ptvl != NULL, ("lookup on value kidx %d failed", kidx));
+ ipfw_objhash_del(vi, &ptvl->no);
+ ipfw_objhash_free_idx(vi, kidx);
+ free(ptvl, M_IPFW);
+}
+
+struct flush_args {
+ struct ip_fw_chain *ch;
+ struct table_algo *ta;
+ struct table_info *ti;
+ void *astate;
+ ipfw_obj_tentry tent;
+};
+
+static int
+unref_table_value_cb(void *e, void *arg)
+{
+ struct flush_args *fa;
+ struct ip_fw_chain *ch;
+ struct table_algo *ta;
+ ipfw_obj_tentry *tent;
+ int error;
+
+ fa = (struct flush_args *)arg;
+
+ ta = fa->ta;
+ memset(&fa->tent, 0, sizeof(fa->tent));
+ tent = &fa->tent;
+ error = ta->dump_tentry(fa->astate, fa->ti, e, tent);
+ if (error != 0)
+ return (error);
+
+ ch = fa->ch;
+
+ unref_table_value(CHAIN_TO_VI(ch),
+ (struct table_value *)ch->valuestate, tent->v.kidx);
+
+ return (0);
+}
+
+/*
+ * Drop references for each value used in @tc.
+ */
+void
+ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc,
+ struct table_algo *ta, void *astate, struct table_info *ti)
+{
+ struct flush_args fa;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ memset(&fa, 0, sizeof(fa));
+ fa.ch = ch;
+ fa.ta = ta;
+ fa.astate = astate;
+ fa.ti = ti;
+
+ ta->foreach(astate, ti, unref_table_value_cb, &fa);
+}
+
+/*
+ * Table operation state handler.
+ * Called when we are going to change something in @tc which
+ * may lead to inconsistencies in on-going table data addition.
+ *
+ * Here we rollback all already committed state (table values, currently)
+ * and set "modified" field to non-zero value to indicate
+ * that we need to restart original operation.
+ */
+void
+rollback_table_values(struct tableop_state *ts)
+{
+ struct ip_fw_chain *ch;
+ struct table_value *pval;
+ struct tentry_info *ptei;
+ struct namedobj_instance *vi;
+ int i;
+
+ ch = ts->ch;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ /* Get current table value pointer */
+ get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
+
+ for (i = 0; i < ts->count; i++) {
+ ptei = &ts->tei[i];
+
+ if (ptei->value == 0)
+ continue;
+
+ unref_table_value(vi, pval, ptei->value);
+ }
+}
+
+/*
+ * Allocate new value index in either shared or per-table array.
+ * Function may drop/reacquire UH lock.
+ *
+ * Returns 0 on success.
+ */
+static int
+alloc_table_vidx(struct ip_fw_chain *ch, struct tableop_state *ts,
+ struct namedobj_instance *vi, uint16_t *pvidx)
+{
+ int error, vlimit;
+ uint16_t vidx;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ error = ipfw_objhash_alloc_idx(vi, &vidx);
+ if (error != 0) {
+
+ /*
+ * We need to resize array. This involves
+ * lock/unlock, so we need to check "modified"
+ * state.
+ */
+ ts->opstate.func(ts->tc, &ts->opstate);
+ error = resize_shared_value_storage(ch);
+ return (error); /* ts->modified should be set, we will restart */
+ }
+
+ vlimit = ts->ta->vlimit;
+ if (vlimit != 0 && vidx >= vlimit) {
+
+ /*
+ * Algorithm is not able to store given index.
+ * We have to rollback state, start using
+ * per-table value array or return error
+ * if we're already using it.
+ *
+ * TODO: do not rollback state if
+ * atomicity is not required.
+ */
+ if (ts->vshared != 0) {
+ /* shared -> per-table */
+ return (ENOSPC); /* TODO: proper error */
+ }
+
+ /* per-table. Fail for now. */
+ return (ENOSPC); /* TODO: proper error */
+ }
+
+ *pvidx = vidx;
+ return (0);
+}
+
+/*
+ * Drops value reference for unused values (updates, deletes, partially
+ * successful adds or rollbacks).
+ */
+void
+ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc,
+ struct tentry_info *tei, uint32_t count, int rollback)
+{
+ int i;
+ struct tentry_info *ptei;
+ struct table_value *pval;
+ struct namedobj_instance *vi;
+
+ /*
+ * We have two slightly different ADD cases here:
+ * either (1) we are successful / partially successful,
+ * in that case we need
+ * * to ignore ADDED entries values
+ * * rollback every other values (either UPDATED since
+ * old value has been stored there, or some failure like
+ * EXISTS or LIMIT or simply "ignored" case.
+ *
+ * (2): atomic rollback of partially successful operation
+ * in that case we simply need to unref all entries.
+ *
+ * DELETE case is simpler: no atomic support there, so
+ * we simply unref all non-zero values.
+ */
+
+ /*
+ * Get current table value pointers.
+ * XXX: Properly read vshared
+ */
+ get_value_ptrs(ch, tc, 1, &pval, &vi);
+
+ for (i = 0; i < count; i++) {
+ ptei = &tei[i];
+
+ if (ptei->value == 0) {
+
+ /*
+ * We may be deleting non-existing record.
+ * Skip.
+ */
+ continue;
+ }
+
+ if ((ptei->flags & TEI_FLAGS_ADDED) != 0 && rollback == 0) {
+ ptei->value = 0;
+ continue;
+ }
+
+ unref_table_value(vi, pval, ptei->value);
+ ptei->value = 0;
+ }
+}
+
+/*
+ * Main function used to link values of entries going to be added,
+ * to the index. Since we may perform many UH locks drops/acquires,
+ * handle changes by checking tablestate "modified" field.
+ *
+ * Success: return 0.
+ */
+int
+ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts)
+{
+ int error, i, found;
+ struct namedobj_instance *vi;
+ struct table_config *tc;
+ struct tentry_info *tei, *ptei;
+ uint32_t count, vlimit;
+ uint16_t vidx;
+ struct table_val_link *ptv;
+ struct table_value tval, *pval;
+
+ /*
+ * Stage 1: reference all existing values and
+ * save their indices.
+ */
+ IPFW_UH_WLOCK_ASSERT(ch);
+ get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
+
+ error = 0;
+ found = 0;
+ vlimit = ts->ta->vlimit;
+ vidx = 0;
+ tc = ts->tc;
+ tei = ts->tei;
+ count = ts->count;
+ for (i = 0; i < count; i++) {
+ ptei = &tei[i];
+ ptei->value = 0; /* Ensure value is always 0 in the beginning */
+ mask_table_value(ptei->pvalue, &tval, ts->vmask);
+ ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
+ (char *)&tval);
+ if (ptv == NULL)
+ continue;
+ /* Deal with vlimit later */
+ if (vlimit > 0 && vlimit <= ptv->no.kidx)
+ continue;
+
+ /* Value found. Bump refcount */
+ ptv->pval->refcnt++;
+ ptei->value = ptv->no.kidx;
+ found++;
+ }
+
+ if (ts->count == found) {
+ /* We've found all values , no need ts create new ones */
+ return (0);
+ }
+
+ /*
+ * we have added some state here, let's attach operation
+ * state ts the list ts be able ts rollback if necessary.
+ */
+ add_toperation_state(ch, ts);
+ /* Ensure table won't disappear */
+ tc_ref(tc);
+ IPFW_UH_WUNLOCK(ch);
+
+ /*
+ * Stage 2: allocate objects for non-existing values.
+ */
+ for (i = 0; i < count; i++) {
+ ptei = &tei[i];
+ if (ptei->value != 0)
+ continue;
+ if (ptei->ptv != NULL)
+ continue;
+ ptei->ptv = malloc(sizeof(struct table_val_link), M_IPFW,
+ M_WAITOK | M_ZERO);
+ }
+
+ /*
+ * Stage 3: allocate index numbers for new values
+ * and link them to index.
+ */
+ IPFW_UH_WLOCK(ch);
+ tc_unref(tc);
+ del_toperation_state(ch, ts);
+ if (ts->modified != 0) {
+
+ /*
+ * In general, we should free all state/indexes here
+ * and return. However, we keep allocated state instead
+ * to ensure we achieve some progress on each restart.
+ */
+ return (0);
+ }
+
+ KASSERT(pval == ch->valuestate, ("resize_storage() notify failure"));
+
+ /* Let's try to link values */
+ for (i = 0; i < count; i++) {
+ ptei = &tei[i];
+
+ /* Check if record has appeared */
+ mask_table_value(ptei->pvalue, &tval, ts->vmask);
+ ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
+ (char *)&tval);
+ if (ptv != NULL) {
+ ptv->pval->refcnt++;
+ ptei->value = ptv->no.kidx;
+ continue;
+ }
+
+ /* May perform UH unlock/lock */
+ error = alloc_table_vidx(ch, ts, vi, &vidx);
+ if (error != 0) {
+ ts->opstate.func(ts->tc, &ts->opstate);
+ return (error);
+ }
+ /* value storage resize has happened, return */
+ if (ts->modified != 0)
+ return (0);
+
+ /* Finally, we have allocated valid index, let's add entry */
+ ptei->value = vidx;
+ ptv = (struct table_val_link *)ptei->ptv;
+ ptei->ptv = NULL;
+
+ ptv->no.kidx = vidx;
+ ptv->no.name = (char *)&pval[vidx];
+ ptv->pval = &pval[vidx];
+ memcpy(ptv->pval, &tval, sizeof(struct table_value));
+ pval[vidx].refcnt = 1;
+ ipfw_objhash_add(vi, &ptv->no);
+ }
+
+ return (0);
+}
+
+/*
+ * Compatibility function used to import data from old
+ * IP_FW_TABLE_ADD / IP_FW_TABLE_XADD opcodes.
+ */
+void
+ipfw_import_table_value_legacy(uint32_t value, struct table_value *v)
+{
+
+ memset(v, 0, sizeof(*v));
+ v->tag = value;
+ v->pipe = value;
+ v->divert = value;
+ v->skipto = value;
+ v->netgraph = value;
+ v->fib = value;
+ v->nat = value;
+ v->nh4 = value; /* host format */
+ v->dscp = value;
+ v->limit = value;
+}
+
+/*
+ * Export data to legacy table dumps opcodes.
+ */
+uint32_t
+ipfw_export_table_value_legacy(struct table_value *v)
+{
+
+ /*
+ * TODO: provide more compatibility depending on
+ * vmask value.
+ */
+ return (v->tag);
+}
+
+/*
+ * Imports table value from current userland format.
+ * Saves value in kernel format to the same place.
+ */
+void
+ipfw_import_table_value_v1(ipfw_table_value *iv)
+{
+ struct table_value v;
+
+ memset(&v, 0, sizeof(v));
+ v.tag = iv->tag;
+ v.pipe = iv->pipe;
+ v.divert = iv->divert;
+ v.skipto = iv->skipto;
+ v.netgraph = iv->netgraph;
+ v.fib = iv->fib;
+ v.nat = iv->nat;
+ v.dscp = iv->dscp;
+ v.nh4 = iv->nh4;
+ v.nh6 = iv->nh6;
+ v.limit = iv->limit;
+ v.zoneid = iv->zoneid;
+
+ memcpy(iv, &v, sizeof(ipfw_table_value));
+}
+
+/*
+ * Export real table value @v to current userland format.
+ * Note that @v and @piv may point to the same memory.
+ */
+void
+ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *piv)
+{
+ ipfw_table_value iv;
+
+ memset(&iv, 0, sizeof(iv));
+ iv.tag = v->tag;
+ iv.pipe = v->pipe;
+ iv.divert = v->divert;
+ iv.skipto = v->skipto;
+ iv.netgraph = v->netgraph;
+ iv.fib = v->fib;
+ iv.nat = v->nat;
+ iv.dscp = v->dscp;
+ iv.limit = v->limit;
+ iv.nh4 = v->nh4;
+ iv.nh6 = v->nh6;
+ iv.zoneid = v->zoneid;
+
+ memcpy(piv, &iv, sizeof(iv));
+}
+
+/*
+ * Exports real value data into ipfw_table_value structure.
+ * Utilizes "spare1" field to store kernel index.
+ */
+static int
+dump_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
+{
+ struct vdump_args *da;
+ struct table_val_link *ptv;
+ struct table_value *v;
+
+ da = (struct vdump_args *)arg;
+ ptv = (struct table_val_link *)no;
+
+ v = (struct table_value *)ipfw_get_sopt_space(da->sd, sizeof(*v));
+ /* Out of memory, returning */
+ if (v == NULL) {
+ da->error = ENOMEM;
+ return (ENOMEM);
+ }
+
+ memcpy(v, ptv->pval, sizeof(*v));
+ v->spare1 = ptv->no.kidx;
+ return (0);
+}
+
+/*
+ * Dumps all shared/table value data
+ * Data layout (v1)(current):
+ * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
+ * Reply: [ ipfw_obj_lheader ipfw_table_value x N ]
+ *
+ * Returns 0 on success
+ */
+static int
+list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ struct _ipfw_obj_lheader *olh;
+ struct namedobj_instance *vi;
+ struct vdump_args da;
+ uint32_t count, size;
+
+ olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
+ if (olh == NULL)
+ return (EINVAL);
+ if (sd->valsize < olh->size)
+ return (EINVAL);
+
+ IPFW_UH_RLOCK(ch);
+ vi = CHAIN_TO_VI(ch);
+
+ count = ipfw_objhash_count(vi);
+ size = count * sizeof(ipfw_table_value) + sizeof(ipfw_obj_lheader);
+
+ /* Fill in header regadless of buffer size */
+ olh->count = count;
+ olh->objsize = sizeof(ipfw_table_value);
+
+ if (size > olh->size) {
+ olh->size = size;
+ IPFW_UH_RUNLOCK(ch);
+ return (ENOMEM);
+ }
+ olh->size = size;
+
+ /*
+ * Do the actual value dump
+ */
+ memset(&da, 0, sizeof(da));
+ da.ch = ch;
+ da.sd = sd;
+ ipfw_objhash_foreach(vi, dump_tvalue, &da);
+
+ IPFW_UH_RUNLOCK(ch);
+
+ return (0);
+}
+
+void
+ipfw_table_value_init(struct ip_fw_chain *ch, int first)
+{
+ struct tables_config *tcfg;
+
+ ch->valuestate = malloc(VALDATA_START_SIZE * sizeof(struct table_value),
+ M_IPFW, M_WAITOK | M_ZERO);
+
+ tcfg = ch->tblcfg;
+
+ tcfg->val_size = VALDATA_START_SIZE;
+ tcfg->valhash = ipfw_objhash_create(tcfg->val_size);
+ ipfw_objhash_set_funcs(tcfg->valhash, hash_table_value,
+ cmp_table_value);
+
+ IPFW_ADD_SOPT_HANDLER(first, scodes);
+}
+
+static int
+destroy_value(struct namedobj_instance *ni, struct named_object *no,
+ void *arg)
+{
+
+ free(no, M_IPFW);
+ return (0);
+}
+
+void
+ipfw_table_value_destroy(struct ip_fw_chain *ch, int last)
+{
+
+ IPFW_DEL_SOPT_HANDLER(last, scodes);
+
+ free(ch->valuestate, M_IPFW);
+ ipfw_objhash_foreach(CHAIN_TO_VI(ch), destroy_value, ch);
+ ipfw_objhash_destroy(CHAIN_TO_VI(ch));
+}
+
diff --git a/freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.c b/freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.c
new file mode 100644
index 00000000..03ca9599
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.c
@@ -0,0 +1,131 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2015-2016 Yandex LLC
+ * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/nat64/ip_fw_nat64.h>
+#include <netpfil/ipfw/nat64/nat64_translate.h>
+
+
+int nat64_debug = 0;
+SYSCTL_DECL(_net_inet_ip_fw);
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, nat64_debug, CTLFLAG_RW,
+ &nat64_debug, 0, "Debug level for NAT64 module");
+
+int nat64_allow_private = 0;
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, nat64_allow_private, CTLFLAG_RW,
+ &nat64_allow_private, 0,
+ "Allow use of non-global IPv4 addresses with NAT64");
+
+static int
+vnet_ipfw_nat64_init(const void *arg __unused)
+{
+ struct ip_fw_chain *ch;
+ int first, error;
+
+ ch = &V_layer3_chain;
+ first = IS_DEFAULT_VNET(curvnet) ? 1: 0;
+ error = nat64stl_init(ch, first);
+ if (error != 0)
+ return (error);
+ error = nat64lsn_init(ch, first);
+ if (error != 0) {
+ nat64stl_uninit(ch, first);
+ return (error);
+ }
+ return (0);
+}
+
+static int
+vnet_ipfw_nat64_uninit(const void *arg __unused)
+{
+ struct ip_fw_chain *ch;
+ int last;
+
+ ch = &V_layer3_chain;
+ last = IS_DEFAULT_VNET(curvnet) ? 1: 0;
+ nat64stl_uninit(ch, last);
+ nat64lsn_uninit(ch, last);
+ return (0);
+}
+
+static int
+ipfw_nat64_modevent(module_t mod, int type, void *unused)
+{
+
+ switch (type) {
+ case MOD_LOAD:
+ case MOD_UNLOAD:
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+ return (0);
+}
+
+static moduledata_t ipfw_nat64_mod = {
+ "ipfw_nat64",
+ ipfw_nat64_modevent,
+ 0
+};
+
+/* Define startup order. */
+#define IPFW_NAT64_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN
+#define IPFW_NAT64_MODEVENT_ORDER (SI_ORDER_ANY - 128) /* after ipfw */
+#define IPFW_NAT64_MODULE_ORDER (IPFW_NAT64_MODEVENT_ORDER + 1)
+#define IPFW_NAT64_VNET_ORDER (IPFW_NAT64_MODEVENT_ORDER + 2)
+
+DECLARE_MODULE(ipfw_nat64, ipfw_nat64_mod, IPFW_NAT64_SI_SUB_FIREWALL,
+ SI_ORDER_ANY);
+MODULE_DEPEND(ipfw_nat64, ipfw, 3, 3, 3);
+MODULE_VERSION(ipfw_nat64, 1);
+
+VNET_SYSINIT(vnet_ipfw_nat64_init, IPFW_NAT64_SI_SUB_FIREWALL,
+ IPFW_NAT64_VNET_ORDER, vnet_ipfw_nat64_init, NULL);
+VNET_SYSUNINIT(vnet_ipfw_nat64_uninit, IPFW_NAT64_SI_SUB_FIREWALL,
+ IPFW_NAT64_VNET_ORDER, vnet_ipfw_nat64_uninit, NULL);
diff --git a/freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.h b/freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.h
new file mode 100644
index 00000000..1d2bb774
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.h
@@ -0,0 +1,117 @@
+/*-
+ * Copyright (c) 2015-2016 Yandex LLC
+ * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_FW_NAT64_H_
+#define _IP_FW_NAT64_H_
+
+#define DPRINTF(mask, fmt, ...) \
+ if (nat64_debug & (mask)) \
+ printf("NAT64: %s: " fmt "\n", __func__, ## __VA_ARGS__)
+#define DP_GENERIC 0x0001
+#define DP_OBJ 0x0002
+#define DP_JQUEUE 0x0004
+#define DP_STATE 0x0008
+#define DP_DROPS 0x0010
+#define DP_ALL 0xFFFF
+extern int nat64_debug;
+
+#if 0
+#define NAT64NOINLINE __noinline
+#else
+#define NAT64NOINLINE
+#endif
+
+int nat64stl_init(struct ip_fw_chain *ch, int first);
+void nat64stl_uninit(struct ip_fw_chain *ch, int last);
+int nat64lsn_init(struct ip_fw_chain *ch, int first);
+void nat64lsn_uninit(struct ip_fw_chain *ch, int last);
+
+struct ip_fw_nat64_stats {
+ counter_u64_t opcnt64; /* 6to4 of packets translated */
+ counter_u64_t opcnt46; /* 4to6 of packets translated */
+ counter_u64_t ofrags; /* number of fragments generated */
+ counter_u64_t ifrags; /* number of fragments received */
+ counter_u64_t oerrors; /* number of output errors */
+ counter_u64_t noroute4;
+ counter_u64_t noroute6;
+ counter_u64_t nomatch4; /* No addr/port match */
+ counter_u64_t noproto; /* Protocol not supported */
+ counter_u64_t nomem; /* mbufs allocation failed */
+ counter_u64_t dropped; /* number of packets silently
+ * dropped due to some errors/
+ * unsupported/etc.
+ */
+
+ counter_u64_t jrequests; /* number of jobs requests queued */
+ counter_u64_t jcalls; /* number of jobs handler calls */
+ counter_u64_t jhostsreq; /* number of hosts requests */
+ counter_u64_t jportreq;
+ counter_u64_t jhostfails;
+ counter_u64_t jportfails;
+ counter_u64_t jmaxlen;
+ counter_u64_t jnomem;
+ counter_u64_t jreinjected;
+
+ counter_u64_t screated;
+ counter_u64_t sdeleted;
+ counter_u64_t spgcreated;
+ counter_u64_t spgdeleted;
+};
+
+#define IPFW_NAT64_VERSION 1
+#define NAT64STATS (sizeof(struct ip_fw_nat64_stats) / sizeof(uint64_t))
+typedef struct _nat64_stats_block {
+ counter_u64_t stats[NAT64STATS];
+} nat64_stats_block;
+#define NAT64STAT_ADD(s, f, v) \
+ counter_u64_add((s)->stats[ \
+ offsetof(struct ip_fw_nat64_stats, f) / sizeof(uint64_t)], (v))
+#define NAT64STAT_INC(s, f) NAT64STAT_ADD(s, f, 1)
+#define NAT64STAT_FETCH(s, f) \
+ counter_u64_fetch((s)->stats[ \
+ offsetof(struct ip_fw_nat64_stats, f) / sizeof(uint64_t)])
+
+#define L3HDR(_ip, _t) ((_t)((u_int32_t *)(_ip) + (_ip)->ip_hl))
+#define TCP(p) ((struct tcphdr *)(p))
+#define UDP(p) ((struct udphdr *)(p))
+#define ICMP(p) ((struct icmphdr *)(p))
+#define ICMP6(p) ((struct icmp6_hdr *)(p))
+
+#define NAT64SKIP 0
+#define NAT64RETURN 1
+#define NAT64MFREE -1
+
+/* Well-known prefix 64:ff9b::/96 */
+#define IPV6_ADDR_INT32_WKPFX htonl(0x64ff9b)
+#define IN6_IS_ADDR_WKPFX(a) \
+ ((a)->s6_addr32[0] == IPV6_ADDR_INT32_WKPFX && \
+ (a)->s6_addr32[1] == 0 && (a)->s6_addr32[2] == 0)
+
+#endif
+
diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64_translate.c b/freebsd/sys/netpfil/ipfw/nat64/nat64_translate.c
new file mode 100644
index 00000000..d2507674
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/nat64/nat64_translate.c
@@ -0,0 +1,1574 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2015-2016 Yandex LLC
+ * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rtems/bsd/local/opt_ipfw.h>
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <rtems/bsd/sys/errno.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/queue.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_pflog.h>
+#include <net/pfil.h>
+#include <net/netisr.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <netinet6/in6_var.h>
+#include <netinet6/ip6_var.h>
+
+#include <netpfil/pf/pf.h>
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/nat64/ip_fw_nat64.h>
+#include <netpfil/ipfw/nat64/nat64_translate.h>
+#include <machine/in_cksum.h>
+
+static void
+nat64_log(struct pfloghdr *logdata, struct mbuf *m, sa_family_t family)
+{
+
+ logdata->dir = PF_OUT;
+ logdata->af = family;
+ ipfw_bpf_mtap2(logdata, PFLOG_HDRLEN, m);
+}
+#ifdef IPFIREWALL_NAT64_DIRECT_OUTPUT
+static NAT64NOINLINE struct sockaddr* nat64_find_route4(struct route *ro,
+ in_addr_t dest, struct mbuf *m);
+static NAT64NOINLINE struct sockaddr* nat64_find_route6(struct route_in6 *ro,
+ struct in6_addr *dest, struct mbuf *m);
+
+static NAT64NOINLINE int
+nat64_output(struct ifnet *ifp, struct mbuf *m,
+ struct sockaddr *dst, struct route *ro, nat64_stats_block *stats,
+ void *logdata)
+{
+ int error;
+
+ if (logdata != NULL)
+ nat64_log(logdata, m, dst->sa_family);
+ error = (*ifp->if_output)(ifp, m, dst, ro);
+ if (error != 0)
+ NAT64STAT_INC(stats, oerrors);
+ return (error);
+}
+
+static NAT64NOINLINE int
+nat64_output_one(struct mbuf *m, nat64_stats_block *stats, void *logdata)
+{
+ struct route_in6 ro6;
+ struct route ro4, *ro;
+ struct sockaddr *dst;
+ struct ifnet *ifp;
+ struct ip6_hdr *ip6;
+ struct ip *ip4;
+ int error;
+
+ ip4 = mtod(m, struct ip *);
+ switch (ip4->ip_v) {
+ case IPVERSION:
+ ro = &ro4;
+ dst = nat64_find_route4(&ro4, ip4->ip_dst.s_addr, m);
+ if (dst == NULL)
+ NAT64STAT_INC(stats, noroute4);
+ break;
+ case (IPV6_VERSION >> 4):
+ ip6 = (struct ip6_hdr *)ip4;
+ ro = (struct route *)&ro6;
+ dst = nat64_find_route6(&ro6, &ip6->ip6_dst, m);
+ if (dst == NULL)
+ NAT64STAT_INC(stats, noroute6);
+ break;
+ default:
+ m_freem(m);
+ NAT64STAT_INC(stats, dropped);
+ DPRINTF(DP_DROPS, "dropped due to unknown IP version");
+ return (EAFNOSUPPORT);
+ }
+ if (dst == NULL) {
+ FREE_ROUTE(ro);
+ m_freem(m);
+ return (EHOSTUNREACH);
+ }
+ if (logdata != NULL)
+ nat64_log(logdata, m, dst->sa_family);
+ ifp = ro->ro_rt->rt_ifp;
+ error = (*ifp->if_output)(ifp, m, dst, ro);
+ if (error != 0)
+ NAT64STAT_INC(stats, oerrors);
+ FREE_ROUTE(ro);
+ return (error);
+}
+#else /* !IPFIREWALL_NAT64_DIRECT_OUTPUT */
+static NAT64NOINLINE int
+nat64_output(struct ifnet *ifp, struct mbuf *m,
+ struct sockaddr *dst, struct route *ro, nat64_stats_block *stats,
+ void *logdata)
+{
+ struct ip *ip4;
+ int ret, af;
+
+ ip4 = mtod(m, struct ip *);
+ switch (ip4->ip_v) {
+ case IPVERSION:
+ af = AF_INET;
+ ret = NETISR_IP;
+ break;
+ case (IPV6_VERSION >> 4):
+ af = AF_INET6;
+ ret = NETISR_IPV6;
+ break;
+ default:
+ m_freem(m);
+ NAT64STAT_INC(stats, dropped);
+ DPRINTF(DP_DROPS, "unknown IP version");
+ return (EAFNOSUPPORT);
+ }
+ if (logdata != NULL)
+ nat64_log(logdata, m, af);
+ ret = netisr_queue(ret, m);
+ if (ret != 0)
+ NAT64STAT_INC(stats, oerrors);
+ return (ret);
+}
+
+static NAT64NOINLINE int
+nat64_output_one(struct mbuf *m, nat64_stats_block *stats, void *logdata)
+{
+
+ return (nat64_output(NULL, m, NULL, NULL, stats, logdata));
+}
+#endif /* !IPFIREWALL_NAT64_DIRECT_OUTPUT */
+
+
+#if 0
+void print_ipv6_header(struct ip6_hdr *ip6, char *buf, size_t bufsize);
+
+void
+print_ipv6_header(struct ip6_hdr *ip6, char *buf, size_t bufsize)
+{
+ char sbuf[INET6_ADDRSTRLEN], dbuf[INET6_ADDRSTRLEN];
+
+ inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
+ inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
+ snprintf(buf, bufsize, "%s -> %s %d", sbuf, dbuf, ip6->ip6_nxt);
+}
+
+
+static NAT64NOINLINE int
+nat64_embed_ip4(struct nat64_cfg *cfg, in_addr_t ia, struct in6_addr *ip6)
+{
+
+ /* assume the prefix is properly filled with zeros */
+ bcopy(&cfg->prefix, ip6, sizeof(*ip6));
+ switch (cfg->plen) {
+ case 32:
+ case 96:
+ ip6->s6_addr32[cfg->plen / 32] = ia;
+ break;
+ case 40:
+ case 48:
+ case 56:
+#if BYTE_ORDER == BIG_ENDIAN
+ ip6->s6_addr32[1] = cfg->prefix.s6_addr32[1] |
+ (ia >> (cfg->plen % 32));
+ ip6->s6_addr32[2] = ia << (24 - cfg->plen % 32);
+#elif BYTE_ORDER == LITTLE_ENDIAN
+ ip6->s6_addr32[1] = cfg->prefix.s6_addr32[1] |
+ (ia << (cfg->plen % 32));
+ ip6->s6_addr32[2] = ia >> (24 - cfg->plen % 32);
+#endif
+ break;
+ case 64:
+#if BYTE_ORDER == BIG_ENDIAN
+ ip6->s6_addr32[2] = ia >> 8;
+ ip6->s6_addr32[3] = ia << 24;
+#elif BYTE_ORDER == LITTLE_ENDIAN
+ ip6->s6_addr32[2] = ia << 8;
+ ip6->s6_addr32[3] = ia >> 24;
+#endif
+ break;
+ default:
+ return (0);
+ };
+ ip6->s6_addr8[8] = 0;
+ return (1);
+}
+
+static NAT64NOINLINE in_addr_t
+nat64_extract_ip4(struct in6_addr *ip6, int plen)
+{
+ in_addr_t ia;
+
+ /*
+ * According to RFC 6052 p2.2:
+ * IPv4-embedded IPv6 addresses are composed of a variable-length
+ * prefix, the embedded IPv4 address, and a variable length suffix.
+ * The suffix bits are reserved for future extensions and SHOULD
+ * be set to zero.
+ */
+ switch (plen) {
+ case 32:
+ if (ip6->s6_addr32[3] != 0 || ip6->s6_addr32[2] != 0)
+ goto badip6;
+ break;
+ case 40:
+ if (ip6->s6_addr32[3] != 0 ||
+ (ip6->s6_addr32[2] & htonl(0xff00ffff)) != 0)
+ goto badip6;
+ break;
+ case 48:
+ if (ip6->s6_addr32[3] != 0 ||
+ (ip6->s6_addr32[2] & htonl(0xff0000ff)) != 0)
+ goto badip6;
+ break;
+ case 56:
+ if (ip6->s6_addr32[3] != 0 || ip6->s6_addr8[8] != 0)
+ goto badip6;
+ break;
+ case 64:
+ if (ip6->s6_addr8[8] != 0 ||
+ (ip6->s6_addr32[3] & htonl(0x00ffffff)) != 0)
+ goto badip6;
+ };
+ switch (plen) {
+ case 32:
+ case 96:
+ ia = ip6->s6_addr32[plen / 32];
+ break;
+ case 40:
+ case 48:
+ case 56:
+#if BYTE_ORDER == BIG_ENDIAN
+ ia = (ip6->s6_addr32[1] << (plen % 32)) |
+ (ip6->s6_addr32[2] >> (24 - plen % 32));
+#elif BYTE_ORDER == LITTLE_ENDIAN
+ ia = (ip6->s6_addr32[1] >> (plen % 32)) |
+ (ip6->s6_addr32[2] << (24 - plen % 32));
+#endif
+ break;
+ case 64:
+#if BYTE_ORDER == BIG_ENDIAN
+ ia = (ip6->s6_addr32[2] << 8) | (ip6->s6_addr32[3] >> 24);
+#elif BYTE_ORDER == LITTLE_ENDIAN
+ ia = (ip6->s6_addr32[2] >> 8) | (ip6->s6_addr32[3] << 24);
+#endif
+ break;
+ default:
+ return (0);
+ };
+ if (nat64_check_ip4(ia) != 0 ||
+ nat64_check_private_ip4(ia) != 0)
+ goto badip4;
+
+ return (ia);
+badip4:
+ DPRINTF(DP_GENERIC, "invalid destination address: %08x", ia);
+ return (0);
+badip6:
+ DPRINTF(DP_GENERIC, "invalid IPv4-embedded IPv6 address");
+ return (0);
+}
+#endif
+
+/*
+ * According to RFC 1624 the equation for incremental checksum update is:
+ * HC' = ~(~HC + ~m + m') -- [Eqn. 3]
+ * HC' = HC - ~m - m' -- [Eqn. 4]
+ * So, when we are replacing IPv4 addresses to IPv6, we
+ * can assume, that new bytes previously were zeros, and vise versa -
+ * when we replacing IPv6 addresses to IPv4, now unused bytes become
+ * zeros. The payload length in pseudo header has bigger size, but one
+ * half of it should be zero. Using the equation 4 we get:
+ * HC' = HC - (~m0 + m0') -- m0 is first changed word
+ * HC' = (HC - (~m0 + m0')) - (~m1 + m1') -- m1 is second changed word
+ * HC' = HC - ~m0 - m0' - ~m1 - m1' - ... =
+ * = HC - sum(~m[i] + m'[i])
+ *
+ * The function result should be used as follows:
+ * IPv6 to IPv4: HC' = cksum_add(HC, result)
+ * IPv4 to IPv6: HC' = cksum_add(HC, ~result)
+ */
+static NAT64NOINLINE uint16_t
+nat64_cksum_convert(struct ip6_hdr *ip6, struct ip *ip)
+{
+ uint32_t sum;
+ uint16_t *p;
+
+ sum = ~ip->ip_src.s_addr >> 16;
+ sum += ~ip->ip_src.s_addr & 0xffff;
+ sum += ~ip->ip_dst.s_addr >> 16;
+ sum += ~ip->ip_dst.s_addr & 0xffff;
+
+ for (p = (uint16_t *)&ip6->ip6_src;
+ p < (uint16_t *)(&ip6->ip6_src + 2); p++)
+ sum += *p;
+
+ while (sum >> 16)
+ sum = (sum & 0xffff) + (sum >> 16);
+ return (sum);
+}
+
+#if __FreeBSD_version < 1100000
+#define ip_fillid(ip) (ip)->ip_id = ip_newid()
+#endif
+static NAT64NOINLINE void
+nat64_init_ip4hdr(const struct ip6_hdr *ip6, const struct ip6_frag *frag,
+ uint16_t plen, uint8_t proto, struct ip *ip)
+{
+
+ /* assume addresses are already initialized */
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = sizeof(*ip) >> 2;
+ ip->ip_tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+ ip->ip_len = htons(sizeof(*ip) + plen);
+#ifdef IPFIREWALL_NAT64_DIRECT_OUTPUT
+ ip->ip_ttl = ip6->ip6_hlim - IPV6_HLIMDEC;
+#else
+ /* Forwarding code will decrement TTL. */
+ ip->ip_ttl = ip6->ip6_hlim;
+#endif
+ ip->ip_sum = 0;
+ ip->ip_p = (proto == IPPROTO_ICMPV6) ? IPPROTO_ICMP: proto;
+ ip_fillid(ip);
+ if (frag != NULL) {
+ ip->ip_off = htons(ntohs(frag->ip6f_offlg) >> 3);
+ if (frag->ip6f_offlg & IP6F_MORE_FRAG)
+ ip->ip_off |= htons(IP_MF);
+ } else {
+ ip->ip_off = htons(IP_DF);
+ }
+ ip->ip_sum = in_cksum_hdr(ip);
+}
+
+#define FRAGSZ(mtu) ((mtu) - sizeof(struct ip6_hdr) - sizeof(struct ip6_frag))
+static NAT64NOINLINE int
+nat64_fragment6(nat64_stats_block *stats, struct ip6_hdr *ip6, struct mbufq *mq,
+ struct mbuf *m, uint32_t mtu, uint16_t ip_id, uint16_t ip_off)
+{
+ struct ip6_frag ip6f;
+ struct mbuf *n;
+ uint16_t hlen, len, offset;
+ int plen;
+
+ plen = ntohs(ip6->ip6_plen);
+ hlen = sizeof(struct ip6_hdr);
+
+ /* Fragmentation isn't needed */
+ if (ip_off == 0 && plen <= mtu - hlen) {
+ M_PREPEND(m, hlen, M_NOWAIT);
+ if (m == NULL) {
+ NAT64STAT_INC(stats, nomem);
+ return (ENOMEM);
+ }
+ bcopy(ip6, mtod(m, void *), hlen);
+ if (mbufq_enqueue(mq, m) != 0) {
+ m_freem(m);
+ NAT64STAT_INC(stats, dropped);
+ DPRINTF(DP_DROPS, "dropped due to mbufq overflow");
+ return (ENOBUFS);
+ }
+ return (0);
+ }
+
+ hlen += sizeof(struct ip6_frag);
+ ip6f.ip6f_reserved = 0;
+ ip6f.ip6f_nxt = ip6->ip6_nxt;
+ ip6->ip6_nxt = IPPROTO_FRAGMENT;
+ if (ip_off != 0) {
+ /*
+ * We have got an IPv4 fragment.
+ * Use offset value and ip_id from original fragment.
+ */
+ ip6f.ip6f_ident = htonl(ntohs(ip_id));
+ offset = (ntohs(ip_off) & IP_OFFMASK) << 3;
+ NAT64STAT_INC(stats, ifrags);
+ } else {
+ /* The packet size exceeds interface MTU */
+ ip6f.ip6f_ident = htonl(ip6_randomid());
+ offset = 0; /* First fragment*/
+ }
+ while (plen > 0 && m != NULL) {
+ n = NULL;
+ len = FRAGSZ(mtu) & ~7;
+ if (len > plen)
+ len = plen;
+ ip6->ip6_plen = htons(len + sizeof(ip6f));
+ ip6f.ip6f_offlg = ntohs(offset);
+ if (len < plen || (ip_off & htons(IP_MF)) != 0)
+ ip6f.ip6f_offlg |= IP6F_MORE_FRAG;
+ offset += len;
+ plen -= len;
+ if (plen > 0) {
+ n = m_split(m, len, M_NOWAIT);
+ if (n == NULL)
+ goto fail;
+ }
+ M_PREPEND(m, hlen, M_NOWAIT);
+ if (m == NULL)
+ goto fail;
+ bcopy(ip6, mtod(m, void *), sizeof(struct ip6_hdr));
+ bcopy(&ip6f, mtodo(m, sizeof(struct ip6_hdr)),
+ sizeof(struct ip6_frag));
+ if (mbufq_enqueue(mq, m) != 0)
+ goto fail;
+ m = n;
+ }
+ NAT64STAT_ADD(stats, ofrags, mbufq_len(mq));
+ return (0);
+fail:
+ if (m != NULL)
+ m_freem(m);
+ if (n != NULL)
+ m_freem(n);
+ mbufq_drain(mq);
+ NAT64STAT_INC(stats, nomem);
+ return (ENOMEM);
+}
+
+#if __FreeBSD_version < 1100000
+#define rt_expire rt_rmx.rmx_expire
+#define rt_mtu rt_rmx.rmx_mtu
+#endif
+static NAT64NOINLINE struct sockaddr*
+nat64_find_route6(struct route_in6 *ro, struct in6_addr *dest, struct mbuf *m)
+{
+ struct sockaddr_in6 *dst;
+ struct rtentry *rt;
+
+ bzero(ro, sizeof(*ro));
+ dst = (struct sockaddr_in6 *)&ro->ro_dst;
+ dst->sin6_family = AF_INET6;
+ dst->sin6_len = sizeof(*dst);
+ dst->sin6_addr = *dest;
+ IN6_LOOKUP_ROUTE(ro, M_GETFIB(m));
+ rt = ro->ro_rt;
+ if (rt && (rt->rt_flags & RTF_UP) &&
+ (rt->rt_ifp->if_flags & IFF_UP) &&
+ (rt->rt_ifp->if_drv_flags & IFF_DRV_RUNNING)) {
+ if (rt->rt_flags & RTF_GATEWAY)
+ dst = (struct sockaddr_in6 *)rt->rt_gateway;
+ } else
+ return (NULL);
+ if (((rt->rt_flags & RTF_REJECT) &&
+ (rt->rt_expire == 0 ||
+ time_uptime < rt->rt_expire)) ||
+ rt->rt_ifp->if_link_state == LINK_STATE_DOWN)
+ return (NULL);
+ return ((struct sockaddr *)dst);
+}
+
+#define NAT64_ICMP6_PLEN 64
+static NAT64NOINLINE void
+nat64_icmp6_reflect(struct mbuf *m, uint8_t type, uint8_t code, uint32_t mtu,
+ nat64_stats_block *stats, void *logdata)
+{
+ struct icmp6_hdr *icmp6;
+ struct ip6_hdr *ip6, *oip6;
+ struct mbuf *n;
+ int len, plen;
+
+ len = 0;
+ plen = nat64_getlasthdr(m, &len);
+ if (plen < 0) {
+ DPRINTF(DP_DROPS, "mbuf isn't contigious");
+ goto freeit;
+ }
+ /*
+ * Do not send ICMPv6 in reply to ICMPv6 errors.
+ */
+ if (plen == IPPROTO_ICMPV6) {
+ if (m->m_len < len + sizeof(*icmp6)) {
+ DPRINTF(DP_DROPS, "mbuf isn't contigious");
+ goto freeit;
+ }
+ icmp6 = mtodo(m, len);
+ if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST ||
+ icmp6->icmp6_type == ND_REDIRECT) {
+ DPRINTF(DP_DROPS, "do not send ICMPv6 in reply to "
+ "ICMPv6 errors");
+ goto freeit;
+ }
+ }
+ /*
+ if (icmp6_ratelimit(&ip6->ip6_src, type, code))
+ goto freeit;
+ */
+ ip6 = mtod(m, struct ip6_hdr *);
+ switch (type) {
+ case ICMP6_DST_UNREACH:
+ case ICMP6_PACKET_TOO_BIG:
+ case ICMP6_TIME_EXCEEDED:
+ case ICMP6_PARAM_PROB:
+ break;
+ default:
+ goto freeit;
+ }
+ /* Calculate length of ICMPv6 payload */
+ len = (m->m_pkthdr.len > NAT64_ICMP6_PLEN) ? NAT64_ICMP6_PLEN:
+ m->m_pkthdr.len;
+
+ /* Create new ICMPv6 datagram */
+ plen = len + sizeof(struct icmp6_hdr);
+ n = m_get2(sizeof(struct ip6_hdr) + plen + max_hdr, M_NOWAIT,
+ MT_HEADER, M_PKTHDR);
+ if (n == NULL) {
+ NAT64STAT_INC(stats, nomem);
+ m_freem(m);
+ return;
+ }
+ /*
+ * Move pkthdr from original mbuf. We should have initialized some
+ * fields, because we can reinject this mbuf to netisr and it will
+ * go trough input path (it requires at least rcvif should be set).
+ * Also do M_ALIGN() to reduce chances of need to allocate new mbuf
+ * in the chain, when we will do M_PREPEND() or make some type of
+ * tunneling.
+ */
+ m_move_pkthdr(n, m);
+ M_ALIGN(n, sizeof(struct ip6_hdr) + plen + max_hdr);
+
+ n->m_len = n->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
+ oip6 = mtod(n, struct ip6_hdr *);
+ oip6->ip6_src = ip6->ip6_dst;
+ oip6->ip6_dst = ip6->ip6_src;
+ oip6->ip6_nxt = IPPROTO_ICMPV6;
+ oip6->ip6_flow = 0;
+ oip6->ip6_vfc |= IPV6_VERSION;
+ oip6->ip6_hlim = V_ip6_defhlim;
+ oip6->ip6_plen = htons(plen);
+
+ icmp6 = mtodo(n, sizeof(struct ip6_hdr));
+ icmp6->icmp6_cksum = 0;
+ icmp6->icmp6_type = type;
+ icmp6->icmp6_code = code;
+ icmp6->icmp6_mtu = htonl(mtu);
+
+ m_copydata(m, 0, len, mtodo(n, sizeof(struct ip6_hdr) +
+ sizeof(struct icmp6_hdr)));
+ icmp6->icmp6_cksum = in6_cksum(n, IPPROTO_ICMPV6,
+ sizeof(struct ip6_hdr), plen);
+ m_freem(m);
+ nat64_output_one(n, stats, logdata);
+ return;
+freeit:
+ NAT64STAT_INC(stats, dropped);
+ m_freem(m);
+}
+
+static NAT64NOINLINE struct sockaddr*
+nat64_find_route4(struct route *ro, in_addr_t dest, struct mbuf *m)
+{
+ struct sockaddr_in *dst;
+ struct rtentry *rt;
+
+ bzero(ro, sizeof(*ro));
+ dst = (struct sockaddr_in *)&ro->ro_dst;
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr.s_addr = dest;
+ IN_LOOKUP_ROUTE(ro, M_GETFIB(m));
+ rt = ro->ro_rt;
+ if (rt && (rt->rt_flags & RTF_UP) &&
+ (rt->rt_ifp->if_flags & IFF_UP) &&
+ (rt->rt_ifp->if_drv_flags & IFF_DRV_RUNNING)) {
+ if (rt->rt_flags & RTF_GATEWAY)
+ dst = (struct sockaddr_in *)rt->rt_gateway;
+ } else
+ return (NULL);
+ if (((rt->rt_flags & RTF_REJECT) &&
+ (rt->rt_expire == 0 ||
+ time_uptime < rt->rt_expire)) ||
+ rt->rt_ifp->if_link_state == LINK_STATE_DOWN)
+ return (NULL);
+ return ((struct sockaddr *)dst);
+}
+
+#define NAT64_ICMP_PLEN 64
+static NAT64NOINLINE void
+nat64_icmp_reflect(struct mbuf *m, uint8_t type,
+ uint8_t code, uint16_t mtu, nat64_stats_block *stats, void *logdata)
+{
+ struct icmp *icmp;
+ struct ip *ip, *oip;
+ struct mbuf *n;
+ int len, plen;
+
+ ip = mtod(m, struct ip *);
+ /* Do not send ICMP error if packet is not the first fragment */
+ if (ip->ip_off & ~ntohs(IP_MF|IP_DF)) {
+ DPRINTF(DP_DROPS, "not first fragment");
+ goto freeit;
+ }
+ /* Do not send ICMP in reply to ICMP errors */
+ if (ip->ip_p == IPPROTO_ICMP) {
+ if (m->m_len < (ip->ip_hl << 2)) {
+ DPRINTF(DP_DROPS, "mbuf isn't contigious");
+ goto freeit;
+ }
+ icmp = mtodo(m, ip->ip_hl << 2);
+ if (!ICMP_INFOTYPE(icmp->icmp_type)) {
+ DPRINTF(DP_DROPS, "do not send ICMP in reply to "
+ "ICMP errors");
+ goto freeit;
+ }
+ }
+ switch (type) {
+ case ICMP_UNREACH:
+ case ICMP_TIMXCEED:
+ case ICMP_PARAMPROB:
+ break;
+ default:
+ goto freeit;
+ }
+ /* Calculate length of ICMP payload */
+ len = (m->m_pkthdr.len > NAT64_ICMP_PLEN) ? (ip->ip_hl << 2) + 8:
+ m->m_pkthdr.len;
+
+ /* Create new ICMPv4 datagram */
+ plen = len + sizeof(struct icmphdr) + sizeof(uint32_t);
+ n = m_get2(sizeof(struct ip) + plen + max_hdr, M_NOWAIT,
+ MT_HEADER, M_PKTHDR);
+ if (n == NULL) {
+ NAT64STAT_INC(stats, nomem);
+ m_freem(m);
+ return;
+ }
+ m_move_pkthdr(n, m);
+ M_ALIGN(n, sizeof(struct ip) + plen + max_hdr);
+
+ n->m_len = n->m_pkthdr.len = sizeof(struct ip) + plen;
+ oip = mtod(n, struct ip *);
+ oip->ip_v = IPVERSION;
+ oip->ip_hl = sizeof(struct ip) >> 2;
+ oip->ip_tos = 0;
+ oip->ip_len = htons(n->m_pkthdr.len);
+ oip->ip_ttl = V_ip_defttl;
+ oip->ip_p = IPPROTO_ICMP;
+ ip_fillid(oip);
+ oip->ip_off = htons(IP_DF);
+ oip->ip_src = ip->ip_dst;
+ oip->ip_dst = ip->ip_src;
+ oip->ip_sum = 0;
+ oip->ip_sum = in_cksum_hdr(oip);
+
+ icmp = mtodo(n, sizeof(struct ip));
+ icmp->icmp_type = type;
+ icmp->icmp_code = code;
+ icmp->icmp_cksum = 0;
+ icmp->icmp_pmvoid = 0;
+ icmp->icmp_nextmtu = htons(mtu);
+ m_copydata(m, 0, len, mtodo(n, sizeof(struct ip) +
+ sizeof(struct icmphdr) + sizeof(uint32_t)));
+ icmp->icmp_cksum = in_cksum_skip(n, sizeof(struct ip) + plen,
+ sizeof(struct ip));
+ m_freem(m);
+ nat64_output_one(n, stats, logdata);
+ return;
+freeit:
+ NAT64STAT_INC(stats, dropped);
+ m_freem(m);
+}
+
+/* Translate ICMP echo request/reply into ICMPv6 */
+static void
+nat64_icmp_handle_echo(struct ip6_hdr *ip6, struct icmp6_hdr *icmp6,
+ uint16_t id, uint8_t type)
+{
+ uint16_t old;
+
+ old = *(uint16_t *)icmp6; /* save type+code in one word */
+ icmp6->icmp6_type = type;
+ /* Reflect ICMPv6 -> ICMPv4 type translation in the cksum */
+ icmp6->icmp6_cksum = cksum_adjust(icmp6->icmp6_cksum,
+ old, *(uint16_t *)icmp6);
+ if (id != 0) {
+ old = icmp6->icmp6_id;
+ icmp6->icmp6_id = id;
+ /* Reflect ICMP id translation in the cksum */
+ icmp6->icmp6_cksum = cksum_adjust(icmp6->icmp6_cksum,
+ old, id);
+ }
+ /* Reflect IPv6 pseudo header in the cksum */
+ icmp6->icmp6_cksum = ~in6_cksum_pseudo(ip6, ntohs(ip6->ip6_plen),
+ IPPROTO_ICMPV6, ~icmp6->icmp6_cksum);
+}
+
+static NAT64NOINLINE struct mbuf *
+nat64_icmp_translate(struct mbuf *m, struct ip6_hdr *ip6, uint16_t icmpid,
+ int offset, nat64_stats_block *stats)
+{
+ struct ip ip;
+ struct icmp *icmp;
+ struct tcphdr *tcp;
+ struct udphdr *udp;
+ struct ip6_hdr *eip6;
+ struct mbuf *n;
+ uint32_t mtu;
+ int len, hlen, plen;
+ uint8_t type, code;
+
+ if (m->m_len < offset + ICMP_MINLEN)
+ m = m_pullup(m, offset + ICMP_MINLEN);
+ if (m == NULL) {
+ NAT64STAT_INC(stats, nomem);
+ return (m);
+ }
+ mtu = 0;
+ icmp = mtodo(m, offset);
+ /* RFC 7915 p4.2 */
+ switch (icmp->icmp_type) {
+ case ICMP_ECHOREPLY:
+ type = ICMP6_ECHO_REPLY;
+ code = 0;
+ break;
+ case ICMP_UNREACH:
+ type = ICMP6_DST_UNREACH;
+ switch (icmp->icmp_code) {
+ case ICMP_UNREACH_NET:
+ case ICMP_UNREACH_HOST:
+ case ICMP_UNREACH_SRCFAIL:
+ case ICMP_UNREACH_NET_UNKNOWN:
+ case ICMP_UNREACH_HOST_UNKNOWN:
+ case ICMP_UNREACH_TOSNET:
+ case ICMP_UNREACH_TOSHOST:
+ code = ICMP6_DST_UNREACH_NOROUTE;
+ break;
+ case ICMP_UNREACH_PROTOCOL:
+ type = ICMP6_PARAM_PROB;
+ code = ICMP6_PARAMPROB_NEXTHEADER;
+ break;
+ case ICMP_UNREACH_PORT:
+ code = ICMP6_DST_UNREACH_NOPORT;
+ break;
+ case ICMP_UNREACH_NEEDFRAG:
+ type = ICMP6_PACKET_TOO_BIG;
+ code = 0;
+ /* XXX: needs an additional look */
+ mtu = max(IPV6_MMTU, ntohs(icmp->icmp_nextmtu) + 20);
+ break;
+ case ICMP_UNREACH_NET_PROHIB:
+ case ICMP_UNREACH_HOST_PROHIB:
+ case ICMP_UNREACH_FILTER_PROHIB:
+ case ICMP_UNREACH_PRECEDENCE_CUTOFF:
+ code = ICMP6_DST_UNREACH_ADMIN;
+ break;
+ default:
+ DPRINTF(DP_DROPS, "Unsupported ICMP type %d, code %d",
+ icmp->icmp_type, icmp->icmp_code);
+ goto freeit;
+ }
+ break;
+ case ICMP_TIMXCEED:
+ type = ICMP6_TIME_EXCEEDED;
+ code = icmp->icmp_code;
+ break;
+ case ICMP_ECHO:
+ type = ICMP6_ECHO_REQUEST;
+ code = 0;
+ break;
+ case ICMP_PARAMPROB:
+ type = ICMP6_PARAM_PROB;
+ switch (icmp->icmp_code) {
+ case ICMP_PARAMPROB_ERRATPTR:
+ case ICMP_PARAMPROB_LENGTH:
+ code = ICMP6_PARAMPROB_HEADER;
+ switch (icmp->icmp_pptr) {
+ case 0: /* Version/IHL */
+ case 1: /* Type Of Service */
+ mtu = icmp->icmp_pptr;
+ break;
+ case 2: /* Total Length */
+ case 3: mtu = 4; /* Payload Length */
+ break;
+ case 8: /* Time to Live */
+ mtu = 7; /* Hop Limit */
+ break;
+ case 9: /* Protocol */
+ mtu = 6; /* Next Header */
+ break;
+ case 12: /* Source address */
+ case 13:
+ case 14:
+ case 15:
+ mtu = 8;
+ break;
+ case 16: /* Destination address */
+ case 17:
+ case 18:
+ case 19:
+ mtu = 24;
+ break;
+ default: /* Silently drop */
+ DPRINTF(DP_DROPS, "Unsupported ICMP type %d,"
+ " code %d, pptr %d", icmp->icmp_type,
+ icmp->icmp_code, icmp->icmp_pptr);
+ goto freeit;
+ }
+ break;
+ default:
+ DPRINTF(DP_DROPS, "Unsupported ICMP type %d,"
+ " code %d, pptr %d", icmp->icmp_type,
+ icmp->icmp_code, icmp->icmp_pptr);
+ goto freeit;
+ }
+ break;
+ default:
+ DPRINTF(DP_DROPS, "Unsupported ICMP type %d, code %d",
+ icmp->icmp_type, icmp->icmp_code);
+ goto freeit;
+ }
+ /*
+ * For echo request/reply we can use original payload,
+ * but we need adjust icmp_cksum, because ICMPv6 cksum covers
+ * IPv6 pseudo header and ICMPv6 types differs from ICMPv4.
+ */
+ if (type == ICMP6_ECHO_REQUEST || type == ICMP6_ECHO_REPLY) {
+ nat64_icmp_handle_echo(ip6, ICMP6(icmp), icmpid, type);
+ return (m);
+ }
+ /*
+ * For other types of ICMP messages we need to translate inner
+ * IPv4 header to IPv6 header.
+ * Assume ICMP src is the same as payload dst
+ * E.g. we have ( GWsrc1 , NATIP1 ) in outer header
+ * and ( NATIP1, Hostdst1 ) in ICMP copy header.
+ * In that case, we already have map for NATIP1 and GWsrc1.
+ * The only thing we need is to copy IPv6 map prefix to
+ * Hostdst1.
+ */
+ hlen = offset + ICMP_MINLEN;
+ if (m->m_pkthdr.len < hlen + sizeof(struct ip) + ICMP_MINLEN) {
+ DPRINTF(DP_DROPS, "Message is too short %d",
+ m->m_pkthdr.len);
+ goto freeit;
+ }
+ m_copydata(m, hlen, sizeof(struct ip), (char *)&ip);
+ if (ip.ip_v != IPVERSION) {
+ DPRINTF(DP_DROPS, "Wrong IP version %d", ip.ip_v);
+ goto freeit;
+ }
+ hlen += ip.ip_hl << 2; /* Skip inner IP header */
+ if (nat64_check_ip4(ip.ip_src.s_addr) != 0 ||
+ nat64_check_ip4(ip.ip_dst.s_addr) != 0 ||
+ nat64_check_private_ip4(ip.ip_src.s_addr) != 0 ||
+ nat64_check_private_ip4(ip.ip_dst.s_addr) != 0) {
+ DPRINTF(DP_DROPS, "IP addresses checks failed %04x -> %04x",
+ ntohl(ip.ip_src.s_addr), ntohl(ip.ip_dst.s_addr));
+ goto freeit;
+ }
+ if (m->m_pkthdr.len < hlen + ICMP_MINLEN) {
+ DPRINTF(DP_DROPS, "Message is too short %d",
+ m->m_pkthdr.len);
+ goto freeit;
+ }
+#if 0
+ /*
+ * Check that inner source matches the outer destination.
+ * XXX: We need some method to convert IPv4 into IPv6 address here,
+ * and compare IPv6 addresses.
+ */
+ if (ip.ip_src.s_addr != nat64_get_ip4(&ip6->ip6_dst)) {
+ DPRINTF(DP_GENERIC, "Inner source doesn't match destination ",
+ "%04x vs %04x", ip.ip_src.s_addr,
+ nat64_get_ip4(&ip6->ip6_dst));
+ goto freeit;
+ }
+#endif
+ /*
+ * Create new mbuf for ICMPv6 datagram.
+ * NOTE: len is data length just after inner IP header.
+ */
+ len = m->m_pkthdr.len - hlen;
+ if (sizeof(struct ip6_hdr) +
+ sizeof(struct icmp6_hdr) + len > NAT64_ICMP6_PLEN)
+ len = NAT64_ICMP6_PLEN - sizeof(struct icmp6_hdr) -
+ sizeof(struct ip6_hdr);
+ plen = sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr) + len;
+ n = m_get2(offset + plen + max_hdr, M_NOWAIT, MT_HEADER, M_PKTHDR);
+ if (n == NULL) {
+ NAT64STAT_INC(stats, nomem);
+ m_freem(m);
+ return (NULL);
+ }
+ m_move_pkthdr(n, m);
+ M_ALIGN(n, offset + plen + max_hdr);
+ n->m_len = n->m_pkthdr.len = offset + plen;
+ /* Adjust ip6_plen in outer header */
+ ip6->ip6_plen = htons(plen);
+ /* Construct new inner IPv6 header */
+ eip6 = mtodo(n, offset + sizeof(struct icmp6_hdr));
+ eip6->ip6_src = ip6->ip6_dst;
+ /* Use the fact that we have single /96 prefix for IPv4 map */
+ eip6->ip6_dst = ip6->ip6_src;
+ nat64_set_ip4(&eip6->ip6_dst, ip.ip_dst.s_addr);
+
+ eip6->ip6_flow = htonl(ip.ip_tos << 20);
+ eip6->ip6_vfc |= IPV6_VERSION;
+ eip6->ip6_hlim = ip.ip_ttl;
+ eip6->ip6_plen = htons(ntohs(ip.ip_len) - (ip.ip_hl << 2));
+ eip6->ip6_nxt = (ip.ip_p == IPPROTO_ICMP) ? IPPROTO_ICMPV6: ip.ip_p;
+ m_copydata(m, hlen, len, (char *)(eip6 + 1));
+ /*
+ * We need to translate source port in the inner ULP header,
+ * and adjust ULP checksum.
+ */
+ switch (ip.ip_p) {
+ case IPPROTO_TCP:
+ if (len < offsetof(struct tcphdr, th_sum))
+ break;
+ tcp = TCP(eip6 + 1);
+ if (icmpid != 0) {
+ tcp->th_sum = cksum_adjust(tcp->th_sum,
+ tcp->th_sport, icmpid);
+ tcp->th_sport = icmpid;
+ }
+ tcp->th_sum = cksum_add(tcp->th_sum,
+ ~nat64_cksum_convert(eip6, &ip));
+ break;
+ case IPPROTO_UDP:
+ if (len < offsetof(struct udphdr, uh_sum))
+ break;
+ udp = UDP(eip6 + 1);
+ if (icmpid != 0) {
+ udp->uh_sum = cksum_adjust(udp->uh_sum,
+ udp->uh_sport, icmpid);
+ udp->uh_sport = icmpid;
+ }
+ udp->uh_sum = cksum_add(udp->uh_sum,
+ ~nat64_cksum_convert(eip6, &ip));
+ break;
+ case IPPROTO_ICMP:
+ /*
+ * Check if this is an ICMP error message for echo request
+ * that we sent. I.e. ULP in the data containing invoking
+ * packet is IPPROTO_ICMP and its type is ICMP_ECHO.
+ */
+ icmp = (struct icmp *)(eip6 + 1);
+ if (icmp->icmp_type != ICMP_ECHO) {
+ m_freem(n);
+ goto freeit;
+ }
+ /*
+ * For our client this original datagram should looks
+ * like it was ICMPv6 datagram with type ICMP6_ECHO_REQUEST.
+ * Thus we need adjust icmp_cksum and convert type from
+ * ICMP_ECHO to ICMP6_ECHO_REQUEST.
+ */
+ nat64_icmp_handle_echo(eip6, ICMP6(icmp), icmpid,
+ ICMP6_ECHO_REQUEST);
+ }
+ m_freem(m);
+ /* Convert ICMPv4 into ICMPv6 header */
+ icmp = mtodo(n, offset);
+ ICMP6(icmp)->icmp6_type = type;
+ ICMP6(icmp)->icmp6_code = code;
+ ICMP6(icmp)->icmp6_mtu = htonl(mtu);
+ ICMP6(icmp)->icmp6_cksum = 0;
+ ICMP6(icmp)->icmp6_cksum = cksum_add(
+ ~in6_cksum_pseudo(ip6, plen, IPPROTO_ICMPV6, 0),
+ in_cksum_skip(n, n->m_pkthdr.len, offset));
+ return (n);
+freeit:
+ m_freem(m);
+ NAT64STAT_INC(stats, dropped);
+ return (NULL);
+}
+
+int
+nat64_getlasthdr(struct mbuf *m, int *offset)
+{
+ struct ip6_hdr *ip6;
+ struct ip6_hbh *hbh;
+ int proto, hlen;
+
+ if (offset != NULL)
+ hlen = *offset;
+ else
+ hlen = 0;
+
+ if (m->m_len < hlen + sizeof(*ip6))
+ return (-1);
+
+ ip6 = mtodo(m, hlen);
+ hlen += sizeof(*ip6);
+ proto = ip6->ip6_nxt;
+ /* Skip extension headers */
+ while (proto == IPPROTO_HOPOPTS || proto == IPPROTO_ROUTING ||
+ proto == IPPROTO_DSTOPTS) {
+ hbh = mtodo(m, hlen);
+ /*
+ * We expect mbuf has contigious data up to
+ * upper level header.
+ */
+ if (m->m_len < hlen)
+ return (-1);
+ /*
+ * We doesn't support Jumbo payload option,
+ * so return error.
+ */
+ if (proto == IPPROTO_HOPOPTS && ip6->ip6_plen == 0)
+ return (-1);
+ proto = hbh->ip6h_nxt;
+ hlen += hbh->ip6h_len << 3;
+ }
+ if (offset != NULL)
+ *offset = hlen;
+ return (proto);
+}
+
+int
+nat64_do_handle_ip4(struct mbuf *m, struct in6_addr *saddr,
+ struct in6_addr *daddr, uint16_t lport, nat64_stats_block *stats,
+ void *logdata)
+{
+ struct route_in6 ro;
+ struct ip6_hdr ip6;
+ struct ifnet *ifp;
+ struct ip *ip;
+ struct mbufq mq;
+ struct sockaddr *dst;
+ uint32_t mtu;
+ uint16_t ip_id, ip_off;
+ uint16_t *csum;
+ int plen, hlen;
+ uint8_t proto;
+
+ ip = mtod(m, struct ip*);
+
+ if (ip->ip_ttl <= IPTTLDEC) {
+ nat64_icmp_reflect(m, ICMP_TIMXCEED,
+ ICMP_TIMXCEED_INTRANS, 0, stats, logdata);
+ return (NAT64RETURN);
+ }
+
+ ip6.ip6_dst = *daddr;
+ ip6.ip6_src = *saddr;
+
+ hlen = ip->ip_hl << 2;
+ plen = ntohs(ip->ip_len) - hlen;
+ proto = ip->ip_p;
+
+ /* Save ip_id and ip_off, both are in network byte order */
+ ip_id = ip->ip_id;
+ ip_off = ip->ip_off & htons(IP_OFFMASK | IP_MF);
+
+ /* Fragment length must be multiple of 8 octets */
+ if ((ip->ip_off & htons(IP_MF)) != 0 && (plen & 0x7) != 0) {
+ nat64_icmp_reflect(m, ICMP_PARAMPROB,
+ ICMP_PARAMPROB_LENGTH, 0, stats, logdata);
+ return (NAT64RETURN);
+ }
+ /* Fragmented ICMP is unsupported */
+ if (proto == IPPROTO_ICMP && ip_off != 0) {
+ DPRINTF(DP_DROPS, "dropped due to fragmented ICMP");
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+
+ dst = nat64_find_route6(&ro, &ip6.ip6_dst, m);
+ if (dst == NULL) {
+ FREE_ROUTE(&ro);
+ NAT64STAT_INC(stats, noroute6);
+ nat64_icmp_reflect(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0,
+ stats, logdata);
+ return (NAT64RETURN);
+ }
+ ifp = ro.ro_rt->rt_ifp;
+ if (ro.ro_rt->rt_mtu != 0)
+ mtu = min(ro.ro_rt->rt_mtu, ifp->if_mtu);
+ else
+ mtu = ifp->if_mtu;
+ if (mtu < plen + sizeof(ip6) && (ip->ip_off & htons(IP_DF)) != 0) {
+ FREE_ROUTE(&ro);
+ nat64_icmp_reflect(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
+ FRAGSZ(mtu) + sizeof(struct ip), stats, logdata);
+ return (NAT64RETURN);
+ }
+
+ ip6.ip6_flow = htonl(ip->ip_tos << 20);
+ ip6.ip6_vfc |= IPV6_VERSION;
+#ifdef IPFIREWALL_NAT64_DIRECT_OUTPUT
+ ip6.ip6_hlim = ip->ip_ttl - IPTTLDEC;
+#else
+ /* Forwarding code will decrement HLIM. */
+ ip6.ip6_hlim = ip->ip_ttl;
+#endif
+ ip6.ip6_plen = htons(plen);
+ ip6.ip6_nxt = (proto == IPPROTO_ICMP) ? IPPROTO_ICMPV6: proto;
+ /* Convert checksums. */
+ switch (proto) {
+ case IPPROTO_TCP:
+ csum = &TCP(mtodo(m, hlen))->th_sum;
+ if (lport != 0) {
+ struct tcphdr *tcp = TCP(mtodo(m, hlen));
+ *csum = cksum_adjust(*csum, tcp->th_dport, lport);
+ tcp->th_dport = lport;
+ }
+ *csum = cksum_add(*csum, ~nat64_cksum_convert(&ip6, ip));
+ break;
+ case IPPROTO_UDP:
+ csum = &UDP(mtodo(m, hlen))->uh_sum;
+ if (lport != 0) {
+ struct udphdr *udp = UDP(mtodo(m, hlen));
+ *csum = cksum_adjust(*csum, udp->uh_dport, lport);
+ udp->uh_dport = lport;
+ }
+ *csum = cksum_add(*csum, ~nat64_cksum_convert(&ip6, ip));
+ break;
+ case IPPROTO_ICMP:
+ m = nat64_icmp_translate(m, &ip6, lport, hlen, stats);
+ if (m == NULL) {
+ FREE_ROUTE(&ro);
+ /* stats already accounted */
+ return (NAT64RETURN);
+ }
+ }
+
+ m_adj(m, hlen);
+ mbufq_init(&mq, 255);
+ nat64_fragment6(stats, &ip6, &mq, m, mtu, ip_id, ip_off);
+ while ((m = mbufq_dequeue(&mq)) != NULL) {
+ if (nat64_output(ifp, m, dst, (struct route *)&ro, stats,
+ logdata) != 0)
+ break;
+ NAT64STAT_INC(stats, opcnt46);
+ }
+ mbufq_drain(&mq);
+ FREE_ROUTE(&ro);
+ return (NAT64RETURN);
+}
+
+int
+nat64_handle_icmp6(struct mbuf *m, int hlen, uint32_t aaddr, uint16_t aport,
+ nat64_stats_block *stats, void *logdata)
+{
+ struct ip ip;
+ struct icmp6_hdr *icmp6;
+ struct ip6_frag *ip6f;
+ struct ip6_hdr *ip6, *ip6i;
+ uint32_t mtu;
+ int plen, proto;
+ uint8_t type, code;
+
+ if (hlen == 0) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ if (nat64_check_ip6(&ip6->ip6_src) != 0 ||
+ nat64_check_ip6(&ip6->ip6_dst) != 0)
+ return (NAT64SKIP);
+
+ proto = nat64_getlasthdr(m, &hlen);
+ if (proto != IPPROTO_ICMPV6) {
+ DPRINTF(DP_DROPS,
+ "dropped due to mbuf isn't contigious");
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+ }
+
+ /*
+ * Translate ICMPv6 type and code to ICMPv4 (RFC7915).
+ * NOTE: ICMPv6 echo handled by nat64_do_handle_ip6().
+ */
+ icmp6 = mtodo(m, hlen);
+ mtu = 0;
+ switch (icmp6->icmp6_type) {
+ case ICMP6_DST_UNREACH:
+ type = ICMP_UNREACH;
+ switch (icmp6->icmp6_code) {
+ case ICMP6_DST_UNREACH_NOROUTE:
+ case ICMP6_DST_UNREACH_BEYONDSCOPE:
+ case ICMP6_DST_UNREACH_ADDR:
+ code = ICMP_UNREACH_HOST;
+ break;
+ case ICMP6_DST_UNREACH_ADMIN:
+ code = ICMP_UNREACH_HOST_PROHIB;
+ break;
+ case ICMP6_DST_UNREACH_NOPORT:
+ code = ICMP_UNREACH_PORT;
+ break;
+ default:
+ DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d,"
+ " code %d", icmp6->icmp6_type,
+ icmp6->icmp6_code);
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+ break;
+ case ICMP6_PACKET_TOO_BIG:
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_NEEDFRAG;
+ mtu = ntohl(icmp6->icmp6_mtu);
+ if (mtu < IPV6_MMTU) {
+ DPRINTF(DP_DROPS, "Wrong MTU %d in ICMPv6 type %d,"
+ " code %d", mtu, icmp6->icmp6_type,
+ icmp6->icmp6_code);
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+ /*
+ * Adjust MTU to reflect difference between
+ * IPv6 an IPv4 headers.
+ */
+ mtu -= sizeof(struct ip6_hdr) - sizeof(struct ip);
+ break;
+ case ICMP6_TIME_EXCEED_TRANSIT:
+ type = ICMP_TIMXCEED;
+ code = ICMP_TIMXCEED_INTRANS;
+ break;
+ case ICMP6_PARAM_PROB:
+ switch (icmp6->icmp6_code) {
+ case ICMP6_PARAMPROB_HEADER:
+ type = ICMP_PARAMPROB;
+ code = ICMP_PARAMPROB_ERRATPTR;
+ mtu = ntohl(icmp6->icmp6_pptr);
+ switch (mtu) {
+ case 0: /* Version/Traffic Class */
+ case 1: /* Traffic Class/Flow Label */
+ break;
+ case 4: /* Payload Length */
+ case 5:
+ mtu = 2;
+ break;
+ case 6: /* Next Header */
+ mtu = 9;
+ break;
+ case 7: /* Hop Limit */
+ mtu = 8;
+ break;
+ default:
+ if (mtu >= 8 && mtu <= 23) {
+ mtu = 12; /* Source address */
+ break;
+ }
+ if (mtu >= 24 && mtu <= 39) {
+ mtu = 16; /* Destination address */
+ break;
+ }
+ DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d,"
+ " code %d, pptr %d", icmp6->icmp6_type,
+ icmp6->icmp6_code, mtu);
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+ case ICMP6_PARAMPROB_NEXTHEADER:
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_PROTOCOL;
+ break;
+ default:
+ DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d,"
+ " code %d, pptr %d", icmp6->icmp6_type,
+ icmp6->icmp6_code, ntohl(icmp6->icmp6_pptr));
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+ break;
+ default:
+ DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d, code %d",
+ icmp6->icmp6_type, icmp6->icmp6_code);
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+
+ hlen += sizeof(struct icmp6_hdr);
+ if (m->m_pkthdr.len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) {
+ NAT64STAT_INC(stats, dropped);
+ DPRINTF(DP_DROPS, "Message is too short %d",
+ m->m_pkthdr.len);
+ return (NAT64MFREE);
+ }
+ /*
+ * We need at least ICMP_MINLEN bytes of original datagram payload
+ * to generate ICMP message. It is nice that ICMP_MINLEN is equal
+ * to sizeof(struct ip6_frag). So, if embedded datagram had a fragment
+ * header we will not have to do m_pullup() again.
+ *
+ * What we have here:
+ * Outer header: (IPv6iGW, v4mapPRefix+v4exthost)
+ * Inner header: (v4mapPRefix+v4host, IPv6iHost) [sport, dport]
+ * We need to translate it to:
+ *
+ * Outer header: (alias_host, v4exthost)
+ * Inner header: (v4exthost, alias_host) [sport, alias_port]
+ *
+ * Assume caller function has checked if v4mapPRefix+v4host
+ * matches configured prefix.
+ * The only two things we should be provided with are mapping between
+ * IPv6iHost <> alias_host and between dport and alias_port.
+ */
+ if (m->m_len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN)
+ m = m_pullup(m, hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN);
+ if (m == NULL) {
+ NAT64STAT_INC(stats, nomem);
+ return (NAT64RETURN);
+ }
+ ip6 = mtod(m, struct ip6_hdr *);
+ ip6i = mtodo(m, hlen);
+ ip6f = NULL;
+ proto = ip6i->ip6_nxt;
+ plen = ntohs(ip6i->ip6_plen);
+ hlen += sizeof(struct ip6_hdr);
+ if (proto == IPPROTO_FRAGMENT) {
+ if (m->m_pkthdr.len < hlen + sizeof(struct ip6_frag) +
+ ICMP_MINLEN)
+ goto fail;
+ ip6f = mtodo(m, hlen);
+ proto = ip6f->ip6f_nxt;
+ plen -= sizeof(struct ip6_frag);
+ hlen += sizeof(struct ip6_frag);
+ /* Ajust MTU to reflect frag header size */
+ if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG)
+ mtu -= sizeof(struct ip6_frag);
+ }
+ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
+ DPRINTF(DP_DROPS, "Unsupported proto %d in the inner header",
+ proto);
+ goto fail;
+ }
+ if (nat64_check_ip6(&ip6i->ip6_src) != 0 ||
+ nat64_check_ip6(&ip6i->ip6_dst) != 0) {
+ DPRINTF(DP_DROPS, "Inner addresses do not passes the check");
+ goto fail;
+ }
+ /* Check if outer dst is the same as inner src */
+ if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6i->ip6_src)) {
+ DPRINTF(DP_DROPS, "Inner src doesn't match outer dst");
+ goto fail;
+ }
+
+ /* Now we need to make a fake IPv4 packet to generate ICMP message */
+ ip.ip_dst.s_addr = aaddr;
+ ip.ip_src.s_addr = nat64_get_ip4(&ip6i->ip6_src);
+ /* XXX: Make fake ulp header */
+#ifdef IPFIREWALL_NAT64_DIRECT_OUTPUT
+ ip6i->ip6_hlim += IPV6_HLIMDEC; /* init_ip4hdr will decrement it */
+#endif
+ nat64_init_ip4hdr(ip6i, ip6f, plen, proto, &ip);
+ m_adj(m, hlen - sizeof(struct ip));
+ bcopy(&ip, mtod(m, void *), sizeof(ip));
+ nat64_icmp_reflect(m, type, code, (uint16_t)mtu, stats, logdata);
+ return (NAT64RETURN);
+fail:
+ /*
+ * We must call m_freem() because mbuf pointer could be
+ * changed with m_pullup().
+ */
+ m_freem(m);
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64RETURN);
+}
+
+int
+nat64_do_handle_ip6(struct mbuf *m, uint32_t aaddr, uint16_t aport,
+ nat64_stats_block *stats, void *logdata)
+{
+ struct route ro;
+ struct ip ip;
+ struct ifnet *ifp;
+ struct ip6_frag *frag;
+ struct ip6_hdr *ip6;
+ struct icmp6_hdr *icmp6;
+ struct sockaddr *dst;
+ uint16_t *csum;
+ uint32_t mtu;
+ int plen, hlen, proto;
+
+ /*
+ * XXX: we expect ipfw_chk() did m_pullup() up to upper level
+ * protocol's headers. Also we skip some checks, that ip6_input(),
+ * ip6_forward(), ip6_fastfwd() and ipfw_chk() already did.
+ */
+ ip6 = mtod(m, struct ip6_hdr *);
+ if (nat64_check_ip6(&ip6->ip6_src) != 0 ||
+ nat64_check_ip6(&ip6->ip6_dst) != 0) {
+ return (NAT64SKIP);
+ }
+
+ /* Starting from this point we must not return zero */
+ ip.ip_src.s_addr = aaddr;
+ if (nat64_check_ip4(ip.ip_src.s_addr) != 0) {
+ DPRINTF(DP_GENERIC, "invalid source address: %08x",
+ ip.ip_src.s_addr);
+ /* XXX: stats? */
+ return (NAT64MFREE);
+ }
+
+ ip.ip_dst.s_addr = nat64_get_ip4(&ip6->ip6_dst);
+ if (ip.ip_dst.s_addr == 0) {
+ /* XXX: stats? */
+ return (NAT64MFREE);
+ }
+
+ if (ip6->ip6_hlim <= IPV6_HLIMDEC) {
+ nat64_icmp6_reflect(m, ICMP6_TIME_EXCEEDED,
+ ICMP6_TIME_EXCEED_TRANSIT, 0, stats, logdata);
+ return (NAT64RETURN);
+ }
+
+ hlen = 0;
+ plen = ntohs(ip6->ip6_plen);
+ proto = nat64_getlasthdr(m, &hlen);
+ if (proto < 0) {
+ DPRINTF(DP_DROPS, "dropped due to mbuf isn't contigious");
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+ frag = NULL;
+ if (proto == IPPROTO_FRAGMENT) {
+ /* ipfw_chk should m_pullup up to frag header */
+ if (m->m_len < hlen + sizeof(*frag)) {
+ DPRINTF(DP_DROPS,
+ "dropped due to mbuf isn't contigious");
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+ frag = mtodo(m, hlen);
+ proto = frag->ip6f_nxt;
+ hlen += sizeof(*frag);
+ /* Fragmented ICMPv6 is unsupported */
+ if (proto == IPPROTO_ICMPV6) {
+ DPRINTF(DP_DROPS, "dropped due to fragmented ICMPv6");
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+ /* Fragment length must be multiple of 8 octets */
+ if ((frag->ip6f_offlg & IP6F_MORE_FRAG) != 0 &&
+ ((plen + sizeof(struct ip6_hdr) - hlen) & 0x7) != 0) {
+ nat64_icmp6_reflect(m, ICMP6_PARAM_PROB,
+ ICMP6_PARAMPROB_HEADER,
+ offsetof(struct ip6_hdr, ip6_plen), stats,
+ logdata);
+ return (NAT64RETURN);
+ }
+ }
+ plen -= hlen - sizeof(struct ip6_hdr);
+ if (plen < 0 || m->m_pkthdr.len < plen + hlen) {
+ DPRINTF(DP_DROPS, "plen %d, pkthdr.len %d, hlen %d",
+ plen, m->m_pkthdr.len, hlen);
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+
+ icmp6 = NULL; /* Make gcc happy */
+ if (proto == IPPROTO_ICMPV6) {
+ icmp6 = mtodo(m, hlen);
+ if (icmp6->icmp6_type != ICMP6_ECHO_REQUEST &&
+ icmp6->icmp6_type != ICMP6_ECHO_REPLY)
+ return (nat64_handle_icmp6(m, hlen, aaddr, aport,
+ stats, logdata));
+ }
+ dst = nat64_find_route4(&ro, ip.ip_dst.s_addr, m);
+ if (dst == NULL) {
+ FREE_ROUTE(&ro);
+ NAT64STAT_INC(stats, noroute4);
+ nat64_icmp6_reflect(m, ICMP6_DST_UNREACH,
+ ICMP6_DST_UNREACH_NOROUTE, 0, stats, logdata);
+ return (NAT64RETURN);
+ }
+
+ ifp = ro.ro_rt->rt_ifp;
+ if (ro.ro_rt->rt_mtu != 0)
+ mtu = min(ro.ro_rt->rt_mtu, ifp->if_mtu);
+ else
+ mtu = ifp->if_mtu;
+ if (mtu < plen + sizeof(ip)) {
+ FREE_ROUTE(&ro);
+ nat64_icmp6_reflect(m, ICMP6_PACKET_TOO_BIG, 0, mtu, stats,
+ logdata);
+ return (NAT64RETURN);
+ }
+ nat64_init_ip4hdr(ip6, frag, plen, proto, &ip);
+ /* Convert checksums. */
+ switch (proto) {
+ case IPPROTO_TCP:
+ csum = &TCP(mtodo(m, hlen))->th_sum;
+ if (aport != 0) {
+ struct tcphdr *tcp = TCP(mtodo(m, hlen));
+ *csum = cksum_adjust(*csum, tcp->th_sport, aport);
+ tcp->th_sport = aport;
+ }
+ *csum = cksum_add(*csum, nat64_cksum_convert(ip6, &ip));
+ break;
+ case IPPROTO_UDP:
+ csum = &UDP(mtodo(m, hlen))->uh_sum;
+ if (aport != 0) {
+ struct udphdr *udp = UDP(mtodo(m, hlen));
+ *csum = cksum_adjust(*csum, udp->uh_sport, aport);
+ udp->uh_sport = aport;
+ }
+ *csum = cksum_add(*csum, nat64_cksum_convert(ip6, &ip));
+ break;
+ case IPPROTO_ICMPV6:
+ /* Checksum in ICMPv6 covers pseudo header */
+ csum = &icmp6->icmp6_cksum;
+ *csum = cksum_add(*csum, in6_cksum_pseudo(ip6, plen,
+ IPPROTO_ICMPV6, 0));
+ /* Convert ICMPv6 types to ICMP */
+ mtu = *(uint16_t *)icmp6; /* save old word for cksum_adjust */
+ if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST)
+ icmp6->icmp6_type = ICMP_ECHO;
+ else /* ICMP6_ECHO_REPLY */
+ icmp6->icmp6_type = ICMP_ECHOREPLY;
+ *csum = cksum_adjust(*csum, (uint16_t)mtu, *(uint16_t *)icmp6);
+ if (aport != 0) {
+ uint16_t old_id = icmp6->icmp6_id;
+ icmp6->icmp6_id = aport;
+ *csum = cksum_adjust(*csum, old_id, aport);
+ }
+ break;
+ };
+
+ m_adj(m, hlen - sizeof(ip));
+ bcopy(&ip, mtod(m, void *), sizeof(ip));
+ if (nat64_output(ifp, m, dst, &ro, stats, logdata) == 0)
+ NAT64STAT_INC(stats, opcnt64);
+ FREE_ROUTE(&ro);
+ return (NAT64RETURN);
+}
+
diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64_translate.h b/freebsd/sys/netpfil/ipfw/nat64/nat64_translate.h
new file mode 100644
index 00000000..9f653954
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/nat64/nat64_translate.h
@@ -0,0 +1,116 @@
+/*-
+ * Copyright (c) 2015-2016 Yandex LLC
+ * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_FW_NAT64_TRANSLATE_H_
+#define _IP_FW_NAT64_TRANSLATE_H_
+
+#ifdef RTALLOC_NOLOCK
+#define IN_LOOKUP_ROUTE(ro, fib) rtalloc_fib_nolock((ro), 0, (fib))
+#define IN6_LOOKUP_ROUTE(ro, fib) in6_rtalloc_nolock((ro), (fib))
+#define FREE_ROUTE(ro)
+#else
+#define IN_LOOKUP_ROUTE(ro, fib) rtalloc_ign_fib((ro), 0, (fib))
+#define IN6_LOOKUP_ROUTE(ro, fib) in6_rtalloc((ro), (fib))
+#define FREE_ROUTE(ro) RO_RTFREE((ro))
+#endif
+
+static inline int
+nat64_check_ip6(struct in6_addr *addr)
+{
+
+ /* XXX: We should really check /8 */
+ if (addr->s6_addr16[0] == 0 || /* 0000::/8 Reserved by IETF */
+ IN6_IS_ADDR_MULTICAST(addr) || IN6_IS_ADDR_LINKLOCAL(addr))
+ return (1);
+ return (0);
+}
+
+extern int nat64_allow_private;
+static inline int
+nat64_check_private_ip4(in_addr_t ia)
+{
+
+ if (nat64_allow_private)
+ return (0);
+ /* WKPFX must not be used to represent non-global IPv4 addresses */
+// if (cfg->flags & NAT64_WKPFX) {
+ /* IN_PRIVATE */
+ if ((ia & htonl(0xff000000)) == htonl(0x0a000000) ||
+ (ia & htonl(0xfff00000)) == htonl(0xac100000) ||
+ (ia & htonl(0xffff0000)) == htonl(0xc0a80000))
+ return (1);
+ /*
+ * RFC 5735:
+ * 192.0.0.0/24 - reserved for IETF protocol assignments
+ * 192.88.99.0/24 - for use as 6to4 relay anycast addresses
+ * 198.18.0.0/15 - for use in benchmark tests
+ * 192.0.2.0/24, 198.51.100.0/24, 203.0.113.0/24 - for use
+ * in documentation and example code
+ */
+ if ((ia & htonl(0xffffff00)) == htonl(0xc0000000) ||
+ (ia & htonl(0xffffff00)) == htonl(0xc0586300) ||
+ (ia & htonl(0xfffffe00)) == htonl(0xc6120000) ||
+ (ia & htonl(0xffffff00)) == htonl(0xc0000200) ||
+ (ia & htonl(0xfffffe00)) == htonl(0xc6336400) ||
+ (ia & htonl(0xffffff00)) == htonl(0xcb007100))
+ return (1);
+// }
+ return (0);
+}
+
+static inline int
+nat64_check_ip4(in_addr_t ia)
+{
+
+ /* IN_LOOPBACK */
+ if ((ia & htonl(0xff000000)) == htonl(0x7f000000))
+ return (1);
+ /* IN_LINKLOCAL */
+ if ((ia & htonl(0xffff0000)) == htonl(0xa9fe0000))
+ return (1);
+ /* IN_MULTICAST & IN_EXPERIMENTAL */
+ if ((ia & htonl(0xe0000000)) == htonl(0xe0000000))
+ return (1);
+ return (0);
+}
+
+#define nat64_get_ip4(_ip6) ((_ip6)->s6_addr32[3])
+#define nat64_set_ip4(_ip6, _ip4) (_ip6)->s6_addr32[3] = (_ip4)
+
+int nat64_getlasthdr(struct mbuf *m, int *offset);
+int nat64_do_handle_ip4(struct mbuf *m, struct in6_addr *saddr,
+ struct in6_addr *daddr, uint16_t lport, nat64_stats_block *stats,
+ void *logdata);
+int nat64_do_handle_ip6(struct mbuf *m, uint32_t aaddr, uint16_t aport,
+ nat64_stats_block *stats, void *logdata);
+int nat64_handle_icmp6(struct mbuf *m, int hlen, uint32_t aaddr, uint16_t aport,
+ nat64_stats_block *stats, void *logdata);
+
+#endif
+
diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64lsn.c b/freebsd/sys/netpfil/ipfw/nat64/nat64lsn.c
new file mode 100644
index 00000000..ce666213
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/nat64/nat64lsn.c
@@ -0,0 +1,1772 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2015-2016 Yandex LLC
+ * Copyright (c) 2015 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <rtems/bsd/sys/errno.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/queue.h>
+#include <sys/syslog.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_pflog.h>
+#include <net/pfil.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <netinet6/in6_var.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/ip_fw_nat64.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/nat64/ip_fw_nat64.h>
+#include <netpfil/ipfw/nat64/nat64lsn.h>
+#include <netpfil/ipfw/nat64/nat64_translate.h>
+#include <netpfil/pf/pf.h>
+
+MALLOC_DEFINE(M_NAT64LSN, "NAT64LSN", "NAT64LSN");
+
+static void nat64lsn_periodic(void *data);
+#define PERIODIC_DELAY 4
+static uint8_t nat64lsn_proto_map[256];
+uint8_t nat64lsn_rproto_map[NAT_MAX_PROTO];
+
+#define NAT64_FLAG_FIN 0x01 /* FIN was seen */
+#define NAT64_FLAG_SYN 0x02 /* First syn in->out */
+#define NAT64_FLAG_ESTAB 0x04 /* Packet with Ack */
+#define NAT64_FLAGS_TCP (NAT64_FLAG_SYN|NAT64_FLAG_ESTAB|NAT64_FLAG_FIN)
+
+#define NAT64_FLAG_RDR 0x80 /* Port redirect */
+#define NAT64_LOOKUP(chain, cmd) \
+ (struct nat64lsn_cfg *)SRV_OBJECT((chain), (cmd)->arg1)
+/*
+ * Delayed job queue, used to create new hosts
+ * and new portgroups
+ */
+enum nat64lsn_jtype {
+ JTYPE_NEWHOST = 1,
+ JTYPE_NEWPORTGROUP,
+ JTYPE_DELPORTGROUP,
+};
+
+struct nat64lsn_job_item {
+ TAILQ_ENTRY(nat64lsn_job_item) next;
+ enum nat64lsn_jtype jtype;
+ struct nat64lsn_host *nh;
+ struct nat64lsn_portgroup *pg;
+ void *spare_idx;
+ struct in6_addr haddr;
+ uint8_t nat_proto;
+ uint8_t done;
+ int needs_idx;
+ int delcount;
+ unsigned int fhash; /* Flow hash */
+ uint32_t aaddr; /* Last used address (net) */
+ struct mbuf *m;
+ struct ipfw_flow_id f_id;
+ uint64_t delmask[NAT64LSN_PGPTRNMASK];
+};
+
+static struct mtx jmtx;
+#define JQUEUE_LOCK_INIT() mtx_init(&jmtx, "qlock", NULL, MTX_DEF)
+#define JQUEUE_LOCK_DESTROY() mtx_destroy(&jmtx)
+#define JQUEUE_LOCK() mtx_lock(&jmtx)
+#define JQUEUE_UNLOCK() mtx_unlock(&jmtx)
+
+static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg,
+ struct nat64lsn_job_item *ji);
+static void nat64lsn_enqueue_jobs(struct nat64lsn_cfg *cfg,
+ struct nat64lsn_job_head *jhead, int jlen);
+
+static struct nat64lsn_job_item *nat64lsn_create_job(struct nat64lsn_cfg *cfg,
+ const struct ipfw_flow_id *f_id, int jtype);
+static int nat64lsn_request_portgroup(struct nat64lsn_cfg *cfg,
+ const struct ipfw_flow_id *f_id, struct mbuf **pm, uint32_t aaddr,
+ int needs_idx);
+static int nat64lsn_request_host(struct nat64lsn_cfg *cfg,
+ const struct ipfw_flow_id *f_id, struct mbuf **pm);
+static int nat64lsn_translate4(struct nat64lsn_cfg *cfg,
+ const struct ipfw_flow_id *f_id, struct mbuf **pm);
+static int nat64lsn_translate6(struct nat64lsn_cfg *cfg,
+ struct ipfw_flow_id *f_id, struct mbuf **pm);
+
+static int alloc_portgroup(struct nat64lsn_job_item *ji);
+static void destroy_portgroup(struct nat64lsn_portgroup *pg);
+static void destroy_host6(struct nat64lsn_host *nh);
+static int alloc_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji);
+
+static int attach_portgroup(struct nat64lsn_cfg *cfg,
+ struct nat64lsn_job_item *ji);
+static int attach_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji);
+
+
+/* XXX tmp */
+static uma_zone_t nat64lsn_host_zone;
+static uma_zone_t nat64lsn_pg_zone;
+static uma_zone_t nat64lsn_pgidx_zone;
+
+static unsigned int nat64lsn_periodic_chkstates(struct nat64lsn_cfg *cfg,
+ struct nat64lsn_host *nh);
+
+#define I6_hash(x) (djb_hash((const unsigned char *)(x), 16))
+#define I6_first(_ph, h) (_ph)[h]
+#define I6_next(x) (x)->next
+#define I6_val(x) (&(x)->addr)
+#define I6_cmp(a, b) IN6_ARE_ADDR_EQUAL(a, b)
+#define I6_lock(a, b)
+#define I6_unlock(a, b)
+
+#define I6HASH_FIND(_cfg, _res, _a) \
+ CHT_FIND(_cfg->ih, _cfg->ihsize, I6_, _res, _a)
+#define I6HASH_INSERT(_cfg, _i) \
+ CHT_INSERT_HEAD(_cfg->ih, _cfg->ihsize, I6_, _i)
+#define I6HASH_REMOVE(_cfg, _res, _tmp, _a) \
+ CHT_REMOVE(_cfg->ih, _cfg->ihsize, I6_, _res, _tmp, _a)
+
+#define I6HASH_FOREACH_SAFE(_cfg, _x, _tmp, _cb, _arg) \
+ CHT_FOREACH_SAFE(_cfg->ih, _cfg->ihsize, I6_, _x, _tmp, _cb, _arg)
+
+#define HASH_IN4(x) djb_hash((const unsigned char *)(x), 8)
+
+static unsigned
+djb_hash(const unsigned char *h, const int len)
+{
+ unsigned int result = 0;
+ int i;
+
+ for (i = 0; i < len; i++)
+ result = 33 * result ^ h[i];
+
+ return (result);
+}
+
+/*
+static size_t
+bitmask_size(size_t num, int *level)
+{
+ size_t x;
+ int c;
+
+ for (c = 0, x = num; num > 1; num /= 64, c++)
+ ;
+
+ return (x);
+}
+
+static void
+bitmask_prepare(uint64_t *pmask, size_t bufsize, int level)
+{
+ size_t x, z;
+
+ memset(pmask, 0xFF, bufsize);
+ for (x = 0, z = 1; level > 1; x += z, z *= 64, level--)
+ ;
+ pmask[x] ~= 0x01;
+}
+*/
+
+static void
+nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family,
+ uint32_t n, uint32_t sn)
+{
+
+ memset(plog, 0, sizeof(plog));
+ plog->length = PFLOG_REAL_HDRLEN;
+ plog->af = family;
+ plog->action = PF_NAT;
+ plog->dir = PF_IN;
+ plog->rulenr = htonl(n);
+ plog->subrulenr = htonl(sn);
+ plog->ruleset[0] = '\0';
+ strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname));
+ ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m);
+}
+/*
+ * Inspects icmp packets to see if the message contains different
+ * packet header so we need to alter @addr and @port.
+ */
+static int
+inspect_icmp_mbuf(struct mbuf **m, uint8_t *nat_proto, uint32_t *addr,
+ uint16_t *port)
+{
+ struct ip *ip;
+ struct tcphdr *tcp;
+ struct udphdr *udp;
+ struct icmphdr *icmp;
+ int off;
+ uint8_t proto;
+
+ ip = mtod(*m, struct ip *); /* Outer IP header */
+ off = (ip->ip_hl << 2) + ICMP_MINLEN;
+ if ((*m)->m_len < off)
+ *m = m_pullup(*m, off);
+ if (*m == NULL)
+ return (ENOMEM);
+
+ ip = mtod(*m, struct ip *); /* Outer IP header */
+ icmp = L3HDR(ip, struct icmphdr *);
+ switch (icmp->icmp_type) {
+ case ICMP_ECHO:
+ case ICMP_ECHOREPLY:
+ /* Use icmp ID as distinguisher */
+ *port = ntohs(*((uint16_t *)(icmp + 1)));
+ return (0);
+ case ICMP_UNREACH:
+ case ICMP_TIMXCEED:
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+ /*
+ * ICMP_UNREACH and ICMP_TIMXCEED contains IP header + 64 bits
+ * of ULP header.
+ */
+ if ((*m)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN)
+ return (EINVAL);
+ if ((*m)->m_len < off + sizeof(struct ip) + ICMP_MINLEN)
+ *m = m_pullup(*m, off + sizeof(struct ip) + ICMP_MINLEN);
+ if (*m == NULL)
+ return (ENOMEM);
+ ip = mtodo(*m, off); /* Inner IP header */
+ proto = ip->ip_p;
+ off += ip->ip_hl << 2; /* Skip inner IP header */
+ *addr = ntohl(ip->ip_src.s_addr);
+ if ((*m)->m_len < off + ICMP_MINLEN)
+ *m = m_pullup(*m, off + ICMP_MINLEN);
+ if (*m == NULL)
+ return (ENOMEM);
+ switch (proto) {
+ case IPPROTO_TCP:
+ tcp = mtodo(*m, off);
+ *nat_proto = NAT_PROTO_TCP;
+ *port = ntohs(tcp->th_sport);
+ return (0);
+ case IPPROTO_UDP:
+ udp = mtodo(*m, off);
+ *nat_proto = NAT_PROTO_UDP;
+ *port = ntohs(udp->uh_sport);
+ return (0);
+ case IPPROTO_ICMP:
+ /*
+ * We will translate only ICMP errors for our ICMP
+ * echo requests.
+ */
+ icmp = mtodo(*m, off);
+ if (icmp->icmp_type != ICMP_ECHO)
+ return (EOPNOTSUPP);
+ *port = ntohs(*((uint16_t *)(icmp + 1)));
+ return (0);
+ };
+ return (EOPNOTSUPP);
+}
+
+static inline uint8_t
+convert_tcp_flags(uint8_t flags)
+{
+ uint8_t result;
+
+ result = flags & (TH_FIN|TH_SYN);
+ result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */
+ result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */
+
+ return (result);
+}
+
+static NAT64NOINLINE int
+nat64lsn_translate4(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id,
+ struct mbuf **pm)
+{
+ struct pfloghdr loghdr, *logdata;
+ struct in6_addr src6;
+ struct nat64lsn_portgroup *pg;
+ struct nat64lsn_host *nh;
+ struct nat64lsn_state *st;
+ struct ip *ip;
+ uint32_t addr;
+ uint16_t state_flags, state_ts;
+ uint16_t port, lport;
+ uint8_t nat_proto;
+ int ret;
+
+ addr = f_id->dst_ip;
+ port = f_id->dst_port;
+ if (addr < cfg->prefix4 || addr > cfg->pmask4) {
+ NAT64STAT_INC(&cfg->stats, nomatch4);
+ return (cfg->nomatch_verdict);
+ }
+
+ /* Check if protocol is supported and get its short id */
+ nat_proto = nat64lsn_proto_map[f_id->proto];
+ if (nat_proto == 0) {
+ NAT64STAT_INC(&cfg->stats, noproto);
+ return (cfg->nomatch_verdict);
+ }
+
+ /* We might need to handle icmp differently */
+ if (nat_proto == NAT_PROTO_ICMP) {
+ ret = inspect_icmp_mbuf(pm, &nat_proto, &addr, &port);
+ if (ret != 0) {
+ if (ret == ENOMEM)
+ NAT64STAT_INC(&cfg->stats, nomem);
+ else
+ NAT64STAT_INC(&cfg->stats, noproto);
+ return (cfg->nomatch_verdict);
+ }
+ /* XXX: Check addr for validity */
+ if (addr < cfg->prefix4 || addr > cfg->pmask4) {
+ NAT64STAT_INC(&cfg->stats, nomatch4);
+ return (cfg->nomatch_verdict);
+ }
+ }
+
+ /* Calc portgroup offset w.r.t protocol */
+ pg = GET_PORTGROUP(cfg, addr, nat_proto, port);
+
+ /* Check if this port is occupied by any portgroup */
+ if (pg == NULL) {
+ NAT64STAT_INC(&cfg->stats, nomatch4);
+#if 0
+ DPRINTF(DP_STATE, "NOMATCH %u %d %d (%d)", addr, nat_proto, port,
+ _GET_PORTGROUP_IDX(cfg, addr, nat_proto, port));
+#endif
+ return (cfg->nomatch_verdict);
+ }
+
+ /* TODO: Check flags to see if we need to do some static mapping */
+ nh = pg->host;
+
+ /* Prepare some fields we might need to update */
+ SET_AGE(state_ts);
+ ip = mtod(*pm, struct ip *);
+ if (ip->ip_p == IPPROTO_TCP)
+ state_flags = convert_tcp_flags(
+ L3HDR(ip, struct tcphdr *)->th_flags);
+ else
+ state_flags = 0;
+
+ /* Lock host and get port mapping */
+ NAT64_LOCK(nh);
+
+ st = &pg->states[port & (NAT64_CHUNK_SIZE - 1)];
+ if (st->timestamp != state_ts)
+ st->timestamp = state_ts;
+ if ((st->flags & state_flags) != state_flags)
+ st->flags |= state_flags;
+ lport = htons(st->u.s.lport);
+
+ NAT64_UNLOCK(nh);
+
+ if (cfg->flags & NAT64_LOG) {
+ logdata = &loghdr;
+ nat64lsn_log(logdata, *pm, AF_INET, pg->idx, st->cur.off);
+ } else
+ logdata = NULL;
+
+ src6.s6_addr32[0] = cfg->prefix6.s6_addr32[0];
+ src6.s6_addr32[1] = cfg->prefix6.s6_addr32[1];
+ src6.s6_addr32[2] = cfg->prefix6.s6_addr32[2];
+ src6.s6_addr32[3] = htonl(f_id->src_ip);
+
+ ret = nat64_do_handle_ip4(*pm, &src6, &nh->addr, lport,
+ &cfg->stats, logdata);
+
+ if (ret == NAT64SKIP)
+ return (IP_FW_PASS);
+ if (ret == NAT64MFREE)
+ m_freem(*pm);
+ *pm = NULL;
+
+ return (IP_FW_DENY);
+}
+
+void
+nat64lsn_dump_state(const struct nat64lsn_cfg *cfg,
+ const struct nat64lsn_portgroup *pg, const struct nat64lsn_state *st,
+ const char *px, int off)
+{
+ char s[INET6_ADDRSTRLEN], a[INET_ADDRSTRLEN], d[INET_ADDRSTRLEN];
+
+ if ((nat64_debug & DP_STATE) == 0)
+ return;
+ inet_ntop(AF_INET6, &pg->host->addr, s, sizeof(s));
+ inet_ntop(AF_INET, &pg->aaddr, a, sizeof(a));
+ inet_ntop(AF_INET, &st->u.s.faddr, d, sizeof(d));
+
+ DPRINTF(DP_STATE, "%s: PG %d ST [%p|%d]: %s:%d/%d <%s:%d> "
+ "%s:%d AGE %d", px, pg->idx, st, off,
+ s, st->u.s.lport, pg->nat_proto, a, pg->aport + off,
+ d, st->u.s.fport, GET_AGE(st->timestamp));
+}
+
+/*
+ * Check if particular TCP state is stale and should be deleted.
+ * Return 1 if true, 0 otherwise.
+ */
+static int
+nat64lsn_periodic_check_tcp(const struct nat64lsn_cfg *cfg,
+ const struct nat64lsn_state *st, int age)
+{
+ int ttl;
+
+ if (st->flags & NAT64_FLAG_FIN)
+ ttl = cfg->st_close_ttl;
+ else if (st->flags & NAT64_FLAG_ESTAB)
+ ttl = cfg->st_estab_ttl;
+ else if (st->flags & NAT64_FLAG_SYN)
+ ttl = cfg->st_syn_ttl;
+ else
+ ttl = cfg->st_syn_ttl;
+
+ if (age > ttl)
+ return (1);
+ return (0);
+}
+
+/*
+ * Check if nat state @st is stale and should be deleted.
+ * Return 1 if true, 0 otherwise.
+ */
+static NAT64NOINLINE int
+nat64lsn_periodic_chkstate(const struct nat64lsn_cfg *cfg,
+ const struct nat64lsn_portgroup *pg, const struct nat64lsn_state *st)
+{
+ int age, delete;
+
+ age = GET_AGE(st->timestamp);
+ delete = 0;
+
+ /* Skip immutable records */
+ if (st->flags & NAT64_FLAG_RDR)
+ return (0);
+
+ switch (pg->nat_proto) {
+ case NAT_PROTO_TCP:
+ delete = nat64lsn_periodic_check_tcp(cfg, st, age);
+ break;
+ case NAT_PROTO_UDP:
+ if (age > cfg->st_udp_ttl)
+ delete = 1;
+ break;
+ case NAT_PROTO_ICMP:
+ if (age > cfg->st_icmp_ttl)
+ delete = 1;
+ break;
+ }
+
+ return (delete);
+}
+
+
+/*
+ * The following structures and functions
+ * are used to perform SLIST_FOREACH_SAFE()
+ * analog for states identified by struct st_ptr.
+ */
+
+struct st_idx {
+ struct nat64lsn_portgroup *pg;
+ struct nat64lsn_state *st;
+ struct st_ptr sidx_next;
+};
+
+static struct st_idx *
+st_first(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh,
+ struct st_ptr *sidx, struct st_idx *si)
+{
+ struct nat64lsn_portgroup *pg;
+ struct nat64lsn_state *st;
+
+ if (sidx->idx == 0) {
+ memset(si, 0, sizeof(*si));
+ return (si);
+ }
+
+ pg = PORTGROUP_BYSIDX(cfg, nh, sidx->idx);
+ st = &pg->states[sidx->off];
+
+ si->pg = pg;
+ si->st = st;
+ si->sidx_next = st->next;
+
+ return (si);
+}
+
+static struct st_idx *
+st_next(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh,
+ struct st_idx *si)
+{
+ struct st_ptr sidx;
+ struct nat64lsn_portgroup *pg;
+ struct nat64lsn_state *st;
+
+ sidx = si->sidx_next;
+ if (sidx.idx == 0) {
+ memset(si, 0, sizeof(*si));
+ si->st = NULL;
+ si->pg = NULL;
+ return (si);
+ }
+
+ pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx);
+ st = &pg->states[sidx.off];
+
+ si->pg = pg;
+ si->st = st;
+ si->sidx_next = st->next;
+
+ return (si);
+}
+
+static struct st_idx *
+st_save_cond(struct st_idx *si_dst, struct st_idx *si)
+{
+ if (si->st != NULL)
+ *si_dst = *si;
+
+ return (si_dst);
+}
+
+unsigned int
+nat64lsn_periodic_chkstates(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh)
+{
+ struct st_idx si, si_prev;
+ int i;
+ unsigned int delcount;
+
+ delcount = 0;
+ for (i = 0; i < nh->hsize; i++) {
+ memset(&si_prev, 0, sizeof(si_prev));
+ for (st_first(cfg, nh, &nh->phash[i], &si);
+ si.st != NULL;
+ st_save_cond(&si_prev, &si), st_next(cfg, nh, &si)) {
+ if (nat64lsn_periodic_chkstate(cfg, si.pg, si.st) == 0)
+ continue;
+ nat64lsn_dump_state(cfg, si.pg, si.st, "DELETE STATE",
+ si.st->cur.off);
+ /* Unlink from hash */
+ if (si_prev.st != NULL)
+ si_prev.st->next = si.st->next;
+ else
+ nh->phash[i] = si.st->next;
+ /* Delete state and free its data */
+ PG_MARK_FREE_IDX(si.pg, si.st->cur.off);
+ memset(si.st, 0, sizeof(struct nat64lsn_state));
+ si.st = NULL;
+ delcount++;
+
+ /* Update portgroup timestamp */
+ SET_AGE(si.pg->timestamp);
+ }
+ }
+ NAT64STAT_ADD(&cfg->stats, sdeleted, delcount);
+ return (delcount);
+}
+
+/*
+ * Checks if portgroup is not used and can be deleted,
+ * Returns 1 if stale, 0 otherwise
+ */
+static int
+stale_pg(const struct nat64lsn_cfg *cfg, const struct nat64lsn_portgroup *pg)
+{
+
+ if (!PG_IS_EMPTY(pg))
+ return (0);
+ if (GET_AGE(pg->timestamp) < cfg->pg_delete_delay)
+ return (0);
+ return (1);
+}
+
+/*
+ * Checks if host record is not used and can be deleted,
+ * Returns 1 if stale, 0 otherwise
+ */
+static int
+stale_nh(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh)
+{
+
+ if (nh->pg_used != 0)
+ return (0);
+ if (GET_AGE(nh->timestamp) < cfg->nh_delete_delay)
+ return (0);
+ return (1);
+}
+
+struct nat64lsn_periodic_data {
+ struct nat64lsn_cfg *cfg;
+ struct nat64lsn_job_head jhead;
+ int jlen;
+};
+
+static NAT64NOINLINE int
+nat64lsn_periodic_chkhost(struct nat64lsn_host *nh,
+ struct nat64lsn_periodic_data *d)
+{
+ char a[INET6_ADDRSTRLEN];
+ struct nat64lsn_portgroup *pg;
+ struct nat64lsn_job_item *ji;
+ uint64_t delmask[NAT64LSN_PGPTRNMASK];
+ int delcount, i;
+
+ delcount = 0;
+ memset(delmask, 0, sizeof(delmask));
+
+ inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
+ DPRINTF(DP_JQUEUE, "Checking %s host %s on cpu %d",
+ stale_nh(d->cfg, nh) ? "stale" : "non-stale", a, curcpu);
+ if (!stale_nh(d->cfg, nh)) {
+ /* Non-stale host. Inspect internals */
+ NAT64_LOCK(nh);
+
+ /* Stage 1: Check&expire states */
+ if (nat64lsn_periodic_chkstates(d->cfg, nh) != 0)
+ SET_AGE(nh->timestamp);
+
+ /* Stage 2: Check if we need to expire */
+ for (i = 0; i < nh->pg_used; i++) {
+ pg = PORTGROUP_BYSIDX(d->cfg, nh, i + 1);
+ if (pg == NULL)
+ continue;
+
+ /* Check if we can delete portgroup */
+ if (stale_pg(d->cfg, pg) == 0)
+ continue;
+
+ DPRINTF(DP_JQUEUE, "Check PG %d", i);
+ delmask[i / 64] |= ((uint64_t)1 << (i % 64));
+ delcount++;
+ }
+
+ NAT64_UNLOCK(nh);
+ if (delcount == 0)
+ return (0);
+ }
+
+ DPRINTF(DP_JQUEUE, "Queueing %d portgroups for deleting", delcount);
+ /* We have something to delete - add it to queue */
+ ji = nat64lsn_create_job(d->cfg, NULL, JTYPE_DELPORTGROUP);
+ if (ji == NULL)
+ return (0);
+
+ ji->haddr = nh->addr;
+ ji->delcount = delcount;
+ memcpy(ji->delmask, delmask, sizeof(ji->delmask));
+
+ TAILQ_INSERT_TAIL(&d->jhead, ji, next);
+ d->jlen++;
+ return (0);
+}
+
+/*
+ * This procedure is used to perform various maintance
+ * on dynamic hash list. Currently it is called every second.
+ */
+static void
+nat64lsn_periodic(void *data)
+{
+ struct ip_fw_chain *ch;
+ IPFW_RLOCK_TRACKER;
+ struct nat64lsn_cfg *cfg;
+ struct nat64lsn_periodic_data d;
+ struct nat64lsn_host *nh, *tmp;
+
+ cfg = (struct nat64lsn_cfg *) data;
+ ch = cfg->ch;
+ CURVNET_SET(cfg->vp);
+
+ memset(&d, 0, sizeof(d));
+ d.cfg = cfg;
+ TAILQ_INIT(&d.jhead);
+
+ IPFW_RLOCK(ch);
+
+ /* Stage 1: foreach host, check all its portgroups */
+ I6HASH_FOREACH_SAFE(cfg, nh, tmp, nat64lsn_periodic_chkhost, &d);
+
+ /* Enqueue everything we have requested */
+ nat64lsn_enqueue_jobs(cfg, &d.jhead, d.jlen);
+
+ callout_schedule(&cfg->periodic, hz * PERIODIC_DELAY);
+
+ IPFW_RUNLOCK(ch);
+
+ CURVNET_RESTORE();
+}
+
+static NAT64NOINLINE void
+reinject_mbuf(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
+{
+
+ if (ji->m == NULL)
+ return;
+
+ /* Request has failed or packet type is wrong */
+ if (ji->f_id.addr_type != 6 || ji->done == 0) {
+ m_freem(ji->m);
+ ji->m = NULL;
+ NAT64STAT_INC(&cfg->stats, dropped);
+ DPRINTF(DP_DROPS, "mbuf dropped: type %d, done %d",
+ ji->jtype, ji->done);
+ return;
+ }
+
+ /*
+ * XXX: Limit recursion level
+ */
+
+ NAT64STAT_INC(&cfg->stats, jreinjected);
+ DPRINTF(DP_JQUEUE, "Reinject mbuf");
+ nat64lsn_translate6(cfg, &ji->f_id, &ji->m);
+}
+
+static void
+destroy_portgroup(struct nat64lsn_portgroup *pg)
+{
+
+ DPRINTF(DP_OBJ, "DESTROY PORTGROUP %d %p", pg->idx, pg);
+ uma_zfree(nat64lsn_pg_zone, pg);
+}
+
+static NAT64NOINLINE int
+alloc_portgroup(struct nat64lsn_job_item *ji)
+{
+ struct nat64lsn_portgroup *pg;
+
+ pg = uma_zalloc(nat64lsn_pg_zone, M_NOWAIT);
+ if (pg == NULL)
+ return (1);
+
+ if (ji->needs_idx != 0) {
+ ji->spare_idx = uma_zalloc(nat64lsn_pgidx_zone, M_NOWAIT);
+ /* Failed alloc isn't always fatal, so don't check */
+ }
+ memset(&pg->freemask, 0xFF, sizeof(pg->freemask));
+ pg->nat_proto = ji->nat_proto;
+ ji->pg = pg;
+ return (0);
+
+}
+
+static void
+destroy_host6(struct nat64lsn_host *nh)
+{
+ char a[INET6_ADDRSTRLEN];
+ int i;
+
+ inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
+ DPRINTF(DP_OBJ, "DESTROY HOST %s %p (pg used %d)", a, nh,
+ nh->pg_used);
+ NAT64_LOCK_DESTROY(nh);
+ for (i = 0; i < nh->pg_allocated / NAT64LSN_PGIDX_CHUNK; i++)
+ uma_zfree(nat64lsn_pgidx_zone, PORTGROUP_CHUNK(nh, i));
+ uma_zfree(nat64lsn_host_zone, nh);
+}
+
+static NAT64NOINLINE int
+alloc_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
+{
+ struct nat64lsn_host *nh;
+ char a[INET6_ADDRSTRLEN];
+
+ nh = uma_zalloc(nat64lsn_host_zone, M_NOWAIT);
+ if (nh == NULL)
+ return (1);
+ PORTGROUP_CHUNK(nh, 0) = uma_zalloc(nat64lsn_pgidx_zone, M_NOWAIT);
+ if (PORTGROUP_CHUNK(nh, 0) == NULL) {
+ uma_zfree(nat64lsn_host_zone, nh);
+ return (2);
+ }
+ if (alloc_portgroup(ji) != 0) {
+ NAT64STAT_INC(&cfg->stats, jportfails);
+ uma_zfree(nat64lsn_pgidx_zone, PORTGROUP_CHUNK(nh, 0));
+ uma_zfree(nat64lsn_host_zone, nh);
+ return (3);
+ }
+
+ NAT64_LOCK_INIT(nh);
+ nh->addr = ji->haddr;
+ nh->hsize = NAT64LSN_HSIZE; /* XXX: hardcoded size */
+ nh->pg_allocated = NAT64LSN_PGIDX_CHUNK;
+ nh->pg_used = 0;
+ ji->nh = nh;
+
+ inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
+ DPRINTF(DP_OBJ, "ALLOC HOST %s %p", a, ji->nh);
+ return (0);
+}
+
+/*
+ * Finds free @pg index inside @nh
+ */
+static NAT64NOINLINE int
+find_nh_pg_idx(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh, int *idx)
+{
+ int i;
+
+ for (i = 0; i < nh->pg_allocated; i++) {
+ if (PORTGROUP_BYSIDX(cfg, nh, i + 1) == NULL) {
+ *idx = i;
+ return (0);
+ }
+ }
+ return (1);
+}
+
+static NAT64NOINLINE int
+attach_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
+{
+ char a[INET6_ADDRSTRLEN];
+ struct nat64lsn_host *nh;
+
+ I6HASH_FIND(cfg, nh, &ji->haddr);
+ if (nh == NULL) {
+ /* Add new host to list */
+ nh = ji->nh;
+ I6HASH_INSERT(cfg, nh);
+ cfg->ihcount++;
+ ji->nh = NULL;
+
+ inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
+ DPRINTF(DP_OBJ, "ATTACH HOST %s %p", a, nh);
+ /*
+ * Try to add portgroup.
+ * Note it will automatically set
+ * 'done' on ji if successful.
+ */
+ if (attach_portgroup(cfg, ji) != 0) {
+ DPRINTF(DP_DROPS, "%s %p failed to attach PG",
+ a, nh);
+ NAT64STAT_INC(&cfg->stats, jportfails);
+ return (1);
+ }
+ return (0);
+ }
+
+ /*
+ * nh isn't NULL. This probably means we had several simultaneous
+ * host requests. The previous one request has already attached
+ * this host. Requeue attached mbuf and mark job as done, but
+ * leave nh and pg pointers not changed, so nat64lsn_do_request()
+ * will release all allocated resources.
+ */
+ inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
+ DPRINTF(DP_OBJ, "%s %p is already attached as %p",
+ a, ji->nh, nh);
+ ji->done = 1;
+ return (0);
+}
+
+static NAT64NOINLINE int
+find_pg_place_addr(const struct nat64lsn_cfg *cfg, int addr_off,
+ int nat_proto, uint16_t *aport, int *ppg_idx)
+{
+ int j, pg_idx;
+
+ pg_idx = addr_off * _ADDR_PG_COUNT +
+ (nat_proto - 1) * _ADDR_PG_PROTO_COUNT;
+
+ for (j = NAT64_MIN_CHUNK; j < _ADDR_PG_PROTO_COUNT; j++) {
+ if (cfg->pg[pg_idx + j] != NULL)
+ continue;
+
+ *aport = j * NAT64_CHUNK_SIZE;
+ *ppg_idx = pg_idx + j;
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * XXX: This function needs to be rewritten to
+ * use free bitmask for faster pg finding,
+ * additionally, it should take into consideration
+ * a) randomization and
+ * b) previous addresses allocated to given nat instance
+ *
+ */
+static NAT64NOINLINE int
+find_portgroup_place(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji,
+ uint32_t *aaddr, uint16_t *aport, int *ppg_idx)
+{
+ int i, nat_proto;
+
+ /*
+ * XXX: Use bitmask index to be able to find/check if IP address
+ * has some spare pg's
+ */
+ nat_proto = ji->nat_proto;
+
+ /* First, try to use same address */
+ if (ji->aaddr != 0) {
+ i = ntohl(ji->aaddr) - cfg->prefix4;
+ if (find_pg_place_addr(cfg, i, nat_proto, aport,
+ ppg_idx) != 0){
+ /* Found! */
+ *aaddr = htonl(cfg->prefix4 + i);
+ return (0);
+ }
+ }
+
+ /* Next, try to use random address based on flow hash */
+ i = ji->fhash % (1 << (32 - cfg->plen4));
+ if (find_pg_place_addr(cfg, i, nat_proto, aport, ppg_idx) != 0) {
+ /* Found! */
+ *aaddr = htonl(cfg->prefix4 + i);
+ return (0);
+ }
+
+
+ /* Last one: simply find ANY available */
+ for (i = 0; i < (1 << (32 - cfg->plen4)); i++) {
+ if (find_pg_place_addr(cfg, i, nat_proto, aport,
+ ppg_idx) != 0){
+ /* Found! */
+ *aaddr = htonl(cfg->prefix4 + i);
+ return (0);
+ }
+ }
+
+ return (1);
+}
+
+static NAT64NOINLINE int
+attach_portgroup(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
+{
+ char a[INET6_ADDRSTRLEN];
+ struct nat64lsn_portgroup *pg;
+ struct nat64lsn_host *nh;
+ uint32_t aaddr;
+ uint16_t aport;
+ int nh_pg_idx, pg_idx;
+
+ pg = ji->pg;
+
+ /*
+ * Find source host and bind: we can't rely on
+ * pg->host
+ */
+ I6HASH_FIND(cfg, nh, &ji->haddr);
+ if (nh == NULL)
+ return (1);
+
+ /* Find spare port chunk */
+ if (find_portgroup_place(cfg, ji, &aaddr, &aport, &pg_idx) != 0) {
+ inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
+ DPRINTF(DP_OBJ | DP_DROPS, "empty PG not found for %s", a);
+ return (2);
+ }
+
+ /* Expand PG indexes if needed */
+ if (nh->pg_allocated < cfg->max_chunks && ji->spare_idx != NULL) {
+ PORTGROUP_CHUNK(nh, nh->pg_allocated / NAT64LSN_PGIDX_CHUNK) =
+ ji->spare_idx;
+ nh->pg_allocated += NAT64LSN_PGIDX_CHUNK;
+ ji->spare_idx = NULL;
+ }
+
+ /* Find empty index to store PG in the @nh */
+ if (find_nh_pg_idx(cfg, nh, &nh_pg_idx) != 0) {
+ inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
+ DPRINTF(DP_OBJ | DP_DROPS, "free PG index not found for %s",
+ a);
+ return (3);
+ }
+
+ cfg->pg[pg_idx] = pg;
+ cfg->protochunks[pg->nat_proto]++;
+ NAT64STAT_INC(&cfg->stats, spgcreated);
+
+ pg->aaddr = aaddr;
+ pg->aport = aport;
+ pg->host = nh;
+ pg->idx = pg_idx;
+ SET_AGE(pg->timestamp);
+
+ PORTGROUP_BYSIDX(cfg, nh, nh_pg_idx + 1) = pg;
+ if (nh->pg_used == nh_pg_idx)
+ nh->pg_used++;
+ SET_AGE(nh->timestamp);
+
+ ji->pg = NULL;
+ ji->done = 1;
+
+ return (0);
+}
+
+static NAT64NOINLINE void
+consider_del_portgroup(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
+{
+ struct nat64lsn_host *nh, *nh_tmp;
+ struct nat64lsn_portgroup *pg, *pg_list[256];
+ int i, pg_lidx, idx;
+
+ /* Find source host */
+ I6HASH_FIND(cfg, nh, &ji->haddr);
+ if (nh == NULL || nh->pg_used == 0)
+ return;
+
+ memset(pg_list, 0, sizeof(pg_list));
+ pg_lidx = 0;
+
+ NAT64_LOCK(nh);
+
+ for (i = nh->pg_used - 1; i >= 0; i--) {
+ if ((ji->delmask[i / 64] & ((uint64_t)1 << (i % 64))) == 0)
+ continue;
+ pg = PORTGROUP_BYSIDX(cfg, nh, i + 1);
+
+ /* Check that PG isn't busy. */
+ if (stale_pg(cfg, pg) == 0)
+ continue;
+
+ /* DO delete */
+ pg_list[pg_lidx++] = pg;
+ PORTGROUP_BYSIDX(cfg, nh, i + 1) = NULL;
+
+ idx = _GET_PORTGROUP_IDX(cfg, ntohl(pg->aaddr), pg->nat_proto,
+ pg->aport);
+ KASSERT(cfg->pg[idx] == pg, ("Non matched pg"));
+ cfg->pg[idx] = NULL;
+ cfg->protochunks[pg->nat_proto]--;
+ NAT64STAT_INC(&cfg->stats, spgdeleted);
+
+ /* Decrease pg_used */
+ while (nh->pg_used > 0 &&
+ PORTGROUP_BYSIDX(cfg, nh, nh->pg_used) == NULL)
+ nh->pg_used--;
+
+ /* Check if on-stack buffer has ended */
+ if (pg_lidx == nitems(pg_list))
+ break;
+ }
+
+ NAT64_UNLOCK(nh);
+
+ if (stale_nh(cfg, nh)) {
+ I6HASH_REMOVE(cfg, nh, nh_tmp, &ji->haddr);
+ KASSERT(nh != NULL, ("Unable to find address"));
+ cfg->ihcount--;
+ ji->nh = nh;
+ I6HASH_FIND(cfg, nh, &ji->haddr);
+ KASSERT(nh == NULL, ("Failed to delete address"));
+ }
+
+ /* TODO: Delay freeing portgroups */
+ while (pg_lidx > 0) {
+ pg_lidx--;
+ NAT64STAT_INC(&cfg->stats, spgdeleted);
+ destroy_portgroup(pg_list[pg_lidx]);
+ }
+}
+
+/*
+ * Main request handler.
+ * Responsible for handling jqueue, e.g.
+ * creating new hosts, addind/deleting portgroups.
+ */
+static NAT64NOINLINE void
+nat64lsn_do_request(void *data)
+{
+ IPFW_RLOCK_TRACKER;
+ struct nat64lsn_job_head jhead;
+ struct nat64lsn_job_item *ji;
+ int jcount, nhsize;
+ struct nat64lsn_cfg *cfg = (struct nat64lsn_cfg *) data;
+ struct ip_fw_chain *ch;
+ int delcount;
+
+ CURVNET_SET(cfg->vp);
+
+ TAILQ_INIT(&jhead);
+
+ /* XXX: We're running unlocked here */
+
+ ch = cfg->ch;
+ delcount = 0;
+ IPFW_RLOCK(ch);
+
+ /* Grab queue */
+ JQUEUE_LOCK();
+ TAILQ_SWAP(&jhead, &cfg->jhead, nat64lsn_job_item, next);
+ jcount = cfg->jlen;
+ cfg->jlen = 0;
+ JQUEUE_UNLOCK();
+
+ /* check if we need to resize hash */
+ nhsize = 0;
+ if (cfg->ihcount > cfg->ihsize && cfg->ihsize < 65536) {
+ nhsize = cfg->ihsize;
+ for ( ; cfg->ihcount > nhsize && nhsize < 65536; nhsize *= 2)
+ ;
+ } else if (cfg->ihcount < cfg->ihsize * 4) {
+ nhsize = cfg->ihsize;
+ for ( ; cfg->ihcount < nhsize * 4 && nhsize > 32; nhsize /= 2)
+ ;
+ }
+
+ IPFW_RUNLOCK(ch);
+
+ if (TAILQ_EMPTY(&jhead)) {
+ CURVNET_RESTORE();
+ return;
+ }
+
+ NAT64STAT_INC(&cfg->stats, jcalls);
+ DPRINTF(DP_JQUEUE, "count=%d", jcount);
+
+ /*
+ * TODO:
+ * What we should do here is to build a hash
+ * to ensure we don't have lots of duplicate requests.
+ * Skip this for now.
+ *
+ * TODO: Limit per-call number of items
+ */
+
+ /* Pre-allocate everything for entire chain */
+ TAILQ_FOREACH(ji, &jhead, next) {
+ switch (ji->jtype) {
+ case JTYPE_NEWHOST:
+ if (alloc_host6(cfg, ji) != 0)
+ NAT64STAT_INC(&cfg->stats, jhostfails);
+ break;
+ case JTYPE_NEWPORTGROUP:
+ if (alloc_portgroup(ji) != 0)
+ NAT64STAT_INC(&cfg->stats, jportfails);
+ break;
+ case JTYPE_DELPORTGROUP:
+ delcount += ji->delcount;
+ break;
+ default:
+ break;
+ }
+ }
+
+ /*
+ * TODO: Alloc hew hash
+ */
+ nhsize = 0;
+ if (nhsize > 0) {
+ /* XXX: */
+ }
+
+ /* Apply all changes in batch */
+ IPFW_UH_WLOCK(ch);
+ IPFW_WLOCK(ch);
+
+ TAILQ_FOREACH(ji, &jhead, next) {
+ switch (ji->jtype) {
+ case JTYPE_NEWHOST:
+ if (ji->nh != NULL)
+ attach_host6(cfg, ji);
+ break;
+ case JTYPE_NEWPORTGROUP:
+ if (ji->pg != NULL &&
+ attach_portgroup(cfg, ji) != 0)
+ NAT64STAT_INC(&cfg->stats, jportfails);
+ break;
+ case JTYPE_DELPORTGROUP:
+ consider_del_portgroup(cfg, ji);
+ break;
+ }
+ }
+
+ if (nhsize > 0) {
+ /* XXX: Move everything to new hash */
+ }
+
+ IPFW_WUNLOCK(ch);
+ IPFW_UH_WUNLOCK(ch);
+
+ /* Flush unused entries */
+ while (!TAILQ_EMPTY(&jhead)) {
+ ji = TAILQ_FIRST(&jhead);
+ TAILQ_REMOVE(&jhead, ji, next);
+ if (ji->nh != NULL)
+ destroy_host6(ji->nh);
+ if (ji->pg != NULL)
+ destroy_portgroup(ji->pg);
+ if (ji->m != NULL)
+ reinject_mbuf(cfg, ji);
+ if (ji->spare_idx != NULL)
+ uma_zfree(nat64lsn_pgidx_zone, ji->spare_idx);
+ free(ji, M_IPFW);
+ }
+ CURVNET_RESTORE();
+}
+
+static NAT64NOINLINE struct nat64lsn_job_item *
+nat64lsn_create_job(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id,
+ int jtype)
+{
+ struct nat64lsn_job_item *ji;
+ struct in6_addr haddr;
+ uint8_t nat_proto;
+
+ /*
+ * Do not try to lock possibly contested mutex if we're near the limit.
+ * Drop packet instead.
+ */
+ if (cfg->jlen >= cfg->jmaxlen) {
+ NAT64STAT_INC(&cfg->stats, jmaxlen);
+ return (NULL);
+ }
+
+ memset(&haddr, 0, sizeof(haddr));
+ nat_proto = 0;
+ if (f_id != NULL) {
+ haddr = f_id->src_ip6;
+ nat_proto = nat64lsn_proto_map[f_id->proto];
+
+ DPRINTF(DP_JQUEUE, "REQUEST pg nat_proto %d on proto %d",
+ nat_proto, f_id->proto);
+
+ if (nat_proto == 0)
+ return (NULL);
+ }
+
+ ji = malloc(sizeof(struct nat64lsn_job_item), M_IPFW,
+ M_NOWAIT | M_ZERO);
+
+ if (ji == NULL) {
+ NAT64STAT_INC(&cfg->stats, jnomem);
+ return (NULL);
+ }
+
+ ji->jtype = jtype;
+
+ if (f_id != NULL) {
+ ji->f_id = *f_id;
+ ji->haddr = haddr;
+ ji->nat_proto = nat_proto;
+ }
+
+ return (ji);
+}
+
+static NAT64NOINLINE void
+nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
+{
+
+ if (ji == NULL)
+ return;
+
+ JQUEUE_LOCK();
+ TAILQ_INSERT_TAIL(&cfg->jhead, ji, next);
+ cfg->jlen++;
+ NAT64STAT_INC(&cfg->stats, jrequests);
+
+ if (callout_pending(&cfg->jcallout) == 0)
+ callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg);
+ JQUEUE_UNLOCK();
+}
+
+static NAT64NOINLINE void
+nat64lsn_enqueue_jobs(struct nat64lsn_cfg *cfg,
+ struct nat64lsn_job_head *jhead, int jlen)
+{
+
+ if (TAILQ_EMPTY(jhead))
+ return;
+
+ /* Attach current queue to execution one */
+ JQUEUE_LOCK();
+ TAILQ_CONCAT(&cfg->jhead, jhead, next);
+ cfg->jlen += jlen;
+ NAT64STAT_ADD(&cfg->stats, jrequests, jlen);
+
+ if (callout_pending(&cfg->jcallout) == 0)
+ callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg);
+ JQUEUE_UNLOCK();
+}
+
+static unsigned int
+flow6_hash(const struct ipfw_flow_id *f_id)
+{
+ unsigned char hbuf[36];
+
+ memcpy(hbuf, &f_id->dst_ip6, 16);
+ memcpy(&hbuf[16], &f_id->src_ip6, 16);
+ memcpy(&hbuf[32], &f_id->dst_port, 2);
+ memcpy(&hbuf[32], &f_id->src_port, 2);
+
+ return (djb_hash(hbuf, sizeof(hbuf)));
+}
+
+static NAT64NOINLINE int
+nat64lsn_request_host(struct nat64lsn_cfg *cfg,
+ const struct ipfw_flow_id *f_id, struct mbuf **pm)
+{
+ struct nat64lsn_job_item *ji;
+ struct mbuf *m;
+
+ m = *pm;
+ *pm = NULL;
+
+ ji = nat64lsn_create_job(cfg, f_id, JTYPE_NEWHOST);
+ if (ji == NULL) {
+ m_freem(m);
+ NAT64STAT_INC(&cfg->stats, dropped);
+ DPRINTF(DP_DROPS, "failed to create job");
+ } else {
+ ji->m = m;
+ /* Provide pseudo-random value based on flow */
+ ji->fhash = flow6_hash(f_id);
+ nat64lsn_enqueue_job(cfg, ji);
+ NAT64STAT_INC(&cfg->stats, jhostsreq);
+ }
+
+ return (IP_FW_PASS);
+}
+
+static NAT64NOINLINE int
+nat64lsn_request_portgroup(struct nat64lsn_cfg *cfg,
+ const struct ipfw_flow_id *f_id, struct mbuf **pm, uint32_t aaddr,
+ int needs_idx)
+{
+ struct nat64lsn_job_item *ji;
+ struct mbuf *m;
+
+ m = *pm;
+ *pm = NULL;
+
+ ji = nat64lsn_create_job(cfg, f_id, JTYPE_NEWPORTGROUP);
+ if (ji == NULL) {
+ m_freem(m);
+ NAT64STAT_INC(&cfg->stats, dropped);
+ DPRINTF(DP_DROPS, "failed to create job");
+ } else {
+ ji->m = m;
+ /* Provide pseudo-random value based on flow */
+ ji->fhash = flow6_hash(f_id);
+ ji->aaddr = aaddr;
+ ji->needs_idx = needs_idx;
+ nat64lsn_enqueue_job(cfg, ji);
+ NAT64STAT_INC(&cfg->stats, jportreq);
+ }
+
+ return (IP_FW_PASS);
+}
+
+static NAT64NOINLINE struct nat64lsn_state *
+nat64lsn_create_state(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh,
+ int nat_proto, struct nat64lsn_state *kst, uint32_t *aaddr)
+{
+ struct nat64lsn_portgroup *pg;
+ struct nat64lsn_state *st;
+ int i, hval, off;
+
+ /* XXX: create additional bitmask for selecting proper portgroup */
+ for (i = 0; i < nh->pg_used; i++) {
+ pg = PORTGROUP_BYSIDX(cfg, nh, i + 1);
+ if (pg == NULL)
+ continue;
+ if (*aaddr == 0)
+ *aaddr = pg->aaddr;
+ if (pg->nat_proto != nat_proto)
+ continue;
+
+ off = PG_GET_FREE_IDX(pg);
+ if (off != 0) {
+ /* We have found spare state. Use it */
+ off--;
+ PG_MARK_BUSY_IDX(pg, off);
+ st = &pg->states[off];
+
+ /*
+ * Fill in new info. Assume state was zeroed.
+ * Timestamp and flags will be filled by caller.
+ */
+ st->u.s = kst->u.s;
+ st->cur.idx = i + 1;
+ st->cur.off = off;
+
+ /* Insert into host hash table */
+ hval = HASH_IN4(&st->u.hkey) & (nh->hsize - 1);
+ st->next = nh->phash[hval];
+ nh->phash[hval] = st->cur;
+
+ nat64lsn_dump_state(cfg, pg, st, "ALLOC STATE", off);
+
+ NAT64STAT_INC(&cfg->stats, screated);
+
+ return (st);
+ }
+ /* Saev last used alias affress */
+ *aaddr = pg->aaddr;
+ }
+
+ return (NULL);
+}
+
+static NAT64NOINLINE int
+nat64lsn_translate6(struct nat64lsn_cfg *cfg, struct ipfw_flow_id *f_id,
+ struct mbuf **pm)
+{
+ struct pfloghdr loghdr, *logdata;
+ char a[INET6_ADDRSTRLEN];
+ struct nat64lsn_host *nh;
+ struct st_ptr sidx;
+ struct nat64lsn_state *st, kst;
+ struct nat64lsn_portgroup *pg;
+ struct icmp6_hdr *icmp6;
+ uint32_t aaddr;
+ int action, hval, nat_proto, proto;
+ uint16_t aport, state_ts, state_flags;
+
+ /* Check if af/protocol is supported and get it short id */
+ nat_proto = nat64lsn_proto_map[f_id->proto];
+ if (nat_proto == 0) {
+ /*
+ * Since we can be called from jobs handler, we need
+ * to free mbuf by self, do not leave this task to
+ * ipfw_check_packet().
+ */
+ NAT64STAT_INC(&cfg->stats, noproto);
+ m_freem(*pm);
+ *pm = NULL;
+ return (IP_FW_DENY);
+ }
+
+ /* Try to find host first */
+ I6HASH_FIND(cfg, nh, &f_id->src_ip6);
+
+ if (nh == NULL)
+ return (nat64lsn_request_host(cfg, f_id, pm));
+
+ /* Fill-in on-stack state structure */
+ kst.u.s.faddr = f_id->dst_ip6.s6_addr32[3];
+ kst.u.s.fport = f_id->dst_port;
+ kst.u.s.lport = f_id->src_port;
+
+ /* Prepare some fields we might need to update */
+ hval = 0;
+ proto = nat64_getlasthdr(*pm, &hval);
+ if (proto < 0) {
+ NAT64STAT_INC(&cfg->stats, dropped);
+ DPRINTF(DP_DROPS, "dropped due to mbuf isn't contigious");
+ m_freem(*pm);
+ *pm = NULL;
+ return (IP_FW_DENY);
+ }
+
+ SET_AGE(state_ts);
+ if (proto == IPPROTO_TCP)
+ state_flags = convert_tcp_flags(
+ TCP(mtodo(*pm, hval))->th_flags);
+ else
+ state_flags = 0;
+ if (proto == IPPROTO_ICMPV6) {
+ /* Alter local port data */
+ icmp6 = mtodo(*pm, hval);
+ if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST ||
+ icmp6->icmp6_type == ICMP6_ECHO_REPLY)
+ kst.u.s.lport = ntohs(icmp6->icmp6_id);
+ }
+
+ hval = HASH_IN4(&kst.u.hkey) & (nh->hsize - 1);
+ pg = NULL;
+ st = NULL;
+
+ /* OK, let's find state in host hash */
+ NAT64_LOCK(nh);
+ sidx = nh->phash[hval];
+ int k = 0;
+ while (sidx.idx != 0) {
+ pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx);
+ st = &pg->states[sidx.off];
+ //DPRINTF("SISX: %d/%d next: %d/%d", sidx.idx, sidx.off,
+ //st->next.idx, st->next.off);
+ if (st->u.hkey == kst.u.hkey && pg->nat_proto == nat_proto)
+ break;
+ if (k++ > 1000) {
+ DPRINTF(DP_ALL, "XXX: too long %d/%d %d/%d\n",
+ sidx.idx, sidx.off, st->next.idx, st->next.off);
+ inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
+ DPRINTF(DP_GENERIC, "TR host %s %p on cpu %d",
+ a, nh, curcpu);
+ k = 0;
+ }
+ sidx = st->next;
+ }
+
+ if (sidx.idx == 0) {
+ aaddr = 0;
+ st = nat64lsn_create_state(cfg, nh, nat_proto, &kst, &aaddr);
+ if (st == NULL) {
+ /* No free states. Request more if we can */
+ if (nh->pg_used >= cfg->max_chunks) {
+ /* Limit reached */
+ NAT64STAT_INC(&cfg->stats, dropped);
+ inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
+ DPRINTF(DP_DROPS, "PG limit reached "
+ " for host %s (used %u, allocated %u, "
+ "limit %u)", a,
+ nh->pg_used * NAT64_CHUNK_SIZE,
+ nh->pg_allocated * NAT64_CHUNK_SIZE,
+ cfg->max_chunks * NAT64_CHUNK_SIZE);
+ m_freem(*pm);
+ *pm = NULL;
+ NAT64_UNLOCK(nh);
+ return (IP_FW_DENY);
+ }
+ if ((nh->pg_allocated <=
+ nh->pg_used + NAT64LSN_REMAININGPG) &&
+ nh->pg_allocated < cfg->max_chunks)
+ action = 1; /* Request new indexes */
+ else
+ action = 0;
+ NAT64_UNLOCK(nh);
+ //DPRINTF("No state, unlock for %p", nh);
+ return (nat64lsn_request_portgroup(cfg, f_id,
+ pm, aaddr, action));
+ }
+
+ /* We've got new state. */
+ sidx = st->cur;
+ pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx);
+ }
+
+ /* Okay, state found */
+
+ /* Update necessary fileds */
+ if (st->timestamp != state_ts)
+ st->timestamp = state_ts;
+ if ((st->flags & state_flags) != 0)
+ st->flags |= state_flags;
+
+ /* Copy needed state data */
+ aaddr = pg->aaddr;
+ aport = htons(pg->aport + sidx.off);
+
+ NAT64_UNLOCK(nh);
+
+ if (cfg->flags & NAT64_LOG) {
+ logdata = &loghdr;
+ nat64lsn_log(logdata, *pm, AF_INET6, pg->idx, st->cur.off);
+ } else
+ logdata = NULL;
+
+ action = nat64_do_handle_ip6(*pm, aaddr, aport, &cfg->stats, logdata);
+ if (action == NAT64SKIP)
+ return (IP_FW_PASS);
+ if (action == NAT64MFREE)
+ m_freem(*pm);
+ *pm = NULL; /* mark mbuf as consumed */
+ return (IP_FW_DENY);
+}
+
+/*
+ * Main dataplane entry point.
+ */
+int
+ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args,
+ ipfw_insn *cmd, int *done)
+{
+ ipfw_insn *icmd;
+ struct nat64lsn_cfg *cfg;
+ int ret;
+
+ IPFW_RLOCK_ASSERT(ch);
+
+ *done = 1; /* terminate the search */
+ icmd = cmd + 1;
+ if (cmd->opcode != O_EXTERNAL_ACTION ||
+ cmd->arg1 != V_nat64lsn_eid ||
+ icmd->opcode != O_EXTERNAL_INSTANCE ||
+ (cfg = NAT64_LOOKUP(ch, icmd)) == NULL)
+ return (0);
+
+ switch (args->f_id.addr_type) {
+ case 4:
+ ret = nat64lsn_translate4(cfg, &args->f_id, &args->m);
+ break;
+ case 6:
+ ret = nat64lsn_translate6(cfg, &args->f_id, &args->m);
+ break;
+ default:
+ return (0);
+ }
+ return (ret);
+}
+
+static int
+nat64lsn_ctor_host(void *mem, int size, void *arg, int flags)
+{
+ struct nat64lsn_host *nh;
+
+ nh = (struct nat64lsn_host *)mem;
+ memset(nh->pg_ptr, 0, sizeof(nh->pg_ptr));
+ memset(nh->phash, 0, sizeof(nh->phash));
+ return (0);
+}
+
+static int
+nat64lsn_ctor_pgidx(void *mem, int size, void *arg, int flags)
+{
+
+ memset(mem, 0, size);
+ return (0);
+}
+
+void
+nat64lsn_init_internal(void)
+{
+
+ memset(nat64lsn_proto_map, 0, sizeof(nat64lsn_proto_map));
+ /* Set up supported protocol map */
+ nat64lsn_proto_map[IPPROTO_TCP] = NAT_PROTO_TCP;
+ nat64lsn_proto_map[IPPROTO_UDP] = NAT_PROTO_UDP;
+ nat64lsn_proto_map[IPPROTO_ICMP] = NAT_PROTO_ICMP;
+ nat64lsn_proto_map[IPPROTO_ICMPV6] = NAT_PROTO_ICMP;
+ /* Fill in reverse proto map */
+ memset(nat64lsn_rproto_map, 0, sizeof(nat64lsn_rproto_map));
+ nat64lsn_rproto_map[NAT_PROTO_TCP] = IPPROTO_TCP;
+ nat64lsn_rproto_map[NAT_PROTO_UDP] = IPPROTO_UDP;
+ nat64lsn_rproto_map[NAT_PROTO_ICMP] = IPPROTO_ICMPV6;
+
+ JQUEUE_LOCK_INIT();
+ nat64lsn_host_zone = uma_zcreate("NAT64 hosts zone",
+ sizeof(struct nat64lsn_host), nat64lsn_ctor_host, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, 0);
+ nat64lsn_pg_zone = uma_zcreate("NAT64 portgroups zone",
+ sizeof(struct nat64lsn_portgroup), NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+ nat64lsn_pgidx_zone = uma_zcreate("NAT64 portgroup indexes zone",
+ sizeof(struct nat64lsn_portgroup *) * NAT64LSN_PGIDX_CHUNK,
+ nat64lsn_ctor_pgidx, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+
+void
+nat64lsn_uninit_internal(void)
+{
+
+ JQUEUE_LOCK_DESTROY();
+ uma_zdestroy(nat64lsn_host_zone);
+ uma_zdestroy(nat64lsn_pg_zone);
+ uma_zdestroy(nat64lsn_pgidx_zone);
+}
+
+void
+nat64lsn_start_instance(struct nat64lsn_cfg *cfg)
+{
+
+ callout_reset(&cfg->periodic, hz * PERIODIC_DELAY,
+ nat64lsn_periodic, cfg);
+}
+
+struct nat64lsn_cfg *
+nat64lsn_init_instance(struct ip_fw_chain *ch, size_t numaddr)
+{
+ struct nat64lsn_cfg *cfg;
+
+ cfg = malloc(sizeof(struct nat64lsn_cfg), M_IPFW, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&cfg->jhead);
+ cfg->vp = curvnet;
+ cfg->ch = ch;
+ COUNTER_ARRAY_ALLOC(cfg->stats.stats, NAT64STATS, M_WAITOK);
+
+ cfg->ihsize = NAT64LSN_HSIZE;
+ cfg->ih = malloc(sizeof(void *) * cfg->ihsize, M_IPFW,
+ M_WAITOK | M_ZERO);
+
+ cfg->pg = malloc(sizeof(void *) * numaddr * _ADDR_PG_COUNT, M_IPFW,
+ M_WAITOK | M_ZERO);
+
+ callout_init(&cfg->periodic, CALLOUT_MPSAFE);
+ callout_init(&cfg->jcallout, CALLOUT_MPSAFE);
+
+ return (cfg);
+}
+
+/*
+ * Destroy all hosts callback.
+ * Called on module unload when all activity already finished, so
+ * can work without any locks.
+ */
+static NAT64NOINLINE int
+nat64lsn_destroy_host(struct nat64lsn_host *nh, struct nat64lsn_cfg *cfg)
+{
+ struct nat64lsn_portgroup *pg;
+ int i;
+
+ for (i = nh->pg_used; i > 0; i--) {
+ pg = PORTGROUP_BYSIDX(cfg, nh, i);
+ if (pg == NULL)
+ continue;
+ cfg->pg[pg->idx] = NULL;
+ destroy_portgroup(pg);
+ nh->pg_used--;
+ }
+ destroy_host6(nh);
+ cfg->ihcount--;
+ return (0);
+}
+
+void
+nat64lsn_destroy_instance(struct nat64lsn_cfg *cfg)
+{
+ struct nat64lsn_host *nh, *tmp;
+
+ JQUEUE_LOCK();
+ callout_drain(&cfg->jcallout);
+ JQUEUE_UNLOCK();
+
+ callout_drain(&cfg->periodic);
+ I6HASH_FOREACH_SAFE(cfg, nh, tmp, nat64lsn_destroy_host, cfg);
+ DPRINTF(DP_OBJ, "instance %s: hosts %d", cfg->name, cfg->ihcount);
+
+ COUNTER_ARRAY_FREE(cfg->stats.stats, NAT64STATS);
+ free(cfg->ih, M_IPFW);
+ free(cfg->pg, M_IPFW);
+ free(cfg, M_IPFW);
+}
+
diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64lsn.h b/freebsd/sys/netpfil/ipfw/nat64/nat64lsn.h
new file mode 100644
index 00000000..e6ceb1dd
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/nat64/nat64lsn.h
@@ -0,0 +1,351 @@
+/*-
+ * Copyright (c) 2015 Yandex LLC
+ * Copyright (c) 2015 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_FW_NAT64LSN_H_
+#define _IP_FW_NAT64LSN_H_
+
+#define NAT64_CHUNK_SIZE_BITS 6 /* 64 ports */
+#define NAT64_CHUNK_SIZE (1 << NAT64_CHUNK_SIZE_BITS)
+
+#define NAT64_MIN_PORT 1024
+#define NAT64_MIN_CHUNK (NAT64_MIN_PORT >> NAT64_CHUNK_SIZE_BITS)
+
+struct st_ptr {
+ uint8_t idx; /* index in nh->pg_ptr array.
+ * NOTE: it starts from 1.
+ */
+ uint8_t off;
+};
+#define NAT64LSN_MAXPGPTR ((1 << (sizeof(uint8_t) * NBBY)) - 1)
+#define NAT64LSN_PGPTRMASKBITS (sizeof(uint64_t) * NBBY)
+#define NAT64LSN_PGPTRNMASK (roundup(NAT64LSN_MAXPGPTR, \
+ NAT64LSN_PGPTRMASKBITS) / NAT64LSN_PGPTRMASKBITS)
+
+struct nat64lsn_portgroup;
+/* sizeof(struct nat64lsn_host) = 64 + 64x2 + 8x8 = 256 bytes */
+struct nat64lsn_host {
+ struct rwlock h_lock; /* Host states lock */
+
+ struct in6_addr addr;
+ struct nat64lsn_host *next;
+ uint16_t timestamp; /* Last altered */
+ uint16_t hsize; /* ports hash size */
+ uint16_t pg_used; /* Number of portgroups used */
+#define NAT64LSN_REMAININGPG 8 /* Number of remaining PG before
+ * requesting of new chunk of indexes.
+ */
+ uint16_t pg_allocated; /* Number of portgroups indexes
+ * allocated.
+ */
+#define NAT64LSN_HSIZE 64
+ struct st_ptr phash[NAT64LSN_HSIZE]; /* XXX: hardcoded size */
+ /*
+ * PG indexes are stored in chunks with 32 elements.
+ * The maximum count is limited to 255 due to st_ptr->idx is uint8_t.
+ */
+#define NAT64LSN_PGIDX_CHUNK 32
+#define NAT64LSN_PGNIDX (roundup(NAT64LSN_MAXPGPTR, \
+ NAT64LSN_PGIDX_CHUNK) / NAT64LSN_PGIDX_CHUNK)
+ struct nat64lsn_portgroup **pg_ptr[NAT64LSN_PGNIDX]; /* PG indexes */
+};
+
+#define NAT64_RLOCK_ASSERT(h) rw_assert(&(h)->h_lock, RA_RLOCKED)
+#define NAT64_WLOCK_ASSERT(h) rw_assert(&(h)->h_lock, RA_WLOCKED)
+
+#define NAT64_RLOCK(h) rw_rlock(&(h)->h_lock)
+#define NAT64_RUNLOCK(h) rw_runlock(&(h)->h_lock)
+#define NAT64_WLOCK(h) rw_wlock(&(h)->h_lock)
+#define NAT64_WUNLOCK(h) rw_wunlock(&(h)->h_lock)
+#define NAT64_LOCK(h) NAT64_WLOCK(h)
+#define NAT64_UNLOCK(h) NAT64_WUNLOCK(h)
+#define NAT64_LOCK_INIT(h) do { \
+ rw_init(&(h)->h_lock, "NAT64 host lock"); \
+ } while (0)
+
+#define NAT64_LOCK_DESTROY(h) do { \
+ rw_destroy(&(h)->h_lock); \
+ } while (0)
+
+/* Internal proto index */
+#define NAT_PROTO_TCP 1
+#define NAT_PROTO_UDP 2
+#define NAT_PROTO_ICMP 3
+
+#define NAT_MAX_PROTO 4
+extern uint8_t nat64lsn_rproto_map[NAT_MAX_PROTO];
+
+VNET_DECLARE(uint16_t, nat64lsn_eid);
+#define V_nat64lsn_eid VNET(nat64lsn_eid)
+#define IPFW_TLV_NAT64LSN_NAME IPFW_TLV_EACTION_NAME(V_nat64lsn_eid)
+
+/* Timestamp macro */
+#define _CT ((int)time_uptime % 65536)
+#define SET_AGE(x) (x) = _CT
+#define GET_AGE(x) ((_CT >= (x)) ? _CT - (x) : \
+ (int)65536 + _CT - (x))
+
+#ifdef __LP64__
+/* ffsl() is capable of checking 64-bit ints */
+#define _FFS64
+#endif
+
+/* 16 bytes */
+struct nat64lsn_state {
+ union {
+ struct {
+ in_addr_t faddr; /* Remote IPv4 address */
+ uint16_t fport; /* Remote IPv4 port */
+ uint16_t lport; /* Local IPv6 port */
+ }s;
+ uint64_t hkey;
+ } u;
+ uint8_t nat_proto;
+ uint8_t flags;
+ uint16_t timestamp;
+ struct st_ptr cur; /* Index of portgroup in nat64lsn_host */
+ struct st_ptr next; /* Next entry index */
+};
+
+/*
+ * 1024+32 bytes per 64 states, used to store state
+ * AND for outside-in state lookup
+ */
+struct nat64lsn_portgroup {
+ struct nat64lsn_host *host; /* IPv6 source host info */
+ in_addr_t aaddr; /* Alias addr, network format */
+ uint16_t aport; /* Base port */
+ uint16_t timestamp;
+ uint8_t nat_proto;
+ uint8_t spare[3];
+ uint32_t idx;
+#ifdef _FFS64
+ uint64_t freemask; /* Mask of free entries */
+#else
+ uint32_t freemask[2]; /* Mask of free entries */
+#endif
+ struct nat64lsn_state states[NAT64_CHUNK_SIZE]; /* State storage */
+};
+#ifdef _FFS64
+#define PG_MARK_BUSY_IDX(_pg, _idx) (_pg)->freemask &= ~((uint64_t)1<<(_idx))
+#define PG_MARK_FREE_IDX(_pg, _idx) (_pg)->freemask |= ((uint64_t)1<<(_idx))
+#define PG_IS_FREE_IDX(_pg, _idx) ((_pg)->freemask & ((uint64_t)1<<(_idx)))
+#define PG_IS_BUSY_IDX(_pg, _idx) (PG_IS_FREE_IDX(_pg, _idx) == 0)
+#define PG_GET_FREE_IDX(_pg) (ffsll((_pg)->freemask))
+#define PG_IS_EMPTY(_pg) (((_pg)->freemask + 1) == 0)
+#else
+#define PG_MARK_BUSY_IDX(_pg, _idx) \
+ (_pg)->freemask[(_idx) / 32] &= ~((u_long)1<<((_idx) % 32))
+#define PG_MARK_FREE_IDX(_pg, _idx) \
+ (_pg)->freemask[(_idx) / 32] |= ((u_long)1<<((_idx) % 32))
+#define PG_IS_FREE_IDX(_pg, _idx) \
+ ((_pg)->freemask[(_idx) / 32] & ((u_long)1<<((_idx) % 32)))
+#define PG_IS_BUSY_IDX(_pg, _idx) (PG_IS_FREE_IDX(_pg, _idx) == 0)
+#define PG_GET_FREE_IDX(_pg) _pg_get_free_idx(_pg)
+#define PG_IS_EMPTY(_pg) \
+ ((((_pg)->freemask[0] + 1) == 0 && ((_pg)->freemask[1] + 1) == 0))
+
+static inline int
+_pg_get_free_idx(const struct nat64lsn_portgroup *pg)
+{
+ int i;
+
+ if ((i = ffsl(pg->freemask[0])) != 0)
+ return (i);
+ if ((i = ffsl(pg->freemask[1])) != 0)
+ return (i + 32);
+ return (0);
+}
+
+#endif
+
+TAILQ_HEAD(nat64lsn_job_head, nat64lsn_job_item);
+
+#define NAT64LSN_FLAGSMASK (NAT64_LOG)
+struct nat64lsn_cfg {
+ struct named_object no;
+ //struct nat64_exthost *ex; /* Pointer to external addr array */
+ struct nat64lsn_portgroup **pg; /* XXX: array of pointers */
+ struct nat64lsn_host **ih; /* Host hash */
+ uint32_t prefix4; /* IPv4 prefix */
+ uint32_t pmask4; /* IPv4 prefix mask */
+ uint32_t ihsize; /* IPv6 host hash size */
+ uint8_t plen4;
+ uint8_t plen6;
+ uint8_t nomatch_verdict;/* What to return to ipfw on no-match */
+ uint8_t nomatch_final; /* Exit outer loop? */
+ struct in6_addr prefix6; /* IPv6 prefix to embed IPv4 hosts */
+
+ uint32_t ihcount; /* Number of items in host hash */
+ int max_chunks; /* Max chunks per client */
+ int agg_prefix_len; /* Prefix length to count */
+ int agg_prefix_max; /* Max hosts per agg prefix */
+ uint32_t jmaxlen; /* Max jobqueue length */
+ uint32_t flags;
+ uint16_t min_chunk; /* Min port group # to use */
+ uint16_t max_chunk; /* Max port group # to use */
+ uint16_t nh_delete_delay; /* Stale host delete delay */
+ uint16_t pg_delete_delay; /* Stale portgroup del delay */
+ uint16_t st_syn_ttl; /* TCP syn expire */
+ uint16_t st_close_ttl; /* TCP fin expire */
+ uint16_t st_estab_ttl; /* TCP established expire */
+ uint16_t st_udp_ttl; /* UDP expire */
+ uint16_t st_icmp_ttl; /* ICMP expire */
+ uint32_t protochunks[NAT_MAX_PROTO];/* Number of chunks used */
+
+ struct callout periodic;
+ struct callout jcallout;
+ struct ip_fw_chain *ch;
+ struct vnet *vp;
+ struct nat64lsn_job_head jhead;
+ int jlen;
+ char name[64]; /* Nat instance name */
+ nat64_stats_block stats;
+};
+
+struct nat64lsn_cfg *nat64lsn_init_instance(struct ip_fw_chain *ch,
+ size_t numaddr);
+void nat64lsn_destroy_instance(struct nat64lsn_cfg *cfg);
+void nat64lsn_start_instance(struct nat64lsn_cfg *cfg);
+void nat64lsn_init_internal(void);
+void nat64lsn_uninit_internal(void);
+int ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args,
+ ipfw_insn *cmd, int *done);
+
+void
+nat64lsn_dump_state(const struct nat64lsn_cfg *cfg,
+ const struct nat64lsn_portgroup *pg, const struct nat64lsn_state *st,
+ const char *px, int off);
+/*
+ * Portgroup layout
+ * addr x nat_proto x port_off
+ *
+ */
+
+#define _ADDR_PG_PROTO_COUNT (65536 >> NAT64_CHUNK_SIZE_BITS)
+#define _ADDR_PG_COUNT (_ADDR_PG_PROTO_COUNT * NAT_MAX_PROTO)
+
+#define GET_ADDR_IDX(_cfg, _addr) ((_addr) - ((_cfg)->prefix4))
+#define __GET_PORTGROUP_IDX(_proto, _port) \
+ ((_proto - 1) * _ADDR_PG_PROTO_COUNT + \
+ ((_port) >> NAT64_CHUNK_SIZE_BITS))
+
+#define _GET_PORTGROUP_IDX(_cfg, _addr, _proto, _port) \
+ GET_ADDR_IDX(_cfg, _addr) * _ADDR_PG_COUNT + \
+ __GET_PORTGROUP_IDX(_proto, _port)
+#define GET_PORTGROUP(_cfg, _addr, _proto, _port) \
+ ((_cfg)->pg[_GET_PORTGROUP_IDX(_cfg, _addr, _proto, _port)])
+
+#define PORTGROUP_CHUNK(_nh, _idx) \
+ ((_nh)->pg_ptr[(_idx)])
+#define PORTGROUP_BYSIDX(_cfg, _nh, _idx) \
+ (PORTGROUP_CHUNK(_nh, (_idx - 1) / NAT64LSN_PGIDX_CHUNK) \
+ [((_idx) - 1) % NAT64LSN_PGIDX_CHUNK])
+
+
+/* Chained hash table */
+#define CHT_FIND(_ph, _hsize, _PX, _x, _key) do { \
+ unsigned int _buck = _PX##hash(_key) & (_hsize - 1); \
+ _PX##lock(_ph, _buck); \
+ _x = _PX##first(_ph, _buck); \
+ for ( ; _x != NULL; _x = _PX##next(_x)) { \
+ if (_PX##cmp(_key, _PX##val(_x))) \
+ break; \
+ } \
+ if (_x == NULL) \
+ _PX##unlock(_ph, _buck); \
+} while(0)
+
+#define CHT_UNLOCK_BUCK(_ph, _PX, _buck) \
+ _PX##unlock(_ph, _buck);
+
+#define CHT_UNLOCK_KEY(_ph, _hsize, _PX, _key) do { \
+ unsigned int _buck = _PX##hash(_key) & (_hsize - 1); \
+ _PX##unlock(_ph, _buck); \
+} while(0)
+
+#define CHT_INSERT_HEAD(_ph, _hsize, _PX, _i) do { \
+ unsigned int _buck = _PX##hash(_PX##val(_i)) & (_hsize - 1); \
+ _PX##lock(_ph, _buck); \
+ _PX##next(_i) = _PX##first(_ph, _buck); \
+ _PX##first(_ph, _buck) = _i; \
+ _PX##unlock(_ph, _buck); \
+} while(0)
+
+#define CHT_REMOVE(_ph, _hsize, _PX, _x, _tmp, _key) do { \
+ unsigned int _buck = _PX##hash(_key) & (_hsize - 1); \
+ _PX##lock(_ph, _buck); \
+ _x = _PX##first(_ph, _buck); \
+ _tmp = NULL; \
+ for ( ; _x != NULL; _tmp = _x, _x = _PX##next(_x)) { \
+ if (_PX##cmp(_key, _PX##val(_x))) \
+ break; \
+ } \
+ if (_x != NULL) { \
+ if (_tmp == NULL) \
+ _PX##first(_ph, _buck) = _PX##next(_x); \
+ else \
+ _PX##next(_tmp) = _PX##next(_x); \
+ } \
+ _PX##unlock(_ph, _buck); \
+} while(0)
+
+#define CHT_FOREACH_SAFE(_ph, _hsize, _PX, _x, _tmp, _cb, _arg) do { \
+ for (unsigned int _i = 0; _i < _hsize; _i++) { \
+ _PX##lock(_ph, _i); \
+ _x = _PX##first(_ph, _i); \
+ _tmp = NULL; \
+ for (; _x != NULL; _tmp = _x, _x = _PX##next(_x)) { \
+ if (_cb(_x, _arg) == 0) \
+ continue; \
+ if (_tmp == NULL) \
+ _PX##first(_ph, _i) = _PX##next(_x); \
+ else \
+ _tmp = _PX##next(_x); \
+ } \
+ _PX##unlock(_ph, _i); \
+ } \
+} while(0)
+
+#define CHT_RESIZE(_ph, _hsize, _nph, _nhsize, _PX, _x, _y) do { \
+ unsigned int _buck; \
+ for (unsigned int _i = 0; _i < _hsize; _i++) { \
+ _x = _PX##first(_ph, _i); \
+ _y = _x; \
+ while (_y != NULL) { \
+ _buck = _PX##hash(_PX##val(_x)) & (_nhsize - 1);\
+ _y = _PX##next(_x); \
+ _PX##next(_x) = _PX##first(_nph, _buck); \
+ _PX##first(_nph, _buck) = _x; \
+ } \
+ } \
+} while(0)
+
+#endif /* _IP_FW_NAT64LSN_H_ */
+
diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64lsn_control.c b/freebsd/sys/netpfil/ipfw/nat64/nat64lsn_control.c
new file mode 100644
index 00000000..a20a52ea
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/nat64/nat64lsn_control.c
@@ -0,0 +1,919 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2015 Yandex LLC
+ * Copyright (c) 2015 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <rtems/bsd/sys/errno.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sockopt.h>
+#include <sys/queue.h>
+
+#include <net/if.h>
+#include <net/pfil.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/nat64/ip_fw_nat64.h>
+#include <netpfil/ipfw/nat64/nat64lsn.h>
+#include <netinet6/ip_fw_nat64.h>
+
+VNET_DEFINE(uint16_t, nat64lsn_eid) = 0;
+
+static struct nat64lsn_cfg *
+nat64lsn_find(struct namedobj_instance *ni, const char *name, uint8_t set)
+{
+ struct nat64lsn_cfg *cfg;
+
+ cfg = (struct nat64lsn_cfg *)ipfw_objhash_lookup_name_type(ni, set,
+ IPFW_TLV_NAT64LSN_NAME, name);
+
+ return (cfg);
+}
+
+static void
+nat64lsn_default_config(ipfw_nat64lsn_cfg *uc)
+{
+
+ if (uc->max_ports == 0)
+ uc->max_ports = NAT64LSN_MAX_PORTS;
+ else
+ uc->max_ports = roundup(uc->max_ports, NAT64_CHUNK_SIZE);
+ if (uc->max_ports > NAT64_CHUNK_SIZE * NAT64LSN_MAXPGPTR)
+ uc->max_ports = NAT64_CHUNK_SIZE * NAT64LSN_MAXPGPTR;
+ if (uc->jmaxlen == 0)
+ uc->jmaxlen = NAT64LSN_JMAXLEN;
+ if (uc->jmaxlen > 65536)
+ uc->jmaxlen = 65536;
+ if (uc->nh_delete_delay == 0)
+ uc->nh_delete_delay = NAT64LSN_HOST_AGE;
+ if (uc->pg_delete_delay == 0)
+ uc->pg_delete_delay = NAT64LSN_PG_AGE;
+ if (uc->st_syn_ttl == 0)
+ uc->st_syn_ttl = NAT64LSN_TCP_SYN_AGE;
+ if (uc->st_close_ttl == 0)
+ uc->st_close_ttl = NAT64LSN_TCP_FIN_AGE;
+ if (uc->st_estab_ttl == 0)
+ uc->st_estab_ttl = NAT64LSN_TCP_EST_AGE;
+ if (uc->st_udp_ttl == 0)
+ uc->st_udp_ttl = NAT64LSN_UDP_AGE;
+ if (uc->st_icmp_ttl == 0)
+ uc->st_icmp_ttl = NAT64LSN_ICMP_AGE;
+}
+
+/*
+ * Creates new nat64lsn instance.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_lheader ipfw_nat64lsn_cfg ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat64lsn_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_lheader *olh;
+ ipfw_nat64lsn_cfg *uc;
+ struct nat64lsn_cfg *cfg;
+ struct namedobj_instance *ni;
+ uint32_t addr4, mask4;
+
+ if (sd->valsize != sizeof(*olh) + sizeof(*uc))
+ return (EINVAL);
+
+ olh = (ipfw_obj_lheader *)sd->kbuf;
+ uc = (ipfw_nat64lsn_cfg *)(olh + 1);
+
+ if (ipfw_check_object_name_generic(uc->name) != 0)
+ return (EINVAL);
+
+ if (uc->agg_prefix_len > 127 || uc->set >= IPFW_MAX_SETS)
+ return (EINVAL);
+
+ if (uc->plen4 > 32)
+ return (EINVAL);
+ if (uc->plen6 > 128 || ((uc->plen6 % 8) != 0))
+ return (EINVAL);
+
+ /* XXX: Check prefix4 to be global */
+ addr4 = ntohl(uc->prefix4.s_addr);
+ mask4 = ~((1 << (32 - uc->plen4)) - 1);
+ if ((addr4 & mask4) != addr4)
+ return (EINVAL);
+
+ /* XXX: Check prefix6 */
+ if (uc->min_port == 0)
+ uc->min_port = NAT64_MIN_PORT;
+ if (uc->max_port == 0)
+ uc->max_port = 65535;
+ if (uc->min_port > uc->max_port)
+ return (EINVAL);
+ uc->min_port = roundup(uc->min_port, NAT64_CHUNK_SIZE);
+ uc->max_port = roundup(uc->max_port, NAT64_CHUNK_SIZE);
+
+ nat64lsn_default_config(uc);
+
+ ni = CHAIN_TO_SRV(ch);
+ IPFW_UH_RLOCK(ch);
+ if (nat64lsn_find(ni, uc->name, uc->set) != NULL) {
+ IPFW_UH_RUNLOCK(ch);
+ return (EEXIST);
+ }
+ IPFW_UH_RUNLOCK(ch);
+
+ cfg = nat64lsn_init_instance(ch, 1 << (32 - uc->plen4));
+ strlcpy(cfg->name, uc->name, sizeof(cfg->name));
+ cfg->no.name = cfg->name;
+ cfg->no.etlv = IPFW_TLV_NAT64LSN_NAME;
+ cfg->no.set = uc->set;
+
+ cfg->prefix4 = addr4;
+ cfg->pmask4 = addr4 | ~mask4;
+ /* XXX: Copy 96 bits */
+ cfg->plen6 = 96;
+ memcpy(&cfg->prefix6, &uc->prefix6, cfg->plen6 / 8);
+ cfg->plen4 = uc->plen4;
+ cfg->flags = uc->flags & NAT64LSN_FLAGSMASK;
+ cfg->max_chunks = uc->max_ports / NAT64_CHUNK_SIZE;
+ cfg->agg_prefix_len = uc->agg_prefix_len;
+ cfg->agg_prefix_max = uc->agg_prefix_max;
+
+ cfg->min_chunk = uc->min_port / NAT64_CHUNK_SIZE;
+ cfg->max_chunk = uc->max_port / NAT64_CHUNK_SIZE;
+
+ cfg->jmaxlen = uc->jmaxlen;
+ cfg->nh_delete_delay = uc->nh_delete_delay;
+ cfg->pg_delete_delay = uc->pg_delete_delay;
+ cfg->st_syn_ttl = uc->st_syn_ttl;
+ cfg->st_close_ttl = uc->st_close_ttl;
+ cfg->st_estab_ttl = uc->st_estab_ttl;
+ cfg->st_udp_ttl = uc->st_udp_ttl;
+ cfg->st_icmp_ttl = uc->st_icmp_ttl;
+
+ cfg->nomatch_verdict = IP_FW_DENY;
+ cfg->nomatch_final = 1; /* Exit outer loop by default */
+
+ IPFW_UH_WLOCK(ch);
+
+ if (nat64lsn_find(ni, uc->name, uc->set) != NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ nat64lsn_destroy_instance(cfg);
+ return (EEXIST);
+ }
+
+ if (ipfw_objhash_alloc_idx(CHAIN_TO_SRV(ch), &cfg->no.kidx) != 0) {
+ IPFW_UH_WUNLOCK(ch);
+ nat64lsn_destroy_instance(cfg);
+ return (ENOSPC);
+ }
+ ipfw_objhash_add(CHAIN_TO_SRV(ch), &cfg->no);
+
+ /* Okay, let's link data */
+ IPFW_WLOCK(ch);
+ SRV_OBJECT(ch, cfg->no.kidx) = cfg;
+ IPFW_WUNLOCK(ch);
+
+ nat64lsn_start_instance(cfg);
+
+ IPFW_UH_WUNLOCK(ch);
+ return (0);
+}
+
+static void
+nat64lsn_detach_config(struct ip_fw_chain *ch, struct nat64lsn_cfg *cfg)
+{
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ ipfw_objhash_del(CHAIN_TO_SRV(ch), &cfg->no);
+ ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), cfg->no.kidx);
+}
+
+/*
+ * Destroys nat64 instance.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat64lsn_destroy(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ struct nat64lsn_cfg *cfg;
+ ipfw_obj_header *oh;
+
+ if (sd->valsize != sizeof(*oh))
+ return (EINVAL);
+
+ oh = (ipfw_obj_header *)op3;
+
+ IPFW_UH_WLOCK(ch);
+ cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return (ESRCH);
+ }
+
+ if (cfg->no.refcnt > 0) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EBUSY);
+ }
+
+ IPFW_WLOCK(ch);
+ SRV_OBJECT(ch, cfg->no.kidx) = NULL;
+ IPFW_WUNLOCK(ch);
+
+ nat64lsn_detach_config(ch, cfg);
+ IPFW_UH_WUNLOCK(ch);
+
+ nat64lsn_destroy_instance(cfg);
+ return (0);
+}
+
+#define __COPY_STAT_FIELD(_cfg, _stats, _field) \
+ (_stats)->_field = NAT64STAT_FETCH(&(_cfg)->stats, _field)
+static void
+export_stats(struct ip_fw_chain *ch, struct nat64lsn_cfg *cfg,
+ struct ipfw_nat64lsn_stats *stats)
+{
+
+ __COPY_STAT_FIELD(cfg, stats, opcnt64);
+ __COPY_STAT_FIELD(cfg, stats, opcnt46);
+ __COPY_STAT_FIELD(cfg, stats, ofrags);
+ __COPY_STAT_FIELD(cfg, stats, ifrags);
+ __COPY_STAT_FIELD(cfg, stats, oerrors);
+ __COPY_STAT_FIELD(cfg, stats, noroute4);
+ __COPY_STAT_FIELD(cfg, stats, noroute6);
+ __COPY_STAT_FIELD(cfg, stats, nomatch4);
+ __COPY_STAT_FIELD(cfg, stats, noproto);
+ __COPY_STAT_FIELD(cfg, stats, nomem);
+ __COPY_STAT_FIELD(cfg, stats, dropped);
+
+ __COPY_STAT_FIELD(cfg, stats, jcalls);
+ __COPY_STAT_FIELD(cfg, stats, jrequests);
+ __COPY_STAT_FIELD(cfg, stats, jhostsreq);
+ __COPY_STAT_FIELD(cfg, stats, jportreq);
+ __COPY_STAT_FIELD(cfg, stats, jhostfails);
+ __COPY_STAT_FIELD(cfg, stats, jportfails);
+ __COPY_STAT_FIELD(cfg, stats, jmaxlen);
+ __COPY_STAT_FIELD(cfg, stats, jnomem);
+ __COPY_STAT_FIELD(cfg, stats, jreinjected);
+ __COPY_STAT_FIELD(cfg, stats, screated);
+ __COPY_STAT_FIELD(cfg, stats, sdeleted);
+ __COPY_STAT_FIELD(cfg, stats, spgcreated);
+ __COPY_STAT_FIELD(cfg, stats, spgdeleted);
+
+ stats->hostcount = cfg->ihcount;
+ stats->tcpchunks = cfg->protochunks[NAT_PROTO_TCP];
+ stats->udpchunks = cfg->protochunks[NAT_PROTO_UDP];
+ stats->icmpchunks = cfg->protochunks[NAT_PROTO_ICMP];
+}
+#undef __COPY_STAT_FIELD
+
+static void
+nat64lsn_export_config(struct ip_fw_chain *ch, struct nat64lsn_cfg *cfg,
+ ipfw_nat64lsn_cfg *uc)
+{
+
+ uc->flags = cfg->flags & NAT64LSN_FLAGSMASK;
+ uc->max_ports = cfg->max_chunks * NAT64_CHUNK_SIZE;
+ uc->agg_prefix_len = cfg->agg_prefix_len;
+ uc->agg_prefix_max = cfg->agg_prefix_max;
+
+ uc->jmaxlen = cfg->jmaxlen;
+ uc->nh_delete_delay = cfg->nh_delete_delay;
+ uc->pg_delete_delay = cfg->pg_delete_delay;
+ uc->st_syn_ttl = cfg->st_syn_ttl;
+ uc->st_close_ttl = cfg->st_close_ttl;
+ uc->st_estab_ttl = cfg->st_estab_ttl;
+ uc->st_udp_ttl = cfg->st_udp_ttl;
+ uc->st_icmp_ttl = cfg->st_icmp_ttl;
+ uc->prefix4.s_addr = htonl(cfg->prefix4);
+ uc->prefix6 = cfg->prefix6;
+ uc->plen4 = cfg->plen4;
+ uc->plen6 = cfg->plen6;
+ uc->set = cfg->no.set;
+ strlcpy(uc->name, cfg->no.name, sizeof(uc->name));
+}
+
+struct nat64_dump_arg {
+ struct ip_fw_chain *ch;
+ struct sockopt_data *sd;
+};
+
+static int
+export_config_cb(struct namedobj_instance *ni, struct named_object *no,
+ void *arg)
+{
+ struct nat64_dump_arg *da = (struct nat64_dump_arg *)arg;
+ ipfw_nat64lsn_cfg *uc;
+
+ uc = (struct _ipfw_nat64lsn_cfg *)ipfw_get_sopt_space(da->sd,
+ sizeof(*uc));
+ nat64lsn_export_config(da->ch, (struct nat64lsn_cfg *)no, uc);
+ return (0);
+}
+
+/*
+ * Lists all nat64 lsn instances currently available in kernel.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_lheader ]
+ * Reply: [ ipfw_obj_lheader ipfw_nat64lsn_cfg x N ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat64lsn_list(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_lheader *olh;
+ struct nat64_dump_arg da;
+
+ /* Check minimum header size */
+ if (sd->valsize < sizeof(ipfw_obj_lheader))
+ return (EINVAL);
+
+ olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh));
+
+ IPFW_UH_RLOCK(ch);
+ olh->count = ipfw_objhash_count_type(CHAIN_TO_SRV(ch),
+ IPFW_TLV_NAT64LSN_NAME);
+ olh->objsize = sizeof(ipfw_nat64lsn_cfg);
+ olh->size = sizeof(*olh) + olh->count * olh->objsize;
+
+ if (sd->valsize < olh->size) {
+ IPFW_UH_RUNLOCK(ch);
+ return (ENOMEM);
+ }
+ memset(&da, 0, sizeof(da));
+ da.ch = ch;
+ da.sd = sd;
+ ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), export_config_cb, &da,
+ IPFW_TLV_NAT64LSN_NAME);
+ IPFW_UH_RUNLOCK(ch);
+
+ return (0);
+}
+
+/*
+ * Change existing nat64lsn instance configuration.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ipfw_nat64lsn_cfg ]
+ * Reply: [ ipfw_obj_header ipfw_nat64lsn_cfg ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat64lsn_config(struct ip_fw_chain *ch, ip_fw3_opheader *op,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_header *oh;
+ ipfw_nat64lsn_cfg *uc;
+ struct nat64lsn_cfg *cfg;
+ struct namedobj_instance *ni;
+
+ if (sd->valsize != sizeof(*oh) + sizeof(*uc))
+ return (EINVAL);
+
+ oh = (ipfw_obj_header *)ipfw_get_sopt_space(sd,
+ sizeof(*oh) + sizeof(*uc));
+ uc = (ipfw_nat64lsn_cfg *)(oh + 1);
+
+ if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 ||
+ oh->ntlv.set >= IPFW_MAX_SETS)
+ return (EINVAL);
+
+ ni = CHAIN_TO_SRV(ch);
+ if (sd->sopt->sopt_dir == SOPT_GET) {
+ IPFW_UH_RLOCK(ch);
+ cfg = nat64lsn_find(ni, oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_RUNLOCK(ch);
+ return (EEXIST);
+ }
+ nat64lsn_export_config(ch, cfg, uc);
+ IPFW_UH_RUNLOCK(ch);
+ return (0);
+ }
+
+ nat64lsn_default_config(uc);
+
+ IPFW_UH_WLOCK(ch);
+ cfg = nat64lsn_find(ni, oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EEXIST);
+ }
+
+ /*
+ * For now allow to change only following values:
+ * jmaxlen, nh_del_age, pg_del_age, tcp_syn_age, tcp_close_age,
+ * tcp_est_age, udp_age, icmp_age, flags, max_ports.
+ */
+
+ cfg->max_chunks = uc->max_ports / NAT64_CHUNK_SIZE;
+ cfg->jmaxlen = uc->jmaxlen;
+ cfg->nh_delete_delay = uc->nh_delete_delay;
+ cfg->pg_delete_delay = uc->pg_delete_delay;
+ cfg->st_syn_ttl = uc->st_syn_ttl;
+ cfg->st_close_ttl = uc->st_close_ttl;
+ cfg->st_estab_ttl = uc->st_estab_ttl;
+ cfg->st_udp_ttl = uc->st_udp_ttl;
+ cfg->st_icmp_ttl = uc->st_icmp_ttl;
+ cfg->flags = uc->flags & NAT64LSN_FLAGSMASK;
+
+ IPFW_UH_WUNLOCK(ch);
+
+ return (0);
+}
+
+/*
+ * Get nat64lsn statistics.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ]
+ * Reply: [ ipfw_obj_header ipfw_counter_tlv ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat64lsn_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op,
+ struct sockopt_data *sd)
+{
+ struct ipfw_nat64lsn_stats stats;
+ struct nat64lsn_cfg *cfg;
+ ipfw_obj_header *oh;
+ ipfw_obj_ctlv *ctlv;
+ size_t sz;
+
+ sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ctlv) + sizeof(stats);
+ if (sd->valsize % sizeof(uint64_t))
+ return (EINVAL);
+ if (sd->valsize < sz)
+ return (ENOMEM);
+ oh = (ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
+ if (oh == NULL)
+ return (EINVAL);
+ memset(&stats, 0, sizeof(stats));
+
+ IPFW_UH_RLOCK(ch);
+ cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_RUNLOCK(ch);
+ return (ESRCH);
+ }
+
+ export_stats(ch, cfg, &stats);
+ IPFW_UH_RUNLOCK(ch);
+
+ ctlv = (ipfw_obj_ctlv *)(oh + 1);
+ memset(ctlv, 0, sizeof(*ctlv));
+ ctlv->head.type = IPFW_TLV_COUNTERS;
+ ctlv->head.length = sz - sizeof(ipfw_obj_header);
+ ctlv->count = sizeof(stats) / sizeof(uint64_t);
+ ctlv->objsize = sizeof(uint64_t);
+ ctlv->version = IPFW_NAT64_VERSION;
+ memcpy(ctlv + 1, &stats, sizeof(stats));
+ return (0);
+}
+
+/*
+ * Reset nat64lsn statistics.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat64lsn_reset_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op,
+ struct sockopt_data *sd)
+{
+ struct nat64lsn_cfg *cfg;
+ ipfw_obj_header *oh;
+
+ if (sd->valsize != sizeof(*oh))
+ return (EINVAL);
+ oh = (ipfw_obj_header *)sd->kbuf;
+ if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 ||
+ oh->ntlv.set >= IPFW_MAX_SETS)
+ return (EINVAL);
+
+ IPFW_UH_WLOCK(ch);
+ cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return (ESRCH);
+ }
+ COUNTER_ARRAY_ZERO(cfg->stats.stats, NAT64STATS);
+ IPFW_UH_WUNLOCK(ch);
+ return (0);
+}
+
+/*
+ * Reply: [ ipfw_obj_header ipfw_obj_data [ ipfw_nat64lsn_stg
+ * ipfw_nat64lsn_state x count, ... ] ]
+ */
+static int
+export_pg_states(struct nat64lsn_cfg *cfg, struct nat64lsn_portgroup *pg,
+ ipfw_nat64lsn_stg *stg, struct sockopt_data *sd)
+{
+ ipfw_nat64lsn_state *ste;
+ struct nat64lsn_state *st;
+ int i, count;
+
+ NAT64_LOCK(pg->host);
+ count = 0;
+ for (i = 0; i < 64; i++) {
+ if (PG_IS_BUSY_IDX(pg, i))
+ count++;
+ }
+ DPRINTF(DP_STATE, "EXPORT PG %d, count %d", pg->idx, count);
+
+ if (count == 0) {
+ stg->count = 0;
+ NAT64_UNLOCK(pg->host);
+ return (0);
+ }
+ ste = (ipfw_nat64lsn_state *)ipfw_get_sopt_space(sd,
+ count * sizeof(ipfw_nat64lsn_state));
+ if (ste == NULL) {
+ NAT64_UNLOCK(pg->host);
+ return (1);
+ }
+
+ stg->alias4.s_addr = pg->aaddr;
+ stg->proto = nat64lsn_rproto_map[pg->nat_proto];
+ stg->flags = 0;
+ stg->host6 = pg->host->addr;
+ stg->count = count;
+ for (i = 0; i < 64; i++) {
+ if (PG_IS_FREE_IDX(pg, i))
+ continue;
+ st = &pg->states[i];
+ ste->daddr.s_addr = st->u.s.faddr;
+ ste->dport = st->u.s.fport;
+ ste->aport = pg->aport + i;
+ ste->sport = st->u.s.lport;
+ ste->flags = st->flags; /* XXX filter flags */
+ ste->idle = GET_AGE(st->timestamp);
+ ste++;
+ }
+ NAT64_UNLOCK(pg->host);
+
+ return (0);
+}
+
+static int
+get_next_idx(struct nat64lsn_cfg *cfg, uint32_t *addr, uint8_t *nat_proto,
+ uint16_t *port)
+{
+
+ if (*port < 65536 - NAT64_CHUNK_SIZE) {
+ *port += NAT64_CHUNK_SIZE;
+ return (0);
+ }
+ *port = 0;
+
+ if (*nat_proto < NAT_MAX_PROTO - 1) {
+ *nat_proto += 1;
+ return (0);
+ }
+ *nat_proto = 1;
+
+ if (*addr < cfg->pmask4) {
+ *addr += 1;
+ return (0);
+ }
+
+ /* End of space. */
+ return (1);
+}
+
+#define PACK_IDX(addr, proto, port) \
+ ((uint64_t)addr << 32) | ((uint32_t)port << 16) | (proto << 8)
+#define UNPACK_IDX(idx, addr, proto, port) \
+ (addr) = (uint32_t)((idx) >> 32); \
+ (port) = (uint16_t)(((idx) >> 16) & 0xFFFF); \
+ (proto) = (uint8_t)(((idx) >> 8) & 0xFF)
+
+static struct nat64lsn_portgroup *
+get_next_pg(struct nat64lsn_cfg *cfg, uint32_t *addr, uint8_t *nat_proto,
+ uint16_t *port)
+{
+ struct nat64lsn_portgroup *pg;
+ uint64_t pre_pack, post_pack;
+
+ pg = NULL;
+ pre_pack = PACK_IDX(*addr, *nat_proto, *port);
+ for (;;) {
+ if (get_next_idx(cfg, addr, nat_proto, port) != 0) {
+ /* End of states */
+ return (pg);
+ }
+
+ pg = GET_PORTGROUP(cfg, *addr, *nat_proto, *port);
+ if (pg != NULL)
+ break;
+ }
+
+ post_pack = PACK_IDX(*addr, *nat_proto, *port);
+ if (pre_pack == post_pack)
+ DPRINTF(DP_STATE, "XXX: PACK_IDX %u %d %d",
+ *addr, *nat_proto, *port);
+ return (pg);
+}
+
+static NAT64NOINLINE struct nat64lsn_portgroup *
+get_first_pg(struct nat64lsn_cfg *cfg, uint32_t *addr, uint8_t *nat_proto,
+ uint16_t *port)
+{
+ struct nat64lsn_portgroup *pg;
+
+ pg = GET_PORTGROUP(cfg, *addr, *nat_proto, *port);
+ if (pg == NULL)
+ pg = get_next_pg(cfg, addr, nat_proto, port);
+
+ return (pg);
+}
+
+/*
+ * Lists nat64lsn states.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ipfw_obj_data [ uint64_t ]]
+ * Reply: [ ipfw_obj_header ipfw_obj_data [
+ * ipfw_nat64lsn_stg ipfw_nat64lsn_state x N] ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat64lsn_states(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_header *oh;
+ ipfw_obj_data *od;
+ ipfw_nat64lsn_stg *stg;
+ struct nat64lsn_cfg *cfg;
+ struct nat64lsn_portgroup *pg, *pg_next;
+ uint64_t next_idx;
+ size_t sz;
+ uint32_t addr, states;
+ uint16_t port;
+ uint8_t nat_proto;
+
+ sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_data) +
+ sizeof(uint64_t);
+ /* Check minimum header size */
+ if (sd->valsize < sz)
+ return (EINVAL);
+
+ oh = (ipfw_obj_header *)sd->kbuf;
+ od = (ipfw_obj_data *)(oh + 1);
+ if (od->head.type != IPFW_TLV_OBJDATA ||
+ od->head.length != sz - sizeof(ipfw_obj_header))
+ return (EINVAL);
+
+ next_idx = *(uint64_t *)(od + 1);
+ /* Translate index to the request position to start from */
+ UNPACK_IDX(next_idx, addr, nat_proto, port);
+ if (nat_proto >= NAT_MAX_PROTO)
+ return (EINVAL);
+ if (nat_proto == 0 && addr != 0)
+ return (EINVAL);
+
+ IPFW_UH_RLOCK(ch);
+ cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_RUNLOCK(ch);
+ return (ESRCH);
+ }
+ /* Fill in starting point */
+ if (addr == 0) {
+ addr = cfg->prefix4;
+ nat_proto = 1;
+ port = 0;
+ }
+ if (addr < cfg->prefix4 || addr > cfg->pmask4) {
+ IPFW_UH_RUNLOCK(ch);
+ DPRINTF(DP_GENERIC | DP_STATE, "XXX: %ju %u %u",
+ (uintmax_t)next_idx, addr, cfg->pmask4);
+ return (EINVAL);
+ }
+
+ sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_data) +
+ sizeof(ipfw_nat64lsn_stg);
+ if (sd->valsize < sz)
+ return (ENOMEM);
+ oh = (ipfw_obj_header *)ipfw_get_sopt_space(sd, sz);
+ od = (ipfw_obj_data *)(oh + 1);
+ od->head.type = IPFW_TLV_OBJDATA;
+ od->head.length = sz - sizeof(ipfw_obj_header);
+ stg = (ipfw_nat64lsn_stg *)(od + 1);
+
+ pg = get_first_pg(cfg, &addr, &nat_proto, &port);
+ if (pg == NULL) {
+ /* No states */
+ stg->next_idx = 0xFF;
+ stg->count = 0;
+ IPFW_UH_RUNLOCK(ch);
+ return (0);
+ }
+ states = 0;
+ pg_next = NULL;
+ while (pg != NULL) {
+ pg_next = get_next_pg(cfg, &addr, &nat_proto, &port);
+ if (pg_next == NULL)
+ stg->next_idx = 0xFF;
+ else
+ stg->next_idx = PACK_IDX(addr, nat_proto, port);
+
+ if (export_pg_states(cfg, pg, stg, sd) != 0) {
+ IPFW_UH_RUNLOCK(ch);
+ return (states == 0 ? ENOMEM: 0);
+ }
+ states += stg->count;
+ od->head.length += stg->count * sizeof(ipfw_nat64lsn_state);
+ sz += stg->count * sizeof(ipfw_nat64lsn_state);
+ if (pg_next != NULL) {
+ sz += sizeof(ipfw_nat64lsn_stg);
+ if (sd->valsize < sz)
+ break;
+ stg = (ipfw_nat64lsn_stg *)ipfw_get_sopt_space(sd,
+ sizeof(ipfw_nat64lsn_stg));
+ }
+ pg = pg_next;
+ }
+ IPFW_UH_RUNLOCK(ch);
+ return (0);
+}
+
+static struct ipfw_sopt_handler scodes[] = {
+ { IP_FW_NAT64LSN_CREATE, 0, HDIR_BOTH, nat64lsn_create },
+ { IP_FW_NAT64LSN_DESTROY,0, HDIR_SET, nat64lsn_destroy },
+ { IP_FW_NAT64LSN_CONFIG, 0, HDIR_BOTH, nat64lsn_config },
+ { IP_FW_NAT64LSN_LIST, 0, HDIR_GET, nat64lsn_list },
+ { IP_FW_NAT64LSN_STATS, 0, HDIR_GET, nat64lsn_stats },
+ { IP_FW_NAT64LSN_RESET_STATS,0, HDIR_SET, nat64lsn_reset_stats },
+ { IP_FW_NAT64LSN_LIST_STATES,0, HDIR_GET, nat64lsn_states },
+};
+
+static int
+nat64lsn_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
+{
+ ipfw_insn *icmd;
+
+ icmd = cmd - 1;
+ if (icmd->opcode != O_EXTERNAL_ACTION ||
+ icmd->arg1 != V_nat64lsn_eid)
+ return (1);
+
+ *puidx = cmd->arg1;
+ *ptype = 0;
+ return (0);
+}
+
+static void
+nat64lsn_update_arg1(ipfw_insn *cmd, uint16_t idx)
+{
+
+ cmd->arg1 = idx;
+}
+
+static int
+nat64lsn_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
+ struct named_object **pno)
+{
+ int err;
+
+ err = ipfw_objhash_find_type(CHAIN_TO_SRV(ch), ti,
+ IPFW_TLV_NAT64LSN_NAME, pno);
+ return (err);
+}
+
+static struct named_object *
+nat64lsn_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
+{
+ struct namedobj_instance *ni;
+ struct named_object *no;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+ ni = CHAIN_TO_SRV(ch);
+ no = ipfw_objhash_lookup_kidx(ni, idx);
+ KASSERT(no != NULL, ("NAT64LSN with index %d not found", idx));
+
+ return (no);
+}
+
+static int
+nat64lsn_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set,
+ enum ipfw_sets_cmd cmd)
+{
+
+ return (ipfw_obj_manage_sets(CHAIN_TO_SRV(ch), IPFW_TLV_NAT64LSN_NAME,
+ set, new_set, cmd));
+}
+
+static struct opcode_obj_rewrite opcodes[] = {
+ {
+ .opcode = O_EXTERNAL_INSTANCE,
+ .etlv = IPFW_TLV_EACTION /* just show it isn't table */,
+ .classifier = nat64lsn_classify,
+ .update = nat64lsn_update_arg1,
+ .find_byname = nat64lsn_findbyname,
+ .find_bykidx = nat64lsn_findbykidx,
+ .manage_sets = nat64lsn_manage_sets,
+ },
+};
+
+static int
+destroy_config_cb(struct namedobj_instance *ni, struct named_object *no,
+ void *arg)
+{
+ struct nat64lsn_cfg *cfg;
+ struct ip_fw_chain *ch;
+
+ ch = (struct ip_fw_chain *)arg;
+ cfg = (struct nat64lsn_cfg *)SRV_OBJECT(ch, no->kidx);
+ SRV_OBJECT(ch, no->kidx) = NULL;
+ nat64lsn_detach_config(ch, cfg);
+ nat64lsn_destroy_instance(cfg);
+ return (0);
+}
+
+int
+nat64lsn_init(struct ip_fw_chain *ch, int first)
+{
+
+ if (first != 0)
+ nat64lsn_init_internal();
+ V_nat64lsn_eid = ipfw_add_eaction(ch, ipfw_nat64lsn, "nat64lsn");
+ if (V_nat64lsn_eid == 0)
+ return (ENXIO);
+ IPFW_ADD_SOPT_HANDLER(first, scodes);
+ IPFW_ADD_OBJ_REWRITER(first, opcodes);
+ return (0);
+}
+
+void
+nat64lsn_uninit(struct ip_fw_chain *ch, int last)
+{
+
+ IPFW_DEL_OBJ_REWRITER(last, opcodes);
+ IPFW_DEL_SOPT_HANDLER(last, scodes);
+ ipfw_del_eaction(ch, V_nat64lsn_eid);
+ /*
+ * Since we already have deregistered external action,
+ * our named objects become unaccessible via rules, because
+ * all rules were truncated by ipfw_del_eaction().
+ * So, we can unlink and destroy our named objects without holding
+ * IPFW_WLOCK().
+ */
+ IPFW_UH_WLOCK(ch);
+ ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), destroy_config_cb, ch,
+ IPFW_TLV_NAT64LSN_NAME);
+ V_nat64lsn_eid = 0;
+ IPFW_UH_WUNLOCK(ch);
+ if (last != 0)
+ nat64lsn_uninit_internal();
+}
+
diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64stl.c b/freebsd/sys/netpfil/ipfw/nat64/nat64stl.c
new file mode 100644
index 00000000..36e6e268
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/nat64/nat64stl.c
@@ -0,0 +1,262 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2015-2016 Yandex LLC
+ * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_pflog.h>
+#include <net/pfil.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet6/ip_fw_nat64.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/nat64/ip_fw_nat64.h>
+#include <netpfil/ipfw/nat64/nat64_translate.h>
+#include <netpfil/ipfw/nat64/nat64stl.h>
+#include <netpfil/pf/pf.h>
+
+#define NAT64_LOOKUP(chain, cmd) \
+ (struct nat64stl_cfg *)SRV_OBJECT((chain), (cmd)->arg1)
+
+static void
+nat64stl_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family,
+ uint32_t kidx)
+{
+ static uint32_t pktid = 0;
+
+ memset(plog, 0, sizeof(plog));
+ plog->length = PFLOG_REAL_HDRLEN;
+ plog->af = family;
+ plog->action = PF_NAT;
+ plog->dir = PF_IN;
+ plog->rulenr = htonl(kidx);
+ plog->subrulenr = htonl(++pktid);
+ plog->ruleset[0] = '\0';
+ strlcpy(plog->ifname, "NAT64STL", sizeof(plog->ifname));
+ ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m);
+}
+
+static int
+nat64stl_handle_ip4(struct ip_fw_chain *chain, struct nat64stl_cfg *cfg,
+ struct mbuf *m, uint32_t tablearg)
+{
+ struct pfloghdr loghdr, *logdata;
+ struct in6_addr saddr, daddr;
+ struct ip *ip;
+
+ ip = mtod(m, struct ip*);
+ if (nat64_check_ip4(ip->ip_src.s_addr) != 0 ||
+ nat64_check_ip4(ip->ip_dst.s_addr) != 0 ||
+ nat64_check_private_ip4(ip->ip_src.s_addr) != 0 ||
+ nat64_check_private_ip4(ip->ip_dst.s_addr) != 0)
+ return (NAT64SKIP);
+
+ daddr = TARG_VAL(chain, tablearg, nh6);
+ if (nat64_check_ip6(&daddr) != 0)
+ return (NAT64MFREE);
+ saddr = cfg->prefix6;
+ nat64_set_ip4(&saddr, ip->ip_src.s_addr);
+
+ if (cfg->flags & NAT64_LOG) {
+ logdata = &loghdr;
+ nat64stl_log(logdata, m, AF_INET, cfg->no.kidx);
+ } else
+ logdata = NULL;
+ return (nat64_do_handle_ip4(m, &saddr, &daddr, 0, &cfg->stats,
+ logdata));
+}
+
+static int
+nat64stl_handle_ip6(struct ip_fw_chain *chain, struct nat64stl_cfg *cfg,
+ struct mbuf *m, uint32_t tablearg)
+{
+ struct pfloghdr loghdr, *logdata;
+ struct ip6_hdr *ip6;
+ uint32_t aaddr;
+
+ aaddr = htonl(TARG_VAL(chain, tablearg, nh4));
+
+ /*
+ * NOTE: we expect ipfw_chk() did m_pullup() up to upper level
+ * protocol's headers. Also we skip some checks, that ip6_input(),
+ * ip6_forward(), ip6_fastfwd() and ipfw_chk() already did.
+ */
+ ip6 = mtod(m, struct ip6_hdr *);
+ /* Check ip6_dst matches configured prefix */
+ if (bcmp(&ip6->ip6_dst, &cfg->prefix6, cfg->plen6 / 8) != 0)
+ return (NAT64SKIP);
+
+ if (cfg->flags & NAT64_LOG) {
+ logdata = &loghdr;
+ nat64stl_log(logdata, m, AF_INET6, cfg->no.kidx);
+ } else
+ logdata = NULL;
+ return (nat64_do_handle_ip6(m, aaddr, 0, &cfg->stats, logdata));
+}
+
+static int
+nat64stl_handle_icmp6(struct ip_fw_chain *chain, struct nat64stl_cfg *cfg,
+ struct mbuf *m)
+{
+ struct pfloghdr loghdr, *logdata;
+ nat64_stats_block *stats;
+ struct ip6_hdr *ip6i;
+ struct icmp6_hdr *icmp6;
+ uint32_t tablearg;
+ int hlen, proto;
+
+ hlen = 0;
+ stats = &cfg->stats;
+ proto = nat64_getlasthdr(m, &hlen);
+ if (proto != IPPROTO_ICMPV6) {
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+ icmp6 = mtodo(m, hlen);
+ switch (icmp6->icmp6_type) {
+ case ICMP6_DST_UNREACH:
+ case ICMP6_PACKET_TOO_BIG:
+ case ICMP6_TIME_EXCEED_TRANSIT:
+ case ICMP6_PARAM_PROB:
+ break;
+ default:
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+ hlen += sizeof(struct icmp6_hdr);
+ if (m->m_pkthdr.len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) {
+ NAT64STAT_INC(stats, dropped);
+ return (NAT64MFREE);
+ }
+ if (m->m_len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN)
+ m = m_pullup(m, hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN);
+ if (m == NULL) {
+ NAT64STAT_INC(stats, nomem);
+ return (NAT64RETURN);
+ }
+ /*
+ * Use destination address from inner IPv6 header to determine
+ * IPv4 mapped address.
+ */
+ ip6i = mtodo(m, hlen);
+ if (ipfw_lookup_table_extended(chain, cfg->map64,
+ sizeof(struct in6_addr), &ip6i->ip6_dst, &tablearg) == 0) {
+ m_freem(m);
+ return (NAT64RETURN);
+ }
+ if (cfg->flags & NAT64_LOG) {
+ logdata = &loghdr;
+ nat64stl_log(logdata, m, AF_INET6, cfg->no.kidx);
+ } else
+ logdata = NULL;
+ return (nat64_handle_icmp6(m, 0,
+ htonl(TARG_VAL(chain, tablearg, nh4)), 0, stats, logdata));
+}
+
+int
+ipfw_nat64stl(struct ip_fw_chain *chain, struct ip_fw_args *args,
+ ipfw_insn *cmd, int *done)
+{
+ ipfw_insn *icmd;
+ struct nat64stl_cfg *cfg;
+ uint32_t tablearg;
+ int ret;
+
+ IPFW_RLOCK_ASSERT(chain);
+
+ *done = 0; /* try next rule if not matched */
+ icmd = cmd + 1;
+ if (cmd->opcode != O_EXTERNAL_ACTION ||
+ cmd->arg1 != V_nat64stl_eid ||
+ icmd->opcode != O_EXTERNAL_INSTANCE ||
+ (cfg = NAT64_LOOKUP(chain, icmd)) == NULL)
+ return (0);
+
+ switch (args->f_id.addr_type) {
+ case 4:
+ ret = ipfw_lookup_table(chain, cfg->map46,
+ htonl(args->f_id.dst_ip), &tablearg);
+ break;
+ case 6:
+ ret = ipfw_lookup_table_extended(chain, cfg->map64,
+ sizeof(struct in6_addr), &args->f_id.src_ip6, &tablearg);
+ break;
+ default:
+ return (0);
+ }
+ if (ret == 0) {
+ /*
+ * In case when packet is ICMPv6 message from an intermediate
+ * router, the source address of message will not match the
+ * addresses from our map64 table.
+ */
+ if (args->f_id.proto != IPPROTO_ICMPV6)
+ return (0);
+
+ ret = nat64stl_handle_icmp6(chain, cfg, args->m);
+ } else {
+ if (args->f_id.addr_type == 4)
+ ret = nat64stl_handle_ip4(chain, cfg, args->m,
+ tablearg);
+ else
+ ret = nat64stl_handle_ip6(chain, cfg, args->m,
+ tablearg);
+ }
+ if (ret == NAT64SKIP)
+ return (0);
+
+ *done = 1; /* terminate the search */
+ if (ret == NAT64MFREE)
+ m_freem(args->m);
+ args->m = NULL;
+ return (IP_FW_DENY);
+}
+
+
diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64stl.h b/freebsd/sys/netpfil/ipfw/nat64/nat64stl.h
new file mode 100644
index 00000000..42ec20ea
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/nat64/nat64stl.h
@@ -0,0 +1,58 @@
+/*-
+ * Copyright (c) 2015-2016 Yandex LLC
+ * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_FW_NAT64STL_H_
+#define _IP_FW_NAT64STL_H_
+
+struct nat64stl_cfg {
+ struct named_object no;
+
+ uint16_t map64; /* table with 6to4 mapping */
+ uint16_t map46; /* table with 4to6 mapping */
+
+ struct in6_addr prefix6;/* IPv6 prefix */
+ uint8_t plen6; /* prefix length */
+ uint8_t flags; /* flags for internal use */
+#define NAT64STL_KIDX 0x0100
+#define NAT64STL_46T 0x0200
+#define NAT64STL_64T 0x0400
+#define NAT64STL_FLAGSMASK (NAT64_LOG) /* flags to pass to userland */
+ char name[64];
+ nat64_stats_block stats;
+};
+
+VNET_DECLARE(uint16_t, nat64stl_eid);
+#define V_nat64stl_eid VNET(nat64stl_eid)
+#define IPFW_TLV_NAT64STL_NAME IPFW_TLV_EACTION_NAME(V_nat64stl_eid)
+
+int ipfw_nat64stl(struct ip_fw_chain *chain, struct ip_fw_args *args,
+ ipfw_insn *cmd, int *done);
+
+#endif
+
diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64stl_control.c b/freebsd/sys/netpfil/ipfw/nat64/nat64stl_control.c
new file mode 100644
index 00000000..6ee04867
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/nat64/nat64stl_control.c
@@ -0,0 +1,623 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2015-2016 Yandex LLC
+ * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ * Copyright (c) 2015 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <rtems/bsd/sys/errno.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sockopt.h>
+#include <sys/queue.h>
+#include <sys/syslog.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/pfil.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet6/in6_var.h>
+#include <netinet6/ip6_var.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/nat64/ip_fw_nat64.h>
+#include <netpfil/ipfw/nat64/nat64stl.h>
+#include <netinet6/ip_fw_nat64.h>
+
+VNET_DEFINE(uint16_t, nat64stl_eid) = 0;
+
+static struct nat64stl_cfg *nat64stl_alloc_config(const char *name, uint8_t set);
+static void nat64stl_free_config(struct nat64stl_cfg *cfg);
+static struct nat64stl_cfg *nat64stl_find(struct namedobj_instance *ni,
+ const char *name, uint8_t set);
+
+static struct nat64stl_cfg *
+nat64stl_alloc_config(const char *name, uint8_t set)
+{
+ struct nat64stl_cfg *cfg;
+
+ cfg = malloc(sizeof(struct nat64stl_cfg), M_IPFW, M_WAITOK | M_ZERO);
+ COUNTER_ARRAY_ALLOC(cfg->stats.stats, NAT64STATS, M_WAITOK);
+ cfg->no.name = cfg->name;
+ cfg->no.etlv = IPFW_TLV_NAT64STL_NAME;
+ cfg->no.set = set;
+ strlcpy(cfg->name, name, sizeof(cfg->name));
+ return (cfg);
+}
+
+static void
+nat64stl_free_config(struct nat64stl_cfg *cfg)
+{
+
+ COUNTER_ARRAY_FREE(cfg->stats.stats, NAT64STATS);
+ free(cfg, M_IPFW);
+}
+
+static void
+nat64stl_export_config(struct ip_fw_chain *ch, struct nat64stl_cfg *cfg,
+ ipfw_nat64stl_cfg *uc)
+{
+ struct named_object *no;
+
+ uc->prefix6 = cfg->prefix6;
+ uc->plen6 = cfg->plen6;
+ uc->flags = cfg->flags & NAT64STL_FLAGSMASK;
+ uc->set = cfg->no.set;
+ strlcpy(uc->name, cfg->no.name, sizeof(uc->name));
+
+ no = ipfw_objhash_lookup_table_kidx(ch, cfg->map64);
+ ipfw_export_obj_ntlv(no, &uc->ntlv6);
+ no = ipfw_objhash_lookup_table_kidx(ch, cfg->map46);
+ ipfw_export_obj_ntlv(no, &uc->ntlv4);
+}
+
+struct nat64stl_dump_arg {
+ struct ip_fw_chain *ch;
+ struct sockopt_data *sd;
+};
+
+static int
+export_config_cb(struct namedobj_instance *ni, struct named_object *no,
+ void *arg)
+{
+ struct nat64stl_dump_arg *da = (struct nat64stl_dump_arg *)arg;
+ ipfw_nat64stl_cfg *uc;
+
+ uc = (ipfw_nat64stl_cfg *)ipfw_get_sopt_space(da->sd, sizeof(*uc));
+ nat64stl_export_config(da->ch, (struct nat64stl_cfg *)no, uc);
+ return (0);
+}
+
+static struct nat64stl_cfg *
+nat64stl_find(struct namedobj_instance *ni, const char *name, uint8_t set)
+{
+ struct nat64stl_cfg *cfg;
+
+ cfg = (struct nat64stl_cfg *)ipfw_objhash_lookup_name_type(ni, set,
+ IPFW_TLV_NAT64STL_NAME, name);
+
+ return (cfg);
+}
+
+
+static int
+nat64stl_create_internal(struct ip_fw_chain *ch, struct nat64stl_cfg *cfg,
+ ipfw_nat64stl_cfg *i)
+{
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ if (ipfw_objhash_alloc_idx(CHAIN_TO_SRV(ch), &cfg->no.kidx) != 0)
+ return (ENOSPC);
+ cfg->flags |= NAT64STL_KIDX;
+
+ if (ipfw_ref_table(ch, &i->ntlv4, &cfg->map46) != 0)
+ return (EINVAL);
+ cfg->flags |= NAT64STL_46T;
+
+ if (ipfw_ref_table(ch, &i->ntlv6, &cfg->map64) != 0)
+ return (EINVAL);
+ cfg->flags |= NAT64STL_64T;
+
+ ipfw_objhash_add(CHAIN_TO_SRV(ch), &cfg->no);
+
+ return (0);
+}
+
+/*
+ * Creates new nat64 instance.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_lheader ipfw_nat64stl_cfg ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat64stl_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_lheader *olh;
+ ipfw_nat64stl_cfg *uc;
+ struct namedobj_instance *ni;
+ struct nat64stl_cfg *cfg;
+ int error;
+
+ if (sd->valsize != sizeof(*olh) + sizeof(*uc))
+ return (EINVAL);
+
+ olh = (ipfw_obj_lheader *)sd->kbuf;
+ uc = (ipfw_nat64stl_cfg *)(olh + 1);
+
+ if (ipfw_check_object_name_generic(uc->name) != 0)
+ return (EINVAL);
+ if (!IN6_IS_ADDR_WKPFX(&uc->prefix6))
+ return (EINVAL);
+ if (uc->plen6 != 96 || uc->set >= IPFW_MAX_SETS)
+ return (EINVAL);
+
+ /* XXX: check types of tables */
+
+ ni = CHAIN_TO_SRV(ch);
+ error = 0;
+
+ IPFW_UH_RLOCK(ch);
+ if (nat64stl_find(ni, uc->name, uc->set) != NULL) {
+ IPFW_UH_RUNLOCK(ch);
+ return (EEXIST);
+ }
+ IPFW_UH_RUNLOCK(ch);
+
+ cfg = nat64stl_alloc_config(uc->name, uc->set);
+ cfg->prefix6 = uc->prefix6;
+ cfg->plen6 = uc->plen6;
+ cfg->flags = uc->flags & NAT64STL_FLAGSMASK;
+
+ IPFW_UH_WLOCK(ch);
+
+ if (nat64stl_find(ni, uc->name, uc->set) != NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ nat64stl_free_config(cfg);
+ return (EEXIST);
+ }
+ error = nat64stl_create_internal(ch, cfg, uc);
+ if (error == 0) {
+ /* Okay, let's link data */
+ IPFW_WLOCK(ch);
+ SRV_OBJECT(ch, cfg->no.kidx) = cfg;
+ IPFW_WUNLOCK(ch);
+
+ IPFW_UH_WUNLOCK(ch);
+ return (0);
+ }
+
+ if (cfg->flags & NAT64STL_KIDX)
+ ipfw_objhash_free_idx(ni, cfg->no.kidx);
+ if (cfg->flags & NAT64STL_46T)
+ ipfw_unref_table(ch, cfg->map46);
+ if (cfg->flags & NAT64STL_64T)
+ ipfw_unref_table(ch, cfg->map64);
+
+ IPFW_UH_WUNLOCK(ch);
+ nat64stl_free_config(cfg);
+ return (error);
+}
+
+/*
+ * Change existing nat64stl instance configuration.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ipfw_nat64stl_cfg ]
+ * Reply: [ ipfw_obj_header ipfw_nat64stl_cfg ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat64stl_config(struct ip_fw_chain *ch, ip_fw3_opheader *op,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_header *oh;
+ ipfw_nat64stl_cfg *uc;
+ struct nat64stl_cfg *cfg;
+ struct namedobj_instance *ni;
+
+ if (sd->valsize != sizeof(*oh) + sizeof(*uc))
+ return (EINVAL);
+
+ oh = (ipfw_obj_header *)ipfw_get_sopt_space(sd,
+ sizeof(*oh) + sizeof(*uc));
+ uc = (ipfw_nat64stl_cfg *)(oh + 1);
+
+ if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 ||
+ oh->ntlv.set >= IPFW_MAX_SETS)
+ return (EINVAL);
+
+ ni = CHAIN_TO_SRV(ch);
+ if (sd->sopt->sopt_dir == SOPT_GET) {
+ IPFW_UH_RLOCK(ch);
+ cfg = nat64stl_find(ni, oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_RUNLOCK(ch);
+ return (EEXIST);
+ }
+ nat64stl_export_config(ch, cfg, uc);
+ IPFW_UH_RUNLOCK(ch);
+ return (0);
+ }
+
+ IPFW_UH_WLOCK(ch);
+ cfg = nat64stl_find(ni, oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EEXIST);
+ }
+
+ /*
+ * For now allow to change only following values:
+ * flags.
+ */
+
+ cfg->flags = uc->flags & NAT64STL_FLAGSMASK;
+ IPFW_UH_WUNLOCK(ch);
+ return (0);
+}
+
+static void
+nat64stl_detach_config(struct ip_fw_chain *ch, struct nat64stl_cfg *cfg)
+{
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ ipfw_objhash_del(CHAIN_TO_SRV(ch), &cfg->no);
+ ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), cfg->no.kidx);
+ ipfw_unref_table(ch, cfg->map46);
+ ipfw_unref_table(ch, cfg->map64);
+}
+
+/*
+ * Destroys nat64 instance.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat64stl_destroy(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_header *oh;
+ struct nat64stl_cfg *cfg;
+
+ if (sd->valsize != sizeof(*oh))
+ return (EINVAL);
+
+ oh = (ipfw_obj_header *)sd->kbuf;
+ if (ipfw_check_object_name_generic(oh->ntlv.name) != 0)
+ return (EINVAL);
+
+ IPFW_UH_WLOCK(ch);
+ cfg = nat64stl_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return (ESRCH);
+ }
+ if (cfg->no.refcnt > 0) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EBUSY);
+ }
+
+ IPFW_WLOCK(ch);
+ SRV_OBJECT(ch, cfg->no.kidx) = NULL;
+ IPFW_WUNLOCK(ch);
+
+ nat64stl_detach_config(ch, cfg);
+ IPFW_UH_WUNLOCK(ch);
+
+ nat64stl_free_config(cfg);
+ return (0);
+}
+
+/*
+ * Lists all nat64stl instances currently available in kernel.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_lheader ]
+ * Reply: [ ipfw_obj_lheader ipfw_nat64stl_cfg x N ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat64stl_list(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_lheader *olh;
+ struct nat64stl_dump_arg da;
+
+ /* Check minimum header size */
+ if (sd->valsize < sizeof(ipfw_obj_lheader))
+ return (EINVAL);
+
+ olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh));
+
+ IPFW_UH_RLOCK(ch);
+ olh->count = ipfw_objhash_count_type(CHAIN_TO_SRV(ch),
+ IPFW_TLV_NAT64STL_NAME);
+ olh->objsize = sizeof(ipfw_nat64stl_cfg);
+ olh->size = sizeof(*olh) + olh->count * olh->objsize;
+
+ if (sd->valsize < olh->size) {
+ IPFW_UH_RUNLOCK(ch);
+ return (ENOMEM);
+ }
+ memset(&da, 0, sizeof(da));
+ da.ch = ch;
+ da.sd = sd;
+ ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), export_config_cb,
+ &da, IPFW_TLV_NAT64STL_NAME);
+ IPFW_UH_RUNLOCK(ch);
+
+ return (0);
+}
+
+#define __COPY_STAT_FIELD(_cfg, _stats, _field) \
+ (_stats)->_field = NAT64STAT_FETCH(&(_cfg)->stats, _field)
+static void
+export_stats(struct ip_fw_chain *ch, struct nat64stl_cfg *cfg,
+ struct ipfw_nat64stl_stats *stats)
+{
+
+ __COPY_STAT_FIELD(cfg, stats, opcnt64);
+ __COPY_STAT_FIELD(cfg, stats, opcnt46);
+ __COPY_STAT_FIELD(cfg, stats, ofrags);
+ __COPY_STAT_FIELD(cfg, stats, ifrags);
+ __COPY_STAT_FIELD(cfg, stats, oerrors);
+ __COPY_STAT_FIELD(cfg, stats, noroute4);
+ __COPY_STAT_FIELD(cfg, stats, noroute6);
+ __COPY_STAT_FIELD(cfg, stats, noproto);
+ __COPY_STAT_FIELD(cfg, stats, nomem);
+ __COPY_STAT_FIELD(cfg, stats, dropped);
+}
+
+/*
+ * Get nat64stl statistics.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ]
+ * Reply: [ ipfw_obj_header ipfw_obj_ctlv [ uint64_t x N ]]
+ *
+ * Returns 0 on success
+ */
+static int
+nat64stl_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op,
+ struct sockopt_data *sd)
+{
+ struct ipfw_nat64stl_stats stats;
+ struct nat64stl_cfg *cfg;
+ ipfw_obj_header *oh;
+ ipfw_obj_ctlv *ctlv;
+ size_t sz;
+
+ sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ctlv) + sizeof(stats);
+ if (sd->valsize % sizeof(uint64_t))
+ return (EINVAL);
+ if (sd->valsize < sz)
+ return (ENOMEM);
+ oh = (ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
+ if (oh == NULL)
+ return (EINVAL);
+ memset(&stats, 0, sizeof(stats));
+
+ IPFW_UH_RLOCK(ch);
+ cfg = nat64stl_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_RUNLOCK(ch);
+ return (ESRCH);
+ }
+ export_stats(ch, cfg, &stats);
+ IPFW_UH_RUNLOCK(ch);
+
+ ctlv = (ipfw_obj_ctlv *)(oh + 1);
+ memset(ctlv, 0, sizeof(*ctlv));
+ ctlv->head.type = IPFW_TLV_COUNTERS;
+ ctlv->head.length = sz - sizeof(ipfw_obj_header);
+ ctlv->count = sizeof(stats) / sizeof(uint64_t);
+ ctlv->objsize = sizeof(uint64_t);
+ ctlv->version = IPFW_NAT64_VERSION;
+ memcpy(ctlv + 1, &stats, sizeof(stats));
+ return (0);
+}
+
+/*
+ * Reset nat64stl statistics.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ]
+ *
+ * Returns 0 on success
+ */
+static int
+nat64stl_reset_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op,
+ struct sockopt_data *sd)
+{
+ struct nat64stl_cfg *cfg;
+ ipfw_obj_header *oh;
+
+ if (sd->valsize != sizeof(*oh))
+ return (EINVAL);
+ oh = (ipfw_obj_header *)sd->kbuf;
+ if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 ||
+ oh->ntlv.set >= IPFW_MAX_SETS)
+ return (EINVAL);
+
+ IPFW_UH_WLOCK(ch);
+ cfg = nat64stl_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return (ESRCH);
+ }
+ COUNTER_ARRAY_ZERO(cfg->stats.stats, NAT64STATS);
+ IPFW_UH_WUNLOCK(ch);
+ return (0);
+}
+
+static struct ipfw_sopt_handler scodes[] = {
+
+ { IP_FW_NAT64STL_CREATE, 0, HDIR_SET, nat64stl_create },
+ { IP_FW_NAT64STL_DESTROY,0, HDIR_SET, nat64stl_destroy },
+ { IP_FW_NAT64STL_CONFIG, 0, HDIR_BOTH, nat64stl_config },
+ { IP_FW_NAT64STL_LIST, 0, HDIR_GET, nat64stl_list },
+ { IP_FW_NAT64STL_STATS, 0, HDIR_GET, nat64stl_stats },
+ { IP_FW_NAT64STL_RESET_STATS,0, HDIR_SET, nat64stl_reset_stats },
+};
+
+static int
+nat64stl_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
+{
+ ipfw_insn *icmd;
+
+ icmd = cmd - 1;
+ if (icmd->opcode != O_EXTERNAL_ACTION ||
+ icmd->arg1 != V_nat64stl_eid)
+ return (1);
+
+ *puidx = cmd->arg1;
+ *ptype = 0;
+ return (0);
+}
+
+static void
+nat64stl_update_arg1(ipfw_insn *cmd, uint16_t idx)
+{
+
+ cmd->arg1 = idx;
+}
+
+static int
+nat64stl_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
+ struct named_object **pno)
+{
+ int err;
+
+ err = ipfw_objhash_find_type(CHAIN_TO_SRV(ch), ti,
+ IPFW_TLV_NAT64STL_NAME, pno);
+ return (err);
+}
+
+static struct named_object *
+nat64stl_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
+{
+ struct namedobj_instance *ni;
+ struct named_object *no;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+ ni = CHAIN_TO_SRV(ch);
+ no = ipfw_objhash_lookup_kidx(ni, idx);
+ KASSERT(no != NULL, ("NAT with index %d not found", idx));
+
+ return (no);
+}
+
+static int
+nat64stl_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set,
+ enum ipfw_sets_cmd cmd)
+{
+
+ return (ipfw_obj_manage_sets(CHAIN_TO_SRV(ch), IPFW_TLV_NAT64STL_NAME,
+ set, new_set, cmd));
+}
+
+static struct opcode_obj_rewrite opcodes[] = {
+ {
+ .opcode = O_EXTERNAL_INSTANCE,
+ .etlv = IPFW_TLV_EACTION /* just show it isn't table */,
+ .classifier = nat64stl_classify,
+ .update = nat64stl_update_arg1,
+ .find_byname = nat64stl_findbyname,
+ .find_bykidx = nat64stl_findbykidx,
+ .manage_sets = nat64stl_manage_sets,
+ },
+};
+
+static int
+destroy_config_cb(struct namedobj_instance *ni, struct named_object *no,
+ void *arg)
+{
+ struct nat64stl_cfg *cfg;
+ struct ip_fw_chain *ch;
+
+ ch = (struct ip_fw_chain *)arg;
+ cfg = (struct nat64stl_cfg *)SRV_OBJECT(ch, no->kidx);
+ SRV_OBJECT(ch, no->kidx) = NULL;
+ nat64stl_detach_config(ch, cfg);
+ nat64stl_free_config(cfg);
+ return (0);
+}
+
+int
+nat64stl_init(struct ip_fw_chain *ch, int first)
+{
+
+ V_nat64stl_eid = ipfw_add_eaction(ch, ipfw_nat64stl, "nat64stl");
+ if (V_nat64stl_eid == 0)
+ return (ENXIO);
+ IPFW_ADD_SOPT_HANDLER(first, scodes);
+ IPFW_ADD_OBJ_REWRITER(first, opcodes);
+ return (0);
+}
+
+void
+nat64stl_uninit(struct ip_fw_chain *ch, int last)
+{
+
+ IPFW_DEL_OBJ_REWRITER(last, opcodes);
+ IPFW_DEL_SOPT_HANDLER(last, scodes);
+ ipfw_del_eaction(ch, V_nat64stl_eid);
+ /*
+ * Since we already have deregistered external action,
+ * our named objects become unaccessible via rules, because
+ * all rules were truncated by ipfw_del_eaction().
+ * So, we can unlink and destroy our named objects without holding
+ * IPFW_WLOCK().
+ */
+ IPFW_UH_WLOCK(ch);
+ ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), destroy_config_cb, ch,
+ IPFW_TLV_NAT64STL_NAME);
+ V_nat64stl_eid = 0;
+ IPFW_UH_WUNLOCK(ch);
+}
+
diff --git a/freebsd/sys/netpfil/ipfw/nptv6/ip_fw_nptv6.c b/freebsd/sys/netpfil/ipfw/nptv6/ip_fw_nptv6.c
new file mode 100644
index 00000000..92a2c7a3
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/nptv6/ip_fw_nptv6.c
@@ -0,0 +1,101 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2016 Yandex LLC
+ * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+
+#include <net/if.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/nptv6/nptv6.h>
+
+static int
+vnet_ipfw_nptv6_init(const void *arg __unused)
+{
+
+ return (nptv6_init(&V_layer3_chain, IS_DEFAULT_VNET(curvnet)));
+}
+
+static int
+vnet_ipfw_nptv6_uninit(const void *arg __unused)
+{
+
+ nptv6_uninit(&V_layer3_chain, IS_DEFAULT_VNET(curvnet));
+ return (0);
+}
+
+static int
+ipfw_nptv6_modevent(module_t mod, int type, void *unused)
+{
+
+ switch (type) {
+ case MOD_LOAD:
+ case MOD_UNLOAD:
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+ return (0);
+}
+
+static moduledata_t ipfw_nptv6_mod = {
+ "ipfw_nptv6",
+ ipfw_nptv6_modevent,
+ 0
+};
+
+/* Define startup order. */
+#define IPFW_NPTV6_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN
+#define IPFW_NPTV6_MODEVENT_ORDER (SI_ORDER_ANY - 128) /* after ipfw */
+#define IPFW_NPTV6_MODULE_ORDER (IPFW_NPTV6_MODEVENT_ORDER + 1)
+#define IPFW_NPTV6_VNET_ORDER (IPFW_NPTV6_MODEVENT_ORDER + 2)
+
+DECLARE_MODULE(ipfw_nptv6, ipfw_nptv6_mod, IPFW_NPTV6_SI_SUB_FIREWALL,
+ IPFW_NPTV6_MODULE_ORDER);
+MODULE_DEPEND(ipfw_nptv6, ipfw, 3, 3, 3);
+MODULE_VERSION(ipfw_nptv6, 1);
+
+VNET_SYSINIT(vnet_ipfw_nptv6_init, IPFW_NPTV6_SI_SUB_FIREWALL,
+ IPFW_NPTV6_VNET_ORDER, vnet_ipfw_nptv6_init, NULL);
+VNET_SYSUNINIT(vnet_ipfw_nptv6_uninit, IPFW_NPTV6_SI_SUB_FIREWALL,
+ IPFW_NPTV6_VNET_ORDER, vnet_ipfw_nptv6_uninit, NULL);
diff --git a/freebsd/sys/netpfil/ipfw/nptv6/nptv6.c b/freebsd/sys/netpfil/ipfw/nptv6/nptv6.c
new file mode 100644
index 00000000..4256d028
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/nptv6/nptv6.c
@@ -0,0 +1,894 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2016 Yandex LLC
+ * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <rtems/bsd/sys/errno.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/queue.h>
+#include <sys/syslog.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/netisr.h>
+#include <net/pfil.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet6/in6_var.h>
+#include <netinet6/ip6_var.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/nptv6/nptv6.h>
+
+static VNET_DEFINE(uint16_t, nptv6_eid) = 0;
+#define V_nptv6_eid VNET(nptv6_eid)
+#define IPFW_TLV_NPTV6_NAME IPFW_TLV_EACTION_NAME(V_nptv6_eid)
+
+static struct nptv6_cfg *nptv6_alloc_config(const char *name, uint8_t set);
+static void nptv6_free_config(struct nptv6_cfg *cfg);
+static struct nptv6_cfg *nptv6_find(struct namedobj_instance *ni,
+ const char *name, uint8_t set);
+static int nptv6_rewrite_internal(struct nptv6_cfg *cfg, struct mbuf **mp,
+ int offset);
+static int nptv6_rewrite_external(struct nptv6_cfg *cfg, struct mbuf **mp,
+ int offset);
+
+#define NPTV6_LOOKUP(chain, cmd) \
+ (struct nptv6_cfg *)SRV_OBJECT((chain), (cmd)->arg1)
+
+#ifndef IN6_MASK_ADDR
+#define IN6_MASK_ADDR(a, m) do { \
+ (a)->s6_addr32[0] &= (m)->s6_addr32[0]; \
+ (a)->s6_addr32[1] &= (m)->s6_addr32[1]; \
+ (a)->s6_addr32[2] &= (m)->s6_addr32[2]; \
+ (a)->s6_addr32[3] &= (m)->s6_addr32[3]; \
+} while (0)
+#endif
+#ifndef IN6_ARE_MASKED_ADDR_EQUAL
+#define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \
+ (((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \
+ (((d)->s6_addr32[1] ^ (a)->s6_addr32[1]) & (m)->s6_addr32[1]) == 0 && \
+ (((d)->s6_addr32[2] ^ (a)->s6_addr32[2]) & (m)->s6_addr32[2]) == 0 && \
+ (((d)->s6_addr32[3] ^ (a)->s6_addr32[3]) & (m)->s6_addr32[3]) == 0 )
+#endif
+
+#if 0
+#define NPTV6_DEBUG(fmt, ...) do { \
+ printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \
+} while (0)
+#define NPTV6_IPDEBUG(fmt, ...) do { \
+ char _s[INET6_ADDRSTRLEN], _d[INET6_ADDRSTRLEN]; \
+ printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \
+} while (0)
+#else
+#define NPTV6_DEBUG(fmt, ...)
+#define NPTV6_IPDEBUG(fmt, ...)
+#endif
+
+static int
+nptv6_getlasthdr(struct nptv6_cfg *cfg, struct mbuf *m, int *offset)
+{
+ struct ip6_hdr *ip6;
+ struct ip6_hbh *hbh;
+ int proto, hlen;
+
+ hlen = (offset == NULL) ? 0: *offset;
+ if (m->m_len < hlen)
+ return (-1);
+ ip6 = mtodo(m, hlen);
+ hlen += sizeof(*ip6);
+ proto = ip6->ip6_nxt;
+ while (proto == IPPROTO_HOPOPTS || proto == IPPROTO_ROUTING ||
+ proto == IPPROTO_DSTOPTS) {
+ hbh = mtodo(m, hlen);
+ if (m->m_len < hlen)
+ return (-1);
+ proto = hbh->ip6h_nxt;
+ hlen += hbh->ip6h_len << 3;
+ }
+ if (offset != NULL)
+ *offset = hlen;
+ return (proto);
+}
+
+static int
+nptv6_translate_icmpv6(struct nptv6_cfg *cfg, struct mbuf **mp, int offset)
+{
+ struct icmp6_hdr *icmp6;
+ struct ip6_hdr *ip6;
+ struct mbuf *m;
+
+ m = *mp;
+ if (offset > m->m_len)
+ return (-1);
+ icmp6 = mtodo(m, offset);
+ NPTV6_DEBUG("ICMPv6 type %d", icmp6->icmp6_type);
+ switch (icmp6->icmp6_type) {
+ case ICMP6_DST_UNREACH:
+ case ICMP6_PACKET_TOO_BIG:
+ case ICMP6_TIME_EXCEEDED:
+ case ICMP6_PARAM_PROB:
+ break;
+ case ICMP6_ECHO_REQUEST:
+ case ICMP6_ECHO_REPLY:
+ /* nothing to translate */
+ return (0);
+ default:
+ /*
+ * XXX: We can add some checks to not translate NDP and MLD
+ * messages. Currently user must explicitly allow these message
+ * types, otherwise packets will be dropped.
+ */
+ return (-1);
+ }
+ offset += sizeof(*icmp6);
+ if (offset + sizeof(*ip6) > m->m_pkthdr.len)
+ return (-1);
+ if (offset + sizeof(*ip6) > m->m_len)
+ *mp = m = m_pullup(m, offset + sizeof(*ip6));
+ if (m == NULL)
+ return (-1);
+ ip6 = mtodo(m, offset);
+ NPTV6_IPDEBUG("offset %d, %s -> %s %d", offset,
+ inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)),
+ inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)),
+ ip6->ip6_nxt);
+ if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_src,
+ &cfg->external, &cfg->mask))
+ return (nptv6_rewrite_external(cfg, mp, offset));
+ else if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_dst,
+ &cfg->internal, &cfg->mask))
+ return (nptv6_rewrite_internal(cfg, mp, offset));
+ /*
+ * Addresses in the inner IPv6 header doesn't matched to
+ * our prefixes.
+ */
+ return (-1);
+}
+
+static int
+nptv6_search_index(struct nptv6_cfg *cfg, struct in6_addr *a)
+{
+ int idx;
+
+ if (cfg->flags & NPTV6_48PLEN)
+ return (3);
+
+ /* Search suitable word index for adjustment */
+ for (idx = 4; idx < 8; idx++)
+ if (a->s6_addr16[idx] != 0xffff)
+ break;
+ /*
+ * RFC 6296 p3.7: If an NPTv6 Translator discovers a datagram with
+ * an IID of all-zeros while performing address mapping, that
+ * datagram MUST be dropped, and an ICMPv6 Parameter Problem error
+ * SHOULD be generated.
+ */
+ if (idx == 8 ||
+ (a->s6_addr32[2] == 0 && a->s6_addr32[3] == 0))
+ return (-1);
+ return (idx);
+}
+
+static void
+nptv6_copy_addr(struct in6_addr *src, struct in6_addr *dst,
+ struct in6_addr *mask)
+{
+ int i;
+
+ for (i = 0; i < 8 && mask->s6_addr8[i] != 0; i++) {
+ dst->s6_addr8[i] &= ~mask->s6_addr8[i];
+ dst->s6_addr8[i] |= src->s6_addr8[i] & mask->s6_addr8[i];
+ }
+}
+
+static int
+nptv6_rewrite_internal(struct nptv6_cfg *cfg, struct mbuf **mp, int offset)
+{
+ struct in6_addr *addr;
+ struct ip6_hdr *ip6;
+ int idx, proto;
+ uint16_t adj;
+
+ ip6 = mtodo(*mp, offset);
+ NPTV6_IPDEBUG("offset %d, %s -> %s %d", offset,
+ inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)),
+ inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)),
+ ip6->ip6_nxt);
+ if (offset == 0)
+ addr = &ip6->ip6_src;
+ else {
+ /*
+ * When we rewriting inner IPv6 header, we need to rewrite
+ * destination address back to external prefix. The datagram in
+ * the ICMPv6 payload should looks like it was send from
+ * external prefix.
+ */
+ addr = &ip6->ip6_dst;
+ }
+ idx = nptv6_search_index(cfg, addr);
+ if (idx < 0) {
+ /*
+ * Do not send ICMPv6 error when offset isn't zero.
+ * This means we are rewriting inner IPv6 header in the
+ * ICMPv6 error message.
+ */
+ if (offset == 0) {
+ icmp6_error2(*mp, ICMP6_DST_UNREACH,
+ ICMP6_DST_UNREACH_ADDR, 0, (*mp)->m_pkthdr.rcvif);
+ *mp = NULL;
+ }
+ return (IP_FW_DENY);
+ }
+ adj = addr->s6_addr16[idx];
+ nptv6_copy_addr(&cfg->external, addr, &cfg->mask);
+ adj = cksum_add(adj, cfg->adjustment);
+ if (adj == 0xffff)
+ adj = 0;
+ addr->s6_addr16[idx] = adj;
+ if (offset == 0) {
+ /*
+ * We may need to translate addresses in the inner IPv6
+ * header for ICMPv6 error messages.
+ */
+ proto = nptv6_getlasthdr(cfg, *mp, &offset);
+ if (proto < 0 || (proto == IPPROTO_ICMPV6 &&
+ nptv6_translate_icmpv6(cfg, mp, offset) != 0))
+ return (IP_FW_DENY);
+ NPTV6STAT_INC(cfg, in2ex);
+ }
+ return (0);
+}
+
+static int
+nptv6_rewrite_external(struct nptv6_cfg *cfg, struct mbuf **mp, int offset)
+{
+ struct in6_addr *addr;
+ struct ip6_hdr *ip6;
+ int idx, proto;
+ uint16_t adj;
+
+ ip6 = mtodo(*mp, offset);
+ NPTV6_IPDEBUG("offset %d, %s -> %s %d", offset,
+ inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)),
+ inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)),
+ ip6->ip6_nxt);
+ if (offset == 0)
+ addr = &ip6->ip6_dst;
+ else {
+ /*
+ * When we rewriting inner IPv6 header, we need to rewrite
+ * source address back to internal prefix. The datagram in
+ * the ICMPv6 payload should looks like it was send from
+ * internal prefix.
+ */
+ addr = &ip6->ip6_src;
+ }
+ idx = nptv6_search_index(cfg, addr);
+ if (idx < 0) {
+ /*
+ * Do not send ICMPv6 error when offset isn't zero.
+ * This means we are rewriting inner IPv6 header in the
+ * ICMPv6 error message.
+ */
+ if (offset == 0) {
+ icmp6_error2(*mp, ICMP6_DST_UNREACH,
+ ICMP6_DST_UNREACH_ADDR, 0, (*mp)->m_pkthdr.rcvif);
+ *mp = NULL;
+ }
+ return (IP_FW_DENY);
+ }
+ adj = addr->s6_addr16[idx];
+ nptv6_copy_addr(&cfg->internal, addr, &cfg->mask);
+ adj = cksum_add(adj, ~cfg->adjustment);
+ if (adj == 0xffff)
+ adj = 0;
+ addr->s6_addr16[idx] = adj;
+ if (offset == 0) {
+ /*
+ * We may need to translate addresses in the inner IPv6
+ * header for ICMPv6 error messages.
+ */
+ proto = nptv6_getlasthdr(cfg, *mp, &offset);
+ if (proto < 0 || (proto == IPPROTO_ICMPV6 &&
+ nptv6_translate_icmpv6(cfg, mp, offset) != 0))
+ return (IP_FW_DENY);
+ NPTV6STAT_INC(cfg, ex2in);
+ }
+ return (0);
+}
+
+/*
+ * ipfw external action handler.
+ */
+static int
+ipfw_nptv6(struct ip_fw_chain *chain, struct ip_fw_args *args,
+ ipfw_insn *cmd, int *done)
+{
+ struct ip6_hdr *ip6;
+ struct nptv6_cfg *cfg;
+ ipfw_insn *icmd;
+ int ret;
+
+ *done = 0; /* try next rule if not matched */
+ icmd = cmd + 1;
+ if (cmd->opcode != O_EXTERNAL_ACTION ||
+ cmd->arg1 != V_nptv6_eid ||
+ icmd->opcode != O_EXTERNAL_INSTANCE ||
+ (cfg = NPTV6_LOOKUP(chain, icmd)) == NULL)
+ return (0);
+ /*
+ * We need act as router, so when forwarding is disabled -
+ * do nothing.
+ */
+ if (V_ip6_forwarding == 0 || args->f_id.addr_type != 6)
+ return (0);
+ /*
+ * NOTE: we expect ipfw_chk() did m_pullup() up to upper level
+ * protocol's headers. Also we skip some checks, that ip6_input(),
+ * ip6_forward(), ip6_fastfwd() and ipfw_chk() already did.
+ */
+ ret = IP_FW_DENY;
+ ip6 = mtod(args->m, struct ip6_hdr *);
+ NPTV6_IPDEBUG("eid %u, oid %u, %s -> %s %d",
+ cmd->arg1, icmd->arg1,
+ inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)),
+ inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)),
+ ip6->ip6_nxt);
+ if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_src,
+ &cfg->internal, &cfg->mask)) {
+ /*
+ * XXX: Do not translate packets when both src and dst
+ * are from internal prefix.
+ */
+ if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_dst,
+ &cfg->internal, &cfg->mask))
+ return (0);
+ ret = nptv6_rewrite_internal(cfg, &args->m, 0);
+ } else if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_dst,
+ &cfg->external, &cfg->mask))
+ ret = nptv6_rewrite_external(cfg, &args->m, 0);
+ else
+ return (0);
+ /*
+ * If address wasn't rewrited - free mbuf.
+ */
+ if (ret != 0) {
+ if (args->m != NULL) {
+ m_freem(args->m);
+ args->m = NULL; /* mark mbuf as consumed */
+ }
+ NPTV6STAT_INC(cfg, dropped);
+ }
+ /* Terminate the search if one_pass is set */
+ *done = V_fw_one_pass;
+ /* Update args->f_id when one_pass is off */
+ if (*done == 0 && ret == 0) {
+ ip6 = mtod(args->m, struct ip6_hdr *);
+ args->f_id.src_ip6 = ip6->ip6_src;
+ args->f_id.dst_ip6 = ip6->ip6_dst;
+ }
+ return (ret);
+}
+
+static struct nptv6_cfg *
+nptv6_alloc_config(const char *name, uint8_t set)
+{
+ struct nptv6_cfg *cfg;
+
+ cfg = malloc(sizeof(struct nptv6_cfg), M_IPFW, M_WAITOK | M_ZERO);
+ COUNTER_ARRAY_ALLOC(cfg->stats, NPTV6STATS, M_WAITOK);
+ cfg->no.name = cfg->name;
+ cfg->no.etlv = IPFW_TLV_NPTV6_NAME;
+ cfg->no.set = set;
+ strlcpy(cfg->name, name, sizeof(cfg->name));
+ return (cfg);
+}
+
+static void
+nptv6_free_config(struct nptv6_cfg *cfg)
+{
+
+ COUNTER_ARRAY_FREE(cfg->stats, NPTV6STATS);
+ free(cfg, M_IPFW);
+}
+
+static void
+nptv6_export_config(struct ip_fw_chain *ch, struct nptv6_cfg *cfg,
+ ipfw_nptv6_cfg *uc)
+{
+
+ uc->internal = cfg->internal;
+ uc->external = cfg->external;
+ uc->plen = cfg->plen;
+ uc->flags = cfg->flags & NPTV6_FLAGSMASK;
+ uc->set = cfg->no.set;
+ strlcpy(uc->name, cfg->no.name, sizeof(uc->name));
+}
+
+struct nptv6_dump_arg {
+ struct ip_fw_chain *ch;
+ struct sockopt_data *sd;
+};
+
+static int
+export_config_cb(struct namedobj_instance *ni, struct named_object *no,
+ void *arg)
+{
+ struct nptv6_dump_arg *da = (struct nptv6_dump_arg *)arg;
+ ipfw_nptv6_cfg *uc;
+
+ uc = (ipfw_nptv6_cfg *)ipfw_get_sopt_space(da->sd, sizeof(*uc));
+ nptv6_export_config(da->ch, (struct nptv6_cfg *)no, uc);
+ return (0);
+}
+
+static struct nptv6_cfg *
+nptv6_find(struct namedobj_instance *ni, const char *name, uint8_t set)
+{
+ struct nptv6_cfg *cfg;
+
+ cfg = (struct nptv6_cfg *)ipfw_objhash_lookup_name_type(ni, set,
+ IPFW_TLV_NPTV6_NAME, name);
+
+ return (cfg);
+}
+
+static void
+nptv6_calculate_adjustment(struct nptv6_cfg *cfg)
+{
+ uint16_t i, e;
+ uint16_t *p;
+
+ /* Calculate checksum of internal prefix */
+ for (i = 0, p = (uint16_t *)&cfg->internal;
+ p < (uint16_t *)(&cfg->internal + 1); p++)
+ i = cksum_add(i, *p);
+
+ /* Calculate checksum of external prefix */
+ for (e = 0, p = (uint16_t *)&cfg->external;
+ p < (uint16_t *)(&cfg->external + 1); p++)
+ e = cksum_add(e, *p);
+
+ /* Adjustment value for Int->Ext direction */
+ cfg->adjustment = cksum_add(~e, i);
+}
+
+/*
+ * Creates new NPTv6 instance.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_lheader ipfw_nptv6_cfg ]
+ *
+ * Returns 0 on success
+ */
+static int
+nptv6_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ struct in6_addr mask;
+ ipfw_obj_lheader *olh;
+ ipfw_nptv6_cfg *uc;
+ struct namedobj_instance *ni;
+ struct nptv6_cfg *cfg;
+
+ if (sd->valsize != sizeof(*olh) + sizeof(*uc))
+ return (EINVAL);
+
+ olh = (ipfw_obj_lheader *)sd->kbuf;
+ uc = (ipfw_nptv6_cfg *)(olh + 1);
+ if (ipfw_check_object_name_generic(uc->name) != 0)
+ return (EINVAL);
+ if (uc->plen < 8 || uc->plen > 64 || uc->set >= IPFW_MAX_SETS)
+ return (EINVAL);
+ if (IN6_IS_ADDR_MULTICAST(&uc->internal) ||
+ IN6_IS_ADDR_MULTICAST(&uc->external) ||
+ IN6_IS_ADDR_UNSPECIFIED(&uc->internal) ||
+ IN6_IS_ADDR_UNSPECIFIED(&uc->external) ||
+ IN6_IS_ADDR_LINKLOCAL(&uc->internal) ||
+ IN6_IS_ADDR_LINKLOCAL(&uc->external))
+ return (EINVAL);
+ in6_prefixlen2mask(&mask, uc->plen);
+ if (IN6_ARE_MASKED_ADDR_EQUAL(&uc->internal, &uc->external, &mask))
+ return (EINVAL);
+
+ ni = CHAIN_TO_SRV(ch);
+ IPFW_UH_RLOCK(ch);
+ if (nptv6_find(ni, uc->name, uc->set) != NULL) {
+ IPFW_UH_RUNLOCK(ch);
+ return (EEXIST);
+ }
+ IPFW_UH_RUNLOCK(ch);
+
+ cfg = nptv6_alloc_config(uc->name, uc->set);
+ cfg->plen = uc->plen;
+ if (cfg->plen <= 48)
+ cfg->flags |= NPTV6_48PLEN;
+ cfg->internal = uc->internal;
+ cfg->external = uc->external;
+ cfg->mask = mask;
+ IN6_MASK_ADDR(&cfg->internal, &mask);
+ IN6_MASK_ADDR(&cfg->external, &mask);
+ nptv6_calculate_adjustment(cfg);
+
+ IPFW_UH_WLOCK(ch);
+ if (ipfw_objhash_alloc_idx(ni, &cfg->no.kidx) != 0) {
+ IPFW_UH_WUNLOCK(ch);
+ nptv6_free_config(cfg);
+ return (ENOSPC);
+ }
+ ipfw_objhash_add(ni, &cfg->no);
+ IPFW_WLOCK(ch);
+ SRV_OBJECT(ch, cfg->no.kidx) = cfg;
+ IPFW_WUNLOCK(ch);
+ IPFW_UH_WUNLOCK(ch);
+ return (0);
+}
+
+/*
+ * Destroys NPTv6 instance.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ]
+ *
+ * Returns 0 on success
+ */
+static int
+nptv6_destroy(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_header *oh;
+ struct nptv6_cfg *cfg;
+
+ if (sd->valsize != sizeof(*oh))
+ return (EINVAL);
+
+ oh = (ipfw_obj_header *)sd->kbuf;
+ if (ipfw_check_object_name_generic(oh->ntlv.name) != 0)
+ return (EINVAL);
+
+ IPFW_UH_WLOCK(ch);
+ cfg = nptv6_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return (ESRCH);
+ }
+ if (cfg->no.refcnt > 0) {
+ IPFW_UH_WUNLOCK(ch);
+ return (EBUSY);
+ }
+
+ IPFW_WLOCK(ch);
+ SRV_OBJECT(ch, cfg->no.kidx) = NULL;
+ IPFW_WUNLOCK(ch);
+
+ ipfw_objhash_del(CHAIN_TO_SRV(ch), &cfg->no);
+ ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), cfg->no.kidx);
+ IPFW_UH_WUNLOCK(ch);
+
+ nptv6_free_config(cfg);
+ return (0);
+}
+
+/*
+ * Get or change nptv6 instance config.
+ * Request: [ ipfw_obj_header [ ipfw_nptv6_cfg ] ]
+ */
+static int
+nptv6_config(struct ip_fw_chain *chain, ip_fw3_opheader *op,
+ struct sockopt_data *sd)
+{
+
+ return (EOPNOTSUPP);
+}
+
+/*
+ * Lists all NPTv6 instances currently available in kernel.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_lheader ]
+ * Reply: [ ipfw_obj_lheader ipfw_nptv6_cfg x N ]
+ *
+ * Returns 0 on success
+ */
+static int
+nptv6_list(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
+ struct sockopt_data *sd)
+{
+ ipfw_obj_lheader *olh;
+ struct nptv6_dump_arg da;
+
+ /* Check minimum header size */
+ if (sd->valsize < sizeof(ipfw_obj_lheader))
+ return (EINVAL);
+
+ olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh));
+
+ IPFW_UH_RLOCK(ch);
+ olh->count = ipfw_objhash_count_type(CHAIN_TO_SRV(ch),
+ IPFW_TLV_NPTV6_NAME);
+ olh->objsize = sizeof(ipfw_nptv6_cfg);
+ olh->size = sizeof(*olh) + olh->count * olh->objsize;
+
+ if (sd->valsize < olh->size) {
+ IPFW_UH_RUNLOCK(ch);
+ return (ENOMEM);
+ }
+ memset(&da, 0, sizeof(da));
+ da.ch = ch;
+ da.sd = sd;
+ ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), export_config_cb,
+ &da, IPFW_TLV_NPTV6_NAME);
+ IPFW_UH_RUNLOCK(ch);
+
+ return (0);
+}
+
+#define __COPY_STAT_FIELD(_cfg, _stats, _field) \
+ (_stats)->_field = NPTV6STAT_FETCH(_cfg, _field)
+static void
+export_stats(struct ip_fw_chain *ch, struct nptv6_cfg *cfg,
+ struct ipfw_nptv6_stats *stats)
+{
+
+ __COPY_STAT_FIELD(cfg, stats, in2ex);
+ __COPY_STAT_FIELD(cfg, stats, ex2in);
+ __COPY_STAT_FIELD(cfg, stats, dropped);
+}
+
+/*
+ * Get NPTv6 statistics.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ]
+ * Reply: [ ipfw_obj_header ipfw_obj_ctlv [ uint64_t x N ]]
+ *
+ * Returns 0 on success
+ */
+static int
+nptv6_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op,
+ struct sockopt_data *sd)
+{
+ struct ipfw_nptv6_stats stats;
+ struct nptv6_cfg *cfg;
+ ipfw_obj_header *oh;
+ ipfw_obj_ctlv *ctlv;
+ size_t sz;
+
+ sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ctlv) + sizeof(stats);
+ if (sd->valsize % sizeof(uint64_t))
+ return (EINVAL);
+ if (sd->valsize < sz)
+ return (ENOMEM);
+ oh = (ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
+ if (oh == NULL)
+ return (EINVAL);
+ if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 ||
+ oh->ntlv.set >= IPFW_MAX_SETS)
+ return (EINVAL);
+ memset(&stats, 0, sizeof(stats));
+
+ IPFW_UH_RLOCK(ch);
+ cfg = nptv6_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_RUNLOCK(ch);
+ return (ESRCH);
+ }
+ export_stats(ch, cfg, &stats);
+ IPFW_UH_RUNLOCK(ch);
+
+ ctlv = (ipfw_obj_ctlv *)(oh + 1);
+ memset(ctlv, 0, sizeof(*ctlv));
+ ctlv->head.type = IPFW_TLV_COUNTERS;
+ ctlv->head.length = sz - sizeof(ipfw_obj_header);
+ ctlv->count = sizeof(stats) / sizeof(uint64_t);
+ ctlv->objsize = sizeof(uint64_t);
+ ctlv->version = 1;
+ memcpy(ctlv + 1, &stats, sizeof(stats));
+ return (0);
+}
+
+/*
+ * Reset NPTv6 statistics.
+ * Data layout (v0)(current):
+ * Request: [ ipfw_obj_header ]
+ *
+ * Returns 0 on success
+ */
+static int
+nptv6_reset_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op,
+ struct sockopt_data *sd)
+{
+ struct nptv6_cfg *cfg;
+ ipfw_obj_header *oh;
+
+ if (sd->valsize != sizeof(*oh))
+ return (EINVAL);
+ oh = (ipfw_obj_header *)sd->kbuf;
+ if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 ||
+ oh->ntlv.set >= IPFW_MAX_SETS)
+ return (EINVAL);
+
+ IPFW_UH_WLOCK(ch);
+ cfg = nptv6_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set);
+ if (cfg == NULL) {
+ IPFW_UH_WUNLOCK(ch);
+ return (ESRCH);
+ }
+ COUNTER_ARRAY_ZERO(cfg->stats, NPTV6STATS);
+ IPFW_UH_WUNLOCK(ch);
+ return (0);
+}
+
+static struct ipfw_sopt_handler scodes[] = {
+ { IP_FW_NPTV6_CREATE, 0, HDIR_SET, nptv6_create },
+ { IP_FW_NPTV6_DESTROY,0, HDIR_SET, nptv6_destroy },
+ { IP_FW_NPTV6_CONFIG, 0, HDIR_BOTH, nptv6_config },
+ { IP_FW_NPTV6_LIST, 0, HDIR_GET, nptv6_list },
+ { IP_FW_NPTV6_STATS, 0, HDIR_GET, nptv6_stats },
+ { IP_FW_NPTV6_RESET_STATS,0, HDIR_SET, nptv6_reset_stats },
+};
+
+static int
+nptv6_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
+{
+ ipfw_insn *icmd;
+
+ icmd = cmd - 1;
+ NPTV6_DEBUG("opcode %d, arg1 %d, opcode0 %d, arg1 %d",
+ cmd->opcode, cmd->arg1, icmd->opcode, icmd->arg1);
+ if (icmd->opcode != O_EXTERNAL_ACTION ||
+ icmd->arg1 != V_nptv6_eid)
+ return (1);
+
+ *puidx = cmd->arg1;
+ *ptype = 0;
+ return (0);
+}
+
+static void
+nptv6_update_arg1(ipfw_insn *cmd, uint16_t idx)
+{
+
+ cmd->arg1 = idx;
+ NPTV6_DEBUG("opcode %d, arg1 -> %d", cmd->opcode, cmd->arg1);
+}
+
+static int
+nptv6_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
+ struct named_object **pno)
+{
+ int err;
+
+ err = ipfw_objhash_find_type(CHAIN_TO_SRV(ch), ti,
+ IPFW_TLV_NPTV6_NAME, pno);
+ NPTV6_DEBUG("uidx %u, type %u, err %d", ti->uidx, ti->type, err);
+ return (err);
+}
+
+static struct named_object *
+nptv6_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
+{
+ struct namedobj_instance *ni;
+ struct named_object *no;
+
+ IPFW_UH_WLOCK_ASSERT(ch);
+ ni = CHAIN_TO_SRV(ch);
+ no = ipfw_objhash_lookup_kidx(ni, idx);
+ KASSERT(no != NULL, ("NPT with index %d not found", idx));
+
+ NPTV6_DEBUG("kidx %u -> %s", idx, no->name);
+ return (no);
+}
+
+static int
+nptv6_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set,
+ enum ipfw_sets_cmd cmd)
+{
+
+ return (ipfw_obj_manage_sets(CHAIN_TO_SRV(ch), IPFW_TLV_NPTV6_NAME,
+ set, new_set, cmd));
+}
+
+static struct opcode_obj_rewrite opcodes[] = {
+ {
+ .opcode = O_EXTERNAL_INSTANCE,
+ .etlv = IPFW_TLV_EACTION /* just show it isn't table */,
+ .classifier = nptv6_classify,
+ .update = nptv6_update_arg1,
+ .find_byname = nptv6_findbyname,
+ .find_bykidx = nptv6_findbykidx,
+ .manage_sets = nptv6_manage_sets,
+ },
+};
+
+static int
+destroy_config_cb(struct namedobj_instance *ni, struct named_object *no,
+ void *arg)
+{
+ struct nptv6_cfg *cfg;
+ struct ip_fw_chain *ch;
+
+ ch = (struct ip_fw_chain *)arg;
+ IPFW_UH_WLOCK_ASSERT(ch);
+
+ cfg = (struct nptv6_cfg *)SRV_OBJECT(ch, no->kidx);
+ SRV_OBJECT(ch, no->kidx) = NULL;
+ ipfw_objhash_del(ni, &cfg->no);
+ ipfw_objhash_free_idx(ni, cfg->no.kidx);
+ nptv6_free_config(cfg);
+ return (0);
+}
+
+int
+nptv6_init(struct ip_fw_chain *ch, int first)
+{
+
+ V_nptv6_eid = ipfw_add_eaction(ch, ipfw_nptv6, "nptv6");
+ if (V_nptv6_eid == 0)
+ return (ENXIO);
+ IPFW_ADD_SOPT_HANDLER(first, scodes);
+ IPFW_ADD_OBJ_REWRITER(first, opcodes);
+ return (0);
+}
+
+void
+nptv6_uninit(struct ip_fw_chain *ch, int last)
+{
+
+ IPFW_DEL_OBJ_REWRITER(last, opcodes);
+ IPFW_DEL_SOPT_HANDLER(last, scodes);
+ ipfw_del_eaction(ch, V_nptv6_eid);
+ /*
+ * Since we already have deregistered external action,
+ * our named objects become unaccessible via rules, because
+ * all rules were truncated by ipfw_del_eaction().
+ * So, we can unlink and destroy our named objects without holding
+ * IPFW_WLOCK().
+ */
+ IPFW_UH_WLOCK(ch);
+ ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), destroy_config_cb, ch,
+ IPFW_TLV_NPTV6_NAME);
+ V_nptv6_eid = 0;
+ IPFW_UH_WUNLOCK(ch);
+}
+
diff --git a/freebsd/sys/netpfil/ipfw/nptv6/nptv6.h b/freebsd/sys/netpfil/ipfw/nptv6/nptv6.h
new file mode 100644
index 00000000..95b04bfe
--- /dev/null
+++ b/freebsd/sys/netpfil/ipfw/nptv6/nptv6.h
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2016 Yandex LLC
+ * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_FW_NPTV6_H_
+#define _IP_FW_NPTV6_H_
+
+#include <netinet6/ip_fw_nptv6.h>
+
+#ifdef _KERNEL
+#define NPTV6STATS (sizeof(struct ipfw_nptv6_stats) / sizeof(uint64_t))
+#define NPTV6STAT_ADD(c, f, v) \
+ counter_u64_add((c)->stats[ \
+ offsetof(struct ipfw_nptv6_stats, f) / sizeof(uint64_t)], (v))
+#define NPTV6STAT_INC(c, f) NPTV6STAT_ADD(c, f, 1)
+#define NPTV6STAT_FETCH(c, f) \
+ counter_u64_fetch((c)->stats[ \
+ offsetof(struct ipfw_nptv6_stats, f) / sizeof(uint64_t)])
+
+struct nptv6_cfg {
+ struct named_object no;
+
+ struct in6_addr internal; /* Internal IPv6 prefix */
+ struct in6_addr external; /* External IPv6 prefix */
+ struct in6_addr mask; /* IPv6 prefix mask */
+ uint16_t adjustment; /* Checksum adjustment value */
+ uint8_t plen; /* Prefix length */
+ uint8_t flags; /* Flags for internal use */
+#define NPTV6_48PLEN 0x0001
+ char name[64]; /* Instance name */
+ counter_u64_t stats[NPTV6STATS]; /* Statistics counters */
+};
+#define NPTV6_FLAGSMASK 0
+
+int nptv6_init(struct ip_fw_chain *ch, int first);
+void nptv6_uninit(struct ip_fw_chain *ch, int last);
+#endif /* _KERNEL */
+
+#endif /* _IP_FW_NPTV6_H_ */
+
diff --git a/freebsd/sys/netpfil/pf/if_pflog.c b/freebsd/sys/netpfil/pf/if_pflog.c
new file mode 100644
index 00000000..3a364abc
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/if_pflog.c
@@ -0,0 +1,320 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * The authors of this code are John Ioannidis (ji@tla.org),
+ * Angelos D. Keromytis (kermit@csd.uch.gr) and
+ * Niels Provos (provos@physnet.uni-hamburg.de).
+ *
+ * This code was written by John Ioannidis for BSD/OS in Athens, Greece,
+ * in November 1995.
+ *
+ * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
+ * by Angelos D. Keromytis.
+ *
+ * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
+ * and Niels Provos.
+ *
+ * Copyright (C) 1995, 1996, 1997, 1998 by John Ioannidis, Angelos D. Keromytis
+ * and Niels Provos.
+ * Copyright (c) 2001, Angelos D. Keromytis, Niels Provos.
+ *
+ * Permission to use, copy, and modify this software with or without fee
+ * is hereby granted, provided that this entire notice is included in
+ * all copies of any software which is or includes a copy or
+ * modification of this software.
+ * You may use this code under the GNU public license if you so wish. Please
+ * contribute changes back to the authors under this freer than GPL license
+ * so that we may further the use of strong encryption without limitations to
+ * all.
+ *
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
+ * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
+ * PURPOSE.
+ *
+ * $OpenBSD: if_pflog.c,v 1.26 2007/10/18 21:58:18 mpf Exp $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/local/opt_inet.h>
+#include <rtems/bsd/local/opt_inet6.h>
+#include <rtems/bsd/local/opt_bpf.h>
+#include <rtems/bsd/local/opt_pf.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+
+#include <net/bpf.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_clone.h>
+#include <net/if_pflog.h>
+#include <net/if_types.h>
+#include <net/vnet.h>
+#include <net/pfvar.h>
+
+#if defined(INET) || defined(INET6)
+#include <netinet/in.h>
+#endif
+#ifdef INET
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#endif
+
+#ifdef INET6
+#include <netinet6/in6_var.h>
+#include <netinet6/nd6.h>
+#endif /* INET6 */
+
+#ifdef INET
+#include <machine/in_cksum.h>
+#endif /* INET */
+
+#define PFLOGMTU (32768 + MHLEN + MLEN)
+
+#ifdef PFLOGDEBUG
+#define DPRINTF(x) do { if (pflogdebug) printf x ; } while (0)
+#else
+#define DPRINTF(x)
+#endif
+
+static int pflogoutput(struct ifnet *, struct mbuf *,
+ const struct sockaddr *, struct route *);
+static void pflogattach(int);
+static int pflogioctl(struct ifnet *, u_long, caddr_t);
+static void pflogstart(struct ifnet *);
+static int pflog_clone_create(struct if_clone *, int, caddr_t);
+static void pflog_clone_destroy(struct ifnet *);
+
+static const char pflogname[] = "pflog";
+
+static VNET_DEFINE(struct if_clone *, pflog_cloner);
+#define V_pflog_cloner VNET(pflog_cloner)
+
+VNET_DEFINE(struct ifnet *, pflogifs[PFLOGIFS_MAX]); /* for fast access */
+#define V_pflogifs VNET(pflogifs)
+
+static void
+pflogattach(int npflog __unused)
+{
+ int i;
+ for (i = 0; i < PFLOGIFS_MAX; i++)
+ V_pflogifs[i] = NULL;
+ V_pflog_cloner = if_clone_simple(pflogname, pflog_clone_create,
+ pflog_clone_destroy, 1);
+}
+
+static int
+pflog_clone_create(struct if_clone *ifc, int unit, caddr_t param)
+{
+ struct ifnet *ifp;
+
+ if (unit >= PFLOGIFS_MAX)
+ return (EINVAL);
+
+ ifp = if_alloc(IFT_PFLOG);
+ if (ifp == NULL) {
+ return (ENOSPC);
+ }
+ if_initname(ifp, pflogname, unit);
+ ifp->if_mtu = PFLOGMTU;
+ ifp->if_ioctl = pflogioctl;
+ ifp->if_output = pflogoutput;
+ ifp->if_start = pflogstart;
+ ifp->if_snd.ifq_maxlen = ifqmaxlen;
+ ifp->if_hdrlen = PFLOG_HDRLEN;
+ if_attach(ifp);
+
+ bpfattach(ifp, DLT_PFLOG, PFLOG_HDRLEN);
+
+ V_pflogifs[unit] = ifp;
+
+ return (0);
+}
+
+static void
+pflog_clone_destroy(struct ifnet *ifp)
+{
+ int i;
+
+ for (i = 0; i < PFLOGIFS_MAX; i++)
+ if (V_pflogifs[i] == ifp)
+ V_pflogifs[i] = NULL;
+
+ bpfdetach(ifp);
+ if_detach(ifp);
+ if_free(ifp);
+}
+
+/*
+ * Start output on the pflog interface.
+ */
+static void
+pflogstart(struct ifnet *ifp)
+{
+ struct mbuf *m;
+
+ for (;;) {
+ IF_LOCK(&ifp->if_snd);
+ _IF_DEQUEUE(&ifp->if_snd, m);
+ IF_UNLOCK(&ifp->if_snd);
+
+ if (m == NULL)
+ return;
+ else
+ m_freem(m);
+ }
+}
+
+static int
+pflogoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
+ struct route *rt)
+{
+ m_freem(m);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+pflogioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+ switch (cmd) {
+ case SIOCSIFFLAGS:
+ if (ifp->if_flags & IFF_UP)
+ ifp->if_drv_flags |= IFF_DRV_RUNNING;
+ else
+ ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+ break;
+ default:
+ return (ENOTTY);
+ }
+
+ return (0);
+}
+
+static int
+pflog_packet(struct pfi_kif *kif, struct mbuf *m, sa_family_t af, u_int8_t dir,
+ u_int8_t reason, struct pf_rule *rm, struct pf_rule *am,
+ struct pf_ruleset *ruleset, struct pf_pdesc *pd, int lookupsafe)
+{
+ struct ifnet *ifn;
+ struct pfloghdr hdr;
+
+ if (kif == NULL || m == NULL || rm == NULL || pd == NULL)
+ return ( 1);
+
+ if ((ifn = V_pflogifs[rm->logif]) == NULL || !ifn->if_bpf)
+ return (0);
+
+ bzero(&hdr, sizeof(hdr));
+ hdr.length = PFLOG_REAL_HDRLEN;
+ hdr.af = af;
+ hdr.action = rm->action;
+ hdr.reason = reason;
+ memcpy(hdr.ifname, kif->pfik_name, sizeof(hdr.ifname));
+
+ if (am == NULL) {
+ hdr.rulenr = htonl(rm->nr);
+ hdr.subrulenr = 1;
+ } else {
+ hdr.rulenr = htonl(am->nr);
+ hdr.subrulenr = htonl(rm->nr);
+ if (ruleset != NULL && ruleset->anchor != NULL)
+ strlcpy(hdr.ruleset, ruleset->anchor->name,
+ sizeof(hdr.ruleset));
+ }
+ /*
+ * XXXGL: we avoid pf_socket_lookup() when we are holding
+ * state lock, since this leads to unsafe LOR.
+ * These conditions are very very rare, however.
+ */
+ if (rm->log & PF_LOG_SOCKET_LOOKUP && !pd->lookup.done && lookupsafe)
+ pd->lookup.done = pf_socket_lookup(dir, pd, m);
+ if (pd->lookup.done > 0)
+ hdr.uid = pd->lookup.uid;
+ else
+ hdr.uid = UID_MAX;
+ hdr.pid = NO_PID;
+ hdr.rule_uid = rm->cuid;
+ hdr.rule_pid = rm->cpid;
+ hdr.dir = dir;
+
+#ifdef INET
+ if (af == AF_INET && dir == PF_OUT) {
+ struct ip *ip;
+
+ ip = mtod(m, struct ip *);
+ ip->ip_sum = 0;
+ ip->ip_sum = in_cksum(m, ip->ip_hl << 2);
+ }
+#endif /* INET */
+
+ if_inc_counter(ifn, IFCOUNTER_OPACKETS, 1);
+ if_inc_counter(ifn, IFCOUNTER_OBYTES, m->m_pkthdr.len);
+ BPF_MTAP2(ifn, &hdr, PFLOG_HDRLEN, m);
+
+ return (0);
+}
+
+static void
+vnet_pflog_init(const void *unused __unused)
+{
+
+ pflogattach(1);
+}
+VNET_SYSINIT(vnet_pflog_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY,
+ vnet_pflog_init, NULL);
+
+static void
+vnet_pflog_uninit(const void *unused __unused)
+{
+
+ if_clone_detach(V_pflog_cloner);
+}
+/*
+ * Detach after pf is gone; otherwise we might touch pflog memory
+ * from within pf after freeing pflog.
+ */
+VNET_SYSUNINIT(vnet_pflog_uninit, SI_SUB_INIT_IF, SI_ORDER_SECOND,
+ vnet_pflog_uninit, NULL);
+
+static int
+pflog_modevent(module_t mod, int type, void *data)
+{
+ int error = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ PF_RULES_WLOCK();
+ pflog_packet_ptr = pflog_packet;
+ PF_RULES_WUNLOCK();
+ break;
+ case MOD_UNLOAD:
+ PF_RULES_WLOCK();
+ pflog_packet_ptr = NULL;
+ PF_RULES_WUNLOCK();
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ return error;
+}
+
+static moduledata_t pflog_mod = { pflogname, pflog_modevent, 0 };
+
+#define PFLOG_MODVER 1
+
+/* Do not run before pf is initialized as we depend on its locks. */
+DECLARE_MODULE(pflog, pflog_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY);
+MODULE_VERSION(pflog, PFLOG_MODVER);
+MODULE_DEPEND(pflog, pf, PF_MODVER, PF_MODVER, PF_MODVER);
diff --git a/freebsd/sys/netpfil/pf/if_pfsync.c b/freebsd/sys/netpfil/pf/if_pfsync.c
new file mode 100644
index 00000000..d6a0dfc0
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/if_pfsync.c
@@ -0,0 +1,2421 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2002 Michael Shalayeff
+ * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $
+ *
+ * Revisions picked from OpenBSD after revision 1.110 import:
+ * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input()
+ * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
+ * 1.120, 1.175 - use monotonic time_uptime
+ * 1.122 - reduce number of updates for non-TCP sessions
+ * 1.125, 1.127 - rewrite merge or stale processing
+ * 1.128 - cleanups
+ * 1.146 - bzero() mbuf before sparsely filling it with data
+ * 1.170 - SIOCSIFMTU checks
+ * 1.126, 1.142 - deferred packets processing
+ * 1.173 - correct expire time processing
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/local/opt_inet.h>
+#include <rtems/bsd/local/opt_inet6.h>
+#include <rtems/bsd/local/opt_pf.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/bus.h>
+#include <sys/endian.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+
+#include <net/bpf.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_clone.h>
+#include <net/if_types.h>
+#include <net/vnet.h>
+#include <net/pfvar.h>
+#include <net/if_pfsync.h>
+
+#include <netinet/if_ether.h>
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_carp.h>
+#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+
+#define PFSYNC_MINPKT ( \
+ sizeof(struct ip) + \
+ sizeof(struct pfsync_header) + \
+ sizeof(struct pfsync_subheader) )
+
+struct pfsync_pkt {
+ struct ip *ip;
+ struct in_addr src;
+ u_int8_t flags;
+};
+
+static int pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
+ struct pfsync_state_peer *);
+static int pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
+
+static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
+ pfsync_in_clr, /* PFSYNC_ACT_CLR */
+ pfsync_in_ins, /* PFSYNC_ACT_INS */
+ pfsync_in_iack, /* PFSYNC_ACT_INS_ACK */
+ pfsync_in_upd, /* PFSYNC_ACT_UPD */
+ pfsync_in_upd_c, /* PFSYNC_ACT_UPD_C */
+ pfsync_in_ureq, /* PFSYNC_ACT_UPD_REQ */
+ pfsync_in_del, /* PFSYNC_ACT_DEL */
+ pfsync_in_del_c, /* PFSYNC_ACT_DEL_C */
+ pfsync_in_error, /* PFSYNC_ACT_INS_F */
+ pfsync_in_error, /* PFSYNC_ACT_DEL_F */
+ pfsync_in_bus, /* PFSYNC_ACT_BUS */
+ pfsync_in_tdb, /* PFSYNC_ACT_TDB */
+ pfsync_in_eof /* PFSYNC_ACT_EOF */
+};
+
+struct pfsync_q {
+ void (*write)(struct pf_state *, void *);
+ size_t len;
+ u_int8_t action;
+};
+
+/* we have one of these for every PFSYNC_S_ */
+static void pfsync_out_state(struct pf_state *, void *);
+static void pfsync_out_iack(struct pf_state *, void *);
+static void pfsync_out_upd_c(struct pf_state *, void *);
+static void pfsync_out_del(struct pf_state *, void *);
+
+static struct pfsync_q pfsync_qs[] = {
+ { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_INS },
+ { pfsync_out_iack, sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
+ { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_UPD },
+ { pfsync_out_upd_c, sizeof(struct pfsync_upd_c), PFSYNC_ACT_UPD_C },
+ { pfsync_out_del, sizeof(struct pfsync_del_c), PFSYNC_ACT_DEL_C }
+};
+
+static void pfsync_q_ins(struct pf_state *, int);
+static void pfsync_q_del(struct pf_state *);
+
+static void pfsync_update_state(struct pf_state *);
+
+struct pfsync_upd_req_item {
+ TAILQ_ENTRY(pfsync_upd_req_item) ur_entry;
+ struct pfsync_upd_req ur_msg;
+};
+
+struct pfsync_deferral {
+ struct pfsync_softc *pd_sc;
+ TAILQ_ENTRY(pfsync_deferral) pd_entry;
+ u_int pd_refs;
+ struct callout pd_tmo;
+
+ struct pf_state *pd_st;
+ struct mbuf *pd_m;
+};
+
+struct pfsync_softc {
+ /* Configuration */
+ struct ifnet *sc_ifp;
+ struct ifnet *sc_sync_if;
+ struct ip_moptions sc_imo;
+ struct in_addr sc_sync_peer;
+ uint32_t sc_flags;
+#define PFSYNCF_OK 0x00000001
+#define PFSYNCF_DEFER 0x00000002
+#define PFSYNCF_PUSH 0x00000004
+ uint8_t sc_maxupdates;
+ struct ip sc_template;
+ struct callout sc_tmo;
+ struct mtx sc_mtx;
+
+ /* Queued data */
+ size_t sc_len;
+ TAILQ_HEAD(, pf_state) sc_qs[PFSYNC_S_COUNT];
+ TAILQ_HEAD(, pfsync_upd_req_item) sc_upd_req_list;
+ TAILQ_HEAD(, pfsync_deferral) sc_deferrals;
+ u_int sc_deferred;
+ void *sc_plus;
+ size_t sc_pluslen;
+
+ /* Bulk update info */
+ struct mtx sc_bulk_mtx;
+ uint32_t sc_ureq_sent;
+ int sc_bulk_tries;
+ uint32_t sc_ureq_received;
+ int sc_bulk_hashid;
+ uint64_t sc_bulk_stateid;
+ uint32_t sc_bulk_creatorid;
+ struct callout sc_bulk_tmo;
+ struct callout sc_bulkfail_tmo;
+};
+
+#define PFSYNC_LOCK(sc) mtx_lock(&(sc)->sc_mtx)
+#define PFSYNC_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx)
+#define PFSYNC_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED)
+
+#define PFSYNC_BLOCK(sc) mtx_lock(&(sc)->sc_bulk_mtx)
+#define PFSYNC_BUNLOCK(sc) mtx_unlock(&(sc)->sc_bulk_mtx)
+#define PFSYNC_BLOCK_ASSERT(sc) mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
+
+static const char pfsyncname[] = "pfsync";
+static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data");
+static VNET_DEFINE(struct pfsync_softc *, pfsyncif) = NULL;
+#define V_pfsyncif VNET(pfsyncif)
+static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL;
+#define V_pfsync_swi_cookie VNET(pfsync_swi_cookie)
+static VNET_DEFINE(struct pfsyncstats, pfsyncstats);
+#define V_pfsyncstats VNET(pfsyncstats)
+static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW;
+#define V_pfsync_carp_adj VNET(pfsync_carp_adj)
+
+static void pfsync_timeout(void *);
+static void pfsync_push(struct pfsync_softc *);
+static void pfsyncintr(void *);
+static int pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
+ void *);
+static void pfsync_multicast_cleanup(struct pfsync_softc *);
+static void pfsync_pointers_init(void);
+static void pfsync_pointers_uninit(void);
+static int pfsync_init(void);
+static void pfsync_uninit(void);
+
+SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
+SYSCTL_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(pfsyncstats), pfsyncstats,
+ "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
+SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
+ &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
+
+static int pfsync_clone_create(struct if_clone *, int, caddr_t);
+static void pfsync_clone_destroy(struct ifnet *);
+static int pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
+ struct pf_state_peer *);
+static int pfsyncoutput(struct ifnet *, struct mbuf *,
+ const struct sockaddr *, struct route *);
+static int pfsyncioctl(struct ifnet *, u_long, caddr_t);
+
+static int pfsync_defer(struct pf_state *, struct mbuf *);
+static void pfsync_undefer(struct pfsync_deferral *, int);
+static void pfsync_undefer_state(struct pf_state *, int);
+static void pfsync_defer_tmo(void *);
+
+static void pfsync_request_update(u_int32_t, u_int64_t);
+static void pfsync_update_state_req(struct pf_state *);
+
+static void pfsync_drop(struct pfsync_softc *);
+static void pfsync_sendout(int);
+static void pfsync_send_plus(void *, size_t);
+
+static void pfsync_bulk_start(void);
+static void pfsync_bulk_status(u_int8_t);
+static void pfsync_bulk_update(void *);
+static void pfsync_bulk_fail(void *);
+
+#ifdef IPSEC
+static void pfsync_update_net_tdb(struct pfsync_tdb *);
+#endif
+
+#define PFSYNC_MAX_BULKTRIES 12
+
+VNET_DEFINE(struct if_clone *, pfsync_cloner);
+#define V_pfsync_cloner VNET(pfsync_cloner)
+
+static int
+pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
+{
+ struct pfsync_softc *sc;
+ struct ifnet *ifp;
+ int q;
+
+ if (unit != 0)
+ return (EINVAL);
+
+ sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
+ sc->sc_flags |= PFSYNCF_OK;
+
+ for (q = 0; q < PFSYNC_S_COUNT; q++)
+ TAILQ_INIT(&sc->sc_qs[q]);
+
+ TAILQ_INIT(&sc->sc_upd_req_list);
+ TAILQ_INIT(&sc->sc_deferrals);
+
+ sc->sc_len = PFSYNC_MINPKT;
+ sc->sc_maxupdates = 128;
+
+ ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
+ if (ifp == NULL) {
+ free(sc, M_PFSYNC);
+ return (ENOSPC);
+ }
+ if_initname(ifp, pfsyncname, unit);
+ ifp->if_softc = sc;
+ ifp->if_ioctl = pfsyncioctl;
+ ifp->if_output = pfsyncoutput;
+ ifp->if_type = IFT_PFSYNC;
+ ifp->if_snd.ifq_maxlen = ifqmaxlen;
+ ifp->if_hdrlen = sizeof(struct pfsync_header);
+ ifp->if_mtu = ETHERMTU;
+ mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF);
+ mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
+ callout_init(&sc->sc_tmo, 1);
+ callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
+ callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
+
+ if_attach(ifp);
+
+ bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
+
+ V_pfsyncif = sc;
+
+ return (0);
+}
+
+static void
+pfsync_clone_destroy(struct ifnet *ifp)
+{
+ struct pfsync_softc *sc = ifp->if_softc;
+
+ /*
+ * At this stage, everything should have already been
+ * cleared by pfsync_uninit(), and we have only to
+ * drain callouts.
+ */
+ while (sc->sc_deferred > 0) {
+ struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals);
+
+ TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+ sc->sc_deferred--;
+ if (callout_stop(&pd->pd_tmo) > 0) {
+ pf_release_state(pd->pd_st);
+ m_freem(pd->pd_m);
+ free(pd, M_PFSYNC);
+ } else {
+ pd->pd_refs++;
+ callout_drain(&pd->pd_tmo);
+ free(pd, M_PFSYNC);
+ }
+ }
+
+ callout_drain(&sc->sc_tmo);
+ callout_drain(&sc->sc_bulkfail_tmo);
+ callout_drain(&sc->sc_bulk_tmo);
+
+ if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
+ (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
+ bpfdetach(ifp);
+ if_detach(ifp);
+
+ pfsync_drop(sc);
+
+ if_free(ifp);
+ if (sc->sc_imo.imo_membership)
+ pfsync_multicast_cleanup(sc);
+ mtx_destroy(&sc->sc_mtx);
+ mtx_destroy(&sc->sc_bulk_mtx);
+ free(sc, M_PFSYNC);
+
+ V_pfsyncif = NULL;
+}
+
+static int
+pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
+ struct pf_state_peer *d)
+{
+ if (s->scrub.scrub_flag && d->scrub == NULL) {
+ d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
+ if (d->scrub == NULL)
+ return (ENOMEM);
+ }
+
+ return (0);
+}
+
+
+static int
+pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+#ifndef __NO_STRICT_ALIGNMENT
+ struct pfsync_state_key key[2];
+#endif
+ struct pfsync_state_key *kw, *ks;
+ struct pf_state *st = NULL;
+ struct pf_state_key *skw = NULL, *sks = NULL;
+ struct pf_rule *r = NULL;
+ struct pfi_kif *kif;
+ int error;
+
+ PF_RULES_RASSERT();
+
+ if (sp->creatorid == 0) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("%s: invalid creator id: %08x\n", __func__,
+ ntohl(sp->creatorid));
+ return (EINVAL);
+ }
+
+ if ((kif = pfi_kif_find(sp->ifname)) == NULL) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("%s: unknown interface: %s\n", __func__,
+ sp->ifname);
+ if (flags & PFSYNC_SI_IOCTL)
+ return (EINVAL);
+ return (0); /* skip this state */
+ }
+
+ /*
+ * If the ruleset checksums match or the state is coming from the ioctl,
+ * it's safe to associate the state with the rule of that number.
+ */
+ if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
+ (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
+ pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
+ r = pf_main_ruleset.rules[
+ PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
+ else
+ r = &V_pf_default_rule;
+
+ if ((r->max_states &&
+ counter_u64_fetch(r->states_cur) >= r->max_states))
+ goto cleanup;
+
+ /*
+ * XXXGL: consider M_WAITOK in ioctl path after.
+ */
+ if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL)
+ goto cleanup;
+
+ if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
+ goto cleanup;
+
+#ifndef __NO_STRICT_ALIGNMENT
+ bcopy(&sp->key, key, sizeof(struct pfsync_state_key) * 2);
+ kw = &key[PF_SK_WIRE];
+ ks = &key[PF_SK_STACK];
+#else
+ kw = &sp->key[PF_SK_WIRE];
+ ks = &sp->key[PF_SK_STACK];
+#endif
+
+ if (PF_ANEQ(&kw->addr[0], &ks->addr[0], sp->af) ||
+ PF_ANEQ(&kw->addr[1], &ks->addr[1], sp->af) ||
+ kw->port[0] != ks->port[0] ||
+ kw->port[1] != ks->port[1]) {
+ sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
+ if (sks == NULL)
+ goto cleanup;
+ } else
+ sks = skw;
+
+ /* allocate memory for scrub info */
+ if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
+ pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
+ goto cleanup;
+
+ /* Copy to state key(s). */
+ skw->addr[0] = kw->addr[0];
+ skw->addr[1] = kw->addr[1];
+ skw->port[0] = kw->port[0];
+ skw->port[1] = kw->port[1];
+ skw->proto = sp->proto;
+ skw->af = sp->af;
+ if (sks != skw) {
+ sks->addr[0] = ks->addr[0];
+ sks->addr[1] = ks->addr[1];
+ sks->port[0] = ks->port[0];
+ sks->port[1] = ks->port[1];
+ sks->proto = sp->proto;
+ sks->af = sp->af;
+ }
+
+ /* copy to state */
+ bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
+ st->creation = time_uptime - ntohl(sp->creation);
+ st->expire = time_uptime;
+ if (sp->expire) {
+ uint32_t timeout;
+
+ timeout = r->timeout[sp->timeout];
+ if (!timeout)
+ timeout = V_pf_default_rule.timeout[sp->timeout];
+
+ /* sp->expire may have been adaptively scaled by export. */
+ st->expire -= timeout - ntohl(sp->expire);
+ }
+
+ st->direction = sp->direction;
+ st->log = sp->log;
+ st->timeout = sp->timeout;
+ st->state_flags = sp->state_flags;
+
+ st->id = sp->id;
+ st->creatorid = sp->creatorid;
+ pf_state_peer_ntoh(&sp->src, &st->src);
+ pf_state_peer_ntoh(&sp->dst, &st->dst);
+
+ st->rule.ptr = r;
+ st->nat_rule.ptr = NULL;
+ st->anchor.ptr = NULL;
+ st->rt_kif = NULL;
+
+ st->pfsync_time = time_uptime;
+ st->sync_state = PFSYNC_S_NONE;
+
+ if (!(flags & PFSYNC_SI_IOCTL))
+ st->state_flags |= PFSTATE_NOSYNC;
+
+ if ((error = pf_state_insert(kif, skw, sks, st)) != 0)
+ goto cleanup_state;
+
+ /* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
+ counter_u64_add(r->states_cur, 1);
+ counter_u64_add(r->states_tot, 1);
+
+ if (!(flags & PFSYNC_SI_IOCTL)) {
+ st->state_flags &= ~PFSTATE_NOSYNC;
+ if (st->state_flags & PFSTATE_ACK) {
+ pfsync_q_ins(st, PFSYNC_S_IACK);
+ pfsync_push(sc);
+ }
+ }
+ st->state_flags &= ~PFSTATE_ACK;
+ PF_STATE_UNLOCK(st);
+
+ return (0);
+
+cleanup:
+ error = ENOMEM;
+ if (skw == sks)
+ sks = NULL;
+ if (skw != NULL)
+ uma_zfree(V_pf_state_key_z, skw);
+ if (sks != NULL)
+ uma_zfree(V_pf_state_key_z, sks);
+
+cleanup_state: /* pf_state_insert() frees the state keys. */
+ if (st) {
+ if (st->dst.scrub)
+ uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
+ if (st->src.scrub)
+ uma_zfree(V_pf_state_scrub_z, st->src.scrub);
+ uma_zfree(V_pf_state_z, st);
+ }
+ return (error);
+}
+
+static int
+pfsync_input(struct mbuf **mp, int *offp __unused, int proto __unused)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_pkt pkt;
+ struct mbuf *m = *mp;
+ struct ip *ip = mtod(m, struct ip *);
+ struct pfsync_header *ph;
+ struct pfsync_subheader subh;
+
+ int offset, len;
+ int rv;
+ uint16_t count;
+
+ *mp = NULL;
+ V_pfsyncstats.pfsyncs_ipackets++;
+
+ /* Verify that we have a sync interface configured. */
+ if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
+ (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
+ goto done;
+
+ /* verify that the packet came in on the right interface */
+ if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
+ V_pfsyncstats.pfsyncs_badif++;
+ goto done;
+ }
+
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1);
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
+ /* verify that the IP TTL is 255. */
+ if (ip->ip_ttl != PFSYNC_DFLTTL) {
+ V_pfsyncstats.pfsyncs_badttl++;
+ goto done;
+ }
+
+ offset = ip->ip_hl << 2;
+ if (m->m_pkthdr.len < offset + sizeof(*ph)) {
+ V_pfsyncstats.pfsyncs_hdrops++;
+ goto done;
+ }
+
+ if (offset + sizeof(*ph) > m->m_len) {
+ if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
+ V_pfsyncstats.pfsyncs_hdrops++;
+ return (IPPROTO_DONE);
+ }
+ ip = mtod(m, struct ip *);
+ }
+ ph = (struct pfsync_header *)((char *)ip + offset);
+
+ /* verify the version */
+ if (ph->version != PFSYNC_VERSION) {
+ V_pfsyncstats.pfsyncs_badver++;
+ goto done;
+ }
+
+ len = ntohs(ph->len) + offset;
+ if (m->m_pkthdr.len < len) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ goto done;
+ }
+
+ /* Cheaper to grab this now than having to mess with mbufs later */
+ pkt.ip = ip;
+ pkt.src = ip->ip_src;
+ pkt.flags = 0;
+
+ /*
+ * Trusting pf_chksum during packet processing, as well as seeking
+ * in interface name tree, require holding PF_RULES_RLOCK().
+ */
+ PF_RULES_RLOCK();
+ if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
+ pkt.flags |= PFSYNC_SI_CKSUM;
+
+ offset += sizeof(*ph);
+ while (offset <= len - sizeof(subh)) {
+ m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
+ offset += sizeof(subh);
+
+ if (subh.action >= PFSYNC_ACT_MAX) {
+ V_pfsyncstats.pfsyncs_badact++;
+ PF_RULES_RUNLOCK();
+ goto done;
+ }
+
+ count = ntohs(subh.count);
+ V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
+ rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count);
+ if (rv == -1) {
+ PF_RULES_RUNLOCK();
+ return (IPPROTO_DONE);
+ }
+
+ offset += rv;
+ }
+ PF_RULES_RUNLOCK();
+
+done:
+ m_freem(m);
+ return (IPPROTO_DONE);
+}
+
+static int
+pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct pfsync_clr *clr;
+ struct mbuf *mp;
+ int len = sizeof(*clr) * count;
+ int i, offp;
+ u_int32_t creatorid;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ clr = (struct pfsync_clr *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ creatorid = clr[i].creatorid;
+
+ if (clr[i].ifname[0] != '\0' &&
+ pfi_kif_find(clr[i].ifname) == NULL)
+ continue;
+
+ for (int i = 0; i <= pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+ struct pf_state *s;
+relock:
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+ if (s->creatorid == creatorid) {
+ s->state_flags |= PFSTATE_NOSYNC;
+ pf_unlink_state(s, PF_ENTER_LOCKED);
+ goto relock;
+ }
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct mbuf *mp;
+ struct pfsync_state *sa, *sp;
+ int len = sizeof(*sp) * count;
+ int i, offp;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ sa = (struct pfsync_state *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ sp = &sa[i];
+
+ /* Check for invalid values. */
+ if (sp->timeout >= PFTM_MAX ||
+ sp->src.state > PF_TCPS_PROXY_DST ||
+ sp->dst.state > PF_TCPS_PROXY_DST ||
+ sp->direction > PF_OUT ||
+ (sp->af != AF_INET && sp->af != AF_INET6)) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("%s: invalid value\n", __func__);
+ V_pfsyncstats.pfsyncs_badval++;
+ continue;
+ }
+
+ if (pfsync_state_import(sp, pkt->flags) == ENOMEM)
+ /* Drop out, but process the rest of the actions. */
+ break;
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct pfsync_ins_ack *ia, *iaa;
+ struct pf_state *st;
+
+ struct mbuf *mp;
+ int len = count * sizeof(*ia);
+ int offp, i;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ ia = &iaa[i];
+
+ st = pf_find_state_byid(ia->id, ia->creatorid);
+ if (st == NULL)
+ continue;
+
+ if (st->state_flags & PFSTATE_ACK) {
+ PFSYNC_LOCK(V_pfsyncif);
+ pfsync_undefer_state(st, 0);
+ PFSYNC_UNLOCK(V_pfsyncif);
+ }
+ PF_STATE_UNLOCK(st);
+ }
+ /*
+ * XXX this is not yet implemented, but we know the size of the
+ * message so we can skip it.
+ */
+
+ return (count * sizeof(struct pfsync_ins_ack));
+}
+
+static int
+pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
+ struct pfsync_state_peer *dst)
+{
+ int sync = 0;
+
+ PF_STATE_LOCK_ASSERT(st);
+
+ /*
+ * The state should never go backwards except
+ * for syn-proxy states. Neither should the
+ * sequence window slide backwards.
+ */
+ if ((st->src.state > src->state &&
+ (st->src.state < PF_TCPS_PROXY_SRC ||
+ src->state >= PF_TCPS_PROXY_SRC)) ||
+
+ (st->src.state == src->state &&
+ SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
+ sync++;
+ else
+ pf_state_peer_ntoh(src, &st->src);
+
+ if ((st->dst.state > dst->state) ||
+
+ (st->dst.state >= TCPS_SYN_SENT &&
+ SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
+ sync++;
+ else
+ pf_state_peer_ntoh(dst, &st->dst);
+
+ return (sync);
+}
+
+static int
+pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_state *sa, *sp;
+ struct pf_state *st;
+ int sync;
+
+ struct mbuf *mp;
+ int len = count * sizeof(*sp);
+ int offp, i;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ sa = (struct pfsync_state *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ sp = &sa[i];
+
+ /* check for invalid values */
+ if (sp->timeout >= PFTM_MAX ||
+ sp->src.state > PF_TCPS_PROXY_DST ||
+ sp->dst.state > PF_TCPS_PROXY_DST) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pfsync_input: PFSYNC_ACT_UPD: "
+ "invalid value\n");
+ }
+ V_pfsyncstats.pfsyncs_badval++;
+ continue;
+ }
+
+ st = pf_find_state_byid(sp->id, sp->creatorid);
+ if (st == NULL) {
+ /* insert the update */
+ if (pfsync_state_import(sp, 0))
+ V_pfsyncstats.pfsyncs_badstate++;
+ continue;
+ }
+
+ if (st->state_flags & PFSTATE_ACK) {
+ PFSYNC_LOCK(sc);
+ pfsync_undefer_state(st, 1);
+ PFSYNC_UNLOCK(sc);
+ }
+
+ if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
+ sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
+ else {
+ sync = 0;
+
+ /*
+ * Non-TCP protocol state machine always go
+ * forwards
+ */
+ if (st->src.state > sp->src.state)
+ sync++;
+ else
+ pf_state_peer_ntoh(&sp->src, &st->src);
+ if (st->dst.state > sp->dst.state)
+ sync++;
+ else
+ pf_state_peer_ntoh(&sp->dst, &st->dst);
+ }
+ if (sync < 2) {
+ pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
+ pf_state_peer_ntoh(&sp->dst, &st->dst);
+ st->expire = time_uptime;
+ st->timeout = sp->timeout;
+ }
+ st->pfsync_time = time_uptime;
+
+ if (sync) {
+ V_pfsyncstats.pfsyncs_stale++;
+
+ pfsync_update_state(st);
+ PF_STATE_UNLOCK(st);
+ PFSYNC_LOCK(sc);
+ pfsync_push(sc);
+ PFSYNC_UNLOCK(sc);
+ continue;
+ }
+ PF_STATE_UNLOCK(st);
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_upd_c *ua, *up;
+ struct pf_state *st;
+ int len = count * sizeof(*up);
+ int sync;
+ struct mbuf *mp;
+ int offp, i;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ ua = (struct pfsync_upd_c *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ up = &ua[i];
+
+ /* check for invalid values */
+ if (up->timeout >= PFTM_MAX ||
+ up->src.state > PF_TCPS_PROXY_DST ||
+ up->dst.state > PF_TCPS_PROXY_DST) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pfsync_input: "
+ "PFSYNC_ACT_UPD_C: "
+ "invalid value\n");
+ }
+ V_pfsyncstats.pfsyncs_badval++;
+ continue;
+ }
+
+ st = pf_find_state_byid(up->id, up->creatorid);
+ if (st == NULL) {
+ /* We don't have this state. Ask for it. */
+ PFSYNC_LOCK(sc);
+ pfsync_request_update(up->creatorid, up->id);
+ PFSYNC_UNLOCK(sc);
+ continue;
+ }
+
+ if (st->state_flags & PFSTATE_ACK) {
+ PFSYNC_LOCK(sc);
+ pfsync_undefer_state(st, 1);
+ PFSYNC_UNLOCK(sc);
+ }
+
+ if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
+ sync = pfsync_upd_tcp(st, &up->src, &up->dst);
+ else {
+ sync = 0;
+
+ /*
+ * Non-TCP protocol state machine always go
+ * forwards
+ */
+ if (st->src.state > up->src.state)
+ sync++;
+ else
+ pf_state_peer_ntoh(&up->src, &st->src);
+ if (st->dst.state > up->dst.state)
+ sync++;
+ else
+ pf_state_peer_ntoh(&up->dst, &st->dst);
+ }
+ if (sync < 2) {
+ pfsync_alloc_scrub_memory(&up->dst, &st->dst);
+ pf_state_peer_ntoh(&up->dst, &st->dst);
+ st->expire = time_uptime;
+ st->timeout = up->timeout;
+ }
+ st->pfsync_time = time_uptime;
+
+ if (sync) {
+ V_pfsyncstats.pfsyncs_stale++;
+
+ pfsync_update_state(st);
+ PF_STATE_UNLOCK(st);
+ PFSYNC_LOCK(sc);
+ pfsync_push(sc);
+ PFSYNC_UNLOCK(sc);
+ continue;
+ }
+ PF_STATE_UNLOCK(st);
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct pfsync_upd_req *ur, *ura;
+ struct mbuf *mp;
+ int len = count * sizeof(*ur);
+ int i, offp;
+
+ struct pf_state *st;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ ura = (struct pfsync_upd_req *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ ur = &ura[i];
+
+ if (ur->id == 0 && ur->creatorid == 0)
+ pfsync_bulk_start();
+ else {
+ st = pf_find_state_byid(ur->id, ur->creatorid);
+ if (st == NULL) {
+ V_pfsyncstats.pfsyncs_badstate++;
+ continue;
+ }
+ if (st->state_flags & PFSTATE_NOSYNC) {
+ PF_STATE_UNLOCK(st);
+ continue;
+ }
+
+ pfsync_update_state_req(st);
+ PF_STATE_UNLOCK(st);
+ }
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct mbuf *mp;
+ struct pfsync_state *sa, *sp;
+ struct pf_state *st;
+ int len = count * sizeof(*sp);
+ int offp, i;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ sa = (struct pfsync_state *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ sp = &sa[i];
+
+ st = pf_find_state_byid(sp->id, sp->creatorid);
+ if (st == NULL) {
+ V_pfsyncstats.pfsyncs_badstate++;
+ continue;
+ }
+ st->state_flags |= PFSTATE_NOSYNC;
+ pf_unlink_state(st, PF_ENTER_LOCKED);
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct mbuf *mp;
+ struct pfsync_del_c *sa, *sp;
+ struct pf_state *st;
+ int len = count * sizeof(*sp);
+ int offp, i;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ sa = (struct pfsync_del_c *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ sp = &sa[i];
+
+ st = pf_find_state_byid(sp->id, sp->creatorid);
+ if (st == NULL) {
+ V_pfsyncstats.pfsyncs_badstate++;
+ continue;
+ }
+
+ st->state_flags |= PFSTATE_NOSYNC;
+ pf_unlink_state(st, PF_ENTER_LOCKED);
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_bus *bus;
+ struct mbuf *mp;
+ int len = count * sizeof(*bus);
+ int offp;
+
+ PFSYNC_BLOCK(sc);
+
+ /* If we're not waiting for a bulk update, who cares. */
+ if (sc->sc_ureq_sent == 0) {
+ PFSYNC_BUNLOCK(sc);
+ return (len);
+ }
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ PFSYNC_BUNLOCK(sc);
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ bus = (struct pfsync_bus *)(mp->m_data + offp);
+
+ switch (bus->status) {
+ case PFSYNC_BUS_START:
+ callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
+ V_pf_limits[PF_LIMIT_STATES].limit /
+ ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
+ sizeof(struct pfsync_state)),
+ pfsync_bulk_fail, sc);
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync: received bulk update start\n");
+ break;
+
+ case PFSYNC_BUS_END:
+ if (time_uptime - ntohl(bus->endtime) >=
+ sc->sc_ureq_sent) {
+ /* that's it, we're happy */
+ sc->sc_ureq_sent = 0;
+ sc->sc_bulk_tries = 0;
+ callout_stop(&sc->sc_bulkfail_tmo);
+ if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
+ (*carp_demote_adj_p)(-V_pfsync_carp_adj,
+ "pfsync bulk done");
+ sc->sc_flags |= PFSYNCF_OK;
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync: received valid "
+ "bulk update end\n");
+ } else {
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync: received invalid "
+ "bulk update end: bad timestamp\n");
+ }
+ break;
+ }
+ PFSYNC_BUNLOCK(sc);
+
+ return (len);
+}
+
+static int
+pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ int len = count * sizeof(struct pfsync_tdb);
+
+#if defined(IPSEC)
+ struct pfsync_tdb *tp;
+ struct mbuf *mp;
+ int offp;
+ int i;
+ int s;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ tp = (struct pfsync_tdb *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++)
+ pfsync_update_net_tdb(&tp[i]);
+#endif
+
+ return (len);
+}
+
+#if defined(IPSEC)
+/* Update an in-kernel tdb. Silently fail if no tdb is found. */
+static void
+pfsync_update_net_tdb(struct pfsync_tdb *pt)
+{
+ struct tdb *tdb;
+ int s;
+
+ /* check for invalid values */
+ if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
+ (pt->dst.sa.sa_family != AF_INET &&
+ pt->dst.sa.sa_family != AF_INET6))
+ goto bad;
+
+ tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
+ if (tdb) {
+ pt->rpl = ntohl(pt->rpl);
+ pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
+
+ /* Neither replay nor byte counter should ever decrease. */
+ if (pt->rpl < tdb->tdb_rpl ||
+ pt->cur_bytes < tdb->tdb_cur_bytes) {
+ goto bad;
+ }
+
+ tdb->tdb_rpl = pt->rpl;
+ tdb->tdb_cur_bytes = pt->cur_bytes;
+ }
+ return;
+
+bad:
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
+ "invalid value\n");
+ V_pfsyncstats.pfsyncs_badstate++;
+ return;
+}
+#endif
+
+
+static int
+pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ /* check if we are at the right place in the packet */
+ if (offset != m->m_pkthdr.len)
+ V_pfsyncstats.pfsyncs_badlen++;
+
+ /* we're done. free and let the caller return */
+ m_freem(m);
+ return (-1);
+}
+
+static int
+pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ V_pfsyncstats.pfsyncs_badact++;
+
+ m_freem(m);
+ return (-1);
+}
+
+static int
+pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
+ struct route *rt)
+{
+ m_freem(m);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+ struct pfsync_softc *sc = ifp->if_softc;
+ struct ifreq *ifr = (struct ifreq *)data;
+ struct pfsyncreq pfsyncr;
+ int error;
+
+ switch (cmd) {
+ case SIOCSIFFLAGS:
+ PFSYNC_LOCK(sc);
+ if (ifp->if_flags & IFF_UP) {
+ ifp->if_drv_flags |= IFF_DRV_RUNNING;
+ PFSYNC_UNLOCK(sc);
+ pfsync_pointers_init();
+ } else {
+ ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+ PFSYNC_UNLOCK(sc);
+ pfsync_pointers_uninit();
+ }
+ break;
+ case SIOCSIFMTU:
+ if (!sc->sc_sync_if ||
+ ifr->ifr_mtu <= PFSYNC_MINPKT ||
+ ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
+ return (EINVAL);
+ if (ifr->ifr_mtu < ifp->if_mtu) {
+ PFSYNC_LOCK(sc);
+ if (sc->sc_len > PFSYNC_MINPKT)
+ pfsync_sendout(1);
+ PFSYNC_UNLOCK(sc);
+ }
+ ifp->if_mtu = ifr->ifr_mtu;
+ break;
+ case SIOCGETPFSYNC:
+ bzero(&pfsyncr, sizeof(pfsyncr));
+ PFSYNC_LOCK(sc);
+ if (sc->sc_sync_if) {
+ strlcpy(pfsyncr.pfsyncr_syncdev,
+ sc->sc_sync_if->if_xname, IFNAMSIZ);
+ }
+ pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
+ pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
+ pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER ==
+ (sc->sc_flags & PFSYNCF_DEFER));
+ PFSYNC_UNLOCK(sc);
+ return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
+
+ case SIOCSETPFSYNC:
+ {
+ struct ip_moptions *imo = &sc->sc_imo;
+ struct ifnet *sifp;
+ struct ip *ip;
+ void *mship = NULL;
+
+ if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
+ return (error);
+ if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
+ return (error);
+
+ if (pfsyncr.pfsyncr_maxupdates > 255)
+ return (EINVAL);
+
+ if (pfsyncr.pfsyncr_syncdev[0] == 0)
+ sifp = NULL;
+ else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL)
+ return (EINVAL);
+
+ if (sifp != NULL && (
+ pfsyncr.pfsyncr_syncpeer.s_addr == 0 ||
+ pfsyncr.pfsyncr_syncpeer.s_addr ==
+ htonl(INADDR_PFSYNC_GROUP)))
+ mship = malloc((sizeof(struct in_multi *) *
+ IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO);
+
+ PFSYNC_LOCK(sc);
+ if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
+ sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
+ else
+ sc->sc_sync_peer.s_addr =
+ pfsyncr.pfsyncr_syncpeer.s_addr;
+
+ sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
+ if (pfsyncr.pfsyncr_defer) {
+ sc->sc_flags |= PFSYNCF_DEFER;
+ pfsync_defer_ptr = pfsync_defer;
+ } else {
+ sc->sc_flags &= ~PFSYNCF_DEFER;
+ pfsync_defer_ptr = NULL;
+ }
+
+ if (sifp == NULL) {
+ if (sc->sc_sync_if)
+ if_rele(sc->sc_sync_if);
+ sc->sc_sync_if = NULL;
+ if (imo->imo_membership)
+ pfsync_multicast_cleanup(sc);
+ PFSYNC_UNLOCK(sc);
+ break;
+ }
+
+ if (sc->sc_len > PFSYNC_MINPKT &&
+ (sifp->if_mtu < sc->sc_ifp->if_mtu ||
+ (sc->sc_sync_if != NULL &&
+ sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
+ sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
+ pfsync_sendout(1);
+
+ if (imo->imo_membership)
+ pfsync_multicast_cleanup(sc);
+
+ if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
+ error = pfsync_multicast_setup(sc, sifp, mship);
+ if (error) {
+ if_rele(sifp);
+ free(mship, M_PFSYNC);
+ return (error);
+ }
+ }
+ if (sc->sc_sync_if)
+ if_rele(sc->sc_sync_if);
+ sc->sc_sync_if = sifp;
+
+ ip = &sc->sc_template;
+ bzero(ip, sizeof(*ip));
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = sizeof(sc->sc_template) >> 2;
+ ip->ip_tos = IPTOS_LOWDELAY;
+ /* len and id are set later. */
+ ip->ip_off = htons(IP_DF);
+ ip->ip_ttl = PFSYNC_DFLTTL;
+ ip->ip_p = IPPROTO_PFSYNC;
+ ip->ip_src.s_addr = INADDR_ANY;
+ ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
+
+ /* Request a full state table update. */
+ if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
+ (*carp_demote_adj_p)(V_pfsync_carp_adj,
+ "pfsync bulk start");
+ sc->sc_flags &= ~PFSYNCF_OK;
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync: requesting bulk update\n");
+ pfsync_request_update(0, 0);
+ PFSYNC_UNLOCK(sc);
+ PFSYNC_BLOCK(sc);
+ sc->sc_ureq_sent = time_uptime;
+ callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
+ sc);
+ PFSYNC_BUNLOCK(sc);
+
+ break;
+ }
+ default:
+ return (ENOTTY);
+ }
+
+ return (0);
+}
+
+static void
+pfsync_out_state(struct pf_state *st, void *buf)
+{
+ struct pfsync_state *sp = buf;
+
+ pfsync_state_export(sp, st);
+}
+
+static void
+pfsync_out_iack(struct pf_state *st, void *buf)
+{
+ struct pfsync_ins_ack *iack = buf;
+
+ iack->id = st->id;
+ iack->creatorid = st->creatorid;
+}
+
+static void
+pfsync_out_upd_c(struct pf_state *st, void *buf)
+{
+ struct pfsync_upd_c *up = buf;
+
+ bzero(up, sizeof(*up));
+ up->id = st->id;
+ pf_state_peer_hton(&st->src, &up->src);
+ pf_state_peer_hton(&st->dst, &up->dst);
+ up->creatorid = st->creatorid;
+ up->timeout = st->timeout;
+}
+
+static void
+pfsync_out_del(struct pf_state *st, void *buf)
+{
+ struct pfsync_del_c *dp = buf;
+
+ dp->id = st->id;
+ dp->creatorid = st->creatorid;
+ st->state_flags |= PFSTATE_NOSYNC;
+}
+
+static void
+pfsync_drop(struct pfsync_softc *sc)
+{
+ struct pf_state *st, *next;
+ struct pfsync_upd_req_item *ur;
+ int q;
+
+ for (q = 0; q < PFSYNC_S_COUNT; q++) {
+ if (TAILQ_EMPTY(&sc->sc_qs[q]))
+ continue;
+
+ TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
+ KASSERT(st->sync_state == q,
+ ("%s: st->sync_state == q",
+ __func__));
+ st->sync_state = PFSYNC_S_NONE;
+ pf_release_state(st);
+ }
+ TAILQ_INIT(&sc->sc_qs[q]);
+ }
+
+ while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
+ TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
+ free(ur, M_PFSYNC);
+ }
+
+ sc->sc_plus = NULL;
+ sc->sc_len = PFSYNC_MINPKT;
+}
+
+static void
+pfsync_sendout(int schedswi)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct ifnet *ifp = sc->sc_ifp;
+ struct mbuf *m;
+ struct ip *ip;
+ struct pfsync_header *ph;
+ struct pfsync_subheader *subh;
+ struct pf_state *st;
+ struct pfsync_upd_req_item *ur;
+ int offset;
+ int q, count = 0;
+
+ KASSERT(sc != NULL, ("%s: null sc", __func__));
+ KASSERT(sc->sc_len > PFSYNC_MINPKT,
+ ("%s: sc_len %zu", __func__, sc->sc_len));
+ PFSYNC_LOCK_ASSERT(sc);
+
+ if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
+ pfsync_drop(sc);
+ return;
+ }
+
+ m = m_get2(max_linkhdr + sc->sc_len, M_NOWAIT, MT_DATA, M_PKTHDR);
+ if (m == NULL) {
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
+ V_pfsyncstats.pfsyncs_onomem++;
+ return;
+ }
+ m->m_data += max_linkhdr;
+ m->m_len = m->m_pkthdr.len = sc->sc_len;
+
+ /* build the ip header */
+ ip = (struct ip *)m->m_data;
+ bcopy(&sc->sc_template, ip, sizeof(*ip));
+ offset = sizeof(*ip);
+
+ ip->ip_len = htons(m->m_pkthdr.len);
+ ip_fillid(ip);
+
+ /* build the pfsync header */
+ ph = (struct pfsync_header *)(m->m_data + offset);
+ bzero(ph, sizeof(*ph));
+ offset += sizeof(*ph);
+
+ ph->version = PFSYNC_VERSION;
+ ph->len = htons(sc->sc_len - sizeof(*ip));
+ bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
+
+ /* walk the queues */
+ for (q = 0; q < PFSYNC_S_COUNT; q++) {
+ if (TAILQ_EMPTY(&sc->sc_qs[q]))
+ continue;
+
+ subh = (struct pfsync_subheader *)(m->m_data + offset);
+ offset += sizeof(*subh);
+
+ count = 0;
+ TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) {
+ KASSERT(st->sync_state == q,
+ ("%s: st->sync_state == q",
+ __func__));
+ /*
+ * XXXGL: some of write methods do unlocked reads
+ * of state data :(
+ */
+ pfsync_qs[q].write(st, m->m_data + offset);
+ offset += pfsync_qs[q].len;
+ st->sync_state = PFSYNC_S_NONE;
+ pf_release_state(st);
+ count++;
+ }
+ TAILQ_INIT(&sc->sc_qs[q]);
+
+ bzero(subh, sizeof(*subh));
+ subh->action = pfsync_qs[q].action;
+ subh->count = htons(count);
+ V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
+ }
+
+ if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
+ subh = (struct pfsync_subheader *)(m->m_data + offset);
+ offset += sizeof(*subh);
+
+ count = 0;
+ while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
+ TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
+
+ bcopy(&ur->ur_msg, m->m_data + offset,
+ sizeof(ur->ur_msg));
+ offset += sizeof(ur->ur_msg);
+ free(ur, M_PFSYNC);
+ count++;
+ }
+
+ bzero(subh, sizeof(*subh));
+ subh->action = PFSYNC_ACT_UPD_REQ;
+ subh->count = htons(count);
+ V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
+ }
+
+ /* has someone built a custom region for us to add? */
+ if (sc->sc_plus != NULL) {
+ bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
+ offset += sc->sc_pluslen;
+
+ sc->sc_plus = NULL;
+ }
+
+ subh = (struct pfsync_subheader *)(m->m_data + offset);
+ offset += sizeof(*subh);
+
+ bzero(subh, sizeof(*subh));
+ subh->action = PFSYNC_ACT_EOF;
+ subh->count = htons(1);
+ V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
+
+ /* we're done, let's put it on the wire */
+ if (ifp->if_bpf) {
+ m->m_data += sizeof(*ip);
+ m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
+ BPF_MTAP(ifp, m);
+ m->m_data -= sizeof(*ip);
+ m->m_len = m->m_pkthdr.len = sc->sc_len;
+ }
+
+ if (sc->sc_sync_if == NULL) {
+ sc->sc_len = PFSYNC_MINPKT;
+ m_freem(m);
+ return;
+ }
+
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1);
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
+ sc->sc_len = PFSYNC_MINPKT;
+
+ if (!_IF_QFULL(&sc->sc_ifp->if_snd))
+ _IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
+ else {
+ m_freem(m);
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1);
+ }
+ if (schedswi)
+ swi_sched(V_pfsync_swi_cookie, 0);
+}
+
+static void
+pfsync_insert_state(struct pf_state *st)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+
+ if (st->state_flags & PFSTATE_NOSYNC)
+ return;
+
+ if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
+ st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
+ st->state_flags |= PFSTATE_NOSYNC;
+ return;
+ }
+
+ KASSERT(st->sync_state == PFSYNC_S_NONE,
+ ("%s: st->sync_state %u", __func__, st->sync_state));
+
+ PFSYNC_LOCK(sc);
+ if (sc->sc_len == PFSYNC_MINPKT)
+ callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
+
+ pfsync_q_ins(st, PFSYNC_S_INS);
+ PFSYNC_UNLOCK(sc);
+
+ st->sync_updates = 0;
+}
+
+static int
+pfsync_defer(struct pf_state *st, struct mbuf *m)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_deferral *pd;
+
+ if (m->m_flags & (M_BCAST|M_MCAST))
+ return (0);
+
+ PFSYNC_LOCK(sc);
+
+ if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) ||
+ !(sc->sc_flags & PFSYNCF_DEFER)) {
+ PFSYNC_UNLOCK(sc);
+ return (0);
+ }
+
+ if (sc->sc_deferred >= 128)
+ pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
+
+ pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
+ if (pd == NULL)
+ return (0);
+ sc->sc_deferred++;
+
+ m->m_flags |= M_SKIP_FIREWALL;
+ st->state_flags |= PFSTATE_ACK;
+
+ pd->pd_sc = sc;
+ pd->pd_refs = 0;
+ pd->pd_st = st;
+ pf_ref_state(st);
+ pd->pd_m = m;
+
+ TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
+ callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
+ callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
+
+ pfsync_push(sc);
+
+ return (1);
+}
+
+static void
+pfsync_undefer(struct pfsync_deferral *pd, int drop)
+{
+ struct pfsync_softc *sc = pd->pd_sc;
+ struct mbuf *m = pd->pd_m;
+ struct pf_state *st = pd->pd_st;
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+ sc->sc_deferred--;
+ pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */
+ free(pd, M_PFSYNC);
+ pf_release_state(st);
+
+ if (drop)
+ m_freem(m);
+ else {
+ _IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
+ pfsync_push(sc);
+ }
+}
+
+static void
+pfsync_defer_tmo(void *arg)
+{
+ struct pfsync_deferral *pd = arg;
+ struct pfsync_softc *sc = pd->pd_sc;
+ struct mbuf *m = pd->pd_m;
+ struct pf_state *st = pd->pd_st;
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
+
+ TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+ sc->sc_deferred--;
+ pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */
+ if (pd->pd_refs == 0)
+ free(pd, M_PFSYNC);
+ PFSYNC_UNLOCK(sc);
+
+ ip_output(m, NULL, NULL, 0, NULL, NULL);
+
+ pf_release_state(st);
+
+ CURVNET_RESTORE();
+}
+
+static void
+pfsync_undefer_state(struct pf_state *st, int drop)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_deferral *pd;
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
+ if (pd->pd_st == st) {
+ if (callout_stop(&pd->pd_tmo) > 0)
+ pfsync_undefer(pd, drop);
+ return;
+ }
+ }
+
+ panic("%s: unable to find deferred state", __func__);
+}
+
+static void
+pfsync_update_state(struct pf_state *st)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ int sync = 0;
+
+ PF_STATE_LOCK_ASSERT(st);
+ PFSYNC_LOCK(sc);
+
+ if (st->state_flags & PFSTATE_ACK)
+ pfsync_undefer_state(st, 0);
+ if (st->state_flags & PFSTATE_NOSYNC) {
+ if (st->sync_state != PFSYNC_S_NONE)
+ pfsync_q_del(st);
+ PFSYNC_UNLOCK(sc);
+ return;
+ }
+
+ if (sc->sc_len == PFSYNC_MINPKT)
+ callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
+
+ switch (st->sync_state) {
+ case PFSYNC_S_UPD_C:
+ case PFSYNC_S_UPD:
+ case PFSYNC_S_INS:
+ /* we're already handling it */
+
+ if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
+ st->sync_updates++;
+ if (st->sync_updates >= sc->sc_maxupdates)
+ sync = 1;
+ }
+ break;
+
+ case PFSYNC_S_IACK:
+ pfsync_q_del(st);
+ case PFSYNC_S_NONE:
+ pfsync_q_ins(st, PFSYNC_S_UPD_C);
+ st->sync_updates = 0;
+ break;
+
+ default:
+ panic("%s: unexpected sync state %d", __func__, st->sync_state);
+ }
+
+ if (sync || (time_uptime - st->pfsync_time) < 2)
+ pfsync_push(sc);
+
+ PFSYNC_UNLOCK(sc);
+}
+
+static void
+pfsync_request_update(u_int32_t creatorid, u_int64_t id)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_upd_req_item *item;
+ size_t nlen = sizeof(struct pfsync_upd_req);
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ /*
+ * This code does a bit to prevent multiple update requests for the
+ * same state being generated. It searches current subheader queue,
+ * but it doesn't lookup into queue of already packed datagrams.
+ */
+ TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry)
+ if (item->ur_msg.id == id &&
+ item->ur_msg.creatorid == creatorid)
+ return;
+
+ item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
+ if (item == NULL)
+ return; /* XXX stats */
+
+ item->ur_msg.id = id;
+ item->ur_msg.creatorid = creatorid;
+
+ if (TAILQ_EMPTY(&sc->sc_upd_req_list))
+ nlen += sizeof(struct pfsync_subheader);
+
+ if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
+ pfsync_sendout(1);
+
+ nlen = sizeof(struct pfsync_subheader) +
+ sizeof(struct pfsync_upd_req);
+ }
+
+ TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
+ sc->sc_len += nlen;
+}
+
+static void
+pfsync_update_state_req(struct pf_state *st)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+
+ PF_STATE_LOCK_ASSERT(st);
+ PFSYNC_LOCK(sc);
+
+ if (st->state_flags & PFSTATE_NOSYNC) {
+ if (st->sync_state != PFSYNC_S_NONE)
+ pfsync_q_del(st);
+ PFSYNC_UNLOCK(sc);
+ return;
+ }
+
+ switch (st->sync_state) {
+ case PFSYNC_S_UPD_C:
+ case PFSYNC_S_IACK:
+ pfsync_q_del(st);
+ case PFSYNC_S_NONE:
+ pfsync_q_ins(st, PFSYNC_S_UPD);
+ pfsync_push(sc);
+ break;
+
+ case PFSYNC_S_INS:
+ case PFSYNC_S_UPD:
+ case PFSYNC_S_DEL:
+ /* we're already handling it */
+ break;
+
+ default:
+ panic("%s: unexpected sync state %d", __func__, st->sync_state);
+ }
+
+ PFSYNC_UNLOCK(sc);
+}
+
+static void
+pfsync_delete_state(struct pf_state *st)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+
+ PFSYNC_LOCK(sc);
+ if (st->state_flags & PFSTATE_ACK)
+ pfsync_undefer_state(st, 1);
+ if (st->state_flags & PFSTATE_NOSYNC) {
+ if (st->sync_state != PFSYNC_S_NONE)
+ pfsync_q_del(st);
+ PFSYNC_UNLOCK(sc);
+ return;
+ }
+
+ if (sc->sc_len == PFSYNC_MINPKT)
+ callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
+
+ switch (st->sync_state) {
+ case PFSYNC_S_INS:
+ /* We never got to tell the world so just forget about it. */
+ pfsync_q_del(st);
+ break;
+
+ case PFSYNC_S_UPD_C:
+ case PFSYNC_S_UPD:
+ case PFSYNC_S_IACK:
+ pfsync_q_del(st);
+ /* FALLTHROUGH to putting it on the del list */
+
+ case PFSYNC_S_NONE:
+ pfsync_q_ins(st, PFSYNC_S_DEL);
+ break;
+
+ default:
+ panic("%s: unexpected sync state %d", __func__, st->sync_state);
+ }
+ PFSYNC_UNLOCK(sc);
+}
+
+static void
+pfsync_clear_states(u_int32_t creatorid, const char *ifname)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct {
+ struct pfsync_subheader subh;
+ struct pfsync_clr clr;
+ } __packed r;
+
+ bzero(&r, sizeof(r));
+
+ r.subh.action = PFSYNC_ACT_CLR;
+ r.subh.count = htons(1);
+ V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
+
+ strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
+ r.clr.creatorid = creatorid;
+
+ PFSYNC_LOCK(sc);
+ pfsync_send_plus(&r, sizeof(r));
+ PFSYNC_UNLOCK(sc);
+}
+
+static void
+pfsync_q_ins(struct pf_state *st, int q)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ size_t nlen = pfsync_qs[q].len;
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ KASSERT(st->sync_state == PFSYNC_S_NONE,
+ ("%s: st->sync_state %u", __func__, st->sync_state));
+ KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
+ sc->sc_len));
+
+ if (TAILQ_EMPTY(&sc->sc_qs[q]))
+ nlen += sizeof(struct pfsync_subheader);
+
+ if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
+ pfsync_sendout(1);
+
+ nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
+ }
+
+ sc->sc_len += nlen;
+ TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
+ st->sync_state = q;
+ pf_ref_state(st);
+}
+
+static void
+pfsync_q_del(struct pf_state *st)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ int q = st->sync_state;
+
+ PFSYNC_LOCK_ASSERT(sc);
+ KASSERT(st->sync_state != PFSYNC_S_NONE,
+ ("%s: st->sync_state != PFSYNC_S_NONE", __func__));
+
+ sc->sc_len -= pfsync_qs[q].len;
+ TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
+ st->sync_state = PFSYNC_S_NONE;
+ pf_release_state(st);
+
+ if (TAILQ_EMPTY(&sc->sc_qs[q]))
+ sc->sc_len -= sizeof(struct pfsync_subheader);
+}
+
+static void
+pfsync_bulk_start(void)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync: received bulk update request\n");
+
+ PFSYNC_BLOCK(sc);
+
+ sc->sc_ureq_received = time_uptime;
+ sc->sc_bulk_hashid = 0;
+ sc->sc_bulk_stateid = 0;
+ pfsync_bulk_status(PFSYNC_BUS_START);
+ callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
+ PFSYNC_BUNLOCK(sc);
+}
+
+static void
+pfsync_bulk_update(void *arg)
+{
+ struct pfsync_softc *sc = arg;
+ struct pf_state *s;
+ int i, sent = 0;
+
+ PFSYNC_BLOCK_ASSERT(sc);
+ CURVNET_SET(sc->sc_ifp->if_vnet);
+
+ /*
+ * Start with last state from previous invocation.
+ * It may had gone, in this case start from the
+ * hash slot.
+ */
+ s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
+
+ if (s != NULL)
+ i = PF_IDHASH(s);
+ else
+ i = sc->sc_bulk_hashid;
+
+ for (; i <= pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+
+ if (s != NULL)
+ PF_HASHROW_ASSERT(ih);
+ else {
+ PF_HASHROW_LOCK(ih);
+ s = LIST_FIRST(&ih->states);
+ }
+
+ for (; s; s = LIST_NEXT(s, entry)) {
+
+ if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) <
+ sizeof(struct pfsync_state)) {
+ /* We've filled a packet. */
+ sc->sc_bulk_hashid = i;
+ sc->sc_bulk_stateid = s->id;
+ sc->sc_bulk_creatorid = s->creatorid;
+ PF_HASHROW_UNLOCK(ih);
+ callout_reset(&sc->sc_bulk_tmo, 1,
+ pfsync_bulk_update, sc);
+ goto full;
+ }
+
+ if (s->sync_state == PFSYNC_S_NONE &&
+ s->timeout < PFTM_MAX &&
+ s->pfsync_time <= sc->sc_ureq_received) {
+ pfsync_update_state_req(s);
+ sent++;
+ }
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+
+ /* We're done. */
+ pfsync_bulk_status(PFSYNC_BUS_END);
+
+full:
+ CURVNET_RESTORE();
+}
+
+static void
+pfsync_bulk_status(u_int8_t status)
+{
+ struct {
+ struct pfsync_subheader subh;
+ struct pfsync_bus bus;
+ } __packed r;
+
+ struct pfsync_softc *sc = V_pfsyncif;
+
+ bzero(&r, sizeof(r));
+
+ r.subh.action = PFSYNC_ACT_BUS;
+ r.subh.count = htons(1);
+ V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
+
+ r.bus.creatorid = V_pf_status.hostid;
+ r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
+ r.bus.status = status;
+
+ PFSYNC_LOCK(sc);
+ pfsync_send_plus(&r, sizeof(r));
+ PFSYNC_UNLOCK(sc);
+}
+
+static void
+pfsync_bulk_fail(void *arg)
+{
+ struct pfsync_softc *sc = arg;
+
+ CURVNET_SET(sc->sc_ifp->if_vnet);
+
+ PFSYNC_BLOCK_ASSERT(sc);
+
+ if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
+ /* Try again */
+ callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
+ pfsync_bulk_fail, V_pfsyncif);
+ PFSYNC_LOCK(sc);
+ pfsync_request_update(0, 0);
+ PFSYNC_UNLOCK(sc);
+ } else {
+ /* Pretend like the transfer was ok. */
+ sc->sc_ureq_sent = 0;
+ sc->sc_bulk_tries = 0;
+ PFSYNC_LOCK(sc);
+ if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
+ (*carp_demote_adj_p)(-V_pfsync_carp_adj,
+ "pfsync bulk fail");
+ sc->sc_flags |= PFSYNCF_OK;
+ PFSYNC_UNLOCK(sc);
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync: failed to receive bulk update\n");
+ }
+
+ CURVNET_RESTORE();
+}
+
+static void
+pfsync_send_plus(void *plus, size_t pluslen)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu)
+ pfsync_sendout(1);
+
+ sc->sc_plus = plus;
+ sc->sc_len += (sc->sc_pluslen = pluslen);
+
+ pfsync_sendout(1);
+}
+
+static void
+pfsync_timeout(void *arg)
+{
+ struct pfsync_softc *sc = arg;
+
+ CURVNET_SET(sc->sc_ifp->if_vnet);
+ PFSYNC_LOCK(sc);
+ pfsync_push(sc);
+ PFSYNC_UNLOCK(sc);
+ CURVNET_RESTORE();
+}
+
+static void
+pfsync_push(struct pfsync_softc *sc)
+{
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ sc->sc_flags |= PFSYNCF_PUSH;
+ swi_sched(V_pfsync_swi_cookie, 0);
+}
+
+static void
+pfsyncintr(void *arg)
+{
+ struct pfsync_softc *sc = arg;
+ struct mbuf *m, *n;
+
+ CURVNET_SET(sc->sc_ifp->if_vnet);
+
+ PFSYNC_LOCK(sc);
+ if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) {
+ pfsync_sendout(0);
+ sc->sc_flags &= ~PFSYNCF_PUSH;
+ }
+ _IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m);
+ PFSYNC_UNLOCK(sc);
+
+ for (; m != NULL; m = n) {
+
+ n = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+
+ /*
+ * We distinguish between a deferral packet and our
+ * own pfsync packet based on M_SKIP_FIREWALL
+ * flag. This is XXX.
+ */
+ if (m->m_flags & M_SKIP_FIREWALL)
+ ip_output(m, NULL, NULL, 0, NULL, NULL);
+ else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
+ NULL) == 0)
+ V_pfsyncstats.pfsyncs_opackets++;
+ else
+ V_pfsyncstats.pfsyncs_oerrors++;
+ }
+ CURVNET_RESTORE();
+}
+
+static int
+pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship)
+{
+ struct ip_moptions *imo = &sc->sc_imo;
+ int error;
+
+ if (!(ifp->if_flags & IFF_MULTICAST))
+ return (EADDRNOTAVAIL);
+
+ imo->imo_membership = (struct in_multi **)mship;
+ imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
+ imo->imo_multicast_vif = -1;
+
+ if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL,
+ &imo->imo_membership[0])) != 0) {
+ imo->imo_membership = NULL;
+ return (error);
+ }
+ imo->imo_num_memberships++;
+ imo->imo_multicast_ifp = ifp;
+ imo->imo_multicast_ttl = PFSYNC_DFLTTL;
+ imo->imo_multicast_loop = 0;
+
+ return (0);
+}
+
+static void
+pfsync_multicast_cleanup(struct pfsync_softc *sc)
+{
+ struct ip_moptions *imo = &sc->sc_imo;
+
+ in_leavegroup(imo->imo_membership[0], NULL);
+ free(imo->imo_membership, M_PFSYNC);
+ imo->imo_membership = NULL;
+ imo->imo_multicast_ifp = NULL;
+}
+
+#ifdef INET
+extern struct domain inetdomain;
+static struct protosw in_pfsync_protosw = {
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_PFSYNC,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = pfsync_input,
+ .pr_output = rip_output,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_usrreqs = &rip_usrreqs
+};
+#endif
+
+static void
+pfsync_pointers_init()
+{
+
+ PF_RULES_WLOCK();
+ pfsync_state_import_ptr = pfsync_state_import;
+ pfsync_insert_state_ptr = pfsync_insert_state;
+ pfsync_update_state_ptr = pfsync_update_state;
+ pfsync_delete_state_ptr = pfsync_delete_state;
+ pfsync_clear_states_ptr = pfsync_clear_states;
+ pfsync_defer_ptr = pfsync_defer;
+ PF_RULES_WUNLOCK();
+}
+
+static void
+pfsync_pointers_uninit()
+{
+
+ PF_RULES_WLOCK();
+ pfsync_state_import_ptr = NULL;
+ pfsync_insert_state_ptr = NULL;
+ pfsync_update_state_ptr = NULL;
+ pfsync_delete_state_ptr = NULL;
+ pfsync_clear_states_ptr = NULL;
+ pfsync_defer_ptr = NULL;
+ PF_RULES_WUNLOCK();
+}
+
+static void
+vnet_pfsync_init(const void *unused __unused)
+{
+ int error;
+
+ V_pfsync_cloner = if_clone_simple(pfsyncname,
+ pfsync_clone_create, pfsync_clone_destroy, 1);
+ error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif,
+ SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
+ if (error) {
+ if_clone_detach(V_pfsync_cloner);
+ log(LOG_INFO, "swi_add() failed in %s\n", __func__);
+ }
+}
+VNET_SYSINIT(vnet_pfsync_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY,
+ vnet_pfsync_init, NULL);
+
+static void
+vnet_pfsync_uninit(const void *unused __unused)
+{
+
+ if_clone_detach(V_pfsync_cloner);
+ swi_remove(V_pfsync_swi_cookie);
+}
+/*
+ * Detach after pf is gone; otherwise we might touch pfsync memory
+ * from within pf after freeing pfsync.
+ */
+VNET_SYSUNINIT(vnet_pfsync_uninit, SI_SUB_INIT_IF, SI_ORDER_SECOND,
+ vnet_pfsync_uninit, NULL);
+
+static int
+pfsync_init()
+{
+#ifdef INET
+ int error;
+
+ error = pf_proto_register(PF_INET, &in_pfsync_protosw);
+ if (error)
+ return (error);
+ error = ipproto_register(IPPROTO_PFSYNC);
+ if (error) {
+ pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
+ return (error);
+ }
+#endif
+ pfsync_pointers_init();
+
+ return (0);
+}
+
+static void
+pfsync_uninit()
+{
+
+ pfsync_pointers_uninit();
+
+#ifdef INET
+ ipproto_unregister(IPPROTO_PFSYNC);
+ pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
+#endif
+}
+
+static int
+pfsync_modevent(module_t mod, int type, void *data)
+{
+ int error = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ error = pfsync_init();
+ break;
+ case MOD_QUIESCE:
+ /*
+ * Module should not be unloaded due to race conditions.
+ */
+ error = EBUSY;
+ break;
+ case MOD_UNLOAD:
+ pfsync_uninit();
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+static moduledata_t pfsync_mod = {
+ pfsyncname,
+ pfsync_modevent,
+ 0
+};
+
+#define PFSYNC_MODVER 1
+
+/* Stay on FIREWALL as we depend on pf being initialized and on inetdomain. */
+DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY);
+MODULE_VERSION(pfsync, PFSYNC_MODVER);
+MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);
diff --git a/freebsd/sys/netpfil/pf/in4_cksum.c b/freebsd/sys/netpfil/pf/in4_cksum.c
new file mode 100644
index 00000000..19cc8ac4
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/in4_cksum.c
@@ -0,0 +1,122 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/* $FreeBSD$ */
+/* $OpenBSD: in4_cksum.c,v 1.7 2003/06/02 23:28:13 millert Exp $ */
+/* $KAME: in4_cksum.c,v 1.10 2001/11/30 10:06:15 itojun Exp $ */
+/* $NetBSD: in_cksum.c,v 1.13 1996/10/13 02:03:03 christos Exp $ */
+
+/*
+ * Copyright (C) 1999 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1988, 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93
+ */
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+
+#include <machine/in_cksum.h>
+
+#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x)
+#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; (void)ADDCARRY(sum);}
+
+int in4_cksum(struct mbuf *, u_int8_t, int, int);
+
+int
+in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len)
+{
+ union {
+ struct ipovly ipov;
+ u_int16_t w[10];
+ } u;
+ union {
+ u_int16_t s[2];
+ u_int32_t l;
+ } l_util;
+
+ u_int16_t *w;
+ int psum;
+ int sum = 0;
+
+ if (nxt != 0) {
+ /* pseudo header */
+ if (off < sizeof(struct ipovly))
+ panic("in4_cksum: offset too short");
+ if (m->m_len < sizeof(struct ip))
+ panic("in4_cksum: bad mbuf chain");
+ bzero(&u.ipov, sizeof(u.ipov));
+ u.ipov.ih_len = htons(len);
+ u.ipov.ih_pr = nxt;
+ u.ipov.ih_src = mtod(m, struct ip *)->ip_src;
+ u.ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
+ w = u.w;
+ /* assumes sizeof(ipov) == 20 */
+ sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4];
+ sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9];
+ }
+
+ psum = in_cksum_skip(m, len + off, off);
+ psum = ~psum & 0xffff;
+ sum += psum;
+ REDUCE;
+ return (~sum & 0xffff);
+}
diff --git a/freebsd/sys/netpfil/pf/pf.c b/freebsd/sys/netpfil/pf/pf.c
new file mode 100644
index 00000000..7ac181b5
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/pf.c
@@ -0,0 +1,6657 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002 - 2008 Henning Brauer
+ * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/local/opt_inet.h>
+#include <rtems/bsd/local/opt_inet6.h>
+#include <rtems/bsd/local/opt_bpf.h>
+#include <rtems/bsd/local/opt_pf.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/bus.h>
+#include <sys/endian.h>
+#include <sys/hash.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/limits.h>
+#include <sys/mbuf.h>
+#include <sys/md5.h>
+#include <sys/random.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/ucred.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_types.h>
+#include <net/if_vlan_var.h>
+#include <net/route.h>
+#include <net/radix_mpath.h>
+#include <net/vnet.h>
+
+#include <net/pfvar.h>
+#include <net/if_pflog.h>
+#include <net/if_pfsync.h>
+
+#include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+#include <netinet/ip.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/icmp_var.h>
+#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+
+#include <netpfil/ipfw/ip_fw_private.h> /* XXX: only for DIR_IN/DIR_OUT */
+
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet6/nd6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/in6_fib.h>
+#include <netinet6/scope6_var.h>
+#endif /* INET6 */
+
+#include <machine/in_cksum.h>
+#include <security/mac/mac_framework.h>
+
+#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
+
+/*
+ * Global variables
+ */
+
+/* state tables */
+VNET_DEFINE(struct pf_altqqueue, pf_altqs[2]);
+VNET_DEFINE(struct pf_palist, pf_pabuf);
+VNET_DEFINE(struct pf_altqqueue *, pf_altqs_active);
+VNET_DEFINE(struct pf_altqqueue *, pf_altqs_inactive);
+VNET_DEFINE(struct pf_kstatus, pf_status);
+
+VNET_DEFINE(u_int32_t, ticket_altqs_active);
+VNET_DEFINE(u_int32_t, ticket_altqs_inactive);
+VNET_DEFINE(int, altqs_inactive_open);
+VNET_DEFINE(u_int32_t, ticket_pabuf);
+
+VNET_DEFINE(MD5_CTX, pf_tcp_secret_ctx);
+#define V_pf_tcp_secret_ctx VNET(pf_tcp_secret_ctx)
+VNET_DEFINE(u_char, pf_tcp_secret[16]);
+#define V_pf_tcp_secret VNET(pf_tcp_secret)
+VNET_DEFINE(int, pf_tcp_secret_init);
+#define V_pf_tcp_secret_init VNET(pf_tcp_secret_init)
+VNET_DEFINE(int, pf_tcp_iss_off);
+#define V_pf_tcp_iss_off VNET(pf_tcp_iss_off)
+
+/*
+ * Queue for pf_intr() sends.
+ */
+static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations");
+struct pf_send_entry {
+ STAILQ_ENTRY(pf_send_entry) pfse_next;
+ struct mbuf *pfse_m;
+ enum {
+ PFSE_IP,
+ PFSE_IP6,
+ PFSE_ICMP,
+ PFSE_ICMP6,
+ } pfse_type;
+ struct {
+ int type;
+ int code;
+ int mtu;
+ } icmpopts;
+};
+
+STAILQ_HEAD(pf_send_head, pf_send_entry);
+static VNET_DEFINE(struct pf_send_head, pf_sendqueue);
+#define V_pf_sendqueue VNET(pf_sendqueue)
+
+static struct mtx pf_sendqueue_mtx;
+MTX_SYSINIT(pf_sendqueue_mtx, &pf_sendqueue_mtx, "pf send queue", MTX_DEF);
+#define PF_SENDQ_LOCK() mtx_lock(&pf_sendqueue_mtx)
+#define PF_SENDQ_UNLOCK() mtx_unlock(&pf_sendqueue_mtx)
+
+/*
+ * Queue for pf_overload_task() tasks.
+ */
+struct pf_overload_entry {
+ SLIST_ENTRY(pf_overload_entry) next;
+ struct pf_addr addr;
+ sa_family_t af;
+ uint8_t dir;
+ struct pf_rule *rule;
+};
+
+SLIST_HEAD(pf_overload_head, pf_overload_entry);
+static VNET_DEFINE(struct pf_overload_head, pf_overloadqueue);
+#define V_pf_overloadqueue VNET(pf_overloadqueue)
+static VNET_DEFINE(struct task, pf_overloadtask);
+#define V_pf_overloadtask VNET(pf_overloadtask)
+
+static struct mtx pf_overloadqueue_mtx;
+MTX_SYSINIT(pf_overloadqueue_mtx, &pf_overloadqueue_mtx,
+ "pf overload/flush queue", MTX_DEF);
+#define PF_OVERLOADQ_LOCK() mtx_lock(&pf_overloadqueue_mtx)
+#define PF_OVERLOADQ_UNLOCK() mtx_unlock(&pf_overloadqueue_mtx)
+
+VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules);
+struct mtx pf_unlnkdrules_mtx;
+MTX_SYSINIT(pf_unlnkdrules_mtx, &pf_unlnkdrules_mtx, "pf unlinked rules",
+ MTX_DEF);
+
+static VNET_DEFINE(uma_zone_t, pf_sources_z);
+#define V_pf_sources_z VNET(pf_sources_z)
+uma_zone_t pf_mtag_z;
+VNET_DEFINE(uma_zone_t, pf_state_z);
+VNET_DEFINE(uma_zone_t, pf_state_key_z);
+
+VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]);
+#define PFID_CPUBITS 8
+#define PFID_CPUSHIFT (sizeof(uint64_t) * NBBY - PFID_CPUBITS)
+#define PFID_CPUMASK ((uint64_t)((1 << PFID_CPUBITS) - 1) << PFID_CPUSHIFT)
+#define PFID_MAXID (~PFID_CPUMASK)
+CTASSERT((1 << PFID_CPUBITS) >= MAXCPU);
+
+static void pf_src_tree_remove_state(struct pf_state *);
+static void pf_init_threshold(struct pf_threshold *, u_int32_t,
+ u_int32_t);
+static void pf_add_threshold(struct pf_threshold *);
+static int pf_check_threshold(struct pf_threshold *);
+
+static void pf_change_ap(struct mbuf *, struct pf_addr *, u_int16_t *,
+ u_int16_t *, u_int16_t *, struct pf_addr *,
+ u_int16_t, u_int8_t, sa_family_t);
+static int pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
+ struct tcphdr *, struct pf_state_peer *);
+static void pf_change_icmp(struct pf_addr *, u_int16_t *,
+ struct pf_addr *, struct pf_addr *, u_int16_t,
+ u_int16_t *, u_int16_t *, u_int16_t *,
+ u_int16_t *, u_int8_t, sa_family_t);
+static void pf_send_tcp(struct mbuf *,
+ const struct pf_rule *, sa_family_t,
+ const struct pf_addr *, const struct pf_addr *,
+ u_int16_t, u_int16_t, u_int32_t, u_int32_t,
+ u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
+ u_int16_t, struct ifnet *);
+static void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
+ sa_family_t, struct pf_rule *);
+static void pf_detach_state(struct pf_state *);
+static int pf_state_key_attach(struct pf_state_key *,
+ struct pf_state_key *, struct pf_state *);
+static void pf_state_key_detach(struct pf_state *, int);
+static int pf_state_key_ctor(void *, int, void *, int);
+static u_int32_t pf_tcp_iss(struct pf_pdesc *);
+static int pf_test_rule(struct pf_rule **, struct pf_state **,
+ int, struct pfi_kif *, struct mbuf *, int,
+ struct pf_pdesc *, struct pf_rule **,
+ struct pf_ruleset **, struct inpcb *);
+static int pf_create_state(struct pf_rule *, struct pf_rule *,
+ struct pf_rule *, struct pf_pdesc *,
+ struct pf_src_node *, struct pf_state_key *,
+ struct pf_state_key *, struct mbuf *, int,
+ u_int16_t, u_int16_t, int *, struct pfi_kif *,
+ struct pf_state **, int, u_int16_t, u_int16_t,
+ int);
+static int pf_test_fragment(struct pf_rule **, int,
+ struct pfi_kif *, struct mbuf *, void *,
+ struct pf_pdesc *, struct pf_rule **,
+ struct pf_ruleset **);
+static int pf_tcp_track_full(struct pf_state_peer *,
+ struct pf_state_peer *, struct pf_state **,
+ struct pfi_kif *, struct mbuf *, int,
+ struct pf_pdesc *, u_short *, int *);
+static int pf_tcp_track_sloppy(struct pf_state_peer *,
+ struct pf_state_peer *, struct pf_state **,
+ struct pf_pdesc *, u_short *);
+static int pf_test_state_tcp(struct pf_state **, int,
+ struct pfi_kif *, struct mbuf *, int,
+ void *, struct pf_pdesc *, u_short *);
+static int pf_test_state_udp(struct pf_state **, int,
+ struct pfi_kif *, struct mbuf *, int,
+ void *, struct pf_pdesc *);
+static int pf_test_state_icmp(struct pf_state **, int,
+ struct pfi_kif *, struct mbuf *, int,
+ void *, struct pf_pdesc *, u_short *);
+static int pf_test_state_other(struct pf_state **, int,
+ struct pfi_kif *, struct mbuf *, struct pf_pdesc *);
+static u_int8_t pf_get_wscale(struct mbuf *, int, u_int16_t,
+ sa_family_t);
+static u_int16_t pf_get_mss(struct mbuf *, int, u_int16_t,
+ sa_family_t);
+static u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t,
+ int, u_int16_t);
+static int pf_check_proto_cksum(struct mbuf *, int, int,
+ u_int8_t, sa_family_t);
+static void pf_print_state_parts(struct pf_state *,
+ struct pf_state_key *, struct pf_state_key *);
+static int pf_addr_wrap_neq(struct pf_addr_wrap *,
+ struct pf_addr_wrap *);
+static struct pf_state *pf_find_state(struct pfi_kif *,
+ struct pf_state_key_cmp *, u_int);
+static int pf_src_connlimit(struct pf_state **);
+static void pf_overload_task(void *v, int pending);
+static int pf_insert_src_node(struct pf_src_node **,
+ struct pf_rule *, struct pf_addr *, sa_family_t);
+static u_int pf_purge_expired_states(u_int, int);
+static void pf_purge_unlinked_rules(void);
+static int pf_mtag_uminit(void *, int, int);
+static void pf_mtag_free(struct m_tag *);
+#ifdef INET
+static void pf_route(struct mbuf **, struct pf_rule *, int,
+ struct ifnet *, struct pf_state *,
+ struct pf_pdesc *);
+#endif /* INET */
+#ifdef INET6
+static void pf_change_a6(struct pf_addr *, u_int16_t *,
+ struct pf_addr *, u_int8_t);
+static void pf_route6(struct mbuf **, struct pf_rule *, int,
+ struct ifnet *, struct pf_state *,
+ struct pf_pdesc *);
+#endif /* INET6 */
+
+int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
+
+extern int pf_end_threads;
+
+VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]);
+
+#define PACKET_LOOPED(pd) ((pd)->pf_mtag && \
+ (pd)->pf_mtag->flags & PF_PACKET_LOOPED)
+
+#define STATE_LOOKUP(i, k, d, s, pd) \
+ do { \
+ (s) = pf_find_state((i), (k), (d)); \
+ if ((s) == NULL) \
+ return (PF_DROP); \
+ if (PACKET_LOOPED(pd)) \
+ return (PF_PASS); \
+ if ((d) == PF_OUT && \
+ (((s)->rule.ptr->rt == PF_ROUTETO && \
+ (s)->rule.ptr->direction == PF_OUT) || \
+ ((s)->rule.ptr->rt == PF_REPLYTO && \
+ (s)->rule.ptr->direction == PF_IN)) && \
+ (s)->rt_kif != NULL && \
+ (s)->rt_kif != (i)) \
+ return (PF_PASS); \
+ } while (0)
+
+#define BOUND_IFACE(r, k) \
+ ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all
+
+#define STATE_INC_COUNTERS(s) \
+ do { \
+ counter_u64_add(s->rule.ptr->states_cur, 1); \
+ counter_u64_add(s->rule.ptr->states_tot, 1); \
+ if (s->anchor.ptr != NULL) { \
+ counter_u64_add(s->anchor.ptr->states_cur, 1); \
+ counter_u64_add(s->anchor.ptr->states_tot, 1); \
+ } \
+ if (s->nat_rule.ptr != NULL) { \
+ counter_u64_add(s->nat_rule.ptr->states_cur, 1);\
+ counter_u64_add(s->nat_rule.ptr->states_tot, 1);\
+ } \
+ } while (0)
+
+#define STATE_DEC_COUNTERS(s) \
+ do { \
+ if (s->nat_rule.ptr != NULL) \
+ counter_u64_add(s->nat_rule.ptr->states_cur, -1);\
+ if (s->anchor.ptr != NULL) \
+ counter_u64_add(s->anchor.ptr->states_cur, -1); \
+ counter_u64_add(s->rule.ptr->states_cur, -1); \
+ } while (0)
+
+static MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures");
+VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
+VNET_DEFINE(struct pf_idhash *, pf_idhash);
+VNET_DEFINE(struct pf_srchash *, pf_srchash);
+
+SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)");
+
+u_long pf_hashmask;
+u_long pf_srchashmask;
+static u_long pf_hashsize;
+static u_long pf_srchashsize;
+
+SYSCTL_ULONG(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN,
+ &pf_hashsize, 0, "Size of pf(4) states hashtable");
+SYSCTL_ULONG(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN,
+ &pf_srchashsize, 0, "Size of pf(4) source nodes hashtable");
+
+VNET_DEFINE(void *, pf_swi_cookie);
+
+VNET_DEFINE(uint32_t, pf_hashseed);
+#define V_pf_hashseed VNET(pf_hashseed)
+
+int
+pf_addr_cmp(struct pf_addr *a, struct pf_addr *b, sa_family_t af)
+{
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ if (a->addr32[0] > b->addr32[0])
+ return (1);
+ if (a->addr32[0] < b->addr32[0])
+ return (-1);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (a->addr32[3] > b->addr32[3])
+ return (1);
+ if (a->addr32[3] < b->addr32[3])
+ return (-1);
+ if (a->addr32[2] > b->addr32[2])
+ return (1);
+ if (a->addr32[2] < b->addr32[2])
+ return (-1);
+ if (a->addr32[1] > b->addr32[1])
+ return (1);
+ if (a->addr32[1] < b->addr32[1])
+ return (-1);
+ if (a->addr32[0] > b->addr32[0])
+ return (1);
+ if (a->addr32[0] < b->addr32[0])
+ return (-1);
+ break;
+#endif /* INET6 */
+ default:
+ panic("%s: unknown address family %u", __func__, af);
+ }
+ return (0);
+}
+
+static __inline uint32_t
+pf_hashkey(struct pf_state_key *sk)
+{
+ uint32_t h;
+
+ h = murmur3_32_hash32((uint32_t *)sk,
+ sizeof(struct pf_state_key_cmp)/sizeof(uint32_t),
+ V_pf_hashseed);
+
+ return (h & pf_hashmask);
+}
+
+static __inline uint32_t
+pf_hashsrc(struct pf_addr *addr, sa_family_t af)
+{
+ uint32_t h;
+
+ switch (af) {
+ case AF_INET:
+ h = murmur3_32_hash32((uint32_t *)&addr->v4,
+ sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed);
+ break;
+ case AF_INET6:
+ h = murmur3_32_hash32((uint32_t *)&addr->v6,
+ sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed);
+ break;
+ default:
+ panic("%s: unknown address family %u", __func__, af);
+ }
+
+ return (h & pf_srchashmask);
+}
+
+#ifdef ALTQ
+static int
+pf_state_hash(struct pf_state *s)
+{
+ u_int32_t hv = (intptr_t)s / sizeof(*s);
+
+ hv ^= crc32(&s->src, sizeof(s->src));
+ hv ^= crc32(&s->dst, sizeof(s->dst));
+ if (hv == 0)
+ hv = 1;
+ return (hv);
+}
+#endif
+
+#ifdef INET6
+void
+pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ dst->addr32[0] = src->addr32[0];
+ break;
+#endif /* INET */
+ case AF_INET6:
+ dst->addr32[0] = src->addr32[0];
+ dst->addr32[1] = src->addr32[1];
+ dst->addr32[2] = src->addr32[2];
+ dst->addr32[3] = src->addr32[3];
+ break;
+ }
+}
+#endif /* INET6 */
+
+static void
+pf_init_threshold(struct pf_threshold *threshold,
+ u_int32_t limit, u_int32_t seconds)
+{
+ threshold->limit = limit * PF_THRESHOLD_MULT;
+ threshold->seconds = seconds;
+ threshold->count = 0;
+ threshold->last = time_uptime;
+}
+
+static void
+pf_add_threshold(struct pf_threshold *threshold)
+{
+ u_int32_t t = time_uptime, diff = t - threshold->last;
+
+ if (diff >= threshold->seconds)
+ threshold->count = 0;
+ else
+ threshold->count -= threshold->count * diff /
+ threshold->seconds;
+ threshold->count += PF_THRESHOLD_MULT;
+ threshold->last = t;
+}
+
+static int
+pf_check_threshold(struct pf_threshold *threshold)
+{
+ return (threshold->count > threshold->limit);
+}
+
+static int
+pf_src_connlimit(struct pf_state **state)
+{
+ struct pf_overload_entry *pfoe;
+ int bad = 0;
+
+ PF_STATE_LOCK_ASSERT(*state);
+
+ (*state)->src_node->conn++;
+ (*state)->src.tcp_est = 1;
+ pf_add_threshold(&(*state)->src_node->conn_rate);
+
+ if ((*state)->rule.ptr->max_src_conn &&
+ (*state)->rule.ptr->max_src_conn <
+ (*state)->src_node->conn) {
+ counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONN], 1);
+ bad++;
+ }
+
+ if ((*state)->rule.ptr->max_src_conn_rate.limit &&
+ pf_check_threshold(&(*state)->src_node->conn_rate)) {
+ counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONNRATE], 1);
+ bad++;
+ }
+
+ if (!bad)
+ return (0);
+
+ /* Kill this state. */
+ (*state)->timeout = PFTM_PURGE;
+ (*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
+
+ if ((*state)->rule.ptr->overload_tbl == NULL)
+ return (1);
+
+ /* Schedule overloading and flushing task. */
+ pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT);
+ if (pfoe == NULL)
+ return (1); /* too bad :( */
+
+ bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr));
+ pfoe->af = (*state)->key[PF_SK_WIRE]->af;
+ pfoe->rule = (*state)->rule.ptr;
+ pfoe->dir = (*state)->direction;
+ PF_OVERLOADQ_LOCK();
+ SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next);
+ PF_OVERLOADQ_UNLOCK();
+ taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask);
+
+ return (1);
+}
+
+static void
+pf_overload_task(void *v, int pending)
+{
+ struct pf_overload_head queue;
+ struct pfr_addr p;
+ struct pf_overload_entry *pfoe, *pfoe1;
+ uint32_t killed = 0;
+
+ CURVNET_SET((struct vnet *)v);
+
+ PF_OVERLOADQ_LOCK();
+ queue = V_pf_overloadqueue;
+ SLIST_INIT(&V_pf_overloadqueue);
+ PF_OVERLOADQ_UNLOCK();
+
+ bzero(&p, sizeof(p));
+ SLIST_FOREACH(pfoe, &queue, next) {
+ counter_u64_add(V_pf_status.lcounters[LCNT_OVERLOAD_TABLE], 1);
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("%s: blocking address ", __func__);
+ pf_print_host(&pfoe->addr, 0, pfoe->af);
+ printf("\n");
+ }
+
+ p.pfra_af = pfoe->af;
+ switch (pfoe->af) {
+#ifdef INET
+ case AF_INET:
+ p.pfra_net = 32;
+ p.pfra_ip4addr = pfoe->addr.v4;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ p.pfra_net = 128;
+ p.pfra_ip6addr = pfoe->addr.v6;
+ break;
+#endif
+ }
+
+ PF_RULES_WLOCK();
+ pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second);
+ PF_RULES_WUNLOCK();
+ }
+
+ /*
+ * Remove those entries, that don't need flushing.
+ */
+ SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
+ if (pfoe->rule->flush == 0) {
+ SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next);
+ free(pfoe, M_PFTEMP);
+ } else
+ counter_u64_add(
+ V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH], 1);
+
+ /* If nothing to flush, return. */
+ if (SLIST_EMPTY(&queue)) {
+ CURVNET_RESTORE();
+ return;
+ }
+
+ for (int i = 0; i <= pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+ struct pf_state_key *sk;
+ struct pf_state *s;
+
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+ sk = s->key[PF_SK_WIRE];
+ SLIST_FOREACH(pfoe, &queue, next)
+ if (sk->af == pfoe->af &&
+ ((pfoe->rule->flush & PF_FLUSH_GLOBAL) ||
+ pfoe->rule == s->rule.ptr) &&
+ ((pfoe->dir == PF_OUT &&
+ PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) ||
+ (pfoe->dir == PF_IN &&
+ PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) {
+ s->timeout = PFTM_PURGE;
+ s->src.state = s->dst.state = TCPS_CLOSED;
+ killed++;
+ }
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+ SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
+ free(pfoe, M_PFTEMP);
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("%s: %u states killed", __func__, killed);
+
+ CURVNET_RESTORE();
+}
+
+/*
+ * Can return locked on failure, so that we can consistently
+ * allocate and insert a new one.
+ */
+struct pf_src_node *
+pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af,
+ int returnlocked)
+{
+ struct pf_srchash *sh;
+ struct pf_src_node *n;
+
+ counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1);
+
+ sh = &V_pf_srchash[pf_hashsrc(src, af)];
+ PF_HASHROW_LOCK(sh);
+ LIST_FOREACH(n, &sh->nodes, entry)
+ if (n->rule.ptr == rule && n->af == af &&
+ ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) ||
+ (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0)))
+ break;
+ if (n != NULL) {
+ n->states++;
+ PF_HASHROW_UNLOCK(sh);
+ } else if (returnlocked == 0)
+ PF_HASHROW_UNLOCK(sh);
+
+ return (n);
+}
+
+static int
+pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
+ struct pf_addr *src, sa_family_t af)
+{
+
+ KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK ||
+ rule->rpool.opts & PF_POOL_STICKYADDR),
+ ("%s for non-tracking rule %p", __func__, rule));
+
+ if (*sn == NULL)
+ *sn = pf_find_src_node(src, rule, af, 1);
+
+ if (*sn == NULL) {
+ struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)];
+
+ PF_HASHROW_ASSERT(sh);
+
+ if (!rule->max_src_nodes ||
+ counter_u64_fetch(rule->src_nodes) < rule->max_src_nodes)
+ (*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO);
+ else
+ counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES],
+ 1);
+ if ((*sn) == NULL) {
+ PF_HASHROW_UNLOCK(sh);
+ return (-1);
+ }
+
+ pf_init_threshold(&(*sn)->conn_rate,
+ rule->max_src_conn_rate.limit,
+ rule->max_src_conn_rate.seconds);
+
+ (*sn)->af = af;
+ (*sn)->rule.ptr = rule;
+ PF_ACPY(&(*sn)->addr, src, af);
+ LIST_INSERT_HEAD(&sh->nodes, *sn, entry);
+ (*sn)->creation = time_uptime;
+ (*sn)->ruletype = rule->action;
+ (*sn)->states = 1;
+ if ((*sn)->rule.ptr != NULL)
+ counter_u64_add((*sn)->rule.ptr->src_nodes, 1);
+ PF_HASHROW_UNLOCK(sh);
+ counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1);
+ } else {
+ if (rule->max_src_states &&
+ (*sn)->states >= rule->max_src_states) {
+ counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES],
+ 1);
+ return (-1);
+ }
+ }
+ return (0);
+}
+
+void
+pf_unlink_src_node(struct pf_src_node *src)
+{
+
+ PF_HASHROW_ASSERT(&V_pf_srchash[pf_hashsrc(&src->addr, src->af)]);
+ LIST_REMOVE(src, entry);
+ if (src->rule.ptr)
+ counter_u64_add(src->rule.ptr->src_nodes, -1);
+}
+
+u_int
+pf_free_src_nodes(struct pf_src_node_list *head)
+{
+ struct pf_src_node *sn, *tmp;
+ u_int count = 0;
+
+ LIST_FOREACH_SAFE(sn, head, entry, tmp) {
+ uma_zfree(V_pf_sources_z, sn);
+ count++;
+ }
+
+ counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], count);
+
+ return (count);
+}
+
+void
+pf_mtag_initialize()
+{
+
+ pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) +
+ sizeof(struct pf_mtag), NULL, NULL, pf_mtag_uminit, NULL,
+ UMA_ALIGN_PTR, 0);
+}
+
+/* Per-vnet data storage structures initialization. */
+void
+pf_initialize()
+{
+ struct pf_keyhash *kh;
+ struct pf_idhash *ih;
+ struct pf_srchash *sh;
+ u_int i;
+
+ if (pf_hashsize == 0 || !powerof2(pf_hashsize))
+ pf_hashsize = PF_HASHSIZ;
+ if (pf_srchashsize == 0 || !powerof2(pf_srchashsize))
+ pf_srchashsize = PF_HASHSIZ / 4;
+
+ V_pf_hashseed = arc4random();
+
+ /* States and state keys storage. */
+ V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z;
+ uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
+ uma_zone_set_warning(V_pf_state_z, "PF states limit reached");
+
+ V_pf_state_key_z = uma_zcreate("pf state keys",
+ sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+ V_pf_keyhash = malloc(pf_hashsize * sizeof(struct pf_keyhash),
+ M_PFHASH, M_WAITOK | M_ZERO);
+ V_pf_idhash = malloc(pf_hashsize * sizeof(struct pf_idhash),
+ M_PFHASH, M_WAITOK | M_ZERO);
+ pf_hashmask = pf_hashsize - 1;
+ for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask;
+ i++, kh++, ih++) {
+ mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK);
+ mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
+ }
+
+ /* Source nodes. */
+ V_pf_sources_z = uma_zcreate("pf source nodes",
+ sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
+ 0);
+ V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z;
+ uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT);
+ uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached");
+ V_pf_srchash = malloc(pf_srchashsize * sizeof(struct pf_srchash),
+ M_PFHASH, M_WAITOK|M_ZERO);
+ pf_srchashmask = pf_srchashsize - 1;
+ for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++)
+ mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
+
+ /* ALTQ */
+ TAILQ_INIT(&V_pf_altqs[0]);
+ TAILQ_INIT(&V_pf_altqs[1]);
+ TAILQ_INIT(&V_pf_pabuf);
+ V_pf_altqs_active = &V_pf_altqs[0];
+ V_pf_altqs_inactive = &V_pf_altqs[1];
+
+ /* Send & overload+flush queues. */
+ STAILQ_INIT(&V_pf_sendqueue);
+ SLIST_INIT(&V_pf_overloadqueue);
+ TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, curvnet);
+
+ /* Unlinked, but may be referenced rules. */
+ TAILQ_INIT(&V_pf_unlinked_rules);
+}
+
+void
+pf_mtag_cleanup()
+{
+
+ uma_zdestroy(pf_mtag_z);
+}
+
+void
+pf_cleanup()
+{
+ struct pf_keyhash *kh;
+ struct pf_idhash *ih;
+ struct pf_srchash *sh;
+ struct pf_send_entry *pfse, *next;
+ u_int i;
+
+ for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask;
+ i++, kh++, ih++) {
+ KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
+ __func__));
+ KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
+ __func__));
+ mtx_destroy(&kh->lock);
+ mtx_destroy(&ih->lock);
+ }
+ free(V_pf_keyhash, M_PFHASH);
+ free(V_pf_idhash, M_PFHASH);
+
+ for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) {
+ KASSERT(LIST_EMPTY(&sh->nodes),
+ ("%s: source node hash not empty", __func__));
+ mtx_destroy(&sh->lock);
+ }
+ free(V_pf_srchash, M_PFHASH);
+
+ STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
+ m_freem(pfse->pfse_m);
+ free(pfse, M_PFTEMP);
+ }
+
+ uma_zdestroy(V_pf_sources_z);
+ uma_zdestroy(V_pf_state_z);
+ uma_zdestroy(V_pf_state_key_z);
+}
+
+static int
+pf_mtag_uminit(void *mem, int size, int how)
+{
+ struct m_tag *t;
+
+ t = (struct m_tag *)mem;
+ t->m_tag_cookie = MTAG_ABI_COMPAT;
+ t->m_tag_id = PACKET_TAG_PF;
+ t->m_tag_len = sizeof(struct pf_mtag);
+ t->m_tag_free = pf_mtag_free;
+
+ return (0);
+}
+
+static void
+pf_mtag_free(struct m_tag *t)
+{
+
+ uma_zfree(pf_mtag_z, t);
+}
+
+struct pf_mtag *
+pf_get_mtag(struct mbuf *m)
+{
+ struct m_tag *mtag;
+
+ if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL)
+ return ((struct pf_mtag *)(mtag + 1));
+
+ mtag = uma_zalloc(pf_mtag_z, M_NOWAIT);
+ if (mtag == NULL)
+ return (NULL);
+ bzero(mtag + 1, sizeof(struct pf_mtag));
+ m_tag_prepend(m, mtag);
+
+ return ((struct pf_mtag *)(mtag + 1));
+}
+
+static int
+pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
+ struct pf_state *s)
+{
+ struct pf_keyhash *khs, *khw, *kh;
+ struct pf_state_key *sk, *cur;
+ struct pf_state *si, *olds = NULL;
+ int idx;
+
+ KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
+ KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
+ KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
+
+ /*
+ * We need to lock hash slots of both keys. To avoid deadlock
+ * we always lock the slot with lower address first. Unlock order
+ * isn't important.
+ *
+ * We also need to lock ID hash slot before dropping key
+ * locks. On success we return with ID hash slot locked.
+ */
+
+ if (skw == sks) {
+ khs = khw = &V_pf_keyhash[pf_hashkey(skw)];
+ PF_HASHROW_LOCK(khs);
+ } else {
+ khs = &V_pf_keyhash[pf_hashkey(sks)];
+ khw = &V_pf_keyhash[pf_hashkey(skw)];
+ if (khs == khw) {
+ PF_HASHROW_LOCK(khs);
+ } else if (khs < khw) {
+ PF_HASHROW_LOCK(khs);
+ PF_HASHROW_LOCK(khw);
+ } else {
+ PF_HASHROW_LOCK(khw);
+ PF_HASHROW_LOCK(khs);
+ }
+ }
+
+#define KEYS_UNLOCK() do { \
+ if (khs != khw) { \
+ PF_HASHROW_UNLOCK(khs); \
+ PF_HASHROW_UNLOCK(khw); \
+ } else \
+ PF_HASHROW_UNLOCK(khs); \
+} while (0)
+
+ /*
+ * First run: start with wire key.
+ */
+ sk = skw;
+ kh = khw;
+ idx = PF_SK_WIRE;
+
+keyattach:
+ LIST_FOREACH(cur, &kh->keys, entry)
+ if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
+ break;
+
+ if (cur != NULL) {
+ /* Key exists. Check for same kif, if none, add to key. */
+ TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
+ struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
+
+ PF_HASHROW_LOCK(ih);
+ if (si->kif == s->kif &&
+ si->direction == s->direction) {
+ if (sk->proto == IPPROTO_TCP &&
+ si->src.state >= TCPS_FIN_WAIT_2 &&
+ si->dst.state >= TCPS_FIN_WAIT_2) {
+ /*
+ * New state matches an old >FIN_WAIT_2
+ * state. We can't drop key hash locks,
+ * thus we can't unlink it properly.
+ *
+ * As a workaround we drop it into
+ * TCPS_CLOSED state, schedule purge
+ * ASAP and push it into the very end
+ * of the slot TAILQ, so that it won't
+ * conflict with our new state.
+ */
+ si->src.state = si->dst.state =
+ TCPS_CLOSED;
+ si->timeout = PFTM_PURGE;
+ olds = si;
+ } else {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: %s key attach "
+ "failed on %s: ",
+ (idx == PF_SK_WIRE) ?
+ "wire" : "stack",
+ s->kif->pfik_name);
+ pf_print_state_parts(s,
+ (idx == PF_SK_WIRE) ?
+ sk : NULL,
+ (idx == PF_SK_STACK) ?
+ sk : NULL);
+ printf(", existing: ");
+ pf_print_state_parts(si,
+ (idx == PF_SK_WIRE) ?
+ sk : NULL,
+ (idx == PF_SK_STACK) ?
+ sk : NULL);
+ printf("\n");
+ }
+ PF_HASHROW_UNLOCK(ih);
+ KEYS_UNLOCK();
+ uma_zfree(V_pf_state_key_z, sk);
+ if (idx == PF_SK_STACK)
+ pf_detach_state(s);
+ return (EEXIST); /* collision! */
+ }
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+ uma_zfree(V_pf_state_key_z, sk);
+ s->key[idx] = cur;
+ } else {
+ LIST_INSERT_HEAD(&kh->keys, sk, entry);
+ s->key[idx] = sk;
+ }
+
+stateattach:
+ /* List is sorted, if-bound states before floating. */
+ if (s->kif == V_pfi_all)
+ TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
+ else
+ TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
+
+ if (olds) {
+ TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]);
+ TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds,
+ key_list[idx]);
+ olds = NULL;
+ }
+
+ /*
+ * Attach done. See how should we (or should not?)
+ * attach a second key.
+ */
+ if (sks == skw) {
+ s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
+ idx = PF_SK_STACK;
+ sks = NULL;
+ goto stateattach;
+ } else if (sks != NULL) {
+ /*
+ * Continue attaching with stack key.
+ */
+ sk = sks;
+ kh = khs;
+ idx = PF_SK_STACK;
+ sks = NULL;
+ goto keyattach;
+ }
+
+ PF_STATE_LOCK(s);
+ KEYS_UNLOCK();
+
+ KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
+ ("%s failure", __func__));
+
+ return (0);
+#undef KEYS_UNLOCK
+}
+
+static void
+pf_detach_state(struct pf_state *s)
+{
+ struct pf_state_key *sks = s->key[PF_SK_STACK];
+ struct pf_keyhash *kh;
+
+ if (sks != NULL) {
+ kh = &V_pf_keyhash[pf_hashkey(sks)];
+ PF_HASHROW_LOCK(kh);
+ if (s->key[PF_SK_STACK] != NULL)
+ pf_state_key_detach(s, PF_SK_STACK);
+ /*
+ * If both point to same key, then we are done.
+ */
+ if (sks == s->key[PF_SK_WIRE]) {
+ pf_state_key_detach(s, PF_SK_WIRE);
+ PF_HASHROW_UNLOCK(kh);
+ return;
+ }
+ PF_HASHROW_UNLOCK(kh);
+ }
+
+ if (s->key[PF_SK_WIRE] != NULL) {
+ kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])];
+ PF_HASHROW_LOCK(kh);
+ if (s->key[PF_SK_WIRE] != NULL)
+ pf_state_key_detach(s, PF_SK_WIRE);
+ PF_HASHROW_UNLOCK(kh);
+ }
+}
+
+static void
+pf_state_key_detach(struct pf_state *s, int idx)
+{
+ struct pf_state_key *sk = s->key[idx];
+#ifdef INVARIANTS
+ struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)];
+
+ PF_HASHROW_ASSERT(kh);
+#endif
+ TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]);
+ s->key[idx] = NULL;
+
+ if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) {
+ LIST_REMOVE(sk, entry);
+ uma_zfree(V_pf_state_key_z, sk);
+ }
+}
+
+static int
+pf_state_key_ctor(void *mem, int size, void *arg, int flags)
+{
+ struct pf_state_key *sk = mem;
+
+ bzero(sk, sizeof(struct pf_state_key_cmp));
+ TAILQ_INIT(&sk->states[PF_SK_WIRE]);
+ TAILQ_INIT(&sk->states[PF_SK_STACK]);
+
+ return (0);
+}
+
+struct pf_state_key *
+pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr,
+ struct pf_addr *daddr, u_int16_t sport, u_int16_t dport)
+{
+ struct pf_state_key *sk;
+
+ sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
+ if (sk == NULL)
+ return (NULL);
+
+ PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af);
+ PF_ACPY(&sk->addr[pd->didx], daddr, pd->af);
+ sk->port[pd->sidx] = sport;
+ sk->port[pd->didx] = dport;
+ sk->proto = pd->proto;
+ sk->af = pd->af;
+
+ return (sk);
+}
+
+struct pf_state_key *
+pf_state_key_clone(struct pf_state_key *orig)
+{
+ struct pf_state_key *sk;
+
+ sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
+ if (sk == NULL)
+ return (NULL);
+
+ bcopy(orig, sk, sizeof(struct pf_state_key_cmp));
+
+ return (sk);
+}
+
+int
+pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw,
+ struct pf_state_key *sks, struct pf_state *s)
+{
+ struct pf_idhash *ih;
+ struct pf_state *cur;
+ int error;
+
+ KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]),
+ ("%s: sks not pristine", __func__));
+ KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]),
+ ("%s: skw not pristine", __func__));
+ KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
+
+ s->kif = kif;
+
+ if (s->id == 0 && s->creatorid == 0) {
+ /* XXX: should be atomic, but probability of collision low */
+ if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID)
+ V_pf_stateid[curcpu] = 1;
+ s->id |= (uint64_t )curcpu << PFID_CPUSHIFT;
+ s->id = htobe64(s->id);
+ s->creatorid = V_pf_status.hostid;
+ }
+
+ /* Returns with ID locked on success. */
+ if ((error = pf_state_key_attach(skw, sks, s)) != 0)
+ return (error);
+
+ ih = &V_pf_idhash[PF_IDHASH(s)];
+ PF_HASHROW_ASSERT(ih);
+ LIST_FOREACH(cur, &ih->states, entry)
+ if (cur->id == s->id && cur->creatorid == s->creatorid)
+ break;
+
+ if (cur != NULL) {
+ PF_HASHROW_UNLOCK(ih);
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: state ID collision: "
+ "id: %016llx creatorid: %08x\n",
+ (unsigned long long)be64toh(s->id),
+ ntohl(s->creatorid));
+ }
+ pf_detach_state(s);
+ return (EEXIST);
+ }
+ LIST_INSERT_HEAD(&ih->states, s, entry);
+ /* One for keys, one for ID hash. */
+ refcount_init(&s->refs, 2);
+
+ counter_u64_add(V_pf_status.fcounters[FCNT_STATE_INSERT], 1);
+ if (pfsync_insert_state_ptr != NULL)
+ pfsync_insert_state_ptr(s);
+
+ /* Returns locked. */
+ return (0);
+}
+
+/*
+ * Find state by ID: returns with locked row on success.
+ */
+struct pf_state *
+pf_find_state_byid(uint64_t id, uint32_t creatorid)
+{
+ struct pf_idhash *ih;
+ struct pf_state *s;
+
+ counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
+
+ ih = &V_pf_idhash[(be64toh(id) % (pf_hashmask + 1))];
+
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry)
+ if (s->id == id && s->creatorid == creatorid)
+ break;
+
+ if (s == NULL)
+ PF_HASHROW_UNLOCK(ih);
+
+ return (s);
+}
+
+/*
+ * Find state by key.
+ * Returns with ID hash slot locked on success.
+ */
+static struct pf_state *
+pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir)
+{
+ struct pf_keyhash *kh;
+ struct pf_state_key *sk;
+ struct pf_state *s;
+ int idx;
+
+ counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
+
+ kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
+
+ PF_HASHROW_LOCK(kh);
+ LIST_FOREACH(sk, &kh->keys, entry)
+ if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
+ break;
+ if (sk == NULL) {
+ PF_HASHROW_UNLOCK(kh);
+ return (NULL);
+ }
+
+ idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK);
+
+ /* List is sorted, if-bound states before floating ones. */
+ TAILQ_FOREACH(s, &sk->states[idx], key_list[idx])
+ if (s->kif == V_pfi_all || s->kif == kif) {
+ PF_STATE_LOCK(s);
+ PF_HASHROW_UNLOCK(kh);
+ if (s->timeout >= PFTM_MAX) {
+ /*
+ * State is either being processed by
+ * pf_unlink_state() in an other thread, or
+ * is scheduled for immediate expiry.
+ */
+ PF_STATE_UNLOCK(s);
+ return (NULL);
+ }
+ return (s);
+ }
+ PF_HASHROW_UNLOCK(kh);
+
+ return (NULL);
+}
+
+struct pf_state *
+pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
+{
+ struct pf_keyhash *kh;
+ struct pf_state_key *sk;
+ struct pf_state *s, *ret = NULL;
+ int idx, inout = 0;
+
+ counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
+
+ kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
+
+ PF_HASHROW_LOCK(kh);
+ LIST_FOREACH(sk, &kh->keys, entry)
+ if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
+ break;
+ if (sk == NULL) {
+ PF_HASHROW_UNLOCK(kh);
+ return (NULL);
+ }
+ switch (dir) {
+ case PF_IN:
+ idx = PF_SK_WIRE;
+ break;
+ case PF_OUT:
+ idx = PF_SK_STACK;
+ break;
+ case PF_INOUT:
+ idx = PF_SK_WIRE;
+ inout = 1;
+ break;
+ default:
+ panic("%s: dir %u", __func__, dir);
+ }
+second_run:
+ TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
+ if (more == NULL) {
+ PF_HASHROW_UNLOCK(kh);
+ return (s);
+ }
+
+ if (ret)
+ (*more)++;
+ else
+ ret = s;
+ }
+ if (inout == 1) {
+ inout = 0;
+ idx = PF_SK_STACK;
+ goto second_run;
+ }
+ PF_HASHROW_UNLOCK(kh);
+
+ return (ret);
+}
+
+/* END state table stuff */
+
+static void
+pf_send(struct pf_send_entry *pfse)
+{
+
+ PF_SENDQ_LOCK();
+ STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next);
+ PF_SENDQ_UNLOCK();
+ swi_sched(V_pf_swi_cookie, 0);
+}
+
+void
+pf_intr(void *v)
+{
+ struct pf_send_head queue;
+ struct pf_send_entry *pfse, *next;
+
+ CURVNET_SET((struct vnet *)v);
+
+ PF_SENDQ_LOCK();
+ queue = V_pf_sendqueue;
+ STAILQ_INIT(&V_pf_sendqueue);
+ PF_SENDQ_UNLOCK();
+
+ STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) {
+ switch (pfse->pfse_type) {
+#ifdef INET
+ case PFSE_IP:
+ ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL);
+ break;
+ case PFSE_ICMP:
+ icmp_error(pfse->pfse_m, pfse->icmpopts.type,
+ pfse->icmpopts.code, 0, pfse->icmpopts.mtu);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case PFSE_IP6:
+ ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL,
+ NULL);
+ break;
+ case PFSE_ICMP6:
+ icmp6_error(pfse->pfse_m, pfse->icmpopts.type,
+ pfse->icmpopts.code, pfse->icmpopts.mtu);
+ break;
+#endif /* INET6 */
+ default:
+ panic("%s: unknown type", __func__);
+ }
+ free(pfse, M_PFTEMP);
+ }
+ CURVNET_RESTORE();
+}
+
+void
+pf_purge_thread(void *unused __unused)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+ u_int idx = 0;
+
+ for (;;) {
+ PF_RULES_RLOCK();
+ rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftm", hz / 10);
+ PF_RULES_RUNLOCK();
+
+ VNET_LIST_RLOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+
+ if (pf_end_threads) {
+ pf_end_threads++;
+ wakeup(pf_purge_thread);
+ kproc_exit(0);
+ }
+
+ /* Process 1/interval fraction of the state table every run. */
+ idx = pf_purge_expired_states(idx, pf_hashmask /
+ (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10));
+
+ /* Purge other expired types every PFTM_INTERVAL seconds. */
+ if (idx == 0) {
+ /*
+ * Order is important:
+ * - states and src nodes reference rules
+ * - states and rules reference kifs
+ */
+ pf_purge_expired_fragments();
+ pf_purge_expired_src_nodes();
+ pf_purge_unlinked_rules();
+ pfi_kif_purge();
+ }
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK();
+ }
+ /* not reached */
+}
+
+void
+pf_unload_vnet_purge(void)
+{
+
+ /*
+ * To cleanse up all kifs and rules we need
+ * two runs: first one clears reference flags,
+ * then pf_purge_expired_states() doesn't
+ * raise them, and then second run frees.
+ */
+ pf_purge_unlinked_rules();
+ pfi_kif_purge();
+
+ /*
+ * Now purge everything.
+ */
+ pf_purge_expired_states(0, pf_hashmask);
+ pf_purge_expired_fragments();
+ pf_purge_expired_src_nodes();
+
+ /*
+ * Now all kifs & rules should be unreferenced,
+ * thus should be successfully freed.
+ */
+ pf_purge_unlinked_rules();
+ pfi_kif_purge();
+}
+
+
+u_int32_t
+pf_state_expires(const struct pf_state *state)
+{
+ u_int32_t timeout;
+ u_int32_t start;
+ u_int32_t end;
+ u_int32_t states;
+
+ /* handle all PFTM_* > PFTM_MAX here */
+ if (state->timeout == PFTM_PURGE)
+ return (time_uptime);
+ KASSERT(state->timeout != PFTM_UNLINKED,
+ ("pf_state_expires: timeout == PFTM_UNLINKED"));
+ KASSERT((state->timeout < PFTM_MAX),
+ ("pf_state_expires: timeout > PFTM_MAX"));
+ timeout = state->rule.ptr->timeout[state->timeout];
+ if (!timeout)
+ timeout = V_pf_default_rule.timeout[state->timeout];
+ start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
+ if (start) {
+ end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
+ states = counter_u64_fetch(state->rule.ptr->states_cur);
+ } else {
+ start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START];
+ end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END];
+ states = V_pf_status.states;
+ }
+ if (end && states > start && start < end) {
+ if (states < end)
+ return (state->expire + timeout * (end - states) /
+ (end - start));
+ else
+ return (time_uptime);
+ }
+ return (state->expire + timeout);
+}
+
+void
+pf_purge_expired_src_nodes()
+{
+ struct pf_src_node_list freelist;
+ struct pf_srchash *sh;
+ struct pf_src_node *cur, *next;
+ int i;
+
+ LIST_INIT(&freelist);
+ for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) {
+ PF_HASHROW_LOCK(sh);
+ LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next)
+ if (cur->states == 0 && cur->expire <= time_uptime) {
+ pf_unlink_src_node(cur);
+ LIST_INSERT_HEAD(&freelist, cur, entry);
+ } else if (cur->rule.ptr != NULL)
+ cur->rule.ptr->rule_flag |= PFRULE_REFS;
+ PF_HASHROW_UNLOCK(sh);
+ }
+
+ pf_free_src_nodes(&freelist);
+
+ V_pf_status.src_nodes = uma_zone_get_cur(V_pf_sources_z);
+}
+
+static void
+pf_src_tree_remove_state(struct pf_state *s)
+{
+ struct pf_src_node *sn;
+ struct pf_srchash *sh;
+ uint32_t timeout;
+
+ timeout = s->rule.ptr->timeout[PFTM_SRC_NODE] ?
+ s->rule.ptr->timeout[PFTM_SRC_NODE] :
+ V_pf_default_rule.timeout[PFTM_SRC_NODE];
+
+ if (s->src_node != NULL) {
+ sn = s->src_node;
+ sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
+ PF_HASHROW_LOCK(sh);
+ if (s->src.tcp_est)
+ --sn->conn;
+ if (--sn->states == 0)
+ sn->expire = time_uptime + timeout;
+ PF_HASHROW_UNLOCK(sh);
+ }
+ if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
+ sn = s->nat_src_node;
+ sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
+ PF_HASHROW_LOCK(sh);
+ if (--sn->states == 0)
+ sn->expire = time_uptime + timeout;
+ PF_HASHROW_UNLOCK(sh);
+ }
+ s->src_node = s->nat_src_node = NULL;
+}
+
+/*
+ * Unlink and potentilly free a state. Function may be
+ * called with ID hash row locked, but always returns
+ * unlocked, since it needs to go through key hash locking.
+ */
+int
+pf_unlink_state(struct pf_state *s, u_int flags)
+{
+ struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)];
+
+ if ((flags & PF_ENTER_LOCKED) == 0)
+ PF_HASHROW_LOCK(ih);
+ else
+ PF_HASHROW_ASSERT(ih);
+
+ if (s->timeout == PFTM_UNLINKED) {
+ /*
+ * State is being processed
+ * by pf_unlink_state() in
+ * an other thread.
+ */
+ PF_HASHROW_UNLOCK(ih);
+ return (0); /* XXXGL: undefined actually */
+ }
+
+ if (s->src.state == PF_TCPS_PROXY_DST) {
+ /* XXX wire key the right one? */
+ pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af,
+ &s->key[PF_SK_WIRE]->addr[1],
+ &s->key[PF_SK_WIRE]->addr[0],
+ s->key[PF_SK_WIRE]->port[1],
+ s->key[PF_SK_WIRE]->port[0],
+ s->src.seqhi, s->src.seqlo + 1,
+ TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL);
+ }
+
+ LIST_REMOVE(s, entry);
+ pf_src_tree_remove_state(s);
+
+ if (pfsync_delete_state_ptr != NULL)
+ pfsync_delete_state_ptr(s);
+
+ STATE_DEC_COUNTERS(s);
+
+ s->timeout = PFTM_UNLINKED;
+
+ PF_HASHROW_UNLOCK(ih);
+
+ pf_detach_state(s);
+ refcount_release(&s->refs);
+
+ return (pf_release_state(s));
+}
+
+void
+pf_free_state(struct pf_state *cur)
+{
+
+ KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur));
+ KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__,
+ cur->timeout));
+
+ pf_normalize_tcp_cleanup(cur);
+ uma_zfree(V_pf_state_z, cur);
+ counter_u64_add(V_pf_status.fcounters[FCNT_STATE_REMOVALS], 1);
+}
+
+/*
+ * Called only from pf_purge_thread(), thus serialized.
+ */
+static u_int
+pf_purge_expired_states(u_int i, int maxcheck)
+{
+ struct pf_idhash *ih;
+ struct pf_state *s;
+
+ V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
+
+ /*
+ * Go through hash and unlink states that expire now.
+ */
+ while (maxcheck > 0) {
+
+ ih = &V_pf_idhash[i];
+relock:
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+ if (pf_state_expires(s) <= time_uptime) {
+ V_pf_status.states -=
+ pf_unlink_state(s, PF_ENTER_LOCKED);
+ goto relock;
+ }
+ s->rule.ptr->rule_flag |= PFRULE_REFS;
+ if (s->nat_rule.ptr != NULL)
+ s->nat_rule.ptr->rule_flag |= PFRULE_REFS;
+ if (s->anchor.ptr != NULL)
+ s->anchor.ptr->rule_flag |= PFRULE_REFS;
+ s->kif->pfik_flags |= PFI_IFLAG_REFS;
+ if (s->rt_kif)
+ s->rt_kif->pfik_flags |= PFI_IFLAG_REFS;
+ }
+ PF_HASHROW_UNLOCK(ih);
+
+ /* Return when we hit end of hash. */
+ if (++i > pf_hashmask) {
+ V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
+ return (0);
+ }
+
+ maxcheck--;
+ }
+
+ V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
+
+ return (i);
+}
+
+static void
+pf_purge_unlinked_rules()
+{
+ struct pf_rulequeue tmpq;
+ struct pf_rule *r, *r1;
+
+ /*
+ * If we have overloading task pending, then we'd
+ * better skip purging this time. There is a tiny
+ * probability that overloading task references
+ * an already unlinked rule.
+ */
+ PF_OVERLOADQ_LOCK();
+ if (!SLIST_EMPTY(&V_pf_overloadqueue)) {
+ PF_OVERLOADQ_UNLOCK();
+ return;
+ }
+ PF_OVERLOADQ_UNLOCK();
+
+ /*
+ * Do naive mark-and-sweep garbage collecting of old rules.
+ * Reference flag is raised by pf_purge_expired_states()
+ * and pf_purge_expired_src_nodes().
+ *
+ * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK,
+ * use a temporary queue.
+ */
+ TAILQ_INIT(&tmpq);
+ PF_UNLNKDRULES_LOCK();
+ TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) {
+ if (!(r->rule_flag & PFRULE_REFS)) {
+ TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries);
+ TAILQ_INSERT_TAIL(&tmpq, r, entries);
+ } else
+ r->rule_flag &= ~PFRULE_REFS;
+ }
+ PF_UNLNKDRULES_UNLOCK();
+
+ if (!TAILQ_EMPTY(&tmpq)) {
+ PF_RULES_WLOCK();
+ TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) {
+ TAILQ_REMOVE(&tmpq, r, entries);
+ pf_free_rule(r);
+ }
+ PF_RULES_WUNLOCK();
+ }
+}
+
+void
+pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET: {
+ u_int32_t a = ntohl(addr->addr32[0]);
+ printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
+ (a>>8)&255, a&255);
+ if (p) {
+ p = ntohs(p);
+ printf(":%u", p);
+ }
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6: {
+ u_int16_t b;
+ u_int8_t i, curstart, curend, maxstart, maxend;
+ curstart = curend = maxstart = maxend = 255;
+ for (i = 0; i < 8; i++) {
+ if (!addr->addr16[i]) {
+ if (curstart == 255)
+ curstart = i;
+ curend = i;
+ } else {
+ if ((curend - curstart) >
+ (maxend - maxstart)) {
+ maxstart = curstart;
+ maxend = curend;
+ }
+ curstart = curend = 255;
+ }
+ }
+ if ((curend - curstart) >
+ (maxend - maxstart)) {
+ maxstart = curstart;
+ maxend = curend;
+ }
+ for (i = 0; i < 8; i++) {
+ if (i >= maxstart && i <= maxend) {
+ if (i == 0)
+ printf(":");
+ if (i == maxend)
+ printf(":");
+ } else {
+ b = ntohs(addr->addr16[i]);
+ printf("%x", b);
+ if (i < 7)
+ printf(":");
+ }
+ }
+ if (p) {
+ p = ntohs(p);
+ printf("[%u]", p);
+ }
+ break;
+ }
+#endif /* INET6 */
+ }
+}
+
+void
+pf_print_state(struct pf_state *s)
+{
+ pf_print_state_parts(s, NULL, NULL);
+}
+
+static void
+pf_print_state_parts(struct pf_state *s,
+ struct pf_state_key *skwp, struct pf_state_key *sksp)
+{
+ struct pf_state_key *skw, *sks;
+ u_int8_t proto, dir;
+
+ /* Do our best to fill these, but they're skipped if NULL */
+ skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
+ sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
+ proto = skw ? skw->proto : (sks ? sks->proto : 0);
+ dir = s ? s->direction : 0;
+
+ switch (proto) {
+ case IPPROTO_IPV4:
+ printf("IPv4");
+ break;
+ case IPPROTO_IPV6:
+ printf("IPv6");
+ break;
+ case IPPROTO_TCP:
+ printf("TCP");
+ break;
+ case IPPROTO_UDP:
+ printf("UDP");
+ break;
+ case IPPROTO_ICMP:
+ printf("ICMP");
+ break;
+ case IPPROTO_ICMPV6:
+ printf("ICMPv6");
+ break;
+ default:
+ printf("%u", proto);
+ break;
+ }
+ switch (dir) {
+ case PF_IN:
+ printf(" in");
+ break;
+ case PF_OUT:
+ printf(" out");
+ break;
+ }
+ if (skw) {
+ printf(" wire: ");
+ pf_print_host(&skw->addr[0], skw->port[0], skw->af);
+ printf(" ");
+ pf_print_host(&skw->addr[1], skw->port[1], skw->af);
+ }
+ if (sks) {
+ printf(" stack: ");
+ if (sks != skw) {
+ pf_print_host(&sks->addr[0], sks->port[0], sks->af);
+ printf(" ");
+ pf_print_host(&sks->addr[1], sks->port[1], sks->af);
+ } else
+ printf("-");
+ }
+ if (s) {
+ if (proto == IPPROTO_TCP) {
+ printf(" [lo=%u high=%u win=%u modulator=%u",
+ s->src.seqlo, s->src.seqhi,
+ s->src.max_win, s->src.seqdiff);
+ if (s->src.wscale && s->dst.wscale)
+ printf(" wscale=%u",
+ s->src.wscale & PF_WSCALE_MASK);
+ printf("]");
+ printf(" [lo=%u high=%u win=%u modulator=%u",
+ s->dst.seqlo, s->dst.seqhi,
+ s->dst.max_win, s->dst.seqdiff);
+ if (s->src.wscale && s->dst.wscale)
+ printf(" wscale=%u",
+ s->dst.wscale & PF_WSCALE_MASK);
+ printf("]");
+ }
+ printf(" %u:%u", s->src.state, s->dst.state);
+ }
+}
+
+void
+pf_print_flags(u_int8_t f)
+{
+ if (f)
+ printf(" ");
+ if (f & TH_FIN)
+ printf("F");
+ if (f & TH_SYN)
+ printf("S");
+ if (f & TH_RST)
+ printf("R");
+ if (f & TH_PUSH)
+ printf("P");
+ if (f & TH_ACK)
+ printf("A");
+ if (f & TH_URG)
+ printf("U");
+ if (f & TH_ECE)
+ printf("E");
+ if (f & TH_CWR)
+ printf("W");
+}
+
+#define PF_SET_SKIP_STEPS(i) \
+ do { \
+ while (head[i] != cur) { \
+ head[i]->skip[i].ptr = cur; \
+ head[i] = TAILQ_NEXT(head[i], entries); \
+ } \
+ } while (0)
+
+void
+pf_calc_skip_steps(struct pf_rulequeue *rules)
+{
+ struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT];
+ int i;
+
+ cur = TAILQ_FIRST(rules);
+ prev = cur;
+ for (i = 0; i < PF_SKIP_COUNT; ++i)
+ head[i] = cur;
+ while (cur != NULL) {
+
+ if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
+ PF_SET_SKIP_STEPS(PF_SKIP_IFP);
+ if (cur->direction != prev->direction)
+ PF_SET_SKIP_STEPS(PF_SKIP_DIR);
+ if (cur->af != prev->af)
+ PF_SET_SKIP_STEPS(PF_SKIP_AF);
+ if (cur->proto != prev->proto)
+ PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
+ if (cur->src.neg != prev->src.neg ||
+ pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
+ PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
+ if (cur->src.port[0] != prev->src.port[0] ||
+ cur->src.port[1] != prev->src.port[1] ||
+ cur->src.port_op != prev->src.port_op)
+ PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
+ if (cur->dst.neg != prev->dst.neg ||
+ pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
+ PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
+ if (cur->dst.port[0] != prev->dst.port[0] ||
+ cur->dst.port[1] != prev->dst.port[1] ||
+ cur->dst.port_op != prev->dst.port_op)
+ PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
+
+ prev = cur;
+ cur = TAILQ_NEXT(cur, entries);
+ }
+ for (i = 0; i < PF_SKIP_COUNT; ++i)
+ PF_SET_SKIP_STEPS(i);
+}
+
+static int
+pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
+{
+ if (aw1->type != aw2->type)
+ return (1);
+ switch (aw1->type) {
+ case PF_ADDR_ADDRMASK:
+ case PF_ADDR_RANGE:
+ if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, AF_INET6))
+ return (1);
+ if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, AF_INET6))
+ return (1);
+ return (0);
+ case PF_ADDR_DYNIFTL:
+ return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
+ case PF_ADDR_NOROUTE:
+ case PF_ADDR_URPFFAILED:
+ return (0);
+ case PF_ADDR_TABLE:
+ return (aw1->p.tbl != aw2->p.tbl);
+ default:
+ printf("invalid address type: %d\n", aw1->type);
+ return (1);
+ }
+}
+
+/**
+ * Checksum updates are a little complicated because the checksum in the TCP/UDP
+ * header isn't always a full checksum. In some cases (i.e. output) it's a
+ * pseudo-header checksum, which is a partial checksum over src/dst IP
+ * addresses, protocol number and length.
+ *
+ * That means we have the following cases:
+ * * Input or forwarding: we don't have TSO, the checksum fields are full
+ * checksums, we need to update the checksum whenever we change anything.
+ * * Output (i.e. the checksum is a pseudo-header checksum):
+ * x The field being updated is src/dst address or affects the length of
+ * the packet. We need to update the pseudo-header checksum (note that this
+ * checksum is not ones' complement).
+ * x Some other field is being modified (e.g. src/dst port numbers): We
+ * don't have to update anything.
+ **/
+u_int16_t
+pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
+{
+ u_int32_t l;
+
+ if (udp && !cksum)
+ return (0x0000);
+ l = cksum + old - new;
+ l = (l >> 16) + (l & 65535);
+ l = l & 65535;
+ if (udp && !l)
+ return (0xFFFF);
+ return (l);
+}
+
+u_int16_t
+pf_proto_cksum_fixup(struct mbuf *m, u_int16_t cksum, u_int16_t old,
+ u_int16_t new, u_int8_t udp)
+{
+ if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
+ return (cksum);
+
+ return (pf_cksum_fixup(cksum, old, new, udp));
+}
+
+static void
+pf_change_ap(struct mbuf *m, struct pf_addr *a, u_int16_t *p, u_int16_t *ic,
+ u_int16_t *pc, struct pf_addr *an, u_int16_t pn, u_int8_t u,
+ sa_family_t af)
+{
+ struct pf_addr ao;
+ u_int16_t po = *p;
+
+ PF_ACPY(&ao, a, af);
+ PF_ACPY(a, an, af);
+
+ if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
+ *pc = ~*pc;
+
+ *p = pn;
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
+ ao.addr16[0], an->addr16[0], 0),
+ ao.addr16[1], an->addr16[1], 0);
+ *p = pn;
+
+ *pc = pf_cksum_fixup(pf_cksum_fixup(*pc,
+ ao.addr16[0], an->addr16[0], u),
+ ao.addr16[1], an->addr16[1], u);
+
+ *pc = pf_proto_cksum_fixup(m, *pc, po, pn, u);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(*pc,
+ ao.addr16[0], an->addr16[0], u),
+ ao.addr16[1], an->addr16[1], u),
+ ao.addr16[2], an->addr16[2], u),
+ ao.addr16[3], an->addr16[3], u),
+ ao.addr16[4], an->addr16[4], u),
+ ao.addr16[5], an->addr16[5], u),
+ ao.addr16[6], an->addr16[6], u),
+ ao.addr16[7], an->addr16[7], u);
+
+ *pc = pf_proto_cksum_fixup(m, *pc, po, pn, u);
+ break;
+#endif /* INET6 */
+ }
+
+ if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA |
+ CSUM_DELAY_DATA_IPV6)) {
+ *pc = ~*pc;
+ if (! *pc)
+ *pc = 0xffff;
+ }
+}
+
+/* Changes a u_int32_t. Uses a void * so there are no align restrictions */
+void
+pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
+{
+ u_int32_t ao;
+
+ memcpy(&ao, a, sizeof(ao));
+ memcpy(a, &an, sizeof(u_int32_t));
+ *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
+ ao % 65536, an % 65536, u);
+}
+
+void
+pf_change_proto_a(struct mbuf *m, void *a, u_int16_t *c, u_int32_t an, u_int8_t udp)
+{
+ u_int32_t ao;
+
+ memcpy(&ao, a, sizeof(ao));
+ memcpy(a, &an, sizeof(u_int32_t));
+
+ *c = pf_proto_cksum_fixup(m,
+ pf_proto_cksum_fixup(m, *c, ao / 65536, an / 65536, udp),
+ ao % 65536, an % 65536, udp);
+}
+
+#ifdef INET6
+static void
+pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
+{
+ struct pf_addr ao;
+
+ PF_ACPY(&ao, a, AF_INET6);
+ PF_ACPY(a, an, AF_INET6);
+
+ *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(*c,
+ ao.addr16[0], an->addr16[0], u),
+ ao.addr16[1], an->addr16[1], u),
+ ao.addr16[2], an->addr16[2], u),
+ ao.addr16[3], an->addr16[3], u),
+ ao.addr16[4], an->addr16[4], u),
+ ao.addr16[5], an->addr16[5], u),
+ ao.addr16[6], an->addr16[6], u),
+ ao.addr16[7], an->addr16[7], u);
+}
+#endif /* INET6 */
+
+static void
+pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
+ struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
+ u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
+{
+ struct pf_addr oia, ooa;
+
+ PF_ACPY(&oia, ia, af);
+ if (oa)
+ PF_ACPY(&ooa, oa, af);
+
+ /* Change inner protocol port, fix inner protocol checksum. */
+ if (ip != NULL) {
+ u_int16_t oip = *ip;
+ u_int32_t opc;
+
+ if (pc != NULL)
+ opc = *pc;
+ *ip = np;
+ if (pc != NULL)
+ *pc = pf_cksum_fixup(*pc, oip, *ip, u);
+ *ic = pf_cksum_fixup(*ic, oip, *ip, 0);
+ if (pc != NULL)
+ *ic = pf_cksum_fixup(*ic, opc, *pc, 0);
+ }
+ /* Change inner ip address, fix inner ip and icmp checksums. */
+ PF_ACPY(ia, na, af);
+ switch (af) {
+#ifdef INET
+ case AF_INET: {
+ u_int32_t oh2c = *h2c;
+
+ *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
+ oia.addr16[0], ia->addr16[0], 0),
+ oia.addr16[1], ia->addr16[1], 0);
+ *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
+ oia.addr16[0], ia->addr16[0], 0),
+ oia.addr16[1], ia->addr16[1], 0);
+ *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(*ic,
+ oia.addr16[0], ia->addr16[0], u),
+ oia.addr16[1], ia->addr16[1], u),
+ oia.addr16[2], ia->addr16[2], u),
+ oia.addr16[3], ia->addr16[3], u),
+ oia.addr16[4], ia->addr16[4], u),
+ oia.addr16[5], ia->addr16[5], u),
+ oia.addr16[6], ia->addr16[6], u),
+ oia.addr16[7], ia->addr16[7], u);
+ break;
+#endif /* INET6 */
+ }
+ /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
+ if (oa) {
+ PF_ACPY(oa, na, af);
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ *hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
+ ooa.addr16[0], oa->addr16[0], 0),
+ ooa.addr16[1], oa->addr16[1], 0);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(*ic,
+ ooa.addr16[0], oa->addr16[0], u),
+ ooa.addr16[1], oa->addr16[1], u),
+ ooa.addr16[2], oa->addr16[2], u),
+ ooa.addr16[3], oa->addr16[3], u),
+ ooa.addr16[4], oa->addr16[4], u),
+ ooa.addr16[5], oa->addr16[5], u),
+ ooa.addr16[6], oa->addr16[6], u),
+ ooa.addr16[7], oa->addr16[7], u);
+ break;
+#endif /* INET6 */
+ }
+ }
+}
+
+
+/*
+ * Need to modulate the sequence numbers in the TCP SACK option
+ * (credits to Krzysztof Pfaff for report and patch)
+ */
+static int
+pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
+ struct tcphdr *th, struct pf_state_peer *dst)
+{
+ int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
+ u_int8_t opts[TCP_MAXOLEN], *opt = opts;
+ int copyback = 0, i, olen;
+ struct sackblk sack;
+
+#define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2)
+ if (hlen < TCPOLEN_SACKLEN ||
+ !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
+ return 0;
+
+ while (hlen >= TCPOLEN_SACKLEN) {
+ olen = opt[1];
+ switch (*opt) {
+ case TCPOPT_EOL: /* FALLTHROUGH */
+ case TCPOPT_NOP:
+ opt++;
+ hlen--;
+ break;
+ case TCPOPT_SACK:
+ if (olen > hlen)
+ olen = hlen;
+ if (olen >= TCPOLEN_SACKLEN) {
+ for (i = 2; i + TCPOLEN_SACK <= olen;
+ i += TCPOLEN_SACK) {
+ memcpy(&sack, &opt[i], sizeof(sack));
+ pf_change_proto_a(m, &sack.start, &th->th_sum,
+ htonl(ntohl(sack.start) - dst->seqdiff), 0);
+ pf_change_proto_a(m, &sack.end, &th->th_sum,
+ htonl(ntohl(sack.end) - dst->seqdiff), 0);
+ memcpy(&opt[i], &sack, sizeof(sack));
+ }
+ copyback = 1;
+ }
+ /* FALLTHROUGH */
+ default:
+ if (olen < 2)
+ olen = 2;
+ hlen -= olen;
+ opt += olen;
+ }
+ }
+
+ if (copyback)
+ m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts);
+ return (copyback);
+}
+
+static void
+pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af,
+ const struct pf_addr *saddr, const struct pf_addr *daddr,
+ u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
+ u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
+ u_int16_t rtag, struct ifnet *ifp)
+{
+ struct pf_send_entry *pfse;
+ struct mbuf *m;
+ int len, tlen;
+#ifdef INET
+ struct ip *h = NULL;
+#endif /* INET */
+#ifdef INET6
+ struct ip6_hdr *h6 = NULL;
+#endif /* INET6 */
+ struct tcphdr *th;
+ char *opt;
+ struct pf_mtag *pf_mtag;
+
+ len = 0;
+ th = NULL;
+
+ /* maximum segment size tcp option */
+ tlen = sizeof(struct tcphdr);
+ if (mss)
+ tlen += 4;
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ len = sizeof(struct ip) + tlen;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ len = sizeof(struct ip6_hdr) + tlen;
+ break;
+#endif /* INET6 */
+ default:
+ panic("%s: unsupported af %d", __func__, af);
+ }
+
+ /* Allocate outgoing queue entry, mbuf and mbuf tag. */
+ pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
+ if (pfse == NULL)
+ return;
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+ if (m == NULL) {
+ free(pfse, M_PFTEMP);
+ return;
+ }
+#ifdef MAC
+ mac_netinet_firewall_send(m);
+#endif
+ if ((pf_mtag = pf_get_mtag(m)) == NULL) {
+ free(pfse, M_PFTEMP);
+ m_freem(m);
+ return;
+ }
+ if (tag)
+ m->m_flags |= M_SKIP_FIREWALL;
+ pf_mtag->tag = rtag;
+
+ if (r != NULL && r->rtableid >= 0)
+ M_SETFIB(m, r->rtableid);
+
+#ifdef ALTQ
+ if (r != NULL && r->qid) {
+ pf_mtag->qid = r->qid;
+
+ /* add hints for ecn */
+ pf_mtag->hdr = mtod(m, struct ip *);
+ }
+#endif /* ALTQ */
+ m->m_data += max_linkhdr;
+ m->m_pkthdr.len = m->m_len = len;
+ m->m_pkthdr.rcvif = NULL;
+ bzero(m->m_data, len);
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ h = mtod(m, struct ip *);
+
+ /* IP header fields included in the TCP checksum */
+ h->ip_p = IPPROTO_TCP;
+ h->ip_len = htons(tlen);
+ h->ip_src.s_addr = saddr->v4.s_addr;
+ h->ip_dst.s_addr = daddr->v4.s_addr;
+
+ th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ h6 = mtod(m, struct ip6_hdr *);
+
+ /* IP header fields included in the TCP checksum */
+ h6->ip6_nxt = IPPROTO_TCP;
+ h6->ip6_plen = htons(tlen);
+ memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
+ memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
+
+ th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
+ break;
+#endif /* INET6 */
+ }
+
+ /* TCP header */
+ th->th_sport = sport;
+ th->th_dport = dport;
+ th->th_seq = htonl(seq);
+ th->th_ack = htonl(ack);
+ th->th_off = tlen >> 2;
+ th->th_flags = flags;
+ th->th_win = htons(win);
+
+ if (mss) {
+ opt = (char *)(th + 1);
+ opt[0] = TCPOPT_MAXSEG;
+ opt[1] = 4;
+ HTONS(mss);
+ bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
+ }
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ /* TCP checksum */
+ th->th_sum = in_cksum(m, len);
+
+ /* Finish the IP header */
+ h->ip_v = 4;
+ h->ip_hl = sizeof(*h) >> 2;
+ h->ip_tos = IPTOS_LOWDELAY;
+ h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0);
+ h->ip_len = htons(len);
+ h->ip_ttl = ttl ? ttl : V_ip_defttl;
+ h->ip_sum = 0;
+
+ pfse->pfse_type = PFSE_IP;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ /* TCP checksum */
+ th->th_sum = in6_cksum(m, IPPROTO_TCP,
+ sizeof(struct ip6_hdr), tlen);
+
+ h6->ip6_vfc |= IPV6_VERSION;
+ h6->ip6_hlim = IPV6_DEFHLIM;
+
+ pfse->pfse_type = PFSE_IP6;
+ break;
+#endif /* INET6 */
+ }
+ pfse->pfse_m = m;
+ pf_send(pfse);
+}
+
+static int
+pf_ieee8021q_setpcp(struct mbuf *m, u_int8_t prio)
+{
+ struct m_tag *mtag;
+
+ KASSERT(prio <= PF_PRIO_MAX,
+ ("%s with invalid pcp", __func__));
+
+ mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_OUT, NULL);
+ if (mtag == NULL) {
+ mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_OUT,
+ sizeof(uint8_t), M_NOWAIT);
+ if (mtag == NULL)
+ return (ENOMEM);
+ m_tag_prepend(m, mtag);
+ }
+
+ *(uint8_t *)(mtag + 1) = prio;
+ return (0);
+}
+
+static int
+pf_match_ieee8021q_pcp(u_int8_t prio, struct mbuf *m)
+{
+ struct m_tag *mtag;
+ u_int8_t mpcp;
+
+ mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL);
+ if (mtag == NULL)
+ return (0);
+
+ if (prio == PF_PRIO_ZERO)
+ prio = 0;
+
+ mpcp = *(uint8_t *)(mtag + 1);
+
+ return (mpcp == prio);
+}
+
+static void
+pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
+ struct pf_rule *r)
+{
+ struct pf_send_entry *pfse;
+ struct mbuf *m0;
+ struct pf_mtag *pf_mtag;
+
+ /* Allocate outgoing queue entry, mbuf and mbuf tag. */
+ pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
+ if (pfse == NULL)
+ return;
+
+ if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) {
+ free(pfse, M_PFTEMP);
+ return;
+ }
+
+ if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
+ free(pfse, M_PFTEMP);
+ return;
+ }
+ /* XXX: revisit */
+ m0->m_flags |= M_SKIP_FIREWALL;
+
+ if (r->rtableid >= 0)
+ M_SETFIB(m0, r->rtableid);
+
+#ifdef ALTQ
+ if (r->qid) {
+ pf_mtag->qid = r->qid;
+ /* add hints for ecn */
+ pf_mtag->hdr = mtod(m0, struct ip *);
+ }
+#endif /* ALTQ */
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ pfse->pfse_type = PFSE_ICMP;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ pfse->pfse_type = PFSE_ICMP6;
+ break;
+#endif /* INET6 */
+ }
+ pfse->pfse_m = m0;
+ pfse->icmpopts.type = type;
+ pfse->icmpopts.code = code;
+ pf_send(pfse);
+}
+
+/*
+ * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
+ * If n is 0, they match if they are equal. If n is != 0, they match if they
+ * are different.
+ */
+int
+pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
+ struct pf_addr *b, sa_family_t af)
+{
+ int match = 0;
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ if ((a->addr32[0] & m->addr32[0]) ==
+ (b->addr32[0] & m->addr32[0]))
+ match++;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (((a->addr32[0] & m->addr32[0]) ==
+ (b->addr32[0] & m->addr32[0])) &&
+ ((a->addr32[1] & m->addr32[1]) ==
+ (b->addr32[1] & m->addr32[1])) &&
+ ((a->addr32[2] & m->addr32[2]) ==
+ (b->addr32[2] & m->addr32[2])) &&
+ ((a->addr32[3] & m->addr32[3]) ==
+ (b->addr32[3] & m->addr32[3])))
+ match++;
+ break;
+#endif /* INET6 */
+ }
+ if (match) {
+ if (n)
+ return (0);
+ else
+ return (1);
+ } else {
+ if (n)
+ return (1);
+ else
+ return (0);
+ }
+}
+
+/*
+ * Return 1 if b <= a <= e, otherwise return 0.
+ */
+int
+pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
+ struct pf_addr *a, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) ||
+ (ntohl(a->addr32[0]) > ntohl(e->addr32[0])))
+ return (0);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6: {
+ int i;
+
+ /* check a >= b */
+ for (i = 0; i < 4; ++i)
+ if (ntohl(a->addr32[i]) > ntohl(b->addr32[i]))
+ break;
+ else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i]))
+ return (0);
+ /* check a <= e */
+ for (i = 0; i < 4; ++i)
+ if (ntohl(a->addr32[i]) < ntohl(e->addr32[i]))
+ break;
+ else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i]))
+ return (0);
+ break;
+ }
+#endif /* INET6 */
+ }
+ return (1);
+}
+
+static int
+pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
+{
+ switch (op) {
+ case PF_OP_IRG:
+ return ((p > a1) && (p < a2));
+ case PF_OP_XRG:
+ return ((p < a1) || (p > a2));
+ case PF_OP_RRG:
+ return ((p >= a1) && (p <= a2));
+ case PF_OP_EQ:
+ return (p == a1);
+ case PF_OP_NE:
+ return (p != a1);
+ case PF_OP_LT:
+ return (p < a1);
+ case PF_OP_LE:
+ return (p <= a1);
+ case PF_OP_GT:
+ return (p > a1);
+ case PF_OP_GE:
+ return (p >= a1);
+ }
+ return (0); /* never reached */
+}
+
+int
+pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
+{
+ NTOHS(a1);
+ NTOHS(a2);
+ NTOHS(p);
+ return (pf_match(op, a1, a2, p));
+}
+
+static int
+pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
+{
+ if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
+ return (0);
+ return (pf_match(op, a1, a2, u));
+}
+
+static int
+pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
+{
+ if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
+ return (0);
+ return (pf_match(op, a1, a2, g));
+}
+
+int
+pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag)
+{
+ if (*tag == -1)
+ *tag = mtag;
+
+ return ((!r->match_tag_not && r->match_tag == *tag) ||
+ (r->match_tag_not && r->match_tag != *tag));
+}
+
+int
+pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag)
+{
+
+ KASSERT(tag > 0, ("%s: tag %d", __func__, tag));
+
+ if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL))
+ return (ENOMEM);
+
+ pd->pf_mtag->tag = tag;
+
+ return (0);
+}
+
+#define PF_ANCHOR_STACKSIZE 32
+struct pf_anchor_stackframe {
+ struct pf_ruleset *rs;
+ struct pf_rule *r; /* XXX: + match bit */
+ struct pf_anchor *child;
+};
+
+/*
+ * XXX: We rely on malloc(9) returning pointer aligned addresses.
+ */
+#define PF_ANCHORSTACK_MATCH 0x00000001
+#define PF_ANCHORSTACK_MASK (PF_ANCHORSTACK_MATCH)
+
+#define PF_ANCHOR_MATCH(f) ((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH)
+#define PF_ANCHOR_RULE(f) (struct pf_rule *) \
+ ((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK)
+#define PF_ANCHOR_SET_MATCH(f) do { (f)->r = (void *) \
+ ((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH); \
+} while (0)
+
+void
+pf_step_into_anchor(struct pf_anchor_stackframe *stack, int *depth,
+ struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
+ int *match)
+{
+ struct pf_anchor_stackframe *f;
+
+ PF_RULES_RASSERT();
+
+ if (match)
+ *match = 0;
+ if (*depth >= PF_ANCHOR_STACKSIZE) {
+ printf("%s: anchor stack overflow on %s\n",
+ __func__, (*r)->anchor->name);
+ *r = TAILQ_NEXT(*r, entries);
+ return;
+ } else if (*depth == 0 && a != NULL)
+ *a = *r;
+ f = stack + (*depth)++;
+ f->rs = *rs;
+ f->r = *r;
+ if ((*r)->anchor_wildcard) {
+ struct pf_anchor_node *parent = &(*r)->anchor->children;
+
+ if ((f->child = RB_MIN(pf_anchor_node, parent)) == NULL) {
+ *r = NULL;
+ return;
+ }
+ *rs = &f->child->ruleset;
+ } else {
+ f->child = NULL;
+ *rs = &(*r)->anchor->ruleset;
+ }
+ *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
+}
+
+int
+pf_step_out_of_anchor(struct pf_anchor_stackframe *stack, int *depth,
+ struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
+ int *match)
+{
+ struct pf_anchor_stackframe *f;
+ struct pf_rule *fr;
+ int quick = 0;
+
+ PF_RULES_RASSERT();
+
+ do {
+ if (*depth <= 0)
+ break;
+ f = stack + *depth - 1;
+ fr = PF_ANCHOR_RULE(f);
+ if (f->child != NULL) {
+ struct pf_anchor_node *parent;
+
+ /*
+ * This block traverses through
+ * a wildcard anchor.
+ */
+ parent = &fr->anchor->children;
+ if (match != NULL && *match) {
+ /*
+ * If any of "*" matched, then
+ * "foo/ *" matched, mark frame
+ * appropriately.
+ */
+ PF_ANCHOR_SET_MATCH(f);
+ *match = 0;
+ }
+ f->child = RB_NEXT(pf_anchor_node, parent, f->child);
+ if (f->child != NULL) {
+ *rs = &f->child->ruleset;
+ *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
+ if (*r == NULL)
+ continue;
+ else
+ break;
+ }
+ }
+ (*depth)--;
+ if (*depth == 0 && a != NULL)
+ *a = NULL;
+ *rs = f->rs;
+ if (PF_ANCHOR_MATCH(f) || (match != NULL && *match))
+ quick = fr->quick;
+ *r = TAILQ_NEXT(fr, entries);
+ } while (*r == NULL);
+
+ return (quick);
+}
+
+#ifdef INET6
+void
+pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
+ struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
+ ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
+ break;
+#endif /* INET */
+ case AF_INET6:
+ naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
+ ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
+ naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
+ ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
+ naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
+ ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
+ naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
+ ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
+ break;
+ }
+}
+
+void
+pf_addr_inc(struct pf_addr *addr, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
+ break;
+#endif /* INET */
+ case AF_INET6:
+ if (addr->addr32[3] == 0xffffffff) {
+ addr->addr32[3] = 0;
+ if (addr->addr32[2] == 0xffffffff) {
+ addr->addr32[2] = 0;
+ if (addr->addr32[1] == 0xffffffff) {
+ addr->addr32[1] = 0;
+ addr->addr32[0] =
+ htonl(ntohl(addr->addr32[0]) + 1);
+ } else
+ addr->addr32[1] =
+ htonl(ntohl(addr->addr32[1]) + 1);
+ } else
+ addr->addr32[2] =
+ htonl(ntohl(addr->addr32[2]) + 1);
+ } else
+ addr->addr32[3] =
+ htonl(ntohl(addr->addr32[3]) + 1);
+ break;
+ }
+}
+#endif /* INET6 */
+
+int
+pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m)
+{
+ struct pf_addr *saddr, *daddr;
+ u_int16_t sport, dport;
+ struct inpcbinfo *pi;
+ struct inpcb *inp;
+
+ pd->lookup.uid = UID_MAX;
+ pd->lookup.gid = GID_MAX;
+
+ switch (pd->proto) {
+ case IPPROTO_TCP:
+ if (pd->hdr.tcp == NULL)
+ return (-1);
+ sport = pd->hdr.tcp->th_sport;
+ dport = pd->hdr.tcp->th_dport;
+ pi = &V_tcbinfo;
+ break;
+ case IPPROTO_UDP:
+ if (pd->hdr.udp == NULL)
+ return (-1);
+ sport = pd->hdr.udp->uh_sport;
+ dport = pd->hdr.udp->uh_dport;
+ pi = &V_udbinfo;
+ break;
+ default:
+ return (-1);
+ }
+ if (direction == PF_IN) {
+ saddr = pd->src;
+ daddr = pd->dst;
+ } else {
+ u_int16_t p;
+
+ p = sport;
+ sport = dport;
+ dport = p;
+ saddr = pd->dst;
+ daddr = pd->src;
+ }
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET:
+ inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4,
+ dport, INPLOOKUP_RLOCKPCB, NULL, m);
+ if (inp == NULL) {
+ inp = in_pcblookup_mbuf(pi, saddr->v4, sport,
+ daddr->v4, dport, INPLOOKUP_WILDCARD |
+ INPLOOKUP_RLOCKPCB, NULL, m);
+ if (inp == NULL)
+ return (-1);
+ }
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6,
+ dport, INPLOOKUP_RLOCKPCB, NULL, m);
+ if (inp == NULL) {
+ inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport,
+ &daddr->v6, dport, INPLOOKUP_WILDCARD |
+ INPLOOKUP_RLOCKPCB, NULL, m);
+ if (inp == NULL)
+ return (-1);
+ }
+ break;
+#endif /* INET6 */
+
+ default:
+ return (-1);
+ }
+ INP_RLOCK_ASSERT(inp);
+#ifndef __rtems__
+ pd->lookup.uid = inp->inp_cred->cr_uid;
+ pd->lookup.gid = inp->inp_cred->cr_groups[0];
+#else /* __rtems__ */
+ pd->lookup.uid = BSD_DEFAULT_UID;
+ pd->lookup.gid = BSD_DEFAULT_GID;
+#endif /* __rtems__ */
+ INP_RUNLOCK(inp);
+
+ return (1);
+}
+
+static u_int8_t
+pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
+{
+ int hlen;
+ u_int8_t hdr[60];
+ u_int8_t *opt, optlen;
+ u_int8_t wscale = 0;
+
+ hlen = th_off << 2; /* hlen <= sizeof(hdr) */
+ if (hlen <= sizeof(struct tcphdr))
+ return (0);
+ if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
+ return (0);
+ opt = hdr + sizeof(struct tcphdr);
+ hlen -= sizeof(struct tcphdr);
+ while (hlen >= 3) {
+ switch (*opt) {
+ case TCPOPT_EOL:
+ case TCPOPT_NOP:
+ ++opt;
+ --hlen;
+ break;
+ case TCPOPT_WINDOW:
+ wscale = opt[2];
+ if (wscale > TCP_MAX_WINSHIFT)
+ wscale = TCP_MAX_WINSHIFT;
+ wscale |= PF_WSCALE_FLAG;
+ /* FALLTHROUGH */
+ default:
+ optlen = opt[1];
+ if (optlen < 2)
+ optlen = 2;
+ hlen -= optlen;
+ opt += optlen;
+ break;
+ }
+ }
+ return (wscale);
+}
+
+static u_int16_t
+pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
+{
+ int hlen;
+ u_int8_t hdr[60];
+ u_int8_t *opt, optlen;
+ u_int16_t mss = V_tcp_mssdflt;
+
+ hlen = th_off << 2; /* hlen <= sizeof(hdr) */
+ if (hlen <= sizeof(struct tcphdr))
+ return (0);
+ if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
+ return (0);
+ opt = hdr + sizeof(struct tcphdr);
+ hlen -= sizeof(struct tcphdr);
+ while (hlen >= TCPOLEN_MAXSEG) {
+ switch (*opt) {
+ case TCPOPT_EOL:
+ case TCPOPT_NOP:
+ ++opt;
+ --hlen;
+ break;
+ case TCPOPT_MAXSEG:
+ bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
+ NTOHS(mss);
+ /* FALLTHROUGH */
+ default:
+ optlen = opt[1];
+ if (optlen < 2)
+ optlen = 2;
+ hlen -= optlen;
+ opt += optlen;
+ break;
+ }
+ }
+ return (mss);
+}
+
+static u_int16_t
+pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer)
+{
+#ifdef INET
+ struct nhop4_basic nh4;
+#endif /* INET */
+#ifdef INET6
+ struct nhop6_basic nh6;
+ struct in6_addr dst6;
+ uint32_t scopeid;
+#endif /* INET6 */
+ int hlen = 0;
+ uint16_t mss = 0;
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ hlen = sizeof(struct ip);
+ if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) == 0)
+ mss = nh4.nh_mtu - hlen - sizeof(struct tcphdr);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ hlen = sizeof(struct ip6_hdr);
+ in6_splitscope(&addr->v6, &dst6, &scopeid);
+ if (fib6_lookup_nh_basic(rtableid, &dst6, scopeid, 0,0,&nh6)==0)
+ mss = nh6.nh_mtu - hlen - sizeof(struct tcphdr);
+ break;
+#endif /* INET6 */
+ }
+
+ mss = max(V_tcp_mssdflt, mss);
+ mss = min(mss, offer);
+ mss = max(mss, 64); /* sanity - at least max opt space */
+ return (mss);
+}
+
+static u_int32_t
+pf_tcp_iss(struct pf_pdesc *pd)
+{
+ MD5_CTX ctx;
+ u_int32_t digest[4];
+
+ if (V_pf_tcp_secret_init == 0) {
+ read_random(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret));
+ MD5Init(&V_pf_tcp_secret_ctx);
+ MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret,
+ sizeof(V_pf_tcp_secret));
+ V_pf_tcp_secret_init = 1;
+ }
+
+ ctx = V_pf_tcp_secret_ctx;
+
+ MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
+ MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
+ if (pd->af == AF_INET6) {
+ MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
+ MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
+ } else {
+ MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
+ MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
+ }
+ MD5Final((u_char *)digest, &ctx);
+ V_pf_tcp_iss_off += 4096;
+#define ISN_RANDOM_INCREMENT (4096 - 1)
+ return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) +
+ V_pf_tcp_iss_off);
+#undef ISN_RANDOM_INCREMENT
+}
+
+static int
+pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
+ struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd,
+ struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp)
+{
+ struct pf_rule *nr = NULL;
+ struct pf_addr * const saddr = pd->src;
+ struct pf_addr * const daddr = pd->dst;
+ sa_family_t af = pd->af;
+ struct pf_rule *r, *a = NULL;
+ struct pf_ruleset *ruleset = NULL;
+ struct pf_src_node *nsn = NULL;
+ struct tcphdr *th = pd->hdr.tcp;
+ struct pf_state_key *sk = NULL, *nk = NULL;
+ u_short reason;
+ int rewrite = 0, hdrlen = 0;
+ int tag = -1, rtableid = -1;
+ int asd = 0;
+ int match = 0;
+ int state_icmp = 0;
+ u_int16_t sport = 0, dport = 0;
+ u_int16_t bproto_sum = 0, bip_sum = 0;
+ u_int8_t icmptype = 0, icmpcode = 0;
+ struct pf_anchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE];
+
+ PF_RULES_RASSERT();
+
+ if (inp != NULL) {
+ INP_LOCK_ASSERT(inp);
+#ifndef __rtems__
+ pd->lookup.uid = inp->inp_cred->cr_uid;
+ pd->lookup.gid = inp->inp_cred->cr_groups[0];
+#else /* __rtems__ */
+ pd->lookup.uid = BSD_DEFAULT_UID;
+ pd->lookup.gid = BSD_DEFAULT_GID;
+#endif /* __rtems__ */
+ pd->lookup.done = 1;
+ }
+
+ switch (pd->proto) {
+ case IPPROTO_TCP:
+ sport = th->th_sport;
+ dport = th->th_dport;
+ hdrlen = sizeof(*th);
+ break;
+ case IPPROTO_UDP:
+ sport = pd->hdr.udp->uh_sport;
+ dport = pd->hdr.udp->uh_dport;
+ hdrlen = sizeof(*pd->hdr.udp);
+ break;
+#ifdef INET
+ case IPPROTO_ICMP:
+ if (pd->af != AF_INET)
+ break;
+ sport = dport = pd->hdr.icmp->icmp_id;
+ hdrlen = sizeof(*pd->hdr.icmp);
+ icmptype = pd->hdr.icmp->icmp_type;
+ icmpcode = pd->hdr.icmp->icmp_code;
+
+ if (icmptype == ICMP_UNREACH ||
+ icmptype == ICMP_SOURCEQUENCH ||
+ icmptype == ICMP_REDIRECT ||
+ icmptype == ICMP_TIMXCEED ||
+ icmptype == ICMP_PARAMPROB)
+ state_icmp++;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+ if (af != AF_INET6)
+ break;
+ sport = dport = pd->hdr.icmp6->icmp6_id;
+ hdrlen = sizeof(*pd->hdr.icmp6);
+ icmptype = pd->hdr.icmp6->icmp6_type;
+ icmpcode = pd->hdr.icmp6->icmp6_code;
+
+ if (icmptype == ICMP6_DST_UNREACH ||
+ icmptype == ICMP6_PACKET_TOO_BIG ||
+ icmptype == ICMP6_TIME_EXCEEDED ||
+ icmptype == ICMP6_PARAM_PROB)
+ state_icmp++;
+ break;
+#endif /* INET6 */
+ default:
+ sport = dport = hdrlen = 0;
+ break;
+ }
+
+ r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
+
+ /* check packet for BINAT/NAT/RDR */
+ if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk,
+ &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) {
+ KASSERT(sk != NULL, ("%s: null sk", __func__));
+ KASSERT(nk != NULL, ("%s: null nk", __func__));
+
+ if (pd->ip_sum)
+ bip_sum = *pd->ip_sum;
+
+ switch (pd->proto) {
+ case IPPROTO_TCP:
+ bproto_sum = th->th_sum;
+ pd->proto_sum = &th->th_sum;
+
+ if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
+ nk->port[pd->sidx] != sport) {
+ pf_change_ap(m, saddr, &th->th_sport, pd->ip_sum,
+ &th->th_sum, &nk->addr[pd->sidx],
+ nk->port[pd->sidx], 0, af);
+ pd->sport = &th->th_sport;
+ sport = th->th_sport;
+ }
+
+ if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
+ nk->port[pd->didx] != dport) {
+ pf_change_ap(m, daddr, &th->th_dport, pd->ip_sum,
+ &th->th_sum, &nk->addr[pd->didx],
+ nk->port[pd->didx], 0, af);
+ dport = th->th_dport;
+ pd->dport = &th->th_dport;
+ }
+ rewrite++;
+ break;
+ case IPPROTO_UDP:
+ bproto_sum = pd->hdr.udp->uh_sum;
+ pd->proto_sum = &pd->hdr.udp->uh_sum;
+
+ if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
+ nk->port[pd->sidx] != sport) {
+ pf_change_ap(m, saddr, &pd->hdr.udp->uh_sport,
+ pd->ip_sum, &pd->hdr.udp->uh_sum,
+ &nk->addr[pd->sidx],
+ nk->port[pd->sidx], 1, af);
+ sport = pd->hdr.udp->uh_sport;
+ pd->sport = &pd->hdr.udp->uh_sport;
+ }
+
+ if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
+ nk->port[pd->didx] != dport) {
+ pf_change_ap(m, daddr, &pd->hdr.udp->uh_dport,
+ pd->ip_sum, &pd->hdr.udp->uh_sum,
+ &nk->addr[pd->didx],
+ nk->port[pd->didx], 1, af);
+ dport = pd->hdr.udp->uh_dport;
+ pd->dport = &pd->hdr.udp->uh_dport;
+ }
+ rewrite++;
+ break;
+#ifdef INET
+ case IPPROTO_ICMP:
+ nk->port[0] = nk->port[1];
+ if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
+ pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
+ nk->addr[pd->sidx].v4.s_addr, 0);
+
+ if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
+ pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
+ nk->addr[pd->didx].v4.s_addr, 0);
+
+ if (nk->port[1] != pd->hdr.icmp->icmp_id) {
+ pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
+ pd->hdr.icmp->icmp_cksum, sport,
+ nk->port[1], 0);
+ pd->hdr.icmp->icmp_id = nk->port[1];
+ pd->sport = &pd->hdr.icmp->icmp_id;
+ }
+ m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+ nk->port[0] = nk->port[1];
+ if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
+ pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
+ &nk->addr[pd->sidx], 0);
+
+ if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
+ pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
+ &nk->addr[pd->didx], 0);
+ rewrite++;
+ break;
+#endif /* INET */
+ default:
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ if (PF_ANEQ(saddr,
+ &nk->addr[pd->sidx], AF_INET))
+ pf_change_a(&saddr->v4.s_addr,
+ pd->ip_sum,
+ nk->addr[pd->sidx].v4.s_addr, 0);
+
+ if (PF_ANEQ(daddr,
+ &nk->addr[pd->didx], AF_INET))
+ pf_change_a(&daddr->v4.s_addr,
+ pd->ip_sum,
+ nk->addr[pd->didx].v4.s_addr, 0);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (PF_ANEQ(saddr,
+ &nk->addr[pd->sidx], AF_INET6))
+ PF_ACPY(saddr, &nk->addr[pd->sidx], af);
+
+ if (PF_ANEQ(daddr,
+ &nk->addr[pd->didx], AF_INET6))
+ PF_ACPY(saddr, &nk->addr[pd->didx], af);
+ break;
+#endif /* INET */
+ }
+ break;
+ }
+ if (nr->natpass)
+ r = NULL;
+ pd->nat_rule = nr;
+ }
+
+ while (r != NULL) {
+ r->evaluations++;
+ if (pfi_kif_match(r->kif, kif) == r->ifnot)
+ r = r->skip[PF_SKIP_IFP].ptr;
+ else if (r->direction && r->direction != direction)
+ r = r->skip[PF_SKIP_DIR].ptr;
+ else if (r->af && r->af != af)
+ r = r->skip[PF_SKIP_AF].ptr;
+ else if (r->proto && r->proto != pd->proto)
+ r = r->skip[PF_SKIP_PROTO].ptr;
+ else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
+ r->src.neg, kif, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_SRC_ADDR].ptr;
+ /* tcp/udp only. port_op always 0 in other cases */
+ else if (r->src.port_op && !pf_match_port(r->src.port_op,
+ r->src.port[0], r->src.port[1], sport))
+ r = r->skip[PF_SKIP_SRC_PORT].ptr;
+ else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
+ r->dst.neg, NULL, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_DST_ADDR].ptr;
+ /* tcp/udp only. port_op always 0 in other cases */
+ else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
+ r->dst.port[0], r->dst.port[1], dport))
+ r = r->skip[PF_SKIP_DST_PORT].ptr;
+ /* icmp only. type always 0 in other cases */
+ else if (r->type && r->type != icmptype + 1)
+ r = TAILQ_NEXT(r, entries);
+ /* icmp only. type always 0 in other cases */
+ else if (r->code && r->code != icmpcode + 1)
+ r = TAILQ_NEXT(r, entries);
+ else if (r->tos && !(r->tos == pd->tos))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->rule_flag & PFRULE_FRAGMENT)
+ r = TAILQ_NEXT(r, entries);
+ else if (pd->proto == IPPROTO_TCP &&
+ (r->flagset & th->th_flags) != r->flags)
+ r = TAILQ_NEXT(r, entries);
+ /* tcp/udp only. uid.op always 0 in other cases */
+ else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
+ pf_socket_lookup(direction, pd, m), 1)) &&
+ !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
+ pd->lookup.uid))
+ r = TAILQ_NEXT(r, entries);
+ /* tcp/udp only. gid.op always 0 in other cases */
+ else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
+ pf_socket_lookup(direction, pd, m), 1)) &&
+ !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
+ pd->lookup.gid))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->prio &&
+ !pf_match_ieee8021q_pcp(r->prio, m))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->prob &&
+ r->prob <= arc4random())
+ r = TAILQ_NEXT(r, entries);
+ else if (r->match_tag && !pf_match_tag(m, r, &tag,
+ pd->pf_mtag ? pd->pf_mtag->tag : 0))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->os_fingerprint != PF_OSFP_ANY &&
+ (pd->proto != IPPROTO_TCP || !pf_osfp_match(
+ pf_osfp_fingerprint(pd, m, off, th),
+ r->os_fingerprint)))
+ r = TAILQ_NEXT(r, entries);
+ else {
+ if (r->tag)
+ tag = r->tag;
+ if (r->rtableid >= 0)
+ rtableid = r->rtableid;
+ if (r->anchor == NULL) {
+ match = 1;
+ *rm = r;
+ *am = a;
+ *rsm = ruleset;
+ if ((*rm)->quick)
+ break;
+ r = TAILQ_NEXT(r, entries);
+ } else
+ pf_step_into_anchor(anchor_stack, &asd,
+ &ruleset, PF_RULESET_FILTER, &r, &a,
+ &match);
+ }
+ if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
+ &ruleset, PF_RULESET_FILTER, &r, &a, &match))
+ break;
+ }
+ r = *rm;
+ a = *am;
+ ruleset = *rsm;
+
+ REASON_SET(&reason, PFRES_MATCH);
+
+ if (r->log || (nr != NULL && nr->log)) {
+ if (rewrite)
+ m_copyback(m, off, hdrlen, pd->hdr.any);
+ PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a,
+ ruleset, pd, 1);
+ }
+
+ if ((r->action == PF_DROP) &&
+ ((r->rule_flag & PFRULE_RETURNRST) ||
+ (r->rule_flag & PFRULE_RETURNICMP) ||
+ (r->rule_flag & PFRULE_RETURN))) {
+ /* undo NAT changes, if they have taken place */
+ if (nr != NULL) {
+ PF_ACPY(saddr, &sk->addr[pd->sidx], af);
+ PF_ACPY(daddr, &sk->addr[pd->didx], af);
+ if (pd->sport)
+ *pd->sport = sk->port[pd->sidx];
+ if (pd->dport)
+ *pd->dport = sk->port[pd->didx];
+ if (pd->proto_sum)
+ *pd->proto_sum = bproto_sum;
+ if (pd->ip_sum)
+ *pd->ip_sum = bip_sum;
+ m_copyback(m, off, hdrlen, pd->hdr.any);
+ }
+ if (pd->proto == IPPROTO_TCP &&
+ ((r->rule_flag & PFRULE_RETURNRST) ||
+ (r->rule_flag & PFRULE_RETURN)) &&
+ !(th->th_flags & TH_RST)) {
+ u_int32_t ack = ntohl(th->th_seq) + pd->p_len;
+ int len = 0;
+#ifdef INET
+ struct ip *h4;
+#endif
+#ifdef INET6
+ struct ip6_hdr *h6;
+#endif
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ h4 = mtod(m, struct ip *);
+ len = ntohs(h4->ip_len) - off;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ h6 = mtod(m, struct ip6_hdr *);
+ len = ntohs(h6->ip6_plen) - (off - sizeof(*h6));
+ break;
+#endif
+ }
+
+ if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
+ REASON_SET(&reason, PFRES_PROTCKSUM);
+ else {
+ if (th->th_flags & TH_SYN)
+ ack++;
+ if (th->th_flags & TH_FIN)
+ ack++;
+ pf_send_tcp(m, r, af, pd->dst,
+ pd->src, th->th_dport, th->th_sport,
+ ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
+ r->return_ttl, 1, 0, kif->pfik_ifp);
+ }
+ } else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
+ r->return_icmp)
+ pf_send_icmp(m, r->return_icmp >> 8,
+ r->return_icmp & 255, af, r);
+ else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
+ r->return_icmp6)
+ pf_send_icmp(m, r->return_icmp6 >> 8,
+ r->return_icmp6 & 255, af, r);
+ }
+
+ if (r->action == PF_DROP)
+ goto cleanup;
+
+ if (tag > 0 && pf_tag_packet(m, pd, tag)) {
+ REASON_SET(&reason, PFRES_MEMORY);
+ goto cleanup;
+ }
+ if (rtableid >= 0)
+ M_SETFIB(m, rtableid);
+
+ if (!state_icmp && (r->keep_state || nr != NULL ||
+ (pd->flags & PFDESC_TCP_NORM))) {
+ int action;
+ action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off,
+ sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum,
+ hdrlen);
+ if (action != PF_PASS)
+ return (action);
+ } else {
+ if (sk != NULL)
+ uma_zfree(V_pf_state_key_z, sk);
+ if (nk != NULL)
+ uma_zfree(V_pf_state_key_z, nk);
+ }
+
+ /* copy back packet headers if we performed NAT operations */
+ if (rewrite)
+ m_copyback(m, off, hdrlen, pd->hdr.any);
+
+ if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) &&
+ direction == PF_OUT &&
+ pfsync_defer_ptr != NULL && pfsync_defer_ptr(*sm, m))
+ /*
+ * We want the state created, but we dont
+ * want to send this in case a partner
+ * firewall has to know about it to allow
+ * replies through it.
+ */
+ return (PF_DEFER);
+
+ return (PF_PASS);
+
+cleanup:
+ if (sk != NULL)
+ uma_zfree(V_pf_state_key_z, sk);
+ if (nk != NULL)
+ uma_zfree(V_pf_state_key_z, nk);
+ return (PF_DROP);
+}
+
+static int
+pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a,
+ struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk,
+ struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport,
+ u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm,
+ int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen)
+{
+ struct pf_state *s = NULL;
+ struct pf_src_node *sn = NULL;
+ struct tcphdr *th = pd->hdr.tcp;
+ u_int16_t mss = V_tcp_mssdflt;
+ u_short reason;
+
+ /* check maximums */
+ if (r->max_states &&
+ (counter_u64_fetch(r->states_cur) >= r->max_states)) {
+ counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1);
+ REASON_SET(&reason, PFRES_MAXSTATES);
+ return (PF_DROP);
+ }
+ /* src node for filter rule */
+ if ((r->rule_flag & PFRULE_SRCTRACK ||
+ r->rpool.opts & PF_POOL_STICKYADDR) &&
+ pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
+ REASON_SET(&reason, PFRES_SRCLIMIT);
+ goto csfailed;
+ }
+ /* src node for translation rule */
+ if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
+ pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
+ REASON_SET(&reason, PFRES_SRCLIMIT);
+ goto csfailed;
+ }
+ s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO);
+ if (s == NULL) {
+ REASON_SET(&reason, PFRES_MEMORY);
+ goto csfailed;
+ }
+ s->rule.ptr = r;
+ s->nat_rule.ptr = nr;
+ s->anchor.ptr = a;
+ STATE_INC_COUNTERS(s);
+ if (r->allow_opts)
+ s->state_flags |= PFSTATE_ALLOWOPTS;
+ if (r->rule_flag & PFRULE_STATESLOPPY)
+ s->state_flags |= PFSTATE_SLOPPY;
+ s->log = r->log & PF_LOG_ALL;
+ s->sync_state = PFSYNC_S_NONE;
+ if (nr != NULL)
+ s->log |= nr->log & PF_LOG_ALL;
+ switch (pd->proto) {
+ case IPPROTO_TCP:
+ s->src.seqlo = ntohl(th->th_seq);
+ s->src.seqhi = s->src.seqlo + pd->p_len + 1;
+ if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
+ r->keep_state == PF_STATE_MODULATE) {
+ /* Generate sequence number modulator */
+ if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
+ 0)
+ s->src.seqdiff = 1;
+ pf_change_proto_a(m, &th->th_seq, &th->th_sum,
+ htonl(s->src.seqlo + s->src.seqdiff), 0);
+ *rewrite = 1;
+ } else
+ s->src.seqdiff = 0;
+ if (th->th_flags & TH_SYN) {
+ s->src.seqhi++;
+ s->src.wscale = pf_get_wscale(m, off,
+ th->th_off, pd->af);
+ }
+ s->src.max_win = MAX(ntohs(th->th_win), 1);
+ if (s->src.wscale & PF_WSCALE_MASK) {
+ /* Remove scale factor from initial window */
+ int win = s->src.max_win;
+ win += 1 << (s->src.wscale & PF_WSCALE_MASK);
+ s->src.max_win = (win - 1) >>
+ (s->src.wscale & PF_WSCALE_MASK);
+ }
+ if (th->th_flags & TH_FIN)
+ s->src.seqhi++;
+ s->dst.seqhi = 1;
+ s->dst.max_win = 1;
+ s->src.state = TCPS_SYN_SENT;
+ s->dst.state = TCPS_CLOSED;
+ s->timeout = PFTM_TCP_FIRST_PACKET;
+ break;
+ case IPPROTO_UDP:
+ s->src.state = PFUDPS_SINGLE;
+ s->dst.state = PFUDPS_NO_TRAFFIC;
+ s->timeout = PFTM_UDP_FIRST_PACKET;
+ break;
+ case IPPROTO_ICMP:
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+#endif
+ s->timeout = PFTM_ICMP_FIRST_PACKET;
+ break;
+ default:
+ s->src.state = PFOTHERS_SINGLE;
+ s->dst.state = PFOTHERS_NO_TRAFFIC;
+ s->timeout = PFTM_OTHER_FIRST_PACKET;
+ }
+
+ if (r->rt && r->rt != PF_FASTROUTE) {
+ if (pf_map_addr(pd->af, r, pd->src, &s->rt_addr, NULL, &sn)) {
+ REASON_SET(&reason, PFRES_MAPFAILED);
+ pf_src_tree_remove_state(s);
+ STATE_DEC_COUNTERS(s);
+ uma_zfree(V_pf_state_z, s);
+ goto csfailed;
+ }
+ s->rt_kif = r->rpool.cur->kif;
+ }
+
+ s->creation = time_uptime;
+ s->expire = time_uptime;
+
+ if (sn != NULL)
+ s->src_node = sn;
+ if (nsn != NULL) {
+ /* XXX We only modify one side for now. */
+ PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
+ s->nat_src_node = nsn;
+ }
+ if (pd->proto == IPPROTO_TCP) {
+ if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
+ off, pd, th, &s->src, &s->dst)) {
+ REASON_SET(&reason, PFRES_MEMORY);
+ pf_src_tree_remove_state(s);
+ STATE_DEC_COUNTERS(s);
+ uma_zfree(V_pf_state_z, s);
+ return (PF_DROP);
+ }
+ if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
+ pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
+ &s->src, &s->dst, rewrite)) {
+ /* This really shouldn't happen!!! */
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("pf_normalize_tcp_stateful failed on first pkt"));
+ pf_normalize_tcp_cleanup(s);
+ pf_src_tree_remove_state(s);
+ STATE_DEC_COUNTERS(s);
+ uma_zfree(V_pf_state_z, s);
+ return (PF_DROP);
+ }
+ }
+ s->direction = pd->dir;
+
+ /*
+ * sk/nk could already been setup by pf_get_translation().
+ */
+ if (nr == NULL) {
+ KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p",
+ __func__, nr, sk, nk));
+ sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport);
+ if (sk == NULL)
+ goto csfailed;
+ nk = sk;
+ } else
+ KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p",
+ __func__, nr, sk, nk));
+
+ /* Swap sk/nk for PF_OUT. */
+ if (pf_state_insert(BOUND_IFACE(r, kif),
+ (pd->dir == PF_IN) ? sk : nk,
+ (pd->dir == PF_IN) ? nk : sk, s)) {
+ if (pd->proto == IPPROTO_TCP)
+ pf_normalize_tcp_cleanup(s);
+ REASON_SET(&reason, PFRES_STATEINS);
+ pf_src_tree_remove_state(s);
+ STATE_DEC_COUNTERS(s);
+ uma_zfree(V_pf_state_z, s);
+ return (PF_DROP);
+ } else
+ *sm = s;
+
+ if (tag > 0)
+ s->tag = tag;
+ if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
+ TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
+ s->src.state = PF_TCPS_PROXY_SRC;
+ /* undo NAT changes, if they have taken place */
+ if (nr != NULL) {
+ struct pf_state_key *skt = s->key[PF_SK_WIRE];
+ if (pd->dir == PF_OUT)
+ skt = s->key[PF_SK_STACK];
+ PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
+ PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
+ if (pd->sport)
+ *pd->sport = skt->port[pd->sidx];
+ if (pd->dport)
+ *pd->dport = skt->port[pd->didx];
+ if (pd->proto_sum)
+ *pd->proto_sum = bproto_sum;
+ if (pd->ip_sum)
+ *pd->ip_sum = bip_sum;
+ m_copyback(m, off, hdrlen, pd->hdr.any);
+ }
+ s->src.seqhi = htonl(arc4random());
+ /* Find mss option */
+ int rtid = M_GETFIB(m);
+ mss = pf_get_mss(m, off, th->th_off, pd->af);
+ mss = pf_calc_mss(pd->src, pd->af, rtid, mss);
+ mss = pf_calc_mss(pd->dst, pd->af, rtid, mss);
+ s->src.mss = mss;
+ pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport,
+ th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
+ TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL);
+ REASON_SET(&reason, PFRES_SYNPROXY);
+ return (PF_SYNPROXY_DROP);
+ }
+
+ return (PF_PASS);
+
+csfailed:
+ if (sk != NULL)
+ uma_zfree(V_pf_state_key_z, sk);
+ if (nk != NULL)
+ uma_zfree(V_pf_state_key_z, nk);
+
+ if (sn != NULL) {
+ struct pf_srchash *sh;
+
+ sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
+ PF_HASHROW_LOCK(sh);
+ if (--sn->states == 0 && sn->expire == 0) {
+ pf_unlink_src_node(sn);
+ uma_zfree(V_pf_sources_z, sn);
+ counter_u64_add(
+ V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
+ }
+ PF_HASHROW_UNLOCK(sh);
+ }
+
+ if (nsn != sn && nsn != NULL) {
+ struct pf_srchash *sh;
+
+ sh = &V_pf_srchash[pf_hashsrc(&nsn->addr, nsn->af)];
+ PF_HASHROW_LOCK(sh);
+ if (--nsn->states == 0 && nsn->expire == 0) {
+ pf_unlink_src_node(nsn);
+ uma_zfree(V_pf_sources_z, nsn);
+ counter_u64_add(
+ V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
+ }
+ PF_HASHROW_UNLOCK(sh);
+ }
+
+ return (PF_DROP);
+}
+
+static int
+pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
+ struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am,
+ struct pf_ruleset **rsm)
+{
+ struct pf_rule *r, *a = NULL;
+ struct pf_ruleset *ruleset = NULL;
+ sa_family_t af = pd->af;
+ u_short reason;
+ int tag = -1;
+ int asd = 0;
+ int match = 0;
+ struct pf_anchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE];
+
+ PF_RULES_RASSERT();
+
+ r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
+ while (r != NULL) {
+ r->evaluations++;
+ if (pfi_kif_match(r->kif, kif) == r->ifnot)
+ r = r->skip[PF_SKIP_IFP].ptr;
+ else if (r->direction && r->direction != direction)
+ r = r->skip[PF_SKIP_DIR].ptr;
+ else if (r->af && r->af != af)
+ r = r->skip[PF_SKIP_AF].ptr;
+ else if (r->proto && r->proto != pd->proto)
+ r = r->skip[PF_SKIP_PROTO].ptr;
+ else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
+ r->src.neg, kif, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_SRC_ADDR].ptr;
+ else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
+ r->dst.neg, NULL, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_DST_ADDR].ptr;
+ else if (r->tos && !(r->tos == pd->tos))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->os_fingerprint != PF_OSFP_ANY)
+ r = TAILQ_NEXT(r, entries);
+ else if (pd->proto == IPPROTO_UDP &&
+ (r->src.port_op || r->dst.port_op))
+ r = TAILQ_NEXT(r, entries);
+ else if (pd->proto == IPPROTO_TCP &&
+ (r->src.port_op || r->dst.port_op || r->flagset))
+ r = TAILQ_NEXT(r, entries);
+ else if ((pd->proto == IPPROTO_ICMP ||
+ pd->proto == IPPROTO_ICMPV6) &&
+ (r->type || r->code))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->prio &&
+ !pf_match_ieee8021q_pcp(r->prio, m))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->prob && r->prob <=
+ (arc4random() % (UINT_MAX - 1) + 1))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->match_tag && !pf_match_tag(m, r, &tag,
+ pd->pf_mtag ? pd->pf_mtag->tag : 0))
+ r = TAILQ_NEXT(r, entries);
+ else {
+ if (r->anchor == NULL) {
+ match = 1;
+ *rm = r;
+ *am = a;
+ *rsm = ruleset;
+ if ((*rm)->quick)
+ break;
+ r = TAILQ_NEXT(r, entries);
+ } else
+ pf_step_into_anchor(anchor_stack, &asd,
+ &ruleset, PF_RULESET_FILTER, &r, &a,
+ &match);
+ }
+ if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
+ &ruleset, PF_RULESET_FILTER, &r, &a, &match))
+ break;
+ }
+ r = *rm;
+ a = *am;
+ ruleset = *rsm;
+
+ REASON_SET(&reason, PFRES_MATCH);
+
+ if (r->log)
+ PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd,
+ 1);
+
+ if (r->action != PF_PASS)
+ return (PF_DROP);
+
+ if (tag > 0 && pf_tag_packet(m, pd, tag)) {
+ REASON_SET(&reason, PFRES_MEMORY);
+ return (PF_DROP);
+ }
+
+ return (PF_PASS);
+}
+
+static int
+pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
+ struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
+ struct pf_pdesc *pd, u_short *reason, int *copyback)
+{
+ struct tcphdr *th = pd->hdr.tcp;
+ u_int16_t win = ntohs(th->th_win);
+ u_int32_t ack, end, seq, orig_seq;
+ u_int8_t sws, dws;
+ int ackskew;
+
+ if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
+ sws = src->wscale & PF_WSCALE_MASK;
+ dws = dst->wscale & PF_WSCALE_MASK;
+ } else
+ sws = dws = 0;
+
+ /*
+ * Sequence tracking algorithm from Guido van Rooij's paper:
+ * http://www.madison-gurkha.com/publications/tcp_filtering/
+ * tcp_filtering.ps
+ */
+
+ orig_seq = seq = ntohl(th->th_seq);
+ if (src->seqlo == 0) {
+ /* First packet from this end. Set its state */
+
+ if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
+ src->scrub == NULL) {
+ if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
+ REASON_SET(reason, PFRES_MEMORY);
+ return (PF_DROP);
+ }
+ }
+
+ /* Deferred generation of sequence number modulator */
+ if (dst->seqdiff && !src->seqdiff) {
+ /* use random iss for the TCP server */
+ while ((src->seqdiff = arc4random() - seq) == 0)
+ ;
+ ack = ntohl(th->th_ack) - dst->seqdiff;
+ pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq +
+ src->seqdiff), 0);
+ pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0);
+ *copyback = 1;
+ } else {
+ ack = ntohl(th->th_ack);
+ }
+
+ end = seq + pd->p_len;
+ if (th->th_flags & TH_SYN) {
+ end++;
+ if (dst->wscale & PF_WSCALE_FLAG) {
+ src->wscale = pf_get_wscale(m, off, th->th_off,
+ pd->af);
+ if (src->wscale & PF_WSCALE_FLAG) {
+ /* Remove scale factor from initial
+ * window */
+ sws = src->wscale & PF_WSCALE_MASK;
+ win = ((u_int32_t)win + (1 << sws) - 1)
+ >> sws;
+ dws = dst->wscale & PF_WSCALE_MASK;
+ } else {
+ /* fixup other window */
+ dst->max_win <<= dst->wscale &
+ PF_WSCALE_MASK;
+ /* in case of a retrans SYN|ACK */
+ dst->wscale = 0;
+ }
+ }
+ }
+ if (th->th_flags & TH_FIN)
+ end++;
+
+ src->seqlo = seq;
+ if (src->state < TCPS_SYN_SENT)
+ src->state = TCPS_SYN_SENT;
+
+ /*
+ * May need to slide the window (seqhi may have been set by
+ * the crappy stack check or if we picked up the connection
+ * after establishment)
+ */
+ if (src->seqhi == 1 ||
+ SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
+ src->seqhi = end + MAX(1, dst->max_win << dws);
+ if (win > src->max_win)
+ src->max_win = win;
+
+ } else {
+ ack = ntohl(th->th_ack) - dst->seqdiff;
+ if (src->seqdiff) {
+ /* Modulate sequence numbers */
+ pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq +
+ src->seqdiff), 0);
+ pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0);
+ *copyback = 1;
+ }
+ end = seq + pd->p_len;
+ if (th->th_flags & TH_SYN)
+ end++;
+ if (th->th_flags & TH_FIN)
+ end++;
+ }
+
+ if ((th->th_flags & TH_ACK) == 0) {
+ /* Let it pass through the ack skew check */
+ ack = dst->seqlo;
+ } else if ((ack == 0 &&
+ (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
+ /* broken tcp stacks do not set ack */
+ (dst->state < TCPS_SYN_SENT)) {
+ /*
+ * Many stacks (ours included) will set the ACK number in an
+ * FIN|ACK if the SYN times out -- no sequence to ACK.
+ */
+ ack = dst->seqlo;
+ }
+
+ if (seq == end) {
+ /* Ease sequencing restrictions on no data packets */
+ seq = src->seqlo;
+ end = seq;
+ }
+
+ ackskew = dst->seqlo - ack;
+
+
+ /*
+ * Need to demodulate the sequence numbers in any TCP SACK options
+ * (Selective ACK). We could optionally validate the SACK values
+ * against the current ACK window, either forwards or backwards, but
+ * I'm not confident that SACK has been implemented properly
+ * everywhere. It wouldn't surprise me if several stacks accidentally
+ * SACK too far backwards of previously ACKed data. There really aren't
+ * any security implications of bad SACKing unless the target stack
+ * doesn't validate the option length correctly. Someone trying to
+ * spoof into a TCP connection won't bother blindly sending SACK
+ * options anyway.
+ */
+ if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
+ if (pf_modulate_sack(m, off, pd, th, dst))
+ *copyback = 1;
+ }
+
+
+#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
+ if (SEQ_GEQ(src->seqhi, end) &&
+ /* Last octet inside other's window space */
+ SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
+ /* Retrans: not more than one window back */
+ (ackskew >= -MAXACKWINDOW) &&
+ /* Acking not more than one reassembled fragment backwards */
+ (ackskew <= (MAXACKWINDOW << sws)) &&
+ /* Acking not more than one window forward */
+ ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
+ (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
+ (pd->flags & PFDESC_IP_REAS) == 0)) {
+ /* Require an exact/+1 sequence match on resets when possible */
+
+ if (dst->scrub || src->scrub) {
+ if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
+ *state, src, dst, copyback))
+ return (PF_DROP);
+ }
+
+ /* update max window */
+ if (src->max_win < win)
+ src->max_win = win;
+ /* synchronize sequencing */
+ if (SEQ_GT(end, src->seqlo))
+ src->seqlo = end;
+ /* slide the window of what the other end can send */
+ if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
+ dst->seqhi = ack + MAX((win << sws), 1);
+
+
+ /* update states */
+ if (th->th_flags & TH_SYN)
+ if (src->state < TCPS_SYN_SENT)
+ src->state = TCPS_SYN_SENT;
+ if (th->th_flags & TH_FIN)
+ if (src->state < TCPS_CLOSING)
+ src->state = TCPS_CLOSING;
+ if (th->th_flags & TH_ACK) {
+ if (dst->state == TCPS_SYN_SENT) {
+ dst->state = TCPS_ESTABLISHED;
+ if (src->state == TCPS_ESTABLISHED &&
+ (*state)->src_node != NULL &&
+ pf_src_connlimit(state)) {
+ REASON_SET(reason, PFRES_SRCLIMIT);
+ return (PF_DROP);
+ }
+ } else if (dst->state == TCPS_CLOSING)
+ dst->state = TCPS_FIN_WAIT_2;
+ }
+ if (th->th_flags & TH_RST)
+ src->state = dst->state = TCPS_TIME_WAIT;
+
+ /* update expire time */
+ (*state)->expire = time_uptime;
+ if (src->state >= TCPS_FIN_WAIT_2 &&
+ dst->state >= TCPS_FIN_WAIT_2)
+ (*state)->timeout = PFTM_TCP_CLOSED;
+ else if (src->state >= TCPS_CLOSING &&
+ dst->state >= TCPS_CLOSING)
+ (*state)->timeout = PFTM_TCP_FIN_WAIT;
+ else if (src->state < TCPS_ESTABLISHED ||
+ dst->state < TCPS_ESTABLISHED)
+ (*state)->timeout = PFTM_TCP_OPENING;
+ else if (src->state >= TCPS_CLOSING ||
+ dst->state >= TCPS_CLOSING)
+ (*state)->timeout = PFTM_TCP_CLOSING;
+ else
+ (*state)->timeout = PFTM_TCP_ESTABLISHED;
+
+ /* Fall through to PASS packet */
+
+ } else if ((dst->state < TCPS_SYN_SENT ||
+ dst->state >= TCPS_FIN_WAIT_2 ||
+ src->state >= TCPS_FIN_WAIT_2) &&
+ SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
+ /* Within a window forward of the originating packet */
+ SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
+ /* Within a window backward of the originating packet */
+
+ /*
+ * This currently handles three situations:
+ * 1) Stupid stacks will shotgun SYNs before their peer
+ * replies.
+ * 2) When PF catches an already established stream (the
+ * firewall rebooted, the state table was flushed, routes
+ * changed...)
+ * 3) Packets get funky immediately after the connection
+ * closes (this should catch Solaris spurious ACK|FINs
+ * that web servers like to spew after a close)
+ *
+ * This must be a little more careful than the above code
+ * since packet floods will also be caught here. We don't
+ * update the TTL here to mitigate the damage of a packet
+ * flood and so the same code can handle awkward establishment
+ * and a loosened connection close.
+ * In the establishment case, a correct peer response will
+ * validate the connection, go through the normal state code
+ * and keep updating the state TTL.
+ */
+
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: loose state match: ");
+ pf_print_state(*state);
+ pf_print_flags(th->th_flags);
+ printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
+ "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack,
+ pd->p_len, ackskew, (unsigned long long)(*state)->packets[0],
+ (unsigned long long)(*state)->packets[1],
+ pd->dir == PF_IN ? "in" : "out",
+ pd->dir == (*state)->direction ? "fwd" : "rev");
+ }
+
+ if (dst->scrub || src->scrub) {
+ if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
+ *state, src, dst, copyback))
+ return (PF_DROP);
+ }
+
+ /* update max window */
+ if (src->max_win < win)
+ src->max_win = win;
+ /* synchronize sequencing */
+ if (SEQ_GT(end, src->seqlo))
+ src->seqlo = end;
+ /* slide the window of what the other end can send */
+ if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
+ dst->seqhi = ack + MAX((win << sws), 1);
+
+ /*
+ * Cannot set dst->seqhi here since this could be a shotgunned
+ * SYN and not an already established connection.
+ */
+
+ if (th->th_flags & TH_FIN)
+ if (src->state < TCPS_CLOSING)
+ src->state = TCPS_CLOSING;
+ if (th->th_flags & TH_RST)
+ src->state = dst->state = TCPS_TIME_WAIT;
+
+ /* Fall through to PASS packet */
+
+ } else {
+ if ((*state)->dst.state == TCPS_SYN_SENT &&
+ (*state)->src.state == TCPS_SYN_SENT) {
+ /* Send RST for state mismatches during handshake */
+ if (!(th->th_flags & TH_RST))
+ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
+ pd->dst, pd->src, th->th_dport,
+ th->th_sport, ntohl(th->th_ack), 0,
+ TH_RST, 0, 0,
+ (*state)->rule.ptr->return_ttl, 1, 0,
+ kif->pfik_ifp);
+ src->seqlo = 0;
+ src->seqhi = 1;
+ src->max_win = 1;
+ } else if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: BAD state: ");
+ pf_print_state(*state);
+ pf_print_flags(th->th_flags);
+ printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
+ "pkts=%llu:%llu dir=%s,%s\n",
+ seq, orig_seq, ack, pd->p_len, ackskew,
+ (unsigned long long)(*state)->packets[0],
+ (unsigned long long)(*state)->packets[1],
+ pd->dir == PF_IN ? "in" : "out",
+ pd->dir == (*state)->direction ? "fwd" : "rev");
+ printf("pf: State failure on: %c %c %c %c | %c %c\n",
+ SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
+ SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
+ ' ': '2',
+ (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
+ (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
+ SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
+ SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
+ }
+ REASON_SET(reason, PFRES_BADSTATE);
+ return (PF_DROP);
+ }
+
+ return (PF_PASS);
+}
+
+static int
+pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
+ struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
+{
+ struct tcphdr *th = pd->hdr.tcp;
+
+ if (th->th_flags & TH_SYN)
+ if (src->state < TCPS_SYN_SENT)
+ src->state = TCPS_SYN_SENT;
+ if (th->th_flags & TH_FIN)
+ if (src->state < TCPS_CLOSING)
+ src->state = TCPS_CLOSING;
+ if (th->th_flags & TH_ACK) {
+ if (dst->state == TCPS_SYN_SENT) {
+ dst->state = TCPS_ESTABLISHED;
+ if (src->state == TCPS_ESTABLISHED &&
+ (*state)->src_node != NULL &&
+ pf_src_connlimit(state)) {
+ REASON_SET(reason, PFRES_SRCLIMIT);
+ return (PF_DROP);
+ }
+ } else if (dst->state == TCPS_CLOSING) {
+ dst->state = TCPS_FIN_WAIT_2;
+ } else if (src->state == TCPS_SYN_SENT &&
+ dst->state < TCPS_SYN_SENT) {
+ /*
+ * Handle a special sloppy case where we only see one
+ * half of the connection. If there is a ACK after
+ * the initial SYN without ever seeing a packet from
+ * the destination, set the connection to established.
+ */
+ dst->state = src->state = TCPS_ESTABLISHED;
+ if ((*state)->src_node != NULL &&
+ pf_src_connlimit(state)) {
+ REASON_SET(reason, PFRES_SRCLIMIT);
+ return (PF_DROP);
+ }
+ } else if (src->state == TCPS_CLOSING &&
+ dst->state == TCPS_ESTABLISHED &&
+ dst->seqlo == 0) {
+ /*
+ * Handle the closing of half connections where we
+ * don't see the full bidirectional FIN/ACK+ACK
+ * handshake.
+ */
+ dst->state = TCPS_CLOSING;
+ }
+ }
+ if (th->th_flags & TH_RST)
+ src->state = dst->state = TCPS_TIME_WAIT;
+
+ /* update expire time */
+ (*state)->expire = time_uptime;
+ if (src->state >= TCPS_FIN_WAIT_2 &&
+ dst->state >= TCPS_FIN_WAIT_2)
+ (*state)->timeout = PFTM_TCP_CLOSED;
+ else if (src->state >= TCPS_CLOSING &&
+ dst->state >= TCPS_CLOSING)
+ (*state)->timeout = PFTM_TCP_FIN_WAIT;
+ else if (src->state < TCPS_ESTABLISHED ||
+ dst->state < TCPS_ESTABLISHED)
+ (*state)->timeout = PFTM_TCP_OPENING;
+ else if (src->state >= TCPS_CLOSING ||
+ dst->state >= TCPS_CLOSING)
+ (*state)->timeout = PFTM_TCP_CLOSING;
+ else
+ (*state)->timeout = PFTM_TCP_ESTABLISHED;
+
+ return (PF_PASS);
+}
+
+static int
+pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
+ struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
+ u_short *reason)
+{
+ struct pf_state_key_cmp key;
+ struct tcphdr *th = pd->hdr.tcp;
+ int copyback = 0;
+ struct pf_state_peer *src, *dst;
+ struct pf_state_key *sk;
+
+ bzero(&key, sizeof(key));
+ key.af = pd->af;
+ key.proto = IPPROTO_TCP;
+ if (direction == PF_IN) { /* wire side, straight */
+ PF_ACPY(&key.addr[0], pd->src, key.af);
+ PF_ACPY(&key.addr[1], pd->dst, key.af);
+ key.port[0] = th->th_sport;
+ key.port[1] = th->th_dport;
+ } else { /* stack side, reverse */
+ PF_ACPY(&key.addr[1], pd->src, key.af);
+ PF_ACPY(&key.addr[0], pd->dst, key.af);
+ key.port[1] = th->th_sport;
+ key.port[0] = th->th_dport;
+ }
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ if (direction == (*state)->direction) {
+ src = &(*state)->src;
+ dst = &(*state)->dst;
+ } else {
+ src = &(*state)->dst;
+ dst = &(*state)->src;
+ }
+
+ sk = (*state)->key[pd->didx];
+
+ if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
+ if (direction != (*state)->direction) {
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_SYNPROXY_DROP);
+ }
+ if (th->th_flags & TH_SYN) {
+ if (ntohl(th->th_seq) != (*state)->src.seqlo) {
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_DROP);
+ }
+ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
+ pd->src, th->th_dport, th->th_sport,
+ (*state)->src.seqhi, ntohl(th->th_seq) + 1,
+ TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL);
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_SYNPROXY_DROP);
+ } else if (!(th->th_flags & TH_ACK) ||
+ (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
+ (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_DROP);
+ } else if ((*state)->src_node != NULL &&
+ pf_src_connlimit(state)) {
+ REASON_SET(reason, PFRES_SRCLIMIT);
+ return (PF_DROP);
+ } else
+ (*state)->src.state = PF_TCPS_PROXY_DST;
+ }
+ if ((*state)->src.state == PF_TCPS_PROXY_DST) {
+ if (direction == (*state)->direction) {
+ if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
+ (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
+ (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_DROP);
+ }
+ (*state)->src.max_win = MAX(ntohs(th->th_win), 1);
+ if ((*state)->dst.seqhi == 1)
+ (*state)->dst.seqhi = htonl(arc4random());
+ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
+ &sk->addr[pd->sidx], &sk->addr[pd->didx],
+ sk->port[pd->sidx], sk->port[pd->didx],
+ (*state)->dst.seqhi, 0, TH_SYN, 0,
+ (*state)->src.mss, 0, 0, (*state)->tag, NULL);
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_SYNPROXY_DROP);
+ } else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
+ (TH_SYN|TH_ACK)) ||
+ (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_DROP);
+ } else {
+ (*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
+ (*state)->dst.seqlo = ntohl(th->th_seq);
+ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
+ pd->src, th->th_dport, th->th_sport,
+ ntohl(th->th_ack), ntohl(th->th_seq) + 1,
+ TH_ACK, (*state)->src.max_win, 0, 0, 0,
+ (*state)->tag, NULL);
+ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
+ &sk->addr[pd->sidx], &sk->addr[pd->didx],
+ sk->port[pd->sidx], sk->port[pd->didx],
+ (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
+ TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL);
+ (*state)->src.seqdiff = (*state)->dst.seqhi -
+ (*state)->src.seqlo;
+ (*state)->dst.seqdiff = (*state)->src.seqhi -
+ (*state)->dst.seqlo;
+ (*state)->src.seqhi = (*state)->src.seqlo +
+ (*state)->dst.max_win;
+ (*state)->dst.seqhi = (*state)->dst.seqlo +
+ (*state)->src.max_win;
+ (*state)->src.wscale = (*state)->dst.wscale = 0;
+ (*state)->src.state = (*state)->dst.state =
+ TCPS_ESTABLISHED;
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_SYNPROXY_DROP);
+ }
+ }
+
+ if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
+ dst->state >= TCPS_FIN_WAIT_2 &&
+ src->state >= TCPS_FIN_WAIT_2) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: state reuse ");
+ pf_print_state(*state);
+ pf_print_flags(th->th_flags);
+ printf("\n");
+ }
+ /* XXX make sure it's the same direction ?? */
+ (*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
+ pf_unlink_state(*state, PF_ENTER_LOCKED);
+ *state = NULL;
+ return (PF_DROP);
+ }
+
+ if ((*state)->state_flags & PFSTATE_SLOPPY) {
+ if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP)
+ return (PF_DROP);
+ } else {
+ if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason,
+ &copyback) == PF_DROP)
+ return (PF_DROP);
+ }
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk = (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
+ nk->port[pd->sidx] != th->th_sport)
+ pf_change_ap(m, pd->src, &th->th_sport,
+ pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx],
+ nk->port[pd->sidx], 0, pd->af);
+
+ if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
+ nk->port[pd->didx] != th->th_dport)
+ pf_change_ap(m, pd->dst, &th->th_dport,
+ pd->ip_sum, &th->th_sum, &nk->addr[pd->didx],
+ nk->port[pd->didx], 0, pd->af);
+ copyback = 1;
+ }
+
+ /* Copyback sequence modulation or stateful scrub changes if needed */
+ if (copyback)
+ m_copyback(m, off, sizeof(*th), (caddr_t)th);
+
+ return (PF_PASS);
+}
+
+static int
+pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
+ struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
+{
+ struct pf_state_peer *src, *dst;
+ struct pf_state_key_cmp key;
+ struct udphdr *uh = pd->hdr.udp;
+
+ bzero(&key, sizeof(key));
+ key.af = pd->af;
+ key.proto = IPPROTO_UDP;
+ if (direction == PF_IN) { /* wire side, straight */
+ PF_ACPY(&key.addr[0], pd->src, key.af);
+ PF_ACPY(&key.addr[1], pd->dst, key.af);
+ key.port[0] = uh->uh_sport;
+ key.port[1] = uh->uh_dport;
+ } else { /* stack side, reverse */
+ PF_ACPY(&key.addr[1], pd->src, key.af);
+ PF_ACPY(&key.addr[0], pd->dst, key.af);
+ key.port[1] = uh->uh_sport;
+ key.port[0] = uh->uh_dport;
+ }
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ if (direction == (*state)->direction) {
+ src = &(*state)->src;
+ dst = &(*state)->dst;
+ } else {
+ src = &(*state)->dst;
+ dst = &(*state)->src;
+ }
+
+ /* update states */
+ if (src->state < PFUDPS_SINGLE)
+ src->state = PFUDPS_SINGLE;
+ if (dst->state == PFUDPS_SINGLE)
+ dst->state = PFUDPS_MULTIPLE;
+
+ /* update expire time */
+ (*state)->expire = time_uptime;
+ if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
+ (*state)->timeout = PFTM_UDP_MULTIPLE;
+ else
+ (*state)->timeout = PFTM_UDP_SINGLE;
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk = (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
+ nk->port[pd->sidx] != uh->uh_sport)
+ pf_change_ap(m, pd->src, &uh->uh_sport, pd->ip_sum,
+ &uh->uh_sum, &nk->addr[pd->sidx],
+ nk->port[pd->sidx], 1, pd->af);
+
+ if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
+ nk->port[pd->didx] != uh->uh_dport)
+ pf_change_ap(m, pd->dst, &uh->uh_dport, pd->ip_sum,
+ &uh->uh_sum, &nk->addr[pd->didx],
+ nk->port[pd->didx], 1, pd->af);
+ m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
+ }
+
+ return (PF_PASS);
+}
+
+static int
+pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
+ struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
+{
+ struct pf_addr *saddr = pd->src, *daddr = pd->dst;
+ u_int16_t icmpid = 0, *icmpsum;
+ u_int8_t icmptype;
+ int state_icmp = 0;
+ struct pf_state_key_cmp key;
+
+ bzero(&key, sizeof(key));
+ switch (pd->proto) {
+#ifdef INET
+ case IPPROTO_ICMP:
+ icmptype = pd->hdr.icmp->icmp_type;
+ icmpid = pd->hdr.icmp->icmp_id;
+ icmpsum = &pd->hdr.icmp->icmp_cksum;
+
+ if (icmptype == ICMP_UNREACH ||
+ icmptype == ICMP_SOURCEQUENCH ||
+ icmptype == ICMP_REDIRECT ||
+ icmptype == ICMP_TIMXCEED ||
+ icmptype == ICMP_PARAMPROB)
+ state_icmp++;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+ icmptype = pd->hdr.icmp6->icmp6_type;
+ icmpid = pd->hdr.icmp6->icmp6_id;
+ icmpsum = &pd->hdr.icmp6->icmp6_cksum;
+
+ if (icmptype == ICMP6_DST_UNREACH ||
+ icmptype == ICMP6_PACKET_TOO_BIG ||
+ icmptype == ICMP6_TIME_EXCEEDED ||
+ icmptype == ICMP6_PARAM_PROB)
+ state_icmp++;
+ break;
+#endif /* INET6 */
+ }
+
+ if (!state_icmp) {
+
+ /*
+ * ICMP query/reply message not related to a TCP/UDP packet.
+ * Search for an ICMP state.
+ */
+ key.af = pd->af;
+ key.proto = pd->proto;
+ key.port[0] = key.port[1] = icmpid;
+ if (direction == PF_IN) { /* wire side, straight */
+ PF_ACPY(&key.addr[0], pd->src, key.af);
+ PF_ACPY(&key.addr[1], pd->dst, key.af);
+ } else { /* stack side, reverse */
+ PF_ACPY(&key.addr[1], pd->src, key.af);
+ PF_ACPY(&key.addr[0], pd->dst, key.af);
+ }
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ (*state)->expire = time_uptime;
+ (*state)->timeout = PFTM_ICMP_ERROR_REPLY;
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk = (*state)->key[pd->didx];
+
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET:
+ if (PF_ANEQ(pd->src,
+ &nk->addr[pd->sidx], AF_INET))
+ pf_change_a(&saddr->v4.s_addr,
+ pd->ip_sum,
+ nk->addr[pd->sidx].v4.s_addr, 0);
+
+ if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
+ AF_INET))
+ pf_change_a(&daddr->v4.s_addr,
+ pd->ip_sum,
+ nk->addr[pd->didx].v4.s_addr, 0);
+
+ if (nk->port[0] !=
+ pd->hdr.icmp->icmp_id) {
+ pd->hdr.icmp->icmp_cksum =
+ pf_cksum_fixup(
+ pd->hdr.icmp->icmp_cksum, icmpid,
+ nk->port[pd->sidx], 0);
+ pd->hdr.icmp->icmp_id =
+ nk->port[pd->sidx];
+ }
+
+ m_copyback(m, off, ICMP_MINLEN,
+ (caddr_t )pd->hdr.icmp);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (PF_ANEQ(pd->src,
+ &nk->addr[pd->sidx], AF_INET6))
+ pf_change_a6(saddr,
+ &pd->hdr.icmp6->icmp6_cksum,
+ &nk->addr[pd->sidx], 0);
+
+ if (PF_ANEQ(pd->dst,
+ &nk->addr[pd->didx], AF_INET6))
+ pf_change_a6(daddr,
+ &pd->hdr.icmp6->icmp6_cksum,
+ &nk->addr[pd->didx], 0);
+
+ m_copyback(m, off, sizeof(struct icmp6_hdr),
+ (caddr_t )pd->hdr.icmp6);
+ break;
+#endif /* INET6 */
+ }
+ }
+ return (PF_PASS);
+
+ } else {
+ /*
+ * ICMP error message in response to a TCP/UDP packet.
+ * Extract the inner TCP/UDP header and search for that state.
+ */
+
+ struct pf_pdesc pd2;
+ bzero(&pd2, sizeof pd2);
+#ifdef INET
+ struct ip h2;
+#endif /* INET */
+#ifdef INET6
+ struct ip6_hdr h2_6;
+ int terminal = 0;
+#endif /* INET6 */
+ int ipoff2 = 0;
+ int off2 = 0;
+
+ pd2.af = pd->af;
+ /* Payload packet is from the opposite direction. */
+ pd2.sidx = (direction == PF_IN) ? 1 : 0;
+ pd2.didx = (direction == PF_IN) ? 0 : 1;
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET:
+ /* offset of h2 in mbuf chain */
+ ipoff2 = off + ICMP_MINLEN;
+
+ if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
+ NULL, reason, pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMP error message too short "
+ "(ip)\n"));
+ return (PF_DROP);
+ }
+ /*
+ * ICMP error messages don't refer to non-first
+ * fragments
+ */
+ if (h2.ip_off & htons(IP_OFFMASK)) {
+ REASON_SET(reason, PFRES_FRAG);
+ return (PF_DROP);
+ }
+
+ /* offset of protocol header that follows h2 */
+ off2 = ipoff2 + (h2.ip_hl << 2);
+
+ pd2.proto = h2.ip_p;
+ pd2.src = (struct pf_addr *)&h2.ip_src;
+ pd2.dst = (struct pf_addr *)&h2.ip_dst;
+ pd2.ip_sum = &h2.ip_sum;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ ipoff2 = off + sizeof(struct icmp6_hdr);
+
+ if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
+ NULL, reason, pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMP error message too short "
+ "(ip6)\n"));
+ return (PF_DROP);
+ }
+ pd2.proto = h2_6.ip6_nxt;
+ pd2.src = (struct pf_addr *)&h2_6.ip6_src;
+ pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
+ pd2.ip_sum = NULL;
+ off2 = ipoff2 + sizeof(h2_6);
+ do {
+ switch (pd2.proto) {
+ case IPPROTO_FRAGMENT:
+ /*
+ * ICMPv6 error messages for
+ * non-first fragments
+ */
+ REASON_SET(reason, PFRES_FRAG);
+ return (PF_DROP);
+ case IPPROTO_AH:
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_DSTOPTS: {
+ /* get next header and header length */
+ struct ip6_ext opt6;
+
+ if (!pf_pull_hdr(m, off2, &opt6,
+ sizeof(opt6), NULL, reason,
+ pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMPv6 short opt\n"));
+ return (PF_DROP);
+ }
+ if (pd2.proto == IPPROTO_AH)
+ off2 += (opt6.ip6e_len + 2) * 4;
+ else
+ off2 += (opt6.ip6e_len + 1) * 8;
+ pd2.proto = opt6.ip6e_nxt;
+ /* goto the next header */
+ break;
+ }
+ default:
+ terminal++;
+ break;
+ }
+ } while (!terminal);
+ break;
+#endif /* INET6 */
+ }
+
+ switch (pd2.proto) {
+ case IPPROTO_TCP: {
+ struct tcphdr th;
+ u_int32_t seq;
+ struct pf_state_peer *src, *dst;
+ u_int8_t dws;
+ int copyback = 0;
+
+ /*
+ * Only the first 8 bytes of the TCP header can be
+ * expected. Don't access any TCP header fields after
+ * th_seq, an ackskew test is not possible.
+ */
+ if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
+ pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMP error message too short "
+ "(tcp)\n"));
+ return (PF_DROP);
+ }
+
+ key.af = pd2.af;
+ key.proto = IPPROTO_TCP;
+ PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
+ PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ key.port[pd2.sidx] = th.th_sport;
+ key.port[pd2.didx] = th.th_dport;
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ if (direction == (*state)->direction) {
+ src = &(*state)->dst;
+ dst = &(*state)->src;
+ } else {
+ src = &(*state)->src;
+ dst = &(*state)->dst;
+ }
+
+ if (src->wscale && dst->wscale)
+ dws = dst->wscale & PF_WSCALE_MASK;
+ else
+ dws = 0;
+
+ /* Demodulate sequence number */
+ seq = ntohl(th.th_seq) - src->seqdiff;
+ if (src->seqdiff) {
+ pf_change_a(&th.th_seq, icmpsum,
+ htonl(seq), 0);
+ copyback = 1;
+ }
+
+ if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
+ (!SEQ_GEQ(src->seqhi, seq) ||
+ !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: BAD ICMP %d:%d ",
+ icmptype, pd->hdr.icmp->icmp_code);
+ pf_print_host(pd->src, 0, pd->af);
+ printf(" -> ");
+ pf_print_host(pd->dst, 0, pd->af);
+ printf(" state: ");
+ pf_print_state(*state);
+ printf(" seq=%u\n", seq);
+ }
+ REASON_SET(reason, PFRES_BADSTATE);
+ return (PF_DROP);
+ } else {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: OK ICMP %d:%d ",
+ icmptype, pd->hdr.icmp->icmp_code);
+ pf_print_host(pd->src, 0, pd->af);
+ printf(" -> ");
+ pf_print_host(pd->dst, 0, pd->af);
+ printf(" state: ");
+ pf_print_state(*state);
+ printf(" seq=%u\n", seq);
+ }
+ }
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] !=
+ (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk =
+ (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd2.src,
+ &nk->addr[pd2.sidx], pd2.af) ||
+ nk->port[pd2.sidx] != th.th_sport)
+ pf_change_icmp(pd2.src, &th.th_sport,
+ daddr, &nk->addr[pd2.sidx],
+ nk->port[pd2.sidx], NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, pd2.af);
+
+ if (PF_ANEQ(pd2.dst,
+ &nk->addr[pd2.didx], pd2.af) ||
+ nk->port[pd2.didx] != th.th_dport)
+ pf_change_icmp(pd2.dst, &th.th_dport,
+ saddr, &nk->addr[pd2.didx],
+ nk->port[pd2.didx], NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, pd2.af);
+ copyback = 1;
+ }
+
+ if (copyback) {
+ switch (pd2.af) {
+#ifdef INET
+ case AF_INET:
+ m_copyback(m, off, ICMP_MINLEN,
+ (caddr_t )pd->hdr.icmp);
+ m_copyback(m, ipoff2, sizeof(h2),
+ (caddr_t )&h2);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ m_copyback(m, off,
+ sizeof(struct icmp6_hdr),
+ (caddr_t )pd->hdr.icmp6);
+ m_copyback(m, ipoff2, sizeof(h2_6),
+ (caddr_t )&h2_6);
+ break;
+#endif /* INET6 */
+ }
+ m_copyback(m, off2, 8, (caddr_t)&th);
+ }
+
+ return (PF_PASS);
+ break;
+ }
+ case IPPROTO_UDP: {
+ struct udphdr uh;
+
+ if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
+ NULL, reason, pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMP error message too short "
+ "(udp)\n"));
+ return (PF_DROP);
+ }
+
+ key.af = pd2.af;
+ key.proto = IPPROTO_UDP;
+ PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
+ PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ key.port[pd2.sidx] = uh.uh_sport;
+ key.port[pd2.didx] = uh.uh_dport;
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] !=
+ (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk =
+ (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd2.src,
+ &nk->addr[pd2.sidx], pd2.af) ||
+ nk->port[pd2.sidx] != uh.uh_sport)
+ pf_change_icmp(pd2.src, &uh.uh_sport,
+ daddr, &nk->addr[pd2.sidx],
+ nk->port[pd2.sidx], &uh.uh_sum,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 1, pd2.af);
+
+ if (PF_ANEQ(pd2.dst,
+ &nk->addr[pd2.didx], pd2.af) ||
+ nk->port[pd2.didx] != uh.uh_dport)
+ pf_change_icmp(pd2.dst, &uh.uh_dport,
+ saddr, &nk->addr[pd2.didx],
+ nk->port[pd2.didx], &uh.uh_sum,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 1, pd2.af);
+
+ switch (pd2.af) {
+#ifdef INET
+ case AF_INET:
+ m_copyback(m, off, ICMP_MINLEN,
+ (caddr_t )pd->hdr.icmp);
+ m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ m_copyback(m, off,
+ sizeof(struct icmp6_hdr),
+ (caddr_t )pd->hdr.icmp6);
+ m_copyback(m, ipoff2, sizeof(h2_6),
+ (caddr_t )&h2_6);
+ break;
+#endif /* INET6 */
+ }
+ m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
+ }
+ return (PF_PASS);
+ break;
+ }
+#ifdef INET
+ case IPPROTO_ICMP: {
+ struct icmp iih;
+
+ if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
+ NULL, reason, pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMP error message too short i"
+ "(icmp)\n"));
+ return (PF_DROP);
+ }
+
+ key.af = pd2.af;
+ key.proto = IPPROTO_ICMP;
+ PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
+ PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ key.port[0] = key.port[1] = iih.icmp_id;
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] !=
+ (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk =
+ (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd2.src,
+ &nk->addr[pd2.sidx], pd2.af) ||
+ nk->port[pd2.sidx] != iih.icmp_id)
+ pf_change_icmp(pd2.src, &iih.icmp_id,
+ daddr, &nk->addr[pd2.sidx],
+ nk->port[pd2.sidx], NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, AF_INET);
+
+ if (PF_ANEQ(pd2.dst,
+ &nk->addr[pd2.didx], pd2.af) ||
+ nk->port[pd2.didx] != iih.icmp_id)
+ pf_change_icmp(pd2.dst, &iih.icmp_id,
+ saddr, &nk->addr[pd2.didx],
+ nk->port[pd2.didx], NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, AF_INET);
+
+ m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
+ m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
+ m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
+ }
+ return (PF_PASS);
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case IPPROTO_ICMPV6: {
+ struct icmp6_hdr iih;
+
+ if (!pf_pull_hdr(m, off2, &iih,
+ sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMP error message too short "
+ "(icmp6)\n"));
+ return (PF_DROP);
+ }
+
+ key.af = pd2.af;
+ key.proto = IPPROTO_ICMPV6;
+ PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
+ PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ key.port[0] = key.port[1] = iih.icmp6_id;
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] !=
+ (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk =
+ (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd2.src,
+ &nk->addr[pd2.sidx], pd2.af) ||
+ nk->port[pd2.sidx] != iih.icmp6_id)
+ pf_change_icmp(pd2.src, &iih.icmp6_id,
+ daddr, &nk->addr[pd2.sidx],
+ nk->port[pd2.sidx], NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, AF_INET6);
+
+ if (PF_ANEQ(pd2.dst,
+ &nk->addr[pd2.didx], pd2.af) ||
+ nk->port[pd2.didx] != iih.icmp6_id)
+ pf_change_icmp(pd2.dst, &iih.icmp6_id,
+ saddr, &nk->addr[pd2.didx],
+ nk->port[pd2.didx], NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, AF_INET6);
+
+ m_copyback(m, off, sizeof(struct icmp6_hdr),
+ (caddr_t)pd->hdr.icmp6);
+ m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
+ m_copyback(m, off2, sizeof(struct icmp6_hdr),
+ (caddr_t)&iih);
+ }
+ return (PF_PASS);
+ break;
+ }
+#endif /* INET6 */
+ default: {
+ key.af = pd2.af;
+ key.proto = pd2.proto;
+ PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
+ PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ key.port[0] = key.port[1] = 0;
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] !=
+ (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk =
+ (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd2.src,
+ &nk->addr[pd2.sidx], pd2.af))
+ pf_change_icmp(pd2.src, NULL, daddr,
+ &nk->addr[pd2.sidx], 0, NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, pd2.af);
+
+ if (PF_ANEQ(pd2.dst,
+ &nk->addr[pd2.didx], pd2.af))
+ pf_change_icmp(pd2.dst, NULL, saddr,
+ &nk->addr[pd2.didx], 0, NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, pd2.af);
+
+ switch (pd2.af) {
+#ifdef INET
+ case AF_INET:
+ m_copyback(m, off, ICMP_MINLEN,
+ (caddr_t)pd->hdr.icmp);
+ m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ m_copyback(m, off,
+ sizeof(struct icmp6_hdr),
+ (caddr_t )pd->hdr.icmp6);
+ m_copyback(m, ipoff2, sizeof(h2_6),
+ (caddr_t )&h2_6);
+ break;
+#endif /* INET6 */
+ }
+ }
+ return (PF_PASS);
+ break;
+ }
+ }
+ }
+}
+
+static int
+pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif,
+ struct mbuf *m, struct pf_pdesc *pd)
+{
+ struct pf_state_peer *src, *dst;
+ struct pf_state_key_cmp key;
+
+ bzero(&key, sizeof(key));
+ key.af = pd->af;
+ key.proto = pd->proto;
+ if (direction == PF_IN) {
+ PF_ACPY(&key.addr[0], pd->src, key.af);
+ PF_ACPY(&key.addr[1], pd->dst, key.af);
+ key.port[0] = key.port[1] = 0;
+ } else {
+ PF_ACPY(&key.addr[1], pd->src, key.af);
+ PF_ACPY(&key.addr[0], pd->dst, key.af);
+ key.port[1] = key.port[0] = 0;
+ }
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ if (direction == (*state)->direction) {
+ src = &(*state)->src;
+ dst = &(*state)->dst;
+ } else {
+ src = &(*state)->dst;
+ dst = &(*state)->src;
+ }
+
+ /* update states */
+ if (src->state < PFOTHERS_SINGLE)
+ src->state = PFOTHERS_SINGLE;
+ if (dst->state == PFOTHERS_SINGLE)
+ dst->state = PFOTHERS_MULTIPLE;
+
+ /* update expire time */
+ (*state)->expire = time_uptime;
+ if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
+ (*state)->timeout = PFTM_OTHER_MULTIPLE;
+ else
+ (*state)->timeout = PFTM_OTHER_SINGLE;
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk = (*state)->key[pd->didx];
+
+ KASSERT(nk, ("%s: nk is null", __func__));
+ KASSERT(pd, ("%s: pd is null", __func__));
+ KASSERT(pd->src, ("%s: pd->src is null", __func__));
+ KASSERT(pd->dst, ("%s: pd->dst is null", __func__));
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET:
+ if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
+ pf_change_a(&pd->src->v4.s_addr,
+ pd->ip_sum,
+ nk->addr[pd->sidx].v4.s_addr,
+ 0);
+
+
+ if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
+ pf_change_a(&pd->dst->v4.s_addr,
+ pd->ip_sum,
+ nk->addr[pd->didx].v4.s_addr,
+ 0);
+
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
+ PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
+
+ if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
+ PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
+#endif /* INET6 */
+ }
+ }
+ return (PF_PASS);
+}
+
+/*
+ * ipoff and off are measured from the start of the mbuf chain.
+ * h must be at "ipoff" on the mbuf chain.
+ */
+void *
+pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
+ u_short *actionp, u_short *reasonp, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET: {
+ struct ip *h = mtod(m, struct ip *);
+ u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
+
+ if (fragoff) {
+ if (fragoff >= len)
+ ACTION_SET(actionp, PF_PASS);
+ else {
+ ACTION_SET(actionp, PF_DROP);
+ REASON_SET(reasonp, PFRES_FRAG);
+ }
+ return (NULL);
+ }
+ if (m->m_pkthdr.len < off + len ||
+ ntohs(h->ip_len) < off + len) {
+ ACTION_SET(actionp, PF_DROP);
+ REASON_SET(reasonp, PFRES_SHORT);
+ return (NULL);
+ }
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6: {
+ struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
+
+ if (m->m_pkthdr.len < off + len ||
+ (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
+ (unsigned)(off + len)) {
+ ACTION_SET(actionp, PF_DROP);
+ REASON_SET(reasonp, PFRES_SHORT);
+ return (NULL);
+ }
+ break;
+ }
+#endif /* INET6 */
+ }
+ m_copydata(m, off, len, p);
+ return (p);
+}
+
+#ifdef RADIX_MPATH
+static int
+pf_routable_oldmpath(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
+ int rtableid)
+{
+ struct radix_node_head *rnh;
+ struct sockaddr_in *dst;
+ int ret = 1;
+ int check_mpath;
+#ifdef INET6
+ struct sockaddr_in6 *dst6;
+ struct route_in6 ro;
+#else
+ struct route ro;
+#endif
+ struct radix_node *rn;
+ struct rtentry *rt;
+ struct ifnet *ifp;
+
+ check_mpath = 0;
+ /* XXX: stick to table 0 for now */
+ rnh = rt_tables_get_rnh(0, af);
+ if (rnh != NULL && rn_mpath_capable(rnh))
+ check_mpath = 1;
+ bzero(&ro, sizeof(ro));
+ switch (af) {
+ case AF_INET:
+ dst = satosin(&ro.ro_dst);
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = addr->v4;
+ break;
+#ifdef INET6
+ case AF_INET6:
+ /*
+ * Skip check for addresses with embedded interface scope,
+ * as they would always match anyway.
+ */
+ if (IN6_IS_SCOPE_EMBED(&addr->v6))
+ goto out;
+ dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
+ dst6->sin6_family = AF_INET6;
+ dst6->sin6_len = sizeof(*dst6);
+ dst6->sin6_addr = addr->v6;
+ break;
+#endif /* INET6 */
+ default:
+ return (0);
+ }
+
+ /* Skip checks for ipsec interfaces */
+ if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
+ goto out;
+
+ switch (af) {
+#ifdef INET6
+ case AF_INET6:
+ in6_rtalloc_ign(&ro, 0, rtableid);
+ break;
+#endif
+#ifdef INET
+ case AF_INET:
+ in_rtalloc_ign((struct route *)&ro, 0, rtableid);
+ break;
+#endif
+ }
+
+ if (ro.ro_rt != NULL) {
+ /* No interface given, this is a no-route check */
+ if (kif == NULL)
+ goto out;
+
+ if (kif->pfik_ifp == NULL) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Perform uRPF check if passed input interface */
+ ret = 0;
+ rn = (struct radix_node *)ro.ro_rt;
+ do {
+ rt = (struct rtentry *)rn;
+ ifp = rt->rt_ifp;
+
+ if (kif->pfik_ifp == ifp)
+ ret = 1;
+ rn = rn_mpath_next(rn);
+ } while (check_mpath == 1 && rn != NULL && ret == 0);
+ } else
+ ret = 0;
+out:
+ if (ro.ro_rt != NULL)
+ RTFREE(ro.ro_rt);
+ return (ret);
+}
+#endif
+
+int
+pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
+ int rtableid)
+{
+#ifdef INET
+ struct nhop4_basic nh4;
+#endif
+#ifdef INET6
+ struct nhop6_basic nh6;
+#endif
+ struct ifnet *ifp;
+#ifdef RADIX_MPATH
+ struct radix_node_head *rnh;
+
+ /* XXX: stick to table 0 for now */
+ rnh = rt_tables_get_rnh(0, af);
+ if (rnh != NULL && rn_mpath_capable(rnh))
+ return (pf_routable_oldmpath(addr, af, kif, rtableid));
+#endif
+ /*
+ * Skip check for addresses with embedded interface scope,
+ * as they would always match anyway.
+ */
+ if (af == AF_INET6 && IN6_IS_SCOPE_EMBED(&addr->v6))
+ return (1);
+
+ if (af != AF_INET && af != AF_INET6)
+ return (0);
+
+ /* Skip checks for ipsec interfaces */
+ if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
+ return (1);
+
+ ifp = NULL;
+
+ switch (af) {
+#ifdef INET6
+ case AF_INET6:
+ if (fib6_lookup_nh_basic(rtableid, &addr->v6, 0, 0, 0, &nh6)!=0)
+ return (0);
+ ifp = nh6.nh_ifp;
+ break;
+#endif
+#ifdef INET
+ case AF_INET:
+ if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) != 0)
+ return (0);
+ ifp = nh4.nh_ifp;
+ break;
+#endif
+ }
+
+ /* No interface given, this is a no-route check */
+ if (kif == NULL)
+ return (1);
+
+ if (kif->pfik_ifp == NULL)
+ return (0);
+
+ /* Perform uRPF check if passed input interface */
+ if (kif->pfik_ifp == ifp)
+ return (1);
+ return (0);
+}
+
+#ifdef INET
+static void
+pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
+ struct pf_state *s, struct pf_pdesc *pd)
+{
+ struct mbuf *m0, *m1;
+ struct sockaddr_in dst;
+ struct ip *ip;
+ struct ifnet *ifp = NULL;
+ struct pf_addr naddr;
+ struct pf_src_node *sn = NULL;
+ int error = 0;
+ uint16_t ip_len, ip_off;
+
+ KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
+ KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
+ __func__));
+
+ if ((pd->pf_mtag == NULL &&
+ ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
+ pd->pf_mtag->routed++ > 3) {
+ m0 = *m;
+ *m = NULL;
+ goto bad_locked;
+ }
+
+ if (r->rt == PF_DUPTO) {
+ if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
+ if (s)
+ PF_STATE_UNLOCK(s);
+ return;
+ }
+ } else {
+ if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
+ if (s)
+ PF_STATE_UNLOCK(s);
+ return;
+ }
+ m0 = *m;
+ }
+
+ ip = mtod(m0, struct ip *);
+
+ bzero(&dst, sizeof(dst));
+ dst.sin_family = AF_INET;
+ dst.sin_len = sizeof(dst);
+ dst.sin_addr = ip->ip_dst;
+
+ if (r->rt == PF_FASTROUTE) {
+ struct nhop4_basic nh4;
+
+ if (s)
+ PF_STATE_UNLOCK(s);
+
+ if (fib4_lookup_nh_basic(M_GETFIB(m0), ip->ip_dst, 0,
+ m0->m_pkthdr.flowid, &nh4) != 0) {
+ KMOD_IPSTAT_INC(ips_noroute);
+ error = EHOSTUNREACH;
+ goto bad;
+ }
+
+ ifp = nh4.nh_ifp;
+ dst.sin_addr = nh4.nh_addr;
+ } else {
+ if (TAILQ_EMPTY(&r->rpool.list)) {
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
+ goto bad_locked;
+ }
+ if (s == NULL) {
+ pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
+ &naddr, NULL, &sn);
+ if (!PF_AZERO(&naddr, AF_INET))
+ dst.sin_addr.s_addr = naddr.v4.s_addr;
+ ifp = r->rpool.cur->kif ?
+ r->rpool.cur->kif->pfik_ifp : NULL;
+ } else {
+ if (!PF_AZERO(&s->rt_addr, AF_INET))
+ dst.sin_addr.s_addr =
+ s->rt_addr.v4.s_addr;
+ ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
+ PF_STATE_UNLOCK(s);
+ }
+ }
+ if (ifp == NULL)
+ goto bad;
+
+ if (oifp != ifp) {
+ if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS)
+ goto bad;
+ else if (m0 == NULL)
+ goto done;
+ if (m0->m_len < sizeof(struct ip)) {
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("%s: m0->m_len < sizeof(struct ip)\n", __func__));
+ goto bad;
+ }
+ ip = mtod(m0, struct ip *);
+ }
+
+ if (ifp->if_flags & IFF_LOOPBACK)
+ m0->m_flags |= M_SKIP_FIREWALL;
+
+ ip_len = ntohs(ip->ip_len);
+ ip_off = ntohs(ip->ip_off);
+
+ /* Copied from FreeBSD 10.0-CURRENT ip_output. */
+ m0->m_pkthdr.csum_flags |= CSUM_IP;
+ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
+ in_delayed_cksum(m0);
+ m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ }
+#ifdef SCTP
+ if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
+ sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
+ m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
+ }
+#endif
+
+ /*
+ * If small enough for interface, or the interface will take
+ * care of the fragmentation for us, we can just send directly.
+ */
+ if (ip_len <= ifp->if_mtu ||
+ (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
+ ip->ip_sum = 0;
+ if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
+ ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
+ m0->m_pkthdr.csum_flags &= ~CSUM_IP;
+ }
+ m_clrprotoflags(m0); /* Avoid confusing lower layers. */
+ error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
+ goto done;
+ }
+
+ /* Balk when DF bit is set or the interface didn't support TSO. */
+ if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) {
+ error = EMSGSIZE;
+ KMOD_IPSTAT_INC(ips_cantfrag);
+ if (r->rt != PF_DUPTO) {
+ icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
+ ifp->if_mtu);
+ goto done;
+ } else
+ goto bad;
+ }
+
+ error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist);
+ if (error)
+ goto bad;
+
+ for (; m0; m0 = m1) {
+ m1 = m0->m_nextpkt;
+ m0->m_nextpkt = NULL;
+ if (error == 0) {
+ m_clrprotoflags(m0);
+ error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
+ } else
+ m_freem(m0);
+ }
+
+ if (error == 0)
+ KMOD_IPSTAT_INC(ips_fragmented);
+
+done:
+ if (r->rt != PF_DUPTO)
+ *m = NULL;
+ return;
+
+bad_locked:
+ if (s)
+ PF_STATE_UNLOCK(s);
+bad:
+ m_freem(m0);
+ goto done;
+}
+#endif /* INET */
+
+#ifdef INET6
+static void
+pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
+ struct pf_state *s, struct pf_pdesc *pd)
+{
+ struct mbuf *m0;
+ struct sockaddr_in6 dst;
+ struct ip6_hdr *ip6;
+ struct ifnet *ifp = NULL;
+ struct pf_addr naddr;
+ struct pf_src_node *sn = NULL;
+
+ KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
+ KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
+ __func__));
+
+ if ((pd->pf_mtag == NULL &&
+ ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
+ pd->pf_mtag->routed++ > 3) {
+ m0 = *m;
+ *m = NULL;
+ goto bad_locked;
+ }
+
+ if (r->rt == PF_DUPTO) {
+ if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
+ if (s)
+ PF_STATE_UNLOCK(s);
+ return;
+ }
+ } else {
+ if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
+ if (s)
+ PF_STATE_UNLOCK(s);
+ return;
+ }
+ m0 = *m;
+ }
+
+ ip6 = mtod(m0, struct ip6_hdr *);
+
+ bzero(&dst, sizeof(dst));
+ dst.sin6_family = AF_INET6;
+ dst.sin6_len = sizeof(dst);
+ dst.sin6_addr = ip6->ip6_dst;
+
+ /* Cheat. XXX why only in the v6 case??? */
+ if (r->rt == PF_FASTROUTE) {
+ if (s)
+ PF_STATE_UNLOCK(s);
+ m0->m_flags |= M_SKIP_FIREWALL;
+ ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
+ *m = NULL;
+ return;
+ }
+
+ if (TAILQ_EMPTY(&r->rpool.list)) {
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
+ goto bad_locked;
+ }
+ if (s == NULL) {
+ pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
+ &naddr, NULL, &sn);
+ if (!PF_AZERO(&naddr, AF_INET6))
+ PF_ACPY((struct pf_addr *)&dst.sin6_addr,
+ &naddr, AF_INET6);
+ ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
+ } else {
+ if (!PF_AZERO(&s->rt_addr, AF_INET6))
+ PF_ACPY((struct pf_addr *)&dst.sin6_addr,
+ &s->rt_addr, AF_INET6);
+ ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
+ }
+
+ if (s)
+ PF_STATE_UNLOCK(s);
+
+ if (ifp == NULL)
+ goto bad;
+
+ if (oifp != ifp) {
+ if (pf_test6(PF_FWD, ifp, &m0, NULL) != PF_PASS)
+ goto bad;
+ else if (m0 == NULL)
+ goto done;
+ if (m0->m_len < sizeof(struct ip6_hdr)) {
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("%s: m0->m_len < sizeof(struct ip6_hdr)\n",
+ __func__));
+ goto bad;
+ }
+ ip6 = mtod(m0, struct ip6_hdr *);
+ }
+
+ if (ifp->if_flags & IFF_LOOPBACK)
+ m0->m_flags |= M_SKIP_FIREWALL;
+
+ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 &
+ ~ifp->if_hwassist) {
+ uint32_t plen = m0->m_pkthdr.len - sizeof(*ip6);
+ in6_delayed_cksum(m0, plen, sizeof(struct ip6_hdr));
+ m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
+ }
+
+ /*
+ * If the packet is too large for the outgoing interface,
+ * send back an icmp6 error.
+ */
+ if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr))
+ dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
+ if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu)
+ nd6_output_ifp(ifp, ifp, m0, &dst, NULL);
+ else {
+ in6_ifstat_inc(ifp, ifs6_in_toobig);
+ if (r->rt != PF_DUPTO)
+ icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
+ else
+ goto bad;
+ }
+
+done:
+ if (r->rt != PF_DUPTO)
+ *m = NULL;
+ return;
+
+bad_locked:
+ if (s)
+ PF_STATE_UNLOCK(s);
+bad:
+ m_freem(m0);
+ goto done;
+}
+#endif /* INET6 */
+
+/*
+ * FreeBSD supports cksum offloads for the following drivers.
+ * em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4),
+ * ti(4), txp(4), xl(4)
+ *
+ * CSUM_DATA_VALID | CSUM_PSEUDO_HDR :
+ * network driver performed cksum including pseudo header, need to verify
+ * csum_data
+ * CSUM_DATA_VALID :
+ * network driver performed cksum, needs to additional pseudo header
+ * cksum computation with partial csum_data(i.e. lack of H/W support for
+ * pseudo header, for instance hme(4), sk(4) and possibly gem(4))
+ *
+ * After validating the cksum of packet, set both flag CSUM_DATA_VALID and
+ * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
+ * TCP/UDP layer.
+ * Also, set csum_data to 0xffff to force cksum validation.
+ */
+static int
+pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
+{
+ u_int16_t sum = 0;
+ int hw_assist = 0;
+ struct ip *ip;
+
+ if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
+ return (1);
+ if (m->m_pkthdr.len < off + len)
+ return (1);
+
+ switch (p) {
+ case IPPROTO_TCP:
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
+ sum = m->m_pkthdr.csum_data;
+ } else {
+ ip = mtod(m, struct ip *);
+ sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htonl((u_short)len +
+ m->m_pkthdr.csum_data + IPPROTO_TCP));
+ }
+ sum ^= 0xffff;
+ ++hw_assist;
+ }
+ break;
+ case IPPROTO_UDP:
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
+ sum = m->m_pkthdr.csum_data;
+ } else {
+ ip = mtod(m, struct ip *);
+ sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htonl((u_short)len +
+ m->m_pkthdr.csum_data + IPPROTO_UDP));
+ }
+ sum ^= 0xffff;
+ ++hw_assist;
+ }
+ break;
+ case IPPROTO_ICMP:
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+#endif /* INET6 */
+ break;
+ default:
+ return (1);
+ }
+
+ if (!hw_assist) {
+ switch (af) {
+ case AF_INET:
+ if (p == IPPROTO_ICMP) {
+ if (m->m_len < off)
+ return (1);
+ m->m_data += off;
+ m->m_len -= off;
+ sum = in_cksum(m, len);
+ m->m_data -= off;
+ m->m_len += off;
+ } else {
+ if (m->m_len < sizeof(struct ip))
+ return (1);
+ sum = in4_cksum(m, p, off, len);
+ }
+ break;
+#ifdef INET6
+ case AF_INET6:
+ if (m->m_len < sizeof(struct ip6_hdr))
+ return (1);
+ sum = in6_cksum(m, p, off, len);
+ break;
+#endif /* INET6 */
+ default:
+ return (1);
+ }
+ }
+ if (sum) {
+ switch (p) {
+ case IPPROTO_TCP:
+ {
+ KMOD_TCPSTAT_INC(tcps_rcvbadsum);
+ break;
+ }
+ case IPPROTO_UDP:
+ {
+ KMOD_UDPSTAT_INC(udps_badsum);
+ break;
+ }
+#ifdef INET
+ case IPPROTO_ICMP:
+ {
+ KMOD_ICMPSTAT_INC(icps_checksum);
+ break;
+ }
+#endif
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+ {
+ KMOD_ICMP6STAT_INC(icp6s_checksum);
+ break;
+ }
+#endif /* INET6 */
+ }
+ return (1);
+ } else {
+ if (p == IPPROTO_TCP || p == IPPROTO_UDP) {
+ m->m_pkthdr.csum_flags |=
+ (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+ m->m_pkthdr.csum_data = 0xffff;
+ }
+ }
+ return (0);
+}
+
+
+#ifdef INET
+int
+pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
+{
+ struct pfi_kif *kif;
+ u_short action, reason = 0, log = 0;
+ struct mbuf *m = *m0;
+ struct ip *h = NULL;
+ struct m_tag *ipfwtag;
+ struct pf_rule *a = NULL, *r = &V_pf_default_rule, *tr, *nr;
+ struct pf_state *s = NULL;
+ struct pf_ruleset *ruleset = NULL;
+ struct pf_pdesc pd;
+ int off, dirndx, pqid = 0;
+
+ M_ASSERTPKTHDR(m);
+
+ if (!V_pf_status.running)
+ return (PF_PASS);
+
+ memset(&pd, 0, sizeof(pd));
+
+ kif = (struct pfi_kif *)ifp->if_pf_kif;
+
+ if (kif == NULL) {
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
+ return (PF_DROP);
+ }
+ if (kif->pfik_flags & PFI_IFLAG_SKIP)
+ return (PF_PASS);
+
+ if (m->m_flags & M_SKIP_FIREWALL)
+ return (PF_PASS);
+
+ pd.pf_mtag = pf_find_mtag(m);
+
+ PF_RULES_RLOCK();
+
+ if (ip_divert_ptr != NULL &&
+ ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) {
+ struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1);
+ if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) {
+ if (pd.pf_mtag == NULL &&
+ ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
+ action = PF_DROP;
+ goto done;
+ }
+ pd.pf_mtag->flags |= PF_PACKET_LOOPED;
+ m_tag_delete(m, ipfwtag);
+ }
+ if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) {
+ m->m_flags |= M_FASTFWD_OURS;
+ pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT;
+ }
+ } else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
+ /* We do IP header normalization and packet reassembly here */
+ action = PF_DROP;
+ goto done;
+ }
+ m = *m0; /* pf_normalize messes with m0 */
+ h = mtod(m, struct ip *);
+
+ off = h->ip_hl << 2;
+ if (off < (int)sizeof(struct ip)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_SHORT);
+ log = 1;
+ goto done;
+ }
+
+ pd.src = (struct pf_addr *)&h->ip_src;
+ pd.dst = (struct pf_addr *)&h->ip_dst;
+ pd.sport = pd.dport = NULL;
+ pd.ip_sum = &h->ip_sum;
+ pd.proto_sum = NULL;
+ pd.proto = h->ip_p;
+ pd.dir = dir;
+ pd.sidx = (dir == PF_IN) ? 0 : 1;
+ pd.didx = (dir == PF_IN) ? 1 : 0;
+ pd.af = AF_INET;
+ pd.tos = h->ip_tos;
+ pd.tot_len = ntohs(h->ip_len);
+
+ /* handle fragments that didn't get reassembled by normalization */
+ if (h->ip_off & htons(IP_MF | IP_OFFMASK)) {
+ action = pf_test_fragment(&r, dir, kif, m, h,
+ &pd, &a, &ruleset);
+ goto done;
+ }
+
+ switch (h->ip_p) {
+
+ case IPPROTO_TCP: {
+ struct tcphdr th;
+
+ pd.hdr.tcp = &th;
+ if (!pf_pull_hdr(m, off, &th, sizeof(th),
+ &action, &reason, AF_INET)) {
+ log = action != PF_PASS;
+ goto done;
+ }
+ pd.p_len = pd.tot_len - off - (th.th_off << 2);
+ if ((th.th_flags & TH_ACK) && pd.p_len == 0)
+ pqid = 1;
+ action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
+ if (action == PF_DROP)
+ goto done;
+ action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
+ &reason);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+ case IPPROTO_UDP: {
+ struct udphdr uh;
+
+ pd.hdr.udp = &uh;
+ if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
+ &action, &reason, AF_INET)) {
+ log = action != PF_PASS;
+ goto done;
+ }
+ if (uh.uh_dport == 0 ||
+ ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
+ ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_SHORT);
+ goto done;
+ }
+ action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+ case IPPROTO_ICMP: {
+ struct icmp ih;
+
+ pd.hdr.icmp = &ih;
+ if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
+ &action, &reason, AF_INET)) {
+ log = action != PF_PASS;
+ goto done;
+ }
+ action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
+ &reason);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+#ifdef INET6
+ case IPPROTO_ICMPV6: {
+ action = PF_DROP;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: dropping IPv4 packet with ICMPv6 payload\n"));
+ goto done;
+ }
+#endif
+
+ default:
+ action = pf_test_state_other(&s, dir, kif, m, &pd);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+done:
+ PF_RULES_RUNLOCK();
+ if (action == PF_PASS && h->ip_hl > 5 &&
+ !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_IPOPTIONS);
+ log = r->log;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: dropping packet with ip options\n"));
+ }
+
+ if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ }
+ if (r->rtableid >= 0)
+ M_SETFIB(m, r->rtableid);
+
+ if (r->scrub_flags & PFSTATE_SETPRIO) {
+ if (pd.tos & IPTOS_LOWDELAY)
+ pqid = 1;
+ if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ log = 1;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: failed to allocate 802.1q mtag\n"));
+ }
+ }
+
+#ifdef ALTQ
+ if (action == PF_PASS && r->qid) {
+ if (pd.pf_mtag == NULL &&
+ ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ } else {
+ if (s != NULL)
+ pd.pf_mtag->qid_hash = pf_state_hash(s);
+ if (pqid || (pd.tos & IPTOS_LOWDELAY))
+ pd.pf_mtag->qid = r->pqid;
+ else
+ pd.pf_mtag->qid = r->qid;
+ /* Add hints for ecn. */
+ pd.pf_mtag->hdr = h;
+ }
+
+ }
+#endif /* ALTQ */
+
+ /*
+ * connections redirected to loopback should not match sockets
+ * bound specifically to loopback due to security implications,
+ * see tcp_input() and in_pcblookup_listen().
+ */
+ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
+ pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
+ (s->nat_rule.ptr->action == PF_RDR ||
+ s->nat_rule.ptr->action == PF_BINAT) &&
+ (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
+ m->m_flags |= M_SKIP_FIREWALL;
+
+ if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL &&
+ !PACKET_LOOPED(&pd)) {
+
+ ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
+ sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
+ if (ipfwtag != NULL) {
+ ((struct ipfw_rule_ref *)(ipfwtag+1))->info =
+ ntohs(r->divert.port);
+ ((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir;
+
+ if (s)
+ PF_STATE_UNLOCK(s);
+
+ m_tag_prepend(m, ipfwtag);
+ if (m->m_flags & M_FASTFWD_OURS) {
+ if (pd.pf_mtag == NULL &&
+ ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ log = 1;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: failed to allocate tag\n"));
+ } else {
+ pd.pf_mtag->flags |=
+ PF_FASTFWD_OURS_PRESENT;
+ m->m_flags &= ~M_FASTFWD_OURS;
+ }
+ }
+ ip_divert_ptr(*m0, dir == PF_IN ? DIR_IN : DIR_OUT);
+ *m0 = NULL;
+
+ return (action);
+ } else {
+ /* XXX: ipfw has the same behaviour! */
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ log = 1;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: failed to allocate divert tag\n"));
+ }
+ }
+
+ if (log) {
+ struct pf_rule *lr;
+
+ if (s != NULL && s->nat_rule.ptr != NULL &&
+ s->nat_rule.ptr->log & PF_LOG_ALL)
+ lr = s->nat_rule.ptr;
+ else
+ lr = r;
+ PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd,
+ (s == NULL));
+ }
+
+ kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
+ kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
+
+ if (action == PF_PASS || r->action == PF_DROP) {
+ dirndx = (dir == PF_OUT);
+ r->packets[dirndx]++;
+ r->bytes[dirndx] += pd.tot_len;
+ if (a != NULL) {
+ a->packets[dirndx]++;
+ a->bytes[dirndx] += pd.tot_len;
+ }
+ if (s != NULL) {
+ if (s->nat_rule.ptr != NULL) {
+ s->nat_rule.ptr->packets[dirndx]++;
+ s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
+ }
+ if (s->src_node != NULL) {
+ s->src_node->packets[dirndx]++;
+ s->src_node->bytes[dirndx] += pd.tot_len;
+ }
+ if (s->nat_src_node != NULL) {
+ s->nat_src_node->packets[dirndx]++;
+ s->nat_src_node->bytes[dirndx] += pd.tot_len;
+ }
+ dirndx = (dir == s->direction) ? 0 : 1;
+ s->packets[dirndx]++;
+ s->bytes[dirndx] += pd.tot_len;
+ }
+ tr = r;
+ nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
+ if (nr != NULL && r == &V_pf_default_rule)
+ tr = nr;
+ if (tr->src.addr.type == PF_ADDR_TABLE)
+ pfr_update_stats(tr->src.addr.p.tbl,
+ (s == NULL) ? pd.src :
+ &s->key[(s->direction == PF_IN)]->
+ addr[(s->direction == PF_OUT)],
+ pd.af, pd.tot_len, dir == PF_OUT,
+ r->action == PF_PASS, tr->src.neg);
+ if (tr->dst.addr.type == PF_ADDR_TABLE)
+ pfr_update_stats(tr->dst.addr.p.tbl,
+ (s == NULL) ? pd.dst :
+ &s->key[(s->direction == PF_IN)]->
+ addr[(s->direction == PF_IN)],
+ pd.af, pd.tot_len, dir == PF_OUT,
+ r->action == PF_PASS, tr->dst.neg);
+ }
+
+ switch (action) {
+ case PF_SYNPROXY_DROP:
+ m_freem(*m0);
+ case PF_DEFER:
+ *m0 = NULL;
+ action = PF_PASS;
+ break;
+ case PF_DROP:
+ m_freem(*m0);
+ *m0 = NULL;
+ break;
+ default:
+ /* pf_route() returns unlocked. */
+ if (r->rt) {
+ pf_route(m0, r, dir, kif->pfik_ifp, s, &pd);
+ return (action);
+ }
+ break;
+ }
+ if (s)
+ PF_STATE_UNLOCK(s);
+
+ return (action);
+}
+#endif /* INET */
+
+#ifdef INET6
+int
+pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
+{
+ struct pfi_kif *kif;
+ u_short action, reason = 0, log = 0;
+ struct mbuf *m = *m0, *n = NULL;
+ struct m_tag *mtag;
+ struct ip6_hdr *h = NULL;
+ struct pf_rule *a = NULL, *r = &V_pf_default_rule, *tr, *nr;
+ struct pf_state *s = NULL;
+ struct pf_ruleset *ruleset = NULL;
+ struct pf_pdesc pd;
+ int off, terminal = 0, dirndx, rh_cnt = 0, pqid = 0;
+ int fwdir = dir;
+
+ M_ASSERTPKTHDR(m);
+
+ /* Detect packet forwarding.
+ * If the input interface is different from the output interface we're
+ * forwarding.
+ * We do need to be careful about bridges. If the
+ * net.link.bridge.pfil_bridge sysctl is set we can be filtering on a
+ * bridge, so if the input interface is a bridge member and the output
+ * interface is its bridge or a member of the same bridge we're not
+ * actually forwarding but bridging.
+ */
+ if (dir == PF_OUT && m->m_pkthdr.rcvif && ifp != m->m_pkthdr.rcvif &&
+ (m->m_pkthdr.rcvif->if_bridge == NULL ||
+ (m->m_pkthdr.rcvif->if_bridge != ifp->if_softc &&
+ m->m_pkthdr.rcvif->if_bridge != ifp->if_bridge)))
+ fwdir = PF_FWD;
+
+ if (!V_pf_status.running)
+ return (PF_PASS);
+
+ memset(&pd, 0, sizeof(pd));
+ pd.pf_mtag = pf_find_mtag(m);
+
+ if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED)
+ return (PF_PASS);
+
+ kif = (struct pfi_kif *)ifp->if_pf_kif;
+ if (kif == NULL) {
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
+ return (PF_DROP);
+ }
+ if (kif->pfik_flags & PFI_IFLAG_SKIP)
+ return (PF_PASS);
+
+ if (m->m_flags & M_SKIP_FIREWALL)
+ return (PF_PASS);
+
+ PF_RULES_RLOCK();
+
+ /* We do IP header normalization and packet reassembly here */
+ if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
+ action = PF_DROP;
+ goto done;
+ }
+ m = *m0; /* pf_normalize messes with m0 */
+ h = mtod(m, struct ip6_hdr *);
+
+#if 1
+ /*
+ * we do not support jumbogram yet. if we keep going, zero ip6_plen
+ * will do something bad, so drop the packet for now.
+ */
+ if (htons(h->ip6_plen) == 0) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_NORM); /*XXX*/
+ goto done;
+ }
+#endif
+
+ pd.src = (struct pf_addr *)&h->ip6_src;
+ pd.dst = (struct pf_addr *)&h->ip6_dst;
+ pd.sport = pd.dport = NULL;
+ pd.ip_sum = NULL;
+ pd.proto_sum = NULL;
+ pd.dir = dir;
+ pd.sidx = (dir == PF_IN) ? 0 : 1;
+ pd.didx = (dir == PF_IN) ? 1 : 0;
+ pd.af = AF_INET6;
+ pd.tos = 0;
+ pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
+
+ off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
+ pd.proto = h->ip6_nxt;
+ do {
+ switch (pd.proto) {
+ case IPPROTO_FRAGMENT:
+ action = pf_test_fragment(&r, dir, kif, m, h,
+ &pd, &a, &ruleset);
+ if (action == PF_DROP)
+ REASON_SET(&reason, PFRES_FRAG);
+ goto done;
+ case IPPROTO_ROUTING: {
+ struct ip6_rthdr rthdr;
+
+ if (rh_cnt++) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: IPv6 more than one rthdr\n"));
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_IPOPTIONS);
+ log = 1;
+ goto done;
+ }
+ if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
+ &reason, pd.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: IPv6 short rthdr\n"));
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_SHORT);
+ log = 1;
+ goto done;
+ }
+ if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: IPv6 rthdr0\n"));
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_IPOPTIONS);
+ log = 1;
+ goto done;
+ }
+ /* FALLTHROUGH */
+ }
+ case IPPROTO_AH:
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_DSTOPTS: {
+ /* get next header and header length */
+ struct ip6_ext opt6;
+
+ if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
+ NULL, &reason, pd.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: IPv6 short opt\n"));
+ action = PF_DROP;
+ log = 1;
+ goto done;
+ }
+ if (pd.proto == IPPROTO_AH)
+ off += (opt6.ip6e_len + 2) * 4;
+ else
+ off += (opt6.ip6e_len + 1) * 8;
+ pd.proto = opt6.ip6e_nxt;
+ /* goto the next header */
+ break;
+ }
+ default:
+ terminal++;
+ break;
+ }
+ } while (!terminal);
+
+ /* if there's no routing header, use unmodified mbuf for checksumming */
+ if (!n)
+ n = m;
+
+ switch (pd.proto) {
+
+ case IPPROTO_TCP: {
+ struct tcphdr th;
+
+ pd.hdr.tcp = &th;
+ if (!pf_pull_hdr(m, off, &th, sizeof(th),
+ &action, &reason, AF_INET6)) {
+ log = action != PF_PASS;
+ goto done;
+ }
+ pd.p_len = pd.tot_len - off - (th.th_off << 2);
+ action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
+ if (action == PF_DROP)
+ goto done;
+ action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
+ &reason);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+ case IPPROTO_UDP: {
+ struct udphdr uh;
+
+ pd.hdr.udp = &uh;
+ if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
+ &action, &reason, AF_INET6)) {
+ log = action != PF_PASS;
+ goto done;
+ }
+ if (uh.uh_dport == 0 ||
+ ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
+ ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_SHORT);
+ goto done;
+ }
+ action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+ case IPPROTO_ICMP: {
+ action = PF_DROP;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: dropping IPv6 packet with ICMPv4 payload\n"));
+ goto done;
+ }
+
+ case IPPROTO_ICMPV6: {
+ struct icmp6_hdr ih;
+
+ pd.hdr.icmp6 = &ih;
+ if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
+ &action, &reason, AF_INET6)) {
+ log = action != PF_PASS;
+ goto done;
+ }
+ action = pf_test_state_icmp(&s, dir, kif,
+ m, off, h, &pd, &reason);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+ default:
+ action = pf_test_state_other(&s, dir, kif, m, &pd);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+done:
+ PF_RULES_RUNLOCK();
+ if (n != m) {
+ m_freem(n);
+ n = NULL;
+ }
+
+ /* handle dangerous IPv6 extension headers. */
+ if (action == PF_PASS && rh_cnt &&
+ !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_IPOPTIONS);
+ log = r->log;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: dropping packet with dangerous v6 headers\n"));
+ }
+
+ if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ }
+ if (r->rtableid >= 0)
+ M_SETFIB(m, r->rtableid);
+
+ if (r->scrub_flags & PFSTATE_SETPRIO) {
+ if (pd.tos & IPTOS_LOWDELAY)
+ pqid = 1;
+ if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ log = 1;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: failed to allocate 802.1q mtag\n"));
+ }
+ }
+
+#ifdef ALTQ
+ if (action == PF_PASS && r->qid) {
+ if (pd.pf_mtag == NULL &&
+ ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ } else {
+ if (s != NULL)
+ pd.pf_mtag->qid_hash = pf_state_hash(s);
+ if (pd.tos & IPTOS_LOWDELAY)
+ pd.pf_mtag->qid = r->pqid;
+ else
+ pd.pf_mtag->qid = r->qid;
+ /* Add hints for ecn. */
+ pd.pf_mtag->hdr = h;
+ }
+ }
+#endif /* ALTQ */
+
+ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
+ pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
+ (s->nat_rule.ptr->action == PF_RDR ||
+ s->nat_rule.ptr->action == PF_BINAT) &&
+ IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
+ m->m_flags |= M_SKIP_FIREWALL;
+
+ /* XXX: Anybody working on it?! */
+ if (r->divert.port)
+ printf("pf: divert(9) is not supported for IPv6\n");
+
+ if (log) {
+ struct pf_rule *lr;
+
+ if (s != NULL && s->nat_rule.ptr != NULL &&
+ s->nat_rule.ptr->log & PF_LOG_ALL)
+ lr = s->nat_rule.ptr;
+ else
+ lr = r;
+ PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset,
+ &pd, (s == NULL));
+ }
+
+ kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
+ kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
+
+ if (action == PF_PASS || r->action == PF_DROP) {
+ dirndx = (dir == PF_OUT);
+ r->packets[dirndx]++;
+ r->bytes[dirndx] += pd.tot_len;
+ if (a != NULL) {
+ a->packets[dirndx]++;
+ a->bytes[dirndx] += pd.tot_len;
+ }
+ if (s != NULL) {
+ if (s->nat_rule.ptr != NULL) {
+ s->nat_rule.ptr->packets[dirndx]++;
+ s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
+ }
+ if (s->src_node != NULL) {
+ s->src_node->packets[dirndx]++;
+ s->src_node->bytes[dirndx] += pd.tot_len;
+ }
+ if (s->nat_src_node != NULL) {
+ s->nat_src_node->packets[dirndx]++;
+ s->nat_src_node->bytes[dirndx] += pd.tot_len;
+ }
+ dirndx = (dir == s->direction) ? 0 : 1;
+ s->packets[dirndx]++;
+ s->bytes[dirndx] += pd.tot_len;
+ }
+ tr = r;
+ nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
+ if (nr != NULL && r == &V_pf_default_rule)
+ tr = nr;
+ if (tr->src.addr.type == PF_ADDR_TABLE)
+ pfr_update_stats(tr->src.addr.p.tbl,
+ (s == NULL) ? pd.src :
+ &s->key[(s->direction == PF_IN)]->addr[0],
+ pd.af, pd.tot_len, dir == PF_OUT,
+ r->action == PF_PASS, tr->src.neg);
+ if (tr->dst.addr.type == PF_ADDR_TABLE)
+ pfr_update_stats(tr->dst.addr.p.tbl,
+ (s == NULL) ? pd.dst :
+ &s->key[(s->direction == PF_IN)]->addr[1],
+ pd.af, pd.tot_len, dir == PF_OUT,
+ r->action == PF_PASS, tr->dst.neg);
+ }
+
+ switch (action) {
+ case PF_SYNPROXY_DROP:
+ m_freem(*m0);
+ case PF_DEFER:
+ *m0 = NULL;
+ action = PF_PASS;
+ break;
+ case PF_DROP:
+ m_freem(*m0);
+ *m0 = NULL;
+ break;
+ default:
+ /* pf_route6() returns unlocked. */
+ if (r->rt) {
+ pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd);
+ return (action);
+ }
+ break;
+ }
+
+ if (s)
+ PF_STATE_UNLOCK(s);
+
+ /* If reassembled packet passed, create new fragments. */
+ if (action == PF_PASS && *m0 && fwdir == PF_FWD &&
+ (mtag = m_tag_find(m, PF_REASSEMBLED, NULL)) != NULL)
+ action = pf_refragment6(ifp, m0, mtag);
+
+ return (action);
+}
+#endif /* INET6 */
diff --git a/freebsd/sys/netpfil/pf/pf.h b/freebsd/sys/netpfil/pf/pf.h
new file mode 100644
index 00000000..ac0e0fb9
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/pf.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2001 Daniel Hartmeier
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $OpenBSD: pfvar.h,v 1.282 2009/01/29 15:12:28 pyr Exp $
+ * $FreeBSD$
+ */
+
+#ifndef _NET_PF_H_
+#define _NET_PF_H_
+
+#define PF_TCPS_PROXY_SRC ((TCP_NSTATES)+0)
+#define PF_TCPS_PROXY_DST ((TCP_NSTATES)+1)
+
+#define PF_MD5_DIGEST_LENGTH 16
+#ifdef MD5_DIGEST_LENGTH
+#if PF_MD5_DIGEST_LENGTH != MD5_DIGEST_LENGTH
+#error
+#endif
+#endif
+
+enum { PF_INOUT, PF_IN, PF_OUT, PF_FWD };
+enum { PF_PASS, PF_DROP, PF_SCRUB, PF_NOSCRUB, PF_NAT, PF_NONAT,
+ PF_BINAT, PF_NOBINAT, PF_RDR, PF_NORDR, PF_SYNPROXY_DROP, PF_DEFER };
+enum { PF_RULESET_SCRUB, PF_RULESET_FILTER, PF_RULESET_NAT,
+ PF_RULESET_BINAT, PF_RULESET_RDR, PF_RULESET_MAX };
+enum { PF_OP_NONE, PF_OP_IRG, PF_OP_EQ, PF_OP_NE, PF_OP_LT,
+ PF_OP_LE, PF_OP_GT, PF_OP_GE, PF_OP_XRG, PF_OP_RRG };
+enum { PF_DEBUG_NONE, PF_DEBUG_URGENT, PF_DEBUG_MISC, PF_DEBUG_NOISY };
+enum { PF_CHANGE_NONE, PF_CHANGE_ADD_HEAD, PF_CHANGE_ADD_TAIL,
+ PF_CHANGE_ADD_BEFORE, PF_CHANGE_ADD_AFTER,
+ PF_CHANGE_REMOVE, PF_CHANGE_GET_TICKET };
+enum { PF_GET_NONE, PF_GET_CLR_CNTR };
+enum { PF_SK_WIRE, PF_SK_STACK, PF_SK_BOTH };
+
+/*
+ * Note about PFTM_*: real indices into pf_rule.timeout[] come before
+ * PFTM_MAX, special cases afterwards. See pf_state_expires().
+ */
+enum { PFTM_TCP_FIRST_PACKET, PFTM_TCP_OPENING, PFTM_TCP_ESTABLISHED,
+ PFTM_TCP_CLOSING, PFTM_TCP_FIN_WAIT, PFTM_TCP_CLOSED,
+ PFTM_UDP_FIRST_PACKET, PFTM_UDP_SINGLE, PFTM_UDP_MULTIPLE,
+ PFTM_ICMP_FIRST_PACKET, PFTM_ICMP_ERROR_REPLY,
+ PFTM_OTHER_FIRST_PACKET, PFTM_OTHER_SINGLE,
+ PFTM_OTHER_MULTIPLE, PFTM_FRAG, PFTM_INTERVAL,
+ PFTM_ADAPTIVE_START, PFTM_ADAPTIVE_END, PFTM_SRC_NODE,
+ PFTM_TS_DIFF, PFTM_MAX, PFTM_PURGE, PFTM_UNLINKED };
+
+/* PFTM default values */
+#define PFTM_TCP_FIRST_PACKET_VAL 120 /* First TCP packet */
+#define PFTM_TCP_OPENING_VAL 30 /* No response yet */
+#define PFTM_TCP_ESTABLISHED_VAL 24*60*60/* Established */
+#define PFTM_TCP_CLOSING_VAL 15 * 60 /* Half closed */
+#define PFTM_TCP_FIN_WAIT_VAL 45 /* Got both FINs */
+#define PFTM_TCP_CLOSED_VAL 90 /* Got a RST */
+#define PFTM_UDP_FIRST_PACKET_VAL 60 /* First UDP packet */
+#define PFTM_UDP_SINGLE_VAL 30 /* Unidirectional */
+#define PFTM_UDP_MULTIPLE_VAL 60 /* Bidirectional */
+#define PFTM_ICMP_FIRST_PACKET_VAL 20 /* First ICMP packet */
+#define PFTM_ICMP_ERROR_REPLY_VAL 10 /* Got error response */
+#define PFTM_OTHER_FIRST_PACKET_VAL 60 /* First packet */
+#define PFTM_OTHER_SINGLE_VAL 30 /* Unidirectional */
+#define PFTM_OTHER_MULTIPLE_VAL 60 /* Bidirectional */
+#define PFTM_FRAG_VAL 30 /* Fragment expire */
+#define PFTM_INTERVAL_VAL 10 /* Expire interval */
+#define PFTM_SRC_NODE_VAL 0 /* Source tracking */
+#define PFTM_TS_DIFF_VAL 30 /* Allowed TS diff */
+
+enum { PF_NOPFROUTE, PF_FASTROUTE, PF_ROUTETO, PF_DUPTO, PF_REPLYTO };
+enum { PF_LIMIT_STATES, PF_LIMIT_SRC_NODES, PF_LIMIT_FRAGS,
+ PF_LIMIT_TABLE_ENTRIES, PF_LIMIT_MAX };
+#define PF_POOL_IDMASK 0x0f
+enum { PF_POOL_NONE, PF_POOL_BITMASK, PF_POOL_RANDOM,
+ PF_POOL_SRCHASH, PF_POOL_ROUNDROBIN };
+enum { PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, PF_ADDR_DYNIFTL,
+ PF_ADDR_TABLE, PF_ADDR_URPFFAILED,
+ PF_ADDR_RANGE };
+#define PF_POOL_TYPEMASK 0x0f
+#define PF_POOL_STICKYADDR 0x20
+#define PF_WSCALE_FLAG 0x80
+#define PF_WSCALE_MASK 0x0f
+
+#define PF_LOG 0x01
+#define PF_LOG_ALL 0x02
+#define PF_LOG_SOCKET_LOOKUP 0x04
+
+/* Reasons code for passing/dropping a packet */
+#define PFRES_MATCH 0 /* Explicit match of a rule */
+#define PFRES_BADOFF 1 /* Bad offset for pull_hdr */
+#define PFRES_FRAG 2 /* Dropping following fragment */
+#define PFRES_SHORT 3 /* Dropping short packet */
+#define PFRES_NORM 4 /* Dropping by normalizer */
+#define PFRES_MEMORY 5 /* Dropped due to lacking mem */
+#define PFRES_TS 6 /* Bad TCP Timestamp (RFC1323) */
+#define PFRES_CONGEST 7 /* Congestion (of ipintrq) */
+#define PFRES_IPOPTIONS 8 /* IP option */
+#define PFRES_PROTCKSUM 9 /* Protocol checksum invalid */
+#define PFRES_BADSTATE 10 /* State mismatch */
+#define PFRES_STATEINS 11 /* State insertion failure */
+#define PFRES_MAXSTATES 12 /* State limit */
+#define PFRES_SRCLIMIT 13 /* Source node/conn limit */
+#define PFRES_SYNPROXY 14 /* SYN proxy */
+#define PFRES_MAPFAILED 15 /* pf_map_addr() failed */
+#define PFRES_MAX 16 /* total+1 */
+
+#define PFRES_NAMES { \
+ "match", \
+ "bad-offset", \
+ "fragment", \
+ "short", \
+ "normalize", \
+ "memory", \
+ "bad-timestamp", \
+ "congestion", \
+ "ip-option", \
+ "proto-cksum", \
+ "state-mismatch", \
+ "state-insert", \
+ "state-limit", \
+ "src-limit", \
+ "synproxy", \
+ "map-failed", \
+ NULL \
+}
+
+/* Counters for other things we want to keep track of */
+#define LCNT_STATES 0 /* states */
+#define LCNT_SRCSTATES 1 /* max-src-states */
+#define LCNT_SRCNODES 2 /* max-src-nodes */
+#define LCNT_SRCCONN 3 /* max-src-conn */
+#define LCNT_SRCCONNRATE 4 /* max-src-conn-rate */
+#define LCNT_OVERLOAD_TABLE 5 /* entry added to overload table */
+#define LCNT_OVERLOAD_FLUSH 6 /* state entries flushed */
+#define LCNT_MAX 7 /* total+1 */
+
+#define LCNT_NAMES { \
+ "max states per rule", \
+ "max-src-states", \
+ "max-src-nodes", \
+ "max-src-conn", \
+ "max-src-conn-rate", \
+ "overload table insertion", \
+ "overload flush states", \
+ NULL \
+}
+
+/* state operation counters */
+#define FCNT_STATE_SEARCH 0
+#define FCNT_STATE_INSERT 1
+#define FCNT_STATE_REMOVALS 2
+#define FCNT_MAX 3
+
+/* src_node operation counters */
+#define SCNT_SRC_NODE_SEARCH 0
+#define SCNT_SRC_NODE_INSERT 1
+#define SCNT_SRC_NODE_REMOVALS 2
+#define SCNT_MAX 3
+
+#define PF_TABLE_NAME_SIZE 32
+#define PF_QNAME_SIZE 64
+
+struct pf_status {
+ uint64_t counters[PFRES_MAX];
+ uint64_t lcounters[LCNT_MAX];
+ uint64_t fcounters[FCNT_MAX];
+ uint64_t scounters[SCNT_MAX];
+ uint64_t pcounters[2][2][3];
+ uint64_t bcounters[2][2];
+ uint32_t running;
+ uint32_t states;
+ uint32_t src_nodes;
+ uint32_t since;
+ uint32_t debug;
+ uint32_t hostid;
+ char ifname[IFNAMSIZ];
+ uint8_t pf_chksum[PF_MD5_DIGEST_LENGTH];
+};
+
+#endif /* _NET_PF_H_ */
diff --git a/freebsd/sys/netpfil/pf/pf_altq.h b/freebsd/sys/netpfil/pf/pf_altq.h
new file mode 100644
index 00000000..3efd4ff7
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/pf_altq.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2001 Daniel Hartmeier
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $OpenBSD: pfvar.h,v 1.282 2009/01/29 15:12:28 pyr Exp $
+ * $FreeBSD$
+ */
+
+#ifndef _NET_PF_ALTQ_H_
+#define _NET_PF_ALTQ_H_
+
+struct cbq_opts {
+ u_int minburst;
+ u_int maxburst;
+ u_int pktsize;
+ u_int maxpktsize;
+ u_int ns_per_byte;
+ u_int maxidle;
+ int minidle;
+ u_int offtime;
+ int flags;
+};
+
+struct codel_opts {
+ u_int target;
+ u_int interval;
+ int ecn;
+};
+
+struct priq_opts {
+ int flags;
+};
+
+struct hfsc_opts {
+ /* real-time service curve */
+ u_int rtsc_m1; /* slope of the 1st segment in bps */
+ u_int rtsc_d; /* the x-projection of m1 in msec */
+ u_int rtsc_m2; /* slope of the 2nd segment in bps */
+ /* link-sharing service curve */
+ u_int lssc_m1;
+ u_int lssc_d;
+ u_int lssc_m2;
+ /* upper-limit service curve */
+ u_int ulsc_m1;
+ u_int ulsc_d;
+ u_int ulsc_m2;
+ int flags;
+};
+
+/*
+ * XXX this needs some work
+ */
+struct fairq_opts {
+ u_int nbuckets;
+ u_int hogs_m1;
+ int flags;
+
+ /* link sharing service curve */
+ u_int lssc_m1;
+ u_int lssc_d;
+ u_int lssc_m2;
+};
+
+struct pf_altq {
+ char ifname[IFNAMSIZ];
+
+ void *altq_disc; /* discipline-specific state */
+ TAILQ_ENTRY(pf_altq) entries;
+
+ /* scheduler spec */
+ uint8_t scheduler; /* scheduler type */
+ uint16_t tbrsize; /* tokenbucket regulator size */
+ uint32_t ifbandwidth; /* interface bandwidth */
+
+ /* queue spec */
+ char qname[PF_QNAME_SIZE]; /* queue name */
+ char parent[PF_QNAME_SIZE]; /* parent name */
+ uint32_t parent_qid; /* parent queue id */
+ uint32_t bandwidth; /* queue bandwidth */
+ uint8_t priority; /* priority */
+ uint8_t local_flags; /* dynamic interface */
+#define PFALTQ_FLAG_IF_REMOVED 0x01
+
+ uint16_t qlimit; /* queue size limit */
+ uint16_t flags; /* misc flags */
+ union {
+ struct cbq_opts cbq_opts;
+ struct codel_opts codel_opts;
+ struct priq_opts priq_opts;
+ struct hfsc_opts hfsc_opts;
+ struct fairq_opts fairq_opts;
+ } pq_u;
+
+ uint32_t qid; /* return value */
+};
+
+#endif /* _NET_PF_ALTQ_H_ */
diff --git a/freebsd/sys/netpfil/pf/pf_if.c b/freebsd/sys/netpfil/pf/pf_if.c
new file mode 100644
index 00000000..d1c54b22
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/pf_if.c
@@ -0,0 +1,924 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2003 Cedric Berger
+ * Copyright (c) 2005 Henning Brauer <henning@openbsd.org>
+ * Copyright (c) 2005 Ryan McBride <mcbride@openbsd.org>
+ * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $OpenBSD: pf_if.c,v 1.54 2008/06/14 16:55:28 mk Exp $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/local/opt_inet.h>
+#include <rtems/bsd/local/opt_inet6.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/kernel.h>
+#include <sys/eventhandler.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/vnet.h>
+#include <net/pfvar.h>
+#include <net/route.h>
+
+VNET_DEFINE(struct pfi_kif *, pfi_all);
+static VNET_DEFINE(long, pfi_update);
+#define V_pfi_update VNET(pfi_update)
+#define PFI_BUFFER_MAX 0x10000
+
+VNET_DECLARE(int, pf_vnet_active);
+#define V_pf_vnet_active VNET(pf_vnet_active)
+
+static VNET_DEFINE(struct pfr_addr *, pfi_buffer);
+static VNET_DEFINE(int, pfi_buffer_cnt);
+static VNET_DEFINE(int, pfi_buffer_max);
+#define V_pfi_buffer VNET(pfi_buffer)
+#define V_pfi_buffer_cnt VNET(pfi_buffer_cnt)
+#define V_pfi_buffer_max VNET(pfi_buffer_max)
+
+eventhandler_tag pfi_attach_cookie;
+eventhandler_tag pfi_detach_cookie;
+eventhandler_tag pfi_attach_group_cookie;
+eventhandler_tag pfi_change_group_cookie;
+eventhandler_tag pfi_detach_group_cookie;
+eventhandler_tag pfi_ifaddr_event_cookie;
+
+static void pfi_attach_ifnet(struct ifnet *);
+static void pfi_attach_ifgroup(struct ifg_group *);
+
+static void pfi_kif_update(struct pfi_kif *);
+static void pfi_dynaddr_update(struct pfi_dynaddr *dyn);
+static void pfi_table_update(struct pfr_ktable *, struct pfi_kif *, int,
+ int);
+static void pfi_instance_add(struct ifnet *, int, int);
+static void pfi_address_add(struct sockaddr *, int, int);
+static int pfi_if_compare(struct pfi_kif *, struct pfi_kif *);
+static int pfi_skip_if(const char *, struct pfi_kif *);
+static int pfi_unmask(void *);
+static void pfi_attach_ifnet_event(void * __unused, struct ifnet *);
+static void pfi_detach_ifnet_event(void * __unused, struct ifnet *);
+static void pfi_attach_group_event(void *, struct ifg_group *);
+static void pfi_change_group_event(void *, char *);
+static void pfi_detach_group_event(void *, struct ifg_group *);
+static void pfi_ifaddr_event(void * __unused, struct ifnet *);
+
+RB_HEAD(pfi_ifhead, pfi_kif);
+static RB_PROTOTYPE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare);
+static RB_GENERATE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare);
+static VNET_DEFINE(struct pfi_ifhead, pfi_ifs);
+#define V_pfi_ifs VNET(pfi_ifs)
+
+#define PFI_BUFFER_MAX 0x10000
+MALLOC_DEFINE(PFI_MTYPE, "pf_ifnet", "pf(4) interface database");
+
+LIST_HEAD(pfi_list, pfi_kif);
+static VNET_DEFINE(struct pfi_list, pfi_unlinked_kifs);
+#define V_pfi_unlinked_kifs VNET(pfi_unlinked_kifs)
+static struct mtx pfi_unlnkdkifs_mtx;
+MTX_SYSINIT(pfi_unlnkdkifs_mtx, &pfi_unlnkdkifs_mtx, "pf unlinked interfaces",
+ MTX_DEF);
+
+void
+pfi_initialize_vnet(void)
+{
+ struct ifg_group *ifg;
+ struct ifnet *ifp;
+ struct pfi_kif *kif;
+
+ V_pfi_buffer_max = 64;
+ V_pfi_buffer = malloc(V_pfi_buffer_max * sizeof(*V_pfi_buffer),
+ PFI_MTYPE, M_WAITOK);
+
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+ PF_RULES_WLOCK();
+ V_pfi_all = pfi_kif_attach(kif, IFG_ALL);
+ PF_RULES_WUNLOCK();
+
+ IFNET_RLOCK();
+ TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
+ pfi_attach_ifgroup(ifg);
+ TAILQ_FOREACH(ifp, &V_ifnet, if_link)
+ pfi_attach_ifnet(ifp);
+ IFNET_RUNLOCK();
+}
+
+void
+pfi_initialize(void)
+{
+
+ pfi_attach_cookie = EVENTHANDLER_REGISTER(ifnet_arrival_event,
+ pfi_attach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
+ pfi_detach_cookie = EVENTHANDLER_REGISTER(ifnet_departure_event,
+ pfi_detach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
+ pfi_attach_group_cookie = EVENTHANDLER_REGISTER(group_attach_event,
+ pfi_attach_group_event, curvnet, EVENTHANDLER_PRI_ANY);
+ pfi_change_group_cookie = EVENTHANDLER_REGISTER(group_change_event,
+ pfi_change_group_event, curvnet, EVENTHANDLER_PRI_ANY);
+ pfi_detach_group_cookie = EVENTHANDLER_REGISTER(group_detach_event,
+ pfi_detach_group_event, curvnet, EVENTHANDLER_PRI_ANY);
+ pfi_ifaddr_event_cookie = EVENTHANDLER_REGISTER(ifaddr_event,
+ pfi_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY);
+}
+
+void
+pfi_cleanup_vnet(void)
+{
+ struct pfi_kif *kif;
+
+ PF_RULES_WASSERT();
+
+ V_pfi_all = NULL;
+ while ((kif = RB_MIN(pfi_ifhead, &V_pfi_ifs))) {
+ RB_REMOVE(pfi_ifhead, &V_pfi_ifs, kif);
+ if (kif->pfik_group)
+ kif->pfik_group->ifg_pf_kif = NULL;
+ if (kif->pfik_ifp)
+ kif->pfik_ifp->if_pf_kif = NULL;
+ free(kif, PFI_MTYPE);
+ }
+
+ mtx_lock(&pfi_unlnkdkifs_mtx);
+ while ((kif = LIST_FIRST(&V_pfi_unlinked_kifs))) {
+ LIST_REMOVE(kif, pfik_list);
+ free(kif, PFI_MTYPE);
+ }
+ mtx_unlock(&pfi_unlnkdkifs_mtx);
+
+ free(V_pfi_buffer, PFI_MTYPE);
+}
+
+void
+pfi_cleanup(void)
+{
+
+ EVENTHANDLER_DEREGISTER(ifnet_arrival_event, pfi_attach_cookie);
+ EVENTHANDLER_DEREGISTER(ifnet_departure_event, pfi_detach_cookie);
+ EVENTHANDLER_DEREGISTER(group_attach_event, pfi_attach_group_cookie);
+ EVENTHANDLER_DEREGISTER(group_change_event, pfi_change_group_cookie);
+ EVENTHANDLER_DEREGISTER(group_detach_event, pfi_detach_group_cookie);
+ EVENTHANDLER_DEREGISTER(ifaddr_event, pfi_ifaddr_event_cookie);
+}
+
+struct pfi_kif *
+pfi_kif_find(const char *kif_name)
+{
+ struct pfi_kif_cmp s;
+
+ PF_RULES_ASSERT();
+
+ bzero(&s, sizeof(s));
+ strlcpy(s.pfik_name, kif_name, sizeof(s.pfik_name));
+
+ return (RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kif *)&s));
+}
+
+struct pfi_kif *
+pfi_kif_attach(struct pfi_kif *kif, const char *kif_name)
+{
+ struct pfi_kif *kif1;
+
+ PF_RULES_WASSERT();
+ KASSERT(kif != NULL, ("%s: null kif", __func__));
+
+ kif1 = pfi_kif_find(kif_name);
+ if (kif1 != NULL) {
+ free(kif, PFI_MTYPE);
+ return (kif1);
+ }
+
+ bzero(kif, sizeof(*kif));
+ strlcpy(kif->pfik_name, kif_name, sizeof(kif->pfik_name));
+ /*
+ * It seems that the value of time_second is in unintialzied state
+ * when pf sets interface statistics clear time in boot phase if pf
+ * was statically linked to kernel. Instead of setting the bogus
+ * time value have pfi_get_ifaces handle this case. In
+ * pfi_get_ifaces it uses time_second if it sees the time is 0.
+ */
+ kif->pfik_tzero = time_second > 1 ? time_second : 0;
+ TAILQ_INIT(&kif->pfik_dynaddrs);
+
+ RB_INSERT(pfi_ifhead, &V_pfi_ifs, kif);
+
+ return (kif);
+}
+
+void
+pfi_kif_ref(struct pfi_kif *kif)
+{
+
+ PF_RULES_WASSERT();
+ kif->pfik_rulerefs++;
+}
+
+void
+pfi_kif_unref(struct pfi_kif *kif)
+{
+
+ PF_RULES_WASSERT();
+ KASSERT(kif->pfik_rulerefs > 0, ("%s: %p has zero refs", __func__, kif));
+
+ kif->pfik_rulerefs--;
+
+ if (kif->pfik_rulerefs > 0)
+ return;
+
+ /* kif referencing an existing ifnet or group should exist. */
+ if (kif->pfik_ifp != NULL || kif->pfik_group != NULL || kif == V_pfi_all)
+ return;
+
+ RB_REMOVE(pfi_ifhead, &V_pfi_ifs, kif);
+
+ kif->pfik_flags |= PFI_IFLAG_REFS;
+
+ mtx_lock(&pfi_unlnkdkifs_mtx);
+ LIST_INSERT_HEAD(&V_pfi_unlinked_kifs, kif, pfik_list);
+ mtx_unlock(&pfi_unlnkdkifs_mtx);
+}
+
+void
+pfi_kif_purge(void)
+{
+ struct pfi_kif *kif, *kif1;
+
+ /*
+ * Do naive mark-and-sweep garbage collecting of old kifs.
+ * Reference flag is raised by pf_purge_expired_states().
+ */
+ mtx_lock(&pfi_unlnkdkifs_mtx);
+ LIST_FOREACH_SAFE(kif, &V_pfi_unlinked_kifs, pfik_list, kif1) {
+ if (!(kif->pfik_flags & PFI_IFLAG_REFS)) {
+ LIST_REMOVE(kif, pfik_list);
+ free(kif, PFI_MTYPE);
+ } else
+ kif->pfik_flags &= ~PFI_IFLAG_REFS;
+ }
+ mtx_unlock(&pfi_unlnkdkifs_mtx);
+}
+
+int
+pfi_kif_match(struct pfi_kif *rule_kif, struct pfi_kif *packet_kif)
+{
+ struct ifg_list *p;
+
+ if (rule_kif == NULL || rule_kif == packet_kif)
+ return (1);
+
+ if (rule_kif->pfik_group != NULL)
+ /* XXXGL: locking? */
+ TAILQ_FOREACH(p, &packet_kif->pfik_ifp->if_groups, ifgl_next)
+ if (p->ifgl_group == rule_kif->pfik_group)
+ return (1);
+
+ return (0);
+}
+
+static void
+pfi_attach_ifnet(struct ifnet *ifp)
+{
+ struct pfi_kif *kif;
+
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+
+ PF_RULES_WLOCK();
+ V_pfi_update++;
+ kif = pfi_kif_attach(kif, ifp->if_xname);
+
+ kif->pfik_ifp = ifp;
+ ifp->if_pf_kif = kif;
+
+ pfi_kif_update(kif);
+ PF_RULES_WUNLOCK();
+}
+
+static void
+pfi_attach_ifgroup(struct ifg_group *ifg)
+{
+ struct pfi_kif *kif;
+
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+
+ PF_RULES_WLOCK();
+ V_pfi_update++;
+ kif = pfi_kif_attach(kif, ifg->ifg_group);
+
+ kif->pfik_group = ifg;
+ ifg->ifg_pf_kif = kif;
+ PF_RULES_WUNLOCK();
+}
+
+int
+pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ switch (dyn->pfid_acnt4) {
+ case 0:
+ return (0);
+ case 1:
+ return (PF_MATCHA(0, &dyn->pfid_addr4,
+ &dyn->pfid_mask4, a, AF_INET));
+ default:
+ return (pfr_match_addr(dyn->pfid_kt, a, AF_INET));
+ }
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ switch (dyn->pfid_acnt6) {
+ case 0:
+ return (0);
+ case 1:
+ return (PF_MATCHA(0, &dyn->pfid_addr6,
+ &dyn->pfid_mask6, a, AF_INET6));
+ default:
+ return (pfr_match_addr(dyn->pfid_kt, a, AF_INET6));
+ }
+ break;
+#endif /* INET6 */
+ default:
+ return (0);
+ }
+}
+
+int
+pfi_dynaddr_setup(struct pf_addr_wrap *aw, sa_family_t af)
+{
+ struct pfi_dynaddr *dyn;
+ char tblname[PF_TABLE_NAME_SIZE];
+ struct pf_ruleset *ruleset = NULL;
+ struct pfi_kif *kif;
+ int rv = 0;
+
+ PF_RULES_WASSERT();
+ KASSERT(aw->type == PF_ADDR_DYNIFTL, ("%s: type %u",
+ __func__, aw->type));
+ KASSERT(aw->p.dyn == NULL, ("%s: dyn is %p", __func__, aw->p.dyn));
+
+ if ((dyn = malloc(sizeof(*dyn), PFI_MTYPE, M_NOWAIT | M_ZERO)) == NULL)
+ return (ENOMEM);
+
+ if ((kif = malloc(sizeof(*kif), PFI_MTYPE, M_NOWAIT)) == NULL) {
+ free(dyn, PFI_MTYPE);
+ return (ENOMEM);
+ }
+
+ if (!strcmp(aw->v.ifname, "self"))
+ dyn->pfid_kif = pfi_kif_attach(kif, IFG_ALL);
+ else
+ dyn->pfid_kif = pfi_kif_attach(kif, aw->v.ifname);
+ pfi_kif_ref(dyn->pfid_kif);
+
+ dyn->pfid_net = pfi_unmask(&aw->v.a.mask);
+ if (af == AF_INET && dyn->pfid_net == 32)
+ dyn->pfid_net = 128;
+ strlcpy(tblname, aw->v.ifname, sizeof(tblname));
+ if (aw->iflags & PFI_AFLAG_NETWORK)
+ strlcat(tblname, ":network", sizeof(tblname));
+ if (aw->iflags & PFI_AFLAG_BROADCAST)
+ strlcat(tblname, ":broadcast", sizeof(tblname));
+ if (aw->iflags & PFI_AFLAG_PEER)
+ strlcat(tblname, ":peer", sizeof(tblname));
+ if (aw->iflags & PFI_AFLAG_NOALIAS)
+ strlcat(tblname, ":0", sizeof(tblname));
+ if (dyn->pfid_net != 128)
+ snprintf(tblname + strlen(tblname),
+ sizeof(tblname) - strlen(tblname), "/%d", dyn->pfid_net);
+ if ((ruleset = pf_find_or_create_ruleset(PF_RESERVED_ANCHOR)) == NULL) {
+ rv = ENOMEM;
+ goto _bad;
+ }
+
+ if ((dyn->pfid_kt = pfr_attach_table(ruleset, tblname)) == NULL) {
+ rv = ENOMEM;
+ goto _bad;
+ }
+
+ dyn->pfid_kt->pfrkt_flags |= PFR_TFLAG_ACTIVE;
+ dyn->pfid_iflags = aw->iflags;
+ dyn->pfid_af = af;
+
+ TAILQ_INSERT_TAIL(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry);
+ aw->p.dyn = dyn;
+ pfi_kif_update(dyn->pfid_kif);
+
+ return (0);
+
+_bad:
+ if (dyn->pfid_kt != NULL)
+ pfr_detach_table(dyn->pfid_kt);
+ if (ruleset != NULL)
+ pf_remove_if_empty_ruleset(ruleset);
+ if (dyn->pfid_kif != NULL)
+ pfi_kif_unref(dyn->pfid_kif);
+ free(dyn, PFI_MTYPE);
+
+ return (rv);
+}
+
+static void
+pfi_kif_update(struct pfi_kif *kif)
+{
+ struct ifg_list *ifgl;
+ struct pfi_dynaddr *p;
+
+ PF_RULES_WASSERT();
+
+ /* update all dynaddr */
+ TAILQ_FOREACH(p, &kif->pfik_dynaddrs, entry)
+ pfi_dynaddr_update(p);
+
+ /* again for all groups kif is member of */
+ if (kif->pfik_ifp != NULL) {
+ IF_ADDR_RLOCK(kif->pfik_ifp);
+ TAILQ_FOREACH(ifgl, &kif->pfik_ifp->if_groups, ifgl_next)
+ pfi_kif_update((struct pfi_kif *)
+ ifgl->ifgl_group->ifg_pf_kif);
+ IF_ADDR_RUNLOCK(kif->pfik_ifp);
+ }
+}
+
+static void
+pfi_dynaddr_update(struct pfi_dynaddr *dyn)
+{
+ struct pfi_kif *kif;
+ struct pfr_ktable *kt;
+
+ PF_RULES_WASSERT();
+ KASSERT(dyn && dyn->pfid_kif && dyn->pfid_kt,
+ ("%s: bad argument", __func__));
+
+ kif = dyn->pfid_kif;
+ kt = dyn->pfid_kt;
+
+ if (kt->pfrkt_larg != V_pfi_update) {
+ /* this table needs to be brought up-to-date */
+ pfi_table_update(kt, kif, dyn->pfid_net, dyn->pfid_iflags);
+ kt->pfrkt_larg = V_pfi_update;
+ }
+ pfr_dynaddr_update(kt, dyn);
+}
+
+static void
+pfi_table_update(struct pfr_ktable *kt, struct pfi_kif *kif, int net, int flags)
+{
+ int e, size2 = 0;
+ struct ifg_member *ifgm;
+
+ V_pfi_buffer_cnt = 0;
+
+ if (kif->pfik_ifp != NULL)
+ pfi_instance_add(kif->pfik_ifp, net, flags);
+ else if (kif->pfik_group != NULL) {
+ IFNET_RLOCK_NOSLEEP();
+ TAILQ_FOREACH(ifgm, &kif->pfik_group->ifg_members, ifgm_next)
+ pfi_instance_add(ifgm->ifgm_ifp, net, flags);
+ IFNET_RUNLOCK_NOSLEEP();
+ }
+
+ if ((e = pfr_set_addrs(&kt->pfrkt_t, V_pfi_buffer, V_pfi_buffer_cnt, &size2,
+ NULL, NULL, NULL, 0, PFR_TFLAG_ALLMASK)))
+ printf("%s: cannot set %d new addresses into table %s: %d\n",
+ __func__, V_pfi_buffer_cnt, kt->pfrkt_name, e);
+}
+
+static void
+pfi_instance_add(struct ifnet *ifp, int net, int flags)
+{
+ struct ifaddr *ia;
+ int got4 = 0, got6 = 0;
+ int net2, af;
+
+ IF_ADDR_RLOCK(ifp);
+ TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_list) {
+ if (ia->ifa_addr == NULL)
+ continue;
+ af = ia->ifa_addr->sa_family;
+ if (af != AF_INET && af != AF_INET6)
+ continue;
+ /*
+ * XXX: For point-to-point interfaces, (ifname:0) and IPv4,
+ * jump over addresses without a proper route to work
+ * around a problem with ppp not fully removing the
+ * address used during IPCP.
+ */
+ if ((ifp->if_flags & IFF_POINTOPOINT) &&
+ !(ia->ifa_flags & IFA_ROUTE) &&
+ (flags & PFI_AFLAG_NOALIAS) && (af == AF_INET))
+ continue;
+ if ((flags & PFI_AFLAG_BROADCAST) && af == AF_INET6)
+ continue;
+ if ((flags & PFI_AFLAG_BROADCAST) &&
+ !(ifp->if_flags & IFF_BROADCAST))
+ continue;
+ if ((flags & PFI_AFLAG_PEER) &&
+ !(ifp->if_flags & IFF_POINTOPOINT))
+ continue;
+ if ((flags & PFI_AFLAG_NETWORK) && af == AF_INET6 &&
+ IN6_IS_ADDR_LINKLOCAL(
+ &((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr))
+ continue;
+ if (flags & PFI_AFLAG_NOALIAS) {
+ if (af == AF_INET && got4)
+ continue;
+ if (af == AF_INET6 && got6)
+ continue;
+ }
+ if (af == AF_INET)
+ got4 = 1;
+ else if (af == AF_INET6)
+ got6 = 1;
+ net2 = net;
+ if (net2 == 128 && (flags & PFI_AFLAG_NETWORK)) {
+ if (af == AF_INET)
+ net2 = pfi_unmask(&((struct sockaddr_in *)
+ ia->ifa_netmask)->sin_addr);
+ else if (af == AF_INET6)
+ net2 = pfi_unmask(&((struct sockaddr_in6 *)
+ ia->ifa_netmask)->sin6_addr);
+ }
+ if (af == AF_INET && net2 > 32)
+ net2 = 32;
+ if (flags & PFI_AFLAG_BROADCAST)
+ pfi_address_add(ia->ifa_broadaddr, af, net2);
+ else if (flags & PFI_AFLAG_PEER)
+ pfi_address_add(ia->ifa_dstaddr, af, net2);
+ else
+ pfi_address_add(ia->ifa_addr, af, net2);
+ }
+ IF_ADDR_RUNLOCK(ifp);
+}
+
+static void
+pfi_address_add(struct sockaddr *sa, int af, int net)
+{
+ struct pfr_addr *p;
+ int i;
+
+ if (V_pfi_buffer_cnt >= V_pfi_buffer_max) {
+ int new_max = V_pfi_buffer_max * 2;
+
+ if (new_max > PFI_BUFFER_MAX) {
+ printf("%s: address buffer full (%d/%d)\n", __func__,
+ V_pfi_buffer_cnt, PFI_BUFFER_MAX);
+ return;
+ }
+ p = malloc(new_max * sizeof(*V_pfi_buffer), PFI_MTYPE,
+ M_NOWAIT);
+ if (p == NULL) {
+ printf("%s: no memory to grow buffer (%d/%d)\n",
+ __func__, V_pfi_buffer_cnt, PFI_BUFFER_MAX);
+ return;
+ }
+ memcpy(p, V_pfi_buffer, V_pfi_buffer_max * sizeof(*V_pfi_buffer));
+ /* no need to zero buffer */
+ free(V_pfi_buffer, PFI_MTYPE);
+ V_pfi_buffer = p;
+ V_pfi_buffer_max = new_max;
+ }
+ if (af == AF_INET && net > 32)
+ net = 128;
+ p = V_pfi_buffer + V_pfi_buffer_cnt++;
+ bzero(p, sizeof(*p));
+ p->pfra_af = af;
+ p->pfra_net = net;
+ if (af == AF_INET)
+ p->pfra_ip4addr = ((struct sockaddr_in *)sa)->sin_addr;
+ else if (af == AF_INET6) {
+ p->pfra_ip6addr = ((struct sockaddr_in6 *)sa)->sin6_addr;
+ if (IN6_IS_SCOPE_EMBED(&p->pfra_ip6addr))
+ p->pfra_ip6addr.s6_addr16[1] = 0;
+ }
+ /* mask network address bits */
+ if (net < 128)
+ ((caddr_t)p)[p->pfra_net/8] &= ~(0xFF >> (p->pfra_net%8));
+ for (i = (p->pfra_net+7)/8; i < sizeof(p->pfra_u); i++)
+ ((caddr_t)p)[i] = 0;
+}
+
+void
+pfi_dynaddr_remove(struct pfi_dynaddr *dyn)
+{
+
+ KASSERT(dyn->pfid_kif != NULL, ("%s: null pfid_kif", __func__));
+ KASSERT(dyn->pfid_kt != NULL, ("%s: null pfid_kt", __func__));
+
+ TAILQ_REMOVE(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry);
+ pfi_kif_unref(dyn->pfid_kif);
+ pfr_detach_table(dyn->pfid_kt);
+ free(dyn, PFI_MTYPE);
+}
+
+void
+pfi_dynaddr_copyout(struct pf_addr_wrap *aw)
+{
+
+ KASSERT(aw->type == PF_ADDR_DYNIFTL,
+ ("%s: type %u", __func__, aw->type));
+
+ if (aw->p.dyn == NULL || aw->p.dyn->pfid_kif == NULL)
+ return;
+ aw->p.dyncnt = aw->p.dyn->pfid_acnt4 + aw->p.dyn->pfid_acnt6;
+}
+
+static int
+pfi_if_compare(struct pfi_kif *p, struct pfi_kif *q)
+{
+ return (strncmp(p->pfik_name, q->pfik_name, IFNAMSIZ));
+}
+
+void
+pfi_update_status(const char *name, struct pf_status *pfs)
+{
+ struct pfi_kif *p;
+ struct pfi_kif_cmp key;
+ struct ifg_member p_member, *ifgm;
+ TAILQ_HEAD(, ifg_member) ifg_members;
+ int i, j, k;
+
+ strlcpy(key.pfik_name, name, sizeof(key.pfik_name));
+ p = RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kif *)&key);
+ if (p == NULL)
+ return;
+
+ if (p->pfik_group != NULL) {
+ bcopy(&p->pfik_group->ifg_members, &ifg_members,
+ sizeof(ifg_members));
+ } else {
+ /* build a temporary list for p only */
+ bzero(&p_member, sizeof(p_member));
+ p_member.ifgm_ifp = p->pfik_ifp;
+ TAILQ_INIT(&ifg_members);
+ TAILQ_INSERT_TAIL(&ifg_members, &p_member, ifgm_next);
+ }
+ if (pfs) {
+ bzero(pfs->pcounters, sizeof(pfs->pcounters));
+ bzero(pfs->bcounters, sizeof(pfs->bcounters));
+ }
+ TAILQ_FOREACH(ifgm, &ifg_members, ifgm_next) {
+ if (ifgm->ifgm_ifp == NULL || ifgm->ifgm_ifp->if_pf_kif == NULL)
+ continue;
+ p = (struct pfi_kif *)ifgm->ifgm_ifp->if_pf_kif;
+
+ /* just clear statistics */
+ if (pfs == NULL) {
+ bzero(p->pfik_packets, sizeof(p->pfik_packets));
+ bzero(p->pfik_bytes, sizeof(p->pfik_bytes));
+ p->pfik_tzero = time_second;
+ continue;
+ }
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < 2; j++)
+ for (k = 0; k < 2; k++) {
+ pfs->pcounters[i][j][k] +=
+ p->pfik_packets[i][j][k];
+ pfs->bcounters[i][j] +=
+ p->pfik_bytes[i][j][k];
+ }
+ }
+}
+
+void
+pfi_get_ifaces(const char *name, struct pfi_kif *buf, int *size)
+{
+ struct pfi_kif *p, *nextp;
+ int n = 0;
+
+ for (p = RB_MIN(pfi_ifhead, &V_pfi_ifs); p; p = nextp) {
+ nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p);
+ if (pfi_skip_if(name, p))
+ continue;
+ if (*size <= n++)
+ break;
+ if (!p->pfik_tzero)
+ p->pfik_tzero = time_second;
+ bcopy(p, buf++, sizeof(*buf));
+ nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p);
+ }
+ *size = n;
+}
+
+static int
+pfi_skip_if(const char *filter, struct pfi_kif *p)
+{
+ int n;
+
+ if (filter == NULL || !*filter)
+ return (0);
+ if (!strcmp(p->pfik_name, filter))
+ return (0); /* exact match */
+ n = strlen(filter);
+ if (n < 1 || n >= IFNAMSIZ)
+ return (1); /* sanity check */
+ if (filter[n-1] >= '0' && filter[n-1] <= '9')
+ return (1); /* only do exact match in that case */
+ if (strncmp(p->pfik_name, filter, n))
+ return (1); /* prefix doesn't match */
+ return (p->pfik_name[n] < '0' || p->pfik_name[n] > '9');
+}
+
+int
+pfi_set_flags(const char *name, int flags)
+{
+ struct pfi_kif *p;
+
+ RB_FOREACH(p, pfi_ifhead, &V_pfi_ifs) {
+ if (pfi_skip_if(name, p))
+ continue;
+ p->pfik_flags |= flags;
+ }
+ return (0);
+}
+
+int
+pfi_clear_flags(const char *name, int flags)
+{
+ struct pfi_kif *p;
+
+ RB_FOREACH(p, pfi_ifhead, &V_pfi_ifs) {
+ if (pfi_skip_if(name, p))
+ continue;
+ p->pfik_flags &= ~flags;
+ }
+ return (0);
+}
+
+/* from pf_print_state.c */
+static int
+pfi_unmask(void *addr)
+{
+ struct pf_addr *m = addr;
+ int i = 31, j = 0, b = 0;
+ u_int32_t tmp;
+
+ while (j < 4 && m->addr32[j] == 0xffffffff) {
+ b += 32;
+ j++;
+ }
+ if (j < 4) {
+ tmp = ntohl(m->addr32[j]);
+ for (i = 31; tmp & (1 << i); --i)
+ b++;
+ }
+ return (b);
+}
+
+static void
+pfi_attach_ifnet_event(void *arg __unused, struct ifnet *ifp)
+{
+
+ CURVNET_SET(ifp->if_vnet);
+ if (V_pf_vnet_active == 0) {
+ /* Avoid teardown race in the least expensive way. */
+ CURVNET_RESTORE();
+ return;
+ }
+ pfi_attach_ifnet(ifp);
+#ifdef ALTQ
+ PF_RULES_WLOCK();
+ pf_altq_ifnet_event(ifp, 0);
+ PF_RULES_WUNLOCK();
+#endif
+ CURVNET_RESTORE();
+}
+
+static void
+pfi_detach_ifnet_event(void *arg __unused, struct ifnet *ifp)
+{
+ struct pfi_kif *kif = (struct pfi_kif *)ifp->if_pf_kif;
+
+ if (kif == NULL)
+ return;
+
+ CURVNET_SET(ifp->if_vnet);
+ if (V_pf_vnet_active == 0) {
+ /* Avoid teardown race in the least expensive way. */
+ CURVNET_RESTORE();
+ return;
+ }
+ PF_RULES_WLOCK();
+ V_pfi_update++;
+ pfi_kif_update(kif);
+
+ kif->pfik_ifp = NULL;
+ ifp->if_pf_kif = NULL;
+#ifdef ALTQ
+ pf_altq_ifnet_event(ifp, 1);
+#endif
+ PF_RULES_WUNLOCK();
+ CURVNET_RESTORE();
+}
+
+static void
+pfi_attach_group_event(void *arg , struct ifg_group *ifg)
+{
+
+ CURVNET_SET((struct vnet *)arg);
+ if (V_pf_vnet_active == 0) {
+ /* Avoid teardown race in the least expensive way. */
+ CURVNET_RESTORE();
+ return;
+ }
+ pfi_attach_ifgroup(ifg);
+ CURVNET_RESTORE();
+}
+
+static void
+pfi_change_group_event(void *arg, char *gname)
+{
+ struct pfi_kif *kif;
+
+ CURVNET_SET((struct vnet *)arg);
+ if (V_pf_vnet_active == 0) {
+ /* Avoid teardown race in the least expensive way. */
+ CURVNET_RESTORE();
+ return;
+ }
+
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+ PF_RULES_WLOCK();
+ V_pfi_update++;
+ kif = pfi_kif_attach(kif, gname);
+ pfi_kif_update(kif);
+ PF_RULES_WUNLOCK();
+ CURVNET_RESTORE();
+}
+
+static void
+pfi_detach_group_event(void *arg, struct ifg_group *ifg)
+{
+ struct pfi_kif *kif = (struct pfi_kif *)ifg->ifg_pf_kif;
+
+ if (kif == NULL)
+ return;
+
+ CURVNET_SET((struct vnet *)arg);
+ if (V_pf_vnet_active == 0) {
+ /* Avoid teardown race in the least expensive way. */
+ CURVNET_RESTORE();
+ return;
+ }
+ PF_RULES_WLOCK();
+ V_pfi_update++;
+
+ kif->pfik_group = NULL;
+ ifg->ifg_pf_kif = NULL;
+ PF_RULES_WUNLOCK();
+ CURVNET_RESTORE();
+}
+
+static void
+pfi_ifaddr_event(void *arg __unused, struct ifnet *ifp)
+{
+ if (ifp->if_pf_kif == NULL)
+ return;
+
+ CURVNET_SET(ifp->if_vnet);
+ if (V_pf_vnet_active == 0) {
+ /* Avoid teardown race in the least expensive way. */
+ CURVNET_RESTORE();
+ return;
+ }
+ PF_RULES_WLOCK();
+ if (ifp && ifp->if_pf_kif) {
+ V_pfi_update++;
+ pfi_kif_update(ifp->if_pf_kif);
+ }
+ PF_RULES_WUNLOCK();
+ CURVNET_RESTORE();
+}
diff --git a/freebsd/sys/netpfil/pf/pf_ioctl.c b/freebsd/sys/netpfil/pf/pf_ioctl.c
new file mode 100644
index 00000000..9c1523ca
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/pf_ioctl.c
@@ -0,0 +1,3872 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002,2003 Henning Brauer
+ * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ * $OpenBSD: pf_ioctl.c,v 1.213 2009/02/15 21:46:12 mbalmer Exp $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/local/opt_inet.h>
+#include <rtems/bsd/local/opt_inet6.h>
+#include <rtems/bsd/local/opt_bpf.h>
+#include <rtems/bsd/local/opt_pf.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/endian.h>
+#include <sys/fcntl.h>
+#include <sys/filio.h>
+#include <sys/interrupt.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/md5.h>
+#include <sys/ucred.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/vnet.h>
+#include <net/route.h>
+#include <net/pfil.h>
+#include <net/pfvar.h>
+#include <net/if_pfsync.h>
+#include <net/if_pflog.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet6/ip6_var.h>
+#include <netinet/ip_icmp.h>
+
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif /* INET6 */
+
+#ifdef ALTQ
+#include <net/altq/altq.h>
+#endif
+
+static struct pf_pool *pf_get_pool(char *, u_int32_t, u_int8_t, u_int32_t,
+ u_int8_t, u_int8_t, u_int8_t);
+
+static void pf_mv_pool(struct pf_palist *, struct pf_palist *);
+static void pf_empty_pool(struct pf_palist *);
+static int pfioctl(struct cdev *, u_long, caddr_t, int,
+ struct thread *);
+#ifdef ALTQ
+static int pf_begin_altq(u_int32_t *);
+static int pf_rollback_altq(u_int32_t);
+static int pf_commit_altq(u_int32_t);
+static int pf_enable_altq(struct pf_altq *);
+static int pf_disable_altq(struct pf_altq *);
+static u_int32_t pf_qname2qid(char *);
+static void pf_qid_unref(u_int32_t);
+#endif /* ALTQ */
+static int pf_begin_rules(u_int32_t *, int, const char *);
+static int pf_rollback_rules(u_int32_t, int, char *);
+static int pf_setup_pfsync_matching(struct pf_ruleset *);
+static void pf_hash_rule(MD5_CTX *, struct pf_rule *);
+static void pf_hash_rule_addr(MD5_CTX *, struct pf_rule_addr *);
+static int pf_commit_rules(u_int32_t, int, char *);
+static int pf_addr_setup(struct pf_ruleset *,
+ struct pf_addr_wrap *, sa_family_t);
+static void pf_addr_copyout(struct pf_addr_wrap *);
+
+VNET_DEFINE(struct pf_rule, pf_default_rule);
+
+#ifdef ALTQ
+static VNET_DEFINE(int, pf_altq_running);
+#define V_pf_altq_running VNET(pf_altq_running)
+#endif
+
+#define TAGID_MAX 50000
+struct pf_tagname {
+ TAILQ_ENTRY(pf_tagname) entries;
+ char name[PF_TAG_NAME_SIZE];
+ uint16_t tag;
+ int ref;
+};
+
+TAILQ_HEAD(pf_tags, pf_tagname);
+#define V_pf_tags VNET(pf_tags)
+VNET_DEFINE(struct pf_tags, pf_tags);
+#define V_pf_qids VNET(pf_qids)
+VNET_DEFINE(struct pf_tags, pf_qids);
+static MALLOC_DEFINE(M_PFTAG, "pf_tag", "pf(4) tag names");
+static MALLOC_DEFINE(M_PFALTQ, "pf_altq", "pf(4) altq configuration db");
+static MALLOC_DEFINE(M_PFRULE, "pf_rule", "pf(4) rules");
+
+#if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE)
+#error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE
+#endif
+
+static u_int16_t tagname2tag(struct pf_tags *, char *);
+static u_int16_t pf_tagname2tag(char *);
+static void tag_unref(struct pf_tags *, u_int16_t);
+
+#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
+
+struct cdev *pf_dev;
+
+/*
+ * XXX - These are new and need to be checked when moveing to a new version
+ */
+static void pf_clear_states(void);
+static int pf_clear_tables(void);
+static void pf_clear_srcnodes(struct pf_src_node *);
+static void pf_kill_srcnodes(struct pfioc_src_node_kill *);
+static void pf_tbladdr_copyout(struct pf_addr_wrap *);
+
+/*
+ * Wrapper functions for pfil(9) hooks
+ */
+#ifdef INET
+static int pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp,
+ int dir, struct inpcb *inp);
+static int pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp,
+ int dir, struct inpcb *inp);
+#endif
+#ifdef INET6
+static int pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp,
+ int dir, struct inpcb *inp);
+static int pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp,
+ int dir, struct inpcb *inp);
+#endif
+
+static int hook_pf(void);
+static int dehook_pf(void);
+static int shutdown_pf(void);
+static int pf_load(void);
+static int pf_unload(void);
+
+static struct cdevsw pf_cdevsw = {
+ .d_ioctl = pfioctl,
+ .d_name = PF_NAME,
+ .d_version = D_VERSION,
+};
+
+static volatile VNET_DEFINE(int, pf_pfil_hooked);
+#define V_pf_pfil_hooked VNET(pf_pfil_hooked)
+
+/*
+ * We need a flag that is neither hooked nor running to know when
+ * the VNET is "valid". We primarily need this to control (global)
+ * external event, e.g., eventhandlers.
+ */
+VNET_DEFINE(int, pf_vnet_active);
+#define V_pf_vnet_active VNET(pf_vnet_active)
+
+int pf_end_threads;
+
+struct rwlock pf_rules_lock;
+struct sx pf_ioctl_lock;
+
+/* pfsync */
+pfsync_state_import_t *pfsync_state_import_ptr = NULL;
+pfsync_insert_state_t *pfsync_insert_state_ptr = NULL;
+pfsync_update_state_t *pfsync_update_state_ptr = NULL;
+pfsync_delete_state_t *pfsync_delete_state_ptr = NULL;
+pfsync_clear_states_t *pfsync_clear_states_ptr = NULL;
+pfsync_defer_t *pfsync_defer_ptr = NULL;
+/* pflog */
+pflog_packet_t *pflog_packet_ptr = NULL;
+
+static void
+pfattach_vnet(void)
+{
+ u_int32_t *my_timeout = V_pf_default_rule.timeout;
+
+ pf_initialize();
+ pfr_initialize();
+ pfi_initialize_vnet();
+ pf_normalize_init();
+
+ V_pf_limits[PF_LIMIT_STATES].limit = PFSTATE_HIWAT;
+ V_pf_limits[PF_LIMIT_SRC_NODES].limit = PFSNODE_HIWAT;
+
+ RB_INIT(&V_pf_anchors);
+ pf_init_ruleset(&pf_main_ruleset);
+
+ /* default rule should never be garbage collected */
+ V_pf_default_rule.entries.tqe_prev = &V_pf_default_rule.entries.tqe_next;
+#ifdef PF_DEFAULT_TO_DROP
+ V_pf_default_rule.action = PF_DROP;
+#else
+ V_pf_default_rule.action = PF_PASS;
+#endif
+ V_pf_default_rule.nr = -1;
+ V_pf_default_rule.rtableid = -1;
+
+ V_pf_default_rule.states_cur = counter_u64_alloc(M_WAITOK);
+ V_pf_default_rule.states_tot = counter_u64_alloc(M_WAITOK);
+ V_pf_default_rule.src_nodes = counter_u64_alloc(M_WAITOK);
+
+ /* initialize default timeouts */
+ my_timeout[PFTM_TCP_FIRST_PACKET] = PFTM_TCP_FIRST_PACKET_VAL;
+ my_timeout[PFTM_TCP_OPENING] = PFTM_TCP_OPENING_VAL;
+ my_timeout[PFTM_TCP_ESTABLISHED] = PFTM_TCP_ESTABLISHED_VAL;
+ my_timeout[PFTM_TCP_CLOSING] = PFTM_TCP_CLOSING_VAL;
+ my_timeout[PFTM_TCP_FIN_WAIT] = PFTM_TCP_FIN_WAIT_VAL;
+ my_timeout[PFTM_TCP_CLOSED] = PFTM_TCP_CLOSED_VAL;
+ my_timeout[PFTM_UDP_FIRST_PACKET] = PFTM_UDP_FIRST_PACKET_VAL;
+ my_timeout[PFTM_UDP_SINGLE] = PFTM_UDP_SINGLE_VAL;
+ my_timeout[PFTM_UDP_MULTIPLE] = PFTM_UDP_MULTIPLE_VAL;
+ my_timeout[PFTM_ICMP_FIRST_PACKET] = PFTM_ICMP_FIRST_PACKET_VAL;
+ my_timeout[PFTM_ICMP_ERROR_REPLY] = PFTM_ICMP_ERROR_REPLY_VAL;
+ my_timeout[PFTM_OTHER_FIRST_PACKET] = PFTM_OTHER_FIRST_PACKET_VAL;
+ my_timeout[PFTM_OTHER_SINGLE] = PFTM_OTHER_SINGLE_VAL;
+ my_timeout[PFTM_OTHER_MULTIPLE] = PFTM_OTHER_MULTIPLE_VAL;
+ my_timeout[PFTM_FRAG] = PFTM_FRAG_VAL;
+ my_timeout[PFTM_INTERVAL] = PFTM_INTERVAL_VAL;
+ my_timeout[PFTM_SRC_NODE] = PFTM_SRC_NODE_VAL;
+ my_timeout[PFTM_TS_DIFF] = PFTM_TS_DIFF_VAL;
+ my_timeout[PFTM_ADAPTIVE_START] = PFSTATE_ADAPT_START;
+ my_timeout[PFTM_ADAPTIVE_END] = PFSTATE_ADAPT_END;
+
+ bzero(&V_pf_status, sizeof(V_pf_status));
+ V_pf_status.debug = PF_DEBUG_URGENT;
+
+ V_pf_pfil_hooked = 0;
+
+ /* XXX do our best to avoid a conflict */
+ V_pf_status.hostid = arc4random();
+
+ for (int i = 0; i < PFRES_MAX; i++)
+ V_pf_status.counters[i] = counter_u64_alloc(M_WAITOK);
+ for (int i = 0; i < LCNT_MAX; i++)
+ V_pf_status.lcounters[i] = counter_u64_alloc(M_WAITOK);
+ for (int i = 0; i < FCNT_MAX; i++)
+ V_pf_status.fcounters[i] = counter_u64_alloc(M_WAITOK);
+ for (int i = 0; i < SCNT_MAX; i++)
+ V_pf_status.scounters[i] = counter_u64_alloc(M_WAITOK);
+
+ if (swi_add(NULL, "pf send", pf_intr, curvnet, SWI_NET,
+ INTR_MPSAFE, &V_pf_swi_cookie) != 0)
+ /* XXXGL: leaked all above. */
+ return;
+}
+
+
+static struct pf_pool *
+pf_get_pool(char *anchor, u_int32_t ticket, u_int8_t rule_action,
+ u_int32_t rule_number, u_int8_t r_last, u_int8_t active,
+ u_int8_t check_ticket)
+{
+ struct pf_ruleset *ruleset;
+ struct pf_rule *rule;
+ int rs_num;
+
+ ruleset = pf_find_ruleset(anchor);
+ if (ruleset == NULL)
+ return (NULL);
+ rs_num = pf_get_ruleset_number(rule_action);
+ if (rs_num >= PF_RULESET_MAX)
+ return (NULL);
+ if (active) {
+ if (check_ticket && ticket !=
+ ruleset->rules[rs_num].active.ticket)
+ return (NULL);
+ if (r_last)
+ rule = TAILQ_LAST(ruleset->rules[rs_num].active.ptr,
+ pf_rulequeue);
+ else
+ rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr);
+ } else {
+ if (check_ticket && ticket !=
+ ruleset->rules[rs_num].inactive.ticket)
+ return (NULL);
+ if (r_last)
+ rule = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr,
+ pf_rulequeue);
+ else
+ rule = TAILQ_FIRST(ruleset->rules[rs_num].inactive.ptr);
+ }
+ if (!r_last) {
+ while ((rule != NULL) && (rule->nr != rule_number))
+ rule = TAILQ_NEXT(rule, entries);
+ }
+ if (rule == NULL)
+ return (NULL);
+
+ return (&rule->rpool);
+}
+
+static void
+pf_mv_pool(struct pf_palist *poola, struct pf_palist *poolb)
+{
+ struct pf_pooladdr *mv_pool_pa;
+
+ while ((mv_pool_pa = TAILQ_FIRST(poola)) != NULL) {
+ TAILQ_REMOVE(poola, mv_pool_pa, entries);
+ TAILQ_INSERT_TAIL(poolb, mv_pool_pa, entries);
+ }
+}
+
+static void
+pf_empty_pool(struct pf_palist *poola)
+{
+ struct pf_pooladdr *pa;
+
+ while ((pa = TAILQ_FIRST(poola)) != NULL) {
+ switch (pa->addr.type) {
+ case PF_ADDR_DYNIFTL:
+ pfi_dynaddr_remove(pa->addr.p.dyn);
+ break;
+ case PF_ADDR_TABLE:
+ /* XXX: this could be unfinished pooladdr on pabuf */
+ if (pa->addr.p.tbl != NULL)
+ pfr_detach_table(pa->addr.p.tbl);
+ break;
+ }
+ if (pa->kif)
+ pfi_kif_unref(pa->kif);
+ TAILQ_REMOVE(poola, pa, entries);
+ free(pa, M_PFRULE);
+ }
+}
+
+static void
+pf_unlink_rule(struct pf_rulequeue *rulequeue, struct pf_rule *rule)
+{
+
+ PF_RULES_WASSERT();
+
+ TAILQ_REMOVE(rulequeue, rule, entries);
+
+ PF_UNLNKDRULES_LOCK();
+ rule->rule_flag |= PFRULE_REFS;
+ TAILQ_INSERT_TAIL(&V_pf_unlinked_rules, rule, entries);
+ PF_UNLNKDRULES_UNLOCK();
+}
+
+void
+pf_free_rule(struct pf_rule *rule)
+{
+
+ PF_RULES_WASSERT();
+
+ if (rule->tag)
+ tag_unref(&V_pf_tags, rule->tag);
+ if (rule->match_tag)
+ tag_unref(&V_pf_tags, rule->match_tag);
+#ifdef ALTQ
+ if (rule->pqid != rule->qid)
+ pf_qid_unref(rule->pqid);
+ pf_qid_unref(rule->qid);
+#endif
+ switch (rule->src.addr.type) {
+ case PF_ADDR_DYNIFTL:
+ pfi_dynaddr_remove(rule->src.addr.p.dyn);
+ break;
+ case PF_ADDR_TABLE:
+ pfr_detach_table(rule->src.addr.p.tbl);
+ break;
+ }
+ switch (rule->dst.addr.type) {
+ case PF_ADDR_DYNIFTL:
+ pfi_dynaddr_remove(rule->dst.addr.p.dyn);
+ break;
+ case PF_ADDR_TABLE:
+ pfr_detach_table(rule->dst.addr.p.tbl);
+ break;
+ }
+ if (rule->overload_tbl)
+ pfr_detach_table(rule->overload_tbl);
+ if (rule->kif)
+ pfi_kif_unref(rule->kif);
+ pf_anchor_remove(rule);
+ pf_empty_pool(&rule->rpool.list);
+ counter_u64_free(rule->states_cur);
+ counter_u64_free(rule->states_tot);
+ counter_u64_free(rule->src_nodes);
+ free(rule, M_PFRULE);
+}
+
+static u_int16_t
+tagname2tag(struct pf_tags *head, char *tagname)
+{
+ struct pf_tagname *tag, *p = NULL;
+ u_int16_t new_tagid = 1;
+
+ PF_RULES_WASSERT();
+
+ TAILQ_FOREACH(tag, head, entries)
+ if (strcmp(tagname, tag->name) == 0) {
+ tag->ref++;
+ return (tag->tag);
+ }
+
+ /*
+ * to avoid fragmentation, we do a linear search from the beginning
+ * and take the first free slot we find. if there is none or the list
+ * is empty, append a new entry at the end.
+ */
+
+ /* new entry */
+ if (!TAILQ_EMPTY(head))
+ for (p = TAILQ_FIRST(head); p != NULL &&
+ p->tag == new_tagid; p = TAILQ_NEXT(p, entries))
+ new_tagid = p->tag + 1;
+
+ if (new_tagid > TAGID_MAX)
+ return (0);
+
+ /* allocate and fill new struct pf_tagname */
+ tag = malloc(sizeof(*tag), M_PFTAG, M_NOWAIT|M_ZERO);
+ if (tag == NULL)
+ return (0);
+ strlcpy(tag->name, tagname, sizeof(tag->name));
+ tag->tag = new_tagid;
+ tag->ref++;
+
+ if (p != NULL) /* insert new entry before p */
+ TAILQ_INSERT_BEFORE(p, tag, entries);
+ else /* either list empty or no free slot in between */
+ TAILQ_INSERT_TAIL(head, tag, entries);
+
+ return (tag->tag);
+}
+
+static void
+tag_unref(struct pf_tags *head, u_int16_t tag)
+{
+ struct pf_tagname *p, *next;
+
+ PF_RULES_WASSERT();
+
+ for (p = TAILQ_FIRST(head); p != NULL; p = next) {
+ next = TAILQ_NEXT(p, entries);
+ if (tag == p->tag) {
+ if (--p->ref == 0) {
+ TAILQ_REMOVE(head, p, entries);
+ free(p, M_PFTAG);
+ }
+ break;
+ }
+ }
+}
+
+static u_int16_t
+pf_tagname2tag(char *tagname)
+{
+ return (tagname2tag(&V_pf_tags, tagname));
+}
+
+#ifdef ALTQ
+static u_int32_t
+pf_qname2qid(char *qname)
+{
+ return ((u_int32_t)tagname2tag(&V_pf_qids, qname));
+}
+
+static void
+pf_qid_unref(u_int32_t qid)
+{
+ tag_unref(&V_pf_qids, (u_int16_t)qid);
+}
+
+static int
+pf_begin_altq(u_int32_t *ticket)
+{
+ struct pf_altq *altq;
+ int error = 0;
+
+ PF_RULES_WASSERT();
+
+ /* Purge the old altq list */
+ while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) {
+ TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries);
+ if (altq->qname[0] == 0 &&
+ (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
+ /* detach and destroy the discipline */
+ error = altq_remove(altq);
+ } else
+ pf_qid_unref(altq->qid);
+ free(altq, M_PFALTQ);
+ }
+ if (error)
+ return (error);
+ *ticket = ++V_ticket_altqs_inactive;
+ V_altqs_inactive_open = 1;
+ return (0);
+}
+
+static int
+pf_rollback_altq(u_int32_t ticket)
+{
+ struct pf_altq *altq;
+ int error = 0;
+
+ PF_RULES_WASSERT();
+
+ if (!V_altqs_inactive_open || ticket != V_ticket_altqs_inactive)
+ return (0);
+ /* Purge the old altq list */
+ while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) {
+ TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries);
+ if (altq->qname[0] == 0 &&
+ (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
+ /* detach and destroy the discipline */
+ error = altq_remove(altq);
+ } else
+ pf_qid_unref(altq->qid);
+ free(altq, M_PFALTQ);
+ }
+ V_altqs_inactive_open = 0;
+ return (error);
+}
+
+static int
+pf_commit_altq(u_int32_t ticket)
+{
+ struct pf_altqqueue *old_altqs;
+ struct pf_altq *altq;
+ int err, error = 0;
+
+ PF_RULES_WASSERT();
+
+ if (!V_altqs_inactive_open || ticket != V_ticket_altqs_inactive)
+ return (EBUSY);
+
+ /* swap altqs, keep the old. */
+ old_altqs = V_pf_altqs_active;
+ V_pf_altqs_active = V_pf_altqs_inactive;
+ V_pf_altqs_inactive = old_altqs;
+ V_ticket_altqs_active = V_ticket_altqs_inactive;
+
+ /* Attach new disciplines */
+ TAILQ_FOREACH(altq, V_pf_altqs_active, entries) {
+ if (altq->qname[0] == 0 &&
+ (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
+ /* attach the discipline */
+ error = altq_pfattach(altq);
+ if (error == 0 && V_pf_altq_running)
+ error = pf_enable_altq(altq);
+ if (error != 0)
+ return (error);
+ }
+ }
+
+ /* Purge the old altq list */
+ while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) {
+ TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries);
+ if (altq->qname[0] == 0 &&
+ (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
+ /* detach and destroy the discipline */
+ if (V_pf_altq_running)
+ error = pf_disable_altq(altq);
+ err = altq_pfdetach(altq);
+ if (err != 0 && error == 0)
+ error = err;
+ err = altq_remove(altq);
+ if (err != 0 && error == 0)
+ error = err;
+ } else
+ pf_qid_unref(altq->qid);
+ free(altq, M_PFALTQ);
+ }
+
+ V_altqs_inactive_open = 0;
+ return (error);
+}
+
+static int
+pf_enable_altq(struct pf_altq *altq)
+{
+ struct ifnet *ifp;
+ struct tb_profile tb;
+ int error = 0;
+
+ if ((ifp = ifunit(altq->ifname)) == NULL)
+ return (EINVAL);
+
+ if (ifp->if_snd.altq_type != ALTQT_NONE)
+ error = altq_enable(&ifp->if_snd);
+
+ /* set tokenbucket regulator */
+ if (error == 0 && ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
+ tb.rate = altq->ifbandwidth;
+ tb.depth = altq->tbrsize;
+ error = tbr_set(&ifp->if_snd, &tb);
+ }
+
+ return (error);
+}
+
+static int
+pf_disable_altq(struct pf_altq *altq)
+{
+ struct ifnet *ifp;
+ struct tb_profile tb;
+ int error;
+
+ if ((ifp = ifunit(altq->ifname)) == NULL)
+ return (EINVAL);
+
+ /*
+ * when the discipline is no longer referenced, it was overridden
+ * by a new one. if so, just return.
+ */
+ if (altq->altq_disc != ifp->if_snd.altq_disc)
+ return (0);
+
+ error = altq_disable(&ifp->if_snd);
+
+ if (error == 0) {
+ /* clear tokenbucket regulator */
+ tb.rate = 0;
+ error = tbr_set(&ifp->if_snd, &tb);
+ }
+
+ return (error);
+}
+
+void
+pf_altq_ifnet_event(struct ifnet *ifp, int remove)
+{
+ struct ifnet *ifp1;
+ struct pf_altq *a1, *a2, *a3;
+ u_int32_t ticket;
+ int error = 0;
+
+ /* Interrupt userland queue modifications */
+ if (V_altqs_inactive_open)
+ pf_rollback_altq(V_ticket_altqs_inactive);
+
+ /* Start new altq ruleset */
+ if (pf_begin_altq(&ticket))
+ return;
+
+ /* Copy the current active set */
+ TAILQ_FOREACH(a1, V_pf_altqs_active, entries) {
+ a2 = malloc(sizeof(*a2), M_PFALTQ, M_NOWAIT);
+ if (a2 == NULL) {
+ error = ENOMEM;
+ break;
+ }
+ bcopy(a1, a2, sizeof(struct pf_altq));
+
+ if (a2->qname[0] != 0) {
+ if ((a2->qid = pf_qname2qid(a2->qname)) == 0) {
+ error = EBUSY;
+ free(a2, M_PFALTQ);
+ break;
+ }
+ a2->altq_disc = NULL;
+ TAILQ_FOREACH(a3, V_pf_altqs_inactive, entries) {
+ if (strncmp(a3->ifname, a2->ifname,
+ IFNAMSIZ) == 0 && a3->qname[0] == 0) {
+ a2->altq_disc = a3->altq_disc;
+ break;
+ }
+ }
+ }
+ /* Deactivate the interface in question */
+ a2->local_flags &= ~PFALTQ_FLAG_IF_REMOVED;
+ if ((ifp1 = ifunit(a2->ifname)) == NULL ||
+ (remove && ifp1 == ifp)) {
+ a2->local_flags |= PFALTQ_FLAG_IF_REMOVED;
+ } else {
+ error = altq_add(a2);
+
+ if (ticket != V_ticket_altqs_inactive)
+ error = EBUSY;
+
+ if (error) {
+ free(a2, M_PFALTQ);
+ break;
+ }
+ }
+
+ TAILQ_INSERT_TAIL(V_pf_altqs_inactive, a2, entries);
+ }
+
+ if (error != 0)
+ pf_rollback_altq(ticket);
+ else
+ pf_commit_altq(ticket);
+}
+#endif /* ALTQ */
+
+static int
+pf_begin_rules(u_int32_t *ticket, int rs_num, const char *anchor)
+{
+ struct pf_ruleset *rs;
+ struct pf_rule *rule;
+
+ PF_RULES_WASSERT();
+
+ if (rs_num < 0 || rs_num >= PF_RULESET_MAX)
+ return (EINVAL);
+ rs = pf_find_or_create_ruleset(anchor);
+ if (rs == NULL)
+ return (EINVAL);
+ while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) {
+ pf_unlink_rule(rs->rules[rs_num].inactive.ptr, rule);
+ rs->rules[rs_num].inactive.rcount--;
+ }
+ *ticket = ++rs->rules[rs_num].inactive.ticket;
+ rs->rules[rs_num].inactive.open = 1;
+ return (0);
+}
+
+static int
+pf_rollback_rules(u_int32_t ticket, int rs_num, char *anchor)
+{
+ struct pf_ruleset *rs;
+ struct pf_rule *rule;
+
+ PF_RULES_WASSERT();
+
+ if (rs_num < 0 || rs_num >= PF_RULESET_MAX)
+ return (EINVAL);
+ rs = pf_find_ruleset(anchor);
+ if (rs == NULL || !rs->rules[rs_num].inactive.open ||
+ rs->rules[rs_num].inactive.ticket != ticket)
+ return (0);
+ while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) {
+ pf_unlink_rule(rs->rules[rs_num].inactive.ptr, rule);
+ rs->rules[rs_num].inactive.rcount--;
+ }
+ rs->rules[rs_num].inactive.open = 0;
+ return (0);
+}
+
+#define PF_MD5_UPD(st, elm) \
+ MD5Update(ctx, (u_int8_t *) &(st)->elm, sizeof((st)->elm))
+
+#define PF_MD5_UPD_STR(st, elm) \
+ MD5Update(ctx, (u_int8_t *) (st)->elm, strlen((st)->elm))
+
+#define PF_MD5_UPD_HTONL(st, elm, stor) do { \
+ (stor) = htonl((st)->elm); \
+ MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int32_t));\
+} while (0)
+
+#define PF_MD5_UPD_HTONS(st, elm, stor) do { \
+ (stor) = htons((st)->elm); \
+ MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int16_t));\
+} while (0)
+
+static void
+pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr)
+{
+ PF_MD5_UPD(pfr, addr.type);
+ switch (pfr->addr.type) {
+ case PF_ADDR_DYNIFTL:
+ PF_MD5_UPD(pfr, addr.v.ifname);
+ PF_MD5_UPD(pfr, addr.iflags);
+ break;
+ case PF_ADDR_TABLE:
+ PF_MD5_UPD(pfr, addr.v.tblname);
+ break;
+ case PF_ADDR_ADDRMASK:
+ /* XXX ignore af? */
+ PF_MD5_UPD(pfr, addr.v.a.addr.addr32);
+ PF_MD5_UPD(pfr, addr.v.a.mask.addr32);
+ break;
+ }
+
+ PF_MD5_UPD(pfr, port[0]);
+ PF_MD5_UPD(pfr, port[1]);
+ PF_MD5_UPD(pfr, neg);
+ PF_MD5_UPD(pfr, port_op);
+}
+
+static void
+pf_hash_rule(MD5_CTX *ctx, struct pf_rule *rule)
+{
+ u_int16_t x;
+ u_int32_t y;
+
+ pf_hash_rule_addr(ctx, &rule->src);
+ pf_hash_rule_addr(ctx, &rule->dst);
+ PF_MD5_UPD_STR(rule, label);
+ PF_MD5_UPD_STR(rule, ifname);
+ PF_MD5_UPD_STR(rule, match_tagname);
+ PF_MD5_UPD_HTONS(rule, match_tag, x); /* dup? */
+ PF_MD5_UPD_HTONL(rule, os_fingerprint, y);
+ PF_MD5_UPD_HTONL(rule, prob, y);
+ PF_MD5_UPD_HTONL(rule, uid.uid[0], y);
+ PF_MD5_UPD_HTONL(rule, uid.uid[1], y);
+ PF_MD5_UPD(rule, uid.op);
+ PF_MD5_UPD_HTONL(rule, gid.gid[0], y);
+ PF_MD5_UPD_HTONL(rule, gid.gid[1], y);
+ PF_MD5_UPD(rule, gid.op);
+ PF_MD5_UPD_HTONL(rule, rule_flag, y);
+ PF_MD5_UPD(rule, action);
+ PF_MD5_UPD(rule, direction);
+ PF_MD5_UPD(rule, af);
+ PF_MD5_UPD(rule, quick);
+ PF_MD5_UPD(rule, ifnot);
+ PF_MD5_UPD(rule, match_tag_not);
+ PF_MD5_UPD(rule, natpass);
+ PF_MD5_UPD(rule, keep_state);
+ PF_MD5_UPD(rule, proto);
+ PF_MD5_UPD(rule, type);
+ PF_MD5_UPD(rule, code);
+ PF_MD5_UPD(rule, flags);
+ PF_MD5_UPD(rule, flagset);
+ PF_MD5_UPD(rule, allow_opts);
+ PF_MD5_UPD(rule, rt);
+ PF_MD5_UPD(rule, tos);
+}
+
+static int
+pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
+{
+ struct pf_ruleset *rs;
+ struct pf_rule *rule, **old_array;
+ struct pf_rulequeue *old_rules;
+ int error;
+ u_int32_t old_rcount;
+
+ PF_RULES_WASSERT();
+
+ if (rs_num < 0 || rs_num >= PF_RULESET_MAX)
+ return (EINVAL);
+ rs = pf_find_ruleset(anchor);
+ if (rs == NULL || !rs->rules[rs_num].inactive.open ||
+ ticket != rs->rules[rs_num].inactive.ticket)
+ return (EBUSY);
+
+ /* Calculate checksum for the main ruleset */
+ if (rs == &pf_main_ruleset) {
+ error = pf_setup_pfsync_matching(rs);
+ if (error != 0)
+ return (error);
+ }
+
+ /* Swap rules, keep the old. */
+ old_rules = rs->rules[rs_num].active.ptr;
+ old_rcount = rs->rules[rs_num].active.rcount;
+ old_array = rs->rules[rs_num].active.ptr_array;
+
+ rs->rules[rs_num].active.ptr =
+ rs->rules[rs_num].inactive.ptr;
+ rs->rules[rs_num].active.ptr_array =
+ rs->rules[rs_num].inactive.ptr_array;
+ rs->rules[rs_num].active.rcount =
+ rs->rules[rs_num].inactive.rcount;
+ rs->rules[rs_num].inactive.ptr = old_rules;
+ rs->rules[rs_num].inactive.ptr_array = old_array;
+ rs->rules[rs_num].inactive.rcount = old_rcount;
+
+ rs->rules[rs_num].active.ticket =
+ rs->rules[rs_num].inactive.ticket;
+ pf_calc_skip_steps(rs->rules[rs_num].active.ptr);
+
+
+ /* Purge the old rule list. */
+ while ((rule = TAILQ_FIRST(old_rules)) != NULL)
+ pf_unlink_rule(old_rules, rule);
+ if (rs->rules[rs_num].inactive.ptr_array)
+ free(rs->rules[rs_num].inactive.ptr_array, M_TEMP);
+ rs->rules[rs_num].inactive.ptr_array = NULL;
+ rs->rules[rs_num].inactive.rcount = 0;
+ rs->rules[rs_num].inactive.open = 0;
+ pf_remove_if_empty_ruleset(rs);
+
+ return (0);
+}
+
+static int
+pf_setup_pfsync_matching(struct pf_ruleset *rs)
+{
+ MD5_CTX ctx;
+ struct pf_rule *rule;
+ int rs_cnt;
+ u_int8_t digest[PF_MD5_DIGEST_LENGTH];
+
+ MD5Init(&ctx);
+ for (rs_cnt = 0; rs_cnt < PF_RULESET_MAX; rs_cnt++) {
+ /* XXX PF_RULESET_SCRUB as well? */
+ if (rs_cnt == PF_RULESET_SCRUB)
+ continue;
+
+ if (rs->rules[rs_cnt].inactive.ptr_array)
+ free(rs->rules[rs_cnt].inactive.ptr_array, M_TEMP);
+ rs->rules[rs_cnt].inactive.ptr_array = NULL;
+
+ if (rs->rules[rs_cnt].inactive.rcount) {
+ rs->rules[rs_cnt].inactive.ptr_array =
+ malloc(sizeof(caddr_t) *
+ rs->rules[rs_cnt].inactive.rcount,
+ M_TEMP, M_NOWAIT);
+
+ if (!rs->rules[rs_cnt].inactive.ptr_array)
+ return (ENOMEM);
+ }
+
+ TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr,
+ entries) {
+ pf_hash_rule(&ctx, rule);
+ (rs->rules[rs_cnt].inactive.ptr_array)[rule->nr] = rule;
+ }
+ }
+
+ MD5Final(digest, &ctx);
+ memcpy(V_pf_status.pf_chksum, digest, sizeof(V_pf_status.pf_chksum));
+ return (0);
+}
+
+static int
+pf_addr_setup(struct pf_ruleset *ruleset, struct pf_addr_wrap *addr,
+ sa_family_t af)
+{
+ int error = 0;
+
+ switch (addr->type) {
+ case PF_ADDR_TABLE:
+ addr->p.tbl = pfr_attach_table(ruleset, addr->v.tblname);
+ if (addr->p.tbl == NULL)
+ error = ENOMEM;
+ break;
+ case PF_ADDR_DYNIFTL:
+ error = pfi_dynaddr_setup(addr, af);
+ break;
+ }
+
+ return (error);
+}
+
+static void
+pf_addr_copyout(struct pf_addr_wrap *addr)
+{
+
+ switch (addr->type) {
+ case PF_ADDR_DYNIFTL:
+ pfi_dynaddr_copyout(addr);
+ break;
+ case PF_ADDR_TABLE:
+ pf_tbladdr_copyout(addr);
+ break;
+ }
+}
+
+static int
+pfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
+{
+ int error = 0;
+
+ /* XXX keep in sync with switch() below */
+ if (securelevel_gt(td->td_ucred, 2))
+ switch (cmd) {
+ case DIOCGETRULES:
+ case DIOCGETRULE:
+ case DIOCGETADDRS:
+ case DIOCGETADDR:
+ case DIOCGETSTATE:
+ case DIOCSETSTATUSIF:
+ case DIOCGETSTATUS:
+ case DIOCCLRSTATUS:
+ case DIOCNATLOOK:
+ case DIOCSETDEBUG:
+ case DIOCGETSTATES:
+ case DIOCGETTIMEOUT:
+ case DIOCCLRRULECTRS:
+ case DIOCGETLIMIT:
+ case DIOCGETALTQS:
+ case DIOCGETALTQ:
+ case DIOCGETQSTATS:
+ case DIOCGETRULESETS:
+ case DIOCGETRULESET:
+ case DIOCRGETTABLES:
+ case DIOCRGETTSTATS:
+ case DIOCRCLRTSTATS:
+ case DIOCRCLRADDRS:
+ case DIOCRADDADDRS:
+ case DIOCRDELADDRS:
+ case DIOCRSETADDRS:
+ case DIOCRGETADDRS:
+ case DIOCRGETASTATS:
+ case DIOCRCLRASTATS:
+ case DIOCRTSTADDRS:
+ case DIOCOSFPGET:
+ case DIOCGETSRCNODES:
+ case DIOCCLRSRCNODES:
+ case DIOCIGETIFACES:
+ case DIOCGIFSPEED:
+ case DIOCSETIFFLAG:
+ case DIOCCLRIFFLAG:
+ break;
+ case DIOCRCLRTABLES:
+ case DIOCRADDTABLES:
+ case DIOCRDELTABLES:
+ case DIOCRSETTFLAGS:
+ if (((struct pfioc_table *)addr)->pfrio_flags &
+ PFR_FLAG_DUMMY)
+ break; /* dummy operation ok */
+ return (EPERM);
+ default:
+ return (EPERM);
+ }
+
+ if (!(flags & FWRITE))
+ switch (cmd) {
+ case DIOCGETRULES:
+ case DIOCGETADDRS:
+ case DIOCGETADDR:
+ case DIOCGETSTATE:
+ case DIOCGETSTATUS:
+ case DIOCGETSTATES:
+ case DIOCGETTIMEOUT:
+ case DIOCGETLIMIT:
+ case DIOCGETALTQS:
+ case DIOCGETALTQ:
+ case DIOCGETQSTATS:
+ case DIOCGETRULESETS:
+ case DIOCGETRULESET:
+ case DIOCNATLOOK:
+ case DIOCRGETTABLES:
+ case DIOCRGETTSTATS:
+ case DIOCRGETADDRS:
+ case DIOCRGETASTATS:
+ case DIOCRTSTADDRS:
+ case DIOCOSFPGET:
+ case DIOCGETSRCNODES:
+ case DIOCIGETIFACES:
+ case DIOCGIFSPEED:
+ break;
+ case DIOCRCLRTABLES:
+ case DIOCRADDTABLES:
+ case DIOCRDELTABLES:
+ case DIOCRCLRTSTATS:
+ case DIOCRCLRADDRS:
+ case DIOCRADDADDRS:
+ case DIOCRDELADDRS:
+ case DIOCRSETADDRS:
+ case DIOCRSETTFLAGS:
+ if (((struct pfioc_table *)addr)->pfrio_flags &
+ PFR_FLAG_DUMMY) {
+ flags |= FWRITE; /* need write lock for dummy */
+ break; /* dummy operation ok */
+ }
+ return (EACCES);
+ case DIOCGETRULE:
+ if (((struct pfioc_rule *)addr)->action ==
+ PF_GET_CLR_CNTR)
+ return (EACCES);
+ break;
+ default:
+ return (EACCES);
+ }
+
+ CURVNET_SET(TD_TO_VNET(td));
+
+ switch (cmd) {
+ case DIOCSTART:
+ sx_xlock(&pf_ioctl_lock);
+ if (V_pf_status.running)
+ error = EEXIST;
+ else {
+ int cpu;
+
+ error = hook_pf();
+ if (error) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: pfil registration failed\n"));
+ break;
+ }
+ V_pf_status.running = 1;
+ V_pf_status.since = time_second;
+
+ CPU_FOREACH(cpu)
+ V_pf_stateid[cpu] = time_second;
+
+ DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n"));
+ }
+ break;
+
+ case DIOCSTOP:
+ sx_xlock(&pf_ioctl_lock);
+ if (!V_pf_status.running)
+ error = ENOENT;
+ else {
+ V_pf_status.running = 0;
+ error = dehook_pf();
+ if (error) {
+ V_pf_status.running = 1;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: pfil unregistration failed\n"));
+ }
+ V_pf_status.since = time_second;
+ DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n"));
+ }
+ break;
+
+ case DIOCADDRULE: {
+ struct pfioc_rule *pr = (struct pfioc_rule *)addr;
+ struct pf_ruleset *ruleset;
+ struct pf_rule *rule, *tail;
+ struct pf_pooladdr *pa;
+ struct pfi_kif *kif = NULL;
+ int rs_num;
+
+ if (pr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
+ error = EINVAL;
+ break;
+ }
+#ifndef INET
+ if (pr->rule.af == AF_INET) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET */
+#ifndef INET6
+ if (pr->rule.af == AF_INET6) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET6 */
+
+ rule = malloc(sizeof(*rule), M_PFRULE, M_WAITOK);
+ bcopy(&pr->rule, rule, sizeof(struct pf_rule));
+ if (rule->ifname[0])
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+ rule->states_cur = counter_u64_alloc(M_WAITOK);
+ rule->states_tot = counter_u64_alloc(M_WAITOK);
+ rule->src_nodes = counter_u64_alloc(M_WAITOK);
+#ifndef __rtems__
+ rule->cuid = td->td_ucred->cr_ruid;
+ rule->cpid = td->td_proc ? td->td_proc->p_pid : 0;
+#else /* __rtems__ */
+ rule->cuid = BSD_DEFAULT_UID;
+ rule->cpid = BSD_DEFAULT_PID;
+#endif /* __rtems__ */
+ TAILQ_INIT(&rule->rpool.list);
+
+#define ERROUT(x) { error = (x); goto DIOCADDRULE_error; }
+
+ PF_RULES_WLOCK();
+ pr->anchor[sizeof(pr->anchor) - 1] = 0;
+ ruleset = pf_find_ruleset(pr->anchor);
+ if (ruleset == NULL)
+ ERROUT(EINVAL);
+ rs_num = pf_get_ruleset_number(pr->rule.action);
+ if (rs_num >= PF_RULESET_MAX)
+ ERROUT(EINVAL);
+ if (pr->ticket != ruleset->rules[rs_num].inactive.ticket) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("ticket: %d != [%d]%d\n", pr->ticket, rs_num,
+ ruleset->rules[rs_num].inactive.ticket));
+ ERROUT(EBUSY);
+ }
+ if (pr->pool_ticket != V_ticket_pabuf) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pool_ticket: %d != %d\n", pr->pool_ticket,
+ V_ticket_pabuf));
+ ERROUT(EBUSY);
+ }
+
+ tail = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr,
+ pf_rulequeue);
+ if (tail)
+ rule->nr = tail->nr + 1;
+ else
+ rule->nr = 0;
+ if (rule->ifname[0]) {
+ rule->kif = pfi_kif_attach(kif, rule->ifname);
+ pfi_kif_ref(rule->kif);
+ } else
+ rule->kif = NULL;
+
+ if (rule->rtableid > 0 && rule->rtableid >= rt_numfibs)
+ error = EBUSY;
+
+#ifdef ALTQ
+ /* set queue IDs */
+ if (rule->qname[0] != 0) {
+ if ((rule->qid = pf_qname2qid(rule->qname)) == 0)
+ error = EBUSY;
+ else if (rule->pqname[0] != 0) {
+ if ((rule->pqid =
+ pf_qname2qid(rule->pqname)) == 0)
+ error = EBUSY;
+ } else
+ rule->pqid = rule->qid;
+ }
+#endif
+ if (rule->tagname[0])
+ if ((rule->tag = pf_tagname2tag(rule->tagname)) == 0)
+ error = EBUSY;
+ if (rule->match_tagname[0])
+ if ((rule->match_tag =
+ pf_tagname2tag(rule->match_tagname)) == 0)
+ error = EBUSY;
+ if (rule->rt && !rule->direction)
+ error = EINVAL;
+ if (!rule->log)
+ rule->logif = 0;
+ if (rule->logif >= PFLOGIFS_MAX)
+ error = EINVAL;
+ if (pf_addr_setup(ruleset, &rule->src.addr, rule->af))
+ error = ENOMEM;
+ if (pf_addr_setup(ruleset, &rule->dst.addr, rule->af))
+ error = ENOMEM;
+ if (pf_anchor_setup(rule, ruleset, pr->anchor_call))
+ error = EINVAL;
+ if (rule->scrub_flags & PFSTATE_SETPRIO &&
+ (rule->set_prio[0] > PF_PRIO_MAX ||
+ rule->set_prio[1] > PF_PRIO_MAX))
+ error = EINVAL;
+ TAILQ_FOREACH(pa, &V_pf_pabuf, entries)
+ if (pa->addr.type == PF_ADDR_TABLE) {
+ pa->addr.p.tbl = pfr_attach_table(ruleset,
+ pa->addr.v.tblname);
+ if (pa->addr.p.tbl == NULL)
+ error = ENOMEM;
+ }
+
+ rule->overload_tbl = NULL;
+ if (rule->overload_tblname[0]) {
+ if ((rule->overload_tbl = pfr_attach_table(ruleset,
+ rule->overload_tblname)) == NULL)
+ error = EINVAL;
+ else
+ rule->overload_tbl->pfrkt_flags |=
+ PFR_TFLAG_ACTIVE;
+ }
+
+ pf_mv_pool(&V_pf_pabuf, &rule->rpool.list);
+ if (((((rule->action == PF_NAT) || (rule->action == PF_RDR) ||
+ (rule->action == PF_BINAT)) && rule->anchor == NULL) ||
+ (rule->rt > PF_FASTROUTE)) &&
+ (TAILQ_FIRST(&rule->rpool.list) == NULL))
+ error = EINVAL;
+
+ if (error) {
+ pf_free_rule(rule);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ rule->rpool.cur = TAILQ_FIRST(&rule->rpool.list);
+ rule->evaluations = rule->packets[0] = rule->packets[1] =
+ rule->bytes[0] = rule->bytes[1] = 0;
+ TAILQ_INSERT_TAIL(ruleset->rules[rs_num].inactive.ptr,
+ rule, entries);
+ ruleset->rules[rs_num].inactive.rcount++;
+ PF_RULES_WUNLOCK();
+ break;
+
+#undef ERROUT
+DIOCADDRULE_error:
+ PF_RULES_WUNLOCK();
+ counter_u64_free(rule->states_cur);
+ counter_u64_free(rule->states_tot);
+ counter_u64_free(rule->src_nodes);
+ free(rule, M_PFRULE);
+ if (kif)
+ free(kif, PFI_MTYPE);
+ break;
+ }
+
+ case DIOCGETRULES: {
+ struct pfioc_rule *pr = (struct pfioc_rule *)addr;
+ struct pf_ruleset *ruleset;
+ struct pf_rule *tail;
+ int rs_num;
+
+ PF_RULES_WLOCK();
+ pr->anchor[sizeof(pr->anchor) - 1] = 0;
+ ruleset = pf_find_ruleset(pr->anchor);
+ if (ruleset == NULL) {
+ PF_RULES_WUNLOCK();
+ error = EINVAL;
+ break;
+ }
+ rs_num = pf_get_ruleset_number(pr->rule.action);
+ if (rs_num >= PF_RULESET_MAX) {
+ PF_RULES_WUNLOCK();
+ error = EINVAL;
+ break;
+ }
+ tail = TAILQ_LAST(ruleset->rules[rs_num].active.ptr,
+ pf_rulequeue);
+ if (tail)
+ pr->nr = tail->nr + 1;
+ else
+ pr->nr = 0;
+ pr->ticket = ruleset->rules[rs_num].active.ticket;
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCGETRULE: {
+ struct pfioc_rule *pr = (struct pfioc_rule *)addr;
+ struct pf_ruleset *ruleset;
+ struct pf_rule *rule;
+ int rs_num, i;
+
+ PF_RULES_WLOCK();
+ pr->anchor[sizeof(pr->anchor) - 1] = 0;
+ ruleset = pf_find_ruleset(pr->anchor);
+ if (ruleset == NULL) {
+ PF_RULES_WUNLOCK();
+ error = EINVAL;
+ break;
+ }
+ rs_num = pf_get_ruleset_number(pr->rule.action);
+ if (rs_num >= PF_RULESET_MAX) {
+ PF_RULES_WUNLOCK();
+ error = EINVAL;
+ break;
+ }
+ if (pr->ticket != ruleset->rules[rs_num].active.ticket) {
+ PF_RULES_WUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr);
+ while ((rule != NULL) && (rule->nr != pr->nr))
+ rule = TAILQ_NEXT(rule, entries);
+ if (rule == NULL) {
+ PF_RULES_WUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ bcopy(rule, &pr->rule, sizeof(struct pf_rule));
+ pr->rule.u_states_cur = counter_u64_fetch(rule->states_cur);
+ pr->rule.u_states_tot = counter_u64_fetch(rule->states_tot);
+ pr->rule.u_src_nodes = counter_u64_fetch(rule->src_nodes);
+ if (pf_anchor_copyout(ruleset, rule, pr)) {
+ PF_RULES_WUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ pf_addr_copyout(&pr->rule.src.addr);
+ pf_addr_copyout(&pr->rule.dst.addr);
+ for (i = 0; i < PF_SKIP_COUNT; ++i)
+ if (rule->skip[i].ptr == NULL)
+ pr->rule.skip[i].nr = -1;
+ else
+ pr->rule.skip[i].nr =
+ rule->skip[i].ptr->nr;
+
+ if (pr->action == PF_GET_CLR_CNTR) {
+ rule->evaluations = 0;
+ rule->packets[0] = rule->packets[1] = 0;
+ rule->bytes[0] = rule->bytes[1] = 0;
+ counter_u64_zero(rule->states_tot);
+ }
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCCHANGERULE: {
+ struct pfioc_rule *pcr = (struct pfioc_rule *)addr;
+ struct pf_ruleset *ruleset;
+ struct pf_rule *oldrule = NULL, *newrule = NULL;
+ struct pfi_kif *kif = NULL;
+ struct pf_pooladdr *pa;
+ u_int32_t nr = 0;
+ int rs_num;
+
+ if (pcr->action < PF_CHANGE_ADD_HEAD ||
+ pcr->action > PF_CHANGE_GET_TICKET) {
+ error = EINVAL;
+ break;
+ }
+ if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
+ error = EINVAL;
+ break;
+ }
+
+ if (pcr->action != PF_CHANGE_REMOVE) {
+#ifndef INET
+ if (pcr->rule.af == AF_INET) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET */
+#ifndef INET6
+ if (pcr->rule.af == AF_INET6) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET6 */
+ newrule = malloc(sizeof(*newrule), M_PFRULE, M_WAITOK);
+ bcopy(&pcr->rule, newrule, sizeof(struct pf_rule));
+ if (newrule->ifname[0])
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+ newrule->states_cur = counter_u64_alloc(M_WAITOK);
+ newrule->states_tot = counter_u64_alloc(M_WAITOK);
+ newrule->src_nodes = counter_u64_alloc(M_WAITOK);
+#ifndef __rtems__
+ newrule->cuid = td->td_ucred->cr_ruid;
+ newrule->cpid = td->td_proc ? td->td_proc->p_pid : 0;
+#else /* __rtems__ */
+ newrule->cuid = BSD_DEFAULT_UID;
+ newrule->cpid = BSD_DEFAULT_PID;
+#endif /* __rtems__ */
+ TAILQ_INIT(&newrule->rpool.list);
+ }
+
+#define ERROUT(x) { error = (x); goto DIOCCHANGERULE_error; }
+
+ PF_RULES_WLOCK();
+ if (!(pcr->action == PF_CHANGE_REMOVE ||
+ pcr->action == PF_CHANGE_GET_TICKET) &&
+ pcr->pool_ticket != V_ticket_pabuf)
+ ERROUT(EBUSY);
+
+ ruleset = pf_find_ruleset(pcr->anchor);
+ if (ruleset == NULL)
+ ERROUT(EINVAL);
+
+ rs_num = pf_get_ruleset_number(pcr->rule.action);
+ if (rs_num >= PF_RULESET_MAX)
+ ERROUT(EINVAL);
+
+ if (pcr->action == PF_CHANGE_GET_TICKET) {
+ pcr->ticket = ++ruleset->rules[rs_num].active.ticket;
+ ERROUT(0);
+ } else if (pcr->ticket !=
+ ruleset->rules[rs_num].active.ticket)
+ ERROUT(EINVAL);
+
+ if (pcr->action != PF_CHANGE_REMOVE) {
+ if (newrule->ifname[0]) {
+ newrule->kif = pfi_kif_attach(kif,
+ newrule->ifname);
+ pfi_kif_ref(newrule->kif);
+ } else
+ newrule->kif = NULL;
+
+ if (newrule->rtableid > 0 &&
+ newrule->rtableid >= rt_numfibs)
+ error = EBUSY;
+
+#ifdef ALTQ
+ /* set queue IDs */
+ if (newrule->qname[0] != 0) {
+ if ((newrule->qid =
+ pf_qname2qid(newrule->qname)) == 0)
+ error = EBUSY;
+ else if (newrule->pqname[0] != 0) {
+ if ((newrule->pqid =
+ pf_qname2qid(newrule->pqname)) == 0)
+ error = EBUSY;
+ } else
+ newrule->pqid = newrule->qid;
+ }
+#endif /* ALTQ */
+ if (newrule->tagname[0])
+ if ((newrule->tag =
+ pf_tagname2tag(newrule->tagname)) == 0)
+ error = EBUSY;
+ if (newrule->match_tagname[0])
+ if ((newrule->match_tag = pf_tagname2tag(
+ newrule->match_tagname)) == 0)
+ error = EBUSY;
+ if (newrule->rt && !newrule->direction)
+ error = EINVAL;
+ if (!newrule->log)
+ newrule->logif = 0;
+ if (newrule->logif >= PFLOGIFS_MAX)
+ error = EINVAL;
+ if (pf_addr_setup(ruleset, &newrule->src.addr, newrule->af))
+ error = ENOMEM;
+ if (pf_addr_setup(ruleset, &newrule->dst.addr, newrule->af))
+ error = ENOMEM;
+ if (pf_anchor_setup(newrule, ruleset, pcr->anchor_call))
+ error = EINVAL;
+ TAILQ_FOREACH(pa, &V_pf_pabuf, entries)
+ if (pa->addr.type == PF_ADDR_TABLE) {
+ pa->addr.p.tbl =
+ pfr_attach_table(ruleset,
+ pa->addr.v.tblname);
+ if (pa->addr.p.tbl == NULL)
+ error = ENOMEM;
+ }
+
+ newrule->overload_tbl = NULL;
+ if (newrule->overload_tblname[0]) {
+ if ((newrule->overload_tbl = pfr_attach_table(
+ ruleset, newrule->overload_tblname)) ==
+ NULL)
+ error = EINVAL;
+ else
+ newrule->overload_tbl->pfrkt_flags |=
+ PFR_TFLAG_ACTIVE;
+ }
+
+ pf_mv_pool(&V_pf_pabuf, &newrule->rpool.list);
+ if (((((newrule->action == PF_NAT) ||
+ (newrule->action == PF_RDR) ||
+ (newrule->action == PF_BINAT) ||
+ (newrule->rt > PF_FASTROUTE)) &&
+ !newrule->anchor)) &&
+ (TAILQ_FIRST(&newrule->rpool.list) == NULL))
+ error = EINVAL;
+
+ if (error) {
+ pf_free_rule(newrule);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ newrule->rpool.cur = TAILQ_FIRST(&newrule->rpool.list);
+ newrule->evaluations = 0;
+ newrule->packets[0] = newrule->packets[1] = 0;
+ newrule->bytes[0] = newrule->bytes[1] = 0;
+ }
+ pf_empty_pool(&V_pf_pabuf);
+
+ if (pcr->action == PF_CHANGE_ADD_HEAD)
+ oldrule = TAILQ_FIRST(
+ ruleset->rules[rs_num].active.ptr);
+ else if (pcr->action == PF_CHANGE_ADD_TAIL)
+ oldrule = TAILQ_LAST(
+ ruleset->rules[rs_num].active.ptr, pf_rulequeue);
+ else {
+ oldrule = TAILQ_FIRST(
+ ruleset->rules[rs_num].active.ptr);
+ while ((oldrule != NULL) && (oldrule->nr != pcr->nr))
+ oldrule = TAILQ_NEXT(oldrule, entries);
+ if (oldrule == NULL) {
+ if (newrule != NULL)
+ pf_free_rule(newrule);
+ PF_RULES_WUNLOCK();
+ error = EINVAL;
+ break;
+ }
+ }
+
+ if (pcr->action == PF_CHANGE_REMOVE) {
+ pf_unlink_rule(ruleset->rules[rs_num].active.ptr,
+ oldrule);
+ ruleset->rules[rs_num].active.rcount--;
+ } else {
+ if (oldrule == NULL)
+ TAILQ_INSERT_TAIL(
+ ruleset->rules[rs_num].active.ptr,
+ newrule, entries);
+ else if (pcr->action == PF_CHANGE_ADD_HEAD ||
+ pcr->action == PF_CHANGE_ADD_BEFORE)
+ TAILQ_INSERT_BEFORE(oldrule, newrule, entries);
+ else
+ TAILQ_INSERT_AFTER(
+ ruleset->rules[rs_num].active.ptr,
+ oldrule, newrule, entries);
+ ruleset->rules[rs_num].active.rcount++;
+ }
+
+ nr = 0;
+ TAILQ_FOREACH(oldrule,
+ ruleset->rules[rs_num].active.ptr, entries)
+ oldrule->nr = nr++;
+
+ ruleset->rules[rs_num].active.ticket++;
+
+ pf_calc_skip_steps(ruleset->rules[rs_num].active.ptr);
+ pf_remove_if_empty_ruleset(ruleset);
+
+ PF_RULES_WUNLOCK();
+ break;
+
+#undef ERROUT
+DIOCCHANGERULE_error:
+ PF_RULES_WUNLOCK();
+ if (newrule != NULL) {
+ counter_u64_free(newrule->states_cur);
+ counter_u64_free(newrule->states_tot);
+ counter_u64_free(newrule->src_nodes);
+ free(newrule, M_PFRULE);
+ }
+ if (kif != NULL)
+ free(kif, PFI_MTYPE);
+ break;
+ }
+
+ case DIOCCLRSTATES: {
+ struct pf_state *s;
+ struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr;
+ u_int i, killed = 0;
+
+ for (i = 0; i <= pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+
+relock_DIOCCLRSTATES:
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry)
+ if (!psk->psk_ifname[0] ||
+ !strcmp(psk->psk_ifname,
+ s->kif->pfik_name)) {
+ /*
+ * Don't send out individual
+ * delete messages.
+ */
+ s->state_flags |= PFSTATE_NOSYNC;
+ pf_unlink_state(s, PF_ENTER_LOCKED);
+ killed++;
+ goto relock_DIOCCLRSTATES;
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+ psk->psk_killed = killed;
+ if (pfsync_clear_states_ptr != NULL)
+ pfsync_clear_states_ptr(V_pf_status.hostid, psk->psk_ifname);
+ break;
+ }
+
+ case DIOCKILLSTATES: {
+ struct pf_state *s;
+ struct pf_state_key *sk;
+ struct pf_addr *srcaddr, *dstaddr;
+ u_int16_t srcport, dstport;
+ struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr;
+ u_int i, killed = 0;
+
+ if (psk->psk_pfcmp.id) {
+ if (psk->psk_pfcmp.creatorid == 0)
+ psk->psk_pfcmp.creatorid = V_pf_status.hostid;
+ if ((s = pf_find_state_byid(psk->psk_pfcmp.id,
+ psk->psk_pfcmp.creatorid))) {
+ pf_unlink_state(s, PF_ENTER_LOCKED);
+ psk->psk_killed = 1;
+ }
+ break;
+ }
+
+ for (i = 0; i <= pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+
+relock_DIOCKILLSTATES:
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+ sk = s->key[PF_SK_WIRE];
+ if (s->direction == PF_OUT) {
+ srcaddr = &sk->addr[1];
+ dstaddr = &sk->addr[0];
+ srcport = sk->port[1];
+ dstport = sk->port[0];
+ } else {
+ srcaddr = &sk->addr[0];
+ dstaddr = &sk->addr[1];
+ srcport = sk->port[0];
+ dstport = sk->port[1];
+ }
+
+ if ((!psk->psk_af || sk->af == psk->psk_af)
+ && (!psk->psk_proto || psk->psk_proto ==
+ sk->proto) &&
+ PF_MATCHA(psk->psk_src.neg,
+ &psk->psk_src.addr.v.a.addr,
+ &psk->psk_src.addr.v.a.mask,
+ srcaddr, sk->af) &&
+ PF_MATCHA(psk->psk_dst.neg,
+ &psk->psk_dst.addr.v.a.addr,
+ &psk->psk_dst.addr.v.a.mask,
+ dstaddr, sk->af) &&
+ (psk->psk_src.port_op == 0 ||
+ pf_match_port(psk->psk_src.port_op,
+ psk->psk_src.port[0], psk->psk_src.port[1],
+ srcport)) &&
+ (psk->psk_dst.port_op == 0 ||
+ pf_match_port(psk->psk_dst.port_op,
+ psk->psk_dst.port[0], psk->psk_dst.port[1],
+ dstport)) &&
+ (!psk->psk_label[0] ||
+ (s->rule.ptr->label[0] &&
+ !strcmp(psk->psk_label,
+ s->rule.ptr->label))) &&
+ (!psk->psk_ifname[0] ||
+ !strcmp(psk->psk_ifname,
+ s->kif->pfik_name))) {
+ pf_unlink_state(s, PF_ENTER_LOCKED);
+ killed++;
+ goto relock_DIOCKILLSTATES;
+ }
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+ psk->psk_killed = killed;
+ break;
+ }
+
+ case DIOCADDSTATE: {
+ struct pfioc_state *ps = (struct pfioc_state *)addr;
+ struct pfsync_state *sp = &ps->state;
+
+ if (sp->timeout >= PFTM_MAX) {
+ error = EINVAL;
+ break;
+ }
+ if (pfsync_state_import_ptr != NULL) {
+ PF_RULES_RLOCK();
+ error = pfsync_state_import_ptr(sp, PFSYNC_SI_IOCTL);
+ PF_RULES_RUNLOCK();
+ } else
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ case DIOCGETSTATE: {
+ struct pfioc_state *ps = (struct pfioc_state *)addr;
+ struct pf_state *s;
+
+ s = pf_find_state_byid(ps->state.id, ps->state.creatorid);
+ if (s == NULL) {
+ error = ENOENT;
+ break;
+ }
+
+ pfsync_state_export(&ps->state, s);
+ PF_STATE_UNLOCK(s);
+ break;
+ }
+
+ case DIOCGETSTATES: {
+ struct pfioc_states *ps = (struct pfioc_states *)addr;
+ struct pf_state *s;
+ struct pfsync_state *pstore, *p;
+ int i, nr;
+
+ if (ps->ps_len == 0) {
+ nr = uma_zone_get_cur(V_pf_state_z);
+ ps->ps_len = sizeof(struct pfsync_state) * nr;
+ break;
+ }
+
+ p = pstore = malloc(ps->ps_len, M_TEMP, M_WAITOK);
+ nr = 0;
+
+ for (i = 0; i <= pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+
+ if (s->timeout == PFTM_UNLINKED)
+ continue;
+
+ if ((nr+1) * sizeof(*p) > ps->ps_len) {
+ PF_HASHROW_UNLOCK(ih);
+ goto DIOCGETSTATES_full;
+ }
+ pfsync_state_export(p, s);
+ p++;
+ nr++;
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+DIOCGETSTATES_full:
+ error = copyout(pstore, ps->ps_states,
+ sizeof(struct pfsync_state) * nr);
+ if (error) {
+ free(pstore, M_TEMP);
+ break;
+ }
+ ps->ps_len = sizeof(struct pfsync_state) * nr;
+ free(pstore, M_TEMP);
+
+ break;
+ }
+
+ case DIOCGETSTATUS: {
+ struct pf_status *s = (struct pf_status *)addr;
+
+ PF_RULES_RLOCK();
+ s->running = V_pf_status.running;
+ s->since = V_pf_status.since;
+ s->debug = V_pf_status.debug;
+ s->hostid = V_pf_status.hostid;
+ s->states = V_pf_status.states;
+ s->src_nodes = V_pf_status.src_nodes;
+
+ for (int i = 0; i < PFRES_MAX; i++)
+ s->counters[i] =
+ counter_u64_fetch(V_pf_status.counters[i]);
+ for (int i = 0; i < LCNT_MAX; i++)
+ s->lcounters[i] =
+ counter_u64_fetch(V_pf_status.lcounters[i]);
+ for (int i = 0; i < FCNT_MAX; i++)
+ s->fcounters[i] =
+ counter_u64_fetch(V_pf_status.fcounters[i]);
+ for (int i = 0; i < SCNT_MAX; i++)
+ s->scounters[i] =
+ counter_u64_fetch(V_pf_status.scounters[i]);
+
+ bcopy(V_pf_status.ifname, s->ifname, IFNAMSIZ);
+ bcopy(V_pf_status.pf_chksum, s->pf_chksum,
+ PF_MD5_DIGEST_LENGTH);
+
+ pfi_update_status(s->ifname, s);
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCSETSTATUSIF: {
+ struct pfioc_if *pi = (struct pfioc_if *)addr;
+
+ if (pi->ifname[0] == 0) {
+ bzero(V_pf_status.ifname, IFNAMSIZ);
+ break;
+ }
+ PF_RULES_WLOCK();
+ strlcpy(V_pf_status.ifname, pi->ifname, IFNAMSIZ);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCCLRSTATUS: {
+ PF_RULES_WLOCK();
+ for (int i = 0; i < PFRES_MAX; i++)
+ counter_u64_zero(V_pf_status.counters[i]);
+ for (int i = 0; i < FCNT_MAX; i++)
+ counter_u64_zero(V_pf_status.fcounters[i]);
+ for (int i = 0; i < SCNT_MAX; i++)
+ counter_u64_zero(V_pf_status.scounters[i]);
+ V_pf_status.since = time_second;
+ if (*V_pf_status.ifname)
+ pfi_update_status(V_pf_status.ifname, NULL);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCNATLOOK: {
+ struct pfioc_natlook *pnl = (struct pfioc_natlook *)addr;
+ struct pf_state_key *sk;
+ struct pf_state *state;
+ struct pf_state_key_cmp key;
+ int m = 0, direction = pnl->direction;
+ int sidx, didx;
+
+ /* NATLOOK src and dst are reversed, so reverse sidx/didx */
+ sidx = (direction == PF_IN) ? 1 : 0;
+ didx = (direction == PF_IN) ? 0 : 1;
+
+ if (!pnl->proto ||
+ PF_AZERO(&pnl->saddr, pnl->af) ||
+ PF_AZERO(&pnl->daddr, pnl->af) ||
+ ((pnl->proto == IPPROTO_TCP ||
+ pnl->proto == IPPROTO_UDP) &&
+ (!pnl->dport || !pnl->sport)))
+ error = EINVAL;
+ else {
+ bzero(&key, sizeof(key));
+ key.af = pnl->af;
+ key.proto = pnl->proto;
+ PF_ACPY(&key.addr[sidx], &pnl->saddr, pnl->af);
+ key.port[sidx] = pnl->sport;
+ PF_ACPY(&key.addr[didx], &pnl->daddr, pnl->af);
+ key.port[didx] = pnl->dport;
+
+ state = pf_find_state_all(&key, direction, &m);
+
+ if (m > 1)
+ error = E2BIG; /* more than one state */
+ else if (state != NULL) {
+ /* XXXGL: not locked read */
+ sk = state->key[sidx];
+ PF_ACPY(&pnl->rsaddr, &sk->addr[sidx], sk->af);
+ pnl->rsport = sk->port[sidx];
+ PF_ACPY(&pnl->rdaddr, &sk->addr[didx], sk->af);
+ pnl->rdport = sk->port[didx];
+ } else
+ error = ENOENT;
+ }
+ break;
+ }
+
+ case DIOCSETTIMEOUT: {
+ struct pfioc_tm *pt = (struct pfioc_tm *)addr;
+ int old;
+
+ if (pt->timeout < 0 || pt->timeout >= PFTM_MAX ||
+ pt->seconds < 0) {
+ error = EINVAL;
+ break;
+ }
+ PF_RULES_WLOCK();
+ old = V_pf_default_rule.timeout[pt->timeout];
+ if (pt->timeout == PFTM_INTERVAL && pt->seconds == 0)
+ pt->seconds = 1;
+ V_pf_default_rule.timeout[pt->timeout] = pt->seconds;
+ if (pt->timeout == PFTM_INTERVAL && pt->seconds < old)
+ wakeup(pf_purge_thread);
+ pt->seconds = old;
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCGETTIMEOUT: {
+ struct pfioc_tm *pt = (struct pfioc_tm *)addr;
+
+ if (pt->timeout < 0 || pt->timeout >= PFTM_MAX) {
+ error = EINVAL;
+ break;
+ }
+ PF_RULES_RLOCK();
+ pt->seconds = V_pf_default_rule.timeout[pt->timeout];
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCGETLIMIT: {
+ struct pfioc_limit *pl = (struct pfioc_limit *)addr;
+
+ if (pl->index < 0 || pl->index >= PF_LIMIT_MAX) {
+ error = EINVAL;
+ break;
+ }
+ PF_RULES_RLOCK();
+ pl->limit = V_pf_limits[pl->index].limit;
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCSETLIMIT: {
+ struct pfioc_limit *pl = (struct pfioc_limit *)addr;
+ int old_limit;
+
+ PF_RULES_WLOCK();
+ if (pl->index < 0 || pl->index >= PF_LIMIT_MAX ||
+ V_pf_limits[pl->index].zone == NULL) {
+ PF_RULES_WUNLOCK();
+ error = EINVAL;
+ break;
+ }
+ uma_zone_set_max(V_pf_limits[pl->index].zone, pl->limit);
+ old_limit = V_pf_limits[pl->index].limit;
+ V_pf_limits[pl->index].limit = pl->limit;
+ pl->limit = old_limit;
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCSETDEBUG: {
+ u_int32_t *level = (u_int32_t *)addr;
+
+ PF_RULES_WLOCK();
+ V_pf_status.debug = *level;
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCCLRRULECTRS: {
+ /* obsoleted by DIOCGETRULE with action=PF_GET_CLR_CNTR */
+ struct pf_ruleset *ruleset = &pf_main_ruleset;
+ struct pf_rule *rule;
+
+ PF_RULES_WLOCK();
+ TAILQ_FOREACH(rule,
+ ruleset->rules[PF_RULESET_FILTER].active.ptr, entries) {
+ rule->evaluations = 0;
+ rule->packets[0] = rule->packets[1] = 0;
+ rule->bytes[0] = rule->bytes[1] = 0;
+ }
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCGIFSPEED: {
+ struct pf_ifspeed *psp = (struct pf_ifspeed *)addr;
+ struct pf_ifspeed ps;
+ struct ifnet *ifp;
+
+ if (psp->ifname[0] != 0) {
+ /* Can we completely trust user-land? */
+ strlcpy(ps.ifname, psp->ifname, IFNAMSIZ);
+ ifp = ifunit(ps.ifname);
+ if (ifp != NULL)
+ psp->baudrate = ifp->if_baudrate;
+ else
+ error = EINVAL;
+ } else
+ error = EINVAL;
+ break;
+ }
+
+#ifdef ALTQ
+ case DIOCSTARTALTQ: {
+ struct pf_altq *altq;
+
+ PF_RULES_WLOCK();
+ /* enable all altq interfaces on active list */
+ TAILQ_FOREACH(altq, V_pf_altqs_active, entries) {
+ if (altq->qname[0] == 0 && (altq->local_flags &
+ PFALTQ_FLAG_IF_REMOVED) == 0) {
+ error = pf_enable_altq(altq);
+ if (error != 0)
+ break;
+ }
+ }
+ if (error == 0)
+ V_pf_altq_running = 1;
+ PF_RULES_WUNLOCK();
+ DPFPRINTF(PF_DEBUG_MISC, ("altq: started\n"));
+ break;
+ }
+
+ case DIOCSTOPALTQ: {
+ struct pf_altq *altq;
+
+ PF_RULES_WLOCK();
+ /* disable all altq interfaces on active list */
+ TAILQ_FOREACH(altq, V_pf_altqs_active, entries) {
+ if (altq->qname[0] == 0 && (altq->local_flags &
+ PFALTQ_FLAG_IF_REMOVED) == 0) {
+ error = pf_disable_altq(altq);
+ if (error != 0)
+ break;
+ }
+ }
+ if (error == 0)
+ V_pf_altq_running = 0;
+ PF_RULES_WUNLOCK();
+ DPFPRINTF(PF_DEBUG_MISC, ("altq: stopped\n"));
+ break;
+ }
+
+ case DIOCADDALTQ: {
+ struct pfioc_altq *pa = (struct pfioc_altq *)addr;
+ struct pf_altq *altq, *a;
+ struct ifnet *ifp;
+
+ altq = malloc(sizeof(*altq), M_PFALTQ, M_WAITOK);
+ bcopy(&pa->altq, altq, sizeof(struct pf_altq));
+ altq->local_flags = 0;
+
+ PF_RULES_WLOCK();
+ if (pa->ticket != V_ticket_altqs_inactive) {
+ PF_RULES_WUNLOCK();
+ free(altq, M_PFALTQ);
+ error = EBUSY;
+ break;
+ }
+
+ /*
+ * if this is for a queue, find the discipline and
+ * copy the necessary fields
+ */
+ if (altq->qname[0] != 0) {
+ if ((altq->qid = pf_qname2qid(altq->qname)) == 0) {
+ PF_RULES_WUNLOCK();
+ error = EBUSY;
+ free(altq, M_PFALTQ);
+ break;
+ }
+ altq->altq_disc = NULL;
+ TAILQ_FOREACH(a, V_pf_altqs_inactive, entries) {
+ if (strncmp(a->ifname, altq->ifname,
+ IFNAMSIZ) == 0 && a->qname[0] == 0) {
+ altq->altq_disc = a->altq_disc;
+ break;
+ }
+ }
+ }
+
+ if ((ifp = ifunit(altq->ifname)) == NULL)
+ altq->local_flags |= PFALTQ_FLAG_IF_REMOVED;
+ else
+ error = altq_add(altq);
+
+ if (error) {
+ PF_RULES_WUNLOCK();
+ free(altq, M_PFALTQ);
+ break;
+ }
+
+ TAILQ_INSERT_TAIL(V_pf_altqs_inactive, altq, entries);
+ bcopy(altq, &pa->altq, sizeof(struct pf_altq));
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCGETALTQS: {
+ struct pfioc_altq *pa = (struct pfioc_altq *)addr;
+ struct pf_altq *altq;
+
+ PF_RULES_RLOCK();
+ pa->nr = 0;
+ TAILQ_FOREACH(altq, V_pf_altqs_active, entries)
+ pa->nr++;
+ pa->ticket = V_ticket_altqs_active;
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCGETALTQ: {
+ struct pfioc_altq *pa = (struct pfioc_altq *)addr;
+ struct pf_altq *altq;
+ u_int32_t nr;
+
+ PF_RULES_RLOCK();
+ if (pa->ticket != V_ticket_altqs_active) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ nr = 0;
+ altq = TAILQ_FIRST(V_pf_altqs_active);
+ while ((altq != NULL) && (nr < pa->nr)) {
+ altq = TAILQ_NEXT(altq, entries);
+ nr++;
+ }
+ if (altq == NULL) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ bcopy(altq, &pa->altq, sizeof(struct pf_altq));
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCCHANGEALTQ:
+ /* CHANGEALTQ not supported yet! */
+ error = ENODEV;
+ break;
+
+ case DIOCGETQSTATS: {
+ struct pfioc_qstats *pq = (struct pfioc_qstats *)addr;
+ struct pf_altq *altq;
+ u_int32_t nr;
+ int nbytes;
+
+ PF_RULES_RLOCK();
+ if (pq->ticket != V_ticket_altqs_active) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ nbytes = pq->nbytes;
+ nr = 0;
+ altq = TAILQ_FIRST(V_pf_altqs_active);
+ while ((altq != NULL) && (nr < pq->nr)) {
+ altq = TAILQ_NEXT(altq, entries);
+ nr++;
+ }
+ if (altq == NULL) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+
+ if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) {
+ PF_RULES_RUNLOCK();
+ error = ENXIO;
+ break;
+ }
+ PF_RULES_RUNLOCK();
+ error = altq_getqstats(altq, pq->buf, &nbytes);
+ if (error == 0) {
+ pq->scheduler = altq->scheduler;
+ pq->nbytes = nbytes;
+ }
+ break;
+ }
+#endif /* ALTQ */
+
+ case DIOCBEGINADDRS: {
+ struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr;
+
+ PF_RULES_WLOCK();
+ pf_empty_pool(&V_pf_pabuf);
+ pp->ticket = ++V_ticket_pabuf;
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCADDADDR: {
+ struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr;
+ struct pf_pooladdr *pa;
+ struct pfi_kif *kif = NULL;
+
+#ifndef INET
+ if (pp->af == AF_INET) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET */
+#ifndef INET6
+ if (pp->af == AF_INET6) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET6 */
+ if (pp->addr.addr.type != PF_ADDR_ADDRMASK &&
+ pp->addr.addr.type != PF_ADDR_DYNIFTL &&
+ pp->addr.addr.type != PF_ADDR_TABLE) {
+ error = EINVAL;
+ break;
+ }
+ pa = malloc(sizeof(*pa), M_PFRULE, M_WAITOK);
+ bcopy(&pp->addr, pa, sizeof(struct pf_pooladdr));
+ if (pa->ifname[0])
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+ PF_RULES_WLOCK();
+ if (pp->ticket != V_ticket_pabuf) {
+ PF_RULES_WUNLOCK();
+ if (pa->ifname[0])
+ free(kif, PFI_MTYPE);
+ free(pa, M_PFRULE);
+ error = EBUSY;
+ break;
+ }
+ if (pa->ifname[0]) {
+ pa->kif = pfi_kif_attach(kif, pa->ifname);
+ pfi_kif_ref(pa->kif);
+ } else
+ pa->kif = NULL;
+ if (pa->addr.type == PF_ADDR_DYNIFTL && ((error =
+ pfi_dynaddr_setup(&pa->addr, pp->af)) != 0)) {
+ if (pa->ifname[0])
+ pfi_kif_unref(pa->kif);
+ PF_RULES_WUNLOCK();
+ free(pa, M_PFRULE);
+ break;
+ }
+ TAILQ_INSERT_TAIL(&V_pf_pabuf, pa, entries);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCGETADDRS: {
+ struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr;
+ struct pf_pool *pool;
+ struct pf_pooladdr *pa;
+
+ PF_RULES_RLOCK();
+ pp->nr = 0;
+ pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action,
+ pp->r_num, 0, 1, 0);
+ if (pool == NULL) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ TAILQ_FOREACH(pa, &pool->list, entries)
+ pp->nr++;
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCGETADDR: {
+ struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr;
+ struct pf_pool *pool;
+ struct pf_pooladdr *pa;
+ u_int32_t nr = 0;
+
+ PF_RULES_RLOCK();
+ pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action,
+ pp->r_num, 0, 1, 1);
+ if (pool == NULL) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ pa = TAILQ_FIRST(&pool->list);
+ while ((pa != NULL) && (nr < pp->nr)) {
+ pa = TAILQ_NEXT(pa, entries);
+ nr++;
+ }
+ if (pa == NULL) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ bcopy(pa, &pp->addr, sizeof(struct pf_pooladdr));
+ pf_addr_copyout(&pp->addr.addr);
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCCHANGEADDR: {
+ struct pfioc_pooladdr *pca = (struct pfioc_pooladdr *)addr;
+ struct pf_pool *pool;
+ struct pf_pooladdr *oldpa = NULL, *newpa = NULL;
+ struct pf_ruleset *ruleset;
+ struct pfi_kif *kif = NULL;
+
+ if (pca->action < PF_CHANGE_ADD_HEAD ||
+ pca->action > PF_CHANGE_REMOVE) {
+ error = EINVAL;
+ break;
+ }
+ if (pca->addr.addr.type != PF_ADDR_ADDRMASK &&
+ pca->addr.addr.type != PF_ADDR_DYNIFTL &&
+ pca->addr.addr.type != PF_ADDR_TABLE) {
+ error = EINVAL;
+ break;
+ }
+
+ if (pca->action != PF_CHANGE_REMOVE) {
+#ifndef INET
+ if (pca->af == AF_INET) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET */
+#ifndef INET6
+ if (pca->af == AF_INET6) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET6 */
+ newpa = malloc(sizeof(*newpa), M_PFRULE, M_WAITOK);
+ bcopy(&pca->addr, newpa, sizeof(struct pf_pooladdr));
+ if (newpa->ifname[0])
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+ newpa->kif = NULL;
+ }
+
+#define ERROUT(x) { error = (x); goto DIOCCHANGEADDR_error; }
+ PF_RULES_WLOCK();
+ ruleset = pf_find_ruleset(pca->anchor);
+ if (ruleset == NULL)
+ ERROUT(EBUSY);
+
+ pool = pf_get_pool(pca->anchor, pca->ticket, pca->r_action,
+ pca->r_num, pca->r_last, 1, 1);
+ if (pool == NULL)
+ ERROUT(EBUSY);
+
+ if (pca->action != PF_CHANGE_REMOVE) {
+ if (newpa->ifname[0]) {
+ newpa->kif = pfi_kif_attach(kif, newpa->ifname);
+ pfi_kif_ref(newpa->kif);
+ kif = NULL;
+ }
+
+ switch (newpa->addr.type) {
+ case PF_ADDR_DYNIFTL:
+ error = pfi_dynaddr_setup(&newpa->addr,
+ pca->af);
+ break;
+ case PF_ADDR_TABLE:
+ newpa->addr.p.tbl = pfr_attach_table(ruleset,
+ newpa->addr.v.tblname);
+ if (newpa->addr.p.tbl == NULL)
+ error = ENOMEM;
+ break;
+ }
+ if (error)
+ goto DIOCCHANGEADDR_error;
+ }
+
+ switch (pca->action) {
+ case PF_CHANGE_ADD_HEAD:
+ oldpa = TAILQ_FIRST(&pool->list);
+ break;
+ case PF_CHANGE_ADD_TAIL:
+ oldpa = TAILQ_LAST(&pool->list, pf_palist);
+ break;
+ default:
+ oldpa = TAILQ_FIRST(&pool->list);
+ for (int i = 0; oldpa && i < pca->nr; i++)
+ oldpa = TAILQ_NEXT(oldpa, entries);
+
+ if (oldpa == NULL)
+ ERROUT(EINVAL);
+ }
+
+ if (pca->action == PF_CHANGE_REMOVE) {
+ TAILQ_REMOVE(&pool->list, oldpa, entries);
+ switch (oldpa->addr.type) {
+ case PF_ADDR_DYNIFTL:
+ pfi_dynaddr_remove(oldpa->addr.p.dyn);
+ break;
+ case PF_ADDR_TABLE:
+ pfr_detach_table(oldpa->addr.p.tbl);
+ break;
+ }
+ if (oldpa->kif)
+ pfi_kif_unref(oldpa->kif);
+ free(oldpa, M_PFRULE);
+ } else {
+ if (oldpa == NULL)
+ TAILQ_INSERT_TAIL(&pool->list, newpa, entries);
+ else if (pca->action == PF_CHANGE_ADD_HEAD ||
+ pca->action == PF_CHANGE_ADD_BEFORE)
+ TAILQ_INSERT_BEFORE(oldpa, newpa, entries);
+ else
+ TAILQ_INSERT_AFTER(&pool->list, oldpa,
+ newpa, entries);
+ }
+
+ pool->cur = TAILQ_FIRST(&pool->list);
+ PF_ACPY(&pool->counter, &pool->cur->addr.v.a.addr, pca->af);
+ PF_RULES_WUNLOCK();
+ break;
+
+#undef ERROUT
+DIOCCHANGEADDR_error:
+ if (newpa->kif)
+ pfi_kif_unref(newpa->kif);
+ PF_RULES_WUNLOCK();
+ if (newpa != NULL)
+ free(newpa, M_PFRULE);
+ if (kif != NULL)
+ free(kif, PFI_MTYPE);
+ break;
+ }
+
+ case DIOCGETRULESETS: {
+ struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr;
+ struct pf_ruleset *ruleset;
+ struct pf_anchor *anchor;
+
+ PF_RULES_RLOCK();
+ pr->path[sizeof(pr->path) - 1] = 0;
+ if ((ruleset = pf_find_ruleset(pr->path)) == NULL) {
+ PF_RULES_RUNLOCK();
+ error = ENOENT;
+ break;
+ }
+ pr->nr = 0;
+ if (ruleset->anchor == NULL) {
+ /* XXX kludge for pf_main_ruleset */
+ RB_FOREACH(anchor, pf_anchor_global, &V_pf_anchors)
+ if (anchor->parent == NULL)
+ pr->nr++;
+ } else {
+ RB_FOREACH(anchor, pf_anchor_node,
+ &ruleset->anchor->children)
+ pr->nr++;
+ }
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCGETRULESET: {
+ struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr;
+ struct pf_ruleset *ruleset;
+ struct pf_anchor *anchor;
+ u_int32_t nr = 0;
+
+ PF_RULES_RLOCK();
+ pr->path[sizeof(pr->path) - 1] = 0;
+ if ((ruleset = pf_find_ruleset(pr->path)) == NULL) {
+ PF_RULES_RUNLOCK();
+ error = ENOENT;
+ break;
+ }
+ pr->name[0] = 0;
+ if (ruleset->anchor == NULL) {
+ /* XXX kludge for pf_main_ruleset */
+ RB_FOREACH(anchor, pf_anchor_global, &V_pf_anchors)
+ if (anchor->parent == NULL && nr++ == pr->nr) {
+ strlcpy(pr->name, anchor->name,
+ sizeof(pr->name));
+ break;
+ }
+ } else {
+ RB_FOREACH(anchor, pf_anchor_node,
+ &ruleset->anchor->children)
+ if (nr++ == pr->nr) {
+ strlcpy(pr->name, anchor->name,
+ sizeof(pr->name));
+ break;
+ }
+ }
+ if (!pr->name[0])
+ error = EBUSY;
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCRCLRTABLES: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+
+ if (io->pfrio_esize != 0) {
+ error = ENODEV;
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel,
+ io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCRADDTABLES: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_table *pfrts;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_table)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_table);
+ pfrts = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfrts, totlen);
+ if (error) {
+ free(pfrts, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_add_tables(pfrts, io->pfrio_size,
+ &io->pfrio_nadd, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ free(pfrts, M_TEMP);
+ break;
+ }
+
+ case DIOCRDELTABLES: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_table *pfrts;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_table)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_table);
+ pfrts = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfrts, totlen);
+ if (error) {
+ free(pfrts, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_del_tables(pfrts, io->pfrio_size,
+ &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ free(pfrts, M_TEMP);
+ break;
+ }
+
+ case DIOCRGETTABLES: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_table *pfrts;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_table)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_table);
+ pfrts = malloc(totlen, M_TEMP, M_WAITOK);
+ PF_RULES_RLOCK();
+ error = pfr_get_tables(&io->pfrio_table, pfrts,
+ &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_RUNLOCK();
+ if (error == 0)
+ error = copyout(pfrts, io->pfrio_buffer, totlen);
+ free(pfrts, M_TEMP);
+ break;
+ }
+
+ case DIOCRGETTSTATS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_tstats *pfrtstats;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_tstats)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_tstats);
+ pfrtstats = malloc(totlen, M_TEMP, M_WAITOK);
+ PF_RULES_WLOCK();
+ error = pfr_get_tstats(&io->pfrio_table, pfrtstats,
+ &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ if (error == 0)
+ error = copyout(pfrtstats, io->pfrio_buffer, totlen);
+ free(pfrtstats, M_TEMP);
+ break;
+ }
+
+ case DIOCRCLRTSTATS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_table *pfrts;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_table)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_table);
+ pfrts = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfrts, totlen);
+ if (error) {
+ free(pfrts, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_clr_tstats(pfrts, io->pfrio_size,
+ &io->pfrio_nzero, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ free(pfrts, M_TEMP);
+ break;
+ }
+
+ case DIOCRSETTFLAGS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_table *pfrts;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_table)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_table);
+ pfrts = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfrts, totlen);
+ if (error) {
+ free(pfrts, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_set_tflags(pfrts, io->pfrio_size,
+ io->pfrio_setflag, io->pfrio_clrflag, &io->pfrio_nchange,
+ &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ free(pfrts, M_TEMP);
+ break;
+ }
+
+ case DIOCRCLRADDRS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+
+ if (io->pfrio_esize != 0) {
+ error = ENODEV;
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel,
+ io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCRADDADDRS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfras, totlen);
+ if (error) {
+ free(pfras, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_add_addrs(&io->pfrio_table, pfras,
+ io->pfrio_size, &io->pfrio_nadd, io->pfrio_flags |
+ PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
+ error = copyout(pfras, io->pfrio_buffer, totlen);
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCRDELADDRS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfras, totlen);
+ if (error) {
+ free(pfras, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_del_addrs(&io->pfrio_table, pfras,
+ io->pfrio_size, &io->pfrio_ndel, io->pfrio_flags |
+ PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
+ error = copyout(pfras, io->pfrio_buffer, totlen);
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCRSETADDRS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen, count;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ count = max(io->pfrio_size, io->pfrio_size2);
+ totlen = count * sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfras, totlen);
+ if (error) {
+ free(pfras, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_set_addrs(&io->pfrio_table, pfras,
+ io->pfrio_size, &io->pfrio_size2, &io->pfrio_nadd,
+ &io->pfrio_ndel, &io->pfrio_nchange, io->pfrio_flags |
+ PFR_FLAG_USERIOCTL, 0);
+ PF_RULES_WUNLOCK();
+ if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
+ error = copyout(pfras, io->pfrio_buffer, totlen);
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCRGETADDRS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ PF_RULES_RLOCK();
+ error = pfr_get_addrs(&io->pfrio_table, pfras,
+ &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_RUNLOCK();
+ if (error == 0)
+ error = copyout(pfras, io->pfrio_buffer, totlen);
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCRGETASTATS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_astats *pfrastats;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_astats)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_astats);
+ pfrastats = malloc(totlen, M_TEMP, M_WAITOK);
+ PF_RULES_RLOCK();
+ error = pfr_get_astats(&io->pfrio_table, pfrastats,
+ &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_RUNLOCK();
+ if (error == 0)
+ error = copyout(pfrastats, io->pfrio_buffer, totlen);
+ free(pfrastats, M_TEMP);
+ break;
+ }
+
+ case DIOCRCLRASTATS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfras, totlen);
+ if (error) {
+ free(pfras, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_clr_astats(&io->pfrio_table, pfras,
+ io->pfrio_size, &io->pfrio_nzero, io->pfrio_flags |
+ PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
+ error = copyout(pfras, io->pfrio_buffer, totlen);
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCRTSTADDRS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfras, totlen);
+ if (error) {
+ free(pfras, M_TEMP);
+ break;
+ }
+ PF_RULES_RLOCK();
+ error = pfr_tst_addrs(&io->pfrio_table, pfras,
+ io->pfrio_size, &io->pfrio_nmatch, io->pfrio_flags |
+ PFR_FLAG_USERIOCTL);
+ PF_RULES_RUNLOCK();
+ if (error == 0)
+ error = copyout(pfras, io->pfrio_buffer, totlen);
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCRINADEFINE: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfras, totlen);
+ if (error) {
+ free(pfras, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_ina_define(&io->pfrio_table, pfras,
+ io->pfrio_size, &io->pfrio_nadd, &io->pfrio_naddr,
+ io->pfrio_ticket, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCOSFPADD: {
+ struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr;
+ PF_RULES_WLOCK();
+ error = pf_osfp_add(io);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCOSFPGET: {
+ struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr;
+ PF_RULES_RLOCK();
+ error = pf_osfp_get(io);
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCXBEGIN: {
+ struct pfioc_trans *io = (struct pfioc_trans *)addr;
+ struct pfioc_trans_e *ioes, *ioe;
+ size_t totlen;
+ int i;
+
+ if (io->esize != sizeof(*ioe)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = sizeof(struct pfioc_trans_e) * io->size;
+ ioes = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->array, ioes, totlen);
+ if (error) {
+ free(ioes, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
+ switch (ioe->rs_num) {
+#ifdef ALTQ
+ case PF_RULESET_ALTQ:
+ if (ioe->anchor[0]) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EINVAL;
+ goto fail;
+ }
+ if ((error = pf_begin_altq(&ioe->ticket))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail;
+ }
+ break;
+#endif /* ALTQ */
+ case PF_RULESET_TABLE:
+ {
+ struct pfr_table table;
+
+ bzero(&table, sizeof(table));
+ strlcpy(table.pfrt_anchor, ioe->anchor,
+ sizeof(table.pfrt_anchor));
+ if ((error = pfr_ina_begin(&table,
+ &ioe->ticket, NULL, 0))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail;
+ }
+ break;
+ }
+ default:
+ if ((error = pf_begin_rules(&ioe->ticket,
+ ioe->rs_num, ioe->anchor))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail;
+ }
+ break;
+ }
+ }
+ PF_RULES_WUNLOCK();
+ error = copyout(ioes, io->array, totlen);
+ free(ioes, M_TEMP);
+ break;
+ }
+
+ case DIOCXROLLBACK: {
+ struct pfioc_trans *io = (struct pfioc_trans *)addr;
+ struct pfioc_trans_e *ioe, *ioes;
+ size_t totlen;
+ int i;
+
+ if (io->esize != sizeof(*ioe)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = sizeof(struct pfioc_trans_e) * io->size;
+ ioes = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->array, ioes, totlen);
+ if (error) {
+ free(ioes, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
+ switch (ioe->rs_num) {
+#ifdef ALTQ
+ case PF_RULESET_ALTQ:
+ if (ioe->anchor[0]) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EINVAL;
+ goto fail;
+ }
+ if ((error = pf_rollback_altq(ioe->ticket))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail; /* really bad */
+ }
+ break;
+#endif /* ALTQ */
+ case PF_RULESET_TABLE:
+ {
+ struct pfr_table table;
+
+ bzero(&table, sizeof(table));
+ strlcpy(table.pfrt_anchor, ioe->anchor,
+ sizeof(table.pfrt_anchor));
+ if ((error = pfr_ina_rollback(&table,
+ ioe->ticket, NULL, 0))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail; /* really bad */
+ }
+ break;
+ }
+ default:
+ if ((error = pf_rollback_rules(ioe->ticket,
+ ioe->rs_num, ioe->anchor))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail; /* really bad */
+ }
+ break;
+ }
+ }
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ break;
+ }
+
+ case DIOCXCOMMIT: {
+ struct pfioc_trans *io = (struct pfioc_trans *)addr;
+ struct pfioc_trans_e *ioe, *ioes;
+ struct pf_ruleset *rs;
+ size_t totlen;
+ int i;
+
+ if (io->esize != sizeof(*ioe)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = sizeof(struct pfioc_trans_e) * io->size;
+ ioes = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->array, ioes, totlen);
+ if (error) {
+ free(ioes, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ /* First makes sure everything will succeed. */
+ for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
+ switch (ioe->rs_num) {
+#ifdef ALTQ
+ case PF_RULESET_ALTQ:
+ if (ioe->anchor[0]) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EINVAL;
+ goto fail;
+ }
+ if (!V_altqs_inactive_open || ioe->ticket !=
+ V_ticket_altqs_inactive) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EBUSY;
+ goto fail;
+ }
+ break;
+#endif /* ALTQ */
+ case PF_RULESET_TABLE:
+ rs = pf_find_ruleset(ioe->anchor);
+ if (rs == NULL || !rs->topen || ioe->ticket !=
+ rs->tticket) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EBUSY;
+ goto fail;
+ }
+ break;
+ default:
+ if (ioe->rs_num < 0 || ioe->rs_num >=
+ PF_RULESET_MAX) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EINVAL;
+ goto fail;
+ }
+ rs = pf_find_ruleset(ioe->anchor);
+ if (rs == NULL ||
+ !rs->rules[ioe->rs_num].inactive.open ||
+ rs->rules[ioe->rs_num].inactive.ticket !=
+ ioe->ticket) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EBUSY;
+ goto fail;
+ }
+ break;
+ }
+ }
+ /* Now do the commit - no errors should happen here. */
+ for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
+ switch (ioe->rs_num) {
+#ifdef ALTQ
+ case PF_RULESET_ALTQ:
+ if ((error = pf_commit_altq(ioe->ticket))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail; /* really bad */
+ }
+ break;
+#endif /* ALTQ */
+ case PF_RULESET_TABLE:
+ {
+ struct pfr_table table;
+
+ bzero(&table, sizeof(table));
+ strlcpy(table.pfrt_anchor, ioe->anchor,
+ sizeof(table.pfrt_anchor));
+ if ((error = pfr_ina_commit(&table,
+ ioe->ticket, NULL, NULL, 0))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail; /* really bad */
+ }
+ break;
+ }
+ default:
+ if ((error = pf_commit_rules(ioe->ticket,
+ ioe->rs_num, ioe->anchor))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail; /* really bad */
+ }
+ break;
+ }
+ }
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ break;
+ }
+
+ case DIOCGETSRCNODES: {
+ struct pfioc_src_nodes *psn = (struct pfioc_src_nodes *)addr;
+ struct pf_srchash *sh;
+ struct pf_src_node *n, *p, *pstore;
+ uint32_t i, nr = 0;
+
+ if (psn->psn_len == 0) {
+ for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask;
+ i++, sh++) {
+ PF_HASHROW_LOCK(sh);
+ LIST_FOREACH(n, &sh->nodes, entry)
+ nr++;
+ PF_HASHROW_UNLOCK(sh);
+ }
+ psn->psn_len = sizeof(struct pf_src_node) * nr;
+ break;
+ }
+
+ p = pstore = malloc(psn->psn_len, M_TEMP, M_WAITOK);
+ for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask;
+ i++, sh++) {
+ PF_HASHROW_LOCK(sh);
+ LIST_FOREACH(n, &sh->nodes, entry) {
+ int secs = time_uptime, diff;
+
+ if ((nr + 1) * sizeof(*p) > (unsigned)psn->psn_len)
+ break;
+
+ bcopy(n, p, sizeof(struct pf_src_node));
+ if (n->rule.ptr != NULL)
+ p->rule.nr = n->rule.ptr->nr;
+ p->creation = secs - p->creation;
+ if (p->expire > secs)
+ p->expire -= secs;
+ else
+ p->expire = 0;
+
+ /* Adjust the connection rate estimate. */
+ diff = secs - n->conn_rate.last;
+ if (diff >= n->conn_rate.seconds)
+ p->conn_rate.count = 0;
+ else
+ p->conn_rate.count -=
+ n->conn_rate.count * diff /
+ n->conn_rate.seconds;
+ p++;
+ nr++;
+ }
+ PF_HASHROW_UNLOCK(sh);
+ }
+ error = copyout(pstore, psn->psn_src_nodes,
+ sizeof(struct pf_src_node) * nr);
+ if (error) {
+ free(pstore, M_TEMP);
+ break;
+ }
+ psn->psn_len = sizeof(struct pf_src_node) * nr;
+ free(pstore, M_TEMP);
+ break;
+ }
+
+ case DIOCCLRSRCNODES: {
+
+ pf_clear_srcnodes(NULL);
+ pf_purge_expired_src_nodes();
+ break;
+ }
+
+ case DIOCKILLSRCNODES:
+ pf_kill_srcnodes((struct pfioc_src_node_kill *)addr);
+ break;
+
+ case DIOCSETHOSTID: {
+ u_int32_t *hostid = (u_int32_t *)addr;
+
+ PF_RULES_WLOCK();
+ if (*hostid == 0)
+ V_pf_status.hostid = arc4random();
+ else
+ V_pf_status.hostid = *hostid;
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCOSFPFLUSH:
+ PF_RULES_WLOCK();
+ pf_osfp_flush();
+ PF_RULES_WUNLOCK();
+ break;
+
+ case DIOCIGETIFACES: {
+ struct pfioc_iface *io = (struct pfioc_iface *)addr;
+ struct pfi_kif *ifstore;
+ size_t bufsiz;
+
+ if (io->pfiio_esize != sizeof(struct pfi_kif)) {
+ error = ENODEV;
+ break;
+ }
+
+ bufsiz = io->pfiio_size * sizeof(struct pfi_kif);
+ ifstore = malloc(bufsiz, M_TEMP, M_WAITOK);
+ PF_RULES_RLOCK();
+ pfi_get_ifaces(io->pfiio_name, ifstore, &io->pfiio_size);
+ PF_RULES_RUNLOCK();
+ error = copyout(ifstore, io->pfiio_buffer, bufsiz);
+ free(ifstore, M_TEMP);
+ break;
+ }
+
+ case DIOCSETIFFLAG: {
+ struct pfioc_iface *io = (struct pfioc_iface *)addr;
+
+ PF_RULES_WLOCK();
+ error = pfi_set_flags(io->pfiio_name, io->pfiio_flags);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCCLRIFFLAG: {
+ struct pfioc_iface *io = (struct pfioc_iface *)addr;
+
+ PF_RULES_WLOCK();
+ error = pfi_clear_flags(io->pfiio_name, io->pfiio_flags);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ default:
+ error = ENODEV;
+ break;
+ }
+fail:
+ if (sx_xlocked(&pf_ioctl_lock))
+ sx_xunlock(&pf_ioctl_lock);
+ CURVNET_RESTORE();
+
+ return (error);
+}
+
+void
+pfsync_state_export(struct pfsync_state *sp, struct pf_state *st)
+{
+ bzero(sp, sizeof(struct pfsync_state));
+
+ /* copy from state key */
+ sp->key[PF_SK_WIRE].addr[0] = st->key[PF_SK_WIRE]->addr[0];
+ sp->key[PF_SK_WIRE].addr[1] = st->key[PF_SK_WIRE]->addr[1];
+ sp->key[PF_SK_WIRE].port[0] = st->key[PF_SK_WIRE]->port[0];
+ sp->key[PF_SK_WIRE].port[1] = st->key[PF_SK_WIRE]->port[1];
+ sp->key[PF_SK_STACK].addr[0] = st->key[PF_SK_STACK]->addr[0];
+ sp->key[PF_SK_STACK].addr[1] = st->key[PF_SK_STACK]->addr[1];
+ sp->key[PF_SK_STACK].port[0] = st->key[PF_SK_STACK]->port[0];
+ sp->key[PF_SK_STACK].port[1] = st->key[PF_SK_STACK]->port[1];
+ sp->proto = st->key[PF_SK_WIRE]->proto;
+ sp->af = st->key[PF_SK_WIRE]->af;
+
+ /* copy from state */
+ strlcpy(sp->ifname, st->kif->pfik_name, sizeof(sp->ifname));
+ bcopy(&st->rt_addr, &sp->rt_addr, sizeof(sp->rt_addr));
+ sp->creation = htonl(time_uptime - st->creation);
+ sp->expire = pf_state_expires(st);
+ if (sp->expire <= time_uptime)
+ sp->expire = htonl(0);
+ else
+ sp->expire = htonl(sp->expire - time_uptime);
+
+ sp->direction = st->direction;
+ sp->log = st->log;
+ sp->timeout = st->timeout;
+ sp->state_flags = st->state_flags;
+ if (st->src_node)
+ sp->sync_flags |= PFSYNC_FLAG_SRCNODE;
+ if (st->nat_src_node)
+ sp->sync_flags |= PFSYNC_FLAG_NATSRCNODE;
+
+ sp->id = st->id;
+ sp->creatorid = st->creatorid;
+ pf_state_peer_hton(&st->src, &sp->src);
+ pf_state_peer_hton(&st->dst, &sp->dst);
+
+ if (st->rule.ptr == NULL)
+ sp->rule = htonl(-1);
+ else
+ sp->rule = htonl(st->rule.ptr->nr);
+ if (st->anchor.ptr == NULL)
+ sp->anchor = htonl(-1);
+ else
+ sp->anchor = htonl(st->anchor.ptr->nr);
+ if (st->nat_rule.ptr == NULL)
+ sp->nat_rule = htonl(-1);
+ else
+ sp->nat_rule = htonl(st->nat_rule.ptr->nr);
+
+ pf_state_counter_hton(st->packets[0], sp->packets[0]);
+ pf_state_counter_hton(st->packets[1], sp->packets[1]);
+ pf_state_counter_hton(st->bytes[0], sp->bytes[0]);
+ pf_state_counter_hton(st->bytes[1], sp->bytes[1]);
+
+}
+
+static void
+pf_tbladdr_copyout(struct pf_addr_wrap *aw)
+{
+ struct pfr_ktable *kt;
+
+ KASSERT(aw->type == PF_ADDR_TABLE, ("%s: type %u", __func__, aw->type));
+
+ kt = aw->p.tbl;
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
+ kt = kt->pfrkt_root;
+ aw->p.tbl = NULL;
+ aw->p.tblcnt = (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) ?
+ kt->pfrkt_cnt : -1;
+}
+
+/*
+ * XXX - Check for version missmatch!!!
+ */
+static void
+pf_clear_states(void)
+{
+ struct pf_state *s;
+ u_int i;
+
+ for (i = 0; i <= pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+relock:
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+ s->timeout = PFTM_PURGE;
+ /* Don't send out individual delete messages. */
+ s->state_flags |= PFSTATE_NOSYNC;
+ pf_unlink_state(s, PF_ENTER_LOCKED);
+ goto relock;
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+}
+
+static int
+pf_clear_tables(void)
+{
+ struct pfioc_table io;
+ int error;
+
+ bzero(&io, sizeof(io));
+
+ error = pfr_clr_tables(&io.pfrio_table, &io.pfrio_ndel,
+ io.pfrio_flags);
+
+ return (error);
+}
+
+static void
+pf_clear_srcnodes(struct pf_src_node *n)
+{
+ struct pf_state *s;
+ int i;
+
+ for (i = 0; i <= pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+ if (n == NULL || n == s->src_node)
+ s->src_node = NULL;
+ if (n == NULL || n == s->nat_src_node)
+ s->nat_src_node = NULL;
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+
+ if (n == NULL) {
+ struct pf_srchash *sh;
+
+ for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask;
+ i++, sh++) {
+ PF_HASHROW_LOCK(sh);
+ LIST_FOREACH(n, &sh->nodes, entry) {
+ n->expire = 1;
+ n->states = 0;
+ }
+ PF_HASHROW_UNLOCK(sh);
+ }
+ } else {
+ /* XXX: hash slot should already be locked here. */
+ n->expire = 1;
+ n->states = 0;
+ }
+}
+
+static void
+pf_kill_srcnodes(struct pfioc_src_node_kill *psnk)
+{
+ struct pf_src_node_list kill;
+
+ LIST_INIT(&kill);
+ for (int i = 0; i <= pf_srchashmask; i++) {
+ struct pf_srchash *sh = &V_pf_srchash[i];
+ struct pf_src_node *sn, *tmp;
+
+ PF_HASHROW_LOCK(sh);
+ LIST_FOREACH_SAFE(sn, &sh->nodes, entry, tmp)
+ if (PF_MATCHA(psnk->psnk_src.neg,
+ &psnk->psnk_src.addr.v.a.addr,
+ &psnk->psnk_src.addr.v.a.mask,
+ &sn->addr, sn->af) &&
+ PF_MATCHA(psnk->psnk_dst.neg,
+ &psnk->psnk_dst.addr.v.a.addr,
+ &psnk->psnk_dst.addr.v.a.mask,
+ &sn->raddr, sn->af)) {
+ pf_unlink_src_node(sn);
+ LIST_INSERT_HEAD(&kill, sn, entry);
+ sn->expire = 1;
+ }
+ PF_HASHROW_UNLOCK(sh);
+ }
+
+ for (int i = 0; i <= pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+ struct pf_state *s;
+
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+ if (s->src_node && s->src_node->expire == 1)
+ s->src_node = NULL;
+ if (s->nat_src_node && s->nat_src_node->expire == 1)
+ s->nat_src_node = NULL;
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+
+ psnk->psnk_killed = pf_free_src_nodes(&kill);
+}
+
+/*
+ * XXX - Check for version missmatch!!!
+ */
+
+/*
+ * Duplicate pfctl -Fa operation to get rid of as much as we can.
+ */
+static int
+shutdown_pf(void)
+{
+ int error = 0;
+ u_int32_t t[5];
+ char nn = '\0';
+
+ do {
+ if ((error = pf_begin_rules(&t[0], PF_RULESET_SCRUB, &nn))
+ != 0) {
+ DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: SCRUB\n"));
+ break;
+ }
+ if ((error = pf_begin_rules(&t[1], PF_RULESET_FILTER, &nn))
+ != 0) {
+ DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: FILTER\n"));
+ break; /* XXX: rollback? */
+ }
+ if ((error = pf_begin_rules(&t[2], PF_RULESET_NAT, &nn))
+ != 0) {
+ DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: NAT\n"));
+ break; /* XXX: rollback? */
+ }
+ if ((error = pf_begin_rules(&t[3], PF_RULESET_BINAT, &nn))
+ != 0) {
+ DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: BINAT\n"));
+ break; /* XXX: rollback? */
+ }
+ if ((error = pf_begin_rules(&t[4], PF_RULESET_RDR, &nn))
+ != 0) {
+ DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: RDR\n"));
+ break; /* XXX: rollback? */
+ }
+
+ /* XXX: these should always succeed here */
+ pf_commit_rules(t[0], PF_RULESET_SCRUB, &nn);
+ pf_commit_rules(t[1], PF_RULESET_FILTER, &nn);
+ pf_commit_rules(t[2], PF_RULESET_NAT, &nn);
+ pf_commit_rules(t[3], PF_RULESET_BINAT, &nn);
+ pf_commit_rules(t[4], PF_RULESET_RDR, &nn);
+
+ if ((error = pf_clear_tables()) != 0)
+ break;
+
+#ifdef ALTQ
+ if ((error = pf_begin_altq(&t[0])) != 0) {
+ DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: ALTQ\n"));
+ break;
+ }
+ pf_commit_altq(t[0]);
+#endif
+
+ pf_clear_states();
+
+ pf_clear_srcnodes(NULL);
+
+ /* status does not use malloced mem so no need to cleanup */
+ /* fingerprints and interfaces have their own cleanup code */
+
+ /* Free counters last as we updated them during shutdown. */
+ counter_u64_free(V_pf_default_rule.states_cur);
+ counter_u64_free(V_pf_default_rule.states_tot);
+ counter_u64_free(V_pf_default_rule.src_nodes);
+
+ for (int i = 0; i < PFRES_MAX; i++)
+ counter_u64_free(V_pf_status.counters[i]);
+ for (int i = 0; i < LCNT_MAX; i++)
+ counter_u64_free(V_pf_status.lcounters[i]);
+ for (int i = 0; i < FCNT_MAX; i++)
+ counter_u64_free(V_pf_status.fcounters[i]);
+ for (int i = 0; i < SCNT_MAX; i++)
+ counter_u64_free(V_pf_status.scounters[i]);
+ } while(0);
+
+ return (error);
+}
+
+#ifdef INET
+static int
+pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
+ struct inpcb *inp)
+{
+ int chk;
+
+ chk = pf_test(PF_IN, ifp, m, inp);
+ if (chk && *m) {
+ m_freem(*m);
+ *m = NULL;
+ }
+
+ if (chk != PF_PASS)
+ return (EACCES);
+ return (0);
+}
+
+static int
+pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
+ struct inpcb *inp)
+{
+ int chk;
+
+ chk = pf_test(PF_OUT, ifp, m, inp);
+ if (chk && *m) {
+ m_freem(*m);
+ *m = NULL;
+ }
+
+ if (chk != PF_PASS)
+ return (EACCES);
+ return (0);
+}
+#endif
+
+#ifdef INET6
+static int
+pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
+ struct inpcb *inp)
+{
+ int chk;
+
+ /*
+ * In case of loopback traffic IPv6 uses the real interface in
+ * order to support scoped addresses. In order to support stateful
+ * filtering we have change this to lo0 as it is the case in IPv4.
+ */
+ CURVNET_SET(ifp->if_vnet);
+ chk = pf_test6(PF_IN, (*m)->m_flags & M_LOOP ? V_loif : ifp, m, inp);
+ CURVNET_RESTORE();
+ if (chk && *m) {
+ m_freem(*m);
+ *m = NULL;
+ }
+ if (chk != PF_PASS)
+ return (EACCES);
+ return (0);
+}
+
+static int
+pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
+ struct inpcb *inp)
+{
+ int chk;
+
+ CURVNET_SET(ifp->if_vnet);
+ chk = pf_test6(PF_OUT, ifp, m, inp);
+ CURVNET_RESTORE();
+ if (chk && *m) {
+ m_freem(*m);
+ *m = NULL;
+ }
+ if (chk != PF_PASS)
+ return (EACCES);
+ return (0);
+}
+#endif /* INET6 */
+
+static int
+hook_pf(void)
+{
+#ifdef INET
+ struct pfil_head *pfh_inet;
+#endif
+#ifdef INET6
+ struct pfil_head *pfh_inet6;
+#endif
+
+ if (V_pf_pfil_hooked)
+ return (0);
+
+#ifdef INET
+ pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
+ if (pfh_inet == NULL)
+ return (ESRCH); /* XXX */
+ pfil_add_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet);
+ pfil_add_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet);
+#endif
+#ifdef INET6
+ pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
+ if (pfh_inet6 == NULL) {
+#ifdef INET
+ pfil_remove_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK,
+ pfh_inet);
+ pfil_remove_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
+ pfh_inet);
+#endif
+ return (ESRCH); /* XXX */
+ }
+ pfil_add_hook(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet6);
+ pfil_add_hook(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet6);
+#endif
+
+ V_pf_pfil_hooked = 1;
+ return (0);
+}
+
+static int
+dehook_pf(void)
+{
+#ifdef INET
+ struct pfil_head *pfh_inet;
+#endif
+#ifdef INET6
+ struct pfil_head *pfh_inet6;
+#endif
+
+ if (V_pf_pfil_hooked == 0)
+ return (0);
+
+#ifdef INET
+ pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
+ if (pfh_inet == NULL)
+ return (ESRCH); /* XXX */
+ pfil_remove_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK,
+ pfh_inet);
+ pfil_remove_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
+ pfh_inet);
+#endif
+#ifdef INET6
+ pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
+ if (pfh_inet6 == NULL)
+ return (ESRCH); /* XXX */
+ pfil_remove_hook(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK,
+ pfh_inet6);
+ pfil_remove_hook(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK,
+ pfh_inet6);
+#endif
+
+ V_pf_pfil_hooked = 0;
+ return (0);
+}
+
+static void
+pf_load_vnet(void)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+
+ VNET_LIST_RLOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ V_pf_pfil_hooked = 0;
+ TAILQ_INIT(&V_pf_tags);
+ TAILQ_INIT(&V_pf_qids);
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK();
+
+ pfattach_vnet();
+ V_pf_vnet_active = 1;
+}
+
+static int
+pf_load(void)
+{
+ int error;
+
+ rw_init(&pf_rules_lock, "pf rulesets");
+ sx_init(&pf_ioctl_lock, "pf ioctl");
+
+ pf_mtag_initialize();
+
+ pf_dev = make_dev(&pf_cdevsw, 0, 0, 0, 0600, PF_NAME);
+ if (pf_dev == NULL)
+ return (ENOMEM);
+
+ pf_end_threads = 0;
+ error = kproc_create(pf_purge_thread, NULL, NULL, 0, 0, "pf purge");
+ if (error != 0)
+ return (error);
+
+ pfi_initialize();
+
+ return (0);
+}
+
+static void
+pf_unload_vnet(void)
+{
+ int error;
+
+ V_pf_vnet_active = 0;
+ V_pf_status.running = 0;
+ swi_remove(V_pf_swi_cookie);
+ error = dehook_pf();
+ if (error) {
+ /*
+ * Should not happen!
+ * XXX Due to error code ESRCH, kldunload will show
+ * a message like 'No such process'.
+ */
+ printf("%s : pfil unregisteration fail\n", __FUNCTION__);
+ return;
+ }
+
+ pf_unload_vnet_purge();
+
+ PF_RULES_WLOCK();
+ shutdown_pf();
+ PF_RULES_WUNLOCK();
+
+ pf_normalize_cleanup();
+ PF_RULES_WLOCK();
+ pfi_cleanup_vnet();
+ PF_RULES_WUNLOCK();
+ pfr_cleanup();
+ pf_osfp_flush();
+ pf_cleanup();
+ if (IS_DEFAULT_VNET(curvnet))
+ pf_mtag_cleanup();
+}
+
+static int
+pf_unload(void)
+{
+ int error = 0;
+
+ pf_end_threads = 1;
+ while (pf_end_threads < 2) {
+ wakeup_one(pf_purge_thread);
+ rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftmo", 0);
+ }
+
+ if (pf_dev != NULL)
+ destroy_dev(pf_dev);
+
+ pfi_cleanup();
+
+ rw_destroy(&pf_rules_lock);
+ sx_destroy(&pf_ioctl_lock);
+
+ return (error);
+}
+
+static void
+vnet_pf_init(void *unused __unused)
+{
+
+ pf_load_vnet();
+}
+VNET_SYSINIT(vnet_pf_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_THIRD,
+ vnet_pf_init, NULL);
+
+static void
+vnet_pf_uninit(const void *unused __unused)
+{
+
+ pf_unload_vnet();
+}
+VNET_SYSUNINIT(vnet_pf_uninit, SI_SUB_PROTO_FIREWALL, SI_ORDER_THIRD,
+ vnet_pf_uninit, NULL);
+
+
+static int
+pf_modevent(module_t mod, int type, void *data)
+{
+ int error = 0;
+
+ switch(type) {
+ case MOD_LOAD:
+ error = pf_load();
+ break;
+ case MOD_QUIESCE:
+ /*
+ * Module should not be unloaded due to race conditions.
+ */
+ error = EBUSY;
+ break;
+ case MOD_UNLOAD:
+ error = pf_unload();
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+static moduledata_t pf_mod = {
+ "pf",
+ pf_modevent,
+ 0
+};
+
+DECLARE_MODULE(pf, pf_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_SECOND);
+MODULE_VERSION(pf, PF_MODVER);
diff --git a/freebsd/sys/netpfil/pf/pf_lb.c b/freebsd/sys/netpfil/pf/pf_lb.c
new file mode 100644
index 00000000..033c3879
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/pf_lb.c
@@ -0,0 +1,681 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002 - 2008 Henning Brauer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ * $OpenBSD: pf_lb.c,v 1.2 2009/02/12 02:13:15 sthen Exp $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/local/opt_pf.h>
+#include <rtems/bsd/local/opt_inet.h>
+#include <rtems/bsd/local/opt_inet6.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/vnet.h>
+#include <net/pfvar.h>
+#include <net/if_pflog.h>
+
+#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
+
+static void pf_hash(struct pf_addr *, struct pf_addr *,
+ struct pf_poolhashkey *, sa_family_t);
+static struct pf_rule *pf_match_translation(struct pf_pdesc *, struct mbuf *,
+ int, int, struct pfi_kif *,
+ struct pf_addr *, u_int16_t, struct pf_addr *,
+ uint16_t, int, struct pf_anchor_stackframe *);
+static int pf_get_sport(sa_family_t, uint8_t, struct pf_rule *,
+ struct pf_addr *, uint16_t, struct pf_addr *, uint16_t, struct pf_addr *,
+ uint16_t *, uint16_t, uint16_t, struct pf_src_node **);
+
+#define mix(a,b,c) \
+ do { \
+ a -= b; a -= c; a ^= (c >> 13); \
+ b -= c; b -= a; b ^= (a << 8); \
+ c -= a; c -= b; c ^= (b >> 13); \
+ a -= b; a -= c; a ^= (c >> 12); \
+ b -= c; b -= a; b ^= (a << 16); \
+ c -= a; c -= b; c ^= (b >> 5); \
+ a -= b; a -= c; a ^= (c >> 3); \
+ b -= c; b -= a; b ^= (a << 10); \
+ c -= a; c -= b; c ^= (b >> 15); \
+ } while (0)
+
+/*
+ * hash function based on bridge_hash in if_bridge.c
+ */
+static void
+pf_hash(struct pf_addr *inaddr, struct pf_addr *hash,
+ struct pf_poolhashkey *key, sa_family_t af)
+{
+ u_int32_t a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0];
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ a += inaddr->addr32[0];
+ b += key->key32[1];
+ mix(a, b, c);
+ hash->addr32[0] = c + key->key32[2];
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ a += inaddr->addr32[0];
+ b += inaddr->addr32[2];
+ mix(a, b, c);
+ hash->addr32[0] = c;
+ a += inaddr->addr32[1];
+ b += inaddr->addr32[3];
+ c += key->key32[1];
+ mix(a, b, c);
+ hash->addr32[1] = c;
+ a += inaddr->addr32[2];
+ b += inaddr->addr32[1];
+ c += key->key32[2];
+ mix(a, b, c);
+ hash->addr32[2] = c;
+ a += inaddr->addr32[3];
+ b += inaddr->addr32[0];
+ c += key->key32[3];
+ mix(a, b, c);
+ hash->addr32[3] = c;
+ break;
+#endif /* INET6 */
+ }
+}
+
+static struct pf_rule *
+pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
+ int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport,
+ struct pf_addr *daddr, uint16_t dport, int rs_num,
+ struct pf_anchor_stackframe *anchor_stack)
+{
+ struct pf_rule *r, *rm = NULL;
+ struct pf_ruleset *ruleset = NULL;
+ int tag = -1;
+ int rtableid = -1;
+ int asd = 0;
+
+ r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr);
+ while (r && rm == NULL) {
+ struct pf_rule_addr *src = NULL, *dst = NULL;
+ struct pf_addr_wrap *xdst = NULL;
+
+ if (r->action == PF_BINAT && direction == PF_IN) {
+ src = &r->dst;
+ if (r->rpool.cur != NULL)
+ xdst = &r->rpool.cur->addr;
+ } else {
+ src = &r->src;
+ dst = &r->dst;
+ }
+
+ r->evaluations++;
+ if (pfi_kif_match(r->kif, kif) == r->ifnot)
+ r = r->skip[PF_SKIP_IFP].ptr;
+ else if (r->direction && r->direction != direction)
+ r = r->skip[PF_SKIP_DIR].ptr;
+ else if (r->af && r->af != pd->af)
+ r = r->skip[PF_SKIP_AF].ptr;
+ else if (r->proto && r->proto != pd->proto)
+ r = r->skip[PF_SKIP_PROTO].ptr;
+ else if (PF_MISMATCHAW(&src->addr, saddr, pd->af,
+ src->neg, kif, M_GETFIB(m)))
+ r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR :
+ PF_SKIP_DST_ADDR].ptr;
+ else if (src->port_op && !pf_match_port(src->port_op,
+ src->port[0], src->port[1], sport))
+ r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT :
+ PF_SKIP_DST_PORT].ptr;
+ else if (dst != NULL &&
+ PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL,
+ M_GETFIB(m)))
+ r = r->skip[PF_SKIP_DST_ADDR].ptr;
+ else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af,
+ 0, NULL, M_GETFIB(m)))
+ r = TAILQ_NEXT(r, entries);
+ else if (dst != NULL && dst->port_op &&
+ !pf_match_port(dst->port_op, dst->port[0],
+ dst->port[1], dport))
+ r = r->skip[PF_SKIP_DST_PORT].ptr;
+ else if (r->match_tag && !pf_match_tag(m, r, &tag,
+ pd->pf_mtag ? pd->pf_mtag->tag : 0))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto !=
+ IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m,
+ off, pd->hdr.tcp), r->os_fingerprint)))
+ r = TAILQ_NEXT(r, entries);
+ else {
+ if (r->tag)
+ tag = r->tag;
+ if (r->rtableid >= 0)
+ rtableid = r->rtableid;
+ if (r->anchor == NULL) {
+ rm = r;
+ } else
+ pf_step_into_anchor(anchor_stack, &asd,
+ &ruleset, rs_num, &r, NULL, NULL);
+ }
+ if (r == NULL)
+ pf_step_out_of_anchor(anchor_stack, &asd, &ruleset,
+ rs_num, &r, NULL, NULL);
+ }
+
+ if (tag > 0 && pf_tag_packet(m, pd, tag))
+ return (NULL);
+ if (rtableid >= 0)
+ M_SETFIB(m, rtableid);
+
+ if (rm != NULL && (rm->action == PF_NONAT ||
+ rm->action == PF_NORDR || rm->action == PF_NOBINAT))
+ return (NULL);
+ return (rm);
+}
+
+static int
+pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r,
+ struct pf_addr *saddr, uint16_t sport, struct pf_addr *daddr,
+ uint16_t dport, struct pf_addr *naddr, uint16_t *nport, uint16_t low,
+ uint16_t high, struct pf_src_node **sn)
+{
+ struct pf_state_key_cmp key;
+ struct pf_addr init_addr;
+
+ bzero(&init_addr, sizeof(init_addr));
+ if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
+ return (1);
+
+ if (proto == IPPROTO_ICMP) {
+ low = 1;
+ high = 65535;
+ }
+
+ bzero(&key, sizeof(key));
+ key.af = af;
+ key.proto = proto;
+ key.port[0] = dport;
+ PF_ACPY(&key.addr[0], daddr, key.af);
+
+ do {
+ PF_ACPY(&key.addr[1], naddr, key.af);
+
+ /*
+ * port search; start random, step;
+ * similar 2 portloop in in_pcbbind
+ */
+ if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP ||
+ proto == IPPROTO_ICMP) || (low == 0 && high == 0)) {
+ /*
+ * XXX bug: icmp states don't use the id on both sides.
+ * (traceroute -I through nat)
+ */
+ key.port[1] = sport;
+ if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
+ *nport = sport;
+ return (0);
+ }
+ } else if (low == high) {
+ key.port[1] = htons(low);
+ if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
+ *nport = htons(low);
+ return (0);
+ }
+ } else {
+ uint16_t tmp, cut;
+
+ if (low > high) {
+ tmp = low;
+ low = high;
+ high = tmp;
+ }
+ /* low < high */
+ cut = arc4random() % (1 + high - low) + low;
+ /* low <= cut <= high */
+ for (tmp = cut; tmp <= high; ++(tmp)) {
+ key.port[1] = htons(tmp);
+ if (pf_find_state_all(&key, PF_IN, NULL) ==
+ NULL) {
+ *nport = htons(tmp);
+ return (0);
+ }
+ }
+ for (tmp = cut - 1; tmp >= low; --(tmp)) {
+ key.port[1] = htons(tmp);
+ if (pf_find_state_all(&key, PF_IN, NULL) ==
+ NULL) {
+ *nport = htons(tmp);
+ return (0);
+ }
+ }
+ }
+
+ switch (r->rpool.opts & PF_POOL_TYPEMASK) {
+ case PF_POOL_RANDOM:
+ case PF_POOL_ROUNDROBIN:
+ if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
+ return (1);
+ break;
+ case PF_POOL_NONE:
+ case PF_POOL_SRCHASH:
+ case PF_POOL_BITMASK:
+ default:
+ return (1);
+ }
+ } while (! PF_AEQ(&init_addr, naddr, af) );
+ return (1); /* none available */
+}
+
+int
+pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
+ struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sn)
+{
+ struct pf_pool *rpool = &r->rpool;
+ struct pf_addr *raddr = NULL, *rmask = NULL;
+
+ /* Try to find a src_node if none was given and this
+ is a sticky-address rule. */
+ if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR &&
+ (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE)
+ *sn = pf_find_src_node(saddr, r, af, 0);
+
+ /* If a src_node was found or explicitly given and it has a non-zero
+ route address, use this address. A zeroed address is found if the
+ src node was created just a moment ago in pf_create_state and it
+ needs to be filled in with routing decision calculated here. */
+ if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) {
+ PF_ACPY(naddr, &(*sn)->raddr, af);
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf_map_addr: src tracking maps ");
+ pf_print_host(saddr, 0, af);
+ printf(" to ");
+ pf_print_host(naddr, 0, af);
+ printf("\n");
+ }
+ return (0);
+ }
+
+ /* Find the route using chosen algorithm. Store the found route
+ in src_node if it was given or found. */
+ if (rpool->cur->addr.type == PF_ADDR_NOROUTE)
+ return (1);
+ if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 &&
+ (rpool->opts & PF_POOL_TYPEMASK) !=
+ PF_POOL_ROUNDROBIN)
+ return (1);
+ raddr = &rpool->cur->addr.p.dyn->pfid_addr4;
+ rmask = &rpool->cur->addr.p.dyn->pfid_mask4;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (rpool->cur->addr.p.dyn->pfid_acnt6 < 1 &&
+ (rpool->opts & PF_POOL_TYPEMASK) !=
+ PF_POOL_ROUNDROBIN)
+ return (1);
+ raddr = &rpool->cur->addr.p.dyn->pfid_addr6;
+ rmask = &rpool->cur->addr.p.dyn->pfid_mask6;
+ break;
+#endif /* INET6 */
+ }
+ } else if (rpool->cur->addr.type == PF_ADDR_TABLE) {
+ if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN)
+ return (1); /* unsupported */
+ } else {
+ raddr = &rpool->cur->addr.v.a.addr;
+ rmask = &rpool->cur->addr.v.a.mask;
+ }
+
+ switch (rpool->opts & PF_POOL_TYPEMASK) {
+ case PF_POOL_NONE:
+ PF_ACPY(naddr, raddr, af);
+ break;
+ case PF_POOL_BITMASK:
+ PF_POOLMASK(naddr, raddr, rmask, saddr, af);
+ break;
+ case PF_POOL_RANDOM:
+ if (init_addr != NULL && PF_AZERO(init_addr, af)) {
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ rpool->counter.addr32[0] = htonl(arc4random());
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (rmask->addr32[3] != 0xffffffff)
+ rpool->counter.addr32[3] =
+ htonl(arc4random());
+ else
+ break;
+ if (rmask->addr32[2] != 0xffffffff)
+ rpool->counter.addr32[2] =
+ htonl(arc4random());
+ else
+ break;
+ if (rmask->addr32[1] != 0xffffffff)
+ rpool->counter.addr32[1] =
+ htonl(arc4random());
+ else
+ break;
+ if (rmask->addr32[0] != 0xffffffff)
+ rpool->counter.addr32[0] =
+ htonl(arc4random());
+ break;
+#endif /* INET6 */
+ }
+ PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
+ PF_ACPY(init_addr, naddr, af);
+
+ } else {
+ PF_AINC(&rpool->counter, af);
+ PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
+ }
+ break;
+ case PF_POOL_SRCHASH:
+ {
+ unsigned char hash[16];
+
+ pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af);
+ PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af);
+ break;
+ }
+ case PF_POOL_ROUNDROBIN:
+ {
+ struct pf_pooladdr *acur = rpool->cur;
+
+ /*
+ * XXXGL: in the round-robin case we need to store
+ * the round-robin machine state in the rule, thus
+ * forwarding thread needs to modify rule.
+ *
+ * This is done w/o locking, because performance is assumed
+ * more important than round-robin precision.
+ *
+ * In the simpliest case we just update the "rpool->cur"
+ * pointer. However, if pool contains tables or dynamic
+ * addresses, then "tblidx" is also used to store machine
+ * state. Since "tblidx" is int, concurrent access to it can't
+ * lead to inconsistence, only to lost of precision.
+ *
+ * Things get worse, if table contains not hosts, but
+ * prefixes. In this case counter also stores machine state,
+ * and for IPv6 address, counter can't be updated atomically.
+ * Probably, using round-robin on a table containing IPv6
+ * prefixes (or even IPv4) would cause a panic.
+ */
+
+ if (rpool->cur->addr.type == PF_ADDR_TABLE) {
+ if (!pfr_pool_get(rpool->cur->addr.p.tbl,
+ &rpool->tblidx, &rpool->counter, af))
+ goto get_addr;
+ } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
+ if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
+ &rpool->tblidx, &rpool->counter, af))
+ goto get_addr;
+ } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
+ goto get_addr;
+
+ try_next:
+ if (TAILQ_NEXT(rpool->cur, entries) == NULL)
+ rpool->cur = TAILQ_FIRST(&rpool->list);
+ else
+ rpool->cur = TAILQ_NEXT(rpool->cur, entries);
+ if (rpool->cur->addr.type == PF_ADDR_TABLE) {
+ rpool->tblidx = -1;
+ if (pfr_pool_get(rpool->cur->addr.p.tbl,
+ &rpool->tblidx, &rpool->counter, af)) {
+ /* table contains no address of type 'af' */
+ if (rpool->cur != acur)
+ goto try_next;
+ return (1);
+ }
+ } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
+ rpool->tblidx = -1;
+ if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
+ &rpool->tblidx, &rpool->counter, af)) {
+ /* table contains no address of type 'af' */
+ if (rpool->cur != acur)
+ goto try_next;
+ return (1);
+ }
+ } else {
+ raddr = &rpool->cur->addr.v.a.addr;
+ rmask = &rpool->cur->addr.v.a.mask;
+ PF_ACPY(&rpool->counter, raddr, af);
+ }
+
+ get_addr:
+ PF_ACPY(naddr, &rpool->counter, af);
+ if (init_addr != NULL && PF_AZERO(init_addr, af))
+ PF_ACPY(init_addr, naddr, af);
+ PF_AINC(&rpool->counter, af);
+ break;
+ }
+ }
+ if (*sn != NULL)
+ PF_ACPY(&(*sn)->raddr, naddr, af);
+
+ if (V_pf_status.debug >= PF_DEBUG_MISC &&
+ (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
+ printf("pf_map_addr: selected address ");
+ pf_print_host(naddr, 0, af);
+ printf("\n");
+ }
+
+ return (0);
+}
+
+struct pf_rule *
+pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
+ struct pfi_kif *kif, struct pf_src_node **sn,
+ struct pf_state_key **skp, struct pf_state_key **nkp,
+ struct pf_addr *saddr, struct pf_addr *daddr,
+ uint16_t sport, uint16_t dport, struct pf_anchor_stackframe *anchor_stack)
+{
+ struct pf_rule *r = NULL;
+ struct pf_addr *naddr;
+ uint16_t *nport;
+
+ PF_RULES_RASSERT();
+ KASSERT(*skp == NULL, ("*skp not NULL"));
+ KASSERT(*nkp == NULL, ("*nkp not NULL"));
+
+ if (direction == PF_OUT) {
+ r = pf_match_translation(pd, m, off, direction, kif, saddr,
+ sport, daddr, dport, PF_RULESET_BINAT, anchor_stack);
+ if (r == NULL)
+ r = pf_match_translation(pd, m, off, direction, kif,
+ saddr, sport, daddr, dport, PF_RULESET_NAT,
+ anchor_stack);
+ } else {
+ r = pf_match_translation(pd, m, off, direction, kif, saddr,
+ sport, daddr, dport, PF_RULESET_RDR, anchor_stack);
+ if (r == NULL)
+ r = pf_match_translation(pd, m, off, direction, kif,
+ saddr, sport, daddr, dport, PF_RULESET_BINAT,
+ anchor_stack);
+ }
+
+ if (r == NULL)
+ return (NULL);
+
+ switch (r->action) {
+ case PF_NONAT:
+ case PF_NOBINAT:
+ case PF_NORDR:
+ return (NULL);
+ }
+
+ *skp = pf_state_key_setup(pd, saddr, daddr, sport, dport);
+ if (*skp == NULL)
+ return (NULL);
+ *nkp = pf_state_key_clone(*skp);
+ if (*nkp == NULL) {
+ uma_zfree(V_pf_state_key_z, skp);
+ *skp = NULL;
+ return (NULL);
+ }
+
+ /* XXX We only modify one side for now. */
+ naddr = &(*nkp)->addr[1];
+ nport = &(*nkp)->port[1];
+
+ switch (r->action) {
+ case PF_NAT:
+ if (pf_get_sport(pd->af, pd->proto, r, saddr, sport, daddr,
+ dport, naddr, nport, r->rpool.proxy_port[0],
+ r->rpool.proxy_port[1], sn)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: NAT proxy port allocation (%u-%u) failed\n",
+ r->rpool.proxy_port[0], r->rpool.proxy_port[1]));
+ goto notrans;
+ }
+ break;
+ case PF_BINAT:
+ switch (direction) {
+ case PF_OUT:
+ if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET:
+ if (r->rpool.cur->addr.p.dyn->
+ pfid_acnt4 < 1)
+ goto notrans;
+ PF_POOLMASK(naddr,
+ &r->rpool.cur->addr.p.dyn->
+ pfid_addr4,
+ &r->rpool.cur->addr.p.dyn->
+ pfid_mask4, saddr, AF_INET);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (r->rpool.cur->addr.p.dyn->
+ pfid_acnt6 < 1)
+ goto notrans;
+ PF_POOLMASK(naddr,
+ &r->rpool.cur->addr.p.dyn->
+ pfid_addr6,
+ &r->rpool.cur->addr.p.dyn->
+ pfid_mask6, saddr, AF_INET6);
+ break;
+#endif /* INET6 */
+ }
+ } else
+ PF_POOLMASK(naddr,
+ &r->rpool.cur->addr.v.a.addr,
+ &r->rpool.cur->addr.v.a.mask, saddr,
+ pd->af);
+ break;
+ case PF_IN:
+ if (r->src.addr.type == PF_ADDR_DYNIFTL) {
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET:
+ if (r->src.addr.p.dyn-> pfid_acnt4 < 1)
+ goto notrans;
+ PF_POOLMASK(naddr,
+ &r->src.addr.p.dyn->pfid_addr4,
+ &r->src.addr.p.dyn->pfid_mask4,
+ daddr, AF_INET);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (r->src.addr.p.dyn->pfid_acnt6 < 1)
+ goto notrans;
+ PF_POOLMASK(naddr,
+ &r->src.addr.p.dyn->pfid_addr6,
+ &r->src.addr.p.dyn->pfid_mask6,
+ daddr, AF_INET6);
+ break;
+#endif /* INET6 */
+ }
+ } else
+ PF_POOLMASK(naddr, &r->src.addr.v.a.addr,
+ &r->src.addr.v.a.mask, daddr, pd->af);
+ break;
+ }
+ break;
+ case PF_RDR: {
+ if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn))
+ goto notrans;
+ if ((r->rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK)
+ PF_POOLMASK(naddr, naddr, &r->rpool.cur->addr.v.a.mask,
+ daddr, pd->af);
+
+ if (r->rpool.proxy_port[1]) {
+ uint32_t tmp_nport;
+
+ tmp_nport = ((ntohs(dport) - ntohs(r->dst.port[0])) %
+ (r->rpool.proxy_port[1] - r->rpool.proxy_port[0] +
+ 1)) + r->rpool.proxy_port[0];
+
+ /* Wrap around if necessary. */
+ if (tmp_nport > 65535)
+ tmp_nport -= 65535;
+ *nport = htons((uint16_t)tmp_nport);
+ } else if (r->rpool.proxy_port[0])
+ *nport = htons(r->rpool.proxy_port[0]);
+ break;
+ }
+ default:
+ panic("%s: unknown action %u", __func__, r->action);
+ }
+
+ /* Return success only if translation really happened. */
+ if (bcmp(*skp, *nkp, sizeof(struct pf_state_key_cmp)))
+ return (r);
+
+notrans:
+ uma_zfree(V_pf_state_key_z, *nkp);
+ uma_zfree(V_pf_state_key_z, *skp);
+ *skp = *nkp = NULL;
+ *sn = NULL;
+
+ return (NULL);
+}
diff --git a/freebsd/sys/netpfil/pf/pf_mtag.h b/freebsd/sys/netpfil/pf/pf_mtag.h
new file mode 100644
index 00000000..fd8554ae
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/pf_mtag.h
@@ -0,0 +1,64 @@
+/* $FreeBSD$ */
+/*
+ * Copyright (c) 2001 Daniel Hartmeier
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _NET_PF_MTAG_H_
+#define _NET_PF_MTAG_H_
+
+#ifdef _KERNEL
+
+#define PF_TAG_GENERATED 0x01
+#define PF_TAG_FRAGCACHE 0x02
+#define PF_TAG_TRANSLATE_LOCALHOST 0x04
+#define PF_PACKET_LOOPED 0x08
+#define PF_FASTFWD_OURS_PRESENT 0x10
+#define PF_REASSEMBLED 0x20
+
+struct pf_mtag {
+ void *hdr; /* saved hdr pos in mbuf, for ECN */
+ u_int32_t qid; /* queue id */
+ u_int32_t qid_hash; /* queue hashid used by WFQ like algos */
+ u_int16_t tag; /* tag id */
+ u_int8_t flags;
+ u_int8_t routed;
+};
+
+static __inline struct pf_mtag *
+pf_find_mtag(struct mbuf *m)
+{
+ struct m_tag *mtag;
+
+ if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) == NULL)
+ return (NULL);
+
+ return ((struct pf_mtag *)(mtag + 1));
+}
+#endif /* _KERNEL */
+#endif /* _NET_PF_MTAG_H_ */
diff --git a/freebsd/sys/netpfil/pf/pf_norm.c b/freebsd/sys/netpfil/pf/pf_norm.c
new file mode 100644
index 00000000..86d2c8eb
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/pf_norm.c
@@ -0,0 +1,1843 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright 2001 Niels Provos <provos@citi.umich.edu>
+ * Copyright 2011 Alexander Bluhm <bluhm@openbsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $OpenBSD: pf_norm.c,v 1.114 2009/01/29 14:11:45 henning Exp $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/local/opt_inet.h>
+#include <rtems/bsd/local/opt_inet6.h>
+#include <rtems/bsd/local/opt_pf.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+
+#include <net/if.h>
+#include <net/vnet.h>
+#include <net/pfvar.h>
+#include <net/if_pflog.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet6/ip6_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif /* INET6 */
+
+struct pf_frent {
+ TAILQ_ENTRY(pf_frent) fr_next;
+ struct mbuf *fe_m;
+ uint16_t fe_hdrlen; /* ipv4 header length with ip options
+ ipv6, extension, fragment header */
+ uint16_t fe_extoff; /* last extension header offset or 0 */
+ uint16_t fe_len; /* fragment length */
+ uint16_t fe_off; /* fragment offset */
+ uint16_t fe_mff; /* more fragment flag */
+};
+
+struct pf_fragment_cmp {
+ struct pf_addr frc_src;
+ struct pf_addr frc_dst;
+ uint32_t frc_id;
+ sa_family_t frc_af;
+ uint8_t frc_proto;
+};
+
+struct pf_fragment {
+ struct pf_fragment_cmp fr_key;
+#define fr_src fr_key.frc_src
+#define fr_dst fr_key.frc_dst
+#define fr_id fr_key.frc_id
+#define fr_af fr_key.frc_af
+#define fr_proto fr_key.frc_proto
+
+ RB_ENTRY(pf_fragment) fr_entry;
+ TAILQ_ENTRY(pf_fragment) frag_next;
+ uint32_t fr_timeout;
+ uint16_t fr_maxlen; /* maximum length of single fragment */
+ TAILQ_HEAD(pf_fragq, pf_frent) fr_queue;
+};
+
+struct pf_fragment_tag {
+ uint16_t ft_hdrlen; /* header length of reassembled pkt */
+ uint16_t ft_extoff; /* last extension header offset or 0 */
+ uint16_t ft_maxlen; /* maximum fragment payload length */
+ uint32_t ft_id; /* fragment id */
+};
+
+static struct mtx pf_frag_mtx;
+MTX_SYSINIT(pf_frag_mtx, &pf_frag_mtx, "pf fragments", MTX_DEF);
+#define PF_FRAG_LOCK() mtx_lock(&pf_frag_mtx)
+#define PF_FRAG_UNLOCK() mtx_unlock(&pf_frag_mtx)
+#define PF_FRAG_ASSERT() mtx_assert(&pf_frag_mtx, MA_OWNED)
+
+VNET_DEFINE(uma_zone_t, pf_state_scrub_z); /* XXX: shared with pfsync */
+
+static VNET_DEFINE(uma_zone_t, pf_frent_z);
+#define V_pf_frent_z VNET(pf_frent_z)
+static VNET_DEFINE(uma_zone_t, pf_frag_z);
+#define V_pf_frag_z VNET(pf_frag_z)
+
+TAILQ_HEAD(pf_fragqueue, pf_fragment);
+TAILQ_HEAD(pf_cachequeue, pf_fragment);
+static VNET_DEFINE(struct pf_fragqueue, pf_fragqueue);
+#define V_pf_fragqueue VNET(pf_fragqueue)
+RB_HEAD(pf_frag_tree, pf_fragment);
+static VNET_DEFINE(struct pf_frag_tree, pf_frag_tree);
+#define V_pf_frag_tree VNET(pf_frag_tree)
+static int pf_frag_compare(struct pf_fragment *,
+ struct pf_fragment *);
+static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
+static RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
+
+static void pf_flush_fragments(void);
+static void pf_free_fragment(struct pf_fragment *);
+static void pf_remove_fragment(struct pf_fragment *);
+static int pf_normalize_tcpopt(struct pf_rule *, struct mbuf *,
+ struct tcphdr *, int, sa_family_t);
+static struct pf_frent *pf_create_fragment(u_short *);
+static struct pf_fragment *pf_find_fragment(struct pf_fragment_cmp *key,
+ struct pf_frag_tree *tree);
+static struct pf_fragment *pf_fillup_fragment(struct pf_fragment_cmp *,
+ struct pf_frent *, u_short *);
+static int pf_isfull_fragment(struct pf_fragment *);
+static struct mbuf *pf_join_fragment(struct pf_fragment *);
+#ifdef INET
+static void pf_scrub_ip(struct mbuf **, uint32_t, uint8_t, uint8_t);
+static int pf_reassemble(struct mbuf **, struct ip *, int, u_short *);
+#endif /* INET */
+#ifdef INET6
+static int pf_reassemble6(struct mbuf **, struct ip6_hdr *,
+ struct ip6_frag *, uint16_t, uint16_t, u_short *);
+static void pf_scrub_ip6(struct mbuf **, uint8_t);
+#endif /* INET6 */
+
+#define DPFPRINTF(x) do { \
+ if (V_pf_status.debug >= PF_DEBUG_MISC) { \
+ printf("%s: ", __func__); \
+ printf x ; \
+ } \
+} while(0)
+
+#ifdef INET
+static void
+pf_ip2key(struct ip *ip, int dir, struct pf_fragment_cmp *key)
+{
+
+ key->frc_src.v4 = ip->ip_src;
+ key->frc_dst.v4 = ip->ip_dst;
+ key->frc_af = AF_INET;
+ key->frc_proto = ip->ip_p;
+ key->frc_id = ip->ip_id;
+}
+#endif /* INET */
+
+void
+pf_normalize_init(void)
+{
+
+ V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ V_pf_state_scrub_z = uma_zcreate("pf state scrubs",
+ sizeof(struct pf_state_scrub), NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+
+ V_pf_limits[PF_LIMIT_FRAGS].zone = V_pf_frent_z;
+ V_pf_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT;
+ uma_zone_set_max(V_pf_frent_z, PFFRAG_FRENT_HIWAT);
+ uma_zone_set_warning(V_pf_frent_z, "PF frag entries limit reached");
+
+ TAILQ_INIT(&V_pf_fragqueue);
+}
+
+void
+pf_normalize_cleanup(void)
+{
+
+ uma_zdestroy(V_pf_state_scrub_z);
+ uma_zdestroy(V_pf_frent_z);
+ uma_zdestroy(V_pf_frag_z);
+}
+
+static int
+pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
+{
+ int diff;
+
+ if ((diff = a->fr_id - b->fr_id) != 0)
+ return (diff);
+ if ((diff = a->fr_proto - b->fr_proto) != 0)
+ return (diff);
+ if ((diff = a->fr_af - b->fr_af) != 0)
+ return (diff);
+ if ((diff = pf_addr_cmp(&a->fr_src, &b->fr_src, a->fr_af)) != 0)
+ return (diff);
+ if ((diff = pf_addr_cmp(&a->fr_dst, &b->fr_dst, a->fr_af)) != 0)
+ return (diff);
+ return (0);
+}
+
+void
+pf_purge_expired_fragments(void)
+{
+ struct pf_fragment *frag;
+ u_int32_t expire = time_uptime -
+ V_pf_default_rule.timeout[PFTM_FRAG];
+
+ PF_FRAG_LOCK();
+ while ((frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue)) != NULL) {
+ if (frag->fr_timeout > expire)
+ break;
+
+ DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
+ pf_free_fragment(frag);
+ }
+
+ PF_FRAG_UNLOCK();
+}
+
+/*
+ * Try to flush old fragments to make space for new ones
+ */
+static void
+pf_flush_fragments(void)
+{
+ struct pf_fragment *frag;
+ int goal;
+
+ PF_FRAG_ASSERT();
+
+ goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10;
+ DPFPRINTF(("trying to free %d frag entriess\n", goal));
+ while (goal < uma_zone_get_cur(V_pf_frent_z)) {
+ frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue);
+ if (frag)
+ pf_free_fragment(frag);
+ else
+ break;
+ }
+}
+
+/* Frees the fragments and all associated entries */
+static void
+pf_free_fragment(struct pf_fragment *frag)
+{
+ struct pf_frent *frent;
+
+ PF_FRAG_ASSERT();
+
+ /* Free all fragments */
+ for (frent = TAILQ_FIRST(&frag->fr_queue); frent;
+ frent = TAILQ_FIRST(&frag->fr_queue)) {
+ TAILQ_REMOVE(&frag->fr_queue, frent, fr_next);
+
+ m_freem(frent->fe_m);
+ uma_zfree(V_pf_frent_z, frent);
+ }
+
+ pf_remove_fragment(frag);
+}
+
+static struct pf_fragment *
+pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree)
+{
+ struct pf_fragment *frag;
+
+ PF_FRAG_ASSERT();
+
+ frag = RB_FIND(pf_frag_tree, tree, (struct pf_fragment *)key);
+ if (frag != NULL) {
+ /* XXX Are we sure we want to update the timeout? */
+ frag->fr_timeout = time_uptime;
+ TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
+ TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
+ }
+
+ return (frag);
+}
+
+/* Removes a fragment from the fragment queue and frees the fragment */
+static void
+pf_remove_fragment(struct pf_fragment *frag)
+{
+
+ PF_FRAG_ASSERT();
+
+ RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag);
+ TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
+ uma_zfree(V_pf_frag_z, frag);
+}
+
+static struct pf_frent *
+pf_create_fragment(u_short *reason)
+{
+ struct pf_frent *frent;
+
+ PF_FRAG_ASSERT();
+
+ frent = uma_zalloc(V_pf_frent_z, M_NOWAIT);
+ if (frent == NULL) {
+ pf_flush_fragments();
+ frent = uma_zalloc(V_pf_frent_z, M_NOWAIT);
+ if (frent == NULL) {
+ REASON_SET(reason, PFRES_MEMORY);
+ return (NULL);
+ }
+ }
+
+ return (frent);
+}
+
+static struct pf_fragment *
+pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent,
+ u_short *reason)
+{
+ struct pf_frent *after, *next, *prev;
+ struct pf_fragment *frag;
+ uint16_t total;
+
+ PF_FRAG_ASSERT();
+
+ /* No empty fragments. */
+ if (frent->fe_len == 0) {
+ DPFPRINTF(("bad fragment: len 0"));
+ goto bad_fragment;
+ }
+
+ /* All fragments are 8 byte aligned. */
+ if (frent->fe_mff && (frent->fe_len & 0x7)) {
+ DPFPRINTF(("bad fragment: mff and len %d", frent->fe_len));
+ goto bad_fragment;
+ }
+
+ /* Respect maximum length, IP_MAXPACKET == IPV6_MAXPACKET. */
+ if (frent->fe_off + frent->fe_len > IP_MAXPACKET) {
+ DPFPRINTF(("bad fragment: max packet %d",
+ frent->fe_off + frent->fe_len));
+ goto bad_fragment;
+ }
+
+ DPFPRINTF((key->frc_af == AF_INET ?
+ "reass frag %d @ %d-%d" : "reass frag %#08x @ %d-%d",
+ key->frc_id, frent->fe_off, frent->fe_off + frent->fe_len));
+
+ /* Fully buffer all of the fragments in this fragment queue. */
+ frag = pf_find_fragment(key, &V_pf_frag_tree);
+
+ /* Create a new reassembly queue for this packet. */
+ if (frag == NULL) {
+ frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
+ if (frag == NULL) {
+ pf_flush_fragments();
+ frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
+ if (frag == NULL) {
+ REASON_SET(reason, PFRES_MEMORY);
+ goto drop_fragment;
+ }
+ }
+
+ *(struct pf_fragment_cmp *)frag = *key;
+ frag->fr_timeout = time_uptime;
+ frag->fr_maxlen = frent->fe_len;
+ TAILQ_INIT(&frag->fr_queue);
+
+ RB_INSERT(pf_frag_tree, &V_pf_frag_tree, frag);
+ TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
+
+ /* We do not have a previous fragment. */
+ TAILQ_INSERT_HEAD(&frag->fr_queue, frent, fr_next);
+
+ return (frag);
+ }
+
+ KASSERT(!TAILQ_EMPTY(&frag->fr_queue), ("!TAILQ_EMPTY()->fr_queue"));
+
+ /* Remember maximum fragment len for refragmentation. */
+ if (frent->fe_len > frag->fr_maxlen)
+ frag->fr_maxlen = frent->fe_len;
+
+ /* Maximum data we have seen already. */
+ total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off +
+ TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len;
+
+ /* Non terminal fragments must have more fragments flag. */
+ if (frent->fe_off + frent->fe_len < total && !frent->fe_mff)
+ goto bad_fragment;
+
+ /* Check if we saw the last fragment already. */
+ if (!TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_mff) {
+ if (frent->fe_off + frent->fe_len > total ||
+ (frent->fe_off + frent->fe_len == total && frent->fe_mff))
+ goto bad_fragment;
+ } else {
+ if (frent->fe_off + frent->fe_len == total && !frent->fe_mff)
+ goto bad_fragment;
+ }
+
+ /* Find a fragment after the current one. */
+ prev = NULL;
+ TAILQ_FOREACH(after, &frag->fr_queue, fr_next) {
+ if (after->fe_off > frent->fe_off)
+ break;
+ prev = after;
+ }
+
+ KASSERT(prev != NULL || after != NULL,
+ ("prev != NULL || after != NULL"));
+
+ if (prev != NULL && prev->fe_off + prev->fe_len > frent->fe_off) {
+ uint16_t precut;
+
+ precut = prev->fe_off + prev->fe_len - frent->fe_off;
+ if (precut >= frent->fe_len)
+ goto bad_fragment;
+ DPFPRINTF(("overlap -%d", precut));
+ m_adj(frent->fe_m, precut);
+ frent->fe_off += precut;
+ frent->fe_len -= precut;
+ }
+
+ for (; after != NULL && frent->fe_off + frent->fe_len > after->fe_off;
+ after = next) {
+ uint16_t aftercut;
+
+ aftercut = frent->fe_off + frent->fe_len - after->fe_off;
+ DPFPRINTF(("adjust overlap %d", aftercut));
+ if (aftercut < after->fe_len) {
+ m_adj(after->fe_m, aftercut);
+ after->fe_off += aftercut;
+ after->fe_len -= aftercut;
+ break;
+ }
+
+ /* This fragment is completely overlapped, lose it. */
+ next = TAILQ_NEXT(after, fr_next);
+ m_freem(after->fe_m);
+ TAILQ_REMOVE(&frag->fr_queue, after, fr_next);
+ uma_zfree(V_pf_frent_z, after);
+ }
+
+ if (prev == NULL)
+ TAILQ_INSERT_HEAD(&frag->fr_queue, frent, fr_next);
+ else
+ TAILQ_INSERT_AFTER(&frag->fr_queue, prev, frent, fr_next);
+
+ return (frag);
+
+bad_fragment:
+ REASON_SET(reason, PFRES_FRAG);
+drop_fragment:
+ uma_zfree(V_pf_frent_z, frent);
+ return (NULL);
+}
+
+static int
+pf_isfull_fragment(struct pf_fragment *frag)
+{
+ struct pf_frent *frent, *next;
+ uint16_t off, total;
+
+ /* Check if we are completely reassembled */
+ if (TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_mff)
+ return (0);
+
+ /* Maximum data we have seen already */
+ total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off +
+ TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len;
+
+ /* Check if we have all the data */
+ off = 0;
+ for (frent = TAILQ_FIRST(&frag->fr_queue); frent; frent = next) {
+ next = TAILQ_NEXT(frent, fr_next);
+
+ off += frent->fe_len;
+ if (off < total && (next == NULL || next->fe_off != off)) {
+ DPFPRINTF(("missing fragment at %d, next %d, total %d",
+ off, next == NULL ? -1 : next->fe_off, total));
+ return (0);
+ }
+ }
+ DPFPRINTF(("%d < %d?", off, total));
+ if (off < total)
+ return (0);
+ KASSERT(off == total, ("off == total"));
+
+ return (1);
+}
+
+static struct mbuf *
+pf_join_fragment(struct pf_fragment *frag)
+{
+ struct mbuf *m, *m2;
+ struct pf_frent *frent, *next;
+
+ frent = TAILQ_FIRST(&frag->fr_queue);
+ next = TAILQ_NEXT(frent, fr_next);
+
+ m = frent->fe_m;
+ m_adj(m, (frent->fe_hdrlen + frent->fe_len) - m->m_pkthdr.len);
+ uma_zfree(V_pf_frent_z, frent);
+ for (frent = next; frent != NULL; frent = next) {
+ next = TAILQ_NEXT(frent, fr_next);
+
+ m2 = frent->fe_m;
+ /* Strip off ip header. */
+ m_adj(m2, frent->fe_hdrlen);
+ /* Strip off any trailing bytes. */
+ m_adj(m2, frent->fe_len - m2->m_pkthdr.len);
+
+ uma_zfree(V_pf_frent_z, frent);
+ m_cat(m, m2);
+ }
+
+ /* Remove from fragment queue. */
+ pf_remove_fragment(frag);
+
+ return (m);
+}
+
+#ifdef INET
+static int
+pf_reassemble(struct mbuf **m0, struct ip *ip, int dir, u_short *reason)
+{
+ struct mbuf *m = *m0;
+ struct pf_frent *frent;
+ struct pf_fragment *frag;
+ struct pf_fragment_cmp key;
+ uint16_t total, hdrlen;
+
+ /* Get an entry for the fragment queue */
+ if ((frent = pf_create_fragment(reason)) == NULL)
+ return (PF_DROP);
+
+ frent->fe_m = m;
+ frent->fe_hdrlen = ip->ip_hl << 2;
+ frent->fe_extoff = 0;
+ frent->fe_len = ntohs(ip->ip_len) - (ip->ip_hl << 2);
+ frent->fe_off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
+ frent->fe_mff = ntohs(ip->ip_off) & IP_MF;
+
+ pf_ip2key(ip, dir, &key);
+
+ if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL)
+ return (PF_DROP);
+
+ /* The mbuf is part of the fragment entry, no direct free or access */
+ m = *m0 = NULL;
+
+ if (!pf_isfull_fragment(frag))
+ return (PF_PASS); /* drop because *m0 is NULL, no error */
+
+ /* We have all the data */
+ frent = TAILQ_FIRST(&frag->fr_queue);
+ KASSERT(frent != NULL, ("frent != NULL"));
+ total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off +
+ TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len;
+ hdrlen = frent->fe_hdrlen;
+
+ m = *m0 = pf_join_fragment(frag);
+ frag = NULL;
+
+ if (m->m_flags & M_PKTHDR) {
+ int plen = 0;
+ for (m = *m0; m; m = m->m_next)
+ plen += m->m_len;
+ m = *m0;
+ m->m_pkthdr.len = plen;
+ }
+
+ ip = mtod(m, struct ip *);
+ ip->ip_len = htons(hdrlen + total);
+ ip->ip_off &= ~(IP_MF|IP_OFFMASK);
+
+ if (hdrlen + total > IP_MAXPACKET) {
+ DPFPRINTF(("drop: too big: %d", total));
+ ip->ip_len = 0;
+ REASON_SET(reason, PFRES_SHORT);
+ /* PF_DROP requires a valid mbuf *m0 in pf_test() */
+ return (PF_DROP);
+ }
+
+ DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len)));
+ return (PF_PASS);
+}
+#endif /* INET */
+
+#ifdef INET6
+static int
+pf_reassemble6(struct mbuf **m0, struct ip6_hdr *ip6, struct ip6_frag *fraghdr,
+ uint16_t hdrlen, uint16_t extoff, u_short *reason)
+{
+ struct mbuf *m = *m0;
+ struct pf_frent *frent;
+ struct pf_fragment *frag;
+ struct pf_fragment_cmp key;
+ struct m_tag *mtag;
+ struct pf_fragment_tag *ftag;
+ int off;
+ uint32_t frag_id;
+ uint16_t total, maxlen;
+ uint8_t proto;
+
+ PF_FRAG_LOCK();
+
+ /* Get an entry for the fragment queue. */
+ if ((frent = pf_create_fragment(reason)) == NULL) {
+ PF_FRAG_UNLOCK();
+ return (PF_DROP);
+ }
+
+ frent->fe_m = m;
+ frent->fe_hdrlen = hdrlen;
+ frent->fe_extoff = extoff;
+ frent->fe_len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - hdrlen;
+ frent->fe_off = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK);
+ frent->fe_mff = fraghdr->ip6f_offlg & IP6F_MORE_FRAG;
+
+ key.frc_src.v6 = ip6->ip6_src;
+ key.frc_dst.v6 = ip6->ip6_dst;
+ key.frc_af = AF_INET6;
+ /* Only the first fragment's protocol is relevant. */
+ key.frc_proto = 0;
+ key.frc_id = fraghdr->ip6f_ident;
+
+ if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) {
+ PF_FRAG_UNLOCK();
+ return (PF_DROP);
+ }
+
+ /* The mbuf is part of the fragment entry, no direct free or access. */
+ m = *m0 = NULL;
+
+ if (!pf_isfull_fragment(frag)) {
+ PF_FRAG_UNLOCK();
+ return (PF_PASS); /* Drop because *m0 is NULL, no error. */
+ }
+
+ /* We have all the data. */
+ extoff = frent->fe_extoff;
+ maxlen = frag->fr_maxlen;
+ frag_id = frag->fr_id;
+ frent = TAILQ_FIRST(&frag->fr_queue);
+ KASSERT(frent != NULL, ("frent != NULL"));
+ total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off +
+ TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len;
+ hdrlen = frent->fe_hdrlen - sizeof(struct ip6_frag);
+
+ m = *m0 = pf_join_fragment(frag);
+ frag = NULL;
+
+ PF_FRAG_UNLOCK();
+
+ /* Take protocol from first fragment header. */
+ m = m_getptr(m, hdrlen + offsetof(struct ip6_frag, ip6f_nxt), &off);
+ KASSERT(m, ("%s: short mbuf chain", __func__));
+ proto = *(mtod(m, caddr_t) + off);
+ m = *m0;
+
+ /* Delete frag6 header */
+ if (ip6_deletefraghdr(m, hdrlen, M_NOWAIT) != 0)
+ goto fail;
+
+ if (m->m_flags & M_PKTHDR) {
+ int plen = 0;
+ for (m = *m0; m; m = m->m_next)
+ plen += m->m_len;
+ m = *m0;
+ m->m_pkthdr.len = plen;
+ }
+
+ if ((mtag = m_tag_get(PF_REASSEMBLED, sizeof(struct pf_fragment_tag),
+ M_NOWAIT)) == NULL)
+ goto fail;
+ ftag = (struct pf_fragment_tag *)(mtag + 1);
+ ftag->ft_hdrlen = hdrlen;
+ ftag->ft_extoff = extoff;
+ ftag->ft_maxlen = maxlen;
+ ftag->ft_id = frag_id;
+ m_tag_prepend(m, mtag);
+
+ ip6 = mtod(m, struct ip6_hdr *);
+ ip6->ip6_plen = htons(hdrlen - sizeof(struct ip6_hdr) + total);
+ if (extoff) {
+ /* Write protocol into next field of last extension header. */
+ m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt),
+ &off);
+ KASSERT(m, ("%s: short mbuf chain", __func__));
+ *(mtod(m, char *) + off) = proto;
+ m = *m0;
+ } else
+ ip6->ip6_nxt = proto;
+
+ if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) {
+ DPFPRINTF(("drop: too big: %d", total));
+ ip6->ip6_plen = 0;
+ REASON_SET(reason, PFRES_SHORT);
+ /* PF_DROP requires a valid mbuf *m0 in pf_test6(). */
+ return (PF_DROP);
+ }
+
+ DPFPRINTF(("complete: %p(%d)", m, ntohs(ip6->ip6_plen)));
+ return (PF_PASS);
+
+fail:
+ REASON_SET(reason, PFRES_MEMORY);
+ /* PF_DROP requires a valid mbuf *m0 in pf_test6(), will free later. */
+ return (PF_DROP);
+}
+#endif /* INET6 */
+
+#ifdef INET6
+int
+pf_refragment6(struct ifnet *ifp, struct mbuf **m0, struct m_tag *mtag)
+{
+ struct mbuf *m = *m0, *t;
+ struct pf_fragment_tag *ftag = (struct pf_fragment_tag *)(mtag + 1);
+ struct pf_pdesc pd;
+ uint32_t frag_id;
+ uint16_t hdrlen, extoff, maxlen;
+ uint8_t proto;
+ int error, action;
+
+ hdrlen = ftag->ft_hdrlen;
+ extoff = ftag->ft_extoff;
+ maxlen = ftag->ft_maxlen;
+ frag_id = ftag->ft_id;
+ m_tag_delete(m, mtag);
+ mtag = NULL;
+ ftag = NULL;
+
+ if (extoff) {
+ int off;
+
+ /* Use protocol from next field of last extension header */
+ m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt),
+ &off);
+ KASSERT((m != NULL), ("pf_refragment6: short mbuf chain"));
+ proto = *(mtod(m, caddr_t) + off);
+ *(mtod(m, char *) + off) = IPPROTO_FRAGMENT;
+ m = *m0;
+ } else {
+ struct ip6_hdr *hdr;
+
+ hdr = mtod(m, struct ip6_hdr *);
+ proto = hdr->ip6_nxt;
+ hdr->ip6_nxt = IPPROTO_FRAGMENT;
+ }
+
+ /*
+ * Maxlen may be less than 8 if there was only a single
+ * fragment. As it was fragmented before, add a fragment
+ * header also for a single fragment. If total or maxlen
+ * is less than 8, ip6_fragment() will return EMSGSIZE and
+ * we drop the packet.
+ */
+ error = ip6_fragment(ifp, m, hdrlen, proto, maxlen, frag_id);
+ m = (*m0)->m_nextpkt;
+ (*m0)->m_nextpkt = NULL;
+ if (error == 0) {
+ /* The first mbuf contains the unfragmented packet. */
+ m_freem(*m0);
+ *m0 = NULL;
+ action = PF_PASS;
+ } else {
+ /* Drop expects an mbuf to free. */
+ DPFPRINTF(("refragment error %d", error));
+ action = PF_DROP;
+ }
+ for (t = m; m; m = t) {
+ t = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ m->m_flags |= M_SKIP_FIREWALL;
+ memset(&pd, 0, sizeof(pd));
+ pd.pf_mtag = pf_find_mtag(m);
+ if (error == 0)
+ ip6_forward(m, 0);
+ else
+ m_freem(m);
+ }
+
+ return (action);
+}
+#endif /* INET6 */
+
+#ifdef INET
+int
+pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason,
+ struct pf_pdesc *pd)
+{
+ struct mbuf *m = *m0;
+ struct pf_rule *r;
+ struct ip *h = mtod(m, struct ip *);
+ int mff = (ntohs(h->ip_off) & IP_MF);
+ int hlen = h->ip_hl << 2;
+ u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
+ u_int16_t max;
+ int ip_len;
+ int ip_off;
+ int tag = -1;
+ int verdict;
+
+ PF_RULES_RASSERT();
+
+ r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
+ while (r != NULL) {
+ r->evaluations++;
+ if (pfi_kif_match(r->kif, kif) == r->ifnot)
+ r = r->skip[PF_SKIP_IFP].ptr;
+ else if (r->direction && r->direction != dir)
+ r = r->skip[PF_SKIP_DIR].ptr;
+ else if (r->af && r->af != AF_INET)
+ r = r->skip[PF_SKIP_AF].ptr;
+ else if (r->proto && r->proto != h->ip_p)
+ r = r->skip[PF_SKIP_PROTO].ptr;
+ else if (PF_MISMATCHAW(&r->src.addr,
+ (struct pf_addr *)&h->ip_src.s_addr, AF_INET,
+ r->src.neg, kif, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_SRC_ADDR].ptr;
+ else if (PF_MISMATCHAW(&r->dst.addr,
+ (struct pf_addr *)&h->ip_dst.s_addr, AF_INET,
+ r->dst.neg, NULL, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_DST_ADDR].ptr;
+ else if (r->match_tag && !pf_match_tag(m, r, &tag,
+ pd->pf_mtag ? pd->pf_mtag->tag : 0))
+ r = TAILQ_NEXT(r, entries);
+ else
+ break;
+ }
+
+ if (r == NULL || r->action == PF_NOSCRUB)
+ return (PF_PASS);
+ else {
+ r->packets[dir == PF_OUT]++;
+ r->bytes[dir == PF_OUT] += pd->tot_len;
+ }
+
+ /* Check for illegal packets */
+ if (hlen < (int)sizeof(struct ip)) {
+ REASON_SET(reason, PFRES_NORM);
+ goto drop;
+ }
+
+ if (hlen > ntohs(h->ip_len)) {
+ REASON_SET(reason, PFRES_NORM);
+ goto drop;
+ }
+
+ /* Clear IP_DF if the rule uses the no-df option */
+ if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
+ u_int16_t ip_off = h->ip_off;
+
+ h->ip_off &= htons(~IP_DF);
+ h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
+ }
+
+ /* We will need other tests here */
+ if (!fragoff && !mff)
+ goto no_fragment;
+
+ /* We're dealing with a fragment now. Don't allow fragments
+ * with IP_DF to enter the cache. If the flag was cleared by
+ * no-df above, fine. Otherwise drop it.
+ */
+ if (h->ip_off & htons(IP_DF)) {
+ DPFPRINTF(("IP_DF\n"));
+ goto bad;
+ }
+
+ ip_len = ntohs(h->ip_len) - hlen;
+ ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
+
+ /* All fragments are 8 byte aligned */
+ if (mff && (ip_len & 0x7)) {
+ DPFPRINTF(("mff and %d\n", ip_len));
+ goto bad;
+ }
+
+ /* Respect maximum length */
+ if (fragoff + ip_len > IP_MAXPACKET) {
+ DPFPRINTF(("max packet %d\n", fragoff + ip_len));
+ goto bad;
+ }
+ max = fragoff + ip_len;
+
+ /* Fully buffer all of the fragments
+ * Might return a completely reassembled mbuf, or NULL */
+ PF_FRAG_LOCK();
+ DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max));
+ verdict = pf_reassemble(m0, h, dir, reason);
+ PF_FRAG_UNLOCK();
+
+ if (verdict != PF_PASS)
+ return (PF_DROP);
+
+ m = *m0;
+ if (m == NULL)
+ return (PF_DROP);
+
+ h = mtod(m, struct ip *);
+
+ no_fragment:
+ /* At this point, only IP_DF is allowed in ip_off */
+ if (h->ip_off & ~htons(IP_DF)) {
+ u_int16_t ip_off = h->ip_off;
+
+ h->ip_off &= htons(IP_DF);
+ h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
+ }
+
+ pf_scrub_ip(&m, r->rule_flag, r->min_ttl, r->set_tos);
+
+ return (PF_PASS);
+
+ bad:
+ DPFPRINTF(("dropping bad fragment\n"));
+ REASON_SET(reason, PFRES_FRAG);
+ drop:
+ if (r != NULL && r->log)
+ PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd,
+ 1);
+
+ return (PF_DROP);
+}
+#endif
+
+#ifdef INET6
+int
+pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif,
+ u_short *reason, struct pf_pdesc *pd)
+{
+ struct mbuf *m = *m0;
+ struct pf_rule *r;
+ struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
+ int extoff;
+ int off;
+ struct ip6_ext ext;
+ struct ip6_opt opt;
+ struct ip6_opt_jumbo jumbo;
+ struct ip6_frag frag;
+ u_int32_t jumbolen = 0, plen;
+ int optend;
+ int ooff;
+ u_int8_t proto;
+ int terminal;
+
+ PF_RULES_RASSERT();
+
+ r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
+ while (r != NULL) {
+ r->evaluations++;
+ if (pfi_kif_match(r->kif, kif) == r->ifnot)
+ r = r->skip[PF_SKIP_IFP].ptr;
+ else if (r->direction && r->direction != dir)
+ r = r->skip[PF_SKIP_DIR].ptr;
+ else if (r->af && r->af != AF_INET6)
+ r = r->skip[PF_SKIP_AF].ptr;
+#if 0 /* header chain! */
+ else if (r->proto && r->proto != h->ip6_nxt)
+ r = r->skip[PF_SKIP_PROTO].ptr;
+#endif
+ else if (PF_MISMATCHAW(&r->src.addr,
+ (struct pf_addr *)&h->ip6_src, AF_INET6,
+ r->src.neg, kif, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_SRC_ADDR].ptr;
+ else if (PF_MISMATCHAW(&r->dst.addr,
+ (struct pf_addr *)&h->ip6_dst, AF_INET6,
+ r->dst.neg, NULL, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_DST_ADDR].ptr;
+ else
+ break;
+ }
+
+ if (r == NULL || r->action == PF_NOSCRUB)
+ return (PF_PASS);
+ else {
+ r->packets[dir == PF_OUT]++;
+ r->bytes[dir == PF_OUT] += pd->tot_len;
+ }
+
+ /* Check for illegal packets */
+ if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len)
+ goto drop;
+
+ extoff = 0;
+ off = sizeof(struct ip6_hdr);
+ proto = h->ip6_nxt;
+ terminal = 0;
+ do {
+ switch (proto) {
+ case IPPROTO_FRAGMENT:
+ goto fragment;
+ break;
+ case IPPROTO_AH:
+ case IPPROTO_ROUTING:
+ case IPPROTO_DSTOPTS:
+ if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
+ NULL, AF_INET6))
+ goto shortpkt;
+ extoff = off;
+ if (proto == IPPROTO_AH)
+ off += (ext.ip6e_len + 2) * 4;
+ else
+ off += (ext.ip6e_len + 1) * 8;
+ proto = ext.ip6e_nxt;
+ break;
+ case IPPROTO_HOPOPTS:
+ if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
+ NULL, AF_INET6))
+ goto shortpkt;
+ extoff = off;
+ optend = off + (ext.ip6e_len + 1) * 8;
+ ooff = off + sizeof(ext);
+ do {
+ if (!pf_pull_hdr(m, ooff, &opt.ip6o_type,
+ sizeof(opt.ip6o_type), NULL, NULL,
+ AF_INET6))
+ goto shortpkt;
+ if (opt.ip6o_type == IP6OPT_PAD1) {
+ ooff++;
+ continue;
+ }
+ if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt),
+ NULL, NULL, AF_INET6))
+ goto shortpkt;
+ if (ooff + sizeof(opt) + opt.ip6o_len > optend)
+ goto drop;
+ switch (opt.ip6o_type) {
+ case IP6OPT_JUMBO:
+ if (h->ip6_plen != 0)
+ goto drop;
+ if (!pf_pull_hdr(m, ooff, &jumbo,
+ sizeof(jumbo), NULL, NULL,
+ AF_INET6))
+ goto shortpkt;
+ memcpy(&jumbolen, jumbo.ip6oj_jumbo_len,
+ sizeof(jumbolen));
+ jumbolen = ntohl(jumbolen);
+ if (jumbolen <= IPV6_MAXPACKET)
+ goto drop;
+ if (sizeof(struct ip6_hdr) + jumbolen !=
+ m->m_pkthdr.len)
+ goto drop;
+ break;
+ default:
+ break;
+ }
+ ooff += sizeof(opt) + opt.ip6o_len;
+ } while (ooff < optend);
+
+ off = optend;
+ proto = ext.ip6e_nxt;
+ break;
+ default:
+ terminal = 1;
+ break;
+ }
+ } while (!terminal);
+
+ /* jumbo payload option must be present, or plen > 0 */
+ if (ntohs(h->ip6_plen) == 0)
+ plen = jumbolen;
+ else
+ plen = ntohs(h->ip6_plen);
+ if (plen == 0)
+ goto drop;
+ if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len)
+ goto shortpkt;
+
+ pf_scrub_ip6(&m, r->min_ttl);
+
+ return (PF_PASS);
+
+ fragment:
+ /* Jumbo payload packets cannot be fragmented. */
+ plen = ntohs(h->ip6_plen);
+ if (plen == 0 || jumbolen)
+ goto drop;
+ if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len)
+ goto shortpkt;
+
+ if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6))
+ goto shortpkt;
+
+ /* Offset now points to data portion. */
+ off += sizeof(frag);
+
+ /* Returns PF_DROP or *m0 is NULL or completely reassembled mbuf. */
+ if (pf_reassemble6(m0, h, &frag, off, extoff, reason) != PF_PASS)
+ return (PF_DROP);
+ m = *m0;
+ if (m == NULL)
+ return (PF_DROP);
+
+ pd->flags |= PFDESC_IP_REAS;
+ return (PF_PASS);
+
+ shortpkt:
+ REASON_SET(reason, PFRES_SHORT);
+ if (r != NULL && r->log)
+ PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd,
+ 1);
+ return (PF_DROP);
+
+ drop:
+ REASON_SET(reason, PFRES_NORM);
+ if (r != NULL && r->log)
+ PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd,
+ 1);
+ return (PF_DROP);
+}
+#endif /* INET6 */
+
+int
+pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff,
+ int off, void *h, struct pf_pdesc *pd)
+{
+ struct pf_rule *r, *rm = NULL;
+ struct tcphdr *th = pd->hdr.tcp;
+ int rewrite = 0;
+ u_short reason;
+ u_int8_t flags;
+ sa_family_t af = pd->af;
+
+ PF_RULES_RASSERT();
+
+ r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
+ while (r != NULL) {
+ r->evaluations++;
+ if (pfi_kif_match(r->kif, kif) == r->ifnot)
+ r = r->skip[PF_SKIP_IFP].ptr;
+ else if (r->direction && r->direction != dir)
+ r = r->skip[PF_SKIP_DIR].ptr;
+ else if (r->af && r->af != af)
+ r = r->skip[PF_SKIP_AF].ptr;
+ else if (r->proto && r->proto != pd->proto)
+ r = r->skip[PF_SKIP_PROTO].ptr;
+ else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
+ r->src.neg, kif, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_SRC_ADDR].ptr;
+ else if (r->src.port_op && !pf_match_port(r->src.port_op,
+ r->src.port[0], r->src.port[1], th->th_sport))
+ r = r->skip[PF_SKIP_SRC_PORT].ptr;
+ else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
+ r->dst.neg, NULL, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_DST_ADDR].ptr;
+ else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
+ r->dst.port[0], r->dst.port[1], th->th_dport))
+ r = r->skip[PF_SKIP_DST_PORT].ptr;
+ else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match(
+ pf_osfp_fingerprint(pd, m, off, th),
+ r->os_fingerprint))
+ r = TAILQ_NEXT(r, entries);
+ else {
+ rm = r;
+ break;
+ }
+ }
+
+ if (rm == NULL || rm->action == PF_NOSCRUB)
+ return (PF_PASS);
+ else {
+ r->packets[dir == PF_OUT]++;
+ r->bytes[dir == PF_OUT] += pd->tot_len;
+ }
+
+ if (rm->rule_flag & PFRULE_REASSEMBLE_TCP)
+ pd->flags |= PFDESC_TCP_NORM;
+
+ flags = th->th_flags;
+ if (flags & TH_SYN) {
+ /* Illegal packet */
+ if (flags & TH_RST)
+ goto tcp_drop;
+
+ if (flags & TH_FIN)
+ goto tcp_drop;
+ } else {
+ /* Illegal packet */
+ if (!(flags & (TH_ACK|TH_RST)))
+ goto tcp_drop;
+ }
+
+ if (!(flags & TH_ACK)) {
+ /* These flags are only valid if ACK is set */
+ if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG))
+ goto tcp_drop;
+ }
+
+ /* Check for illegal header length */
+ if (th->th_off < (sizeof(struct tcphdr) >> 2))
+ goto tcp_drop;
+
+ /* If flags changed, or reserved data set, then adjust */
+ if (flags != th->th_flags || th->th_x2 != 0) {
+ u_int16_t ov, nv;
+
+ ov = *(u_int16_t *)(&th->th_ack + 1);
+ th->th_flags = flags;
+ th->th_x2 = 0;
+ nv = *(u_int16_t *)(&th->th_ack + 1);
+
+ th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, ov, nv, 0);
+ rewrite = 1;
+ }
+
+ /* Remove urgent pointer, if TH_URG is not set */
+ if (!(flags & TH_URG) && th->th_urp) {
+ th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, th->th_urp,
+ 0, 0);
+ th->th_urp = 0;
+ rewrite = 1;
+ }
+
+ /* Process options */
+ if (r->max_mss && pf_normalize_tcpopt(r, m, th, off, pd->af))
+ rewrite = 1;
+
+ /* copy back packet headers if we sanitized */
+ if (rewrite)
+ m_copyback(m, off, sizeof(*th), (caddr_t)th);
+
+ return (PF_PASS);
+
+ tcp_drop:
+ REASON_SET(&reason, PFRES_NORM);
+ if (rm != NULL && r->log)
+ PFLOG_PACKET(kif, m, AF_INET, dir, reason, r, NULL, NULL, pd,
+ 1);
+ return (PF_DROP);
+}
+
+int
+pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd,
+ struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst)
+{
+ u_int32_t tsval, tsecr;
+ u_int8_t hdr[60];
+ u_int8_t *opt;
+
+ KASSERT((src->scrub == NULL),
+ ("pf_normalize_tcp_init: src->scrub != NULL"));
+
+ src->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT);
+ if (src->scrub == NULL)
+ return (1);
+
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET: {
+ struct ip *h = mtod(m, struct ip *);
+ src->scrub->pfss_ttl = h->ip_ttl;
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6: {
+ struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
+ src->scrub->pfss_ttl = h->ip6_hlim;
+ break;
+ }
+#endif /* INET6 */
+ }
+
+
+ /*
+ * All normalizations below are only begun if we see the start of
+ * the connections. They must all set an enabled bit in pfss_flags
+ */
+ if ((th->th_flags & TH_SYN) == 0)
+ return (0);
+
+
+ if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub &&
+ pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
+ /* Diddle with TCP options */
+ int hlen;
+ opt = hdr + sizeof(struct tcphdr);
+ hlen = (th->th_off << 2) - sizeof(struct tcphdr);
+ while (hlen >= TCPOLEN_TIMESTAMP) {
+ switch (*opt) {
+ case TCPOPT_EOL: /* FALLTHROUGH */
+ case TCPOPT_NOP:
+ opt++;
+ hlen--;
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (opt[1] >= TCPOLEN_TIMESTAMP) {
+ src->scrub->pfss_flags |=
+ PFSS_TIMESTAMP;
+ src->scrub->pfss_ts_mod =
+ htonl(arc4random());
+
+ /* note PFSS_PAWS not set yet */
+ memcpy(&tsval, &opt[2],
+ sizeof(u_int32_t));
+ memcpy(&tsecr, &opt[6],
+ sizeof(u_int32_t));
+ src->scrub->pfss_tsval0 = ntohl(tsval);
+ src->scrub->pfss_tsval = ntohl(tsval);
+ src->scrub->pfss_tsecr = ntohl(tsecr);
+ getmicrouptime(&src->scrub->pfss_last);
+ }
+ /* FALLTHROUGH */
+ default:
+ hlen -= MAX(opt[1], 2);
+ opt += MAX(opt[1], 2);
+ break;
+ }
+ }
+ }
+
+ return (0);
+}
+
+void
+pf_normalize_tcp_cleanup(struct pf_state *state)
+{
+ if (state->src.scrub)
+ uma_zfree(V_pf_state_scrub_z, state->src.scrub);
+ if (state->dst.scrub)
+ uma_zfree(V_pf_state_scrub_z, state->dst.scrub);
+
+ /* Someday... flush the TCP segment reassembly descriptors. */
+}
+
+int
+pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd,
+ u_short *reason, struct tcphdr *th, struct pf_state *state,
+ struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback)
+{
+ struct timeval uptime;
+ u_int32_t tsval, tsecr;
+ u_int tsval_from_last;
+ u_int8_t hdr[60];
+ u_int8_t *opt;
+ int copyback = 0;
+ int got_ts = 0;
+
+ KASSERT((src->scrub || dst->scrub),
+ ("%s: src->scrub && dst->scrub!", __func__));
+
+ /*
+ * Enforce the minimum TTL seen for this connection. Negate a common
+ * technique to evade an intrusion detection system and confuse
+ * firewall state code.
+ */
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET: {
+ if (src->scrub) {
+ struct ip *h = mtod(m, struct ip *);
+ if (h->ip_ttl > src->scrub->pfss_ttl)
+ src->scrub->pfss_ttl = h->ip_ttl;
+ h->ip_ttl = src->scrub->pfss_ttl;
+ }
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6: {
+ if (src->scrub) {
+ struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
+ if (h->ip6_hlim > src->scrub->pfss_ttl)
+ src->scrub->pfss_ttl = h->ip6_hlim;
+ h->ip6_hlim = src->scrub->pfss_ttl;
+ }
+ break;
+ }
+#endif /* INET6 */
+ }
+
+ if (th->th_off > (sizeof(struct tcphdr) >> 2) &&
+ ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) ||
+ (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) &&
+ pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
+ /* Diddle with TCP options */
+ int hlen;
+ opt = hdr + sizeof(struct tcphdr);
+ hlen = (th->th_off << 2) - sizeof(struct tcphdr);
+ while (hlen >= TCPOLEN_TIMESTAMP) {
+ switch (*opt) {
+ case TCPOPT_EOL: /* FALLTHROUGH */
+ case TCPOPT_NOP:
+ opt++;
+ hlen--;
+ break;
+ case TCPOPT_TIMESTAMP:
+ /* Modulate the timestamps. Can be used for
+ * NAT detection, OS uptime determination or
+ * reboot detection.
+ */
+
+ if (got_ts) {
+ /* Huh? Multiple timestamps!? */
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ DPFPRINTF(("multiple TS??"));
+ pf_print_state(state);
+ printf("\n");
+ }
+ REASON_SET(reason, PFRES_TS);
+ return (PF_DROP);
+ }
+ if (opt[1] >= TCPOLEN_TIMESTAMP) {
+ memcpy(&tsval, &opt[2],
+ sizeof(u_int32_t));
+ if (tsval && src->scrub &&
+ (src->scrub->pfss_flags &
+ PFSS_TIMESTAMP)) {
+ tsval = ntohl(tsval);
+ pf_change_proto_a(m, &opt[2],
+ &th->th_sum,
+ htonl(tsval +
+ src->scrub->pfss_ts_mod),
+ 0);
+ copyback = 1;
+ }
+
+ /* Modulate TS reply iff valid (!0) */
+ memcpy(&tsecr, &opt[6],
+ sizeof(u_int32_t));
+ if (tsecr && dst->scrub &&
+ (dst->scrub->pfss_flags &
+ PFSS_TIMESTAMP)) {
+ tsecr = ntohl(tsecr)
+ - dst->scrub->pfss_ts_mod;
+ pf_change_proto_a(m, &opt[6],
+ &th->th_sum, htonl(tsecr),
+ 0);
+ copyback = 1;
+ }
+ got_ts = 1;
+ }
+ /* FALLTHROUGH */
+ default:
+ hlen -= MAX(opt[1], 2);
+ opt += MAX(opt[1], 2);
+ break;
+ }
+ }
+ if (copyback) {
+ /* Copyback the options, caller copys back header */
+ *writeback = 1;
+ m_copyback(m, off + sizeof(struct tcphdr),
+ (th->th_off << 2) - sizeof(struct tcphdr), hdr +
+ sizeof(struct tcphdr));
+ }
+ }
+
+
+ /*
+ * Must invalidate PAWS checks on connections idle for too long.
+ * The fastest allowed timestamp clock is 1ms. That turns out to
+ * be about 24 days before it wraps. XXX Right now our lowerbound
+ * TS echo check only works for the first 12 days of a connection
+ * when the TS has exhausted half its 32bit space
+ */
+#define TS_MAX_IDLE (24*24*60*60)
+#define TS_MAX_CONN (12*24*60*60) /* XXX remove when better tsecr check */
+
+ getmicrouptime(&uptime);
+ if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) &&
+ (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE ||
+ time_uptime - state->creation > TS_MAX_CONN)) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ DPFPRINTF(("src idled out of PAWS\n"));
+ pf_print_state(state);
+ printf("\n");
+ }
+ src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS)
+ | PFSS_PAWS_IDLED;
+ }
+ if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) &&
+ uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ DPFPRINTF(("dst idled out of PAWS\n"));
+ pf_print_state(state);
+ printf("\n");
+ }
+ dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS)
+ | PFSS_PAWS_IDLED;
+ }
+
+ if (got_ts && src->scrub && dst->scrub &&
+ (src->scrub->pfss_flags & PFSS_PAWS) &&
+ (dst->scrub->pfss_flags & PFSS_PAWS)) {
+ /* Validate that the timestamps are "in-window".
+ * RFC1323 describes TCP Timestamp options that allow
+ * measurement of RTT (round trip time) and PAWS
+ * (protection against wrapped sequence numbers). PAWS
+ * gives us a set of rules for rejecting packets on
+ * long fat pipes (packets that were somehow delayed
+ * in transit longer than the time it took to send the
+ * full TCP sequence space of 4Gb). We can use these
+ * rules and infer a few others that will let us treat
+ * the 32bit timestamp and the 32bit echoed timestamp
+ * as sequence numbers to prevent a blind attacker from
+ * inserting packets into a connection.
+ *
+ * RFC1323 tells us:
+ * - The timestamp on this packet must be greater than
+ * or equal to the last value echoed by the other
+ * endpoint. The RFC says those will be discarded
+ * since it is a dup that has already been acked.
+ * This gives us a lowerbound on the timestamp.
+ * timestamp >= other last echoed timestamp
+ * - The timestamp will be less than or equal to
+ * the last timestamp plus the time between the
+ * last packet and now. The RFC defines the max
+ * clock rate as 1ms. We will allow clocks to be
+ * up to 10% fast and will allow a total difference
+ * or 30 seconds due to a route change. And this
+ * gives us an upperbound on the timestamp.
+ * timestamp <= last timestamp + max ticks
+ * We have to be careful here. Windows will send an
+ * initial timestamp of zero and then initialize it
+ * to a random value after the 3whs; presumably to
+ * avoid a DoS by having to call an expensive RNG
+ * during a SYN flood. Proof MS has at least one
+ * good security geek.
+ *
+ * - The TCP timestamp option must also echo the other
+ * endpoints timestamp. The timestamp echoed is the
+ * one carried on the earliest unacknowledged segment
+ * on the left edge of the sequence window. The RFC
+ * states that the host will reject any echoed
+ * timestamps that were larger than any ever sent.
+ * This gives us an upperbound on the TS echo.
+ * tescr <= largest_tsval
+ * - The lowerbound on the TS echo is a little more
+ * tricky to determine. The other endpoint's echoed
+ * values will not decrease. But there may be
+ * network conditions that re-order packets and
+ * cause our view of them to decrease. For now the
+ * only lowerbound we can safely determine is that
+ * the TS echo will never be less than the original
+ * TS. XXX There is probably a better lowerbound.
+ * Remove TS_MAX_CONN with better lowerbound check.
+ * tescr >= other original TS
+ *
+ * It is also important to note that the fastest
+ * timestamp clock of 1ms will wrap its 32bit space in
+ * 24 days. So we just disable TS checking after 24
+ * days of idle time. We actually must use a 12d
+ * connection limit until we can come up with a better
+ * lowerbound to the TS echo check.
+ */
+ struct timeval delta_ts;
+ int ts_fudge;
+
+
+ /*
+ * PFTM_TS_DIFF is how many seconds of leeway to allow
+ * a host's timestamp. This can happen if the previous
+ * packet got delayed in transit for much longer than
+ * this packet.
+ */
+ if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0)
+ ts_fudge = V_pf_default_rule.timeout[PFTM_TS_DIFF];
+
+ /* Calculate max ticks since the last timestamp */
+#define TS_MAXFREQ 1100 /* RFC max TS freq of 1Khz + 10% skew */
+#define TS_MICROSECS 1000000 /* microseconds per second */
+ delta_ts = uptime;
+ timevalsub(&delta_ts, &src->scrub->pfss_last);
+ tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ;
+ tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ);
+
+ if ((src->state >= TCPS_ESTABLISHED &&
+ dst->state >= TCPS_ESTABLISHED) &&
+ (SEQ_LT(tsval, dst->scrub->pfss_tsecr) ||
+ SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ||
+ (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) ||
+ SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) {
+ /* Bad RFC1323 implementation or an insertion attack.
+ *
+ * - Solaris 2.6 and 2.7 are known to send another ACK
+ * after the FIN,FIN|ACK,ACK closing that carries
+ * an old timestamp.
+ */
+
+ DPFPRINTF(("Timestamp failed %c%c%c%c\n",
+ SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ',
+ SEQ_GT(tsval, src->scrub->pfss_tsval +
+ tsval_from_last) ? '1' : ' ',
+ SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ',
+ SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' '));
+ DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u "
+ "idle: %jus %lums\n",
+ tsval, tsecr, tsval_from_last,
+ (uintmax_t)delta_ts.tv_sec,
+ delta_ts.tv_usec / 1000));
+ DPFPRINTF((" src->tsval: %u tsecr: %u\n",
+ src->scrub->pfss_tsval, src->scrub->pfss_tsecr));
+ DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u"
+ "\n", dst->scrub->pfss_tsval,
+ dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0));
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ pf_print_state(state);
+ pf_print_flags(th->th_flags);
+ printf("\n");
+ }
+ REASON_SET(reason, PFRES_TS);
+ return (PF_DROP);
+ }
+
+ /* XXX I'd really like to require tsecr but it's optional */
+
+ } else if (!got_ts && (th->th_flags & TH_RST) == 0 &&
+ ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED)
+ || pd->p_len > 0 || (th->th_flags & TH_SYN)) &&
+ src->scrub && dst->scrub &&
+ (src->scrub->pfss_flags & PFSS_PAWS) &&
+ (dst->scrub->pfss_flags & PFSS_PAWS)) {
+ /* Didn't send a timestamp. Timestamps aren't really useful
+ * when:
+ * - connection opening or closing (often not even sent).
+ * but we must not let an attacker to put a FIN on a
+ * data packet to sneak it through our ESTABLISHED check.
+ * - on a TCP reset. RFC suggests not even looking at TS.
+ * - on an empty ACK. The TS will not be echoed so it will
+ * probably not help keep the RTT calculation in sync and
+ * there isn't as much danger when the sequence numbers
+ * got wrapped. So some stacks don't include TS on empty
+ * ACKs :-(
+ *
+ * To minimize the disruption to mostly RFC1323 conformant
+ * stacks, we will only require timestamps on data packets.
+ *
+ * And what do ya know, we cannot require timestamps on data
+ * packets. There appear to be devices that do legitimate
+ * TCP connection hijacking. There are HTTP devices that allow
+ * a 3whs (with timestamps) and then buffer the HTTP request.
+ * If the intermediate device has the HTTP response cache, it
+ * will spoof the response but not bother timestamping its
+ * packets. So we can look for the presence of a timestamp in
+ * the first data packet and if there, require it in all future
+ * packets.
+ */
+
+ if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) {
+ /*
+ * Hey! Someone tried to sneak a packet in. Or the
+ * stack changed its RFC1323 behavior?!?!
+ */
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ DPFPRINTF(("Did not receive expected RFC1323 "
+ "timestamp\n"));
+ pf_print_state(state);
+ pf_print_flags(th->th_flags);
+ printf("\n");
+ }
+ REASON_SET(reason, PFRES_TS);
+ return (PF_DROP);
+ }
+ }
+
+
+ /*
+ * We will note if a host sends his data packets with or without
+ * timestamps. And require all data packets to contain a timestamp
+ * if the first does. PAWS implicitly requires that all data packets be
+ * timestamped. But I think there are middle-man devices that hijack
+ * TCP streams immediately after the 3whs and don't timestamp their
+ * packets (seen in a WWW accelerator or cache).
+ */
+ if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags &
+ (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) {
+ if (got_ts)
+ src->scrub->pfss_flags |= PFSS_DATA_TS;
+ else {
+ src->scrub->pfss_flags |= PFSS_DATA_NOTS;
+ if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub &&
+ (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) {
+ /* Don't warn if other host rejected RFC1323 */
+ DPFPRINTF(("Broken RFC1323 stack did not "
+ "timestamp data packet. Disabled PAWS "
+ "security.\n"));
+ pf_print_state(state);
+ pf_print_flags(th->th_flags);
+ printf("\n");
+ }
+ }
+ }
+
+
+ /*
+ * Update PAWS values
+ */
+ if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags &
+ (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) {
+ getmicrouptime(&src->scrub->pfss_last);
+ if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) ||
+ (src->scrub->pfss_flags & PFSS_PAWS) == 0)
+ src->scrub->pfss_tsval = tsval;
+
+ if (tsecr) {
+ if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) ||
+ (src->scrub->pfss_flags & PFSS_PAWS) == 0)
+ src->scrub->pfss_tsecr = tsecr;
+
+ if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 &&
+ (SEQ_LT(tsval, src->scrub->pfss_tsval0) ||
+ src->scrub->pfss_tsval0 == 0)) {
+ /* tsval0 MUST be the lowest timestamp */
+ src->scrub->pfss_tsval0 = tsval;
+ }
+
+ /* Only fully initialized after a TS gets echoed */
+ if ((src->scrub->pfss_flags & PFSS_PAWS) == 0)
+ src->scrub->pfss_flags |= PFSS_PAWS;
+ }
+ }
+
+ /* I have a dream.... TCP segment reassembly.... */
+ return (0);
+}
+
+static int
+pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th,
+ int off, sa_family_t af)
+{
+ u_int16_t *mss;
+ int thoff;
+ int opt, cnt, optlen = 0;
+ int rewrite = 0;
+ u_char opts[TCP_MAXOLEN];
+ u_char *optp = opts;
+
+ thoff = th->th_off << 2;
+ cnt = thoff - sizeof(struct tcphdr);
+
+ if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt,
+ NULL, NULL, af))
+ return (rewrite);
+
+ for (; cnt > 0; cnt -= optlen, optp += optlen) {
+ opt = optp[0];
+ if (opt == TCPOPT_EOL)
+ break;
+ if (opt == TCPOPT_NOP)
+ optlen = 1;
+ else {
+ if (cnt < 2)
+ break;
+ optlen = optp[1];
+ if (optlen < 2 || optlen > cnt)
+ break;
+ }
+ switch (opt) {
+ case TCPOPT_MAXSEG:
+ mss = (u_int16_t *)(optp + 2);
+ if ((ntohs(*mss)) > r->max_mss) {
+ th->th_sum = pf_proto_cksum_fixup(m,
+ th->th_sum, *mss, htons(r->max_mss), 0);
+ *mss = htons(r->max_mss);
+ rewrite = 1;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (rewrite)
+ m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts);
+
+ return (rewrite);
+}
+
+#ifdef INET
+static void
+pf_scrub_ip(struct mbuf **m0, u_int32_t flags, u_int8_t min_ttl, u_int8_t tos)
+{
+ struct mbuf *m = *m0;
+ struct ip *h = mtod(m, struct ip *);
+
+ /* Clear IP_DF if no-df was requested */
+ if (flags & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
+ u_int16_t ip_off = h->ip_off;
+
+ h->ip_off &= htons(~IP_DF);
+ h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
+ }
+
+ /* Enforce a minimum ttl, may cause endless packet loops */
+ if (min_ttl && h->ip_ttl < min_ttl) {
+ u_int16_t ip_ttl = h->ip_ttl;
+
+ h->ip_ttl = min_ttl;
+ h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
+ }
+
+ /* Enforce tos */
+ if (flags & PFRULE_SET_TOS) {
+ u_int16_t ov, nv;
+
+ ov = *(u_int16_t *)h;
+ h->ip_tos = tos;
+ nv = *(u_int16_t *)h;
+
+ h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0);
+ }
+
+ /* random-id, but not for fragments */
+ if (flags & PFRULE_RANDOMID && !(h->ip_off & ~htons(IP_DF))) {
+ uint16_t ip_id = h->ip_id;
+
+ ip_fillid(h);
+ h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0);
+ }
+}
+#endif /* INET */
+
+#ifdef INET6
+static void
+pf_scrub_ip6(struct mbuf **m0, u_int8_t min_ttl)
+{
+ struct mbuf *m = *m0;
+ struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
+
+ /* Enforce a minimum ttl, may cause endless packet loops */
+ if (min_ttl && h->ip6_hlim < min_ttl)
+ h->ip6_hlim = min_ttl;
+}
+#endif
diff --git a/freebsd/sys/netpfil/pf/pf_osfp.c b/freebsd/sys/netpfil/pf/pf_osfp.c
new file mode 100644
index 00000000..33bef4c8
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/pf_osfp.c
@@ -0,0 +1,530 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2003 Mike Frantzen <frantzen@w4g.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * $OpenBSD: pf_osfp.c,v 1.14 2008/06/12 18:17:01 henning Exp $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+
+#include <net/if.h>
+#include <net/vnet.h>
+#include <net/pfvar.h>
+
+#include <netinet/ip6.h>
+
+static MALLOC_DEFINE(M_PFOSFP, "pf_osfp", "pf(4) operating system fingerprints");
+#define DPFPRINTF(format, x...) \
+ if (V_pf_status.debug >= PF_DEBUG_NOISY) \
+ printf(format , ##x)
+
+SLIST_HEAD(pf_osfp_list, pf_os_fingerprint);
+static VNET_DEFINE(struct pf_osfp_list, pf_osfp_list) =
+ SLIST_HEAD_INITIALIZER();
+#define V_pf_osfp_list VNET(pf_osfp_list)
+
+static struct pf_osfp_enlist *pf_osfp_fingerprint_hdr(const struct ip *,
+ const struct ip6_hdr *,
+ const struct tcphdr *);
+static struct pf_os_fingerprint *pf_osfp_find(struct pf_osfp_list *,
+ struct pf_os_fingerprint *, u_int8_t);
+static struct pf_os_fingerprint *pf_osfp_find_exact(struct pf_osfp_list *,
+ struct pf_os_fingerprint *);
+static void pf_osfp_insert(struct pf_osfp_list *,
+ struct pf_os_fingerprint *);
+#ifdef PFDEBUG
+static struct pf_os_fingerprint *pf_osfp_validate(void);
+#endif
+
+/*
+ * Passively fingerprint the OS of the host (IPv4 TCP SYN packets only)
+ * Returns the list of possible OSes.
+ */
+struct pf_osfp_enlist *
+pf_osfp_fingerprint(struct pf_pdesc *pd, struct mbuf *m, int off,
+ const struct tcphdr *tcp)
+{
+ struct ip *ip;
+ struct ip6_hdr *ip6;
+ char hdr[60];
+
+ if ((pd->af != PF_INET && pd->af != PF_INET6) ||
+ pd->proto != IPPROTO_TCP || (tcp->th_off << 2) < sizeof(*tcp))
+ return (NULL);
+
+ if (pd->af == PF_INET) {
+ ip = mtod(m, struct ip *);
+ ip6 = (struct ip6_hdr *)NULL;
+ } else {
+ ip = (struct ip *)NULL;
+ ip6 = mtod(m, struct ip6_hdr *);
+ }
+ if (!pf_pull_hdr(m, off, hdr, tcp->th_off << 2, NULL, NULL,
+ pd->af)) return (NULL);
+
+ return (pf_osfp_fingerprint_hdr(ip, ip6, (struct tcphdr *)hdr));
+}
+
+static struct pf_osfp_enlist *
+pf_osfp_fingerprint_hdr(const struct ip *ip, const struct ip6_hdr *ip6, const struct tcphdr *tcp)
+{
+ struct pf_os_fingerprint fp, *fpresult;
+ int cnt, optlen = 0;
+ const u_int8_t *optp;
+ char srcname[128];
+
+ if ((tcp->th_flags & (TH_SYN|TH_ACK)) != TH_SYN)
+ return (NULL);
+ if (ip) {
+ if ((ip->ip_off & htons(IP_OFFMASK)) != 0)
+ return (NULL);
+ }
+
+ memset(&fp, 0, sizeof(fp));
+
+ if (ip) {
+ fp.fp_psize = ntohs(ip->ip_len);
+ fp.fp_ttl = ip->ip_ttl;
+ if (ip->ip_off & htons(IP_DF))
+ fp.fp_flags |= PF_OSFP_DF;
+ strlcpy(srcname, inet_ntoa(ip->ip_src), sizeof(srcname));
+ }
+#ifdef INET6
+ else if (ip6) {
+ /* jumbo payload? */
+ fp.fp_psize = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
+ fp.fp_ttl = ip6->ip6_hlim;
+ fp.fp_flags |= PF_OSFP_DF;
+ fp.fp_flags |= PF_OSFP_INET6;
+ strlcpy(srcname, ip6_sprintf((struct in6_addr *)&ip6->ip6_src),
+ sizeof(srcname));
+ }
+#endif
+ else
+ return (NULL);
+ fp.fp_wsize = ntohs(tcp->th_win);
+
+
+ cnt = (tcp->th_off << 2) - sizeof(*tcp);
+ optp = (const u_int8_t *)((const char *)tcp + sizeof(*tcp));
+ for (; cnt > 0; cnt -= optlen, optp += optlen) {
+ if (*optp == TCPOPT_EOL)
+ break;
+
+ fp.fp_optcnt++;
+ if (*optp == TCPOPT_NOP) {
+ fp.fp_tcpopts = (fp.fp_tcpopts << PF_OSFP_TCPOPT_BITS) |
+ PF_OSFP_TCPOPT_NOP;
+ optlen = 1;
+ } else {
+ if (cnt < 2)
+ return (NULL);
+ optlen = optp[1];
+ if (optlen > cnt || optlen < 2)
+ return (NULL);
+ switch (*optp) {
+ case TCPOPT_MAXSEG:
+ if (optlen >= TCPOLEN_MAXSEG)
+ memcpy(&fp.fp_mss, &optp[2],
+ sizeof(fp.fp_mss));
+ fp.fp_tcpopts = (fp.fp_tcpopts <<
+ PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_MSS;
+ NTOHS(fp.fp_mss);
+ break;
+ case TCPOPT_WINDOW:
+ if (optlen >= TCPOLEN_WINDOW)
+ memcpy(&fp.fp_wscale, &optp[2],
+ sizeof(fp.fp_wscale));
+ NTOHS(fp.fp_wscale);
+ fp.fp_tcpopts = (fp.fp_tcpopts <<
+ PF_OSFP_TCPOPT_BITS) |
+ PF_OSFP_TCPOPT_WSCALE;
+ break;
+ case TCPOPT_SACK_PERMITTED:
+ fp.fp_tcpopts = (fp.fp_tcpopts <<
+ PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_SACK;
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (optlen >= TCPOLEN_TIMESTAMP) {
+ u_int32_t ts;
+ memcpy(&ts, &optp[2], sizeof(ts));
+ if (ts == 0)
+ fp.fp_flags |= PF_OSFP_TS0;
+
+ }
+ fp.fp_tcpopts = (fp.fp_tcpopts <<
+ PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_TS;
+ break;
+ default:
+ return (NULL);
+ }
+ }
+ optlen = MAX(optlen, 1); /* paranoia */
+ }
+
+ DPFPRINTF("fingerprinted %s:%d %d:%d:%d:%d:%llx (%d) "
+ "(TS=%s,M=%s%d,W=%s%d)\n",
+ srcname, ntohs(tcp->th_sport),
+ fp.fp_wsize, fp.fp_ttl, (fp.fp_flags & PF_OSFP_DF) != 0,
+ fp.fp_psize, (long long int)fp.fp_tcpopts, fp.fp_optcnt,
+ (fp.fp_flags & PF_OSFP_TS0) ? "0" : "",
+ (fp.fp_flags & PF_OSFP_MSS_MOD) ? "%" :
+ (fp.fp_flags & PF_OSFP_MSS_DC) ? "*" : "",
+ fp.fp_mss,
+ (fp.fp_flags & PF_OSFP_WSCALE_MOD) ? "%" :
+ (fp.fp_flags & PF_OSFP_WSCALE_DC) ? "*" : "",
+ fp.fp_wscale);
+
+ if ((fpresult = pf_osfp_find(&V_pf_osfp_list, &fp,
+ PF_OSFP_MAXTTL_OFFSET)))
+ return (&fpresult->fp_oses);
+ return (NULL);
+}
+
+/* Match a fingerprint ID against a list of OSes */
+int
+pf_osfp_match(struct pf_osfp_enlist *list, pf_osfp_t os)
+{
+ struct pf_osfp_entry *entry;
+ int os_class, os_version, os_subtype;
+ int en_class, en_version, en_subtype;
+
+ if (os == PF_OSFP_ANY)
+ return (1);
+ if (list == NULL) {
+ DPFPRINTF("osfp no match against %x\n", os);
+ return (os == PF_OSFP_UNKNOWN);
+ }
+ PF_OSFP_UNPACK(os, os_class, os_version, os_subtype);
+ SLIST_FOREACH(entry, list, fp_entry) {
+ PF_OSFP_UNPACK(entry->fp_os, en_class, en_version, en_subtype);
+ if ((os_class == PF_OSFP_ANY || en_class == os_class) &&
+ (os_version == PF_OSFP_ANY || en_version == os_version) &&
+ (os_subtype == PF_OSFP_ANY || en_subtype == os_subtype)) {
+ DPFPRINTF("osfp matched %s %s %s %x==%x\n",
+ entry->fp_class_nm, entry->fp_version_nm,
+ entry->fp_subtype_nm, os, entry->fp_os);
+ return (1);
+ }
+ }
+ DPFPRINTF("fingerprint 0x%x didn't match\n", os);
+ return (0);
+}
+
+/* Flush the fingerprint list */
+void
+pf_osfp_flush(void)
+{
+ struct pf_os_fingerprint *fp;
+ struct pf_osfp_entry *entry;
+
+ while ((fp = SLIST_FIRST(&V_pf_osfp_list))) {
+ SLIST_REMOVE_HEAD(&V_pf_osfp_list, fp_next);
+ while ((entry = SLIST_FIRST(&fp->fp_oses))) {
+ SLIST_REMOVE_HEAD(&fp->fp_oses, fp_entry);
+ free(entry, M_PFOSFP);
+ }
+ free(fp, M_PFOSFP);
+ }
+}
+
+
+/* Add a fingerprint */
+int
+pf_osfp_add(struct pf_osfp_ioctl *fpioc)
+{
+ struct pf_os_fingerprint *fp, fpadd;
+ struct pf_osfp_entry *entry;
+
+ PF_RULES_WASSERT();
+
+ memset(&fpadd, 0, sizeof(fpadd));
+ fpadd.fp_tcpopts = fpioc->fp_tcpopts;
+ fpadd.fp_wsize = fpioc->fp_wsize;
+ fpadd.fp_psize = fpioc->fp_psize;
+ fpadd.fp_mss = fpioc->fp_mss;
+ fpadd.fp_flags = fpioc->fp_flags;
+ fpadd.fp_optcnt = fpioc->fp_optcnt;
+ fpadd.fp_wscale = fpioc->fp_wscale;
+ fpadd.fp_ttl = fpioc->fp_ttl;
+
+#if 0 /* XXX RYAN wants to fix logging */
+ DPFPRINTF("adding osfp %s %s %s = %s%d:%d:%d:%s%d:0x%llx %d "
+ "(TS=%s,M=%s%d,W=%s%d) %x\n",
+ fpioc->fp_os.fp_class_nm, fpioc->fp_os.fp_version_nm,
+ fpioc->fp_os.fp_subtype_nm,
+ (fpadd.fp_flags & PF_OSFP_WSIZE_MOD) ? "%" :
+ (fpadd.fp_flags & PF_OSFP_WSIZE_MSS) ? "S" :
+ (fpadd.fp_flags & PF_OSFP_WSIZE_MTU) ? "T" :
+ (fpadd.fp_flags & PF_OSFP_WSIZE_DC) ? "*" : "",
+ fpadd.fp_wsize,
+ fpadd.fp_ttl,
+ (fpadd.fp_flags & PF_OSFP_DF) ? 1 : 0,
+ (fpadd.fp_flags & PF_OSFP_PSIZE_MOD) ? "%" :
+ (fpadd.fp_flags & PF_OSFP_PSIZE_DC) ? "*" : "",
+ fpadd.fp_psize,
+ (long long int)fpadd.fp_tcpopts, fpadd.fp_optcnt,
+ (fpadd.fp_flags & PF_OSFP_TS0) ? "0" : "",
+ (fpadd.fp_flags & PF_OSFP_MSS_MOD) ? "%" :
+ (fpadd.fp_flags & PF_OSFP_MSS_DC) ? "*" : "",
+ fpadd.fp_mss,
+ (fpadd.fp_flags & PF_OSFP_WSCALE_MOD) ? "%" :
+ (fpadd.fp_flags & PF_OSFP_WSCALE_DC) ? "*" : "",
+ fpadd.fp_wscale,
+ fpioc->fp_os.fp_os);
+#endif
+
+ if ((fp = pf_osfp_find_exact(&V_pf_osfp_list, &fpadd))) {
+ SLIST_FOREACH(entry, &fp->fp_oses, fp_entry) {
+ if (PF_OSFP_ENTRY_EQ(entry, &fpioc->fp_os))
+ return (EEXIST);
+ }
+ if ((entry = malloc(sizeof(*entry), M_PFOSFP, M_NOWAIT))
+ == NULL)
+ return (ENOMEM);
+ } else {
+ if ((fp = malloc(sizeof(*fp), M_PFOSFP, M_ZERO | M_NOWAIT))
+ == NULL)
+ return (ENOMEM);
+ fp->fp_tcpopts = fpioc->fp_tcpopts;
+ fp->fp_wsize = fpioc->fp_wsize;
+ fp->fp_psize = fpioc->fp_psize;
+ fp->fp_mss = fpioc->fp_mss;
+ fp->fp_flags = fpioc->fp_flags;
+ fp->fp_optcnt = fpioc->fp_optcnt;
+ fp->fp_wscale = fpioc->fp_wscale;
+ fp->fp_ttl = fpioc->fp_ttl;
+ SLIST_INIT(&fp->fp_oses);
+ if ((entry = malloc(sizeof(*entry), M_PFOSFP, M_NOWAIT))
+ == NULL) {
+ free(fp, M_PFOSFP);
+ return (ENOMEM);
+ }
+ pf_osfp_insert(&V_pf_osfp_list, fp);
+ }
+ memcpy(entry, &fpioc->fp_os, sizeof(*entry));
+
+ /* Make sure the strings are NUL terminated */
+ entry->fp_class_nm[sizeof(entry->fp_class_nm)-1] = '\0';
+ entry->fp_version_nm[sizeof(entry->fp_version_nm)-1] = '\0';
+ entry->fp_subtype_nm[sizeof(entry->fp_subtype_nm)-1] = '\0';
+
+ SLIST_INSERT_HEAD(&fp->fp_oses, entry, fp_entry);
+
+#ifdef PFDEBUG
+ if ((fp = pf_osfp_validate()))
+ printf("Invalid fingerprint list\n");
+#endif /* PFDEBUG */
+ return (0);
+}
+
+
+/* Find a fingerprint in the list */
+static struct pf_os_fingerprint *
+pf_osfp_find(struct pf_osfp_list *list, struct pf_os_fingerprint *find,
+ u_int8_t ttldiff)
+{
+ struct pf_os_fingerprint *f;
+
+#define MATCH_INT(_MOD, _DC, _field) \
+ if ((f->fp_flags & _DC) == 0) { \
+ if ((f->fp_flags & _MOD) == 0) { \
+ if (f->_field != find->_field) \
+ continue; \
+ } else { \
+ if (f->_field == 0 || find->_field % f->_field) \
+ continue; \
+ } \
+ }
+
+ SLIST_FOREACH(f, list, fp_next) {
+ if (f->fp_tcpopts != find->fp_tcpopts ||
+ f->fp_optcnt != find->fp_optcnt ||
+ f->fp_ttl < find->fp_ttl ||
+ f->fp_ttl - find->fp_ttl > ttldiff ||
+ (f->fp_flags & (PF_OSFP_DF|PF_OSFP_TS0)) !=
+ (find->fp_flags & (PF_OSFP_DF|PF_OSFP_TS0)))
+ continue;
+
+ MATCH_INT(PF_OSFP_PSIZE_MOD, PF_OSFP_PSIZE_DC, fp_psize)
+ MATCH_INT(PF_OSFP_MSS_MOD, PF_OSFP_MSS_DC, fp_mss)
+ MATCH_INT(PF_OSFP_WSCALE_MOD, PF_OSFP_WSCALE_DC, fp_wscale)
+ if ((f->fp_flags & PF_OSFP_WSIZE_DC) == 0) {
+ if (f->fp_flags & PF_OSFP_WSIZE_MSS) {
+ if (find->fp_mss == 0)
+ continue;
+
+/*
+ * Some "smart" NAT devices and DSL routers will tweak the MSS size and
+ * will set it to whatever is suitable for the link type.
+ */
+#define SMART_MSS 1460
+ if ((find->fp_wsize % find->fp_mss ||
+ find->fp_wsize / find->fp_mss !=
+ f->fp_wsize) &&
+ (find->fp_wsize % SMART_MSS ||
+ find->fp_wsize / SMART_MSS !=
+ f->fp_wsize))
+ continue;
+ } else if (f->fp_flags & PF_OSFP_WSIZE_MTU) {
+ if (find->fp_mss == 0)
+ continue;
+
+#define MTUOFF (sizeof(struct ip) + sizeof(struct tcphdr))
+#define SMART_MTU (SMART_MSS + MTUOFF)
+ if ((find->fp_wsize % (find->fp_mss + MTUOFF) ||
+ find->fp_wsize / (find->fp_mss + MTUOFF) !=
+ f->fp_wsize) &&
+ (find->fp_wsize % SMART_MTU ||
+ find->fp_wsize / SMART_MTU !=
+ f->fp_wsize))
+ continue;
+ } else if (f->fp_flags & PF_OSFP_WSIZE_MOD) {
+ if (f->fp_wsize == 0 || find->fp_wsize %
+ f->fp_wsize)
+ continue;
+ } else {
+ if (f->fp_wsize != find->fp_wsize)
+ continue;
+ }
+ }
+ return (f);
+ }
+
+ return (NULL);
+}
+
+/* Find an exact fingerprint in the list */
+static struct pf_os_fingerprint *
+pf_osfp_find_exact(struct pf_osfp_list *list, struct pf_os_fingerprint *find)
+{
+ struct pf_os_fingerprint *f;
+
+ SLIST_FOREACH(f, list, fp_next) {
+ if (f->fp_tcpopts == find->fp_tcpopts &&
+ f->fp_wsize == find->fp_wsize &&
+ f->fp_psize == find->fp_psize &&
+ f->fp_mss == find->fp_mss &&
+ f->fp_flags == find->fp_flags &&
+ f->fp_optcnt == find->fp_optcnt &&
+ f->fp_wscale == find->fp_wscale &&
+ f->fp_ttl == find->fp_ttl)
+ return (f);
+ }
+
+ return (NULL);
+}
+
+/* Insert a fingerprint into the list */
+static void
+pf_osfp_insert(struct pf_osfp_list *list, struct pf_os_fingerprint *ins)
+{
+ struct pf_os_fingerprint *f, *prev = NULL;
+
+ /* XXX need to go semi tree based. can key on tcp options */
+
+ SLIST_FOREACH(f, list, fp_next)
+ prev = f;
+ if (prev)
+ SLIST_INSERT_AFTER(prev, ins, fp_next);
+ else
+ SLIST_INSERT_HEAD(list, ins, fp_next);
+}
+
+/* Fill a fingerprint by its number (from an ioctl) */
+int
+pf_osfp_get(struct pf_osfp_ioctl *fpioc)
+{
+ struct pf_os_fingerprint *fp;
+ struct pf_osfp_entry *entry;
+ int num = fpioc->fp_getnum;
+ int i = 0;
+
+
+ memset(fpioc, 0, sizeof(*fpioc));
+ SLIST_FOREACH(fp, &V_pf_osfp_list, fp_next) {
+ SLIST_FOREACH(entry, &fp->fp_oses, fp_entry) {
+ if (i++ == num) {
+ fpioc->fp_mss = fp->fp_mss;
+ fpioc->fp_wsize = fp->fp_wsize;
+ fpioc->fp_flags = fp->fp_flags;
+ fpioc->fp_psize = fp->fp_psize;
+ fpioc->fp_ttl = fp->fp_ttl;
+ fpioc->fp_wscale = fp->fp_wscale;
+ fpioc->fp_getnum = num;
+ memcpy(&fpioc->fp_os, entry,
+ sizeof(fpioc->fp_os));
+ return (0);
+ }
+ }
+ }
+
+ return (EBUSY);
+}
+
+
+#ifdef PFDEBUG
+/* Validate that each signature is reachable */
+static struct pf_os_fingerprint *
+pf_osfp_validate(void)
+{
+ struct pf_os_fingerprint *f, *f2, find;
+
+ SLIST_FOREACH(f, &V_pf_osfp_list, fp_next) {
+ memcpy(&find, f, sizeof(find));
+
+ /* We do a few MSS/th_win percolations to make things unique */
+ if (find.fp_mss == 0)
+ find.fp_mss = 128;
+ if (f->fp_flags & PF_OSFP_WSIZE_MSS)
+ find.fp_wsize *= find.fp_mss;
+ else if (f->fp_flags & PF_OSFP_WSIZE_MTU)
+ find.fp_wsize *= (find.fp_mss + 40);
+ else if (f->fp_flags & PF_OSFP_WSIZE_MOD)
+ find.fp_wsize *= 2;
+ if (f != (f2 = pf_osfp_find(&V_pf_osfp_list, &find, 0))) {
+ if (f2)
+ printf("Found \"%s %s %s\" instead of "
+ "\"%s %s %s\"\n",
+ SLIST_FIRST(&f2->fp_oses)->fp_class_nm,
+ SLIST_FIRST(&f2->fp_oses)->fp_version_nm,
+ SLIST_FIRST(&f2->fp_oses)->fp_subtype_nm,
+ SLIST_FIRST(&f->fp_oses)->fp_class_nm,
+ SLIST_FIRST(&f->fp_oses)->fp_version_nm,
+ SLIST_FIRST(&f->fp_oses)->fp_subtype_nm);
+ else
+ printf("Couldn't find \"%s %s %s\"\n",
+ SLIST_FIRST(&f->fp_oses)->fp_class_nm,
+ SLIST_FIRST(&f->fp_oses)->fp_version_nm,
+ SLIST_FIRST(&f->fp_oses)->fp_subtype_nm);
+ return (f);
+ }
+ }
+ return (NULL);
+}
+#endif /* PFDEBUG */
diff --git a/freebsd/sys/netpfil/pf/pf_ruleset.c b/freebsd/sys/netpfil/pf/pf_ruleset.c
new file mode 100644
index 00000000..e16643aa
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/pf_ruleset.c
@@ -0,0 +1,426 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002,2003 Henning Brauer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ * $OpenBSD: pf_ruleset.c,v 1.2 2008/12/18 15:31:37 dhill Exp $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/socket.h>
+#ifdef _KERNEL
+# include <sys/systm.h>
+# include <sys/refcount.h>
+#endif /* _KERNEL */
+#include <sys/mbuf.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+
+#include <net/if.h>
+#include <net/vnet.h>
+#include <net/pfvar.h>
+
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif /* INET6 */
+
+
+#ifdef _KERNEL
+#define DPFPRINTF(format, x...) \
+ if (V_pf_status.debug >= PF_DEBUG_NOISY) \
+ printf(format , ##x)
+#define rs_malloc(x) malloc(x, M_TEMP, M_NOWAIT|M_ZERO)
+#define rs_free(x) free(x, M_TEMP)
+
+#else
+/* Userland equivalents so we can lend code to pfctl et al. */
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define rs_malloc(x) calloc(1, x)
+#define rs_free(x) free(x)
+
+#ifdef PFDEBUG
+#include <sys/stdarg.h>
+#define DPFPRINTF(format, x...) fprintf(stderr, format , ##x)
+#else
+#define DPFPRINTF(format, x...) ((void)0)
+#endif /* PFDEBUG */
+#endif /* _KERNEL */
+
+#ifdef _KERNEL
+VNET_DEFINE(struct pf_anchor_global, pf_anchors);
+VNET_DEFINE(struct pf_anchor, pf_main_anchor);
+#else /* ! _KERNEL */
+struct pf_anchor_global pf_anchors;
+struct pf_anchor pf_main_anchor;
+#undef V_pf_anchors
+#define V_pf_anchors pf_anchors
+#undef pf_main_ruleset
+#define pf_main_ruleset pf_main_anchor.ruleset
+#endif /* _KERNEL */
+
+static __inline int pf_anchor_compare(struct pf_anchor *, struct pf_anchor *);
+
+static struct pf_anchor *pf_find_anchor(const char *);
+
+RB_GENERATE(pf_anchor_global, pf_anchor, entry_global, pf_anchor_compare);
+RB_GENERATE(pf_anchor_node, pf_anchor, entry_node, pf_anchor_compare);
+
+static __inline int
+pf_anchor_compare(struct pf_anchor *a, struct pf_anchor *b)
+{
+ int c = strcmp(a->path, b->path);
+
+ return (c ? (c < 0 ? -1 : 1) : 0);
+}
+
+int
+pf_get_ruleset_number(u_int8_t action)
+{
+ switch (action) {
+ case PF_SCRUB:
+ case PF_NOSCRUB:
+ return (PF_RULESET_SCRUB);
+ break;
+ case PF_PASS:
+ case PF_DROP:
+ return (PF_RULESET_FILTER);
+ break;
+ case PF_NAT:
+ case PF_NONAT:
+ return (PF_RULESET_NAT);
+ break;
+ case PF_BINAT:
+ case PF_NOBINAT:
+ return (PF_RULESET_BINAT);
+ break;
+ case PF_RDR:
+ case PF_NORDR:
+ return (PF_RULESET_RDR);
+ break;
+ default:
+ return (PF_RULESET_MAX);
+ break;
+ }
+}
+
+void
+pf_init_ruleset(struct pf_ruleset *ruleset)
+{
+ int i;
+
+ memset(ruleset, 0, sizeof(struct pf_ruleset));
+ for (i = 0; i < PF_RULESET_MAX; i++) {
+ TAILQ_INIT(&ruleset->rules[i].queues[0]);
+ TAILQ_INIT(&ruleset->rules[i].queues[1]);
+ ruleset->rules[i].active.ptr = &ruleset->rules[i].queues[0];
+ ruleset->rules[i].inactive.ptr = &ruleset->rules[i].queues[1];
+ }
+}
+
+static struct pf_anchor *
+pf_find_anchor(const char *path)
+{
+ struct pf_anchor *key, *found;
+
+ key = (struct pf_anchor *)rs_malloc(sizeof(*key));
+ if (key == NULL)
+ return (NULL);
+ strlcpy(key->path, path, sizeof(key->path));
+ found = RB_FIND(pf_anchor_global, &V_pf_anchors, key);
+ rs_free(key);
+ return (found);
+}
+
+struct pf_ruleset *
+pf_find_ruleset(const char *path)
+{
+ struct pf_anchor *anchor;
+
+ while (*path == '/')
+ path++;
+ if (!*path)
+ return (&pf_main_ruleset);
+ anchor = pf_find_anchor(path);
+ if (anchor == NULL)
+ return (NULL);
+ else
+ return (&anchor->ruleset);
+}
+
+struct pf_ruleset *
+pf_find_or_create_ruleset(const char *path)
+{
+ char *p, *q, *r;
+ struct pf_ruleset *ruleset;
+ struct pf_anchor *anchor = NULL, *dup, *parent = NULL;
+
+ if (path[0] == 0)
+ return (&pf_main_ruleset);
+ while (*path == '/')
+ path++;
+ ruleset = pf_find_ruleset(path);
+ if (ruleset != NULL)
+ return (ruleset);
+ p = (char *)rs_malloc(MAXPATHLEN);
+ if (p == NULL)
+ return (NULL);
+ strlcpy(p, path, MAXPATHLEN);
+ while (parent == NULL && (q = strrchr(p, '/')) != NULL) {
+ *q = 0;
+ if ((ruleset = pf_find_ruleset(p)) != NULL) {
+ parent = ruleset->anchor;
+ break;
+ }
+ }
+ if (q == NULL)
+ q = p;
+ else
+ q++;
+ strlcpy(p, path, MAXPATHLEN);
+ if (!*q) {
+ rs_free(p);
+ return (NULL);
+ }
+ while ((r = strchr(q, '/')) != NULL || *q) {
+ if (r != NULL)
+ *r = 0;
+ if (!*q || strlen(q) >= PF_ANCHOR_NAME_SIZE ||
+ (parent != NULL && strlen(parent->path) >=
+ MAXPATHLEN - PF_ANCHOR_NAME_SIZE - 1)) {
+ rs_free(p);
+ return (NULL);
+ }
+ anchor = (struct pf_anchor *)rs_malloc(sizeof(*anchor));
+ if (anchor == NULL) {
+ rs_free(p);
+ return (NULL);
+ }
+ RB_INIT(&anchor->children);
+ strlcpy(anchor->name, q, sizeof(anchor->name));
+ if (parent != NULL) {
+ strlcpy(anchor->path, parent->path,
+ sizeof(anchor->path));
+ strlcat(anchor->path, "/", sizeof(anchor->path));
+ }
+ strlcat(anchor->path, anchor->name, sizeof(anchor->path));
+ if ((dup = RB_INSERT(pf_anchor_global, &V_pf_anchors, anchor)) !=
+ NULL) {
+ printf("pf_find_or_create_ruleset: RB_INSERT1 "
+ "'%s' '%s' collides with '%s' '%s'\n",
+ anchor->path, anchor->name, dup->path, dup->name);
+ rs_free(anchor);
+ rs_free(p);
+ return (NULL);
+ }
+ if (parent != NULL) {
+ anchor->parent = parent;
+ if ((dup = RB_INSERT(pf_anchor_node, &parent->children,
+ anchor)) != NULL) {
+ printf("pf_find_or_create_ruleset: "
+ "RB_INSERT2 '%s' '%s' collides with "
+ "'%s' '%s'\n", anchor->path, anchor->name,
+ dup->path, dup->name);
+ RB_REMOVE(pf_anchor_global, &V_pf_anchors,
+ anchor);
+ rs_free(anchor);
+ rs_free(p);
+ return (NULL);
+ }
+ }
+ pf_init_ruleset(&anchor->ruleset);
+ anchor->ruleset.anchor = anchor;
+ parent = anchor;
+ if (r != NULL)
+ q = r + 1;
+ else
+ *q = 0;
+ }
+ rs_free(p);
+ return (&anchor->ruleset);
+}
+
+void
+pf_remove_if_empty_ruleset(struct pf_ruleset *ruleset)
+{
+ struct pf_anchor *parent;
+ int i;
+
+ while (ruleset != NULL) {
+ if (ruleset == &pf_main_ruleset || ruleset->anchor == NULL ||
+ !RB_EMPTY(&ruleset->anchor->children) ||
+ ruleset->anchor->refcnt > 0 || ruleset->tables > 0 ||
+ ruleset->topen)
+ return;
+ for (i = 0; i < PF_RULESET_MAX; ++i)
+ if (!TAILQ_EMPTY(ruleset->rules[i].active.ptr) ||
+ !TAILQ_EMPTY(ruleset->rules[i].inactive.ptr) ||
+ ruleset->rules[i].inactive.open)
+ return;
+ RB_REMOVE(pf_anchor_global, &V_pf_anchors, ruleset->anchor);
+ if ((parent = ruleset->anchor->parent) != NULL)
+ RB_REMOVE(pf_anchor_node, &parent->children,
+ ruleset->anchor);
+ rs_free(ruleset->anchor);
+ if (parent == NULL)
+ return;
+ ruleset = &parent->ruleset;
+ }
+}
+
+int
+pf_anchor_setup(struct pf_rule *r, const struct pf_ruleset *s,
+ const char *name)
+{
+ char *p, *path;
+ struct pf_ruleset *ruleset;
+
+ r->anchor = NULL;
+ r->anchor_relative = 0;
+ r->anchor_wildcard = 0;
+ if (!name[0])
+ return (0);
+ path = (char *)rs_malloc(MAXPATHLEN);
+ if (path == NULL)
+ return (1);
+ if (name[0] == '/')
+ strlcpy(path, name + 1, MAXPATHLEN);
+ else {
+ /* relative path */
+ r->anchor_relative = 1;
+ if (s->anchor == NULL || !s->anchor->path[0])
+ path[0] = 0;
+ else
+ strlcpy(path, s->anchor->path, MAXPATHLEN);
+ while (name[0] == '.' && name[1] == '.' && name[2] == '/') {
+ if (!path[0]) {
+ printf("pf_anchor_setup: .. beyond root\n");
+ rs_free(path);
+ return (1);
+ }
+ if ((p = strrchr(path, '/')) != NULL)
+ *p = 0;
+ else
+ path[0] = 0;
+ r->anchor_relative++;
+ name += 3;
+ }
+ if (path[0])
+ strlcat(path, "/", MAXPATHLEN);
+ strlcat(path, name, MAXPATHLEN);
+ }
+ if ((p = strrchr(path, '/')) != NULL && !strcmp(p, "/*")) {
+ r->anchor_wildcard = 1;
+ *p = 0;
+ }
+ ruleset = pf_find_or_create_ruleset(path);
+ rs_free(path);
+ if (ruleset == NULL || ruleset->anchor == NULL) {
+ printf("pf_anchor_setup: ruleset\n");
+ return (1);
+ }
+ r->anchor = ruleset->anchor;
+ r->anchor->refcnt++;
+ return (0);
+}
+
+int
+pf_anchor_copyout(const struct pf_ruleset *rs, const struct pf_rule *r,
+ struct pfioc_rule *pr)
+{
+ pr->anchor_call[0] = 0;
+ if (r->anchor == NULL)
+ return (0);
+ if (!r->anchor_relative) {
+ strlcpy(pr->anchor_call, "/", sizeof(pr->anchor_call));
+ strlcat(pr->anchor_call, r->anchor->path,
+ sizeof(pr->anchor_call));
+ } else {
+ char *a, *p;
+ int i;
+
+ a = (char *)rs_malloc(MAXPATHLEN);
+ if (a == NULL)
+ return (1);
+ if (rs->anchor == NULL)
+ a[0] = 0;
+ else
+ strlcpy(a, rs->anchor->path, MAXPATHLEN);
+ for (i = 1; i < r->anchor_relative; ++i) {
+ if ((p = strrchr(a, '/')) == NULL)
+ p = a;
+ *p = 0;
+ strlcat(pr->anchor_call, "../",
+ sizeof(pr->anchor_call));
+ }
+ if (strncmp(a, r->anchor->path, strlen(a))) {
+ printf("pf_anchor_copyout: '%s' '%s'\n", a,
+ r->anchor->path);
+ rs_free(a);
+ return (1);
+ }
+ if (strlen(r->anchor->path) > strlen(a))
+ strlcat(pr->anchor_call, r->anchor->path + (a[0] ?
+ strlen(a) + 1 : 0), sizeof(pr->anchor_call));
+ rs_free(a);
+ }
+ if (r->anchor_wildcard)
+ strlcat(pr->anchor_call, pr->anchor_call[0] ? "/*" : "*",
+ sizeof(pr->anchor_call));
+ return (0);
+}
+
+void
+pf_anchor_remove(struct pf_rule *r)
+{
+ if (r->anchor == NULL)
+ return;
+ if (r->anchor->refcnt <= 0) {
+ printf("pf_anchor_remove: broken refcount\n");
+ r->anchor = NULL;
+ return;
+ }
+ if (!--r->anchor->refcnt)
+ pf_remove_if_empty_ruleset(&r->anchor->ruleset);
+ r->anchor = NULL;
+}
diff --git a/freebsd/sys/netpfil/pf/pf_table.c b/freebsd/sys/netpfil/pf/pf_table.c
new file mode 100644
index 00000000..26b6f4e9
--- /dev/null
+++ b/freebsd/sys/netpfil/pf/pf_table.c
@@ -0,0 +1,2195 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2002 Cedric Berger
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $OpenBSD: pf_table.c,v 1.79 2008/10/08 06:24:50 mcbride Exp $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/local/opt_inet.h>
+#include <rtems/bsd/local/opt_inet6.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <vm/uma.h>
+
+#include <net/if.h>
+#include <net/vnet.h>
+#include <net/pfvar.h>
+
+#define ACCEPT_FLAGS(flags, oklist) \
+ do { \
+ if ((flags & ~(oklist)) & \
+ PFR_FLAG_ALLMASK) \
+ return (EINVAL); \
+ } while (0)
+
+#define FILLIN_SIN(sin, addr) \
+ do { \
+ (sin).sin_len = sizeof(sin); \
+ (sin).sin_family = AF_INET; \
+ (sin).sin_addr = (addr); \
+ } while (0)
+
+#define FILLIN_SIN6(sin6, addr) \
+ do { \
+ (sin6).sin6_len = sizeof(sin6); \
+ (sin6).sin6_family = AF_INET6; \
+ (sin6).sin6_addr = (addr); \
+ } while (0)
+
+#define SWAP(type, a1, a2) \
+ do { \
+ type tmp = a1; \
+ a1 = a2; \
+ a2 = tmp; \
+ } while (0)
+
+#define SUNION2PF(su, af) (((af)==AF_INET) ? \
+ (struct pf_addr *)&(su)->sin.sin_addr : \
+ (struct pf_addr *)&(su)->sin6.sin6_addr)
+
+#define AF_BITS(af) (((af)==AF_INET)?32:128)
+#define ADDR_NETWORK(ad) ((ad)->pfra_net < AF_BITS((ad)->pfra_af))
+#define KENTRY_NETWORK(ke) ((ke)->pfrke_net < AF_BITS((ke)->pfrke_af))
+#define KENTRY_RNF_ROOT(ke) \
+ ((((struct radix_node *)(ke))->rn_flags & RNF_ROOT) != 0)
+
+#define NO_ADDRESSES (-1)
+#define ENQUEUE_UNMARKED_ONLY (1)
+#define INVERT_NEG_FLAG (1)
+
+struct pfr_walktree {
+ enum pfrw_op {
+ PFRW_MARK,
+ PFRW_SWEEP,
+ PFRW_ENQUEUE,
+ PFRW_GET_ADDRS,
+ PFRW_GET_ASTATS,
+ PFRW_POOL_GET,
+ PFRW_DYNADDR_UPDATE
+ } pfrw_op;
+ union {
+ struct pfr_addr *pfrw1_addr;
+ struct pfr_astats *pfrw1_astats;
+ struct pfr_kentryworkq *pfrw1_workq;
+ struct pfr_kentry *pfrw1_kentry;
+ struct pfi_dynaddr *pfrw1_dyn;
+ } pfrw_1;
+ int pfrw_free;
+};
+#define pfrw_addr pfrw_1.pfrw1_addr
+#define pfrw_astats pfrw_1.pfrw1_astats
+#define pfrw_workq pfrw_1.pfrw1_workq
+#define pfrw_kentry pfrw_1.pfrw1_kentry
+#define pfrw_dyn pfrw_1.pfrw1_dyn
+#define pfrw_cnt pfrw_free
+
+#define senderr(e) do { rv = (e); goto _bad; } while (0)
+
+static MALLOC_DEFINE(M_PFTABLE, "pf_table", "pf(4) tables structures");
+static VNET_DEFINE(uma_zone_t, pfr_kentry_z);
+#define V_pfr_kentry_z VNET(pfr_kentry_z)
+static VNET_DEFINE(uma_zone_t, pfr_kcounters_z);
+#define V_pfr_kcounters_z VNET(pfr_kcounters_z)
+
+static struct pf_addr pfr_ffaddr = {
+ .addr32 = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }
+};
+
+static void pfr_copyout_addr(struct pfr_addr *,
+ struct pfr_kentry *ke);
+static int pfr_validate_addr(struct pfr_addr *);
+static void pfr_enqueue_addrs(struct pfr_ktable *,
+ struct pfr_kentryworkq *, int *, int);
+static void pfr_mark_addrs(struct pfr_ktable *);
+static struct pfr_kentry
+ *pfr_lookup_addr(struct pfr_ktable *,
+ struct pfr_addr *, int);
+static struct pfr_kentry *pfr_create_kentry(struct pfr_addr *);
+static void pfr_destroy_kentries(struct pfr_kentryworkq *);
+static void pfr_destroy_kentry(struct pfr_kentry *);
+static void pfr_insert_kentries(struct pfr_ktable *,
+ struct pfr_kentryworkq *, long);
+static void pfr_remove_kentries(struct pfr_ktable *,
+ struct pfr_kentryworkq *);
+static void pfr_clstats_kentries(struct pfr_kentryworkq *, long,
+ int);
+static void pfr_reset_feedback(struct pfr_addr *, int);
+static void pfr_prepare_network(union sockaddr_union *, int, int);
+static int pfr_route_kentry(struct pfr_ktable *,
+ struct pfr_kentry *);
+static int pfr_unroute_kentry(struct pfr_ktable *,
+ struct pfr_kentry *);
+static int pfr_walktree(struct radix_node *, void *);
+static int pfr_validate_table(struct pfr_table *, int, int);
+static int pfr_fix_anchor(char *);
+static void pfr_commit_ktable(struct pfr_ktable *, long);
+static void pfr_insert_ktables(struct pfr_ktableworkq *);
+static void pfr_insert_ktable(struct pfr_ktable *);
+static void pfr_setflags_ktables(struct pfr_ktableworkq *);
+static void pfr_setflags_ktable(struct pfr_ktable *, int);
+static void pfr_clstats_ktables(struct pfr_ktableworkq *, long,
+ int);
+static void pfr_clstats_ktable(struct pfr_ktable *, long, int);
+static struct pfr_ktable
+ *pfr_create_ktable(struct pfr_table *, long, int);
+static void pfr_destroy_ktables(struct pfr_ktableworkq *, int);
+static void pfr_destroy_ktable(struct pfr_ktable *, int);
+static int pfr_ktable_compare(struct pfr_ktable *,
+ struct pfr_ktable *);
+static struct pfr_ktable
+ *pfr_lookup_table(struct pfr_table *);
+static void pfr_clean_node_mask(struct pfr_ktable *,
+ struct pfr_kentryworkq *);
+static int pfr_table_count(struct pfr_table *, int);
+static int pfr_skip_table(struct pfr_table *,
+ struct pfr_ktable *, int);
+static struct pfr_kentry
+ *pfr_kentry_byidx(struct pfr_ktable *, int, int);
+
+static RB_PROTOTYPE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare);
+static RB_GENERATE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare);
+
+struct pfr_ktablehead pfr_ktables;
+struct pfr_table pfr_nulltable;
+int pfr_ktable_cnt;
+
+void
+pfr_initialize(void)
+{
+
+ V_pfr_kentry_z = uma_zcreate("pf table entries",
+ sizeof(struct pfr_kentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
+ 0);
+ V_pfr_kcounters_z = uma_zcreate("pf table counters",
+ sizeof(struct pfr_kcounters), NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+ V_pf_limits[PF_LIMIT_TABLE_ENTRIES].zone = V_pfr_kentry_z;
+ V_pf_limits[PF_LIMIT_TABLE_ENTRIES].limit = PFR_KENTRY_HIWAT;
+}
+
+void
+pfr_cleanup(void)
+{
+
+ uma_zdestroy(V_pfr_kentry_z);
+ uma_zdestroy(V_pfr_kcounters_z);
+}
+
+int
+pfr_clr_addrs(struct pfr_table *tbl, int *ndel, int flags)
+{
+ struct pfr_ktable *kt;
+ struct pfr_kentryworkq workq;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ if (kt->pfrkt_flags & PFR_TFLAG_CONST)
+ return (EPERM);
+ pfr_enqueue_addrs(kt, &workq, ndel, 0);
+
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ pfr_remove_kentries(kt, &workq);
+ KASSERT(kt->pfrkt_cnt == 0, ("%s: non-null pfrkt_cnt", __func__));
+ }
+ return (0);
+}
+
+int
+pfr_add_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size,
+ int *nadd, int flags)
+{
+ struct pfr_ktable *kt, *tmpkt;
+ struct pfr_kentryworkq workq;
+ struct pfr_kentry *p, *q;
+ struct pfr_addr *ad;
+ int i, rv, xadd = 0;
+ long tzero = time_second;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK);
+ if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ if (kt->pfrkt_flags & PFR_TFLAG_CONST)
+ return (EPERM);
+ tmpkt = pfr_create_ktable(&pfr_nulltable, 0, 0);
+ if (tmpkt == NULL)
+ return (ENOMEM);
+ SLIST_INIT(&workq);
+ for (i = 0, ad = addr; i < size; i++, ad++) {
+ if (pfr_validate_addr(ad))
+ senderr(EINVAL);
+ p = pfr_lookup_addr(kt, ad, 1);
+ q = pfr_lookup_addr(tmpkt, ad, 1);
+ if (flags & PFR_FLAG_FEEDBACK) {
+ if (q != NULL)
+ ad->pfra_fback = PFR_FB_DUPLICATE;
+ else if (p == NULL)
+ ad->pfra_fback = PFR_FB_ADDED;
+ else if (p->pfrke_not != ad->pfra_not)
+ ad->pfra_fback = PFR_FB_CONFLICT;
+ else
+ ad->pfra_fback = PFR_FB_NONE;
+ }
+ if (p == NULL && q == NULL) {
+ p = pfr_create_kentry(ad);
+ if (p == NULL)
+ senderr(ENOMEM);
+ if (pfr_route_kentry(tmpkt, p)) {
+ pfr_destroy_kentry(p);
+ ad->pfra_fback = PFR_FB_NONE;
+ } else {
+ SLIST_INSERT_HEAD(&workq, p, pfrke_workq);
+ xadd++;
+ }
+ }
+ }
+ pfr_clean_node_mask(tmpkt, &workq);
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_insert_kentries(kt, &workq, tzero);
+ else
+ pfr_destroy_kentries(&workq);
+ if (nadd != NULL)
+ *nadd = xadd;
+ pfr_destroy_ktable(tmpkt, 0);
+ return (0);
+_bad:
+ pfr_clean_node_mask(tmpkt, &workq);
+ pfr_destroy_kentries(&workq);
+ if (flags & PFR_FLAG_FEEDBACK)
+ pfr_reset_feedback(addr, size);
+ pfr_destroy_ktable(tmpkt, 0);
+ return (rv);
+}
+
+int
+pfr_del_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size,
+ int *ndel, int flags)
+{
+ struct pfr_ktable *kt;
+ struct pfr_kentryworkq workq;
+ struct pfr_kentry *p;
+ struct pfr_addr *ad;
+ int i, rv, xdel = 0, log = 1;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK);
+ if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ if (kt->pfrkt_flags & PFR_TFLAG_CONST)
+ return (EPERM);
+ /*
+ * there are two algorithms to choose from here.
+ * with:
+ * n: number of addresses to delete
+ * N: number of addresses in the table
+ *
+ * one is O(N) and is better for large 'n'
+ * one is O(n*LOG(N)) and is better for small 'n'
+ *
+ * following code try to decide which one is best.
+ */
+ for (i = kt->pfrkt_cnt; i > 0; i >>= 1)
+ log++;
+ if (size > kt->pfrkt_cnt/log) {
+ /* full table scan */
+ pfr_mark_addrs(kt);
+ } else {
+ /* iterate over addresses to delete */
+ for (i = 0, ad = addr; i < size; i++, ad++) {
+ if (pfr_validate_addr(ad))
+ return (EINVAL);
+ p = pfr_lookup_addr(kt, ad, 1);
+ if (p != NULL)
+ p->pfrke_mark = 0;
+ }
+ }
+ SLIST_INIT(&workq);
+ for (i = 0, ad = addr; i < size; i++, ad++) {
+ if (pfr_validate_addr(ad))
+ senderr(EINVAL);
+ p = pfr_lookup_addr(kt, ad, 1);
+ if (flags & PFR_FLAG_FEEDBACK) {
+ if (p == NULL)
+ ad->pfra_fback = PFR_FB_NONE;
+ else if (p->pfrke_not != ad->pfra_not)
+ ad->pfra_fback = PFR_FB_CONFLICT;
+ else if (p->pfrke_mark)
+ ad->pfra_fback = PFR_FB_DUPLICATE;
+ else
+ ad->pfra_fback = PFR_FB_DELETED;
+ }
+ if (p != NULL && p->pfrke_not == ad->pfra_not &&
+ !p->pfrke_mark) {
+ p->pfrke_mark = 1;
+ SLIST_INSERT_HEAD(&workq, p, pfrke_workq);
+ xdel++;
+ }
+ }
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_remove_kentries(kt, &workq);
+ if (ndel != NULL)
+ *ndel = xdel;
+ return (0);
+_bad:
+ if (flags & PFR_FLAG_FEEDBACK)
+ pfr_reset_feedback(addr, size);
+ return (rv);
+}
+
+int
+pfr_set_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size,
+ int *size2, int *nadd, int *ndel, int *nchange, int flags,
+ u_int32_t ignore_pfrt_flags)
+{
+ struct pfr_ktable *kt, *tmpkt;
+ struct pfr_kentryworkq addq, delq, changeq;
+ struct pfr_kentry *p, *q;
+ struct pfr_addr ad;
+ int i, rv, xadd = 0, xdel = 0, xchange = 0;
+ long tzero = time_second;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK);
+ if (pfr_validate_table(tbl, ignore_pfrt_flags, flags &
+ PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ if (kt->pfrkt_flags & PFR_TFLAG_CONST)
+ return (EPERM);
+ tmpkt = pfr_create_ktable(&pfr_nulltable, 0, 0);
+ if (tmpkt == NULL)
+ return (ENOMEM);
+ pfr_mark_addrs(kt);
+ SLIST_INIT(&addq);
+ SLIST_INIT(&delq);
+ SLIST_INIT(&changeq);
+ for (i = 0; i < size; i++) {
+ /*
+ * XXXGL: undertand pf_if usage of this function
+ * and make ad a moving pointer
+ */
+ bcopy(addr + i, &ad, sizeof(ad));
+ if (pfr_validate_addr(&ad))
+ senderr(EINVAL);
+ ad.pfra_fback = PFR_FB_NONE;
+ p = pfr_lookup_addr(kt, &ad, 1);
+ if (p != NULL) {
+ if (p->pfrke_mark) {
+ ad.pfra_fback = PFR_FB_DUPLICATE;
+ goto _skip;
+ }
+ p->pfrke_mark = 1;
+ if (p->pfrke_not != ad.pfra_not) {
+ SLIST_INSERT_HEAD(&changeq, p, pfrke_workq);
+ ad.pfra_fback = PFR_FB_CHANGED;
+ xchange++;
+ }
+ } else {
+ q = pfr_lookup_addr(tmpkt, &ad, 1);
+ if (q != NULL) {
+ ad.pfra_fback = PFR_FB_DUPLICATE;
+ goto _skip;
+ }
+ p = pfr_create_kentry(&ad);
+ if (p == NULL)
+ senderr(ENOMEM);
+ if (pfr_route_kentry(tmpkt, p)) {
+ pfr_destroy_kentry(p);
+ ad.pfra_fback = PFR_FB_NONE;
+ } else {
+ SLIST_INSERT_HEAD(&addq, p, pfrke_workq);
+ ad.pfra_fback = PFR_FB_ADDED;
+ xadd++;
+ }
+ }
+_skip:
+ if (flags & PFR_FLAG_FEEDBACK)
+ bcopy(&ad, addr + i, sizeof(ad));
+ }
+ pfr_enqueue_addrs(kt, &delq, &xdel, ENQUEUE_UNMARKED_ONLY);
+ if ((flags & PFR_FLAG_FEEDBACK) && *size2) {
+ if (*size2 < size+xdel) {
+ *size2 = size+xdel;
+ senderr(0);
+ }
+ i = 0;
+ SLIST_FOREACH(p, &delq, pfrke_workq) {
+ pfr_copyout_addr(&ad, p);
+ ad.pfra_fback = PFR_FB_DELETED;
+ bcopy(&ad, addr + size + i, sizeof(ad));
+ i++;
+ }
+ }
+ pfr_clean_node_mask(tmpkt, &addq);
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ pfr_insert_kentries(kt, &addq, tzero);
+ pfr_remove_kentries(kt, &delq);
+ pfr_clstats_kentries(&changeq, tzero, INVERT_NEG_FLAG);
+ } else
+ pfr_destroy_kentries(&addq);
+ if (nadd != NULL)
+ *nadd = xadd;
+ if (ndel != NULL)
+ *ndel = xdel;
+ if (nchange != NULL)
+ *nchange = xchange;
+ if ((flags & PFR_FLAG_FEEDBACK) && size2)
+ *size2 = size+xdel;
+ pfr_destroy_ktable(tmpkt, 0);
+ return (0);
+_bad:
+ pfr_clean_node_mask(tmpkt, &addq);
+ pfr_destroy_kentries(&addq);
+ if (flags & PFR_FLAG_FEEDBACK)
+ pfr_reset_feedback(addr, size);
+ pfr_destroy_ktable(tmpkt, 0);
+ return (rv);
+}
+
+int
+pfr_tst_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size,
+ int *nmatch, int flags)
+{
+ struct pfr_ktable *kt;
+ struct pfr_kentry *p;
+ struct pfr_addr *ad;
+ int i, xmatch = 0;
+
+ PF_RULES_RASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_REPLACE);
+ if (pfr_validate_table(tbl, 0, 0))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+
+ for (i = 0, ad = addr; i < size; i++, ad++) {
+ if (pfr_validate_addr(ad))
+ return (EINVAL);
+ if (ADDR_NETWORK(ad))
+ return (EINVAL);
+ p = pfr_lookup_addr(kt, ad, 0);
+ if (flags & PFR_FLAG_REPLACE)
+ pfr_copyout_addr(ad, p);
+ ad->pfra_fback = (p == NULL) ? PFR_FB_NONE :
+ (p->pfrke_not ? PFR_FB_NOTMATCH : PFR_FB_MATCH);
+ if (p != NULL && !p->pfrke_not)
+ xmatch++;
+ }
+ if (nmatch != NULL)
+ *nmatch = xmatch;
+ return (0);
+}
+
+int
+pfr_get_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int *size,
+ int flags)
+{
+ struct pfr_ktable *kt;
+ struct pfr_walktree w;
+ int rv;
+
+ PF_RULES_RASSERT();
+
+ ACCEPT_FLAGS(flags, 0);
+ if (pfr_validate_table(tbl, 0, 0))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ if (kt->pfrkt_cnt > *size) {
+ *size = kt->pfrkt_cnt;
+ return (0);
+ }
+
+ bzero(&w, sizeof(w));
+ w.pfrw_op = PFRW_GET_ADDRS;
+ w.pfrw_addr = addr;
+ w.pfrw_free = kt->pfrkt_cnt;
+ rv = kt->pfrkt_ip4->rnh_walktree(&kt->pfrkt_ip4->rh, pfr_walktree, &w);
+ if (!rv)
+ rv = kt->pfrkt_ip6->rnh_walktree(&kt->pfrkt_ip6->rh,
+ pfr_walktree, &w);
+ if (rv)
+ return (rv);
+
+ KASSERT(w.pfrw_free == 0, ("%s: corruption detected (%d)", __func__,
+ w.pfrw_free));
+
+ *size = kt->pfrkt_cnt;
+ return (0);
+}
+
+int
+pfr_get_astats(struct pfr_table *tbl, struct pfr_astats *addr, int *size,
+ int flags)
+{
+ struct pfr_ktable *kt;
+ struct pfr_walktree w;
+ struct pfr_kentryworkq workq;
+ int rv;
+ long tzero = time_second;
+
+ PF_RULES_RASSERT();
+
+ /* XXX PFR_FLAG_CLSTATS disabled */
+ ACCEPT_FLAGS(flags, 0);
+ if (pfr_validate_table(tbl, 0, 0))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ if (kt->pfrkt_cnt > *size) {
+ *size = kt->pfrkt_cnt;
+ return (0);
+ }
+
+ bzero(&w, sizeof(w));
+ w.pfrw_op = PFRW_GET_ASTATS;
+ w.pfrw_astats = addr;
+ w.pfrw_free = kt->pfrkt_cnt;
+ rv = kt->pfrkt_ip4->rnh_walktree(&kt->pfrkt_ip4->rh, pfr_walktree, &w);
+ if (!rv)
+ rv = kt->pfrkt_ip6->rnh_walktree(&kt->pfrkt_ip6->rh,
+ pfr_walktree, &w);
+ if (!rv && (flags & PFR_FLAG_CLSTATS)) {
+ pfr_enqueue_addrs(kt, &workq, NULL, 0);
+ pfr_clstats_kentries(&workq, tzero, 0);
+ }
+ if (rv)
+ return (rv);
+
+ if (w.pfrw_free) {
+ printf("pfr_get_astats: corruption detected (%d).\n",
+ w.pfrw_free);
+ return (ENOTTY);
+ }
+ *size = kt->pfrkt_cnt;
+ return (0);
+}
+
+int
+pfr_clr_astats(struct pfr_table *tbl, struct pfr_addr *addr, int size,
+ int *nzero, int flags)
+{
+ struct pfr_ktable *kt;
+ struct pfr_kentryworkq workq;
+ struct pfr_kentry *p;
+ struct pfr_addr *ad;
+ int i, rv, xzero = 0;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK);
+ if (pfr_validate_table(tbl, 0, 0))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ SLIST_INIT(&workq);
+ for (i = 0, ad = addr; i < size; i++, ad++) {
+ if (pfr_validate_addr(ad))
+ senderr(EINVAL);
+ p = pfr_lookup_addr(kt, ad, 1);
+ if (flags & PFR_FLAG_FEEDBACK) {
+ ad->pfra_fback = (p != NULL) ?
+ PFR_FB_CLEARED : PFR_FB_NONE;
+ }
+ if (p != NULL) {
+ SLIST_INSERT_HEAD(&workq, p, pfrke_workq);
+ xzero++;
+ }
+ }
+
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_clstats_kentries(&workq, 0, 0);
+ if (nzero != NULL)
+ *nzero = xzero;
+ return (0);
+_bad:
+ if (flags & PFR_FLAG_FEEDBACK)
+ pfr_reset_feedback(addr, size);
+ return (rv);
+}
+
+static int
+pfr_validate_addr(struct pfr_addr *ad)
+{
+ int i;
+
+ switch (ad->pfra_af) {
+#ifdef INET
+ case AF_INET:
+ if (ad->pfra_net > 32)
+ return (-1);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (ad->pfra_net > 128)
+ return (-1);
+ break;
+#endif /* INET6 */
+ default:
+ return (-1);
+ }
+ if (ad->pfra_net < 128 &&
+ (((caddr_t)ad)[ad->pfra_net/8] & (0xFF >> (ad->pfra_net%8))))
+ return (-1);
+ for (i = (ad->pfra_net+7)/8; i < sizeof(ad->pfra_u); i++)
+ if (((caddr_t)ad)[i])
+ return (-1);
+ if (ad->pfra_not && ad->pfra_not != 1)
+ return (-1);
+ if (ad->pfra_fback)
+ return (-1);
+ return (0);
+}
+
+static void
+pfr_enqueue_addrs(struct pfr_ktable *kt, struct pfr_kentryworkq *workq,
+ int *naddr, int sweep)
+{
+ struct pfr_walktree w;
+
+ SLIST_INIT(workq);
+ bzero(&w, sizeof(w));
+ w.pfrw_op = sweep ? PFRW_SWEEP : PFRW_ENQUEUE;
+ w.pfrw_workq = workq;
+ if (kt->pfrkt_ip4 != NULL)
+ if (kt->pfrkt_ip4->rnh_walktree(&kt->pfrkt_ip4->rh,
+ pfr_walktree, &w))
+ printf("pfr_enqueue_addrs: IPv4 walktree failed.\n");
+ if (kt->pfrkt_ip6 != NULL)
+ if (kt->pfrkt_ip6->rnh_walktree(&kt->pfrkt_ip6->rh,
+ pfr_walktree, &w))
+ printf("pfr_enqueue_addrs: IPv6 walktree failed.\n");
+ if (naddr != NULL)
+ *naddr = w.pfrw_cnt;
+}
+
+static void
+pfr_mark_addrs(struct pfr_ktable *kt)
+{
+ struct pfr_walktree w;
+
+ bzero(&w, sizeof(w));
+ w.pfrw_op = PFRW_MARK;
+ if (kt->pfrkt_ip4->rnh_walktree(&kt->pfrkt_ip4->rh, pfr_walktree, &w))
+ printf("pfr_mark_addrs: IPv4 walktree failed.\n");
+ if (kt->pfrkt_ip6->rnh_walktree(&kt->pfrkt_ip6->rh, pfr_walktree, &w))
+ printf("pfr_mark_addrs: IPv6 walktree failed.\n");
+}
+
+
+static struct pfr_kentry *
+pfr_lookup_addr(struct pfr_ktable *kt, struct pfr_addr *ad, int exact)
+{
+ union sockaddr_union sa, mask;
+ struct radix_head *head = NULL;
+ struct pfr_kentry *ke;
+
+ PF_RULES_ASSERT();
+
+ bzero(&sa, sizeof(sa));
+ if (ad->pfra_af == AF_INET) {
+ FILLIN_SIN(sa.sin, ad->pfra_ip4addr);
+ head = &kt->pfrkt_ip4->rh;
+ } else if ( ad->pfra_af == AF_INET6 ) {
+ FILLIN_SIN6(sa.sin6, ad->pfra_ip6addr);
+ head = &kt->pfrkt_ip6->rh;
+ }
+ if (ADDR_NETWORK(ad)) {
+ pfr_prepare_network(&mask, ad->pfra_af, ad->pfra_net);
+ ke = (struct pfr_kentry *)rn_lookup(&sa, &mask, head);
+ if (ke && KENTRY_RNF_ROOT(ke))
+ ke = NULL;
+ } else {
+ ke = (struct pfr_kentry *)rn_match(&sa, head);
+ if (ke && KENTRY_RNF_ROOT(ke))
+ ke = NULL;
+ if (exact && ke && KENTRY_NETWORK(ke))
+ ke = NULL;
+ }
+ return (ke);
+}
+
+static struct pfr_kentry *
+pfr_create_kentry(struct pfr_addr *ad)
+{
+ struct pfr_kentry *ke;
+
+ ke = uma_zalloc(V_pfr_kentry_z, M_NOWAIT | M_ZERO);
+ if (ke == NULL)
+ return (NULL);
+
+ if (ad->pfra_af == AF_INET)
+ FILLIN_SIN(ke->pfrke_sa.sin, ad->pfra_ip4addr);
+ else if (ad->pfra_af == AF_INET6)
+ FILLIN_SIN6(ke->pfrke_sa.sin6, ad->pfra_ip6addr);
+ ke->pfrke_af = ad->pfra_af;
+ ke->pfrke_net = ad->pfra_net;
+ ke->pfrke_not = ad->pfra_not;
+ return (ke);
+}
+
+static void
+pfr_destroy_kentries(struct pfr_kentryworkq *workq)
+{
+ struct pfr_kentry *p, *q;
+
+ for (p = SLIST_FIRST(workq); p != NULL; p = q) {
+ q = SLIST_NEXT(p, pfrke_workq);
+ pfr_destroy_kentry(p);
+ }
+}
+
+static void
+pfr_destroy_kentry(struct pfr_kentry *ke)
+{
+ if (ke->pfrke_counters)
+ uma_zfree(V_pfr_kcounters_z, ke->pfrke_counters);
+ uma_zfree(V_pfr_kentry_z, ke);
+}
+
+static void
+pfr_insert_kentries(struct pfr_ktable *kt,
+ struct pfr_kentryworkq *workq, long tzero)
+{
+ struct pfr_kentry *p;
+ int rv, n = 0;
+
+ SLIST_FOREACH(p, workq, pfrke_workq) {
+ rv = pfr_route_kentry(kt, p);
+ if (rv) {
+ printf("pfr_insert_kentries: cannot route entry "
+ "(code=%d).\n", rv);
+ break;
+ }
+ p->pfrke_tzero = tzero;
+ n++;
+ }
+ kt->pfrkt_cnt += n;
+}
+
+int
+pfr_insert_kentry(struct pfr_ktable *kt, struct pfr_addr *ad, long tzero)
+{
+ struct pfr_kentry *p;
+ int rv;
+
+ p = pfr_lookup_addr(kt, ad, 1);
+ if (p != NULL)
+ return (0);
+ p = pfr_create_kentry(ad);
+ if (p == NULL)
+ return (ENOMEM);
+
+ rv = pfr_route_kentry(kt, p);
+ if (rv)
+ return (rv);
+
+ p->pfrke_tzero = tzero;
+ kt->pfrkt_cnt++;
+
+ return (0);
+}
+
+static void
+pfr_remove_kentries(struct pfr_ktable *kt,
+ struct pfr_kentryworkq *workq)
+{
+ struct pfr_kentry *p;
+ int n = 0;
+
+ SLIST_FOREACH(p, workq, pfrke_workq) {
+ pfr_unroute_kentry(kt, p);
+ n++;
+ }
+ kt->pfrkt_cnt -= n;
+ pfr_destroy_kentries(workq);
+}
+
+static void
+pfr_clean_node_mask(struct pfr_ktable *kt,
+ struct pfr_kentryworkq *workq)
+{
+ struct pfr_kentry *p;
+
+ SLIST_FOREACH(p, workq, pfrke_workq)
+ pfr_unroute_kentry(kt, p);
+}
+
+static void
+pfr_clstats_kentries(struct pfr_kentryworkq *workq, long tzero, int negchange)
+{
+ struct pfr_kentry *p;
+
+ SLIST_FOREACH(p, workq, pfrke_workq) {
+ if (negchange)
+ p->pfrke_not = !p->pfrke_not;
+ if (p->pfrke_counters) {
+ uma_zfree(V_pfr_kcounters_z, p->pfrke_counters);
+ p->pfrke_counters = NULL;
+ }
+ p->pfrke_tzero = tzero;
+ }
+}
+
+static void
+pfr_reset_feedback(struct pfr_addr *addr, int size)
+{
+ struct pfr_addr *ad;
+ int i;
+
+ for (i = 0, ad = addr; i < size; i++, ad++)
+ ad->pfra_fback = PFR_FB_NONE;
+}
+
+static void
+pfr_prepare_network(union sockaddr_union *sa, int af, int net)
+{
+ int i;
+
+ bzero(sa, sizeof(*sa));
+ if (af == AF_INET) {
+ sa->sin.sin_len = sizeof(sa->sin);
+ sa->sin.sin_family = AF_INET;
+ sa->sin.sin_addr.s_addr = net ? htonl(-1 << (32-net)) : 0;
+ } else if (af == AF_INET6) {
+ sa->sin6.sin6_len = sizeof(sa->sin6);
+ sa->sin6.sin6_family = AF_INET6;
+ for (i = 0; i < 4; i++) {
+ if (net <= 32) {
+ sa->sin6.sin6_addr.s6_addr32[i] =
+ net ? htonl(-1 << (32-net)) : 0;
+ break;
+ }
+ sa->sin6.sin6_addr.s6_addr32[i] = 0xFFFFFFFF;
+ net -= 32;
+ }
+ }
+}
+
+static int
+pfr_route_kentry(struct pfr_ktable *kt, struct pfr_kentry *ke)
+{
+ union sockaddr_union mask;
+ struct radix_node *rn;
+ struct radix_head *head = NULL;
+
+ PF_RULES_WASSERT();
+
+ bzero(ke->pfrke_node, sizeof(ke->pfrke_node));
+ if (ke->pfrke_af == AF_INET)
+ head = &kt->pfrkt_ip4->rh;
+ else if (ke->pfrke_af == AF_INET6)
+ head = &kt->pfrkt_ip6->rh;
+
+ if (KENTRY_NETWORK(ke)) {
+ pfr_prepare_network(&mask, ke->pfrke_af, ke->pfrke_net);
+ rn = rn_addroute(&ke->pfrke_sa, &mask, head, ke->pfrke_node);
+ } else
+ rn = rn_addroute(&ke->pfrke_sa, NULL, head, ke->pfrke_node);
+
+ return (rn == NULL ? -1 : 0);
+}
+
+static int
+pfr_unroute_kentry(struct pfr_ktable *kt, struct pfr_kentry *ke)
+{
+ union sockaddr_union mask;
+ struct radix_node *rn;
+ struct radix_head *head = NULL;
+
+ if (ke->pfrke_af == AF_INET)
+ head = &kt->pfrkt_ip4->rh;
+ else if (ke->pfrke_af == AF_INET6)
+ head = &kt->pfrkt_ip6->rh;
+
+ if (KENTRY_NETWORK(ke)) {
+ pfr_prepare_network(&mask, ke->pfrke_af, ke->pfrke_net);
+ rn = rn_delete(&ke->pfrke_sa, &mask, head);
+ } else
+ rn = rn_delete(&ke->pfrke_sa, NULL, head);
+
+ if (rn == NULL) {
+ printf("pfr_unroute_kentry: delete failed.\n");
+ return (-1);
+ }
+ return (0);
+}
+
+static void
+pfr_copyout_addr(struct pfr_addr *ad, struct pfr_kentry *ke)
+{
+ bzero(ad, sizeof(*ad));
+ if (ke == NULL)
+ return;
+ ad->pfra_af = ke->pfrke_af;
+ ad->pfra_net = ke->pfrke_net;
+ ad->pfra_not = ke->pfrke_not;
+ if (ad->pfra_af == AF_INET)
+ ad->pfra_ip4addr = ke->pfrke_sa.sin.sin_addr;
+ else if (ad->pfra_af == AF_INET6)
+ ad->pfra_ip6addr = ke->pfrke_sa.sin6.sin6_addr;
+}
+
+static int
+pfr_walktree(struct radix_node *rn, void *arg)
+{
+ struct pfr_kentry *ke = (struct pfr_kentry *)rn;
+ struct pfr_walktree *w = arg;
+
+ switch (w->pfrw_op) {
+ case PFRW_MARK:
+ ke->pfrke_mark = 0;
+ break;
+ case PFRW_SWEEP:
+ if (ke->pfrke_mark)
+ break;
+ /* FALLTHROUGH */
+ case PFRW_ENQUEUE:
+ SLIST_INSERT_HEAD(w->pfrw_workq, ke, pfrke_workq);
+ w->pfrw_cnt++;
+ break;
+ case PFRW_GET_ADDRS:
+ if (w->pfrw_free-- > 0) {
+ pfr_copyout_addr(w->pfrw_addr, ke);
+ w->pfrw_addr++;
+ }
+ break;
+ case PFRW_GET_ASTATS:
+ if (w->pfrw_free-- > 0) {
+ struct pfr_astats as;
+
+ pfr_copyout_addr(&as.pfras_a, ke);
+
+ if (ke->pfrke_counters) {
+ bcopy(ke->pfrke_counters->pfrkc_packets,
+ as.pfras_packets, sizeof(as.pfras_packets));
+ bcopy(ke->pfrke_counters->pfrkc_bytes,
+ as.pfras_bytes, sizeof(as.pfras_bytes));
+ } else {
+ bzero(as.pfras_packets, sizeof(as.pfras_packets));
+ bzero(as.pfras_bytes, sizeof(as.pfras_bytes));
+ as.pfras_a.pfra_fback = PFR_FB_NOCOUNT;
+ }
+ as.pfras_tzero = ke->pfrke_tzero;
+
+ bcopy(&as, w->pfrw_astats, sizeof(as));
+ w->pfrw_astats++;
+ }
+ break;
+ case PFRW_POOL_GET:
+ if (ke->pfrke_not)
+ break; /* negative entries are ignored */
+ if (!w->pfrw_cnt--) {
+ w->pfrw_kentry = ke;
+ return (1); /* finish search */
+ }
+ break;
+ case PFRW_DYNADDR_UPDATE:
+ {
+ union sockaddr_union pfr_mask;
+
+ if (ke->pfrke_af == AF_INET) {
+ if (w->pfrw_dyn->pfid_acnt4++ > 0)
+ break;
+ pfr_prepare_network(&pfr_mask, AF_INET, ke->pfrke_net);
+ w->pfrw_dyn->pfid_addr4 = *SUNION2PF(&ke->pfrke_sa,
+ AF_INET);
+ w->pfrw_dyn->pfid_mask4 = *SUNION2PF(&pfr_mask,
+ AF_INET);
+ } else if (ke->pfrke_af == AF_INET6){
+ if (w->pfrw_dyn->pfid_acnt6++ > 0)
+ break;
+ pfr_prepare_network(&pfr_mask, AF_INET6, ke->pfrke_net);
+ w->pfrw_dyn->pfid_addr6 = *SUNION2PF(&ke->pfrke_sa,
+ AF_INET6);
+ w->pfrw_dyn->pfid_mask6 = *SUNION2PF(&pfr_mask,
+ AF_INET6);
+ }
+ break;
+ }
+ }
+ return (0);
+}
+
+int
+pfr_clr_tables(struct pfr_table *filter, int *ndel, int flags)
+{
+ struct pfr_ktableworkq workq;
+ struct pfr_ktable *p;
+ int xdel = 0;
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_ALLRSETS);
+ if (pfr_fix_anchor(filter->pfrt_anchor))
+ return (EINVAL);
+ if (pfr_table_count(filter, flags) < 0)
+ return (ENOENT);
+
+ SLIST_INIT(&workq);
+ RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) {
+ if (pfr_skip_table(filter, p, flags))
+ continue;
+ if (!strcmp(p->pfrkt_anchor, PF_RESERVED_ANCHOR))
+ continue;
+ if (!(p->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ continue;
+ p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_ACTIVE;
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ xdel++;
+ }
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_setflags_ktables(&workq);
+ if (ndel != NULL)
+ *ndel = xdel;
+ return (0);
+}
+
+int
+pfr_add_tables(struct pfr_table *tbl, int size, int *nadd, int flags)
+{
+ struct pfr_ktableworkq addq, changeq;
+ struct pfr_ktable *p, *q, *r, key;
+ int i, rv, xadd = 0;
+ long tzero = time_second;
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ SLIST_INIT(&addq);
+ SLIST_INIT(&changeq);
+ for (i = 0; i < size; i++) {
+ bcopy(tbl+i, &key.pfrkt_t, sizeof(key.pfrkt_t));
+ if (pfr_validate_table(&key.pfrkt_t, PFR_TFLAG_USRMASK,
+ flags & PFR_FLAG_USERIOCTL))
+ senderr(EINVAL);
+ key.pfrkt_flags |= PFR_TFLAG_ACTIVE;
+ p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key);
+ if (p == NULL) {
+ p = pfr_create_ktable(&key.pfrkt_t, tzero, 1);
+ if (p == NULL)
+ senderr(ENOMEM);
+ SLIST_FOREACH(q, &addq, pfrkt_workq) {
+ if (!pfr_ktable_compare(p, q))
+ goto _skip;
+ }
+ SLIST_INSERT_HEAD(&addq, p, pfrkt_workq);
+ xadd++;
+ if (!key.pfrkt_anchor[0])
+ goto _skip;
+
+ /* find or create root table */
+ bzero(key.pfrkt_anchor, sizeof(key.pfrkt_anchor));
+ r = RB_FIND(pfr_ktablehead, &pfr_ktables, &key);
+ if (r != NULL) {
+ p->pfrkt_root = r;
+ goto _skip;
+ }
+ SLIST_FOREACH(q, &addq, pfrkt_workq) {
+ if (!pfr_ktable_compare(&key, q)) {
+ p->pfrkt_root = q;
+ goto _skip;
+ }
+ }
+ key.pfrkt_flags = 0;
+ r = pfr_create_ktable(&key.pfrkt_t, 0, 1);
+ if (r == NULL)
+ senderr(ENOMEM);
+ SLIST_INSERT_HEAD(&addq, r, pfrkt_workq);
+ p->pfrkt_root = r;
+ } else if (!(p->pfrkt_flags & PFR_TFLAG_ACTIVE)) {
+ SLIST_FOREACH(q, &changeq, pfrkt_workq)
+ if (!pfr_ktable_compare(&key, q))
+ goto _skip;
+ p->pfrkt_nflags = (p->pfrkt_flags &
+ ~PFR_TFLAG_USRMASK) | key.pfrkt_flags;
+ SLIST_INSERT_HEAD(&changeq, p, pfrkt_workq);
+ xadd++;
+ }
+_skip:
+ ;
+ }
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ pfr_insert_ktables(&addq);
+ pfr_setflags_ktables(&changeq);
+ } else
+ pfr_destroy_ktables(&addq, 0);
+ if (nadd != NULL)
+ *nadd = xadd;
+ return (0);
+_bad:
+ pfr_destroy_ktables(&addq, 0);
+ return (rv);
+}
+
+int
+pfr_del_tables(struct pfr_table *tbl, int size, int *ndel, int flags)
+{
+ struct pfr_ktableworkq workq;
+ struct pfr_ktable *p, *q, key;
+ int i, xdel = 0;
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ SLIST_INIT(&workq);
+ for (i = 0; i < size; i++) {
+ bcopy(tbl+i, &key.pfrkt_t, sizeof(key.pfrkt_t));
+ if (pfr_validate_table(&key.pfrkt_t, 0,
+ flags & PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key);
+ if (p != NULL && (p->pfrkt_flags & PFR_TFLAG_ACTIVE)) {
+ SLIST_FOREACH(q, &workq, pfrkt_workq)
+ if (!pfr_ktable_compare(p, q))
+ goto _skip;
+ p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_ACTIVE;
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ xdel++;
+ }
+_skip:
+ ;
+ }
+
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_setflags_ktables(&workq);
+ if (ndel != NULL)
+ *ndel = xdel;
+ return (0);
+}
+
+int
+pfr_get_tables(struct pfr_table *filter, struct pfr_table *tbl, int *size,
+ int flags)
+{
+ struct pfr_ktable *p;
+ int n, nn;
+
+ PF_RULES_RASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_ALLRSETS);
+ if (pfr_fix_anchor(filter->pfrt_anchor))
+ return (EINVAL);
+ n = nn = pfr_table_count(filter, flags);
+ if (n < 0)
+ return (ENOENT);
+ if (n > *size) {
+ *size = n;
+ return (0);
+ }
+ RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) {
+ if (pfr_skip_table(filter, p, flags))
+ continue;
+ if (n-- <= 0)
+ continue;
+ bcopy(&p->pfrkt_t, tbl++, sizeof(*tbl));
+ }
+
+ KASSERT(n == 0, ("%s: corruption detected (%d)", __func__, n));
+
+ *size = nn;
+ return (0);
+}
+
+int
+pfr_get_tstats(struct pfr_table *filter, struct pfr_tstats *tbl, int *size,
+ int flags)
+{
+ struct pfr_ktable *p;
+ struct pfr_ktableworkq workq;
+ int n, nn;
+ long tzero = time_second;
+
+ /* XXX PFR_FLAG_CLSTATS disabled */
+ ACCEPT_FLAGS(flags, PFR_FLAG_ALLRSETS);
+ if (pfr_fix_anchor(filter->pfrt_anchor))
+ return (EINVAL);
+ n = nn = pfr_table_count(filter, flags);
+ if (n < 0)
+ return (ENOENT);
+ if (n > *size) {
+ *size = n;
+ return (0);
+ }
+ SLIST_INIT(&workq);
+ RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) {
+ if (pfr_skip_table(filter, p, flags))
+ continue;
+ if (n-- <= 0)
+ continue;
+ bcopy(&p->pfrkt_ts, tbl++, sizeof(*tbl));
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ }
+ if (flags & PFR_FLAG_CLSTATS)
+ pfr_clstats_ktables(&workq, tzero,
+ flags & PFR_FLAG_ADDRSTOO);
+
+ KASSERT(n == 0, ("%s: corruption detected (%d)", __func__, n));
+
+ *size = nn;
+ return (0);
+}
+
+int
+pfr_clr_tstats(struct pfr_table *tbl, int size, int *nzero, int flags)
+{
+ struct pfr_ktableworkq workq;
+ struct pfr_ktable *p, key;
+ int i, xzero = 0;
+ long tzero = time_second;
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_ADDRSTOO);
+ SLIST_INIT(&workq);
+ for (i = 0; i < size; i++) {
+ bcopy(tbl + i, &key.pfrkt_t, sizeof(key.pfrkt_t));
+ if (pfr_validate_table(&key.pfrkt_t, 0, 0))
+ return (EINVAL);
+ p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key);
+ if (p != NULL) {
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ xzero++;
+ }
+ }
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_clstats_ktables(&workq, tzero, flags & PFR_FLAG_ADDRSTOO);
+ if (nzero != NULL)
+ *nzero = xzero;
+ return (0);
+}
+
+int
+pfr_set_tflags(struct pfr_table *tbl, int size, int setflag, int clrflag,
+ int *nchange, int *ndel, int flags)
+{
+ struct pfr_ktableworkq workq;
+ struct pfr_ktable *p, *q, key;
+ int i, xchange = 0, xdel = 0;
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ if ((setflag & ~PFR_TFLAG_USRMASK) ||
+ (clrflag & ~PFR_TFLAG_USRMASK) ||
+ (setflag & clrflag))
+ return (EINVAL);
+ SLIST_INIT(&workq);
+ for (i = 0; i < size; i++) {
+ bcopy(tbl + i, &key.pfrkt_t, sizeof(key.pfrkt_t));
+ if (pfr_validate_table(&key.pfrkt_t, 0,
+ flags & PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key);
+ if (p != NULL && (p->pfrkt_flags & PFR_TFLAG_ACTIVE)) {
+ p->pfrkt_nflags = (p->pfrkt_flags | setflag) &
+ ~clrflag;
+ if (p->pfrkt_nflags == p->pfrkt_flags)
+ goto _skip;
+ SLIST_FOREACH(q, &workq, pfrkt_workq)
+ if (!pfr_ktable_compare(p, q))
+ goto _skip;
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ if ((p->pfrkt_flags & PFR_TFLAG_PERSIST) &&
+ (clrflag & PFR_TFLAG_PERSIST) &&
+ !(p->pfrkt_flags & PFR_TFLAG_REFERENCED))
+ xdel++;
+ else
+ xchange++;
+ }
+_skip:
+ ;
+ }
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_setflags_ktables(&workq);
+ if (nchange != NULL)
+ *nchange = xchange;
+ if (ndel != NULL)
+ *ndel = xdel;
+ return (0);
+}
+
+int
+pfr_ina_begin(struct pfr_table *trs, u_int32_t *ticket, int *ndel, int flags)
+{
+ struct pfr_ktableworkq workq;
+ struct pfr_ktable *p;
+ struct pf_ruleset *rs;
+ int xdel = 0;
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ rs = pf_find_or_create_ruleset(trs->pfrt_anchor);
+ if (rs == NULL)
+ return (ENOMEM);
+ SLIST_INIT(&workq);
+ RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) {
+ if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) ||
+ pfr_skip_table(trs, p, 0))
+ continue;
+ p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_INACTIVE;
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ xdel++;
+ }
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ pfr_setflags_ktables(&workq);
+ if (ticket != NULL)
+ *ticket = ++rs->tticket;
+ rs->topen = 1;
+ } else
+ pf_remove_if_empty_ruleset(rs);
+ if (ndel != NULL)
+ *ndel = xdel;
+ return (0);
+}
+
+int
+pfr_ina_define(struct pfr_table *tbl, struct pfr_addr *addr, int size,
+ int *nadd, int *naddr, u_int32_t ticket, int flags)
+{
+ struct pfr_ktableworkq tableq;
+ struct pfr_kentryworkq addrq;
+ struct pfr_ktable *kt, *rt, *shadow, key;
+ struct pfr_kentry *p;
+ struct pfr_addr *ad;
+ struct pf_ruleset *rs;
+ int i, rv, xadd = 0, xaddr = 0;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_ADDRSTOO);
+ if (size && !(flags & PFR_FLAG_ADDRSTOO))
+ return (EINVAL);
+ if (pfr_validate_table(tbl, PFR_TFLAG_USRMASK,
+ flags & PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ rs = pf_find_ruleset(tbl->pfrt_anchor);
+ if (rs == NULL || !rs->topen || ticket != rs->tticket)
+ return (EBUSY);
+ tbl->pfrt_flags |= PFR_TFLAG_INACTIVE;
+ SLIST_INIT(&tableq);
+ kt = RB_FIND(pfr_ktablehead, &pfr_ktables, (struct pfr_ktable *)tbl);
+ if (kt == NULL) {
+ kt = pfr_create_ktable(tbl, 0, 1);
+ if (kt == NULL)
+ return (ENOMEM);
+ SLIST_INSERT_HEAD(&tableq, kt, pfrkt_workq);
+ xadd++;
+ if (!tbl->pfrt_anchor[0])
+ goto _skip;
+
+ /* find or create root table */
+ bzero(&key, sizeof(key));
+ strlcpy(key.pfrkt_name, tbl->pfrt_name, sizeof(key.pfrkt_name));
+ rt = RB_FIND(pfr_ktablehead, &pfr_ktables, &key);
+ if (rt != NULL) {
+ kt->pfrkt_root = rt;
+ goto _skip;
+ }
+ rt = pfr_create_ktable(&key.pfrkt_t, 0, 1);
+ if (rt == NULL) {
+ pfr_destroy_ktables(&tableq, 0);
+ return (ENOMEM);
+ }
+ SLIST_INSERT_HEAD(&tableq, rt, pfrkt_workq);
+ kt->pfrkt_root = rt;
+ } else if (!(kt->pfrkt_flags & PFR_TFLAG_INACTIVE))
+ xadd++;
+_skip:
+ shadow = pfr_create_ktable(tbl, 0, 0);
+ if (shadow == NULL) {
+ pfr_destroy_ktables(&tableq, 0);
+ return (ENOMEM);
+ }
+ SLIST_INIT(&addrq);
+ for (i = 0, ad = addr; i < size; i++, ad++) {
+ if (pfr_validate_addr(ad))
+ senderr(EINVAL);
+ if (pfr_lookup_addr(shadow, ad, 1) != NULL)
+ continue;
+ p = pfr_create_kentry(ad);
+ if (p == NULL)
+ senderr(ENOMEM);
+ if (pfr_route_kentry(shadow, p)) {
+ pfr_destroy_kentry(p);
+ continue;
+ }
+ SLIST_INSERT_HEAD(&addrq, p, pfrke_workq);
+ xaddr++;
+ }
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ if (kt->pfrkt_shadow != NULL)
+ pfr_destroy_ktable(kt->pfrkt_shadow, 1);
+ kt->pfrkt_flags |= PFR_TFLAG_INACTIVE;
+ pfr_insert_ktables(&tableq);
+ shadow->pfrkt_cnt = (flags & PFR_FLAG_ADDRSTOO) ?
+ xaddr : NO_ADDRESSES;
+ kt->pfrkt_shadow = shadow;
+ } else {
+ pfr_clean_node_mask(shadow, &addrq);
+ pfr_destroy_ktable(shadow, 0);
+ pfr_destroy_ktables(&tableq, 0);
+ pfr_destroy_kentries(&addrq);
+ }
+ if (nadd != NULL)
+ *nadd = xadd;
+ if (naddr != NULL)
+ *naddr = xaddr;
+ return (0);
+_bad:
+ pfr_destroy_ktable(shadow, 0);
+ pfr_destroy_ktables(&tableq, 0);
+ pfr_destroy_kentries(&addrq);
+ return (rv);
+}
+
+int
+pfr_ina_rollback(struct pfr_table *trs, u_int32_t ticket, int *ndel, int flags)
+{
+ struct pfr_ktableworkq workq;
+ struct pfr_ktable *p;
+ struct pf_ruleset *rs;
+ int xdel = 0;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ rs = pf_find_ruleset(trs->pfrt_anchor);
+ if (rs == NULL || !rs->topen || ticket != rs->tticket)
+ return (0);
+ SLIST_INIT(&workq);
+ RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) {
+ if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) ||
+ pfr_skip_table(trs, p, 0))
+ continue;
+ p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_INACTIVE;
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ xdel++;
+ }
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ pfr_setflags_ktables(&workq);
+ rs->topen = 0;
+ pf_remove_if_empty_ruleset(rs);
+ }
+ if (ndel != NULL)
+ *ndel = xdel;
+ return (0);
+}
+
+int
+pfr_ina_commit(struct pfr_table *trs, u_int32_t ticket, int *nadd,
+ int *nchange, int flags)
+{
+ struct pfr_ktable *p, *q;
+ struct pfr_ktableworkq workq;
+ struct pf_ruleset *rs;
+ int xadd = 0, xchange = 0;
+ long tzero = time_second;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ rs = pf_find_ruleset(trs->pfrt_anchor);
+ if (rs == NULL || !rs->topen || ticket != rs->tticket)
+ return (EBUSY);
+
+ SLIST_INIT(&workq);
+ RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) {
+ if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) ||
+ pfr_skip_table(trs, p, 0))
+ continue;
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ if (p->pfrkt_flags & PFR_TFLAG_ACTIVE)
+ xchange++;
+ else
+ xadd++;
+ }
+
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ for (p = SLIST_FIRST(&workq); p != NULL; p = q) {
+ q = SLIST_NEXT(p, pfrkt_workq);
+ pfr_commit_ktable(p, tzero);
+ }
+ rs->topen = 0;
+ pf_remove_if_empty_ruleset(rs);
+ }
+ if (nadd != NULL)
+ *nadd = xadd;
+ if (nchange != NULL)
+ *nchange = xchange;
+
+ return (0);
+}
+
+static void
+pfr_commit_ktable(struct pfr_ktable *kt, long tzero)
+{
+ struct pfr_ktable *shadow = kt->pfrkt_shadow;
+ int nflags;
+
+ PF_RULES_WASSERT();
+
+ if (shadow->pfrkt_cnt == NO_ADDRESSES) {
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ pfr_clstats_ktable(kt, tzero, 1);
+ } else if (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) {
+ /* kt might contain addresses */
+ struct pfr_kentryworkq addrq, addq, changeq, delq, garbageq;
+ struct pfr_kentry *p, *q, *next;
+ struct pfr_addr ad;
+
+ pfr_enqueue_addrs(shadow, &addrq, NULL, 0);
+ pfr_mark_addrs(kt);
+ SLIST_INIT(&addq);
+ SLIST_INIT(&changeq);
+ SLIST_INIT(&delq);
+ SLIST_INIT(&garbageq);
+ pfr_clean_node_mask(shadow, &addrq);
+ for (p = SLIST_FIRST(&addrq); p != NULL; p = next) {
+ next = SLIST_NEXT(p, pfrke_workq); /* XXX */
+ pfr_copyout_addr(&ad, p);
+ q = pfr_lookup_addr(kt, &ad, 1);
+ if (q != NULL) {
+ if (q->pfrke_not != p->pfrke_not)
+ SLIST_INSERT_HEAD(&changeq, q,
+ pfrke_workq);
+ q->pfrke_mark = 1;
+ SLIST_INSERT_HEAD(&garbageq, p, pfrke_workq);
+ } else {
+ p->pfrke_tzero = tzero;
+ SLIST_INSERT_HEAD(&addq, p, pfrke_workq);
+ }
+ }
+ pfr_enqueue_addrs(kt, &delq, NULL, ENQUEUE_UNMARKED_ONLY);
+ pfr_insert_kentries(kt, &addq, tzero);
+ pfr_remove_kentries(kt, &delq);
+ pfr_clstats_kentries(&changeq, tzero, INVERT_NEG_FLAG);
+ pfr_destroy_kentries(&garbageq);
+ } else {
+ /* kt cannot contain addresses */
+ SWAP(struct radix_node_head *, kt->pfrkt_ip4,
+ shadow->pfrkt_ip4);
+ SWAP(struct radix_node_head *, kt->pfrkt_ip6,
+ shadow->pfrkt_ip6);
+ SWAP(int, kt->pfrkt_cnt, shadow->pfrkt_cnt);
+ pfr_clstats_ktable(kt, tzero, 1);
+ }
+ nflags = ((shadow->pfrkt_flags & PFR_TFLAG_USRMASK) |
+ (kt->pfrkt_flags & PFR_TFLAG_SETMASK) | PFR_TFLAG_ACTIVE)
+ & ~PFR_TFLAG_INACTIVE;
+ pfr_destroy_ktable(shadow, 0);
+ kt->pfrkt_shadow = NULL;
+ pfr_setflags_ktable(kt, nflags);
+}
+
+static int
+pfr_validate_table(struct pfr_table *tbl, int allowedflags, int no_reserved)
+{
+ int i;
+
+ if (!tbl->pfrt_name[0])
+ return (-1);
+ if (no_reserved && !strcmp(tbl->pfrt_anchor, PF_RESERVED_ANCHOR))
+ return (-1);
+ if (tbl->pfrt_name[PF_TABLE_NAME_SIZE-1])
+ return (-1);
+ for (i = strlen(tbl->pfrt_name); i < PF_TABLE_NAME_SIZE; i++)
+ if (tbl->pfrt_name[i])
+ return (-1);
+ if (pfr_fix_anchor(tbl->pfrt_anchor))
+ return (-1);
+ if (tbl->pfrt_flags & ~allowedflags)
+ return (-1);
+ return (0);
+}
+
+/*
+ * Rewrite anchors referenced by tables to remove slashes
+ * and check for validity.
+ */
+static int
+pfr_fix_anchor(char *anchor)
+{
+ size_t siz = MAXPATHLEN;
+ int i;
+
+ if (anchor[0] == '/') {
+ char *path;
+ int off;
+
+ path = anchor;
+ off = 1;
+ while (*++path == '/')
+ off++;
+ bcopy(path, anchor, siz - off);
+ memset(anchor + siz - off, 0, off);
+ }
+ if (anchor[siz - 1])
+ return (-1);
+ for (i = strlen(anchor); i < siz; i++)
+ if (anchor[i])
+ return (-1);
+ return (0);
+}
+
+static int
+pfr_table_count(struct pfr_table *filter, int flags)
+{
+ struct pf_ruleset *rs;
+
+ PF_RULES_ASSERT();
+
+ if (flags & PFR_FLAG_ALLRSETS)
+ return (pfr_ktable_cnt);
+ if (filter->pfrt_anchor[0]) {
+ rs = pf_find_ruleset(filter->pfrt_anchor);
+ return ((rs != NULL) ? rs->tables : -1);
+ }
+ return (pf_main_ruleset.tables);
+}
+
+static int
+pfr_skip_table(struct pfr_table *filter, struct pfr_ktable *kt, int flags)
+{
+ if (flags & PFR_FLAG_ALLRSETS)
+ return (0);
+ if (strcmp(filter->pfrt_anchor, kt->pfrkt_anchor))
+ return (1);
+ return (0);
+}
+
+static void
+pfr_insert_ktables(struct pfr_ktableworkq *workq)
+{
+ struct pfr_ktable *p;
+
+ SLIST_FOREACH(p, workq, pfrkt_workq)
+ pfr_insert_ktable(p);
+}
+
+static void
+pfr_insert_ktable(struct pfr_ktable *kt)
+{
+
+ PF_RULES_WASSERT();
+
+ RB_INSERT(pfr_ktablehead, &pfr_ktables, kt);
+ pfr_ktable_cnt++;
+ if (kt->pfrkt_root != NULL)
+ if (!kt->pfrkt_root->pfrkt_refcnt[PFR_REFCNT_ANCHOR]++)
+ pfr_setflags_ktable(kt->pfrkt_root,
+ kt->pfrkt_root->pfrkt_flags|PFR_TFLAG_REFDANCHOR);
+}
+
+static void
+pfr_setflags_ktables(struct pfr_ktableworkq *workq)
+{
+ struct pfr_ktable *p, *q;
+
+ for (p = SLIST_FIRST(workq); p; p = q) {
+ q = SLIST_NEXT(p, pfrkt_workq);
+ pfr_setflags_ktable(p, p->pfrkt_nflags);
+ }
+}
+
+static void
+pfr_setflags_ktable(struct pfr_ktable *kt, int newf)
+{
+ struct pfr_kentryworkq addrq;
+
+ PF_RULES_WASSERT();
+
+ if (!(newf & PFR_TFLAG_REFERENCED) &&
+ !(newf & PFR_TFLAG_PERSIST))
+ newf &= ~PFR_TFLAG_ACTIVE;
+ if (!(newf & PFR_TFLAG_ACTIVE))
+ newf &= ~PFR_TFLAG_USRMASK;
+ if (!(newf & PFR_TFLAG_SETMASK)) {
+ RB_REMOVE(pfr_ktablehead, &pfr_ktables, kt);
+ if (kt->pfrkt_root != NULL)
+ if (!--kt->pfrkt_root->pfrkt_refcnt[PFR_REFCNT_ANCHOR])
+ pfr_setflags_ktable(kt->pfrkt_root,
+ kt->pfrkt_root->pfrkt_flags &
+ ~PFR_TFLAG_REFDANCHOR);
+ pfr_destroy_ktable(kt, 1);
+ pfr_ktable_cnt--;
+ return;
+ }
+ if (!(newf & PFR_TFLAG_ACTIVE) && kt->pfrkt_cnt) {
+ pfr_enqueue_addrs(kt, &addrq, NULL, 0);
+ pfr_remove_kentries(kt, &addrq);
+ }
+ if (!(newf & PFR_TFLAG_INACTIVE) && kt->pfrkt_shadow != NULL) {
+ pfr_destroy_ktable(kt->pfrkt_shadow, 1);
+ kt->pfrkt_shadow = NULL;
+ }
+ kt->pfrkt_flags = newf;
+}
+
+static void
+pfr_clstats_ktables(struct pfr_ktableworkq *workq, long tzero, int recurse)
+{
+ struct pfr_ktable *p;
+
+ SLIST_FOREACH(p, workq, pfrkt_workq)
+ pfr_clstats_ktable(p, tzero, recurse);
+}
+
+static void
+pfr_clstats_ktable(struct pfr_ktable *kt, long tzero, int recurse)
+{
+ struct pfr_kentryworkq addrq;
+
+ if (recurse) {
+ pfr_enqueue_addrs(kt, &addrq, NULL, 0);
+ pfr_clstats_kentries(&addrq, tzero, 0);
+ }
+ bzero(kt->pfrkt_packets, sizeof(kt->pfrkt_packets));
+ bzero(kt->pfrkt_bytes, sizeof(kt->pfrkt_bytes));
+ kt->pfrkt_match = kt->pfrkt_nomatch = 0;
+ kt->pfrkt_tzero = tzero;
+}
+
+static struct pfr_ktable *
+pfr_create_ktable(struct pfr_table *tbl, long tzero, int attachruleset)
+{
+ struct pfr_ktable *kt;
+ struct pf_ruleset *rs;
+
+ PF_RULES_WASSERT();
+
+ kt = malloc(sizeof(*kt), M_PFTABLE, M_NOWAIT|M_ZERO);
+ if (kt == NULL)
+ return (NULL);
+ kt->pfrkt_t = *tbl;
+
+ if (attachruleset) {
+ rs = pf_find_or_create_ruleset(tbl->pfrt_anchor);
+ if (!rs) {
+ pfr_destroy_ktable(kt, 0);
+ return (NULL);
+ }
+ kt->pfrkt_rs = rs;
+ rs->tables++;
+ }
+
+ if (!rn_inithead((void **)&kt->pfrkt_ip4,
+ offsetof(struct sockaddr_in, sin_addr) * 8) ||
+ !rn_inithead((void **)&kt->pfrkt_ip6,
+ offsetof(struct sockaddr_in6, sin6_addr) * 8)) {
+ pfr_destroy_ktable(kt, 0);
+ return (NULL);
+ }
+ kt->pfrkt_tzero = tzero;
+
+ return (kt);
+}
+
+static void
+pfr_destroy_ktables(struct pfr_ktableworkq *workq, int flushaddr)
+{
+ struct pfr_ktable *p, *q;
+
+ for (p = SLIST_FIRST(workq); p; p = q) {
+ q = SLIST_NEXT(p, pfrkt_workq);
+ pfr_destroy_ktable(p, flushaddr);
+ }
+}
+
+static void
+pfr_destroy_ktable(struct pfr_ktable *kt, int flushaddr)
+{
+ struct pfr_kentryworkq addrq;
+
+ if (flushaddr) {
+ pfr_enqueue_addrs(kt, &addrq, NULL, 0);
+ pfr_clean_node_mask(kt, &addrq);
+ pfr_destroy_kentries(&addrq);
+ }
+ if (kt->pfrkt_ip4 != NULL)
+ rn_detachhead((void **)&kt->pfrkt_ip4);
+ if (kt->pfrkt_ip6 != NULL)
+ rn_detachhead((void **)&kt->pfrkt_ip6);
+ if (kt->pfrkt_shadow != NULL)
+ pfr_destroy_ktable(kt->pfrkt_shadow, flushaddr);
+ if (kt->pfrkt_rs != NULL) {
+ kt->pfrkt_rs->tables--;
+ pf_remove_if_empty_ruleset(kt->pfrkt_rs);
+ }
+ free(kt, M_PFTABLE);
+}
+
+static int
+pfr_ktable_compare(struct pfr_ktable *p, struct pfr_ktable *q)
+{
+ int d;
+
+ if ((d = strncmp(p->pfrkt_name, q->pfrkt_name, PF_TABLE_NAME_SIZE)))
+ return (d);
+ return (strcmp(p->pfrkt_anchor, q->pfrkt_anchor));
+}
+
+static struct pfr_ktable *
+pfr_lookup_table(struct pfr_table *tbl)
+{
+ /* struct pfr_ktable start like a struct pfr_table */
+ return (RB_FIND(pfr_ktablehead, &pfr_ktables,
+ (struct pfr_ktable *)tbl));
+}
+
+int
+pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af)
+{
+ struct pfr_kentry *ke = NULL;
+ int match;
+
+ PF_RULES_RASSERT();
+
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
+ kt = kt->pfrkt_root;
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (0);
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ {
+ struct sockaddr_in sin;
+
+ bzero(&sin, sizeof(sin));
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = a->addr32[0];
+ ke = (struct pfr_kentry *)rn_match(&sin, &kt->pfrkt_ip4->rh);
+ if (ke && KENTRY_RNF_ROOT(ke))
+ ke = NULL;
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 sin6;
+
+ bzero(&sin6, sizeof(sin6));
+ sin6.sin6_len = sizeof(sin6);
+ sin6.sin6_family = AF_INET6;
+ bcopy(a, &sin6.sin6_addr, sizeof(sin6.sin6_addr));
+ ke = (struct pfr_kentry *)rn_match(&sin6, &kt->pfrkt_ip6->rh);
+ if (ke && KENTRY_RNF_ROOT(ke))
+ ke = NULL;
+ break;
+ }
+#endif /* INET6 */
+ }
+ match = (ke && !ke->pfrke_not);
+ if (match)
+ kt->pfrkt_match++;
+ else
+ kt->pfrkt_nomatch++;
+ return (match);
+}
+
+void
+pfr_update_stats(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af,
+ u_int64_t len, int dir_out, int op_pass, int notrule)
+{
+ struct pfr_kentry *ke = NULL;
+
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
+ kt = kt->pfrkt_root;
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return;
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ {
+ struct sockaddr_in sin;
+
+ bzero(&sin, sizeof(sin));
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = a->addr32[0];
+ ke = (struct pfr_kentry *)rn_match(&sin, &kt->pfrkt_ip4->rh);
+ if (ke && KENTRY_RNF_ROOT(ke))
+ ke = NULL;
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 sin6;
+
+ bzero(&sin6, sizeof(sin6));
+ sin6.sin6_len = sizeof(sin6);
+ sin6.sin6_family = AF_INET6;
+ bcopy(a, &sin6.sin6_addr, sizeof(sin6.sin6_addr));
+ ke = (struct pfr_kentry *)rn_match(&sin6, &kt->pfrkt_ip6->rh);
+ if (ke && KENTRY_RNF_ROOT(ke))
+ ke = NULL;
+ break;
+ }
+#endif /* INET6 */
+ default:
+ panic("%s: unknown address family %u", __func__, af);
+ }
+ if ((ke == NULL || ke->pfrke_not) != notrule) {
+ if (op_pass != PFR_OP_PASS)
+ printf("pfr_update_stats: assertion failed.\n");
+ op_pass = PFR_OP_XPASS;
+ }
+ kt->pfrkt_packets[dir_out][op_pass]++;
+ kt->pfrkt_bytes[dir_out][op_pass] += len;
+ if (ke != NULL && op_pass != PFR_OP_XPASS &&
+ (kt->pfrkt_flags & PFR_TFLAG_COUNTERS)) {
+ if (ke->pfrke_counters == NULL)
+ ke->pfrke_counters = uma_zalloc(V_pfr_kcounters_z,
+ M_NOWAIT | M_ZERO);
+ if (ke->pfrke_counters != NULL) {
+ ke->pfrke_counters->pfrkc_packets[dir_out][op_pass]++;
+ ke->pfrke_counters->pfrkc_bytes[dir_out][op_pass] += len;
+ }
+ }
+}
+
+struct pfr_ktable *
+pfr_attach_table(struct pf_ruleset *rs, char *name)
+{
+ struct pfr_ktable *kt, *rt;
+ struct pfr_table tbl;
+ struct pf_anchor *ac = rs->anchor;
+
+ PF_RULES_WASSERT();
+
+ bzero(&tbl, sizeof(tbl));
+ strlcpy(tbl.pfrt_name, name, sizeof(tbl.pfrt_name));
+ if (ac != NULL)
+ strlcpy(tbl.pfrt_anchor, ac->path, sizeof(tbl.pfrt_anchor));
+ kt = pfr_lookup_table(&tbl);
+ if (kt == NULL) {
+ kt = pfr_create_ktable(&tbl, time_second, 1);
+ if (kt == NULL)
+ return (NULL);
+ if (ac != NULL) {
+ bzero(tbl.pfrt_anchor, sizeof(tbl.pfrt_anchor));
+ rt = pfr_lookup_table(&tbl);
+ if (rt == NULL) {
+ rt = pfr_create_ktable(&tbl, 0, 1);
+ if (rt == NULL) {
+ pfr_destroy_ktable(kt, 0);
+ return (NULL);
+ }
+ pfr_insert_ktable(rt);
+ }
+ kt->pfrkt_root = rt;
+ }
+ pfr_insert_ktable(kt);
+ }
+ if (!kt->pfrkt_refcnt[PFR_REFCNT_RULE]++)
+ pfr_setflags_ktable(kt, kt->pfrkt_flags|PFR_TFLAG_REFERENCED);
+ return (kt);
+}
+
+void
+pfr_detach_table(struct pfr_ktable *kt)
+{
+
+ PF_RULES_WASSERT();
+ KASSERT(kt->pfrkt_refcnt[PFR_REFCNT_RULE] > 0, ("%s: refcount %d\n",
+ __func__, kt->pfrkt_refcnt[PFR_REFCNT_RULE]));
+
+ if (!--kt->pfrkt_refcnt[PFR_REFCNT_RULE])
+ pfr_setflags_ktable(kt, kt->pfrkt_flags&~PFR_TFLAG_REFERENCED);
+}
+
+int
+pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter,
+ sa_family_t af)
+{
+ struct pf_addr *addr, *cur, *mask;
+ union sockaddr_union uaddr, umask;
+ struct pfr_kentry *ke, *ke2 = NULL;
+ int idx = -1, use_counter = 0;
+
+ switch (af) {
+ case AF_INET:
+ uaddr.sin.sin_len = sizeof(struct sockaddr_in);
+ uaddr.sin.sin_family = AF_INET;
+ break;
+ case AF_INET6:
+ uaddr.sin6.sin6_len = sizeof(struct sockaddr_in6);
+ uaddr.sin6.sin6_family = AF_INET6;
+ break;
+ }
+ addr = SUNION2PF(&uaddr, af);
+
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
+ kt = kt->pfrkt_root;
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (-1);
+
+ if (pidx != NULL)
+ idx = *pidx;
+ if (counter != NULL && idx >= 0)
+ use_counter = 1;
+ if (idx < 0)
+ idx = 0;
+
+_next_block:
+ ke = pfr_kentry_byidx(kt, idx, af);
+ if (ke == NULL) {
+ kt->pfrkt_nomatch++;
+ return (1);
+ }
+ pfr_prepare_network(&umask, af, ke->pfrke_net);
+ cur = SUNION2PF(&ke->pfrke_sa, af);
+ mask = SUNION2PF(&umask, af);
+
+ if (use_counter) {
+ /* is supplied address within block? */
+ if (!PF_MATCHA(0, cur, mask, counter, af)) {
+ /* no, go to next block in table */
+ idx++;
+ use_counter = 0;
+ goto _next_block;
+ }
+ PF_ACPY(addr, counter, af);
+ } else {
+ /* use first address of block */
+ PF_ACPY(addr, cur, af);
+ }
+
+ if (!KENTRY_NETWORK(ke)) {
+ /* this is a single IP address - no possible nested block */
+ PF_ACPY(counter, addr, af);
+ *pidx = idx;
+ kt->pfrkt_match++;
+ return (0);
+ }
+ for (;;) {
+ /* we don't want to use a nested block */
+ switch (af) {
+ case AF_INET:
+ ke2 = (struct pfr_kentry *)rn_match(&uaddr,
+ &kt->pfrkt_ip4->rh);
+ break;
+ case AF_INET6:
+ ke2 = (struct pfr_kentry *)rn_match(&uaddr,
+ &kt->pfrkt_ip6->rh);
+ break;
+ }
+ /* no need to check KENTRY_RNF_ROOT() here */
+ if (ke2 == ke) {
+ /* lookup return the same block - perfect */
+ PF_ACPY(counter, addr, af);
+ *pidx = idx;
+ kt->pfrkt_match++;
+ return (0);
+ }
+
+ /* we need to increase the counter past the nested block */
+ pfr_prepare_network(&umask, AF_INET, ke2->pfrke_net);
+ PF_POOLMASK(addr, addr, SUNION2PF(&umask, af), &pfr_ffaddr, af);
+ PF_AINC(addr, af);
+ if (!PF_MATCHA(0, cur, mask, addr, af)) {
+ /* ok, we reached the end of our main block */
+ /* go to next block in table */
+ idx++;
+ use_counter = 0;
+ goto _next_block;
+ }
+ }
+}
+
+static struct pfr_kentry *
+pfr_kentry_byidx(struct pfr_ktable *kt, int idx, int af)
+{
+ struct pfr_walktree w;
+
+ bzero(&w, sizeof(w));
+ w.pfrw_op = PFRW_POOL_GET;
+ w.pfrw_cnt = idx;
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ kt->pfrkt_ip4->rnh_walktree(&kt->pfrkt_ip4->rh, pfr_walktree, &w);
+ return (w.pfrw_kentry);
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ kt->pfrkt_ip6->rnh_walktree(&kt->pfrkt_ip6->rh, pfr_walktree, &w);
+ return (w.pfrw_kentry);
+#endif /* INET6 */
+ default:
+ return (NULL);
+ }
+}
+
+void
+pfr_dynaddr_update(struct pfr_ktable *kt, struct pfi_dynaddr *dyn)
+{
+ struct pfr_walktree w;
+
+ bzero(&w, sizeof(w));
+ w.pfrw_op = PFRW_DYNADDR_UPDATE;
+ w.pfrw_dyn = dyn;
+
+ dyn->pfid_acnt4 = 0;
+ dyn->pfid_acnt6 = 0;
+ if (!dyn->pfid_af || dyn->pfid_af == AF_INET)
+ kt->pfrkt_ip4->rnh_walktree(&kt->pfrkt_ip4->rh, pfr_walktree, &w);
+ if (!dyn->pfid_af || dyn->pfid_af == AF_INET6)
+ kt->pfrkt_ip6->rnh_walktree(&kt->pfrkt_ip6->rh, pfr_walktree, &w);
+}