diff options
author | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2016-10-07 15:10:20 +0200 |
---|---|---|
committer | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2017-01-10 09:53:31 +0100 |
commit | c40e45b75eb76d79a05c7fa85c1fa9b5c728a12f (patch) | |
tree | ad4f2519067709f00ab98b3c591186c26dc3a21f /freebsd/sys/netpfil | |
parent | userspace-header-gen.py: Simplify program ports (diff) | |
download | rtems-libbsd-c40e45b75eb76d79a05c7fa85c1fa9b5c728a12f.tar.bz2 |
Update to FreeBSD head 2016-08-23
Git mirror commit 9fe7c416e6abb28b1398fd3e5687099846800cfd.
Diffstat (limited to 'freebsd/sys/netpfil')
58 files changed, 44729 insertions, 7825 deletions
diff --git a/freebsd/sys/netpfil/ipfw/dn_aqm.h b/freebsd/sys/netpfil/ipfw/dn_aqm.h new file mode 100644 index 00000000..d01e98eb --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/dn_aqm.h @@ -0,0 +1,167 @@ +/*- + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * API for writing an Active Queue Management algorithm for Dummynet + * + * $FreeBSD$ + */ + +#ifndef _IP_DN_AQM_H +#define _IP_DN_AQM_H + + +/* NOW is the current time in millisecond*/ +#define NOW ((dn_cfg.curr_time * tick) / 1000) + +#define AQM_UNOW (dn_cfg.curr_time * tick) +#define AQM_TIME_1US ((aqm_time_t)(1)) +#define AQM_TIME_1MS ((aqm_time_t)(1000)) +#define AQM_TIME_1S ((aqm_time_t)(AQM_TIME_1MS * 1000)) + +/* aqm time allows to store up to 4294 seconds */ +typedef uint32_t aqm_time_t; +typedef int32_t aqm_stime_t; + +#define DN_AQM_MTAG_TS 55345 + +/* Macro for variable bounding */ +#define BOUND_VAR(x,l,h) ((x) > (h)? (h) : ((x) > (l)? (x) : (l))) + +/* sysctl variable to count number of dropped packets */ +extern unsigned long io_pkt_drop; + +/* + * Structure for holding data and function pointers that together represent a + * AQM algorithm. + */ + struct dn_aqm { +#define DN_AQM_NAME_MAX 50 + char name[DN_AQM_NAME_MAX]; /* name of AQM algorithm */ + uint32_t type; /* AQM type number */ + + /* Methods implemented by AQM algorithm: + * + * enqueue enqueue packet 'm' on queue 'q'. + * Return 0 on success, 1 on drop. + * + * dequeue dequeue a packet from queue 'q'. + * Return a packet, NULL if no packet available. + * + * config configure AQM algorithm + * If required, this function should allocate space to store + * the configurations and set 'fs->aqmcfg' to point to this space. + * 'dn_extra_parms' includes array of parameters send + * from ipfw userland command. + * Return 0 on success, non-zero otherwise. + * + * deconfig deconfigure AQM algorithm. + * The allocated configuration memory space should be freed here. + * Return 0 on success, non-zero otherwise. + * + * init initialise AQM status variables of queue 'q' + * This function is used to allocate space and init AQM status for a + * queue and q->aqm_status to point to this space. + * Return 0 on success, non-zero otherwise. + * + * cleanup cleanup AQM status variables of queue 'q' + * The allocated memory space for AQM status should be freed here. + * Return 0 on success, non-zero otherwise. + * + * getconfig retrieve AQM configurations + * This function is used to return AQM parameters to userland + * command. The function should fill 'dn_extra_parms' struct with + * the AQM configurations using 'par' array. + * + */ + + int (*enqueue)(struct dn_queue *, struct mbuf *); + struct mbuf * (*dequeue)(struct dn_queue *); + int (*config)(struct dn_fsk *, struct dn_extra_parms *ep, int); + int (*deconfig)(struct dn_fsk *); + int (*init)(struct dn_queue *); + int (*cleanup)(struct dn_queue *); + int (*getconfig)(struct dn_fsk *, struct dn_extra_parms *); + + int ref_count; /*Number of queues instances in the system */ + int cfg_ref_count; /*Number of AQM instances in the system */ + SLIST_ENTRY (dn_aqm) next; /* Next AQM in the list */ +}; + +/* Helper function to update queue and scheduler statistics. + * negative len + drop -> drop + * negative len -> dequeue + * positive len -> enqueue + * positive len + drop -> drop during enqueue + */ +__inline static void +update_stats(struct dn_queue *q, int len, int drop) +{ + int inc = 0; + struct dn_flow *sni; + struct dn_flow *qni; + + sni = &q->_si->ni; + qni = &q->ni; + + if (len < 0) + inc = -1; + else if(len > 0) + inc = 1; + + if (drop) { + qni->drops++; + sni->drops++; + io_pkt_drop++; + } else { + /*update queue stats */ + qni->length += inc; + qni->len_bytes += len; + + /*update scheduler instance stats */ + sni->length += inc; + sni->len_bytes += len; + } + /* tot_pkts is updated in dn_enqueue function */ +} + + +/* kernel module related function */ +int +dn_aqm_modevent(module_t mod, int cmd, void *arg); + +#define DECLARE_DNAQM_MODULE(name, dnaqm) \ + static moduledata_t name##_mod = { \ + #name, dn_aqm_modevent, dnaqm \ + }; \ + DECLARE_MODULE(name, name##_mod, \ + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ + MODULE_DEPEND(name, dummynet, 3, 3, 3) + +#endif diff --git a/freebsd/sys/netpfil/ipfw/dn_aqm_codel.h b/freebsd/sys/netpfil/ipfw/dn_aqm_codel.h new file mode 100644 index 00000000..f5618e76 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/dn_aqm_codel.h @@ -0,0 +1,222 @@ +/* + * Codel - The Controlled-Delay Active Queue Management algorithm. + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Copyright (C) 2011-2014 Kathleen Nichols <nichols@pollere.com>. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * o Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * + * o Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * o The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General Public + * License ("GPL") version 2, in which case the provisions of the GPL + * apply INSTEAD OF those given above. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _IP_DN_AQM_CODEL_H +#define _IP_DN_AQM_CODEL_H + + +// XXX How to choose MTAG? +#define FIX_POINT_BITS 16 + +enum { + CODEL_ECN_ENABLED = 1 +}; + +/* Codel parameters */ +struct dn_aqm_codel_parms { + aqm_time_t target; + aqm_time_t interval; + uint32_t flags; +}; + +/* codel status variables */ +struct codel_status { + uint32_t count; /* number of dropped pkts since entering drop state */ + uint16_t dropping; /* dropping state */ + aqm_time_t drop_next_time; /* time for next drop */ + aqm_time_t first_above_time; /* time for first ts over target we observed */ + uint16_t isqrt; /* last isqrt for control low */ + uint16_t maxpkt_size; /* max packet size seen so far */ +}; + +struct mbuf *codel_extract_head(struct dn_queue *, aqm_time_t *); +aqm_time_t control_law(struct codel_status *, + struct dn_aqm_codel_parms *, aqm_time_t ); + +__inline static struct mbuf * +codel_dodequeue(struct dn_queue *q, aqm_time_t now, uint16_t *ok_to_drop) +{ + struct mbuf * m; + struct dn_aqm_codel_parms *cprms; + struct codel_status *cst; + aqm_time_t pkt_ts, sojourn_time; + + *ok_to_drop = 0; + m = codel_extract_head(q, &pkt_ts); + + cst = q->aqm_status; + + if (m == NULL) { + /* queue is empty - we can't be above target */ + cst->first_above_time= 0; + return m; + } + + cprms = q->fs->aqmcfg; + + /* To span a large range of bandwidths, CoDel runs two + * different AQMs in parallel. One is sojourn-time-based + * and takes effect when the time to send an MTU-sized + * packet is less than target. The 1st term of the "if" + * below does this. The other is backlog-based and takes + * effect when the time to send an MTU-sized packet is >= + * target. The goal here is to keep the output link + * utilization high by never allowing the queue to get + * smaller than the amount that arrives in a typical + * interarrival time (MTU-sized packets arriving spaced + * by the amount of time it takes to send such a packet on + * the bottleneck). The 2nd term of the "if" does this. + */ + sojourn_time = now - pkt_ts; + if (sojourn_time < cprms->target || q->ni.len_bytes <= cst->maxpkt_size) { + /* went below - stay below for at least interval */ + cst->first_above_time = 0; + } else { + if (cst->first_above_time == 0) { + /* just went above from below. if still above at + * first_above_time, will say it's ok to drop. */ + cst->first_above_time = now + cprms->interval; + } else if (now >= cst->first_above_time) { + *ok_to_drop = 1; + } + } + return m; +} + +/* + * Dequeue a packet from queue 'q' + */ +__inline static struct mbuf * +codel_dequeue(struct dn_queue *q) +{ + struct mbuf *m; + struct dn_aqm_codel_parms *cprms; + struct codel_status *cst; + aqm_time_t now; + uint16_t ok_to_drop; + + cst = q->aqm_status;; + cprms = q->fs->aqmcfg; + now = AQM_UNOW; + + m = codel_dodequeue(q, now, &ok_to_drop); + if (cst->dropping) { + if (!ok_to_drop) { + /* sojourn time below target - leave dropping state */ + cst->dropping = false; + } + /* + * Time for the next drop. Drop current packet and dequeue + * next. If the dequeue doesn't take us out of dropping + * state, schedule the next drop. A large backlog might + * result in drop rates so high that the next drop should + * happen now, hence the 'while' loop. + */ + while (now >= cst->drop_next_time && cst->dropping) { + + /* mark the packet */ + if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) { + cst->count++; + /* schedule the next mark. */ + cst->drop_next_time = control_law(cst, cprms, + cst->drop_next_time); + return m; + } + + /* drop the packet */ + update_stats(q, 0, 1); + FREE_PKT(m); + m = codel_dodequeue(q, now, &ok_to_drop); + + if (!ok_to_drop) { + /* leave dropping state */ + cst->dropping = false; + } else { + cst->count++; + /* schedule the next drop. */ + cst->drop_next_time = control_law(cst, cprms, + cst->drop_next_time); + } + } + /* If we get here we're not in dropping state. The 'ok_to_drop' + * return from dodequeue means that the sojourn time has been + * above 'target' for 'interval' so enter dropping state. + */ + } else if (ok_to_drop) { + + /* if ECN option is disabled or the packet cannot be marked, + * drop the packet and extract another. + */ + if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) { + update_stats(q, 0, 1); + FREE_PKT(m); + m = codel_dodequeue(q, now, &ok_to_drop); + } + + cst->dropping = true; + + /* If min went above target close to when it last went + * below, assume that the drop rate that controlled the + * queue on the last cycle is a good starting point to + * control it now. ('drop_next' will be at most 'interval' + * later than the time of the last drop so 'now - drop_next' + * is a good approximation of the time from the last drop + * until now.) + */ + cst->count = (cst->count > 2 && ((aqm_stime_t)now - + (aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)? + cst->count - 2 : 1; + /* we don't have to set initial guess for Newton's method isqrt as + * we initilaize isqrt in control_law function when count == 1 */ + cst->drop_next_time = control_law(cst, cprms, now); + } + + return m; +} + +#endif diff --git a/freebsd/sys/netpfil/ipfw/dn_aqm_pie.h b/freebsd/sys/netpfil/ipfw/dn_aqm_pie.h new file mode 100644 index 00000000..aa2fceba --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/dn_aqm_pie.h @@ -0,0 +1,153 @@ +/* + * PIE - Proportional Integral controller Enhanced AQM algorithm. + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _IP_DN_AQM_PIE_H +#define _IP_DN_AQM_PIE_H + +#define DN_AQM_PIE 2 +#define PIE_DQ_THRESHOLD_BITS 14 +/* 2^14 =16KB */ +#define PIE_DQ_THRESHOLD (1UL << PIE_DQ_THRESHOLD_BITS) +#define MEAN_PKTSIZE 800 + +/* 31-bits because random() generates range from 0->(2**31)-1 */ +#define PIE_PROB_BITS 31 +#define PIE_MAX_PROB ((1ULL<<PIE_PROB_BITS) -1) + +/* for 16-bits, we have 3-bits for integer part and 13-bits for fraction */ +#define PIE_FIX_POINT_BITS 13 +#define PIE_SCALE (1UL<<PIE_FIX_POINT_BITS) + + +/* PIE options */ +enum { + PIE_ECN_ENABLED =1, + PIE_CAPDROP_ENABLED = 2, + PIE_ON_OFF_MODE_ENABLED = 4, + PIE_DEPRATEEST_ENABLED = 8, + PIE_DERAND_ENABLED = 16 +}; + +/* PIE parameters */ +struct dn_aqm_pie_parms { + aqm_time_t qdelay_ref; /* AQM Latency Target (default: 15ms) */ + aqm_time_t tupdate; /* a period to calculate drop probability (default:15ms) */ + aqm_time_t max_burst; /* AQM Max Burst Allowance (default: 150ms) */ + uint16_t max_ecnth; /*AQM Max ECN Marking Threshold (default: 10%) */ + uint16_t alpha; /* (default: 1/8) */ + uint16_t beta; /* (default: 1+1/4) */ + uint32_t flags; /* PIE options */ +}; + +/* PIE status variables */ +struct pie_status{ + struct callout aqm_pie_callout; + aqm_time_t burst_allowance; + uint32_t drop_prob; + aqm_time_t current_qdelay; + aqm_time_t qdelay_old; + uint64_t accu_prob; + aqm_time_t measurement_start; + aqm_time_t avg_dq_time; + uint32_t dq_count; + uint32_t sflags; + struct dn_aqm_pie_parms *parms; /* pointer to PIE configurations */ + /* pointer to parent queue of FQ-PIE sub-queues, or queue of owner fs. */ + struct dn_queue *pq; + struct mtx lock_mtx; + uint32_t one_third_q_size; /* 1/3 of queue size, for speed optization */ +}; + +enum { + ENQUE = 1, + DROP, + MARKECN +}; + +/* PIE current state */ +enum { + PIE_ACTIVE = 1, + PIE_INMEASUREMENT = 2 +}; + +/* + * Check if eneque should drop packet to control delay or not based on + * PIe algorithm. + * return DROP if it is time to drop or ENQUE otherwise. + * This function is used by PIE and FQ-PIE. + */ +__inline static int +drop_early(struct pie_status *pst, uint32_t qlen) +{ + struct dn_aqm_pie_parms *pprms; + + pprms = pst->parms; + + /* queue is not congested */ + + if ((pst->qdelay_old < (pprms->qdelay_ref >> 1) + && pst->drop_prob < PIE_MAX_PROB / 5 ) + || qlen <= 2 * MEAN_PKTSIZE) + return ENQUE; + + + if (pst->drop_prob == 0) + pst->accu_prob = 0; + + /* increment accu_prob */ + if (pprms->flags & PIE_DERAND_ENABLED) + pst->accu_prob += pst->drop_prob; + + /* De-randomize option + * if accu_prob < 0.85 -> enqueue + * if accu_prob>8.5 ->drop + * between 0.85 and 8.5 || !De-randomize --> drop on prob + * + * (0.85 = 17/20 ,8.5 = 17/2) + */ + if (pprms->flags & PIE_DERAND_ENABLED) { + if(pst->accu_prob < (uint64_t) (PIE_MAX_PROB * 17 / 20)) + return ENQUE; + if( pst->accu_prob >= (uint64_t) (PIE_MAX_PROB * 17 / 2)) + return DROP; + } + + if (random() < pst->drop_prob) { + pst->accu_prob = 0; + return DROP; + } + + return ENQUE; +} + +#endif diff --git a/freebsd/sys/netpfil/ipfw/dn_heap.c b/freebsd/sys/netpfil/ipfw/dn_heap.c deleted file mode 100644 index 15e2870d..00000000 --- a/freebsd/sys/netpfil/ipfw/dn_heap.c +++ /dev/null @@ -1,554 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/*- - * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * Binary heap and hash tables, used in dummynet - * - * $FreeBSD$ - */ - -#include <sys/cdefs.h> -#include <rtems/bsd/sys/param.h> -#ifdef _KERNEL -__FBSDID("$FreeBSD$"); -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/kernel.h> -#include <netpfil/ipfw/dn_heap.h> -#ifndef log -#define log(x, arg...) -#endif - -#else /* !_KERNEL */ - -#include <stdio.h> -#include <dn_test.h> -#include <strings.h> -#include <stdlib.h> - -#include "dn_heap.h" -#define log(x, arg...) fprintf(stderr, ## arg) -#define panic(x...) fprintf(stderr, ## x), exit(1) -#define MALLOC_DEFINE(a, b, c) -static void *my_malloc(int s) { return malloc(s); } -static void my_free(void *p) { free(p); } -#define malloc(s, t, w) my_malloc(s) -#define free(p, t) my_free(p) -#endif /* !_KERNEL */ - -MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap"); - -/* - * Heap management functions. - * - * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. - * Some macros help finding parent/children so we can optimize them. - * - * heap_init() is called to expand the heap when needed. - * Increment size in blocks of 16 entries. - * Returns 1 on error, 0 on success - */ -#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) -#define HEAP_LEFT(x) ( (x)+(x) + 1 ) -#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } -#define HEAP_INCREMENT 15 - -static int -heap_resize(struct dn_heap *h, unsigned int new_size) -{ - struct dn_heap_entry *p; - - if (h->size >= new_size ) /* have enough room */ - return 0; -#if 1 /* round to the next power of 2 */ - new_size |= new_size >> 1; - new_size |= new_size >> 2; - new_size |= new_size >> 4; - new_size |= new_size >> 8; - new_size |= new_size >> 16; -#else - new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT; -#endif - p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT); - if (p == NULL) { - printf("--- %s, resize %d failed\n", __func__, new_size ); - return 1; /* error */ - } - if (h->size > 0) { - bcopy(h->p, p, h->size * sizeof(*p) ); - free(h->p, M_DN_HEAP); - } - h->p = p; - h->size = new_size; - return 0; -} - -int -heap_init(struct dn_heap *h, int size, int ofs) -{ - if (heap_resize(h, size)) - return 1; - h->elements = 0; - h->ofs = ofs; - return 0; -} - -/* - * Insert element in heap. Normally, p != NULL, we insert p in - * a new position and bubble up. If p == NULL, then the element is - * already in place, and key is the position where to start the - * bubble-up. - * Returns 1 on failure (cannot allocate new heap entry) - * - * If ofs > 0 the position (index, int) of the element in the heap is - * also stored in the element itself at the given offset in bytes. - */ -#define SET_OFFSET(h, i) do { \ - if (h->ofs > 0) \ - *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \ - } while (0) -/* - * RESET_OFFSET is used for sanity checks. It sets ofs - * to an invalid value. - */ -#define RESET_OFFSET(h, i) do { \ - if (h->ofs > 0) \ - *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \ - } while (0) - -int -heap_insert(struct dn_heap *h, uint64_t key1, void *p) -{ - int son = h->elements; - - //log("%s key %llu p %p\n", __FUNCTION__, key1, p); - if (p == NULL) { /* data already there, set starting point */ - son = key1; - } else { /* insert new element at the end, possibly resize */ - son = h->elements; - if (son == h->size) /* need resize... */ - // XXX expand by 16 or so - if (heap_resize(h, h->elements+16) ) - return 1; /* failure... */ - h->p[son].object = p; - h->p[son].key = key1; - h->elements++; - } - /* make sure that son >= father along the path */ - while (son > 0) { - int father = HEAP_FATHER(son); - struct dn_heap_entry tmp; - - if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) - break; /* found right position */ - /* son smaller than father, swap and repeat */ - HEAP_SWAP(h->p[son], h->p[father], tmp); - SET_OFFSET(h, son); - son = father; - } - SET_OFFSET(h, son); - return 0; -} - -/* - * remove top element from heap, or obj if obj != NULL - */ -void -heap_extract(struct dn_heap *h, void *obj) -{ - int child, father, max = h->elements - 1; - - if (max < 0) { - printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h); - return; - } - if (obj == NULL) - father = 0; /* default: move up smallest child */ - else { /* extract specific element, index is at offset */ - if (h->ofs <= 0) - panic("%s: extract from middle not set on %p\n", - __FUNCTION__, h); - father = *((int *)((char *)obj + h->ofs)); - if (father < 0 || father >= h->elements) { - panic("%s: father %d out of bound 0..%d\n", - __FUNCTION__, father, h->elements); - } - } - /* - * below, father is the index of the empty element, which - * we replace at each step with the smallest child until we - * reach the bottom level. - */ - // XXX why removing RESET_OFFSET increases runtime by 10% ? - RESET_OFFSET(h, father); - while ( (child = HEAP_LEFT(father)) <= max ) { - if (child != max && - DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) - child++; /* take right child, otherwise left */ - h->p[father] = h->p[child]; - SET_OFFSET(h, father); - father = child; - } - h->elements--; - if (father != max) { - /* - * Fill hole with last entry and bubble up, - * reusing the insert code - */ - h->p[father] = h->p[max]; - heap_insert(h, father, NULL); - } -} - -#if 0 -/* - * change object position and update references - * XXX this one is never used! - */ -static void -heap_move(struct dn_heap *h, uint64_t new_key, void *object) -{ - int temp, i, max = h->elements-1; - struct dn_heap_entry *p, buf; - - if (h->ofs <= 0) - panic("cannot move items on this heap"); - p = h->p; /* shortcut */ - - i = *((int *)((char *)object + h->ofs)); - if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */ - p[i].key = new_key; - for (; i>0 && - DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key); - i = temp ) { /* bubble up */ - HEAP_SWAP(p[i], p[temp], buf); - SET_OFFSET(h, i); - } - } else { /* must move down */ - p[i].key = new_key; - while ( (temp = HEAP_LEFT(i)) <= max ) { - /* found left child */ - if (temp != max && - DN_KEY_LT(p[temp+1].key, p[temp].key)) - temp++; /* select child with min key */ - if (DN_KEY_LT(>p[temp].key, new_key)) { - /* go down */ - HEAP_SWAP(p[i], p[temp], buf); - SET_OFFSET(h, i); - } else - break; - i = temp; - } - } - SET_OFFSET(h, i); -} -#endif /* heap_move, unused */ - -/* - * heapify() will reorganize data inside an array to maintain the - * heap property. It is needed when we delete a bunch of entries. - */ -static void -heapify(struct dn_heap *h) -{ - int i; - - for (i = 0; i < h->elements; i++ ) - heap_insert(h, i , NULL); -} - -int -heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t), - uintptr_t arg) -{ - int i, ret, found; - - for (i = found = 0 ; i < h->elements ;) { - ret = fn(h->p[i].object, arg); - if (ret & HEAP_SCAN_DEL) { - h->elements-- ; - h->p[i] = h->p[h->elements] ; - found++ ; - } else - i++ ; - if (ret & HEAP_SCAN_END) - break; - } - if (found) - heapify(h); - return found; -} - -/* - * cleanup the heap and free data structure - */ -void -heap_free(struct dn_heap *h) -{ - if (h->size >0 ) - free(h->p, M_DN_HEAP); - bzero(h, sizeof(*h) ); -} - -/* - * hash table support. - */ - -struct dn_ht { - int buckets; /* how many buckets, really buckets - 1*/ - int entries; /* how many entries */ - int ofs; /* offset of link field */ - uint32_t (*hash)(uintptr_t, int, void *arg); - int (*match)(void *_el, uintptr_t key, int, void *); - void *(*newh)(uintptr_t, int, void *); - void **ht; /* bucket heads */ -}; -/* - * Initialize, allocating bucket pointers inline. - * Recycle previous record if possible. - * If the 'newh' function is not supplied, we assume that the - * key passed to ht_find is the same object to be stored in. - */ -struct dn_ht * -dn_ht_init(struct dn_ht *ht, int buckets, int ofs, - uint32_t (*h)(uintptr_t, int, void *), - int (*match)(void *, uintptr_t, int, void *), - void *(*newh)(uintptr_t, int, void *)) -{ - int l; - - /* - * Notes about rounding bucket size to a power of two. - * Given the original bucket size, we compute the nearest lower and - * higher power of two, minus 1 (respectively b_min and b_max) because - * this value will be used to do an AND with the index returned - * by hash function. - * To choice between these two values, the original bucket size is - * compared with b_min. If the original size is greater than 4/3 b_min, - * we round the bucket size to b_max, else to b_min. - * This ratio try to round to the nearest power of two, advantaging - * the greater size if the different between two power is relatively - * big. - * Rounding the bucket size to a power of two avoid the use of - * module when calculating the correct bucket. - * The ht->buckets variable store the bucket size - 1 to simply - * do an AND between the index returned by hash function and ht->bucket - * instead of a module. - */ - int b_min; /* min buckets */ - int b_max; /* max buckets */ - int b_ori; /* original buckets */ - - if (h == NULL || match == NULL) { - printf("--- missing hash or match function"); - return NULL; - } - if (buckets < 1 || buckets > 65536) - return NULL; - - b_ori = buckets; - /* calculate next power of 2, - 1*/ - buckets |= buckets >> 1; - buckets |= buckets >> 2; - buckets |= buckets >> 4; - buckets |= buckets >> 8; - buckets |= buckets >> 16; - - b_max = buckets; /* Next power */ - b_min = buckets >> 1; /* Previous power */ - - /* Calculate the 'nearest' bucket size */ - if (b_min * 4000 / 3000 < b_ori) - buckets = b_max; - else - buckets = b_min; - - if (ht) { /* see if we can reuse */ - if (buckets <= ht->buckets) { - ht->buckets = buckets; - } else { - /* free pointers if not allocated inline */ - if (ht->ht != (void *)(ht + 1)) - free(ht->ht, M_DN_HEAP); - free(ht, M_DN_HEAP); - ht = NULL; - } - } - if (ht == NULL) { - /* Allocate buckets + 1 entries because buckets is use to - * do the AND with the index returned by hash function - */ - l = sizeof(*ht) + (buckets + 1) * sizeof(void **); - ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO); - } - if (ht) { - ht->ht = (void **)(ht + 1); - ht->buckets = buckets; - ht->ofs = ofs; - ht->hash = h; - ht->match = match; - ht->newh = newh; - } - return ht; -} - -/* dummy callback for dn_ht_free to unlink all */ -static int -do_del(void *obj, void *arg) -{ - return DNHT_SCAN_DEL; -} - -void -dn_ht_free(struct dn_ht *ht, int flags) -{ - if (ht == NULL) - return; - if (flags & DNHT_REMOVE) { - (void)dn_ht_scan(ht, do_del, NULL); - } else { - if (ht->ht && ht->ht != (void *)(ht + 1)) - free(ht->ht, M_DN_HEAP); - free(ht, M_DN_HEAP); - } -} - -int -dn_ht_entries(struct dn_ht *ht) -{ - return ht ? ht->entries : 0; -} - -/* lookup and optionally create or delete element */ -void * -dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg) -{ - int i; - void **pp, *p; - - if (ht == NULL) /* easy on an empty hash */ - return NULL; - i = (ht->buckets == 1) ? 0 : - (ht->hash(key, flags, arg) & ht->buckets); - - for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) { - if (flags & DNHT_MATCH_PTR) { - if (key == (uintptr_t)p) - break; - } else if (ht->match(p, key, flags, arg)) /* found match */ - break; - } - if (p) { - if (flags & DNHT_REMOVE) { - /* link in the next element */ - *pp = *(void **)((char *)p + ht->ofs); - *(void **)((char *)p + ht->ofs) = NULL; - ht->entries--; - } - } else if (flags & DNHT_INSERT) { - // printf("%s before calling new, bucket %d ofs %d\n", - // __FUNCTION__, i, ht->ofs); - p = ht->newh ? ht->newh(key, flags, arg) : (void *)key; - // printf("%s newh returns %p\n", __FUNCTION__, p); - if (p) { - ht->entries++; - *(void **)((char *)p + ht->ofs) = ht->ht[i]; - ht->ht[i] = p; - } - } - return p; -} - -/* - * do a scan with the option to delete the object. Extract next before - * running the callback because the element may be destroyed there. - */ -int -dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg) -{ - int i, ret, found = 0; - void **curp, *cur, *next; - - if (ht == NULL || fn == NULL) - return 0; - for (i = 0; i <= ht->buckets; i++) { - curp = &ht->ht[i]; - while ( (cur = *curp) != NULL) { - next = *(void **)((char *)cur + ht->ofs); - ret = fn(cur, arg); - if (ret & DNHT_SCAN_DEL) { - found++; - ht->entries--; - *curp = next; - } else { - curp = (void **)((char *)cur + ht->ofs); - } - if (ret & DNHT_SCAN_END) - return found; - } - } - return found; -} - -/* - * Similar to dn_ht_scan(), except that the scan is performed only - * in the bucket 'bucket'. The function returns a correct bucket number if - * the original is invalid. - * If the callback returns DNHT_SCAN_END, the function move the ht->ht[i] - * pointer to the last entry processed. Moreover, the bucket number passed - * by caller is decremented, because usually the caller increment it. - */ -int -dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *), - void *arg) -{ - int i, ret, found = 0; - void **curp, *cur, *next; - - if (ht == NULL || fn == NULL) - return 0; - if (*bucket > ht->buckets) - *bucket = 0; - i = *bucket; - - curp = &ht->ht[i]; - while ( (cur = *curp) != NULL) { - next = *(void **)((char *)cur + ht->ofs); - ret = fn(cur, arg); - if (ret & DNHT_SCAN_DEL) { - found++; - ht->entries--; - *curp = next; - } else { - curp = (void **)((char *)cur + ht->ofs); - } - if (ret & DNHT_SCAN_END) - return found; - } - return found; -} diff --git a/freebsd/sys/netpfil/ipfw/dn_heap.h b/freebsd/sys/netpfil/ipfw/dn_heap.h index c95473ad..cb6e03ef 100644 --- a/freebsd/sys/netpfil/ipfw/dn_heap.h +++ b/freebsd/sys/netpfil/ipfw/dn_heap.h @@ -83,7 +83,7 @@ enum { * heap_insert() adds a key-pointer pair to the heap * * HEAP_TOP() returns a pointer to the top element of the heap, - * but makes no checks on its existance (XXX should we change ?) + * but makes no checks on its existence (XXX should we change ?) * * heap_extract() removes the entry at the top, returing the pointer. * (the key should have been read before). @@ -146,7 +146,7 @@ int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t); * of the dn_ht_find(), and of the callbacks: * * DNHT_KEY_IS_OBJ means the key is the object pointer. - * It is usally of interest for the hash and match functions. + * It is usually of interest for the hash and match functions. * * DNHT_MATCH_PTR during a lookup, match pointers instead * of calling match(). Normally used when removing specific diff --git a/freebsd/sys/netpfil/ipfw/dn_sched.h b/freebsd/sys/netpfil/ipfw/dn_sched.h index ab823fe7..ab32771b 100644 --- a/freebsd/sys/netpfil/ipfw/dn_sched.h +++ b/freebsd/sys/netpfil/ipfw/dn_sched.h @@ -132,6 +132,10 @@ struct dn_alg { int (*free_fsk)(struct dn_fsk *f); int (*new_queue)(struct dn_queue *q); int (*free_queue)(struct dn_queue *q); +#ifdef NEW_AQM + /* Getting scheduler extra parameters */ + int (*getconfig)(struct dn_schk *, struct dn_extra_parms *); +#endif /* run-time fields */ int ref_count; /* XXX number of instances in the system */ @@ -165,7 +169,13 @@ dn_dequeue(struct dn_queue *q) struct mbuf *m = q->mq.head; if (m == NULL) return NULL; +#ifdef NEW_AQM + /* Call AQM dequeue function */ + if (q->fs->aqmfp && q->fs->aqmfp->dequeue ) + return q->fs->aqmfp->dequeue(q); +#endif q->mq.head = m->m_nextpkt; + q->mq.count--; /* Update stats for the queue */ q->ni.length--; @@ -186,6 +196,6 @@ int dn_sched_modevent(module_t mod, int cmd, void *arg); #name, dn_sched_modevent, dnsched \ }; \ DECLARE_MODULE(name, name##_mod, \ - SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ - MODULE_DEPEND(name, dummynet, 3, 3, 3); + SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY); \ + MODULE_DEPEND(name, dummynet, 3, 3, 3) #endif /* _DN_SCHED_H */ diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_fifo.c b/freebsd/sys/netpfil/ipfw/dn_sched_fifo.c deleted file mode 100644 index 154a7ac6..00000000 --- a/freebsd/sys/netpfil/ipfw/dn_sched_fifo.c +++ /dev/null @@ -1,122 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/* - * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - */ - -#ifdef _KERNEL -#include <sys/malloc.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/kernel.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <net/if.h> /* IFNAMSIZ */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ipfw_rule_ref */ -#include <netinet/ip_fw.h> /* flow_id */ -#include <netinet/ip_dummynet.h> -#include <netpfil/ipfw/dn_heap.h> -#include <netpfil/ipfw/ip_dn_private.h> -#include <netpfil/ipfw/dn_sched.h> -#else -#include <dn_test.h> -#endif - -/* - * This file implements a FIFO scheduler for a single queue. - * The queue is allocated as part of the scheduler instance, - * and there is a single flowset is in the template which stores - * queue size and policy. - * Enqueue and dequeue use the default library functions. - */ -static int -fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m) -{ - /* XXX if called with q != NULL and m=NULL, this is a - * re-enqueue from an existing scheduler, which we should - * handle. - */ - return dn_enqueue((struct dn_queue *)(si+1), m, 0); -} - -static struct mbuf * -fifo_dequeue(struct dn_sch_inst *si) -{ - return dn_dequeue((struct dn_queue *)(si + 1)); -} - -static int -fifo_new_sched(struct dn_sch_inst *si) -{ - /* This scheduler instance contains the queue */ - struct dn_queue *q = (struct dn_queue *)(si + 1); - - set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); - q->_si = si; - q->fs = si->sched->fs; - return 0; -} - -static int -fifo_free_sched(struct dn_sch_inst *si) -{ - struct dn_queue *q = (struct dn_queue *)(si + 1); - dn_free_pkts(q->mq.head); - bzero(q, sizeof(*q)); - return 0; -} - -/* - * FIFO scheduler descriptor - * contains the type of the scheduler, the name, the size of extra - * data structures, and function pointers. - */ -static struct dn_alg fifo_desc = { - _SI( .type = ) DN_SCHED_FIFO, - _SI( .name = ) "FIFO", - _SI( .flags = ) 0, - - _SI( .schk_datalen = ) 0, - _SI( .si_datalen = ) sizeof(struct dn_queue), - _SI( .q_datalen = ) 0, - - _SI( .enqueue = ) fifo_enqueue, - _SI( .dequeue = ) fifo_dequeue, - _SI( .config = ) NULL, - _SI( .destroy = ) NULL, - _SI( .new_sched = ) fifo_new_sched, - _SI( .free_sched = ) fifo_free_sched, - _SI( .new_fsk = ) NULL, - _SI( .free_fsk = ) NULL, - _SI( .new_queue = ) NULL, - _SI( .free_queue = ) NULL, -}; - -DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_fq_codel.h b/freebsd/sys/netpfil/ipfw/dn_sched_fq_codel.h new file mode 100644 index 00000000..4b65781e --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/dn_sched_fq_codel.h @@ -0,0 +1,167 @@ +/*- + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * FQ_Codel Structures and helper functions + * + * $FreeBSD$ + */ + +#ifndef _IP_DN_SCHED_FQ_CODEL_H +#define _IP_DN_SCHED_FQ_CODEL_H + +/* list of queues */ +STAILQ_HEAD(fq_codel_list, fq_codel_flow) ; + +/* fq_codel parameters including codel */ +struct dn_sch_fq_codel_parms { + struct dn_aqm_codel_parms ccfg; /* CoDel Parameters */ + /* FQ_CODEL Parameters */ + uint32_t flows_cnt; /* number of flows */ + uint32_t limit; /* hard limit of fq_codel queue size*/ + uint32_t quantum; +}; /* defaults */ + +/* flow (sub-queue) stats */ +struct flow_stats { + uint64_t tot_pkts; /* statistics counters */ + uint64_t tot_bytes; + uint32_t length; /* Queue length, in packets */ + uint32_t len_bytes; /* Queue length, in bytes */ + uint32_t drops; +}; + +/* A flow of packets (sub-queue).*/ +struct fq_codel_flow { + struct mq mq; /* list of packets */ + struct flow_stats stats; /* statistics */ + int deficit; + int active; /* 1: flow is active (in a list) */ + struct codel_status cst; + STAILQ_ENTRY(fq_codel_flow) flowchain; +}; + +/* extra fq_codel scheduler configurations */ +struct fq_codel_schk { + struct dn_sch_fq_codel_parms cfg; +}; + +/* fq_codel scheduler instance */ +struct fq_codel_si { + struct dn_sch_inst _si; /* standard scheduler instance */ + struct dn_queue main_q; /* main queue is after si directly */ + + struct fq_codel_flow *flows; /* array of flows (queues) */ + uint32_t perturbation; /* random value */ + struct fq_codel_list newflows; /* list of new queues */ + struct fq_codel_list oldflows; /* list of old queues */ +}; + +/* Helper function to update queue&main-queue and scheduler statistics. + * negative len + drop -> drop + * negative len -> dequeue + * positive len -> enqueue + * positive len + drop -> drop during enqueue + */ +__inline static void +fq_update_stats(struct fq_codel_flow *q, struct fq_codel_si *si, int len, + int drop) +{ + int inc = 0; + + if (len < 0) + inc = -1; + else if (len > 0) + inc = 1; + + if (drop) { + si->main_q.ni.drops ++; + q->stats.drops ++; + si->_si.ni.drops ++; + io_pkt_drop ++; + } + + if (!drop || (drop && len < 0)) { + /* Update stats for the main queue */ + si->main_q.ni.length += inc; + si->main_q.ni.len_bytes += len; + + /*update sub-queue stats */ + q->stats.length += inc; + q->stats.len_bytes += len; + + /*update scheduler instance stats */ + si->_si.ni.length += inc; + si->_si.ni.len_bytes += len; + } + + if (inc > 0) { + si->main_q.ni.tot_bytes += len; + si->main_q.ni.tot_pkts ++; + + q->stats.tot_bytes +=len; + q->stats.tot_pkts++; + + si->_si.ni.tot_bytes +=len; + si->_si.ni.tot_pkts ++; + } + +} + +/* extract the head of fq_codel sub-queue */ +__inline static struct mbuf * +fq_codel_extract_head(struct fq_codel_flow *q, aqm_time_t *pkt_ts, struct fq_codel_si *si) +{ + struct mbuf *m = q->mq.head; + + if (m == NULL) + return m; + q->mq.head = m->m_nextpkt; + + fq_update_stats(q, si, -m->m_pkthdr.len, 0); + + if (si->main_q.ni.length == 0) /* queue is now idle */ + si->main_q.q_time = dn_cfg.curr_time; + + /* extract packet timestamp*/ + struct m_tag *mtag; + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL){ + D("timestamp tag is not found!"); + *pkt_ts = 0; + } else { + *pkt_ts = *(aqm_time_t *)(mtag + 1); + m_tag_delete(m,mtag); + } + + return m; +} + + +#endif diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h b/freebsd/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h new file mode 100644 index 00000000..da663dc8 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h @@ -0,0 +1,187 @@ +/* + * Codel - The Controlled-Delay Active Queue Management algorithm. + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Copyright (C) 2011-2014 Kathleen Nichols <nichols@pollere.com>. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * o Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * + * o Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * o The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General Public + * License ("GPL") version 2, in which case the provisions of the GPL + * apply INSTEAD OF those given above. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _IP_DN_SCHED_FQ_CODEL_HELPER_H +#define _IP_DN_SCHED_FQ_CODEL_HELPER_H + +__inline static struct mbuf * +fqc_dodequeue(struct fq_codel_flow *q, aqm_time_t now, uint16_t *ok_to_drop, + struct fq_codel_si *si) +{ + struct mbuf * m; + struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1); + aqm_time_t pkt_ts, sojourn_time; + + *ok_to_drop = 0; + m = fq_codel_extract_head(q, &pkt_ts, si); + + if (m == NULL) { + /*queue is empty - we can't be above target*/ + q->cst.first_above_time= 0; + return m; + } + + /* To span a large range of bandwidths, CoDel runs two + * different AQMs in parallel. One is sojourn-time-based + * and takes effect when the time to send an MTU-sized + * packet is less than target. The 1st term of the "if" + * below does this. The other is backlog-based and takes + * effect when the time to send an MTU-sized packet is >= + * target. The goal here is to keep the output link + * utilization high by never allowing the queue to get + * smaller than the amount that arrives in a typical + * interarrival time (MTU-sized packets arriving spaced + * by the amount of time it takes to send such a packet on + * the bottleneck). The 2nd term of the "if" does this. + */ + sojourn_time = now - pkt_ts; + if (sojourn_time < schk->cfg.ccfg.target || q->stats.len_bytes <= q->cst.maxpkt_size) { + /* went below - stay below for at least interval */ + q->cst.first_above_time = 0; + } else { + if (q->cst.first_above_time == 0) { + /* just went above from below. if still above at + * first_above_time, will say it's ok to drop. */ + q->cst.first_above_time = now + schk->cfg.ccfg.interval; + } else if (now >= q->cst.first_above_time) { + *ok_to_drop = 1; + } + } + return m; +} + +/* Codel dequeue function */ +__inline static struct mbuf * +fqc_codel_dequeue(struct fq_codel_flow *q, struct fq_codel_si *si) +{ + struct mbuf *m; + struct dn_aqm_codel_parms *cprms; + struct codel_status *cst; + aqm_time_t now; + uint16_t ok_to_drop; + struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1); + + cst = &q->cst; + cprms = &schk->cfg.ccfg; + + now = AQM_UNOW; + m = fqc_dodequeue(q, now, &ok_to_drop, si); + + if (cst->dropping) { + if (!ok_to_drop) { + /* sojourn time below target - leave dropping state */ + cst->dropping = false; + } + + /* Time for the next drop. Drop current packet and dequeue + * next. If the dequeue doesn't take us out of dropping + * state, schedule the next drop. A large backlog might + * result in drop rates so high that the next drop should + * happen now, hence the 'while' loop. + */ + while (now >= cst->drop_next_time && cst->dropping) { + + /* mark the packet */ + if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) { + cst->count++; + /* schedule the next mark. */ + cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time); + return m; + } + + /* drop the packet */ + fq_update_stats(q, si, 0, 1); + m_freem(m); + m = fqc_dodequeue(q, now, &ok_to_drop, si); + + if (!ok_to_drop) { + /* leave dropping state */ + cst->dropping = false; + } else { + cst->count++; + /* schedule the next drop. */ + cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time); + } + } + /* If we get here we're not in dropping state. The 'ok_to_drop' + * return from dodequeue means that the sojourn time has been + * above 'target' for 'interval' so enter dropping state. + */ + } else if (ok_to_drop) { + + /* if ECN option is disabled or the packet cannot be marked, + * drop the packet and extract another. + */ + if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) { + fq_update_stats(q, si, 0, 1); + m_freem(m); + m = fqc_dodequeue(q, now, &ok_to_drop,si); + } + + cst->dropping = true; + + /* If min went above target close to when it last went + * below, assume that the drop rate that controlled the + * queue on the last cycle is a good starting point to + * control it now. ('drop_next' will be at most 'interval' + * later than the time of the last drop so 'now - drop_next' + * is a good approximation of the time from the last drop + * until now.) + */ + cst->count = (cst->count > 2 && ((aqm_stime_t)now - + (aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)? cst->count - 2 : 1; + + /* we don't have to set initial guess for Newton's method isqrt as + * we initilaize isqrt in control_law function when count == 1 */ + cst->drop_next_time = control_law(cst, cprms, now); + } + + return m; +} + +#endif diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_prio.c b/freebsd/sys/netpfil/ipfw/dn_sched_prio.c deleted file mode 100644 index 0679db9d..00000000 --- a/freebsd/sys/netpfil/ipfw/dn_sched_prio.c +++ /dev/null @@ -1,231 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/* - * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - */ -#ifdef _KERNEL -#include <sys/malloc.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/kernel.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <net/if.h> /* IFNAMSIZ */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ipfw_rule_ref */ -#include <netinet/ip_fw.h> /* flow_id */ -#include <netinet/ip_dummynet.h> -#include <netpfil/ipfw/dn_heap.h> -#include <netpfil/ipfw/ip_dn_private.h> -#include <netpfil/ipfw/dn_sched.h> -#else -#include <dn_test.h> -#endif - -#define DN_SCHED_PRIO 5 //XXX - -#if !defined(_KERNEL) || !defined(__linux__) -#define test_bit(ix, pData) ((*pData) & (1<<(ix))) -#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) -#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) -#endif - -#ifdef __MIPSEL__ -#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) -#endif - -/* Size of the array of queues pointers. */ -#define BITMAP_T unsigned long -#define MAXPRIO (sizeof(BITMAP_T) * 8) - -/* - * The scheduler instance contains an array of pointers to queues, - * one for each priority, and a bitmap listing backlogged queues. - */ -struct prio_si { - BITMAP_T bitmap; /* array bitmap */ - struct dn_queue *q_array[MAXPRIO]; /* Array of queues pointers */ -}; - -/* - * If a queue with the same priority is already backlogged, use - * that one instead of the queue passed as argument. - */ -static int -prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) -{ - struct prio_si *si = (struct prio_si *)(_si + 1); - int prio = q->fs->fs.par[0]; - - if (test_bit(prio, &si->bitmap) == 0) { - /* No queue with this priority, insert */ - __set_bit(prio, &si->bitmap); - si->q_array[prio] = q; - } else { /* use the existing queue */ - q = si->q_array[prio]; - } - if (dn_enqueue(q, m, 0)) - return 1; - return 0; -} - -/* - * Packets are dequeued only from the highest priority queue. - * The function ffs() return the lowest bit in the bitmap that rapresent - * the array index (-1) which contains the pointer to the highest priority - * queue. - * After the dequeue, if this queue become empty, it is index is removed - * from the bitmap. - * Scheduler is idle if the bitmap is empty - * - * NOTE: highest priority is 0, lowest is sched->max_prio_q - */ -static struct mbuf * -prio_dequeue(struct dn_sch_inst *_si) -{ - struct prio_si *si = (struct prio_si *)(_si + 1); - struct mbuf *m; - struct dn_queue *q; - int prio; - - if (si->bitmap == 0) /* scheduler idle */ - return NULL; - - prio = ffs(si->bitmap) - 1; - - /* Take the highest priority queue in the scheduler */ - q = si->q_array[prio]; - // assert(q) - - m = dn_dequeue(q); - if (q->mq.head == NULL) { - /* Queue is now empty, remove from scheduler - * and mark it - */ - si->q_array[prio] = NULL; - __clear_bit(prio, &si->bitmap); - } - return m; -} - -static int -prio_new_sched(struct dn_sch_inst *_si) -{ - struct prio_si *si = (struct prio_si *)(_si + 1); - - bzero(si->q_array, sizeof(si->q_array)); - si->bitmap = 0; - - return 0; -} - -static int -prio_new_fsk(struct dn_fsk *fs) -{ - /* Check if the prioritiy is between 0 and MAXPRIO-1 */ - ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority"); - return 0; -} - -static int -prio_new_queue(struct dn_queue *q) -{ - struct prio_si *si = (struct prio_si *)(q->_si + 1); - int prio = q->fs->fs.par[0]; - struct dn_queue *oldq; - - q->ni.oid.subtype = DN_SCHED_PRIO; - - if (q->mq.head == NULL) - return 0; - - /* Queue already full, must insert in the scheduler or append - * mbufs to existing queue. This partly duplicates prio_enqueue - */ - if (test_bit(prio, &si->bitmap) == 0) { - /* No queue with this priority, insert */ - __set_bit(prio, &si->bitmap); - si->q_array[prio] = q; - } else if ( (oldq = si->q_array[prio]) != q) { - /* must append to the existing queue. - * can simply append q->mq.head to q2->... - * and add the counters to those of q2 - */ - oldq->mq.tail->m_nextpkt = q->mq.head; - oldq->mq.tail = q->mq.tail; - oldq->ni.length += q->ni.length; - q->ni.length = 0; - oldq->ni.len_bytes += q->ni.len_bytes; - q->ni.len_bytes = 0; - q->mq.tail = q->mq.head = NULL; - } - return 0; -} - -static int -prio_free_queue(struct dn_queue *q) -{ - int prio = q->fs->fs.par[0]; - struct prio_si *si = (struct prio_si *)(q->_si + 1); - - if (si->q_array[prio] == q) { - si->q_array[prio] = NULL; - __clear_bit(prio, &si->bitmap); - } - return 0; -} - - -static struct dn_alg prio_desc = { - _SI( .type = ) DN_SCHED_PRIO, - _SI( .name = ) "PRIO", - _SI( .flags = ) DN_MULTIQUEUE, - - /* we need extra space in the si and the queue */ - _SI( .schk_datalen = ) 0, - _SI( .si_datalen = ) sizeof(struct prio_si), - _SI( .q_datalen = ) 0, - - _SI( .enqueue = ) prio_enqueue, - _SI( .dequeue = ) prio_dequeue, - - _SI( .config = ) NULL, - _SI( .destroy = ) NULL, - _SI( .new_sched = ) prio_new_sched, - _SI( .free_sched = ) NULL, - - _SI( .new_fsk = ) prio_new_fsk, - _SI( .free_fsk = ) NULL, - - _SI( .new_queue = ) prio_new_queue, - _SI( .free_queue = ) prio_free_queue, -}; - - -DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc); diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_qfq.c b/freebsd/sys/netpfil/ipfw/dn_sched_qfq.c deleted file mode 100644 index 461c40a5..00000000 --- a/freebsd/sys/netpfil/ipfw/dn_sched_qfq.c +++ /dev/null @@ -1,866 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/* - * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - */ - -#ifdef _KERNEL -#include <sys/malloc.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/kernel.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <net/if.h> /* IFNAMSIZ */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ipfw_rule_ref */ -#include <netinet/ip_fw.h> /* flow_id */ -#include <netinet/ip_dummynet.h> -#include <netpfil/ipfw/dn_heap.h> -#include <netpfil/ipfw/ip_dn_private.h> -#include <netpfil/ipfw/dn_sched.h> -#else -#include <dn_test.h> -#endif - -#ifdef QFQ_DEBUG -struct qfq_sched; -static void dump_sched(struct qfq_sched *q, const char *msg); -#define NO(x) x -#else -#define NO(x) -#endif -#define DN_SCHED_QFQ 4 // XXX Where? -typedef unsigned long bitmap; - -/* - * bitmaps ops are critical. Some linux versions have __fls - * and the bitmap ops. Some machines have ffs - */ -#if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) -int fls(unsigned int n) -{ - int i = 0; - for (i = 0; n > 0; n >>= 1, i++) - ; - return i; -} -#endif - -#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) -static inline unsigned long __fls(unsigned long word) -{ - return fls(word) - 1; -} -#endif - -#if !defined(_KERNEL) || !defined(__linux__) -#ifdef QFQ_DEBUG -int test_bit(int ix, bitmap *p) -{ - if (ix < 0 || ix > 31) - D("bad index %d", ix); - return *p & (1<<ix); -} -void __set_bit(int ix, bitmap *p) -{ - if (ix < 0 || ix > 31) - D("bad index %d", ix); - *p |= (1<<ix); -} -void __clear_bit(int ix, bitmap *p) -{ - if (ix < 0 || ix > 31) - D("bad index %d", ix); - *p &= ~(1<<ix); -} -#else /* !QFQ_DEBUG */ -/* XXX do we have fast version, or leave it to the compiler ? */ -#define test_bit(ix, pData) ((*pData) & (1<<(ix))) -#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) -#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) -#endif /* !QFQ_DEBUG */ -#endif /* !__linux__ */ - -#ifdef __MIPSEL__ -#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) -#endif - -/*-------------------------------------------*/ -/* - -Virtual time computations. - -S, F and V are all computed in fixed point arithmetic with -FRAC_BITS decimal bits. - - QFQ_MAX_INDEX is the maximum index allowed for a group. We need - one bit per index. - QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. - The layout of the bits is as below: - - [ MTU_SHIFT ][ FRAC_BITS ] - [ MAX_INDEX ][ MIN_SLOT_SHIFT ] - ^.__grp->index = 0 - *.__grp->slot_shift - - where MIN_SLOT_SHIFT is derived by difference from the others. - -The max group index corresponds to Lmax/w_min, where -Lmax=1<<MTU_SHIFT, w_min = 1 . -From this, and knowing how many groups (MAX_INDEX) we want, -we can derive the shift corresponding to each group. - -Because we often need to compute - F = S + len/w_i and V = V + len/wsum -instead of storing w_i store the value - inv_w = (1<<FRAC_BITS)/w_i -so we can do F = S + len * inv_w * wsum. -We use W_TOT in the formulas so we can easily move between -static and adaptive weight sum. - -The per-scheduler-instance data contain all the data structures -for the scheduler: bitmaps and bucket lists. - - */ -/* - * Maximum number of consecutive slots occupied by backlogged classes - * inside a group. This is approx lmax/lmin + 5. - * XXX check because it poses constraints on MAX_INDEX - */ -#define QFQ_MAX_SLOTS 32 -/* - * Shifts used for class<->group mapping. Class weights are - * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the - * group with the smallest index that can support the L_i / r_i - * configured for the class. - * - * grp->index is the index of the group; and grp->slot_shift - * is the shift for the corresponding (scaled) sigma_i. - * - * When computing the group index, we do (len<<FP_SHIFT)/weight, - * then compute an FLS (which is like a log2()), and if the result - * is below the MAX_INDEX region we use 0 (which is the same as - * using a larger len). - */ -#define QFQ_MAX_INDEX 19 -#define QFQ_MAX_WSHIFT 16 /* log2(max_weight) */ - -#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) -#define QFQ_MAX_WSUM (2*QFQ_MAX_WEIGHT) -//#define IWSUM (q->i_wsum) -#define IWSUM ((1<<FRAC_BITS)/QFQ_MAX_WSUM) - -#define FRAC_BITS 30 /* fixed point arithmetic */ -#define ONE_FP (1UL << FRAC_BITS) - -#define QFQ_MTU_SHIFT 11 /* log2(max_len) */ -#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX) - -/* - * Possible group states, also indexes for the bitmaps array in - * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3 - */ -enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; - -struct qfq_group; -/* - * additional queue info. Some of this info should come from - * the flowset, we copy them here for faster processing. - * This is an overlay of the struct dn_queue - */ -struct qfq_class { - struct dn_queue _q; - uint64_t S, F; /* flow timestamps (exact) */ - struct qfq_class *next; /* Link for the slot list. */ - - /* group we belong to. In principle we would need the index, - * which is log_2(lmax/weight), but we never reference it - * directly, only the group. - */ - struct qfq_group *grp; - - /* these are copied from the flowset. */ - uint32_t inv_w; /* ONE_FP/weight */ - uint32_t lmax; /* Max packet size for this flow. */ -}; - -/* Group descriptor, see the paper for details. - * Basically this contains the bucket lists - */ -struct qfq_group { - uint64_t S, F; /* group timestamps (approx). */ - unsigned int slot_shift; /* Slot shift. */ - unsigned int index; /* Group index. */ - unsigned int front; /* Index of the front slot. */ - bitmap full_slots; /* non-empty slots */ - - /* Array of lists of active classes. */ - struct qfq_class *slots[QFQ_MAX_SLOTS]; -}; - -/* scheduler instance descriptor. */ -struct qfq_sched { - uint64_t V; /* Precise virtual time. */ - uint32_t wsum; /* weight sum */ - NO(uint32_t i_wsum; /* ONE_FP/w_sum */ - uint32_t _queued; /* debugging */ - uint32_t loops; /* debugging */) - bitmap bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ - struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ -}; - -/*---- support functions ----------------------------*/ - -/* Generic comparison function, handling wraparound. */ -static inline int qfq_gt(uint64_t a, uint64_t b) -{ - return (int64_t)(a - b) > 0; -} - -/* Round a precise timestamp to its slotted value. */ -static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift) -{ - return ts & ~((1ULL << shift) - 1); -} - -/* return the pointer to the group with lowest index in the bitmap */ -static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, - unsigned long bitmap) -{ - int index = ffs(bitmap) - 1; // zero-based - return &q->groups[index]; -} - -/* - * Calculate a flow index, given its weight and maximum packet length. - * index = log_2(maxlen/weight) but we need to apply the scaling. - * This is used only once at flow creation. - */ -static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen) -{ - uint64_t slot_size = (uint64_t)maxlen *inv_w; - unsigned long size_map; - int index = 0; - - size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT); - if (!size_map) - goto out; - - index = __fls(size_map) + 1; // basically a log_2() - index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); - - if (index < 0) - index = 0; - -out: - ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index); - return index; -} -/*---- end support functions ----*/ - -/*-------- API calls --------------------------------*/ -/* - * Validate and copy parameters from flowset. - */ -static int -qfq_new_queue(struct dn_queue *_q) -{ - struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); - struct qfq_class *cl = (struct qfq_class *)_q; - int i; - uint32_t w; /* approximated weight */ - - /* import parameters from the flowset. They should be correct - * already. - */ - w = _q->fs->fs.par[0]; - cl->lmax = _q->fs->fs.par[1]; - if (!w || w > QFQ_MAX_WEIGHT) { - w = 1; - D("rounding weight to 1"); - } - cl->inv_w = ONE_FP/w; - w = ONE_FP/cl->inv_w; - if (q->wsum + w > QFQ_MAX_WSUM) - return EINVAL; - - i = qfq_calc_index(cl->inv_w, cl->lmax); - cl->grp = &q->groups[i]; - q->wsum += w; - // XXX cl->S = q->V; ? - // XXX compute q->i_wsum - return 0; -} - -/* remove an empty queue */ -static int -qfq_free_queue(struct dn_queue *_q) -{ - struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); - struct qfq_class *cl = (struct qfq_class *)_q; - if (cl->inv_w) { - q->wsum -= ONE_FP/cl->inv_w; - cl->inv_w = 0; /* reset weight to avoid run twice */ - } - return 0; -} - -/* Calculate a mask to mimic what would be ffs_from(). */ -static inline unsigned long -mask_from(unsigned long bitmap, int from) -{ - return bitmap & ~((1UL << from) - 1); -} - -/* - * The state computation relies on ER=0, IR=1, EB=2, IB=3 - * First compute eligibility comparing grp->S, q->V, - * then check if someone is blocking us and possibly add EB - */ -static inline unsigned int -qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp) -{ - /* if S > V we are not eligible */ - unsigned int state = qfq_gt(grp->S, q->V); - unsigned long mask = mask_from(q->bitmaps[ER], grp->index); - struct qfq_group *next; - - if (mask) { - next = qfq_ffs(q, mask); - if (qfq_gt(grp->F, next->F)) - state |= EB; - } - - return state; -} - -/* - * In principle - * q->bitmaps[dst] |= q->bitmaps[src] & mask; - * q->bitmaps[src] &= ~mask; - * but we should make sure that src != dst - */ -static inline void -qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) -{ - q->bitmaps[dst] |= q->bitmaps[src] & mask; - q->bitmaps[src] &= ~mask; -} - -static inline void -qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish) -{ - unsigned long mask = mask_from(q->bitmaps[ER], index + 1); - struct qfq_group *next; - - if (mask) { - next = qfq_ffs(q, mask); - if (!qfq_gt(next->F, old_finish)) - return; - } - - mask = (1UL << index) - 1; - qfq_move_groups(q, mask, EB, ER); - qfq_move_groups(q, mask, IB, IR); -} - -/* - * perhaps - * - old_V ^= q->V; - old_V >>= QFQ_MIN_SLOT_SHIFT; - if (old_V) { - ... - } - * - */ -static inline void -qfq_make_eligible(struct qfq_sched *q, uint64_t old_V) -{ - unsigned long mask, vslot, old_vslot; - - vslot = q->V >> QFQ_MIN_SLOT_SHIFT; - old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; - - if (vslot != old_vslot) { - mask = (2UL << (__fls(vslot ^ old_vslot))) - 1; - qfq_move_groups(q, mask, IR, ER); - qfq_move_groups(q, mask, IB, EB); - } -} - -/* - * XXX we should make sure that slot becomes less than 32. - * This is guaranteed by the input values. - * roundedS is always cl->S rounded on grp->slot_shift bits. - */ -static inline void -qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS) -{ - uint64_t slot = (roundedS - grp->S) >> grp->slot_shift; - unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; - - cl->next = grp->slots[i]; - grp->slots[i] = cl; - __set_bit(slot, &grp->full_slots); -} - -/* - * remove the entry from the slot - */ -static inline void -qfq_front_slot_remove(struct qfq_group *grp) -{ - struct qfq_class **h = &grp->slots[grp->front]; - - *h = (*h)->next; - if (!*h) - __clear_bit(0, &grp->full_slots); -} - -/* - * Returns the first full queue in a group. As a side effect, - * adjust the bucket list so the first non-empty bucket is at - * position 0 in full_slots. - */ -static inline struct qfq_class * -qfq_slot_scan(struct qfq_group *grp) -{ - int i; - - ND("grp %d full %x", grp->index, grp->full_slots); - if (!grp->full_slots) - return NULL; - - i = ffs(grp->full_slots) - 1; // zero-based - if (i > 0) { - grp->front = (grp->front + i) % QFQ_MAX_SLOTS; - grp->full_slots >>= i; - } - - return grp->slots[grp->front]; -} - -/* - * adjust the bucket list. When the start time of a group decreases, - * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to - * move the objects. The mask of occupied slots must be shifted - * because we use ffs() to find the first non-empty slot. - * This covers decreases in the group's start time, but what about - * increases of the start time ? - * Here too we should make sure that i is less than 32 - */ -static inline void -qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS) -{ - unsigned int i = (grp->S - roundedS) >> grp->slot_shift; - - grp->full_slots <<= i; - grp->front = (grp->front - i) % QFQ_MAX_SLOTS; -} - - -static inline void -qfq_update_eligible(struct qfq_sched *q, uint64_t old_V) -{ - bitmap ineligible; - - ineligible = q->bitmaps[IR] | q->bitmaps[IB]; - if (ineligible) { - if (!q->bitmaps[ER]) { - struct qfq_group *grp; - grp = qfq_ffs(q, ineligible); - if (qfq_gt(grp->S, q->V)) - q->V = grp->S; - } - qfq_make_eligible(q, old_V); - } -} - -/* - * Updates the class, returns true if also the group needs to be updated. - */ -static inline int -qfq_update_class(struct qfq_sched *q, struct qfq_group *grp, - struct qfq_class *cl) -{ - - cl->S = cl->F; - if (cl->_q.mq.head == NULL) { - qfq_front_slot_remove(grp); - } else { - unsigned int len; - uint64_t roundedS; - - len = cl->_q.mq.head->m_pkthdr.len; - cl->F = cl->S + (uint64_t)len * cl->inv_w; - roundedS = qfq_round_down(cl->S, grp->slot_shift); - if (roundedS == grp->S) - return 0; - - qfq_front_slot_remove(grp); - qfq_slot_insert(grp, cl, roundedS); - } - return 1; -} - -static struct mbuf * -qfq_dequeue(struct dn_sch_inst *si) -{ - struct qfq_sched *q = (struct qfq_sched *)(si + 1); - struct qfq_group *grp; - struct qfq_class *cl; - struct mbuf *m; - uint64_t old_V; - - NO(q->loops++;) - if (!q->bitmaps[ER]) { - NO(if (q->queued) - dump_sched(q, "start dequeue");) - return NULL; - } - - grp = qfq_ffs(q, q->bitmaps[ER]); - - cl = grp->slots[grp->front]; - /* extract from the first bucket in the bucket list */ - m = dn_dequeue(&cl->_q); - - if (!m) { - D("BUG/* non-workconserving leaf */"); - return NULL; - } - NO(q->queued--;) - old_V = q->V; - q->V += (uint64_t)m->m_pkthdr.len * IWSUM; - ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V); - - if (qfq_update_class(q, grp, cl)) { - uint64_t old_F = grp->F; - cl = qfq_slot_scan(grp); - if (!cl) { /* group gone, remove from ER */ - __clear_bit(grp->index, &q->bitmaps[ER]); - // grp->S = grp->F + 1; // XXX debugging only - } else { - uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift); - unsigned int s; - - if (grp->S == roundedS) - goto skip_unblock; - grp->S = roundedS; - grp->F = roundedS + (2ULL << grp->slot_shift); - /* remove from ER and put in the new set */ - __clear_bit(grp->index, &q->bitmaps[ER]); - s = qfq_calc_state(q, grp); - __set_bit(grp->index, &q->bitmaps[s]); - } - /* we need to unblock even if the group has gone away */ - qfq_unblock_groups(q, grp->index, old_F); - } - -skip_unblock: - qfq_update_eligible(q, old_V); - NO(if (!q->bitmaps[ER] && q->queued) - dump_sched(q, "end dequeue");) - - return m; -} - -/* - * Assign a reasonable start time for a new flow k in group i. - * Admissible values for \hat(F) are multiples of \sigma_i - * no greater than V+\sigma_i . Larger values mean that - * we had a wraparound so we consider the timestamp to be stale. - * - * If F is not stale and F >= V then we set S = F. - * Otherwise we should assign S = V, but this may violate - * the ordering in ER. So, if we have groups in ER, set S to - * the F_j of the first group j which would be blocking us. - * We are guaranteed not to move S backward because - * otherwise our group i would still be blocked. - */ -static inline void -qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) -{ - unsigned long mask; - uint32_t limit, roundedF; - int slot_shift = cl->grp->slot_shift; - - roundedF = qfq_round_down(cl->F, slot_shift); - limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); - - if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { - /* timestamp was stale */ - mask = mask_from(q->bitmaps[ER], cl->grp->index); - if (mask) { - struct qfq_group *next = qfq_ffs(q, mask); - if (qfq_gt(roundedF, next->F)) { - cl->S = next->F; - return; - } - } - cl->S = q->V; - } else { /* timestamp is not stale */ - cl->S = cl->F; - } -} - -static int -qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m) -{ - struct qfq_sched *q = (struct qfq_sched *)(si + 1); - struct qfq_group *grp; - struct qfq_class *cl = (struct qfq_class *)_q; - uint64_t roundedS; - int s; - - NO(q->loops++;) - DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len, - _q, cl->inv_w, cl->grp->index); - /* XXX verify that the packet obeys the parameters */ - if (m != _q->mq.head) { - if (dn_enqueue(_q, m, 0)) /* packet was dropped */ - return 1; - NO(q->queued++;) - if (m != _q->mq.head) - return 0; - } - /* If reach this point, queue q was idle */ - grp = cl->grp; - qfq_update_start(q, cl); /* adjust start time */ - /* compute new finish time and rounded start. */ - cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w; - roundedS = qfq_round_down(cl->S, grp->slot_shift); - - /* - * insert cl in the correct bucket. - * If cl->S >= grp->S we don't need to adjust the - * bucket list and simply go to the insertion phase. - * Otherwise grp->S is decreasing, we must make room - * in the bucket list, and also recompute the group state. - * Finally, if there were no flows in this group and nobody - * was in ER make sure to adjust V. - */ - if (grp->full_slots) { - if (!qfq_gt(grp->S, cl->S)) - goto skip_update; - /* create a slot for this cl->S */ - qfq_slot_rotate(q, grp, roundedS); - /* group was surely ineligible, remove */ - __clear_bit(grp->index, &q->bitmaps[IR]); - __clear_bit(grp->index, &q->bitmaps[IB]); - } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) - q->V = roundedS; - - grp->S = roundedS; - grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i - s = qfq_calc_state(q, grp); - __set_bit(grp->index, &q->bitmaps[s]); - ND("new state %d 0x%x", s, q->bitmaps[s]); - ND("S %llx F %llx V %llx", cl->S, cl->F, q->V); -skip_update: - qfq_slot_insert(grp, cl, roundedS); - - return 0; -} - - -#if 0 -static inline void -qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, - struct qfq_class *cl, struct qfq_class **pprev) -{ - unsigned int i, offset; - uint64_t roundedS; - - roundedS = qfq_round_down(cl->S, grp->slot_shift); - offset = (roundedS - grp->S) >> grp->slot_shift; - i = (grp->front + offset) % QFQ_MAX_SLOTS; - -#ifdef notyet - if (!pprev) { - pprev = &grp->slots[i]; - while (*pprev && *pprev != cl) - pprev = &(*pprev)->next; - } -#endif - - *pprev = cl->next; - if (!grp->slots[i]) - __clear_bit(offset, &grp->full_slots); -} - -/* - * called to forcibly destroy a queue. - * If the queue is not in the front bucket, or if it has - * other queues in the front bucket, we can simply remove - * the queue with no other side effects. - * Otherwise we must propagate the event up. - * XXX description to be completed. - */ -static void -qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl, - struct qfq_class **pprev) -{ - struct qfq_group *grp = &q->groups[cl->index]; - unsigned long mask; - uint64_t roundedS; - int s; - - cl->F = cl->S; // not needed if the class goes away. - qfq_slot_remove(q, grp, cl, pprev); - - if (!grp->full_slots) { - /* nothing left in the group, remove from all sets. - * Do ER last because if we were blocking other groups - * we must unblock them. - */ - __clear_bit(grp->index, &q->bitmaps[IR]); - __clear_bit(grp->index, &q->bitmaps[EB]); - __clear_bit(grp->index, &q->bitmaps[IB]); - - if (test_bit(grp->index, &q->bitmaps[ER]) && - !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { - mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); - if (mask) - mask = ~((1UL << __fls(mask)) - 1); - else - mask = ~0UL; - qfq_move_groups(q, mask, EB, ER); - qfq_move_groups(q, mask, IB, IR); - } - __clear_bit(grp->index, &q->bitmaps[ER]); - } else if (!grp->slots[grp->front]) { - cl = qfq_slot_scan(grp); - roundedS = qfq_round_down(cl->S, grp->slot_shift); - if (grp->S != roundedS) { - __clear_bit(grp->index, &q->bitmaps[ER]); - __clear_bit(grp->index, &q->bitmaps[IR]); - __clear_bit(grp->index, &q->bitmaps[EB]); - __clear_bit(grp->index, &q->bitmaps[IB]); - grp->S = roundedS; - grp->F = roundedS + (2ULL << grp->slot_shift); - s = qfq_calc_state(q, grp); - __set_bit(grp->index, &q->bitmaps[s]); - } - } - qfq_update_eligible(q, q->V); -} -#endif - -static int -qfq_new_fsk(struct dn_fsk *f) -{ - ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight"); - ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen"); - ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]); - return 0; -} - -/* - * initialize a new scheduler instance - */ -static int -qfq_new_sched(struct dn_sch_inst *si) -{ - struct qfq_sched *q = (struct qfq_sched *)(si + 1); - struct qfq_group *grp; - int i; - - for (i = 0; i <= QFQ_MAX_INDEX; i++) { - grp = &q->groups[i]; - grp->index = i; - grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - - (QFQ_MAX_INDEX - i); - } - return 0; -} - -/* - * QFQ scheduler descriptor - */ -static struct dn_alg qfq_desc = { - _SI( .type = ) DN_SCHED_QFQ, - _SI( .name = ) "QFQ", - _SI( .flags = ) DN_MULTIQUEUE, - - _SI( .schk_datalen = ) 0, - _SI( .si_datalen = ) sizeof(struct qfq_sched), - _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue), - - _SI( .enqueue = ) qfq_enqueue, - _SI( .dequeue = ) qfq_dequeue, - - _SI( .config = ) NULL, - _SI( .destroy = ) NULL, - _SI( .new_sched = ) qfq_new_sched, - _SI( .free_sched = ) NULL, - _SI( .new_fsk = ) qfq_new_fsk, - _SI( .free_fsk = ) NULL, - _SI( .new_queue = ) qfq_new_queue, - _SI( .free_queue = ) qfq_free_queue, -}; - -DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); - -#ifdef QFQ_DEBUG -static void -dump_groups(struct qfq_sched *q, uint32_t mask) -{ - int i, j; - - for (i = 0; i < QFQ_MAX_INDEX + 1; i++) { - struct qfq_group *g = &q->groups[i]; - - if (0 == (mask & (1<<i))) - continue; - for (j = 0; j < QFQ_MAX_SLOTS; j++) { - if (g->slots[j]) - D(" bucket %d %p", j, g->slots[j]); - } - D("full_slots 0x%x", g->full_slots); - D(" %2d S 0x%20llx F 0x%llx %c", i, - g->S, g->F, - mask & (1<<i) ? '1' : '0'); - } -} - -static void -dump_sched(struct qfq_sched *q, const char *msg) -{ - D("--- in %s: ---", msg); - ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V); - D(" ER 0x%08x", q->bitmaps[ER]); - D(" EB 0x%08x", q->bitmaps[EB]); - D(" IR 0x%08x", q->bitmaps[IR]); - D(" IB 0x%08x", q->bitmaps[IB]); - dump_groups(q, 0xffffffff); -}; -#endif /* QFQ_DEBUG */ diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_rr.c b/freebsd/sys/netpfil/ipfw/dn_sched_rr.c deleted file mode 100644 index c1862ab0..00000000 --- a/freebsd/sys/netpfil/ipfw/dn_sched_rr.c +++ /dev/null @@ -1,309 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/* - * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - */ - -#ifdef _KERNEL -#include <sys/malloc.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/kernel.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <net/if.h> /* IFNAMSIZ */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ipfw_rule_ref */ -#include <netinet/ip_fw.h> /* flow_id */ -#include <netinet/ip_dummynet.h> -#include <netpfil/ipfw/dn_heap.h> -#include <netpfil/ipfw/ip_dn_private.h> -#include <netpfil/ipfw/dn_sched.h> -#else -#include <dn_test.h> -#endif - -#define DN_SCHED_RR 3 // XXX Where? - -struct rr_queue { - struct dn_queue q; /* Standard queue */ - int status; /* 1: queue is in the list */ - int credit; /* Number of bytes to transmit */ - int quantum; /* quantum * C */ - struct rr_queue *qnext; /* */ -}; - -/* struct rr_schk contains global config parameters - * and is right after dn_schk - */ -struct rr_schk { - int min_q; /* Min quantum */ - int max_q; /* Max quantum */ - int q_bytes; /* Bytes per quantum */ -}; - -/* per-instance round robin list, right after dn_sch_inst */ -struct rr_si { - struct rr_queue *head, *tail; /* Pointer to current queue */ -}; - -/* Append a queue to the rr list */ -static inline void -rr_append(struct rr_queue *q, struct rr_si *si) -{ - q->status = 1; /* mark as in-rr_list */ - q->credit = q->quantum; /* initialize credit */ - - /* append to the tail */ - if (si->head == NULL) - si->head = q; - else - si->tail->qnext = q; - si->tail = q; /* advance the tail pointer */ - q->qnext = si->head; /* make it circular */ -} - -/* Remove the head queue from circular list. */ -static inline void -rr_remove_head(struct rr_si *si) -{ - if (si->head == NULL) - return; /* empty queue */ - si->head->status = 0; - - if (si->head == si->tail) { - si->head = si->tail = NULL; - return; - } - - si->head = si->head->qnext; - si->tail->qnext = si->head; -} - -/* Remove a queue from circular list. - * XXX see if ti can be merge with remove_queue() - */ -static inline void -remove_queue_q(struct rr_queue *q, struct rr_si *si) -{ - struct rr_queue *prev; - - if (q->status != 1) - return; - if (q == si->head) { - rr_remove_head(si); - return; - } - - for (prev = si->head; prev; prev = prev->qnext) { - if (prev->qnext != q) - continue; - prev->qnext = q->qnext; - if (q == si->tail) - si->tail = prev; - q->status = 0; - break; - } -} - - -static inline void -next_pointer(struct rr_si *si) -{ - if (si->head == NULL) - return; /* empty queue */ - - si->head = si->head->qnext; - si->tail = si->tail->qnext; -} - -static int -rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) -{ - struct rr_si *si; - struct rr_queue *rrq; - - if (m != q->mq.head) { - if (dn_enqueue(q, m, 0)) /* packet was dropped */ - return 1; - if (m != q->mq.head) - return 0; - } - - /* If reach this point, queue q was idle */ - si = (struct rr_si *)(_si + 1); - rrq = (struct rr_queue *)q; - - if (rrq->status == 1) /* Queue is already in the queue list */ - return 0; - - /* Insert the queue in the queue list */ - rr_append(rrq, si); - - return 0; -} - -static struct mbuf * -rr_dequeue(struct dn_sch_inst *_si) -{ - /* Access scheduler instance private data */ - struct rr_si *si = (struct rr_si *)(_si + 1); - struct rr_queue *rrq; - uint64_t len; - - while ( (rrq = si->head) ) { - struct mbuf *m = rrq->q.mq.head; - if ( m == NULL) { - /* empty queue, remove from list */ - rr_remove_head(si); - continue; - } - len = m->m_pkthdr.len; - - if (len > rrq->credit) { - /* Packet too big */ - rrq->credit += rrq->quantum; - /* Try next queue */ - next_pointer(si); - } else { - rrq->credit -= len; - return dn_dequeue(&rrq->q); - } - } - - /* no packet to dequeue*/ - return NULL; -} - -static int -rr_config(struct dn_schk *_schk) -{ - struct rr_schk *schk = (struct rr_schk *)(_schk + 1); - ND("called"); - - /* use reasonable quantums (64..2k bytes, default 1500) */ - schk->min_q = 64; - schk->max_q = 2048; - schk->q_bytes = 1500; /* quantum */ - - return 0; -} - -static int -rr_new_sched(struct dn_sch_inst *_si) -{ - struct rr_si *si = (struct rr_si *)(_si + 1); - - ND("called"); - si->head = si->tail = NULL; - - return 0; -} - -static int -rr_free_sched(struct dn_sch_inst *_si) -{ - ND("called"); - /* Nothing to do? */ - return 0; -} - -static int -rr_new_fsk(struct dn_fsk *fs) -{ - struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1); - /* par[0] is the weight, par[1] is the quantum step */ - ipdn_bound_var(&fs->fs.par[0], 1, - 1, 65536, "RR weight"); - ipdn_bound_var(&fs->fs.par[1], schk->q_bytes, - schk->min_q, schk->max_q, "RR quantum"); - return 0; -} - -static int -rr_new_queue(struct dn_queue *_q) -{ - struct rr_queue *q = (struct rr_queue *)_q; - - _q->ni.oid.subtype = DN_SCHED_RR; - - q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1]; - ND("called, q->quantum %d", q->quantum); - q->credit = q->quantum; - q->status = 0; - - if (_q->mq.head != NULL) { - /* Queue NOT empty, insert in the queue list */ - rr_append(q, (struct rr_si *)(_q->_si + 1)); - } - return 0; -} - -static int -rr_free_queue(struct dn_queue *_q) -{ - struct rr_queue *q = (struct rr_queue *)_q; - - ND("called"); - if (q->status == 1) { - struct rr_si *si = (struct rr_si *)(_q->_si + 1); - remove_queue_q(q, si); - } - return 0; -} - -/* - * RR scheduler descriptor - * contains the type of the scheduler, the name, the size of the - * structures and function pointers. - */ -static struct dn_alg rr_desc = { - _SI( .type = ) DN_SCHED_RR, - _SI( .name = ) "RR", - _SI( .flags = ) DN_MULTIQUEUE, - - _SI( .schk_datalen = ) 0, - _SI( .si_datalen = ) sizeof(struct rr_si), - _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue), - - _SI( .enqueue = ) rr_enqueue, - _SI( .dequeue = ) rr_dequeue, - - _SI( .config = ) rr_config, - _SI( .destroy = ) NULL, - _SI( .new_sched = ) rr_new_sched, - _SI( .free_sched = ) rr_free_sched, - _SI( .new_fsk = ) rr_new_fsk, - _SI( .free_fsk = ) NULL, - _SI( .new_queue = ) rr_new_queue, - _SI( .free_queue = ) rr_free_queue, -}; - - -DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc); diff --git a/freebsd/sys/netpfil/ipfw/dn_sched_wf2q.c b/freebsd/sys/netpfil/ipfw/dn_sched_wf2q.c deleted file mode 100644 index 77c4bbad..00000000 --- a/freebsd/sys/netpfil/ipfw/dn_sched_wf2q.c +++ /dev/null @@ -1,375 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/* - * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa - * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - */ - -#ifdef _KERNEL -#include <sys/malloc.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/kernel.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <net/if.h> /* IFNAMSIZ */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ipfw_rule_ref */ -#include <netinet/ip_fw.h> /* flow_id */ -#include <netinet/ip_dummynet.h> -#include <netpfil/ipfw/dn_heap.h> -#include <netpfil/ipfw/ip_dn_private.h> -#include <netpfil/ipfw/dn_sched.h> -#else -#include <dn_test.h> -#endif - -#ifndef MAX64 -#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) -#endif - -/* - * timestamps are computed on 64 bit using fixed point arithmetic. - * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len - * and sum of weights, respectively. FRAC_BITS is the number of - * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large - * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w - * using an unsigned 32-bit division, and to avoid wraparounds we need - * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64 - * As an example - * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19 - */ -#ifndef FRAC_BITS -#define FRAC_BITS 28 /* shift for fixed point arithmetic */ -#define ONE_FP (1UL << FRAC_BITS) -#endif - -/* - * Private information for the scheduler instance: - * sch_heap (key is Finish time) returns the next queue to serve - * ne_heap (key is Start time) stores not-eligible queues - * idle_heap (key=start/finish time) stores idle flows. It must - * support extract-from-middle. - * A flow is only in 1 of the three heaps. - * XXX todo: use a more efficient data structure, e.g. a tree sorted - * by F with min_subtree(S) in each node - */ -struct wf2qp_si { - struct dn_heap sch_heap; /* top extract - key Finish time */ - struct dn_heap ne_heap; /* top extract - key Start time */ - struct dn_heap idle_heap; /* random extract - key Start=Finish time */ - uint64_t V; /* virtual time */ - uint32_t inv_wsum; /* inverse of sum of weights */ - uint32_t wsum; /* sum of weights */ -}; - -struct wf2qp_queue { - struct dn_queue _q; - uint64_t S, F; /* start time, finish time */ - uint32_t inv_w; /* ONE_FP / weight */ - int32_t heap_pos; /* position (index) of struct in heap */ -}; - -/* - * This file implements a WF2Q+ scheduler as it has been in dummynet - * since 2000. - * The scheduler supports per-flow queues and has O(log N) complexity. - * - * WF2Q+ needs to drain entries from the idle heap so that we - * can keep the sum of weights up to date. We can do it whenever - * we get a chance, or periodically, or following some other - * strategy. The function idle_check() drains at most N elements - * from the idle heap. - */ -static void -idle_check(struct wf2qp_si *si, int n, int force) -{ - struct dn_heap *h = &si->idle_heap; - while (n-- > 0 && h->elements > 0 && - (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) { - struct dn_queue *q = HEAP_TOP(h)->object; - struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; - - heap_extract(h, NULL); - /* XXX to let the flowset delete the queue we should - * mark it as 'unused' by the scheduler. - */ - alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */ - si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */ - if (si->wsum > 0) - si->inv_wsum = ONE_FP/si->wsum; - } -} - -static int -wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) -{ - struct dn_fsk *fs = q->fs; - struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); - struct wf2qp_queue *alg_fq; - uint64_t len = m->m_pkthdr.len; - - if (m != q->mq.head) { - if (dn_enqueue(q, m, 0)) /* packet was dropped */ - return 1; - if (m != q->mq.head) /* queue was already busy */ - return 0; - } - - /* If reach this point, queue q was idle */ - alg_fq = (struct wf2qp_queue *)q; - - if (DN_KEY_LT(alg_fq->F, alg_fq->S)) { - /* F<S means timestamps are invalid ->brand new queue. */ - alg_fq->S = si->V; /* init start time */ - si->wsum += fs->fs.par[0]; /* add weight of new queue. */ - si->inv_wsum = ONE_FP/si->wsum; - } else { /* if it was idle then it was in the idle heap */ - heap_extract(&si->idle_heap, q); - alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */ - } - alg_fq->F = alg_fq->S + len * alg_fq->inv_w; - - /* if nothing is backlogged, make sure this flow is eligible */ - if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0) - si->V = MAX64(alg_fq->S, si->V); - - /* - * Look at eligibility. A flow is not eligibile if S>V (when - * this happens, it means that there is some other flow already - * scheduled for the same pipe, so the sch_heap cannot be - * empty). If the flow is not eligible we just store it in the - * ne_heap. Otherwise, we store in the sch_heap. - * Note that for all flows in sch_heap (SCH), S_i <= V, - * and for all flows in ne_heap (NEH), S_i > V. - * So when we need to compute max(V, min(S_i)) forall i in - * SCH+NEH, we only need to look into NEH. - */ - if (DN_KEY_LT(si->V, alg_fq->S)) { - /* S>V means flow Not eligible. */ - if (si->sch_heap.elements == 0) - D("++ ouch! not eligible but empty scheduler!"); - heap_insert(&si->ne_heap, alg_fq->S, q); - } else { - heap_insert(&si->sch_heap, alg_fq->F, q); - } - return 0; -} - -/* XXX invariant: sch > 0 || V >= min(S in neh) */ -static struct mbuf * -wf2qp_dequeue(struct dn_sch_inst *_si) -{ - /* Access scheduler instance private data */ - struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); - struct mbuf *m; - struct dn_queue *q; - struct dn_heap *sch = &si->sch_heap; - struct dn_heap *neh = &si->ne_heap; - struct wf2qp_queue *alg_fq; - - if (sch->elements == 0 && neh->elements == 0) { - /* we have nothing to do. We could kill the idle heap - * altogether and reset V - */ - idle_check(si, 0x7fffffff, 1); - si->V = 0; - si->wsum = 0; /* should be set already */ - return NULL; /* quick return if nothing to do */ - } - idle_check(si, 1, 0); /* drain something from the idle heap */ - - /* make sure at least one element is eligible, bumping V - * and moving entries that have become eligible. - * We need to repeat the first part twice, before and - * after extracting the candidate, or enqueue() will - * find the data structure in a wrong state. - */ - m = NULL; - for(;;) { - /* - * Compute V = max(V, min(S_i)). Remember that all elements - * in sch have by definition S_i <= V so if sch is not empty, - * V is surely the max and we must not update it. Conversely, - * if sch is empty we only need to look at neh. - * We don't need to move the queues, as it will be done at the - * next enqueue - */ - if (sch->elements == 0 && neh->elements > 0) { - si->V = MAX64(si->V, HEAP_TOP(neh)->key); - } - while (neh->elements > 0 && - DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) { - q = HEAP_TOP(neh)->object; - alg_fq = (struct wf2qp_queue *)q; - heap_extract(neh, NULL); - heap_insert(sch, alg_fq->F, q); - } - if (m) /* pkt found in previous iteration */ - break; - /* ok we have at least one eligible pkt */ - q = HEAP_TOP(sch)->object; - alg_fq = (struct wf2qp_queue *)q; - m = dn_dequeue(q); - heap_extract(sch, NULL); /* Remove queue from heap. */ - si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum; - alg_fq->S = alg_fq->F; /* Update start time. */ - if (q->mq.head == 0) { /* not backlogged any more. */ - heap_insert(&si->idle_heap, alg_fq->F, q); - } else { /* Still backlogged. */ - /* Update F, store in neh or sch */ - uint64_t len = q->mq.head->m_pkthdr.len; - alg_fq->F += len * alg_fq->inv_w; - if (DN_KEY_LEQ(alg_fq->S, si->V)) { - heap_insert(sch, alg_fq->F, q); - } else { - heap_insert(neh, alg_fq->S, q); - } - } - } - return m; -} - -static int -wf2qp_new_sched(struct dn_sch_inst *_si) -{ - struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); - int ofs = offsetof(struct wf2qp_queue, heap_pos); - - /* all heaps support extract from middle */ - if (heap_init(&si->idle_heap, 16, ofs) || - heap_init(&si->sch_heap, 16, ofs) || - heap_init(&si->ne_heap, 16, ofs)) { - heap_free(&si->ne_heap); - heap_free(&si->sch_heap); - heap_free(&si->idle_heap); - return ENOMEM; - } - return 0; -} - -static int -wf2qp_free_sched(struct dn_sch_inst *_si) -{ - struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); - - heap_free(&si->sch_heap); - heap_free(&si->ne_heap); - heap_free(&si->idle_heap); - - return 0; -} - -static int -wf2qp_new_fsk(struct dn_fsk *fs) -{ - ipdn_bound_var(&fs->fs.par[0], 1, - 1, 100, "WF2Q+ weight"); - return 0; -} - -static int -wf2qp_new_queue(struct dn_queue *_q) -{ - struct wf2qp_queue *q = (struct wf2qp_queue *)_q; - - _q->ni.oid.subtype = DN_SCHED_WF2QP; - q->F = 0; /* not strictly necessary */ - q->S = q->F + 1; /* mark timestamp as invalid. */ - q->inv_w = ONE_FP / _q->fs->fs.par[0]; - if (_q->mq.head != NULL) { - wf2qp_enqueue(_q->_si, _q, _q->mq.head); - } - return 0; -} - -/* - * Called when the infrastructure removes a queue (e.g. flowset - * is reconfigured). Nothing to do if we did not 'own' the queue, - * otherwise remove it from the right heap and adjust the sum - * of weights. - */ -static int -wf2qp_free_queue(struct dn_queue *q) -{ - struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; - struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1); - - if (alg_fq->S >= alg_fq->F + 1) - return 0; /* nothing to do, not in any heap */ - si->wsum -= q->fs->fs.par[0]; - if (si->wsum > 0) - si->inv_wsum = ONE_FP/si->wsum; - - /* extract from the heap. XXX TODO we may need to adjust V - * to make sure the invariants hold. - */ - if (q->mq.head == NULL) { - heap_extract(&si->idle_heap, q); - } else if (DN_KEY_LT(si->V, alg_fq->S)) { - heap_extract(&si->ne_heap, q); - } else { - heap_extract(&si->sch_heap, q); - } - return 0; -} - -/* - * WF2Q+ scheduler descriptor - * contains the type of the scheduler, the name, the size of the - * structures and function pointers. - */ -static struct dn_alg wf2qp_desc = { - _SI( .type = ) DN_SCHED_WF2QP, - _SI( .name = ) "WF2Q+", - _SI( .flags = ) DN_MULTIQUEUE, - - /* we need extra space in the si and the queue */ - _SI( .schk_datalen = ) 0, - _SI( .si_datalen = ) sizeof(struct wf2qp_si), - _SI( .q_datalen = ) sizeof(struct wf2qp_queue) - - sizeof(struct dn_queue), - - _SI( .enqueue = ) wf2qp_enqueue, - _SI( .dequeue = ) wf2qp_dequeue, - - _SI( .config = ) NULL, - _SI( .destroy = ) NULL, - _SI( .new_sched = ) wf2qp_new_sched, - _SI( .free_sched = ) wf2qp_free_sched, - - _SI( .new_fsk = ) wf2qp_new_fsk, - _SI( .free_fsk = ) NULL, - - _SI( .new_queue = ) wf2qp_new_queue, - _SI( .free_queue = ) wf2qp_free_queue, -}; - - -DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc); diff --git a/freebsd/sys/netpfil/ipfw/ip_dn_glue.c b/freebsd/sys/netpfil/ipfw/ip_dn_glue.c deleted file mode 100644 index 8e0cc36d..00000000 --- a/freebsd/sys/netpfil/ipfw/ip_dn_glue.c +++ /dev/null @@ -1,848 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/*- - * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - * - * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8 - */ - -#include <rtems/bsd/local/opt_inet6.h> - -#include <rtems/bsd/sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/kernel.h> -#include <rtems/bsd/sys/lock.h> -#include <sys/module.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/rwlock.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/time.h> -#include <sys/taskqueue.h> -#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ -#include <netinet/ip_fw.h> -#include <netinet/ip_dummynet.h> - -#include <netpfil/ipfw/ip_fw_private.h> -#include <netpfil/ipfw/dn_heap.h> -#include <netpfil/ipfw/ip_dn_private.h> -#include <netpfil/ipfw/dn_sched.h> - -/* FREEBSD7.2 ip_dummynet.h r191715*/ - -struct dn_heap_entry7 { - int64_t key; /* sorting key. Topmost element is smallest one */ - void *object; /* object pointer */ -}; - -struct dn_heap7 { - int size; - int elements; - int offset; /* XXX if > 0 this is the offset of direct ptr to obj */ - struct dn_heap_entry7 *p; /* really an array of "size" entries */ -}; - -/* Common to 7.2 and 8 */ -struct dn_flow_set { - SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ - - u_short fs_nr ; /* flow_set number */ - u_short flags_fs; -#define DNOLD_HAVE_FLOW_MASK 0x0001 -#define DNOLD_IS_RED 0x0002 -#define DNOLD_IS_GENTLE_RED 0x0004 -#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ -#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */ -#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ -#define DNOLD_IS_PIPE 0x4000 -#define DNOLD_IS_QUEUE 0x8000 - - struct dn_pipe7 *pipe ; /* pointer to parent pipe */ - u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ - - int weight ; /* WFQ queue weight */ - int qsize ; /* queue size in slots or bytes */ - int plr ; /* pkt loss rate (2^31-1 means 100%) */ - - struct ipfw_flow_id flow_mask ; - - /* hash table of queues onto this flow_set */ - int rq_size ; /* number of slots */ - int rq_elements ; /* active elements */ - struct dn_flow_queue7 **rq; /* array of rq_size entries */ - - u_int32_t last_expired ; /* do not expire too frequently */ - int backlogged ; /* #active queues for this flowset */ - - /* RED parameters */ -#define SCALE_RED 16 -#define SCALE(x) ( (x) << SCALE_RED ) -#define SCALE_VAL(x) ( (x) >> SCALE_RED ) -#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) - int w_q ; /* queue weight (scaled) */ - int max_th ; /* maximum threshold for queue (scaled) */ - int min_th ; /* minimum threshold for queue (scaled) */ - int max_p ; /* maximum value for p_b (scaled) */ - u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ - u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ - u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ - u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ - u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ - u_int lookup_depth ; /* depth of lookup table */ - int lookup_step ; /* granularity inside the lookup table */ - int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ - int avg_pkt_size ; /* medium packet size */ - int max_pkt_size ; /* max packet size */ -}; -SLIST_HEAD(dn_flow_set_head, dn_flow_set); - -#define DN_IS_PIPE 0x4000 -#define DN_IS_QUEUE 0x8000 -struct dn_flow_queue7 { - struct dn_flow_queue7 *next ; - struct ipfw_flow_id id ; - - struct mbuf *head, *tail ; /* queue of packets */ - u_int len ; - u_int len_bytes ; - - u_long numbytes; - - u_int64_t tot_pkts ; /* statistics counters */ - u_int64_t tot_bytes ; - u_int32_t drops ; - - int hash_slot ; /* debugging/diagnostic */ - - /* RED parameters */ - int avg ; /* average queue length est. (scaled) */ - int count ; /* arrivals since last RED drop */ - int random ; /* random value (scaled) */ - u_int32_t q_time; /* start of queue idle time */ - - /* WF2Q+ support */ - struct dn_flow_set *fs ; /* parent flow set */ - int heap_pos ; /* position (index) of struct in heap */ - int64_t sched_time ; /* current time when queue enters ready_heap */ - - int64_t S,F ; /* start time, finish time */ -}; - -struct dn_pipe7 { /* a pipe */ - SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */ - - int pipe_nr ; /* number */ - int bandwidth; /* really, bytes/tick. */ - int delay ; /* really, ticks */ - - struct mbuf *head, *tail ; /* packets in delay line */ - - /* WF2Q+ */ - struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ - struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ - struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ - - int64_t V ; /* virtual time */ - int sum; /* sum of weights of all active sessions */ - - int numbytes; - - int64_t sched_time ; /* time pipe was scheduled in ready_heap */ - - /* - * When the tx clock come from an interface (if_name[0] != '\0'), its name - * is stored below, whereas the ifp is filled when the rule is configured. - */ - char if_name[IFNAMSIZ]; - struct ifnet *ifp ; - int ready ; /* set if ifp != NULL and we got a signal from it */ - - struct dn_flow_set fs ; /* used with fixed-rate flows */ -}; -SLIST_HEAD(dn_pipe_head7, dn_pipe7); - - -/* FREEBSD8 ip_dummynet.h r196045 */ -struct dn_flow_queue8 { - struct dn_flow_queue8 *next ; - struct ipfw_flow_id id ; - - struct mbuf *head, *tail ; /* queue of packets */ - u_int len ; - u_int len_bytes ; - - uint64_t numbytes ; /* credit for transmission (dynamic queues) */ - int64_t extra_bits; /* extra bits simulating unavailable channel */ - - u_int64_t tot_pkts ; /* statistics counters */ - u_int64_t tot_bytes ; - u_int32_t drops ; - - int hash_slot ; /* debugging/diagnostic */ - - /* RED parameters */ - int avg ; /* average queue length est. (scaled) */ - int count ; /* arrivals since last RED drop */ - int random ; /* random value (scaled) */ - int64_t idle_time; /* start of queue idle time */ - - /* WF2Q+ support */ - struct dn_flow_set *fs ; /* parent flow set */ - int heap_pos ; /* position (index) of struct in heap */ - int64_t sched_time ; /* current time when queue enters ready_heap */ - - int64_t S,F ; /* start time, finish time */ -}; - -struct dn_pipe8 { /* a pipe */ - SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */ - - int pipe_nr ; /* number */ - int bandwidth; /* really, bytes/tick. */ - int delay ; /* really, ticks */ - - struct mbuf *head, *tail ; /* packets in delay line */ - - /* WF2Q+ */ - struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ - struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ - struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ - - int64_t V ; /* virtual time */ - int sum; /* sum of weights of all active sessions */ - - /* Same as in dn_flow_queue, numbytes can become large */ - int64_t numbytes; /* bits I can transmit (more or less). */ - uint64_t burst; /* burst size, scaled: bits * hz */ - - int64_t sched_time ; /* time pipe was scheduled in ready_heap */ - int64_t idle_time; /* start of pipe idle time */ - - char if_name[IFNAMSIZ]; - struct ifnet *ifp ; - int ready ; /* set if ifp != NULL and we got a signal from it */ - - struct dn_flow_set fs ; /* used with fixed-rate flows */ - - /* fields to simulate a delay profile */ -#define ED_MAX_NAME_LEN 32 - char name[ED_MAX_NAME_LEN]; - int loss_level; - int samples_no; - int *samples; -}; - -#define ED_MAX_SAMPLES_NO 1024 -struct dn_pipe_max8 { - struct dn_pipe8 pipe; - int samples[ED_MAX_SAMPLES_NO]; -}; -SLIST_HEAD(dn_pipe_head8, dn_pipe8); - -/* - * Changes from 7.2 to 8: - * dn_pipe: - * numbytes from int to int64_t - * add burst (int64_t) - * add idle_time (int64_t) - * add profile - * add struct dn_pipe_max - * add flag DN_HAS_PROFILE - * - * dn_flow_queue - * numbytes from u_long to int64_t - * add extra_bits (int64_t) - * q_time from u_int32_t to int64_t and name idle_time - * - * dn_flow_set unchanged - * - */ - -/* NOTE:XXX copied from dummynet.c */ -#define O_NEXT(p, len) ((void *)((char *)p + len)) -static void -oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) -{ - oid->len = len; - oid->type = type; - oid->subtype = 0; - oid->id = id; -} -/* make room in the buffer and move the pointer forward */ -static void * -o_next(struct dn_id **o, int len, int type) -{ - struct dn_id *ret = *o; - oid_fill(ret, len, type, 0); - *o = O_NEXT(*o, len); - return ret; -} - - -static size_t pipesize7 = sizeof(struct dn_pipe7); -static size_t pipesize8 = sizeof(struct dn_pipe8); -static size_t pipesizemax8 = sizeof(struct dn_pipe_max8); - -/* Indicate 'ipfw' version - * 1: from FreeBSD 7.2 - * 0: from FreeBSD 8 - * -1: unknow (for now is unused) - * - * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives - * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknow, - * it is suppose to be the FreeBSD 8 version. - */ -static int is7 = 0; - -static int -convertflags2new(int src) -{ - int dst = 0; - - if (src & DNOLD_HAVE_FLOW_MASK) - dst |= DN_HAVE_MASK; - if (src & DNOLD_QSIZE_IS_BYTES) - dst |= DN_QSIZE_BYTES; - if (src & DNOLD_NOERROR) - dst |= DN_NOERROR; - if (src & DNOLD_IS_RED) - dst |= DN_IS_RED; - if (src & DNOLD_IS_GENTLE_RED) - dst |= DN_IS_GENTLE_RED; - if (src & DNOLD_HAS_PROFILE) - dst |= DN_HAS_PROFILE; - - return dst; -} - -static int -convertflags2old(int src) -{ - int dst = 0; - - if (src & DN_HAVE_MASK) - dst |= DNOLD_HAVE_FLOW_MASK; - if (src & DN_IS_RED) - dst |= DNOLD_IS_RED; - if (src & DN_IS_GENTLE_RED) - dst |= DNOLD_IS_GENTLE_RED; - if (src & DN_NOERROR) - dst |= DNOLD_NOERROR; - if (src & DN_HAS_PROFILE) - dst |= DNOLD_HAS_PROFILE; - if (src & DN_QSIZE_BYTES) - dst |= DNOLD_QSIZE_IS_BYTES; - - return dst; -} - -static int -dn_compat_del(void *v) -{ - struct dn_pipe7 *p = (struct dn_pipe7 *) v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *) v; - struct { - struct dn_id oid; - uintptr_t a[1]; /* add more if we want a list */ - } cmd; - - /* XXX DN_API_VERSION ??? */ - oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); - - if (is7) { - if (p->pipe_nr == 0 && p->fs.fs_nr == 0) - return EINVAL; - if (p->pipe_nr != 0 && p->fs.fs_nr != 0) - return EINVAL; - } else { - if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0) - return EINVAL; - if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0) - return EINVAL; - } - - if (p->pipe_nr != 0) { /* pipe x delete */ - cmd.a[0] = p->pipe_nr; - cmd.oid.subtype = DN_LINK; - } else { /* queue x delete */ - cmd.oid.subtype = DN_FS; - cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr; - } - - return do_config(&cmd, cmd.oid.len); -} - -static int -dn_compat_config_queue(struct dn_fs *fs, void* v) -{ - struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - struct dn_flow_set *f; - - if (is7) - f = &p7->fs; - else - f = &p8->fs; - - fs->fs_nr = f->fs_nr; - fs->sched_nr = f->parent_nr; - fs->flow_mask = f->flow_mask; - fs->buckets = f->rq_size; - fs->qsize = f->qsize; - fs->plr = f->plr; - fs->par[0] = f->weight; - fs->flags = convertflags2new(f->flags_fs); - if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) { - fs->w_q = f->w_q; - fs->max_th = f->max_th; - fs->min_th = f->min_th; - fs->max_p = f->max_p; - } - - return 0; -} - -static int -dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, - struct dn_fs *fs, void* v) -{ - struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - int i = p7->pipe_nr; - - sch->sched_nr = i; - sch->oid.subtype = 0; - p->link_nr = i; - fs->fs_nr = i + 2*DN_MAX_ID; - fs->sched_nr = i + DN_MAX_ID; - - /* Common to 7 and 8 */ - p->bandwidth = p7->bandwidth; - p->delay = p7->delay; - if (!is7) { - /* FreeBSD 8 has burst */ - p->burst = p8->burst; - } - - /* fill the fifo flowset */ - dn_compat_config_queue(fs, v); - fs->fs_nr = i + 2*DN_MAX_ID; - fs->sched_nr = i + DN_MAX_ID; - - /* Move scheduler related parameter from fs to sch */ - sch->buckets = fs->buckets; /*XXX*/ - fs->buckets = 0; - if (fs->flags & DN_HAVE_MASK) { - sch->flags |= DN_HAVE_MASK; - fs->flags &= ~DN_HAVE_MASK; - sch->sched_mask = fs->flow_mask; - bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id)); - } - - return 0; -} - -static int -dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p, - void *v) -{ - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - - p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]); - - pf->link_nr = p->link_nr; - pf->loss_level = p8->loss_level; -// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant? - pf->samples_no = p8->samples_no; - strncpy(pf->name, p8->name,sizeof(pf->name)); - bcopy(p8->samples, pf->samples, sizeof(pf->samples)); - - return 0; -} - -/* - * If p->pipe_nr != 0 the command is 'pipe x config', so need to create - * the three main struct, else only a flowset is created - */ -static int -dn_compat_configure(void *v) -{ - struct dn_id *buf = NULL, *base; - struct dn_sch *sch = NULL; - struct dn_link *p = NULL; - struct dn_fs *fs = NULL; - struct dn_profile *pf = NULL; - int lmax; - int error; - - struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - - int i; /* number of object to configure */ - - lmax = sizeof(struct dn_id); /* command header */ - lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + - sizeof(struct dn_fs) + sizeof(struct dn_profile); - - base = buf = malloc(lmax, M_DUMMYNET, M_WAIT|M_ZERO); - o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); - base->id = DN_API_VERSION; - - /* pipe_nr is the same in p7 and p8 */ - i = p7->pipe_nr; - if (i != 0) { /* pipe config */ - sch = o_next(&buf, sizeof(*sch), DN_SCH); - p = o_next(&buf, sizeof(*p), DN_LINK); - fs = o_next(&buf, sizeof(*fs), DN_FS); - - error = dn_compat_config_pipe(sch, p, fs, v); - if (error) { - free(buf, M_DUMMYNET); - return error; - } - if (!is7 && p8->samples_no > 0) { - /* Add profiles*/ - pf = o_next(&buf, sizeof(*pf), DN_PROFILE); - error = dn_compat_config_profile(pf, p, v); - if (error) { - free(buf, M_DUMMYNET); - return error; - } - } - } else { /* queue config */ - fs = o_next(&buf, sizeof(*fs), DN_FS); - error = dn_compat_config_queue(fs, v); - if (error) { - free(buf, M_DUMMYNET); - return error; - } - } - error = do_config(base, (char *)buf - (char *)base); - - if (buf) - free(buf, M_DUMMYNET); - return error; -} - -int -dn_compat_calc_size(void) -{ - int need = 0; - /* XXX use FreeBSD 8 struct size */ - /* NOTE: - * - half scheduler: schk_count/2 - * - all flowset: fsk_count - * - all flowset queues: queue_count - * - all pipe queue: si_count - */ - need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2; - need += dn_cfg.fsk_count * sizeof(struct dn_flow_set); - need += dn_cfg.si_count * sizeof(struct dn_flow_queue8); - need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8); - - return need; -} - -int -dn_c_copy_q (void *_ni, void *arg) -{ - struct copy_args *a = arg; - struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start; - struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start; - struct dn_flow *ni = (struct dn_flow *)_ni; - int size = 0; - - /* XXX hash slot not set */ - /* No difference between 7.2/8 */ - fq7->len = ni->length; - fq7->len_bytes = ni->len_bytes; - fq7->id = ni->fid; - - if (is7) { - size = sizeof(struct dn_flow_queue7); - fq7->tot_pkts = ni->tot_pkts; - fq7->tot_bytes = ni->tot_bytes; - fq7->drops = ni->drops; - } else { - size = sizeof(struct dn_flow_queue8); - fq8->tot_pkts = ni->tot_pkts; - fq8->tot_bytes = ni->tot_bytes; - fq8->drops = ni->drops; - } - - *a->start += size; - return 0; -} - -int -dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq) -{ - struct dn_link *l = &s->link; - struct dn_fsk *f = s->fs; - - struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start; - struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start; - struct dn_flow_set *fs; - int size = 0; - - if (is7) { - fs = &pipe7->fs; - size = sizeof(struct dn_pipe7); - } else { - fs = &pipe8->fs; - size = sizeof(struct dn_pipe8); - } - - /* These 4 field are the same in pipe7 and pipe8 */ - pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE; - pipe7->bandwidth = l->bandwidth; - pipe7->delay = l->delay * 1000 / hz; - pipe7->pipe_nr = l->link_nr - DN_MAX_ID; - - if (!is7) { - if (s->profile) { - struct dn_profile *pf = s->profile; - strncpy(pipe8->name, pf->name, sizeof(pf->name)); - pipe8->loss_level = pf->loss_level; - pipe8->samples_no = pf->samples_no; - } - pipe8->burst = div64(l->burst , 8 * hz); - } - - fs->flow_mask = s->sch.sched_mask; - fs->rq_size = s->sch.buckets ? s->sch.buckets : 1; - - fs->parent_nr = l->link_nr - DN_MAX_ID; - fs->qsize = f->fs.qsize; - fs->plr = f->fs.plr; - fs->w_q = f->fs.w_q; - fs->max_th = f->max_th; - fs->min_th = f->min_th; - fs->max_p = f->fs.max_p; - fs->rq_elements = nq; - - fs->flags_fs = convertflags2old(f->fs.flags); - - *a->start += size; - return 0; -} - - -int -dn_compat_copy_pipe(struct copy_args *a, void *_o) -{ - int have = a->end - *a->start; - int need = 0; - int pipe_size = sizeof(struct dn_pipe8); - int queue_size = sizeof(struct dn_flow_queue8); - int n_queue = 0; /* number of queues */ - - struct dn_schk *s = (struct dn_schk *)_o; - /* calculate needed space: - * - struct dn_pipe - * - if there are instances, dn_queue * n_instances - */ - n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) : - (s->siht ? 1 : 0)); - need = pipe_size + queue_size * n_queue; - if (have < need) { - D("have %d < need %d", have, need); - return 1; - } - /* copy pipe */ - dn_c_copy_pipe(s, a, n_queue); - - /* copy queues */ - if (s->sch.flags & DN_HAVE_MASK) - dn_ht_scan(s->siht, dn_c_copy_q, a); - else if (s->siht) - dn_c_copy_q(s->siht, a); - return 0; -} - -int -dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq) -{ - struct dn_flow_set *fs = (struct dn_flow_set *)*a->start; - - fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; - fs->fs_nr = f->fs.fs_nr; - fs->qsize = f->fs.qsize; - fs->plr = f->fs.plr; - fs->w_q = f->fs.w_q; - fs->max_th = f->max_th; - fs->min_th = f->min_th; - fs->max_p = f->fs.max_p; - fs->flow_mask = f->fs.flow_mask; - fs->rq_elements = nq; - fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1); - fs->parent_nr = f->fs.sched_nr; - fs->weight = f->fs.par[0]; - - fs->flags_fs = convertflags2old(f->fs.flags); - *a->start += sizeof(struct dn_flow_set); - return 0; -} - -int -dn_compat_copy_queue(struct copy_args *a, void *_o) -{ - int have = a->end - *a->start; - int need = 0; - int fs_size = sizeof(struct dn_flow_set); - int queue_size = sizeof(struct dn_flow_queue8); - - struct dn_fsk *fs = (struct dn_fsk *)_o; - int n_queue = 0; /* number of queues */ - - n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) : - (fs->qht ? 1 : 0)); - - need = fs_size + queue_size * n_queue; - if (have < need) { - D("have < need"); - return 1; - } - - /* copy flowset */ - dn_c_copy_fs(fs, a, n_queue); - - /* copy queues */ - if (fs->fs.flags & DN_HAVE_MASK) - dn_ht_scan(fs->qht, dn_c_copy_q, a); - else if (fs->qht) - dn_c_copy_q(fs->qht, a); - - return 0; -} - -int -copy_data_helper_compat(void *_o, void *_arg) -{ - struct copy_args *a = _arg; - - if (a->type == DN_COMPAT_PIPE) { - struct dn_schk *s = _o; - if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) { - return 0; /* not old type */ - } - /* copy pipe parameters, and if instance exists, copy - * other parameters and eventually queues. - */ - if(dn_compat_copy_pipe(a, _o)) - return DNHT_SCAN_END; - } else if (a->type == DN_COMPAT_QUEUE) { - struct dn_fsk *fs = _o; - if (fs->fs.fs_nr >= DN_MAX_ID) - return 0; - if (dn_compat_copy_queue(a, _o)) - return DNHT_SCAN_END; - } - return 0; -} - -/* Main function to manage old requests */ -int -ip_dummynet_compat(struct sockopt *sopt) -{ - int error=0; - void *v = NULL; - struct dn_id oid; - - /* Lenght of data, used to found ipfw version... */ - int len = sopt->sopt_valsize; - - /* len can be 0 if command was dummynet_flush */ - if (len == pipesize7) { - D("setting compatibility with FreeBSD 7.2"); - is7 = 1; - } - else if (len == pipesize8 || len == pipesizemax8) { - D("setting compatibility with FreeBSD 8"); - is7 = 0; - } - - switch (sopt->sopt_name) { - default: - printf("dummynet: -- unknown option %d", sopt->sopt_name); - error = EINVAL; - break; - - case IP_DUMMYNET_FLUSH: - oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); - do_config(&oid, oid.len); - break; - - case IP_DUMMYNET_DEL: - v = malloc(len, M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, v, len, len); - if (error) - break; - error = dn_compat_del(v); - free(v, M_TEMP); - break; - - case IP_DUMMYNET_CONFIGURE: - v = malloc(len, M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, v, len, len); - if (error) - break; - error = dn_compat_configure(v); - free(v, M_TEMP); - break; - - case IP_DUMMYNET_GET: { - void *buf; - int ret; - int original_size = sopt->sopt_valsize; - int size; - - ret = dummynet_get(sopt, &buf); - if (ret) - return 0;//XXX ? - size = sopt->sopt_valsize; - sopt->sopt_valsize = original_size; - D("size=%d, buf=%p", size, buf); - ret = sooptcopyout(sopt, buf, size); - if (ret) - printf(" %s ERROR sooptcopyout\n", __FUNCTION__); - if (buf) - free(buf, M_DUMMYNET); - } - } - - return error; -} - - diff --git a/freebsd/sys/netpfil/ipfw/ip_dn_io.c b/freebsd/sys/netpfil/ipfw/ip_dn_io.c deleted file mode 100644 index 23392a55..00000000 --- a/freebsd/sys/netpfil/ipfw/ip_dn_io.c +++ /dev/null @@ -1,852 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/*- - * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * Dummynet portions related to packet handling. - */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <rtems/bsd/local/opt_inet6.h> - -#include <rtems/bsd/sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/kernel.h> -#include <rtems/bsd/sys/lock.h> -#include <sys/module.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/rwlock.h> -#include <sys/socket.h> -#include <sys/time.h> -#include <sys/sysctl.h> - -#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ -#include <net/netisr.h> -#include <net/vnet.h> - -#include <netinet/in.h> -#include <netinet/ip.h> /* ip_len, ip_off */ -#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ -#include <netinet/ip_fw.h> -#include <netinet/ip_dummynet.h> -#include <netinet/if_ether.h> /* various ether_* routines */ -#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */ -#include <netinet6/ip6_var.h> - -#include <netpfil/ipfw/ip_fw_private.h> -#include <netpfil/ipfw/dn_heap.h> -#include <netpfil/ipfw/ip_dn_private.h> -#include <netpfil/ipfw/dn_sched.h> - -/* - * We keep a private variable for the simulation time, but we could - * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) - * instead of dn_cfg.curr_time - */ - -struct dn_parms dn_cfg; -//VNET_DEFINE(struct dn_parms, _base_dn_cfg); - -static long tick_last; /* Last tick duration (usec). */ -static long tick_delta; /* Last vs standard tick diff (usec). */ -static long tick_delta_sum; /* Accumulated tick difference (usec).*/ -static long tick_adjustment; /* Tick adjustments done. */ -static long tick_lost; /* Lost(coalesced) ticks number. */ -/* Adjusted vs non-adjusted curr_time difference (ticks). */ -static long tick_diff; - -static unsigned long io_pkt; -static unsigned long io_pkt_fast; -static unsigned long io_pkt_drop; - -/* - * We use a heap to store entities for which we have pending timer events. - * The heap is checked at every tick and all entities with expired events - * are extracted. - */ - -MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); - -extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); - -#ifdef SYSCTL_NODE - -SYSBEGIN(f4) - -SYSCTL_DECL(_net_inet); -SYSCTL_DECL(_net_inet_ip); -static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); - -/* wrapper to pass dn_cfg fields to SYSCTL_* */ -//#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) -#define DC(x) (&(dn_cfg.x)) -/* parameters */ - -static int -sysctl_hash_size(SYSCTL_HANDLER_ARGS) -{ - int error, value; - - value = dn_cfg.hash_size; - error = sysctl_handle_int(oidp, &value, 0, req); - if (error != 0 || req->newptr == NULL) - return (error); - if (value < 16 || value > 65536) - return (EINVAL); - dn_cfg.hash_size = value; - return (0); -} - -SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, hash_size, - CTLTYPE_INT | CTLFLAG_RW, 0, 0, sysctl_hash_size, - "I", "Default hash table size"); - -static int -sysctl_limits(SYSCTL_HANDLER_ARGS) -{ - int error; - long value; - - if (arg2 != 0) - value = dn_cfg.slot_limit; - else - value = dn_cfg.byte_limit; - error = sysctl_handle_long(oidp, &value, 0, req); - - if (error != 0 || req->newptr == NULL) - return (error); - if (arg2 != 0) { - if (value < 1) - return (EINVAL); - dn_cfg.slot_limit = value; - } else { - if (value < 1500) - return (EINVAL); - dn_cfg.byte_limit = value; - } - return (0); -} - -SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, - CTLTYPE_LONG | CTLFLAG_RW, 0, 1, sysctl_limits, - "L", "Upper limit in slots for pipe queue."); -SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, - CTLTYPE_LONG | CTLFLAG_RW, 0, 0, sysctl_limits, - "L", "Upper limit in bytes for pipe queue."); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, - CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io."); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, - CTLFLAG_RW, DC(debug), 0, "Dummynet debug level"); - -/* RED parameters */ -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, - CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, - CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, - CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size"); - -/* time adjustment */ -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, - CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, - CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, - CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, - CTLFLAG_RD, &tick_diff, 0, - "Adjusted vs non-adjusted curr_time difference (ticks)."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, - CTLFLAG_RD, &tick_lost, 0, - "Number of ticks coalesced by dummynet taskqueue."); - -/* Drain parameters */ -SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire, - CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes"); -SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, - CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes"); - -/* statistics */ -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, - CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, - CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, - CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, - CTLFLAG_RD, DC(queue_count), 0, "Number of queues"); -SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, - CTLFLAG_RD, &io_pkt, 0, - "Number of packets passed to dummynet."); -SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, - CTLFLAG_RD, &io_pkt_fast, 0, - "Number of packets bypassed dummynet scheduler."); -SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, - CTLFLAG_RD, &io_pkt_drop, 0, - "Number of packets dropped by dummynet."); -#undef DC -SYSEND - -#endif - -static void dummynet_send(struct mbuf *); - -/* - * Packets processed by dummynet have an mbuf tag associated with - * them that carries their dummynet state. - * Outside dummynet, only the 'rule' field is relevant, and it must - * be at the beginning of the structure. - */ -struct dn_pkt_tag { - struct ipfw_rule_ref rule; /* matching rule */ - - /* second part, dummynet specific */ - int dn_dir; /* action when packet comes out.*/ - /* see ip_fw_private.h */ - uint64_t output_time; /* when the pkt is due for delivery*/ - struct ifnet *ifp; /* interface, for ip_output */ - struct _ip6dn_args ip6opt; /* XXX ipv6 options */ -}; - -/* - * Return the mbuf tag holding the dummynet state (it should - * be the first one on the list). - */ -static struct dn_pkt_tag * -dn_tag_get(struct mbuf *m) -{ - struct m_tag *mtag = m_tag_first(m); - KASSERT(mtag != NULL && - mtag->m_tag_cookie == MTAG_ABI_COMPAT && - mtag->m_tag_id == PACKET_TAG_DUMMYNET, - ("packet on dummynet queue w/o dummynet tag!")); - return (struct dn_pkt_tag *)(mtag+1); -} - -static inline void -mq_append(struct mq *q, struct mbuf *m) -{ - if (q->head == NULL) - q->head = m; - else - q->tail->m_nextpkt = m; - q->tail = m; - m->m_nextpkt = NULL; -} - -/* - * Dispose a list of packet. Use a functions so if we need to do - * more work, this is a central point to do it. - */ -void dn_free_pkts(struct mbuf *mnext) -{ - struct mbuf *m; - - while ((m = mnext) != NULL) { - mnext = m->m_nextpkt; - FREE_PKT(m); - } -} - -static int -red_drops (struct dn_queue *q, int len) -{ - /* - * RED algorithm - * - * RED calculates the average queue size (avg) using a low-pass filter - * with an exponential weighted (w_q) moving average: - * avg <- (1-w_q) * avg + w_q * q_size - * where q_size is the queue length (measured in bytes or * packets). - * - * If q_size == 0, we compute the idle time for the link, and set - * avg = (1 - w_q)^(idle/s) - * where s is the time needed for transmitting a medium-sized packet. - * - * Now, if avg < min_th the packet is enqueued. - * If avg > max_th the packet is dropped. Otherwise, the packet is - * dropped with probability P function of avg. - */ - - struct dn_fsk *fs = q->fs; - int64_t p_b = 0; - - /* Queue in bytes or packets? */ - uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? - q->ni.len_bytes : q->ni.length; - - /* Average queue size estimation. */ - if (q_size != 0) { - /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ - int diff = SCALE(q_size) - q->avg; - int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); - - q->avg += (int)v; - } else { - /* - * Queue is empty, find for how long the queue has been - * empty and use a lookup table for computing - * (1 - * w_q)^(idle_time/s) where s is the time to send a - * (small) packet. - * XXX check wraps... - */ - if (q->avg) { - u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); - - q->avg = (t < fs->lookup_depth) ? - SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; - } - } - - /* Should i drop? */ - if (q->avg < fs->min_th) { - q->count = -1; - return (0); /* accept packet */ - } - if (q->avg >= fs->max_th) { /* average queue >= max threshold */ - if (fs->fs.flags & DN_IS_GENTLE_RED) { - /* - * According to Gentle-RED, if avg is greater than - * max_th the packet is dropped with a probability - * p_b = c_3 * avg - c_4 - * where c_3 = (1 - max_p) / max_th - * c_4 = 1 - 2 * max_p - */ - p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - - fs->c_4; - } else { - q->count = -1; - return (1); - } - } else if (q->avg > fs->min_th) { - /* - * We compute p_b using the linear dropping function - * p_b = c_1 * avg - c_2 - * where c_1 = max_p / (max_th - min_th) - * c_2 = max_p * min_th / (max_th - min_th) - */ - p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; - } - - if (fs->fs.flags & DN_QSIZE_BYTES) - p_b = div64((p_b * len) , fs->max_pkt_size); - if (++q->count == 0) - q->random = random() & 0xffff; - else { - /* - * q->count counts packets arrived since last drop, so a greater - * value of q->count means a greater packet drop probability. - */ - if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { - q->count = 0; - /* After a drop we calculate a new random value. */ - q->random = random() & 0xffff; - return (1); /* drop */ - } - } - /* End of RED algorithm. */ - - return (0); /* accept */ - -} - -/* - * Enqueue a packet in q, subject to space and queue management policy - * (whose parameters are in q->fs). - * Update stats for the queue and the scheduler. - * Return 0 on success, 1 on drop. The packet is consumed anyways. - */ -int -dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) -{ - struct dn_fs *f; - struct dn_flow *ni; /* stats for scheduler instance */ - uint64_t len; - - if (q->fs == NULL || q->_si == NULL) { - printf("%s fs %p si %p, dropping\n", - __FUNCTION__, q->fs, q->_si); - FREE_PKT(m); - return 1; - } - f = &(q->fs->fs); - ni = &q->_si->ni; - len = m->m_pkthdr.len; - /* Update statistics, then check reasons to drop pkt. */ - q->ni.tot_bytes += len; - q->ni.tot_pkts++; - ni->tot_bytes += len; - ni->tot_pkts++; - if (drop) - goto drop; - if (f->plr && random() < f->plr) - goto drop; - if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) - goto drop; - if (f->flags & DN_QSIZE_BYTES) { - if (q->ni.len_bytes > f->qsize) - goto drop; - } else if (q->ni.length >= f->qsize) { - goto drop; - } - mq_append(&q->mq, m); - q->ni.length++; - q->ni.len_bytes += len; - ni->length++; - ni->len_bytes += len; - return 0; - -drop: - io_pkt_drop++; - q->ni.drops++; - ni->drops++; - FREE_PKT(m); - return 1; -} - -/* - * Fetch packets from the delay line which are due now. If there are - * leftover packets, reinsert the delay line in the heap. - * Runs under scheduler lock. - */ -static void -transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) -{ - struct mbuf *m; - struct dn_pkt_tag *pkt = NULL; - - dline->oid.subtype = 0; /* not in heap */ - while ((m = dline->mq.head) != NULL) { - pkt = dn_tag_get(m); - if (!DN_KEY_LEQ(pkt->output_time, now)) - break; - dline->mq.head = m->m_nextpkt; - mq_append(q, m); - } - if (m != NULL) { - dline->oid.subtype = 1; /* in heap */ - heap_insert(&dn_cfg.evheap, pkt->output_time, dline); - } -} - -/* - * Convert the additional MAC overheads/delays into an equivalent - * number of bits for the given data rate. The samples are - * in milliseconds so we need to divide by 1000. - */ -static uint64_t -extra_bits(struct mbuf *m, struct dn_schk *s) -{ - int index; - uint64_t bits; - struct dn_profile *pf = s->profile; - - if (!pf || pf->samples_no == 0) - return 0; - index = random() % pf->samples_no; - bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); - if (index >= pf->loss_level) { - struct dn_pkt_tag *dt = dn_tag_get(m); - if (dt) - dt->dn_dir = DIR_DROP; - } - return bits; -} - -/* - * Send traffic from a scheduler instance due by 'now'. - * Return a pointer to the head of the queue. - */ -static struct mbuf * -serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) -{ - struct mq def_q; - struct dn_schk *s = si->sched; - struct mbuf *m = NULL; - int delay_line_idle = (si->dline.mq.head == NULL); - int done, bw; - - if (q == NULL) { - q = &def_q; - q->head = NULL; - } - - bw = s->link.bandwidth; - si->kflags &= ~DN_ACTIVE; - - if (bw > 0) - si->credit += (now - si->sched_time) * bw; - else - si->credit = 0; - si->sched_time = now; - done = 0; - while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { - uint64_t len_scaled; - - done++; - len_scaled = (bw == 0) ? 0 : hz * - (m->m_pkthdr.len * 8 + extra_bits(m, s)); - si->credit -= len_scaled; - /* Move packet in the delay line */ - dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay ; - mq_append(&si->dline.mq, m); - } - - /* - * If credit >= 0 the instance is idle, mark time. - * Otherwise put back in the heap, and adjust the output - * time of the last inserted packet, m, which was too early. - */ - if (si->credit >= 0) { - si->idle_time = now; - } else { - uint64_t t; - KASSERT (bw > 0, ("bw=0 and credit<0 ?")); - t = div64(bw - 1 - si->credit, bw); - if (m) - dn_tag_get(m)->output_time += t; - si->kflags |= DN_ACTIVE; - heap_insert(&dn_cfg.evheap, now + t, si); - } - if (delay_line_idle && done) - transmit_event(q, &si->dline, now); - return q->head; -} - -/* - * The timer handler for dummynet. Time is computed in ticks, but - * but the code is tolerant to the actual rate at which this is called. - * Once complete, the function reschedules itself for the next tick. - */ -void -dummynet_task(void *context, int pending) -{ - struct timeval t; - struct mq q = { NULL, NULL }; /* queue to accumulate results */ - - CURVNET_SET((struct vnet *)context); - - DN_BH_WLOCK(); - - /* Update number of lost(coalesced) ticks. */ - tick_lost += pending - 1; - - getmicrouptime(&t); - /* Last tick duration (usec). */ - tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + - (t.tv_usec - dn_cfg.prev_t.tv_usec); - /* Last tick vs standard tick difference (usec). */ - tick_delta = (tick_last * hz - 1000000) / hz; - /* Accumulated tick difference (usec). */ - tick_delta_sum += tick_delta; - - dn_cfg.prev_t = t; - - /* - * Adjust curr_time if the accumulated tick difference is - * greater than the 'standard' tick. Since curr_time should - * be monotonically increasing, we do positive adjustments - * as required, and throttle curr_time in case of negative - * adjustment. - */ - dn_cfg.curr_time++; - if (tick_delta_sum - tick >= 0) { - int diff = tick_delta_sum / tick; - - dn_cfg.curr_time += diff; - tick_diff += diff; - tick_delta_sum %= tick; - tick_adjustment++; - } else if (tick_delta_sum + tick <= 0) { - dn_cfg.curr_time--; - tick_diff--; - tick_delta_sum += tick; - tick_adjustment++; - } - - /* serve pending events, accumulate in q */ - for (;;) { - struct dn_id *p; /* generic parameter to handler */ - - if (dn_cfg.evheap.elements == 0 || - DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) - break; - p = HEAP_TOP(&dn_cfg.evheap)->object; - heap_extract(&dn_cfg.evheap, NULL); - - if (p->type == DN_SCH_I) { - serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); - } else { /* extracted a delay line */ - transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); - } - } - if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) { - dn_cfg.expire_cycle = 0; - dn_drain_scheduler(); - dn_drain_queue(); - } - - DN_BH_WUNLOCK(); - dn_reschedule(); - if (q.head != NULL) - dummynet_send(q.head); - CURVNET_RESTORE(); -} - -/* - * forward a chain of packets to the proper destination. - * This runs outside the dummynet lock. - */ -static void -dummynet_send(struct mbuf *m) -{ - struct mbuf *n; - - for (; m != NULL; m = n) { - struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ - struct m_tag *tag; - int dst; - - n = m->m_nextpkt; - m->m_nextpkt = NULL; - tag = m_tag_first(m); - if (tag == NULL) { /* should not happen */ - dst = DIR_DROP; - } else { - struct dn_pkt_tag *pkt = dn_tag_get(m); - /* extract the dummynet info, rename the tag - * to carry reinject info. - */ - dst = pkt->dn_dir; - ifp = pkt->ifp; - tag->m_tag_cookie = MTAG_IPFW_RULE; - tag->m_tag_id = 0; - } - - switch (dst) { - case DIR_OUT: - SET_HOST_IPLEN(mtod(m, struct ip *)); - ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); - break ; - - case DIR_IN : - /* put header in network format for ip_input() */ - //SET_NET_IPLEN(mtod(m, struct ip *)); - netisr_dispatch(NETISR_IP, m); - break; - -#ifdef INET6 - case DIR_IN | PROTO_IPV6: - netisr_dispatch(NETISR_IPV6, m); - break; - - case DIR_OUT | PROTO_IPV6: - ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); - break; -#endif - - case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ - if (bridge_dn_p != NULL) - ((*bridge_dn_p)(m, ifp)); - else - printf("dummynet: if_bridge not loaded\n"); - - break; - - case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ - /* - * The Ethernet code assumes the Ethernet header is - * contiguous in the first mbuf header. - * Insure this is true. - */ - if (m->m_len < ETHER_HDR_LEN && - (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { - printf("dummynet/ether: pullup failed, " - "dropping packet\n"); - break; - } - ether_demux(m->m_pkthdr.rcvif, m); - break; - - case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */ - ether_output_frame(ifp, m); - break; - - case DIR_DROP: - /* drop the packet after some time */ - FREE_PKT(m); - break; - - default: - printf("dummynet: bad switch %d!\n", dst); - FREE_PKT(m); - break; - } - } -} - -static inline int -tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) -{ - struct dn_pkt_tag *dt; - struct m_tag *mtag; - - mtag = m_tag_get(PACKET_TAG_DUMMYNET, - sizeof(*dt), M_NOWAIT | M_ZERO); - if (mtag == NULL) - return 1; /* Cannot allocate packet header. */ - m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ - dt = (struct dn_pkt_tag *)(mtag + 1); - dt->rule = fwa->rule; - dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ - dt->dn_dir = dir; - dt->ifp = fwa->oif; - /* dt->output tame is updated as we move through */ - dt->output_time = dn_cfg.curr_time; - return 0; -} - - -/* - * dummynet hook for packets. - * We use the argument to locate the flowset fs and the sched_set sch - * associated to it. The we apply flow_mask and sched_mask to - * determine the queue and scheduler instances. - * - * dir where shall we send the packet after dummynet. - * *m0 the mbuf with the packet - * ifp the 'ifp' parameter from the caller. - * NULL in ip_input, destination interface in ip_output, - */ -int -dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) -{ - struct mbuf *m = *m0; - struct dn_fsk *fs = NULL; - struct dn_sch_inst *si; - struct dn_queue *q = NULL; /* default */ - - int fs_id = (fwa->rule.info & IPFW_INFO_MASK) + - ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); - DN_BH_WLOCK(); - io_pkt++; - /* we could actually tag outside the lock, but who cares... */ - if (tag_mbuf(m, dir, fwa)) - goto dropit; - if (dn_cfg.busy) { - /* if the upper half is busy doing something expensive, - * lets queue the packet and move forward - */ - mq_append(&dn_cfg.pending, m); - m = *m0 = NULL; /* consumed */ - goto done; /* already active, nothing to do */ - } - /* XXX locate_flowset could be optimised with a direct ref. */ - fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); - if (fs == NULL) - goto dropit; /* This queue/pipe does not exist! */ - if (fs->sched == NULL) /* should not happen */ - goto dropit; - /* find scheduler instance, possibly applying sched_mask */ - si = ipdn_si_find(fs->sched, &(fwa->f_id)); - if (si == NULL) - goto dropit; - /* - * If the scheduler supports multiple queues, find the right one - * (otherwise it will be ignored by enqueue). - */ - if (fs->sched->fp->flags & DN_MULTIQUEUE) { - q = ipdn_q_find(fs, si, &(fwa->f_id)); - if (q == NULL) - goto dropit; - } - if (fs->sched->fp->enqueue(si, q, m)) { - /* packet was dropped by enqueue() */ - m = *m0 = NULL; - goto dropit; - } - - if (si->kflags & DN_ACTIVE) { - m = *m0 = NULL; /* consumed */ - goto done; /* already active, nothing to do */ - } - - /* compute the initial allowance */ - if (si->idle_time < dn_cfg.curr_time) { - /* Do this only on the first packet on an idle pipe */ - struct dn_link *p = &fs->sched->link; - - si->sched_time = dn_cfg.curr_time; - si->credit = dn_cfg.io_fast ? p->bandwidth : 0; - if (p->burst) { - uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; - if (burst > p->burst) - burst = p->burst; - si->credit += burst; - } - } - /* pass through scheduler and delay line */ - m = serve_sched(NULL, si, dn_cfg.curr_time); - - /* optimization -- pass it back to ipfw for immediate send */ - /* XXX Don't call dummynet_send() if scheduler return the packet - * just enqueued. This avoid a lock order reversal. - * - */ - if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { - /* fast io, rename the tag * to carry reinject info. */ - struct m_tag *tag = m_tag_first(m); - - tag->m_tag_cookie = MTAG_IPFW_RULE; - tag->m_tag_id = 0; - io_pkt_fast++; - if (m->m_nextpkt != NULL) { - printf("dummynet: fast io: pkt chain detected!\n"); - m->m_nextpkt = NULL; - } - m = NULL; - } else { - *m0 = NULL; - } -done: - DN_BH_WUNLOCK(); - if (m) - dummynet_send(m); - return 0; - -dropit: - io_pkt_drop++; - DN_BH_WUNLOCK(); - if (m) - FREE_PKT(m); - *m0 = NULL; - return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; -} diff --git a/freebsd/sys/netpfil/ipfw/ip_dn_private.h b/freebsd/sys/netpfil/ipfw/ip_dn_private.h index 159ddc9a..2fce1366 100644 --- a/freebsd/sys/netpfil/ipfw/ip_dn_private.h +++ b/freebsd/sys/netpfil/ipfw/ip_dn_private.h @@ -81,8 +81,13 @@ SLIST_HEAD(dn_fsk_head, dn_fsk); SLIST_HEAD(dn_queue_head, dn_queue); SLIST_HEAD(dn_alg_head, dn_alg); +#ifdef NEW_AQM +SLIST_HEAD(dn_aqm_head, dn_aqm); /* for new AQMs */ +#endif + struct mq { /* a basic queue of packets*/ struct mbuf *head, *tail; + int count; }; static inline void @@ -91,7 +96,7 @@ set_oid(struct dn_id *o, int type, int len) o->type = type; o->len = len; o->subtype = 0; -}; +} /* * configuration and global data for a dummynet instance @@ -135,6 +140,9 @@ struct dn_parms { /* list of flowsets without a scheduler -- use sch_chain */ struct dn_fsk_head fsu; /* list of unlinked flowsets */ struct dn_alg_head schedlist; /* list of algorithms */ +#ifdef NEW_AQM + struct dn_aqm_head aqmlist; /* list of AQMs */ +#endif /* Store the fs/sch to scan when draining. The value is the * bucket number of the hash table. Expire can be disabled @@ -231,6 +239,10 @@ struct dn_fsk { /* kernel side of a flowset */ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ int avg_pkt_size ; /* medium packet size */ int max_pkt_size ; /* max packet size */ +#ifdef NEW_AQM + struct dn_aqm *aqmfp; /* Pointer to AQM functions */ + void *aqmcfg; /* configuration parameters for AQM */ +#endif }; /* @@ -253,6 +265,9 @@ struct dn_queue { int count; /* arrivals since last RED drop */ int random; /* random value (scaled) */ uint64_t q_time; /* start of queue idle time */ +#ifdef NEW_AQM + void *aqm_status; /* per-queue status variables*/ +#endif }; @@ -400,4 +415,49 @@ int do_config(void *p, int l); void dn_drain_scheduler(void); void dn_drain_queue(void); +#ifdef NEW_AQM +int ecn_mark(struct mbuf* m); + +/* moved from ip_dn_io.c to here to be available for AQMs modules*/ +static inline void +mq_append(struct mq *q, struct mbuf *m) +{ +#ifdef USERSPACE + // buffers from netmap need to be copied + // XXX note that the routine is not expected to fail + ND("append %p to %p", m, q); + if (m->m_flags & M_STACK) { + struct mbuf *m_new; + void *p; + int l, ofs; + + ofs = m->m_data - m->__m_extbuf; + // XXX allocate + MGETHDR(m_new, M_NOWAIT, MT_DATA); + ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p", + m, m->__m_extbuf, m->__m_extlen, ofs, m_new); + p = m_new->__m_extbuf; /* new pointer */ + l = m_new->__m_extlen; /* new len */ + if (l <= m->__m_extlen) { + panic("extlen too large"); + } + + *m_new = *m; // copy + m_new->m_flags &= ~M_STACK; + m_new->__m_extbuf = p; // point to new buffer + _pkt_copy(m->__m_extbuf, p, m->__m_extlen); + m_new->m_data = p + ofs; + m = m_new; + } +#endif /* USERSPACE */ + if (q->head == NULL) + q->head = m; + else + q->tail->m_nextpkt = m; + q->count++; + q->tail = m; + m->m_nextpkt = NULL; +} +#endif /* NEW_AQM */ + #endif /* _IP_DN_PRIVATE_H */ diff --git a/freebsd/sys/netpfil/ipfw/ip_dummynet.c b/freebsd/sys/netpfil/ipfw/ip_dummynet.c deleted file mode 100644 index 40c37d80..00000000 --- a/freebsd/sys/netpfil/ipfw/ip_dummynet.c +++ /dev/null @@ -1,2309 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/*- - * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa - * Portions Copyright (c) 2000 Akamba Corp. - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -/* - * Configuration and internal object management for dummynet. - */ - -#include <rtems/bsd/local/opt_inet6.h> - -#include <rtems/bsd/sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/kernel.h> -#include <rtems/bsd/sys/lock.h> -#include <sys/module.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/rwlock.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/time.h> -#include <sys/taskqueue.h> -#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ -#include <netinet/ip_fw.h> -#include <netinet/ip_dummynet.h> - -#include <netpfil/ipfw/ip_fw_private.h> -#include <netpfil/ipfw/dn_heap.h> -#include <netpfil/ipfw/ip_dn_private.h> -#include <netpfil/ipfw/dn_sched.h> - -/* which objects to copy */ -#define DN_C_LINK 0x01 -#define DN_C_SCH 0x02 -#define DN_C_FLOW 0x04 -#define DN_C_FS 0x08 -#define DN_C_QUEUE 0x10 - -/* we use this argument in case of a schk_new */ -struct schk_new_arg { - struct dn_alg *fp; - struct dn_sch *sch; -}; - -/*---- callout hooks. ----*/ -static struct callout dn_timeout; -static struct task dn_task; -static struct taskqueue *dn_tq = NULL; - -static void -dummynet(void * __unused unused) -{ - - taskqueue_enqueue(dn_tq, &dn_task); -} - -void -dn_reschedule(void) -{ - callout_reset(&dn_timeout, 1, dummynet, NULL); -} -/*----- end of callout hooks -----*/ - -/* Return a scheduler descriptor given the type or name. */ -static struct dn_alg * -find_sched_type(int type, char *name) -{ - struct dn_alg *d; - - SLIST_FOREACH(d, &dn_cfg.schedlist, next) { - if (d->type == type || (name && !strcasecmp(d->name, name))) - return d; - } - return NULL; /* not found */ -} - -int -ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) -{ - int oldv = *v; - const char *op = NULL; - if (dflt < lo) - dflt = lo; - if (dflt > hi) - dflt = hi; - if (oldv < lo) { - *v = dflt; - op = "Bump"; - } else if (oldv > hi) { - *v = hi; - op = "Clamp"; - } else - return *v; - if (op && msg) - printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); - return *v; -} - -/*---- flow_id mask, hash and compare functions ---*/ -/* - * The flow_id includes the 5-tuple, the queue/pipe number - * which we store in the extra area in host order, - * and for ipv6 also the flow_id6. - * XXX see if we want the tos byte (can store in 'flags') - */ -static struct ipfw_flow_id * -flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id) -{ - int is_v6 = IS_IP6_FLOW_ID(id); - - id->dst_port &= mask->dst_port; - id->src_port &= mask->src_port; - id->proto &= mask->proto; - id->extra &= mask->extra; - if (is_v6) { - APPLY_MASK(&id->dst_ip6, &mask->dst_ip6); - APPLY_MASK(&id->src_ip6, &mask->src_ip6); - id->flow_id6 &= mask->flow_id6; - } else { - id->dst_ip &= mask->dst_ip; - id->src_ip &= mask->src_ip; - } - return id; -} - -/* computes an OR of two masks, result in dst and also returned */ -static struct ipfw_flow_id * -flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst) -{ - int is_v6 = IS_IP6_FLOW_ID(dst); - - dst->dst_port |= src->dst_port; - dst->src_port |= src->src_port; - dst->proto |= src->proto; - dst->extra |= src->extra; - if (is_v6) { -#define OR_MASK(_d, _s) \ - (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \ - (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \ - (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \ - (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3]; - OR_MASK(&dst->dst_ip6, &src->dst_ip6); - OR_MASK(&dst->src_ip6, &src->src_ip6); -#undef OR_MASK - dst->flow_id6 |= src->flow_id6; - } else { - dst->dst_ip |= src->dst_ip; - dst->src_ip |= src->src_ip; - } - return dst; -} - -static int -nonzero_mask(struct ipfw_flow_id *m) -{ - if (m->dst_port || m->src_port || m->proto || m->extra) - return 1; - if (IS_IP6_FLOW_ID(m)) { - return - m->dst_ip6.__u6_addr.__u6_addr32[0] || - m->dst_ip6.__u6_addr.__u6_addr32[1] || - m->dst_ip6.__u6_addr.__u6_addr32[2] || - m->dst_ip6.__u6_addr.__u6_addr32[3] || - m->src_ip6.__u6_addr.__u6_addr32[0] || - m->src_ip6.__u6_addr.__u6_addr32[1] || - m->src_ip6.__u6_addr.__u6_addr32[2] || - m->src_ip6.__u6_addr.__u6_addr32[3] || - m->flow_id6; - } else { - return m->dst_ip || m->src_ip; - } -} - -/* XXX we may want a better hash function */ -static uint32_t -flow_id_hash(struct ipfw_flow_id *id) -{ - uint32_t i; - - if (IS_IP6_FLOW_ID(id)) { - uint32_t *d = (uint32_t *)&id->dst_ip6; - uint32_t *s = (uint32_t *)&id->src_ip6; - i = (d[0] ) ^ (d[1]) ^ - (d[2] ) ^ (d[3]) ^ - (d[0] >> 15) ^ (d[1] >> 15) ^ - (d[2] >> 15) ^ (d[3] >> 15) ^ - (s[0] << 1) ^ (s[1] << 1) ^ - (s[2] << 1) ^ (s[3] << 1) ^ - (s[0] << 16) ^ (s[1] << 16) ^ - (s[2] << 16) ^ (s[3] << 16) ^ - (id->dst_port << 1) ^ (id->src_port) ^ - (id->extra) ^ - (id->proto ) ^ (id->flow_id6); - } else { - i = (id->dst_ip) ^ (id->dst_ip >> 15) ^ - (id->src_ip << 1) ^ (id->src_ip >> 16) ^ - (id->extra) ^ - (id->dst_port << 1) ^ (id->src_port) ^ (id->proto); - } - return i; -} - -/* Like bcmp, returns 0 if ids match, 1 otherwise. */ -static int -flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2) -{ - int is_v6 = IS_IP6_FLOW_ID(id1); - - if (!is_v6) { - if (IS_IP6_FLOW_ID(id2)) - return 1; /* different address families */ - - return (id1->dst_ip == id2->dst_ip && - id1->src_ip == id2->src_ip && - id1->dst_port == id2->dst_port && - id1->src_port == id2->src_port && - id1->proto == id2->proto && - id1->extra == id2->extra) ? 0 : 1; - } - /* the ipv6 case */ - return ( - !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) && - !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) && - id1->dst_port == id2->dst_port && - id1->src_port == id2->src_port && - id1->proto == id2->proto && - id1->extra == id2->extra && - id1->flow_id6 == id2->flow_id6) ? 0 : 1; -} -/*--------- end of flow-id mask, hash and compare ---------*/ - -/*--- support functions for the qht hashtable ---- - * Entries are hashed by flow-id - */ -static uint32_t -q_hash(uintptr_t key, int flags, void *arg) -{ - /* compute the hash slot from the flow id */ - struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? - &((struct dn_queue *)key)->ni.fid : - (struct ipfw_flow_id *)key; - - return flow_id_hash(id); -} - -static int -q_match(void *obj, uintptr_t key, int flags, void *arg) -{ - struct dn_queue *o = (struct dn_queue *)obj; - struct ipfw_flow_id *id2; - - if (flags & DNHT_KEY_IS_OBJ) { - /* compare pointers */ - id2 = &((struct dn_queue *)key)->ni.fid; - } else { - id2 = (struct ipfw_flow_id *)key; - } - return (0 == flow_id_cmp(&o->ni.fid, id2)); -} - -/* - * create a new queue instance for the given 'key'. - */ -static void * -q_new(uintptr_t key, int flags, void *arg) -{ - struct dn_queue *q, *template = arg; - struct dn_fsk *fs = template->fs; - int size = sizeof(*q) + fs->sched->fp->q_datalen; - - q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO); - if (q == NULL) { - D("no memory for new queue"); - return NULL; - } - - set_oid(&q->ni.oid, DN_QUEUE, size); - if (fs->fs.flags & DN_QHT_HASH) - q->ni.fid = *(struct ipfw_flow_id *)key; - q->fs = fs; - q->_si = template->_si; - q->_si->q_count++; - - if (fs->sched->fp->new_queue) - fs->sched->fp->new_queue(q); - dn_cfg.queue_count++; - return q; -} - -/* - * Notify schedulers that a queue is going away. - * If (flags & DN_DESTROY), also free the packets. - * The version for callbacks is called q_delete_cb(). - */ -static void -dn_delete_queue(struct dn_queue *q, int flags) -{ - struct dn_fsk *fs = q->fs; - - // D("fs %p si %p\n", fs, q->_si); - /* notify the parent scheduler that the queue is going away */ - if (fs && fs->sched->fp->free_queue) - fs->sched->fp->free_queue(q); - q->_si->q_count--; - q->_si = NULL; - if (flags & DN_DESTROY) { - if (q->mq.head) - dn_free_pkts(q->mq.head); - bzero(q, sizeof(*q)); // safety - free(q, M_DUMMYNET); - dn_cfg.queue_count--; - } -} - -static int -q_delete_cb(void *q, void *arg) -{ - int flags = (int)(uintptr_t)arg; - dn_delete_queue(q, flags); - return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0; -} - -/* - * calls dn_delete_queue/q_delete_cb on all queues, - * which notifies the parent scheduler and possibly drains packets. - * flags & DN_DESTROY: drains queues and destroy qht; - */ -static void -qht_delete(struct dn_fsk *fs, int flags) -{ - ND("fs %d start flags %d qht %p", - fs->fs.fs_nr, flags, fs->qht); - if (!fs->qht) - return; - if (fs->fs.flags & DN_QHT_HASH) { - dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags); - if (flags & DN_DESTROY) { - dn_ht_free(fs->qht, 0); - fs->qht = NULL; - } - } else { - dn_delete_queue((struct dn_queue *)(fs->qht), flags); - if (flags & DN_DESTROY) - fs->qht = NULL; - } -} - -/* - * Find and possibly create the queue for a MULTIQUEUE scheduler. - * We never call it for !MULTIQUEUE (the queue is in the sch_inst). - */ -struct dn_queue * -ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si, - struct ipfw_flow_id *id) -{ - struct dn_queue template; - - template._si = si; - template.fs = fs; - - if (fs->fs.flags & DN_QHT_HASH) { - struct ipfw_flow_id masked_id; - if (fs->qht == NULL) { - fs->qht = dn_ht_init(NULL, fs->fs.buckets, - offsetof(struct dn_queue, q_next), - q_hash, q_match, q_new); - if (fs->qht == NULL) - return NULL; - } - masked_id = *id; - flow_id_mask(&fs->fsk_mask, &masked_id); - return dn_ht_find(fs->qht, (uintptr_t)&masked_id, - DNHT_INSERT, &template); - } else { - if (fs->qht == NULL) - fs->qht = q_new(0, 0, &template); - return (struct dn_queue *)fs->qht; - } -} -/*--- end of queue hash table ---*/ - -/*--- support functions for the sch_inst hashtable ---- - * - * These are hashed by flow-id - */ -static uint32_t -si_hash(uintptr_t key, int flags, void *arg) -{ - /* compute the hash slot from the flow id */ - struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? - &((struct dn_sch_inst *)key)->ni.fid : - (struct ipfw_flow_id *)key; - - return flow_id_hash(id); -} - -static int -si_match(void *obj, uintptr_t key, int flags, void *arg) -{ - struct dn_sch_inst *o = obj; - struct ipfw_flow_id *id2; - - id2 = (flags & DNHT_KEY_IS_OBJ) ? - &((struct dn_sch_inst *)key)->ni.fid : - (struct ipfw_flow_id *)key; - return flow_id_cmp(&o->ni.fid, id2) == 0; -} - -/* - * create a new instance for the given 'key' - * Allocate memory for instance, delay line and scheduler private data. - */ -static void * -si_new(uintptr_t key, int flags, void *arg) -{ - struct dn_schk *s = arg; - struct dn_sch_inst *si; - int l = sizeof(*si) + s->fp->si_datalen; - - si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); - if (si == NULL) - goto error; - - /* Set length only for the part passed up to userland. */ - set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow)); - set_oid(&(si->dline.oid), DN_DELAY_LINE, - sizeof(struct delay_line)); - /* mark si and dline as outside the event queue */ - si->ni.oid.id = si->dline.oid.id = -1; - - si->sched = s; - si->dline.si = si; - - if (s->fp->new_sched && s->fp->new_sched(si)) { - D("new_sched error"); - goto error; - } - if (s->sch.flags & DN_HAVE_MASK) - si->ni.fid = *(struct ipfw_flow_id *)key; - - dn_cfg.si_count++; - return si; - -error: - if (si) { - bzero(si, sizeof(*si)); // safety - free(si, M_DUMMYNET); - } - return NULL; -} - -/* - * Callback from siht to delete all scheduler instances. Remove - * si and delay line from the system heap, destroy all queues. - * We assume that all flowset have been notified and do not - * point to us anymore. - */ -static int -si_destroy(void *_si, void *arg) -{ - struct dn_sch_inst *si = _si; - struct dn_schk *s = si->sched; - struct delay_line *dl = &si->dline; - - if (dl->oid.subtype) /* remove delay line from event heap */ - heap_extract(&dn_cfg.evheap, dl); - dn_free_pkts(dl->mq.head); /* drain delay line */ - if (si->kflags & DN_ACTIVE) /* remove si from event heap */ - heap_extract(&dn_cfg.evheap, si); - if (s->fp->free_sched) - s->fp->free_sched(si); - bzero(si, sizeof(*si)); /* safety */ - free(si, M_DUMMYNET); - dn_cfg.si_count--; - return DNHT_SCAN_DEL; -} - -/* - * Find the scheduler instance for this packet. If we need to apply - * a mask, do on a local copy of the flow_id to preserve the original. - * Assume siht is always initialized if we have a mask. - */ -struct dn_sch_inst * -ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id) -{ - - if (s->sch.flags & DN_HAVE_MASK) { - struct ipfw_flow_id id_t = *id; - flow_id_mask(&s->sch.sched_mask, &id_t); - return dn_ht_find(s->siht, (uintptr_t)&id_t, - DNHT_INSERT, s); - } - if (!s->siht) - s->siht = si_new(0, 0, s); - return (struct dn_sch_inst *)s->siht; -} - -/* callback to flush credit for the scheduler instance */ -static int -si_reset_credit(void *_si, void *arg) -{ - struct dn_sch_inst *si = _si; - struct dn_link *p = &si->sched->link; - - si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0); - return 0; -} - -static void -schk_reset_credit(struct dn_schk *s) -{ - if (s->sch.flags & DN_HAVE_MASK) - dn_ht_scan(s->siht, si_reset_credit, NULL); - else if (s->siht) - si_reset_credit(s->siht, NULL); -} -/*---- end of sch_inst hashtable ---------------------*/ - -/*------------------------------------------------------- - * flowset hash (fshash) support. Entries are hashed by fs_nr. - * New allocations are put in the fsunlinked list, from which - * they are removed when they point to a specific scheduler. - */ -static uint32_t -fsk_hash(uintptr_t key, int flags, void *arg) -{ - uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : - ((struct dn_fsk *)key)->fs.fs_nr; - - return ( (i>>8)^(i>>4)^i ); -} - -static int -fsk_match(void *obj, uintptr_t key, int flags, void *arg) -{ - struct dn_fsk *fs = obj; - int i = !(flags & DNHT_KEY_IS_OBJ) ? key : - ((struct dn_fsk *)key)->fs.fs_nr; - - return (fs->fs.fs_nr == i); -} - -static void * -fsk_new(uintptr_t key, int flags, void *arg) -{ - struct dn_fsk *fs; - - fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO); - if (fs) { - set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs)); - dn_cfg.fsk_count++; - fs->drain_bucket = 0; - SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); - } - return fs; -} - -/* - * detach flowset from its current scheduler. Flags as follows: - * DN_DETACH removes from the fsk_list - * DN_DESTROY deletes individual queues - * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked). - */ -static void -fsk_detach(struct dn_fsk *fs, int flags) -{ - if (flags & DN_DELETE_FS) - flags |= DN_DESTROY; - ND("fs %d from sched %d flags %s %s %s", - fs->fs.fs_nr, fs->fs.sched_nr, - (flags & DN_DELETE_FS) ? "DEL_FS":"", - (flags & DN_DESTROY) ? "DEL":"", - (flags & DN_DETACH) ? "DET":""); - if (flags & DN_DETACH) { /* detach from the list */ - struct dn_fsk_head *h; - h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu; - SLIST_REMOVE(h, fs, dn_fsk, sch_chain); - } - /* Free the RED parameters, they will be recomputed on - * subsequent attach if needed. - */ - if (fs->w_q_lookup) - free(fs->w_q_lookup, M_DUMMYNET); - fs->w_q_lookup = NULL; - qht_delete(fs, flags); - if (fs->sched && fs->sched->fp->free_fsk) - fs->sched->fp->free_fsk(fs); - fs->sched = NULL; - if (flags & DN_DELETE_FS) { - bzero(fs, sizeof(*fs)); /* safety */ - free(fs, M_DUMMYNET); - dn_cfg.fsk_count--; - } else { - SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); - } -} - -/* - * Detach or destroy all flowsets in a list. - * flags specifies what to do: - * DN_DESTROY: flush all queues - * DN_DELETE_FS: DN_DESTROY + destroy flowset - * DN_DELETE_FS implies DN_DESTROY - */ -static void -fsk_detach_list(struct dn_fsk_head *h, int flags) -{ - struct dn_fsk *fs; - int n = 0; /* only for stats */ - - ND("head %p flags %x", h, flags); - while ((fs = SLIST_FIRST(h))) { - SLIST_REMOVE_HEAD(h, sch_chain); - n++; - fsk_detach(fs, flags); - } - ND("done %d flowsets", n); -} - -/* - * called on 'queue X delete' -- removes the flowset from fshash, - * deletes all queues for the flowset, and removes the flowset. - */ -static int -delete_fs(int i, int locked) -{ - struct dn_fsk *fs; - int err = 0; - - if (!locked) - DN_BH_WLOCK(); - fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL); - ND("fs %d found %p", i, fs); - if (fs) { - fsk_detach(fs, DN_DETACH | DN_DELETE_FS); - err = 0; - } else - err = EINVAL; - if (!locked) - DN_BH_WUNLOCK(); - return err; -} - -/*----- end of flowset hashtable support -------------*/ - -/*------------------------------------------------------------ - * Scheduler hash. When searching by index we pass sched_nr, - * otherwise we pass struct dn_sch * which is the first field in - * struct dn_schk so we can cast between the two. We use this trick - * because in the create phase (but it should be fixed). - */ -static uint32_t -schk_hash(uintptr_t key, int flags, void *_arg) -{ - uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : - ((struct dn_schk *)key)->sch.sched_nr; - return ( (i>>8)^(i>>4)^i ); -} - -static int -schk_match(void *obj, uintptr_t key, int flags, void *_arg) -{ - struct dn_schk *s = (struct dn_schk *)obj; - int i = !(flags & DNHT_KEY_IS_OBJ) ? key : - ((struct dn_schk *)key)->sch.sched_nr; - return (s->sch.sched_nr == i); -} - -/* - * Create the entry and intialize with the sched hash if needed. - * Leave s->fp unset so we can tell whether a dn_ht_find() returns - * a new object or a previously existing one. - */ -static void * -schk_new(uintptr_t key, int flags, void *arg) -{ - struct schk_new_arg *a = arg; - struct dn_schk *s; - int l = sizeof(*s) +a->fp->schk_datalen; - - s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); - if (s == NULL) - return NULL; - set_oid(&s->link.oid, DN_LINK, sizeof(s->link)); - s->sch = *a->sch; // copy initial values - s->link.link_nr = s->sch.sched_nr; - SLIST_INIT(&s->fsk_list); - /* initialize the hash table or create the single instance */ - s->fp = a->fp; /* si_new needs this */ - s->drain_bucket = 0; - if (s->sch.flags & DN_HAVE_MASK) { - s->siht = dn_ht_init(NULL, s->sch.buckets, - offsetof(struct dn_sch_inst, si_next), - si_hash, si_match, si_new); - if (s->siht == NULL) { - free(s, M_DUMMYNET); - return NULL; - } - } - s->fp = NULL; /* mark as a new scheduler */ - dn_cfg.schk_count++; - return s; -} - -/* - * Callback for sched delete. Notify all attached flowsets to - * detach from the scheduler, destroy the internal flowset, and - * all instances. The scheduler goes away too. - * arg is 0 (only detach flowsets and destroy instances) - * DN_DESTROY (detach & delete queues, delete schk) - * or DN_DELETE_FS (delete queues and flowsets, delete schk) - */ -static int -schk_delete_cb(void *obj, void *arg) -{ - struct dn_schk *s = obj; -#if 0 - int a = (int)arg; - ND("sched %d arg %s%s", - s->sch.sched_nr, - a&DN_DESTROY ? "DEL ":"", - a&DN_DELETE_FS ? "DEL_FS":""); -#endif - fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0); - /* no more flowset pointing to us now */ - if (s->sch.flags & DN_HAVE_MASK) { - dn_ht_scan(s->siht, si_destroy, NULL); - dn_ht_free(s->siht, 0); - } else if (s->siht) - si_destroy(s->siht, NULL); - if (s->profile) { - free(s->profile, M_DUMMYNET); - s->profile = NULL; - } - s->siht = NULL; - if (s->fp->destroy) - s->fp->destroy(s); - bzero(s, sizeof(*s)); // safety - free(obj, M_DUMMYNET); - dn_cfg.schk_count--; - return DNHT_SCAN_DEL; -} - -/* - * called on a 'sched X delete' command. Deletes a single scheduler. - * This is done by removing from the schedhash, unlinking all - * flowsets and deleting their traffic. - */ -static int -delete_schk(int i) -{ - struct dn_schk *s; - - s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); - ND("%d %p", i, s); - if (!s) - return EINVAL; - delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */ - /* then detach flowsets, delete traffic */ - schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY); - return 0; -} -/*--- end of schk hashtable support ---*/ - -static int -copy_obj(char **start, char *end, void *_o, const char *msg, int i) -{ - struct dn_id *o = _o; - int have = end - *start; - - if (have < o->len || o->len == 0 || o->type == 0) { - D("(WARN) type %d %s %d have %d need %d", - o->type, msg, i, have, o->len); - return 1; - } - ND("type %d %s %d len %d", o->type, msg, i, o->len); - bcopy(_o, *start, o->len); - if (o->type == DN_LINK) { - /* Adjust burst parameter for link */ - struct dn_link *l = (struct dn_link *)*start; - l->burst = div64(l->burst, 8 * hz); - l->delay = l->delay * 1000 / hz; - } else if (o->type == DN_SCH) { - /* Set id->id to the number of instances */ - struct dn_schk *s = _o; - struct dn_id *id = (struct dn_id *)(*start); - id->id = (s->sch.flags & DN_HAVE_MASK) ? - dn_ht_entries(s->siht) : (s->siht ? 1 : 0); - } - *start += o->len; - return 0; -} - -/* Specific function to copy a queue. - * Copies only the user-visible part of a queue (which is in - * a struct dn_flow), and sets len accordingly. - */ -static int -copy_obj_q(char **start, char *end, void *_o, const char *msg, int i) -{ - struct dn_id *o = _o; - int have = end - *start; - int len = sizeof(struct dn_flow); /* see above comment */ - - if (have < len || o->len == 0 || o->type != DN_QUEUE) { - D("ERROR type %d %s %d have %d need %d", - o->type, msg, i, have, len); - return 1; - } - ND("type %d %s %d len %d", o->type, msg, i, len); - bcopy(_o, *start, len); - ((struct dn_id*)(*start))->len = len; - *start += len; - return 0; -} - -static int -copy_q_cb(void *obj, void *arg) -{ - struct dn_queue *q = obj; - struct copy_args *a = arg; - struct dn_flow *ni = (struct dn_flow *)(*a->start); - if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1)) - return DNHT_SCAN_END; - ni->oid.type = DN_FLOW; /* override the DN_QUEUE */ - ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL); - return 0; -} - -static int -copy_q(struct copy_args *a, struct dn_fsk *fs, int flags) -{ - if (!fs->qht) - return 0; - if (fs->fs.flags & DN_QHT_HASH) - dn_ht_scan(fs->qht, copy_q_cb, a); - else - copy_q_cb(fs->qht, a); - return 0; -} - -/* - * This routine only copies the initial part of a profile ? XXX - */ -static int -copy_profile(struct copy_args *a, struct dn_profile *p) -{ - int have = a->end - *a->start; - /* XXX here we check for max length */ - int profile_len = sizeof(struct dn_profile) - - ED_MAX_SAMPLES_NO*sizeof(int); - - if (p == NULL) - return 0; - if (have < profile_len) { - D("error have %d need %d", have, profile_len); - return 1; - } - bcopy(p, *a->start, profile_len); - ((struct dn_id *)(*a->start))->len = profile_len; - *a->start += profile_len; - return 0; -} - -static int -copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags) -{ - struct dn_fs *ufs = (struct dn_fs *)(*a->start); - if (!fs) - return 0; - ND("flowset %d", fs->fs.fs_nr); - if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr)) - return DNHT_SCAN_END; - ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ? - dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0); - if (flags) { /* copy queues */ - copy_q(a, fs, 0); - } - return 0; -} - -static int -copy_si_cb(void *obj, void *arg) -{ - struct dn_sch_inst *si = obj; - struct copy_args *a = arg; - struct dn_flow *ni = (struct dn_flow *)(*a->start); - if (copy_obj(a->start, a->end, &si->ni, "inst", - si->sched->sch.sched_nr)) - return DNHT_SCAN_END; - ni->oid.type = DN_FLOW; /* override the DN_SCH_I */ - ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL); - return 0; -} - -static int -copy_si(struct copy_args *a, struct dn_schk *s, int flags) -{ - if (s->sch.flags & DN_HAVE_MASK) - dn_ht_scan(s->siht, copy_si_cb, a); - else if (s->siht) - copy_si_cb(s->siht, a); - return 0; -} - -/* - * compute a list of children of a scheduler and copy up - */ -static int -copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags) -{ - struct dn_fsk *fs; - struct dn_id *o; - uint32_t *p; - - int n = 0, space = sizeof(*o); - SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { - if (fs->fs.fs_nr < DN_MAX_ID) - n++; - } - space += n * sizeof(uint32_t); - DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n); - if (a->end - *(a->start) < space) - return DNHT_SCAN_END; - o = (struct dn_id *)(*(a->start)); - o->len = space; - *a->start += o->len; - o->type = DN_TEXT; - p = (uint32_t *)(o+1); - SLIST_FOREACH(fs, &s->fsk_list, sch_chain) - if (fs->fs.fs_nr < DN_MAX_ID) - *p++ = fs->fs.fs_nr; - return 0; -} - -static int -copy_data_helper(void *_o, void *_arg) -{ - struct copy_args *a = _arg; - uint32_t *r = a->extra->r; /* start of first range */ - uint32_t *lim; /* first invalid pointer */ - int n; - - lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len); - - if (a->type == DN_LINK || a->type == DN_SCH) { - /* pipe|sched show, we receive a dn_schk */ - struct dn_schk *s = _o; - - n = s->sch.sched_nr; - if (a->type == DN_SCH && n >= DN_MAX_ID) - return 0; /* not a scheduler */ - if (a->type == DN_LINK && n <= DN_MAX_ID) - return 0; /* not a pipe */ - - /* see if the object is within one of our ranges */ - for (;r < lim; r += 2) { - if (n < r[0] || n > r[1]) - continue; - /* Found a valid entry, copy and we are done */ - if (a->flags & DN_C_LINK) { - if (copy_obj(a->start, a->end, - &s->link, "link", n)) - return DNHT_SCAN_END; - if (copy_profile(a, s->profile)) - return DNHT_SCAN_END; - if (copy_flowset(a, s->fs, 0)) - return DNHT_SCAN_END; - } - if (a->flags & DN_C_SCH) { - if (copy_obj(a->start, a->end, - &s->sch, "sched", n)) - return DNHT_SCAN_END; - /* list all attached flowsets */ - if (copy_fsk_list(a, s, 0)) - return DNHT_SCAN_END; - } - if (a->flags & DN_C_FLOW) - copy_si(a, s, 0); - break; - } - } else if (a->type == DN_FS) { - /* queue show, skip internal flowsets */ - struct dn_fsk *fs = _o; - - n = fs->fs.fs_nr; - if (n >= DN_MAX_ID) - return 0; - /* see if the object is within one of our ranges */ - for (;r < lim; r += 2) { - if (n < r[0] || n > r[1]) - continue; - if (copy_flowset(a, fs, 0)) - return DNHT_SCAN_END; - copy_q(a, fs, 0); - break; /* we are done */ - } - } - return 0; -} - -static inline struct dn_schk * -locate_scheduler(int i) -{ - return dn_ht_find(dn_cfg.schedhash, i, 0, NULL); -} - -/* - * red parameters are in fixed point arithmetic. - */ -static int -config_red(struct dn_fsk *fs) -{ - int64_t s, idle, weight, w0; - int t, i; - - fs->w_q = fs->fs.w_q; - fs->max_p = fs->fs.max_p; - ND("called"); - /* Doing stuff that was in userland */ - i = fs->sched->link.bandwidth; - s = (i <= 0) ? 0 : - hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i; - - idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */ - fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth); - /* fs->lookup_step not scaled, */ - if (!fs->lookup_step) - fs->lookup_step = 1; - w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled - - for (t = fs->lookup_step; t > 1; --t) - weight = SCALE_MUL(weight, w0); - fs->lookup_weight = (int)(weight); // scaled - - /* Now doing stuff that was in kerneland */ - fs->min_th = SCALE(fs->fs.min_th); - fs->max_th = SCALE(fs->fs.max_th); - - fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th); - fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th)); - - if (fs->fs.flags & DN_IS_GENTLE_RED) { - fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th; - fs->c_4 = SCALE(1) - 2 * fs->max_p; - } - - /* If the lookup table already exist, free and create it again. */ - if (fs->w_q_lookup) { - free(fs->w_q_lookup, M_DUMMYNET); - fs->w_q_lookup = NULL; - } - if (dn_cfg.red_lookup_depth == 0) { - printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth" - "must be > 0\n"); - fs->fs.flags &= ~DN_IS_RED; - fs->fs.flags &= ~DN_IS_GENTLE_RED; - return (EINVAL); - } - fs->lookup_depth = dn_cfg.red_lookup_depth; - fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int), - M_DUMMYNET, M_NOWAIT); - if (fs->w_q_lookup == NULL) { - printf("dummynet: sorry, cannot allocate red lookup table\n"); - fs->fs.flags &= ~DN_IS_RED; - fs->fs.flags &= ~DN_IS_GENTLE_RED; - return(ENOSPC); - } - - /* Fill the lookup table with (1 - w_q)^x */ - fs->w_q_lookup[0] = SCALE(1) - fs->w_q; - - for (i = 1; i < fs->lookup_depth; i++) - fs->w_q_lookup[i] = - SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight); - - if (dn_cfg.red_avg_pkt_size < 1) - dn_cfg.red_avg_pkt_size = 512; - fs->avg_pkt_size = dn_cfg.red_avg_pkt_size; - if (dn_cfg.red_max_pkt_size < 1) - dn_cfg.red_max_pkt_size = 1500; - fs->max_pkt_size = dn_cfg.red_max_pkt_size; - ND("exit"); - return 0; -} - -/* Scan all flowset attached to this scheduler and update red */ -static void -update_red(struct dn_schk *s) -{ - struct dn_fsk *fs; - SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { - if (fs && (fs->fs.flags & DN_IS_RED)) - config_red(fs); - } -} - -/* attach flowset to scheduler s, possibly requeue */ -static void -fsk_attach(struct dn_fsk *fs, struct dn_schk *s) -{ - ND("remove fs %d from fsunlinked, link to sched %d", - fs->fs.fs_nr, s->sch.sched_nr); - SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain); - fs->sched = s; - SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain); - if (s->fp->new_fsk) - s->fp->new_fsk(fs); - /* XXX compute fsk_mask */ - fs->fsk_mask = fs->fs.flow_mask; - if (fs->sched->sch.flags & DN_HAVE_MASK) - flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask); - if (fs->qht) { - /* - * we must drain qht according to the old - * type, and reinsert according to the new one. - * The requeue is complex -- in general we need to - * reclassify every single packet. - * For the time being, let's hope qht is never set - * when we reach this point. - */ - D("XXX TODO requeue from fs %d to sch %d", - fs->fs.fs_nr, s->sch.sched_nr); - fs->qht = NULL; - } - /* set the new type for qht */ - if (nonzero_mask(&fs->fsk_mask)) - fs->fs.flags |= DN_QHT_HASH; - else - fs->fs.flags &= ~DN_QHT_HASH; - - /* XXX config_red() can fail... */ - if (fs->fs.flags & DN_IS_RED) - config_red(fs); -} - -/* update all flowsets which may refer to this scheduler */ -static void -update_fs(struct dn_schk *s) -{ - struct dn_fsk *fs, *tmp; - - SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) { - if (s->sch.sched_nr != fs->fs.sched_nr) { - D("fs %d for sch %d not %d still unlinked", - fs->fs.fs_nr, fs->fs.sched_nr, - s->sch.sched_nr); - continue; - } - fsk_attach(fs, s); - } -} - -/* - * Configuration -- to preserve backward compatibility we use - * the following scheme (N is 65536) - * NUMBER SCHED LINK FLOWSET - * 1 .. N-1 (1)WFQ (2)WFQ (3)queue - * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1 - * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1 - * - * "pipe i config" configures #1, #2 and #3 - * "sched i config" configures #1 and possibly #6 - * "queue i config" configures #3 - * #1 is configured with 'pipe i config' or 'sched i config' - * #2 is configured with 'pipe i config', and created if not - * existing with 'sched i config' - * #3 is configured with 'queue i config' - * #4 is automatically configured after #1, can only be FIFO - * #5 is automatically configured after #2 - * #6 is automatically created when #1 is !MULTIQUEUE, - * and can be updated. - * #7 is automatically configured after #2 - */ - -/* - * configure a link (and its FIFO instance) - */ -static int -config_link(struct dn_link *p, struct dn_id *arg) -{ - int i; - - if (p->oid.len != sizeof(*p)) { - D("invalid pipe len %d", p->oid.len); - return EINVAL; - } - i = p->link_nr; - if (i <= 0 || i >= DN_MAX_ID) - return EINVAL; - /* - * The config program passes parameters as follows: - * bw = bits/second (0 means no limits), - * delay = ms, must be translated into ticks. - * qsize = slots/bytes - * burst ??? - */ - p->delay = (p->delay * hz) / 1000; - /* Scale burst size: bytes -> bits * hz */ - p->burst *= 8 * hz; - - DN_BH_WLOCK(); - /* do it twice, base link and FIFO link */ - for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { - struct dn_schk *s = locate_scheduler(i); - if (s == NULL) { - DN_BH_WUNLOCK(); - D("sched %d not found", i); - return EINVAL; - } - /* remove profile if exists */ - if (s->profile) { - free(s->profile, M_DUMMYNET); - s->profile = NULL; - } - /* copy all parameters */ - s->link.oid = p->oid; - s->link.link_nr = i; - s->link.delay = p->delay; - if (s->link.bandwidth != p->bandwidth) { - /* XXX bandwidth changes, need to update red params */ - s->link.bandwidth = p->bandwidth; - update_red(s); - } - s->link.burst = p->burst; - schk_reset_credit(s); - } - dn_cfg.id++; - DN_BH_WUNLOCK(); - return 0; -} - -/* - * configure a flowset. Can be called from inside with locked=1, - */ -static struct dn_fsk * -config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked) -{ - int i; - struct dn_fsk *fs; - - if (nfs->oid.len != sizeof(*nfs)) { - D("invalid flowset len %d", nfs->oid.len); - return NULL; - } - i = nfs->fs_nr; - if (i <= 0 || i >= 3*DN_MAX_ID) - return NULL; - ND("flowset %d", i); - /* XXX other sanity checks */ - if (nfs->flags & DN_QSIZE_BYTES) { - ipdn_bound_var(&nfs->qsize, 16384, - 1500, dn_cfg.byte_limit, NULL); // "queue byte size"); - } else { - ipdn_bound_var(&nfs->qsize, 50, - 1, dn_cfg.slot_limit, NULL); // "queue slot size"); - } - if (nfs->flags & DN_HAVE_MASK) { - /* make sure we have some buckets */ - ipdn_bound_var(&nfs->buckets, dn_cfg.hash_size, - 1, dn_cfg.max_hash_size, "flowset buckets"); - } else { - nfs->buckets = 1; /* we only need 1 */ - } - if (!locked) - DN_BH_WLOCK(); - do { /* exit with break when done */ - struct dn_schk *s; - int flags = nfs->sched_nr ? DNHT_INSERT : 0; - int j; - int oldc = dn_cfg.fsk_count; - fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL); - if (fs == NULL) { - D("missing sched for flowset %d", i); - break; - } - /* grab some defaults from the existing one */ - if (nfs->sched_nr == 0) /* reuse */ - nfs->sched_nr = fs->fs.sched_nr; - for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) { - if (nfs->par[j] == -1) /* reuse */ - nfs->par[j] = fs->fs.par[j]; - } - if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) { - ND("flowset %d unchanged", i); - break; /* no change, nothing to do */ - } - if (oldc != dn_cfg.fsk_count) /* new item */ - dn_cfg.id++; - s = locate_scheduler(nfs->sched_nr); - /* detach from old scheduler if needed, preserving - * queues if we need to reattach. Then update the - * configuration, and possibly attach to the new sched. - */ - DX(2, "fs %d changed sched %d@%p to %d@%p", - fs->fs.fs_nr, - fs->fs.sched_nr, fs->sched, nfs->sched_nr, s); - if (fs->sched) { - int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY); - flags |= DN_DESTROY; /* XXX temporary */ - fsk_detach(fs, flags); - } - fs->fs = *nfs; /* copy configuration */ - if (s != NULL) - fsk_attach(fs, s); - } while (0); - if (!locked) - DN_BH_WUNLOCK(); - return fs; -} - -/* - * config/reconfig a scheduler and its FIFO variant. - * For !MULTIQUEUE schedulers, also set up the flowset. - * - * On reconfigurations (detected because s->fp is set), - * detach existing flowsets preserving traffic, preserve link, - * and delete the old scheduler creating a new one. - */ -static int -config_sched(struct dn_sch *_nsch, struct dn_id *arg) -{ - struct dn_schk *s; - struct schk_new_arg a; /* argument for schk_new */ - int i; - struct dn_link p; /* copy of oldlink */ - struct dn_profile *pf = NULL; /* copy of old link profile */ - /* Used to preserv mask parameter */ - struct ipfw_flow_id new_mask; - int new_buckets = 0; - int new_flags = 0; - int pipe_cmd; - int err = ENOMEM; - - a.sch = _nsch; - if (a.sch->oid.len != sizeof(*a.sch)) { - D("bad sched len %d", a.sch->oid.len); - return EINVAL; - } - i = a.sch->sched_nr; - if (i <= 0 || i >= DN_MAX_ID) - return EINVAL; - /* make sure we have some buckets */ - if (a.sch->flags & DN_HAVE_MASK) - ipdn_bound_var(&a.sch->buckets, dn_cfg.hash_size, - 1, dn_cfg.max_hash_size, "sched buckets"); - /* XXX other sanity checks */ - bzero(&p, sizeof(p)); - - pipe_cmd = a.sch->flags & DN_PIPE_CMD; - a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set? - if (pipe_cmd) { - /* Copy mask parameter */ - new_mask = a.sch->sched_mask; - new_buckets = a.sch->buckets; - new_flags = a.sch->flags; - } - DN_BH_WLOCK(); -again: /* run twice, for wfq and fifo */ - /* - * lookup the type. If not supplied, use the previous one - * or default to WF2Q+. Otherwise, return an error. - */ - dn_cfg.id++; - a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name); - if (a.fp != NULL) { - /* found. Lookup or create entry */ - s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a); - } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) { - /* No type. search existing s* or retry with WF2Q+ */ - s = dn_ht_find(dn_cfg.schedhash, i, 0, &a); - if (s != NULL) { - a.fp = s->fp; - /* Scheduler exists, skip to FIFO scheduler - * if command was pipe config... - */ - if (pipe_cmd) - goto next; - } else { - /* New scheduler, create a wf2q+ with no mask - * if command was pipe config... - */ - if (pipe_cmd) { - /* clear mask parameter */ - bzero(&a.sch->sched_mask, sizeof(new_mask)); - a.sch->buckets = 0; - a.sch->flags &= ~DN_HAVE_MASK; - } - a.sch->oid.subtype = DN_SCHED_WF2QP; - goto again; - } - } else { - D("invalid scheduler type %d %s", - a.sch->oid.subtype, a.sch->name); - err = EINVAL; - goto error; - } - /* normalize name and subtype */ - a.sch->oid.subtype = a.fp->type; - bzero(a.sch->name, sizeof(a.sch->name)); - strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name)); - if (s == NULL) { - D("cannot allocate scheduler %d", i); - goto error; - } - /* restore existing link if any */ - if (p.link_nr) { - s->link = p; - if (!pf || pf->link_nr != p.link_nr) { /* no saved value */ - s->profile = NULL; /* XXX maybe not needed */ - } else { - s->profile = malloc(sizeof(struct dn_profile), - M_DUMMYNET, M_NOWAIT | M_ZERO); - if (s->profile == NULL) { - D("cannot allocate profile"); - goto error; //XXX - } - bcopy(pf, s->profile, sizeof(*pf)); - } - } - p.link_nr = 0; - if (s->fp == NULL) { - DX(2, "sched %d new type %s", i, a.fp->name); - } else if (s->fp != a.fp || - bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) { - /* already existing. */ - DX(2, "sched %d type changed from %s to %s", - i, s->fp->name, a.fp->name); - DX(4, " type/sub %d/%d -> %d/%d", - s->sch.oid.type, s->sch.oid.subtype, - a.sch->oid.type, a.sch->oid.subtype); - if (s->link.link_nr == 0) - D("XXX WARNING link 0 for sched %d", i); - p = s->link; /* preserve link */ - if (s->profile) {/* preserve profile */ - if (!pf) - pf = malloc(sizeof(*pf), - M_DUMMYNET, M_NOWAIT | M_ZERO); - if (pf) /* XXX should issue a warning otherwise */ - bcopy(s->profile, pf, sizeof(*pf)); - } - /* remove from the hash */ - dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); - /* Detach flowsets, preserve queues. */ - // schk_delete_cb(s, NULL); - // XXX temporarily, kill queues - schk_delete_cb(s, (void *)DN_DESTROY); - goto again; - } else { - DX(4, "sched %d unchanged type %s", i, a.fp->name); - } - /* complete initialization */ - s->sch = *a.sch; - s->fp = a.fp; - s->cfg = arg; - // XXX schk_reset_credit(s); - /* create the internal flowset if needed, - * trying to reuse existing ones if available - */ - if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) { - s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL); - if (!s->fs) { - struct dn_fs fs; - bzero(&fs, sizeof(fs)); - set_oid(&fs.oid, DN_FS, sizeof(fs)); - fs.fs_nr = i + DN_MAX_ID; - fs.sched_nr = i; - s->fs = config_fs(&fs, NULL, 1 /* locked */); - } - if (!s->fs) { - schk_delete_cb(s, (void *)DN_DESTROY); - D("error creating internal fs for %d", i); - goto error; - } - } - /* call init function after the flowset is created */ - if (s->fp->config) - s->fp->config(s); - update_fs(s); -next: - if (i < DN_MAX_ID) { /* now configure the FIFO instance */ - i += DN_MAX_ID; - if (pipe_cmd) { - /* Restore mask parameter for FIFO */ - a.sch->sched_mask = new_mask; - a.sch->buckets = new_buckets; - a.sch->flags = new_flags; - } else { - /* sched config shouldn't modify the FIFO scheduler */ - if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) { - /* FIFO already exist, don't touch it */ - err = 0; /* and this is not an error */ - goto error; - } - } - a.sch->sched_nr = i; - a.sch->oid.subtype = DN_SCHED_FIFO; - bzero(a.sch->name, sizeof(a.sch->name)); - goto again; - } - err = 0; -error: - DN_BH_WUNLOCK(); - if (pf) - free(pf, M_DUMMYNET); - return err; -} - -/* - * attach a profile to a link - */ -static int -config_profile(struct dn_profile *pf, struct dn_id *arg) -{ - struct dn_schk *s; - int i, olen, err = 0; - - if (pf->oid.len < sizeof(*pf)) { - D("short profile len %d", pf->oid.len); - return EINVAL; - } - i = pf->link_nr; - if (i <= 0 || i >= DN_MAX_ID) - return EINVAL; - /* XXX other sanity checks */ - DN_BH_WLOCK(); - for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { - s = locate_scheduler(i); - - if (s == NULL) { - err = EINVAL; - break; - } - dn_cfg.id++; - /* - * If we had a profile and the new one does not fit, - * or it is deleted, then we need to free memory. - */ - if (s->profile && (pf->samples_no == 0 || - s->profile->oid.len < pf->oid.len)) { - free(s->profile, M_DUMMYNET); - s->profile = NULL; - } - if (pf->samples_no == 0) - continue; - /* - * new profile, possibly allocate memory - * and copy data. - */ - if (s->profile == NULL) - s->profile = malloc(pf->oid.len, - M_DUMMYNET, M_NOWAIT | M_ZERO); - if (s->profile == NULL) { - D("no memory for profile %d", i); - err = ENOMEM; - break; - } - /* preserve larger length XXX double check */ - olen = s->profile->oid.len; - if (olen < pf->oid.len) - olen = pf->oid.len; - bcopy(pf, s->profile, pf->oid.len); - s->profile->oid.len = olen; - } - DN_BH_WUNLOCK(); - return err; -} - -/* - * Delete all objects: - */ -static void -dummynet_flush(void) -{ - - /* delete all schedulers and related links/queues/flowsets */ - dn_ht_scan(dn_cfg.schedhash, schk_delete_cb, - (void *)(uintptr_t)DN_DELETE_FS); - /* delete all remaining (unlinked) flowsets */ - DX(4, "still %d unlinked fs", dn_cfg.fsk_count); - dn_ht_free(dn_cfg.fshash, DNHT_REMOVE); - fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS); - /* Reinitialize system heap... */ - heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); -} - -/* - * Main handler for configuration. We are guaranteed to be called - * with an oid which is at least a dn_id. - * - the first object is the command (config, delete, flush, ...) - * - config_link must be issued after the corresponding config_sched - * - parameters (DN_TXT) for an object must preceed the object - * processed on a config_sched. - */ -int -do_config(void *p, int l) -{ - struct dn_id *next, *o; - int err = 0, err2 = 0; - struct dn_id *arg = NULL; - uintptr_t *a; - - o = p; - if (o->id != DN_API_VERSION) { - D("invalid api version got %d need %d", - o->id, DN_API_VERSION); - return EINVAL; - } - for (; l >= sizeof(*o); o = next) { - struct dn_id *prev = arg; - if (o->len < sizeof(*o) || l < o->len) { - D("bad len o->len %d len %d", o->len, l); - err = EINVAL; - break; - } - l -= o->len; - next = (struct dn_id *)((char *)o + o->len); - err = 0; - switch (o->type) { - default: - D("cmd %d not implemented", o->type); - break; - -#ifdef EMULATE_SYSCTL - /* sysctl emulation. - * if we recognize the command, jump to the correct - * handler and return - */ - case DN_SYSCTL_SET: - err = kesysctl_emu_set(p, l); - return err; -#endif - - case DN_CMD_CONFIG: /* simply a header */ - break; - - case DN_CMD_DELETE: - /* the argument is in the first uintptr_t after o */ - a = (uintptr_t *)(o+1); - if (o->len < sizeof(*o) + sizeof(*a)) { - err = EINVAL; - break; - } - switch (o->subtype) { - case DN_LINK: - /* delete base and derived schedulers */ - DN_BH_WLOCK(); - err = delete_schk(*a); - err2 = delete_schk(*a + DN_MAX_ID); - DN_BH_WUNLOCK(); - if (!err) - err = err2; - break; - - default: - D("invalid delete type %d", - o->subtype); - err = EINVAL; - break; - - case DN_FS: - err = (*a <1 || *a >= DN_MAX_ID) ? - EINVAL : delete_fs(*a, 0) ; - break; - } - break; - - case DN_CMD_FLUSH: - DN_BH_WLOCK(); - dummynet_flush(); - DN_BH_WUNLOCK(); - break; - case DN_TEXT: /* store argument the next block */ - prev = NULL; - arg = o; - break; - case DN_LINK: - err = config_link((struct dn_link *)o, arg); - break; - case DN_PROFILE: - err = config_profile((struct dn_profile *)o, arg); - break; - case DN_SCH: - err = config_sched((struct dn_sch *)o, arg); - break; - case DN_FS: - err = (NULL==config_fs((struct dn_fs *)o, arg, 0)); - break; - } - if (prev) - arg = NULL; - if (err != 0) - break; - } - return err; -} - -static int -compute_space(struct dn_id *cmd, struct copy_args *a) -{ - int x = 0, need = 0; - int profile_size = sizeof(struct dn_profile) - - ED_MAX_SAMPLES_NO*sizeof(int); - - /* NOTE about compute space: - * NP = dn_cfg.schk_count - * NSI = dn_cfg.si_count - * NF = dn_cfg.fsk_count - * NQ = dn_cfg.queue_count - * - ipfw pipe show - * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler - * link, scheduler template, flowset - * integrated in scheduler and header - * for flowset list - * (NSI)*(dn_flow) all scheduler instance (includes - * the queue instance) - * - ipfw sched show - * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler - * link, scheduler template, flowset - * integrated in scheduler and header - * for flowset list - * (NSI * dn_flow) all scheduler instances - * (NF * sizeof(uint_32)) space for flowset list linked to scheduler - * (NQ * dn_queue) all queue [XXXfor now not listed] - * - ipfw queue show - * (NF * dn_fs) all flowset - * (NQ * dn_queue) all queues - */ - switch (cmd->subtype) { - default: - return -1; - /* XXX where do LINK and SCH differ ? */ - /* 'ipfw sched show' could list all queues associated to - * a scheduler. This feature for now is disabled - */ - case DN_LINK: /* pipe show */ - x = DN_C_LINK | DN_C_SCH | DN_C_FLOW; - need += dn_cfg.schk_count * - (sizeof(struct dn_fs) + profile_size) / 2; - need += dn_cfg.fsk_count * sizeof(uint32_t); - break; - case DN_SCH: /* sched show */ - need += dn_cfg.schk_count * - (sizeof(struct dn_fs) + profile_size) / 2; - need += dn_cfg.fsk_count * sizeof(uint32_t); - x = DN_C_SCH | DN_C_LINK | DN_C_FLOW; - break; - case DN_FS: /* queue show */ - x = DN_C_FS | DN_C_QUEUE; - break; - case DN_GET_COMPAT: /* compatibility mode */ - need = dn_compat_calc_size(); - break; - } - a->flags = x; - if (x & DN_C_SCH) { - need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2; - /* NOT also, each fs might be attached to a sched */ - need += dn_cfg.schk_count * sizeof(struct dn_id) / 2; - } - if (x & DN_C_FS) - need += dn_cfg.fsk_count * sizeof(struct dn_fs); - if (x & DN_C_LINK) { - need += dn_cfg.schk_count * sizeof(struct dn_link) / 2; - } - /* - * When exporting a queue to userland, only pass up the - * struct dn_flow, which is the only visible part. - */ - - if (x & DN_C_QUEUE) - need += dn_cfg.queue_count * sizeof(struct dn_flow); - if (x & DN_C_FLOW) - need += dn_cfg.si_count * (sizeof(struct dn_flow)); - return need; -} - -/* - * If compat != NULL dummynet_get is called in compatibility mode. - * *compat will be the pointer to the buffer to pass to ipfw - */ -int -dummynet_get(struct sockopt *sopt, void **compat) -{ - int have, i, need, error; - char *start = NULL, *buf; - size_t sopt_valsize; - struct dn_id *cmd; - struct copy_args a; - struct copy_range r; - int l = sizeof(struct dn_id); - - bzero(&a, sizeof(a)); - bzero(&r, sizeof(r)); - - /* save and restore original sopt_valsize around copyin */ - sopt_valsize = sopt->sopt_valsize; - - cmd = &r.o; - - if (!compat) { - /* copy at least an oid, and possibly a full object */ - error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); - sopt->sopt_valsize = sopt_valsize; - if (error) - goto done; - l = cmd->len; -#ifdef EMULATE_SYSCTL - /* sysctl emulation. */ - if (cmd->type == DN_SYSCTL_GET) - return kesysctl_emu_get(sopt); -#endif - if (l > sizeof(r)) { - /* request larger than default, allocate buffer */ - cmd = malloc(l, M_DUMMYNET, M_WAITOK); - error = sooptcopyin(sopt, cmd, l, l); - sopt->sopt_valsize = sopt_valsize; - if (error) - goto done; - } - } else { /* compatibility */ - error = 0; - cmd->type = DN_CMD_GET; - cmd->len = sizeof(struct dn_id); - cmd->subtype = DN_GET_COMPAT; - // cmd->id = sopt_valsize; - D("compatibility mode"); - } - a.extra = (struct copy_range *)cmd; - if (cmd->len == sizeof(*cmd)) { /* no range, create a default */ - uint32_t *rp = (uint32_t *)(cmd + 1); - cmd->len += 2* sizeof(uint32_t); - rp[0] = 1; - rp[1] = DN_MAX_ID - 1; - if (cmd->subtype == DN_LINK) { - rp[0] += DN_MAX_ID; - rp[1] += DN_MAX_ID; - } - } - /* Count space (under lock) and allocate (outside lock). - * Exit with lock held if we manage to get enough buffer. - * Try a few times then give up. - */ - for (have = 0, i = 0; i < 10; i++) { - DN_BH_WLOCK(); - need = compute_space(cmd, &a); - - /* if there is a range, ignore value from compute_space() */ - if (l > sizeof(*cmd)) - need = sopt_valsize - sizeof(*cmd); - - if (need < 0) { - DN_BH_WUNLOCK(); - error = EINVAL; - goto done; - } - need += sizeof(*cmd); - cmd->id = need; - if (have >= need) - break; - - DN_BH_WUNLOCK(); - if (start) - free(start, M_DUMMYNET); - start = NULL; - if (need > sopt_valsize) - break; - - have = need; - start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO); - } - - if (start == NULL) { - if (compat) { - *compat = NULL; - error = 1; // XXX - } else { - error = sooptcopyout(sopt, cmd, sizeof(*cmd)); - } - goto done; - } - ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, " - "%d:%d si %d, %d:%d queues %d", - dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH, - dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK, - dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS, - dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I, - dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE); - sopt->sopt_valsize = sopt_valsize; - a.type = cmd->subtype; - - if (compat == NULL) { - bcopy(cmd, start, sizeof(*cmd)); - ((struct dn_id*)(start))->len = sizeof(struct dn_id); - buf = start + sizeof(*cmd); - } else - buf = start; - a.start = &buf; - a.end = start + have; - /* start copying other objects */ - if (compat) { - a.type = DN_COMPAT_PIPE; - dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a); - a.type = DN_COMPAT_QUEUE; - dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a); - } else if (a.type == DN_FS) { - dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a); - } else { - dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a); - } - DN_BH_WUNLOCK(); - - if (compat) { - *compat = start; - sopt->sopt_valsize = buf - start; - /* free() is done by ip_dummynet_compat() */ - start = NULL; //XXX hack - } else { - error = sooptcopyout(sopt, start, buf - start); - } -done: - if (cmd && cmd != &r.o) - free(cmd, M_DUMMYNET); - if (start) - free(start, M_DUMMYNET); - return error; -} - -/* Callback called on scheduler instance to delete it if idle */ -static int -drain_scheduler_cb(void *_si, void *arg) -{ - struct dn_sch_inst *si = _si; - - if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL) - return 0; - - if (si->sched->fp->flags & DN_MULTIQUEUE) { - if (si->q_count == 0) - return si_destroy(si, NULL); - else - return 0; - } else { /* !DN_MULTIQUEUE */ - if ((si+1)->ni.length == 0) - return si_destroy(si, NULL); - else - return 0; - } - return 0; /* unreachable */ -} - -/* Callback called on scheduler to check if it has instances */ -static int -drain_scheduler_sch_cb(void *_s, void *arg) -{ - struct dn_schk *s = _s; - - if (s->sch.flags & DN_HAVE_MASK) { - dn_ht_scan_bucket(s->siht, &s->drain_bucket, - drain_scheduler_cb, NULL); - s->drain_bucket++; - } else { - if (s->siht) { - if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL) - s->siht = NULL; - } - } - return 0; -} - -/* Called every tick, try to delete a 'bucket' of scheduler */ -void -dn_drain_scheduler(void) -{ - dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch, - drain_scheduler_sch_cb, NULL); - dn_cfg.drain_sch++; -} - -/* Callback called on queue to delete if it is idle */ -static int -drain_queue_cb(void *_q, void *arg) -{ - struct dn_queue *q = _q; - - if (q->ni.length == 0) { - dn_delete_queue(q, DN_DESTROY); - return DNHT_SCAN_DEL; /* queue is deleted */ - } - - return 0; /* queue isn't deleted */ -} - -/* Callback called on flowset used to check if it has queues */ -static int -drain_queue_fs_cb(void *_fs, void *arg) -{ - struct dn_fsk *fs = _fs; - - if (fs->fs.flags & DN_QHT_HASH) { - /* Flowset has a hash table for queues */ - dn_ht_scan_bucket(fs->qht, &fs->drain_bucket, - drain_queue_cb, NULL); - fs->drain_bucket++; - } else { - /* No hash table for this flowset, null the pointer - * if the queue is deleted - */ - if (fs->qht) { - if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL) - fs->qht = NULL; - } - } - return 0; -} - -/* Called every tick, try to delete a 'bucket' of queue */ -void -dn_drain_queue(void) -{ - /* scan a bucket of flowset */ - dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs, - drain_queue_fs_cb, NULL); - dn_cfg.drain_fs++; -} - -/* - * Handler for the various dummynet socket options - */ -static int -ip_dn_ctl(struct sockopt *sopt) -{ - void *p = NULL; - int error, l; - - error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET); - if (error) - return (error); - - /* Disallow sets in really-really secure mode. */ - if (sopt->sopt_dir == SOPT_SET) { - error = securelevel_ge(sopt->sopt_td->td_ucred, 3); - if (error) - return (error); - } - - switch (sopt->sopt_name) { - default : - D("dummynet: unknown option %d", sopt->sopt_name); - error = EINVAL; - break; - - case IP_DUMMYNET_FLUSH: - case IP_DUMMYNET_CONFIGURE: - case IP_DUMMYNET_DEL: /* remove a pipe or queue */ - case IP_DUMMYNET_GET: - D("dummynet: compat option %d", sopt->sopt_name); - error = ip_dummynet_compat(sopt); - break; - - case IP_DUMMYNET3 : - if (sopt->sopt_dir == SOPT_GET) { - error = dummynet_get(sopt, NULL); - break; - } - l = sopt->sopt_valsize; - if (l < sizeof(struct dn_id) || l > 12000) { - D("argument len %d invalid", l); - break; - } - p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ? - error = sooptcopyin(sopt, p, l, l); - if (error) - break ; - error = do_config(p, l); - break; - } - - if (p != NULL) - free(p, M_TEMP); - - return error ; -} - - -static void -ip_dn_init(void) -{ - if (dn_cfg.init_done) - return; - printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet); - dn_cfg.init_done = 1; - /* Set defaults here. MSVC does not accept initializers, - * and this is also useful for vimages - */ - /* queue limits */ - dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */ - dn_cfg.byte_limit = 1024 * 1024; - dn_cfg.expire = 1; - - /* RED parameters */ - dn_cfg.red_lookup_depth = 256; /* default lookup table depth */ - dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */ - dn_cfg.red_max_pkt_size = 1500; /* default max packet size */ - - /* hash tables */ - dn_cfg.max_hash_size = 65536; /* max in the hash tables */ - dn_cfg.hash_size = 64; /* default hash size */ - - /* create hash tables for schedulers and flowsets. - * In both we search by key and by pointer. - */ - dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size, - offsetof(struct dn_schk, schk_next), - schk_hash, schk_match, schk_new); - dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size, - offsetof(struct dn_fsk, fsk_next), - fsk_hash, fsk_match, fsk_new); - - /* bucket index to drain object */ - dn_cfg.drain_fs = 0; - dn_cfg.drain_sch = 0; - - heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); - SLIST_INIT(&dn_cfg.fsu); - SLIST_INIT(&dn_cfg.schedlist); - - DN_LOCK_INIT(); - - TASK_INIT(&dn_task, 0, dummynet_task, curvnet); - dn_tq = taskqueue_create("dummynet", M_WAITOK, - taskqueue_thread_enqueue, &dn_tq); - taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet"); - - callout_init(&dn_timeout, CALLOUT_MPSAFE); - callout_reset(&dn_timeout, 1, dummynet, NULL); - - /* Initialize curr_time adjustment mechanics. */ - getmicrouptime(&dn_cfg.prev_t); -} - -static void -ip_dn_destroy(int last) -{ - callout_drain(&dn_timeout); - - DN_BH_WLOCK(); - if (last) { - ND("removing last instance\n"); - ip_dn_ctl_ptr = NULL; - ip_dn_io_ptr = NULL; - } - - dummynet_flush(); - DN_BH_WUNLOCK(); - taskqueue_drain(dn_tq, &dn_task); - taskqueue_free(dn_tq); - - dn_ht_free(dn_cfg.schedhash, 0); - dn_ht_free(dn_cfg.fshash, 0); - heap_free(&dn_cfg.evheap); - - DN_LOCK_DESTROY(); -} - -static int -dummynet_modevent(module_t mod, int type, void *data) -{ - - if (type == MOD_LOAD) { - if (ip_dn_io_ptr) { - printf("DUMMYNET already loaded\n"); - return EEXIST ; - } - ip_dn_init(); - ip_dn_ctl_ptr = ip_dn_ctl; - ip_dn_io_ptr = dummynet_io; - return 0; - } else if (type == MOD_UNLOAD) { - ip_dn_destroy(1 /* last */); - return 0; - } else - return EOPNOTSUPP; -} - -/* modevent helpers for the modules */ -static int -load_dn_sched(struct dn_alg *d) -{ - struct dn_alg *s; - - if (d == NULL) - return 1; /* error */ - ip_dn_init(); /* just in case, we need the lock */ - - /* Check that mandatory funcs exists */ - if (d->enqueue == NULL || d->dequeue == NULL) { - D("missing enqueue or dequeue for %s", d->name); - return 1; - } - - /* Search if scheduler already exists */ - DN_BH_WLOCK(); - SLIST_FOREACH(s, &dn_cfg.schedlist, next) { - if (strcmp(s->name, d->name) == 0) { - D("%s already loaded", d->name); - break; /* scheduler already exists */ - } - } - if (s == NULL) - SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next); - DN_BH_WUNLOCK(); - D("dn_sched %s %sloaded", d->name, s ? "not ":""); - return s ? 1 : 0; -} - -static int -unload_dn_sched(struct dn_alg *s) -{ - struct dn_alg *tmp, *r; - int err = EINVAL; - - ND("called for %s", s->name); - - DN_BH_WLOCK(); - SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) { - if (strcmp(s->name, r->name) != 0) - continue; - ND("ref_count = %d", r->ref_count); - err = (r->ref_count != 0) ? EBUSY : 0; - if (err == 0) - SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next); - break; - } - DN_BH_WUNLOCK(); - D("dn_sched %s %sunloaded", s->name, err ? "not ":""); - return err; -} - -int -dn_sched_modevent(module_t mod, int cmd, void *arg) -{ - struct dn_alg *sch = arg; - - if (cmd == MOD_LOAD) - return load_dn_sched(sch); - else if (cmd == MOD_UNLOAD) - return unload_dn_sched(sch); - else - return EINVAL; -} - -static moduledata_t dummynet_mod = { - "dummynet", dummynet_modevent, NULL -}; - -#define DN_SI_SUB SI_SUB_PROTO_IFATTACHDOMAIN -#define DN_MODEV_ORD (SI_ORDER_ANY - 128) /* after ipfw */ -DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD); -MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); -MODULE_VERSION(dummynet, 3); - -/* - * Starting up. Done in order after dummynet_modevent() has been called. - * VNET_SYSINIT is also called for each existing vnet and each new vnet. - */ -//VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL); - -/* - * Shutdown handlers up shop. These are done in REVERSE ORDER, but still - * after dummynet_modevent() has been called. Not called on reboot. - * VNET_SYSUNINIT is also called for each exiting vnet as it exits. - * or when the module is unloaded. - */ -//VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL); - -/* end of file */ diff --git a/freebsd/sys/netpfil/ipfw/ip_fw2.c b/freebsd/sys/netpfil/ipfw/ip_fw2.c index 224ba937..a3a11819 100644 --- a/freebsd/sys/netpfil/ipfw/ip_fw2.c +++ b/freebsd/sys/netpfil/ipfw/ip_fw2.c @@ -36,7 +36,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_ipdivert.h> #include <rtems/bsd/local/opt_inet.h> #ifndef INET -#error IPFIREWALL requires INET. +#error "IPFIREWALL requires INET" #endif /* INET */ #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_ipsec.h> @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/sys/param.h> #include <sys/systm.h> #include <sys/condvar.h> +#include <sys/counter.h> #include <sys/eventhandler.h> #include <sys/malloc.h> #include <sys/mbuf.h> @@ -54,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include <sys/priv.h> #include <sys/proc.h> #include <sys/rwlock.h> +#include <sys/rmlock.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sysctl.h> @@ -61,11 +63,13 @@ __FBSDID("$FreeBSD$"); #include <sys/ucred.h> #include <net/ethernet.h> /* for ETHERTYPE_IP */ #include <net/if.h> +#include <net/if_var.h> #include <net/route.h> -#include <net/pf_mtag.h> #include <net/pfil.h> #include <net/vnet.h> +#include <netpfil/pf/pf_mtag.h> + #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/in_pcb.h> @@ -82,7 +86,9 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip6.h> #include <netinet/icmp6.h> +#include <netinet/in_fib.h> #ifdef INET6 +#include <netinet6/in6_fib.h> #include <netinet6/in6_pcb.h> #include <netinet6/scope6_var.h> #include <netinet6/ip6_var.h> @@ -101,10 +107,6 @@ __FBSDID("$FreeBSD$"); * All ipfw global variables are here. */ -/* ipfw_vnet_ready controls when we are open for business */ -static VNET_DEFINE(int, ipfw_vnet_ready) = 0; -#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) - static VNET_DEFINE(int, fw_deny_unknown_exthdrs); #define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) @@ -121,9 +123,20 @@ VNET_DEFINE(int, autoinc_step); VNET_DEFINE(int, fw_one_pass) = 1; VNET_DEFINE(unsigned int, fw_tables_max); +VNET_DEFINE(unsigned int, fw_tables_sets) = 0; /* Don't use set-aware tables */ /* Use 128 tables by default */ static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT; +#ifndef LINEAR_SKIPTO +static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, + int tablearg, int jump_backwards); +#define JUMP(ch, f, num, targ, back) jump_fast(ch, f, num, targ, back) +#else +static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, + int tablearg, int jump_backwards); +#define JUMP(ch, f, num, targ, back) jump_linear(ch, f, num, targ, back) +#endif + /* * Each rule belongs to one of 32 different sets (0..31). * The variable set_disable contains one bit per set. @@ -144,6 +157,9 @@ VNET_DEFINE(int, verbose_limit); /* layer3_chain contains the list of rules for layer 3 */ VNET_DEFINE(struct ip_fw_chain, layer3_chain); +/* ipfw_vnet_ready controls when we are open for business */ +VNET_DEFINE(int, ipfw_vnet_ready) = 0; + VNET_DEFINE(int, ipfw_nat_ready) = 0; ipfw_nat_t *ipfw_nat_ptr = NULL; @@ -156,45 +172,51 @@ ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #ifdef SYSCTL_NODE uint32_t dummy_def = IPFW_DEFAULT_RULE; static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS); +static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS); SYSBEGIN(f3) SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass, - CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, + CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, "Only do a single pass through ipfw when using dummynet(4)"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, - CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, "Rule number auto-increment step"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose, - CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, + CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, "Log matches to ipfw rules"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, - CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, "Set upper limit of matches of ipfw rules logged"); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, &dummy_def, 0, "The default/max possible rule number."); -SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, - CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU", - "Maximum number of tables"); +SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU", + "Maximum number of concurrently used tables"); +SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, + 0, 0, sysctl_ipfw_tables_sets, "IU", + "Use per-set namespace for tables"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, &default_to_accept, 0, "Make the default rule accept all packets."); -TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept); -TUNABLE_INT("net.inet.ip.fw.tables_max", &default_fw_tables); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count, - CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, +TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, + CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, "Number of static rules"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); -SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, - CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, + CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, + &VNET_NAME(fw_deny_unknown_exthdrs), 0, "Deny packets with unknown IPv6 Extension Headers"); -SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, - CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0, +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, + CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, + &VNET_NAME(fw_permit_single_frag6), 0, "Permit single packet IPv6 fragments"); #endif /* INET6 */ @@ -352,15 +374,18 @@ tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) } static int -iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uint32_t *tablearg) +iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, + uint32_t *tablearg) { + if (ifp == NULL) /* no iface with this packet, match fails */ - return 0; + return (0); + /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ if (cmd->name[0] == '\1') /* use tablearg to match */ - return ipfw_lookup_table_extended(chain, cmd->p.glob, - ifp->if_xname, tablearg, IPFW_TABLE_INTERFACE); + return ipfw_lookup_table_extended(chain, cmd->p.kidx, 0, + &ifp->if_index, tablearg); /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) @@ -370,7 +395,7 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uin return(1); } } else { -#ifdef __FreeBSD__ /* and OSX too ? */ +#if !defined(USERSPACE) && defined(__FreeBSD__) /* and OSX too ? */ struct ifaddr *ia; if_addr_rlock(ifp); @@ -413,50 +438,33 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uin static int verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) { -#ifndef __FreeBSD__ +#if defined(USERSPACE) || !defined(__FreeBSD__) return 0; #else - struct route ro; - struct sockaddr_in *dst; - - bzero(&ro, sizeof(ro)); - - dst = (struct sockaddr_in *)&(ro.ro_dst); - dst->sin_family = AF_INET; - dst->sin_len = sizeof(*dst); - dst->sin_addr = src; - in_rtalloc_ign(&ro, 0, fib); + struct nhop4_basic nh4; - if (ro.ro_rt == NULL) - return 0; + if (fib4_lookup_nh_basic(fib, src, NHR_IFAIF, 0, &nh4) != 0) + return (0); /* * If ifp is provided, check for equality with rtentry. * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * in order to pass packets injected back by if_simloop(): - * if useloopback == 1 routing entry (via lo0) for our own address + * routing entry (via lo0) for our own address * may exist, so we need to handle routing assymetry. */ - if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { - RTFREE(ro.ro_rt); - return 0; - } + if (ifp != NULL && ifp != nh4.nh_ifp) + return (0); /* if no ifp provided, check if rtentry is not default route */ - if (ifp == NULL && - satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { - RTFREE(ro.ro_rt); - return 0; - } + if (ifp == NULL && (nh4.nh_flags & NHF_DEFAULT) != 0) + return (0); /* or if this is a blackhole/reject route */ - if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { - RTFREE(ro.ro_rt); - return 0; - } + if (ifp == NULL && (nh4.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) + return (0); /* found valid route */ - RTFREE(ro.ro_rt); return 1; #endif /* __FreeBSD__ */ } @@ -482,79 +490,62 @@ flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) } /* support for IP6_*_ME opcodes */ +static const struct in6_addr lla_mask = {{{ + 0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}}}; + static int -search_ip6_addr_net (struct in6_addr * ip6_addr) +ipfw_localip6(struct in6_addr *in6) { - struct ifnet *mdc; - struct ifaddr *mdc2; - struct in6_ifaddr *fdm; - struct in6_addr copia; - - TAILQ_FOREACH(mdc, &V_ifnet, if_link) { - if_addr_rlock(mdc); - TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) { - if (mdc2->ifa_addr->sa_family == AF_INET6) { - fdm = (struct in6_ifaddr *)mdc2; - copia = fdm->ia_addr.sin6_addr; - /* need for leaving scope_id in the sock_addr */ - in6_clearscope(&copia); - if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) { - if_addr_runlock(mdc); - return 1; - } - } + struct rm_priotracker in6_ifa_tracker; + struct in6_ifaddr *ia; + + if (IN6_IS_ADDR_MULTICAST(in6)) + return (0); + + if (!IN6_IS_ADDR_LINKLOCAL(in6)) + return (in6_localip(in6)); + + IN6_IFADDR_RLOCK(&in6_ifa_tracker); + TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { + if (!IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) + continue; + if (IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr, + in6, &lla_mask)) { + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); + return (1); } - if_addr_runlock(mdc); } - return 0; + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); + return (0); } static int verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib) { - struct route_in6 ro; - struct sockaddr_in6 *dst; + struct nhop6_basic nh6; - bzero(&ro, sizeof(ro)); - - dst = (struct sockaddr_in6 * )&(ro.ro_dst); - dst->sin6_family = AF_INET6; - dst->sin6_len = sizeof(*dst); - dst->sin6_addr = *src; + if (IN6_IS_SCOPE_LINKLOCAL(src)) + return (1); - in6_rtalloc_ign(&ro, 0, fib); - if (ro.ro_rt == NULL) - return 0; + if (fib6_lookup_nh_basic(fib, src, 0, NHR_IFAIF, 0, &nh6) != 0) + return (0); - /* - * if ifp is provided, check for equality with rtentry - * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, - * to support the case of sending packets to an address of our own. - * (where the former interface is the first argument of if_simloop() - * (=ifp), the latter is lo0) - */ - if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { - RTFREE(ro.ro_rt); - return 0; - } + /* If ifp is provided, check for equality with route table. */ + if (ifp != NULL && ifp != nh6.nh_ifp) + return (0); /* if no ifp provided, check if rtentry is not default route */ - if (ifp == NULL && - IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { - RTFREE(ro.ro_rt); - return 0; - } + if (ifp == NULL && (nh6.nh_flags & NHF_DEFAULT) != 0) + return (0); /* or if this is a blackhole/reject route */ - if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { - RTFREE(ro.ro_rt); - return 0; - } + if (ifp == NULL && (nh6.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) + return (0); /* found valid route */ - RTFREE(ro.ro_rt); return 1; - } static int @@ -632,8 +623,6 @@ send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) m_adj(m, args->L3offset); #endif if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ - /* We need the IP header in host order for icmp_error(). */ - SET_HOST_IPLEN(ip); icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = @@ -666,6 +655,9 @@ static int check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, struct ucred **uc) { +#if defined(USERSPACE) + return 0; // not supported in userspace +#else #ifndef __FreeBSD__ /* XXX */ return cred_check(insn, proto, oif, @@ -776,6 +768,7 @@ check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, #endif /* __rtems__ */ return (match); #endif /* __FreeBSD__ */ +#endif /* not supported in userspace */ } /* @@ -793,9 +786,10 @@ set_match(struct ip_fw_args *args, int slot, args->rule.rulenum = chain->map[slot]->rulenum; } +#ifndef LINEAR_SKIPTO /* * Helper function to enable cached rule lookups using - * x_next and next_rule fields in ipfw rule. + * cached_id and cached_pos fields in ipfw rule. */ static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, @@ -803,28 +797,51 @@ jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, { int f_pos; - /* If possible use cached f_pos (in f->next_rule), - * whose version is written in f->next_rule + /* If possible use cached f_pos (in f->cached_pos), + * whose version is written in f->cached_id * (horrible hacks to avoid changing the ABI). */ - if (num != IP_FW_TABLEARG && (uintptr_t)f->x_next == chain->id) - f_pos = (uintptr_t)f->next_rule; + if (num != IP_FW_TARG && f->cached_id == chain->id) + f_pos = f->cached_pos; else { - int i = IP_FW_ARG_TABLEARG(num); + int i = IP_FW_ARG_TABLEARG(chain, num, skipto); /* make sure we do not jump backward */ if (jump_backwards == 0 && i <= f->rulenum) i = f->rulenum + 1; - f_pos = ipfw_find_rule(chain, i, 0); + if (chain->idxmap != NULL) + f_pos = chain->idxmap[i]; + else + f_pos = ipfw_find_rule(chain, i, 0); /* update the cache */ - if (num != IP_FW_TABLEARG) { - f->next_rule = (void *)(uintptr_t)f_pos; - f->x_next = (void *)(uintptr_t)chain->id; + if (num != IP_FW_TARG) { + f->cached_id = chain->id; + f->cached_pos = f_pos; } } return (f_pos); } +#else +/* + * Helper function to enable real fast rule lookups. + */ +static int +jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, + int tablearg, int jump_backwards) +{ + int f_pos; + + num = IP_FW_ARG_TABLEARG(chain, num, skipto); + /* make sure we do not jump backward */ + if (jump_backwards == 0 && num <= f->rulenum) + num = f->rulenum + 1; + f_pos = chain->idxmap[num]; + + return (f_pos); +} +#endif +#define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) /* * The main check routine for the firewall. * @@ -929,7 +946,7 @@ ipfw_chk(struct ip_fw_args *args) * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header - * or there is a single packet fragement (fragement header added + * or there is a single packet fragment (fragment header added * without needed). We will treat a single packet fragment as if * there was no fragment header (or log/block depending on the * V_fw_permit_single_frag6 sysctl setting). @@ -964,6 +981,7 @@ ipfw_chk(struct ip_fw_args *args) * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) */ int dyn_dir = MATCH_UNKNOWN; + uint16_t dyn_name = 0; ipfw_dyn_rule *q = NULL; struct ip_fw_chain *chain = &V_layer3_chain; @@ -984,6 +1002,7 @@ ipfw_chk(struct ip_fw_args *args) int is_ipv4 = 0; int done = 0; /* flag to exit the outer loop */ + IPFW_RLOCK_TRACKER; if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) return (IP_FW_PASS); /* accept */ @@ -1249,9 +1268,9 @@ do { \ args->f_id.dst_port = dst_port = ntohs(dst_port); } - IPFW_RLOCK(chain); + IPFW_PF_RLOCK(chain); if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ - IPFW_RUNLOCK(chain); + IPFW_PF_RUNLOCK(chain); return (IP_FW_PASS); /* accept */ } if (args->rule.slot) { @@ -1471,9 +1490,10 @@ do { \ proto != IPPROTO_UDP) break; else if (v == 2) - key = htonl(dst_port); + key = dst_port; else if (v == 3) - key = htonl(src_port); + key = src_port; +#ifndef USERSPACE else if (v == 4 || v == 5) { check_uidgid( (ipfw_insn_u32 *)cmd, @@ -1499,8 +1519,9 @@ do { \ else if (v == 5 /* O_JAIL */) key = ucred_cache.xid; #endif /* !__FreeBSD__ */ - key = htonl(key); - } else + } +#endif /* !USERSPACE */ + else break; } match = ipfw_lookup_table(chain, @@ -1517,8 +1538,9 @@ do { \ void *pkey = (cmd->opcode == O_IP_DST_LOOKUP) ? &args->f_id.dst_ip6: &args->f_id.src_ip6; match = ipfw_lookup_table_extended(chain, - cmd->arg1, pkey, &v, - IPFW_TABLE_CIDR); + cmd->arg1, + sizeof(struct in6_addr), + pkey, &v); if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == v; if (match) @@ -1526,6 +1548,17 @@ do { \ } break; + case O_IP_FLOW_LOOKUP: + { + uint32_t v = 0; + match = ipfw_lookup_table_extended(chain, + cmd->arg1, 0, &args->f_id, &v); + if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) + match = ((ipfw_insn_u32 *)cmd)->d[0] == v; + if (match) + tablearg = v; + } + break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { @@ -1551,7 +1584,7 @@ do { \ #ifdef INET6 /* FALLTHROUGH */ case O_IP6_SRC_ME: - match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); + match= is_ipv6 && ipfw_localip6(&args->f_id.src_ip6); #endif break; @@ -1590,7 +1623,7 @@ do { \ #ifdef INET6 /* FALLTHROUGH */ case O_IP6_DST_ME: - match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); + match= is_ipv6 && ipfw_localip6(&args->f_id.dst_ip6); #endif break; @@ -1697,7 +1730,7 @@ do { \ break; /* DSCP bitmask is stored as low_u32 high_u32 */ - if (x > 32) + if (x >= 32) match = *(p + 1) & (1 << (x - 32)); else match = *p & (1 << x); @@ -1732,9 +1765,11 @@ do { \ break; case O_TCPOPTS: - PULLUP_LEN(hlen, ulp, (TCP(ulp)->th_off << 2)); - match = (proto == IPPROTO_TCP && offset == 0 && - tcpopts_match(TCP(ulp), cmd)); + if (proto == IPPROTO_TCP && offset == 0 && ulp){ + PULLUP_LEN(hlen, ulp, + (TCP(ulp)->th_off << 2)); + match = tcpopts_match(TCP(ulp), cmd); + } break; case O_TCPSEQ: @@ -1778,27 +1813,37 @@ do { \ case O_ALTQ: { struct pf_mtag *at; + struct m_tag *mtag; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; + /* + * ALTQ uses mbuf tags from another + * packet filtering system - pf(4). + * We allocate a tag in its format + * and fill it in, pretending to be pf(4). + */ match = 1; at = pf_find_mtag(m); if (at != NULL && at->qid != 0) break; - at = pf_get_mtag(m); - if (at == NULL) { + mtag = m_tag_get(PACKET_TAG_PF, + sizeof(struct pf_mtag), M_NOWAIT | M_ZERO); + if (mtag == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } + m_tag_prepend(m, mtag); + at = (struct pf_mtag *)(mtag + 1); at->qid = altq->qid; at->hdr = ip; break; } case O_LOG: - ipfw_log(f, hlen, args, m, + ipfw_log(chain, f, hlen, args, m, oif, offset | ip6f_mf, tablearg, ip); match = 1; break; @@ -1920,7 +1965,7 @@ do { \ case O_TAG: { struct m_tag *mtag; - uint32_t tag = IP_FW_ARG_TABLEARG(cmd->arg1); + uint32_t tag = TARG(cmd->arg1, tag); /* Packet is already tagged with this tag? */ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); @@ -1954,6 +1999,7 @@ do { \ break; case O_SOCKARG: { +#ifndef USERSPACE /* not supported in userspace */ struct inpcb *inp = args->inp; struct inpcbinfo *pi; @@ -1972,7 +2018,7 @@ do { \ * certainly be inp_user_cookie? */ - /* For incomming packet, lookup up the + /* For incoming packet, lookup up the inpcb using the src/dest ip/port tuple */ if (inp == NULL) { inp = in_pcblookup(pi, @@ -1994,12 +2040,13 @@ do { \ match = 1; } } +#endif /* !USERSPACE */ break; } case O_TAGGED: { struct m_tag *mtag; - uint32_t tag = IP_FW_ARG_TABLEARG(cmd->arg1); + uint32_t tag = TARG(cmd->arg1, tag); if (cmdlen == 1) { match = m_tag_locate(m, MTAG_IPFW, @@ -2070,7 +2117,7 @@ do { \ */ case O_LIMIT: case O_KEEP_STATE: - if (ipfw_install_state(f, + if (ipfw_install_state(chain, f, (ipfw_insn_limit *)cmd, args, tablearg)) { /* error or limit violation */ retval = IP_FW_DENY; @@ -2085,17 +2132,35 @@ do { \ /* * dynamic rules are checked at the first * keep-state or check-state occurrence, - * with the result being stored in dyn_dir. + * with the result being stored in dyn_dir + * and dyn_name. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). + * + * (dyn_dir == MATCH_UNKNOWN) means this is + * first lookup for such f_id. Do lookup. + * + * (dyn_dir != MATCH_UNKNOWN && + * dyn_name != 0 && dyn_name != cmd->arg1) + * means previous lookup didn't find dynamic + * rule for specific state name and current + * lookup will search rule with another state + * name. Redo lookup. + * + * (dyn_dir != MATCH_UNKNOWN && dyn_name == 0) + * means previous lookup was for `any' name + * and it didn't find rule. No need to do + * lookup again. */ - if (dyn_dir == MATCH_UNKNOWN && + if ((dyn_dir == MATCH_UNKNOWN || + (dyn_name != 0 && + dyn_name != cmd->arg1)) && (q = ipfw_lookup_dyn_rule(&args->f_id, &dyn_dir, proto == IPPROTO_TCP ? - TCP(ulp) : NULL)) - != NULL) { + TCP(ulp): NULL, + (dyn_name = cmd->arg1))) != NULL) { /* * Found dynamic entry, update stats * and jump to the 'action' part of @@ -2137,7 +2202,7 @@ do { \ case O_PIPE: case O_QUEUE: set_match(args, f_pos, chain); - args->rule.info = IP_FW_ARG_TABLEARG(cmd->arg1); + args->rule.info = TARG(cmd->arg1, pipe); if (cmd->opcode == O_PIPE) args->rule.info |= IPFW_IS_PIPE; if (V_fw_one_pass) @@ -2157,7 +2222,7 @@ do { \ retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; set_match(args, f_pos, chain); - args->rule.info = IP_FW_ARG_TABLEARG(cmd->arg1); + args->rule.info = TARG(cmd->arg1, divert); break; case O_COUNT: @@ -2167,7 +2232,7 @@ do { \ case O_SKIPTO: IPFW_INC_RULE_COUNTER(f, pktlen); - f_pos = jump_fast(chain, f, cmd->arg1, tablearg, 0); + f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0); /* * Skip disabled rules, and re-enter * the inner loop with the correct @@ -2256,7 +2321,7 @@ do { \ if (IS_CALL) { stack[mtag->m_tag_id] = f->rulenum; mtag->m_tag_id++; - f_pos = jump_fast(chain, f, cmd->arg1, + f_pos = JUMP(chain, f, cmd->arg1, tablearg, 1); } else { /* `return' action */ mtag->m_tag_id--; @@ -2328,13 +2393,48 @@ do { \ if (q == NULL || q->rule != f || dyn_dir == MATCH_FORWARD) { struct sockaddr_in *sa; + sa = &(((ipfw_insn_sa *)cmd)->sa); if (sa->sin_addr.s_addr == INADDR_ANY) { - bcopy(sa, &args->hopstore, - sizeof(*sa)); - args->hopstore.sin_addr.s_addr = - htonl(tablearg); - args->next_hop = &args->hopstore; +#ifdef INET6 + /* + * We use O_FORWARD_IP opcode for + * fwd rule with tablearg, but tables + * now support IPv6 addresses. And + * when we are inspecting IPv6 packet, + * we can use nh6 field from + * table_value as next_hop6 address. + */ + if (is_ipv6) { + struct sockaddr_in6 *sa6; + + sa6 = args->next_hop6 = + &args->hopstore6; + sa6->sin6_family = AF_INET6; + sa6->sin6_len = sizeof(*sa6); + sa6->sin6_addr = TARG_VAL( + chain, tablearg, nh6); + /* + * Set sin6_scope_id only for + * link-local unicast addresses. + */ + if (IN6_IS_ADDR_LINKLOCAL( + &sa6->sin6_addr)) + sa6->sin6_scope_id = + TARG_VAL(chain, + tablearg, + zoneid); + } else +#endif + { + sa = args->next_hop = + &args->hopstore; + sa->sin_family = AF_INET; + sa->sin_len = sizeof(*sa); + sa->sin_addr.s_addr = htonl( + TARG_VAL(chain, tablearg, + nh4)); + } } else { args->next_hop = sa; } @@ -2364,7 +2464,7 @@ do { \ case O_NETGRAPH: case O_NGTEE: set_match(args, f_pos, chain); - args->rule.info = IP_FW_ARG_TABLEARG(cmd->arg1); + args->rule.info = TARG(cmd->arg1, netgraph); if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = (cmd->opcode == O_NETGRAPH) ? @@ -2377,7 +2477,7 @@ do { \ uint32_t fib; IPFW_INC_RULE_COUNTER(f, pktlen); - fib = IP_FW_ARG_TABLEARG(cmd->arg1); + fib = TARG(cmd->arg1, fib) & 0x7FFF; if (fib >= rt_numfibs) fib = 0; M_SETFIB(m, fib); @@ -2389,15 +2489,16 @@ do { \ case O_SETDSCP: { uint16_t code; - code = IP_FW_ARG_TABLEARG(cmd->arg1) & 0x3F; + code = TARG(cmd->arg1, dscp) & 0x3F; l = 0; /* exit inner loop */ if (is_ipv4) { - uint16_t a; + uint16_t old; - a = ip->ip_tos; - ip->ip_tos = (code << 2) | (ip->ip_tos & 0x03); - a += ntohs(ip->ip_sum) - ip->ip_tos; - ip->ip_sum = htons(a); + old = *(uint16_t *)ip; + ip->ip_tos = (code << 2) | + (ip->ip_tos & 0x03); + ip->ip_sum = cksum_adjust(ip->ip_sum, + old, *(uint16_t *)ip); } else if (is_ipv6) { uint8_t *v; @@ -2425,20 +2526,20 @@ do { \ set_match(args, f_pos, chain); /* Check if this is 'global' nat rule */ - if (cmd->arg1 == 0) { + if (cmd->arg1 == IP_FW_NAT44_GLOBAL) { retval = ipfw_nat_ptr(args, NULL, m); break; } t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { - nat_id = IP_FW_ARG_TABLEARG(cmd->arg1); + nat_id = TARG(cmd->arg1, nat); t = (*lookup_nat_ptr)(&chain->nat, nat_id); if (t == NULL) { retval = IP_FW_DENY; break; } - if (cmd->arg1 != IP_FW_TABLEARG) + if (cmd->arg1 != IP_FW_TARG) ((ipfw_insn_nat *)cmd)->nat = t; } retval = ipfw_nat_ptr(args, t, m); @@ -2454,11 +2555,6 @@ do { \ /* if not fragmented, go to next rule */ if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) break; - /* - * ip_reass() expects len & off in host - * byte order. - */ - SET_HOST_IPLEN(ip); args->m = m = ip_reass(m); @@ -2472,7 +2568,6 @@ do { \ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; - SET_NET_IPLEN(ip); ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); @@ -2484,6 +2579,11 @@ do { \ done = 1; /* exit outer loop */ break; } + case O_EXTERNAL_ACTION: + l = 0; /* in any case exit inner loop */ + retval = ipfw_run_eaction(chain, args, + cmd, &done); + break; default: panic("-- unknown opcode %d\n", cmd->opcode); @@ -2521,7 +2621,7 @@ do { \ retval = IP_FW_DENY; printf("ipfw: ouch!, skip past end of rules, denying packet\n"); } - IPFW_RUNLOCK(chain); + IPFW_PF_RUNLOCK(chain); #ifdef __FreeBSD__ if (ucred_cache != NULL) crfree(ucred_cache); @@ -2553,7 +2653,27 @@ sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS) return (ipfw_resize_tables(&V_layer3_chain, ntables)); } + +/* + * Switches table namespace between global and per-set. + */ +static int +sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS) +{ + int error; + unsigned int sets; + + sets = V_fw_tables_sets; + + error = sysctl_handle_int(oidp, &sets, 0, req); + /* Read operation or some error */ + if ((error != 0) || (req->newptr == NULL)) + return (error); + + return (ipfw_switch_tables_namespace(&V_layer3_chain, sets)); +} #endif + /* * Module and VNET glue */ @@ -2607,7 +2727,8 @@ ipfw_init(void) if (default_fw_tables > IPFW_TABLES_MAX) default_fw_tables = IPFW_TABLES_MAX; - ipfw_log_bpf(1); /* init */ + ipfw_init_sopt_handler(); + ipfw_iface_init(); return (error); } @@ -2619,7 +2740,8 @@ static void ipfw_destroy(void) { - ipfw_log_bpf(0); /* uninit */ + ipfw_iface_destroy(); + ipfw_destroy_sopt_handler(); printf("IP firewall unloaded\n"); } #endif /* __rtems__ */ @@ -2631,12 +2753,14 @@ ipfw_destroy(void) static int vnet_ipfw_init(const void *unused) { - int error; + int error, first; struct ip_fw *rule = NULL; struct ip_fw_chain *chain; chain = &V_layer3_chain; + first = IS_DEFAULT_VNET(curvnet) ? 1 : 0; + /* First set up some values that are compile time options */ V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ V_fw_deny_unknown_exthdrs = 1; @@ -2650,16 +2774,19 @@ vnet_ipfw_init(const void *unused) LIST_INIT(&chain->nat); #endif + /* Init shared services hash table */ + ipfw_init_srv(chain); + + ipfw_init_obj_rewriter(); + ipfw_init_counters(); /* insert the default rule and create the initial map */ chain->n_rules = 1; - chain->static_len = sizeof(struct ip_fw); chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_WAITOK | M_ZERO); - if (chain->map) - rule = malloc(chain->static_len, M_IPFW, M_WAITOK | M_ZERO); + rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw)); /* Set initial number of tables */ V_fw_tables_max = default_fw_tables; - error = ipfw_init_tables(chain); + error = ipfw_init_tables(chain, first); if (error) { printf("ipfw2: setting up tables failed\n"); free(chain->map, M_IPFW); @@ -2676,18 +2803,24 @@ vnet_ipfw_init(const void *unused) rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; chain->default_rule = chain->map[0] = rule; chain->id = rule->id = 1; + /* Pre-calculate rules length for legacy dump format */ + chain->static_len = sizeof(struct ip_fw_rule0); IPFW_LOCK_INIT(chain); ipfw_dyn_init(chain); + ipfw_eaction_init(chain, first); +#ifdef LINEAR_SKIPTO + ipfw_init_skipto_cache(chain); +#endif + ipfw_bpf_init(first); /* First set up some values that are compile time options */ V_ipfw_vnet_ready = 1; /* Open for business */ /* - * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr) - * and pfil hooks for ipv4 and ipv6. Even if the latter two fail - * we still keep the module alive because the sockopt and - * layer2 paths are still useful. + * Hook the sockopt handler and pfil hooks for ipv4 and ipv6. + * Even if the latter two fail we still keep the module alive + * because the sockopt and layer2 paths are still useful. * ipfw[6]_hook return 0 on success, ENOENT on failure, * so we can ignore the exact return value and just set a flag. * @@ -2697,8 +2830,7 @@ vnet_ipfw_init(const void *unused) * In layer2 we have the same behaviour, except that V_ether_ipfw * is checked on each packet because there are no pfil hooks. */ - V_ip_fw_ctl_ptr = ipfw_ctl; - V_ip_fw_chk_ptr = ipfw_chk; + V_ip_fw_ctl_ptr = ipfw_ctl3; error = ipfw_attach_hooks(1); return (error); } @@ -2710,9 +2842,9 @@ vnet_ipfw_init(const void *unused) static int vnet_ipfw_uninit(const void *unused) { - struct ip_fw *reap, *rule; + struct ip_fw *reap; struct ip_fw_chain *chain = &V_layer3_chain; - int i; + int i, last; V_ipfw_vnet_ready = 0; /* tell new callers to go away */ /* @@ -2721,33 +2853,39 @@ vnet_ipfw_uninit(const void *unused) * sure the update is propagated and nobody will be in. */ (void)ipfw_attach_hooks(0 /* detach */); - V_ip_fw_chk_ptr = NULL; V_ip_fw_ctl_ptr = NULL; + + last = IS_DEFAULT_VNET(curvnet) ? 1 : 0; + IPFW_UH_WLOCK(chain); IPFW_UH_WUNLOCK(chain); - IPFW_UH_WLOCK(chain); - IPFW_WLOCK(chain); ipfw_dyn_uninit(0); /* run the callout_drain */ - IPFW_WUNLOCK(chain); - ipfw_destroy_tables(chain); + IPFW_UH_WLOCK(chain); + reap = NULL; IPFW_WLOCK(chain); - for (i = 0; i < chain->n_rules; i++) { - rule = chain->map[i]; - rule->x_next = reap; - reap = rule; - } - if (chain->map) - free(chain->map, M_IPFW); + for (i = 0; i < chain->n_rules; i++) + ipfw_reap_add(chain, &reap, chain->map[i]); + free(chain->map, M_IPFW); +#ifdef LINEAR_SKIPTO + ipfw_destroy_skipto_cache(chain); +#endif IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); + ipfw_destroy_tables(chain, last); + ipfw_eaction_uninit(chain, last); if (reap != NULL) ipfw_reap_rules(reap); + vnet_ipfw_iface_destroy(chain); + ipfw_destroy_srv(chain); IPFW_LOCK_DESTROY(chain); ipfw_dyn_uninit(1); /* free the remaining parts */ - return 0; + ipfw_destroy_counters(); + ipfw_destroy_obj_rewriter(); + ipfw_bpf_uninit(last); + return (0); } #endif /* __rtems__ */ @@ -2793,13 +2931,14 @@ static moduledata_t ipfwmod = { }; /* Define startup order. */ -#define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN +#define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_FIREWALL #define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ #define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ #define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); -MODULE_VERSION(ipfw, 2); +FEATURE(ipfw_ctl3, "ipfw new sockopt calls"); +MODULE_VERSION(ipfw, 3); /* should declare some dependencies here */ /* diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_bpf.c b/freebsd/sys/netpfil/ipfw/ip_fw_bpf.c new file mode 100644 index 00000000..3127809b --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/ip_fw_bpf.c @@ -0,0 +1,211 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2016 Yandex LLC + * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_pflog.h> +#include <net/if_var.h> +#include <net/if_clone.h> +#include <net/if_types.h> +#include <net/vnet.h> +#include <net/bpf.h> + +#include <netinet/in.h> +#include <netinet/ip_fw.h> +#include <netinet/ip_var.h> +#include <netpfil/ipfw/ip_fw_private.h> + +static VNET_DEFINE(struct ifnet *, log_if); +static VNET_DEFINE(struct ifnet *, pflog_if); +static VNET_DEFINE(struct if_clone *, ipfw_cloner); +static VNET_DEFINE(struct if_clone *, ipfwlog_cloner); +#define V_ipfw_cloner VNET(ipfw_cloner) +#define V_ipfwlog_cloner VNET(ipfwlog_cloner) +#define V_log_if VNET(log_if) +#define V_pflog_if VNET(pflog_if) + +static struct rmlock log_if_lock; +#define LOGIF_LOCK_INIT(x) rm_init(&log_if_lock, "ipfw log_if lock") +#define LOGIF_LOCK_DESTROY(x) rm_destroy(&log_if_lock) +#define LOGIF_RLOCK_TRACKER struct rm_priotracker _log_tracker +#define LOGIF_RLOCK(x) rm_rlock(&log_if_lock, &_log_tracker) +#define LOGIF_RUNLOCK(x) rm_runlock(&log_if_lock, &_log_tracker) +#define LOGIF_WLOCK(x) rm_wlock(&log_if_lock) +#define LOGIF_WUNLOCK(x) rm_wunlock(&log_if_lock) + +static const char ipfwname[] = "ipfw"; +static const char ipfwlogname[] = "ipfwlog"; + +static int +ipfw_bpf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr) +{ + + return (EINVAL); +} + +static int +ipfw_bpf_output(struct ifnet *ifp, struct mbuf *m, + const struct sockaddr *dst, struct route *ro) +{ + + if (m != NULL) + FREE_PKT(m); + return (0); +} + +static void +ipfw_clone_destroy(struct ifnet *ifp) +{ + + LOGIF_WLOCK(); + if (ifp->if_hdrlen == ETHER_HDR_LEN) + V_log_if = NULL; + else + V_pflog_if = NULL; + LOGIF_WUNLOCK(); + + bpfdetach(ifp); + if_detach(ifp); + if_free(ifp); +} + +static int +ipfw_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + struct ifnet *ifp; + + ifp = if_alloc(IFT_PFLOG); + if (ifp == NULL) + return (ENOSPC); + if_initname(ifp, ipfwname, unit); + ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_mtu = 65536; + ifp->if_ioctl = ipfw_bpf_ioctl; + ifp->if_output = ipfw_bpf_output; + ifp->if_hdrlen = ETHER_HDR_LEN; + if_attach(ifp); + bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); + LOGIF_WLOCK(); + if (V_log_if != NULL) { + LOGIF_WUNLOCK(); + bpfdetach(ifp); + if_detach(ifp); + if_free(ifp); + return (EEXIST); + } + V_log_if = ifp; + LOGIF_WUNLOCK(); + return (0); +} + +static int +ipfwlog_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + struct ifnet *ifp; + + ifp = if_alloc(IFT_PFLOG); + if (ifp == NULL) + return (ENOSPC); + if_initname(ifp, ipfwlogname, unit); + ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_mtu = 65536; + ifp->if_ioctl = ipfw_bpf_ioctl; + ifp->if_output = ipfw_bpf_output; + ifp->if_hdrlen = PFLOG_HDRLEN; + if_attach(ifp); + bpfattach(ifp, DLT_PFLOG, PFLOG_HDRLEN); + LOGIF_WLOCK(); + if (V_pflog_if != NULL) { + LOGIF_WUNLOCK(); + bpfdetach(ifp); + if_detach(ifp); + if_free(ifp); + return (EEXIST); + } + V_pflog_if = ifp; + LOGIF_WUNLOCK(); + return (0); +} + +void +ipfw_bpf_mtap2(void *data, u_int dlen, struct mbuf *m) +{ + LOGIF_RLOCK_TRACKER; + + LOGIF_RLOCK(); + if (dlen == ETHER_HDR_LEN) { + if (V_log_if == NULL) { + LOGIF_RUNLOCK(); + return; + } + BPF_MTAP2(V_log_if, data, dlen, m); + } else if (dlen == PFLOG_HDRLEN) { + if (V_pflog_if == NULL) { + LOGIF_RUNLOCK(); + return; + } + BPF_MTAP2(V_pflog_if, data, dlen, m); + } + LOGIF_RUNLOCK(); +} + +void +ipfw_bpf_init(int first) +{ + + if (first) { + LOGIF_LOCK_INIT(); + V_log_if = NULL; + V_pflog_if = NULL; + } + V_ipfw_cloner = if_clone_simple(ipfwname, ipfw_clone_create, + ipfw_clone_destroy, 0); + V_ipfwlog_cloner = if_clone_simple(ipfwlogname, ipfwlog_clone_create, + ipfw_clone_destroy, 0); +} + +void +ipfw_bpf_uninit(int last) +{ + + if_clone_detach(V_ipfw_cloner); + if_clone_detach(V_ipfwlog_cloner); + if (last) + LOGIF_LOCK_DESTROY(); +} + diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_dynamic.c b/freebsd/sys/netpfil/ipfw/ip_fw_dynamic.c new file mode 100644 index 00000000..4696faac --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/ip_fw_dynamic.c @@ -0,0 +1,1822 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#define DEB(x) +#define DDB(x) x + +/* + * Dynamic rule support for ipfw + */ + +#include <rtems/bsd/local/opt_ipfw.h> +#include <rtems/bsd/local/opt_inet.h> +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#include <rtems/bsd/local/opt_inet6.h> + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <net/ethernet.h> /* for ETHERTYPE_IP */ +#include <net/if.h> +#include <net/if_var.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> /* ip_defttl */ +#include <netinet/ip_fw.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> + +#include <netinet/ip6.h> /* IN6_ARE_ADDR_EQUAL */ +#ifdef INET6 +#include <netinet6/in6_var.h> +#include <netinet6/ip6_var.h> +#endif + +#include <netpfil/ipfw/ip_fw_private.h> + +#include <machine/in_cksum.h> /* XXX for in_cksum */ + +#ifdef MAC +#include <security/mac/mac_framework.h> +#endif + +/* + * Description of dynamic rules. + * + * Dynamic rules are stored in lists accessed through a hash table + * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can + * be modified through the sysctl variable dyn_buckets which is + * updated when the table becomes empty. + * + * XXX currently there is only one list, ipfw_dyn. + * + * When a packet is received, its address fields are first masked + * with the mask defined for the rule, then hashed, then matched + * against the entries in the corresponding list. + * Dynamic rules can be used for different purposes: + * + stateful rules; + * + enforcing limits on the number of sessions; + * + in-kernel NAT (not implemented yet) + * + * The lifetime of dynamic rules is regulated by dyn_*_lifetime, + * measured in seconds and depending on the flags. + * + * The total number of dynamic rules is equal to UMA zone items count. + * The max number of dynamic rules is dyn_max. When we reach + * the maximum number of rules we do not create anymore. This is + * done to avoid consuming too much memory, but also too much + * time when searching on each packet (ideally, we should try instead + * to put a limit on the length of the list on each bucket...). + * + * Each dynamic rule holds a pointer to the parent ipfw rule so + * we know what action to perform. Dynamic rules are removed when + * the parent rule is deleted. This can be changed by dyn_keep_states + * sysctl. + * + * There are some limitations with dynamic rules -- we do not + * obey the 'randomized match', and we do not do multiple + * passes through the firewall. XXX check the latter!!! + */ + +struct ipfw_dyn_bucket { + struct mtx mtx; /* Bucket protecting lock */ + ipfw_dyn_rule *head; /* Pointer to first rule */ +}; + +/* + * Static variables followed by global ones + */ +static VNET_DEFINE(struct ipfw_dyn_bucket *, ipfw_dyn_v); +static VNET_DEFINE(u_int32_t, dyn_buckets_max); +static VNET_DEFINE(u_int32_t, curr_dyn_buckets); +static VNET_DEFINE(struct callout, ipfw_timeout); +#define V_ipfw_dyn_v VNET(ipfw_dyn_v) +#define V_dyn_buckets_max VNET(dyn_buckets_max) +#define V_curr_dyn_buckets VNET(curr_dyn_buckets) +#define V_ipfw_timeout VNET(ipfw_timeout) + +static VNET_DEFINE(uma_zone_t, ipfw_dyn_rule_zone); +#define V_ipfw_dyn_rule_zone VNET(ipfw_dyn_rule_zone) + +#define IPFW_BUCK_LOCK_INIT(b) \ + mtx_init(&(b)->mtx, "IPFW dynamic bucket", NULL, MTX_DEF) +#define IPFW_BUCK_LOCK_DESTROY(b) \ + mtx_destroy(&(b)->mtx) +#define IPFW_BUCK_LOCK(i) mtx_lock(&V_ipfw_dyn_v[(i)].mtx) +#define IPFW_BUCK_UNLOCK(i) mtx_unlock(&V_ipfw_dyn_v[(i)].mtx) +#define IPFW_BUCK_ASSERT(i) mtx_assert(&V_ipfw_dyn_v[(i)].mtx, MA_OWNED) + + +static VNET_DEFINE(int, dyn_keep_states); +#define V_dyn_keep_states VNET(dyn_keep_states) + +/* + * Timeouts for various events in handing dynamic rules. + */ +static VNET_DEFINE(u_int32_t, dyn_ack_lifetime); +static VNET_DEFINE(u_int32_t, dyn_syn_lifetime); +static VNET_DEFINE(u_int32_t, dyn_fin_lifetime); +static VNET_DEFINE(u_int32_t, dyn_rst_lifetime); +static VNET_DEFINE(u_int32_t, dyn_udp_lifetime); +static VNET_DEFINE(u_int32_t, dyn_short_lifetime); + +#define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) +#define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) +#define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) +#define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) +#define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) +#define V_dyn_short_lifetime VNET(dyn_short_lifetime) + +/* + * Keepalives are sent if dyn_keepalive is set. They are sent every + * dyn_keepalive_period seconds, in the last dyn_keepalive_interval + * seconds of lifetime of a rule. + * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower + * than dyn_keepalive_period. + */ + +static VNET_DEFINE(u_int32_t, dyn_keepalive_interval); +static VNET_DEFINE(u_int32_t, dyn_keepalive_period); +static VNET_DEFINE(u_int32_t, dyn_keepalive); +static VNET_DEFINE(time_t, dyn_keepalive_last); + +#define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) +#define V_dyn_keepalive_period VNET(dyn_keepalive_period) +#define V_dyn_keepalive VNET(dyn_keepalive) +#define V_dyn_keepalive_last VNET(dyn_keepalive_last) + +static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */ + +#define DYN_COUNT uma_zone_get_cur(V_ipfw_dyn_rule_zone) +#define V_dyn_max VNET(dyn_max) + +/* for userspace, we emulate the uma_zone_counter with ipfw_dyn_count */ +static int ipfw_dyn_count; /* number of objects */ + +#ifdef USERSPACE /* emulation of UMA object counters for userspace */ +#define uma_zone_get_cur(x) ipfw_dyn_count +#endif /* USERSPACE */ + +static int last_log; /* Log ratelimiting */ + +static void ipfw_dyn_tick(void *vnetx); +static void check_dyn_rules(struct ip_fw_chain *, ipfw_range_tlv *, int, int); +#ifdef SYSCTL_NODE + +static int sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS); +static int sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS); + +SYSBEGIN(f2) + +SYSCTL_DECL(_net_inet_ip_fw); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_buckets_max), 0, + "Max number of dyn. buckets"); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, + CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, + "Current Number of dyn. buckets"); +SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RD, 0, 0, sysctl_ipfw_dyn_count, "IU", + "Number of dyn. rules"); +SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_dyn_max, "IU", + "Max number of dyn. rules"); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, + "Lifetime of dyn. rules for acks"); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, + "Lifetime of dyn. rules for syn"); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, + "Lifetime of dyn. rules for fin"); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, + "Lifetime of dyn. rules for rst"); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, + "Lifetime of dyn. rules for UDP"); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, + "Lifetime of dyn. rules for other situations"); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, + "Enable keepalives for dyn. rules"); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keep_states, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0, + "Do not flush dynamic states on rule deletion"); + +SYSEND + +#endif /* SYSCTL_NODE */ + + +#ifdef INET6 +static __inline int +hash_packet6(struct ipfw_flow_id *id) +{ + u_int32_t i; + i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ + (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ + (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ + (id->src_ip6.__u6_addr.__u6_addr32[3]) ^ + (id->dst_port) ^ (id->src_port); + return i; +} +#endif + +/* + * IMPORTANT: the hash function for dynamic rules must be commutative + * in source and destination (ip,port), because rules are bidirectional + * and we want to find both in the same bucket. + */ +static __inline int +hash_packet(struct ipfw_flow_id *id, int buckets) +{ + u_int32_t i; + +#ifdef INET6 + if (IS_IP6_FLOW_ID(id)) + i = hash_packet6(id); + else +#endif /* INET6 */ + i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); + i &= (buckets - 1); + return i; +} + +#if 0 +#define DYN_DEBUG(fmt, ...) do { \ + printf("%s: " fmt "\n", __func__, __VA_ARGS__); \ +} while (0) +#else +#define DYN_DEBUG(fmt, ...) +#endif + +static char *default_state_name = "default"; +struct dyn_state_obj { + struct named_object no; + char name[64]; +}; + +#define DYN_STATE_OBJ(ch, cmd) \ + ((struct dyn_state_obj *)SRV_OBJECT(ch, (cmd)->arg1)) +/* + * Classifier callback. + * Return 0 if opcode contains object that should be referenced + * or rewritten. + */ +static int +dyn_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +{ + + DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1); + /* Don't rewrite "check-state any" */ + if (cmd->arg1 == 0 && + cmd->opcode == O_CHECK_STATE) + return (1); + + *puidx = cmd->arg1; + *ptype = 0; + return (0); +} + +static void +dyn_update(ipfw_insn *cmd, uint16_t idx) +{ + + cmd->arg1 = idx; + DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1); +} + +static int +dyn_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, + struct named_object **pno) +{ + ipfw_obj_ntlv *ntlv; + const char *name; + + DYN_DEBUG("uidx %d", ti->uidx); + if (ti->uidx != 0) { + if (ti->tlvs == NULL) + return (EINVAL); + /* Search ntlv in the buffer provided by user */ + ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, + IPFW_TLV_STATE_NAME); + if (ntlv == NULL) + return (EINVAL); + name = ntlv->name; + } else + name = default_state_name; + /* + * Search named object with corresponding name. + * Since states objects are global - ignore the set value + * and use zero instead. + */ + *pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0, + IPFW_TLV_STATE_NAME, name); + /* + * We always return success here. + * The caller will check *pno and mark object as unresolved, + * then it will automatically create "default" object. + */ + return (0); +} + +static struct named_object * +dyn_findbykidx(struct ip_fw_chain *ch, uint16_t idx) +{ + + DYN_DEBUG("kidx %d", idx); + return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx)); +} + +static int +dyn_create(struct ip_fw_chain *ch, struct tid_info *ti, + uint16_t *pkidx) +{ + struct namedobj_instance *ni; + struct dyn_state_obj *obj; + struct named_object *no; + ipfw_obj_ntlv *ntlv; + char *name; + + DYN_DEBUG("uidx %d", ti->uidx); + if (ti->uidx != 0) { + if (ti->tlvs == NULL) + return (EINVAL); + ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, + IPFW_TLV_STATE_NAME); + if (ntlv == NULL) + return (EINVAL); + name = ntlv->name; + } else + name = default_state_name; + + ni = CHAIN_TO_SRV(ch); + obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO); + obj->no.name = obj->name; + obj->no.etlv = IPFW_TLV_STATE_NAME; + strlcpy(obj->name, name, sizeof(obj->name)); + + IPFW_UH_WLOCK(ch); + no = ipfw_objhash_lookup_name_type(ni, 0, + IPFW_TLV_STATE_NAME, name); + if (no != NULL) { + /* + * Object is already created. + * Just return its kidx and bump refcount. + */ + *pkidx = no->kidx; + no->refcnt++; + IPFW_UH_WUNLOCK(ch); + free(obj, M_IPFW); + DYN_DEBUG("\tfound kidx %d", *pkidx); + return (0); + } + if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) { + DYN_DEBUG("\talloc_idx failed for %s", name); + IPFW_UH_WUNLOCK(ch); + free(obj, M_IPFW); + return (ENOSPC); + } + ipfw_objhash_add(ni, &obj->no); + IPFW_WLOCK(ch); + SRV_OBJECT(ch, obj->no.kidx) = obj; + IPFW_WUNLOCK(ch); + obj->no.refcnt++; + *pkidx = obj->no.kidx; + IPFW_UH_WUNLOCK(ch); + DYN_DEBUG("\tcreated kidx %d", *pkidx); + return (0); +} + +static void +dyn_destroy(struct ip_fw_chain *ch, struct named_object *no) +{ + struct dyn_state_obj *obj; + + IPFW_UH_WLOCK_ASSERT(ch); + + KASSERT(no->refcnt == 1, + ("Destroying object '%s' (type %u, idx %u) with refcnt %u", + no->name, no->etlv, no->kidx, no->refcnt)); + + DYN_DEBUG("kidx %d", no->kidx); + IPFW_WLOCK(ch); + obj = SRV_OBJECT(ch, no->kidx); + SRV_OBJECT(ch, no->kidx) = NULL; + IPFW_WUNLOCK(ch); + ipfw_objhash_del(CHAIN_TO_SRV(ch), no); + ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), no->kidx); + + free(obj, M_IPFW); +} + +static struct opcode_obj_rewrite dyn_opcodes[] = { + { + O_KEEP_STATE, IPFW_TLV_STATE_NAME, + dyn_classify, dyn_update, + dyn_findbyname, dyn_findbykidx, + dyn_create, dyn_destroy + }, + { + O_CHECK_STATE, IPFW_TLV_STATE_NAME, + dyn_classify, dyn_update, + dyn_findbyname, dyn_findbykidx, + dyn_create, dyn_destroy + }, + { + O_PROBE_STATE, IPFW_TLV_STATE_NAME, + dyn_classify, dyn_update, + dyn_findbyname, dyn_findbykidx, + dyn_create, dyn_destroy + }, + { + O_LIMIT, IPFW_TLV_STATE_NAME, + dyn_classify, dyn_update, + dyn_findbyname, dyn_findbykidx, + dyn_create, dyn_destroy + }, +}; +/** + * Print customizable flow id description via log(9) facility. + */ +static void +print_dyn_rule_flags(struct ipfw_flow_id *id, int dyn_type, int log_flags, + char *prefix, char *postfix) +{ + struct in_addr da; +#ifdef INET6 + char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; +#else + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; +#endif + +#ifdef INET6 + if (IS_IP6_FLOW_ID(id)) { + ip6_sprintf(src, &id->src_ip6); + ip6_sprintf(dst, &id->dst_ip6); + } else +#endif + { + da.s_addr = htonl(id->src_ip); + inet_ntop(AF_INET, &da, src, sizeof(src)); + da.s_addr = htonl(id->dst_ip); + inet_ntop(AF_INET, &da, dst, sizeof(dst)); + } + log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n", + prefix, dyn_type, src, id->src_port, dst, + id->dst_port, DYN_COUNT, postfix); +} + +#define print_dyn_rule(id, dtype, prefix, postfix) \ + print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix) + +#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define TIME_LE(a,b) ((int)((a)-(b)) < 0) + +static void +dyn_update_proto_state(ipfw_dyn_rule *q, const struct ipfw_flow_id *id, + const struct tcphdr *tcp, int dir) +{ + uint32_t ack; + u_char flags; + + if (id->proto == IPPROTO_TCP) { + flags = id->_flags & (TH_FIN | TH_SYN | TH_RST); +#define BOTH_SYN (TH_SYN | (TH_SYN << 8)) +#define BOTH_FIN (TH_FIN | (TH_FIN << 8)) +#define TCP_FLAGS (TH_FLAGS | (TH_FLAGS << 8)) +#define ACK_FWD 0x10000 /* fwd ack seen */ +#define ACK_REV 0x20000 /* rev ack seen */ + + q->state |= (dir == MATCH_FORWARD) ? flags : (flags << 8); + switch (q->state & TCP_FLAGS) { + case TH_SYN: /* opening */ + q->expire = time_uptime + V_dyn_syn_lifetime; + break; + + case BOTH_SYN: /* move to established */ + case BOTH_SYN | TH_FIN: /* one side tries to close */ + case BOTH_SYN | (TH_FIN << 8): +#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) + if (tcp == NULL) + break; + + ack = ntohl(tcp->th_ack); + if (dir == MATCH_FORWARD) { + if (q->ack_fwd == 0 || + _SEQ_GE(ack, q->ack_fwd)) { + q->ack_fwd = ack; + q->state |= ACK_FWD; + } + } else { + if (q->ack_rev == 0 || + _SEQ_GE(ack, q->ack_rev)) { + q->ack_rev = ack; + q->state |= ACK_REV; + } + } + if ((q->state & (ACK_FWD | ACK_REV)) == + (ACK_FWD | ACK_REV)) { + q->expire = time_uptime + V_dyn_ack_lifetime; + q->state &= ~(ACK_FWD | ACK_REV); + } + break; + + case BOTH_SYN | BOTH_FIN: /* both sides closed */ + if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) + V_dyn_fin_lifetime = + V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_fin_lifetime; + break; + + default: +#if 0 + /* + * reset or some invalid combination, but can also + * occur if we use keep-state the wrong way. + */ + if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) + printf("invalid state: 0x%x\n", q->state); +#endif + if (V_dyn_rst_lifetime >= V_dyn_keepalive_period) + V_dyn_rst_lifetime = + V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_rst_lifetime; + break; + } + } else if (id->proto == IPPROTO_UDP) { + q->expire = time_uptime + V_dyn_udp_lifetime; + } else { + /* other protocols */ + q->expire = time_uptime + V_dyn_short_lifetime; + } +} + +/* + * Lookup a dynamic rule, locked version. + */ +static ipfw_dyn_rule * +lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int i, int *match_direction, + struct tcphdr *tcp, uint16_t kidx) +{ + /* + * Stateful ipfw extensions. + * Lookup into dynamic session queue. + */ + ipfw_dyn_rule *prev, *q = NULL; + int dir; + + IPFW_BUCK_ASSERT(i); + + dir = MATCH_NONE; + for (prev = NULL, q = V_ipfw_dyn_v[i].head; q; prev = q, q = q->next) { + if (q->dyn_type == O_LIMIT_PARENT) + continue; + + if (pkt->proto != q->id.proto) + continue; + + if (kidx != 0 && kidx != q->kidx) + continue; + + if (IS_IP6_FLOW_ID(pkt)) { + if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.src_ip6) && + IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.dst_ip6) && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port) { + dir = MATCH_FORWARD; + break; + } + if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.dst_ip6) && + IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.src_ip6) && + pkt->src_port == q->id.dst_port && + pkt->dst_port == q->id.src_port) { + dir = MATCH_REVERSE; + break; + } + } else { + if (pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port) { + dir = MATCH_FORWARD; + break; + } + if (pkt->src_ip == q->id.dst_ip && + pkt->dst_ip == q->id.src_ip && + pkt->src_port == q->id.dst_port && + pkt->dst_port == q->id.src_port) { + dir = MATCH_REVERSE; + break; + } + } + } + if (q == NULL) + goto done; /* q = NULL, not found */ + + if (prev != NULL) { /* found and not in front */ + prev->next = q->next; + q->next = V_ipfw_dyn_v[i].head; + V_ipfw_dyn_v[i].head = q; + } + + /* update state according to flags */ + dyn_update_proto_state(q, pkt, tcp, dir); +done: + if (match_direction != NULL) + *match_direction = dir; + return (q); +} + +ipfw_dyn_rule * +ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, + struct tcphdr *tcp, uint16_t kidx) +{ + ipfw_dyn_rule *q; + int i; + + i = hash_packet(pkt, V_curr_dyn_buckets); + + IPFW_BUCK_LOCK(i); + q = lookup_dyn_rule_locked(pkt, i, match_direction, tcp, kidx); + if (q == NULL) + IPFW_BUCK_UNLOCK(i); + /* NB: return table locked when q is not NULL */ + return q; +} + +/* + * Unlock bucket mtx + * @p - pointer to dynamic rule + */ +void +ipfw_dyn_unlock(ipfw_dyn_rule *q) +{ + + IPFW_BUCK_UNLOCK(q->bucket); +} + +static int +resize_dynamic_table(struct ip_fw_chain *chain, int nbuckets) +{ + int i, k, nbuckets_old; + ipfw_dyn_rule *q; + struct ipfw_dyn_bucket *dyn_v, *dyn_v_old; + + /* Check if given number is power of 2 and less than 64k */ + if ((nbuckets > 65536) || (!powerof2(nbuckets))) + return 1; + + CTR3(KTR_NET, "%s: resize dynamic hash: %d -> %d", __func__, + V_curr_dyn_buckets, nbuckets); + + /* Allocate and initialize new hash */ + dyn_v = malloc(nbuckets * sizeof(*dyn_v), M_IPFW, + M_WAITOK | M_ZERO); + + for (i = 0 ; i < nbuckets; i++) + IPFW_BUCK_LOCK_INIT(&dyn_v[i]); + + /* + * Call upper half lock, as get_map() do to ease + * read-only access to dynamic rules hash from sysctl + */ + IPFW_UH_WLOCK(chain); + + /* + * Acquire chain write lock to permit hash access + * for main traffic path without additional locks + */ + IPFW_WLOCK(chain); + + /* Save old values */ + nbuckets_old = V_curr_dyn_buckets; + dyn_v_old = V_ipfw_dyn_v; + + /* Skip relinking if array is not set up */ + if (V_ipfw_dyn_v == NULL) + V_curr_dyn_buckets = 0; + + /* Re-link all dynamic states */ + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + while (V_ipfw_dyn_v[i].head != NULL) { + /* Remove from current chain */ + q = V_ipfw_dyn_v[i].head; + V_ipfw_dyn_v[i].head = q->next; + + /* Get new hash value */ + k = hash_packet(&q->id, nbuckets); + q->bucket = k; + /* Add to the new head */ + q->next = dyn_v[k].head; + dyn_v[k].head = q; + } + } + + /* Update current pointers/buckets values */ + V_curr_dyn_buckets = nbuckets; + V_ipfw_dyn_v = dyn_v; + + IPFW_WUNLOCK(chain); + + IPFW_UH_WUNLOCK(chain); + + /* Start periodic callout on initial creation */ + if (dyn_v_old == NULL) { + callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, curvnet, 0); + return (0); + } + + /* Destroy all mutexes */ + for (i = 0 ; i < nbuckets_old ; i++) + IPFW_BUCK_LOCK_DESTROY(&dyn_v_old[i]); + + /* Free old hash */ + free(dyn_v_old, M_IPFW); + + return 0; +} + +/** + * Install state of type 'type' for a dynamic session. + * The hash table contains two type of rules: + * - regular rules (O_KEEP_STATE) + * - rules for sessions with limited number of sess per user + * (O_LIMIT). When they are created, the parent is + * increased by 1, and decreased on delete. In this case, + * the third parameter is the parent rule and not the chain. + * - "parent" rules for the above (O_LIMIT_PARENT). + */ +static ipfw_dyn_rule * +add_dyn_rule(struct ipfw_flow_id *id, int i, uint8_t dyn_type, + struct ip_fw *rule, uint16_t kidx) +{ + ipfw_dyn_rule *r; + + IPFW_BUCK_ASSERT(i); + + r = uma_zalloc(V_ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); + if (r == NULL) { + if (last_log != time_uptime) { + last_log = time_uptime; + log(LOG_DEBUG, + "ipfw: Cannot allocate dynamic state, " + "consider increasing net.inet.ip.fw.dyn_max\n"); + } + return NULL; + } + ipfw_dyn_count++; + + /* + * refcount on parent is already incremented, so + * it is safe to use parent unlocked. + */ + if (dyn_type == O_LIMIT) { + ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; + if ( parent->dyn_type != O_LIMIT_PARENT) + panic("invalid parent"); + r->parent = parent; + rule = parent->rule; + } + + r->id = *id; + r->expire = time_uptime + V_dyn_syn_lifetime; + r->rule = rule; + r->dyn_type = dyn_type; + IPFW_ZERO_DYN_COUNTER(r); + r->count = 0; + r->kidx = kidx; + r->bucket = i; + r->next = V_ipfw_dyn_v[i].head; + V_ipfw_dyn_v[i].head = r; + DEB(print_dyn_rule(id, dyn_type, "add dyn entry", "total");) + return r; +} + +/** + * lookup dynamic parent rule using pkt and rule as search keys. + * If the lookup fails, then install one. + */ +static ipfw_dyn_rule * +lookup_dyn_parent(struct ipfw_flow_id *pkt, int *pindex, struct ip_fw *rule, + uint16_t kidx) +{ + ipfw_dyn_rule *q; + int i, is_v6; + + is_v6 = IS_IP6_FLOW_ID(pkt); + i = hash_packet( pkt, V_curr_dyn_buckets ); + *pindex = i; + IPFW_BUCK_LOCK(i); + for (q = V_ipfw_dyn_v[i].head ; q != NULL ; q=q->next) + if (q->dyn_type == O_LIMIT_PARENT && + kidx == q->kidx && + rule == q->rule && + pkt->proto == q->id.proto && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port && + ( + (is_v6 && + IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), + &(q->id.src_ip6)) && + IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), + &(q->id.dst_ip6))) || + (!is_v6 && + pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip) + ) + ) { + q->expire = time_uptime + V_dyn_short_lifetime; + DEB(print_dyn_rule(pkt, q->dyn_type, + "lookup_dyn_parent found", "");) + return q; + } + + /* Add virtual limiting rule */ + return add_dyn_rule(pkt, i, O_LIMIT_PARENT, rule, kidx); +} + +/** + * Install dynamic state for rule type cmd->o.opcode + * + * Returns 1 (failure) if state is not installed because of errors or because + * session limitations are enforced. + */ +int +ipfw_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, + ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg) +{ + ipfw_dyn_rule *q; + int i; + + DEB(print_dyn_rule(&args->f_id, cmd->o.opcode, "install_state", + (cmd->o.arg1 == 0 ? "": DYN_STATE_OBJ(chain, &cmd->o)->name));) + + i = hash_packet(&args->f_id, V_curr_dyn_buckets); + + IPFW_BUCK_LOCK(i); + + q = lookup_dyn_rule_locked(&args->f_id, i, NULL, NULL, cmd->o.arg1); + if (q != NULL) { /* should never occur */ + DEB( + if (last_log != time_uptime) { + last_log = time_uptime; + printf("ipfw: %s: entry already present, done\n", + __func__); + }) + IPFW_BUCK_UNLOCK(i); + return (0); + } + + /* + * State limiting is done via uma(9) zone limiting. + * Save pointer to newly-installed rule and reject + * packet if add_dyn_rule() returned NULL. + * Note q is currently set to NULL. + */ + + switch (cmd->o.opcode) { + case O_KEEP_STATE: /* bidir rule */ + q = add_dyn_rule(&args->f_id, i, O_KEEP_STATE, rule, + cmd->o.arg1); + break; + + case O_LIMIT: { /* limit number of sessions */ + struct ipfw_flow_id id; + ipfw_dyn_rule *parent; + uint32_t conn_limit; + uint16_t limit_mask = cmd->limit_mask; + int pindex; + + conn_limit = IP_FW_ARG_TABLEARG(chain, cmd->conn_limit, limit); + + DEB( + if (cmd->conn_limit == IP_FW_TARG) + printf("ipfw: %s: O_LIMIT rule, conn_limit: %u " + "(tablearg)\n", __func__, conn_limit); + else + printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n", + __func__, conn_limit); + ) + + id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0; + id.proto = args->f_id.proto; + id.addr_type = args->f_id.addr_type; + id.fib = M_GETFIB(args->m); + + if (IS_IP6_FLOW_ID (&(args->f_id))) { + bzero(&id.src_ip6, sizeof(id.src_ip6)); + bzero(&id.dst_ip6, sizeof(id.dst_ip6)); + + if (limit_mask & DYN_SRC_ADDR) + id.src_ip6 = args->f_id.src_ip6; + if (limit_mask & DYN_DST_ADDR) + id.dst_ip6 = args->f_id.dst_ip6; + } else { + if (limit_mask & DYN_SRC_ADDR) + id.src_ip = args->f_id.src_ip; + if (limit_mask & DYN_DST_ADDR) + id.dst_ip = args->f_id.dst_ip; + } + if (limit_mask & DYN_SRC_PORT) + id.src_port = args->f_id.src_port; + if (limit_mask & DYN_DST_PORT) + id.dst_port = args->f_id.dst_port; + + /* + * We have to release lock for previous bucket to + * avoid possible deadlock + */ + IPFW_BUCK_UNLOCK(i); + + parent = lookup_dyn_parent(&id, &pindex, rule, cmd->o.arg1); + if (parent == NULL) { + printf("ipfw: %s: add parent failed\n", __func__); + IPFW_BUCK_UNLOCK(pindex); + return (1); + } + + if (parent->count >= conn_limit) { + if (V_fw_verbose && last_log != time_uptime) { + last_log = time_uptime; + char sbuf[24]; + last_log = time_uptime; + snprintf(sbuf, sizeof(sbuf), + "%d drop session", + parent->rule->rulenum); + print_dyn_rule_flags(&args->f_id, + cmd->o.opcode, + LOG_SECURITY | LOG_DEBUG, + sbuf, "too many entries"); + } + IPFW_BUCK_UNLOCK(pindex); + return (1); + } + /* Increment counter on parent */ + parent->count++; + IPFW_BUCK_UNLOCK(pindex); + + IPFW_BUCK_LOCK(i); + q = add_dyn_rule(&args->f_id, i, O_LIMIT, + (struct ip_fw *)parent, cmd->o.arg1); + if (q == NULL) { + /* Decrement index and notify caller */ + IPFW_BUCK_UNLOCK(i); + IPFW_BUCK_LOCK(pindex); + parent->count--; + IPFW_BUCK_UNLOCK(pindex); + return (1); + } + break; + } + default: + printf("ipfw: %s: unknown dynamic rule type %u\n", + __func__, cmd->o.opcode); + } + + if (q == NULL) { + IPFW_BUCK_UNLOCK(i); + return (1); /* Notify caller about failure */ + } + + dyn_update_proto_state(q, &args->f_id, NULL, MATCH_FORWARD); + IPFW_BUCK_UNLOCK(i); + return (0); +} + +/* + * Generate a TCP packet, containing either a RST or a keepalive. + * When flags & TH_RST, we are sending a RST packet, because of a + * "reset" action matched the packet. + * Otherwise we are sending a keepalive, and flags & TH_ + * The 'replyto' mbuf is the mbuf being replied to, if any, and is required + * so that MAC can label the reply appropriately. + */ +struct mbuf * +ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, + u_int32_t ack, int flags) +{ + struct mbuf *m = NULL; /* stupid compiler */ + int len, dir; + struct ip *h = NULL; /* stupid compiler */ +#ifdef INET6 + struct ip6_hdr *h6 = NULL; +#endif + struct tcphdr *th = NULL; + + MGETHDR(m, M_NOWAIT, MT_DATA); + if (m == NULL) + return (NULL); + + M_SETFIB(m, id->fib); +#ifdef MAC + if (replyto != NULL) + mac_netinet_firewall_reply(replyto, m); + else + mac_netinet_firewall_send(m); +#else + (void)replyto; /* don't warn about unused arg */ +#endif + + switch (id->addr_type) { + case 4: + len = sizeof(struct ip) + sizeof(struct tcphdr); + break; +#ifdef INET6 + case 6: + len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + break; +#endif + default: + /* XXX: log me?!? */ + FREE_PKT(m); + return (NULL); + } + dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); + + m->m_data += max_linkhdr; + m->m_flags |= M_SKIP_FIREWALL; + m->m_pkthdr.len = m->m_len = len; + m->m_pkthdr.rcvif = NULL; + bzero(m->m_data, len); + + switch (id->addr_type) { + case 4: + h = mtod(m, struct ip *); + + /* prepare for checksum */ + h->ip_p = IPPROTO_TCP; + h->ip_len = htons(sizeof(struct tcphdr)); + if (dir) { + h->ip_src.s_addr = htonl(id->src_ip); + h->ip_dst.s_addr = htonl(id->dst_ip); + } else { + h->ip_src.s_addr = htonl(id->dst_ip); + h->ip_dst.s_addr = htonl(id->src_ip); + } + + th = (struct tcphdr *)(h + 1); + break; +#ifdef INET6 + case 6: + h6 = mtod(m, struct ip6_hdr *); + + /* prepare for checksum */ + h6->ip6_nxt = IPPROTO_TCP; + h6->ip6_plen = htons(sizeof(struct tcphdr)); + if (dir) { + h6->ip6_src = id->src_ip6; + h6->ip6_dst = id->dst_ip6; + } else { + h6->ip6_src = id->dst_ip6; + h6->ip6_dst = id->src_ip6; + } + + th = (struct tcphdr *)(h6 + 1); + break; +#endif + } + + if (dir) { + th->th_sport = htons(id->src_port); + th->th_dport = htons(id->dst_port); + } else { + th->th_sport = htons(id->dst_port); + th->th_dport = htons(id->src_port); + } + th->th_off = sizeof(struct tcphdr) >> 2; + + if (flags & TH_RST) { + if (flags & TH_ACK) { + th->th_seq = htonl(ack); + th->th_flags = TH_RST; + } else { + if (flags & TH_SYN) + seq++; + th->th_ack = htonl(seq); + th->th_flags = TH_RST | TH_ACK; + } + } else { + /* + * Keepalive - use caller provided sequence numbers + */ + th->th_seq = htonl(seq); + th->th_ack = htonl(ack); + th->th_flags = TH_ACK; + } + + switch (id->addr_type) { + case 4: + th->th_sum = in_cksum(m, len); + + /* finish the ip header */ + h->ip_v = 4; + h->ip_hl = sizeof(*h) >> 2; + h->ip_tos = IPTOS_LOWDELAY; + h->ip_off = htons(0); + h->ip_len = htons(len); + h->ip_ttl = V_ip_defttl; + h->ip_sum = 0; + break; +#ifdef INET6 + case 6: + th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), + sizeof(struct tcphdr)); + + /* finish the ip6 header */ + h6->ip6_vfc |= IPV6_VERSION; + h6->ip6_hlim = IPV6_DEFHLIM; + break; +#endif + } + + return (m); +} + +/* + * Queue keepalive packets for given dynamic rule + */ +static struct mbuf ** +ipfw_dyn_send_ka(struct mbuf **mtailp, ipfw_dyn_rule *q) +{ + struct mbuf *m_rev, *m_fwd; + + m_rev = (q->state & ACK_REV) ? NULL : + ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN); + m_fwd = (q->state & ACK_FWD) ? NULL : + ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1, q->ack_rev, 0); + + if (m_rev != NULL) { + *mtailp = m_rev; + mtailp = &(*mtailp)->m_nextpkt; + } + if (m_fwd != NULL) { + *mtailp = m_fwd; + mtailp = &(*mtailp)->m_nextpkt; + } + + return (mtailp); +} + +/* + * This procedure is used to perform various maintenance + * on dynamic hash list. Currently it is called every second. + */ +static void +ipfw_dyn_tick(void * vnetx) +{ + struct ip_fw_chain *chain; + int check_ka = 0; +#ifdef VIMAGE + struct vnet *vp = vnetx; +#endif + + CURVNET_SET(vp); + + chain = &V_layer3_chain; + + /* Run keepalive checks every keepalive_period iff ka is enabled */ + if ((V_dyn_keepalive_last + V_dyn_keepalive_period <= time_uptime) && + (V_dyn_keepalive != 0)) { + V_dyn_keepalive_last = time_uptime; + check_ka = 1; + } + + check_dyn_rules(chain, NULL, check_ka, 1); + + callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, vnetx, 0); + + CURVNET_RESTORE(); +} + + +/* + * Walk through all dynamic states doing generic maintenance: + * 1) free expired states + * 2) free all states based on deleted rule / set + * 3) send keepalives for states if needed + * + * @chain - pointer to current ipfw rules chain + * @rule - delete all states originated by given rule if != NULL + * @set - delete all states originated by any rule in set @set if != RESVD_SET + * @check_ka - perform checking/sending keepalives + * @timer - indicate call from timer routine. + * + * Timer routine must call this function unlocked to permit + * sending keepalives/resizing table. + * + * Others has to call function with IPFW_UH_WLOCK held. + * Additionally, function assume that dynamic rule/set is + * ALREADY deleted so no new states can be generated by + * 'deleted' rules. + * + * Write lock is needed to ensure that unused parent rules + * are not freed by other instance (see stage 2, 3) + */ +static void +check_dyn_rules(struct ip_fw_chain *chain, ipfw_range_tlv *rt, + int check_ka, int timer) +{ + struct mbuf *m0, *m, *mnext, **mtailp; + struct ip *h; + int i, dyn_count, new_buckets = 0, max_buckets; + int expired = 0, expired_limits = 0, parents = 0, total = 0; + ipfw_dyn_rule *q, *q_prev, *q_next; + ipfw_dyn_rule *exp_head, **exptailp; + ipfw_dyn_rule *exp_lhead, **expltailp; + + KASSERT(V_ipfw_dyn_v != NULL, ("%s: dynamic table not allocated", + __func__)); + + /* Avoid possible LOR */ + KASSERT(!check_ka || timer, ("%s: keepalive check with lock held", + __func__)); + + /* + * Do not perform any checks if we currently have no dynamic states + */ + if (DYN_COUNT == 0) + return; + + /* Expired states */ + exp_head = NULL; + exptailp = &exp_head; + + /* Expired limit states */ + exp_lhead = NULL; + expltailp = &exp_lhead; + + /* + * We make a chain of packets to go out here -- not deferring + * until after we drop the IPFW dynamic rule lock would result + * in a lock order reversal with the normal packet input -> ipfw + * call stack. + */ + m0 = NULL; + mtailp = &m0; + + /* Protect from hash resizing */ + if (timer != 0) + IPFW_UH_WLOCK(chain); + else + IPFW_UH_WLOCK_ASSERT(chain); + +#define NEXT_RULE() { q_prev = q; q = q->next ; continue; } + + /* Stage 1: perform requested deletion */ + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + IPFW_BUCK_LOCK(i); + for (q = V_ipfw_dyn_v[i].head, q_prev = q; q ; ) { + /* account every rule */ + total++; + + /* Skip parent rules at all */ + if (q->dyn_type == O_LIMIT_PARENT) { + parents++; + NEXT_RULE(); + } + + /* + * Remove rules which are: + * 1) expired + * 2) matches deletion range + */ + if ((TIME_LEQ(q->expire, time_uptime)) || + (rt != NULL && ipfw_match_range(q->rule, rt))) { + if (TIME_LE(time_uptime, q->expire) && + q->dyn_type == O_KEEP_STATE && + V_dyn_keep_states != 0) { + /* + * Do not delete state if + * it is not expired and + * dyn_keep_states is ON. + * However we need to re-link it + * to any other stable rule + */ + q->rule = chain->default_rule; + NEXT_RULE(); + } + + /* Unlink q from current list */ + q_next = q->next; + if (q == V_ipfw_dyn_v[i].head) + V_ipfw_dyn_v[i].head = q_next; + else + q_prev->next = q_next; + + q->next = NULL; + + /* queue q to expire list */ + if (q->dyn_type != O_LIMIT) { + *exptailp = q; + exptailp = &(*exptailp)->next; + DEB(print_dyn_rule(&q->id, q->dyn_type, + "unlink entry", "left"); + ) + } else { + /* Separate list for limit rules */ + *expltailp = q; + expltailp = &(*expltailp)->next; + expired_limits++; + DEB(print_dyn_rule(&q->id, q->dyn_type, + "unlink limit entry", "left"); + ) + } + + q = q_next; + expired++; + continue; + } + + /* + * Check if we need to send keepalive: + * we need to ensure if is time to do KA, + * this is established TCP session, and + * expire time is within keepalive interval + */ + if ((check_ka != 0) && (q->id.proto == IPPROTO_TCP) && + ((q->state & BOTH_SYN) == BOTH_SYN) && + (TIME_LEQ(q->expire, time_uptime + + V_dyn_keepalive_interval))) + mtailp = ipfw_dyn_send_ka(mtailp, q); + + NEXT_RULE(); + } + IPFW_BUCK_UNLOCK(i); + } + + /* Stage 2: decrement counters from O_LIMIT parents */ + if (expired_limits != 0) { + /* + * XXX: Note that deleting set with more than one + * heavily-used LIMIT rules can result in overwhelming + * locking due to lack of per-hash value sorting + * + * We should probably think about: + * 1) pre-allocating hash of size, say, + * MAX(16, V_curr_dyn_buckets / 1024) + * 2) checking if expired_limits is large enough + * 3) If yes, init hash (or its part), re-link + * current list and start decrementing procedure in + * each bucket separately + */ + + /* + * Small optimization: do not unlock bucket until + * we see the next item resides in different bucket + */ + if (exp_lhead != NULL) { + i = exp_lhead->parent->bucket; + IPFW_BUCK_LOCK(i); + } + for (q = exp_lhead; q != NULL; q = q->next) { + if (i != q->parent->bucket) { + IPFW_BUCK_UNLOCK(i); + i = q->parent->bucket; + IPFW_BUCK_LOCK(i); + } + + /* Decrease parent refcount */ + q->parent->count--; + } + if (exp_lhead != NULL) + IPFW_BUCK_UNLOCK(i); + } + + /* + * We protectet ourselves from unused parent deletion + * (from the timer function) by holding UH write lock. + */ + + /* Stage 3: remove unused parent rules */ + if ((parents != 0) && (expired != 0)) { + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + IPFW_BUCK_LOCK(i); + for (q = V_ipfw_dyn_v[i].head, q_prev = q ; q ; ) { + if (q->dyn_type != O_LIMIT_PARENT) + NEXT_RULE(); + + if (q->count != 0) + NEXT_RULE(); + + /* Parent rule without consumers */ + + /* Unlink q from current list */ + q_next = q->next; + if (q == V_ipfw_dyn_v[i].head) + V_ipfw_dyn_v[i].head = q_next; + else + q_prev->next = q_next; + + q->next = NULL; + + /* Add to expired list */ + *exptailp = q; + exptailp = &(*exptailp)->next; + + DEB(print_dyn_rule(&q->id, q->dyn_type, + "unlink parent entry", "left"); + ) + + expired++; + + q = q_next; + } + IPFW_BUCK_UNLOCK(i); + } + } + +#undef NEXT_RULE + + if (timer != 0) { + /* + * Check if we need to resize hash: + * if current number of states exceeds number of buckes in hash, + * grow hash size to the minimum power of 2 which is bigger than + * current states count. Limit hash size by 64k. + */ + max_buckets = (V_dyn_buckets_max > 65536) ? + 65536 : V_dyn_buckets_max; + + dyn_count = DYN_COUNT; + + if ((dyn_count > V_curr_dyn_buckets * 2) && + (dyn_count < max_buckets)) { + new_buckets = V_curr_dyn_buckets; + while (new_buckets < dyn_count) { + new_buckets *= 2; + + if (new_buckets >= max_buckets) + break; + } + } + + IPFW_UH_WUNLOCK(chain); + } + + /* Finally delete old states ad limits if any */ + for (q = exp_head; q != NULL; q = q_next) { + q_next = q->next; + uma_zfree(V_ipfw_dyn_rule_zone, q); + ipfw_dyn_count--; + } + + for (q = exp_lhead; q != NULL; q = q_next) { + q_next = q->next; + uma_zfree(V_ipfw_dyn_rule_zone, q); + ipfw_dyn_count--; + } + + /* + * The rest code MUST be called from timer routine only + * without holding any locks + */ + if (timer == 0) + return; + + /* Send keepalive packets if any */ + for (m = m0; m != NULL; m = mnext) { + mnext = m->m_nextpkt; + m->m_nextpkt = NULL; + h = mtod(m, struct ip *); + if (h->ip_v == 4) + ip_output(m, NULL, NULL, 0, NULL, NULL); +#ifdef INET6 + else + ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); +#endif + } + + /* Run table resize without holding any locks */ + if (new_buckets != 0) + resize_dynamic_table(chain, new_buckets); +} + +/* + * Deletes all dynamic rules originated by given rule or all rules in + * given set. Specify RESVD_SET to indicate set should not be used. + * @chain - pointer to current ipfw rules chain + * @rr - delete all states originated by rules in matched range. + * + * Function has to be called with IPFW_UH_WLOCK held. + * Additionally, function assume that dynamic rule/set is + * ALREADY deleted so no new states can be generated by + * 'deleted' rules. + */ +void +ipfw_expire_dyn_rules(struct ip_fw_chain *chain, ipfw_range_tlv *rt) +{ + + check_dyn_rules(chain, rt, 0, 0); +} + +/* + * Check if rule contains at least one dynamic opcode. + * + * Returns 1 if such opcode is found, 0 otherwise. + */ +int +ipfw_is_dyn_rule(struct ip_fw *rule) +{ + int cmdlen, l; + ipfw_insn *cmd; + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + switch (cmd->opcode) { + case O_LIMIT: + case O_KEEP_STATE: + case O_PROBE_STATE: + case O_CHECK_STATE: + return (1); + } + } + + return (0); +} + +void +ipfw_dyn_init(struct ip_fw_chain *chain) +{ + + V_ipfw_dyn_v = NULL; + V_dyn_buckets_max = 256; /* must be power of 2 */ + V_curr_dyn_buckets = 256; /* must be power of 2 */ + + V_dyn_ack_lifetime = 300; + V_dyn_syn_lifetime = 20; + V_dyn_fin_lifetime = 1; + V_dyn_rst_lifetime = 1; + V_dyn_udp_lifetime = 10; + V_dyn_short_lifetime = 5; + + V_dyn_keepalive_interval = 20; + V_dyn_keepalive_period = 5; + V_dyn_keepalive = 1; /* do send keepalives */ + V_dyn_keepalive_last = time_uptime; + + V_dyn_max = 16384; /* max # of dynamic rules */ + + V_ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", + sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + + /* Enforce limit on dynamic rules */ + uma_zone_set_max(V_ipfw_dyn_rule_zone, V_dyn_max); + + callout_init(&V_ipfw_timeout, 1); + + /* + * This can potentially be done on first dynamic rule + * being added to chain. + */ + resize_dynamic_table(chain, V_curr_dyn_buckets); + IPFW_ADD_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes); +} + +void +ipfw_dyn_uninit(int pass) +{ + int i; + + if (pass == 0) { + callout_drain(&V_ipfw_timeout); + return; + } + IPFW_DEL_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes); + + if (V_ipfw_dyn_v != NULL) { + /* + * Skip deleting all dynamic states - + * uma_zdestroy() does this more efficiently; + */ + + /* Destroy all mutexes */ + for (i = 0 ; i < V_curr_dyn_buckets ; i++) + IPFW_BUCK_LOCK_DESTROY(&V_ipfw_dyn_v[i]); + free(V_ipfw_dyn_v, M_IPFW); + V_ipfw_dyn_v = NULL; + } + + uma_zdestroy(V_ipfw_dyn_rule_zone); +} + +#ifdef SYSCTL_NODE +/* + * Get/set maximum number of dynamic states in given VNET instance. + */ +static int +sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS) +{ + int error; + unsigned int nstates; + + nstates = V_dyn_max; + + error = sysctl_handle_int(oidp, &nstates, 0, req); + /* Read operation or some error */ + if ((error != 0) || (req->newptr == NULL)) + return (error); + + V_dyn_max = nstates; + uma_zone_set_max(V_ipfw_dyn_rule_zone, V_dyn_max); + + return (0); +} + +/* + * Get current number of dynamic states in given VNET instance. + */ +static int +sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS) +{ + int error; + unsigned int nstates; + + nstates = DYN_COUNT; + + error = sysctl_handle_int(oidp, &nstates, 0, req); + + return (error); +} +#endif + +/* + * Returns size of dynamic states in legacy format + */ +int +ipfw_dyn_len(void) +{ + + return (V_ipfw_dyn_v == NULL) ? 0 : + (DYN_COUNT * sizeof(ipfw_dyn_rule)); +} + +/* + * Returns number of dynamic states. + * Used by dump format v1 (current). + */ +int +ipfw_dyn_get_count(void) +{ + + return (V_ipfw_dyn_v == NULL) ? 0 : DYN_COUNT; +} + +static void +export_dyn_rule(ipfw_dyn_rule *src, ipfw_dyn_rule *dst) +{ + + memcpy(dst, src, sizeof(*src)); + memcpy(&(dst->rule), &(src->rule->rulenum), sizeof(src->rule->rulenum)); + /* + * store set number into high word of + * dst->rule pointer. + */ + memcpy((char *)&dst->rule + sizeof(src->rule->rulenum), + &(src->rule->set), sizeof(src->rule->set)); + /* + * store a non-null value in "next". + * The userland code will interpret a + * NULL here as a marker + * for the last dynamic rule. + */ + memcpy(&dst->next, &dst, sizeof(dst)); + dst->expire = + TIME_LEQ(dst->expire, time_uptime) ? 0 : dst->expire - time_uptime; +} + +/* + * Fills int buffer given by @sd with dynamic states. + * Used by dump format v1 (current). + * + * Returns 0 on success. + */ +int +ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd) +{ + ipfw_dyn_rule *p; + ipfw_obj_dyntlv *dst, *last; + ipfw_obj_ctlv *ctlv; + int i; + size_t sz; + + if (V_ipfw_dyn_v == NULL) + return (0); + + IPFW_UH_RLOCK_ASSERT(chain); + + ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); + if (ctlv == NULL) + return (ENOMEM); + sz = sizeof(ipfw_obj_dyntlv); + ctlv->head.type = IPFW_TLV_DYNSTATE_LIST; + ctlv->objsize = sz; + last = NULL; + + for (i = 0 ; i < V_curr_dyn_buckets; i++) { + IPFW_BUCK_LOCK(i); + for (p = V_ipfw_dyn_v[i].head ; p != NULL; p = p->next) { + dst = (ipfw_obj_dyntlv *)ipfw_get_sopt_space(sd, sz); + if (dst == NULL) { + IPFW_BUCK_UNLOCK(i); + return (ENOMEM); + } + + export_dyn_rule(p, &dst->state); + dst->head.length = sz; + dst->head.type = IPFW_TLV_DYN_ENT; + last = dst; + } + IPFW_BUCK_UNLOCK(i); + } + + if (last != NULL) /* mark last dynamic rule */ + last->head.flags = IPFW_DF_LAST; + + return (0); +} + +/* + * Fill given buffer with dynamic states (legacy format). + * IPFW_UH_RLOCK has to be held while calling. + */ +void +ipfw_get_dynamic(struct ip_fw_chain *chain, char **pbp, const char *ep) +{ + ipfw_dyn_rule *p, *last = NULL; + char *bp; + int i; + + if (V_ipfw_dyn_v == NULL) + return; + bp = *pbp; + + IPFW_UH_RLOCK_ASSERT(chain); + + for (i = 0 ; i < V_curr_dyn_buckets; i++) { + IPFW_BUCK_LOCK(i); + for (p = V_ipfw_dyn_v[i].head ; p != NULL; p = p->next) { + if (bp + sizeof *p <= ep) { + ipfw_dyn_rule *dst = + (ipfw_dyn_rule *)bp; + + export_dyn_rule(p, dst); + last = dst; + bp += sizeof(ipfw_dyn_rule); + } + } + IPFW_BUCK_UNLOCK(i); + } + + if (last != NULL) /* mark last dynamic rule */ + bzero(&last->next, sizeof(last)); + *pbp = bp; +} +/* end of file */ diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_eaction.c b/freebsd/sys/netpfil/ipfw/ip_fw_eaction.c new file mode 100644 index 00000000..2c6ba8b9 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/ip_fw_eaction.c @@ -0,0 +1,383 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2016 Yandex LLC + * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/hash.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/rwlock.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/queue.h> +#include <net/pfil.h> + +#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ +#include <netinet/ip_fw.h> + +#include <netpfil/ipfw/ip_fw_private.h> + +#include <rtems/bsd/local/opt_ipfw.h> + +/* + * External actions support for ipfw. + * + * This code provides KPI for implementing loadable modules, that + * can provide handlers for external action opcodes in the ipfw's + * rules. + * Module should implement opcode handler with type ipfw_eaction_t. + * This handler will be called by ipfw_chk() function when + * O_EXTERNAL_ACTION opcode will be matched. The handler must return + * value used as return value in ipfw_chk(), i.e. IP_FW_PASS, + * IP_FW_DENY (see ip_fw_private.h). + * Also the last argument must be set by handler. If it is zero, + * the search continues to the next rule. If it has non zero value, + * the search terminates. + * + * The module that implements external action should register its + * handler and name with ipfw_add_eaction() function. + * This function will return eaction_id, that can be used by module. + * + * It is possible to pass some additional information to external + * action handler via the O_EXTERNAL_INSTANCE opcode. This opcode + * will be next after the O_EXTERNAL_ACTION opcode. cmd->arg1 will + * contain index of named object related to instance of external action. + * + * In case when eaction module uses named instances, it should register + * opcode rewriting routines for O_EXTERNAL_INSTANCE opcode. The + * classifier callback can look back into O_EXTERNAL_ACTION opcode (it + * must be in the (ipfw_insn *)(cmd - 1)). By arg1 from O_EXTERNAL_ACTION + * it can deteremine eaction_id and compare it with its own. + * The macro IPFW_TLV_EACTION_NAME(eaction_id) can be used to deteremine + * the type of named_object related to external action instance. + * + * On module unload handler should be deregistered with ipfw_del_eaction() + * function using known eaction_id. + */ + +struct eaction_obj { + struct named_object no; + ipfw_eaction_t *handler; + char name[64]; +}; + +#define EACTION_OBJ(ch, cmd) \ + ((struct eaction_obj *)SRV_OBJECT((ch), (cmd)->arg1)) + +#if 0 +#define EACTION_DEBUG(fmt, ...) do { \ + printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \ +} while (0) +#else +#define EACTION_DEBUG(fmt, ...) +#endif + +const char *default_eaction_typename = "drop"; +static int +default_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args, + ipfw_insn *cmd, int *done) +{ + + *done = 1; /* terminate the search */ + return (IP_FW_DENY); +} + +/* + * Opcode rewriting callbacks. + */ +static int +eaction_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +{ + + EACTION_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1); + *puidx = cmd->arg1; + *ptype = 0; + return (0); +} + +static void +eaction_update(ipfw_insn *cmd, uint16_t idx) +{ + + cmd->arg1 = idx; + EACTION_DEBUG("opcode %d, arg1 -> %d", cmd->opcode, cmd->arg1); +} + +static int +eaction_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, + struct named_object **pno) +{ + ipfw_obj_ntlv *ntlv; + + if (ti->tlvs == NULL) + return (EINVAL); + + /* Search ntlv in the buffer provided by user */ + ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, + IPFW_TLV_EACTION); + if (ntlv == NULL) + return (EINVAL); + EACTION_DEBUG("name %s, uidx %u, type %u", ntlv->name, + ti->uidx, ti->type); + /* + * Search named object with corresponding name. + * Since eaction objects are global - ignore the set value + * and use zero instead. + */ + *pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), + 0, IPFW_TLV_EACTION, ntlv->name); + if (*pno == NULL) + return (ESRCH); + return (0); +} + +static struct named_object * +eaction_findbykidx(struct ip_fw_chain *ch, uint16_t idx) +{ + + EACTION_DEBUG("kidx %u", idx); + return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx)); +} + +static struct opcode_obj_rewrite eaction_opcodes[] = { + { + .opcode = O_EXTERNAL_ACTION, + .etlv = IPFW_TLV_EACTION, + .classifier = eaction_classify, + .update = eaction_update, + .find_byname = eaction_findbyname, + .find_bykidx = eaction_findbykidx, + }, +}; + +static int +create_eaction_obj(struct ip_fw_chain *ch, ipfw_eaction_t handler, + const char *name, uint16_t *eaction_id) +{ + struct namedobj_instance *ni; + struct eaction_obj *obj; + + IPFW_UH_UNLOCK_ASSERT(ch); + + ni = CHAIN_TO_SRV(ch); + obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO); + obj->no.name = obj->name; + obj->no.etlv = IPFW_TLV_EACTION; + obj->handler = handler; + strlcpy(obj->name, name, sizeof(obj->name)); + + IPFW_UH_WLOCK(ch); + if (ipfw_objhash_lookup_name_type(ni, 0, IPFW_TLV_EACTION, + name) != NULL) { + /* + * Object is already created. + * We don't allow eactions with the same name. + */ + IPFW_UH_WUNLOCK(ch); + free(obj, M_IPFW); + EACTION_DEBUG("External action with typename " + "'%s' already exists", name); + return (EEXIST); + } + if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) { + IPFW_UH_WUNLOCK(ch); + free(obj, M_IPFW); + EACTION_DEBUG("alloc_idx failed"); + return (ENOSPC); + } + ipfw_objhash_add(ni, &obj->no); + IPFW_WLOCK(ch); + SRV_OBJECT(ch, obj->no.kidx) = obj; + IPFW_WUNLOCK(ch); + obj->no.refcnt++; + IPFW_UH_WUNLOCK(ch); + + if (eaction_id != NULL) + *eaction_id = obj->no.kidx; + return (0); +} + +static void +destroy_eaction_obj(struct ip_fw_chain *ch, struct named_object *no) +{ + struct namedobj_instance *ni; + struct eaction_obj *obj; + + IPFW_UH_WLOCK_ASSERT(ch); + + ni = CHAIN_TO_SRV(ch); + IPFW_WLOCK(ch); + obj = SRV_OBJECT(ch, no->kidx); + SRV_OBJECT(ch, no->kidx) = NULL; + IPFW_WUNLOCK(ch); + ipfw_objhash_del(ni, no); + ipfw_objhash_free_idx(ni, no->kidx); + free(obj, M_IPFW); +} + +/* + * Resets all eaction opcodes to default handlers. + */ +static void +reset_eaction_obj(struct ip_fw_chain *ch, uint16_t eaction_id) +{ + struct named_object *no; + struct ip_fw *rule; + ipfw_insn *cmd; + int i; + + IPFW_UH_WLOCK_ASSERT(ch); + + no = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0, + IPFW_TLV_EACTION, default_eaction_typename); + if (no == NULL) + panic("Default external action handler is not found"); + if (eaction_id == no->kidx) + panic("Wrong eaction_id"); + EACTION_DEBUG("replace id %u with %u", eaction_id, no->kidx); + IPFW_WLOCK(ch); + for (i = 0; i < ch->n_rules; i++) { + rule = ch->map[i]; + cmd = ACTION_PTR(rule); + if (cmd->opcode != O_EXTERNAL_ACTION) + continue; + if (cmd->arg1 != eaction_id) + continue; + cmd->arg1 = no->kidx; /* Set to default id */ + /* + * XXX: we only bump refcount on default_eaction. + * Refcount on the original object will be just + * ignored on destroy. But on default_eaction it + * will be decremented on rule deletion. + */ + no->refcnt++; + /* + * Since named_object related to this instance will be + * also destroyed, truncate the chain of opcodes to + * remove O_EXTERNAL_INSTANCE opcode. + */ + if (rule->act_ofs < rule->cmd_len - 1) { + EACTION_DEBUG("truncate rule %d", rule->rulenum); + rule->cmd_len--; + } + } + IPFW_WUNLOCK(ch); +} + +/* + * Initialize external actions framework. + * Create object with default eaction handler "drop". + */ +int +ipfw_eaction_init(struct ip_fw_chain *ch, int first) +{ + int error; + + error = create_eaction_obj(ch, default_eaction, + default_eaction_typename, NULL); + if (error != 0) + return (error); + IPFW_ADD_OBJ_REWRITER(first, eaction_opcodes); + EACTION_DEBUG("External actions support initialized"); + return (0); +} + +void +ipfw_eaction_uninit(struct ip_fw_chain *ch, int last) +{ + struct namedobj_instance *ni; + struct named_object *no; + + ni = CHAIN_TO_SRV(ch); + + IPFW_UH_WLOCK(ch); + no = ipfw_objhash_lookup_name_type(ni, 0, IPFW_TLV_EACTION, + default_eaction_typename); + if (no != NULL) + destroy_eaction_obj(ch, no); + IPFW_UH_WUNLOCK(ch); + IPFW_DEL_OBJ_REWRITER(last, eaction_opcodes); + EACTION_DEBUG("External actions support uninitialized"); +} + +/* + * Registers external action handler to the global array. + * On success it returns eaction id, otherwise - zero. + */ +uint16_t +ipfw_add_eaction(struct ip_fw_chain *ch, ipfw_eaction_t handler, + const char *name) +{ + uint16_t eaction_id; + + eaction_id = 0; + if (ipfw_check_object_name_generic(name) == 0) { + create_eaction_obj(ch, handler, name, &eaction_id); + EACTION_DEBUG("Registered external action '%s' with id %u", + name, eaction_id); + } + return (eaction_id); +} + +/* + * Deregisters external action handler with id eaction_id. + */ +int +ipfw_del_eaction(struct ip_fw_chain *ch, uint16_t eaction_id) +{ + struct named_object *no; + + IPFW_UH_WLOCK(ch); + no = ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), eaction_id); + if (no == NULL || no->etlv != IPFW_TLV_EACTION) { + IPFW_UH_WUNLOCK(ch); + return (EINVAL); + } + if (no->refcnt > 1) + reset_eaction_obj(ch, eaction_id); + EACTION_DEBUG("External action '%s' with id %u unregistered", + no->name, eaction_id); + destroy_eaction_obj(ch, no); + IPFW_UH_WUNLOCK(ch); + return (0); +} + +int +ipfw_run_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args, + ipfw_insn *cmd, int *done) +{ + + return (EACTION_OBJ(ch, cmd)->handler(ch, args, cmd, done)); +} diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_iface.c b/freebsd/sys/netpfil/ipfw/ip_fw_iface.c new file mode 100644 index 00000000..f8973a91 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/ip_fw_iface.c @@ -0,0 +1,541 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2014 Yandex LLC. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Kernel interface tracking API. + * + */ + +#include <rtems/bsd/local/opt_ipfw.h> +#include <rtems/bsd/local/opt_inet.h> +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#include <rtems/bsd/local/opt_inet6.h> + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/rwlock.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <sys/queue.h> +#include <sys/eventhandler.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ +#include <netinet/ip_fw.h> + +#include <netpfil/ipfw/ip_fw_private.h> + +#define CHAIN_TO_II(ch) ((struct namedobj_instance *)ch->ifcfg) + +#define DEFAULT_IFACES 128 + +static void handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif, + uint16_t ifindex); +static void handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif, + uint16_t ifindex); +static int list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd); + +static struct ipfw_sopt_handler scodes[] = { + { IP_FW_XIFLIST, 0, HDIR_GET, list_ifaces }, +}; + +/* + * FreeBSD Kernel interface. + */ +static void ipfw_kifhandler(void *arg, struct ifnet *ifp); +static int ipfw_kiflookup(char *name); +static void iface_khandler_register(void); +static void iface_khandler_deregister(void); + +static eventhandler_tag ipfw_ifdetach_event, ipfw_ifattach_event; +static int num_vnets = 0; +static struct mtx vnet_mtx; + +/* + * Checks if kernel interface is contained in our tracked + * interface list and calls attach/detach handler. + */ +static void +ipfw_kifhandler(void *arg, struct ifnet *ifp) +{ + struct ip_fw_chain *ch; + struct ipfw_iface *iif; + struct namedobj_instance *ii; + uintptr_t htype; + + if (V_ipfw_vnet_ready == 0) + return; + + ch = &V_layer3_chain; + htype = (uintptr_t)arg; + + IPFW_UH_WLOCK(ch); + ii = CHAIN_TO_II(ch); + if (ii == NULL) { + IPFW_UH_WUNLOCK(ch); + return; + } + iif = (struct ipfw_iface*)ipfw_objhash_lookup_name(ii, 0, + if_name(ifp)); + if (iif != NULL) { + if (htype == 1) + handle_ifattach(ch, iif, ifp->if_index); + else + handle_ifdetach(ch, iif, ifp->if_index); + } + IPFW_UH_WUNLOCK(ch); +} + +/* + * Reference current VNET as iface tracking API user. + * Registers interface tracking handlers for first VNET. + */ +static void +iface_khandler_register() +{ + int create; + + create = 0; + + mtx_lock(&vnet_mtx); + if (num_vnets == 0) + create = 1; + num_vnets++; + mtx_unlock(&vnet_mtx); + + if (create == 0) + return; + + printf("IPFW: starting up interface tracker\n"); + + ipfw_ifdetach_event = EVENTHANDLER_REGISTER( + ifnet_departure_event, ipfw_kifhandler, NULL, + EVENTHANDLER_PRI_ANY); + ipfw_ifattach_event = EVENTHANDLER_REGISTER( + ifnet_arrival_event, ipfw_kifhandler, (void*)((uintptr_t)1), + EVENTHANDLER_PRI_ANY); +} + +/* + * + * Detach interface event handlers on last VNET instance + * detach. + */ +static void +iface_khandler_deregister() +{ + int destroy; + + destroy = 0; + mtx_lock(&vnet_mtx); + if (num_vnets == 1) + destroy = 1; + num_vnets--; + mtx_unlock(&vnet_mtx); + + if (destroy == 0) + return; + + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, + ipfw_ifattach_event); + EVENTHANDLER_DEREGISTER(ifnet_departure_event, + ipfw_ifdetach_event); +} + +/* + * Retrieves ifindex for given @name. + * + * Returns ifindex or 0. + */ +static int +ipfw_kiflookup(char *name) +{ + struct ifnet *ifp; + int ifindex; + + ifindex = 0; + + if ((ifp = ifunit_ref(name)) != NULL) { + ifindex = ifp->if_index; + if_rele(ifp); + } + + return (ifindex); +} + +/* + * Global ipfw startup hook. + * Since we perform lazy initialization, do nothing except + * mutex init. + */ +int +ipfw_iface_init() +{ + + mtx_init(&vnet_mtx, "IPFW ifhandler mtx", NULL, MTX_DEF); + IPFW_ADD_SOPT_HANDLER(1, scodes); + return (0); +} + +/* + * Global ipfw destroy hook. + * Unregister khandlers iff init has been done. + */ +void +ipfw_iface_destroy() +{ + + IPFW_DEL_SOPT_HANDLER(1, scodes); + mtx_destroy(&vnet_mtx); +} + +/* + * Perform actual init on internal request. + * Inits both namehash and global khandler. + */ +static void +vnet_ipfw_iface_init(struct ip_fw_chain *ch) +{ + struct namedobj_instance *ii; + + ii = ipfw_objhash_create(DEFAULT_IFACES); + IPFW_UH_WLOCK(ch); + if (ch->ifcfg == NULL) { + ch->ifcfg = ii; + ii = NULL; + } + IPFW_UH_WUNLOCK(ch); + + if (ii != NULL) { + /* Already initialized. Free namehash. */ + ipfw_objhash_destroy(ii); + } else { + /* We're the first ones. Init kernel hooks. */ + iface_khandler_register(); + } +} + +static int +destroy_iface(struct namedobj_instance *ii, struct named_object *no, + void *arg) +{ + + /* Assume all consumers have been already detached */ + free(no, M_IPFW); + return (0); +} + +/* + * Per-VNET ipfw detach hook. + * + */ +void +vnet_ipfw_iface_destroy(struct ip_fw_chain *ch) +{ + struct namedobj_instance *ii; + + IPFW_UH_WLOCK(ch); + ii = CHAIN_TO_II(ch); + ch->ifcfg = NULL; + IPFW_UH_WUNLOCK(ch); + + if (ii != NULL) { + ipfw_objhash_foreach(ii, destroy_iface, ch); + ipfw_objhash_destroy(ii); + iface_khandler_deregister(); + } +} + +/* + * Notify the subsystem that we are interested in tracking + * interface @name. This function has to be called without + * holding any locks to permit allocating the necessary states + * for proper interface tracking. + * + * Returns 0 on success. + */ +int +ipfw_iface_ref(struct ip_fw_chain *ch, char *name, + struct ipfw_ifc *ic) +{ + struct namedobj_instance *ii; + struct ipfw_iface *iif, *tmp; + + if (strlen(name) >= sizeof(iif->ifname)) + return (EINVAL); + + IPFW_UH_WLOCK(ch); + + ii = CHAIN_TO_II(ch); + if (ii == NULL) { + + /* + * First request to subsystem. + * Let's perform init. + */ + IPFW_UH_WUNLOCK(ch); + vnet_ipfw_iface_init(ch); + IPFW_UH_WLOCK(ch); + ii = CHAIN_TO_II(ch); + } + + iif = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name); + + if (iif != NULL) { + iif->no.refcnt++; + ic->iface = iif; + IPFW_UH_WUNLOCK(ch); + return (0); + } + + IPFW_UH_WUNLOCK(ch); + + /* Not found. Let's create one */ + iif = malloc(sizeof(struct ipfw_iface), M_IPFW, M_WAITOK | M_ZERO); + TAILQ_INIT(&iif->consumers); + iif->no.name = iif->ifname; + strlcpy(iif->ifname, name, sizeof(iif->ifname)); + + /* + * Ref & link to the list. + * + * We assume ifnet_arrival_event / ifnet_departure_event + * are not holding any locks. + */ + iif->no.refcnt = 1; + IPFW_UH_WLOCK(ch); + + tmp = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name); + if (tmp != NULL) { + /* Interface has been created since unlock. Ref and return */ + tmp->no.refcnt++; + ic->iface = tmp; + IPFW_UH_WUNLOCK(ch); + free(iif, M_IPFW); + return (0); + } + + iif->ifindex = ipfw_kiflookup(name); + if (iif->ifindex != 0) + iif->resolved = 1; + + ipfw_objhash_add(ii, &iif->no); + ic->iface = iif; + + IPFW_UH_WUNLOCK(ch); + + return (0); +} + +/* + * Adds @ic to the list of iif interface consumers. + * Must be called with holding both UH+WLOCK. + * Callback may be immediately called (if interface exists). + */ +void +ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic) +{ + struct ipfw_iface *iif; + + IPFW_UH_WLOCK_ASSERT(ch); + IPFW_WLOCK_ASSERT(ch); + + iif = ic->iface; + + TAILQ_INSERT_TAIL(&iif->consumers, ic, next); + if (iif->resolved != 0) + ic->cb(ch, ic->cbdata, iif->ifindex); +} + +/* + * Unlinks interface tracker object @ic from interface. + * Must be called while holding UH lock. + */ +void +ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic) +{ + struct ipfw_iface *iif; + + IPFW_UH_WLOCK_ASSERT(ch); + + iif = ic->iface; + TAILQ_REMOVE(&iif->consumers, ic, next); +} + +/* + * Unreference interface specified by @ic. + * Must be called while holding UH lock. + */ +void +ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic) +{ + struct ipfw_iface *iif; + + IPFW_UH_WLOCK_ASSERT(ch); + + iif = ic->iface; + ic->iface = NULL; + + iif->no.refcnt--; + /* TODO: check for references & delete */ +} + +/* + * Interface arrival handler. + */ +static void +handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif, + uint16_t ifindex) +{ + struct ipfw_ifc *ic; + + IPFW_UH_WLOCK_ASSERT(ch); + + iif->gencnt++; + iif->resolved = 1; + iif->ifindex = ifindex; + + IPFW_WLOCK(ch); + TAILQ_FOREACH(ic, &iif->consumers, next) + ic->cb(ch, ic->cbdata, iif->ifindex); + IPFW_WUNLOCK(ch); +} + +/* + * Interface departure handler. + */ +static void +handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif, + uint16_t ifindex) +{ + struct ipfw_ifc *ic; + + IPFW_UH_WLOCK_ASSERT(ch); + + IPFW_WLOCK(ch); + TAILQ_FOREACH(ic, &iif->consumers, next) + ic->cb(ch, ic->cbdata, 0); + IPFW_WUNLOCK(ch); + + iif->gencnt++; + iif->resolved = 0; + iif->ifindex = 0; +} + +struct dump_iface_args { + struct ip_fw_chain *ch; + struct sockopt_data *sd; +}; + +static int +export_iface_internal(struct namedobj_instance *ii, struct named_object *no, + void *arg) +{ + ipfw_iface_info *i; + struct dump_iface_args *da; + struct ipfw_iface *iif; + + da = (struct dump_iface_args *)arg; + + i = (ipfw_iface_info *)ipfw_get_sopt_space(da->sd, sizeof(*i)); + KASSERT(i != NULL, ("previously checked buffer is not enough")); + + iif = (struct ipfw_iface *)no; + + strlcpy(i->ifname, iif->ifname, sizeof(i->ifname)); + if (iif->resolved) + i->flags |= IPFW_IFFLAG_RESOLVED; + i->ifindex = iif->ifindex; + i->refcnt = iif->no.refcnt; + i->gencnt = iif->gencnt; + return (0); +} + +/* + * Lists all interface currently tracked by ipfw. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size + * Reply: [ ipfw_obj_lheader ipfw_iface_info x N ] + * + * Returns 0 on success + */ +static int +list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct namedobj_instance *ii; + struct _ipfw_obj_lheader *olh; + struct dump_iface_args da; + uint32_t count, size; + + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); + if (olh == NULL) + return (EINVAL); + if (sd->valsize < olh->size) + return (EINVAL); + + IPFW_UH_RLOCK(ch); + ii = CHAIN_TO_II(ch); + if (ii != NULL) + count = ipfw_objhash_count(ii); + else + count = 0; + size = count * sizeof(ipfw_iface_info) + sizeof(ipfw_obj_lheader); + + /* Fill in header regadless of buffer size */ + olh->count = count; + olh->objsize = sizeof(ipfw_iface_info); + + if (size > olh->size) { + olh->size = size; + IPFW_UH_RUNLOCK(ch); + return (ENOMEM); + } + olh->size = size; + + da.ch = ch; + da.sd = sd; + + if (ii != NULL) + ipfw_objhash_foreach(ii, export_iface_internal, &da); + IPFW_UH_RUNLOCK(ch); + + return (0); +} + diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_log.c b/freebsd/sys/netpfil/ipfw/ip_fw_log.c index 60b0df7d..658e1256 100644 --- a/freebsd/sys/netpfil/ipfw/ip_fw_log.c +++ b/freebsd/sys/netpfil/ipfw/ip_fw_log.c @@ -41,16 +41,15 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/sys/param.h> #include <sys/systm.h> -#include <sys/mbuf.h> #include <sys/kernel.h> +#include <sys/mbuf.h> #include <sys/socket.h> #include <sys/sysctl.h> #include <sys/syslog.h> #include <net/ethernet.h> /* for ETHERTYPE_IP */ #include <net/if.h> +#include <net/if_var.h> #include <net/vnet.h> -#include <net/if_types.h> /* for IFT_ETHER */ -#include <net/bpf.h> /* for BPF */ #include <netinet/in.h> #include <netinet/ip.h> @@ -83,111 +82,48 @@ __FBSDID("$FreeBSD$"); #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) +#ifdef __APPLE__ +#undef snprintf +#define snprintf sprintf +#define SNPARGS(buf, len) buf + len +#define SNP(buf) buf +#else /* !__APPLE__ */ #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) +#endif /* !__APPLE__ */ -#ifdef WITHOUT_BPF -void -ipfw_log_bpf(int onoff) -{ -} -#else /* !WITHOUT_BPF */ -static struct ifnet *log_if; /* hook to attach to bpf */ - -/* we use this dummy function for all ifnet callbacks */ -static int -log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) -{ - return EINVAL; -} - -static int -ipfw_log_output(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *dst, struct route *ro) -{ - if (m != NULL) - m_freem(m); - return EINVAL; -} - -static void -ipfw_log_start(struct ifnet* ifp) -{ - panic("ipfw_log_start() must not be called"); -} - -static const u_char ipfwbroadcastaddr[6] = - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - -void -ipfw_log_bpf(int onoff) -{ - struct ifnet *ifp; - - if (onoff) { - if (log_if) - return; - ifp = if_alloc(IFT_ETHER); - if (ifp == NULL) - return; - if_initname(ifp, "ipfw", 0); - ifp->if_mtu = 65536; - ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; - ifp->if_init = (void *)log_dummy; - ifp->if_ioctl = log_dummy; - ifp->if_start = ipfw_log_start; - ifp->if_output = ipfw_log_output; - ifp->if_addrlen = 6; - ifp->if_hdrlen = 14; - if_attach(ifp); - ifp->if_broadcastaddr = ipfwbroadcastaddr; - ifp->if_baudrate = IF_Mbps(10); - bpfattach(ifp, DLT_EN10MB, 14); - log_if = ifp; - } else { - if (log_if) { - ether_ifdetach(log_if); - if_free(log_if); - } - log_if = NULL; - } -} -#endif /* !WITHOUT_BPF */ - +#define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) /* * We enter here when we have a rule with O_LOG. * XXX this function alone takes about 2Kbytes of code! */ void -ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, - struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, - struct ip *ip) +ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, + struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, + u_short offset, uint32_t tablearg, struct ip *ip) { char *action; int limit_reached = 0; char action2[92], proto[128], fragment[32]; if (V_fw_verbose == 0) { -#ifndef WITHOUT_BPF - - if (log_if == NULL || log_if->if_bpf == NULL) - return; - if (args->eh) /* layer2, use orig hdr */ - BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m); + ipfw_bpf_mtap2(args->eh, ETHER_HDR_LEN, m); else { /* Add fake header. Later we will store * more info in the header. */ if (ip->ip_v == 4) - BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m); - else if (ip->ip_v == 6) - BPF_MTAP2(log_if, "DDDDDDSSSSSS\x86\xdd", ETHER_HDR_LEN, m); + ipfw_bpf_mtap2("DDDDDDSSSSSS\x08\x00", + ETHER_HDR_LEN, m); + else if (ip->ip_v == 6) + ipfw_bpf_mtap2("DDDDDDSSSSSS\x86\xdd", + ETHER_HDR_LEN, m); else /* Obviously bogus EtherType. */ - BPF_MTAP2(log_if, "DDDDDDSSSSSS\xff\xff", ETHER_HDR_LEN, m); + ipfw_bpf_mtap2("DDDDDDSSSSSS\xff\xff", + ETHER_HDR_LEN, m); } -#endif /* !WITHOUT_BPF */ return; } /* the old 'log' function */ @@ -254,27 +190,27 @@ ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, break; case O_DIVERT: snprintf(SNPARGS(action2, 0), "Divert %d", - cmd->arg1); + TARG(cmd->arg1, divert)); break; case O_TEE: snprintf(SNPARGS(action2, 0), "Tee %d", - cmd->arg1); + TARG(cmd->arg1, divert)); break; case O_SETFIB: snprintf(SNPARGS(action2, 0), "SetFib %d", - IP_FW_ARG_TABLEARG(cmd->arg1)); + TARG(cmd->arg1, fib) & 0x7FFF); break; case O_SKIPTO: snprintf(SNPARGS(action2, 0), "SkipTo %d", - IP_FW_ARG_TABLEARG(cmd->arg1)); + TARG(cmd->arg1, skipto)); break; case O_PIPE: snprintf(SNPARGS(action2, 0), "Pipe %d", - IP_FW_ARG_TABLEARG(cmd->arg1)); + TARG(cmd->arg1, pipe)); break; case O_QUEUE: snprintf(SNPARGS(action2, 0), "Queue %d", - IP_FW_ARG_TABLEARG(cmd->arg1)); + TARG(cmd->arg1, pipe)); break; case O_FORWARD_IP: { ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; @@ -435,7 +371,7 @@ ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, #ifdef INET6 if (IS_IP6_FLOW_ID(&(args->f_id))) { - if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) + if (offset || ip6f_mf) snprintf(SNPARGS(fragment, 0), " (frag %08x:%d@%d%s)", args->f_id.extra, diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_nat.c b/freebsd/sys/netpfil/ipfw/ip_fw_nat.c index 5d4dcc9f..58bc1f3c 100644 --- a/freebsd/sys/netpfil/ipfw/ip_fw_nat.c +++ b/freebsd/sys/netpfil/ipfw/ip_fw_nat.c @@ -33,17 +33,18 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/eventhandler.h> #include <sys/malloc.h> +#include <sys/mbuf.h> #include <sys/kernel.h> #include <rtems/bsd/sys/lock.h> #include <sys/module.h> #include <sys/rwlock.h> - -#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ +#include <sys/rmlock.h> #include <netinet/libalias/alias.h> #include <netinet/libalias/alias_local.h> #include <net/if.h> +#include <net/if_var.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip_var.h> @@ -55,6 +56,45 @@ __FBSDID("$FreeBSD$"); #include <machine/in_cksum.h> /* XXX for in_cksum */ +struct cfg_spool { + LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */ + struct in_addr addr; + uint16_t port; +}; + +/* Nat redirect configuration. */ +struct cfg_redir { + LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */ + uint16_t mode; /* type of redirect mode */ + uint16_t proto; /* protocol: tcp/udp */ + struct in_addr laddr; /* local ip address */ + struct in_addr paddr; /* public ip address */ + struct in_addr raddr; /* remote ip address */ + uint16_t lport; /* local port */ + uint16_t pport; /* public port */ + uint16_t rport; /* remote port */ + uint16_t pport_cnt; /* number of public ports */ + uint16_t rport_cnt; /* number of remote ports */ + struct alias_link **alink; + u_int16_t spool_cnt; /* num of entry in spool chain */ + /* chain of spool instances */ + LIST_HEAD(spool_chain, cfg_spool) spool_chain; +}; + +/* Nat configuration data struct. */ +struct cfg_nat { + /* chain of nat instances */ + LIST_ENTRY(cfg_nat) _next; + int id; /* nat id */ + struct in_addr ip; /* nat ip address */ + struct libalias *lib; /* libalias instance */ + int mode; /* aliasing mode */ + int redir_cnt; /* number of entry in spool chain */ + /* chain of redir instances */ + LIST_HEAD(redir_chain, cfg_redir) redir_chain; + char if_name[IF_NAMESIZE]; /* interface name */ +}; + static eventhandler_tag ifaddr_event_tag; static void @@ -66,8 +106,12 @@ ifaddr_change(void *arg __unused, struct ifnet *ifp) KASSERT(curvnet == ifp->if_vnet, ("curvnet(%p) differs from iface vnet(%p)", curvnet, ifp->if_vnet)); + + if (V_ipfw_vnet_ready == 0 || V_ipfw_nat_ready == 0) + return; + chain = &V_layer3_chain; - IPFW_WLOCK(chain); + IPFW_UH_WLOCK(chain); /* Check every nat entry... */ LIST_FOREACH(ptr, &chain->nat, _next) { /* ...using nic 'ifp->if_xname' as dynamic alias address. */ @@ -79,13 +123,15 @@ ifaddr_change(void *arg __unused, struct ifnet *ifp) continue; if (ifa->ifa_addr->sa_family != AF_INET) continue; + IPFW_WLOCK(chain); ptr->ip = ((struct sockaddr_in *) (ifa->ifa_addr))->sin_addr; LibAliasSetAddress(ptr->lib, ptr->ip); + IPFW_WUNLOCK(chain); } if_addr_runlock(ifp); } - IPFW_WUNLOCK(chain); + IPFW_UH_WUNLOCK(chain); } /* @@ -117,11 +163,11 @@ del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) LIST_FOREACH_SAFE(r, head, _next, tmp_r) { num = 1; /* Number of alias_link to delete. */ switch (r->mode) { - case REDIR_PORT: + case NAT44_REDIR_PORT: num = r->pport_cnt; /* FALLTHROUGH */ - case REDIR_ADDR: - case REDIR_PROTO: + case NAT44_REDIR_ADDR: + case NAT44_REDIR_PROTO: /* Delete all libalias redirect entry. */ for (i = 0; i < num; i++) LibAliasRedirectDelete(n->lib, r->alink[i]); @@ -142,27 +188,41 @@ del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) } } -static void +static int add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) { - struct cfg_redir *r, *ser_r; - struct cfg_spool *s, *ser_s; + struct cfg_redir *r; + struct cfg_spool *s; + struct nat44_cfg_redir *ser_r; + struct nat44_cfg_spool *ser_s; + int cnt, off, i; for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) { - ser_r = (struct cfg_redir *)&buf[off]; - r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); - memcpy(r, ser_r, SOF_REDIR); + ser_r = (struct nat44_cfg_redir *)&buf[off]; + r = malloc(sizeof(*r), M_IPFW, M_WAITOK | M_ZERO); + r->mode = ser_r->mode; + r->laddr = ser_r->laddr; + r->paddr = ser_r->paddr; + r->raddr = ser_r->raddr; + r->lport = ser_r->lport; + r->pport = ser_r->pport; + r->rport = ser_r->rport; + r->pport_cnt = ser_r->pport_cnt; + r->rport_cnt = ser_r->rport_cnt; + r->proto = ser_r->proto; + r->spool_cnt = ser_r->spool_cnt; + //memcpy(r, ser_r, SOF_REDIR); LIST_INIT(&r->spool_chain); - off += SOF_REDIR; + off += sizeof(struct nat44_cfg_redir); r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt, M_IPFW, M_WAITOK | M_ZERO); switch (r->mode) { - case REDIR_ADDR: + case NAT44_REDIR_ADDR: r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr, r->paddr); break; - case REDIR_PORT: + case NAT44_REDIR_PORT: for (i = 0 ; i < r->pport_cnt; i++) { /* If remotePort is all ports, set it to 0. */ u_short remotePortCopy = r->rport + i; @@ -178,7 +238,7 @@ add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) } } break; - case REDIR_PROTO: + case NAT44_REDIR_PROTO: r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr, r->raddr, r->paddr, r->proto); break; @@ -186,25 +246,41 @@ add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) printf("unknown redirect mode: %u\n", r->mode); break; } - /* XXX perhaps return an error instead of panic ? */ - if (r->alink[0] == NULL) - panic("LibAliasRedirect* returned NULL"); + if (r->alink[0] == NULL) { + printf("LibAliasRedirect* returned NULL\n"); + free(r->alink, M_IPFW); + free(r, M_IPFW); + return (EINVAL); + } /* LSNAT handling. */ for (i = 0; i < r->spool_cnt; i++) { - ser_s = (struct cfg_spool *)&buf[off]; - s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); - memcpy(s, ser_s, SOF_SPOOL); + ser_s = (struct nat44_cfg_spool *)&buf[off]; + s = malloc(sizeof(*s), M_IPFW, M_WAITOK | M_ZERO); + s->addr = ser_s->addr; + s->port = ser_s->port; LibAliasAddServer(ptr->lib, r->alink[0], s->addr, htons(s->port)); - off += SOF_SPOOL; + off += sizeof(struct nat44_cfg_spool); /* Hook spool entry. */ LIST_INSERT_HEAD(&r->spool_chain, s, _next); } /* And finally hook this redir entry. */ LIST_INSERT_HEAD(&ptr->redir_chain, r, _next); } + + return (0); +} + +static void +free_nat_instance(struct cfg_nat *ptr) +{ + + del_redir_spool_cfg(ptr, &ptr->redir_chain); + LibAliasUninit(ptr->lib); + free(ptr, M_IPFW); } + /* * ipfw_nat - perform mbuf header translation. * @@ -345,11 +421,11 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) if (ldt) { struct tcphdr *th; struct udphdr *uh; - u_short cksum; + uint16_t ip_len, cksum; - ip->ip_len = ntohs(ip->ip_len); + ip_len = ntohs(ip->ip_len); cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, - htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2))); + htons(ip->ip_p + ip_len - (ip->ip_hl << 2))); switch (ip->ip_p) { case IPPROTO_TCP: @@ -375,7 +451,6 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) in_delayed_cksum(mcl); mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } - ip->ip_len = htons(ip->ip_len); } args->m = mcl; return (IP_FW_NAT); @@ -393,60 +468,68 @@ lookup_nat(struct nat_list *l, int nat_id) return res; } -static int -ipfw_nat_cfg(struct sockopt *sopt) +static struct cfg_nat * +lookup_nat_name(struct nat_list *l, char *name) { - struct cfg_nat *cfg, *ptr; - char *buf; - struct ip_fw_chain *chain = &V_layer3_chain; - size_t len; - int gencnt, error = 0; + struct cfg_nat *res; + int id; + char *errptr; - len = sopt->sopt_valsize; - buf = malloc(len, M_TEMP, M_WAITOK | M_ZERO); - if ((error = sooptcopyin(sopt, buf, len, sizeof(struct cfg_nat))) != 0) - goto out; + id = strtol(name, &errptr, 10); + if (id == 0 || *errptr != '\0') + return (NULL); - cfg = (struct cfg_nat *)buf; - if (cfg->id < 0) { - error = EINVAL; - goto out; + LIST_FOREACH(res, l, _next) { + if (res->id == id) + break; } + return (res); +} + +/* IP_FW3 configuration routines */ + +static void +nat44_config(struct ip_fw_chain *chain, struct nat44_cfg_nat *ucfg) +{ + struct cfg_nat *ptr, *tcfg; + int gencnt; /* * Find/create nat rule. */ - IPFW_WLOCK(chain); + IPFW_UH_WLOCK(chain); gencnt = chain->gencnt; - ptr = lookup_nat(&chain->nat, cfg->id); + ptr = lookup_nat_name(&chain->nat, ucfg->name); if (ptr == NULL) { - IPFW_WUNLOCK(chain); + IPFW_UH_WUNLOCK(chain); /* New rule: allocate and init new instance. */ ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_WAITOK | M_ZERO); ptr->lib = LibAliasInit(NULL); LIST_INIT(&ptr->redir_chain); } else { /* Entry already present: temporarily unhook it. */ + IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); - flush_nat_ptrs(chain, cfg->id); + flush_nat_ptrs(chain, ptr->id); IPFW_WUNLOCK(chain); + IPFW_UH_WUNLOCK(chain); } /* - * Basic nat configuration. + * Basic nat (re)configuration. */ - ptr->id = cfg->id; + ptr->id = strtol(ucfg->name, NULL, 10); /* * XXX - what if this rule doesn't nat any ip and just * redirect? * do we set aliasaddress to 0.0.0.0? */ - ptr->ip = cfg->ip; - ptr->redir_cnt = cfg->redir_cnt; - ptr->mode = cfg->mode; - LibAliasSetMode(ptr->lib, cfg->mode, ~0); + ptr->ip = ucfg->ip; + ptr->redir_cnt = ucfg->redir_cnt; + ptr->mode = ucfg->mode; + strlcpy(ptr->if_name, ucfg->if_name, sizeof(ptr->if_name)); + LibAliasSetMode(ptr->lib, ptr->mode, ~0); LibAliasSetAddress(ptr->lib, ptr->ip); - memcpy(ptr->if_name, cfg->if_name, IF_NAMESIZE); /* * Redir and LSNAT configuration. @@ -454,16 +537,453 @@ ipfw_nat_cfg(struct sockopt *sopt) /* Delete old cfgs. */ del_redir_spool_cfg(ptr, &ptr->redir_chain); /* Add new entries. */ - add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr); + add_redir_spool_cfg((char *)(ucfg + 1), ptr); + IPFW_UH_WLOCK(chain); - IPFW_WLOCK(chain); /* Extra check to avoid race with another ipfw_nat_cfg() */ - if (gencnt != chain->gencnt && - ((cfg = lookup_nat(&chain->nat, ptr->id)) != NULL)) - LIST_REMOVE(cfg, _next); + tcfg = NULL; + if (gencnt != chain->gencnt) + tcfg = lookup_nat_name(&chain->nat, ucfg->name); + IPFW_WLOCK(chain); + if (tcfg != NULL) + LIST_REMOVE(tcfg, _next); LIST_INSERT_HEAD(&chain->nat, ptr, _next); + IPFW_WUNLOCK(chain); chain->gencnt++; + + IPFW_UH_WUNLOCK(chain); + + if (tcfg != NULL) + free_nat_instance(ptr); +} + +/* + * Creates/configure nat44 instance + * Data layout (v0)(current): + * Request: [ ipfw_obj_header nat44_cfg_nat .. ] + * + * Returns 0 on success + */ +static int +nat44_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_header *oh; + struct nat44_cfg_nat *ucfg; + int id; + size_t read; + char *errptr; + + /* Check minimum header size */ + if (sd->valsize < (sizeof(*oh) + sizeof(*ucfg))) + return (EINVAL); + + oh = (ipfw_obj_header *)sd->kbuf; + + /* Basic length checks for TLVs */ + if (oh->ntlv.head.length != sizeof(oh->ntlv)) + return (EINVAL); + + ucfg = (struct nat44_cfg_nat *)(oh + 1); + + /* Check if name is properly terminated and looks like number */ + if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) + return (EINVAL); + id = strtol(ucfg->name, &errptr, 10); + if (id == 0 || *errptr != '\0') + return (EINVAL); + + read = sizeof(*oh) + sizeof(*ucfg); + /* Check number of redirs */ + if (sd->valsize < read + ucfg->redir_cnt*sizeof(struct nat44_cfg_redir)) + return (EINVAL); + + nat44_config(chain, ucfg); + return (0); +} + +/* + * Destroys given nat instances. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ] + * + * Returns 0 on success + */ +static int +nat44_destroy(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_header *oh; + struct cfg_nat *ptr; + ipfw_obj_ntlv *ntlv; + + /* Check minimum header size */ + if (sd->valsize < sizeof(*oh)) + return (EINVAL); + + oh = (ipfw_obj_header *)sd->kbuf; + + /* Basic length checks for TLVs */ + if (oh->ntlv.head.length != sizeof(oh->ntlv)) + return (EINVAL); + + ntlv = &oh->ntlv; + /* Check if name is properly terminated */ + if (strnlen(ntlv->name, sizeof(ntlv->name)) == sizeof(ntlv->name)) + return (EINVAL); + + IPFW_UH_WLOCK(chain); + ptr = lookup_nat_name(&chain->nat, ntlv->name); + if (ptr == NULL) { + IPFW_UH_WUNLOCK(chain); + return (ESRCH); + } + IPFW_WLOCK(chain); + LIST_REMOVE(ptr, _next); + flush_nat_ptrs(chain, ptr->id); IPFW_WUNLOCK(chain); + IPFW_UH_WUNLOCK(chain); + + free_nat_instance(ptr); + + return (0); +} + +static void +export_nat_cfg(struct cfg_nat *ptr, struct nat44_cfg_nat *ucfg) +{ + + snprintf(ucfg->name, sizeof(ucfg->name), "%d", ptr->id); + ucfg->ip = ptr->ip; + ucfg->redir_cnt = ptr->redir_cnt; + ucfg->mode = ptr->mode; + strlcpy(ucfg->if_name, ptr->if_name, sizeof(ucfg->if_name)); +} + +/* + * Gets config for given nat instance + * Data layout (v0)(current): + * Request: [ ipfw_obj_header nat44_cfg_nat .. ] + * + * Returns 0 on success + */ +static int +nat44_get_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_header *oh; + struct nat44_cfg_nat *ucfg; + struct cfg_nat *ptr; + struct cfg_redir *r; + struct cfg_spool *s; + struct nat44_cfg_redir *ser_r; + struct nat44_cfg_spool *ser_s; + size_t sz; + + sz = sizeof(*oh) + sizeof(*ucfg); + /* Check minimum header size */ + if (sd->valsize < sz) + return (EINVAL); + + oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); + + /* Basic length checks for TLVs */ + if (oh->ntlv.head.length != sizeof(oh->ntlv)) + return (EINVAL); + + ucfg = (struct nat44_cfg_nat *)(oh + 1); + + /* Check if name is properly terminated */ + if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) + return (EINVAL); + + IPFW_UH_RLOCK(chain); + ptr = lookup_nat_name(&chain->nat, ucfg->name); + if (ptr == NULL) { + IPFW_UH_RUNLOCK(chain); + return (ESRCH); + } + + export_nat_cfg(ptr, ucfg); + + /* Estimate memory amount */ + sz = sizeof(ipfw_obj_header) + sizeof(struct nat44_cfg_nat); + LIST_FOREACH(r, &ptr->redir_chain, _next) { + sz += sizeof(struct nat44_cfg_redir); + LIST_FOREACH(s, &r->spool_chain, _next) + sz += sizeof(struct nat44_cfg_spool); + } + + ucfg->size = sz; + if (sd->valsize < sz) { + + /* + * Submitted buffer size is not enough. + * WE've already filled in @ucfg structure with + * relevant info including size, so we + * can return. Buffer will be flushed automatically. + */ + IPFW_UH_RUNLOCK(chain); + return (ENOMEM); + } + + /* Size OK, let's copy data */ + LIST_FOREACH(r, &ptr->redir_chain, _next) { + ser_r = (struct nat44_cfg_redir *)ipfw_get_sopt_space(sd, + sizeof(*ser_r)); + ser_r->mode = r->mode; + ser_r->laddr = r->laddr; + ser_r->paddr = r->paddr; + ser_r->raddr = r->raddr; + ser_r->lport = r->lport; + ser_r->pport = r->pport; + ser_r->rport = r->rport; + ser_r->pport_cnt = r->pport_cnt; + ser_r->rport_cnt = r->rport_cnt; + ser_r->proto = r->proto; + ser_r->spool_cnt = r->spool_cnt; + + LIST_FOREACH(s, &r->spool_chain, _next) { + ser_s = (struct nat44_cfg_spool *)ipfw_get_sopt_space( + sd, sizeof(*ser_s)); + + ser_s->addr = s->addr; + ser_s->port = s->port; + } + } + + IPFW_UH_RUNLOCK(chain); + + return (0); +} + +/* + * Lists all nat44 instances currently available in kernel. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ] + * Reply: [ ipfw_obj_lheader nat44_cfg_nat x N ] + * + * Returns 0 on success + */ +static int +nat44_list_nat(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_lheader *olh; + struct nat44_cfg_nat *ucfg; + struct cfg_nat *ptr; + int nat_count; + + /* Check minimum header size */ + if (sd->valsize < sizeof(ipfw_obj_lheader)) + return (EINVAL); + + olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh)); + IPFW_UH_RLOCK(chain); + nat_count = 0; + LIST_FOREACH(ptr, &chain->nat, _next) + nat_count++; + + olh->count = nat_count; + olh->objsize = sizeof(struct nat44_cfg_nat); + olh->size = sizeof(*olh) + olh->count * olh->objsize; + + if (sd->valsize < olh->size) { + IPFW_UH_RUNLOCK(chain); + return (ENOMEM); + } + + LIST_FOREACH(ptr, &chain->nat, _next) { + ucfg = (struct nat44_cfg_nat *)ipfw_get_sopt_space(sd, + sizeof(*ucfg)); + export_nat_cfg(ptr, ucfg); + } + + IPFW_UH_RUNLOCK(chain); + + return (0); +} + +/* + * Gets log for given nat instance + * Data layout (v0)(current): + * Request: [ ipfw_obj_header nat44_cfg_nat ] + * Reply: [ ipfw_obj_header nat44_cfg_nat LOGBUFFER ] + * + * Returns 0 on success + */ +static int +nat44_get_log(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_header *oh; + struct nat44_cfg_nat *ucfg; + struct cfg_nat *ptr; + void *pbuf; + size_t sz; + + sz = sizeof(*oh) + sizeof(*ucfg); + /* Check minimum header size */ + if (sd->valsize < sz) + return (EINVAL); + + oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); + + /* Basic length checks for TLVs */ + if (oh->ntlv.head.length != sizeof(oh->ntlv)) + return (EINVAL); + + ucfg = (struct nat44_cfg_nat *)(oh + 1); + + /* Check if name is properly terminated */ + if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) + return (EINVAL); + + IPFW_UH_RLOCK(chain); + ptr = lookup_nat_name(&chain->nat, ucfg->name); + if (ptr == NULL) { + IPFW_UH_RUNLOCK(chain); + return (ESRCH); + } + + if (ptr->lib->logDesc == NULL) { + IPFW_UH_RUNLOCK(chain); + return (ENOENT); + } + + export_nat_cfg(ptr, ucfg); + + /* Estimate memory amount */ + ucfg->size = sizeof(struct nat44_cfg_nat) + LIBALIAS_BUF_SIZE; + if (sd->valsize < sz + sizeof(*oh)) { + + /* + * Submitted buffer size is not enough. + * WE've already filled in @ucfg structure with + * relevant info including size, so we + * can return. Buffer will be flushed automatically. + */ + IPFW_UH_RUNLOCK(chain); + return (ENOMEM); + } + + pbuf = (void *)ipfw_get_sopt_space(sd, LIBALIAS_BUF_SIZE); + memcpy(pbuf, ptr->lib->logDesc, LIBALIAS_BUF_SIZE); + + IPFW_UH_RUNLOCK(chain); + + return (0); +} + +static struct ipfw_sopt_handler scodes[] = { + { IP_FW_NAT44_XCONFIG, 0, HDIR_SET, nat44_cfg }, + { IP_FW_NAT44_DESTROY, 0, HDIR_SET, nat44_destroy }, + { IP_FW_NAT44_XGETCONFIG, 0, HDIR_GET, nat44_get_cfg }, + { IP_FW_NAT44_LIST_NAT, 0, HDIR_GET, nat44_list_nat }, + { IP_FW_NAT44_XGETLOG, 0, HDIR_GET, nat44_get_log }, +}; + + +/* + * Legacy configuration routines + */ + +struct cfg_spool_legacy { + LIST_ENTRY(cfg_spool_legacy) _next; + struct in_addr addr; + u_short port; +}; + +struct cfg_redir_legacy { + LIST_ENTRY(cfg_redir) _next; + u_int16_t mode; + struct in_addr laddr; + struct in_addr paddr; + struct in_addr raddr; + u_short lport; + u_short pport; + u_short rport; + u_short pport_cnt; + u_short rport_cnt; + int proto; + struct alias_link **alink; + u_int16_t spool_cnt; + LIST_HEAD(, cfg_spool_legacy) spool_chain; +}; + +struct cfg_nat_legacy { + LIST_ENTRY(cfg_nat_legacy) _next; + int id; + struct in_addr ip; + char if_name[IF_NAMESIZE]; + int mode; + struct libalias *lib; + int redir_cnt; + LIST_HEAD(, cfg_redir_legacy) redir_chain; +}; + +static int +ipfw_nat_cfg(struct sockopt *sopt) +{ + struct cfg_nat_legacy *cfg; + struct nat44_cfg_nat *ucfg; + struct cfg_redir_legacy *rdir; + struct nat44_cfg_redir *urdir; + char *buf; + size_t len, len2; + int error, i; + + len = sopt->sopt_valsize; + len2 = len + 128; + + /* + * Allocate 2x buffer to store converted structures. + * new redir_cfg has shrunk, so we're sure that + * new buffer size is enough. + */ + buf = malloc(roundup2(len, 8) + len2, M_TEMP, M_WAITOK | M_ZERO); + error = sooptcopyin(sopt, buf, len, sizeof(struct cfg_nat_legacy)); + if (error != 0) + goto out; + + cfg = (struct cfg_nat_legacy *)buf; + if (cfg->id < 0) { + error = EINVAL; + goto out; + } + + ucfg = (struct nat44_cfg_nat *)&buf[roundup2(len, 8)]; + snprintf(ucfg->name, sizeof(ucfg->name), "%d", cfg->id); + strlcpy(ucfg->if_name, cfg->if_name, sizeof(ucfg->if_name)); + ucfg->ip = cfg->ip; + ucfg->mode = cfg->mode; + ucfg->redir_cnt = cfg->redir_cnt; + + if (len < sizeof(*cfg) + cfg->redir_cnt * sizeof(*rdir)) { + error = EINVAL; + goto out; + } + + urdir = (struct nat44_cfg_redir *)(ucfg + 1); + rdir = (struct cfg_redir_legacy *)(cfg + 1); + for (i = 0; i < cfg->redir_cnt; i++) { + urdir->mode = rdir->mode; + urdir->laddr = rdir->laddr; + urdir->paddr = rdir->paddr; + urdir->raddr = rdir->raddr; + urdir->lport = rdir->lport; + urdir->pport = rdir->pport; + urdir->rport = rdir->rport; + urdir->pport_cnt = rdir->pport_cnt; + urdir->rport_cnt = rdir->rport_cnt; + urdir->proto = rdir->proto; + urdir->spool_cnt = rdir->spool_cnt; + + urdir++; + rdir++; + } + + nat44_config(&V_layer3_chain, ucfg); out: free(buf, M_TEMP); @@ -479,18 +999,18 @@ ipfw_nat_del(struct sockopt *sopt) sooptcopyin(sopt, &i, sizeof i, sizeof i); /* XXX validate i */ - IPFW_WLOCK(chain); + IPFW_UH_WLOCK(chain); ptr = lookup_nat(&chain->nat, i); if (ptr == NULL) { - IPFW_WUNLOCK(chain); + IPFW_UH_WUNLOCK(chain); return (EINVAL); } + IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, i); IPFW_WUNLOCK(chain); - del_redir_spool_cfg(ptr, &ptr->redir_chain); - LibAliasUninit(ptr->lib); - free(ptr, M_IPFW); + IPFW_UH_WUNLOCK(chain); + free_nat_instance(ptr); return (0); } @@ -499,28 +1019,31 @@ ipfw_nat_get_cfg(struct sockopt *sopt) { struct ip_fw_chain *chain = &V_layer3_chain; struct cfg_nat *n; + struct cfg_nat_legacy *ucfg; struct cfg_redir *r; struct cfg_spool *s; + struct cfg_redir_legacy *ser_r; + struct cfg_spool_legacy *ser_s; char *data; int gencnt, nat_cnt, len, error; nat_cnt = 0; len = sizeof(nat_cnt); - IPFW_RLOCK(chain); + IPFW_UH_RLOCK(chain); retry: gencnt = chain->gencnt; /* Estimate memory amount */ LIST_FOREACH(n, &chain->nat, _next) { nat_cnt++; - len += sizeof(struct cfg_nat); + len += sizeof(struct cfg_nat_legacy); LIST_FOREACH(r, &n->redir_chain, _next) { - len += sizeof(struct cfg_redir); + len += sizeof(struct cfg_redir_legacy); LIST_FOREACH(s, &r->spool_chain, _next) - len += sizeof(struct cfg_spool); + len += sizeof(struct cfg_spool_legacy); } } - IPFW_RUNLOCK(chain); + IPFW_UH_RUNLOCK(chain); data = malloc(len, M_TEMP, M_WAITOK | M_ZERO); bcopy(&nat_cnt, data, sizeof(nat_cnt)); @@ -528,25 +1051,43 @@ retry: nat_cnt = 0; len = sizeof(nat_cnt); - IPFW_RLOCK(chain); + IPFW_UH_RLOCK(chain); if (gencnt != chain->gencnt) { free(data, M_TEMP); goto retry; } /* Serialize all the data. */ LIST_FOREACH(n, &chain->nat, _next) { - bcopy(n, &data[len], sizeof(struct cfg_nat)); - len += sizeof(struct cfg_nat); + ucfg = (struct cfg_nat_legacy *)&data[len]; + ucfg->id = n->id; + ucfg->ip = n->ip; + ucfg->redir_cnt = n->redir_cnt; + ucfg->mode = n->mode; + strlcpy(ucfg->if_name, n->if_name, sizeof(ucfg->if_name)); + len += sizeof(struct cfg_nat_legacy); LIST_FOREACH(r, &n->redir_chain, _next) { - bcopy(r, &data[len], sizeof(struct cfg_redir)); - len += sizeof(struct cfg_redir); + ser_r = (struct cfg_redir_legacy *)&data[len]; + ser_r->mode = r->mode; + ser_r->laddr = r->laddr; + ser_r->paddr = r->paddr; + ser_r->raddr = r->raddr; + ser_r->lport = r->lport; + ser_r->pport = r->pport; + ser_r->rport = r->rport; + ser_r->pport_cnt = r->pport_cnt; + ser_r->rport_cnt = r->rport_cnt; + ser_r->proto = r->proto; + ser_r->spool_cnt = r->spool_cnt; + len += sizeof(struct cfg_redir_legacy); LIST_FOREACH(s, &r->spool_chain, _next) { - bcopy(s, &data[len], sizeof(struct cfg_spool)); - len += sizeof(struct cfg_spool); + ser_s = (struct cfg_spool_legacy *)&data[len]; + ser_s->addr = s->addr; + ser_s->port = s->port; + len += sizeof(struct cfg_spool_legacy); } } } - IPFW_RUNLOCK(chain); + IPFW_UH_RUNLOCK(chain); error = sooptcopyout(sopt, data, len); free(data, M_TEMP); @@ -561,6 +1102,7 @@ ipfw_nat_get_log(struct sockopt *sopt) struct cfg_nat *ptr; int i, size; struct ip_fw_chain *chain; + IPFW_RLOCK_TRACKER; chain = &V_layer3_chain; @@ -609,14 +1151,12 @@ vnet_ipfw_nat_uninit(const void *arg __unused) chain = &V_layer3_chain; IPFW_WLOCK(chain); + V_ipfw_nat_ready = 0; LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) { LIST_REMOVE(ptr, _next); - del_redir_spool_cfg(ptr, &ptr->redir_chain); - LibAliasUninit(ptr->lib); - free(ptr, M_IPFW); + free_nat_instance(ptr); } flush_nat_ptrs(chain, -1 /* flush all */); - V_ipfw_nat_ready = 0; IPFW_WUNLOCK(chain); return (0); } @@ -632,6 +1172,7 @@ ipfw_nat_init(void) ipfw_nat_del_ptr = ipfw_nat_del; ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg; ipfw_nat_get_log_ptr = ipfw_nat_get_log; + IPFW_ADD_SOPT_HANDLER(1, scodes); ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change, NULL, EVENTHANDLER_PRI_ANY); @@ -643,6 +1184,7 @@ ipfw_nat_destroy(void) EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag); /* deregister ipfw_nat */ + IPFW_DEL_SOPT_HANDLER(1, scodes); ipfw_nat_ptr = NULL; lookup_nat_ptr = NULL; ipfw_nat_cfg_ptr = NULL; @@ -677,14 +1219,14 @@ static moduledata_t ipfw_nat_mod = { }; /* Define startup order. */ -#define IPFW_NAT_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN -#define IPFW_NAT_MODEVENT_ORDER (SI_ORDER_ANY - 128) +#define IPFW_NAT_SI_SUB_FIREWALL SI_SUB_PROTO_FIREWALL +#define IPFW_NAT_MODEVENT_ORDER (SI_ORDER_ANY - 128) /* after ipfw */ #define IPFW_NAT_MODULE_ORDER (IPFW_NAT_MODEVENT_ORDER + 1) #define IPFW_NAT_VNET_ORDER (IPFW_NAT_MODEVENT_ORDER + 2) DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, IPFW_NAT_SI_SUB_FIREWALL, SI_ORDER_ANY); MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1); -MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2); +MODULE_DEPEND(ipfw_nat, ipfw, 3, 3, 3); MODULE_VERSION(ipfw_nat, 1); SYSINIT(ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER, diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_pfil.c b/freebsd/sys/netpfil/ipfw/ip_fw_pfil.c index d2e1b448..59c13aa5 100644 --- a/freebsd/sys/netpfil/ipfw/ip_fw_pfil.c +++ b/freebsd/sys/netpfil/ipfw/ip_fw_pfil.c @@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$"); #include <net/if.h> #include <net/route.h> +#include <net/ethernet.h> #include <net/pfil.h> #include <net/vnet.h> @@ -60,6 +61,7 @@ __FBSDID("$FreeBSD$"); #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/ip6_var.h> +#include <netinet6/scope6_var.h> #endif #include <netgraph/ng_ipfw.h> @@ -76,26 +78,39 @@ static VNET_DEFINE(int, fw6_enable) = 1; #define V_fw6_enable VNET(fw6_enable) #endif +static VNET_DEFINE(int, fwlink_enable) = 0; +#define V_fwlink_enable VNET(fwlink_enable) + int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); /* Forward declarations. */ static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int); +int ipfw_check_packet(void *, struct mbuf **, struct ifnet *, int, + struct inpcb *); +int ipfw_check_frame(void *, struct mbuf **, struct ifnet *, int, + struct inpcb *); #ifdef SYSCTL_NODE SYSBEGIN(f1) SYSCTL_DECL(_net_inet_ip_fw); -SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, - ipfw_chg_hook, "I", "Enable ipfw"); +SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, + &VNET_NAME(fw_enable), 0, ipfw_chg_hook, "I", "Enable ipfw"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6_fw); -SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, - ipfw_chg_hook, "I", "Enable ipfw+6"); +SYSCTL_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, + &VNET_NAME(fw6_enable), 0, ipfw_chg_hook, "I", "Enable ipfw+6"); #endif /* INET6 */ +SYSCTL_DECL(_net_link_ether); +SYSCTL_PROC(_net_link_ether, OID_AUTO, ipfw, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, + &VNET_NAME(fwlink_enable), 0, ipfw_chg_hook, "I", + "Pass ether pkts through firewall"); + SYSEND #endif /* SYSCTL_NODE */ @@ -106,7 +121,7 @@ SYSEND * The packet may be consumed. */ int -ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, +ipfw_check_packet(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp) { struct ip_fw_args args; @@ -114,10 +129,6 @@ ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, int ipfw; int ret; - /* all the processing now uses ip_len in net format */ - if (mtod(*m0, struct ip *)->ip_v == 4) - SET_NET_IPLEN(mtod(*m0, struct ip *)); - /* convert dir to IPFW values */ dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT; bzero(&args, sizeof(args)); @@ -131,11 +142,8 @@ again: if (tag != NULL) { args.rule = *((struct ipfw_rule_ref *)(tag+1)); m_tag_delete(*m0, tag); - if (args.rule.info & IPFW_ONEPASS) { - if (mtod(*m0, struct ip *)->ip_v == 4) - SET_HOST_IPLEN(mtod(*m0, struct ip *)); + if (args.rule.info & IPFW_ONEPASS) return (0); - } } args.m = *m0; @@ -192,8 +200,20 @@ again: } #ifdef INET6 if (args.next_hop6 != NULL) { - bcopy(args.next_hop6, (fwd_tag+1), len); - if (in6_localip(&args.next_hop6->sin6_addr)) + struct sockaddr_in6 *sa6; + + sa6 = (struct sockaddr_in6 *)(fwd_tag + 1); + bcopy(args.next_hop6, sa6, len); + /* + * If nh6 address is link-local we should convert + * it to kernel internal form before doing any + * comparisons. + */ + if (sa6_embedscope(sa6, V_ip6_use_defzone) != 0) { + ret = EACCES; + break; + } + if (in6_localip(&sa6->sin6_addr)) (*m0)->m_flags |= M_FASTFWD_OURS; (*m0)->m_flags |= M_IP6_NEXTHOP; } @@ -279,8 +299,112 @@ again: FREE_PKT(*m0); *m0 = NULL; } - if (*m0 && mtod(*m0, struct ip *)->ip_v == 4) - SET_HOST_IPLEN(mtod(*m0, struct ip *)); + + return ret; +} + +/* + * ipfw processing for ethernet packets (in and out). + * Inteface is NULL from ether_demux, and ifp from + * ether_output_frame. + */ +int +ipfw_check_frame(void *arg, struct mbuf **m0, struct ifnet *dst, int dir, + struct inpcb *inp) +{ + struct ether_header *eh; + struct ether_header save_eh; + struct mbuf *m; + int i, ret; + struct ip_fw_args args; + struct m_tag *mtag; + + /* fetch start point from rule, if any */ + mtag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); + if (mtag == NULL) { + args.rule.slot = 0; + } else { + /* dummynet packet, already partially processed */ + struct ipfw_rule_ref *r; + + /* XXX can we free it after use ? */ + mtag->m_tag_id = PACKET_TAG_NONE; + r = (struct ipfw_rule_ref *)(mtag + 1); + if (r->info & IPFW_ONEPASS) + return (0); + args.rule = *r; + } + + /* I need some amt of data to be contiguous */ + m = *m0; + i = min(m->m_pkthdr.len, max_protohdr); + if (m->m_len < i) { + m = m_pullup(m, i); + if (m == NULL) { + *m0 = m; + return (0); + } + } + eh = mtod(m, struct ether_header *); + save_eh = *eh; /* save copy for restore below */ + m_adj(m, ETHER_HDR_LEN); /* strip ethernet header */ + + args.m = m; /* the packet we are looking at */ + args.oif = dir == PFIL_OUT ? dst: NULL; /* destination, if any */ + args.next_hop = NULL; /* we do not support forward yet */ + args.next_hop6 = NULL; /* we do not support forward yet */ + args.eh = &save_eh; /* MAC header for bridged/MAC packets */ + args.inp = NULL; /* used by ipfw uid/gid/jail rules */ + i = ipfw_chk(&args); + m = args.m; + if (m != NULL) { + /* + * Restore Ethernet header, as needed, in case the + * mbuf chain was replaced by ipfw. + */ + M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); + if (m == NULL) { + *m0 = NULL; + return (0); + } + if (eh != mtod(m, struct ether_header *)) + bcopy(&save_eh, mtod(m, struct ether_header *), + ETHER_HDR_LEN); + } + *m0 = m; + + ret = 0; + /* Check result of ipfw_chk() */ + switch (i) { + case IP_FW_PASS: + break; + + case IP_FW_DENY: + ret = EACCES; + break; /* i.e. drop */ + + case IP_FW_DUMMYNET: + ret = EACCES; + int dir; + + if (ip_dn_io_ptr == NULL) + break; /* i.e. drop */ + + *m0 = NULL; + dir = PROTO_LAYER2 | (dst ? DIR_OUT : DIR_IN); + ip_dn_io_ptr(&m, dir, &args); + return 0; + + default: + KASSERT(0, ("%s: unknown retval", __func__)); + } + + if (ret != 0) { + if (*m0) + FREE_PKT(*m0); + *m0 = NULL; + } + return ret; } @@ -303,7 +427,7 @@ ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule, clone = *m0; /* use the original mbuf */ *m0 = NULL; } else { - clone = m_dup(*m0, M_DONTWAIT); + clone = m_dup(*m0, M_NOWAIT); /* If we cannot duplicate the mbuf, we sacrifice the divert * chain and continue with the tee-ed packet. */ @@ -325,7 +449,6 @@ ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule, int hlen; struct mbuf *reass; - SET_HOST_IPLEN(ip); /* ip_reass wants host order */ reass = ip_reass(clone); /* Reassemble packet. */ if (reass == NULL) return 0; /* not an error */ @@ -336,7 +459,6 @@ ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule, */ ip = mtod(reass, struct ip *); hlen = ip->ip_hl << 2; - SET_NET_IPLEN(ip); ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); @@ -385,13 +507,16 @@ static int ipfw_hook(int onoff, int pf) { struct pfil_head *pfh; + pfil_func_t hook_func; pfh = pfil_head_get(PFIL_TYPE_AF, pf); if (pfh == NULL) return ENOENT; + hook_func = (pf == AF_LINK) ? ipfw_check_frame : ipfw_check_packet; + (void) (onoff ? pfil_add_hook : pfil_remove_hook) - (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh); + (hook_func, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh); return 0; } @@ -415,51 +540,50 @@ ipfw_attach_hooks(int arg) printf("ipfw6_hook() error\n"); } #endif + if (arg == 0) /* detach */ + ipfw_hook(0, AF_LINK); + else if (V_fwlink_enable && ipfw_hook(1, AF_LINK) != 0) { + error = ENOENT; + printf("ipfw_link_hook() error\n"); + } return error; } int ipfw_chg_hook(SYSCTL_HANDLER_ARGS) { - int enable; - int oldenable; + int newval; int error; int af; - if (arg1 == &VNET_NAME(fw_enable)) { - enable = V_fw_enable; + if (arg1 == &V_fw_enable) af = AF_INET; - } #ifdef INET6 - else if (arg1 == &VNET_NAME(fw6_enable)) { - enable = V_fw6_enable; + else if (arg1 == &V_fw6_enable) af = AF_INET6; - } #endif + else if (arg1 == &V_fwlink_enable) + af = AF_LINK; else return (EINVAL); - oldenable = enable; - - error = sysctl_handle_int(oidp, &enable, 0, req); + newval = *(int *)arg1; + /* Handle sysctl change */ + error = sysctl_handle_int(oidp, &newval, 0, req); if (error) return (error); - enable = (enable) ? 1 : 0; + /* Formalize new value */ + newval = (newval) ? 1 : 0; - if (enable == oldenable) + if (*(int *)arg1 == newval) return (0); - error = ipfw_hook(enable, af); + error = ipfw_hook(newval, af); if (error) return (error); - if (af == AF_INET) - V_fw_enable = enable; -#ifdef INET6 - else if (af == AF_INET6) - V_fw6_enable = enable; -#endif + *(int *)arg1 = newval; return (0); } diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_private.h b/freebsd/sys/netpfil/ipfw/ip_fw_private.h index ceabf88d..3b483625 100644 --- a/freebsd/sys/netpfil/ipfw/ip_fw_private.h +++ b/freebsd/sys/netpfil/ipfw/ip_fw_private.h @@ -66,14 +66,12 @@ enum { */ struct _ip6dn_args { struct ip6_pktopts *opt_or; - struct route_in6 ro_or; int flags_or; struct ip6_moptions *im6o_or; struct ifnet *origifp_or; struct ifnet *ifp_or; struct sockaddr_in6 dst_or; u_long mtu_or; - struct route_in6 ro_pmtu_or; }; @@ -104,7 +102,10 @@ struct ip_fw_args { struct inpcb *inp; struct _ip6dn_args dummypar; /* dummynet->ip6_output */ - struct sockaddr_in hopstore; /* store here if cannot use a pointer */ + union { /* store here if cannot use a pointer */ + struct sockaddr_in hopstore; + struct sockaddr_in6 hopstore6; + }; }; MALLOC_DECLARE(M_IPFW); @@ -152,10 +153,13 @@ void ipfw_nat_destroy(void); /* In ip_fw_log.c */ struct ip; -void ipfw_log_bpf(int); -void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, - struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, - struct ip *ip); +struct ip_fw_chain; +void ipfw_bpf_init(int); +void ipfw_bpf_uninit(int); +void ipfw_bpf_mtap2(void *, u_int, struct mbuf *); +void ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, + struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, + u_short offset, uint32_t tablearg, struct ip *ip); VNET_DECLARE(u_int64_t, norule_counter); #define V_norule_counter VNET(norule_counter) VNET_DECLARE(int, verbose_limit); @@ -176,22 +180,26 @@ enum { /* result for matching dynamic rules */ * Eventually we may implement it with a callback on the function. */ struct ip_fw_chain; -void ipfw_expire_dyn_rules(struct ip_fw_chain *, struct ip_fw *, int); +struct sockopt_data; +int ipfw_is_dyn_rule(struct ip_fw *rule); +void ipfw_expire_dyn_rules(struct ip_fw_chain *, ipfw_range_tlv *); void ipfw_dyn_unlock(ipfw_dyn_rule *q); struct tcphdr; struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, u_int32_t, u_int32_t, int); -int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, - struct ip_fw_args *args, uint32_t tablearg); +int ipfw_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, + ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg); ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, - int *match_direction, struct tcphdr *tcp); + int *match_direction, struct tcphdr *tcp, uint16_t kidx); void ipfw_remove_dyn_children(struct ip_fw *rule); void ipfw_get_dynamic(struct ip_fw_chain *chain, char **bp, const char *ep); +int ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd); void ipfw_dyn_init(struct ip_fw_chain *); /* per-vnet initialization */ void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ int ipfw_dyn_len(void); +int ipfw_dyn_get_count(void); /* common variables */ VNET_DECLARE(int, fw_one_pass); @@ -203,6 +211,9 @@ VNET_DECLARE(int, fw_verbose); VNET_DECLARE(struct ip_fw_chain, layer3_chain); #define V_layer3_chain VNET(layer3_chain) +VNET_DECLARE(int, ipfw_vnet_ready); +#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) + VNET_DECLARE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) @@ -212,23 +223,66 @@ VNET_DECLARE(int, autoinc_step); VNET_DECLARE(unsigned int, fw_tables_max); #define V_fw_tables_max VNET(fw_tables_max) +VNET_DECLARE(unsigned int, fw_tables_sets); +#define V_fw_tables_sets VNET(fw_tables_sets) + +struct tables_config; + +#ifdef _KERNEL +/* + * Here we have the structure representing an ipfw rule. + * + * It starts with a general area + * followed by an array of one or more instructions, which the code + * accesses as an array of 32-bit values. + * + * Given a rule pointer r: + * + * r->cmd is the start of the first instruction. + * ACTION_PTR(r) is the start of the first action (things to do + * once a rule matched). + */ + +struct ip_fw { + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ + uint8_t flags; /* currently unused */ + counter_u64_t cntr; /* Pointer to rule counters */ + uint32_t timestamp; /* tv_sec of last match */ + uint32_t id; /* rule id */ + uint32_t cached_id; /* used by jump_fast */ + uint32_t cached_pos; /* used by jump_fast */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + +#define IPFW_RULE_CNTR_SIZE (2 * sizeof(uint64_t)) + +#endif + struct ip_fw_chain { struct ip_fw **map; /* array of rule ptrs to ease lookup */ uint32_t id; /* ruleset id */ int n_rules; /* number of static rules */ - LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ - struct radix_node_head **tables; /* IPv4 tables */ - struct radix_node_head **xtables; /* extended tables */ - uint8_t *tabletype; /* Array of table types */ + void *tablestate; /* runtime table info */ + void *valuestate; /* runtime table value info */ + int *idxmap; /* skipto array of rules */ + void **srvstate; /* runtime service mappings */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t rwmtx; #else - struct rwlock rwmtx; + struct rmlock rwmtx; #endif - int static_len; /* total len of static rules */ + int static_len; /* total len of static rules (v0) */ uint32_t gencnt; /* NAT generation count */ - struct ip_fw *reap; /* list of rules to reap */ + LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ struct ip_fw *default_rule; + struct tables_config *tblcfg; /* tables module data */ + void *ifcfg; /* interface module data */ + int *idxmap_back; /* standby skipto array of rules */ + struct namedobj_instance *srvmap; /* cfg name->number mappings */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t uh_lock; #else @@ -236,13 +290,81 @@ struct ip_fw_chain { #endif }; +/* 64-byte structure representing multi-field table value */ +struct table_value { + uint32_t tag; /* O_TAG/O_TAGGED */ + uint32_t pipe; /* O_PIPE/O_QUEUE */ + uint16_t divert; /* O_DIVERT/O_TEE */ + uint16_t skipto; /* skipto, CALLRET */ + uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ + uint32_t fib; /* O_SETFIB */ + uint32_t nat; /* O_NAT */ + uint32_t nh4; + uint8_t dscp; + uint8_t spare0; + uint16_t spare1; + /* -- 32 bytes -- */ + struct in6_addr nh6; + uint32_t limit; /* O_LIMIT */ + uint32_t zoneid; /* scope zone id for nh6 */ + uint64_t refcnt; /* Number of references */ +}; + + +struct named_object { + TAILQ_ENTRY(named_object) nn_next; /* namehash */ + TAILQ_ENTRY(named_object) nv_next; /* valuehash */ + char *name; /* object name */ + uint16_t etlv; /* Export TLV id */ + uint8_t subtype;/* object subtype within class */ + uint8_t set; /* set object belongs to */ + uint16_t kidx; /* object kernel index */ + uint16_t spare; + uint32_t ocnt; /* object counter for internal use */ + uint32_t refcnt; /* number of references */ +}; +TAILQ_HEAD(namedobjects_head, named_object); + struct sockopt; /* used by tcp_var.h */ +struct sockopt_data { + caddr_t kbuf; /* allocated buffer */ + size_t ksize; /* given buffer size */ + size_t koff; /* data already used */ + size_t kavail; /* number of bytes available */ + size_t ktotal; /* total bytes pushed */ + struct sockopt *sopt; /* socket data */ + caddr_t sopt_val; /* sopt user buffer */ + size_t valsize; /* original data size */ +}; + +struct ipfw_ifc; + +typedef void (ipfw_ifc_cb)(struct ip_fw_chain *ch, void *cbdata, + uint16_t ifindex); + +struct ipfw_iface { + struct named_object no; + char ifname[64]; + int resolved; + uint16_t ifindex; + uint16_t spare; + uint64_t gencnt; + TAILQ_HEAD(, ipfw_ifc) consumers; +}; + +struct ipfw_ifc { + TAILQ_ENTRY(ipfw_ifc) next; + struct ipfw_iface *iface; + ipfw_ifc_cb *cb; + void *cbdata; +}; /* Macro for working with various counters */ #define IPFW_INC_RULE_COUNTER(_cntr, _bytes) do { \ - (_cntr)->pcnt++; \ - (_cntr)->bcnt += _bytes; \ - (_cntr)->timestamp = time_uptime; \ + counter_u64_add((_cntr)->cntr, 1); \ + counter_u64_add((_cntr)->cntr + 1, _bytes); \ + if ((_cntr)->timestamp != time_uptime) \ + (_cntr)->timestamp = time_uptime; \ } while (0) #define IPFW_INC_DYN_COUNTER(_cntr, _bytes) do { \ @@ -251,8 +373,8 @@ struct sockopt; /* used by tcp_var.h */ } while (0) #define IPFW_ZERO_RULE_COUNTER(_cntr) do { \ - (_cntr)->pcnt = 0; \ - (_cntr)->bcnt = 0; \ + counter_u64_zero((_cntr)->cntr); \ + counter_u64_zero((_cntr)->cntr + 1); \ (_cntr)->timestamp = 0; \ } while (0) @@ -261,12 +383,15 @@ struct sockopt; /* used by tcp_var.h */ (_cntr)->bcnt = 0; \ } while (0) -#define IP_FW_ARG_TABLEARG(a) ((a) == IP_FW_TABLEARG) ? tablearg : (a) +#define TARG_VAL(ch, k, f) ((struct table_value *)((ch)->valuestate))[k].f +#define IP_FW_ARG_TABLEARG(ch, a, f) \ + (((a) == IP_FW_TARG) ? TARG_VAL(ch, tablearg, f) : (a)) /* * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c * so the variable and the macros must be here. */ +#if defined( __linux__ ) || defined( _WIN32 ) #define IPFW_LOCK_INIT(_chain) do { \ rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ @@ -280,49 +405,354 @@ struct sockopt; /* used by tcp_var.h */ #define IPFW_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_RLOCKED) #define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) -#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) -#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) -#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) -#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) +#define IPFW_RLOCK_TRACKER +#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) +#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) +#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) +#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) +#define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) +#define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) +#else /* FreeBSD */ +#define IPFW_LOCK_INIT(_chain) do { \ + rm_init(&(_chain)->rwmtx, "IPFW static rules"); \ + rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ + } while (0) + +#define IPFW_LOCK_DESTROY(_chain) do { \ + rm_destroy(&(_chain)->rwmtx); \ + rw_destroy(&(_chain)->uh_lock); \ + } while (0) + +#define IPFW_RLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_RLOCKED) +#define IPFW_WLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_WLOCKED) + +#define IPFW_RLOCK_TRACKER struct rm_priotracker _tracker +#define IPFW_RLOCK(p) rm_rlock(&(p)->rwmtx, &_tracker) +#define IPFW_RUNLOCK(p) rm_runlock(&(p)->rwmtx, &_tracker) +#define IPFW_WLOCK(p) rm_wlock(&(p)->rwmtx) +#define IPFW_WUNLOCK(p) rm_wunlock(&(p)->rwmtx) +#define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) +#define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) +#endif #define IPFW_UH_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_RLOCKED) #define IPFW_UH_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_WLOCKED) +#define IPFW_UH_UNLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_UNLOCKED) #define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) #define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) #define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) #define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) +struct obj_idx { + uint16_t uidx; /* internal index supplied by userland */ + uint16_t kidx; /* kernel object index */ + uint16_t off; /* tlv offset from rule end in 4-byte words */ + uint8_t spare; + uint8_t type; /* object type within its category */ +}; + +struct rule_check_info { + uint16_t flags; /* rule-specific check flags */ + uint16_t object_opcodes; /* num of opcodes referencing objects */ + uint16_t urule_numoff; /* offset of rulenum in bytes */ + uint8_t version; /* rule version */ + uint8_t spare; + ipfw_obj_ctlv *ctlv; /* name TLV containter */ + struct ip_fw *krule; /* resulting rule pointer */ + caddr_t urule; /* original rule pointer */ + struct obj_idx obuf[8]; /* table references storage */ +}; + +/* Legacy interface support */ +/* + * FreeBSD 8 export rule format + */ +struct ip_fw_rule0 { + struct ip_fw *x_next; /* linked list of rules */ + struct ip_fw *next_rule; /* ptr to next [skipto] rule */ + /* 'next_rule' is used to pass up 'set_disable' status */ + + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ + uint8_t _pad; /* padding */ + uint32_t id; /* rule id */ + + /* These fields are present in all rules. */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + +struct ip_fw_bcounter0 { + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ +}; + +/* Kernel rule length */ +/* + * RULE _K_ SIZE _V_ -> + * get kernel size from userland rool version _V_. + * RULE _U_ SIZE _V_ -> + * get user size version _V_ from kernel rule + * RULESIZE _V_ -> + * get user size rule length + */ +/* FreeBSD8 <> current kernel format */ +#define RULEUSIZE0(r) (sizeof(struct ip_fw_rule0) + (r)->cmd_len * 4 - 4) +#define RULEKSIZE0(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) +/* FreeBSD11 <> current kernel format */ +#define RULEUSIZE1(r) (roundup2(sizeof(struct ip_fw_rule) + \ + (r)->cmd_len * 4 - 4, 8)) +#define RULEKSIZE1(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) + +/* + * Tables/Objects index rewriting code + */ + +/* Default and maximum number of ipfw tables/objects. */ +#define IPFW_TABLES_MAX 65536 +#define IPFW_TABLES_DEFAULT 128 +#define IPFW_OBJECTS_MAX 65536 +#define IPFW_OBJECTS_DEFAULT 1024 + +#define CHAIN_TO_SRV(ch) ((ch)->srvmap) +#define SRV_OBJECT(ch, idx) ((ch)->srvstate[(idx)]) + +struct tid_info { + uint32_t set; /* table set */ + uint16_t uidx; /* table index */ + uint8_t type; /* table type */ + uint8_t atype; + uint8_t spare; + int tlen; /* Total TLV size block */ + void *tlvs; /* Pointer to first TLV */ +}; + +/* + * Classifier callback. Checks if @cmd opcode contains kernel object reference. + * If true, returns its index and type. + * Returns 0 if match is found, 1 overwise. + */ +typedef int (ipfw_obj_rw_cl)(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype); +/* + * Updater callback. Sets kernel object reference index to @puidx + */ +typedef void (ipfw_obj_rw_upd)(ipfw_insn *cmd, uint16_t puidx); +/* + * Finder callback. Tries to find named object by name (specified via @ti). + * Stores found named object pointer in @pno. + * If object was not found, NULL is stored. + * + * Return 0 if input data was valid. + */ +typedef int (ipfw_obj_fname_cb)(struct ip_fw_chain *ch, + struct tid_info *ti, struct named_object **pno); +/* + * Another finder callback. Tries to findex named object by kernel index. + * + * Returns pointer to named object or NULL. + */ +typedef struct named_object *(ipfw_obj_fidx_cb)(struct ip_fw_chain *ch, + uint16_t kidx); +/* + * Object creator callback. Tries to create object specified by @ti. + * Stores newly-allocated object index in @pkidx. + * + * Returns 0 on success. + */ +typedef int (ipfw_obj_create_cb)(struct ip_fw_chain *ch, struct tid_info *ti, + uint16_t *pkidx); +/* + * Object destroy callback. Intended to free resources allocated by + * create_object callback. + */ +typedef void (ipfw_obj_destroy_cb)(struct ip_fw_chain *ch, + struct named_object *no); +/* + * Sets handler callback. Handles moving and swaping set of named object. + * SWAP_ALL moves all named objects from set `set' to `new_set' and vise versa; + * TEST_ALL checks that there aren't any named object with conflicting names; + * MOVE_ALL moves all named objects from set `set' to `new_set'; + * COUNT_ONE used to count number of references used by object with kidx `set'; + * TEST_ONE checks that named object with kidx `set' can be moved to `new_set`; + * MOVE_ONE moves named object with kidx `set' to set `new_set'. + */ +enum ipfw_sets_cmd { + SWAP_ALL = 0, TEST_ALL, MOVE_ALL, COUNT_ONE, TEST_ONE, MOVE_ONE +}; +typedef int (ipfw_obj_sets_cb)(struct ip_fw_chain *ch, + uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd); + + +struct opcode_obj_rewrite { + uint32_t opcode; /* Opcode to act upon */ + uint32_t etlv; /* Relevant export TLV id */ + ipfw_obj_rw_cl *classifier; /* Check if rewrite is needed */ + ipfw_obj_rw_upd *update; /* update cmd with new value */ + ipfw_obj_fname_cb *find_byname; /* Find named object by name */ + ipfw_obj_fidx_cb *find_bykidx; /* Find named object by kidx */ + ipfw_obj_create_cb *create_object; /* Create named object */ + ipfw_obj_destroy_cb *destroy_object;/* Destroy named object */ + ipfw_obj_sets_cb *manage_sets; /* Swap or move sets */ +}; + +#define IPFW_ADD_OBJ_REWRITER(f, c) do { \ + if ((f) != 0) \ + ipfw_add_obj_rewriter(c, \ + sizeof(c) / sizeof(c[0])); \ + } while(0) +#define IPFW_DEL_OBJ_REWRITER(l, c) do { \ + if ((l) != 0) \ + ipfw_del_obj_rewriter(c, \ + sizeof(c) / sizeof(c[0])); \ + } while(0) + +/* In ip_fw_iface.c */ +int ipfw_iface_init(void); +void ipfw_iface_destroy(void); +void vnet_ipfw_iface_destroy(struct ip_fw_chain *ch); +int ipfw_iface_ref(struct ip_fw_chain *ch, char *name, + struct ipfw_ifc *ic); +void ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic); +void ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); +void ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); + /* In ip_fw_sockopt.c */ +void ipfw_init_skipto_cache(struct ip_fw_chain *chain); +void ipfw_destroy_skipto_cache(struct ip_fw_chain *chain); int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); -int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule); -int ipfw_ctl(struct sockopt *sopt); +int ipfw_ctl3(struct sockopt *sopt); int ipfw_chk(struct ip_fw_args *args); +void ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head, + struct ip_fw *rule); void ipfw_reap_rules(struct ip_fw *head); - -/* In ip_fw_pfil */ -int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, - struct inpcb *inp); +void ipfw_init_counters(void); +void ipfw_destroy_counters(void); +struct ip_fw *ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize); +int ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt); + +typedef int (sopt_handler_f)(struct ip_fw_chain *ch, + ip_fw3_opheader *op3, struct sockopt_data *sd); +struct ipfw_sopt_handler { + uint16_t opcode; + uint8_t version; + uint8_t dir; + sopt_handler_f *handler; + uint64_t refcnt; +}; +#define HDIR_SET 0x01 /* Handler is used to set some data */ +#define HDIR_GET 0x02 /* Handler is used to retrieve data */ +#define HDIR_BOTH HDIR_GET|HDIR_SET + +void ipfw_init_sopt_handler(void); +void ipfw_destroy_sopt_handler(void); +void ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); +int ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); +caddr_t ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed); +caddr_t ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed); +#define IPFW_ADD_SOPT_HANDLER(f, c) do { \ + if ((f) != 0) \ + ipfw_add_sopt_handler(c, \ + sizeof(c) / sizeof(c[0])); \ + } while(0) +#define IPFW_DEL_SOPT_HANDLER(l, c) do { \ + if ((l) != 0) \ + ipfw_del_sopt_handler(c, \ + sizeof(c) / sizeof(c[0])); \ + } while(0) + +struct namedobj_instance; +typedef int (objhash_cb_t)(struct namedobj_instance *ni, struct named_object *, + void *arg); +typedef uint32_t (objhash_hash_f)(struct namedobj_instance *ni, const void *key, + uint32_t kopt); +typedef int (objhash_cmp_f)(struct named_object *no, const void *key, + uint32_t kopt); +struct namedobj_instance *ipfw_objhash_create(uint32_t items); +void ipfw_objhash_destroy(struct namedobj_instance *); +void ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks); +void ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, + void **idx, int *blocks); +void ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, + void **idx, int *blocks); +void ipfw_objhash_bitmap_free(void *idx, int blocks); +void ipfw_objhash_set_hashf(struct namedobj_instance *ni, objhash_hash_f *f); +struct named_object *ipfw_objhash_lookup_name(struct namedobj_instance *ni, + uint32_t set, char *name); +struct named_object *ipfw_objhash_lookup_name_type(struct namedobj_instance *ni, + uint32_t set, uint32_t type, const char *name); +struct named_object *ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, + uint16_t idx); +int ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a, + struct named_object *b); +void ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no); +void ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no); +uint32_t ipfw_objhash_count(struct namedobj_instance *ni); +uint32_t ipfw_objhash_count_type(struct namedobj_instance *ni, uint16_t type); +int ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, + void *arg); +int ipfw_objhash_foreach_type(struct namedobj_instance *ni, objhash_cb_t *f, + void *arg, uint16_t type); +int ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx); +int ipfw_objhash_alloc_idx(void *n, uint16_t *pidx); +void ipfw_objhash_set_funcs(struct namedobj_instance *ni, + objhash_hash_f *hash_f, objhash_cmp_f *cmp_f); +int ipfw_objhash_find_type(struct namedobj_instance *ni, struct tid_info *ti, + uint32_t etlv, struct named_object **pno); +void ipfw_export_obj_ntlv(struct named_object *no, ipfw_obj_ntlv *ntlv); +ipfw_obj_ntlv *ipfw_find_name_tlv_type(void *tlvs, int len, uint16_t uidx, + uint32_t etlv); +void ipfw_init_obj_rewriter(void); +void ipfw_destroy_obj_rewriter(void); +void ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count); +int ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count); + +int create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd, + struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti); +void update_opcode_kidx(ipfw_insn *cmd, uint16_t idx); +int classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx); +void ipfw_init_srv(struct ip_fw_chain *ch); +void ipfw_destroy_srv(struct ip_fw_chain *ch); +int ipfw_check_object_name_generic(const char *name); +int ipfw_obj_manage_sets(struct namedobj_instance *ni, uint16_t type, + uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd); + +/* In ip_fw_eaction.c */ +typedef int (ipfw_eaction_t)(struct ip_fw_chain *ch, struct ip_fw_args *args, + ipfw_insn *cmd, int *done); +int ipfw_eaction_init(struct ip_fw_chain *ch, int first); +void ipfw_eaction_uninit(struct ip_fw_chain *ch, int last); + +uint16_t ipfw_add_eaction(struct ip_fw_chain *ch, ipfw_eaction_t handler, + const char *name); +int ipfw_del_eaction(struct ip_fw_chain *ch, uint16_t eaction_id); +int ipfw_run_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args, + ipfw_insn *cmd, int *done); /* In ip_fw_table.c */ -struct radix_node; +struct table_info; + +typedef int (table_lookup_t)(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); + int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint32_t *val); -int ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, - uint32_t *val, int type); -int ipfw_init_tables(struct ip_fw_chain *ch); -void ipfw_destroy_tables(struct ip_fw_chain *ch); -int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl); -int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, - uint8_t plen, uint8_t mlen, uint8_t type, uint32_t value); -int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, - uint8_t plen, uint8_t mlen, uint8_t type); -int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt); -int ipfw_dump_table_entry(struct radix_node *rn, void *arg); -int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl); -int ipfw_count_xtable(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt); -int ipfw_dump_xtable(struct ip_fw_chain *ch, ipfw_xtable *tbl); +int ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, + uint16_t plen, void *paddr, uint32_t *val); +struct named_object *ipfw_objhash_lookup_table_kidx(struct ip_fw_chain *ch, + uint16_t kidx); +int ipfw_ref_table(struct ip_fw_chain *ch, ipfw_obj_ntlv *ntlv, uint16_t *kidx); +void ipfw_unref_table(struct ip_fw_chain *ch, uint16_t kidx); +int ipfw_init_tables(struct ip_fw_chain *ch, int first); int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables); +int ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int nsets); +void ipfw_destroy_tables(struct ip_fw_chain *ch, int last); /* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ @@ -341,5 +771,22 @@ extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; +/* Helper functions for IP checksum adjustment */ +static __inline uint16_t +cksum_add(uint16_t sum, uint16_t a) +{ + uint16_t res; + + res = sum + a; + return (res + (res < a)); +} + +static __inline uint16_t +cksum_adjust(uint16_t oldsum, uint16_t old, uint16_t new) +{ + + return (~cksum_add(cksum_add(~oldsum, ~old), new)); +} + #endif /* _KERNEL */ #endif /* _IPFW2_PRIVATE_H */ diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_sockopt.c b/freebsd/sys/netpfil/ipfw/ip_fw_sockopt.c index 95cd8c81..468e4ad4 100644 --- a/freebsd/sys/netpfil/ipfw/ip_fw_sockopt.c +++ b/freebsd/sys/netpfil/ipfw/ip_fw_sockopt.c @@ -2,6 +2,8 @@ /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * Copyright (c) 2014 Yandex LLC + * Copyright (c) 2014 Alexander V. Chernikov * * Supported by: Valeria Paoli * @@ -31,8 +33,8 @@ __FBSDID("$FreeBSD$"); /* - * Sockopt support for ipfw. The routines here implement - * the upper half of the ipfw code. + * Control socket and rule management routines for ipfw. + * Control is currently implemented via IP_FW3 setsockopt() code. */ #include <rtems/bsd/local/opt_ipfw.h> @@ -51,30 +53,174 @@ __FBSDID("$FreeBSD$"); #include <sys/priv.h> #include <sys/proc.h> #include <sys/rwlock.h> +#include <sys/rmlock.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sysctl.h> #include <sys/syslog.h> +#include <sys/fnv_hash.h> #include <net/if.h> #include <net/route.h> #include <net/vnet.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> #include <netinet/in.h> #include <netinet/ip_var.h> /* hooks */ #include <netinet/ip_fw.h> #include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/ip_fw_table.h> #ifdef MAC #include <security/mac/mac_framework.h> #endif +static int ipfw_ctl(struct sockopt *sopt); +static int check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, + struct rule_check_info *ci); +static int check_ipfw_rule1(struct ip_fw_rule *rule, int size, + struct rule_check_info *ci); +static int check_ipfw_rule0(struct ip_fw_rule0 *rule, int size, + struct rule_check_info *ci); +static int rewrite_rule_uidx(struct ip_fw_chain *chain, + struct rule_check_info *ci); + +#define NAMEDOBJ_HASH_SIZE 32 + +struct namedobj_instance { + struct namedobjects_head *names; + struct namedobjects_head *values; + uint32_t nn_size; /* names hash size */ + uint32_t nv_size; /* number hash size */ + u_long *idx_mask; /* used items bitmask */ + uint32_t max_blocks; /* number of "long" blocks in bitmask */ + uint32_t count; /* number of items */ + uint16_t free_off[IPFW_MAX_SETS]; /* first possible free offset */ + objhash_hash_f *hash_f; + objhash_cmp_f *cmp_f; +}; +#define BLOCK_ITEMS (8 * sizeof(u_long)) /* Number of items for ffsl() */ + +static uint32_t objhash_hash_name(struct namedobj_instance *ni, + const void *key, uint32_t kopt); +static uint32_t objhash_hash_idx(struct namedobj_instance *ni, uint32_t val); +static int objhash_cmp_name(struct named_object *no, const void *name, + uint32_t set); + MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); +static int dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); +static int add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); +static int del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); +static int clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); +static int move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); +static int manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); +static int dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); +static int dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); + +/* ctl3 handler data */ +struct mtx ctl3_lock; +#define CTL3_LOCK_INIT() mtx_init(&ctl3_lock, "ctl3_lock", NULL, MTX_DEF) +#define CTL3_LOCK_DESTROY() mtx_destroy(&ctl3_lock) +#define CTL3_LOCK() mtx_lock(&ctl3_lock) +#define CTL3_UNLOCK() mtx_unlock(&ctl3_lock) + +static struct ipfw_sopt_handler *ctl3_handlers; +static size_t ctl3_hsize; +static uint64_t ctl3_refct, ctl3_gencnt; +#define CTL3_SMALLBUF 4096 /* small page-size write buffer */ +#define CTL3_LARGEBUF 16 * 1024 * 1024 /* handle large rulesets */ + +static int ipfw_flush_sopt_data(struct sockopt_data *sd); + +static struct ipfw_sopt_handler scodes[] = { + { IP_FW_XGET, 0, HDIR_GET, dump_config }, + { IP_FW_XADD, 0, HDIR_BOTH, add_rules }, + { IP_FW_XDEL, 0, HDIR_BOTH, del_rules }, + { IP_FW_XZERO, 0, HDIR_SET, clear_rules }, + { IP_FW_XRESETLOG, 0, HDIR_SET, clear_rules }, + { IP_FW_XMOVE, 0, HDIR_SET, move_rules }, + { IP_FW_SET_SWAP, 0, HDIR_SET, manage_sets }, + { IP_FW_SET_MOVE, 0, HDIR_SET, manage_sets }, + { IP_FW_SET_ENABLE, 0, HDIR_SET, manage_sets }, + { IP_FW_DUMP_SOPTCODES, 0, HDIR_GET, dump_soptcodes }, + { IP_FW_DUMP_SRVOBJECTS,0, HDIR_GET, dump_srvobjects }, +}; + +static int +set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule); +static struct opcode_obj_rewrite *find_op_rw(ipfw_insn *cmd, + uint16_t *puidx, uint8_t *ptype); +static int mark_object_kidx(struct ip_fw_chain *ch, struct ip_fw *rule, + uint32_t *bmask); +static int ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule, + struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti); +static int ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd, + struct tid_info *ti, struct obj_idx *pidx, int *unresolved); +static void unref_rule_objects(struct ip_fw_chain *chain, struct ip_fw *rule); +static void unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd, + struct obj_idx *oib, struct obj_idx *end); +static int export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx, + struct sockopt_data *sd); + +/* + * Opcode object rewriter variables + */ +struct opcode_obj_rewrite *ctl3_rewriters; +static size_t ctl3_rsize; + /* - * static variables followed by global ones (none in this file) + * static variables followed by global ones */ +static VNET_DEFINE(uma_zone_t, ipfw_cntr_zone); +#define V_ipfw_cntr_zone VNET(ipfw_cntr_zone) + +void +ipfw_init_counters() +{ + + V_ipfw_cntr_zone = uma_zcreate("IPFW counters", + IPFW_RULE_CNTR_SIZE, NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, UMA_ZONE_PCPU); +} + +void +ipfw_destroy_counters() +{ + + uma_zdestroy(V_ipfw_cntr_zone); +} + +struct ip_fw * +ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize) +{ + struct ip_fw *rule; + + rule = malloc(rulesize, M_IPFW, M_WAITOK | M_ZERO); + rule->cntr = uma_zalloc(V_ipfw_cntr_zone, M_WAITOK | M_ZERO); + + return (rule); +} + +static void +free_rule(struct ip_fw *rule) +{ + + uma_zfree(V_ipfw_cntr_zone, rule->cntr); + free(rule, M_IPFW); +} + + /* * Find the smallest rule >= key, id. * We could use bsearch but it is so simple that we code it directly @@ -96,11 +242,109 @@ ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id) lo = i + 1; /* continue from the next one */ else /* r->id >= id */ hi = i; /* this might be good */ - }; + } return hi; } /* + * Builds skipto cache on rule set @map. + */ +static void +update_skipto_cache(struct ip_fw_chain *chain, struct ip_fw **map) +{ + int *smap, rulenum; + int i, mi; + + IPFW_UH_WLOCK_ASSERT(chain); + + mi = 0; + rulenum = map[mi]->rulenum; + smap = chain->idxmap_back; + + if (smap == NULL) + return; + + for (i = 0; i < 65536; i++) { + smap[i] = mi; + /* Use the same rule index until i < rulenum */ + if (i != rulenum || i == 65535) + continue; + /* Find next rule with num > i */ + rulenum = map[++mi]->rulenum; + while (rulenum == i) + rulenum = map[++mi]->rulenum; + } +} + +/* + * Swaps prepared (backup) index with current one. + */ +static void +swap_skipto_cache(struct ip_fw_chain *chain) +{ + int *map; + + IPFW_UH_WLOCK_ASSERT(chain); + IPFW_WLOCK_ASSERT(chain); + + map = chain->idxmap; + chain->idxmap = chain->idxmap_back; + chain->idxmap_back = map; +} + +/* + * Allocate and initialize skipto cache. + */ +void +ipfw_init_skipto_cache(struct ip_fw_chain *chain) +{ + int *idxmap, *idxmap_back; + + idxmap = malloc(65536 * sizeof(uint32_t *), M_IPFW, + M_WAITOK | M_ZERO); + idxmap_back = malloc(65536 * sizeof(uint32_t *), M_IPFW, + M_WAITOK | M_ZERO); + + /* + * Note we may be called at any time after initialization, + * for example, on first skipto rule, so we need to + * provide valid chain->idxmap on return + */ + + IPFW_UH_WLOCK(chain); + if (chain->idxmap != NULL) { + IPFW_UH_WUNLOCK(chain); + free(idxmap, M_IPFW); + free(idxmap_back, M_IPFW); + return; + } + + /* Set backup pointer first to permit building cache */ + chain->idxmap_back = idxmap_back; + update_skipto_cache(chain, chain->map); + IPFW_WLOCK(chain); + /* It is now safe to set chain->idxmap ptr */ + chain->idxmap = idxmap; + swap_skipto_cache(chain); + IPFW_WUNLOCK(chain); + IPFW_UH_WUNLOCK(chain); +} + +/* + * Destroys skipto cache. + */ +void +ipfw_destroy_skipto_cache(struct ip_fw_chain *chain) +{ + + if (chain->idxmap != NULL) + free(chain->idxmap, M_IPFW); + if (chain->idxmap != NULL) + free(chain->idxmap_back, M_IPFW); +} + + +/* * allocate a new map, returns the chain locked. extra is the number * of entries to add or delete. */ @@ -110,11 +354,12 @@ get_map(struct ip_fw_chain *chain, int extra, int locked) for (;;) { struct ip_fw **map; - int i; + int i, mflags; + + mflags = M_ZERO | ((locked != 0) ? M_NOWAIT : M_WAITOK); i = chain->n_rules + extra; - map = malloc(i * sizeof(struct ip_fw *), M_IPFW, - locked ? M_NOWAIT : M_WAITOK); + map = malloc(i * sizeof(struct ip_fw *), M_IPFW, mflags); if (map == NULL) { printf("%s: cannot allocate map\n", __FUNCTION__); return NULL; @@ -143,69 +388,403 @@ swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len) chain->n_rules = new_len; old_map = chain->map; chain->map = new_map; + swap_skipto_cache(chain); IPFW_WUNLOCK(chain); return old_map; } + +static void +export_cntr1_base(struct ip_fw *krule, struct ip_fw_bcounter *cntr) +{ + struct timeval boottime; + + cntr->size = sizeof(*cntr); + + if (krule->cntr != NULL) { + cntr->pcnt = counter_u64_fetch(krule->cntr); + cntr->bcnt = counter_u64_fetch(krule->cntr + 1); + cntr->timestamp = krule->timestamp; + } + if (cntr->timestamp > 0) { + getboottime(&boottime); + cntr->timestamp += boottime.tv_sec; + } +} + +static void +export_cntr0_base(struct ip_fw *krule, struct ip_fw_bcounter0 *cntr) +{ + struct timeval boottime; + + if (krule->cntr != NULL) { + cntr->pcnt = counter_u64_fetch(krule->cntr); + cntr->bcnt = counter_u64_fetch(krule->cntr + 1); + cntr->timestamp = krule->timestamp; + } + if (cntr->timestamp > 0) { + getboottime(&boottime); + cntr->timestamp += boottime.tv_sec; + } +} + +/* + * Copies rule @urule from v1 userland format (current). + * to kernel @krule. + * Assume @krule is zeroed. + */ +static void +import_rule1(struct rule_check_info *ci) +{ + struct ip_fw_rule *urule; + struct ip_fw *krule; + + urule = (struct ip_fw_rule *)ci->urule; + krule = (struct ip_fw *)ci->krule; + + /* copy header */ + krule->act_ofs = urule->act_ofs; + krule->cmd_len = urule->cmd_len; + krule->rulenum = urule->rulenum; + krule->set = urule->set; + krule->flags = urule->flags; + + /* Save rulenum offset */ + ci->urule_numoff = offsetof(struct ip_fw_rule, rulenum); + + /* Copy opcodes */ + memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t)); +} + +/* + * Export rule into v1 format (Current). + * Layout: + * [ ipfw_obj_tlv(IPFW_TLV_RULE_ENT) + * [ ip_fw_rule ] OR + * [ ip_fw_bcounter ip_fw_rule] (depends on rcntrs). + * ] + * Assume @data is zeroed. + */ +static void +export_rule1(struct ip_fw *krule, caddr_t data, int len, int rcntrs) +{ + struct ip_fw_bcounter *cntr; + struct ip_fw_rule *urule; + ipfw_obj_tlv *tlv; + + /* Fill in TLV header */ + tlv = (ipfw_obj_tlv *)data; + tlv->type = IPFW_TLV_RULE_ENT; + tlv->length = len; + + if (rcntrs != 0) { + /* Copy counters */ + cntr = (struct ip_fw_bcounter *)(tlv + 1); + urule = (struct ip_fw_rule *)(cntr + 1); + export_cntr1_base(krule, cntr); + } else + urule = (struct ip_fw_rule *)(tlv + 1); + + /* copy header */ + urule->act_ofs = krule->act_ofs; + urule->cmd_len = krule->cmd_len; + urule->rulenum = krule->rulenum; + urule->set = krule->set; + urule->flags = krule->flags; + urule->id = krule->id; + + /* Copy opcodes */ + memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t)); +} + + +/* + * Copies rule @urule from FreeBSD8 userland format (v0) + * to kernel @krule. + * Assume @krule is zeroed. + */ +static void +import_rule0(struct rule_check_info *ci) +{ + struct ip_fw_rule0 *urule; + struct ip_fw *krule; + int cmdlen, l; + ipfw_insn *cmd; + ipfw_insn_limit *lcmd; + ipfw_insn_if *cmdif; + + urule = (struct ip_fw_rule0 *)ci->urule; + krule = (struct ip_fw *)ci->krule; + + /* copy header */ + krule->act_ofs = urule->act_ofs; + krule->cmd_len = urule->cmd_len; + krule->rulenum = urule->rulenum; + krule->set = urule->set; + if ((urule->_pad & 1) != 0) + krule->flags |= IPFW_RULE_NOOPT; + + /* Save rulenum offset */ + ci->urule_numoff = offsetof(struct ip_fw_rule0, rulenum); + + /* Copy opcodes */ + memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t)); + + /* + * Alter opcodes: + * 1) convert tablearg value from 65535 to 0 + * 2) Add high bit to O_SETFIB/O_SETDSCP values (to make room + * for targ). + * 3) convert table number in iface opcodes to u16 + * 4) convert old `nat global` into new 65535 + */ + l = krule->cmd_len; + cmd = krule->cmd; + cmdlen = 0; + + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + switch (cmd->opcode) { + /* Opcodes supporting tablearg */ + case O_TAG: + case O_TAGGED: + case O_PIPE: + case O_QUEUE: + case O_DIVERT: + case O_TEE: + case O_SKIPTO: + case O_CALLRETURN: + case O_NETGRAPH: + case O_NGTEE: + case O_NAT: + if (cmd->arg1 == IP_FW_TABLEARG) + cmd->arg1 = IP_FW_TARG; + else if (cmd->arg1 == 0) + cmd->arg1 = IP_FW_NAT44_GLOBAL; + break; + case O_SETFIB: + case O_SETDSCP: + if (cmd->arg1 == IP_FW_TABLEARG) + cmd->arg1 = IP_FW_TARG; + else + cmd->arg1 |= 0x8000; + break; + case O_LIMIT: + lcmd = (ipfw_insn_limit *)cmd; + if (lcmd->conn_limit == IP_FW_TABLEARG) + lcmd->conn_limit = IP_FW_TARG; + break; + /* Interface tables */ + case O_XMIT: + case O_RECV: + case O_VIA: + /* Interface table, possibly */ + cmdif = (ipfw_insn_if *)cmd; + if (cmdif->name[0] != '\1') + break; + + cmdif->p.kidx = (uint16_t)cmdif->p.glob; + break; + } + } +} + +/* + * Copies rule @krule from kernel to FreeBSD8 userland format (v0) + */ +static void +export_rule0(struct ip_fw *krule, struct ip_fw_rule0 *urule, int len) +{ + int cmdlen, l; + ipfw_insn *cmd; + ipfw_insn_limit *lcmd; + ipfw_insn_if *cmdif; + + /* copy header */ + memset(urule, 0, len); + urule->act_ofs = krule->act_ofs; + urule->cmd_len = krule->cmd_len; + urule->rulenum = krule->rulenum; + urule->set = krule->set; + if ((krule->flags & IPFW_RULE_NOOPT) != 0) + urule->_pad |= 1; + + /* Copy opcodes */ + memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t)); + + /* Export counters */ + export_cntr0_base(krule, (struct ip_fw_bcounter0 *)&urule->pcnt); + + /* + * Alter opcodes: + * 1) convert tablearg value from 0 to 65535 + * 2) Remove highest bit from O_SETFIB/O_SETDSCP values. + * 3) convert table number in iface opcodes to int + */ + l = urule->cmd_len; + cmd = urule->cmd; + cmdlen = 0; + + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + switch (cmd->opcode) { + /* Opcodes supporting tablearg */ + case O_TAG: + case O_TAGGED: + case O_PIPE: + case O_QUEUE: + case O_DIVERT: + case O_TEE: + case O_SKIPTO: + case O_CALLRETURN: + case O_NETGRAPH: + case O_NGTEE: + case O_NAT: + if (cmd->arg1 == IP_FW_TARG) + cmd->arg1 = IP_FW_TABLEARG; + else if (cmd->arg1 == IP_FW_NAT44_GLOBAL) + cmd->arg1 = 0; + break; + case O_SETFIB: + case O_SETDSCP: + if (cmd->arg1 == IP_FW_TARG) + cmd->arg1 = IP_FW_TABLEARG; + else + cmd->arg1 &= ~0x8000; + break; + case O_LIMIT: + lcmd = (ipfw_insn_limit *)cmd; + if (lcmd->conn_limit == IP_FW_TARG) + lcmd->conn_limit = IP_FW_TABLEARG; + break; + /* Interface tables */ + case O_XMIT: + case O_RECV: + case O_VIA: + /* Interface table, possibly */ + cmdif = (ipfw_insn_if *)cmd; + if (cmdif->name[0] != '\1') + break; + + cmdif->p.glob = cmdif->p.kidx; + break; + } + } +} + /* - * Add a new rule to the list. Copy the rule into a malloc'ed area, then - * possibly create a rule number and add the rule to the list. + * Add new rule(s) to the list possibly creating rule number for each. * Update the rule_number in the input struct so the caller knows it as well. - * XXX DO NOT USE FOR THE DEFAULT RULE. * Must be called without IPFW_UH held */ -int -ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) +static int +commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, int count) { - struct ip_fw *rule; - int i, l, insert_before; + int error, i, insert_before, tcount; + uint16_t rulenum, *pnum; + struct rule_check_info *ci; + struct ip_fw *krule; struct ip_fw **map; /* the new array of pointers */ - if (chain->map == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE - 1) - return (EINVAL); + /* Check if we need to do table/obj index remap */ + tcount = 0; + for (ci = rci, i = 0; i < count; ci++, i++) { + if (ci->object_opcodes == 0) + continue; + + /* + * Rule has some object opcodes. + * We need to find (and create non-existing) + * kernel objects, and reference existing ones. + */ + error = rewrite_rule_uidx(chain, ci); + if (error != 0) { + + /* + * rewrite failed, state for current rule + * has been reverted. Check if we need to + * revert more. + */ + if (tcount > 0) { + + /* + * We have some more table rules + * we need to rollback. + */ + + IPFW_UH_WLOCK(chain); + while (ci != rci) { + ci--; + if (ci->object_opcodes == 0) + continue; + unref_rule_objects(chain,ci->krule); + + } + IPFW_UH_WUNLOCK(chain); + + } + + return (error); + } + + tcount++; + } - l = RULESIZE(input_rule); - rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO); - if (rule == NULL) - return (ENOSPC); /* get_map returns with IPFW_UH_WLOCK if successful */ - map = get_map(chain, 1, 0 /* not locked */); + map = get_map(chain, count, 0 /* not locked */); if (map == NULL) { - free(rule, M_IPFW); - return ENOSPC; - } + if (tcount > 0) { + /* Unbind tables */ + IPFW_UH_WLOCK(chain); + for (ci = rci, i = 0; i < count; ci++, i++) { + if (ci->object_opcodes == 0) + continue; + + unref_rule_objects(chain, ci->krule); + } + IPFW_UH_WUNLOCK(chain); + } - bcopy(input_rule, rule, l); - /* clear fields not settable from userland */ - rule->x_next = NULL; - rule->next_rule = NULL; - IPFW_ZERO_RULE_COUNTER(rule); + return (ENOSPC); + } if (V_autoinc_step < 1) V_autoinc_step = 1; else if (V_autoinc_step > 1000) V_autoinc_step = 1000; + + /* FIXME: Handle count > 1 */ + ci = rci; + krule = ci->krule; + rulenum = krule->rulenum; + /* find the insertion point, we will insert before */ - insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE; + insert_before = rulenum ? rulenum + 1 : IPFW_DEFAULT_RULE; i = ipfw_find_rule(chain, insert_before, 0); /* duplicate first part */ if (i > 0) bcopy(chain->map, map, i * sizeof(struct ip_fw *)); - map[i] = rule; + map[i] = krule; /* duplicate remaining part, we always have the default rule */ bcopy(chain->map + i, map + i + 1, sizeof(struct ip_fw *) *(chain->n_rules - i)); - if (rule->rulenum == 0) { - /* write back the number */ - rule->rulenum = i > 0 ? map[i-1]->rulenum : 0; - if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) - rule->rulenum += V_autoinc_step; - input_rule->rulenum = rule->rulenum; + if (rulenum == 0) { + /* Compute rule number and write it back */ + rulenum = i > 0 ? map[i-1]->rulenum : 0; + if (rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) + rulenum += V_autoinc_step; + krule->rulenum = rulenum; + /* Save number to userland rule */ + pnum = (uint16_t *)((caddr_t)ci->urule + ci->urule_numoff); + *pnum = rulenum; } - rule->id = chain->id + 1; + krule->id = chain->id + 1; + update_skipto_cache(chain, map); map = swap_map(chain, map, chain->n_rules + 1); - chain->static_len += l; + chain->static_len += RULEUSIZE0(krule); IPFW_UH_WUNLOCK(chain); if (map) free(map, M_IPFW); @@ -213,6 +792,23 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) } /* + * Adds @rule to the list of rules to reap + */ +void +ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head, + struct ip_fw *rule) +{ + + IPFW_UH_WLOCK_ASSERT(chain); + + /* Unlink rule from everywhere */ + unref_rule_objects(chain, rule); + + *((struct ip_fw **)rule) = *head; + *head = rule; +} + +/* * Reclaim storage associated with a list of rules. This is * typically the list created using remove_rule. * A NULL pointer on input is handled correctly. @@ -223,22 +819,12 @@ ipfw_reap_rules(struct ip_fw *head) struct ip_fw *rule; while ((rule = head) != NULL) { - head = head->x_next; - free(rule, M_IPFW); + head = *((struct ip_fw **)head); + free_rule(rule); } } /* - * Used by del_entry() to check if a rule should be kept. - * Returns 1 if the rule must be kept, 0 otherwise. - * - * Called with cmd = {0,1,5}. - * cmd == 0 matches on rule numbers, excludes rules in RESVD_SET if n == 0 ; - * cmd == 1 matches on set numbers only, rule numbers are ignored; - * cmd == 5 matches on rule and set numbers. - * - * n == 0 is a wildcard for rule numbers, there is no wildcard for sets. - * * Rules to keep are * (default || reserved || !match_set || !match_number) * where @@ -255,14 +841,608 @@ ipfw_reap_rules(struct ip_fw *head) * // number is ignored for cmd == 1 or n == 0 * */ +int +ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt) +{ + + /* Don't match default rule for modification queries */ + if (rule->rulenum == IPFW_DEFAULT_RULE && + (rt->flags & IPFW_RCFLAG_DEFAULT) == 0) + return (0); + + /* Don't match rules in reserved set for flush requests */ + if ((rt->flags & IPFW_RCFLAG_ALL) != 0 && rule->set == RESVD_SET) + return (0); + + /* If we're filtering by set, don't match other sets */ + if ((rt->flags & IPFW_RCFLAG_SET) != 0 && rule->set != rt->set) + return (0); + + if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 && + (rule->rulenum < rt->start_rule || rule->rulenum > rt->end_rule)) + return (0); + + return (1); +} + +struct manage_sets_args { + uint16_t set; + uint8_t new_set; +}; + +static int +swap_sets_cb(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + struct manage_sets_args *args; + + args = (struct manage_sets_args *)arg; + if (no->set == (uint8_t)args->set) + no->set = args->new_set; + else if (no->set == args->new_set) + no->set = (uint8_t)args->set; + return (0); +} + +static int +move_sets_cb(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + struct manage_sets_args *args; + + args = (struct manage_sets_args *)arg; + if (no->set == (uint8_t)args->set) + no->set = args->new_set; + return (0); +} + +static int +test_sets_cb(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + struct manage_sets_args *args; + + args = (struct manage_sets_args *)arg; + if (no->set != (uint8_t)args->set) + return (0); + if (ipfw_objhash_lookup_name_type(ni, args->new_set, + no->etlv, no->name) != NULL) + return (EEXIST); + return (0); +} + +/* + * Generic function to handler moving and swapping sets. + */ +int +ipfw_obj_manage_sets(struct namedobj_instance *ni, uint16_t type, + uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd) +{ + struct manage_sets_args args; + struct named_object *no; + + args.set = set; + args.new_set = new_set; + switch (cmd) { + case SWAP_ALL: + return (ipfw_objhash_foreach_type(ni, swap_sets_cb, + &args, type)); + case TEST_ALL: + return (ipfw_objhash_foreach_type(ni, test_sets_cb, + &args, type)); + case MOVE_ALL: + return (ipfw_objhash_foreach_type(ni, move_sets_cb, + &args, type)); + case COUNT_ONE: + /* + * @set used to pass kidx. + * When @new_set is zero - reset object counter, + * otherwise increment it. + */ + no = ipfw_objhash_lookup_kidx(ni, set); + if (new_set != 0) + no->ocnt++; + else + no->ocnt = 0; + return (0); + case TEST_ONE: + /* @set used to pass kidx */ + no = ipfw_objhash_lookup_kidx(ni, set); + /* + * First check number of references: + * when it differs, this mean other rules are holding + * reference to given object, so it is not possible to + * change its set. Note that refcnt may account references + * to some going-to-be-added rules. Since we don't know + * their numbers (and even if they will be added) it is + * perfectly OK to return error here. + */ + if (no->ocnt != no->refcnt) + return (EBUSY); + if (ipfw_objhash_lookup_name_type(ni, new_set, type, + no->name) != NULL) + return (EEXIST); + return (0); + case MOVE_ONE: + /* @set used to pass kidx */ + no = ipfw_objhash_lookup_kidx(ni, set); + no->set = new_set; + return (0); + } + return (EINVAL); +} + +/* + * Delete rules matching range @rt. + * Saves number of deleted rules in @ndel. + * + * Returns 0 on success. + */ +static int +delete_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int *ndel) +{ + struct ip_fw *reap, *rule, **map; + int end, start; + int i, n, ndyn, ofs; + + reap = NULL; + IPFW_UH_WLOCK(chain); /* arbitrate writers */ + + /* + * Stage 1: Determine range to inspect. + * Range is half-inclusive, e.g [start, end). + */ + start = 0; + end = chain->n_rules - 1; + + if ((rt->flags & IPFW_RCFLAG_RANGE) != 0) { + start = ipfw_find_rule(chain, rt->start_rule, 0); + + end = ipfw_find_rule(chain, rt->end_rule, 0); + if (rt->end_rule != IPFW_DEFAULT_RULE) + while (chain->map[end]->rulenum == rt->end_rule) + end++; + } + + /* Allocate new map of the same size */ + map = get_map(chain, 0, 1 /* locked */); + if (map == NULL) { + IPFW_UH_WUNLOCK(chain); + return (ENOMEM); + } + + n = 0; + ndyn = 0; + ofs = start; + /* 1. bcopy the initial part of the map */ + if (start > 0) + bcopy(chain->map, map, start * sizeof(struct ip_fw *)); + /* 2. copy active rules between start and end */ + for (i = start; i < end; i++) { + rule = chain->map[i]; + if (ipfw_match_range(rule, rt) == 0) { + map[ofs++] = rule; + continue; + } + + n++; + if (ipfw_is_dyn_rule(rule) != 0) + ndyn++; + } + /* 3. copy the final part of the map */ + bcopy(chain->map + end, map + ofs, + (chain->n_rules - end) * sizeof(struct ip_fw *)); + /* 4. recalculate skipto cache */ + update_skipto_cache(chain, map); + /* 5. swap the maps (under UH_WLOCK + WHLOCK) */ + map = swap_map(chain, map, chain->n_rules - n); + /* 6. Remove all dynamic states originated by deleted rules */ + if (ndyn > 0) + ipfw_expire_dyn_rules(chain, rt); + /* 7. now remove the rules deleted from the old map */ + for (i = start; i < end; i++) { + rule = map[i]; + if (ipfw_match_range(rule, rt) == 0) + continue; + chain->static_len -= RULEUSIZE0(rule); + ipfw_reap_add(chain, &reap, rule); + } + IPFW_UH_WUNLOCK(chain); + + ipfw_reap_rules(reap); + if (map != NULL) + free(map, M_IPFW); + *ndel = n; + return (0); +} + +static int +move_objects(struct ip_fw_chain *ch, ipfw_range_tlv *rt) +{ + struct opcode_obj_rewrite *rw; + struct ip_fw *rule; + ipfw_insn *cmd; + int cmdlen, i, l, c; + uint16_t kidx; + + IPFW_UH_WLOCK_ASSERT(ch); + + /* Stage 1: count number of references by given rules */ + for (c = 0, i = 0; i < ch->n_rules - 1; i++) { + rule = ch->map[i]; + if (ipfw_match_range(rule, rt) == 0) + continue; + if (rule->set == rt->new_set) /* nothing to do */ + continue; + /* Search opcodes with named objects */ + for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd; + l > 0; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + rw = find_op_rw(cmd, &kidx, NULL); + if (rw == NULL || rw->manage_sets == NULL) + continue; + /* + * When manage_sets() returns non-zero value to + * COUNT_ONE command, consider this as an object + * doesn't support sets (e.g. disabled with sysctl). + * So, skip checks for this object. + */ + if (rw->manage_sets(ch, kidx, 1, COUNT_ONE) != 0) + continue; + c++; + } + } + if (c == 0) /* No objects found */ + return (0); + /* Stage 2: verify "ownership" */ + for (c = 0, i = 0; (i < ch->n_rules - 1) && c == 0; i++) { + rule = ch->map[i]; + if (ipfw_match_range(rule, rt) == 0) + continue; + if (rule->set == rt->new_set) /* nothing to do */ + continue; + /* Search opcodes with named objects */ + for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd; + l > 0 && c == 0; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + rw = find_op_rw(cmd, &kidx, NULL); + if (rw == NULL || rw->manage_sets == NULL) + continue; + /* Test for ownership and conflicting names */ + c = rw->manage_sets(ch, kidx, + (uint8_t)rt->new_set, TEST_ONE); + } + } + /* Stage 3: change set and cleanup */ + for (i = 0; i < ch->n_rules - 1; i++) { + rule = ch->map[i]; + if (ipfw_match_range(rule, rt) == 0) + continue; + if (rule->set == rt->new_set) /* nothing to do */ + continue; + /* Search opcodes with named objects */ + for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd; + l > 0; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + rw = find_op_rw(cmd, &kidx, NULL); + if (rw == NULL || rw->manage_sets == NULL) + continue; + /* cleanup object counter */ + rw->manage_sets(ch, kidx, + 0 /* reset counter */, COUNT_ONE); + if (c != 0) + continue; + /* change set */ + rw->manage_sets(ch, kidx, + (uint8_t)rt->new_set, MOVE_ONE); + } + } + return (c); +}/* + * Changes set of given rule rannge @rt + * with each other. + * + * Returns 0 on success. + */ +static int +move_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt) +{ + struct ip_fw *rule; + int i; + + IPFW_UH_WLOCK(chain); + + /* + * Move rules with matching paramenerts to a new set. + * This one is much more complex. We have to ensure + * that all referenced tables (if any) are referenced + * by given rule subset only. Otherwise, we can't move + * them to new set and have to return error. + */ + if ((i = move_objects(chain, rt)) != 0) { + IPFW_UH_WUNLOCK(chain); + return (i); + } + + /* XXX: We have to do swap holding WLOCK */ + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + if (ipfw_match_range(rule, rt) == 0) + continue; + rule->set = rt->new_set; + } + + IPFW_UH_WUNLOCK(chain); + + return (0); +} + +/* + * Clear counters for a specific rule. + * Normally run under IPFW_UH_RLOCK, but these are idempotent ops + * so we only care that rules do not disappear. + */ +static void +clear_counters(struct ip_fw *rule, int log_only) +{ + ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); + + if (log_only == 0) + IPFW_ZERO_RULE_COUNTER(rule); + if (l->o.opcode == O_LOG) + l->log_left = l->max_log; +} + +/* + * Flushes rules counters and/or log values on matching range. + * + * Returns number of items cleared. + */ +static int +clear_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int log_only) +{ + struct ip_fw *rule; + int num; + int i; + + num = 0; + rt->flags |= IPFW_RCFLAG_DEFAULT; + + IPFW_UH_WLOCK(chain); /* arbitrate writers */ + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + if (ipfw_match_range(rule, rt) == 0) + continue; + clear_counters(rule, log_only); + num++; + } + IPFW_UH_WUNLOCK(chain); + + return (num); +} + +static int +check_range_tlv(ipfw_range_tlv *rt) +{ + + if (rt->head.length != sizeof(*rt)) + return (1); + if (rt->start_rule > rt->end_rule) + return (1); + if (rt->set >= IPFW_MAX_SETS || rt->new_set >= IPFW_MAX_SETS) + return (1); + + if ((rt->flags & IPFW_RCFLAG_USER) != rt->flags) + return (1); + + return (0); +} + +/* + * Delete rules matching specified parameters + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_range_tlv ] + * Reply: [ ipfw_obj_header ipfw_range_tlv ] + * + * Saves number of deleted rules in ipfw_range_tlv->new_set. + * + * Returns 0 on success. + */ +static int +del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_range_header *rh; + int error, ndel; + + if (sd->valsize != sizeof(*rh)) + return (EINVAL); + + rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); + + if (check_range_tlv(&rh->range) != 0) + return (EINVAL); + + ndel = 0; + if ((error = delete_range(chain, &rh->range, &ndel)) != 0) + return (error); + + /* Save number of rules deleted */ + rh->range.new_set = ndel; + return (0); +} + +/* + * Move rules/sets matching specified parameters + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_range_tlv ] + * + * Returns 0 on success. + */ +static int +move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_range_header *rh; + + if (sd->valsize != sizeof(*rh)) + return (EINVAL); + + rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); + + if (check_range_tlv(&rh->range) != 0) + return (EINVAL); + + return (move_range(chain, &rh->range)); +} + +/* + * Clear rule accounting data matching specified parameters + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_range_tlv ] + * Reply: [ ipfw_obj_header ipfw_range_tlv ] + * + * Saves number of cleared rules in ipfw_range_tlv->new_set. + * + * Returns 0 on success. + */ static int -keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n) +clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) { - return - (rule->rulenum == IPFW_DEFAULT_RULE) || - (cmd == 0 && n == 0 && rule->set == RESVD_SET) || - !(cmd == 0 || rule->set == set) || - !(cmd == 1 || n == 0 || n == rule->rulenum); + ipfw_range_header *rh; + int log_only, num; + char *msg; + + if (sd->valsize != sizeof(*rh)) + return (EINVAL); + + rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); + + if (check_range_tlv(&rh->range) != 0) + return (EINVAL); + + log_only = (op3->opcode == IP_FW_XRESETLOG); + + num = clear_range(chain, &rh->range, log_only); + + if (rh->range.flags & IPFW_RCFLAG_ALL) + msg = log_only ? "All logging counts reset" : + "Accounting cleared"; + else + msg = log_only ? "logging count reset" : "cleared"; + + if (V_fw_verbose) { + int lev = LOG_SECURITY | LOG_NOTICE; + log(lev, "ipfw: %s.\n", msg); + } + + /* Save number of rules cleared */ + rh->range.new_set = num; + return (0); +} + +static void +enable_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt) +{ + uint32_t v_set; + + IPFW_UH_WLOCK_ASSERT(chain); + + /* Change enabled/disabled sets mask */ + v_set = (V_set_disable | rt->set) & ~rt->new_set; + v_set &= ~(1 << RESVD_SET); /* set RESVD_SET always enabled */ + IPFW_WLOCK(chain); + V_set_disable = v_set; + IPFW_WUNLOCK(chain); +} + +static int +swap_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int mv) +{ + struct opcode_obj_rewrite *rw; + struct ip_fw *rule; + int i; + + IPFW_UH_WLOCK_ASSERT(chain); + + if (rt->set == rt->new_set) /* nothing to do */ + return (0); + + if (mv != 0) { + /* + * Berfore moving the rules we need to check that + * there aren't any conflicting named objects. + */ + for (rw = ctl3_rewriters; + rw < ctl3_rewriters + ctl3_rsize; rw++) { + if (rw->manage_sets == NULL) + continue; + i = rw->manage_sets(chain, (uint8_t)rt->set, + (uint8_t)rt->new_set, TEST_ALL); + if (i != 0) + return (EEXIST); + } + } + /* Swap or move two sets */ + for (i = 0; i < chain->n_rules - 1; i++) { + rule = chain->map[i]; + if (rule->set == (uint8_t)rt->set) + rule->set = (uint8_t)rt->new_set; + else if (rule->set == (uint8_t)rt->new_set && mv == 0) + rule->set = (uint8_t)rt->set; + } + for (rw = ctl3_rewriters; rw < ctl3_rewriters + ctl3_rsize; rw++) { + if (rw->manage_sets == NULL) + continue; + rw->manage_sets(chain, (uint8_t)rt->set, + (uint8_t)rt->new_set, mv != 0 ? MOVE_ALL: SWAP_ALL); + } + return (0); +} + +/* + * Swaps or moves set + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_range_tlv ] + * + * Returns 0 on success. + */ +static int +manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_range_header *rh; + int ret; + + if (sd->valsize != sizeof(*rh)) + return (EINVAL); + + rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); + + if (rh->range.head.length != sizeof(ipfw_range_tlv)) + return (1); + /* enable_sets() expects bitmasks. */ + if (op3->opcode != IP_FW_SET_ENABLE && + (rh->range.set >= IPFW_MAX_SETS || + rh->range.new_set >= IPFW_MAX_SETS)) + return (EINVAL); + + ret = 0; + IPFW_UH_WLOCK(chain); + switch (op3->opcode) { + case IP_FW_SET_SWAP: + case IP_FW_SET_MOVE: + ret = swap_sets(chain, &rh->range, + op3->opcode == IP_FW_SET_MOVE); + break; + case IP_FW_SET_ENABLE: + enable_sets(chain, &rh->range); + break; + } + IPFW_UH_WUNLOCK(chain); + + return (ret); } /** @@ -282,12 +1462,11 @@ keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n) static int del_entry(struct ip_fw_chain *chain, uint32_t arg) { - struct ip_fw *rule; uint32_t num; /* rule number or old_set */ uint8_t cmd, new_set; - int start, end, i, ofs, n; - struct ip_fw **map = NULL; + int do_del, ndel; int error = 0; + ipfw_range_tlv rt; num = arg & 0xffff; cmd = (arg >> 24) & 0xff; @@ -303,149 +1482,60 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg) return EINVAL; } - IPFW_UH_WLOCK(chain); /* arbitrate writers */ - chain->reap = NULL; /* prepare for deletions */ + /* Convert old requests into new representation */ + memset(&rt, 0, sizeof(rt)); + rt.start_rule = num; + rt.end_rule = num; + rt.set = num; + rt.new_set = new_set; + do_del = 0; switch (cmd) { - case 0: /* delete rules "num" (num == 0 matches all) */ - case 1: /* delete all rules in set N */ - case 5: /* delete rules with number N and set "new_set". */ - - /* - * Locate first rule to delete (start), the rule after - * the last one to delete (end), and count how many - * rules to delete (n). Always use keep_rule() to - * determine which rules to keep. - */ - n = 0; - if (cmd == 1) { - /* look for a specific set including RESVD_SET. - * Must scan the entire range, ignore num. - */ - new_set = num; - for (start = -1, end = i = 0; i < chain->n_rules; i++) { - if (keep_rule(chain->map[i], cmd, new_set, 0)) - continue; - if (start < 0) - start = i; - end = i; - n++; - } - end++; /* first non-matching */ - } else { - /* Optimized search on rule numbers */ - start = ipfw_find_rule(chain, num, 0); - for (end = start; end < chain->n_rules; end++) { - rule = chain->map[end]; - if (num > 0 && rule->rulenum != num) - break; - if (!keep_rule(rule, cmd, new_set, num)) - n++; - } - } - - if (n == 0) { - /* A flush request (arg == 0 or cmd == 1) on empty - * ruleset returns with no error. On the contrary, - * if there is no match on a specific request, - * we return EINVAL. - */ - if (arg != 0 && cmd != 1) - error = EINVAL; - break; - } - - /* We have something to delete. Allocate the new map */ - map = get_map(chain, -n, 1 /* locked */); - if (map == NULL) { - error = EINVAL; - break; - } - - /* 1. bcopy the initial part of the map */ - if (start > 0) - bcopy(chain->map, map, start * sizeof(struct ip_fw *)); - /* 2. copy active rules between start and end */ - for (i = ofs = start; i < end; i++) { - rule = chain->map[i]; - if (keep_rule(rule, cmd, new_set, num)) - map[ofs++] = rule; - } - /* 3. copy the final part of the map */ - bcopy(chain->map + end, map + ofs, - (chain->n_rules - end) * sizeof(struct ip_fw *)); - /* 4. swap the maps (under BH_LOCK) */ - map = swap_map(chain, map, chain->n_rules - n); - /* 5. now remove the rules deleted from the old map */ - if (cmd == 1) - ipfw_expire_dyn_rules(chain, NULL, new_set); - for (i = start; i < end; i++) { - rule = map[i]; - if (keep_rule(rule, cmd, new_set, num)) - continue; - chain->static_len -= RULESIZE(rule); - if (cmd != 1) - ipfw_expire_dyn_rules(chain, rule, RESVD_SET); - rule->x_next = chain->reap; - chain->reap = rule; - } + case 0: /* delete rules numbered "rulenum" */ + if (num == 0) + rt.flags |= IPFW_RCFLAG_ALL; + else + rt.flags |= IPFW_RCFLAG_RANGE; + do_del = 1; break; - - /* - * In the next 3 cases the loop stops at (n_rules - 1) - * because the default rule is never eligible.. - */ - - case 2: /* move rules with given RULE number to new set */ - for (i = 0; i < chain->n_rules - 1; i++) { - rule = chain->map[i]; - if (rule->rulenum == num) - rule->set = new_set; - } + case 1: /* delete rules in set "rulenum" */ + rt.flags |= IPFW_RCFLAG_SET; + do_del = 1; break; - - case 3: /* move rules with given SET number to new set */ - for (i = 0; i < chain->n_rules - 1; i++) { - rule = chain->map[i]; - if (rule->set == num) - rule->set = new_set; - } + case 5: /* delete rules "rulenum" and set "new_set" */ + rt.flags |= IPFW_RCFLAG_RANGE | IPFW_RCFLAG_SET; + rt.set = new_set; + rt.new_set = 0; + do_del = 1; break; - - case 4: /* swap two sets */ - for (i = 0; i < chain->n_rules - 1; i++) { - rule = chain->map[i]; - if (rule->set == num) - rule->set = new_set; - else if (rule->set == new_set) - rule->set = num; - } + case 2: /* move rules "rulenum" to set "new_set" */ + rt.flags |= IPFW_RCFLAG_RANGE; break; + case 3: /* move rules from set "rulenum" to set "new_set" */ + IPFW_UH_WLOCK(chain); + error = swap_sets(chain, &rt, 1); + IPFW_UH_WUNLOCK(chain); + return (error); + case 4: /* swap sets "rulenum" and "new_set" */ + IPFW_UH_WLOCK(chain); + error = swap_sets(chain, &rt, 0); + IPFW_UH_WUNLOCK(chain); + return (error); + default: + return (ENOTSUP); } - rule = chain->reap; - chain->reap = NULL; - IPFW_UH_WUNLOCK(chain); - ipfw_reap_rules(rule); - if (map) - free(map, M_IPFW); - return error; -} + if (do_del != 0) { + if ((error = delete_range(chain, &rt, &ndel)) != 0) + return (error); -/* - * Clear counters for a specific rule. - * Normally run under IPFW_UH_RLOCK, but these are idempotent ops - * so we only care that rules do not disappear. - */ -static void -clear_counters(struct ip_fw *rule, int log_only) -{ - ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); + if (ndel == 0 && (cmd != 1 && num != 0)) + return (EINVAL); - if (log_only == 0) - IPFW_ZERO_RULE_COUNTER(rule); - if (l->o.opcode == O_LOG) - l->log_left = l->max_log; + return (0); + } + + return (move_range(chain, &rt)); } /** @@ -516,23 +1606,57 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) return (0); } + /* - * Check validity of the structure before insert. - * Rules are simple, so this mostly need to check rule sizes. + * Check rule head in FreeBSD11 format + * */ static int -check_ipfw_struct(struct ip_fw *rule, int size) +check_ipfw_rule1(struct ip_fw_rule *rule, int size, + struct rule_check_info *ci) { - int l, cmdlen = 0; - int have_action=0; - ipfw_insn *cmd; + int l; + + if (size < sizeof(*rule)) { + printf("ipfw: rule too short\n"); + return (EINVAL); + } + + /* Check for valid cmd_len */ + l = roundup2(RULESIZE(rule), sizeof(uint64_t)); + if (l != size) { + printf("ipfw: size mismatch (have %d want %d)\n", size, l); + return (EINVAL); + } + if (rule->act_ofs >= rule->cmd_len) { + printf("ipfw: bogus action offset (%u > %u)\n", + rule->act_ofs, rule->cmd_len - 1); + return (EINVAL); + } + + if (rule->rulenum > IPFW_DEFAULT_RULE - 1) + return (EINVAL); + + return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci)); +} + +/* + * Check rule head in FreeBSD8 format + * + */ +static int +check_ipfw_rule0(struct ip_fw_rule0 *rule, int size, + struct rule_check_info *ci) +{ + int l; if (size < sizeof(*rule)) { printf("ipfw: rule too short\n"); return (EINVAL); } - /* first, check for valid size */ - l = RULESIZE(rule); + + /* Check for valid cmd_len */ + l = sizeof(*rule) + rule->cmd_len * 4 - 4; if (l != size) { printf("ipfw: size mismatch (have %d want %d)\n", size, l); return (EINVAL); @@ -542,12 +1666,26 @@ check_ipfw_struct(struct ip_fw *rule, int size) rule->act_ofs, rule->cmd_len - 1); return (EINVAL); } + + if (rule->rulenum > IPFW_DEFAULT_RULE - 1) + return (EINVAL); + + return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci)); +} + +static int +check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci) +{ + int cmdlen, l; + int have_action; + + have_action = 0; + /* * Now go for the individual checks. Very simple ones, basically only * instruction sizes. */ - for (l = rule->cmd_len, cmd = rule->cmd ; - l > 0 ; l -= cmdlen, cmd += cmdlen) { + for (l = cmd_len; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (cmdlen > l) { printf("ipfw: opcode %d size truncated\n", @@ -557,6 +1695,10 @@ check_ipfw_struct(struct ip_fw *rule, int size) switch (cmd->opcode) { case O_PROBE_STATE: case O_KEEP_STATE: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + ci->object_opcodes++; + break; case O_PROTO: case O_IP_SRC_ME: case O_IP_DST_ME: @@ -588,6 +1730,35 @@ check_ipfw_struct(struct ip_fw *rule, int size) goto bad_size; break; + case O_EXTERNAL_ACTION: + if (cmd->arg1 == 0 || + cmdlen != F_INSN_SIZE(ipfw_insn)) { + printf("ipfw: invalid external " + "action opcode\n"); + return (EINVAL); + } + ci->object_opcodes++; + /* Do we have O_EXTERNAL_INSTANCE opcode? */ + if (l != cmdlen) { + l -= cmdlen; + cmd += cmdlen; + cmdlen = F_LEN(cmd); + if (cmd->opcode != O_EXTERNAL_INSTANCE) { + printf("ipfw: invalid opcode " + "next to external action %u\n", + cmd->opcode); + return (EINVAL); + } + if (cmd->arg1 == 0 || + cmdlen != F_INSN_SIZE(ipfw_insn)) { + printf("ipfw: invalid external " + "action instance opcode\n"); + return (EINVAL); + } + ci->object_opcodes++; + } + goto check_action; + case O_FIB: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; @@ -601,10 +1772,10 @@ check_ipfw_struct(struct ip_fw *rule, int size) case O_SETFIB: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; - if ((cmd->arg1 != IP_FW_TABLEARG) && - (cmd->arg1 >= rt_numfibs)) { + if ((cmd->arg1 != IP_FW_TARG) && + ((cmd->arg1 & 0x7FFF) >= rt_numfibs)) { printf("ipfw: invalid fib number %d\n", - cmd->arg1); + cmd->arg1 & 0x7FFF); return EINVAL; } goto check_action; @@ -625,6 +1796,7 @@ check_ipfw_struct(struct ip_fw *rule, int size) case O_LIMIT: if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) goto bad_size; + ci->object_opcodes++; break; case O_LOG: @@ -639,7 +1811,7 @@ check_ipfw_struct(struct ip_fw *rule, int size) case O_IP_SRC_MASK: case O_IP_DST_MASK: /* only odd command lengths */ - if ( !(cmdlen & 1) || cmdlen > 31) + if ((cmdlen & 1) == 0) goto bad_size; break; @@ -666,6 +1838,18 @@ check_ipfw_struct(struct ip_fw *rule, int size) cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; + ci->object_opcodes++; + break; + case O_IP_FLOW_LOOKUP: + if (cmd->arg1 >= V_fw_tables_max) { + printf("ipfw: invalid table number %d\n", + cmd->arg1); + return (EINVAL); + } + if (cmdlen != F_INSN_SIZE(ipfw_insn) && + cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + goto bad_size; + ci->object_opcodes++; break; case O_MACADDR2: if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) @@ -700,6 +1884,7 @@ check_ipfw_struct(struct ip_fw *rule, int size) case O_VIA: if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) goto bad_size; + ci->object_opcodes++; break; case O_ALTQ: @@ -742,8 +1927,10 @@ check_ipfw_struct(struct ip_fw *rule, int size) if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) goto bad_size; goto check_action; - case O_FORWARD_MAC: /* XXX not implemented yet */ case O_CHECK_STATE: + ci->object_opcodes++; + /* FALLTHROUGH */ + case O_FORWARD_MAC: /* XXX not implemented yet */ case O_COUNT: case O_ACCEPT: case O_DENY: @@ -763,14 +1950,14 @@ check_action: printf("ipfw: opcode %d, multiple actions" " not allowed\n", cmd->opcode); - return EINVAL; + return (EINVAL); } have_action = 1; if (l != cmdlen) { printf("ipfw: opcode %d, action must be" " last opcode\n", cmd->opcode); - return EINVAL; + return (EINVAL); } break; #ifdef INET6 @@ -813,25 +2000,25 @@ check_action: case O_IP6_DST_MASK: case O_ICMP6TYPE: printf("ipfw: no IPv6 support in kernel\n"); - return EPROTONOSUPPORT; + return (EPROTONOSUPPORT); #endif default: printf("ipfw: opcode %d, unknown opcode\n", cmd->opcode); - return EINVAL; + return (EINVAL); } } } if (have_action == 0) { printf("ipfw: missing action\n"); - return EINVAL; + return (EINVAL); } return 0; bad_size: printf("ipfw: opcode %d size %d wrong\n", cmd->opcode, cmdlen); - return EINVAL; + return (EINVAL); } @@ -863,8 +2050,8 @@ struct ip_fw7 { ipfw_insn cmd[1]; /* storage for commands */ }; - int convert_rule_to_7(struct ip_fw *rule); -int convert_rule_to_8(struct ip_fw *rule); +static int convert_rule_to_7(struct ip_fw_rule0 *rule); +static int convert_rule_to_8(struct ip_fw_rule0 *rule); #ifndef RULESIZE7 #define RULESIZE7(rule) (sizeof(struct ip_fw7) + \ @@ -882,10 +2069,15 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) { char *bp = buf; char *ep = bp + space; - struct ip_fw *rule, *dst; - int l, i; + struct ip_fw *rule; + struct ip_fw_rule0 *dst; + struct timeval boottime; + int error, i, l, warnflag; time_t boot_seconds; + warnflag = 0; + + getboottime(&boottime); boot_seconds = boottime.tv_sec; for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; @@ -894,9 +2086,12 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) /* Convert rule to FreeBSd 7.2 format */ l = RULESIZE7(rule); if (bp + l + sizeof(uint32_t) <= ep) { - int error; bcopy(rule, bp, l + sizeof(uint32_t)); - error = convert_rule_to_7((struct ip_fw *) bp); + error = set_legacy_obj_kidx(chain, + (struct ip_fw_rule0 *)bp); + if (error != 0) + return (0); + error = convert_rule_to_7((struct ip_fw_rule0 *) bp); if (error) return 0; /*XXX correct? */ /* @@ -914,76 +2109,1631 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) continue; /* go to next rule */ } - /* normal mode, don't touch rules */ - l = RULESIZE(rule); + l = RULEUSIZE0(rule); if (bp + l > ep) { /* should not happen */ printf("overflow dumping static rules\n"); break; } - dst = (struct ip_fw *)bp; - bcopy(rule, dst, l); + dst = (struct ip_fw_rule0 *)bp; + export_rule0(rule, dst, l); + error = set_legacy_obj_kidx(chain, dst); + /* * XXX HACK. Store the disable mask in the "next" * pointer in a wild attempt to keep the ABI the same. * Why do we do this on EVERY rule? + * + * XXX: "ipfw set show" (ab)uses IP_FW_GET to read disabled mask + * so we need to fail _after_ saving at least one mask. */ bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable)); if (dst->timestamp) dst->timestamp += boot_seconds; bp += l; + + if (error != 0) { + if (error == 2) { + /* Non-fatal table rewrite error. */ + warnflag = 1; + continue; + } + printf("Stop on rule %d. Fail to convert table\n", + rule->rulenum); + break; + } } + if (warnflag != 0) + printf("ipfw: process %s is using legacy interfaces," + " consider rebuilding\n", ""); ipfw_get_dynamic(chain, &bp, ep); /* protected by the dynamic lock */ return (bp - (char *)buf); } -#define IP_FW3_OPLENGTH(x) ((x)->sopt_valsize - sizeof(ip_fw3_opheader)) -/** - * {set|get}sockopt parser. +struct dump_args { + uint32_t b; /* start rule */ + uint32_t e; /* end rule */ + uint32_t rcount; /* number of rules */ + uint32_t rsize; /* rules size */ + uint32_t tcount; /* number of tables */ + int rcounters; /* counters */ +}; + +void +ipfw_export_obj_ntlv(struct named_object *no, ipfw_obj_ntlv *ntlv) +{ + + ntlv->head.type = no->etlv; + ntlv->head.length = sizeof(*ntlv); + ntlv->idx = no->kidx; + strlcpy(ntlv->name, no->name, sizeof(ntlv->name)); +} + +/* + * Export named object info in instance @ni, identified by @kidx + * to ipfw_obj_ntlv. TLV is allocated from @sd space. + * + * Returns 0 on success. + */ +static int +export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx, + struct sockopt_data *sd) +{ + struct named_object *no; + ipfw_obj_ntlv *ntlv; + + no = ipfw_objhash_lookup_kidx(ni, kidx); + KASSERT(no != NULL, ("invalid object kernel index passed")); + + ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv)); + if (ntlv == NULL) + return (ENOMEM); + + ipfw_export_obj_ntlv(no, ntlv); + return (0); +} + +/* + * Dumps static rules with table TLVs in buffer @sd. + * + * Returns 0 on success. + */ +static int +dump_static_rules(struct ip_fw_chain *chain, struct dump_args *da, + uint32_t *bmask, struct sockopt_data *sd) +{ + int error; + int i, l; + uint32_t tcount; + ipfw_obj_ctlv *ctlv; + struct ip_fw *krule; + struct namedobj_instance *ni; + caddr_t dst; + + /* Dump table names first (if any) */ + if (da->tcount > 0) { + /* Header first */ + ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); + if (ctlv == NULL) + return (ENOMEM); + ctlv->head.type = IPFW_TLV_TBLNAME_LIST; + ctlv->head.length = da->tcount * sizeof(ipfw_obj_ntlv) + + sizeof(*ctlv); + ctlv->count = da->tcount; + ctlv->objsize = sizeof(ipfw_obj_ntlv); + } + + i = 0; + tcount = da->tcount; + ni = ipfw_get_table_objhash(chain); + while (tcount > 0) { + if ((bmask[i / 32] & (1 << (i % 32))) == 0) { + i++; + continue; + } + + /* Jump to shared named object bitmask */ + if (i >= IPFW_TABLES_MAX) { + ni = CHAIN_TO_SRV(chain); + i -= IPFW_TABLES_MAX; + bmask += IPFW_TABLES_MAX / 32; + } + + if ((error = export_objhash_ntlv(ni, i, sd)) != 0) + return (error); + + i++; + tcount--; + } + + /* Dump rules */ + ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); + if (ctlv == NULL) + return (ENOMEM); + ctlv->head.type = IPFW_TLV_RULE_LIST; + ctlv->head.length = da->rsize + sizeof(*ctlv); + ctlv->count = da->rcount; + + for (i = da->b; i < da->e; i++) { + krule = chain->map[i]; + + l = RULEUSIZE1(krule) + sizeof(ipfw_obj_tlv); + if (da->rcounters != 0) + l += sizeof(struct ip_fw_bcounter); + dst = (caddr_t)ipfw_get_sopt_space(sd, l); + if (dst == NULL) + return (ENOMEM); + + export_rule1(krule, dst, l, da->rcounters); + } + + return (0); +} + +/* + * Marks every object index used in @rule with bit in @bmask. + * Used to generate bitmask of referenced tables/objects for given ruleset + * or its part. + * + * Returns number of newly-referenced objects. + */ +static int +mark_object_kidx(struct ip_fw_chain *ch, struct ip_fw *rule, + uint32_t *bmask) +{ + struct opcode_obj_rewrite *rw; + ipfw_insn *cmd; + int bidx, cmdlen, l, count; + uint16_t kidx; + uint8_t subtype; + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + count = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + rw = find_op_rw(cmd, &kidx, &subtype); + if (rw == NULL) + continue; + + bidx = kidx / 32; + /* + * Maintain separate bitmasks for table and + * non-table objects. + */ + if (rw->etlv != IPFW_TLV_TBL_NAME) + bidx += IPFW_TABLES_MAX / 32; + + if ((bmask[bidx] & (1 << (kidx % 32))) == 0) + count++; + + bmask[bidx] |= 1 << (kidx % 32); + } + + return (count); +} + +/* + * Dumps requested objects data + * Data layout (version 0)(current): + * Request: [ ipfw_cfg_lheader ] + IPFW_CFG_GET_* flags + * size = ipfw_cfg_lheader.size + * Reply: [ ipfw_cfg_lheader + * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) + * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) + * ipfw_obj_tlv(IPFW_TLV_RULE_ENT) [ ip_fw_bcounter (optional) ip_fw_rule ] + * ] (optional) + * [ ipfw_obj_ctlv(IPFW_TLV_STATE_LIST) ipfw_obj_dyntlv x N ] (optional) + * ] + * * NOTE IPFW_TLV_STATE_LIST has the single valid field: objsize. + * The rest (size, count) are set to zero and needs to be ignored. + * + * Returns 0 on success. */ +static int +dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_cfg_lheader *hdr; + struct ip_fw *rule; + size_t sz, rnum; + uint32_t hdr_flags; + int error, i; + struct dump_args da; + uint32_t *bmask; + + hdr = (ipfw_cfg_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr)); + if (hdr == NULL) + return (EINVAL); + + error = 0; + bmask = NULL; + /* Allocate needed state. Note we allocate 2xspace mask, for table&srv */ + if (hdr->flags & IPFW_CFG_GET_STATIC) + bmask = malloc(IPFW_TABLES_MAX / 4, M_TEMP, M_WAITOK | M_ZERO); + + IPFW_UH_RLOCK(chain); + + /* + * STAGE 1: Determine size/count for objects in range. + * Prepare used tables bitmask. + */ + sz = sizeof(ipfw_cfg_lheader); + memset(&da, 0, sizeof(da)); + + da.b = 0; + da.e = chain->n_rules; + + if (hdr->end_rule != 0) { + /* Handle custom range */ + if ((rnum = hdr->start_rule) > IPFW_DEFAULT_RULE) + rnum = IPFW_DEFAULT_RULE; + da.b = ipfw_find_rule(chain, rnum, 0); + rnum = hdr->end_rule; + rnum = (rnum < IPFW_DEFAULT_RULE) ? rnum+1 : IPFW_DEFAULT_RULE; + da.e = ipfw_find_rule(chain, rnum, 0) + 1; + } + + if (hdr->flags & IPFW_CFG_GET_STATIC) { + for (i = da.b; i < da.e; i++) { + rule = chain->map[i]; + da.rsize += RULEUSIZE1(rule) + sizeof(ipfw_obj_tlv); + da.rcount++; + /* Update bitmask of used objects for given range */ + da.tcount += mark_object_kidx(chain, rule, bmask); + } + /* Add counters if requested */ + if (hdr->flags & IPFW_CFG_GET_COUNTERS) { + da.rsize += sizeof(struct ip_fw_bcounter) * da.rcount; + da.rcounters = 1; + } + + if (da.tcount > 0) + sz += da.tcount * sizeof(ipfw_obj_ntlv) + + sizeof(ipfw_obj_ctlv); + sz += da.rsize + sizeof(ipfw_obj_ctlv); + } + + if (hdr->flags & IPFW_CFG_GET_STATES) + sz += ipfw_dyn_get_count() * sizeof(ipfw_obj_dyntlv) + + sizeof(ipfw_obj_ctlv); + + + /* + * Fill header anyway. + * Note we have to save header fields to stable storage + * buffer inside @sd can be flushed after dumping rules + */ + hdr->size = sz; + hdr->set_mask = ~V_set_disable; + hdr_flags = hdr->flags; + hdr = NULL; + + if (sd->valsize < sz) { + error = ENOMEM; + goto cleanup; + } + + /* STAGE2: Store actual data */ + if (hdr_flags & IPFW_CFG_GET_STATIC) { + error = dump_static_rules(chain, &da, bmask, sd); + if (error != 0) + goto cleanup; + } + + if (hdr_flags & IPFW_CFG_GET_STATES) + error = ipfw_dump_states(chain, sd); + +cleanup: + IPFW_UH_RUNLOCK(chain); + + if (bmask != NULL) + free(bmask, M_TEMP); + + return (error); +} + int -ipfw_ctl(struct sockopt *sopt) +ipfw_check_object_name_generic(const char *name) +{ + int nsize; + + nsize = sizeof(((ipfw_obj_ntlv *)0)->name); + if (strnlen(name, nsize) == nsize) + return (EINVAL); + if (name[0] == '\0') + return (EINVAL); + return (0); +} + +/* + * Creates non-existent objects referenced by rule. + * + * Return 0 on success. + */ +int +create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd, + struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti) +{ + struct opcode_obj_rewrite *rw; + struct obj_idx *p; + uint16_t kidx; + int error; + + /* + * Compatibility stuff: do actual creation for non-existing, + * but referenced objects. + */ + for (p = oib; p < pidx; p++) { + if (p->kidx != 0) + continue; + + ti->uidx = p->uidx; + ti->type = p->type; + ti->atype = 0; + + rw = find_op_rw(cmd + p->off, NULL, NULL); + KASSERT(rw != NULL, ("Unable to find handler for op %d", + (cmd + p->off)->opcode)); + + if (rw->create_object == NULL) + error = EOPNOTSUPP; + else + error = rw->create_object(ch, ti, &kidx); + if (error == 0) { + p->kidx = kidx; + continue; + } + + /* + * Error happened. We have to rollback everything. + * Drop all already acquired references. + */ + IPFW_UH_WLOCK(ch); + unref_oib_objects(ch, cmd, oib, pidx); + IPFW_UH_WUNLOCK(ch); + + return (error); + } + + return (0); +} + +/* + * Compatibility function for old ipfw(8) binaries. + * Rewrites table/nat kernel indices with userland ones. + * Convert tables matching '/^\d+$/' to their atoi() value. + * Use number 65535 for other tables. + * + * Returns 0 on success. + */ +static int +set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule) +{ + struct opcode_obj_rewrite *rw; + struct named_object *no; + ipfw_insn *cmd; + char *end; + long val; + int cmdlen, error, l; + uint16_t kidx, uidx; + uint8_t subtype; + + error = 0; + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + /* Check if is index in given opcode */ + rw = find_op_rw(cmd, &kidx, &subtype); + if (rw == NULL) + continue; + + /* Try to find referenced kernel object */ + no = rw->find_bykidx(ch, kidx); + if (no == NULL) + continue; + + val = strtol(no->name, &end, 10); + if (*end == '\0' && val < 65535) { + uidx = val; + } else { + + /* + * We are called via legacy opcode. + * Save error and show table as fake number + * not to make ipfw(8) hang. + */ + uidx = 65535; + error = 2; + } + + rw->update(cmd, uidx); + } + + return (error); +} + + +/* + * Unreferences all already-referenced objects in given @cmd rule, + * using information in @oib. + * + * Used to rollback partially converted rule on error. + */ +static void +unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib, + struct obj_idx *end) +{ + struct opcode_obj_rewrite *rw; + struct named_object *no; + struct obj_idx *p; + + IPFW_UH_WLOCK_ASSERT(ch); + + for (p = oib; p < end; p++) { + if (p->kidx == 0) + continue; + + rw = find_op_rw(cmd + p->off, NULL, NULL); + KASSERT(rw != NULL, ("Unable to find handler for op %d", + (cmd + p->off)->opcode)); + + /* Find & unref by existing idx */ + no = rw->find_bykidx(ch, p->kidx); + KASSERT(no != NULL, ("Ref'd object %d disappeared", p->kidx)); + no->refcnt--; + } +} + +/* + * Remove references from every object used in @rule. + * Used at rule removal code. + */ +static void +unref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule) +{ + struct opcode_obj_rewrite *rw; + struct named_object *no; + ipfw_insn *cmd; + int cmdlen, l; + uint16_t kidx; + uint8_t subtype; + + IPFW_UH_WLOCK_ASSERT(ch); + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + rw = find_op_rw(cmd, &kidx, &subtype); + if (rw == NULL) + continue; + no = rw->find_bykidx(ch, kidx); + + KASSERT(no != NULL, ("table id %d not found", kidx)); + KASSERT(no->subtype == subtype, + ("wrong type %d (%d) for table id %d", + no->subtype, subtype, kidx)); + KASSERT(no->refcnt > 0, ("refcount for table %d is %d", + kidx, no->refcnt)); + + if (no->refcnt == 1 && rw->destroy_object != NULL) + rw->destroy_object(ch, no); + else + no->refcnt--; + } +} + + +/* + * Find and reference object (if any) stored in instruction @cmd. + * + * Saves object info in @pidx, sets + * - @unresolved to 1 if object should exists but not found + * + * Returns non-zero value in case of error. + */ +static int +ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd, struct tid_info *ti, + struct obj_idx *pidx, int *unresolved) +{ + struct named_object *no; + struct opcode_obj_rewrite *rw; + int error; + + /* Check if this opcode is candidate for rewrite */ + rw = find_op_rw(cmd, &ti->uidx, &ti->type); + if (rw == NULL) + return (0); + + /* Need to rewrite. Save necessary fields */ + pidx->uidx = ti->uidx; + pidx->type = ti->type; + + /* Try to find referenced kernel object */ + error = rw->find_byname(ch, ti, &no); + if (error != 0) + return (error); + if (no == NULL) { + /* + * Report about unresolved object for automaic + * creation. + */ + *unresolved = 1; + return (0); + } + + /* Found. Bump refcount and update kidx. */ + no->refcnt++; + rw->update(cmd, no->kidx); + return (0); +} + +/* + * Finds and bumps refcount for objects referenced by given @rule. + * Auto-creates non-existing tables. + * Fills in @oib array with userland/kernel indexes. + * + * Returns 0 on success. + */ +static int +ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule, + struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti) +{ + struct obj_idx *pidx; + ipfw_insn *cmd; + int cmdlen, error, l, unresolved; + + pidx = oib; + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + error = 0; + + IPFW_UH_WLOCK(ch); + + /* Increase refcount on each existing referenced table. */ + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + unresolved = 0; + + error = ref_opcode_object(ch, cmd, ti, pidx, &unresolved); + if (error != 0) + break; + /* + * Compatibility stuff for old clients: + * prepare to automaitcally create non-existing objects. + */ + if (unresolved != 0) { + pidx->off = rule->cmd_len - l; + pidx++; + } + } + + if (error != 0) { + /* Unref everything we have already done */ + unref_oib_objects(ch, rule->cmd, oib, pidx); + IPFW_UH_WUNLOCK(ch); + return (error); + } + IPFW_UH_WUNLOCK(ch); + + /* Perform auto-creation for non-existing objects */ + if (pidx != oib) + error = create_objects_compat(ch, rule->cmd, oib, pidx, ti); + + /* Calculate real number of dynamic objects */ + ci->object_opcodes = (uint16_t)(pidx - oib); + + return (error); +} + +/* + * Checks is opcode is referencing table of appropriate type. + * Adds reference count for found table if true. + * Rewrites user-supplied opcode values with kernel ones. + * + * Returns 0 on success and appropriate error code otherwise. + */ +static int +rewrite_rule_uidx(struct ip_fw_chain *chain, struct rule_check_info *ci) +{ + int error; + ipfw_insn *cmd; + uint8_t type; + struct obj_idx *p, *pidx_first, *pidx_last; + struct tid_info ti; + + /* + * Prepare an array for storing opcode indices. + * Use stack allocation by default. + */ + if (ci->object_opcodes <= (sizeof(ci->obuf)/sizeof(ci->obuf[0]))) { + /* Stack */ + pidx_first = ci->obuf; + } else + pidx_first = malloc( + ci->object_opcodes * sizeof(struct obj_idx), + M_IPFW, M_WAITOK | M_ZERO); + + error = 0; + type = 0; + memset(&ti, 0, sizeof(ti)); + + /* Use set rule is assigned to. */ + ti.set = ci->krule->set; + if (ci->ctlv != NULL) { + ti.tlvs = (void *)(ci->ctlv + 1); + ti.tlen = ci->ctlv->head.length - sizeof(ipfw_obj_ctlv); + } + + /* Reference all used tables and other objects */ + error = ref_rule_objects(chain, ci->krule, ci, pidx_first, &ti); + if (error != 0) + goto free; + /* + * Note that ref_rule_objects() might have updated ci->object_opcodes + * to reflect actual number of object opcodes. + */ + + /* Perform rewrite of remaining opcodes */ + p = pidx_first; + pidx_last = pidx_first + ci->object_opcodes; + for (p = pidx_first; p < pidx_last; p++) { + cmd = ci->krule->cmd + p->off; + update_opcode_kidx(cmd, p->kidx); + } + +free: + if (pidx_first != ci->obuf) + free(pidx_first, M_IPFW); + + return (error); +} + +/* + * Adds one or more rules to ipfw @chain. + * Data layout (version 0)(current): + * Request: + * [ + * ip_fw3_opheader + * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional *1) + * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] (*2) (*3) + * ] + * Reply: + * [ + * ip_fw3_opheader + * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) + * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] + * ] + * + * Rules in reply are modified to store their actual ruleset number. + * + * (*1) TLVs inside IPFW_TLV_TBL_LIST needs to be sorted ascending + * according to their idx field and there has to be no duplicates. + * (*2) Numbered rules inside IPFW_TLV_RULE_LIST needs to be sorted ascending. + * (*3) Each ip_fw structure needs to be aligned to u64 boundary. + * + * Returns 0 on success. + */ +static int +add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_ctlv *ctlv, *rtlv, *tstate; + ipfw_obj_ntlv *ntlv; + int clen, error, idx; + uint32_t count, read; + struct ip_fw_rule *r; + struct rule_check_info rci, *ci, *cbuf; + int i, rsize; + + op3 = (ip_fw3_opheader *)ipfw_get_sopt_space(sd, sd->valsize); + ctlv = (ipfw_obj_ctlv *)(op3 + 1); + + read = sizeof(ip_fw3_opheader); + rtlv = NULL; + tstate = NULL; + cbuf = NULL; + memset(&rci, 0, sizeof(struct rule_check_info)); + + if (read + sizeof(*ctlv) > sd->valsize) + return (EINVAL); + + if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) { + clen = ctlv->head.length; + /* Check size and alignment */ + if (clen > sd->valsize || clen < sizeof(*ctlv)) + return (EINVAL); + if ((clen % sizeof(uint64_t)) != 0) + return (EINVAL); + + /* + * Some table names or other named objects. + * Check for validness. + */ + count = (ctlv->head.length - sizeof(*ctlv)) / sizeof(*ntlv); + if (ctlv->count != count || ctlv->objsize != sizeof(*ntlv)) + return (EINVAL); + + /* + * Check each TLV. + * Ensure TLVs are sorted ascending and + * there are no duplicates. + */ + idx = -1; + ntlv = (ipfw_obj_ntlv *)(ctlv + 1); + while (count > 0) { + if (ntlv->head.length != sizeof(ipfw_obj_ntlv)) + return (EINVAL); + + error = ipfw_check_object_name_generic(ntlv->name); + if (error != 0) + return (error); + + if (ntlv->idx <= idx) + return (EINVAL); + + idx = ntlv->idx; + count--; + ntlv++; + } + + tstate = ctlv; + read += ctlv->head.length; + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); + } + + if (read + sizeof(*ctlv) > sd->valsize) + return (EINVAL); + + if (ctlv->head.type == IPFW_TLV_RULE_LIST) { + clen = ctlv->head.length; + if (clen + read > sd->valsize || clen < sizeof(*ctlv)) + return (EINVAL); + if ((clen % sizeof(uint64_t)) != 0) + return (EINVAL); + + /* + * TODO: Permit adding multiple rules at once + */ + if (ctlv->count != 1) + return (ENOTSUP); + + clen -= sizeof(*ctlv); + + if (ctlv->count > clen / sizeof(struct ip_fw_rule)) + return (EINVAL); + + /* Allocate state for each rule or use stack */ + if (ctlv->count == 1) { + memset(&rci, 0, sizeof(struct rule_check_info)); + cbuf = &rci; + } else + cbuf = malloc(ctlv->count * sizeof(*ci), M_TEMP, + M_WAITOK | M_ZERO); + ci = cbuf; + + /* + * Check each rule for validness. + * Ensure numbered rules are sorted ascending + * and properly aligned + */ + idx = 0; + r = (struct ip_fw_rule *)(ctlv + 1); + count = 0; + error = 0; + while (clen > 0) { + rsize = roundup2(RULESIZE(r), sizeof(uint64_t)); + if (rsize > clen || ctlv->count <= count) { + error = EINVAL; + break; + } + + ci->ctlv = tstate; + error = check_ipfw_rule1(r, rsize, ci); + if (error != 0) + break; + + /* Check sorting */ + if (r->rulenum != 0 && r->rulenum < idx) { + printf("rulenum %d idx %d\n", r->rulenum, idx); + error = EINVAL; + break; + } + idx = r->rulenum; + + ci->urule = (caddr_t)r; + + rsize = roundup2(rsize, sizeof(uint64_t)); + clen -= rsize; + r = (struct ip_fw_rule *)((caddr_t)r + rsize); + count++; + ci++; + } + + if (ctlv->count != count || error != 0) { + if (cbuf != &rci) + free(cbuf, M_TEMP); + return (EINVAL); + } + + rtlv = ctlv; + read += ctlv->head.length; + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); + } + + if (read != sd->valsize || rtlv == NULL || rtlv->count == 0) { + if (cbuf != NULL && cbuf != &rci) + free(cbuf, M_TEMP); + return (EINVAL); + } + + /* + * Passed rules seems to be valid. + * Allocate storage and try to add them to chain. + */ + for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) { + clen = RULEKSIZE1((struct ip_fw_rule *)ci->urule); + ci->krule = ipfw_alloc_rule(chain, clen); + import_rule1(ci); + } + + if ((error = commit_rules(chain, cbuf, rtlv->count)) != 0) { + /* Free allocate krules */ + for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) + free_rule(ci->krule); + } + + if (cbuf != NULL && cbuf != &rci) + free(cbuf, M_TEMP); + + return (error); +} + +/* + * Lists all sopts currently registered. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size + * Reply: [ ipfw_obj_lheader ipfw_sopt_info x N ] + * + * Returns 0 on success + */ +static int +dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct _ipfw_obj_lheader *olh; + ipfw_sopt_info *i; + struct ipfw_sopt_handler *sh; + uint32_t count, n, size; + + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); + if (olh == NULL) + return (EINVAL); + if (sd->valsize < olh->size) + return (EINVAL); + + CTL3_LOCK(); + count = ctl3_hsize; + size = count * sizeof(ipfw_sopt_info) + sizeof(ipfw_obj_lheader); + + /* Fill in header regadless of buffer size */ + olh->count = count; + olh->objsize = sizeof(ipfw_sopt_info); + + if (size > olh->size) { + olh->size = size; + CTL3_UNLOCK(); + return (ENOMEM); + } + olh->size = size; + + for (n = 1; n <= count; n++) { + i = (ipfw_sopt_info *)ipfw_get_sopt_space(sd, sizeof(*i)); + KASSERT(i != NULL, ("previously checked buffer is not enough")); + sh = &ctl3_handlers[n]; + i->opcode = sh->opcode; + i->version = sh->version; + i->refcnt = sh->refcnt; + } + CTL3_UNLOCK(); + + return (0); +} + +/* + * Compares two opcodes. + * Used both in qsort() and bsearch(). + * + * Returns 0 if match is found. + */ +static int +compare_opcodes(const void *_a, const void *_b) +{ + const struct opcode_obj_rewrite *a, *b; + + a = (const struct opcode_obj_rewrite *)_a; + b = (const struct opcode_obj_rewrite *)_b; + + if (a->opcode < b->opcode) + return (-1); + else if (a->opcode > b->opcode) + return (1); + + return (0); +} + +/* + * XXX: Rewrite bsearch() + */ +static int +find_op_rw_range(uint16_t op, struct opcode_obj_rewrite **plo, + struct opcode_obj_rewrite **phi) +{ + struct opcode_obj_rewrite *ctl3_max, *lo, *hi, h, *rw; + + memset(&h, 0, sizeof(h)); + h.opcode = op; + + rw = (struct opcode_obj_rewrite *)bsearch(&h, ctl3_rewriters, + ctl3_rsize, sizeof(h), compare_opcodes); + if (rw == NULL) + return (1); + + /* Find the first element matching the same opcode */ + lo = rw; + for ( ; lo > ctl3_rewriters && (lo - 1)->opcode == op; lo--) + ; + + /* Find the last element matching the same opcode */ + hi = rw; + ctl3_max = ctl3_rewriters + ctl3_rsize; + for ( ; (hi + 1) < ctl3_max && (hi + 1)->opcode == op; hi++) + ; + + *plo = lo; + *phi = hi; + + return (0); +} + +/* + * Finds opcode object rewriter based on @code. + * + * Returns pointer to handler or NULL. + */ +static struct opcode_obj_rewrite * +find_op_rw(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +{ + struct opcode_obj_rewrite *rw, *lo, *hi; + uint16_t uidx; + uint8_t subtype; + + if (find_op_rw_range(cmd->opcode, &lo, &hi) != 0) + return (NULL); + + for (rw = lo; rw <= hi; rw++) { + if (rw->classifier(cmd, &uidx, &subtype) == 0) { + if (puidx != NULL) + *puidx = uidx; + if (ptype != NULL) + *ptype = subtype; + return (rw); + } + } + + return (NULL); +} +int +classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx) +{ + + if (find_op_rw(cmd, puidx, NULL) == 0) + return (1); + return (0); +} + +void +update_opcode_kidx(ipfw_insn *cmd, uint16_t idx) +{ + struct opcode_obj_rewrite *rw; + + rw = find_op_rw(cmd, NULL, NULL); + KASSERT(rw != NULL, ("No handler to update opcode %d", cmd->opcode)); + rw->update(cmd, idx); +} + +void +ipfw_init_obj_rewriter() +{ + + ctl3_rewriters = NULL; + ctl3_rsize = 0; +} + +void +ipfw_destroy_obj_rewriter() +{ + + if (ctl3_rewriters != NULL) + free(ctl3_rewriters, M_IPFW); + ctl3_rewriters = NULL; + ctl3_rsize = 0; +} + +/* + * Adds one or more opcode object rewrite handlers to the global array. + * Function may sleep. + */ +void +ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count) +{ + size_t sz; + struct opcode_obj_rewrite *tmp; + + CTL3_LOCK(); + + for (;;) { + sz = ctl3_rsize + count; + CTL3_UNLOCK(); + tmp = malloc(sizeof(*rw) * sz, M_IPFW, M_WAITOK | M_ZERO); + CTL3_LOCK(); + if (ctl3_rsize + count <= sz) + break; + + /* Retry */ + free(tmp, M_IPFW); + } + + /* Merge old & new arrays */ + sz = ctl3_rsize + count; + memcpy(tmp, ctl3_rewriters, ctl3_rsize * sizeof(*rw)); + memcpy(&tmp[ctl3_rsize], rw, count * sizeof(*rw)); + qsort(tmp, sz, sizeof(*rw), compare_opcodes); + /* Switch new and free old */ + if (ctl3_rewriters != NULL) + free(ctl3_rewriters, M_IPFW); + ctl3_rewriters = tmp; + ctl3_rsize = sz; + + CTL3_UNLOCK(); +} + +/* + * Removes one or more object rewrite handlers from the global array. + */ +int +ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count) +{ + size_t sz; + struct opcode_obj_rewrite *ctl3_max, *ktmp, *lo, *hi; + int i; + + CTL3_LOCK(); + + for (i = 0; i < count; i++) { + if (find_op_rw_range(rw[i].opcode, &lo, &hi) != 0) + continue; + + for (ktmp = lo; ktmp <= hi; ktmp++) { + if (ktmp->classifier != rw[i].classifier) + continue; + + ctl3_max = ctl3_rewriters + ctl3_rsize; + sz = (ctl3_max - (ktmp + 1)) * sizeof(*ktmp); + memmove(ktmp, ktmp + 1, sz); + ctl3_rsize--; + break; + } + + } + + if (ctl3_rsize == 0) { + if (ctl3_rewriters != NULL) + free(ctl3_rewriters, M_IPFW); + ctl3_rewriters = NULL; + } + + CTL3_UNLOCK(); + + return (0); +} + +static int +export_objhash_ntlv_internal(struct namedobj_instance *ni, + struct named_object *no, void *arg) +{ + struct sockopt_data *sd; + ipfw_obj_ntlv *ntlv; + + sd = (struct sockopt_data *)arg; + ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv)); + if (ntlv == NULL) + return (ENOMEM); + ipfw_export_obj_ntlv(no, ntlv); + return (0); +} + +/* + * Lists all service objects. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ] size = ipfw_obj_lheader.size + * Reply: [ ipfw_obj_lheader [ ipfw_obj_ntlv x N ] (optional) ] + * Returns 0 on success + */ +static int +dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_lheader *hdr; + int count; + + hdr = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr)); + if (hdr == NULL) + return (EINVAL); + + IPFW_UH_RLOCK(chain); + count = ipfw_objhash_count(CHAIN_TO_SRV(chain)); + hdr->size = sizeof(ipfw_obj_lheader) + count * sizeof(ipfw_obj_ntlv); + if (sd->valsize < hdr->size) { + IPFW_UH_RUNLOCK(chain); + return (ENOMEM); + } + hdr->count = count; + hdr->objsize = sizeof(ipfw_obj_ntlv); + if (count > 0) + ipfw_objhash_foreach(CHAIN_TO_SRV(chain), + export_objhash_ntlv_internal, sd); + IPFW_UH_RUNLOCK(chain); + return (0); +} + +/* + * Compares two sopt handlers (code, version and handler ptr). + * Used both as qsort() and bsearch(). + * Does not compare handler for latter case. + * + * Returns 0 if match is found. + */ +static int +compare_sh(const void *_a, const void *_b) +{ + const struct ipfw_sopt_handler *a, *b; + + a = (const struct ipfw_sopt_handler *)_a; + b = (const struct ipfw_sopt_handler *)_b; + + if (a->opcode < b->opcode) + return (-1); + else if (a->opcode > b->opcode) + return (1); + + if (a->version < b->version) + return (-1); + else if (a->version > b->version) + return (1); + + /* bsearch helper */ + if (a->handler == NULL) + return (0); + + if ((uintptr_t)a->handler < (uintptr_t)b->handler) + return (-1); + else if ((uintptr_t)a->handler > (uintptr_t)b->handler) + return (1); + + return (0); +} + +/* + * Finds sopt handler based on @code and @version. + * + * Returns pointer to handler or NULL. + */ +static struct ipfw_sopt_handler * +find_sh(uint16_t code, uint8_t version, sopt_handler_f *handler) +{ + struct ipfw_sopt_handler *sh, h; + + memset(&h, 0, sizeof(h)); + h.opcode = code; + h.version = version; + h.handler = handler; + + sh = (struct ipfw_sopt_handler *)bsearch(&h, ctl3_handlers, + ctl3_hsize, sizeof(h), compare_sh); + + return (sh); +} + +static int +find_ref_sh(uint16_t opcode, uint8_t version, struct ipfw_sopt_handler *psh) +{ + struct ipfw_sopt_handler *sh; + + CTL3_LOCK(); + if ((sh = find_sh(opcode, version, NULL)) == NULL) { + CTL3_UNLOCK(); + printf("ipfw: ipfw_ctl3 invalid option %d""v""%d\n", + opcode, version); + return (EINVAL); + } + sh->refcnt++; + ctl3_refct++; + /* Copy handler data to requested buffer */ + *psh = *sh; + CTL3_UNLOCK(); + + return (0); +} + +static void +find_unref_sh(struct ipfw_sopt_handler *psh) +{ + struct ipfw_sopt_handler *sh; + + CTL3_LOCK(); + sh = find_sh(psh->opcode, psh->version, NULL); + KASSERT(sh != NULL, ("ctl3 handler disappeared")); + sh->refcnt--; + ctl3_refct--; + CTL3_UNLOCK(); +} + +void +ipfw_init_sopt_handler() +{ + + CTL3_LOCK_INIT(); + IPFW_ADD_SOPT_HANDLER(1, scodes); +} + +void +ipfw_destroy_sopt_handler() +{ + + IPFW_DEL_SOPT_HANDLER(1, scodes); + CTL3_LOCK_DESTROY(); +} + +/* + * Adds one or more sockopt handlers to the global array. + * Function may sleep. + */ +void +ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count) +{ + size_t sz; + struct ipfw_sopt_handler *tmp; + + CTL3_LOCK(); + + for (;;) { + sz = ctl3_hsize + count; + CTL3_UNLOCK(); + tmp = malloc(sizeof(*sh) * sz, M_IPFW, M_WAITOK | M_ZERO); + CTL3_LOCK(); + if (ctl3_hsize + count <= sz) + break; + + /* Retry */ + free(tmp, M_IPFW); + } + + /* Merge old & new arrays */ + sz = ctl3_hsize + count; + memcpy(tmp, ctl3_handlers, ctl3_hsize * sizeof(*sh)); + memcpy(&tmp[ctl3_hsize], sh, count * sizeof(*sh)); + qsort(tmp, sz, sizeof(*sh), compare_sh); + /* Switch new and free old */ + if (ctl3_handlers != NULL) + free(ctl3_handlers, M_IPFW); + ctl3_handlers = tmp; + ctl3_hsize = sz; + ctl3_gencnt++; + + CTL3_UNLOCK(); +} + +/* + * Removes one or more sockopt handlers from the global array. + */ +int +ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count) +{ + size_t sz; + struct ipfw_sopt_handler *tmp, *h; + int i; + + CTL3_LOCK(); + + for (i = 0; i < count; i++) { + tmp = &sh[i]; + h = find_sh(tmp->opcode, tmp->version, tmp->handler); + if (h == NULL) + continue; + + sz = (ctl3_handlers + ctl3_hsize - (h + 1)) * sizeof(*h); + memmove(h, h + 1, sz); + ctl3_hsize--; + } + + if (ctl3_hsize == 0) { + if (ctl3_handlers != NULL) + free(ctl3_handlers, M_IPFW); + ctl3_handlers = NULL; + } + + ctl3_gencnt++; + + CTL3_UNLOCK(); + + return (0); +} + +/* + * Writes data accumulated in @sd to sockopt buffer. + * Zeroes internal @sd buffer. + */ +static int +ipfw_flush_sopt_data(struct sockopt_data *sd) +{ + struct sockopt *sopt; + int error; + size_t sz; + + sz = sd->koff; + if (sz == 0) + return (0); + + sopt = sd->sopt; + + if (sopt->sopt_dir == SOPT_GET) { + error = copyout(sd->kbuf, sopt->sopt_val, sz); + if (error != 0) + return (error); + } + + memset(sd->kbuf, 0, sd->ksize); + sd->ktotal += sz; + sd->koff = 0; + if (sd->ktotal + sd->ksize < sd->valsize) + sd->kavail = sd->ksize; + else + sd->kavail = sd->valsize - sd->ktotal; + + /* Update sopt buffer data */ + sopt->sopt_valsize = sd->ktotal; + sopt->sopt_val = sd->sopt_val + sd->ktotal; + + return (0); +} + +/* + * Ensures that @sd buffer has contiguous @neeeded number of + * bytes. + * + * Returns pointer to requested space or NULL. + */ +caddr_t +ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed) { -#define RULE_MAXSIZE (256*sizeof(u_int32_t)) int error; - size_t size, len, valsize; - struct ip_fw *buf, *rule; + caddr_t addr; + + if (sd->kavail < needed) { + /* + * Flush data and try another time. + */ + error = ipfw_flush_sopt_data(sd); + + if (sd->kavail < needed || error != 0) + return (NULL); + } + + addr = sd->kbuf + sd->koff; + sd->koff += needed; + sd->kavail -= needed; + return (addr); +} + +/* + * Requests @needed contiguous bytes from @sd buffer. + * Function is used to notify subsystem that we are + * interesed in first @needed bytes (request header) + * and the rest buffer can be safely zeroed. + * + * Returns pointer to requested space or NULL. + */ +caddr_t +ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed) +{ + caddr_t addr; + + if ((addr = ipfw_get_sopt_space(sd, needed)) == NULL) + return (NULL); + + if (sd->kavail > 0) + memset(sd->kbuf + sd->koff, 0, sd->kavail); + + return (addr); +} + +/* + * New sockopt handler. + */ +int +ipfw_ctl3(struct sockopt *sopt) +{ + int error, locked; + size_t size, valsize; struct ip_fw_chain *chain; - u_int32_t rulenum[2]; - uint32_t opt; - char xbuf[128]; + char xbuf[256]; + struct sockopt_data sdata; + struct ipfw_sopt_handler h; ip_fw3_opheader *op3 = NULL; error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); - if (error) + if (error != 0) + return (error); + + if (sopt->sopt_name != IP_FW3) + return (ipfw_ctl(sopt)); + + chain = &V_layer3_chain; + error = 0; + + /* Save original valsize before it is altered via sooptcopyin() */ + valsize = sopt->sopt_valsize; + memset(&sdata, 0, sizeof(sdata)); + /* Read op3 header first to determine actual operation */ + op3 = (ip_fw3_opheader *)xbuf; + error = sooptcopyin(sopt, op3, sizeof(*op3), sizeof(*op3)); + if (error != 0) + return (error); + sopt->sopt_valsize = valsize; + + /* + * Find and reference command. + */ + error = find_ref_sh(op3->opcode, op3->version, &h); + if (error != 0) return (error); /* * Disallow modifications in really-really secure mode, but still allow * the logging counters to be reset. */ - if (sopt->sopt_name == IP_FW_ADD || - (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { + if ((h.dir & HDIR_SET) != 0 && h.opcode != IP_FW_XRESETLOG) { error = securelevel_ge(sopt->sopt_td->td_ucred, 3); - if (error) + if (error != 0) { + find_unref_sh(&h); return (error); + } } + /* + * Fill in sockopt_data structure that may be useful for + * IP_FW3 get requests. + */ + locked = 0; + if (valsize <= sizeof(xbuf)) { + /* use on-stack buffer */ + sdata.kbuf = xbuf; + sdata.ksize = sizeof(xbuf); + sdata.kavail = valsize; + } else { + + /* + * Determine opcode type/buffer size: + * allocate sliding-window buf for data export or + * contiguous buffer for special ops. + */ + if ((h.dir & HDIR_SET) != 0) { + /* Set request. Allocate contigous buffer. */ + if (valsize > CTL3_LARGEBUF) { + find_unref_sh(&h); + return (EFBIG); + } + + size = valsize; + } else { + /* Get request. Allocate sliding window buffer */ + size = (valsize<CTL3_SMALLBUF) ? valsize:CTL3_SMALLBUF; + + if (size < valsize) { + /* We have to wire user buffer */ + error = vslock(sopt->sopt_val, valsize); + if (error != 0) + return (error); + locked = 1; + } + } + + sdata.kbuf = malloc(size, M_TEMP, M_WAITOK | M_ZERO); + sdata.ksize = size; + sdata.kavail = size; + } + + sdata.sopt = sopt; + sdata.sopt_val = sopt->sopt_val; + sdata.valsize = valsize; + + /* + * Copy either all request (if valsize < bsize_max) + * or first bsize_max bytes to guarantee most consumers + * that all necessary data has been copied). + * Anyway, copy not less than sizeof(ip_fw3_opheader). + */ + if ((error = sooptcopyin(sopt, sdata.kbuf, sdata.ksize, + sizeof(ip_fw3_opheader))) != 0) + return (error); + op3 = (ip_fw3_opheader *)sdata.kbuf; + + /* Finally, run handler */ + error = h.handler(chain, op3, &sdata); + find_unref_sh(&h); + + /* Flush state and free buffers */ + if (error == 0) + error = ipfw_flush_sopt_data(&sdata); + else + ipfw_flush_sopt_data(&sdata); + + if (locked != 0) + vsunlock(sdata.sopt_val, valsize); + + /* Restore original pointer and set number of bytes written */ + sopt->sopt_val = sdata.sopt_val; + sopt->sopt_valsize = sdata.ktotal; + if (sdata.kbuf != xbuf) + free(sdata.kbuf, M_TEMP); + + return (error); +} + +/** + * {set|get}sockopt parser. + */ +int +ipfw_ctl(struct sockopt *sopt) +{ +#define RULE_MAXSIZE (512*sizeof(u_int32_t)) + int error; + size_t size, valsize; + struct ip_fw *buf; + struct ip_fw_rule0 *rule; + struct ip_fw_chain *chain; + u_int32_t rulenum[2]; + uint32_t opt; + struct rule_check_info ci; + IPFW_RLOCK_TRACKER; + chain = &V_layer3_chain; error = 0; /* Save original valsize before it is altered via sooptcopyin() */ valsize = sopt->sopt_valsize; - if ((opt = sopt->sopt_name) == IP_FW3) { - /* - * Copy not less than sizeof(ip_fw3_opheader). - * We hope any IP_FW3 command will fit into 128-byte buffer. - */ - if ((error = sooptcopyin(sopt, xbuf, sizeof(xbuf), - sizeof(ip_fw3_opheader))) != 0) + opt = sopt->sopt_name; + + /* + * Disallow modifications in really-really secure mode, but still allow + * the logging counters to be reset. + */ + if (opt == IP_FW_ADD || + (sopt->sopt_dir == SOPT_SET && opt != IP_FW_RESETLOG)) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error != 0) return (error); - op3 = (ip_fw3_opheader *)xbuf; - opt = op3->opcode; } switch (opt) { @@ -1006,9 +3756,7 @@ ipfw_ctl(struct sockopt *sopt) size += ipfw_dyn_len(); if (size >= sopt->sopt_valsize) break; - buf = malloc(size, M_TEMP, M_WAITOK); - if (buf == NULL) - break; + buf = malloc(size, M_TEMP, M_WAITOK | M_ZERO); IPFW_UH_RLOCK(chain); /* check again how much space we need */ want = chain->static_len + ipfw_dyn_len(); @@ -1033,6 +3781,8 @@ ipfw_ctl(struct sockopt *sopt) error = sooptcopyin(sopt, rule, RULE_MAXSIZE, sizeof(struct ip_fw7) ); + memset(&ci, 0, sizeof(struct rule_check_info)); + /* * If the size of commands equals RULESIZE7 then we assume * a FreeBSD7.2 binary is talking to us (set is7=1). @@ -1042,25 +3792,30 @@ ipfw_ctl(struct sockopt *sopt) * the first ipfw command is 'ipfw [pipe] list') * the ipfw binary may crash or loop infinitly... */ - if (sopt->sopt_valsize == RULESIZE7(rule)) { + size = sopt->sopt_valsize; + if (size == RULESIZE7(rule)) { is7 = 1; error = convert_rule_to_8(rule); if (error) { free(rule, M_TEMP); return error; } - if (error == 0) - error = check_ipfw_struct(rule, RULESIZE(rule)); - } else { + size = RULESIZE(rule); + } else is7 = 0; if (error == 0) - error = check_ipfw_struct(rule, sopt->sopt_valsize); - } + error = check_ipfw_rule0(rule, size, &ci); if (error == 0) { - /* locking is done within ipfw_add_rule() */ - error = ipfw_add_rule(chain, rule); - size = RULESIZE(rule); - if (!error && sopt->sopt_dir == SOPT_GET) { + /* locking is done within add_rule() */ + struct ip_fw *krule; + krule = ipfw_alloc_rule(chain, RULEKSIZE0(rule)); + ci.urule = (caddr_t)rule; + ci.krule = krule; + import_rule0(&ci); + error = commit_rules(chain, &ci, 1); + if (error != 0) + free_rule(ci.krule); + else if (sopt->sopt_dir == SOPT_GET) { if (is7) { error = convert_rule_to_7(rule); size = RULESIZE7(rule); @@ -1119,82 +3874,64 @@ ipfw_ctl(struct sockopt *sopt) sopt->sopt_name == IP_FW_RESETLOG); break; - /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/ + /*--- TABLE opcodes ---*/ case IP_FW_TABLE_ADD: - { - ipfw_table_entry ent; - - error = sooptcopyin(sopt, &ent, - sizeof(ent), sizeof(ent)); - if (error) - break; - error = ipfw_add_table_entry(chain, ent.tbl, - &ent.addr, sizeof(ent.addr), ent.masklen, - IPFW_TABLE_CIDR, ent.value); - } - break; - case IP_FW_TABLE_DEL: { ipfw_table_entry ent; + struct tentry_info tei; + struct tid_info ti; + struct table_value v; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; - error = ipfw_del_table_entry(chain, ent.tbl, - &ent.addr, sizeof(ent.addr), ent.masklen, IPFW_TABLE_CIDR); - } - break; - - case IP_FW_TABLE_XADD: /* IP_FW3 */ - case IP_FW_TABLE_XDEL: /* IP_FW3 */ - { - ipfw_table_xentry *xent = (ipfw_table_xentry *)(op3 + 1); - - /* Check minimum header size */ - if (IP_FW3_OPLENGTH(sopt) < offsetof(ipfw_table_xentry, k)) { - error = EINVAL; - break; - } - /* Check if len field is valid */ - if (xent->len > sizeof(ipfw_table_xentry)) { - error = EINVAL; - break; - } - - len = xent->len - offsetof(ipfw_table_xentry, k); - - error = (opt == IP_FW_TABLE_XADD) ? - ipfw_add_table_entry(chain, xent->tbl, &xent->k, - len, xent->masklen, xent->type, xent->value) : - ipfw_del_table_entry(chain, xent->tbl, &xent->k, - len, xent->masklen, xent->type); + memset(&tei, 0, sizeof(tei)); + tei.paddr = &ent.addr; + tei.subtype = AF_INET; + tei.masklen = ent.masklen; + ipfw_import_table_value_legacy(ent.value, &v); + tei.pvalue = &v; + memset(&ti, 0, sizeof(ti)); + ti.uidx = ent.tbl; + ti.type = IPFW_TABLE_CIDR; + + error = (opt == IP_FW_TABLE_ADD) ? + add_table_entry(chain, &ti, &tei, 0, 1) : + del_table_entry(chain, &ti, &tei, 0, 1); } break; + case IP_FW_TABLE_FLUSH: { u_int16_t tbl; + struct tid_info ti; error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)); if (error) break; - error = ipfw_flush_table(chain, tbl); + memset(&ti, 0, sizeof(ti)); + ti.uidx = tbl; + error = flush_table(chain, &ti); } break; case IP_FW_TABLE_GETSIZE: { u_int32_t tbl, cnt; + struct tid_info ti; if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)))) break; + memset(&ti, 0, sizeof(ti)); + ti.uidx = tbl; IPFW_RLOCK(chain); - error = ipfw_count_table(chain, tbl, &cnt); + error = ipfw_count_table(chain, &ti, &cnt); IPFW_RUNLOCK(chain); if (error) break; @@ -1205,6 +3942,7 @@ ipfw_ctl(struct sockopt *sopt) case IP_FW_TABLE_LIST: { ipfw_table *tbl; + struct tid_info ti; if (sopt->sopt_valsize < sizeof(*tbl)) { error = EINVAL; @@ -1219,8 +3957,10 @@ ipfw_ctl(struct sockopt *sopt) } tbl->size = (size - sizeof(*tbl)) / sizeof(ipfw_table_entry); + memset(&ti, 0, sizeof(ti)); + ti.uidx = tbl->tbl; IPFW_RLOCK(chain); - error = ipfw_dump_table(chain, tbl); + error = ipfw_dump_table_legacy(chain, &ti, tbl); IPFW_RUNLOCK(chain); if (error) { free(tbl, M_TEMP); @@ -1231,62 +3971,6 @@ ipfw_ctl(struct sockopt *sopt) } break; - case IP_FW_TABLE_XGETSIZE: /* IP_FW3 */ - { - uint32_t *tbl; - - if (IP_FW3_OPLENGTH(sopt) < sizeof(uint32_t)) { - error = EINVAL; - break; - } - - tbl = (uint32_t *)(op3 + 1); - - IPFW_RLOCK(chain); - error = ipfw_count_xtable(chain, *tbl, tbl); - IPFW_RUNLOCK(chain); - if (error) - break; - error = sooptcopyout(sopt, op3, sopt->sopt_valsize); - } - break; - - case IP_FW_TABLE_XLIST: /* IP_FW3 */ - { - ipfw_xtable *tbl; - - if ((size = valsize) < sizeof(ipfw_xtable)) { - error = EINVAL; - break; - } - - tbl = malloc(size, M_TEMP, M_ZERO | M_WAITOK); - memcpy(tbl, op3, sizeof(ipfw_xtable)); - - /* Get maximum number of entries we can store */ - tbl->size = (size - sizeof(ipfw_xtable)) / - sizeof(ipfw_table_xentry); - IPFW_RLOCK(chain); - error = ipfw_dump_xtable(chain, tbl); - IPFW_RUNLOCK(chain); - if (error) { - free(tbl, M_TEMP); - break; - } - - /* Revert size field back to bytes */ - tbl->size = tbl->size * sizeof(ipfw_table_xentry) + - sizeof(ipfw_table); - /* - * Since we call sooptcopyin() with small buffer, sopt_valsize is - * decreased to reflect supplied buffer size. Set it back to original value - */ - sopt->sopt_valsize = valsize; - error = sooptcopyout(sopt, tbl, size); - free(tbl, M_TEMP); - } - break; - /*--- NAT operations are protected by the IPFW_LOCK ---*/ case IP_FW_NAT_CFG: if (IPFW_NAT_LOADED) @@ -1336,18 +4020,16 @@ ipfw_ctl(struct sockopt *sopt) return (error); #undef RULE_MAXSIZE } - - #define RULE_MAXSIZE (256*sizeof(u_int32_t)) /* Functions to convert rules 7.2 <==> 8.0 */ -int -convert_rule_to_7(struct ip_fw *rule) +static int +convert_rule_to_7(struct ip_fw_rule0 *rule) { /* Used to modify original rule */ struct ip_fw7 *rule7 = (struct ip_fw7 *)rule; /* copy of original rule, version 8 */ - struct ip_fw *tmp; + struct ip_fw_rule0 *tmp; /* Used to copy commands */ ipfw_insn *ccmd, *dst; @@ -1360,13 +4042,12 @@ convert_rule_to_7(struct ip_fw *rule) bcopy(rule, tmp, RULE_MAXSIZE); /* Copy fields */ - rule7->_pad = tmp->_pad; + //rule7->_pad = tmp->_pad; rule7->set = tmp->set; rule7->rulenum = tmp->rulenum; rule7->cmd_len = tmp->cmd_len; rule7->act_ofs = tmp->act_ofs; rule7->next_rule = (struct ip_fw7 *)tmp->next_rule; - rule7->next = (struct ip_fw7 *)tmp->x_next; rule7->cmd_len = tmp->cmd_len; rule7->pcnt = tmp->pcnt; rule7->bcnt = tmp->bcnt; @@ -1396,8 +4077,8 @@ convert_rule_to_7(struct ip_fw *rule) return 0; } -int -convert_rule_to_8(struct ip_fw *rule) +static int +convert_rule_to_8(struct ip_fw_rule0 *rule) { /* Used to modify original rule */ struct ip_fw7 *rule7 = (struct ip_fw7 *) rule; @@ -1439,7 +4120,6 @@ convert_rule_to_8(struct ip_fw *rule) rule->cmd_len = tmp->cmd_len; rule->act_ofs = tmp->act_ofs; rule->next_rule = (struct ip_fw *)tmp->next_rule; - rule->x_next = (struct ip_fw *)tmp->next; rule->cmd_len = tmp->cmd_len; rule->id = 0; /* XXX see if is ok = 0 */ rule->pcnt = tmp->pcnt; @@ -1450,4 +4130,486 @@ convert_rule_to_8(struct ip_fw *rule) return 0; } +/* + * Named object api + * + */ + +void +ipfw_init_srv(struct ip_fw_chain *ch) +{ + + ch->srvmap = ipfw_objhash_create(IPFW_OBJECTS_DEFAULT); + ch->srvstate = malloc(sizeof(void *) * IPFW_OBJECTS_DEFAULT, + M_IPFW, M_WAITOK | M_ZERO); +} + +void +ipfw_destroy_srv(struct ip_fw_chain *ch) +{ + + free(ch->srvstate, M_IPFW); + ipfw_objhash_destroy(ch->srvmap); +} + +/* + * Allocate new bitmask which can be used to enlarge/shrink + * named instance index. + */ +void +ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks) +{ + size_t size; + int max_blocks; + u_long *idx_mask; + + KASSERT((items % BLOCK_ITEMS) == 0, + ("bitmask size needs to power of 2 and greater or equal to %zu", + BLOCK_ITEMS)); + + max_blocks = items / BLOCK_ITEMS; + size = items / 8; + idx_mask = malloc(size * IPFW_MAX_SETS, M_IPFW, M_WAITOK); + /* Mark all as free */ + memset(idx_mask, 0xFF, size * IPFW_MAX_SETS); + *idx_mask &= ~(u_long)1; /* Skip index 0 */ + + *idx = idx_mask; + *pblocks = max_blocks; +} + +/* + * Copy current bitmask index to new one. + */ +void +ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, void **idx, int *blocks) +{ + int old_blocks, new_blocks; + u_long *old_idx, *new_idx; + int i; + + old_idx = ni->idx_mask; + old_blocks = ni->max_blocks; + new_idx = *idx; + new_blocks = *blocks; + + for (i = 0; i < IPFW_MAX_SETS; i++) { + memcpy(&new_idx[new_blocks * i], &old_idx[old_blocks * i], + old_blocks * sizeof(u_long)); + } +} + +/* + * Swaps current @ni index with new one. + */ +void +ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, void **idx, int *blocks) +{ + int old_blocks; + u_long *old_idx; + + old_idx = ni->idx_mask; + old_blocks = ni->max_blocks; + + ni->idx_mask = *idx; + ni->max_blocks = *blocks; + + /* Save old values */ + *idx = old_idx; + *blocks = old_blocks; +} + +void +ipfw_objhash_bitmap_free(void *idx, int blocks) +{ + + free(idx, M_IPFW); +} + +/* + * Creates named hash instance. + * Must be called without holding any locks. + * Return pointer to new instance. + */ +struct namedobj_instance * +ipfw_objhash_create(uint32_t items) +{ + struct namedobj_instance *ni; + int i; + size_t size; + + size = sizeof(struct namedobj_instance) + + sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE + + sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE; + + ni = malloc(size, M_IPFW, M_WAITOK | M_ZERO); + ni->nn_size = NAMEDOBJ_HASH_SIZE; + ni->nv_size = NAMEDOBJ_HASH_SIZE; + + ni->names = (struct namedobjects_head *)(ni +1); + ni->values = &ni->names[ni->nn_size]; + + for (i = 0; i < ni->nn_size; i++) + TAILQ_INIT(&ni->names[i]); + + for (i = 0; i < ni->nv_size; i++) + TAILQ_INIT(&ni->values[i]); + + /* Set default hashing/comparison functions */ + ni->hash_f = objhash_hash_name; + ni->cmp_f = objhash_cmp_name; + + /* Allocate bitmask separately due to possible resize */ + ipfw_objhash_bitmap_alloc(items, (void*)&ni->idx_mask, &ni->max_blocks); + + return (ni); +} + +void +ipfw_objhash_destroy(struct namedobj_instance *ni) +{ + + free(ni->idx_mask, M_IPFW); + free(ni, M_IPFW); +} + +void +ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f, + objhash_cmp_f *cmp_f) +{ + + ni->hash_f = hash_f; + ni->cmp_f = cmp_f; +} + +static uint32_t +objhash_hash_name(struct namedobj_instance *ni, const void *name, uint32_t set) +{ + + return (fnv_32_str((const char *)name, FNV1_32_INIT)); +} + +static int +objhash_cmp_name(struct named_object *no, const void *name, uint32_t set) +{ + + if ((strcmp(no->name, (const char *)name) == 0) && (no->set == set)) + return (0); + + return (1); +} + +static uint32_t +objhash_hash_idx(struct namedobj_instance *ni, uint32_t val) +{ + uint32_t v; + + v = val % (ni->nv_size - 1); + + return (v); +} + +struct named_object * +ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, char *name) +{ + struct named_object *no; + uint32_t hash; + + hash = ni->hash_f(ni, name, set) % ni->nn_size; + + TAILQ_FOREACH(no, &ni->names[hash], nn_next) { + if (ni->cmp_f(no, name, set) == 0) + return (no); + } + + return (NULL); +} + +/* + * Find named object by @uid. + * Check @tlvs for valid data inside. + * + * Returns pointer to found TLV or NULL. + */ +ipfw_obj_ntlv * +ipfw_find_name_tlv_type(void *tlvs, int len, uint16_t uidx, uint32_t etlv) +{ + ipfw_obj_ntlv *ntlv; + uintptr_t pa, pe; + int l; + + pa = (uintptr_t)tlvs; + pe = pa + len; + l = 0; + for (; pa < pe; pa += l) { + ntlv = (ipfw_obj_ntlv *)pa; + l = ntlv->head.length; + + if (l != sizeof(*ntlv)) + return (NULL); + + if (ntlv->idx != uidx) + continue; + /* + * When userland has specified zero TLV type, do + * not compare it with eltv. In some cases userland + * doesn't know what type should it have. Use only + * uidx and name for search named_object. + */ + if (ntlv->head.type != 0 && + ntlv->head.type != (uint16_t)etlv) + continue; + + if (ipfw_check_object_name_generic(ntlv->name) != 0) + return (NULL); + + return (ntlv); + } + + return (NULL); +} + +/* + * Finds object config based on either legacy index + * or name in ntlv. + * Note @ti structure contains unchecked data from userland. + * + * Returns 0 in success and fills in @pno with found config + */ +int +ipfw_objhash_find_type(struct namedobj_instance *ni, struct tid_info *ti, + uint32_t etlv, struct named_object **pno) +{ + char *name; + ipfw_obj_ntlv *ntlv; + uint32_t set; + + if (ti->tlvs == NULL) + return (EINVAL); + + ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, etlv); + if (ntlv == NULL) + return (EINVAL); + name = ntlv->name; + + /* + * Use set provided by @ti instead of @ntlv one. + * This is needed due to different sets behavior + * controlled by V_fw_tables_sets. + */ + set = ti->set; + *pno = ipfw_objhash_lookup_name(ni, set, name); + if (*pno == NULL) + return (ESRCH); + return (0); +} + +/* + * Find named object by name, considering also its TLV type. + */ +struct named_object * +ipfw_objhash_lookup_name_type(struct namedobj_instance *ni, uint32_t set, + uint32_t type, const char *name) +{ + struct named_object *no; + uint32_t hash; + + hash = ni->hash_f(ni, name, set) % ni->nn_size; + + TAILQ_FOREACH(no, &ni->names[hash], nn_next) { + if (ni->cmp_f(no, name, set) == 0 && + no->etlv == (uint16_t)type) + return (no); + } + + return (NULL); +} + +struct named_object * +ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t kidx) +{ + struct named_object *no; + uint32_t hash; + + hash = objhash_hash_idx(ni, kidx); + + TAILQ_FOREACH(no, &ni->values[hash], nv_next) { + if (no->kidx == kidx) + return (no); + } + + return (NULL); +} + +int +ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a, + struct named_object *b) +{ + + if ((strcmp(a->name, b->name) == 0) && a->set == b->set) + return (1); + + return (0); +} + +void +ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no) +{ + uint32_t hash; + + hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size; + TAILQ_INSERT_HEAD(&ni->names[hash], no, nn_next); + + hash = objhash_hash_idx(ni, no->kidx); + TAILQ_INSERT_HEAD(&ni->values[hash], no, nv_next); + + ni->count++; +} + +void +ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no) +{ + uint32_t hash; + + hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size; + TAILQ_REMOVE(&ni->names[hash], no, nn_next); + + hash = objhash_hash_idx(ni, no->kidx); + TAILQ_REMOVE(&ni->values[hash], no, nv_next); + + ni->count--; +} + +uint32_t +ipfw_objhash_count(struct namedobj_instance *ni) +{ + + return (ni->count); +} + +uint32_t +ipfw_objhash_count_type(struct namedobj_instance *ni, uint16_t type) +{ + struct named_object *no; + uint32_t count; + int i; + + count = 0; + for (i = 0; i < ni->nn_size; i++) { + TAILQ_FOREACH(no, &ni->names[i], nn_next) { + if (no->etlv == type) + count++; + } + } + return (count); +} + +/* + * Runs @func for each found named object. + * It is safe to delete objects from callback + */ +int +ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, void *arg) +{ + struct named_object *no, *no_tmp; + int i, ret; + + for (i = 0; i < ni->nn_size; i++) { + TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) { + ret = f(ni, no, arg); + if (ret != 0) + return (ret); + } + } + return (0); +} + +/* + * Runs @f for each found named object with type @type. + * It is safe to delete objects from callback + */ +int +ipfw_objhash_foreach_type(struct namedobj_instance *ni, objhash_cb_t *f, + void *arg, uint16_t type) +{ + struct named_object *no, *no_tmp; + int i, ret; + + for (i = 0; i < ni->nn_size; i++) { + TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) { + if (no->etlv != type) + continue; + ret = f(ni, no, arg); + if (ret != 0) + return (ret); + } + } + return (0); +} + +/* + * Removes index from given set. + * Returns 0 on success. + */ +int +ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx) +{ + u_long *mask; + int i, v; + + i = idx / BLOCK_ITEMS; + v = idx % BLOCK_ITEMS; + + if (i >= ni->max_blocks) + return (1); + + mask = &ni->idx_mask[i]; + + if ((*mask & ((u_long)1 << v)) != 0) + return (1); + + /* Mark as free */ + *mask |= (u_long)1 << v; + + /* Update free offset */ + if (ni->free_off[0] > i) + ni->free_off[0] = i; + + return (0); +} + +/* + * Allocate new index in given instance and stores in in @pidx. + * Returns 0 on success. + */ +int +ipfw_objhash_alloc_idx(void *n, uint16_t *pidx) +{ + struct namedobj_instance *ni; + u_long *mask; + int i, off, v; + + ni = (struct namedobj_instance *)n; + + off = ni->free_off[0]; + mask = &ni->idx_mask[off]; + + for (i = off; i < ni->max_blocks; i++, mask++) { + if ((v = ffsl(*mask)) == 0) + continue; + + /* Mark as busy */ + *mask &= ~ ((u_long)1 << (v - 1)); + + ni->free_off[0] = i; + + v = BLOCK_ITEMS * i + v - 1; + + *pidx = v; + return (0); + } + + return (1); +} + /* end of file */ diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_table.c b/freebsd/sys/netpfil/ipfw/ip_fw_table.c index 71579795..9d2baad2 100644 --- a/freebsd/sys/netpfil/ipfw/ip_fw_table.c +++ b/freebsd/sys/netpfil/ipfw/ip_fw_table.c @@ -2,6 +2,8 @@ /*- * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko. + * Copyright (c) 2014 Yandex LLC + * Copyright (c) 2014 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -29,24 +31,18 @@ __FBSDID("$FreeBSD$"); /* - * Lookup table support for ipfw + * Lookup table support for ipfw. * - * Lookup tables are implemented (at the moment) using the radix - * tree used for routing tables. Tables store key-value entries, where - * keys are network prefixes (addr/masklen), and values are integers. - * As a degenerate case we can interpret keys as 32-bit integers - * (with a /32 mask). + * This file contains handlers for all generic tables' operations: + * add/del/flush entries, list/dump tables etc.. * - * The table is protected by the IPFW lock even for manipulation coming - * from userland, because operations are typically fast. + * Table data modification is protected by both UH and runtime lock + * while reading configuration/data is protected by UH lock. + * + * Lookup algorithms for all table types are located in ip_fw_table_algo.c */ #include <rtems/bsd/local/opt_ipfw.h> -#include <rtems/bsd/local/opt_inet.h> -#ifndef INET -#error IPFIREWALL requires INET. -#endif /* INET */ -#include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/sys/param.h> #include <sys/systm.h> @@ -54,713 +50,3296 @@ __FBSDID("$FreeBSD$"); #include <sys/kernel.h> #include <rtems/bsd/sys/lock.h> #include <sys/rwlock.h> +#include <sys/rmlock.h> #include <sys/socket.h> +#include <sys/socketvar.h> #include <sys/queue.h> #include <net/if.h> /* ip_fw.h requires IFNAMSIZ */ -#include <net/radix.h> -#include <net/route.h> -#include <net/vnet.h> #include <netinet/in.h> #include <netinet/ip_var.h> /* struct ipfw_rule_ref */ #include <netinet/ip_fw.h> #include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/ip_fw_table.h> -#ifdef MAC -#include <security/mac/mac_framework.h> -#endif + /* + * Table has the following `type` concepts: + * + * `no.type` represents lookup key type (addr, ifp, uid, etc..) + * vmask represents bitmask of table values which are present at the moment. + * Special IPFW_VTYPE_LEGACY ( (uint32_t)-1 ) represents old + * single-value-for-all approach. + */ +struct table_config { + struct named_object no; + uint8_t tflags; /* type flags */ + uint8_t locked; /* 1 if locked from changes */ + uint8_t linked; /* 1 if already linked */ + uint8_t ochanged; /* used by set swapping */ + uint8_t vshared; /* 1 if using shared value array */ + uint8_t spare[3]; + uint32_t count; /* Number of records */ + uint32_t limit; /* Max number of records */ + uint32_t vmask; /* bitmask with supported values */ + uint32_t ocount; /* used by set swapping */ + uint64_t gencnt; /* generation count */ + char tablename[64]; /* table name */ + struct table_algo *ta; /* Callbacks for given algo */ + void *astate; /* algorithm state */ + struct table_info ti_copy; /* data to put to table_info */ + struct namedobj_instance *vi; +}; -MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); +static int find_table_err(struct namedobj_instance *ni, struct tid_info *ti, + struct table_config **tc); +static struct table_config *find_table(struct namedobj_instance *ni, + struct tid_info *ti); +static struct table_config *alloc_table_config(struct ip_fw_chain *ch, + struct tid_info *ti, struct table_algo *ta, char *adata, uint8_t tflags); +static void free_table_config(struct namedobj_instance *ni, + struct table_config *tc); +static int create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti, + char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int ref); +static void link_table(struct ip_fw_chain *ch, struct table_config *tc); +static void unlink_table(struct ip_fw_chain *ch, struct table_config *tc); +static int find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti, + struct tentry_info *tei, uint32_t count, int op, struct table_config **ptc); +#define OP_ADD 1 +#define OP_DEL 0 +static int export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh, + struct sockopt_data *sd); +static void export_table_info(struct ip_fw_chain *ch, struct table_config *tc, + ipfw_xtable_info *i); +static int dump_table_tentry(void *e, void *arg); +static int dump_table_xentry(void *e, void *arg); + +static int swap_tables(struct ip_fw_chain *ch, struct tid_info *a, + struct tid_info *b); + +static int check_table_name(const char *name); +static int check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts, + struct table_config *tc, struct table_info *ti, uint32_t count); +static int destroy_table(struct ip_fw_chain *ch, struct tid_info *ti); + +static struct table_algo *find_table_algo(struct tables_config *tableconf, + struct tid_info *ti, char *name); + +static void objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti); +static void ntlv_to_ti(struct _ipfw_obj_ntlv *ntlv, struct tid_info *ti); + +#define CHAIN_TO_NI(chain) (CHAIN_TO_TCFG(chain)->namehash) +#define KIDX_TO_TI(ch, k) (&(((struct table_info *)(ch)->tablestate)[k])) + +#define TA_BUF_SZ 128 /* On-stack buffer for add/delete state */ -struct table_entry { - struct radix_node rn[2]; - struct sockaddr_in addr, mask; - u_int32_t value; -}; +void +rollback_toperation_state(struct ip_fw_chain *ch, void *object) +{ + struct tables_config *tcfg; + struct op_state *os; -struct xaddr_iface { - uint8_t if_len; /* length of this struct */ - uint8_t pad[7]; /* Align name */ - char ifname[IF_NAMESIZE]; /* Interface name */ -}; + tcfg = CHAIN_TO_TCFG(ch); + TAILQ_FOREACH(os, &tcfg->state_list, next) + os->func(object, os); +} + +void +add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts) +{ + struct tables_config *tcfg; + + tcfg = CHAIN_TO_TCFG(ch); + TAILQ_INSERT_HEAD(&tcfg->state_list, &ts->opstate, next); +} + +void +del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts) +{ + struct tables_config *tcfg; + + tcfg = CHAIN_TO_TCFG(ch); + TAILQ_REMOVE(&tcfg->state_list, &ts->opstate, next); +} + +void +tc_ref(struct table_config *tc) +{ + + tc->no.refcnt++; +} + +void +tc_unref(struct table_config *tc) +{ + + tc->no.refcnt--; +} + +static struct table_value * +get_table_value(struct ip_fw_chain *ch, struct table_config *tc, uint32_t kidx) +{ + struct table_value *pval; + + pval = (struct table_value *)ch->valuestate; + + return (&pval[kidx]); +} -struct table_xentry { - struct radix_node rn[2]; - union { -#ifdef INET6 - struct sockaddr_in6 addr6; -#endif - struct xaddr_iface iface; - } a; - union { -#ifdef INET6 - struct sockaddr_in6 mask6; -#endif - struct xaddr_iface ifmask; - } m; - u_int32_t value; -}; /* - * The radix code expects addr and mask to be array of bytes, - * with the first byte being the length of the array. rn_inithead - * is called with the offset in bits of the lookup key within the - * array. If we use a sockaddr_in as the underlying type, - * sin_len is conveniently located at offset 0, sin_addr is at - * offset 4 and normally aligned. - * But for portability, let's avoid assumption and make the code explicit + * Checks if we're able to insert/update entry @tei into table + * w.r.t @tc limits. + * May alter @tei to indicate insertion error / insert + * options. + * + * Returns 0 if operation can be performed/ */ -#define KEY_LEN(v) *((uint8_t *)&(v)) -#define KEY_OFS (8*offsetof(struct sockaddr_in, sin_addr)) +static int +check_table_limit(struct table_config *tc, struct tentry_info *tei) +{ + + if (tc->limit == 0 || tc->count < tc->limit) + return (0); + + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) { + /* Notify userland on error cause */ + tei->flags |= TEI_FLAGS_LIMIT; + return (EFBIG); + } + + /* + * We have UPDATE flag set. + * Permit updating record (if found), + * but restrict adding new one since we've + * already hit the limit. + */ + tei->flags |= TEI_FLAGS_DONTADD; + + return (0); +} + /* - * Do not require radix to compare more than actual IPv4/IPv6 address + * Convert algorithm callback return code into + * one of pre-defined states known by userland. */ -#define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t)) -#define KEY_LEN_INET6 (offsetof(struct sockaddr_in6, sin6_addr) + sizeof(struct in6_addr)) -#define KEY_LEN_IFACE (offsetof(struct xaddr_iface, ifname)) +static void +store_tei_result(struct tentry_info *tei, int op, int error, uint32_t num) +{ + int flag; -#define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr)) -#define OFF_LEN_INET6 (8 * offsetof(struct sockaddr_in6, sin6_addr)) -#define OFF_LEN_IFACE (8 * offsetof(struct xaddr_iface, ifname)) + flag = 0; + switch (error) { + case 0: + if (op == OP_ADD && num != 0) + flag = TEI_FLAGS_ADDED; + if (op == OP_DEL) + flag = TEI_FLAGS_DELETED; + break; + case ENOENT: + flag = TEI_FLAGS_NOTFOUND; + break; + case EEXIST: + flag = TEI_FLAGS_EXISTS; + break; + default: + flag = TEI_FLAGS_ERROR; + } -#ifdef INET6 -static inline void -ipv6_writemask(struct in6_addr *addr6, uint8_t mask) + tei->flags |= flag; +} + +/* + * Creates and references table with default parameters. + * Saves table config, algo and allocated kidx info @ptc, @pta and + * @pkidx if non-zero. + * Used for table auto-creation to support old binaries. + * + * Returns 0 on success. + */ +static int +create_table_compat(struct ip_fw_chain *ch, struct tid_info *ti, + uint16_t *pkidx) { - uint32_t *cp; + ipfw_xtable_info xi; + int error; + + memset(&xi, 0, sizeof(xi)); + /* Set default value mask for legacy clients */ + xi.vmask = IPFW_VTYPE_LEGACY; + + error = create_table_internal(ch, ti, NULL, &xi, pkidx, 1); + if (error != 0) + return (error); - for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) - *cp++ = 0xFFFFFFFF; - *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); + return (0); +} + +/* + * Find and reference existing table optionally + * creating new one. + * + * Saves found table config into @ptc. + * Note function may drop/acquire UH_WLOCK. + * Returns 0 if table was found/created and referenced + * or non-zero return code. + */ +static int +find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti, + struct tentry_info *tei, uint32_t count, int op, + struct table_config **ptc) +{ + struct namedobj_instance *ni; + struct table_config *tc; + uint16_t kidx; + int error; + + IPFW_UH_WLOCK_ASSERT(ch); + + ni = CHAIN_TO_NI(ch); + tc = NULL; + if ((tc = find_table(ni, ti)) != NULL) { + /* check table type */ + if (tc->no.subtype != ti->type) + return (EINVAL); + + if (tc->locked != 0) + return (EACCES); + + /* Try to exit early on limit hit */ + if (op == OP_ADD && count == 1 && + check_table_limit(tc, tei) != 0) + return (EFBIG); + + /* Reference and return */ + tc->no.refcnt++; + *ptc = tc; + return (0); + } + + if (op == OP_DEL) + return (ESRCH); + + /* Compatibility mode: create new table for old clients */ + if ((tei->flags & TEI_FLAGS_COMPAT) == 0) + return (ESRCH); + + IPFW_UH_WUNLOCK(ch); + error = create_table_compat(ch, ti, &kidx); + IPFW_UH_WLOCK(ch); + + if (error != 0) + return (error); + + tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx); + KASSERT(tc != NULL, ("create_table_compat returned bad idx %d", kidx)); + + /* OK, now we've got referenced table. */ + *ptc = tc; + return (0); +} + +/* + * Rolls back already @added to @tc entries using state array @ta_buf_m. + * Assume the following layout: + * 1) ADD state (ta_buf_m[0] ... t_buf_m[added - 1]) for handling update cases + * 2) DEL state (ta_buf_m[count[ ... t_buf_m[count + added - 1]) + * for storing deleted state + */ +static void +rollback_added_entries(struct ip_fw_chain *ch, struct table_config *tc, + struct table_info *tinfo, struct tentry_info *tei, caddr_t ta_buf_m, + uint32_t count, uint32_t added) +{ + struct table_algo *ta; + struct tentry_info *ptei; + caddr_t v, vv; + size_t ta_buf_sz; + int error, i; + uint32_t num; + + IPFW_UH_WLOCK_ASSERT(ch); + + ta = tc->ta; + ta_buf_sz = ta->ta_buf_size; + v = ta_buf_m; + vv = v + count * ta_buf_sz; + for (i = 0; i < added; i++, v += ta_buf_sz, vv += ta_buf_sz) { + ptei = &tei[i]; + if ((ptei->flags & TEI_FLAGS_UPDATED) != 0) { + + /* + * We have old value stored by previous + * call in @ptei->value. Do add once again + * to restore it. + */ + error = ta->add(tc->astate, tinfo, ptei, v, &num); + KASSERT(error == 0, ("rollback UPDATE fail")); + KASSERT(num == 0, ("rollback UPDATE fail2")); + continue; + } + + error = ta->prepare_del(ch, ptei, vv); + KASSERT(error == 0, ("pre-rollback INSERT failed")); + error = ta->del(tc->astate, tinfo, ptei, vv, &num); + KASSERT(error == 0, ("rollback INSERT failed")); + tc->count -= num; + } +} + +/* + * Prepares add/del state for all @count entries in @tei. + * Uses either stack buffer (@ta_buf) or allocates a new one. + * Stores pointer to allocated buffer back to @ta_buf. + * + * Returns 0 on success. + */ +static int +prepare_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta, + struct tentry_info *tei, uint32_t count, int op, caddr_t *ta_buf) +{ + caddr_t ta_buf_m, v; + size_t ta_buf_sz, sz; + struct tentry_info *ptei; + int error, i; + + error = 0; + ta_buf_sz = ta->ta_buf_size; + if (count == 1) { + /* Sigle add/delete, use on-stack buffer */ + memset(*ta_buf, 0, TA_BUF_SZ); + ta_buf_m = *ta_buf; + } else { + + /* + * Multiple adds/deletes, allocate larger buffer + * + * Note we need 2xcount buffer for add case: + * we have hold both ADD state + * and DELETE state (this may be needed + * if we need to rollback all changes) + */ + sz = count * ta_buf_sz; + ta_buf_m = malloc((op == OP_ADD) ? sz * 2 : sz, M_TEMP, + M_WAITOK | M_ZERO); + } + + v = ta_buf_m; + for (i = 0; i < count; i++, v += ta_buf_sz) { + ptei = &tei[i]; + error = (op == OP_ADD) ? + ta->prepare_add(ch, ptei, v) : ta->prepare_del(ch, ptei, v); + + /* + * Some syntax error (incorrect mask, or address, or + * anything). Return error regardless of atomicity + * settings. + */ + if (error != 0) + break; + } + + *ta_buf = ta_buf_m; + return (error); } -#endif +/* + * Flushes allocated state for each @count entries in @tei. + * Frees @ta_buf_m if differs from stack buffer @ta_buf. + */ +static void +flush_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta, + struct tentry_info *tei, uint32_t count, int rollback, + caddr_t ta_buf_m, caddr_t ta_buf) +{ + caddr_t v; + struct tentry_info *ptei; + size_t ta_buf_sz; + int i; + + ta_buf_sz = ta->ta_buf_size; + + /* Run cleaning callback anyway */ + v = ta_buf_m; + for (i = 0; i < count; i++, v += ta_buf_sz) { + ptei = &tei[i]; + ta->flush_entry(ch, ptei, v); + if (ptei->ptv != NULL) { + free(ptei->ptv, M_IPFW); + ptei->ptv = NULL; + } + } + + /* Clean up "deleted" state in case of rollback */ + if (rollback != 0) { + v = ta_buf_m + count * ta_buf_sz; + for (i = 0; i < count; i++, v += ta_buf_sz) + ta->flush_entry(ch, &tei[i], v); + } + + if (ta_buf_m != ta_buf) + free(ta_buf_m, M_TEMP); +} + + +static void +rollback_add_entry(void *object, struct op_state *_state) +{ + struct ip_fw_chain *ch; + struct tableop_state *ts; + + ts = (struct tableop_state *)_state; + + if (ts->tc != object && ts->ch != object) + return; + + ch = ts->ch; + + IPFW_UH_WLOCK_ASSERT(ch); + + /* Call specifid unlockers */ + rollback_table_values(ts); + + /* Indicate we've called */ + ts->modified = 1; +} + +/* + * Adds/updates one or more entries in table @ti. + * + * Function may drop/reacquire UH wlock multiple times due to + * items alloc, algorithm callbacks (check_space), value linkage + * (new values, value storage realloc), etc.. + * Other processes like other adds (which may involve storage resize), + * table swaps (which changes table data and may change algo type), + * table modify (which may change value mask) may be executed + * simultaneously so we need to deal with it. + * + * The following approach was implemented: + * we have per-chain linked list, protected with UH lock. + * add_table_entry prepares special on-stack structure wthich is passed + * to its descendants. Users add this structure to this list before unlock. + * After performing needed operations and acquiring UH lock back, each user + * checks if structure has changed. If true, it rolls local state back and + * returns without error to the caller. + * add_table_entry() on its own checks if structure has changed and restarts + * its operation from the beginning (goto restart). + * + * Functions which are modifying fields of interest (currently + * resize_shared_value_storage() and swap_tables() ) + * traverses given list while holding UH lock immediately before + * performing their operations calling function provided be list entry + * ( currently rollback_add_entry ) which performs rollback for all necessary + * state and sets appropriate values in structure indicating rollback + * has happened. + * + * Algo interaction: + * Function references @ti first to ensure table won't + * disappear or change its type. + * After that, prepare_add callback is called for each @tei entry. + * Next, we try to add each entry under UH+WHLOCK + * using add() callback. + * Finally, we free all state by calling flush_entry callback + * for each @tei. + * + * Returns 0 on success. + */ int -ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, - uint8_t plen, uint8_t mlen, uint8_t type, uint32_t value) -{ - struct radix_node_head *rnh, **rnh_ptr; - struct table_entry *ent; - struct table_xentry *xent; - struct radix_node *rn; - in_addr_t addr; - int offset; - void *ent_ptr; - struct sockaddr *addr_ptr, *mask_ptr; - char c; - - if (tbl >= V_fw_tables_max) - return (EINVAL); +add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, + struct tentry_info *tei, uint8_t flags, uint32_t count) +{ + struct table_config *tc; + struct table_algo *ta; + uint16_t kidx; + int error, first_error, i, rollback; + uint32_t num, numadd; + struct tentry_info *ptei; + struct tableop_state ts; + char ta_buf[TA_BUF_SZ]; + caddr_t ta_buf_m, v; + + memset(&ts, 0, sizeof(ts)); + ta = NULL; + IPFW_UH_WLOCK(ch); - switch (type) { - case IPFW_TABLE_CIDR: - if (plen == sizeof(in_addr_t)) { -#ifdef INET - /* IPv4 case */ - if (mlen > 32) - return (EINVAL); - ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); - ent->value = value; - /* Set 'total' structure length */ - KEY_LEN(ent->addr) = KEY_LEN_INET; - KEY_LEN(ent->mask) = KEY_LEN_INET; - /* Set offset of IPv4 address in bits */ - offset = OFF_LEN_INET; - ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); - addr = *((in_addr_t *)paddr); - ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; - /* Set pointers */ - rnh_ptr = &ch->tables[tbl]; - ent_ptr = ent; - addr_ptr = (struct sockaddr *)&ent->addr; - mask_ptr = (struct sockaddr *)&ent->mask; -#endif -#ifdef INET6 - } else if (plen == sizeof(struct in6_addr)) { - /* IPv6 case */ - if (mlen > 128) - return (EINVAL); - xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO); - xent->value = value; - /* Set 'total' structure length */ - KEY_LEN(xent->a.addr6) = KEY_LEN_INET6; - KEY_LEN(xent->m.mask6) = KEY_LEN_INET6; - /* Set offset of IPv6 address in bits */ - offset = OFF_LEN_INET6; - ipv6_writemask(&xent->m.mask6.sin6_addr, mlen); - memcpy(&xent->a.addr6.sin6_addr, paddr, sizeof(struct in6_addr)); - APPLY_MASK(&xent->a.addr6.sin6_addr, &xent->m.mask6.sin6_addr); - /* Set pointers */ - rnh_ptr = &ch->xtables[tbl]; - ent_ptr = xent; - addr_ptr = (struct sockaddr *)&xent->a.addr6; - mask_ptr = (struct sockaddr *)&xent->m.mask6; -#endif - } else { - /* Unknown CIDR type */ - return (EINVAL); + /* + * Find and reference existing table. + */ +restart: + if (ts.modified != 0) { + IPFW_UH_WUNLOCK(ch); + flush_batch_buffer(ch, ta, tei, count, rollback, + ta_buf_m, ta_buf); + memset(&ts, 0, sizeof(ts)); + ta = NULL; + IPFW_UH_WLOCK(ch); + } + + error = find_ref_table(ch, ti, tei, count, OP_ADD, &tc); + if (error != 0) { + IPFW_UH_WUNLOCK(ch); + return (error); + } + ta = tc->ta; + + /* Fill in tablestate */ + ts.ch = ch; + ts.opstate.func = rollback_add_entry; + ts.tc = tc; + ts.vshared = tc->vshared; + ts.vmask = tc->vmask; + ts.ta = ta; + ts.tei = tei; + ts.count = count; + rollback = 0; + add_toperation_state(ch, &ts); + IPFW_UH_WUNLOCK(ch); + + /* Allocate memory and prepare record(s) */ + /* Pass stack buffer by default */ + ta_buf_m = ta_buf; + error = prepare_batch_buffer(ch, ta, tei, count, OP_ADD, &ta_buf_m); + + IPFW_UH_WLOCK(ch); + del_toperation_state(ch, &ts); + /* Drop reference we've used in first search */ + tc->no.refcnt--; + + /* Check prepare_batch_buffer() error */ + if (error != 0) + goto cleanup; + + /* + * Check if table swap has happened. + * (so table algo might be changed). + * Restart operation to achieve consistent behavior. + */ + if (ts.modified != 0) + goto restart; + + /* + * Link all values values to shared/per-table value array. + * + * May release/reacquire UH_WLOCK. + */ + error = ipfw_link_table_values(ch, &ts); + if (error != 0) + goto cleanup; + if (ts.modified != 0) + goto restart; + + /* + * Ensure we are able to add all entries without additional + * memory allocations. May release/reacquire UH_WLOCK. + */ + kidx = tc->no.kidx; + error = check_table_space(ch, &ts, tc, KIDX_TO_TI(ch, kidx), count); + if (error != 0) + goto cleanup; + if (ts.modified != 0) + goto restart; + + /* We've got valid table in @tc. Let's try to add data */ + kidx = tc->no.kidx; + ta = tc->ta; + numadd = 0; + first_error = 0; + + IPFW_WLOCK(ch); + + v = ta_buf_m; + for (i = 0; i < count; i++, v += ta->ta_buf_size) { + ptei = &tei[i]; + num = 0; + /* check limit before adding */ + if ((error = check_table_limit(tc, ptei)) == 0) { + error = ta->add(tc->astate, KIDX_TO_TI(ch, kidx), + ptei, v, &num); + /* Set status flag to inform userland */ + store_tei_result(ptei, OP_ADD, error, num); } + if (error == 0) { + /* Update number of records to ease limit checking */ + tc->count += num; + numadd += num; + continue; + } + + if (first_error == 0) + first_error = error; + + /* + * Some error have happened. Check our atomicity + * settings: continue if atomicity is not required, + * rollback changes otherwise. + */ + if ((flags & IPFW_CTF_ATOMIC) == 0) + continue; + + rollback_added_entries(ch, tc, KIDX_TO_TI(ch, kidx), + tei, ta_buf_m, count, i); + + rollback = 1; break; + } + + IPFW_WUNLOCK(ch); + + ipfw_garbage_table_values(ch, tc, tei, count, rollback); + + /* Permit post-add algorithm grow/rehash. */ + if (numadd != 0) + check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0); + + /* Return first error to user, if any */ + error = first_error; + +cleanup: + IPFW_UH_WUNLOCK(ch); + + flush_batch_buffer(ch, ta, tei, count, rollback, ta_buf_m, ta_buf); - case IPFW_TABLE_INTERFACE: - /* Check if string is terminated */ - c = ((char *)paddr)[IF_NAMESIZE - 1]; - ((char *)paddr)[IF_NAMESIZE - 1] = '\0'; - if (((mlen = strlen((char *)paddr)) == IF_NAMESIZE - 1) && (c != '\0')) - return (EINVAL); + return (error); +} - /* Include last \0 into comparison */ - mlen++; - - xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO); - xent->value = value; - /* Set 'total' structure length */ - KEY_LEN(xent->a.iface) = KEY_LEN_IFACE + mlen; - KEY_LEN(xent->m.ifmask) = KEY_LEN_IFACE + mlen; - /* Set offset of interface name in bits */ - offset = OFF_LEN_IFACE; - memcpy(xent->a.iface.ifname, paddr, mlen); - /* Assume direct match */ - /* TODO: Add interface pattern matching */ -#if 0 - memset(xent->m.ifmask.ifname, 0xFF, IF_NAMESIZE); - mask_ptr = (struct sockaddr *)&xent->m.ifmask; -#endif - /* Set pointers */ - rnh_ptr = &ch->xtables[tbl]; - ent_ptr = xent; - addr_ptr = (struct sockaddr *)&xent->a.iface; - mask_ptr = NULL; - break; +/* + * Deletes one or more entries in table @ti. + * + * Returns 0 on success. + */ +int +del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, + struct tentry_info *tei, uint8_t flags, uint32_t count) +{ + struct table_config *tc; + struct table_algo *ta; + struct tentry_info *ptei; + uint16_t kidx; + int error, first_error, i; + uint32_t num, numdel; + char ta_buf[TA_BUF_SZ]; + caddr_t ta_buf_m, v; - default: - return (EINVAL); + /* + * Find and reference existing table. + */ + IPFW_UH_WLOCK(ch); + error = find_ref_table(ch, ti, tei, count, OP_DEL, &tc); + if (error != 0) { + IPFW_UH_WUNLOCK(ch); + return (error); + } + ta = tc->ta; + IPFW_UH_WUNLOCK(ch); + + /* Allocate memory and prepare record(s) */ + /* Pass stack buffer by default */ + ta_buf_m = ta_buf; + error = prepare_batch_buffer(ch, ta, tei, count, OP_DEL, &ta_buf_m); + if (error != 0) + goto cleanup; + + IPFW_UH_WLOCK(ch); + + /* Drop reference we've used in first search */ + tc->no.refcnt--; + + /* + * Check if table algo is still the same. + * (changed ta may be the result of table swap). + */ + if (ta != tc->ta) { + IPFW_UH_WUNLOCK(ch); + error = EINVAL; + goto cleanup; } + kidx = tc->no.kidx; + numdel = 0; + first_error = 0; + IPFW_WLOCK(ch); + v = ta_buf_m; + for (i = 0; i < count; i++, v += ta->ta_buf_size) { + ptei = &tei[i]; + num = 0; + error = ta->del(tc->astate, KIDX_TO_TI(ch, kidx), ptei, v, + &num); + /* Save state for userland */ + store_tei_result(ptei, OP_DEL, error, num); + if (error != 0 && first_error == 0) + first_error = error; + tc->count -= num; + numdel += num; + } + IPFW_WUNLOCK(ch); - /* Check if tabletype is valid */ - if ((ch->tabletype[tbl] != 0) && (ch->tabletype[tbl] != type)) { - IPFW_WUNLOCK(ch); - free(ent_ptr, M_IPFW_TBL); - return (EINVAL); + /* Unlink non-used values */ + ipfw_garbage_table_values(ch, tc, tei, count, 0); + + if (numdel != 0) { + /* Run post-del hook to permit shrinking */ + check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0); } - /* Check if radix tree exists */ - if ((rnh = *rnh_ptr) == NULL) { - IPFW_WUNLOCK(ch); - /* Create radix for a new table */ - if (!rn_inithead((void **)&rnh, offset)) { - free(ent_ptr, M_IPFW_TBL); - return (ENOMEM); + IPFW_UH_WUNLOCK(ch); + + /* Return first error to user, if any */ + error = first_error; + +cleanup: + flush_batch_buffer(ch, ta, tei, count, 0, ta_buf_m, ta_buf); + + return (error); +} + +/* + * Ensure that table @tc has enough space to add @count entries without + * need for reallocation. + * + * Callbacks order: + * 0) need_modify() (UH_WLOCK) - checks if @count items can be added w/o resize. + * + * 1) alloc_modify (no locks, M_WAITOK) - alloc new state based on @pflags. + * 2) prepare_modifyt (UH_WLOCK) - copy old data into new storage + * 3) modify (UH_WLOCK + WLOCK) - switch pointers + * 4) flush_modify (UH_WLOCK) - free state, if needed + * + * Returns 0 on success. + */ +static int +check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts, + struct table_config *tc, struct table_info *ti, uint32_t count) +{ + struct table_algo *ta; + uint64_t pflags; + char ta_buf[TA_BUF_SZ]; + int error; + + IPFW_UH_WLOCK_ASSERT(ch); + + error = 0; + ta = tc->ta; + if (ta->need_modify == NULL) + return (0); + + /* Acquire reference not to loose @tc between locks/unlocks */ + tc->no.refcnt++; + + /* + * TODO: think about avoiding race between large add/large delete + * operation on algorithm which implements shrinking along with + * growing. + */ + while (true) { + pflags = 0; + if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) { + error = 0; + break; } - IPFW_WLOCK(ch); - if (*rnh_ptr != NULL) { - /* Tree is already attached by other thread */ - rn_detachhead((void **)&rnh); - rnh = *rnh_ptr; - /* Check table type another time */ - if (ch->tabletype[tbl] != type) { - IPFW_WUNLOCK(ch); - free(ent_ptr, M_IPFW_TBL); - return (EINVAL); - } - } else { - *rnh_ptr = rnh; - /* - * Set table type. It can be set already - * (if we have IPv6-only table) but setting - * it another time does not hurt + /* We have to shrink/grow table */ + if (ts != NULL) + add_toperation_state(ch, ts); + IPFW_UH_WUNLOCK(ch); + + memset(&ta_buf, 0, sizeof(ta_buf)); + error = ta->prepare_mod(ta_buf, &pflags); + + IPFW_UH_WLOCK(ch); + if (ts != NULL) + del_toperation_state(ch, ts); + + if (error != 0) + break; + + if (ts != NULL && ts->modified != 0) { + + /* + * Swap operation has happened + * so we're currently operating on other + * table data. Stop doing this. + */ + ta->flush_mod(ta_buf); + break; + } + + /* Check if we still need to alter table */ + ti = KIDX_TO_TI(ch, tc->no.kidx); + if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) { + IPFW_UH_WUNLOCK(ch); + + /* + * Other thread has already performed resize. + * Flush our state and return. */ - ch->tabletype[tbl] = type; + ta->flush_mod(ta_buf); + break; + } + + error = ta->fill_mod(tc->astate, ti, ta_buf, &pflags); + if (error == 0) { + /* Do actual modification */ + IPFW_WLOCK(ch); + ta->modify(tc->astate, ti, ta_buf, pflags); + IPFW_WUNLOCK(ch); } + + /* Anyway, flush data and retry */ + ta->flush_mod(ta_buf); } - rn = rnh->rnh_addaddr(addr_ptr, mask_ptr, rnh, ent_ptr); - IPFW_WUNLOCK(ch); + tc->no.refcnt--; + return (error); +} - if (rn == NULL) { - free(ent_ptr, M_IPFW_TBL); - return (EEXIST); +/* + * Adds or deletes record in table. + * Data layout (v0): + * Request: [ ip_fw3_opheader ipfw_table_xentry ] + * + * Returns 0 on success + */ +static int +manage_table_ent_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_table_xentry *xent; + struct tentry_info tei; + struct tid_info ti; + struct table_value v; + int error, hdrlen, read; + + hdrlen = offsetof(ipfw_table_xentry, k); + + /* Check minimum header size */ + if (sd->valsize < (sizeof(*op3) + hdrlen)) + return (EINVAL); + + read = sizeof(ip_fw3_opheader); + + /* Check if xentry len field is valid */ + xent = (ipfw_table_xentry *)(op3 + 1); + if (xent->len < hdrlen || xent->len + read > sd->valsize) + return (EINVAL); + + memset(&tei, 0, sizeof(tei)); + tei.paddr = &xent->k; + tei.masklen = xent->masklen; + ipfw_import_table_value_legacy(xent->value, &v); + tei.pvalue = &v; + /* Old requests compatibility */ + tei.flags = TEI_FLAGS_COMPAT; + if (xent->type == IPFW_TABLE_ADDR) { + if (xent->len - hdrlen == sizeof(in_addr_t)) + tei.subtype = AF_INET; + else + tei.subtype = AF_INET6; } - return (0); + + memset(&ti, 0, sizeof(ti)); + ti.uidx = xent->tbl; + ti.type = xent->type; + + error = (op3->opcode == IP_FW_TABLE_XADD) ? + add_table_entry(ch, &ti, &tei, 0, 1) : + del_table_entry(ch, &ti, &tei, 0, 1); + + return (error); } -int -ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, - uint8_t plen, uint8_t mlen, uint8_t type) +/* + * Adds or deletes record in table. + * Data layout (v1)(current): + * Request: [ ipfw_obj_header + * ipfw_obj_ctlv(IPFW_TLV_TBLENT_LIST) [ ipfw_obj_tentry x N ] + * ] + * + * Returns 0 on success + */ +static int +manage_table_ent_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) { - struct radix_node_head *rnh, **rnh_ptr; - struct table_entry *ent; - in_addr_t addr; - struct sockaddr_in sa, mask; - struct sockaddr *sa_ptr, *mask_ptr; - char c; + ipfw_obj_tentry *tent, *ptent; + ipfw_obj_ctlv *ctlv; + ipfw_obj_header *oh; + struct tentry_info *ptei, tei, *tei_buf; + struct tid_info ti; + int error, i, kidx, read; + + /* Check minimum header size */ + if (sd->valsize < (sizeof(*oh) + sizeof(*ctlv))) + return (EINVAL); - if (tbl >= V_fw_tables_max) + /* Check if passed data is too long */ + if (sd->valsize != sd->kavail) return (EINVAL); - switch (type) { - case IPFW_TABLE_CIDR: - if (plen == sizeof(in_addr_t)) { - /* Set 'total' structure length */ - KEY_LEN(sa) = KEY_LEN_INET; - KEY_LEN(mask) = KEY_LEN_INET; - mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); - addr = *((in_addr_t *)paddr); - sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr; - rnh_ptr = &ch->tables[tbl]; - sa_ptr = (struct sockaddr *)&sa; - mask_ptr = (struct sockaddr *)&mask; -#ifdef INET6 - } else if (plen == sizeof(struct in6_addr)) { - /* IPv6 case */ - if (mlen > 128) - return (EINVAL); - struct sockaddr_in6 sa6, mask6; - memset(&sa6, 0, sizeof(struct sockaddr_in6)); - memset(&mask6, 0, sizeof(struct sockaddr_in6)); - /* Set 'total' structure length */ - KEY_LEN(sa6) = KEY_LEN_INET6; - KEY_LEN(mask6) = KEY_LEN_INET6; - ipv6_writemask(&mask6.sin6_addr, mlen); - memcpy(&sa6.sin6_addr, paddr, sizeof(struct in6_addr)); - APPLY_MASK(&sa6.sin6_addr, &mask6.sin6_addr); - rnh_ptr = &ch->xtables[tbl]; - sa_ptr = (struct sockaddr *)&sa6; - mask_ptr = (struct sockaddr *)&mask6; -#endif - } else { - /* Unknown CIDR type */ - return (EINVAL); - } - break; + oh = (ipfw_obj_header *)sd->kbuf; - case IPFW_TABLE_INTERFACE: - /* Check if string is terminated */ - c = ((char *)paddr)[IF_NAMESIZE - 1]; - ((char *)paddr)[IF_NAMESIZE - 1] = '\0'; - if (((mlen = strlen((char *)paddr)) == IF_NAMESIZE - 1) && (c != '\0')) - return (EINVAL); + /* Basic length checks for TLVs */ + if (oh->ntlv.head.length != sizeof(oh->ntlv)) + return (EINVAL); - struct xaddr_iface ifname, ifmask; - memset(&ifname, 0, sizeof(ifname)); - - /* Include last \0 into comparison */ - mlen++; - - /* Set 'total' structure length */ - KEY_LEN(ifname) = KEY_LEN_IFACE + mlen; - KEY_LEN(ifmask) = KEY_LEN_IFACE + mlen; - /* Assume direct match */ - /* FIXME: Add interface pattern matching */ -#if 0 - memset(ifmask.ifname, 0xFF, IF_NAMESIZE); - mask_ptr = (struct sockaddr *)&ifmask; -#endif - mask_ptr = NULL; - memcpy(ifname.ifname, paddr, mlen); - /* Set pointers */ - rnh_ptr = &ch->xtables[tbl]; - sa_ptr = (struct sockaddr *)&ifname; + read = sizeof(*oh); - break; + ctlv = (ipfw_obj_ctlv *)(oh + 1); + if (ctlv->head.length + read != sd->valsize) + return (EINVAL); - default: + read += sizeof(*ctlv); + tent = (ipfw_obj_tentry *)(ctlv + 1); + if (ctlv->count * sizeof(*tent) + read != sd->valsize) return (EINVAL); + + if (ctlv->count == 0) + return (0); + + /* + * Mark entire buffer as "read". + * This instructs sopt api write it back + * after function return. + */ + ipfw_get_sopt_header(sd, sd->valsize); + + /* Perform basic checks for each entry */ + ptent = tent; + kidx = tent->idx; + for (i = 0; i < ctlv->count; i++, ptent++) { + if (ptent->head.length != sizeof(*ptent)) + return (EINVAL); + if (ptent->idx != kidx) + return (ENOTSUP); } - IPFW_WLOCK(ch); - if ((rnh = *rnh_ptr) == NULL) { - IPFW_WUNLOCK(ch); + /* Convert data into kernel request objects */ + objheader_to_ti(oh, &ti); + ti.type = oh->ntlv.type; + ti.uidx = kidx; + + /* Use on-stack buffer for single add/del */ + if (ctlv->count == 1) { + memset(&tei, 0, sizeof(tei)); + tei_buf = &tei; + } else + tei_buf = malloc(ctlv->count * sizeof(tei), M_TEMP, + M_WAITOK | M_ZERO); + + ptei = tei_buf; + ptent = tent; + for (i = 0; i < ctlv->count; i++, ptent++, ptei++) { + ptei->paddr = &ptent->k; + ptei->subtype = ptent->subtype; + ptei->masklen = ptent->masklen; + if (ptent->head.flags & IPFW_TF_UPDATE) + ptei->flags |= TEI_FLAGS_UPDATE; + + ipfw_import_table_value_v1(&ptent->v.value); + ptei->pvalue = (struct table_value *)&ptent->v.value; + } + + error = (oh->opheader.opcode == IP_FW_TABLE_XADD) ? + add_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count) : + del_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count); + + /* Translate result back to userland */ + ptei = tei_buf; + ptent = tent; + for (i = 0; i < ctlv->count; i++, ptent++, ptei++) { + if (ptei->flags & TEI_FLAGS_ADDED) + ptent->result = IPFW_TR_ADDED; + else if (ptei->flags & TEI_FLAGS_DELETED) + ptent->result = IPFW_TR_DELETED; + else if (ptei->flags & TEI_FLAGS_UPDATED) + ptent->result = IPFW_TR_UPDATED; + else if (ptei->flags & TEI_FLAGS_LIMIT) + ptent->result = IPFW_TR_LIMIT; + else if (ptei->flags & TEI_FLAGS_ERROR) + ptent->result = IPFW_TR_ERROR; + else if (ptei->flags & TEI_FLAGS_NOTFOUND) + ptent->result = IPFW_TR_NOTFOUND; + else if (ptei->flags & TEI_FLAGS_EXISTS) + ptent->result = IPFW_TR_EXISTS; + ipfw_export_table_value_v1(ptei->pvalue, &ptent->v.value); + } + + if (tei_buf != &tei) + free(tei_buf, M_TEMP); + + return (error); +} + +/* + * Looks up an entry in given table. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_obj_tentry ] + * Reply: [ ipfw_obj_header ipfw_obj_tentry ] + * + * Returns 0 on success + */ +static int +find_table_entry(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_tentry *tent; + ipfw_obj_header *oh; + struct tid_info ti; + struct table_config *tc; + struct table_algo *ta; + struct table_info *kti; + struct namedobj_instance *ni; + int error; + size_t sz; + + /* Check minimum header size */ + sz = sizeof(*oh) + sizeof(*tent); + if (sd->valsize != sz) + return (EINVAL); + + oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); + tent = (ipfw_obj_tentry *)(oh + 1); + + /* Basic length checks for TLVs */ + if (oh->ntlv.head.length != sizeof(oh->ntlv)) + return (EINVAL); + + objheader_to_ti(oh, &ti); + ti.type = oh->ntlv.type; + ti.uidx = tent->idx; + + IPFW_UH_RLOCK(ch); + ni = CHAIN_TO_NI(ch); + + /* + * Find existing table and check its type . + */ + ta = NULL; + if ((tc = find_table(ni, &ti)) == NULL) { + IPFW_UH_RUNLOCK(ch); return (ESRCH); } - if (ch->tabletype[tbl] != type) { - IPFW_WUNLOCK(ch); + /* check table type */ + if (tc->no.subtype != ti.type) { + IPFW_UH_RUNLOCK(ch); return (EINVAL); } - ent = (struct table_entry *)rnh->rnh_deladdr(sa_ptr, mask_ptr, rnh); - IPFW_WUNLOCK(ch); + kti = KIDX_TO_TI(ch, tc->no.kidx); + ta = tc->ta; - if (ent == NULL) - return (ESRCH); + if (ta->find_tentry == NULL) + return (ENOTSUP); - free(ent, M_IPFW_TBL); - return (0); + error = ta->find_tentry(tc->astate, kti, tent); + + IPFW_UH_RUNLOCK(ch); + + return (error); } +/* + * Flushes all entries or destroys given table. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ] + * + * Returns 0 on success + */ static int -flush_table_entry(struct radix_node *rn, void *arg) +flush_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) { - struct radix_node_head * const rnh = arg; - struct table_entry *ent; + int error; + struct _ipfw_obj_header *oh; + struct tid_info ti; - ent = (struct table_entry *) - rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); - if (ent != NULL) - free(ent, M_IPFW_TBL); - return (0); + if (sd->valsize != sizeof(*oh)) + return (EINVAL); + + oh = (struct _ipfw_obj_header *)op3; + objheader_to_ti(oh, &ti); + + if (op3->opcode == IP_FW_TABLE_XDESTROY) + error = destroy_table(ch, &ti); + else if (op3->opcode == IP_FW_TABLE_XFLUSH) + error = flush_table(ch, &ti); + else + return (ENOTSUP); + + return (error); } +static void +restart_flush(void *object, struct op_state *_state) +{ + struct tableop_state *ts; + + ts = (struct tableop_state *)_state; + + if (ts->tc != object) + return; + + /* Indicate we've called */ + ts->modified = 1; +} + +/* + * Flushes given table. + * + * Function create new table instance with the same + * parameters, swaps it with old one and + * flushes state without holding runtime WLOCK. + * + * Returns 0 on success. + */ int -ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl) +flush_table(struct ip_fw_chain *ch, struct tid_info *ti) { - struct radix_node_head *rnh, *xrnh; + struct namedobj_instance *ni; + struct table_config *tc; + struct table_algo *ta; + struct table_info ti_old, ti_new, *tablestate; + void *astate_old, *astate_new; + char algostate[64], *pstate; + struct tableop_state ts; + int error, need_gc; + uint16_t kidx; + uint8_t tflags; - if (tbl >= V_fw_tables_max) - return (EINVAL); + /* + * Stage 1: save table algorithm. + * Reference found table to ensure it won't disappear. + */ + IPFW_UH_WLOCK(ch); + ni = CHAIN_TO_NI(ch); + if ((tc = find_table(ni, ti)) == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + need_gc = 0; + astate_new = NULL; + memset(&ti_new, 0, sizeof(ti_new)); +restart: + /* Set up swap handler */ + memset(&ts, 0, sizeof(ts)); + ts.opstate.func = restart_flush; + ts.tc = tc; + + ta = tc->ta; + /* Do not flush readonly tables */ + if ((ta->flags & TA_FLAG_READONLY) != 0) { + IPFW_UH_WUNLOCK(ch); + return (EACCES); + } + /* Save startup algo parameters */ + if (ta->print_config != NULL) { + ta->print_config(tc->astate, KIDX_TO_TI(ch, tc->no.kidx), + algostate, sizeof(algostate)); + pstate = algostate; + } else + pstate = NULL; + tflags = tc->tflags; + tc->no.refcnt++; + add_toperation_state(ch, &ts); + IPFW_UH_WUNLOCK(ch); + + /* + * Stage 1.5: if this is not the first attempt, destroy previous state + */ + if (need_gc != 0) { + ta->destroy(astate_new, &ti_new); + need_gc = 0; + } /* - * We free both (IPv4 and extended) radix trees and - * clear table type here to permit table to be reused - * for different type without module reload + * Stage 2: allocate new table instance using same algo. */ + memset(&ti_new, 0, sizeof(struct table_info)); + error = ta->init(ch, &astate_new, &ti_new, pstate, tflags); + + /* + * Stage 3: swap old state pointers with newly-allocated ones. + * Decrease refcount. + */ + IPFW_UH_WLOCK(ch); + tc->no.refcnt--; + del_toperation_state(ch, &ts); + + if (error != 0) { + IPFW_UH_WUNLOCK(ch); + return (error); + } + + /* + * Restart operation if table swap has happened: + * even if algo may be the same, algo init parameters + * may change. Restart operation instead of doing + * complex checks. + */ + if (ts.modified != 0) { + /* Delay destroying data since we're holding UH lock */ + need_gc = 1; + goto restart; + } + + ni = CHAIN_TO_NI(ch); + kidx = tc->no.kidx; + tablestate = (struct table_info *)ch->tablestate; IPFW_WLOCK(ch); - /* Set IPv4 table pointer to zero */ - if ((rnh = ch->tables[tbl]) != NULL) - ch->tables[tbl] = NULL; - /* Set extended table pointer to zero */ - if ((xrnh = ch->xtables[tbl]) != NULL) - ch->xtables[tbl] = NULL; - /* Zero table type */ - ch->tabletype[tbl] = 0; + ti_old = tablestate[kidx]; + tablestate[kidx] = ti_new; IPFW_WUNLOCK(ch); - if (rnh != NULL) { - rnh->rnh_walktree(rnh, flush_table_entry, rnh); - rn_detachhead((void **)&rnh); + astate_old = tc->astate; + tc->astate = astate_new; + tc->ti_copy = ti_new; + tc->count = 0; + + /* Notify algo on real @ti address */ + if (ta->change_ti != NULL) + ta->change_ti(tc->astate, &tablestate[kidx]); + + /* + * Stage 4: unref values. + */ + ipfw_unref_table_values(ch, tc, ta, astate_old, &ti_old); + IPFW_UH_WUNLOCK(ch); + + /* + * Stage 5: perform real flush/destroy. + */ + ta->destroy(astate_old, &ti_old); + + return (0); +} + +/* + * Swaps two tables. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_obj_ntlv ] + * + * Returns 0 on success + */ +static int +swap_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + int error; + struct _ipfw_obj_header *oh; + struct tid_info ti_a, ti_b; + + if (sd->valsize != sizeof(*oh) + sizeof(ipfw_obj_ntlv)) + return (EINVAL); + + oh = (struct _ipfw_obj_header *)op3; + ntlv_to_ti(&oh->ntlv, &ti_a); + ntlv_to_ti((ipfw_obj_ntlv *)(oh + 1), &ti_b); + + error = swap_tables(ch, &ti_a, &ti_b); + + return (error); +} + +/* + * Swaps two tables of the same type/valtype. + * + * Checks if tables are compatible and limits + * permits swap, than actually perform swap. + * + * Each table consists of 2 different parts: + * config: + * @tc (with name, set, kidx) and rule bindings, which is "stable". + * number of items + * table algo + * runtime: + * runtime data @ti (ch->tablestate) + * runtime cache in @tc + * algo-specific data (@tc->astate) + * + * So we switch: + * all runtime data + * number of items + * table algo + * + * After that we call @ti change handler for each table. + * + * Note that referencing @tc won't protect tc->ta from change. + * XXX: Do we need to restrict swap between locked tables? + * XXX: Do we need to exchange ftype? + * + * Returns 0 on success. + */ +static int +swap_tables(struct ip_fw_chain *ch, struct tid_info *a, + struct tid_info *b) +{ + struct namedobj_instance *ni; + struct table_config *tc_a, *tc_b; + struct table_algo *ta; + struct table_info ti, *tablestate; + void *astate; + uint32_t count; + + /* + * Stage 1: find both tables and ensure they are of + * the same type. + */ + IPFW_UH_WLOCK(ch); + ni = CHAIN_TO_NI(ch); + if ((tc_a = find_table(ni, a)) == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + if ((tc_b = find_table(ni, b)) == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + + /* It is very easy to swap between the same table */ + if (tc_a == tc_b) { + IPFW_UH_WUNLOCK(ch); + return (0); + } + + /* Check type and value are the same */ + if (tc_a->no.subtype!=tc_b->no.subtype || tc_a->tflags!=tc_b->tflags) { + IPFW_UH_WUNLOCK(ch); + return (EINVAL); } - if (xrnh != NULL) { - xrnh->rnh_walktree(xrnh, flush_table_entry, xrnh); - rn_detachhead((void **)&xrnh); + /* Check limits before swap */ + if ((tc_a->limit != 0 && tc_b->count > tc_a->limit) || + (tc_b->limit != 0 && tc_a->count > tc_b->limit)) { + IPFW_UH_WUNLOCK(ch); + return (EFBIG); } + /* Check if one of the tables is readonly */ + if (((tc_a->ta->flags | tc_b->ta->flags) & TA_FLAG_READONLY) != 0) { + IPFW_UH_WUNLOCK(ch); + return (EACCES); + } + + /* Notify we're going to swap */ + rollback_toperation_state(ch, tc_a); + rollback_toperation_state(ch, tc_b); + + /* Everything is fine, prepare to swap */ + tablestate = (struct table_info *)ch->tablestate; + ti = tablestate[tc_a->no.kidx]; + ta = tc_a->ta; + astate = tc_a->astate; + count = tc_a->count; + + IPFW_WLOCK(ch); + /* a <- b */ + tablestate[tc_a->no.kidx] = tablestate[tc_b->no.kidx]; + tc_a->ta = tc_b->ta; + tc_a->astate = tc_b->astate; + tc_a->count = tc_b->count; + /* b <- a */ + tablestate[tc_b->no.kidx] = ti; + tc_b->ta = ta; + tc_b->astate = astate; + tc_b->count = count; + IPFW_WUNLOCK(ch); + + /* Ensure tc.ti copies are in sync */ + tc_a->ti_copy = tablestate[tc_a->no.kidx]; + tc_b->ti_copy = tablestate[tc_b->no.kidx]; + + /* Notify both tables on @ti change */ + if (tc_a->ta->change_ti != NULL) + tc_a->ta->change_ti(tc_a->astate, &tablestate[tc_a->no.kidx]); + if (tc_b->ta->change_ti != NULL) + tc_b->ta->change_ti(tc_b->astate, &tablestate[tc_b->no.kidx]); + + IPFW_UH_WUNLOCK(ch); + return (0); } -void -ipfw_destroy_tables(struct ip_fw_chain *ch) +/* + * Destroys table specified by @ti. + * Data layout (v0)(current): + * Request: [ ip_fw3_opheader ] + * + * Returns 0 on success + */ +static int +destroy_table(struct ip_fw_chain *ch, struct tid_info *ti) { - uint16_t tbl; + struct namedobj_instance *ni; + struct table_config *tc; - /* Flush all tables */ - for (tbl = 0; tbl < V_fw_tables_max; tbl++) - ipfw_flush_table(ch, tbl); + IPFW_UH_WLOCK(ch); - /* Free pointers itself */ - free(ch->tables, M_IPFW); - free(ch->xtables, M_IPFW); - free(ch->tabletype, M_IPFW); + ni = CHAIN_TO_NI(ch); + if ((tc = find_table(ni, ti)) == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + + /* Do not permit destroying referenced tables */ + if (tc->no.refcnt > 0) { + IPFW_UH_WUNLOCK(ch); + return (EBUSY); + } + + IPFW_WLOCK(ch); + unlink_table(ch, tc); + IPFW_WUNLOCK(ch); + + /* Free obj index */ + if (ipfw_objhash_free_idx(ni, tc->no.kidx) != 0) + printf("Error unlinking kidx %d from table %s\n", + tc->no.kidx, tc->tablename); + + /* Unref values used in tables while holding UH lock */ + ipfw_unref_table_values(ch, tc, tc->ta, tc->astate, &tc->ti_copy); + IPFW_UH_WUNLOCK(ch); + + free_table_config(ni, tc); + + return (0); } -int -ipfw_init_tables(struct ip_fw_chain *ch) +static uint32_t +roundup2p(uint32_t v) { - /* Allocate pointers */ - ch->tables = malloc(V_fw_tables_max * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); - ch->xtables = malloc(V_fw_tables_max * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); - ch->tabletype = malloc(V_fw_tables_max * sizeof(uint8_t), M_IPFW, M_WAITOK | M_ZERO); - return (0); + + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + + return (v); } +/* + * Grow tables index. + * + * Returns 0 on success. + */ int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables) { - struct radix_node_head **tables, **xtables, *rnh; - struct radix_node_head **tables_old, **xtables_old; - uint8_t *tabletype, *tabletype_old; unsigned int ntables_old, tbl; + struct namedobj_instance *ni; + void *new_idx, *old_tablestate, *tablestate; + struct table_info *ti; + struct table_config *tc; + int i, new_blocks; /* Check new value for validity */ + if (ntables == 0) + return (EINVAL); if (ntables > IPFW_TABLES_MAX) ntables = IPFW_TABLES_MAX; + /* Alight to nearest power of 2 */ + ntables = (unsigned int)roundup2p(ntables); /* Allocate new pointers */ - tables = malloc(ntables * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); - xtables = malloc(ntables * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); - tabletype = malloc(ntables * sizeof(uint8_t), M_IPFW, M_WAITOK | M_ZERO); + tablestate = malloc(ntables * sizeof(struct table_info), + M_IPFW, M_WAITOK | M_ZERO); - IPFW_WLOCK(ch); + ipfw_objhash_bitmap_alloc(ntables, (void *)&new_idx, &new_blocks); + + IPFW_UH_WLOCK(ch); tbl = (ntables >= V_fw_tables_max) ? V_fw_tables_max : ntables; + ni = CHAIN_TO_NI(ch); - /* Copy old table pointers */ - memcpy(tables, ch->tables, sizeof(void *) * tbl); - memcpy(xtables, ch->xtables, sizeof(void *) * tbl); - memcpy(tabletype, ch->tabletype, sizeof(uint8_t) * tbl); + /* Temporary restrict decreasing max_tables */ + if (ntables < V_fw_tables_max) { - /* Change pointers and number of tables */ - tables_old = ch->tables; - xtables_old = ch->xtables; - tabletype_old = ch->tabletype; - ch->tables = tables; - ch->xtables = xtables; - ch->tabletype = tabletype; + /* + * FIXME: Check if we really can shrink + */ + IPFW_UH_WUNLOCK(ch); + return (EINVAL); + } + + /* Copy table info/indices */ + memcpy(tablestate, ch->tablestate, sizeof(struct table_info) * tbl); + ipfw_objhash_bitmap_merge(ni, &new_idx, &new_blocks); + + IPFW_WLOCK(ch); + + /* Change pointers */ + old_tablestate = ch->tablestate; + ch->tablestate = tablestate; + ipfw_objhash_bitmap_swap(ni, &new_idx, &new_blocks); ntables_old = V_fw_tables_max; V_fw_tables_max = ntables; IPFW_WUNLOCK(ch); - /* Check if we need to destroy radix trees */ - if (ntables < ntables_old) { - for (tbl = ntables; tbl < ntables_old; tbl++) { - if ((rnh = tables_old[tbl]) != NULL) { - rnh->rnh_walktree(rnh, flush_table_entry, rnh); - rn_detachhead((void **)&rnh); - } + /* Notify all consumers that their @ti pointer has changed */ + ti = (struct table_info *)ch->tablestate; + for (i = 0; i < tbl; i++, ti++) { + if (ti->lookup == NULL) + continue; + tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, i); + if (tc == NULL || tc->ta->change_ti == NULL) + continue; - if ((rnh = xtables_old[tbl]) != NULL) { - rnh->rnh_walktree(rnh, flush_table_entry, rnh); - rn_detachhead((void **)&rnh); - } - } + tc->ta->change_ti(tc->astate, ti); } + IPFW_UH_WUNLOCK(ch); + /* Free old pointers */ - free(tables_old, M_IPFW); - free(xtables_old, M_IPFW); - free(tabletype_old, M_IPFW); + free(old_tablestate, M_IPFW); + ipfw_objhash_bitmap_free(new_idx, new_blocks); + + return (0); +} + +/* + * Lookup table's named object by its @kidx. + */ +struct named_object * +ipfw_objhash_lookup_table_kidx(struct ip_fw_chain *ch, uint16_t kidx) +{ + + return (ipfw_objhash_lookup_kidx(CHAIN_TO_NI(ch), kidx)); +} + +/* + * Take reference to table specified in @ntlv. + * On success return its @kidx. + */ +int +ipfw_ref_table(struct ip_fw_chain *ch, ipfw_obj_ntlv *ntlv, uint16_t *kidx) +{ + struct tid_info ti; + struct table_config *tc; + int error; + + IPFW_UH_WLOCK_ASSERT(ch); + + ntlv_to_ti(ntlv, &ti); + error = find_table_err(CHAIN_TO_NI(ch), &ti, &tc); + if (error != 0) + return (error); + + if (tc == NULL) + return (ESRCH); + + tc_ref(tc); + *kidx = tc->no.kidx; return (0); } +void +ipfw_unref_table(struct ip_fw_chain *ch, uint16_t kidx) +{ + + struct namedobj_instance *ni; + struct named_object *no; + + IPFW_UH_WLOCK_ASSERT(ch); + ni = CHAIN_TO_NI(ch); + no = ipfw_objhash_lookup_kidx(ni, kidx); + KASSERT(no != NULL, ("Table with index %d not found", kidx)); + no->refcnt--; +} + +/* + * Lookup an IP @addr in table @tbl. + * Stores found value in @val. + * + * Returns 1 if @addr was found. + */ int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint32_t *val) { - struct radix_node_head *rnh; - struct table_entry *ent; - struct sockaddr_in sa; + struct table_info *ti; - if (tbl >= V_fw_tables_max) - return (0); - if ((rnh = ch->tables[tbl]) == NULL) - return (0); - KEY_LEN(sa) = KEY_LEN_INET; - sa.sin_addr.s_addr = addr; - ent = (struct table_entry *)(rnh->rnh_matchaddr(&sa, rnh)); - if (ent != NULL) { - *val = ent->value; - return (1); + ti = KIDX_TO_TI(ch, tbl); + + return (ti->lookup(ti, &addr, sizeof(in_addr_t), val)); +} + +/* + * Lookup an arbtrary key @paddr of legth @plen in table @tbl. + * Stores found value in @val. + * + * Returns 1 if key was found. + */ +int +ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen, + void *paddr, uint32_t *val) +{ + struct table_info *ti; + + ti = KIDX_TO_TI(ch, tbl); + + return (ti->lookup(ti, paddr, plen, val)); +} + +/* + * Info/List/dump support for tables. + * + */ + +/* + * High-level 'get' cmds sysctl handlers + */ + +/* + * Lists all tables currently available in kernel. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size + * Reply: [ ipfw_obj_lheader ipfw_xtable_info x N ] + * + * Returns 0 on success + */ +static int +list_tables(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct _ipfw_obj_lheader *olh; + int error; + + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); + if (olh == NULL) + return (EINVAL); + if (sd->valsize < olh->size) + return (EINVAL); + + IPFW_UH_RLOCK(ch); + error = export_tables(ch, olh, sd); + IPFW_UH_RUNLOCK(ch); + + return (error); +} + +/* + * Store table info to buffer provided by @sd. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_xtable_info(empty)] + * Reply: [ ipfw_obj_header ipfw_xtable_info ] + * + * Returns 0 on success. + */ +static int +describe_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct _ipfw_obj_header *oh; + struct table_config *tc; + struct tid_info ti; + size_t sz; + + sz = sizeof(*oh) + sizeof(ipfw_xtable_info); + oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); + if (oh == NULL) + return (EINVAL); + + objheader_to_ti(oh, &ti); + + IPFW_UH_RLOCK(ch); + if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { + IPFW_UH_RUNLOCK(ch); + return (ESRCH); } + + export_table_info(ch, tc, (ipfw_xtable_info *)(oh + 1)); + IPFW_UH_RUNLOCK(ch); + return (0); } -int -ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, - uint32_t *val, int type) +/* + * Modifies existing table. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_xtable_info ] + * + * Returns 0 on success + */ +static int +modify_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) { - struct radix_node_head *rnh; - struct table_xentry *xent; - struct sockaddr_in6 sa6; - struct xaddr_iface iface; + struct _ipfw_obj_header *oh; + ipfw_xtable_info *i; + char *tname; + struct tid_info ti; + struct namedobj_instance *ni; + struct table_config *tc; + + if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info)) + return (EINVAL); - if (tbl >= V_fw_tables_max) - return (0); - if ((rnh = ch->xtables[tbl]) == NULL) - return (0); + oh = (struct _ipfw_obj_header *)sd->kbuf; + i = (ipfw_xtable_info *)(oh + 1); - switch (type) { - case IPFW_TABLE_CIDR: - KEY_LEN(sa6) = KEY_LEN_INET6; - memcpy(&sa6.sin6_addr, paddr, sizeof(struct in6_addr)); - xent = (struct table_xentry *)(rnh->rnh_matchaddr(&sa6, rnh)); - break; + /* + * Verify user-supplied strings. + * Check for null-terminated/zero-length strings/ + */ + tname = oh->ntlv.name; + if (check_table_name(tname) != 0) + return (EINVAL); - case IPFW_TABLE_INTERFACE: - KEY_LEN(iface) = KEY_LEN_IFACE + - strlcpy(iface.ifname, (char *)paddr, IF_NAMESIZE) + 1; - /* Assume direct match */ - /* FIXME: Add interface pattern matching */ - xent = (struct table_xentry *)(rnh->rnh_matchaddr(&iface, rnh)); - break; + objheader_to_ti(oh, &ti); + ti.type = i->type; - default: - return (0); + IPFW_UH_WLOCK(ch); + ni = CHAIN_TO_NI(ch); + if ((tc = find_table(ni, &ti)) == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); } - if (xent != NULL) { - *val = xent->value; - return (1); + /* Do not support any modifications for readonly tables */ + if ((tc->ta->flags & TA_FLAG_READONLY) != 0) { + IPFW_UH_WUNLOCK(ch); + return (EACCES); } + + if ((i->mflags & IPFW_TMFLAGS_LIMIT) != 0) + tc->limit = i->limit; + if ((i->mflags & IPFW_TMFLAGS_LOCK) != 0) + tc->locked = ((i->flags & IPFW_TGFLAGS_LOCKED) != 0); + IPFW_UH_WUNLOCK(ch); + return (0); } +/* + * Creates new table. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_xtable_info ] + * + * Returns 0 on success + */ static int -count_table_entry(struct radix_node *rn, void *arg) +create_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) { - u_int32_t * const cnt = arg; + struct _ipfw_obj_header *oh; + ipfw_xtable_info *i; + char *tname, *aname; + struct tid_info ti; + struct namedobj_instance *ni; + + if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info)) + return (EINVAL); + + oh = (struct _ipfw_obj_header *)sd->kbuf; + i = (ipfw_xtable_info *)(oh + 1); + + /* + * Verify user-supplied strings. + * Check for null-terminated/zero-length strings/ + */ + tname = oh->ntlv.name; + aname = i->algoname; + if (check_table_name(tname) != 0 || + strnlen(aname, sizeof(i->algoname)) == sizeof(i->algoname)) + return (EINVAL); + + if (aname[0] == '\0') { + /* Use default algorithm */ + aname = NULL; + } + + objheader_to_ti(oh, &ti); + ti.type = i->type; + + ni = CHAIN_TO_NI(ch); + + IPFW_UH_RLOCK(ch); + if (find_table(ni, &ti) != NULL) { + IPFW_UH_RUNLOCK(ch); + return (EEXIST); + } + IPFW_UH_RUNLOCK(ch); + + return (create_table_internal(ch, &ti, aname, i, NULL, 0)); +} + +/* + * Creates new table based on @ti and @aname. + * + * Assume @aname to be checked and valid. + * Stores allocated table kidx inside @pkidx (if non-NULL). + * Reference created table if @compat is non-zero. + * + * Returns 0 on success. + */ +static int +create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti, + char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int compat) +{ + struct namedobj_instance *ni; + struct table_config *tc, *tc_new, *tmp; + struct table_algo *ta; + uint16_t kidx; + + ni = CHAIN_TO_NI(ch); + + ta = find_table_algo(CHAIN_TO_TCFG(ch), ti, aname); + if (ta == NULL) + return (ENOTSUP); + + tc = alloc_table_config(ch, ti, ta, aname, i->tflags); + if (tc == NULL) + return (ENOMEM); + + tc->vmask = i->vmask; + tc->limit = i->limit; + if (ta->flags & TA_FLAG_READONLY) + tc->locked = 1; + else + tc->locked = (i->flags & IPFW_TGFLAGS_LOCKED) != 0; + + IPFW_UH_WLOCK(ch); + + /* Check if table has been already created */ + tc_new = find_table(ni, ti); + if (tc_new != NULL) { + + /* + * Compat: do not fail if we're + * requesting to create existing table + * which has the same type + */ + if (compat == 0 || tc_new->no.subtype != tc->no.subtype) { + IPFW_UH_WUNLOCK(ch); + free_table_config(ni, tc); + return (EEXIST); + } + + /* Exchange tc and tc_new for proper refcounting & freeing */ + tmp = tc; + tc = tc_new; + tc_new = tmp; + } else { + /* New table */ + if (ipfw_objhash_alloc_idx(ni, &kidx) != 0) { + IPFW_UH_WUNLOCK(ch); + printf("Unable to allocate table index." + " Consider increasing net.inet.ip.fw.tables_max"); + free_table_config(ni, tc); + return (EBUSY); + } + tc->no.kidx = kidx; + tc->no.etlv = IPFW_TLV_TBL_NAME; + + IPFW_WLOCK(ch); + link_table(ch, tc); + IPFW_WUNLOCK(ch); + } + + if (compat != 0) + tc->no.refcnt++; + if (pkidx != NULL) + *pkidx = tc->no.kidx; + + IPFW_UH_WUNLOCK(ch); + + if (tc_new != NULL) + free_table_config(ni, tc_new); - (*cnt)++; return (0); } +static void +ntlv_to_ti(ipfw_obj_ntlv *ntlv, struct tid_info *ti) +{ + + memset(ti, 0, sizeof(struct tid_info)); + ti->set = ntlv->set; + ti->uidx = ntlv->idx; + ti->tlvs = ntlv; + ti->tlen = ntlv->head.length; +} + +static void +objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti) +{ + + ntlv_to_ti(&oh->ntlv, ti); +} + +struct namedobj_instance * +ipfw_get_table_objhash(struct ip_fw_chain *ch) +{ + + return (CHAIN_TO_NI(ch)); +} + +/* + * Exports basic table info as name TLV. + * Used inside dump_static_rules() to provide info + * about all tables referenced by current ruleset. + * + * Returns 0 on success. + */ int -ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) +ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx, + struct sockopt_data *sd) +{ + struct namedobj_instance *ni; + struct named_object *no; + ipfw_obj_ntlv *ntlv; + + ni = CHAIN_TO_NI(ch); + + no = ipfw_objhash_lookup_kidx(ni, kidx); + KASSERT(no != NULL, ("invalid table kidx passed")); + + ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv)); + if (ntlv == NULL) + return (ENOMEM); + + ntlv->head.type = IPFW_TLV_TBL_NAME; + ntlv->head.length = sizeof(*ntlv); + ntlv->idx = no->kidx; + strlcpy(ntlv->name, no->name, sizeof(ntlv->name)); + + return (0); +} + +struct dump_args { + struct ip_fw_chain *ch; + struct table_info *ti; + struct table_config *tc; + struct sockopt_data *sd; + uint32_t cnt; + uint16_t uidx; + int error; + uint32_t size; + ipfw_table_entry *ent; + ta_foreach_f *f; + void *farg; + ipfw_obj_tentry tent; +}; + +static int +count_ext_entries(void *e, void *arg) { - struct radix_node_head *rnh; + struct dump_args *da; - if (tbl >= V_fw_tables_max) + da = (struct dump_args *)arg; + da->cnt++; + + return (0); +} + +/* + * Gets number of items from table either using + * internal counter or calling algo callback for + * externally-managed tables. + * + * Returns number of records. + */ +static uint32_t +table_get_count(struct ip_fw_chain *ch, struct table_config *tc) +{ + struct table_info *ti; + struct table_algo *ta; + struct dump_args da; + + ti = KIDX_TO_TI(ch, tc->no.kidx); + ta = tc->ta; + + /* Use internal counter for self-managed tables */ + if ((ta->flags & TA_FLAG_READONLY) == 0) + return (tc->count); + + /* Use callback to quickly get number of items */ + if ((ta->flags & TA_FLAG_EXTCOUNTER) != 0) + return (ta->get_count(tc->astate, ti)); + + /* Count number of iterms ourselves */ + memset(&da, 0, sizeof(da)); + ta->foreach(tc->astate, ti, count_ext_entries, &da); + + return (da.cnt); +} + +/* + * Exports table @tc info into standard ipfw_xtable_info format. + */ +static void +export_table_info(struct ip_fw_chain *ch, struct table_config *tc, + ipfw_xtable_info *i) +{ + struct table_info *ti; + struct table_algo *ta; + + i->type = tc->no.subtype; + i->tflags = tc->tflags; + i->vmask = tc->vmask; + i->set = tc->no.set; + i->kidx = tc->no.kidx; + i->refcnt = tc->no.refcnt; + i->count = table_get_count(ch, tc); + i->limit = tc->limit; + i->flags |= (tc->locked != 0) ? IPFW_TGFLAGS_LOCKED : 0; + i->size = i->count * sizeof(ipfw_obj_tentry); + i->size += sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info); + strlcpy(i->tablename, tc->tablename, sizeof(i->tablename)); + ti = KIDX_TO_TI(ch, tc->no.kidx); + ta = tc->ta; + if (ta->print_config != NULL) { + /* Use algo function to print table config to string */ + ta->print_config(tc->astate, ti, i->algoname, + sizeof(i->algoname)); + } else + strlcpy(i->algoname, ta->name, sizeof(i->algoname)); + /* Dump algo-specific data, if possible */ + if (ta->dump_tinfo != NULL) { + ta->dump_tinfo(tc->astate, ti, &i->ta_info); + i->ta_info.flags |= IPFW_TATFLAGS_DATA; + } +} + +struct dump_table_args { + struct ip_fw_chain *ch; + struct sockopt_data *sd; +}; + +static int +export_table_internal(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + ipfw_xtable_info *i; + struct dump_table_args *dta; + + dta = (struct dump_table_args *)arg; + + i = (ipfw_xtable_info *)ipfw_get_sopt_space(dta->sd, sizeof(*i)); + KASSERT(i != NULL, ("previously checked buffer is not enough")); + + export_table_info(dta->ch, (struct table_config *)no, i); + return (0); +} + +/* + * Export all tables as ipfw_xtable_info structures to + * storage provided by @sd. + * + * If supplied buffer is too small, fills in required size + * and returns ENOMEM. + * Returns 0 on success. + */ +static int +export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh, + struct sockopt_data *sd) +{ + uint32_t size; + uint32_t count; + struct dump_table_args dta; + + count = ipfw_objhash_count(CHAIN_TO_NI(ch)); + size = count * sizeof(ipfw_xtable_info) + sizeof(ipfw_obj_lheader); + + /* Fill in header regadless of buffer size */ + olh->count = count; + olh->objsize = sizeof(ipfw_xtable_info); + + if (size > olh->size) { + olh->size = size; + return (ENOMEM); + } + + olh->size = size; + + dta.ch = ch; + dta.sd = sd; + + ipfw_objhash_foreach(CHAIN_TO_NI(ch), export_table_internal, &dta); + + return (0); +} + +/* + * Dumps all table data + * Data layout (v1)(current): + * Request: [ ipfw_obj_header ], size = ipfw_xtable_info.size + * Reply: [ ipfw_obj_header ipfw_xtable_info ipfw_obj_tentry x N ] + * + * Returns 0 on success + */ +static int +dump_table_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct _ipfw_obj_header *oh; + ipfw_xtable_info *i; + struct tid_info ti; + struct table_config *tc; + struct table_algo *ta; + struct dump_args da; + uint32_t sz; + + sz = sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info); + oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); + if (oh == NULL) + return (EINVAL); + + i = (ipfw_xtable_info *)(oh + 1); + objheader_to_ti(oh, &ti); + + IPFW_UH_RLOCK(ch); + if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { + IPFW_UH_RUNLOCK(ch); + return (ESRCH); + } + export_table_info(ch, tc, i); + + if (sd->valsize < i->size) { + + /* + * Submitted buffer size is not enough. + * WE've already filled in @i structure with + * relevant table info including size, so we + * can return. Buffer will be flushed automatically. + */ + IPFW_UH_RUNLOCK(ch); + return (ENOMEM); + } + + /* + * Do the actual dump in eXtended format + */ + memset(&da, 0, sizeof(da)); + da.ch = ch; + da.ti = KIDX_TO_TI(ch, tc->no.kidx); + da.tc = tc; + da.sd = sd; + + ta = tc->ta; + + ta->foreach(tc->astate, da.ti, dump_table_tentry, &da); + IPFW_UH_RUNLOCK(ch); + + return (da.error); +} + +/* + * Dumps all table data + * Data layout (version 0)(legacy): + * Request: [ ipfw_xtable ], size = IP_FW_TABLE_XGETSIZE() + * Reply: [ ipfw_xtable ipfw_table_xentry x N ] + * + * Returns 0 on success + */ +static int +dump_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_xtable *xtbl; + struct tid_info ti; + struct table_config *tc; + struct table_algo *ta; + struct dump_args da; + size_t sz, count; + + xtbl = (ipfw_xtable *)ipfw_get_sopt_header(sd, sizeof(ipfw_xtable)); + if (xtbl == NULL) return (EINVAL); - *cnt = 0; - if ((rnh = ch->tables[tbl]) == NULL) + + memset(&ti, 0, sizeof(ti)); + ti.uidx = xtbl->tbl; + + IPFW_UH_RLOCK(ch); + if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { + IPFW_UH_RUNLOCK(ch); return (0); - rnh->rnh_walktree(rnh, count_table_entry, cnt); + } + count = table_get_count(ch, tc); + sz = count * sizeof(ipfw_table_xentry) + sizeof(ipfw_xtable); + + xtbl->cnt = count; + xtbl->size = sz; + xtbl->type = tc->no.subtype; + xtbl->tbl = ti.uidx; + + if (sd->valsize < sz) { + + /* + * Submitted buffer size is not enough. + * WE've already filled in @i structure with + * relevant table info including size, so we + * can return. Buffer will be flushed automatically. + */ + IPFW_UH_RUNLOCK(ch); + return (ENOMEM); + } + + /* Do the actual dump in eXtended format */ + memset(&da, 0, sizeof(da)); + da.ch = ch; + da.ti = KIDX_TO_TI(ch, tc->no.kidx); + da.tc = tc; + da.sd = sd; + + ta = tc->ta; + + ta->foreach(tc->astate, da.ti, dump_table_xentry, &da); + IPFW_UH_RUNLOCK(ch); + + return (0); +} + +/* + * Legacy function to retrieve number of items in table. + */ +static int +get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + uint32_t *tbl; + struct tid_info ti; + size_t sz; + int error; + + sz = sizeof(*op3) + sizeof(uint32_t); + op3 = (ip_fw3_opheader *)ipfw_get_sopt_header(sd, sz); + if (op3 == NULL) + return (EINVAL); + + tbl = (uint32_t *)(op3 + 1); + memset(&ti, 0, sizeof(ti)); + ti.uidx = *tbl; + IPFW_UH_RLOCK(ch); + error = ipfw_count_xtable(ch, &ti, tbl); + IPFW_UH_RUNLOCK(ch); + return (error); +} + +/* + * Legacy IP_FW_TABLE_GETSIZE handler + */ +int +ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt) +{ + struct table_config *tc; + + if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) + return (ESRCH); + *cnt = table_get_count(ch, tc); + return (0); +} + +/* + * Legacy IP_FW_TABLE_XGETSIZE handler + */ +int +ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt) +{ + struct table_config *tc; + uint32_t count; + + if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) { + *cnt = 0; + return (0); /* 'table all list' requires success */ + } + + count = table_get_count(ch, tc); + *cnt = count * sizeof(ipfw_table_xentry); + if (count > 0) + *cnt += sizeof(ipfw_xtable); return (0); } static int -dump_table_entry(struct radix_node *rn, void *arg) +dump_table_entry(void *e, void *arg) { - struct table_entry * const n = (struct table_entry *)rn; - ipfw_table * const tbl = arg; + struct dump_args *da; + struct table_config *tc; + struct table_algo *ta; ipfw_table_entry *ent; + struct table_value *pval; + int error; + + da = (struct dump_args *)arg; + + tc = da->tc; + ta = tc->ta; - if (tbl->cnt == tbl->size) + /* Out of memory, returning */ + if (da->cnt == da->size) return (1); - ent = &tbl->ent[tbl->cnt]; - ent->tbl = tbl->tbl; - if (in_nullhost(n->mask.sin_addr)) - ent->masklen = 0; - else - ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); - ent->addr = n->addr.sin_addr.s_addr; - ent->value = n->value; - tbl->cnt++; + ent = da->ent++; + ent->tbl = da->uidx; + da->cnt++; + + error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent); + if (error != 0) + return (error); + + ent->addr = da->tent.k.addr.s_addr; + ent->masklen = da->tent.masklen; + pval = get_table_value(da->ch, da->tc, da->tent.v.kidx); + ent->value = ipfw_export_table_value_legacy(pval); + return (0); } +/* + * Dumps table in pre-8.1 legacy format. + */ int -ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl) +ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti, + ipfw_table *tbl) { - struct radix_node_head *rnh; + struct table_config *tc; + struct table_algo *ta; + struct dump_args da; - if (tbl->tbl >= V_fw_tables_max) - return (EINVAL); tbl->cnt = 0; - if ((rnh = ch->tables[tbl->tbl]) == NULL) + + if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) + return (0); /* XXX: We should return ESRCH */ + + ta = tc->ta; + + /* This dump format supports IPv4 only */ + if (tc->no.subtype != IPFW_TABLE_ADDR) return (0); - rnh->rnh_walktree(rnh, dump_table_entry, tbl); + + memset(&da, 0, sizeof(da)); + da.ch = ch; + da.ti = KIDX_TO_TI(ch, tc->no.kidx); + da.tc = tc; + da.ent = &tbl->ent[0]; + da.size = tbl->size; + + tbl->cnt = 0; + ta->foreach(tc->astate, da.ti, dump_table_entry, &da); + tbl->cnt = da.cnt; + + return (0); +} + +/* + * Dumps table entry in eXtended format (v1)(current). + */ +static int +dump_table_tentry(void *e, void *arg) +{ + struct dump_args *da; + struct table_config *tc; + struct table_algo *ta; + struct table_value *pval; + ipfw_obj_tentry *tent; + int error; + + da = (struct dump_args *)arg; + + tc = da->tc; + ta = tc->ta; + + tent = (ipfw_obj_tentry *)ipfw_get_sopt_space(da->sd, sizeof(*tent)); + /* Out of memory, returning */ + if (tent == NULL) { + da->error = ENOMEM; + return (1); + } + tent->head.length = sizeof(ipfw_obj_tentry); + tent->idx = da->uidx; + + error = ta->dump_tentry(tc->astate, da->ti, e, tent); + if (error != 0) + return (error); + + pval = get_table_value(da->ch, da->tc, tent->v.kidx); + ipfw_export_table_value_v1(pval, &tent->v.value); + + return (0); +} + +/* + * Dumps table entry in eXtended format (v0). + */ +static int +dump_table_xentry(void *e, void *arg) +{ + struct dump_args *da; + struct table_config *tc; + struct table_algo *ta; + ipfw_table_xentry *xent; + ipfw_obj_tentry *tent; + struct table_value *pval; + int error; + + da = (struct dump_args *)arg; + + tc = da->tc; + ta = tc->ta; + + xent = (ipfw_table_xentry *)ipfw_get_sopt_space(da->sd, sizeof(*xent)); + /* Out of memory, returning */ + if (xent == NULL) + return (1); + xent->len = sizeof(ipfw_table_xentry); + xent->tbl = da->uidx; + + memset(&da->tent, 0, sizeof(da->tent)); + tent = &da->tent; + error = ta->dump_tentry(tc->astate, da->ti, e, tent); + if (error != 0) + return (error); + + /* Convert current format to previous one */ + xent->masklen = tent->masklen; + pval = get_table_value(da->ch, da->tc, da->tent.v.kidx); + xent->value = ipfw_export_table_value_legacy(pval); + /* Apply some hacks */ + if (tc->no.subtype == IPFW_TABLE_ADDR && tent->subtype == AF_INET) { + xent->k.addr6.s6_addr32[3] = tent->k.addr.s_addr; + xent->flags = IPFW_TCF_INET; + } else + memcpy(&xent->k, &tent->k, sizeof(xent->k)); + return (0); } +/* + * Helper function to export table algo data + * to tentry format before calling user function. + * + * Returns 0 on success. + */ static int -count_table_xentry(struct radix_node *rn, void *arg) +prepare_table_tentry(void *e, void *arg) { - uint32_t * const cnt = arg; + struct dump_args *da; + struct table_config *tc; + struct table_algo *ta; + int error; + + da = (struct dump_args *)arg; + + tc = da->tc; + ta = tc->ta; + + error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent); + if (error != 0) + return (error); + + da->f(&da->tent, da->farg); - (*cnt) += sizeof(ipfw_table_xentry); return (0); } +/* + * Allow external consumers to read table entries in standard format. + */ int -ipfw_count_xtable(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) +ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx, + ta_foreach_f *f, void *arg) +{ + struct namedobj_instance *ni; + struct table_config *tc; + struct table_algo *ta; + struct dump_args da; + + ni = CHAIN_TO_NI(ch); + + tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx); + if (tc == NULL) + return (ESRCH); + + ta = tc->ta; + + memset(&da, 0, sizeof(da)); + da.ch = ch; + da.ti = KIDX_TO_TI(ch, tc->no.kidx); + da.tc = tc; + da.f = f; + da.farg = arg; + + ta->foreach(tc->astate, da.ti, prepare_table_tentry, &da); + + return (0); +} + +/* + * Table algorithms + */ + +/* + * Finds algorithm by index, table type or supplied name. + * + * Returns pointer to algo or NULL. + */ +static struct table_algo * +find_table_algo(struct tables_config *tcfg, struct tid_info *ti, char *name) { - struct radix_node_head *rnh; + int i, l; + struct table_algo *ta; + + if (ti->type > IPFW_TABLE_MAXTYPE) + return (NULL); + + /* Search by index */ + if (ti->atype != 0) { + if (ti->atype > tcfg->algo_count) + return (NULL); + return (tcfg->algo[ti->atype]); + } + + if (name == NULL) { + /* Return default algorithm for given type if set */ + return (tcfg->def_algo[ti->type]); + } + + /* Search by name */ + /* TODO: better search */ + for (i = 1; i <= tcfg->algo_count; i++) { + ta = tcfg->algo[i]; + + /* + * One can supply additional algorithm + * parameters so we compare only the first word + * of supplied name: + * 'addr:chash hsize=32' + * '^^^^^^^^^' + * + */ + l = strlen(ta->name); + if (strncmp(name, ta->name, l) != 0) + continue; + if (name[l] != '\0' && name[l] != ' ') + continue; + /* Check if we're requesting proper table type */ + if (ti->type != 0 && ti->type != ta->type) + return (NULL); + return (ta); + } - if (tbl >= V_fw_tables_max) + return (NULL); +} + +/* + * Register new table algo @ta. + * Stores algo id inside @idx. + * + * Returns 0 on success. + */ +int +ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, size_t size, + int *idx) +{ + struct tables_config *tcfg; + struct table_algo *ta_new; + size_t sz; + + if (size > sizeof(struct table_algo)) return (EINVAL); - *cnt = 0; - if ((rnh = ch->tables[tbl]) != NULL) - rnh->rnh_walktree(rnh, count_table_xentry, cnt); - if ((rnh = ch->xtables[tbl]) != NULL) - rnh->rnh_walktree(rnh, count_table_xentry, cnt); - /* Return zero if table is empty */ - if (*cnt > 0) - (*cnt) += sizeof(ipfw_xtable); + + /* Check for the required on-stack size for add/del */ + sz = roundup2(ta->ta_buf_size, sizeof(void *)); + if (sz > TA_BUF_SZ) + return (EINVAL); + + KASSERT(ta->type <= IPFW_TABLE_MAXTYPE,("Increase IPFW_TABLE_MAXTYPE")); + + /* Copy algorithm data to stable storage. */ + ta_new = malloc(sizeof(struct table_algo), M_IPFW, M_WAITOK | M_ZERO); + memcpy(ta_new, ta, size); + + tcfg = CHAIN_TO_TCFG(ch); + + KASSERT(tcfg->algo_count < 255, ("Increase algo array size")); + + tcfg->algo[++tcfg->algo_count] = ta_new; + ta_new->idx = tcfg->algo_count; + + /* Set algorithm as default one for given type */ + if ((ta_new->flags & TA_FLAG_DEFAULT) != 0 && + tcfg->def_algo[ta_new->type] == NULL) + tcfg->def_algo[ta_new->type] = ta_new; + + *idx = ta_new->idx; + return (0); } +/* + * Unregisters table algo using @idx as id. + * XXX: It is NOT safe to call this function in any place + * other than ipfw instance destroy handler. + */ +void +ipfw_del_table_algo(struct ip_fw_chain *ch, int idx) +{ + struct tables_config *tcfg; + struct table_algo *ta; + + tcfg = CHAIN_TO_TCFG(ch); + + KASSERT(idx <= tcfg->algo_count, ("algo idx %d out of range 1..%d", + idx, tcfg->algo_count)); + ta = tcfg->algo[idx]; + KASSERT(ta != NULL, ("algo idx %d is NULL", idx)); + + if (tcfg->def_algo[ta->type] == ta) + tcfg->def_algo[ta->type] = NULL; + + free(ta, M_IPFW); +} + +/* + * Lists all table algorithms currently available. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size + * Reply: [ ipfw_obj_lheader ipfw_ta_info x N ] + * + * Returns 0 on success + */ static int -dump_table_xentry_base(struct radix_node *rn, void *arg) +list_table_algo(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) { - struct table_entry * const n = (struct table_entry *)rn; - ipfw_xtable * const tbl = arg; - ipfw_table_xentry *xent; + struct _ipfw_obj_lheader *olh; + struct tables_config *tcfg; + ipfw_ta_info *i; + struct table_algo *ta; + uint32_t count, n, size; + + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); + if (olh == NULL) + return (EINVAL); + if (sd->valsize < olh->size) + return (EINVAL); + + IPFW_UH_RLOCK(ch); + tcfg = CHAIN_TO_TCFG(ch); + count = tcfg->algo_count; + size = count * sizeof(ipfw_ta_info) + sizeof(ipfw_obj_lheader); + + /* Fill in header regadless of buffer size */ + olh->count = count; + olh->objsize = sizeof(ipfw_ta_info); + + if (size > olh->size) { + olh->size = size; + IPFW_UH_RUNLOCK(ch); + return (ENOMEM); + } + olh->size = size; + + for (n = 1; n <= count; n++) { + i = (ipfw_ta_info *)ipfw_get_sopt_space(sd, sizeof(*i)); + KASSERT(i != NULL, ("previously checked buffer is not enough")); + ta = tcfg->algo[n]; + strlcpy(i->algoname, ta->name, sizeof(i->algoname)); + i->type = ta->type; + i->refcnt = ta->refcnt; + } + + IPFW_UH_RUNLOCK(ch); - /* Out of memory, returning */ - if (tbl->cnt == tbl->size) - return (1); - xent = &tbl->xent[tbl->cnt]; - xent->len = sizeof(ipfw_table_xentry); - xent->tbl = tbl->tbl; - if (in_nullhost(n->mask.sin_addr)) - xent->masklen = 0; - else - xent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); - /* Save IPv4 address as deprecated IPv6 compatible */ - xent->k.addr6.s6_addr32[3] = n->addr.sin_addr.s_addr; - xent->value = n->value; - tbl->cnt++; return (0); } static int -dump_table_xentry_extended(struct radix_node *rn, void *arg) +classify_srcdst(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) { - struct table_xentry * const n = (struct table_xentry *)rn; - ipfw_xtable * const tbl = arg; - ipfw_table_xentry *xent; -#ifdef INET6 - int i; - uint32_t *v; -#endif - /* Out of memory, returning */ - if (tbl->cnt == tbl->size) + /* Basic IPv4/IPv6 or u32 lookups */ + *puidx = cmd->arg1; + /* Assume ADDR by default */ + *ptype = IPFW_TABLE_ADDR; + int v; + + if (F_LEN(cmd) > F_INSN_SIZE(ipfw_insn_u32)) { + /* + * generic lookup. The key must be + * in 32bit big-endian format. + */ + v = ((ipfw_insn_u32 *)cmd)->d[1]; + switch (v) { + case 0: + case 1: + /* IPv4 src/dst */ + break; + case 2: + case 3: + /* src/dst port */ + *ptype = IPFW_TABLE_NUMBER; + break; + case 4: + /* uid/gid */ + *ptype = IPFW_TABLE_NUMBER; + break; + case 5: + /* jid */ + *ptype = IPFW_TABLE_NUMBER; + break; + case 6: + /* dscp */ + *ptype = IPFW_TABLE_NUMBER; + break; + } + } + + return (0); +} + +static int +classify_via(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +{ + ipfw_insn_if *cmdif; + + /* Interface table, possibly */ + cmdif = (ipfw_insn_if *)cmd; + if (cmdif->name[0] != '\1') return (1); - xent = &tbl->xent[tbl->cnt]; - xent->len = sizeof(ipfw_table_xentry); - xent->tbl = tbl->tbl; - - switch (tbl->type) { -#ifdef INET6 - case IPFW_TABLE_CIDR: - /* Count IPv6 mask */ - v = (uint32_t *)&n->m.mask6.sin6_addr; - for (i = 0; i < sizeof(struct in6_addr) / 4; i++, v++) - xent->masklen += bitcount32(*v); - memcpy(&xent->k, &n->a.addr6.sin6_addr, sizeof(struct in6_addr)); - break; -#endif - case IPFW_TABLE_INTERFACE: - /* Assume exact mask */ - xent->masklen = 8 * IF_NAMESIZE; - memcpy(&xent->k, &n->a.iface.ifname, IF_NAMESIZE); + + *ptype = IPFW_TABLE_INTERFACE; + *puidx = cmdif->p.kidx; + + return (0); +} + +static int +classify_flow(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +{ + + *puidx = cmd->arg1; + *ptype = IPFW_TABLE_FLOW; + + return (0); +} + +static void +update_arg1(ipfw_insn *cmd, uint16_t idx) +{ + + cmd->arg1 = idx; +} + +static void +update_via(ipfw_insn *cmd, uint16_t idx) +{ + ipfw_insn_if *cmdif; + + cmdif = (ipfw_insn_if *)cmd; + cmdif->p.kidx = idx; +} + +static int +table_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, + struct named_object **pno) +{ + struct table_config *tc; + int error; + + IPFW_UH_WLOCK_ASSERT(ch); + + error = find_table_err(CHAIN_TO_NI(ch), ti, &tc); + if (error != 0) + return (error); + + *pno = &tc->no; + return (0); +} + +/* XXX: sets-sets! */ +static struct named_object * +table_findbykidx(struct ip_fw_chain *ch, uint16_t idx) +{ + struct namedobj_instance *ni; + struct table_config *tc; + + IPFW_UH_WLOCK_ASSERT(ch); + ni = CHAIN_TO_NI(ch); + tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, idx); + KASSERT(tc != NULL, ("Table with index %d not found", idx)); + + return (&tc->no); +} + +static int +table_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set, + enum ipfw_sets_cmd cmd) +{ + + switch (cmd) { + case SWAP_ALL: + case TEST_ALL: + /* + * Return success for TEST_ALL, since nothing prevents + * move rules from one set to another. All tables are + * accessible from all sets when per-set tables sysctl + * is disabled. + */ + case MOVE_ALL: + case TEST_ONE: + case MOVE_ONE: + /* + * NOTE: we need to use ipfw_objhash_del/ipfw_objhash_add + * if set number will be used in hash function. Currently + * we can just use generic handler that replaces set value. + */ + if (V_fw_tables_sets == 0) + return (0); break; - - default: - /* unknown, skip entry */ + case COUNT_ONE: + /* + * Return EOPNOTSUPP for COUNT_ONE when per-set sysctl is + * disabled. This allow skip table's opcodes from additional + * checks when specific rules moved to another set. + */ + if (V_fw_tables_sets == 0) + return (EOPNOTSUPP); + } + /* Use generic sets handler when per-set sysctl is enabled. */ + return (ipfw_obj_manage_sets(CHAIN_TO_NI(ch), IPFW_TLV_TBL_NAME, + set, new_set, cmd)); +} + +static struct opcode_obj_rewrite opcodes[] = { + { + .opcode = O_IP_SRC_LOOKUP, + .etlv = IPFW_TLV_TBL_NAME, + .classifier = classify_srcdst, + .update = update_arg1, + .find_byname = table_findbyname, + .find_bykidx = table_findbykidx, + .create_object = create_table_compat, + .manage_sets = table_manage_sets, + }, + { + .opcode = O_IP_DST_LOOKUP, + .etlv = IPFW_TLV_TBL_NAME, + .classifier = classify_srcdst, + .update = update_arg1, + .find_byname = table_findbyname, + .find_bykidx = table_findbykidx, + .create_object = create_table_compat, + .manage_sets = table_manage_sets, + }, + { + .opcode = O_IP_FLOW_LOOKUP, + .etlv = IPFW_TLV_TBL_NAME, + .classifier = classify_flow, + .update = update_arg1, + .find_byname = table_findbyname, + .find_bykidx = table_findbykidx, + .create_object = create_table_compat, + .manage_sets = table_manage_sets, + }, + { + .opcode = O_XMIT, + .etlv = IPFW_TLV_TBL_NAME, + .classifier = classify_via, + .update = update_via, + .find_byname = table_findbyname, + .find_bykidx = table_findbykidx, + .create_object = create_table_compat, + .manage_sets = table_manage_sets, + }, + { + .opcode = O_RECV, + .etlv = IPFW_TLV_TBL_NAME, + .classifier = classify_via, + .update = update_via, + .find_byname = table_findbyname, + .find_bykidx = table_findbykidx, + .create_object = create_table_compat, + .manage_sets = table_manage_sets, + }, + { + .opcode = O_VIA, + .etlv = IPFW_TLV_TBL_NAME, + .classifier = classify_via, + .update = update_via, + .find_byname = table_findbyname, + .find_bykidx = table_findbykidx, + .create_object = create_table_compat, + .manage_sets = table_manage_sets, + }, +}; + +static int +test_sets_cb(struct namedobj_instance *ni __unused, struct named_object *no, + void *arg __unused) +{ + + /* Check that there aren't any tables in not default set */ + if (no->set != 0) + return (EBUSY); + return (0); +} + +/* + * Switch between "set 0" and "rule's set" table binding, + * Check all ruleset bindings and permits changing + * IFF each binding has both rule AND table in default set (set 0). + * + * Returns 0 on success. + */ +int +ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int sets) +{ + struct opcode_obj_rewrite *rw; + struct namedobj_instance *ni; + struct named_object *no; + struct ip_fw *rule; + ipfw_insn *cmd; + int cmdlen, i, l; + uint16_t kidx; + uint8_t subtype; + + IPFW_UH_WLOCK(ch); + + if (V_fw_tables_sets == sets) { + IPFW_UH_WUNLOCK(ch); return (0); } + ni = CHAIN_TO_NI(ch); + if (sets == 0) { + /* + * Prevent disabling sets support if we have some tables + * in not default sets. + */ + if (ipfw_objhash_foreach_type(ni, test_sets_cb, + NULL, IPFW_TLV_TBL_NAME) != 0) { + IPFW_UH_WUNLOCK(ch); + return (EBUSY); + } + } + /* + * Scan all rules and examine tables opcodes. + */ + for (i = 0; i < ch->n_rules; i++) { + rule = ch->map[i]; + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + /* Check only tables opcodes */ + for (kidx = 0, rw = opcodes; + rw < opcodes + nitems(opcodes); rw++) { + if (rw->opcode != cmd->opcode) + continue; + if (rw->classifier(cmd, &kidx, &subtype) == 0) + break; + } + if (kidx == 0) + continue; + no = ipfw_objhash_lookup_kidx(ni, kidx); + /* Check if both table object and rule has the set 0 */ + if (no->set != 0 || rule->set != 0) { + IPFW_UH_WUNLOCK(ch); + return (EBUSY); + } + + } + } + V_fw_tables_sets = sets; + IPFW_UH_WUNLOCK(ch); + return (0); +} + +/* + * Checks table name for validity. + * Enforce basic length checks, the rest + * should be done in userland. + * + * Returns 0 if name is considered valid. + */ +static int +check_table_name(const char *name) +{ + + /* + * TODO: do some more complicated checks + */ + return (ipfw_check_object_name_generic(name)); +} + +/* + * Finds table config based on either legacy index + * or name in ntlv. + * Note @ti structure contains unchecked data from userland. + * + * Returns 0 in success and fills in @tc with found config + */ +static int +find_table_err(struct namedobj_instance *ni, struct tid_info *ti, + struct table_config **tc) +{ + char *name, bname[16]; + struct named_object *no; + ipfw_obj_ntlv *ntlv; + uint32_t set; + + if (ti->tlvs != NULL) { + ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, + IPFW_TLV_TBL_NAME); + if (ntlv == NULL) + return (EINVAL); + name = ntlv->name; + + /* + * Use set provided by @ti instead of @ntlv one. + * This is needed due to different sets behavior + * controlled by V_fw_tables_sets. + */ + set = (V_fw_tables_sets != 0) ? ti->set : 0; + } else { + snprintf(bname, sizeof(bname), "%d", ti->uidx); + name = bname; + set = 0; + } + + no = ipfw_objhash_lookup_name(ni, set, name); + *tc = (struct table_config *)no; + + return (0); +} + +/* + * Finds table config based on either legacy index + * or name in ntlv. + * Note @ti structure contains unchecked data from userland. + * + * Returns pointer to table_config or NULL. + */ +static struct table_config * +find_table(struct namedobj_instance *ni, struct tid_info *ti) +{ + struct table_config *tc; + + if (find_table_err(ni, ti, &tc) != 0) + return (NULL); + + return (tc); +} + +/* + * Allocate new table config structure using + * specified @algo and @aname. + * + * Returns pointer to config or NULL. + */ +static struct table_config * +alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti, + struct table_algo *ta, char *aname, uint8_t tflags) +{ + char *name, bname[16]; + struct table_config *tc; + int error; + ipfw_obj_ntlv *ntlv; + uint32_t set; + + if (ti->tlvs != NULL) { + ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, + IPFW_TLV_TBL_NAME); + if (ntlv == NULL) + return (NULL); + name = ntlv->name; + set = ntlv->set; + } else { + /* Compat part: convert number to string representation */ + snprintf(bname, sizeof(bname), "%d", ti->uidx); + name = bname; + set = 0; + } + + tc = malloc(sizeof(struct table_config), M_IPFW, M_WAITOK | M_ZERO); + tc->no.name = tc->tablename; + tc->no.subtype = ta->type; + tc->no.set = set; + tc->tflags = tflags; + tc->ta = ta; + strlcpy(tc->tablename, name, sizeof(tc->tablename)); + /* Set "shared" value type by default */ + tc->vshared = 1; + + /* Preallocate data structures for new tables */ + error = ta->init(ch, &tc->astate, &tc->ti_copy, aname, tflags); + if (error != 0) { + free(tc, M_IPFW); + return (NULL); + } + + return (tc); +} + +/* + * Destroys table state and config. + */ +static void +free_table_config(struct namedobj_instance *ni, struct table_config *tc) +{ + + KASSERT(tc->linked == 0, ("free() on linked config")); + /* UH lock MUST NOT be held */ + + /* + * We're using ta without any locking/referencing. + * TODO: fix this if we're going to use unloadable algos. + */ + tc->ta->destroy(tc->astate, &tc->ti_copy); + free(tc, M_IPFW); +} + +/* + * Links @tc to @chain table named instance. + * Sets appropriate type/states in @chain table info. + */ +static void +link_table(struct ip_fw_chain *ch, struct table_config *tc) +{ + struct namedobj_instance *ni; + struct table_info *ti; + uint16_t kidx; + + IPFW_UH_WLOCK_ASSERT(ch); + IPFW_WLOCK_ASSERT(ch); + + ni = CHAIN_TO_NI(ch); + kidx = tc->no.kidx; + + ipfw_objhash_add(ni, &tc->no); + + ti = KIDX_TO_TI(ch, kidx); + *ti = tc->ti_copy; + + /* Notify algo on real @ti address */ + if (tc->ta->change_ti != NULL) + tc->ta->change_ti(tc->astate, ti); + + tc->linked = 1; + tc->ta->refcnt++; +} + +/* + * Unlinks @tc from @chain table named instance. + * Zeroes states in @chain and stores them in @tc. + */ +static void +unlink_table(struct ip_fw_chain *ch, struct table_config *tc) +{ + struct namedobj_instance *ni; + struct table_info *ti; + uint16_t kidx; + + IPFW_UH_WLOCK_ASSERT(ch); + IPFW_WLOCK_ASSERT(ch); + + ni = CHAIN_TO_NI(ch); + kidx = tc->no.kidx; + + /* Clear state. @ti copy is already saved inside @tc */ + ipfw_objhash_del(ni, &tc->no); + ti = KIDX_TO_TI(ch, kidx); + memset(ti, 0, sizeof(struct table_info)); + tc->linked = 0; + tc->ta->refcnt--; + + /* Notify algo on real @ti address */ + if (tc->ta->change_ti != NULL) + tc->ta->change_ti(tc->astate, NULL); +} + +static struct ipfw_sopt_handler scodes[] = { + { IP_FW_TABLE_XCREATE, 0, HDIR_SET, create_table }, + { IP_FW_TABLE_XDESTROY, 0, HDIR_SET, flush_table_v0 }, + { IP_FW_TABLE_XFLUSH, 0, HDIR_SET, flush_table_v0 }, + { IP_FW_TABLE_XMODIFY, 0, HDIR_BOTH, modify_table }, + { IP_FW_TABLE_XINFO, 0, HDIR_GET, describe_table }, + { IP_FW_TABLES_XLIST, 0, HDIR_GET, list_tables }, + { IP_FW_TABLE_XLIST, 0, HDIR_GET, dump_table_v0 }, + { IP_FW_TABLE_XLIST, 1, HDIR_GET, dump_table_v1 }, + { IP_FW_TABLE_XADD, 0, HDIR_BOTH, manage_table_ent_v0 }, + { IP_FW_TABLE_XADD, 1, HDIR_BOTH, manage_table_ent_v1 }, + { IP_FW_TABLE_XDEL, 0, HDIR_BOTH, manage_table_ent_v0 }, + { IP_FW_TABLE_XDEL, 1, HDIR_BOTH, manage_table_ent_v1 }, + { IP_FW_TABLE_XFIND, 0, HDIR_GET, find_table_entry }, + { IP_FW_TABLE_XSWAP, 0, HDIR_SET, swap_table }, + { IP_FW_TABLES_ALIST, 0, HDIR_GET, list_table_algo }, + { IP_FW_TABLE_XGETSIZE, 0, HDIR_GET, get_table_size }, +}; - xent->value = n->value; - tbl->cnt++; +static int +destroy_table_locked(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + + unlink_table((struct ip_fw_chain *)arg, (struct table_config *)no); + if (ipfw_objhash_free_idx(ni, no->kidx) != 0) + printf("Error unlinking kidx %d from table %s\n", + no->kidx, no->name); + free_table_config(ni, (struct table_config *)no); return (0); } +/* + * Shuts tables module down. + */ +void +ipfw_destroy_tables(struct ip_fw_chain *ch, int last) +{ + + IPFW_DEL_SOPT_HANDLER(last, scodes); + IPFW_DEL_OBJ_REWRITER(last, opcodes); + + /* Remove all tables from working set */ + IPFW_UH_WLOCK(ch); + IPFW_WLOCK(ch); + ipfw_objhash_foreach(CHAIN_TO_NI(ch), destroy_table_locked, ch); + IPFW_WUNLOCK(ch); + IPFW_UH_WUNLOCK(ch); + + /* Free pointers itself */ + free(ch->tablestate, M_IPFW); + + ipfw_table_value_destroy(ch, last); + ipfw_table_algo_destroy(ch); + + ipfw_objhash_destroy(CHAIN_TO_NI(ch)); + free(CHAIN_TO_TCFG(ch), M_IPFW); +} + +/* + * Starts tables module. + */ int -ipfw_dump_xtable(struct ip_fw_chain *ch, ipfw_xtable *tbl) +ipfw_init_tables(struct ip_fw_chain *ch, int first) { - struct radix_node_head *rnh; + struct tables_config *tcfg; - if (tbl->tbl >= V_fw_tables_max) - return (EINVAL); - tbl->cnt = 0; - tbl->type = ch->tabletype[tbl->tbl]; - if ((rnh = ch->tables[tbl->tbl]) != NULL) - rnh->rnh_walktree(rnh, dump_table_xentry_base, tbl); - if ((rnh = ch->xtables[tbl->tbl]) != NULL) - rnh->rnh_walktree(rnh, dump_table_xentry_extended, tbl); + /* Allocate pointers */ + ch->tablestate = malloc(V_fw_tables_max * sizeof(struct table_info), + M_IPFW, M_WAITOK | M_ZERO); + + tcfg = malloc(sizeof(struct tables_config), M_IPFW, M_WAITOK | M_ZERO); + tcfg->namehash = ipfw_objhash_create(V_fw_tables_max); + ch->tblcfg = tcfg; + + ipfw_table_value_init(ch, first); + ipfw_table_algo_init(ch); + + IPFW_ADD_OBJ_REWRITER(first, opcodes); + IPFW_ADD_SOPT_HANDLER(first, scodes); return (0); } -/* end of file */ + + diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_table.h b/freebsd/sys/netpfil/ipfw/ip_fw_table.h new file mode 100644 index 00000000..d6578482 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/ip_fw_table.h @@ -0,0 +1,234 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IPFW2_TABLE_H +#define _IPFW2_TABLE_H + +/* + * Internal constants and data structures used by ipfw tables + * not meant to be exported outside the kernel. + */ +#ifdef _KERNEL + +struct table_algo; +struct tables_config { + struct namedobj_instance *namehash; + struct namedobj_instance *valhash; + uint32_t val_size; + uint32_t algo_count; + struct table_algo *algo[256]; + struct table_algo *def_algo[IPFW_TABLE_MAXTYPE + 1]; + TAILQ_HEAD(op_state_l,op_state) state_list; +}; +#define CHAIN_TO_TCFG(chain) ((struct tables_config *)(chain)->tblcfg) + +struct table_info { + table_lookup_t *lookup; /* Lookup function */ + void *state; /* Lookup radix/other structure */ + void *xstate; /* eXtended state */ + u_long data; /* Hints for given func */ +}; + +struct table_value; +struct tentry_info { + void *paddr; + struct table_value *pvalue; + void *ptv; /* Temporary field to hold obj */ + uint8_t masklen; /* mask length */ + uint8_t subtype; + uint16_t flags; /* record flags */ + uint32_t value; /* value index */ +}; +#define TEI_FLAGS_UPDATE 0x0001 /* Add or update rec if exists */ +#define TEI_FLAGS_UPDATED 0x0002 /* Entry has been updated */ +#define TEI_FLAGS_COMPAT 0x0004 /* Called from old ABI */ +#define TEI_FLAGS_DONTADD 0x0008 /* Do not create new rec */ +#define TEI_FLAGS_ADDED 0x0010 /* Entry was added */ +#define TEI_FLAGS_DELETED 0x0020 /* Entry was deleted */ +#define TEI_FLAGS_LIMIT 0x0040 /* Limit was hit */ +#define TEI_FLAGS_ERROR 0x0080 /* Unknown request error */ +#define TEI_FLAGS_NOTFOUND 0x0100 /* Entry was not found */ +#define TEI_FLAGS_EXISTS 0x0200 /* Entry already exists */ + +typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state, + struct table_info *ti, char *data, uint8_t tflags); +typedef void (ta_destroy)(void *ta_state, struct table_info *ti); +typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +typedef int (ta_add)(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +typedef int (ta_del)(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +typedef void (ta_flush_entry)(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); + +typedef int (ta_need_modify)(void *ta_state, struct table_info *ti, + uint32_t count, uint64_t *pflags); +typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags); +typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti, + void *ta_buf, uint64_t *pflags); +typedef void (ta_modify)(void *ta_state, struct table_info *ti, + void *ta_buf, uint64_t pflags); +typedef void (ta_flush_mod)(void *ta_buf); + +typedef void (ta_change_ti)(void *ta_state, struct table_info *ti); +typedef void (ta_print_config)(void *ta_state, struct table_info *ti, char *buf, + size_t bufsize); + +typedef int ta_foreach_f(void *node, void *arg); +typedef void ta_foreach(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg); +typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent); +typedef int ta_find_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +typedef uint32_t ta_get_count(void *ta_state, struct table_info *ti); + +struct table_algo { + char name[16]; + uint32_t idx; + uint32_t type; + uint32_t refcnt; + uint32_t flags; + uint32_t vlimit; + size_t ta_buf_size; + ta_init *init; + ta_destroy *destroy; + ta_prepare_add *prepare_add; + ta_prepare_del *prepare_del; + ta_add *add; + ta_del *del; + ta_flush_entry *flush_entry; + ta_find_tentry *find_tentry; + ta_need_modify *need_modify; + ta_prepare_mod *prepare_mod; + ta_fill_mod *fill_mod; + ta_modify *modify; + ta_flush_mod *flush_mod; + ta_change_ti *change_ti; + ta_foreach *foreach; + ta_dump_tentry *dump_tentry; + ta_print_config *print_config; + ta_dump_tinfo *dump_tinfo; + ta_get_count *get_count; +}; +#define TA_FLAG_DEFAULT 0x01 /* Algo is default for given type */ +#define TA_FLAG_READONLY 0x02 /* Algo does not support modifications*/ +#define TA_FLAG_EXTCOUNTER 0x04 /* Algo has external counter available*/ + +int ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, + size_t size, int *idx); +void ipfw_del_table_algo(struct ip_fw_chain *ch, int idx); + +void ipfw_table_algo_init(struct ip_fw_chain *chain); +void ipfw_table_algo_destroy(struct ip_fw_chain *chain); + +MALLOC_DECLARE(M_IPFW_TBL); +/* Exported to support legacy opcodes */ +int add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, + struct tentry_info *tei, uint8_t flags, uint32_t count); +int del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, + struct tentry_info *tei, uint8_t flags, uint32_t count); +int flush_table(struct ip_fw_chain *ch, struct tid_info *ti); +void ipfw_import_table_value_legacy(uint32_t value, struct table_value *v); +uint32_t ipfw_export_table_value_legacy(struct table_value *v); +int ipfw_get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd); + +/* ipfw_table_value.c functions */ +struct table_config; +struct tableop_state; +void ipfw_table_value_init(struct ip_fw_chain *ch, int first); +void ipfw_table_value_destroy(struct ip_fw_chain *ch, int last); +int ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts); +void ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc, + struct tentry_info *tei, uint32_t count, int rollback); +void ipfw_import_table_value_v1(ipfw_table_value *iv); +void ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *iv); +void ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc, + struct table_algo *ta, void *astate, struct table_info *ti); +void rollback_table_values(struct tableop_state *ts); + +int ipfw_rewrite_table_uidx(struct ip_fw_chain *chain, + struct rule_check_info *ci); +int ipfw_mark_table_kidx(struct ip_fw_chain *chain, struct ip_fw *rule, + uint32_t *bmask); +int ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx, + struct sockopt_data *sd); +void ipfw_unref_rule_tables(struct ip_fw_chain *chain, struct ip_fw *rule); +struct namedobj_instance *ipfw_get_table_objhash(struct ip_fw_chain *ch); + +/* utility functions */ +int ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt, + uint32_t new_set); +void ipfw_swap_tables_sets(struct ip_fw_chain *ch, uint32_t old_set, + uint32_t new_set, int mv); +int ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx, + ta_foreach_f f, void *arg); + +/* internal functions */ +void tc_ref(struct table_config *tc); +void tc_unref(struct table_config *tc); + +struct op_state; +typedef void (op_rollback_f)(void *object, struct op_state *state); +struct op_state { + TAILQ_ENTRY(op_state) next; /* chain link */ + op_rollback_f *func; +}; + +struct tableop_state { + struct op_state opstate; + struct ip_fw_chain *ch; + struct table_config *tc; + struct table_algo *ta; + struct tentry_info *tei; + uint32_t count; + uint32_t vmask; + int vshared; + int modified; +}; + +void add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts); +void del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts); +void rollback_toperation_state(struct ip_fw_chain *ch, void *object); + +/* Legacy interfaces */ +int ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, + uint32_t *cnt); +int ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, + uint32_t *cnt); +int ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti, + ipfw_table *tbl); + + +#endif /* _KERNEL */ +#endif /* _IPFW2_TABLE_H */ diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_table_algo.c b/freebsd/sys/netpfil/ipfw/ip_fw_table_algo.c new file mode 100644 index 00000000..e4c82131 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/ip_fw_table_algo.c @@ -0,0 +1,4112 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2014 Yandex LLC + * Copyright (c) 2014 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Lookup table algorithms. + * + */ + +#include <rtems/bsd/local/opt_ipfw.h> +#include <rtems/bsd/local/opt_inet.h> +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#include <rtems/bsd/local/opt_inet6.h> + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/rwlock.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <sys/queue.h> +#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */ +#include <net/radix.h> +#include <net/route.h> +#include <net/route_var.h> + +#include <netinet/in.h> +#include <netinet/in_fib.h> +#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ +#include <netinet/ip_fw.h> +#include <netinet6/in6_fib.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/ip_fw_table.h> + + +/* + * IPFW table lookup algorithms. + * + * What is needed to add another table algo? + * + * Algo init: + * * struct table_algo has to be filled with: + * name: "type:algoname" format, e.g. "addr:radix". Currently + * there are the following types: "addr", "iface", "number" and "flow". + * type: one of IPFW_TABLE_* types + * flags: one or more TA_FLAGS_* + * ta_buf_size: size of structure used to store add/del item state. + * Needs to be less than TA_BUF_SZ. + * callbacks: see below for description. + * * ipfw_add_table_algo / ipfw_del_table_algo has to be called + * + * Callbacks description: + * + * -init: request to initialize new table instance. + * typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state, + * struct table_info *ti, char *data, uint8_t tflags); + * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. + * + * Allocate all structures needed for normal operations. + * * Caller may want to parse @data for some algo-specific + * options provided by userland. + * * Caller may want to save configuration state pointer to @ta_state + * * Caller needs to save desired runtime structure pointer(s) + * inside @ti fields. Note that it is not correct to save + * @ti pointer at this moment. Use -change_ti hook for that. + * * Caller has to fill in ti->lookup to appropriate function + * pointer. + * + * + * + * -destroy: request to destroy table instance. + * typedef void (ta_destroy)(void *ta_state, struct table_info *ti); + * MANDATORY, unlocked. (M_WAITOK). + * + * Frees all table entries and all tables structures allocated by -init. + * + * + * + * -prepare_add: request to allocate state for adding new entry. + * typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei, + * void *ta_buf); + * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. + * + * Allocates state and fills it in with all necessary data (EXCEPT value) + * from @tei to minimize operations needed to be done under WLOCK. + * "value" field has to be copied to new entry in @add callback. + * Buffer ta_buf of size ta->ta_buf_sz may be used to store + * allocated state. + * + * + * + * -prepare_del: request to set state for deleting existing entry. + * typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei, + * void *ta_buf); + * MANDATORY, locked, UH. (M_NOWAIT). Returns 0 on success. + * + * Buffer ta_buf of size ta->ta_buf_sz may be used to store + * allocated state. Caller should use on-stack ta_buf allocation + * instead of doing malloc(). + * + * + * + * -add: request to insert new entry into runtime/config structures. + * typedef int (ta_add)(void *ta_state, struct table_info *ti, + * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); + * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. + * + * Insert new entry using previously-allocated state in @ta_buf. + * * @tei may have the following flags: + * TEI_FLAGS_UPDATE: request to add or update entry. + * TEI_FLAGS_DONTADD: request to update (but not add) entry. + * * Caller is required to do the following: + * copy real entry value from @tei + * entry added: return 0, set 1 to @pnum + * entry updated: return 0, store 0 to @pnum, store old value in @tei, + * add TEI_FLAGS_UPDATED flag to @tei. + * entry exists: return EEXIST + * entry not found: return ENOENT + * other error: return non-zero error code. + * + * + * + * -del: request to delete existing entry from runtime/config structures. + * typedef int (ta_del)(void *ta_state, struct table_info *ti, + * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); + * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. + * + * Delete entry using previously set up in @ta_buf. + * * Caller is required to do the following: + * entry deleted: return 0, set 1 to @pnum, store old value in @tei. + * entry not found: return ENOENT + * other error: return non-zero error code. + * + * + * + * -flush_entry: flush entry state created by -prepare_add / -del / others + * typedef void (ta_flush_entry)(struct ip_fw_chain *ch, + * struct tentry_info *tei, void *ta_buf); + * MANDATORY, may be locked. (M_NOWAIT). + * + * Delete state allocated by: + * -prepare_add (-add returned EEXIST|UPDATED) + * -prepare_del (if any) + * -del + * * Caller is required to handle empty @ta_buf correctly. + * + * + * -find_tentry: finds entry specified by key @tei + * typedef int ta_find_tentry(void *ta_state, struct table_info *ti, + * ipfw_obj_tentry *tent); + * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 on success. + * + * Finds entry specified by given key. + * * Caller is required to do the following: + * entry found: returns 0, export entry to @tent + * entry not found: returns ENOENT + * + * + * -need_modify: checks if @ti has enough space to hold another @count items. + * typedef int (ta_need_modify)(void *ta_state, struct table_info *ti, + * uint32_t count, uint64_t *pflags); + * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 if has. + * + * Checks if given table has enough space to add @count items without + * resize. Caller may use @pflags to store desired modification data. + * + * + * + * -prepare_mod: allocate structures for table modification. + * typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags); + * OPTIONAL(need_modify), unlocked. (M_WAITOK). Returns 0 on success. + * + * Allocate all needed state for table modification. Caller + * should use `struct mod_item` to store new state in @ta_buf. + * Up to TA_BUF_SZ (128 bytes) can be stored in @ta_buf. + * + * + * + * -fill_mod: copy some data to new state/ + * typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti, + * void *ta_buf, uint64_t *pflags); + * OPTIONAL(need_modify), locked (UH). (M_NOWAIT). Returns 0 on success. + * + * Copy as much data as we can to minimize changes under WLOCK. + * For example, array can be merged inside this callback. + * + * + * + * -modify: perform final modification. + * typedef void (ta_modify)(void *ta_state, struct table_info *ti, + * void *ta_buf, uint64_t pflags); + * OPTIONAL(need_modify), locked (UH+WLOCK). (M_NOWAIT). + * + * Performs all changes necessary to switch to new structures. + * * Caller should save old pointers to @ta_buf storage. + * + * + * + * -flush_mod: flush table modification state. + * typedef void (ta_flush_mod)(void *ta_buf); + * OPTIONAL(need_modify), unlocked. (M_WAITOK). + * + * Performs flush for the following: + * - prepare_mod (modification was not necessary) + * - modify (for the old state) + * + * + * + * -change_gi: monitor table info pointer changes + * typedef void (ta_change_ti)(void *ta_state, struct table_info *ti); + * OPTIONAL, locked (UH). (M_NOWAIT). + * + * Called on @ti pointer changed. Called immediately after -init + * to set initial state. + * + * + * + * -foreach: calls @f for each table entry + * typedef void ta_foreach(void *ta_state, struct table_info *ti, + * ta_foreach_f *f, void *arg); + * MANDATORY, locked(UH). (M_NOWAIT). + * + * Runs callback with specified argument for each table entry, + * Typically used for dumping table entries. + * + * + * + * -dump_tentry: dump table entry in current @tentry format. + * typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e, + * ipfw_obj_tentry *tent); + * MANDATORY, locked(UH). (M_NOWAIT). Returns 0 on success. + * + * Dumps entry @e to @tent. + * + * + * -print_config: prints custom algorithm options into buffer. + * typedef void (ta_print_config)(void *ta_state, struct table_info *ti, + * char *buf, size_t bufsize); + * OPTIONAL. locked(UH). (M_NOWAIT). + * + * Prints custom algorithm options in the format suitable to pass + * back to -init callback. + * + * + * + * -dump_tinfo: dumps algo-specific info. + * typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti, + * ipfw_ta_tinfo *tinfo); + * OPTIONAL. locked(UH). (M_NOWAIT). + * + * Dumps options like items size/hash size, etc. + */ + +MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); + +/* + * Utility structures/functions common to more than one algo + */ + +struct mod_item { + void *main_ptr; + size_t size; + void *main_ptr6; + size_t size6; +}; + +static int badd(const void *key, void *item, void *base, size_t nmemb, + size_t size, int (*compar) (const void *, const void *)); +static int bdel(const void *key, void *base, size_t nmemb, size_t size, + int (*compar) (const void *, const void *)); + + +/* + * ADDR implementation using radix + * + */ + +/* + * The radix code expects addr and mask to be array of bytes, + * with the first byte being the length of the array. rn_inithead + * is called with the offset in bits of the lookup key within the + * array. If we use a sockaddr_in as the underlying type, + * sin_len is conveniently located at offset 0, sin_addr is at + * offset 4 and normally aligned. + * But for portability, let's avoid assumption and make the code explicit + */ +#define KEY_LEN(v) *((uint8_t *)&(v)) +/* + * Do not require radix to compare more than actual IPv4/IPv6 address + */ +#define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t)) +#define KEY_LEN_INET6 (offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr)) + +#define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr)) +#define OFF_LEN_INET6 (8 * offsetof(struct sa_in6, sin6_addr)) + +struct radix_addr_entry { + struct radix_node rn[2]; + struct sockaddr_in addr; + uint32_t value; + uint8_t masklen; +}; + +struct sa_in6 { + uint8_t sin6_len; + uint8_t sin6_family; + uint8_t pad[2]; + struct in6_addr sin6_addr; +}; + +struct radix_addr_xentry { + struct radix_node rn[2]; + struct sa_in6 addr6; + uint32_t value; + uint8_t masklen; +}; + +struct radix_cfg { + struct radix_node_head *head4; + struct radix_node_head *head6; + size_t count4; + size_t count6; +}; + +struct ta_buf_radix +{ + void *ent_ptr; + struct sockaddr *addr_ptr; + struct sockaddr *mask_ptr; + union { + struct { + struct sockaddr_in sa; + struct sockaddr_in ma; + } a4; + struct { + struct sa_in6 sa; + struct sa_in6 ma; + } a6; + } addr; +}; + +static int ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); +static int ta_init_radix(struct ip_fw_chain *ch, void **ta_state, + struct table_info *ti, char *data, uint8_t tflags); +static int flush_radix_entry(struct radix_node *rn, void *arg); +static void ta_destroy_radix(void *ta_state, struct table_info *ti); +static void ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +static int ta_dump_radix_tentry(void *ta_state, struct table_info *ti, + void *e, ipfw_obj_tentry *tent); +static int ta_find_radix_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +static void ta_foreach_radix(void *ta_state, struct table_info *ti, + ta_foreach_f *f, void *arg); +static void tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa, + struct sockaddr *ma, int *set_mask); +static int ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_add_radix(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static int ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_del_radix(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_need_modify_radix(void *ta_state, struct table_info *ti, + uint32_t count, uint64_t *pflags); + +static int +ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct radix_node_head *rnh; + + if (keylen == sizeof(in_addr_t)) { + struct radix_addr_entry *ent; + struct sockaddr_in sa; + KEY_LEN(sa) = KEY_LEN_INET; + sa.sin_addr.s_addr = *((in_addr_t *)key); + rnh = (struct radix_node_head *)ti->state; + ent = (struct radix_addr_entry *)(rnh->rnh_matchaddr(&sa, &rnh->rh)); + if (ent != NULL) { + *val = ent->value; + return (1); + } + } else { + struct radix_addr_xentry *xent; + struct sa_in6 sa6; + KEY_LEN(sa6) = KEY_LEN_INET6; + memcpy(&sa6.sin6_addr, key, sizeof(struct in6_addr)); + rnh = (struct radix_node_head *)ti->xstate; + xent = (struct radix_addr_xentry *)(rnh->rnh_matchaddr(&sa6, &rnh->rh)); + if (xent != NULL) { + *val = xent->value; + return (1); + } + } + + return (0); +} + +/* + * New table + */ +static int +ta_init_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, + char *data, uint8_t tflags) +{ + struct radix_cfg *cfg; + + if (!rn_inithead(&ti->state, OFF_LEN_INET)) + return (ENOMEM); + if (!rn_inithead(&ti->xstate, OFF_LEN_INET6)) { + rn_detachhead(&ti->state); + return (ENOMEM); + } + + cfg = malloc(sizeof(struct radix_cfg), M_IPFW, M_WAITOK | M_ZERO); + + *ta_state = cfg; + ti->lookup = ta_lookup_radix; + + return (0); +} + +static int +flush_radix_entry(struct radix_node *rn, void *arg) +{ + struct radix_node_head * const rnh = arg; + struct radix_addr_entry *ent; + + ent = (struct radix_addr_entry *) + rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, &rnh->rh); + if (ent != NULL) + free(ent, M_IPFW_TBL); + return (0); +} + +static void +ta_destroy_radix(void *ta_state, struct table_info *ti) +{ + struct radix_cfg *cfg; + struct radix_node_head *rnh; + + cfg = (struct radix_cfg *)ta_state; + + rnh = (struct radix_node_head *)(ti->state); + rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh); + rn_detachhead(&ti->state); + + rnh = (struct radix_node_head *)(ti->xstate); + rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh); + rn_detachhead(&ti->xstate); + + free(cfg, M_IPFW); +} + +/* + * Provide algo-specific table info + */ +static void +ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) +{ + struct radix_cfg *cfg; + + cfg = (struct radix_cfg *)ta_state; + + tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; + tinfo->taclass4 = IPFW_TACLASS_RADIX; + tinfo->count4 = cfg->count4; + tinfo->itemsize4 = sizeof(struct radix_addr_entry); + tinfo->taclass6 = IPFW_TACLASS_RADIX; + tinfo->count6 = cfg->count6; + tinfo->itemsize6 = sizeof(struct radix_addr_xentry); +} + +static int +ta_dump_radix_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent) +{ + struct radix_addr_entry *n; +#ifdef INET6 + struct radix_addr_xentry *xn; +#endif + + n = (struct radix_addr_entry *)e; + + /* Guess IPv4/IPv6 radix by sockaddr family */ + if (n->addr.sin_family == AF_INET) { + tent->k.addr.s_addr = n->addr.sin_addr.s_addr; + tent->masklen = n->masklen; + tent->subtype = AF_INET; + tent->v.kidx = n->value; +#ifdef INET6 + } else { + xn = (struct radix_addr_xentry *)e; + memcpy(&tent->k, &xn->addr6.sin6_addr, sizeof(struct in6_addr)); + tent->masklen = xn->masklen; + tent->subtype = AF_INET6; + tent->v.kidx = xn->value; +#endif + } + + return (0); +} + +static int +ta_find_radix_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent) +{ + struct radix_node_head *rnh; + void *e; + + e = NULL; + if (tent->subtype == AF_INET) { + struct sockaddr_in sa; + KEY_LEN(sa) = KEY_LEN_INET; + sa.sin_addr.s_addr = tent->k.addr.s_addr; + rnh = (struct radix_node_head *)ti->state; + e = rnh->rnh_matchaddr(&sa, &rnh->rh); + } else { + struct sa_in6 sa6; + KEY_LEN(sa6) = KEY_LEN_INET6; + memcpy(&sa6.sin6_addr, &tent->k.addr6, sizeof(struct in6_addr)); + rnh = (struct radix_node_head *)ti->xstate; + e = rnh->rnh_matchaddr(&sa6, &rnh->rh); + } + + if (e != NULL) { + ta_dump_radix_tentry(ta_state, ti, e, tent); + return (0); + } + + return (ENOENT); +} + +static void +ta_foreach_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg) +{ + struct radix_node_head *rnh; + + rnh = (struct radix_node_head *)(ti->state); + rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg); + + rnh = (struct radix_node_head *)(ti->xstate); + rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg); +} + + +#ifdef INET6 +static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask); + +static inline void +ipv6_writemask(struct in6_addr *addr6, uint8_t mask) +{ + uint32_t *cp; + + for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) + *cp++ = 0xFFFFFFFF; + if (mask > 0) + *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); +} +#endif + +static void +tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa, + struct sockaddr *ma, int *set_mask) +{ + int mlen; +#ifdef INET + struct sockaddr_in *addr, *mask; +#endif +#ifdef INET6 + struct sa_in6 *addr6, *mask6; +#endif + in_addr_t a4; + + mlen = tei->masklen; + + if (tei->subtype == AF_INET) { +#ifdef INET + addr = (struct sockaddr_in *)sa; + mask = (struct sockaddr_in *)ma; + /* Set 'total' structure length */ + KEY_LEN(*addr) = KEY_LEN_INET; + KEY_LEN(*mask) = KEY_LEN_INET; + addr->sin_family = AF_INET; + mask->sin_addr.s_addr = + htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); + a4 = *((in_addr_t *)tei->paddr); + addr->sin_addr.s_addr = a4 & mask->sin_addr.s_addr; + if (mlen != 32) + *set_mask = 1; + else + *set_mask = 0; +#endif +#ifdef INET6 + } else if (tei->subtype == AF_INET6) { + /* IPv6 case */ + addr6 = (struct sa_in6 *)sa; + mask6 = (struct sa_in6 *)ma; + /* Set 'total' structure length */ + KEY_LEN(*addr6) = KEY_LEN_INET6; + KEY_LEN(*mask6) = KEY_LEN_INET6; + addr6->sin6_family = AF_INET6; + ipv6_writemask(&mask6->sin6_addr, mlen); + memcpy(&addr6->sin6_addr, tei->paddr, sizeof(struct in6_addr)); + APPLY_MASK(&addr6->sin6_addr, &mask6->sin6_addr); + if (mlen != 128) + *set_mask = 1; + else + *set_mask = 0; +#endif + } +} + +static int +ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_radix *tb; + struct radix_addr_entry *ent; +#ifdef INET6 + struct radix_addr_xentry *xent; +#endif + struct sockaddr *addr, *mask; + int mlen, set_mask; + + tb = (struct ta_buf_radix *)ta_buf; + + mlen = tei->masklen; + set_mask = 0; + + if (tei->subtype == AF_INET) { +#ifdef INET + if (mlen > 32) + return (EINVAL); + ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); + ent->masklen = mlen; + + addr = (struct sockaddr *)&ent->addr; + mask = (struct sockaddr *)&tb->addr.a4.ma; + tb->ent_ptr = ent; +#endif +#ifdef INET6 + } else if (tei->subtype == AF_INET6) { + /* IPv6 case */ + if (mlen > 128) + return (EINVAL); + xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO); + xent->masklen = mlen; + + addr = (struct sockaddr *)&xent->addr6; + mask = (struct sockaddr *)&tb->addr.a6.ma; + tb->ent_ptr = xent; +#endif + } else { + /* Unknown CIDR type */ + return (EINVAL); + } + + tei_to_sockaddr_ent(tei, addr, mask, &set_mask); + /* Set pointers */ + tb->addr_ptr = addr; + if (set_mask != 0) + tb->mask_ptr = mask; + + return (0); +} + +static int +ta_add_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct radix_cfg *cfg; + struct radix_node_head *rnh; + struct radix_node *rn; + struct ta_buf_radix *tb; + uint32_t *old_value, value; + + cfg = (struct radix_cfg *)ta_state; + tb = (struct ta_buf_radix *)ta_buf; + + /* Save current entry value from @tei */ + if (tei->subtype == AF_INET) { + rnh = ti->state; + ((struct radix_addr_entry *)tb->ent_ptr)->value = tei->value; + } else { + rnh = ti->xstate; + ((struct radix_addr_xentry *)tb->ent_ptr)->value = tei->value; + } + + /* Search for an entry first */ + rn = rnh->rnh_lookup(tb->addr_ptr, tb->mask_ptr, &rnh->rh); + if (rn != NULL) { + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) + return (EEXIST); + /* Record already exists. Update value if we're asked to */ + if (tei->subtype == AF_INET) + old_value = &((struct radix_addr_entry *)rn)->value; + else + old_value = &((struct radix_addr_xentry *)rn)->value; + + value = *old_value; + *old_value = tei->value; + tei->value = value; + + /* Indicate that update has happened instead of addition */ + tei->flags |= TEI_FLAGS_UPDATED; + *pnum = 0; + + return (0); + } + + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) + return (EFBIG); + + rn = rnh->rnh_addaddr(tb->addr_ptr, tb->mask_ptr, &rnh->rh,tb->ent_ptr); + if (rn == NULL) { + /* Unknown error */ + return (EINVAL); + } + + if (tei->subtype == AF_INET) + cfg->count4++; + else + cfg->count6++; + tb->ent_ptr = NULL; + *pnum = 1; + + return (0); +} + +static int +ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_radix *tb; + struct sockaddr *addr, *mask; + int mlen, set_mask; + + tb = (struct ta_buf_radix *)ta_buf; + + mlen = tei->masklen; + set_mask = 0; + + if (tei->subtype == AF_INET) { + if (mlen > 32) + return (EINVAL); + + addr = (struct sockaddr *)&tb->addr.a4.sa; + mask = (struct sockaddr *)&tb->addr.a4.ma; +#ifdef INET6 + } else if (tei->subtype == AF_INET6) { + if (mlen > 128) + return (EINVAL); + + addr = (struct sockaddr *)&tb->addr.a6.sa; + mask = (struct sockaddr *)&tb->addr.a6.ma; +#endif + } else + return (EINVAL); + + tei_to_sockaddr_ent(tei, addr, mask, &set_mask); + tb->addr_ptr = addr; + if (set_mask != 0) + tb->mask_ptr = mask; + + return (0); +} + +static int +ta_del_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct radix_cfg *cfg; + struct radix_node_head *rnh; + struct radix_node *rn; + struct ta_buf_radix *tb; + + cfg = (struct radix_cfg *)ta_state; + tb = (struct ta_buf_radix *)ta_buf; + + if (tei->subtype == AF_INET) + rnh = ti->state; + else + rnh = ti->xstate; + + rn = rnh->rnh_deladdr(tb->addr_ptr, tb->mask_ptr, &rnh->rh); + + if (rn == NULL) + return (ENOENT); + + /* Save entry value to @tei */ + if (tei->subtype == AF_INET) + tei->value = ((struct radix_addr_entry *)rn)->value; + else + tei->value = ((struct radix_addr_xentry *)rn)->value; + + tb->ent_ptr = rn; + + if (tei->subtype == AF_INET) + cfg->count4--; + else + cfg->count6--; + *pnum = 1; + + return (0); +} + +static void +ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_radix *tb; + + tb = (struct ta_buf_radix *)ta_buf; + + if (tb->ent_ptr != NULL) + free(tb->ent_ptr, M_IPFW_TBL); +} + +static int +ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count, + uint64_t *pflags) +{ + + /* + * radix does not require additional memory allocations + * other than nodes itself. Adding new masks to the tree do + * but we don't have any API to call (and we don't known which + * sizes do we need). + */ + return (0); +} + +struct table_algo addr_radix = { + .name = "addr:radix", + .type = IPFW_TABLE_ADDR, + .flags = TA_FLAG_DEFAULT, + .ta_buf_size = sizeof(struct ta_buf_radix), + .init = ta_init_radix, + .destroy = ta_destroy_radix, + .prepare_add = ta_prepare_add_radix, + .prepare_del = ta_prepare_del_radix, + .add = ta_add_radix, + .del = ta_del_radix, + .flush_entry = ta_flush_radix_entry, + .foreach = ta_foreach_radix, + .dump_tentry = ta_dump_radix_tentry, + .find_tentry = ta_find_radix_tentry, + .dump_tinfo = ta_dump_radix_tinfo, + .need_modify = ta_need_modify_radix, +}; + + +/* + * addr:hash cmds + * + * + * ti->data: + * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] + * [ 8][ 8[ 8][ 8] + * + * inv.mask4: 32 - mask + * inv.mask6: + * 1) _slow lookup: mask + * 2) _aligned: (128 - mask) / 8 + * 3) _64: 8 + * + * + * pflags: + * [v4=1/v6=0][hsize] + * [ 32][ 32] + */ + +struct chashentry; + +SLIST_HEAD(chashbhead, chashentry); + +struct chash_cfg { + struct chashbhead *head4; + struct chashbhead *head6; + size_t size4; + size_t size6; + size_t items4; + size_t items6; + uint8_t mask4; + uint8_t mask6; +}; + +struct chashentry { + SLIST_ENTRY(chashentry) next; + uint32_t value; + uint32_t type; + union { + uint32_t a4; /* Host format */ + struct in6_addr a6; /* Network format */ + } a; +}; + +struct ta_buf_chash +{ + void *ent_ptr; + struct chashentry ent; +}; + +#ifdef INET +static __inline uint32_t hash_ip(uint32_t addr, int hsize); +#endif +#ifdef INET6 +static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize); +static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize); +static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key, + int mask, int hsize); +static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask, + int hsize); +#endif +static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); +static int ta_lookup_chash_aligned(struct table_info *ti, void *key, + uint32_t keylen, uint32_t *val); +static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); +static int chash_parse_opts(struct chash_cfg *cfg, char *data); +static void ta_print_chash_config(void *ta_state, struct table_info *ti, + char *buf, size_t bufsize); +static int ta_log2(uint32_t v); +static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state, + struct table_info *ti, char *data, uint8_t tflags); +static void ta_destroy_chash(void *ta_state, struct table_info *ti); +static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti, + void *e, ipfw_obj_tentry *tent); +static uint32_t hash_ent(struct chashentry *ent, int af, int mlen, + uint32_t size); +static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent); +static int ta_find_chash_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +static void ta_foreach_chash(void *ta_state, struct table_info *ti, + ta_foreach_f *f, void *arg); +static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_add_chash(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_del_chash(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_need_modify_chash(void *ta_state, struct table_info *ti, + uint32_t count, uint64_t *pflags); +static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags); +static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t *pflags); +static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags); +static void ta_flush_mod_chash(void *ta_buf); + + +#ifdef INET +static __inline uint32_t +hash_ip(uint32_t addr, int hsize) +{ + + return (addr % (hsize - 1)); +} +#endif + +#ifdef INET6 +static __inline uint32_t +hash_ip6(struct in6_addr *addr6, int hsize) +{ + uint32_t i; + + i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1] ^ + addr6->s6_addr32[2] ^ addr6->s6_addr32[3]; + + return (i % (hsize - 1)); +} + + +static __inline uint16_t +hash_ip64(struct in6_addr *addr6, int hsize) +{ + uint32_t i; + + i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1]; + + return (i % (hsize - 1)); +} + + +static __inline uint32_t +hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize) +{ + struct in6_addr mask6; + + ipv6_writemask(&mask6, mask); + memcpy(addr6, key, sizeof(struct in6_addr)); + APPLY_MASK(addr6, &mask6); + return (hash_ip6(addr6, hsize)); +} + +static __inline uint32_t +hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize) +{ + uint64_t *paddr; + + paddr = (uint64_t *)addr6; + *paddr = 0; + *(paddr + 1) = 0; + memcpy(addr6, key, mask); + return (hash_ip6(addr6, hsize)); +} +#endif + +static int +ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct chashbhead *head; + struct chashentry *ent; + uint16_t hash, hsize; + uint8_t imask; + + if (keylen == sizeof(in_addr_t)) { +#ifdef INET + head = (struct chashbhead *)ti->state; + imask = ti->data >> 24; + hsize = 1 << ((ti->data & 0xFFFF) >> 8); + uint32_t a; + a = ntohl(*((in_addr_t *)key)); + a = a >> imask; + hash = hash_ip(a, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + if (ent->a.a4 == a) { + *val = ent->value; + return (1); + } + } +#endif + } else { +#ifdef INET6 + /* IPv6: worst scenario: non-round mask */ + struct in6_addr addr6; + head = (struct chashbhead *)ti->xstate; + imask = (ti->data & 0xFF0000) >> 16; + hsize = 1 << (ti->data & 0xFF); + hash = hash_ip6_slow(&addr6, key, imask, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + if (memcmp(&ent->a.a6, &addr6, 16) == 0) { + *val = ent->value; + return (1); + } + } +#endif + } + + return (0); +} + +static int +ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct chashbhead *head; + struct chashentry *ent; + uint16_t hash, hsize; + uint8_t imask; + + if (keylen == sizeof(in_addr_t)) { +#ifdef INET + head = (struct chashbhead *)ti->state; + imask = ti->data >> 24; + hsize = 1 << ((ti->data & 0xFFFF) >> 8); + uint32_t a; + a = ntohl(*((in_addr_t *)key)); + a = a >> imask; + hash = hash_ip(a, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + if (ent->a.a4 == a) { + *val = ent->value; + return (1); + } + } +#endif + } else { +#ifdef INET6 + /* IPv6: aligned to 8bit mask */ + struct in6_addr addr6; + uint64_t *paddr, *ptmp; + head = (struct chashbhead *)ti->xstate; + imask = (ti->data & 0xFF0000) >> 16; + hsize = 1 << (ti->data & 0xFF); + + hash = hash_ip6_al(&addr6, key, imask, hsize); + paddr = (uint64_t *)&addr6; + SLIST_FOREACH(ent, &head[hash], next) { + ptmp = (uint64_t *)&ent->a.a6; + if (paddr[0] == ptmp[0] && paddr[1] == ptmp[1]) { + *val = ent->value; + return (1); + } + } +#endif + } + + return (0); +} + +static int +ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct chashbhead *head; + struct chashentry *ent; + uint16_t hash, hsize; + uint8_t imask; + + if (keylen == sizeof(in_addr_t)) { +#ifdef INET + head = (struct chashbhead *)ti->state; + imask = ti->data >> 24; + hsize = 1 << ((ti->data & 0xFFFF) >> 8); + uint32_t a; + a = ntohl(*((in_addr_t *)key)); + a = a >> imask; + hash = hash_ip(a, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + if (ent->a.a4 == a) { + *val = ent->value; + return (1); + } + } +#endif + } else { +#ifdef INET6 + /* IPv6: /64 */ + uint64_t a6, *paddr; + head = (struct chashbhead *)ti->xstate; + paddr = (uint64_t *)key; + hsize = 1 << (ti->data & 0xFF); + a6 = *paddr; + hash = hash_ip64((struct in6_addr *)key, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + paddr = (uint64_t *)&ent->a.a6; + if (a6 == *paddr) { + *val = ent->value; + return (1); + } + } +#endif + } + + return (0); +} + +static int +chash_parse_opts(struct chash_cfg *cfg, char *data) +{ + char *pdel, *pend, *s; + int mask4, mask6; + + mask4 = cfg->mask4; + mask6 = cfg->mask6; + + if (data == NULL) + return (0); + if ((pdel = strchr(data, ' ')) == NULL) + return (0); + while (*pdel == ' ') + pdel++; + if (strncmp(pdel, "masks=", 6) != 0) + return (EINVAL); + if ((s = strchr(pdel, ' ')) != NULL) + *s++ = '\0'; + + pdel += 6; + /* Need /XX[,/YY] */ + if (*pdel++ != '/') + return (EINVAL); + mask4 = strtol(pdel, &pend, 10); + if (*pend == ',') { + /* ,/YY */ + pdel = pend + 1; + if (*pdel++ != '/') + return (EINVAL); + mask6 = strtol(pdel, &pend, 10); + if (*pend != '\0') + return (EINVAL); + } else if (*pend != '\0') + return (EINVAL); + + if (mask4 < 0 || mask4 > 32 || mask6 < 0 || mask6 > 128) + return (EINVAL); + + cfg->mask4 = mask4; + cfg->mask6 = mask6; + + return (0); +} + +static void +ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf, + size_t bufsize) +{ + struct chash_cfg *cfg; + + cfg = (struct chash_cfg *)ta_state; + + if (cfg->mask4 != 32 || cfg->mask6 != 128) + snprintf(buf, bufsize, "%s masks=/%d,/%d", "addr:hash", + cfg->mask4, cfg->mask6); + else + snprintf(buf, bufsize, "%s", "addr:hash"); +} + +static int +ta_log2(uint32_t v) +{ + uint32_t r; + + r = 0; + while (v >>= 1) + r++; + + return (r); +} + +/* + * New table. + * We assume 'data' to be either NULL or the following format: + * 'addr:hash [masks=/32[,/128]]' + */ +static int +ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, + char *data, uint8_t tflags) +{ + int error, i; + uint32_t hsize; + struct chash_cfg *cfg; + + cfg = malloc(sizeof(struct chash_cfg), M_IPFW, M_WAITOK | M_ZERO); + + cfg->mask4 = 32; + cfg->mask6 = 128; + + if ((error = chash_parse_opts(cfg, data)) != 0) { + free(cfg, M_IPFW); + return (error); + } + + cfg->size4 = 128; + cfg->size6 = 128; + + cfg->head4 = malloc(sizeof(struct chashbhead) * cfg->size4, M_IPFW, + M_WAITOK | M_ZERO); + cfg->head6 = malloc(sizeof(struct chashbhead) * cfg->size6, M_IPFW, + M_WAITOK | M_ZERO); + for (i = 0; i < cfg->size4; i++) + SLIST_INIT(&cfg->head4[i]); + for (i = 0; i < cfg->size6; i++) + SLIST_INIT(&cfg->head6[i]); + + + *ta_state = cfg; + ti->state = cfg->head4; + ti->xstate = cfg->head6; + + /* Store data depending on v6 mask length */ + hsize = ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); + if (cfg->mask6 == 64) { + ti->data = (32 - cfg->mask4) << 24 | (128 - cfg->mask6) << 16| + hsize; + ti->lookup = ta_lookup_chash_64; + } else if ((cfg->mask6 % 8) == 0) { + ti->data = (32 - cfg->mask4) << 24 | + cfg->mask6 << 13 | hsize; + ti->lookup = ta_lookup_chash_aligned; + } else { + /* don't do that! */ + ti->data = (32 - cfg->mask4) << 24 | + cfg->mask6 << 16 | hsize; + ti->lookup = ta_lookup_chash_slow; + } + + return (0); +} + +static void +ta_destroy_chash(void *ta_state, struct table_info *ti) +{ + struct chash_cfg *cfg; + struct chashentry *ent, *ent_next; + int i; + + cfg = (struct chash_cfg *)ta_state; + + for (i = 0; i < cfg->size4; i++) + SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) + free(ent, M_IPFW_TBL); + + for (i = 0; i < cfg->size6; i++) + SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) + free(ent, M_IPFW_TBL); + + free(cfg->head4, M_IPFW); + free(cfg->head6, M_IPFW); + + free(cfg, M_IPFW); +} + +static void +ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) +{ + struct chash_cfg *cfg; + + cfg = (struct chash_cfg *)ta_state; + + tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; + tinfo->taclass4 = IPFW_TACLASS_HASH; + tinfo->size4 = cfg->size4; + tinfo->count4 = cfg->items4; + tinfo->itemsize4 = sizeof(struct chashentry); + tinfo->taclass6 = IPFW_TACLASS_HASH; + tinfo->size6 = cfg->size6; + tinfo->count6 = cfg->items6; + tinfo->itemsize6 = sizeof(struct chashentry); +} + +static int +ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent) +{ + struct chash_cfg *cfg; + struct chashentry *ent; + + cfg = (struct chash_cfg *)ta_state; + ent = (struct chashentry *)e; + + if (ent->type == AF_INET) { + tent->k.addr.s_addr = htonl(ent->a.a4 << (32 - cfg->mask4)); + tent->masklen = cfg->mask4; + tent->subtype = AF_INET; + tent->v.kidx = ent->value; +#ifdef INET6 + } else { + memcpy(&tent->k, &ent->a.a6, sizeof(struct in6_addr)); + tent->masklen = cfg->mask6; + tent->subtype = AF_INET6; + tent->v.kidx = ent->value; +#endif + } + + return (0); +} + +static uint32_t +hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size) +{ + uint32_t hash; + + hash = 0; + + if (af == AF_INET) { +#ifdef INET + hash = hash_ip(ent->a.a4, size); +#endif + } else { +#ifdef INET6 + if (mlen == 64) + hash = hash_ip64(&ent->a.a6, size); + else + hash = hash_ip6(&ent->a.a6, size); +#endif + } + + return (hash); +} + +static int +tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent) +{ + int mlen; +#ifdef INET6 + struct in6_addr mask6; +#endif + + + mlen = tei->masklen; + + if (tei->subtype == AF_INET) { +#ifdef INET + if (mlen > 32) + return (EINVAL); + ent->type = AF_INET; + + /* Calculate masked address */ + ent->a.a4 = ntohl(*((in_addr_t *)tei->paddr)) >> (32 - mlen); +#endif +#ifdef INET6 + } else if (tei->subtype == AF_INET6) { + /* IPv6 case */ + if (mlen > 128) + return (EINVAL); + ent->type = AF_INET6; + + ipv6_writemask(&mask6, mlen); + memcpy(&ent->a.a6, tei->paddr, sizeof(struct in6_addr)); + APPLY_MASK(&ent->a.a6, &mask6); +#endif + } else { + /* Unknown CIDR type */ + return (EINVAL); + } + + return (0); +} + +static int +ta_find_chash_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent) +{ + struct chash_cfg *cfg; + struct chashbhead *head; + struct chashentry ent, *tmp; + struct tentry_info tei; + int error; + uint32_t hash; + + cfg = (struct chash_cfg *)ta_state; + + memset(&ent, 0, sizeof(ent)); + memset(&tei, 0, sizeof(tei)); + + if (tent->subtype == AF_INET) { + tei.paddr = &tent->k.addr; + tei.masklen = cfg->mask4; + tei.subtype = AF_INET; + + if ((error = tei_to_chash_ent(&tei, &ent)) != 0) + return (error); + + head = cfg->head4; + hash = hash_ent(&ent, AF_INET, cfg->mask4, cfg->size4); + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (tmp->a.a4 != ent.a.a4) + continue; + + ta_dump_chash_tentry(ta_state, ti, tmp, tent); + return (0); + } + } else { + tei.paddr = &tent->k.addr6; + tei.masklen = cfg->mask6; + tei.subtype = AF_INET6; + + if ((error = tei_to_chash_ent(&tei, &ent)) != 0) + return (error); + + head = cfg->head6; + hash = hash_ent(&ent, AF_INET6, cfg->mask6, cfg->size6); + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (memcmp(&tmp->a.a6, &ent.a.a6, 16) != 0) + continue; + ta_dump_chash_tentry(ta_state, ti, tmp, tent); + return (0); + } + } + + return (ENOENT); +} + +static void +ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg) +{ + struct chash_cfg *cfg; + struct chashentry *ent, *ent_next; + int i; + + cfg = (struct chash_cfg *)ta_state; + + for (i = 0; i < cfg->size4; i++) + SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) + f(ent, arg); + + for (i = 0; i < cfg->size6; i++) + SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) + f(ent, arg); +} + +static int +ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_chash *tb; + struct chashentry *ent; + int error; + + tb = (struct ta_buf_chash *)ta_buf; + + ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); + + error = tei_to_chash_ent(tei, ent); + if (error != 0) { + free(ent, M_IPFW_TBL); + return (error); + } + tb->ent_ptr = ent; + + return (0); +} + +static int +ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct chash_cfg *cfg; + struct chashbhead *head; + struct chashentry *ent, *tmp; + struct ta_buf_chash *tb; + int exists; + uint32_t hash, value; + + cfg = (struct chash_cfg *)ta_state; + tb = (struct ta_buf_chash *)ta_buf; + ent = (struct chashentry *)tb->ent_ptr; + hash = 0; + exists = 0; + + /* Read current value from @tei */ + ent->value = tei->value; + + /* Read cuurrent value */ + if (tei->subtype == AF_INET) { + if (tei->masklen != cfg->mask4) + return (EINVAL); + head = cfg->head4; + hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); + + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (tmp->a.a4 == ent->a.a4) { + exists = 1; + break; + } + } + } else { + if (tei->masklen != cfg->mask6) + return (EINVAL); + head = cfg->head6; + hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (memcmp(&tmp->a.a6, &ent->a.a6, 16) == 0) { + exists = 1; + break; + } + } + } + + if (exists == 1) { + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) + return (EEXIST); + /* Record already exists. Update value if we're asked to */ + value = tmp->value; + tmp->value = tei->value; + tei->value = value; + /* Indicate that update has happened instead of addition */ + tei->flags |= TEI_FLAGS_UPDATED; + *pnum = 0; + } else { + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) + return (EFBIG); + SLIST_INSERT_HEAD(&head[hash], ent, next); + tb->ent_ptr = NULL; + *pnum = 1; + + /* Update counters */ + if (tei->subtype == AF_INET) + cfg->items4++; + else + cfg->items6++; + } + + return (0); +} + +static int +ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_chash *tb; + + tb = (struct ta_buf_chash *)ta_buf; + + return (tei_to_chash_ent(tei, &tb->ent)); +} + +static int +ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct chash_cfg *cfg; + struct chashbhead *head; + struct chashentry *tmp, *tmp_next, *ent; + struct ta_buf_chash *tb; + uint32_t hash; + + cfg = (struct chash_cfg *)ta_state; + tb = (struct ta_buf_chash *)ta_buf; + ent = &tb->ent; + + if (tei->subtype == AF_INET) { + if (tei->masklen != cfg->mask4) + return (EINVAL); + head = cfg->head4; + hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); + + SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { + if (tmp->a.a4 != ent->a.a4) + continue; + + SLIST_REMOVE(&head[hash], tmp, chashentry, next); + cfg->items4--; + tb->ent_ptr = tmp; + tei->value = tmp->value; + *pnum = 1; + return (0); + } + } else { + if (tei->masklen != cfg->mask6) + return (EINVAL); + head = cfg->head6; + hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); + SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { + if (memcmp(&tmp->a.a6, &ent->a.a6, 16) != 0) + continue; + + SLIST_REMOVE(&head[hash], tmp, chashentry, next); + cfg->items6--; + tb->ent_ptr = tmp; + tei->value = tmp->value; + *pnum = 1; + return (0); + } + } + + return (ENOENT); +} + +static void +ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_chash *tb; + + tb = (struct ta_buf_chash *)ta_buf; + + if (tb->ent_ptr != NULL) + free(tb->ent_ptr, M_IPFW_TBL); +} + +/* + * Hash growing callbacks. + */ + +static int +ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count, + uint64_t *pflags) +{ + struct chash_cfg *cfg; + uint64_t data; + + /* + * Since we don't know exact number of IPv4/IPv6 records in @count, + * ignore non-zero @count value at all. Check current hash sizes + * and return appropriate data. + */ + + cfg = (struct chash_cfg *)ta_state; + + data = 0; + if (cfg->items4 > cfg->size4 && cfg->size4 < 65536) + data |= (cfg->size4 * 2) << 16; + if (cfg->items6 > cfg->size6 && cfg->size6 < 65536) + data |= cfg->size6 * 2; + + if (data != 0) { + *pflags = data; + return (1); + } + + return (0); +} + +/* + * Allocate new, larger chash. + */ +static int +ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags) +{ + struct mod_item *mi; + struct chashbhead *head; + int i; + + mi = (struct mod_item *)ta_buf; + + memset(mi, 0, sizeof(struct mod_item)); + mi->size = (*pflags >> 16) & 0xFFFF; + mi->size6 = *pflags & 0xFFFF; + if (mi->size > 0) { + head = malloc(sizeof(struct chashbhead) * mi->size, + M_IPFW, M_WAITOK | M_ZERO); + for (i = 0; i < mi->size; i++) + SLIST_INIT(&head[i]); + mi->main_ptr = head; + } + + if (mi->size6 > 0) { + head = malloc(sizeof(struct chashbhead) * mi->size6, + M_IPFW, M_WAITOK | M_ZERO); + for (i = 0; i < mi->size6; i++) + SLIST_INIT(&head[i]); + mi->main_ptr6 = head; + } + + return (0); +} + +/* + * Copy data from old runtime array to new one. + */ +static int +ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t *pflags) +{ + + /* In is not possible to do rehash if we're not holidng WLOCK. */ + return (0); +} + +/* + * Switch old & new arrays. + */ +static void +ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags) +{ + struct mod_item *mi; + struct chash_cfg *cfg; + struct chashbhead *old_head, *new_head; + struct chashentry *ent, *ent_next; + int af, i, mlen; + uint32_t nhash; + size_t old_size, new_size; + + mi = (struct mod_item *)ta_buf; + cfg = (struct chash_cfg *)ta_state; + + /* Check which hash we need to grow and do we still need that */ + if (mi->size > 0 && cfg->size4 < mi->size) { + new_head = (struct chashbhead *)mi->main_ptr; + new_size = mi->size; + old_size = cfg->size4; + old_head = ti->state; + mlen = cfg->mask4; + af = AF_INET; + + for (i = 0; i < old_size; i++) { + SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { + nhash = hash_ent(ent, af, mlen, new_size); + SLIST_INSERT_HEAD(&new_head[nhash], ent, next); + } + } + + ti->state = new_head; + cfg->head4 = new_head; + cfg->size4 = mi->size; + mi->main_ptr = old_head; + } + + if (mi->size6 > 0 && cfg->size6 < mi->size6) { + new_head = (struct chashbhead *)mi->main_ptr6; + new_size = mi->size6; + old_size = cfg->size6; + old_head = ti->xstate; + mlen = cfg->mask6; + af = AF_INET6; + + for (i = 0; i < old_size; i++) { + SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { + nhash = hash_ent(ent, af, mlen, new_size); + SLIST_INSERT_HEAD(&new_head[nhash], ent, next); + } + } + + ti->xstate = new_head; + cfg->head6 = new_head; + cfg->size6 = mi->size6; + mi->main_ptr6 = old_head; + } + + /* Update lower 32 bits with new values */ + ti->data &= 0xFFFFFFFF00000000; + ti->data |= ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); +} + +/* + * Free unneded array. + */ +static void +ta_flush_mod_chash(void *ta_buf) +{ + struct mod_item *mi; + + mi = (struct mod_item *)ta_buf; + if (mi->main_ptr != NULL) + free(mi->main_ptr, M_IPFW); + if (mi->main_ptr6 != NULL) + free(mi->main_ptr6, M_IPFW); +} + +struct table_algo addr_hash = { + .name = "addr:hash", + .type = IPFW_TABLE_ADDR, + .ta_buf_size = sizeof(struct ta_buf_chash), + .init = ta_init_chash, + .destroy = ta_destroy_chash, + .prepare_add = ta_prepare_add_chash, + .prepare_del = ta_prepare_del_chash, + .add = ta_add_chash, + .del = ta_del_chash, + .flush_entry = ta_flush_chash_entry, + .foreach = ta_foreach_chash, + .dump_tentry = ta_dump_chash_tentry, + .find_tentry = ta_find_chash_tentry, + .print_config = ta_print_chash_config, + .dump_tinfo = ta_dump_chash_tinfo, + .need_modify = ta_need_modify_chash, + .prepare_mod = ta_prepare_mod_chash, + .fill_mod = ta_fill_mod_chash, + .modify = ta_modify_chash, + .flush_mod = ta_flush_mod_chash, +}; + + +/* + * Iface table cmds. + * + * Implementation: + * + * Runtime part: + * - sorted array of "struct ifidx" pointed by ti->state. + * Array is allocated with rounding up to IFIDX_CHUNK. Only existing + * interfaces are stored in array, however its allocated size is + * sufficient to hold all table records if needed. + * - current array size is stored in ti->data + * + * Table data: + * - "struct iftable_cfg" is allocated to store table state (ta_state). + * - All table records are stored inside namedobj instance. + * + */ + +struct ifidx { + uint16_t kidx; + uint16_t spare; + uint32_t value; +}; +#define DEFAULT_IFIDX_SIZE 64 + +struct iftable_cfg; + +struct ifentry { + struct named_object no; + struct ipfw_ifc ic; + struct iftable_cfg *icfg; + uint32_t value; + int linked; +}; + +struct iftable_cfg { + struct namedobj_instance *ii; + struct ip_fw_chain *ch; + struct table_info *ti; + void *main_ptr; + size_t size; /* Number of items allocated in array */ + size_t count; /* Number of all items */ + size_t used; /* Number of items _active_ now */ +}; + +struct ta_buf_ifidx +{ + struct ifentry *ife; + uint32_t value; +}; + +int compare_ifidx(const void *k, const void *v); +static struct ifidx * ifidx_find(struct table_info *ti, void *key); +static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); +static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, + struct table_info *ti, char *data, uint8_t tflags); +static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti); +static int destroy_ifidx_locked(struct namedobj_instance *ii, + struct named_object *no, void *arg); +static void ta_destroy_ifidx(void *ta_state, struct table_info *ti); +static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_add_ifidx(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_del_ifidx(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static void ta_flush_ifidx_entry(struct ip_fw_chain *ch, + struct tentry_info *tei, void *ta_buf); +static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex); +static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti, + uint32_t count, uint64_t *pflags); +static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags); +static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, + void *ta_buf, uint64_t *pflags); +static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags); +static void ta_flush_mod_ifidx(void *ta_buf); +static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent); +static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +static int foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, + void *arg); +static void ta_foreach_ifidx(void *ta_state, struct table_info *ti, + ta_foreach_f *f, void *arg); + +int +compare_ifidx(const void *k, const void *v) +{ + const struct ifidx *ifidx; + uint16_t key; + + key = *((const uint16_t *)k); + ifidx = (const struct ifidx *)v; + + if (key < ifidx->kidx) + return (-1); + else if (key > ifidx->kidx) + return (1); + + return (0); +} + +/* + * Adds item @item with key @key into ascending-sorted array @base. + * Assumes @base has enough additional storage. + * + * Returns 1 on success, 0 on duplicate key. + */ +static int +badd(const void *key, void *item, void *base, size_t nmemb, + size_t size, int (*compar) (const void *, const void *)) +{ + int min, max, mid, shift, res; + caddr_t paddr; + + if (nmemb == 0) { + memcpy(base, item, size); + return (1); + } + + /* Binary search */ + min = 0; + max = nmemb - 1; + mid = 0; + while (min <= max) { + mid = (min + max) / 2; + res = compar(key, (const void *)((caddr_t)base + mid * size)); + if (res == 0) + return (0); + + if (res > 0) + min = mid + 1; + else + max = mid - 1; + } + + /* Item not found. */ + res = compar(key, (const void *)((caddr_t)base + mid * size)); + if (res > 0) + shift = mid + 1; + else + shift = mid; + + paddr = (caddr_t)base + shift * size; + if (nmemb > shift) + memmove(paddr + size, paddr, (nmemb - shift) * size); + + memcpy(paddr, item, size); + + return (1); +} + +/* + * Deletes item with key @key from ascending-sorted array @base. + * + * Returns 1 on success, 0 for non-existent key. + */ +static int +bdel(const void *key, void *base, size_t nmemb, size_t size, + int (*compar) (const void *, const void *)) +{ + caddr_t item; + size_t sz; + + item = (caddr_t)bsearch(key, base, nmemb, size, compar); + + if (item == NULL) + return (0); + + sz = (caddr_t)base + nmemb * size - item; + + if (sz > 0) + memmove(item, item + size, sz); + + return (1); +} + +static struct ifidx * +ifidx_find(struct table_info *ti, void *key) +{ + struct ifidx *ifi; + + ifi = bsearch(key, ti->state, ti->data, sizeof(struct ifidx), + compare_ifidx); + + return (ifi); +} + +static int +ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct ifidx *ifi; + + ifi = ifidx_find(ti, key); + + if (ifi != NULL) { + *val = ifi->value; + return (1); + } + + return (0); +} + +static int +ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, + char *data, uint8_t tflags) +{ + struct iftable_cfg *icfg; + + icfg = malloc(sizeof(struct iftable_cfg), M_IPFW, M_WAITOK | M_ZERO); + + icfg->ii = ipfw_objhash_create(DEFAULT_IFIDX_SIZE); + icfg->size = DEFAULT_IFIDX_SIZE; + icfg->main_ptr = malloc(sizeof(struct ifidx) * icfg->size, M_IPFW, + M_WAITOK | M_ZERO); + icfg->ch = ch; + + *ta_state = icfg; + ti->state = icfg->main_ptr; + ti->lookup = ta_lookup_ifidx; + + return (0); +} + +/* + * Handle tableinfo @ti pointer change (on table array resize). + */ +static void +ta_change_ti_ifidx(void *ta_state, struct table_info *ti) +{ + struct iftable_cfg *icfg; + + icfg = (struct iftable_cfg *)ta_state; + icfg->ti = ti; +} + +static int +destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no, + void *arg) +{ + struct ifentry *ife; + struct ip_fw_chain *ch; + + ch = (struct ip_fw_chain *)arg; + ife = (struct ifentry *)no; + + ipfw_iface_del_notify(ch, &ife->ic); + ipfw_iface_unref(ch, &ife->ic); + free(ife, M_IPFW_TBL); + return (0); +} + + +/* + * Destroys table @ti + */ +static void +ta_destroy_ifidx(void *ta_state, struct table_info *ti) +{ + struct iftable_cfg *icfg; + struct ip_fw_chain *ch; + + icfg = (struct iftable_cfg *)ta_state; + ch = icfg->ch; + + if (icfg->main_ptr != NULL) + free(icfg->main_ptr, M_IPFW); + + IPFW_UH_WLOCK(ch); + ipfw_objhash_foreach(icfg->ii, destroy_ifidx_locked, ch); + IPFW_UH_WUNLOCK(ch); + + ipfw_objhash_destroy(icfg->ii); + + free(icfg, M_IPFW); +} + +/* + * Provide algo-specific table info + */ +static void +ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) +{ + struct iftable_cfg *cfg; + + cfg = (struct iftable_cfg *)ta_state; + + tinfo->taclass4 = IPFW_TACLASS_ARRAY; + tinfo->size4 = cfg->size; + tinfo->count4 = cfg->used; + tinfo->itemsize4 = sizeof(struct ifidx); +} + +/* + * Prepare state to add to the table: + * allocate ifentry and reference needed interface. + */ +static int +ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_ifidx *tb; + char *ifname; + struct ifentry *ife; + + tb = (struct ta_buf_ifidx *)ta_buf; + + /* Check if string is terminated */ + ifname = (char *)tei->paddr; + if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) + return (EINVAL); + + ife = malloc(sizeof(struct ifentry), M_IPFW_TBL, M_WAITOK | M_ZERO); + ife->ic.cb = if_notifier; + ife->ic.cbdata = ife; + + if (ipfw_iface_ref(ch, ifname, &ife->ic) != 0) { + free(ife, M_IPFW_TBL); + return (EINVAL); + } + + /* Use ipfw_iface 'ifname' field as stable storage */ + ife->no.name = ife->ic.iface->ifname; + + tb->ife = ife; + + return (0); +} + +static int +ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct iftable_cfg *icfg; + struct ifentry *ife, *tmp; + struct ta_buf_ifidx *tb; + struct ipfw_iface *iif; + struct ifidx *ifi; + char *ifname; + uint32_t value; + + tb = (struct ta_buf_ifidx *)ta_buf; + ifname = (char *)tei->paddr; + icfg = (struct iftable_cfg *)ta_state; + ife = tb->ife; + + ife->icfg = icfg; + ife->value = tei->value; + + tmp = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); + + if (tmp != NULL) { + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) + return (EEXIST); + + /* Exchange values in @tmp and @tei */ + value = tmp->value; + tmp->value = tei->value; + tei->value = value; + + iif = tmp->ic.iface; + if (iif->resolved != 0) { + /* We have to update runtime value, too */ + ifi = ifidx_find(ti, &iif->ifindex); + ifi->value = ife->value; + } + + /* Indicate that update has happened instead of addition */ + tei->flags |= TEI_FLAGS_UPDATED; + *pnum = 0; + return (0); + } + + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) + return (EFBIG); + + /* Link to internal list */ + ipfw_objhash_add(icfg->ii, &ife->no); + + /* Link notifier (possible running its callback) */ + ipfw_iface_add_notify(icfg->ch, &ife->ic); + icfg->count++; + + tb->ife = NULL; + *pnum = 1; + + return (0); +} + +/* + * Prepare to delete key from table. + * Do basic interface name checks. + */ +static int +ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_ifidx *tb; + char *ifname; + + tb = (struct ta_buf_ifidx *)ta_buf; + + /* Check if string is terminated */ + ifname = (char *)tei->paddr; + if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) + return (EINVAL); + + return (0); +} + +/* + * Remove key from both configuration list and + * runtime array. Removed interface notification. + */ +static int +ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct iftable_cfg *icfg; + struct ifentry *ife; + struct ta_buf_ifidx *tb; + char *ifname; + uint16_t ifindex; + int res; + + tb = (struct ta_buf_ifidx *)ta_buf; + ifname = (char *)tei->paddr; + icfg = (struct iftable_cfg *)ta_state; + ife = tb->ife; + + ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); + + if (ife == NULL) + return (ENOENT); + + if (ife->linked != 0) { + /* We have to remove item from runtime */ + ifindex = ife->ic.iface->ifindex; + + res = bdel(&ifindex, icfg->main_ptr, icfg->used, + sizeof(struct ifidx), compare_ifidx); + + KASSERT(res == 1, ("index %d does not exist", ifindex)); + icfg->used--; + ti->data = icfg->used; + ife->linked = 0; + } + + /* Unlink from local list */ + ipfw_objhash_del(icfg->ii, &ife->no); + /* Unlink notifier and deref */ + ipfw_iface_del_notify(icfg->ch, &ife->ic); + ipfw_iface_unref(icfg->ch, &ife->ic); + + icfg->count--; + tei->value = ife->value; + + tb->ife = ife; + *pnum = 1; + + return (0); +} + +/* + * Flush deleted entry. + * Drops interface reference and frees entry. + */ +static void +ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_ifidx *tb; + + tb = (struct ta_buf_ifidx *)ta_buf; + + if (tb->ife != NULL) + free(tb->ife, M_IPFW_TBL); +} + + +/* + * Handle interface announce/withdrawal for particular table. + * Every real runtime array modification happens here. + */ +static void +if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex) +{ + struct ifentry *ife; + struct ifidx ifi; + struct iftable_cfg *icfg; + struct table_info *ti; + int res; + + ife = (struct ifentry *)cbdata; + icfg = ife->icfg; + ti = icfg->ti; + + KASSERT(ti != NULL, ("ti=NULL, check change_ti handler")); + + if (ife->linked == 0 && ifindex != 0) { + /* Interface announce */ + ifi.kidx = ifindex; + ifi.spare = 0; + ifi.value = ife->value; + res = badd(&ifindex, &ifi, icfg->main_ptr, icfg->used, + sizeof(struct ifidx), compare_ifidx); + KASSERT(res == 1, ("index %d already exists", ifindex)); + icfg->used++; + ti->data = icfg->used; + ife->linked = 1; + } else if (ife->linked != 0 && ifindex == 0) { + /* Interface withdrawal */ + ifindex = ife->ic.iface->ifindex; + + res = bdel(&ifindex, icfg->main_ptr, icfg->used, + sizeof(struct ifidx), compare_ifidx); + + KASSERT(res == 1, ("index %d does not exist", ifindex)); + icfg->used--; + ti->data = icfg->used; + ife->linked = 0; + } +} + + +/* + * Table growing callbacks. + */ + +static int +ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count, + uint64_t *pflags) +{ + struct iftable_cfg *cfg; + uint32_t size; + + cfg = (struct iftable_cfg *)ta_state; + + size = cfg->size; + while (size < cfg->count + count) + size *= 2; + + if (size != cfg->size) { + *pflags = size; + return (1); + } + + return (0); +} + +/* + * Allocate ned, larger runtime ifidx array. + */ +static int +ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags) +{ + struct mod_item *mi; + + mi = (struct mod_item *)ta_buf; + + memset(mi, 0, sizeof(struct mod_item)); + mi->size = *pflags; + mi->main_ptr = malloc(sizeof(struct ifidx) * mi->size, M_IPFW, + M_WAITOK | M_ZERO); + + return (0); +} + +/* + * Copy data from old runtime array to new one. + */ +static int +ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t *pflags) +{ + struct mod_item *mi; + struct iftable_cfg *icfg; + + mi = (struct mod_item *)ta_buf; + icfg = (struct iftable_cfg *)ta_state; + + /* Check if we still need to grow array */ + if (icfg->size >= mi->size) { + *pflags = 0; + return (0); + } + + memcpy(mi->main_ptr, icfg->main_ptr, icfg->used * sizeof(struct ifidx)); + + return (0); +} + +/* + * Switch old & new arrays. + */ +static void +ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags) +{ + struct mod_item *mi; + struct iftable_cfg *icfg; + void *old_ptr; + + mi = (struct mod_item *)ta_buf; + icfg = (struct iftable_cfg *)ta_state; + + old_ptr = icfg->main_ptr; + icfg->main_ptr = mi->main_ptr; + icfg->size = mi->size; + ti->state = icfg->main_ptr; + + mi->main_ptr = old_ptr; +} + +/* + * Free unneded array. + */ +static void +ta_flush_mod_ifidx(void *ta_buf) +{ + struct mod_item *mi; + + mi = (struct mod_item *)ta_buf; + if (mi->main_ptr != NULL) + free(mi->main_ptr, M_IPFW); +} + +static int +ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent) +{ + struct ifentry *ife; + + ife = (struct ifentry *)e; + + tent->masklen = 8 * IF_NAMESIZE; + memcpy(&tent->k, ife->no.name, IF_NAMESIZE); + tent->v.kidx = ife->value; + + return (0); +} + +static int +ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent) +{ + struct iftable_cfg *icfg; + struct ifentry *ife; + char *ifname; + + icfg = (struct iftable_cfg *)ta_state; + ifname = tent->k.iface; + + if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) + return (EINVAL); + + ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); + + if (ife != NULL) { + ta_dump_ifidx_tentry(ta_state, ti, ife, tent); + return (0); + } + + return (ENOENT); +} + +struct wa_ifidx { + ta_foreach_f *f; + void *arg; +}; + +static int +foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, + void *arg) +{ + struct ifentry *ife; + struct wa_ifidx *wa; + + ife = (struct ifentry *)no; + wa = (struct wa_ifidx *)arg; + + wa->f(ife, wa->arg); + return (0); +} + +static void +ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg) +{ + struct iftable_cfg *icfg; + struct wa_ifidx wa; + + icfg = (struct iftable_cfg *)ta_state; + + wa.f = f; + wa.arg = arg; + + ipfw_objhash_foreach(icfg->ii, foreach_ifidx, &wa); +} + +struct table_algo iface_idx = { + .name = "iface:array", + .type = IPFW_TABLE_INTERFACE, + .flags = TA_FLAG_DEFAULT, + .ta_buf_size = sizeof(struct ta_buf_ifidx), + .init = ta_init_ifidx, + .destroy = ta_destroy_ifidx, + .prepare_add = ta_prepare_add_ifidx, + .prepare_del = ta_prepare_del_ifidx, + .add = ta_add_ifidx, + .del = ta_del_ifidx, + .flush_entry = ta_flush_ifidx_entry, + .foreach = ta_foreach_ifidx, + .dump_tentry = ta_dump_ifidx_tentry, + .find_tentry = ta_find_ifidx_tentry, + .dump_tinfo = ta_dump_ifidx_tinfo, + .need_modify = ta_need_modify_ifidx, + .prepare_mod = ta_prepare_mod_ifidx, + .fill_mod = ta_fill_mod_ifidx, + .modify = ta_modify_ifidx, + .flush_mod = ta_flush_mod_ifidx, + .change_ti = ta_change_ti_ifidx, +}; + +/* + * Number array cmds. + * + * Implementation: + * + * Runtime part: + * - sorted array of "struct numarray" pointed by ti->state. + * Array is allocated with rounding up to NUMARRAY_CHUNK. + * - current array size is stored in ti->data + * + */ + +struct numarray { + uint32_t number; + uint32_t value; +}; + +struct numarray_cfg { + void *main_ptr; + size_t size; /* Number of items allocated in array */ + size_t used; /* Number of items _active_ now */ +}; + +struct ta_buf_numarray +{ + struct numarray na; +}; + +int compare_numarray(const void *k, const void *v); +static struct numarray *numarray_find(struct table_info *ti, void *key); +static int ta_lookup_numarray(struct table_info *ti, void *key, + uint32_t keylen, uint32_t *val); +static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, + struct table_info *ti, char *data, uint8_t tflags); +static void ta_destroy_numarray(void *ta_state, struct table_info *ti); +static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +static int ta_prepare_add_numarray(struct ip_fw_chain *ch, + struct tentry_info *tei, void *ta_buf); +static int ta_add_numarray(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static int ta_del_numarray(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static void ta_flush_numarray_entry(struct ip_fw_chain *ch, + struct tentry_info *tei, void *ta_buf); +static int ta_need_modify_numarray(void *ta_state, struct table_info *ti, + uint32_t count, uint64_t *pflags); +static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags); +static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti, + void *ta_buf, uint64_t *pflags); +static void ta_modify_numarray(void *ta_state, struct table_info *ti, + void *ta_buf, uint64_t pflags); +static void ta_flush_mod_numarray(void *ta_buf); +static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, + void *e, ipfw_obj_tentry *tent); +static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +static void ta_foreach_numarray(void *ta_state, struct table_info *ti, + ta_foreach_f *f, void *arg); + +int +compare_numarray(const void *k, const void *v) +{ + const struct numarray *na; + uint32_t key; + + key = *((const uint32_t *)k); + na = (const struct numarray *)v; + + if (key < na->number) + return (-1); + else if (key > na->number) + return (1); + + return (0); +} + +static struct numarray * +numarray_find(struct table_info *ti, void *key) +{ + struct numarray *ri; + + ri = bsearch(key, ti->state, ti->data, sizeof(struct numarray), + compare_ifidx); + + return (ri); +} + +static int +ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct numarray *ri; + + ri = numarray_find(ti, key); + + if (ri != NULL) { + *val = ri->value; + return (1); + } + + return (0); +} + +static int +ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, + char *data, uint8_t tflags) +{ + struct numarray_cfg *cfg; + + cfg = malloc(sizeof(*cfg), M_IPFW, M_WAITOK | M_ZERO); + + cfg->size = 16; + cfg->main_ptr = malloc(sizeof(struct numarray) * cfg->size, M_IPFW, + M_WAITOK | M_ZERO); + + *ta_state = cfg; + ti->state = cfg->main_ptr; + ti->lookup = ta_lookup_numarray; + + return (0); +} + +/* + * Destroys table @ti + */ +static void +ta_destroy_numarray(void *ta_state, struct table_info *ti) +{ + struct numarray_cfg *cfg; + + cfg = (struct numarray_cfg *)ta_state; + + if (cfg->main_ptr != NULL) + free(cfg->main_ptr, M_IPFW); + + free(cfg, M_IPFW); +} + +/* + * Provide algo-specific table info + */ +static void +ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) +{ + struct numarray_cfg *cfg; + + cfg = (struct numarray_cfg *)ta_state; + + tinfo->taclass4 = IPFW_TACLASS_ARRAY; + tinfo->size4 = cfg->size; + tinfo->count4 = cfg->used; + tinfo->itemsize4 = sizeof(struct numarray); +} + +/* + * Prepare for addition/deletion to an array. + */ +static int +ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_numarray *tb; + + tb = (struct ta_buf_numarray *)ta_buf; + + tb->na.number = *((uint32_t *)tei->paddr); + + return (0); +} + +static int +ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct numarray_cfg *cfg; + struct ta_buf_numarray *tb; + struct numarray *ri; + int res; + uint32_t value; + + tb = (struct ta_buf_numarray *)ta_buf; + cfg = (struct numarray_cfg *)ta_state; + + /* Read current value from @tei */ + tb->na.value = tei->value; + + ri = numarray_find(ti, &tb->na.number); + + if (ri != NULL) { + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) + return (EEXIST); + + /* Exchange values between ri and @tei */ + value = ri->value; + ri->value = tei->value; + tei->value = value; + /* Indicate that update has happened instead of addition */ + tei->flags |= TEI_FLAGS_UPDATED; + *pnum = 0; + return (0); + } + + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) + return (EFBIG); + + res = badd(&tb->na.number, &tb->na, cfg->main_ptr, cfg->used, + sizeof(struct numarray), compare_numarray); + + KASSERT(res == 1, ("number %d already exists", tb->na.number)); + cfg->used++; + ti->data = cfg->used; + *pnum = 1; + + return (0); +} + +/* + * Remove key from both configuration list and + * runtime array. Removed interface notification. + */ +static int +ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct numarray_cfg *cfg; + struct ta_buf_numarray *tb; + struct numarray *ri; + int res; + + tb = (struct ta_buf_numarray *)ta_buf; + cfg = (struct numarray_cfg *)ta_state; + + ri = numarray_find(ti, &tb->na.number); + if (ri == NULL) + return (ENOENT); + + tei->value = ri->value; + + res = bdel(&tb->na.number, cfg->main_ptr, cfg->used, + sizeof(struct numarray), compare_numarray); + + KASSERT(res == 1, ("number %u does not exist", tb->na.number)); + cfg->used--; + ti->data = cfg->used; + *pnum = 1; + + return (0); +} + +static void +ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + + /* We don't have any state, do nothing */ +} + + +/* + * Table growing callbacks. + */ + +static int +ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count, + uint64_t *pflags) +{ + struct numarray_cfg *cfg; + size_t size; + + cfg = (struct numarray_cfg *)ta_state; + + size = cfg->size; + while (size < cfg->used + count) + size *= 2; + + if (size != cfg->size) { + *pflags = size; + return (1); + } + + return (0); +} + +/* + * Allocate new, larger runtime array. + */ +static int +ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags) +{ + struct mod_item *mi; + + mi = (struct mod_item *)ta_buf; + + memset(mi, 0, sizeof(struct mod_item)); + mi->size = *pflags; + mi->main_ptr = malloc(sizeof(struct numarray) * mi->size, M_IPFW, + M_WAITOK | M_ZERO); + + return (0); +} + +/* + * Copy data from old runtime array to new one. + */ +static int +ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t *pflags) +{ + struct mod_item *mi; + struct numarray_cfg *cfg; + + mi = (struct mod_item *)ta_buf; + cfg = (struct numarray_cfg *)ta_state; + + /* Check if we still need to grow array */ + if (cfg->size >= mi->size) { + *pflags = 0; + return (0); + } + + memcpy(mi->main_ptr, cfg->main_ptr, cfg->used * sizeof(struct numarray)); + + return (0); +} + +/* + * Switch old & new arrays. + */ +static void +ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags) +{ + struct mod_item *mi; + struct numarray_cfg *cfg; + void *old_ptr; + + mi = (struct mod_item *)ta_buf; + cfg = (struct numarray_cfg *)ta_state; + + old_ptr = cfg->main_ptr; + cfg->main_ptr = mi->main_ptr; + cfg->size = mi->size; + ti->state = cfg->main_ptr; + + mi->main_ptr = old_ptr; +} + +/* + * Free unneded array. + */ +static void +ta_flush_mod_numarray(void *ta_buf) +{ + struct mod_item *mi; + + mi = (struct mod_item *)ta_buf; + if (mi->main_ptr != NULL) + free(mi->main_ptr, M_IPFW); +} + +static int +ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent) +{ + struct numarray *na; + + na = (struct numarray *)e; + + tent->k.key = na->number; + tent->v.kidx = na->value; + + return (0); +} + +static int +ta_find_numarray_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent) +{ + struct numarray_cfg *cfg; + struct numarray *ri; + + cfg = (struct numarray_cfg *)ta_state; + + ri = numarray_find(ti, &tent->k.key); + + if (ri != NULL) { + ta_dump_numarray_tentry(ta_state, ti, ri, tent); + return (0); + } + + return (ENOENT); +} + +static void +ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg) +{ + struct numarray_cfg *cfg; + struct numarray *array; + int i; + + cfg = (struct numarray_cfg *)ta_state; + array = cfg->main_ptr; + + for (i = 0; i < cfg->used; i++) + f(&array[i], arg); +} + +struct table_algo number_array = { + .name = "number:array", + .type = IPFW_TABLE_NUMBER, + .ta_buf_size = sizeof(struct ta_buf_numarray), + .init = ta_init_numarray, + .destroy = ta_destroy_numarray, + .prepare_add = ta_prepare_add_numarray, + .prepare_del = ta_prepare_add_numarray, + .add = ta_add_numarray, + .del = ta_del_numarray, + .flush_entry = ta_flush_numarray_entry, + .foreach = ta_foreach_numarray, + .dump_tentry = ta_dump_numarray_tentry, + .find_tentry = ta_find_numarray_tentry, + .dump_tinfo = ta_dump_numarray_tinfo, + .need_modify = ta_need_modify_numarray, + .prepare_mod = ta_prepare_mod_numarray, + .fill_mod = ta_fill_mod_numarray, + .modify = ta_modify_numarray, + .flush_mod = ta_flush_mod_numarray, +}; + +/* + * flow:hash cmds + * + * + * ti->data: + * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] + * [ 8][ 8[ 8][ 8] + * + * inv.mask4: 32 - mask + * inv.mask6: + * 1) _slow lookup: mask + * 2) _aligned: (128 - mask) / 8 + * 3) _64: 8 + * + * + * pflags: + * [hsize4][hsize6] + * [ 16][ 16] + */ + +struct fhashentry; + +SLIST_HEAD(fhashbhead, fhashentry); + +struct fhashentry { + SLIST_ENTRY(fhashentry) next; + uint8_t af; + uint8_t proto; + uint16_t spare0; + uint16_t dport; + uint16_t sport; + uint32_t value; + uint32_t spare1; +}; + +struct fhashentry4 { + struct fhashentry e; + struct in_addr dip; + struct in_addr sip; +}; + +struct fhashentry6 { + struct fhashentry e; + struct in6_addr dip6; + struct in6_addr sip6; +}; + +struct fhash_cfg { + struct fhashbhead *head; + size_t size; + size_t items; + struct fhashentry4 fe4; + struct fhashentry6 fe6; +}; + +struct ta_buf_fhash { + void *ent_ptr; + struct fhashentry6 fe6; +}; + +static __inline int cmp_flow_ent(struct fhashentry *a, + struct fhashentry *b, size_t sz); +static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize); +static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize); +static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size); +static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); +static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, +struct table_info *ti, char *data, uint8_t tflags); +static void ta_destroy_fhash(void *ta_state, struct table_info *ti); +static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, + void *e, ipfw_obj_tentry *tent); +static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent); +static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +static void ta_foreach_fhash(void *ta_state, struct table_info *ti, + ta_foreach_f *f, void *arg); +static int ta_prepare_add_fhash(struct ip_fw_chain *ch, + struct tentry_info *tei, void *ta_buf); +static int ta_add_fhash(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_del_fhash(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_need_modify_fhash(void *ta_state, struct table_info *ti, + uint32_t count, uint64_t *pflags); +static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags); +static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti, + void *ta_buf, uint64_t *pflags); +static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags); +static void ta_flush_mod_fhash(void *ta_buf); + +static __inline int +cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz) +{ + uint64_t *ka, *kb; + + ka = (uint64_t *)(&a->next + 1); + kb = (uint64_t *)(&b->next + 1); + + if (*ka == *kb && (memcmp(a + 1, b + 1, sz) == 0)) + return (1); + + return (0); +} + +static __inline uint32_t +hash_flow4(struct fhashentry4 *f, int hsize) +{ + uint32_t i; + + i = (f->dip.s_addr) ^ (f->sip.s_addr) ^ (f->e.dport) ^ (f->e.sport); + + return (i % (hsize - 1)); +} + +static __inline uint32_t +hash_flow6(struct fhashentry6 *f, int hsize) +{ + uint32_t i; + + i = (f->dip6.__u6_addr.__u6_addr32[2]) ^ + (f->dip6.__u6_addr.__u6_addr32[3]) ^ + (f->sip6.__u6_addr.__u6_addr32[2]) ^ + (f->sip6.__u6_addr.__u6_addr32[3]) ^ + (f->e.dport) ^ (f->e.sport); + + return (i % (hsize - 1)); +} + +static uint32_t +hash_flow_ent(struct fhashentry *ent, uint32_t size) +{ + uint32_t hash; + + if (ent->af == AF_INET) { + hash = hash_flow4((struct fhashentry4 *)ent, size); + } else { + hash = hash_flow6((struct fhashentry6 *)ent, size); + } + + return (hash); +} + +static int +ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct fhashbhead *head; + struct fhashentry *ent; + struct fhashentry4 *m4; + struct ipfw_flow_id *id; + uint16_t hash, hsize; + + id = (struct ipfw_flow_id *)key; + head = (struct fhashbhead *)ti->state; + hsize = ti->data; + m4 = (struct fhashentry4 *)ti->xstate; + + if (id->addr_type == 4) { + struct fhashentry4 f; + + /* Copy hash mask */ + f = *m4; + + f.dip.s_addr &= id->dst_ip; + f.sip.s_addr &= id->src_ip; + f.e.dport &= id->dst_port; + f.e.sport &= id->src_port; + f.e.proto &= id->proto; + hash = hash_flow4(&f, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + if (cmp_flow_ent(ent, &f.e, 2 * 4) != 0) { + *val = ent->value; + return (1); + } + } + } else if (id->addr_type == 6) { + struct fhashentry6 f; + uint64_t *fp, *idp; + + /* Copy hash mask */ + f = *((struct fhashentry6 *)(m4 + 1)); + + /* Handle lack of __u6_addr.__u6_addr64 */ + fp = (uint64_t *)&f.dip6; + idp = (uint64_t *)&id->dst_ip6; + /* src IPv6 is stored after dst IPv6 */ + *fp++ &= *idp++; + *fp++ &= *idp++; + *fp++ &= *idp++; + *fp &= *idp; + f.e.dport &= id->dst_port; + f.e.sport &= id->src_port; + f.e.proto &= id->proto; + hash = hash_flow6(&f, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + if (cmp_flow_ent(ent, &f.e, 2 * 16) != 0) { + *val = ent->value; + return (1); + } + } + } + + return (0); +} + +/* + * New table. + */ +static int +ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, + char *data, uint8_t tflags) +{ + int i; + struct fhash_cfg *cfg; + struct fhashentry4 *fe4; + struct fhashentry6 *fe6; + + cfg = malloc(sizeof(struct fhash_cfg), M_IPFW, M_WAITOK | M_ZERO); + + cfg->size = 512; + + cfg->head = malloc(sizeof(struct fhashbhead) * cfg->size, M_IPFW, + M_WAITOK | M_ZERO); + for (i = 0; i < cfg->size; i++) + SLIST_INIT(&cfg->head[i]); + + /* Fill in fe masks based on @tflags */ + fe4 = &cfg->fe4; + fe6 = &cfg->fe6; + if (tflags & IPFW_TFFLAG_SRCIP) { + memset(&fe4->sip, 0xFF, sizeof(fe4->sip)); + memset(&fe6->sip6, 0xFF, sizeof(fe6->sip6)); + } + if (tflags & IPFW_TFFLAG_DSTIP) { + memset(&fe4->dip, 0xFF, sizeof(fe4->dip)); + memset(&fe6->dip6, 0xFF, sizeof(fe6->dip6)); + } + if (tflags & IPFW_TFFLAG_SRCPORT) { + memset(&fe4->e.sport, 0xFF, sizeof(fe4->e.sport)); + memset(&fe6->e.sport, 0xFF, sizeof(fe6->e.sport)); + } + if (tflags & IPFW_TFFLAG_DSTPORT) { + memset(&fe4->e.dport, 0xFF, sizeof(fe4->e.dport)); + memset(&fe6->e.dport, 0xFF, sizeof(fe6->e.dport)); + } + if (tflags & IPFW_TFFLAG_PROTO) { + memset(&fe4->e.proto, 0xFF, sizeof(fe4->e.proto)); + memset(&fe6->e.proto, 0xFF, sizeof(fe6->e.proto)); + } + + fe4->e.af = AF_INET; + fe6->e.af = AF_INET6; + + *ta_state = cfg; + ti->state = cfg->head; + ti->xstate = &cfg->fe4; + ti->data = cfg->size; + ti->lookup = ta_lookup_fhash; + + return (0); +} + +static void +ta_destroy_fhash(void *ta_state, struct table_info *ti) +{ + struct fhash_cfg *cfg; + struct fhashentry *ent, *ent_next; + int i; + + cfg = (struct fhash_cfg *)ta_state; + + for (i = 0; i < cfg->size; i++) + SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) + free(ent, M_IPFW_TBL); + + free(cfg->head, M_IPFW); + free(cfg, M_IPFW); +} + +/* + * Provide algo-specific table info + */ +static void +ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) +{ + struct fhash_cfg *cfg; + + cfg = (struct fhash_cfg *)ta_state; + + tinfo->flags = IPFW_TATFLAGS_AFITEM; + tinfo->taclass4 = IPFW_TACLASS_HASH; + tinfo->size4 = cfg->size; + tinfo->count4 = cfg->items; + tinfo->itemsize4 = sizeof(struct fhashentry4); + tinfo->itemsize6 = sizeof(struct fhashentry6); +} + +static int +ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent) +{ + struct fhash_cfg *cfg; + struct fhashentry *ent; + struct fhashentry4 *fe4; +#ifdef INET6 + struct fhashentry6 *fe6; +#endif + struct tflow_entry *tfe; + + cfg = (struct fhash_cfg *)ta_state; + ent = (struct fhashentry *)e; + tfe = &tent->k.flow; + + tfe->af = ent->af; + tfe->proto = ent->proto; + tfe->dport = htons(ent->dport); + tfe->sport = htons(ent->sport); + tent->v.kidx = ent->value; + tent->subtype = ent->af; + + if (ent->af == AF_INET) { + fe4 = (struct fhashentry4 *)ent; + tfe->a.a4.sip.s_addr = htonl(fe4->sip.s_addr); + tfe->a.a4.dip.s_addr = htonl(fe4->dip.s_addr); + tent->masklen = 32; +#ifdef INET6 + } else { + fe6 = (struct fhashentry6 *)ent; + tfe->a.a6.sip6 = fe6->sip6; + tfe->a.a6.dip6 = fe6->dip6; + tent->masklen = 128; +#endif + } + + return (0); +} + +static int +tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent) +{ +#ifdef INET + struct fhashentry4 *fe4; +#endif +#ifdef INET6 + struct fhashentry6 *fe6; +#endif + struct tflow_entry *tfe; + + tfe = (struct tflow_entry *)tei->paddr; + + ent->af = tei->subtype; + ent->proto = tfe->proto; + ent->dport = ntohs(tfe->dport); + ent->sport = ntohs(tfe->sport); + + if (tei->subtype == AF_INET) { +#ifdef INET + fe4 = (struct fhashentry4 *)ent; + fe4->sip.s_addr = ntohl(tfe->a.a4.sip.s_addr); + fe4->dip.s_addr = ntohl(tfe->a.a4.dip.s_addr); +#endif +#ifdef INET6 + } else if (tei->subtype == AF_INET6) { + fe6 = (struct fhashentry6 *)ent; + fe6->sip6 = tfe->a.a6.sip6; + fe6->dip6 = tfe->a.a6.dip6; +#endif + } else { + /* Unknown CIDR type */ + return (EINVAL); + } + + return (0); +} + + +static int +ta_find_fhash_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent) +{ + struct fhash_cfg *cfg; + struct fhashbhead *head; + struct fhashentry *ent, *tmp; + struct fhashentry6 fe6; + struct tentry_info tei; + int error; + uint32_t hash; + size_t sz; + + cfg = (struct fhash_cfg *)ta_state; + + ent = &fe6.e; + + memset(&fe6, 0, sizeof(fe6)); + memset(&tei, 0, sizeof(tei)); + + tei.paddr = &tent->k.flow; + tei.subtype = tent->subtype; + + if ((error = tei_to_fhash_ent(&tei, ent)) != 0) + return (error); + + head = cfg->head; + hash = hash_flow_ent(ent, cfg->size); + + if (tei.subtype == AF_INET) + sz = 2 * sizeof(struct in_addr); + else + sz = 2 * sizeof(struct in6_addr); + + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (cmp_flow_ent(tmp, ent, sz) != 0) { + ta_dump_fhash_tentry(ta_state, ti, tmp, tent); + return (0); + } + } + + return (ENOENT); +} + +static void +ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg) +{ + struct fhash_cfg *cfg; + struct fhashentry *ent, *ent_next; + int i; + + cfg = (struct fhash_cfg *)ta_state; + + for (i = 0; i < cfg->size; i++) + SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) + f(ent, arg); +} + +static int +ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_fhash *tb; + struct fhashentry *ent; + size_t sz; + int error; + + tb = (struct ta_buf_fhash *)ta_buf; + + if (tei->subtype == AF_INET) + sz = sizeof(struct fhashentry4); + else if (tei->subtype == AF_INET6) + sz = sizeof(struct fhashentry6); + else + return (EINVAL); + + ent = malloc(sz, M_IPFW_TBL, M_WAITOK | M_ZERO); + + error = tei_to_fhash_ent(tei, ent); + if (error != 0) { + free(ent, M_IPFW_TBL); + return (error); + } + tb->ent_ptr = ent; + + return (0); +} + +static int +ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct fhash_cfg *cfg; + struct fhashbhead *head; + struct fhashentry *ent, *tmp; + struct ta_buf_fhash *tb; + int exists; + uint32_t hash, value; + size_t sz; + + cfg = (struct fhash_cfg *)ta_state; + tb = (struct ta_buf_fhash *)ta_buf; + ent = (struct fhashentry *)tb->ent_ptr; + exists = 0; + + /* Read current value from @tei */ + ent->value = tei->value; + + head = cfg->head; + hash = hash_flow_ent(ent, cfg->size); + + if (tei->subtype == AF_INET) + sz = 2 * sizeof(struct in_addr); + else + sz = 2 * sizeof(struct in6_addr); + + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (cmp_flow_ent(tmp, ent, sz) != 0) { + exists = 1; + break; + } + } + + if (exists == 1) { + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) + return (EEXIST); + /* Record already exists. Update value if we're asked to */ + /* Exchange values between tmp and @tei */ + value = tmp->value; + tmp->value = tei->value; + tei->value = value; + /* Indicate that update has happened instead of addition */ + tei->flags |= TEI_FLAGS_UPDATED; + *pnum = 0; + } else { + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) + return (EFBIG); + + SLIST_INSERT_HEAD(&head[hash], ent, next); + tb->ent_ptr = NULL; + *pnum = 1; + + /* Update counters and check if we need to grow hash */ + cfg->items++; + } + + return (0); +} + +static int +ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_fhash *tb; + + tb = (struct ta_buf_fhash *)ta_buf; + + return (tei_to_fhash_ent(tei, &tb->fe6.e)); +} + +static int +ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct fhash_cfg *cfg; + struct fhashbhead *head; + struct fhashentry *ent, *tmp; + struct ta_buf_fhash *tb; + uint32_t hash; + size_t sz; + + cfg = (struct fhash_cfg *)ta_state; + tb = (struct ta_buf_fhash *)ta_buf; + ent = &tb->fe6.e; + + head = cfg->head; + hash = hash_flow_ent(ent, cfg->size); + + if (tei->subtype == AF_INET) + sz = 2 * sizeof(struct in_addr); + else + sz = 2 * sizeof(struct in6_addr); + + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (cmp_flow_ent(tmp, ent, sz) == 0) + continue; + + SLIST_REMOVE(&head[hash], tmp, fhashentry, next); + tei->value = tmp->value; + *pnum = 1; + cfg->items--; + tb->ent_ptr = tmp; + return (0); + } + + return (ENOENT); +} + +static void +ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_fhash *tb; + + tb = (struct ta_buf_fhash *)ta_buf; + + if (tb->ent_ptr != NULL) + free(tb->ent_ptr, M_IPFW_TBL); +} + +/* + * Hash growing callbacks. + */ + +static int +ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count, + uint64_t *pflags) +{ + struct fhash_cfg *cfg; + + cfg = (struct fhash_cfg *)ta_state; + + if (cfg->items > cfg->size && cfg->size < 65536) { + *pflags = cfg->size * 2; + return (1); + } + + return (0); +} + +/* + * Allocate new, larger fhash. + */ +static int +ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags) +{ + struct mod_item *mi; + struct fhashbhead *head; + int i; + + mi = (struct mod_item *)ta_buf; + + memset(mi, 0, sizeof(struct mod_item)); + mi->size = *pflags; + head = malloc(sizeof(struct fhashbhead) * mi->size, M_IPFW, + M_WAITOK | M_ZERO); + for (i = 0; i < mi->size; i++) + SLIST_INIT(&head[i]); + + mi->main_ptr = head; + + return (0); +} + +/* + * Copy data from old runtime array to new one. + */ +static int +ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t *pflags) +{ + + /* In is not possible to do rehash if we're not holidng WLOCK. */ + return (0); +} + +/* + * Switch old & new arrays. + */ +static void +ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags) +{ + struct mod_item *mi; + struct fhash_cfg *cfg; + struct fhashbhead *old_head, *new_head; + struct fhashentry *ent, *ent_next; + int i; + uint32_t nhash; + size_t old_size; + + mi = (struct mod_item *)ta_buf; + cfg = (struct fhash_cfg *)ta_state; + + old_size = cfg->size; + old_head = ti->state; + + new_head = (struct fhashbhead *)mi->main_ptr; + for (i = 0; i < old_size; i++) { + SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { + nhash = hash_flow_ent(ent, mi->size); + SLIST_INSERT_HEAD(&new_head[nhash], ent, next); + } + } + + ti->state = new_head; + ti->data = mi->size; + cfg->head = new_head; + cfg->size = mi->size; + + mi->main_ptr = old_head; +} + +/* + * Free unneded array. + */ +static void +ta_flush_mod_fhash(void *ta_buf) +{ + struct mod_item *mi; + + mi = (struct mod_item *)ta_buf; + if (mi->main_ptr != NULL) + free(mi->main_ptr, M_IPFW); +} + +struct table_algo flow_hash = { + .name = "flow:hash", + .type = IPFW_TABLE_FLOW, + .flags = TA_FLAG_DEFAULT, + .ta_buf_size = sizeof(struct ta_buf_fhash), + .init = ta_init_fhash, + .destroy = ta_destroy_fhash, + .prepare_add = ta_prepare_add_fhash, + .prepare_del = ta_prepare_del_fhash, + .add = ta_add_fhash, + .del = ta_del_fhash, + .flush_entry = ta_flush_fhash_entry, + .foreach = ta_foreach_fhash, + .dump_tentry = ta_dump_fhash_tentry, + .find_tentry = ta_find_fhash_tentry, + .dump_tinfo = ta_dump_fhash_tinfo, + .need_modify = ta_need_modify_fhash, + .prepare_mod = ta_prepare_mod_fhash, + .fill_mod = ta_fill_mod_fhash, + .modify = ta_modify_fhash, + .flush_mod = ta_flush_mod_fhash, +}; + +/* + * Kernel fibs bindings. + * + * Implementation: + * + * Runtime part: + * - fully relies on route API + * - fib number is stored in ti->data + * + */ + +static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); +static int kfib_parse_opts(int *pfib, char *data); +static void ta_print_kfib_config(void *ta_state, struct table_info *ti, + char *buf, size_t bufsize); +static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, + struct table_info *ti, char *data, uint8_t tflags); +static void ta_destroy_kfib(void *ta_state, struct table_info *ti); +static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +static int contigmask(uint8_t *p, int len); +static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent); +static int ta_dump_kfib_tentry_int(struct sockaddr *paddr, + struct sockaddr *pmask, ipfw_obj_tentry *tent); +static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +static void ta_foreach_kfib(void *ta_state, struct table_info *ti, + ta_foreach_f *f, void *arg); + + +static int +ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ +#ifdef INET + struct nhop4_basic nh4; + struct in_addr in; +#endif +#ifdef INET6 + struct nhop6_basic nh6; +#endif + int error; + + error = ENOENT; +#ifdef INET + if (keylen == 4) { + in.s_addr = *(in_addr_t *)key; + error = fib4_lookup_nh_basic(ti->data, + in, 0, 0, &nh4); + } +#endif +#ifdef INET6 + if (keylen == 6) + error = fib6_lookup_nh_basic(ti->data, + (struct in6_addr *)key, 0, 0, 0, &nh6); +#endif + + if (error != 0) + return (0); + + *val = 0; + + return (1); +} + +/* Parse 'fib=%d' */ +static int +kfib_parse_opts(int *pfib, char *data) +{ + char *pdel, *pend, *s; + int fibnum; + + if (data == NULL) + return (0); + if ((pdel = strchr(data, ' ')) == NULL) + return (0); + while (*pdel == ' ') + pdel++; + if (strncmp(pdel, "fib=", 4) != 0) + return (EINVAL); + if ((s = strchr(pdel, ' ')) != NULL) + *s++ = '\0'; + + pdel += 4; + /* Need \d+ */ + fibnum = strtol(pdel, &pend, 10); + if (*pend != '\0') + return (EINVAL); + + *pfib = fibnum; + + return (0); +} + +static void +ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf, + size_t bufsize) +{ + + if (ti->data != 0) + snprintf(buf, bufsize, "%s fib=%lu", "addr:kfib", ti->data); + else + snprintf(buf, bufsize, "%s", "addr:kfib"); +} + +static int +ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, + char *data, uint8_t tflags) +{ + int error, fibnum; + + fibnum = 0; + if ((error = kfib_parse_opts(&fibnum, data)) != 0) + return (error); + + if (fibnum >= rt_numfibs) + return (E2BIG); + + ti->data = fibnum; + ti->lookup = ta_lookup_kfib; + + return (0); +} + +/* + * Destroys table @ti + */ +static void +ta_destroy_kfib(void *ta_state, struct table_info *ti) +{ + +} + +/* + * Provide algo-specific table info + */ +static void +ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) +{ + + tinfo->flags = IPFW_TATFLAGS_AFDATA; + tinfo->taclass4 = IPFW_TACLASS_RADIX; + tinfo->count4 = 0; + tinfo->itemsize4 = sizeof(struct rtentry); + tinfo->taclass6 = IPFW_TACLASS_RADIX; + tinfo->count6 = 0; + tinfo->itemsize6 = sizeof(struct rtentry); +} + +static int +contigmask(uint8_t *p, int len) +{ + int i, n; + + for (i = 0; i < len ; i++) + if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */ + break; + for (n= i + 1; n < len; n++) + if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0) + return (-1); /* mask not contiguous */ + return (i); +} + + +static int +ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent) +{ + struct rtentry *rte; + + rte = (struct rtentry *)e; + + return ta_dump_kfib_tentry_int(rt_key(rte), rt_mask(rte), tent); +} + +static int +ta_dump_kfib_tentry_int(struct sockaddr *paddr, struct sockaddr *pmask, + ipfw_obj_tentry *tent) +{ +#ifdef INET + struct sockaddr_in *addr, *mask; +#endif +#ifdef INET6 + struct sockaddr_in6 *addr6, *mask6; +#endif + int len; + + len = 0; + + /* Guess IPv4/IPv6 radix by sockaddr family */ +#ifdef INET + if (paddr->sa_family == AF_INET) { + addr = (struct sockaddr_in *)paddr; + mask = (struct sockaddr_in *)pmask; + tent->k.addr.s_addr = addr->sin_addr.s_addr; + len = 32; + if (mask != NULL) + len = contigmask((uint8_t *)&mask->sin_addr, 32); + if (len == -1) + len = 0; + tent->masklen = len; + tent->subtype = AF_INET; + tent->v.kidx = 0; /* Do we need to put GW here? */ + } +#endif +#ifdef INET6 + if (paddr->sa_family == AF_INET6) { + addr6 = (struct sockaddr_in6 *)paddr; + mask6 = (struct sockaddr_in6 *)pmask; + memcpy(&tent->k, &addr6->sin6_addr, sizeof(struct in6_addr)); + len = 128; + if (mask6 != NULL) + len = contigmask((uint8_t *)&mask6->sin6_addr, 128); + if (len == -1) + len = 0; + tent->masklen = len; + tent->subtype = AF_INET6; + tent->v.kidx = 0; + } +#endif + + return (0); +} + +static int +ta_find_kfib_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent) +{ + struct rt_addrinfo info; + struct sockaddr_in6 key6, dst6, mask6; + struct sockaddr *dst, *key, *mask; + + /* Prepare sockaddr for prefix/mask and info */ + bzero(&dst6, sizeof(dst6)); + dst6.sin6_len = sizeof(dst6); + dst = (struct sockaddr *)&dst6; + bzero(&mask6, sizeof(mask6)); + mask6.sin6_len = sizeof(mask6); + mask = (struct sockaddr *)&mask6; + + bzero(&info, sizeof(info)); + info.rti_info[RTAX_DST] = dst; + info.rti_info[RTAX_NETMASK] = mask; + + /* Prepare the lookup key */ + bzero(&key6, sizeof(key6)); + key6.sin6_family = tent->subtype; + key = (struct sockaddr *)&key6; + + if (tent->subtype == AF_INET) { + ((struct sockaddr_in *)&key6)->sin_addr = tent->k.addr; + key6.sin6_len = sizeof(struct sockaddr_in); + } else { + key6.sin6_addr = tent->k.addr6; + key6.sin6_len = sizeof(struct sockaddr_in6); + } + + if (rib_lookup_info(ti->data, key, 0, 0, &info) != 0) + return (ENOENT); + if ((info.rti_addrs & RTA_NETMASK) == 0) + mask = NULL; + + ta_dump_kfib_tentry_int(dst, mask, tent); + + return (0); +} + +static void +ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg) +{ + struct rib_head *rh; + int error; + + rh = rt_tables_get_rnh(ti->data, AF_INET); + if (rh != NULL) { + RIB_RLOCK(rh); + error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg); + RIB_RUNLOCK(rh); + } + + rh = rt_tables_get_rnh(ti->data, AF_INET6); + if (rh != NULL) { + RIB_RLOCK(rh); + error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg); + RIB_RUNLOCK(rh); + } +} + +struct table_algo addr_kfib = { + .name = "addr:kfib", + .type = IPFW_TABLE_ADDR, + .flags = TA_FLAG_READONLY, + .ta_buf_size = 0, + .init = ta_init_kfib, + .destroy = ta_destroy_kfib, + .foreach = ta_foreach_kfib, + .dump_tentry = ta_dump_kfib_tentry, + .find_tentry = ta_find_kfib_tentry, + .dump_tinfo = ta_dump_kfib_tinfo, + .print_config = ta_print_kfib_config, +}; + +void +ipfw_table_algo_init(struct ip_fw_chain *ch) +{ + size_t sz; + + /* + * Register all algorithms presented here. + */ + sz = sizeof(struct table_algo); + ipfw_add_table_algo(ch, &addr_radix, sz, &addr_radix.idx); + ipfw_add_table_algo(ch, &addr_hash, sz, &addr_hash.idx); + ipfw_add_table_algo(ch, &iface_idx, sz, &iface_idx.idx); + ipfw_add_table_algo(ch, &number_array, sz, &number_array.idx); + ipfw_add_table_algo(ch, &flow_hash, sz, &flow_hash.idx); + ipfw_add_table_algo(ch, &addr_kfib, sz, &addr_kfib.idx); +} + +void +ipfw_table_algo_destroy(struct ip_fw_chain *ch) +{ + + ipfw_del_table_algo(ch, addr_radix.idx); + ipfw_del_table_algo(ch, addr_hash.idx); + ipfw_del_table_algo(ch, iface_idx.idx); + ipfw_del_table_algo(ch, number_array.idx); + ipfw_del_table_algo(ch, flow_hash.idx); + ipfw_del_table_algo(ch, addr_kfib.idx); +} + + diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_table_value.c b/freebsd/sys/netpfil/ipfw/ip_fw_table_value.c new file mode 100644 index 00000000..ef42e401 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/ip_fw_table_value.c @@ -0,0 +1,810 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2014 Yandex LLC + * Copyright (c) 2014 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Multi-field value support for ipfw tables. + * + * This file contains necessary functions to convert + * large multi-field values into u32 indices suitable to be fed + * to various table algorithms. Other machinery like proper refcounting, + * internal structures resizing are also kept here. + */ + +#include <rtems/bsd/local/opt_ipfw.h> + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/hash.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/rwlock.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/queue.h> +#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */ + +#include <netinet/in.h> +#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ +#include <netinet/ip_fw.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/ip_fw_table.h> + +static uint32_t hash_table_value(struct namedobj_instance *ni, const void *key, + uint32_t kopt); +static int cmp_table_value(struct named_object *no, const void *key, + uint32_t kopt); + +static int list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd); + +static struct ipfw_sopt_handler scodes[] = { + { IP_FW_TABLE_VLIST, 0, HDIR_GET, list_table_values }, +}; + +#define CHAIN_TO_VI(chain) (CHAIN_TO_TCFG(chain)->valhash) + +struct table_val_link +{ + struct named_object no; + struct table_value *pval; /* Pointer to real table value */ +}; +#define VALDATA_START_SIZE 64 /* Allocate 64-items array by default */ + +struct vdump_args { + struct ip_fw_chain *ch; + struct sockopt_data *sd; + struct table_value *pval; + int error; +}; + + +static uint32_t +hash_table_value(struct namedobj_instance *ni, const void *key, uint32_t kopt) +{ + + return (hash32_buf(key, 56, 0)); +} + +static int +cmp_table_value(struct named_object *no, const void *key, uint32_t kopt) +{ + + return (memcmp(((struct table_val_link *)no)->pval, key, 56)); +} + +static void +mask_table_value(struct table_value *src, struct table_value *dst, + uint32_t mask) +{ +#define _MCPY(f, b) if ((mask & (b)) != 0) { dst->f = src->f; } + + memset(dst, 0, sizeof(*dst)); + _MCPY(tag, IPFW_VTYPE_TAG); + _MCPY(pipe, IPFW_VTYPE_PIPE); + _MCPY(divert, IPFW_VTYPE_DIVERT); + _MCPY(skipto, IPFW_VTYPE_SKIPTO); + _MCPY(netgraph, IPFW_VTYPE_NETGRAPH); + _MCPY(fib, IPFW_VTYPE_FIB); + _MCPY(nat, IPFW_VTYPE_NAT); + _MCPY(dscp, IPFW_VTYPE_DSCP); + _MCPY(nh4, IPFW_VTYPE_NH4); + _MCPY(nh6, IPFW_VTYPE_NH6); + _MCPY(zoneid, IPFW_VTYPE_NH6); +#undef _MCPY +} + +static void +get_value_ptrs(struct ip_fw_chain *ch, struct table_config *tc, int vshared, + struct table_value **ptv, struct namedobj_instance **pvi) +{ + struct table_value *pval; + struct namedobj_instance *vi; + + if (vshared != 0) { + pval = (struct table_value *)ch->valuestate; + vi = CHAIN_TO_VI(ch); + } else { + pval = NULL; + vi = NULL; + //pval = (struct table_value *)&tc->ti.data; + } + + if (ptv != NULL) + *ptv = pval; + if (pvi != NULL) + *pvi = vi; +} + +/* + * Update pointers to real vaues after @pval change. + */ +static int +update_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg) +{ + struct vdump_args *da; + struct table_val_link *ptv; + struct table_value *pval; + + da = (struct vdump_args *)arg; + ptv = (struct table_val_link *)no; + + pval = da->pval; + ptv->pval = &pval[ptv->no.kidx]; + ptv->no.name = (char *)&pval[ptv->no.kidx]; + return (0); +} + +/* + * Grows value storage shared among all tables. + * Drops/reacquires UH locks. + * Notifies other running adds on @ch shared storage resize. + * Note function does not guarantee that free space + * will be available after invocation, so one caller needs + * to roll cycle himself. + * + * Returns 0 if case of no errors. + */ +static int +resize_shared_value_storage(struct ip_fw_chain *ch) +{ + struct tables_config *tcfg; + struct namedobj_instance *vi; + struct table_value *pval, *valuestate, *old_valuestate; + void *new_idx; + struct vdump_args da; + int new_blocks; + int val_size, val_size_old; + + IPFW_UH_WLOCK_ASSERT(ch); + + valuestate = NULL; + new_idx = NULL; + + pval = (struct table_value *)ch->valuestate; + vi = CHAIN_TO_VI(ch); + tcfg = CHAIN_TO_TCFG(ch); + + val_size = tcfg->val_size * 2; + + if (val_size == (1 << 30)) + return (ENOSPC); + + IPFW_UH_WUNLOCK(ch); + + valuestate = malloc(sizeof(struct table_value) * val_size, M_IPFW, + M_WAITOK | M_ZERO); + ipfw_objhash_bitmap_alloc(val_size, (void *)&new_idx, + &new_blocks); + + IPFW_UH_WLOCK(ch); + + /* + * Check if we still need to resize + */ + if (tcfg->val_size >= val_size) + goto done; + + /* Update pointers and notify everyone we're changing @ch */ + pval = (struct table_value *)ch->valuestate; + rollback_toperation_state(ch, ch); + + /* Good. Let's merge */ + memcpy(valuestate, pval, sizeof(struct table_value) * tcfg->val_size); + ipfw_objhash_bitmap_merge(CHAIN_TO_VI(ch), &new_idx, &new_blocks); + + IPFW_WLOCK(ch); + /* Change pointers */ + old_valuestate = ch->valuestate; + ch->valuestate = valuestate; + valuestate = old_valuestate; + ipfw_objhash_bitmap_swap(CHAIN_TO_VI(ch), &new_idx, &new_blocks); + + val_size_old = tcfg->val_size; + tcfg->val_size = val_size; + val_size = val_size_old; + IPFW_WUNLOCK(ch); + /* Update pointers to reflect resize */ + memset(&da, 0, sizeof(da)); + da.pval = (struct table_value *)ch->valuestate; + ipfw_objhash_foreach(vi, update_tvalue, &da); + +done: + free(valuestate, M_IPFW); + ipfw_objhash_bitmap_free(new_idx, new_blocks); + + return (0); +} + +/* + * Drops reference for table value with index @kidx, stored in @pval and + * @vi. Frees value if it has no references. + */ +static void +unref_table_value(struct namedobj_instance *vi, struct table_value *pval, + uint32_t kidx) +{ + struct table_val_link *ptvl; + + KASSERT(pval[kidx].refcnt > 0, ("Refcount is 0 on kidx %d", kidx)); + if (--pval[kidx].refcnt > 0) + return; + + /* Last reference, delete item */ + ptvl = (struct table_val_link *)ipfw_objhash_lookup_kidx(vi, kidx); + KASSERT(ptvl != NULL, ("lookup on value kidx %d failed", kidx)); + ipfw_objhash_del(vi, &ptvl->no); + ipfw_objhash_free_idx(vi, kidx); + free(ptvl, M_IPFW); +} + +struct flush_args { + struct ip_fw_chain *ch; + struct table_algo *ta; + struct table_info *ti; + void *astate; + ipfw_obj_tentry tent; +}; + +static int +unref_table_value_cb(void *e, void *arg) +{ + struct flush_args *fa; + struct ip_fw_chain *ch; + struct table_algo *ta; + ipfw_obj_tentry *tent; + int error; + + fa = (struct flush_args *)arg; + + ta = fa->ta; + memset(&fa->tent, 0, sizeof(fa->tent)); + tent = &fa->tent; + error = ta->dump_tentry(fa->astate, fa->ti, e, tent); + if (error != 0) + return (error); + + ch = fa->ch; + + unref_table_value(CHAIN_TO_VI(ch), + (struct table_value *)ch->valuestate, tent->v.kidx); + + return (0); +} + +/* + * Drop references for each value used in @tc. + */ +void +ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc, + struct table_algo *ta, void *astate, struct table_info *ti) +{ + struct flush_args fa; + + IPFW_UH_WLOCK_ASSERT(ch); + + memset(&fa, 0, sizeof(fa)); + fa.ch = ch; + fa.ta = ta; + fa.astate = astate; + fa.ti = ti; + + ta->foreach(astate, ti, unref_table_value_cb, &fa); +} + +/* + * Table operation state handler. + * Called when we are going to change something in @tc which + * may lead to inconsistencies in on-going table data addition. + * + * Here we rollback all already committed state (table values, currently) + * and set "modified" field to non-zero value to indicate + * that we need to restart original operation. + */ +void +rollback_table_values(struct tableop_state *ts) +{ + struct ip_fw_chain *ch; + struct table_value *pval; + struct tentry_info *ptei; + struct namedobj_instance *vi; + int i; + + ch = ts->ch; + + IPFW_UH_WLOCK_ASSERT(ch); + + /* Get current table value pointer */ + get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi); + + for (i = 0; i < ts->count; i++) { + ptei = &ts->tei[i]; + + if (ptei->value == 0) + continue; + + unref_table_value(vi, pval, ptei->value); + } +} + +/* + * Allocate new value index in either shared or per-table array. + * Function may drop/reacquire UH lock. + * + * Returns 0 on success. + */ +static int +alloc_table_vidx(struct ip_fw_chain *ch, struct tableop_state *ts, + struct namedobj_instance *vi, uint16_t *pvidx) +{ + int error, vlimit; + uint16_t vidx; + + IPFW_UH_WLOCK_ASSERT(ch); + + error = ipfw_objhash_alloc_idx(vi, &vidx); + if (error != 0) { + + /* + * We need to resize array. This involves + * lock/unlock, so we need to check "modified" + * state. + */ + ts->opstate.func(ts->tc, &ts->opstate); + error = resize_shared_value_storage(ch); + return (error); /* ts->modified should be set, we will restart */ + } + + vlimit = ts->ta->vlimit; + if (vlimit != 0 && vidx >= vlimit) { + + /* + * Algorithm is not able to store given index. + * We have to rollback state, start using + * per-table value array or return error + * if we're already using it. + * + * TODO: do not rollback state if + * atomicity is not required. + */ + if (ts->vshared != 0) { + /* shared -> per-table */ + return (ENOSPC); /* TODO: proper error */ + } + + /* per-table. Fail for now. */ + return (ENOSPC); /* TODO: proper error */ + } + + *pvidx = vidx; + return (0); +} + +/* + * Drops value reference for unused values (updates, deletes, partially + * successful adds or rollbacks). + */ +void +ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc, + struct tentry_info *tei, uint32_t count, int rollback) +{ + int i; + struct tentry_info *ptei; + struct table_value *pval; + struct namedobj_instance *vi; + + /* + * We have two slightly different ADD cases here: + * either (1) we are successful / partially successful, + * in that case we need + * * to ignore ADDED entries values + * * rollback every other values (either UPDATED since + * old value has been stored there, or some failure like + * EXISTS or LIMIT or simply "ignored" case. + * + * (2): atomic rollback of partially successful operation + * in that case we simply need to unref all entries. + * + * DELETE case is simpler: no atomic support there, so + * we simply unref all non-zero values. + */ + + /* + * Get current table value pointers. + * XXX: Properly read vshared + */ + get_value_ptrs(ch, tc, 1, &pval, &vi); + + for (i = 0; i < count; i++) { + ptei = &tei[i]; + + if (ptei->value == 0) { + + /* + * We may be deleting non-existing record. + * Skip. + */ + continue; + } + + if ((ptei->flags & TEI_FLAGS_ADDED) != 0 && rollback == 0) { + ptei->value = 0; + continue; + } + + unref_table_value(vi, pval, ptei->value); + ptei->value = 0; + } +} + +/* + * Main function used to link values of entries going to be added, + * to the index. Since we may perform many UH locks drops/acquires, + * handle changes by checking tablestate "modified" field. + * + * Success: return 0. + */ +int +ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts) +{ + int error, i, found; + struct namedobj_instance *vi; + struct table_config *tc; + struct tentry_info *tei, *ptei; + uint32_t count, vlimit; + uint16_t vidx; + struct table_val_link *ptv; + struct table_value tval, *pval; + + /* + * Stage 1: reference all existing values and + * save their indices. + */ + IPFW_UH_WLOCK_ASSERT(ch); + get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi); + + error = 0; + found = 0; + vlimit = ts->ta->vlimit; + vidx = 0; + tc = ts->tc; + tei = ts->tei; + count = ts->count; + for (i = 0; i < count; i++) { + ptei = &tei[i]; + ptei->value = 0; /* Ensure value is always 0 in the beginning */ + mask_table_value(ptei->pvalue, &tval, ts->vmask); + ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0, + (char *)&tval); + if (ptv == NULL) + continue; + /* Deal with vlimit later */ + if (vlimit > 0 && vlimit <= ptv->no.kidx) + continue; + + /* Value found. Bump refcount */ + ptv->pval->refcnt++; + ptei->value = ptv->no.kidx; + found++; + } + + if (ts->count == found) { + /* We've found all values , no need ts create new ones */ + return (0); + } + + /* + * we have added some state here, let's attach operation + * state ts the list ts be able ts rollback if necessary. + */ + add_toperation_state(ch, ts); + /* Ensure table won't disappear */ + tc_ref(tc); + IPFW_UH_WUNLOCK(ch); + + /* + * Stage 2: allocate objects for non-existing values. + */ + for (i = 0; i < count; i++) { + ptei = &tei[i]; + if (ptei->value != 0) + continue; + if (ptei->ptv != NULL) + continue; + ptei->ptv = malloc(sizeof(struct table_val_link), M_IPFW, + M_WAITOK | M_ZERO); + } + + /* + * Stage 3: allocate index numbers for new values + * and link them to index. + */ + IPFW_UH_WLOCK(ch); + tc_unref(tc); + del_toperation_state(ch, ts); + if (ts->modified != 0) { + + /* + * In general, we should free all state/indexes here + * and return. However, we keep allocated state instead + * to ensure we achieve some progress on each restart. + */ + return (0); + } + + KASSERT(pval == ch->valuestate, ("resize_storage() notify failure")); + + /* Let's try to link values */ + for (i = 0; i < count; i++) { + ptei = &tei[i]; + + /* Check if record has appeared */ + mask_table_value(ptei->pvalue, &tval, ts->vmask); + ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0, + (char *)&tval); + if (ptv != NULL) { + ptv->pval->refcnt++; + ptei->value = ptv->no.kidx; + continue; + } + + /* May perform UH unlock/lock */ + error = alloc_table_vidx(ch, ts, vi, &vidx); + if (error != 0) { + ts->opstate.func(ts->tc, &ts->opstate); + return (error); + } + /* value storage resize has happened, return */ + if (ts->modified != 0) + return (0); + + /* Finally, we have allocated valid index, let's add entry */ + ptei->value = vidx; + ptv = (struct table_val_link *)ptei->ptv; + ptei->ptv = NULL; + + ptv->no.kidx = vidx; + ptv->no.name = (char *)&pval[vidx]; + ptv->pval = &pval[vidx]; + memcpy(ptv->pval, &tval, sizeof(struct table_value)); + pval[vidx].refcnt = 1; + ipfw_objhash_add(vi, &ptv->no); + } + + return (0); +} + +/* + * Compatibility function used to import data from old + * IP_FW_TABLE_ADD / IP_FW_TABLE_XADD opcodes. + */ +void +ipfw_import_table_value_legacy(uint32_t value, struct table_value *v) +{ + + memset(v, 0, sizeof(*v)); + v->tag = value; + v->pipe = value; + v->divert = value; + v->skipto = value; + v->netgraph = value; + v->fib = value; + v->nat = value; + v->nh4 = value; /* host format */ + v->dscp = value; + v->limit = value; +} + +/* + * Export data to legacy table dumps opcodes. + */ +uint32_t +ipfw_export_table_value_legacy(struct table_value *v) +{ + + /* + * TODO: provide more compatibility depending on + * vmask value. + */ + return (v->tag); +} + +/* + * Imports table value from current userland format. + * Saves value in kernel format to the same place. + */ +void +ipfw_import_table_value_v1(ipfw_table_value *iv) +{ + struct table_value v; + + memset(&v, 0, sizeof(v)); + v.tag = iv->tag; + v.pipe = iv->pipe; + v.divert = iv->divert; + v.skipto = iv->skipto; + v.netgraph = iv->netgraph; + v.fib = iv->fib; + v.nat = iv->nat; + v.dscp = iv->dscp; + v.nh4 = iv->nh4; + v.nh6 = iv->nh6; + v.limit = iv->limit; + v.zoneid = iv->zoneid; + + memcpy(iv, &v, sizeof(ipfw_table_value)); +} + +/* + * Export real table value @v to current userland format. + * Note that @v and @piv may point to the same memory. + */ +void +ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *piv) +{ + ipfw_table_value iv; + + memset(&iv, 0, sizeof(iv)); + iv.tag = v->tag; + iv.pipe = v->pipe; + iv.divert = v->divert; + iv.skipto = v->skipto; + iv.netgraph = v->netgraph; + iv.fib = v->fib; + iv.nat = v->nat; + iv.dscp = v->dscp; + iv.limit = v->limit; + iv.nh4 = v->nh4; + iv.nh6 = v->nh6; + iv.zoneid = v->zoneid; + + memcpy(piv, &iv, sizeof(iv)); +} + +/* + * Exports real value data into ipfw_table_value structure. + * Utilizes "spare1" field to store kernel index. + */ +static int +dump_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg) +{ + struct vdump_args *da; + struct table_val_link *ptv; + struct table_value *v; + + da = (struct vdump_args *)arg; + ptv = (struct table_val_link *)no; + + v = (struct table_value *)ipfw_get_sopt_space(da->sd, sizeof(*v)); + /* Out of memory, returning */ + if (v == NULL) { + da->error = ENOMEM; + return (ENOMEM); + } + + memcpy(v, ptv->pval, sizeof(*v)); + v->spare1 = ptv->no.kidx; + return (0); +} + +/* + * Dumps all shared/table value data + * Data layout (v1)(current): + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size + * Reply: [ ipfw_obj_lheader ipfw_table_value x N ] + * + * Returns 0 on success + */ +static int +list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct _ipfw_obj_lheader *olh; + struct namedobj_instance *vi; + struct vdump_args da; + uint32_t count, size; + + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); + if (olh == NULL) + return (EINVAL); + if (sd->valsize < olh->size) + return (EINVAL); + + IPFW_UH_RLOCK(ch); + vi = CHAIN_TO_VI(ch); + + count = ipfw_objhash_count(vi); + size = count * sizeof(ipfw_table_value) + sizeof(ipfw_obj_lheader); + + /* Fill in header regadless of buffer size */ + olh->count = count; + olh->objsize = sizeof(ipfw_table_value); + + if (size > olh->size) { + olh->size = size; + IPFW_UH_RUNLOCK(ch); + return (ENOMEM); + } + olh->size = size; + + /* + * Do the actual value dump + */ + memset(&da, 0, sizeof(da)); + da.ch = ch; + da.sd = sd; + ipfw_objhash_foreach(vi, dump_tvalue, &da); + + IPFW_UH_RUNLOCK(ch); + + return (0); +} + +void +ipfw_table_value_init(struct ip_fw_chain *ch, int first) +{ + struct tables_config *tcfg; + + ch->valuestate = malloc(VALDATA_START_SIZE * sizeof(struct table_value), + M_IPFW, M_WAITOK | M_ZERO); + + tcfg = ch->tblcfg; + + tcfg->val_size = VALDATA_START_SIZE; + tcfg->valhash = ipfw_objhash_create(tcfg->val_size); + ipfw_objhash_set_funcs(tcfg->valhash, hash_table_value, + cmp_table_value); + + IPFW_ADD_SOPT_HANDLER(first, scodes); +} + +static int +destroy_value(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + + free(no, M_IPFW); + return (0); +} + +void +ipfw_table_value_destroy(struct ip_fw_chain *ch, int last) +{ + + IPFW_DEL_SOPT_HANDLER(last, scodes); + + free(ch->valuestate, M_IPFW); + ipfw_objhash_foreach(CHAIN_TO_VI(ch), destroy_value, ch); + ipfw_objhash_destroy(CHAIN_TO_VI(ch)); +} + diff --git a/freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.c b/freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.c new file mode 100644 index 00000000..03ca9599 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.c @@ -0,0 +1,131 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2015-2016 Yandex LLC + * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip_var.h> +#include <netinet/ip_fw.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/nat64/ip_fw_nat64.h> +#include <netpfil/ipfw/nat64/nat64_translate.h> + + +int nat64_debug = 0; +SYSCTL_DECL(_net_inet_ip_fw); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, nat64_debug, CTLFLAG_RW, + &nat64_debug, 0, "Debug level for NAT64 module"); + +int nat64_allow_private = 0; +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, nat64_allow_private, CTLFLAG_RW, + &nat64_allow_private, 0, + "Allow use of non-global IPv4 addresses with NAT64"); + +static int +vnet_ipfw_nat64_init(const void *arg __unused) +{ + struct ip_fw_chain *ch; + int first, error; + + ch = &V_layer3_chain; + first = IS_DEFAULT_VNET(curvnet) ? 1: 0; + error = nat64stl_init(ch, first); + if (error != 0) + return (error); + error = nat64lsn_init(ch, first); + if (error != 0) { + nat64stl_uninit(ch, first); + return (error); + } + return (0); +} + +static int +vnet_ipfw_nat64_uninit(const void *arg __unused) +{ + struct ip_fw_chain *ch; + int last; + + ch = &V_layer3_chain; + last = IS_DEFAULT_VNET(curvnet) ? 1: 0; + nat64stl_uninit(ch, last); + nat64lsn_uninit(ch, last); + return (0); +} + +static int +ipfw_nat64_modevent(module_t mod, int type, void *unused) +{ + + switch (type) { + case MOD_LOAD: + case MOD_UNLOAD: + break; + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t ipfw_nat64_mod = { + "ipfw_nat64", + ipfw_nat64_modevent, + 0 +}; + +/* Define startup order. */ +#define IPFW_NAT64_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN +#define IPFW_NAT64_MODEVENT_ORDER (SI_ORDER_ANY - 128) /* after ipfw */ +#define IPFW_NAT64_MODULE_ORDER (IPFW_NAT64_MODEVENT_ORDER + 1) +#define IPFW_NAT64_VNET_ORDER (IPFW_NAT64_MODEVENT_ORDER + 2) + +DECLARE_MODULE(ipfw_nat64, ipfw_nat64_mod, IPFW_NAT64_SI_SUB_FIREWALL, + SI_ORDER_ANY); +MODULE_DEPEND(ipfw_nat64, ipfw, 3, 3, 3); +MODULE_VERSION(ipfw_nat64, 1); + +VNET_SYSINIT(vnet_ipfw_nat64_init, IPFW_NAT64_SI_SUB_FIREWALL, + IPFW_NAT64_VNET_ORDER, vnet_ipfw_nat64_init, NULL); +VNET_SYSUNINIT(vnet_ipfw_nat64_uninit, IPFW_NAT64_SI_SUB_FIREWALL, + IPFW_NAT64_VNET_ORDER, vnet_ipfw_nat64_uninit, NULL); diff --git a/freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.h b/freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.h new file mode 100644 index 00000000..1d2bb774 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/nat64/ip_fw_nat64.h @@ -0,0 +1,117 @@ +/*- + * Copyright (c) 2015-2016 Yandex LLC + * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IP_FW_NAT64_H_ +#define _IP_FW_NAT64_H_ + +#define DPRINTF(mask, fmt, ...) \ + if (nat64_debug & (mask)) \ + printf("NAT64: %s: " fmt "\n", __func__, ## __VA_ARGS__) +#define DP_GENERIC 0x0001 +#define DP_OBJ 0x0002 +#define DP_JQUEUE 0x0004 +#define DP_STATE 0x0008 +#define DP_DROPS 0x0010 +#define DP_ALL 0xFFFF +extern int nat64_debug; + +#if 0 +#define NAT64NOINLINE __noinline +#else +#define NAT64NOINLINE +#endif + +int nat64stl_init(struct ip_fw_chain *ch, int first); +void nat64stl_uninit(struct ip_fw_chain *ch, int last); +int nat64lsn_init(struct ip_fw_chain *ch, int first); +void nat64lsn_uninit(struct ip_fw_chain *ch, int last); + +struct ip_fw_nat64_stats { + counter_u64_t opcnt64; /* 6to4 of packets translated */ + counter_u64_t opcnt46; /* 4to6 of packets translated */ + counter_u64_t ofrags; /* number of fragments generated */ + counter_u64_t ifrags; /* number of fragments received */ + counter_u64_t oerrors; /* number of output errors */ + counter_u64_t noroute4; + counter_u64_t noroute6; + counter_u64_t nomatch4; /* No addr/port match */ + counter_u64_t noproto; /* Protocol not supported */ + counter_u64_t nomem; /* mbufs allocation failed */ + counter_u64_t dropped; /* number of packets silently + * dropped due to some errors/ + * unsupported/etc. + */ + + counter_u64_t jrequests; /* number of jobs requests queued */ + counter_u64_t jcalls; /* number of jobs handler calls */ + counter_u64_t jhostsreq; /* number of hosts requests */ + counter_u64_t jportreq; + counter_u64_t jhostfails; + counter_u64_t jportfails; + counter_u64_t jmaxlen; + counter_u64_t jnomem; + counter_u64_t jreinjected; + + counter_u64_t screated; + counter_u64_t sdeleted; + counter_u64_t spgcreated; + counter_u64_t spgdeleted; +}; + +#define IPFW_NAT64_VERSION 1 +#define NAT64STATS (sizeof(struct ip_fw_nat64_stats) / sizeof(uint64_t)) +typedef struct _nat64_stats_block { + counter_u64_t stats[NAT64STATS]; +} nat64_stats_block; +#define NAT64STAT_ADD(s, f, v) \ + counter_u64_add((s)->stats[ \ + offsetof(struct ip_fw_nat64_stats, f) / sizeof(uint64_t)], (v)) +#define NAT64STAT_INC(s, f) NAT64STAT_ADD(s, f, 1) +#define NAT64STAT_FETCH(s, f) \ + counter_u64_fetch((s)->stats[ \ + offsetof(struct ip_fw_nat64_stats, f) / sizeof(uint64_t)]) + +#define L3HDR(_ip, _t) ((_t)((u_int32_t *)(_ip) + (_ip)->ip_hl)) +#define TCP(p) ((struct tcphdr *)(p)) +#define UDP(p) ((struct udphdr *)(p)) +#define ICMP(p) ((struct icmphdr *)(p)) +#define ICMP6(p) ((struct icmp6_hdr *)(p)) + +#define NAT64SKIP 0 +#define NAT64RETURN 1 +#define NAT64MFREE -1 + +/* Well-known prefix 64:ff9b::/96 */ +#define IPV6_ADDR_INT32_WKPFX htonl(0x64ff9b) +#define IN6_IS_ADDR_WKPFX(a) \ + ((a)->s6_addr32[0] == IPV6_ADDR_INT32_WKPFX && \ + (a)->s6_addr32[1] == 0 && (a)->s6_addr32[2] == 0) + +#endif + diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64_translate.c b/freebsd/sys/netpfil/ipfw/nat64/nat64_translate.c new file mode 100644 index 00000000..d2507674 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/nat64/nat64_translate.c @@ -0,0 +1,1574 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2015-2016 Yandex LLC + * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <rtems/bsd/local/opt_ipfw.h> + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/counter.h> +#include <rtems/bsd/sys/errno.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/rmlock.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/queue.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_pflog.h> +#include <net/pfil.h> +#include <net/netisr.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_fw.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet/ip_icmp.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet6/in6_var.h> +#include <netinet6/ip6_var.h> + +#include <netpfil/pf/pf.h> +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/nat64/ip_fw_nat64.h> +#include <netpfil/ipfw/nat64/nat64_translate.h> +#include <machine/in_cksum.h> + +static void +nat64_log(struct pfloghdr *logdata, struct mbuf *m, sa_family_t family) +{ + + logdata->dir = PF_OUT; + logdata->af = family; + ipfw_bpf_mtap2(logdata, PFLOG_HDRLEN, m); +} +#ifdef IPFIREWALL_NAT64_DIRECT_OUTPUT +static NAT64NOINLINE struct sockaddr* nat64_find_route4(struct route *ro, + in_addr_t dest, struct mbuf *m); +static NAT64NOINLINE struct sockaddr* nat64_find_route6(struct route_in6 *ro, + struct in6_addr *dest, struct mbuf *m); + +static NAT64NOINLINE int +nat64_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro, nat64_stats_block *stats, + void *logdata) +{ + int error; + + if (logdata != NULL) + nat64_log(logdata, m, dst->sa_family); + error = (*ifp->if_output)(ifp, m, dst, ro); + if (error != 0) + NAT64STAT_INC(stats, oerrors); + return (error); +} + +static NAT64NOINLINE int +nat64_output_one(struct mbuf *m, nat64_stats_block *stats, void *logdata) +{ + struct route_in6 ro6; + struct route ro4, *ro; + struct sockaddr *dst; + struct ifnet *ifp; + struct ip6_hdr *ip6; + struct ip *ip4; + int error; + + ip4 = mtod(m, struct ip *); + switch (ip4->ip_v) { + case IPVERSION: + ro = &ro4; + dst = nat64_find_route4(&ro4, ip4->ip_dst.s_addr, m); + if (dst == NULL) + NAT64STAT_INC(stats, noroute4); + break; + case (IPV6_VERSION >> 4): + ip6 = (struct ip6_hdr *)ip4; + ro = (struct route *)&ro6; + dst = nat64_find_route6(&ro6, &ip6->ip6_dst, m); + if (dst == NULL) + NAT64STAT_INC(stats, noroute6); + break; + default: + m_freem(m); + NAT64STAT_INC(stats, dropped); + DPRINTF(DP_DROPS, "dropped due to unknown IP version"); + return (EAFNOSUPPORT); + } + if (dst == NULL) { + FREE_ROUTE(ro); + m_freem(m); + return (EHOSTUNREACH); + } + if (logdata != NULL) + nat64_log(logdata, m, dst->sa_family); + ifp = ro->ro_rt->rt_ifp; + error = (*ifp->if_output)(ifp, m, dst, ro); + if (error != 0) + NAT64STAT_INC(stats, oerrors); + FREE_ROUTE(ro); + return (error); +} +#else /* !IPFIREWALL_NAT64_DIRECT_OUTPUT */ +static NAT64NOINLINE int +nat64_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro, nat64_stats_block *stats, + void *logdata) +{ + struct ip *ip4; + int ret, af; + + ip4 = mtod(m, struct ip *); + switch (ip4->ip_v) { + case IPVERSION: + af = AF_INET; + ret = NETISR_IP; + break; + case (IPV6_VERSION >> 4): + af = AF_INET6; + ret = NETISR_IPV6; + break; + default: + m_freem(m); + NAT64STAT_INC(stats, dropped); + DPRINTF(DP_DROPS, "unknown IP version"); + return (EAFNOSUPPORT); + } + if (logdata != NULL) + nat64_log(logdata, m, af); + ret = netisr_queue(ret, m); + if (ret != 0) + NAT64STAT_INC(stats, oerrors); + return (ret); +} + +static NAT64NOINLINE int +nat64_output_one(struct mbuf *m, nat64_stats_block *stats, void *logdata) +{ + + return (nat64_output(NULL, m, NULL, NULL, stats, logdata)); +} +#endif /* !IPFIREWALL_NAT64_DIRECT_OUTPUT */ + + +#if 0 +void print_ipv6_header(struct ip6_hdr *ip6, char *buf, size_t bufsize); + +void +print_ipv6_header(struct ip6_hdr *ip6, char *buf, size_t bufsize) +{ + char sbuf[INET6_ADDRSTRLEN], dbuf[INET6_ADDRSTRLEN]; + + inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf)); + inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf)); + snprintf(buf, bufsize, "%s -> %s %d", sbuf, dbuf, ip6->ip6_nxt); +} + + +static NAT64NOINLINE int +nat64_embed_ip4(struct nat64_cfg *cfg, in_addr_t ia, struct in6_addr *ip6) +{ + + /* assume the prefix is properly filled with zeros */ + bcopy(&cfg->prefix, ip6, sizeof(*ip6)); + switch (cfg->plen) { + case 32: + case 96: + ip6->s6_addr32[cfg->plen / 32] = ia; + break; + case 40: + case 48: + case 56: +#if BYTE_ORDER == BIG_ENDIAN + ip6->s6_addr32[1] = cfg->prefix.s6_addr32[1] | + (ia >> (cfg->plen % 32)); + ip6->s6_addr32[2] = ia << (24 - cfg->plen % 32); +#elif BYTE_ORDER == LITTLE_ENDIAN + ip6->s6_addr32[1] = cfg->prefix.s6_addr32[1] | + (ia << (cfg->plen % 32)); + ip6->s6_addr32[2] = ia >> (24 - cfg->plen % 32); +#endif + break; + case 64: +#if BYTE_ORDER == BIG_ENDIAN + ip6->s6_addr32[2] = ia >> 8; + ip6->s6_addr32[3] = ia << 24; +#elif BYTE_ORDER == LITTLE_ENDIAN + ip6->s6_addr32[2] = ia << 8; + ip6->s6_addr32[3] = ia >> 24; +#endif + break; + default: + return (0); + }; + ip6->s6_addr8[8] = 0; + return (1); +} + +static NAT64NOINLINE in_addr_t +nat64_extract_ip4(struct in6_addr *ip6, int plen) +{ + in_addr_t ia; + + /* + * According to RFC 6052 p2.2: + * IPv4-embedded IPv6 addresses are composed of a variable-length + * prefix, the embedded IPv4 address, and a variable length suffix. + * The suffix bits are reserved for future extensions and SHOULD + * be set to zero. + */ + switch (plen) { + case 32: + if (ip6->s6_addr32[3] != 0 || ip6->s6_addr32[2] != 0) + goto badip6; + break; + case 40: + if (ip6->s6_addr32[3] != 0 || + (ip6->s6_addr32[2] & htonl(0xff00ffff)) != 0) + goto badip6; + break; + case 48: + if (ip6->s6_addr32[3] != 0 || + (ip6->s6_addr32[2] & htonl(0xff0000ff)) != 0) + goto badip6; + break; + case 56: + if (ip6->s6_addr32[3] != 0 || ip6->s6_addr8[8] != 0) + goto badip6; + break; + case 64: + if (ip6->s6_addr8[8] != 0 || + (ip6->s6_addr32[3] & htonl(0x00ffffff)) != 0) + goto badip6; + }; + switch (plen) { + case 32: + case 96: + ia = ip6->s6_addr32[plen / 32]; + break; + case 40: + case 48: + case 56: +#if BYTE_ORDER == BIG_ENDIAN + ia = (ip6->s6_addr32[1] << (plen % 32)) | + (ip6->s6_addr32[2] >> (24 - plen % 32)); +#elif BYTE_ORDER == LITTLE_ENDIAN + ia = (ip6->s6_addr32[1] >> (plen % 32)) | + (ip6->s6_addr32[2] << (24 - plen % 32)); +#endif + break; + case 64: +#if BYTE_ORDER == BIG_ENDIAN + ia = (ip6->s6_addr32[2] << 8) | (ip6->s6_addr32[3] >> 24); +#elif BYTE_ORDER == LITTLE_ENDIAN + ia = (ip6->s6_addr32[2] >> 8) | (ip6->s6_addr32[3] << 24); +#endif + break; + default: + return (0); + }; + if (nat64_check_ip4(ia) != 0 || + nat64_check_private_ip4(ia) != 0) + goto badip4; + + return (ia); +badip4: + DPRINTF(DP_GENERIC, "invalid destination address: %08x", ia); + return (0); +badip6: + DPRINTF(DP_GENERIC, "invalid IPv4-embedded IPv6 address"); + return (0); +} +#endif + +/* + * According to RFC 1624 the equation for incremental checksum update is: + * HC' = ~(~HC + ~m + m') -- [Eqn. 3] + * HC' = HC - ~m - m' -- [Eqn. 4] + * So, when we are replacing IPv4 addresses to IPv6, we + * can assume, that new bytes previously were zeros, and vise versa - + * when we replacing IPv6 addresses to IPv4, now unused bytes become + * zeros. The payload length in pseudo header has bigger size, but one + * half of it should be zero. Using the equation 4 we get: + * HC' = HC - (~m0 + m0') -- m0 is first changed word + * HC' = (HC - (~m0 + m0')) - (~m1 + m1') -- m1 is second changed word + * HC' = HC - ~m0 - m0' - ~m1 - m1' - ... = + * = HC - sum(~m[i] + m'[i]) + * + * The function result should be used as follows: + * IPv6 to IPv4: HC' = cksum_add(HC, result) + * IPv4 to IPv6: HC' = cksum_add(HC, ~result) + */ +static NAT64NOINLINE uint16_t +nat64_cksum_convert(struct ip6_hdr *ip6, struct ip *ip) +{ + uint32_t sum; + uint16_t *p; + + sum = ~ip->ip_src.s_addr >> 16; + sum += ~ip->ip_src.s_addr & 0xffff; + sum += ~ip->ip_dst.s_addr >> 16; + sum += ~ip->ip_dst.s_addr & 0xffff; + + for (p = (uint16_t *)&ip6->ip6_src; + p < (uint16_t *)(&ip6->ip6_src + 2); p++) + sum += *p; + + while (sum >> 16) + sum = (sum & 0xffff) + (sum >> 16); + return (sum); +} + +#if __FreeBSD_version < 1100000 +#define ip_fillid(ip) (ip)->ip_id = ip_newid() +#endif +static NAT64NOINLINE void +nat64_init_ip4hdr(const struct ip6_hdr *ip6, const struct ip6_frag *frag, + uint16_t plen, uint8_t proto, struct ip *ip) +{ + + /* assume addresses are already initialized */ + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + ip->ip_tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + ip->ip_len = htons(sizeof(*ip) + plen); +#ifdef IPFIREWALL_NAT64_DIRECT_OUTPUT + ip->ip_ttl = ip6->ip6_hlim - IPV6_HLIMDEC; +#else + /* Forwarding code will decrement TTL. */ + ip->ip_ttl = ip6->ip6_hlim; +#endif + ip->ip_sum = 0; + ip->ip_p = (proto == IPPROTO_ICMPV6) ? IPPROTO_ICMP: proto; + ip_fillid(ip); + if (frag != NULL) { + ip->ip_off = htons(ntohs(frag->ip6f_offlg) >> 3); + if (frag->ip6f_offlg & IP6F_MORE_FRAG) + ip->ip_off |= htons(IP_MF); + } else { + ip->ip_off = htons(IP_DF); + } + ip->ip_sum = in_cksum_hdr(ip); +} + +#define FRAGSZ(mtu) ((mtu) - sizeof(struct ip6_hdr) - sizeof(struct ip6_frag)) +static NAT64NOINLINE int +nat64_fragment6(nat64_stats_block *stats, struct ip6_hdr *ip6, struct mbufq *mq, + struct mbuf *m, uint32_t mtu, uint16_t ip_id, uint16_t ip_off) +{ + struct ip6_frag ip6f; + struct mbuf *n; + uint16_t hlen, len, offset; + int plen; + + plen = ntohs(ip6->ip6_plen); + hlen = sizeof(struct ip6_hdr); + + /* Fragmentation isn't needed */ + if (ip_off == 0 && plen <= mtu - hlen) { + M_PREPEND(m, hlen, M_NOWAIT); + if (m == NULL) { + NAT64STAT_INC(stats, nomem); + return (ENOMEM); + } + bcopy(ip6, mtod(m, void *), hlen); + if (mbufq_enqueue(mq, m) != 0) { + m_freem(m); + NAT64STAT_INC(stats, dropped); + DPRINTF(DP_DROPS, "dropped due to mbufq overflow"); + return (ENOBUFS); + } + return (0); + } + + hlen += sizeof(struct ip6_frag); + ip6f.ip6f_reserved = 0; + ip6f.ip6f_nxt = ip6->ip6_nxt; + ip6->ip6_nxt = IPPROTO_FRAGMENT; + if (ip_off != 0) { + /* + * We have got an IPv4 fragment. + * Use offset value and ip_id from original fragment. + */ + ip6f.ip6f_ident = htonl(ntohs(ip_id)); + offset = (ntohs(ip_off) & IP_OFFMASK) << 3; + NAT64STAT_INC(stats, ifrags); + } else { + /* The packet size exceeds interface MTU */ + ip6f.ip6f_ident = htonl(ip6_randomid()); + offset = 0; /* First fragment*/ + } + while (plen > 0 && m != NULL) { + n = NULL; + len = FRAGSZ(mtu) & ~7; + if (len > plen) + len = plen; + ip6->ip6_plen = htons(len + sizeof(ip6f)); + ip6f.ip6f_offlg = ntohs(offset); + if (len < plen || (ip_off & htons(IP_MF)) != 0) + ip6f.ip6f_offlg |= IP6F_MORE_FRAG; + offset += len; + plen -= len; + if (plen > 0) { + n = m_split(m, len, M_NOWAIT); + if (n == NULL) + goto fail; + } + M_PREPEND(m, hlen, M_NOWAIT); + if (m == NULL) + goto fail; + bcopy(ip6, mtod(m, void *), sizeof(struct ip6_hdr)); + bcopy(&ip6f, mtodo(m, sizeof(struct ip6_hdr)), + sizeof(struct ip6_frag)); + if (mbufq_enqueue(mq, m) != 0) + goto fail; + m = n; + } + NAT64STAT_ADD(stats, ofrags, mbufq_len(mq)); + return (0); +fail: + if (m != NULL) + m_freem(m); + if (n != NULL) + m_freem(n); + mbufq_drain(mq); + NAT64STAT_INC(stats, nomem); + return (ENOMEM); +} + +#if __FreeBSD_version < 1100000 +#define rt_expire rt_rmx.rmx_expire +#define rt_mtu rt_rmx.rmx_mtu +#endif +static NAT64NOINLINE struct sockaddr* +nat64_find_route6(struct route_in6 *ro, struct in6_addr *dest, struct mbuf *m) +{ + struct sockaddr_in6 *dst; + struct rtentry *rt; + + bzero(ro, sizeof(*ro)); + dst = (struct sockaddr_in6 *)&ro->ro_dst; + dst->sin6_family = AF_INET6; + dst->sin6_len = sizeof(*dst); + dst->sin6_addr = *dest; + IN6_LOOKUP_ROUTE(ro, M_GETFIB(m)); + rt = ro->ro_rt; + if (rt && (rt->rt_flags & RTF_UP) && + (rt->rt_ifp->if_flags & IFF_UP) && + (rt->rt_ifp->if_drv_flags & IFF_DRV_RUNNING)) { + if (rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in6 *)rt->rt_gateway; + } else + return (NULL); + if (((rt->rt_flags & RTF_REJECT) && + (rt->rt_expire == 0 || + time_uptime < rt->rt_expire)) || + rt->rt_ifp->if_link_state == LINK_STATE_DOWN) + return (NULL); + return ((struct sockaddr *)dst); +} + +#define NAT64_ICMP6_PLEN 64 +static NAT64NOINLINE void +nat64_icmp6_reflect(struct mbuf *m, uint8_t type, uint8_t code, uint32_t mtu, + nat64_stats_block *stats, void *logdata) +{ + struct icmp6_hdr *icmp6; + struct ip6_hdr *ip6, *oip6; + struct mbuf *n; + int len, plen; + + len = 0; + plen = nat64_getlasthdr(m, &len); + if (plen < 0) { + DPRINTF(DP_DROPS, "mbuf isn't contigious"); + goto freeit; + } + /* + * Do not send ICMPv6 in reply to ICMPv6 errors. + */ + if (plen == IPPROTO_ICMPV6) { + if (m->m_len < len + sizeof(*icmp6)) { + DPRINTF(DP_DROPS, "mbuf isn't contigious"); + goto freeit; + } + icmp6 = mtodo(m, len); + if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST || + icmp6->icmp6_type == ND_REDIRECT) { + DPRINTF(DP_DROPS, "do not send ICMPv6 in reply to " + "ICMPv6 errors"); + goto freeit; + } + } + /* + if (icmp6_ratelimit(&ip6->ip6_src, type, code)) + goto freeit; + */ + ip6 = mtod(m, struct ip6_hdr *); + switch (type) { + case ICMP6_DST_UNREACH: + case ICMP6_PACKET_TOO_BIG: + case ICMP6_TIME_EXCEEDED: + case ICMP6_PARAM_PROB: + break; + default: + goto freeit; + } + /* Calculate length of ICMPv6 payload */ + len = (m->m_pkthdr.len > NAT64_ICMP6_PLEN) ? NAT64_ICMP6_PLEN: + m->m_pkthdr.len; + + /* Create new ICMPv6 datagram */ + plen = len + sizeof(struct icmp6_hdr); + n = m_get2(sizeof(struct ip6_hdr) + plen + max_hdr, M_NOWAIT, + MT_HEADER, M_PKTHDR); + if (n == NULL) { + NAT64STAT_INC(stats, nomem); + m_freem(m); + return; + } + /* + * Move pkthdr from original mbuf. We should have initialized some + * fields, because we can reinject this mbuf to netisr and it will + * go trough input path (it requires at least rcvif should be set). + * Also do M_ALIGN() to reduce chances of need to allocate new mbuf + * in the chain, when we will do M_PREPEND() or make some type of + * tunneling. + */ + m_move_pkthdr(n, m); + M_ALIGN(n, sizeof(struct ip6_hdr) + plen + max_hdr); + + n->m_len = n->m_pkthdr.len = sizeof(struct ip6_hdr) + plen; + oip6 = mtod(n, struct ip6_hdr *); + oip6->ip6_src = ip6->ip6_dst; + oip6->ip6_dst = ip6->ip6_src; + oip6->ip6_nxt = IPPROTO_ICMPV6; + oip6->ip6_flow = 0; + oip6->ip6_vfc |= IPV6_VERSION; + oip6->ip6_hlim = V_ip6_defhlim; + oip6->ip6_plen = htons(plen); + + icmp6 = mtodo(n, sizeof(struct ip6_hdr)); + icmp6->icmp6_cksum = 0; + icmp6->icmp6_type = type; + icmp6->icmp6_code = code; + icmp6->icmp6_mtu = htonl(mtu); + + m_copydata(m, 0, len, mtodo(n, sizeof(struct ip6_hdr) + + sizeof(struct icmp6_hdr))); + icmp6->icmp6_cksum = in6_cksum(n, IPPROTO_ICMPV6, + sizeof(struct ip6_hdr), plen); + m_freem(m); + nat64_output_one(n, stats, logdata); + return; +freeit: + NAT64STAT_INC(stats, dropped); + m_freem(m); +} + +static NAT64NOINLINE struct sockaddr* +nat64_find_route4(struct route *ro, in_addr_t dest, struct mbuf *m) +{ + struct sockaddr_in *dst; + struct rtentry *rt; + + bzero(ro, sizeof(*ro)); + dst = (struct sockaddr_in *)&ro->ro_dst; + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr.s_addr = dest; + IN_LOOKUP_ROUTE(ro, M_GETFIB(m)); + rt = ro->ro_rt; + if (rt && (rt->rt_flags & RTF_UP) && + (rt->rt_ifp->if_flags & IFF_UP) && + (rt->rt_ifp->if_drv_flags & IFF_DRV_RUNNING)) { + if (rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *)rt->rt_gateway; + } else + return (NULL); + if (((rt->rt_flags & RTF_REJECT) && + (rt->rt_expire == 0 || + time_uptime < rt->rt_expire)) || + rt->rt_ifp->if_link_state == LINK_STATE_DOWN) + return (NULL); + return ((struct sockaddr *)dst); +} + +#define NAT64_ICMP_PLEN 64 +static NAT64NOINLINE void +nat64_icmp_reflect(struct mbuf *m, uint8_t type, + uint8_t code, uint16_t mtu, nat64_stats_block *stats, void *logdata) +{ + struct icmp *icmp; + struct ip *ip, *oip; + struct mbuf *n; + int len, plen; + + ip = mtod(m, struct ip *); + /* Do not send ICMP error if packet is not the first fragment */ + if (ip->ip_off & ~ntohs(IP_MF|IP_DF)) { + DPRINTF(DP_DROPS, "not first fragment"); + goto freeit; + } + /* Do not send ICMP in reply to ICMP errors */ + if (ip->ip_p == IPPROTO_ICMP) { + if (m->m_len < (ip->ip_hl << 2)) { + DPRINTF(DP_DROPS, "mbuf isn't contigious"); + goto freeit; + } + icmp = mtodo(m, ip->ip_hl << 2); + if (!ICMP_INFOTYPE(icmp->icmp_type)) { + DPRINTF(DP_DROPS, "do not send ICMP in reply to " + "ICMP errors"); + goto freeit; + } + } + switch (type) { + case ICMP_UNREACH: + case ICMP_TIMXCEED: + case ICMP_PARAMPROB: + break; + default: + goto freeit; + } + /* Calculate length of ICMP payload */ + len = (m->m_pkthdr.len > NAT64_ICMP_PLEN) ? (ip->ip_hl << 2) + 8: + m->m_pkthdr.len; + + /* Create new ICMPv4 datagram */ + plen = len + sizeof(struct icmphdr) + sizeof(uint32_t); + n = m_get2(sizeof(struct ip) + plen + max_hdr, M_NOWAIT, + MT_HEADER, M_PKTHDR); + if (n == NULL) { + NAT64STAT_INC(stats, nomem); + m_freem(m); + return; + } + m_move_pkthdr(n, m); + M_ALIGN(n, sizeof(struct ip) + plen + max_hdr); + + n->m_len = n->m_pkthdr.len = sizeof(struct ip) + plen; + oip = mtod(n, struct ip *); + oip->ip_v = IPVERSION; + oip->ip_hl = sizeof(struct ip) >> 2; + oip->ip_tos = 0; + oip->ip_len = htons(n->m_pkthdr.len); + oip->ip_ttl = V_ip_defttl; + oip->ip_p = IPPROTO_ICMP; + ip_fillid(oip); + oip->ip_off = htons(IP_DF); + oip->ip_src = ip->ip_dst; + oip->ip_dst = ip->ip_src; + oip->ip_sum = 0; + oip->ip_sum = in_cksum_hdr(oip); + + icmp = mtodo(n, sizeof(struct ip)); + icmp->icmp_type = type; + icmp->icmp_code = code; + icmp->icmp_cksum = 0; + icmp->icmp_pmvoid = 0; + icmp->icmp_nextmtu = htons(mtu); + m_copydata(m, 0, len, mtodo(n, sizeof(struct ip) + + sizeof(struct icmphdr) + sizeof(uint32_t))); + icmp->icmp_cksum = in_cksum_skip(n, sizeof(struct ip) + plen, + sizeof(struct ip)); + m_freem(m); + nat64_output_one(n, stats, logdata); + return; +freeit: + NAT64STAT_INC(stats, dropped); + m_freem(m); +} + +/* Translate ICMP echo request/reply into ICMPv6 */ +static void +nat64_icmp_handle_echo(struct ip6_hdr *ip6, struct icmp6_hdr *icmp6, + uint16_t id, uint8_t type) +{ + uint16_t old; + + old = *(uint16_t *)icmp6; /* save type+code in one word */ + icmp6->icmp6_type = type; + /* Reflect ICMPv6 -> ICMPv4 type translation in the cksum */ + icmp6->icmp6_cksum = cksum_adjust(icmp6->icmp6_cksum, + old, *(uint16_t *)icmp6); + if (id != 0) { + old = icmp6->icmp6_id; + icmp6->icmp6_id = id; + /* Reflect ICMP id translation in the cksum */ + icmp6->icmp6_cksum = cksum_adjust(icmp6->icmp6_cksum, + old, id); + } + /* Reflect IPv6 pseudo header in the cksum */ + icmp6->icmp6_cksum = ~in6_cksum_pseudo(ip6, ntohs(ip6->ip6_plen), + IPPROTO_ICMPV6, ~icmp6->icmp6_cksum); +} + +static NAT64NOINLINE struct mbuf * +nat64_icmp_translate(struct mbuf *m, struct ip6_hdr *ip6, uint16_t icmpid, + int offset, nat64_stats_block *stats) +{ + struct ip ip; + struct icmp *icmp; + struct tcphdr *tcp; + struct udphdr *udp; + struct ip6_hdr *eip6; + struct mbuf *n; + uint32_t mtu; + int len, hlen, plen; + uint8_t type, code; + + if (m->m_len < offset + ICMP_MINLEN) + m = m_pullup(m, offset + ICMP_MINLEN); + if (m == NULL) { + NAT64STAT_INC(stats, nomem); + return (m); + } + mtu = 0; + icmp = mtodo(m, offset); + /* RFC 7915 p4.2 */ + switch (icmp->icmp_type) { + case ICMP_ECHOREPLY: + type = ICMP6_ECHO_REPLY; + code = 0; + break; + case ICMP_UNREACH: + type = ICMP6_DST_UNREACH; + switch (icmp->icmp_code) { + case ICMP_UNREACH_NET: + case ICMP_UNREACH_HOST: + case ICMP_UNREACH_SRCFAIL: + case ICMP_UNREACH_NET_UNKNOWN: + case ICMP_UNREACH_HOST_UNKNOWN: + case ICMP_UNREACH_TOSNET: + case ICMP_UNREACH_TOSHOST: + code = ICMP6_DST_UNREACH_NOROUTE; + break; + case ICMP_UNREACH_PROTOCOL: + type = ICMP6_PARAM_PROB; + code = ICMP6_PARAMPROB_NEXTHEADER; + break; + case ICMP_UNREACH_PORT: + code = ICMP6_DST_UNREACH_NOPORT; + break; + case ICMP_UNREACH_NEEDFRAG: + type = ICMP6_PACKET_TOO_BIG; + code = 0; + /* XXX: needs an additional look */ + mtu = max(IPV6_MMTU, ntohs(icmp->icmp_nextmtu) + 20); + break; + case ICMP_UNREACH_NET_PROHIB: + case ICMP_UNREACH_HOST_PROHIB: + case ICMP_UNREACH_FILTER_PROHIB: + case ICMP_UNREACH_PRECEDENCE_CUTOFF: + code = ICMP6_DST_UNREACH_ADMIN; + break; + default: + DPRINTF(DP_DROPS, "Unsupported ICMP type %d, code %d", + icmp->icmp_type, icmp->icmp_code); + goto freeit; + } + break; + case ICMP_TIMXCEED: + type = ICMP6_TIME_EXCEEDED; + code = icmp->icmp_code; + break; + case ICMP_ECHO: + type = ICMP6_ECHO_REQUEST; + code = 0; + break; + case ICMP_PARAMPROB: + type = ICMP6_PARAM_PROB; + switch (icmp->icmp_code) { + case ICMP_PARAMPROB_ERRATPTR: + case ICMP_PARAMPROB_LENGTH: + code = ICMP6_PARAMPROB_HEADER; + switch (icmp->icmp_pptr) { + case 0: /* Version/IHL */ + case 1: /* Type Of Service */ + mtu = icmp->icmp_pptr; + break; + case 2: /* Total Length */ + case 3: mtu = 4; /* Payload Length */ + break; + case 8: /* Time to Live */ + mtu = 7; /* Hop Limit */ + break; + case 9: /* Protocol */ + mtu = 6; /* Next Header */ + break; + case 12: /* Source address */ + case 13: + case 14: + case 15: + mtu = 8; + break; + case 16: /* Destination address */ + case 17: + case 18: + case 19: + mtu = 24; + break; + default: /* Silently drop */ + DPRINTF(DP_DROPS, "Unsupported ICMP type %d," + " code %d, pptr %d", icmp->icmp_type, + icmp->icmp_code, icmp->icmp_pptr); + goto freeit; + } + break; + default: + DPRINTF(DP_DROPS, "Unsupported ICMP type %d," + " code %d, pptr %d", icmp->icmp_type, + icmp->icmp_code, icmp->icmp_pptr); + goto freeit; + } + break; + default: + DPRINTF(DP_DROPS, "Unsupported ICMP type %d, code %d", + icmp->icmp_type, icmp->icmp_code); + goto freeit; + } + /* + * For echo request/reply we can use original payload, + * but we need adjust icmp_cksum, because ICMPv6 cksum covers + * IPv6 pseudo header and ICMPv6 types differs from ICMPv4. + */ + if (type == ICMP6_ECHO_REQUEST || type == ICMP6_ECHO_REPLY) { + nat64_icmp_handle_echo(ip6, ICMP6(icmp), icmpid, type); + return (m); + } + /* + * For other types of ICMP messages we need to translate inner + * IPv4 header to IPv6 header. + * Assume ICMP src is the same as payload dst + * E.g. we have ( GWsrc1 , NATIP1 ) in outer header + * and ( NATIP1, Hostdst1 ) in ICMP copy header. + * In that case, we already have map for NATIP1 and GWsrc1. + * The only thing we need is to copy IPv6 map prefix to + * Hostdst1. + */ + hlen = offset + ICMP_MINLEN; + if (m->m_pkthdr.len < hlen + sizeof(struct ip) + ICMP_MINLEN) { + DPRINTF(DP_DROPS, "Message is too short %d", + m->m_pkthdr.len); + goto freeit; + } + m_copydata(m, hlen, sizeof(struct ip), (char *)&ip); + if (ip.ip_v != IPVERSION) { + DPRINTF(DP_DROPS, "Wrong IP version %d", ip.ip_v); + goto freeit; + } + hlen += ip.ip_hl << 2; /* Skip inner IP header */ + if (nat64_check_ip4(ip.ip_src.s_addr) != 0 || + nat64_check_ip4(ip.ip_dst.s_addr) != 0 || + nat64_check_private_ip4(ip.ip_src.s_addr) != 0 || + nat64_check_private_ip4(ip.ip_dst.s_addr) != 0) { + DPRINTF(DP_DROPS, "IP addresses checks failed %04x -> %04x", + ntohl(ip.ip_src.s_addr), ntohl(ip.ip_dst.s_addr)); + goto freeit; + } + if (m->m_pkthdr.len < hlen + ICMP_MINLEN) { + DPRINTF(DP_DROPS, "Message is too short %d", + m->m_pkthdr.len); + goto freeit; + } +#if 0 + /* + * Check that inner source matches the outer destination. + * XXX: We need some method to convert IPv4 into IPv6 address here, + * and compare IPv6 addresses. + */ + if (ip.ip_src.s_addr != nat64_get_ip4(&ip6->ip6_dst)) { + DPRINTF(DP_GENERIC, "Inner source doesn't match destination ", + "%04x vs %04x", ip.ip_src.s_addr, + nat64_get_ip4(&ip6->ip6_dst)); + goto freeit; + } +#endif + /* + * Create new mbuf for ICMPv6 datagram. + * NOTE: len is data length just after inner IP header. + */ + len = m->m_pkthdr.len - hlen; + if (sizeof(struct ip6_hdr) + + sizeof(struct icmp6_hdr) + len > NAT64_ICMP6_PLEN) + len = NAT64_ICMP6_PLEN - sizeof(struct icmp6_hdr) - + sizeof(struct ip6_hdr); + plen = sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr) + len; + n = m_get2(offset + plen + max_hdr, M_NOWAIT, MT_HEADER, M_PKTHDR); + if (n == NULL) { + NAT64STAT_INC(stats, nomem); + m_freem(m); + return (NULL); + } + m_move_pkthdr(n, m); + M_ALIGN(n, offset + plen + max_hdr); + n->m_len = n->m_pkthdr.len = offset + plen; + /* Adjust ip6_plen in outer header */ + ip6->ip6_plen = htons(plen); + /* Construct new inner IPv6 header */ + eip6 = mtodo(n, offset + sizeof(struct icmp6_hdr)); + eip6->ip6_src = ip6->ip6_dst; + /* Use the fact that we have single /96 prefix for IPv4 map */ + eip6->ip6_dst = ip6->ip6_src; + nat64_set_ip4(&eip6->ip6_dst, ip.ip_dst.s_addr); + + eip6->ip6_flow = htonl(ip.ip_tos << 20); + eip6->ip6_vfc |= IPV6_VERSION; + eip6->ip6_hlim = ip.ip_ttl; + eip6->ip6_plen = htons(ntohs(ip.ip_len) - (ip.ip_hl << 2)); + eip6->ip6_nxt = (ip.ip_p == IPPROTO_ICMP) ? IPPROTO_ICMPV6: ip.ip_p; + m_copydata(m, hlen, len, (char *)(eip6 + 1)); + /* + * We need to translate source port in the inner ULP header, + * and adjust ULP checksum. + */ + switch (ip.ip_p) { + case IPPROTO_TCP: + if (len < offsetof(struct tcphdr, th_sum)) + break; + tcp = TCP(eip6 + 1); + if (icmpid != 0) { + tcp->th_sum = cksum_adjust(tcp->th_sum, + tcp->th_sport, icmpid); + tcp->th_sport = icmpid; + } + tcp->th_sum = cksum_add(tcp->th_sum, + ~nat64_cksum_convert(eip6, &ip)); + break; + case IPPROTO_UDP: + if (len < offsetof(struct udphdr, uh_sum)) + break; + udp = UDP(eip6 + 1); + if (icmpid != 0) { + udp->uh_sum = cksum_adjust(udp->uh_sum, + udp->uh_sport, icmpid); + udp->uh_sport = icmpid; + } + udp->uh_sum = cksum_add(udp->uh_sum, + ~nat64_cksum_convert(eip6, &ip)); + break; + case IPPROTO_ICMP: + /* + * Check if this is an ICMP error message for echo request + * that we sent. I.e. ULP in the data containing invoking + * packet is IPPROTO_ICMP and its type is ICMP_ECHO. + */ + icmp = (struct icmp *)(eip6 + 1); + if (icmp->icmp_type != ICMP_ECHO) { + m_freem(n); + goto freeit; + } + /* + * For our client this original datagram should looks + * like it was ICMPv6 datagram with type ICMP6_ECHO_REQUEST. + * Thus we need adjust icmp_cksum and convert type from + * ICMP_ECHO to ICMP6_ECHO_REQUEST. + */ + nat64_icmp_handle_echo(eip6, ICMP6(icmp), icmpid, + ICMP6_ECHO_REQUEST); + } + m_freem(m); + /* Convert ICMPv4 into ICMPv6 header */ + icmp = mtodo(n, offset); + ICMP6(icmp)->icmp6_type = type; + ICMP6(icmp)->icmp6_code = code; + ICMP6(icmp)->icmp6_mtu = htonl(mtu); + ICMP6(icmp)->icmp6_cksum = 0; + ICMP6(icmp)->icmp6_cksum = cksum_add( + ~in6_cksum_pseudo(ip6, plen, IPPROTO_ICMPV6, 0), + in_cksum_skip(n, n->m_pkthdr.len, offset)); + return (n); +freeit: + m_freem(m); + NAT64STAT_INC(stats, dropped); + return (NULL); +} + +int +nat64_getlasthdr(struct mbuf *m, int *offset) +{ + struct ip6_hdr *ip6; + struct ip6_hbh *hbh; + int proto, hlen; + + if (offset != NULL) + hlen = *offset; + else + hlen = 0; + + if (m->m_len < hlen + sizeof(*ip6)) + return (-1); + + ip6 = mtodo(m, hlen); + hlen += sizeof(*ip6); + proto = ip6->ip6_nxt; + /* Skip extension headers */ + while (proto == IPPROTO_HOPOPTS || proto == IPPROTO_ROUTING || + proto == IPPROTO_DSTOPTS) { + hbh = mtodo(m, hlen); + /* + * We expect mbuf has contigious data up to + * upper level header. + */ + if (m->m_len < hlen) + return (-1); + /* + * We doesn't support Jumbo payload option, + * so return error. + */ + if (proto == IPPROTO_HOPOPTS && ip6->ip6_plen == 0) + return (-1); + proto = hbh->ip6h_nxt; + hlen += hbh->ip6h_len << 3; + } + if (offset != NULL) + *offset = hlen; + return (proto); +} + +int +nat64_do_handle_ip4(struct mbuf *m, struct in6_addr *saddr, + struct in6_addr *daddr, uint16_t lport, nat64_stats_block *stats, + void *logdata) +{ + struct route_in6 ro; + struct ip6_hdr ip6; + struct ifnet *ifp; + struct ip *ip; + struct mbufq mq; + struct sockaddr *dst; + uint32_t mtu; + uint16_t ip_id, ip_off; + uint16_t *csum; + int plen, hlen; + uint8_t proto; + + ip = mtod(m, struct ip*); + + if (ip->ip_ttl <= IPTTLDEC) { + nat64_icmp_reflect(m, ICMP_TIMXCEED, + ICMP_TIMXCEED_INTRANS, 0, stats, logdata); + return (NAT64RETURN); + } + + ip6.ip6_dst = *daddr; + ip6.ip6_src = *saddr; + + hlen = ip->ip_hl << 2; + plen = ntohs(ip->ip_len) - hlen; + proto = ip->ip_p; + + /* Save ip_id and ip_off, both are in network byte order */ + ip_id = ip->ip_id; + ip_off = ip->ip_off & htons(IP_OFFMASK | IP_MF); + + /* Fragment length must be multiple of 8 octets */ + if ((ip->ip_off & htons(IP_MF)) != 0 && (plen & 0x7) != 0) { + nat64_icmp_reflect(m, ICMP_PARAMPROB, + ICMP_PARAMPROB_LENGTH, 0, stats, logdata); + return (NAT64RETURN); + } + /* Fragmented ICMP is unsupported */ + if (proto == IPPROTO_ICMP && ip_off != 0) { + DPRINTF(DP_DROPS, "dropped due to fragmented ICMP"); + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + + dst = nat64_find_route6(&ro, &ip6.ip6_dst, m); + if (dst == NULL) { + FREE_ROUTE(&ro); + NAT64STAT_INC(stats, noroute6); + nat64_icmp_reflect(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, + stats, logdata); + return (NAT64RETURN); + } + ifp = ro.ro_rt->rt_ifp; + if (ro.ro_rt->rt_mtu != 0) + mtu = min(ro.ro_rt->rt_mtu, ifp->if_mtu); + else + mtu = ifp->if_mtu; + if (mtu < plen + sizeof(ip6) && (ip->ip_off & htons(IP_DF)) != 0) { + FREE_ROUTE(&ro); + nat64_icmp_reflect(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, + FRAGSZ(mtu) + sizeof(struct ip), stats, logdata); + return (NAT64RETURN); + } + + ip6.ip6_flow = htonl(ip->ip_tos << 20); + ip6.ip6_vfc |= IPV6_VERSION; +#ifdef IPFIREWALL_NAT64_DIRECT_OUTPUT + ip6.ip6_hlim = ip->ip_ttl - IPTTLDEC; +#else + /* Forwarding code will decrement HLIM. */ + ip6.ip6_hlim = ip->ip_ttl; +#endif + ip6.ip6_plen = htons(plen); + ip6.ip6_nxt = (proto == IPPROTO_ICMP) ? IPPROTO_ICMPV6: proto; + /* Convert checksums. */ + switch (proto) { + case IPPROTO_TCP: + csum = &TCP(mtodo(m, hlen))->th_sum; + if (lport != 0) { + struct tcphdr *tcp = TCP(mtodo(m, hlen)); + *csum = cksum_adjust(*csum, tcp->th_dport, lport); + tcp->th_dport = lport; + } + *csum = cksum_add(*csum, ~nat64_cksum_convert(&ip6, ip)); + break; + case IPPROTO_UDP: + csum = &UDP(mtodo(m, hlen))->uh_sum; + if (lport != 0) { + struct udphdr *udp = UDP(mtodo(m, hlen)); + *csum = cksum_adjust(*csum, udp->uh_dport, lport); + udp->uh_dport = lport; + } + *csum = cksum_add(*csum, ~nat64_cksum_convert(&ip6, ip)); + break; + case IPPROTO_ICMP: + m = nat64_icmp_translate(m, &ip6, lport, hlen, stats); + if (m == NULL) { + FREE_ROUTE(&ro); + /* stats already accounted */ + return (NAT64RETURN); + } + } + + m_adj(m, hlen); + mbufq_init(&mq, 255); + nat64_fragment6(stats, &ip6, &mq, m, mtu, ip_id, ip_off); + while ((m = mbufq_dequeue(&mq)) != NULL) { + if (nat64_output(ifp, m, dst, (struct route *)&ro, stats, + logdata) != 0) + break; + NAT64STAT_INC(stats, opcnt46); + } + mbufq_drain(&mq); + FREE_ROUTE(&ro); + return (NAT64RETURN); +} + +int +nat64_handle_icmp6(struct mbuf *m, int hlen, uint32_t aaddr, uint16_t aport, + nat64_stats_block *stats, void *logdata) +{ + struct ip ip; + struct icmp6_hdr *icmp6; + struct ip6_frag *ip6f; + struct ip6_hdr *ip6, *ip6i; + uint32_t mtu; + int plen, proto; + uint8_t type, code; + + if (hlen == 0) { + ip6 = mtod(m, struct ip6_hdr *); + if (nat64_check_ip6(&ip6->ip6_src) != 0 || + nat64_check_ip6(&ip6->ip6_dst) != 0) + return (NAT64SKIP); + + proto = nat64_getlasthdr(m, &hlen); + if (proto != IPPROTO_ICMPV6) { + DPRINTF(DP_DROPS, + "dropped due to mbuf isn't contigious"); + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + } + + /* + * Translate ICMPv6 type and code to ICMPv4 (RFC7915). + * NOTE: ICMPv6 echo handled by nat64_do_handle_ip6(). + */ + icmp6 = mtodo(m, hlen); + mtu = 0; + switch (icmp6->icmp6_type) { + case ICMP6_DST_UNREACH: + type = ICMP_UNREACH; + switch (icmp6->icmp6_code) { + case ICMP6_DST_UNREACH_NOROUTE: + case ICMP6_DST_UNREACH_BEYONDSCOPE: + case ICMP6_DST_UNREACH_ADDR: + code = ICMP_UNREACH_HOST; + break; + case ICMP6_DST_UNREACH_ADMIN: + code = ICMP_UNREACH_HOST_PROHIB; + break; + case ICMP6_DST_UNREACH_NOPORT: + code = ICMP_UNREACH_PORT; + break; + default: + DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d," + " code %d", icmp6->icmp6_type, + icmp6->icmp6_code); + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + break; + case ICMP6_PACKET_TOO_BIG: + type = ICMP_UNREACH; + code = ICMP_UNREACH_NEEDFRAG; + mtu = ntohl(icmp6->icmp6_mtu); + if (mtu < IPV6_MMTU) { + DPRINTF(DP_DROPS, "Wrong MTU %d in ICMPv6 type %d," + " code %d", mtu, icmp6->icmp6_type, + icmp6->icmp6_code); + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + /* + * Adjust MTU to reflect difference between + * IPv6 an IPv4 headers. + */ + mtu -= sizeof(struct ip6_hdr) - sizeof(struct ip); + break; + case ICMP6_TIME_EXCEED_TRANSIT: + type = ICMP_TIMXCEED; + code = ICMP_TIMXCEED_INTRANS; + break; + case ICMP6_PARAM_PROB: + switch (icmp6->icmp6_code) { + case ICMP6_PARAMPROB_HEADER: + type = ICMP_PARAMPROB; + code = ICMP_PARAMPROB_ERRATPTR; + mtu = ntohl(icmp6->icmp6_pptr); + switch (mtu) { + case 0: /* Version/Traffic Class */ + case 1: /* Traffic Class/Flow Label */ + break; + case 4: /* Payload Length */ + case 5: + mtu = 2; + break; + case 6: /* Next Header */ + mtu = 9; + break; + case 7: /* Hop Limit */ + mtu = 8; + break; + default: + if (mtu >= 8 && mtu <= 23) { + mtu = 12; /* Source address */ + break; + } + if (mtu >= 24 && mtu <= 39) { + mtu = 16; /* Destination address */ + break; + } + DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d," + " code %d, pptr %d", icmp6->icmp6_type, + icmp6->icmp6_code, mtu); + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + case ICMP6_PARAMPROB_NEXTHEADER: + type = ICMP_UNREACH; + code = ICMP_UNREACH_PROTOCOL; + break; + default: + DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d," + " code %d, pptr %d", icmp6->icmp6_type, + icmp6->icmp6_code, ntohl(icmp6->icmp6_pptr)); + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + break; + default: + DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d, code %d", + icmp6->icmp6_type, icmp6->icmp6_code); + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + + hlen += sizeof(struct icmp6_hdr); + if (m->m_pkthdr.len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) { + NAT64STAT_INC(stats, dropped); + DPRINTF(DP_DROPS, "Message is too short %d", + m->m_pkthdr.len); + return (NAT64MFREE); + } + /* + * We need at least ICMP_MINLEN bytes of original datagram payload + * to generate ICMP message. It is nice that ICMP_MINLEN is equal + * to sizeof(struct ip6_frag). So, if embedded datagram had a fragment + * header we will not have to do m_pullup() again. + * + * What we have here: + * Outer header: (IPv6iGW, v4mapPRefix+v4exthost) + * Inner header: (v4mapPRefix+v4host, IPv6iHost) [sport, dport] + * We need to translate it to: + * + * Outer header: (alias_host, v4exthost) + * Inner header: (v4exthost, alias_host) [sport, alias_port] + * + * Assume caller function has checked if v4mapPRefix+v4host + * matches configured prefix. + * The only two things we should be provided with are mapping between + * IPv6iHost <> alias_host and between dport and alias_port. + */ + if (m->m_len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) + m = m_pullup(m, hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN); + if (m == NULL) { + NAT64STAT_INC(stats, nomem); + return (NAT64RETURN); + } + ip6 = mtod(m, struct ip6_hdr *); + ip6i = mtodo(m, hlen); + ip6f = NULL; + proto = ip6i->ip6_nxt; + plen = ntohs(ip6i->ip6_plen); + hlen += sizeof(struct ip6_hdr); + if (proto == IPPROTO_FRAGMENT) { + if (m->m_pkthdr.len < hlen + sizeof(struct ip6_frag) + + ICMP_MINLEN) + goto fail; + ip6f = mtodo(m, hlen); + proto = ip6f->ip6f_nxt; + plen -= sizeof(struct ip6_frag); + hlen += sizeof(struct ip6_frag); + /* Ajust MTU to reflect frag header size */ + if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG) + mtu -= sizeof(struct ip6_frag); + } + if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { + DPRINTF(DP_DROPS, "Unsupported proto %d in the inner header", + proto); + goto fail; + } + if (nat64_check_ip6(&ip6i->ip6_src) != 0 || + nat64_check_ip6(&ip6i->ip6_dst) != 0) { + DPRINTF(DP_DROPS, "Inner addresses do not passes the check"); + goto fail; + } + /* Check if outer dst is the same as inner src */ + if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6i->ip6_src)) { + DPRINTF(DP_DROPS, "Inner src doesn't match outer dst"); + goto fail; + } + + /* Now we need to make a fake IPv4 packet to generate ICMP message */ + ip.ip_dst.s_addr = aaddr; + ip.ip_src.s_addr = nat64_get_ip4(&ip6i->ip6_src); + /* XXX: Make fake ulp header */ +#ifdef IPFIREWALL_NAT64_DIRECT_OUTPUT + ip6i->ip6_hlim += IPV6_HLIMDEC; /* init_ip4hdr will decrement it */ +#endif + nat64_init_ip4hdr(ip6i, ip6f, plen, proto, &ip); + m_adj(m, hlen - sizeof(struct ip)); + bcopy(&ip, mtod(m, void *), sizeof(ip)); + nat64_icmp_reflect(m, type, code, (uint16_t)mtu, stats, logdata); + return (NAT64RETURN); +fail: + /* + * We must call m_freem() because mbuf pointer could be + * changed with m_pullup(). + */ + m_freem(m); + NAT64STAT_INC(stats, dropped); + return (NAT64RETURN); +} + +int +nat64_do_handle_ip6(struct mbuf *m, uint32_t aaddr, uint16_t aport, + nat64_stats_block *stats, void *logdata) +{ + struct route ro; + struct ip ip; + struct ifnet *ifp; + struct ip6_frag *frag; + struct ip6_hdr *ip6; + struct icmp6_hdr *icmp6; + struct sockaddr *dst; + uint16_t *csum; + uint32_t mtu; + int plen, hlen, proto; + + /* + * XXX: we expect ipfw_chk() did m_pullup() up to upper level + * protocol's headers. Also we skip some checks, that ip6_input(), + * ip6_forward(), ip6_fastfwd() and ipfw_chk() already did. + */ + ip6 = mtod(m, struct ip6_hdr *); + if (nat64_check_ip6(&ip6->ip6_src) != 0 || + nat64_check_ip6(&ip6->ip6_dst) != 0) { + return (NAT64SKIP); + } + + /* Starting from this point we must not return zero */ + ip.ip_src.s_addr = aaddr; + if (nat64_check_ip4(ip.ip_src.s_addr) != 0) { + DPRINTF(DP_GENERIC, "invalid source address: %08x", + ip.ip_src.s_addr); + /* XXX: stats? */ + return (NAT64MFREE); + } + + ip.ip_dst.s_addr = nat64_get_ip4(&ip6->ip6_dst); + if (ip.ip_dst.s_addr == 0) { + /* XXX: stats? */ + return (NAT64MFREE); + } + + if (ip6->ip6_hlim <= IPV6_HLIMDEC) { + nat64_icmp6_reflect(m, ICMP6_TIME_EXCEEDED, + ICMP6_TIME_EXCEED_TRANSIT, 0, stats, logdata); + return (NAT64RETURN); + } + + hlen = 0; + plen = ntohs(ip6->ip6_plen); + proto = nat64_getlasthdr(m, &hlen); + if (proto < 0) { + DPRINTF(DP_DROPS, "dropped due to mbuf isn't contigious"); + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + frag = NULL; + if (proto == IPPROTO_FRAGMENT) { + /* ipfw_chk should m_pullup up to frag header */ + if (m->m_len < hlen + sizeof(*frag)) { + DPRINTF(DP_DROPS, + "dropped due to mbuf isn't contigious"); + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + frag = mtodo(m, hlen); + proto = frag->ip6f_nxt; + hlen += sizeof(*frag); + /* Fragmented ICMPv6 is unsupported */ + if (proto == IPPROTO_ICMPV6) { + DPRINTF(DP_DROPS, "dropped due to fragmented ICMPv6"); + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + /* Fragment length must be multiple of 8 octets */ + if ((frag->ip6f_offlg & IP6F_MORE_FRAG) != 0 && + ((plen + sizeof(struct ip6_hdr) - hlen) & 0x7) != 0) { + nat64_icmp6_reflect(m, ICMP6_PARAM_PROB, + ICMP6_PARAMPROB_HEADER, + offsetof(struct ip6_hdr, ip6_plen), stats, + logdata); + return (NAT64RETURN); + } + } + plen -= hlen - sizeof(struct ip6_hdr); + if (plen < 0 || m->m_pkthdr.len < plen + hlen) { + DPRINTF(DP_DROPS, "plen %d, pkthdr.len %d, hlen %d", + plen, m->m_pkthdr.len, hlen); + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + + icmp6 = NULL; /* Make gcc happy */ + if (proto == IPPROTO_ICMPV6) { + icmp6 = mtodo(m, hlen); + if (icmp6->icmp6_type != ICMP6_ECHO_REQUEST && + icmp6->icmp6_type != ICMP6_ECHO_REPLY) + return (nat64_handle_icmp6(m, hlen, aaddr, aport, + stats, logdata)); + } + dst = nat64_find_route4(&ro, ip.ip_dst.s_addr, m); + if (dst == NULL) { + FREE_ROUTE(&ro); + NAT64STAT_INC(stats, noroute4); + nat64_icmp6_reflect(m, ICMP6_DST_UNREACH, + ICMP6_DST_UNREACH_NOROUTE, 0, stats, logdata); + return (NAT64RETURN); + } + + ifp = ro.ro_rt->rt_ifp; + if (ro.ro_rt->rt_mtu != 0) + mtu = min(ro.ro_rt->rt_mtu, ifp->if_mtu); + else + mtu = ifp->if_mtu; + if (mtu < plen + sizeof(ip)) { + FREE_ROUTE(&ro); + nat64_icmp6_reflect(m, ICMP6_PACKET_TOO_BIG, 0, mtu, stats, + logdata); + return (NAT64RETURN); + } + nat64_init_ip4hdr(ip6, frag, plen, proto, &ip); + /* Convert checksums. */ + switch (proto) { + case IPPROTO_TCP: + csum = &TCP(mtodo(m, hlen))->th_sum; + if (aport != 0) { + struct tcphdr *tcp = TCP(mtodo(m, hlen)); + *csum = cksum_adjust(*csum, tcp->th_sport, aport); + tcp->th_sport = aport; + } + *csum = cksum_add(*csum, nat64_cksum_convert(ip6, &ip)); + break; + case IPPROTO_UDP: + csum = &UDP(mtodo(m, hlen))->uh_sum; + if (aport != 0) { + struct udphdr *udp = UDP(mtodo(m, hlen)); + *csum = cksum_adjust(*csum, udp->uh_sport, aport); + udp->uh_sport = aport; + } + *csum = cksum_add(*csum, nat64_cksum_convert(ip6, &ip)); + break; + case IPPROTO_ICMPV6: + /* Checksum in ICMPv6 covers pseudo header */ + csum = &icmp6->icmp6_cksum; + *csum = cksum_add(*csum, in6_cksum_pseudo(ip6, plen, + IPPROTO_ICMPV6, 0)); + /* Convert ICMPv6 types to ICMP */ + mtu = *(uint16_t *)icmp6; /* save old word for cksum_adjust */ + if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST) + icmp6->icmp6_type = ICMP_ECHO; + else /* ICMP6_ECHO_REPLY */ + icmp6->icmp6_type = ICMP_ECHOREPLY; + *csum = cksum_adjust(*csum, (uint16_t)mtu, *(uint16_t *)icmp6); + if (aport != 0) { + uint16_t old_id = icmp6->icmp6_id; + icmp6->icmp6_id = aport; + *csum = cksum_adjust(*csum, old_id, aport); + } + break; + }; + + m_adj(m, hlen - sizeof(ip)); + bcopy(&ip, mtod(m, void *), sizeof(ip)); + if (nat64_output(ifp, m, dst, &ro, stats, logdata) == 0) + NAT64STAT_INC(stats, opcnt64); + FREE_ROUTE(&ro); + return (NAT64RETURN); +} + diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64_translate.h b/freebsd/sys/netpfil/ipfw/nat64/nat64_translate.h new file mode 100644 index 00000000..9f653954 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/nat64/nat64_translate.h @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 2015-2016 Yandex LLC + * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IP_FW_NAT64_TRANSLATE_H_ +#define _IP_FW_NAT64_TRANSLATE_H_ + +#ifdef RTALLOC_NOLOCK +#define IN_LOOKUP_ROUTE(ro, fib) rtalloc_fib_nolock((ro), 0, (fib)) +#define IN6_LOOKUP_ROUTE(ro, fib) in6_rtalloc_nolock((ro), (fib)) +#define FREE_ROUTE(ro) +#else +#define IN_LOOKUP_ROUTE(ro, fib) rtalloc_ign_fib((ro), 0, (fib)) +#define IN6_LOOKUP_ROUTE(ro, fib) in6_rtalloc((ro), (fib)) +#define FREE_ROUTE(ro) RO_RTFREE((ro)) +#endif + +static inline int +nat64_check_ip6(struct in6_addr *addr) +{ + + /* XXX: We should really check /8 */ + if (addr->s6_addr16[0] == 0 || /* 0000::/8 Reserved by IETF */ + IN6_IS_ADDR_MULTICAST(addr) || IN6_IS_ADDR_LINKLOCAL(addr)) + return (1); + return (0); +} + +extern int nat64_allow_private; +static inline int +nat64_check_private_ip4(in_addr_t ia) +{ + + if (nat64_allow_private) + return (0); + /* WKPFX must not be used to represent non-global IPv4 addresses */ +// if (cfg->flags & NAT64_WKPFX) { + /* IN_PRIVATE */ + if ((ia & htonl(0xff000000)) == htonl(0x0a000000) || + (ia & htonl(0xfff00000)) == htonl(0xac100000) || + (ia & htonl(0xffff0000)) == htonl(0xc0a80000)) + return (1); + /* + * RFC 5735: + * 192.0.0.0/24 - reserved for IETF protocol assignments + * 192.88.99.0/24 - for use as 6to4 relay anycast addresses + * 198.18.0.0/15 - for use in benchmark tests + * 192.0.2.0/24, 198.51.100.0/24, 203.0.113.0/24 - for use + * in documentation and example code + */ + if ((ia & htonl(0xffffff00)) == htonl(0xc0000000) || + (ia & htonl(0xffffff00)) == htonl(0xc0586300) || + (ia & htonl(0xfffffe00)) == htonl(0xc6120000) || + (ia & htonl(0xffffff00)) == htonl(0xc0000200) || + (ia & htonl(0xfffffe00)) == htonl(0xc6336400) || + (ia & htonl(0xffffff00)) == htonl(0xcb007100)) + return (1); +// } + return (0); +} + +static inline int +nat64_check_ip4(in_addr_t ia) +{ + + /* IN_LOOPBACK */ + if ((ia & htonl(0xff000000)) == htonl(0x7f000000)) + return (1); + /* IN_LINKLOCAL */ + if ((ia & htonl(0xffff0000)) == htonl(0xa9fe0000)) + return (1); + /* IN_MULTICAST & IN_EXPERIMENTAL */ + if ((ia & htonl(0xe0000000)) == htonl(0xe0000000)) + return (1); + return (0); +} + +#define nat64_get_ip4(_ip6) ((_ip6)->s6_addr32[3]) +#define nat64_set_ip4(_ip6, _ip4) (_ip6)->s6_addr32[3] = (_ip4) + +int nat64_getlasthdr(struct mbuf *m, int *offset); +int nat64_do_handle_ip4(struct mbuf *m, struct in6_addr *saddr, + struct in6_addr *daddr, uint16_t lport, nat64_stats_block *stats, + void *logdata); +int nat64_do_handle_ip6(struct mbuf *m, uint32_t aaddr, uint16_t aport, + nat64_stats_block *stats, void *logdata); +int nat64_handle_icmp6(struct mbuf *m, int hlen, uint32_t aaddr, uint16_t aport, + nat64_stats_block *stats, void *logdata); + +#endif + diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64lsn.c b/freebsd/sys/netpfil/ipfw/nat64/nat64lsn.c new file mode 100644 index 00000000..ce666213 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/nat64/nat64lsn.c @@ -0,0 +1,1772 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2015-2016 Yandex LLC + * Copyright (c) 2015 Alexander V. Chernikov <melifaro@FreeBSD.org> + * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/counter.h> +#include <rtems/bsd/sys/errno.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/rmlock.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/queue.h> +#include <sys/syslog.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_pflog.h> +#include <net/pfil.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_fw.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet/ip_icmp.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet6/in6_var.h> +#include <netinet6/ip6_var.h> +#include <netinet6/ip_fw_nat64.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/nat64/ip_fw_nat64.h> +#include <netpfil/ipfw/nat64/nat64lsn.h> +#include <netpfil/ipfw/nat64/nat64_translate.h> +#include <netpfil/pf/pf.h> + +MALLOC_DEFINE(M_NAT64LSN, "NAT64LSN", "NAT64LSN"); + +static void nat64lsn_periodic(void *data); +#define PERIODIC_DELAY 4 +static uint8_t nat64lsn_proto_map[256]; +uint8_t nat64lsn_rproto_map[NAT_MAX_PROTO]; + +#define NAT64_FLAG_FIN 0x01 /* FIN was seen */ +#define NAT64_FLAG_SYN 0x02 /* First syn in->out */ +#define NAT64_FLAG_ESTAB 0x04 /* Packet with Ack */ +#define NAT64_FLAGS_TCP (NAT64_FLAG_SYN|NAT64_FLAG_ESTAB|NAT64_FLAG_FIN) + +#define NAT64_FLAG_RDR 0x80 /* Port redirect */ +#define NAT64_LOOKUP(chain, cmd) \ + (struct nat64lsn_cfg *)SRV_OBJECT((chain), (cmd)->arg1) +/* + * Delayed job queue, used to create new hosts + * and new portgroups + */ +enum nat64lsn_jtype { + JTYPE_NEWHOST = 1, + JTYPE_NEWPORTGROUP, + JTYPE_DELPORTGROUP, +}; + +struct nat64lsn_job_item { + TAILQ_ENTRY(nat64lsn_job_item) next; + enum nat64lsn_jtype jtype; + struct nat64lsn_host *nh; + struct nat64lsn_portgroup *pg; + void *spare_idx; + struct in6_addr haddr; + uint8_t nat_proto; + uint8_t done; + int needs_idx; + int delcount; + unsigned int fhash; /* Flow hash */ + uint32_t aaddr; /* Last used address (net) */ + struct mbuf *m; + struct ipfw_flow_id f_id; + uint64_t delmask[NAT64LSN_PGPTRNMASK]; +}; + +static struct mtx jmtx; +#define JQUEUE_LOCK_INIT() mtx_init(&jmtx, "qlock", NULL, MTX_DEF) +#define JQUEUE_LOCK_DESTROY() mtx_destroy(&jmtx) +#define JQUEUE_LOCK() mtx_lock(&jmtx) +#define JQUEUE_UNLOCK() mtx_unlock(&jmtx) + +static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, + struct nat64lsn_job_item *ji); +static void nat64lsn_enqueue_jobs(struct nat64lsn_cfg *cfg, + struct nat64lsn_job_head *jhead, int jlen); + +static struct nat64lsn_job_item *nat64lsn_create_job(struct nat64lsn_cfg *cfg, + const struct ipfw_flow_id *f_id, int jtype); +static int nat64lsn_request_portgroup(struct nat64lsn_cfg *cfg, + const struct ipfw_flow_id *f_id, struct mbuf **pm, uint32_t aaddr, + int needs_idx); +static int nat64lsn_request_host(struct nat64lsn_cfg *cfg, + const struct ipfw_flow_id *f_id, struct mbuf **pm); +static int nat64lsn_translate4(struct nat64lsn_cfg *cfg, + const struct ipfw_flow_id *f_id, struct mbuf **pm); +static int nat64lsn_translate6(struct nat64lsn_cfg *cfg, + struct ipfw_flow_id *f_id, struct mbuf **pm); + +static int alloc_portgroup(struct nat64lsn_job_item *ji); +static void destroy_portgroup(struct nat64lsn_portgroup *pg); +static void destroy_host6(struct nat64lsn_host *nh); +static int alloc_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); + +static int attach_portgroup(struct nat64lsn_cfg *cfg, + struct nat64lsn_job_item *ji); +static int attach_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); + + +/* XXX tmp */ +static uma_zone_t nat64lsn_host_zone; +static uma_zone_t nat64lsn_pg_zone; +static uma_zone_t nat64lsn_pgidx_zone; + +static unsigned int nat64lsn_periodic_chkstates(struct nat64lsn_cfg *cfg, + struct nat64lsn_host *nh); + +#define I6_hash(x) (djb_hash((const unsigned char *)(x), 16)) +#define I6_first(_ph, h) (_ph)[h] +#define I6_next(x) (x)->next +#define I6_val(x) (&(x)->addr) +#define I6_cmp(a, b) IN6_ARE_ADDR_EQUAL(a, b) +#define I6_lock(a, b) +#define I6_unlock(a, b) + +#define I6HASH_FIND(_cfg, _res, _a) \ + CHT_FIND(_cfg->ih, _cfg->ihsize, I6_, _res, _a) +#define I6HASH_INSERT(_cfg, _i) \ + CHT_INSERT_HEAD(_cfg->ih, _cfg->ihsize, I6_, _i) +#define I6HASH_REMOVE(_cfg, _res, _tmp, _a) \ + CHT_REMOVE(_cfg->ih, _cfg->ihsize, I6_, _res, _tmp, _a) + +#define I6HASH_FOREACH_SAFE(_cfg, _x, _tmp, _cb, _arg) \ + CHT_FOREACH_SAFE(_cfg->ih, _cfg->ihsize, I6_, _x, _tmp, _cb, _arg) + +#define HASH_IN4(x) djb_hash((const unsigned char *)(x), 8) + +static unsigned +djb_hash(const unsigned char *h, const int len) +{ + unsigned int result = 0; + int i; + + for (i = 0; i < len; i++) + result = 33 * result ^ h[i]; + + return (result); +} + +/* +static size_t +bitmask_size(size_t num, int *level) +{ + size_t x; + int c; + + for (c = 0, x = num; num > 1; num /= 64, c++) + ; + + return (x); +} + +static void +bitmask_prepare(uint64_t *pmask, size_t bufsize, int level) +{ + size_t x, z; + + memset(pmask, 0xFF, bufsize); + for (x = 0, z = 1; level > 1; x += z, z *= 64, level--) + ; + pmask[x] ~= 0x01; +} +*/ + +static void +nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family, + uint32_t n, uint32_t sn) +{ + + memset(plog, 0, sizeof(plog)); + plog->length = PFLOG_REAL_HDRLEN; + plog->af = family; + plog->action = PF_NAT; + plog->dir = PF_IN; + plog->rulenr = htonl(n); + plog->subrulenr = htonl(sn); + plog->ruleset[0] = '\0'; + strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname)); + ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m); +} +/* + * Inspects icmp packets to see if the message contains different + * packet header so we need to alter @addr and @port. + */ +static int +inspect_icmp_mbuf(struct mbuf **m, uint8_t *nat_proto, uint32_t *addr, + uint16_t *port) +{ + struct ip *ip; + struct tcphdr *tcp; + struct udphdr *udp; + struct icmphdr *icmp; + int off; + uint8_t proto; + + ip = mtod(*m, struct ip *); /* Outer IP header */ + off = (ip->ip_hl << 2) + ICMP_MINLEN; + if ((*m)->m_len < off) + *m = m_pullup(*m, off); + if (*m == NULL) + return (ENOMEM); + + ip = mtod(*m, struct ip *); /* Outer IP header */ + icmp = L3HDR(ip, struct icmphdr *); + switch (icmp->icmp_type) { + case ICMP_ECHO: + case ICMP_ECHOREPLY: + /* Use icmp ID as distinguisher */ + *port = ntohs(*((uint16_t *)(icmp + 1))); + return (0); + case ICMP_UNREACH: + case ICMP_TIMXCEED: + break; + default: + return (EOPNOTSUPP); + } + /* + * ICMP_UNREACH and ICMP_TIMXCEED contains IP header + 64 bits + * of ULP header. + */ + if ((*m)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN) + return (EINVAL); + if ((*m)->m_len < off + sizeof(struct ip) + ICMP_MINLEN) + *m = m_pullup(*m, off + sizeof(struct ip) + ICMP_MINLEN); + if (*m == NULL) + return (ENOMEM); + ip = mtodo(*m, off); /* Inner IP header */ + proto = ip->ip_p; + off += ip->ip_hl << 2; /* Skip inner IP header */ + *addr = ntohl(ip->ip_src.s_addr); + if ((*m)->m_len < off + ICMP_MINLEN) + *m = m_pullup(*m, off + ICMP_MINLEN); + if (*m == NULL) + return (ENOMEM); + switch (proto) { + case IPPROTO_TCP: + tcp = mtodo(*m, off); + *nat_proto = NAT_PROTO_TCP; + *port = ntohs(tcp->th_sport); + return (0); + case IPPROTO_UDP: + udp = mtodo(*m, off); + *nat_proto = NAT_PROTO_UDP; + *port = ntohs(udp->uh_sport); + return (0); + case IPPROTO_ICMP: + /* + * We will translate only ICMP errors for our ICMP + * echo requests. + */ + icmp = mtodo(*m, off); + if (icmp->icmp_type != ICMP_ECHO) + return (EOPNOTSUPP); + *port = ntohs(*((uint16_t *)(icmp + 1))); + return (0); + }; + return (EOPNOTSUPP); +} + +static inline uint8_t +convert_tcp_flags(uint8_t flags) +{ + uint8_t result; + + result = flags & (TH_FIN|TH_SYN); + result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */ + result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */ + + return (result); +} + +static NAT64NOINLINE int +nat64lsn_translate4(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id, + struct mbuf **pm) +{ + struct pfloghdr loghdr, *logdata; + struct in6_addr src6; + struct nat64lsn_portgroup *pg; + struct nat64lsn_host *nh; + struct nat64lsn_state *st; + struct ip *ip; + uint32_t addr; + uint16_t state_flags, state_ts; + uint16_t port, lport; + uint8_t nat_proto; + int ret; + + addr = f_id->dst_ip; + port = f_id->dst_port; + if (addr < cfg->prefix4 || addr > cfg->pmask4) { + NAT64STAT_INC(&cfg->stats, nomatch4); + return (cfg->nomatch_verdict); + } + + /* Check if protocol is supported and get its short id */ + nat_proto = nat64lsn_proto_map[f_id->proto]; + if (nat_proto == 0) { + NAT64STAT_INC(&cfg->stats, noproto); + return (cfg->nomatch_verdict); + } + + /* We might need to handle icmp differently */ + if (nat_proto == NAT_PROTO_ICMP) { + ret = inspect_icmp_mbuf(pm, &nat_proto, &addr, &port); + if (ret != 0) { + if (ret == ENOMEM) + NAT64STAT_INC(&cfg->stats, nomem); + else + NAT64STAT_INC(&cfg->stats, noproto); + return (cfg->nomatch_verdict); + } + /* XXX: Check addr for validity */ + if (addr < cfg->prefix4 || addr > cfg->pmask4) { + NAT64STAT_INC(&cfg->stats, nomatch4); + return (cfg->nomatch_verdict); + } + } + + /* Calc portgroup offset w.r.t protocol */ + pg = GET_PORTGROUP(cfg, addr, nat_proto, port); + + /* Check if this port is occupied by any portgroup */ + if (pg == NULL) { + NAT64STAT_INC(&cfg->stats, nomatch4); +#if 0 + DPRINTF(DP_STATE, "NOMATCH %u %d %d (%d)", addr, nat_proto, port, + _GET_PORTGROUP_IDX(cfg, addr, nat_proto, port)); +#endif + return (cfg->nomatch_verdict); + } + + /* TODO: Check flags to see if we need to do some static mapping */ + nh = pg->host; + + /* Prepare some fields we might need to update */ + SET_AGE(state_ts); + ip = mtod(*pm, struct ip *); + if (ip->ip_p == IPPROTO_TCP) + state_flags = convert_tcp_flags( + L3HDR(ip, struct tcphdr *)->th_flags); + else + state_flags = 0; + + /* Lock host and get port mapping */ + NAT64_LOCK(nh); + + st = &pg->states[port & (NAT64_CHUNK_SIZE - 1)]; + if (st->timestamp != state_ts) + st->timestamp = state_ts; + if ((st->flags & state_flags) != state_flags) + st->flags |= state_flags; + lport = htons(st->u.s.lport); + + NAT64_UNLOCK(nh); + + if (cfg->flags & NAT64_LOG) { + logdata = &loghdr; + nat64lsn_log(logdata, *pm, AF_INET, pg->idx, st->cur.off); + } else + logdata = NULL; + + src6.s6_addr32[0] = cfg->prefix6.s6_addr32[0]; + src6.s6_addr32[1] = cfg->prefix6.s6_addr32[1]; + src6.s6_addr32[2] = cfg->prefix6.s6_addr32[2]; + src6.s6_addr32[3] = htonl(f_id->src_ip); + + ret = nat64_do_handle_ip4(*pm, &src6, &nh->addr, lport, + &cfg->stats, logdata); + + if (ret == NAT64SKIP) + return (IP_FW_PASS); + if (ret == NAT64MFREE) + m_freem(*pm); + *pm = NULL; + + return (IP_FW_DENY); +} + +void +nat64lsn_dump_state(const struct nat64lsn_cfg *cfg, + const struct nat64lsn_portgroup *pg, const struct nat64lsn_state *st, + const char *px, int off) +{ + char s[INET6_ADDRSTRLEN], a[INET_ADDRSTRLEN], d[INET_ADDRSTRLEN]; + + if ((nat64_debug & DP_STATE) == 0) + return; + inet_ntop(AF_INET6, &pg->host->addr, s, sizeof(s)); + inet_ntop(AF_INET, &pg->aaddr, a, sizeof(a)); + inet_ntop(AF_INET, &st->u.s.faddr, d, sizeof(d)); + + DPRINTF(DP_STATE, "%s: PG %d ST [%p|%d]: %s:%d/%d <%s:%d> " + "%s:%d AGE %d", px, pg->idx, st, off, + s, st->u.s.lport, pg->nat_proto, a, pg->aport + off, + d, st->u.s.fport, GET_AGE(st->timestamp)); +} + +/* + * Check if particular TCP state is stale and should be deleted. + * Return 1 if true, 0 otherwise. + */ +static int +nat64lsn_periodic_check_tcp(const struct nat64lsn_cfg *cfg, + const struct nat64lsn_state *st, int age) +{ + int ttl; + + if (st->flags & NAT64_FLAG_FIN) + ttl = cfg->st_close_ttl; + else if (st->flags & NAT64_FLAG_ESTAB) + ttl = cfg->st_estab_ttl; + else if (st->flags & NAT64_FLAG_SYN) + ttl = cfg->st_syn_ttl; + else + ttl = cfg->st_syn_ttl; + + if (age > ttl) + return (1); + return (0); +} + +/* + * Check if nat state @st is stale and should be deleted. + * Return 1 if true, 0 otherwise. + */ +static NAT64NOINLINE int +nat64lsn_periodic_chkstate(const struct nat64lsn_cfg *cfg, + const struct nat64lsn_portgroup *pg, const struct nat64lsn_state *st) +{ + int age, delete; + + age = GET_AGE(st->timestamp); + delete = 0; + + /* Skip immutable records */ + if (st->flags & NAT64_FLAG_RDR) + return (0); + + switch (pg->nat_proto) { + case NAT_PROTO_TCP: + delete = nat64lsn_periodic_check_tcp(cfg, st, age); + break; + case NAT_PROTO_UDP: + if (age > cfg->st_udp_ttl) + delete = 1; + break; + case NAT_PROTO_ICMP: + if (age > cfg->st_icmp_ttl) + delete = 1; + break; + } + + return (delete); +} + + +/* + * The following structures and functions + * are used to perform SLIST_FOREACH_SAFE() + * analog for states identified by struct st_ptr. + */ + +struct st_idx { + struct nat64lsn_portgroup *pg; + struct nat64lsn_state *st; + struct st_ptr sidx_next; +}; + +static struct st_idx * +st_first(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh, + struct st_ptr *sidx, struct st_idx *si) +{ + struct nat64lsn_portgroup *pg; + struct nat64lsn_state *st; + + if (sidx->idx == 0) { + memset(si, 0, sizeof(*si)); + return (si); + } + + pg = PORTGROUP_BYSIDX(cfg, nh, sidx->idx); + st = &pg->states[sidx->off]; + + si->pg = pg; + si->st = st; + si->sidx_next = st->next; + + return (si); +} + +static struct st_idx * +st_next(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh, + struct st_idx *si) +{ + struct st_ptr sidx; + struct nat64lsn_portgroup *pg; + struct nat64lsn_state *st; + + sidx = si->sidx_next; + if (sidx.idx == 0) { + memset(si, 0, sizeof(*si)); + si->st = NULL; + si->pg = NULL; + return (si); + } + + pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx); + st = &pg->states[sidx.off]; + + si->pg = pg; + si->st = st; + si->sidx_next = st->next; + + return (si); +} + +static struct st_idx * +st_save_cond(struct st_idx *si_dst, struct st_idx *si) +{ + if (si->st != NULL) + *si_dst = *si; + + return (si_dst); +} + +unsigned int +nat64lsn_periodic_chkstates(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh) +{ + struct st_idx si, si_prev; + int i; + unsigned int delcount; + + delcount = 0; + for (i = 0; i < nh->hsize; i++) { + memset(&si_prev, 0, sizeof(si_prev)); + for (st_first(cfg, nh, &nh->phash[i], &si); + si.st != NULL; + st_save_cond(&si_prev, &si), st_next(cfg, nh, &si)) { + if (nat64lsn_periodic_chkstate(cfg, si.pg, si.st) == 0) + continue; + nat64lsn_dump_state(cfg, si.pg, si.st, "DELETE STATE", + si.st->cur.off); + /* Unlink from hash */ + if (si_prev.st != NULL) + si_prev.st->next = si.st->next; + else + nh->phash[i] = si.st->next; + /* Delete state and free its data */ + PG_MARK_FREE_IDX(si.pg, si.st->cur.off); + memset(si.st, 0, sizeof(struct nat64lsn_state)); + si.st = NULL; + delcount++; + + /* Update portgroup timestamp */ + SET_AGE(si.pg->timestamp); + } + } + NAT64STAT_ADD(&cfg->stats, sdeleted, delcount); + return (delcount); +} + +/* + * Checks if portgroup is not used and can be deleted, + * Returns 1 if stale, 0 otherwise + */ +static int +stale_pg(const struct nat64lsn_cfg *cfg, const struct nat64lsn_portgroup *pg) +{ + + if (!PG_IS_EMPTY(pg)) + return (0); + if (GET_AGE(pg->timestamp) < cfg->pg_delete_delay) + return (0); + return (1); +} + +/* + * Checks if host record is not used and can be deleted, + * Returns 1 if stale, 0 otherwise + */ +static int +stale_nh(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh) +{ + + if (nh->pg_used != 0) + return (0); + if (GET_AGE(nh->timestamp) < cfg->nh_delete_delay) + return (0); + return (1); +} + +struct nat64lsn_periodic_data { + struct nat64lsn_cfg *cfg; + struct nat64lsn_job_head jhead; + int jlen; +}; + +static NAT64NOINLINE int +nat64lsn_periodic_chkhost(struct nat64lsn_host *nh, + struct nat64lsn_periodic_data *d) +{ + char a[INET6_ADDRSTRLEN]; + struct nat64lsn_portgroup *pg; + struct nat64lsn_job_item *ji; + uint64_t delmask[NAT64LSN_PGPTRNMASK]; + int delcount, i; + + delcount = 0; + memset(delmask, 0, sizeof(delmask)); + + inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); + DPRINTF(DP_JQUEUE, "Checking %s host %s on cpu %d", + stale_nh(d->cfg, nh) ? "stale" : "non-stale", a, curcpu); + if (!stale_nh(d->cfg, nh)) { + /* Non-stale host. Inspect internals */ + NAT64_LOCK(nh); + + /* Stage 1: Check&expire states */ + if (nat64lsn_periodic_chkstates(d->cfg, nh) != 0) + SET_AGE(nh->timestamp); + + /* Stage 2: Check if we need to expire */ + for (i = 0; i < nh->pg_used; i++) { + pg = PORTGROUP_BYSIDX(d->cfg, nh, i + 1); + if (pg == NULL) + continue; + + /* Check if we can delete portgroup */ + if (stale_pg(d->cfg, pg) == 0) + continue; + + DPRINTF(DP_JQUEUE, "Check PG %d", i); + delmask[i / 64] |= ((uint64_t)1 << (i % 64)); + delcount++; + } + + NAT64_UNLOCK(nh); + if (delcount == 0) + return (0); + } + + DPRINTF(DP_JQUEUE, "Queueing %d portgroups for deleting", delcount); + /* We have something to delete - add it to queue */ + ji = nat64lsn_create_job(d->cfg, NULL, JTYPE_DELPORTGROUP); + if (ji == NULL) + return (0); + + ji->haddr = nh->addr; + ji->delcount = delcount; + memcpy(ji->delmask, delmask, sizeof(ji->delmask)); + + TAILQ_INSERT_TAIL(&d->jhead, ji, next); + d->jlen++; + return (0); +} + +/* + * This procedure is used to perform various maintance + * on dynamic hash list. Currently it is called every second. + */ +static void +nat64lsn_periodic(void *data) +{ + struct ip_fw_chain *ch; + IPFW_RLOCK_TRACKER; + struct nat64lsn_cfg *cfg; + struct nat64lsn_periodic_data d; + struct nat64lsn_host *nh, *tmp; + + cfg = (struct nat64lsn_cfg *) data; + ch = cfg->ch; + CURVNET_SET(cfg->vp); + + memset(&d, 0, sizeof(d)); + d.cfg = cfg; + TAILQ_INIT(&d.jhead); + + IPFW_RLOCK(ch); + + /* Stage 1: foreach host, check all its portgroups */ + I6HASH_FOREACH_SAFE(cfg, nh, tmp, nat64lsn_periodic_chkhost, &d); + + /* Enqueue everything we have requested */ + nat64lsn_enqueue_jobs(cfg, &d.jhead, d.jlen); + + callout_schedule(&cfg->periodic, hz * PERIODIC_DELAY); + + IPFW_RUNLOCK(ch); + + CURVNET_RESTORE(); +} + +static NAT64NOINLINE void +reinject_mbuf(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) +{ + + if (ji->m == NULL) + return; + + /* Request has failed or packet type is wrong */ + if (ji->f_id.addr_type != 6 || ji->done == 0) { + m_freem(ji->m); + ji->m = NULL; + NAT64STAT_INC(&cfg->stats, dropped); + DPRINTF(DP_DROPS, "mbuf dropped: type %d, done %d", + ji->jtype, ji->done); + return; + } + + /* + * XXX: Limit recursion level + */ + + NAT64STAT_INC(&cfg->stats, jreinjected); + DPRINTF(DP_JQUEUE, "Reinject mbuf"); + nat64lsn_translate6(cfg, &ji->f_id, &ji->m); +} + +static void +destroy_portgroup(struct nat64lsn_portgroup *pg) +{ + + DPRINTF(DP_OBJ, "DESTROY PORTGROUP %d %p", pg->idx, pg); + uma_zfree(nat64lsn_pg_zone, pg); +} + +static NAT64NOINLINE int +alloc_portgroup(struct nat64lsn_job_item *ji) +{ + struct nat64lsn_portgroup *pg; + + pg = uma_zalloc(nat64lsn_pg_zone, M_NOWAIT); + if (pg == NULL) + return (1); + + if (ji->needs_idx != 0) { + ji->spare_idx = uma_zalloc(nat64lsn_pgidx_zone, M_NOWAIT); + /* Failed alloc isn't always fatal, so don't check */ + } + memset(&pg->freemask, 0xFF, sizeof(pg->freemask)); + pg->nat_proto = ji->nat_proto; + ji->pg = pg; + return (0); + +} + +static void +destroy_host6(struct nat64lsn_host *nh) +{ + char a[INET6_ADDRSTRLEN]; + int i; + + inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); + DPRINTF(DP_OBJ, "DESTROY HOST %s %p (pg used %d)", a, nh, + nh->pg_used); + NAT64_LOCK_DESTROY(nh); + for (i = 0; i < nh->pg_allocated / NAT64LSN_PGIDX_CHUNK; i++) + uma_zfree(nat64lsn_pgidx_zone, PORTGROUP_CHUNK(nh, i)); + uma_zfree(nat64lsn_host_zone, nh); +} + +static NAT64NOINLINE int +alloc_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) +{ + struct nat64lsn_host *nh; + char a[INET6_ADDRSTRLEN]; + + nh = uma_zalloc(nat64lsn_host_zone, M_NOWAIT); + if (nh == NULL) + return (1); + PORTGROUP_CHUNK(nh, 0) = uma_zalloc(nat64lsn_pgidx_zone, M_NOWAIT); + if (PORTGROUP_CHUNK(nh, 0) == NULL) { + uma_zfree(nat64lsn_host_zone, nh); + return (2); + } + if (alloc_portgroup(ji) != 0) { + NAT64STAT_INC(&cfg->stats, jportfails); + uma_zfree(nat64lsn_pgidx_zone, PORTGROUP_CHUNK(nh, 0)); + uma_zfree(nat64lsn_host_zone, nh); + return (3); + } + + NAT64_LOCK_INIT(nh); + nh->addr = ji->haddr; + nh->hsize = NAT64LSN_HSIZE; /* XXX: hardcoded size */ + nh->pg_allocated = NAT64LSN_PGIDX_CHUNK; + nh->pg_used = 0; + ji->nh = nh; + + inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); + DPRINTF(DP_OBJ, "ALLOC HOST %s %p", a, ji->nh); + return (0); +} + +/* + * Finds free @pg index inside @nh + */ +static NAT64NOINLINE int +find_nh_pg_idx(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh, int *idx) +{ + int i; + + for (i = 0; i < nh->pg_allocated; i++) { + if (PORTGROUP_BYSIDX(cfg, nh, i + 1) == NULL) { + *idx = i; + return (0); + } + } + return (1); +} + +static NAT64NOINLINE int +attach_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) +{ + char a[INET6_ADDRSTRLEN]; + struct nat64lsn_host *nh; + + I6HASH_FIND(cfg, nh, &ji->haddr); + if (nh == NULL) { + /* Add new host to list */ + nh = ji->nh; + I6HASH_INSERT(cfg, nh); + cfg->ihcount++; + ji->nh = NULL; + + inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); + DPRINTF(DP_OBJ, "ATTACH HOST %s %p", a, nh); + /* + * Try to add portgroup. + * Note it will automatically set + * 'done' on ji if successful. + */ + if (attach_portgroup(cfg, ji) != 0) { + DPRINTF(DP_DROPS, "%s %p failed to attach PG", + a, nh); + NAT64STAT_INC(&cfg->stats, jportfails); + return (1); + } + return (0); + } + + /* + * nh isn't NULL. This probably means we had several simultaneous + * host requests. The previous one request has already attached + * this host. Requeue attached mbuf and mark job as done, but + * leave nh and pg pointers not changed, so nat64lsn_do_request() + * will release all allocated resources. + */ + inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); + DPRINTF(DP_OBJ, "%s %p is already attached as %p", + a, ji->nh, nh); + ji->done = 1; + return (0); +} + +static NAT64NOINLINE int +find_pg_place_addr(const struct nat64lsn_cfg *cfg, int addr_off, + int nat_proto, uint16_t *aport, int *ppg_idx) +{ + int j, pg_idx; + + pg_idx = addr_off * _ADDR_PG_COUNT + + (nat_proto - 1) * _ADDR_PG_PROTO_COUNT; + + for (j = NAT64_MIN_CHUNK; j < _ADDR_PG_PROTO_COUNT; j++) { + if (cfg->pg[pg_idx + j] != NULL) + continue; + + *aport = j * NAT64_CHUNK_SIZE; + *ppg_idx = pg_idx + j; + return (1); + } + + return (0); +} + +/* + * XXX: This function needs to be rewritten to + * use free bitmask for faster pg finding, + * additionally, it should take into consideration + * a) randomization and + * b) previous addresses allocated to given nat instance + * + */ +static NAT64NOINLINE int +find_portgroup_place(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji, + uint32_t *aaddr, uint16_t *aport, int *ppg_idx) +{ + int i, nat_proto; + + /* + * XXX: Use bitmask index to be able to find/check if IP address + * has some spare pg's + */ + nat_proto = ji->nat_proto; + + /* First, try to use same address */ + if (ji->aaddr != 0) { + i = ntohl(ji->aaddr) - cfg->prefix4; + if (find_pg_place_addr(cfg, i, nat_proto, aport, + ppg_idx) != 0){ + /* Found! */ + *aaddr = htonl(cfg->prefix4 + i); + return (0); + } + } + + /* Next, try to use random address based on flow hash */ + i = ji->fhash % (1 << (32 - cfg->plen4)); + if (find_pg_place_addr(cfg, i, nat_proto, aport, ppg_idx) != 0) { + /* Found! */ + *aaddr = htonl(cfg->prefix4 + i); + return (0); + } + + + /* Last one: simply find ANY available */ + for (i = 0; i < (1 << (32 - cfg->plen4)); i++) { + if (find_pg_place_addr(cfg, i, nat_proto, aport, + ppg_idx) != 0){ + /* Found! */ + *aaddr = htonl(cfg->prefix4 + i); + return (0); + } + } + + return (1); +} + +static NAT64NOINLINE int +attach_portgroup(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) +{ + char a[INET6_ADDRSTRLEN]; + struct nat64lsn_portgroup *pg; + struct nat64lsn_host *nh; + uint32_t aaddr; + uint16_t aport; + int nh_pg_idx, pg_idx; + + pg = ji->pg; + + /* + * Find source host and bind: we can't rely on + * pg->host + */ + I6HASH_FIND(cfg, nh, &ji->haddr); + if (nh == NULL) + return (1); + + /* Find spare port chunk */ + if (find_portgroup_place(cfg, ji, &aaddr, &aport, &pg_idx) != 0) { + inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); + DPRINTF(DP_OBJ | DP_DROPS, "empty PG not found for %s", a); + return (2); + } + + /* Expand PG indexes if needed */ + if (nh->pg_allocated < cfg->max_chunks && ji->spare_idx != NULL) { + PORTGROUP_CHUNK(nh, nh->pg_allocated / NAT64LSN_PGIDX_CHUNK) = + ji->spare_idx; + nh->pg_allocated += NAT64LSN_PGIDX_CHUNK; + ji->spare_idx = NULL; + } + + /* Find empty index to store PG in the @nh */ + if (find_nh_pg_idx(cfg, nh, &nh_pg_idx) != 0) { + inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); + DPRINTF(DP_OBJ | DP_DROPS, "free PG index not found for %s", + a); + return (3); + } + + cfg->pg[pg_idx] = pg; + cfg->protochunks[pg->nat_proto]++; + NAT64STAT_INC(&cfg->stats, spgcreated); + + pg->aaddr = aaddr; + pg->aport = aport; + pg->host = nh; + pg->idx = pg_idx; + SET_AGE(pg->timestamp); + + PORTGROUP_BYSIDX(cfg, nh, nh_pg_idx + 1) = pg; + if (nh->pg_used == nh_pg_idx) + nh->pg_used++; + SET_AGE(nh->timestamp); + + ji->pg = NULL; + ji->done = 1; + + return (0); +} + +static NAT64NOINLINE void +consider_del_portgroup(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) +{ + struct nat64lsn_host *nh, *nh_tmp; + struct nat64lsn_portgroup *pg, *pg_list[256]; + int i, pg_lidx, idx; + + /* Find source host */ + I6HASH_FIND(cfg, nh, &ji->haddr); + if (nh == NULL || nh->pg_used == 0) + return; + + memset(pg_list, 0, sizeof(pg_list)); + pg_lidx = 0; + + NAT64_LOCK(nh); + + for (i = nh->pg_used - 1; i >= 0; i--) { + if ((ji->delmask[i / 64] & ((uint64_t)1 << (i % 64))) == 0) + continue; + pg = PORTGROUP_BYSIDX(cfg, nh, i + 1); + + /* Check that PG isn't busy. */ + if (stale_pg(cfg, pg) == 0) + continue; + + /* DO delete */ + pg_list[pg_lidx++] = pg; + PORTGROUP_BYSIDX(cfg, nh, i + 1) = NULL; + + idx = _GET_PORTGROUP_IDX(cfg, ntohl(pg->aaddr), pg->nat_proto, + pg->aport); + KASSERT(cfg->pg[idx] == pg, ("Non matched pg")); + cfg->pg[idx] = NULL; + cfg->protochunks[pg->nat_proto]--; + NAT64STAT_INC(&cfg->stats, spgdeleted); + + /* Decrease pg_used */ + while (nh->pg_used > 0 && + PORTGROUP_BYSIDX(cfg, nh, nh->pg_used) == NULL) + nh->pg_used--; + + /* Check if on-stack buffer has ended */ + if (pg_lidx == nitems(pg_list)) + break; + } + + NAT64_UNLOCK(nh); + + if (stale_nh(cfg, nh)) { + I6HASH_REMOVE(cfg, nh, nh_tmp, &ji->haddr); + KASSERT(nh != NULL, ("Unable to find address")); + cfg->ihcount--; + ji->nh = nh; + I6HASH_FIND(cfg, nh, &ji->haddr); + KASSERT(nh == NULL, ("Failed to delete address")); + } + + /* TODO: Delay freeing portgroups */ + while (pg_lidx > 0) { + pg_lidx--; + NAT64STAT_INC(&cfg->stats, spgdeleted); + destroy_portgroup(pg_list[pg_lidx]); + } +} + +/* + * Main request handler. + * Responsible for handling jqueue, e.g. + * creating new hosts, addind/deleting portgroups. + */ +static NAT64NOINLINE void +nat64lsn_do_request(void *data) +{ + IPFW_RLOCK_TRACKER; + struct nat64lsn_job_head jhead; + struct nat64lsn_job_item *ji; + int jcount, nhsize; + struct nat64lsn_cfg *cfg = (struct nat64lsn_cfg *) data; + struct ip_fw_chain *ch; + int delcount; + + CURVNET_SET(cfg->vp); + + TAILQ_INIT(&jhead); + + /* XXX: We're running unlocked here */ + + ch = cfg->ch; + delcount = 0; + IPFW_RLOCK(ch); + + /* Grab queue */ + JQUEUE_LOCK(); + TAILQ_SWAP(&jhead, &cfg->jhead, nat64lsn_job_item, next); + jcount = cfg->jlen; + cfg->jlen = 0; + JQUEUE_UNLOCK(); + + /* check if we need to resize hash */ + nhsize = 0; + if (cfg->ihcount > cfg->ihsize && cfg->ihsize < 65536) { + nhsize = cfg->ihsize; + for ( ; cfg->ihcount > nhsize && nhsize < 65536; nhsize *= 2) + ; + } else if (cfg->ihcount < cfg->ihsize * 4) { + nhsize = cfg->ihsize; + for ( ; cfg->ihcount < nhsize * 4 && nhsize > 32; nhsize /= 2) + ; + } + + IPFW_RUNLOCK(ch); + + if (TAILQ_EMPTY(&jhead)) { + CURVNET_RESTORE(); + return; + } + + NAT64STAT_INC(&cfg->stats, jcalls); + DPRINTF(DP_JQUEUE, "count=%d", jcount); + + /* + * TODO: + * What we should do here is to build a hash + * to ensure we don't have lots of duplicate requests. + * Skip this for now. + * + * TODO: Limit per-call number of items + */ + + /* Pre-allocate everything for entire chain */ + TAILQ_FOREACH(ji, &jhead, next) { + switch (ji->jtype) { + case JTYPE_NEWHOST: + if (alloc_host6(cfg, ji) != 0) + NAT64STAT_INC(&cfg->stats, jhostfails); + break; + case JTYPE_NEWPORTGROUP: + if (alloc_portgroup(ji) != 0) + NAT64STAT_INC(&cfg->stats, jportfails); + break; + case JTYPE_DELPORTGROUP: + delcount += ji->delcount; + break; + default: + break; + } + } + + /* + * TODO: Alloc hew hash + */ + nhsize = 0; + if (nhsize > 0) { + /* XXX: */ + } + + /* Apply all changes in batch */ + IPFW_UH_WLOCK(ch); + IPFW_WLOCK(ch); + + TAILQ_FOREACH(ji, &jhead, next) { + switch (ji->jtype) { + case JTYPE_NEWHOST: + if (ji->nh != NULL) + attach_host6(cfg, ji); + break; + case JTYPE_NEWPORTGROUP: + if (ji->pg != NULL && + attach_portgroup(cfg, ji) != 0) + NAT64STAT_INC(&cfg->stats, jportfails); + break; + case JTYPE_DELPORTGROUP: + consider_del_portgroup(cfg, ji); + break; + } + } + + if (nhsize > 0) { + /* XXX: Move everything to new hash */ + } + + IPFW_WUNLOCK(ch); + IPFW_UH_WUNLOCK(ch); + + /* Flush unused entries */ + while (!TAILQ_EMPTY(&jhead)) { + ji = TAILQ_FIRST(&jhead); + TAILQ_REMOVE(&jhead, ji, next); + if (ji->nh != NULL) + destroy_host6(ji->nh); + if (ji->pg != NULL) + destroy_portgroup(ji->pg); + if (ji->m != NULL) + reinject_mbuf(cfg, ji); + if (ji->spare_idx != NULL) + uma_zfree(nat64lsn_pgidx_zone, ji->spare_idx); + free(ji, M_IPFW); + } + CURVNET_RESTORE(); +} + +static NAT64NOINLINE struct nat64lsn_job_item * +nat64lsn_create_job(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id, + int jtype) +{ + struct nat64lsn_job_item *ji; + struct in6_addr haddr; + uint8_t nat_proto; + + /* + * Do not try to lock possibly contested mutex if we're near the limit. + * Drop packet instead. + */ + if (cfg->jlen >= cfg->jmaxlen) { + NAT64STAT_INC(&cfg->stats, jmaxlen); + return (NULL); + } + + memset(&haddr, 0, sizeof(haddr)); + nat_proto = 0; + if (f_id != NULL) { + haddr = f_id->src_ip6; + nat_proto = nat64lsn_proto_map[f_id->proto]; + + DPRINTF(DP_JQUEUE, "REQUEST pg nat_proto %d on proto %d", + nat_proto, f_id->proto); + + if (nat_proto == 0) + return (NULL); + } + + ji = malloc(sizeof(struct nat64lsn_job_item), M_IPFW, + M_NOWAIT | M_ZERO); + + if (ji == NULL) { + NAT64STAT_INC(&cfg->stats, jnomem); + return (NULL); + } + + ji->jtype = jtype; + + if (f_id != NULL) { + ji->f_id = *f_id; + ji->haddr = haddr; + ji->nat_proto = nat_proto; + } + + return (ji); +} + +static NAT64NOINLINE void +nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) +{ + + if (ji == NULL) + return; + + JQUEUE_LOCK(); + TAILQ_INSERT_TAIL(&cfg->jhead, ji, next); + cfg->jlen++; + NAT64STAT_INC(&cfg->stats, jrequests); + + if (callout_pending(&cfg->jcallout) == 0) + callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg); + JQUEUE_UNLOCK(); +} + +static NAT64NOINLINE void +nat64lsn_enqueue_jobs(struct nat64lsn_cfg *cfg, + struct nat64lsn_job_head *jhead, int jlen) +{ + + if (TAILQ_EMPTY(jhead)) + return; + + /* Attach current queue to execution one */ + JQUEUE_LOCK(); + TAILQ_CONCAT(&cfg->jhead, jhead, next); + cfg->jlen += jlen; + NAT64STAT_ADD(&cfg->stats, jrequests, jlen); + + if (callout_pending(&cfg->jcallout) == 0) + callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg); + JQUEUE_UNLOCK(); +} + +static unsigned int +flow6_hash(const struct ipfw_flow_id *f_id) +{ + unsigned char hbuf[36]; + + memcpy(hbuf, &f_id->dst_ip6, 16); + memcpy(&hbuf[16], &f_id->src_ip6, 16); + memcpy(&hbuf[32], &f_id->dst_port, 2); + memcpy(&hbuf[32], &f_id->src_port, 2); + + return (djb_hash(hbuf, sizeof(hbuf))); +} + +static NAT64NOINLINE int +nat64lsn_request_host(struct nat64lsn_cfg *cfg, + const struct ipfw_flow_id *f_id, struct mbuf **pm) +{ + struct nat64lsn_job_item *ji; + struct mbuf *m; + + m = *pm; + *pm = NULL; + + ji = nat64lsn_create_job(cfg, f_id, JTYPE_NEWHOST); + if (ji == NULL) { + m_freem(m); + NAT64STAT_INC(&cfg->stats, dropped); + DPRINTF(DP_DROPS, "failed to create job"); + } else { + ji->m = m; + /* Provide pseudo-random value based on flow */ + ji->fhash = flow6_hash(f_id); + nat64lsn_enqueue_job(cfg, ji); + NAT64STAT_INC(&cfg->stats, jhostsreq); + } + + return (IP_FW_PASS); +} + +static NAT64NOINLINE int +nat64lsn_request_portgroup(struct nat64lsn_cfg *cfg, + const struct ipfw_flow_id *f_id, struct mbuf **pm, uint32_t aaddr, + int needs_idx) +{ + struct nat64lsn_job_item *ji; + struct mbuf *m; + + m = *pm; + *pm = NULL; + + ji = nat64lsn_create_job(cfg, f_id, JTYPE_NEWPORTGROUP); + if (ji == NULL) { + m_freem(m); + NAT64STAT_INC(&cfg->stats, dropped); + DPRINTF(DP_DROPS, "failed to create job"); + } else { + ji->m = m; + /* Provide pseudo-random value based on flow */ + ji->fhash = flow6_hash(f_id); + ji->aaddr = aaddr; + ji->needs_idx = needs_idx; + nat64lsn_enqueue_job(cfg, ji); + NAT64STAT_INC(&cfg->stats, jportreq); + } + + return (IP_FW_PASS); +} + +static NAT64NOINLINE struct nat64lsn_state * +nat64lsn_create_state(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh, + int nat_proto, struct nat64lsn_state *kst, uint32_t *aaddr) +{ + struct nat64lsn_portgroup *pg; + struct nat64lsn_state *st; + int i, hval, off; + + /* XXX: create additional bitmask for selecting proper portgroup */ + for (i = 0; i < nh->pg_used; i++) { + pg = PORTGROUP_BYSIDX(cfg, nh, i + 1); + if (pg == NULL) + continue; + if (*aaddr == 0) + *aaddr = pg->aaddr; + if (pg->nat_proto != nat_proto) + continue; + + off = PG_GET_FREE_IDX(pg); + if (off != 0) { + /* We have found spare state. Use it */ + off--; + PG_MARK_BUSY_IDX(pg, off); + st = &pg->states[off]; + + /* + * Fill in new info. Assume state was zeroed. + * Timestamp and flags will be filled by caller. + */ + st->u.s = kst->u.s; + st->cur.idx = i + 1; + st->cur.off = off; + + /* Insert into host hash table */ + hval = HASH_IN4(&st->u.hkey) & (nh->hsize - 1); + st->next = nh->phash[hval]; + nh->phash[hval] = st->cur; + + nat64lsn_dump_state(cfg, pg, st, "ALLOC STATE", off); + + NAT64STAT_INC(&cfg->stats, screated); + + return (st); + } + /* Saev last used alias affress */ + *aaddr = pg->aaddr; + } + + return (NULL); +} + +static NAT64NOINLINE int +nat64lsn_translate6(struct nat64lsn_cfg *cfg, struct ipfw_flow_id *f_id, + struct mbuf **pm) +{ + struct pfloghdr loghdr, *logdata; + char a[INET6_ADDRSTRLEN]; + struct nat64lsn_host *nh; + struct st_ptr sidx; + struct nat64lsn_state *st, kst; + struct nat64lsn_portgroup *pg; + struct icmp6_hdr *icmp6; + uint32_t aaddr; + int action, hval, nat_proto, proto; + uint16_t aport, state_ts, state_flags; + + /* Check if af/protocol is supported and get it short id */ + nat_proto = nat64lsn_proto_map[f_id->proto]; + if (nat_proto == 0) { + /* + * Since we can be called from jobs handler, we need + * to free mbuf by self, do not leave this task to + * ipfw_check_packet(). + */ + NAT64STAT_INC(&cfg->stats, noproto); + m_freem(*pm); + *pm = NULL; + return (IP_FW_DENY); + } + + /* Try to find host first */ + I6HASH_FIND(cfg, nh, &f_id->src_ip6); + + if (nh == NULL) + return (nat64lsn_request_host(cfg, f_id, pm)); + + /* Fill-in on-stack state structure */ + kst.u.s.faddr = f_id->dst_ip6.s6_addr32[3]; + kst.u.s.fport = f_id->dst_port; + kst.u.s.lport = f_id->src_port; + + /* Prepare some fields we might need to update */ + hval = 0; + proto = nat64_getlasthdr(*pm, &hval); + if (proto < 0) { + NAT64STAT_INC(&cfg->stats, dropped); + DPRINTF(DP_DROPS, "dropped due to mbuf isn't contigious"); + m_freem(*pm); + *pm = NULL; + return (IP_FW_DENY); + } + + SET_AGE(state_ts); + if (proto == IPPROTO_TCP) + state_flags = convert_tcp_flags( + TCP(mtodo(*pm, hval))->th_flags); + else + state_flags = 0; + if (proto == IPPROTO_ICMPV6) { + /* Alter local port data */ + icmp6 = mtodo(*pm, hval); + if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST || + icmp6->icmp6_type == ICMP6_ECHO_REPLY) + kst.u.s.lport = ntohs(icmp6->icmp6_id); + } + + hval = HASH_IN4(&kst.u.hkey) & (nh->hsize - 1); + pg = NULL; + st = NULL; + + /* OK, let's find state in host hash */ + NAT64_LOCK(nh); + sidx = nh->phash[hval]; + int k = 0; + while (sidx.idx != 0) { + pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx); + st = &pg->states[sidx.off]; + //DPRINTF("SISX: %d/%d next: %d/%d", sidx.idx, sidx.off, + //st->next.idx, st->next.off); + if (st->u.hkey == kst.u.hkey && pg->nat_proto == nat_proto) + break; + if (k++ > 1000) { + DPRINTF(DP_ALL, "XXX: too long %d/%d %d/%d\n", + sidx.idx, sidx.off, st->next.idx, st->next.off); + inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); + DPRINTF(DP_GENERIC, "TR host %s %p on cpu %d", + a, nh, curcpu); + k = 0; + } + sidx = st->next; + } + + if (sidx.idx == 0) { + aaddr = 0; + st = nat64lsn_create_state(cfg, nh, nat_proto, &kst, &aaddr); + if (st == NULL) { + /* No free states. Request more if we can */ + if (nh->pg_used >= cfg->max_chunks) { + /* Limit reached */ + NAT64STAT_INC(&cfg->stats, dropped); + inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)); + DPRINTF(DP_DROPS, "PG limit reached " + " for host %s (used %u, allocated %u, " + "limit %u)", a, + nh->pg_used * NAT64_CHUNK_SIZE, + nh->pg_allocated * NAT64_CHUNK_SIZE, + cfg->max_chunks * NAT64_CHUNK_SIZE); + m_freem(*pm); + *pm = NULL; + NAT64_UNLOCK(nh); + return (IP_FW_DENY); + } + if ((nh->pg_allocated <= + nh->pg_used + NAT64LSN_REMAININGPG) && + nh->pg_allocated < cfg->max_chunks) + action = 1; /* Request new indexes */ + else + action = 0; + NAT64_UNLOCK(nh); + //DPRINTF("No state, unlock for %p", nh); + return (nat64lsn_request_portgroup(cfg, f_id, + pm, aaddr, action)); + } + + /* We've got new state. */ + sidx = st->cur; + pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx); + } + + /* Okay, state found */ + + /* Update necessary fileds */ + if (st->timestamp != state_ts) + st->timestamp = state_ts; + if ((st->flags & state_flags) != 0) + st->flags |= state_flags; + + /* Copy needed state data */ + aaddr = pg->aaddr; + aport = htons(pg->aport + sidx.off); + + NAT64_UNLOCK(nh); + + if (cfg->flags & NAT64_LOG) { + logdata = &loghdr; + nat64lsn_log(logdata, *pm, AF_INET6, pg->idx, st->cur.off); + } else + logdata = NULL; + + action = nat64_do_handle_ip6(*pm, aaddr, aport, &cfg->stats, logdata); + if (action == NAT64SKIP) + return (IP_FW_PASS); + if (action == NAT64MFREE) + m_freem(*pm); + *pm = NULL; /* mark mbuf as consumed */ + return (IP_FW_DENY); +} + +/* + * Main dataplane entry point. + */ +int +ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args, + ipfw_insn *cmd, int *done) +{ + ipfw_insn *icmd; + struct nat64lsn_cfg *cfg; + int ret; + + IPFW_RLOCK_ASSERT(ch); + + *done = 1; /* terminate the search */ + icmd = cmd + 1; + if (cmd->opcode != O_EXTERNAL_ACTION || + cmd->arg1 != V_nat64lsn_eid || + icmd->opcode != O_EXTERNAL_INSTANCE || + (cfg = NAT64_LOOKUP(ch, icmd)) == NULL) + return (0); + + switch (args->f_id.addr_type) { + case 4: + ret = nat64lsn_translate4(cfg, &args->f_id, &args->m); + break; + case 6: + ret = nat64lsn_translate6(cfg, &args->f_id, &args->m); + break; + default: + return (0); + } + return (ret); +} + +static int +nat64lsn_ctor_host(void *mem, int size, void *arg, int flags) +{ + struct nat64lsn_host *nh; + + nh = (struct nat64lsn_host *)mem; + memset(nh->pg_ptr, 0, sizeof(nh->pg_ptr)); + memset(nh->phash, 0, sizeof(nh->phash)); + return (0); +} + +static int +nat64lsn_ctor_pgidx(void *mem, int size, void *arg, int flags) +{ + + memset(mem, 0, size); + return (0); +} + +void +nat64lsn_init_internal(void) +{ + + memset(nat64lsn_proto_map, 0, sizeof(nat64lsn_proto_map)); + /* Set up supported protocol map */ + nat64lsn_proto_map[IPPROTO_TCP] = NAT_PROTO_TCP; + nat64lsn_proto_map[IPPROTO_UDP] = NAT_PROTO_UDP; + nat64lsn_proto_map[IPPROTO_ICMP] = NAT_PROTO_ICMP; + nat64lsn_proto_map[IPPROTO_ICMPV6] = NAT_PROTO_ICMP; + /* Fill in reverse proto map */ + memset(nat64lsn_rproto_map, 0, sizeof(nat64lsn_rproto_map)); + nat64lsn_rproto_map[NAT_PROTO_TCP] = IPPROTO_TCP; + nat64lsn_rproto_map[NAT_PROTO_UDP] = IPPROTO_UDP; + nat64lsn_rproto_map[NAT_PROTO_ICMP] = IPPROTO_ICMPV6; + + JQUEUE_LOCK_INIT(); + nat64lsn_host_zone = uma_zcreate("NAT64 hosts zone", + sizeof(struct nat64lsn_host), nat64lsn_ctor_host, NULL, + NULL, NULL, UMA_ALIGN_PTR, 0); + nat64lsn_pg_zone = uma_zcreate("NAT64 portgroups zone", + sizeof(struct nat64lsn_portgroup), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + nat64lsn_pgidx_zone = uma_zcreate("NAT64 portgroup indexes zone", + sizeof(struct nat64lsn_portgroup *) * NAT64LSN_PGIDX_CHUNK, + nat64lsn_ctor_pgidx, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); +} + +void +nat64lsn_uninit_internal(void) +{ + + JQUEUE_LOCK_DESTROY(); + uma_zdestroy(nat64lsn_host_zone); + uma_zdestroy(nat64lsn_pg_zone); + uma_zdestroy(nat64lsn_pgidx_zone); +} + +void +nat64lsn_start_instance(struct nat64lsn_cfg *cfg) +{ + + callout_reset(&cfg->periodic, hz * PERIODIC_DELAY, + nat64lsn_periodic, cfg); +} + +struct nat64lsn_cfg * +nat64lsn_init_instance(struct ip_fw_chain *ch, size_t numaddr) +{ + struct nat64lsn_cfg *cfg; + + cfg = malloc(sizeof(struct nat64lsn_cfg), M_IPFW, M_WAITOK | M_ZERO); + TAILQ_INIT(&cfg->jhead); + cfg->vp = curvnet; + cfg->ch = ch; + COUNTER_ARRAY_ALLOC(cfg->stats.stats, NAT64STATS, M_WAITOK); + + cfg->ihsize = NAT64LSN_HSIZE; + cfg->ih = malloc(sizeof(void *) * cfg->ihsize, M_IPFW, + M_WAITOK | M_ZERO); + + cfg->pg = malloc(sizeof(void *) * numaddr * _ADDR_PG_COUNT, M_IPFW, + M_WAITOK | M_ZERO); + + callout_init(&cfg->periodic, CALLOUT_MPSAFE); + callout_init(&cfg->jcallout, CALLOUT_MPSAFE); + + return (cfg); +} + +/* + * Destroy all hosts callback. + * Called on module unload when all activity already finished, so + * can work without any locks. + */ +static NAT64NOINLINE int +nat64lsn_destroy_host(struct nat64lsn_host *nh, struct nat64lsn_cfg *cfg) +{ + struct nat64lsn_portgroup *pg; + int i; + + for (i = nh->pg_used; i > 0; i--) { + pg = PORTGROUP_BYSIDX(cfg, nh, i); + if (pg == NULL) + continue; + cfg->pg[pg->idx] = NULL; + destroy_portgroup(pg); + nh->pg_used--; + } + destroy_host6(nh); + cfg->ihcount--; + return (0); +} + +void +nat64lsn_destroy_instance(struct nat64lsn_cfg *cfg) +{ + struct nat64lsn_host *nh, *tmp; + + JQUEUE_LOCK(); + callout_drain(&cfg->jcallout); + JQUEUE_UNLOCK(); + + callout_drain(&cfg->periodic); + I6HASH_FOREACH_SAFE(cfg, nh, tmp, nat64lsn_destroy_host, cfg); + DPRINTF(DP_OBJ, "instance %s: hosts %d", cfg->name, cfg->ihcount); + + COUNTER_ARRAY_FREE(cfg->stats.stats, NAT64STATS); + free(cfg->ih, M_IPFW); + free(cfg->pg, M_IPFW); + free(cfg, M_IPFW); +} + diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64lsn.h b/freebsd/sys/netpfil/ipfw/nat64/nat64lsn.h new file mode 100644 index 00000000..e6ceb1dd --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/nat64/nat64lsn.h @@ -0,0 +1,351 @@ +/*- + * Copyright (c) 2015 Yandex LLC + * Copyright (c) 2015 Alexander V. Chernikov <melifaro@FreeBSD.org> + * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IP_FW_NAT64LSN_H_ +#define _IP_FW_NAT64LSN_H_ + +#define NAT64_CHUNK_SIZE_BITS 6 /* 64 ports */ +#define NAT64_CHUNK_SIZE (1 << NAT64_CHUNK_SIZE_BITS) + +#define NAT64_MIN_PORT 1024 +#define NAT64_MIN_CHUNK (NAT64_MIN_PORT >> NAT64_CHUNK_SIZE_BITS) + +struct st_ptr { + uint8_t idx; /* index in nh->pg_ptr array. + * NOTE: it starts from 1. + */ + uint8_t off; +}; +#define NAT64LSN_MAXPGPTR ((1 << (sizeof(uint8_t) * NBBY)) - 1) +#define NAT64LSN_PGPTRMASKBITS (sizeof(uint64_t) * NBBY) +#define NAT64LSN_PGPTRNMASK (roundup(NAT64LSN_MAXPGPTR, \ + NAT64LSN_PGPTRMASKBITS) / NAT64LSN_PGPTRMASKBITS) + +struct nat64lsn_portgroup; +/* sizeof(struct nat64lsn_host) = 64 + 64x2 + 8x8 = 256 bytes */ +struct nat64lsn_host { + struct rwlock h_lock; /* Host states lock */ + + struct in6_addr addr; + struct nat64lsn_host *next; + uint16_t timestamp; /* Last altered */ + uint16_t hsize; /* ports hash size */ + uint16_t pg_used; /* Number of portgroups used */ +#define NAT64LSN_REMAININGPG 8 /* Number of remaining PG before + * requesting of new chunk of indexes. + */ + uint16_t pg_allocated; /* Number of portgroups indexes + * allocated. + */ +#define NAT64LSN_HSIZE 64 + struct st_ptr phash[NAT64LSN_HSIZE]; /* XXX: hardcoded size */ + /* + * PG indexes are stored in chunks with 32 elements. + * The maximum count is limited to 255 due to st_ptr->idx is uint8_t. + */ +#define NAT64LSN_PGIDX_CHUNK 32 +#define NAT64LSN_PGNIDX (roundup(NAT64LSN_MAXPGPTR, \ + NAT64LSN_PGIDX_CHUNK) / NAT64LSN_PGIDX_CHUNK) + struct nat64lsn_portgroup **pg_ptr[NAT64LSN_PGNIDX]; /* PG indexes */ +}; + +#define NAT64_RLOCK_ASSERT(h) rw_assert(&(h)->h_lock, RA_RLOCKED) +#define NAT64_WLOCK_ASSERT(h) rw_assert(&(h)->h_lock, RA_WLOCKED) + +#define NAT64_RLOCK(h) rw_rlock(&(h)->h_lock) +#define NAT64_RUNLOCK(h) rw_runlock(&(h)->h_lock) +#define NAT64_WLOCK(h) rw_wlock(&(h)->h_lock) +#define NAT64_WUNLOCK(h) rw_wunlock(&(h)->h_lock) +#define NAT64_LOCK(h) NAT64_WLOCK(h) +#define NAT64_UNLOCK(h) NAT64_WUNLOCK(h) +#define NAT64_LOCK_INIT(h) do { \ + rw_init(&(h)->h_lock, "NAT64 host lock"); \ + } while (0) + +#define NAT64_LOCK_DESTROY(h) do { \ + rw_destroy(&(h)->h_lock); \ + } while (0) + +/* Internal proto index */ +#define NAT_PROTO_TCP 1 +#define NAT_PROTO_UDP 2 +#define NAT_PROTO_ICMP 3 + +#define NAT_MAX_PROTO 4 +extern uint8_t nat64lsn_rproto_map[NAT_MAX_PROTO]; + +VNET_DECLARE(uint16_t, nat64lsn_eid); +#define V_nat64lsn_eid VNET(nat64lsn_eid) +#define IPFW_TLV_NAT64LSN_NAME IPFW_TLV_EACTION_NAME(V_nat64lsn_eid) + +/* Timestamp macro */ +#define _CT ((int)time_uptime % 65536) +#define SET_AGE(x) (x) = _CT +#define GET_AGE(x) ((_CT >= (x)) ? _CT - (x) : \ + (int)65536 + _CT - (x)) + +#ifdef __LP64__ +/* ffsl() is capable of checking 64-bit ints */ +#define _FFS64 +#endif + +/* 16 bytes */ +struct nat64lsn_state { + union { + struct { + in_addr_t faddr; /* Remote IPv4 address */ + uint16_t fport; /* Remote IPv4 port */ + uint16_t lport; /* Local IPv6 port */ + }s; + uint64_t hkey; + } u; + uint8_t nat_proto; + uint8_t flags; + uint16_t timestamp; + struct st_ptr cur; /* Index of portgroup in nat64lsn_host */ + struct st_ptr next; /* Next entry index */ +}; + +/* + * 1024+32 bytes per 64 states, used to store state + * AND for outside-in state lookup + */ +struct nat64lsn_portgroup { + struct nat64lsn_host *host; /* IPv6 source host info */ + in_addr_t aaddr; /* Alias addr, network format */ + uint16_t aport; /* Base port */ + uint16_t timestamp; + uint8_t nat_proto; + uint8_t spare[3]; + uint32_t idx; +#ifdef _FFS64 + uint64_t freemask; /* Mask of free entries */ +#else + uint32_t freemask[2]; /* Mask of free entries */ +#endif + struct nat64lsn_state states[NAT64_CHUNK_SIZE]; /* State storage */ +}; +#ifdef _FFS64 +#define PG_MARK_BUSY_IDX(_pg, _idx) (_pg)->freemask &= ~((uint64_t)1<<(_idx)) +#define PG_MARK_FREE_IDX(_pg, _idx) (_pg)->freemask |= ((uint64_t)1<<(_idx)) +#define PG_IS_FREE_IDX(_pg, _idx) ((_pg)->freemask & ((uint64_t)1<<(_idx))) +#define PG_IS_BUSY_IDX(_pg, _idx) (PG_IS_FREE_IDX(_pg, _idx) == 0) +#define PG_GET_FREE_IDX(_pg) (ffsll((_pg)->freemask)) +#define PG_IS_EMPTY(_pg) (((_pg)->freemask + 1) == 0) +#else +#define PG_MARK_BUSY_IDX(_pg, _idx) \ + (_pg)->freemask[(_idx) / 32] &= ~((u_long)1<<((_idx) % 32)) +#define PG_MARK_FREE_IDX(_pg, _idx) \ + (_pg)->freemask[(_idx) / 32] |= ((u_long)1<<((_idx) % 32)) +#define PG_IS_FREE_IDX(_pg, _idx) \ + ((_pg)->freemask[(_idx) / 32] & ((u_long)1<<((_idx) % 32))) +#define PG_IS_BUSY_IDX(_pg, _idx) (PG_IS_FREE_IDX(_pg, _idx) == 0) +#define PG_GET_FREE_IDX(_pg) _pg_get_free_idx(_pg) +#define PG_IS_EMPTY(_pg) \ + ((((_pg)->freemask[0] + 1) == 0 && ((_pg)->freemask[1] + 1) == 0)) + +static inline int +_pg_get_free_idx(const struct nat64lsn_portgroup *pg) +{ + int i; + + if ((i = ffsl(pg->freemask[0])) != 0) + return (i); + if ((i = ffsl(pg->freemask[1])) != 0) + return (i + 32); + return (0); +} + +#endif + +TAILQ_HEAD(nat64lsn_job_head, nat64lsn_job_item); + +#define NAT64LSN_FLAGSMASK (NAT64_LOG) +struct nat64lsn_cfg { + struct named_object no; + //struct nat64_exthost *ex; /* Pointer to external addr array */ + struct nat64lsn_portgroup **pg; /* XXX: array of pointers */ + struct nat64lsn_host **ih; /* Host hash */ + uint32_t prefix4; /* IPv4 prefix */ + uint32_t pmask4; /* IPv4 prefix mask */ + uint32_t ihsize; /* IPv6 host hash size */ + uint8_t plen4; + uint8_t plen6; + uint8_t nomatch_verdict;/* What to return to ipfw on no-match */ + uint8_t nomatch_final; /* Exit outer loop? */ + struct in6_addr prefix6; /* IPv6 prefix to embed IPv4 hosts */ + + uint32_t ihcount; /* Number of items in host hash */ + int max_chunks; /* Max chunks per client */ + int agg_prefix_len; /* Prefix length to count */ + int agg_prefix_max; /* Max hosts per agg prefix */ + uint32_t jmaxlen; /* Max jobqueue length */ + uint32_t flags; + uint16_t min_chunk; /* Min port group # to use */ + uint16_t max_chunk; /* Max port group # to use */ + uint16_t nh_delete_delay; /* Stale host delete delay */ + uint16_t pg_delete_delay; /* Stale portgroup del delay */ + uint16_t st_syn_ttl; /* TCP syn expire */ + uint16_t st_close_ttl; /* TCP fin expire */ + uint16_t st_estab_ttl; /* TCP established expire */ + uint16_t st_udp_ttl; /* UDP expire */ + uint16_t st_icmp_ttl; /* ICMP expire */ + uint32_t protochunks[NAT_MAX_PROTO];/* Number of chunks used */ + + struct callout periodic; + struct callout jcallout; + struct ip_fw_chain *ch; + struct vnet *vp; + struct nat64lsn_job_head jhead; + int jlen; + char name[64]; /* Nat instance name */ + nat64_stats_block stats; +}; + +struct nat64lsn_cfg *nat64lsn_init_instance(struct ip_fw_chain *ch, + size_t numaddr); +void nat64lsn_destroy_instance(struct nat64lsn_cfg *cfg); +void nat64lsn_start_instance(struct nat64lsn_cfg *cfg); +void nat64lsn_init_internal(void); +void nat64lsn_uninit_internal(void); +int ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args, + ipfw_insn *cmd, int *done); + +void +nat64lsn_dump_state(const struct nat64lsn_cfg *cfg, + const struct nat64lsn_portgroup *pg, const struct nat64lsn_state *st, + const char *px, int off); +/* + * Portgroup layout + * addr x nat_proto x port_off + * + */ + +#define _ADDR_PG_PROTO_COUNT (65536 >> NAT64_CHUNK_SIZE_BITS) +#define _ADDR_PG_COUNT (_ADDR_PG_PROTO_COUNT * NAT_MAX_PROTO) + +#define GET_ADDR_IDX(_cfg, _addr) ((_addr) - ((_cfg)->prefix4)) +#define __GET_PORTGROUP_IDX(_proto, _port) \ + ((_proto - 1) * _ADDR_PG_PROTO_COUNT + \ + ((_port) >> NAT64_CHUNK_SIZE_BITS)) + +#define _GET_PORTGROUP_IDX(_cfg, _addr, _proto, _port) \ + GET_ADDR_IDX(_cfg, _addr) * _ADDR_PG_COUNT + \ + __GET_PORTGROUP_IDX(_proto, _port) +#define GET_PORTGROUP(_cfg, _addr, _proto, _port) \ + ((_cfg)->pg[_GET_PORTGROUP_IDX(_cfg, _addr, _proto, _port)]) + +#define PORTGROUP_CHUNK(_nh, _idx) \ + ((_nh)->pg_ptr[(_idx)]) +#define PORTGROUP_BYSIDX(_cfg, _nh, _idx) \ + (PORTGROUP_CHUNK(_nh, (_idx - 1) / NAT64LSN_PGIDX_CHUNK) \ + [((_idx) - 1) % NAT64LSN_PGIDX_CHUNK]) + + +/* Chained hash table */ +#define CHT_FIND(_ph, _hsize, _PX, _x, _key) do { \ + unsigned int _buck = _PX##hash(_key) & (_hsize - 1); \ + _PX##lock(_ph, _buck); \ + _x = _PX##first(_ph, _buck); \ + for ( ; _x != NULL; _x = _PX##next(_x)) { \ + if (_PX##cmp(_key, _PX##val(_x))) \ + break; \ + } \ + if (_x == NULL) \ + _PX##unlock(_ph, _buck); \ +} while(0) + +#define CHT_UNLOCK_BUCK(_ph, _PX, _buck) \ + _PX##unlock(_ph, _buck); + +#define CHT_UNLOCK_KEY(_ph, _hsize, _PX, _key) do { \ + unsigned int _buck = _PX##hash(_key) & (_hsize - 1); \ + _PX##unlock(_ph, _buck); \ +} while(0) + +#define CHT_INSERT_HEAD(_ph, _hsize, _PX, _i) do { \ + unsigned int _buck = _PX##hash(_PX##val(_i)) & (_hsize - 1); \ + _PX##lock(_ph, _buck); \ + _PX##next(_i) = _PX##first(_ph, _buck); \ + _PX##first(_ph, _buck) = _i; \ + _PX##unlock(_ph, _buck); \ +} while(0) + +#define CHT_REMOVE(_ph, _hsize, _PX, _x, _tmp, _key) do { \ + unsigned int _buck = _PX##hash(_key) & (_hsize - 1); \ + _PX##lock(_ph, _buck); \ + _x = _PX##first(_ph, _buck); \ + _tmp = NULL; \ + for ( ; _x != NULL; _tmp = _x, _x = _PX##next(_x)) { \ + if (_PX##cmp(_key, _PX##val(_x))) \ + break; \ + } \ + if (_x != NULL) { \ + if (_tmp == NULL) \ + _PX##first(_ph, _buck) = _PX##next(_x); \ + else \ + _PX##next(_tmp) = _PX##next(_x); \ + } \ + _PX##unlock(_ph, _buck); \ +} while(0) + +#define CHT_FOREACH_SAFE(_ph, _hsize, _PX, _x, _tmp, _cb, _arg) do { \ + for (unsigned int _i = 0; _i < _hsize; _i++) { \ + _PX##lock(_ph, _i); \ + _x = _PX##first(_ph, _i); \ + _tmp = NULL; \ + for (; _x != NULL; _tmp = _x, _x = _PX##next(_x)) { \ + if (_cb(_x, _arg) == 0) \ + continue; \ + if (_tmp == NULL) \ + _PX##first(_ph, _i) = _PX##next(_x); \ + else \ + _tmp = _PX##next(_x); \ + } \ + _PX##unlock(_ph, _i); \ + } \ +} while(0) + +#define CHT_RESIZE(_ph, _hsize, _nph, _nhsize, _PX, _x, _y) do { \ + unsigned int _buck; \ + for (unsigned int _i = 0; _i < _hsize; _i++) { \ + _x = _PX##first(_ph, _i); \ + _y = _x; \ + while (_y != NULL) { \ + _buck = _PX##hash(_PX##val(_x)) & (_nhsize - 1);\ + _y = _PX##next(_x); \ + _PX##next(_x) = _PX##first(_nph, _buck); \ + _PX##first(_nph, _buck) = _x; \ + } \ + } \ +} while(0) + +#endif /* _IP_FW_NAT64LSN_H_ */ + diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64lsn_control.c b/freebsd/sys/netpfil/ipfw/nat64/nat64lsn_control.c new file mode 100644 index 00000000..a20a52ea --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/nat64/nat64lsn_control.c @@ -0,0 +1,919 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2015 Yandex LLC + * Copyright (c) 2015 Alexander V. Chernikov <melifaro@FreeBSD.org> + * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/counter.h> +#include <rtems/bsd/sys/errno.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/rmlock.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/sockopt.h> +#include <sys/queue.h> + +#include <net/if.h> +#include <net/pfil.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_fw.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/nat64/ip_fw_nat64.h> +#include <netpfil/ipfw/nat64/nat64lsn.h> +#include <netinet6/ip_fw_nat64.h> + +VNET_DEFINE(uint16_t, nat64lsn_eid) = 0; + +static struct nat64lsn_cfg * +nat64lsn_find(struct namedobj_instance *ni, const char *name, uint8_t set) +{ + struct nat64lsn_cfg *cfg; + + cfg = (struct nat64lsn_cfg *)ipfw_objhash_lookup_name_type(ni, set, + IPFW_TLV_NAT64LSN_NAME, name); + + return (cfg); +} + +static void +nat64lsn_default_config(ipfw_nat64lsn_cfg *uc) +{ + + if (uc->max_ports == 0) + uc->max_ports = NAT64LSN_MAX_PORTS; + else + uc->max_ports = roundup(uc->max_ports, NAT64_CHUNK_SIZE); + if (uc->max_ports > NAT64_CHUNK_SIZE * NAT64LSN_MAXPGPTR) + uc->max_ports = NAT64_CHUNK_SIZE * NAT64LSN_MAXPGPTR; + if (uc->jmaxlen == 0) + uc->jmaxlen = NAT64LSN_JMAXLEN; + if (uc->jmaxlen > 65536) + uc->jmaxlen = 65536; + if (uc->nh_delete_delay == 0) + uc->nh_delete_delay = NAT64LSN_HOST_AGE; + if (uc->pg_delete_delay == 0) + uc->pg_delete_delay = NAT64LSN_PG_AGE; + if (uc->st_syn_ttl == 0) + uc->st_syn_ttl = NAT64LSN_TCP_SYN_AGE; + if (uc->st_close_ttl == 0) + uc->st_close_ttl = NAT64LSN_TCP_FIN_AGE; + if (uc->st_estab_ttl == 0) + uc->st_estab_ttl = NAT64LSN_TCP_EST_AGE; + if (uc->st_udp_ttl == 0) + uc->st_udp_ttl = NAT64LSN_UDP_AGE; + if (uc->st_icmp_ttl == 0) + uc->st_icmp_ttl = NAT64LSN_ICMP_AGE; +} + +/* + * Creates new nat64lsn instance. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ipfw_nat64lsn_cfg ] + * + * Returns 0 on success + */ +static int +nat64lsn_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_lheader *olh; + ipfw_nat64lsn_cfg *uc; + struct nat64lsn_cfg *cfg; + struct namedobj_instance *ni; + uint32_t addr4, mask4; + + if (sd->valsize != sizeof(*olh) + sizeof(*uc)) + return (EINVAL); + + olh = (ipfw_obj_lheader *)sd->kbuf; + uc = (ipfw_nat64lsn_cfg *)(olh + 1); + + if (ipfw_check_object_name_generic(uc->name) != 0) + return (EINVAL); + + if (uc->agg_prefix_len > 127 || uc->set >= IPFW_MAX_SETS) + return (EINVAL); + + if (uc->plen4 > 32) + return (EINVAL); + if (uc->plen6 > 128 || ((uc->plen6 % 8) != 0)) + return (EINVAL); + + /* XXX: Check prefix4 to be global */ + addr4 = ntohl(uc->prefix4.s_addr); + mask4 = ~((1 << (32 - uc->plen4)) - 1); + if ((addr4 & mask4) != addr4) + return (EINVAL); + + /* XXX: Check prefix6 */ + if (uc->min_port == 0) + uc->min_port = NAT64_MIN_PORT; + if (uc->max_port == 0) + uc->max_port = 65535; + if (uc->min_port > uc->max_port) + return (EINVAL); + uc->min_port = roundup(uc->min_port, NAT64_CHUNK_SIZE); + uc->max_port = roundup(uc->max_port, NAT64_CHUNK_SIZE); + + nat64lsn_default_config(uc); + + ni = CHAIN_TO_SRV(ch); + IPFW_UH_RLOCK(ch); + if (nat64lsn_find(ni, uc->name, uc->set) != NULL) { + IPFW_UH_RUNLOCK(ch); + return (EEXIST); + } + IPFW_UH_RUNLOCK(ch); + + cfg = nat64lsn_init_instance(ch, 1 << (32 - uc->plen4)); + strlcpy(cfg->name, uc->name, sizeof(cfg->name)); + cfg->no.name = cfg->name; + cfg->no.etlv = IPFW_TLV_NAT64LSN_NAME; + cfg->no.set = uc->set; + + cfg->prefix4 = addr4; + cfg->pmask4 = addr4 | ~mask4; + /* XXX: Copy 96 bits */ + cfg->plen6 = 96; + memcpy(&cfg->prefix6, &uc->prefix6, cfg->plen6 / 8); + cfg->plen4 = uc->plen4; + cfg->flags = uc->flags & NAT64LSN_FLAGSMASK; + cfg->max_chunks = uc->max_ports / NAT64_CHUNK_SIZE; + cfg->agg_prefix_len = uc->agg_prefix_len; + cfg->agg_prefix_max = uc->agg_prefix_max; + + cfg->min_chunk = uc->min_port / NAT64_CHUNK_SIZE; + cfg->max_chunk = uc->max_port / NAT64_CHUNK_SIZE; + + cfg->jmaxlen = uc->jmaxlen; + cfg->nh_delete_delay = uc->nh_delete_delay; + cfg->pg_delete_delay = uc->pg_delete_delay; + cfg->st_syn_ttl = uc->st_syn_ttl; + cfg->st_close_ttl = uc->st_close_ttl; + cfg->st_estab_ttl = uc->st_estab_ttl; + cfg->st_udp_ttl = uc->st_udp_ttl; + cfg->st_icmp_ttl = uc->st_icmp_ttl; + + cfg->nomatch_verdict = IP_FW_DENY; + cfg->nomatch_final = 1; /* Exit outer loop by default */ + + IPFW_UH_WLOCK(ch); + + if (nat64lsn_find(ni, uc->name, uc->set) != NULL) { + IPFW_UH_WUNLOCK(ch); + nat64lsn_destroy_instance(cfg); + return (EEXIST); + } + + if (ipfw_objhash_alloc_idx(CHAIN_TO_SRV(ch), &cfg->no.kidx) != 0) { + IPFW_UH_WUNLOCK(ch); + nat64lsn_destroy_instance(cfg); + return (ENOSPC); + } + ipfw_objhash_add(CHAIN_TO_SRV(ch), &cfg->no); + + /* Okay, let's link data */ + IPFW_WLOCK(ch); + SRV_OBJECT(ch, cfg->no.kidx) = cfg; + IPFW_WUNLOCK(ch); + + nat64lsn_start_instance(cfg); + + IPFW_UH_WUNLOCK(ch); + return (0); +} + +static void +nat64lsn_detach_config(struct ip_fw_chain *ch, struct nat64lsn_cfg *cfg) +{ + + IPFW_UH_WLOCK_ASSERT(ch); + + ipfw_objhash_del(CHAIN_TO_SRV(ch), &cfg->no); + ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), cfg->no.kidx); +} + +/* + * Destroys nat64 instance. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ] + * + * Returns 0 on success + */ +static int +nat64lsn_destroy(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct nat64lsn_cfg *cfg; + ipfw_obj_header *oh; + + if (sd->valsize != sizeof(*oh)) + return (EINVAL); + + oh = (ipfw_obj_header *)op3; + + IPFW_UH_WLOCK(ch); + cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + + if (cfg->no.refcnt > 0) { + IPFW_UH_WUNLOCK(ch); + return (EBUSY); + } + + IPFW_WLOCK(ch); + SRV_OBJECT(ch, cfg->no.kidx) = NULL; + IPFW_WUNLOCK(ch); + + nat64lsn_detach_config(ch, cfg); + IPFW_UH_WUNLOCK(ch); + + nat64lsn_destroy_instance(cfg); + return (0); +} + +#define __COPY_STAT_FIELD(_cfg, _stats, _field) \ + (_stats)->_field = NAT64STAT_FETCH(&(_cfg)->stats, _field) +static void +export_stats(struct ip_fw_chain *ch, struct nat64lsn_cfg *cfg, + struct ipfw_nat64lsn_stats *stats) +{ + + __COPY_STAT_FIELD(cfg, stats, opcnt64); + __COPY_STAT_FIELD(cfg, stats, opcnt46); + __COPY_STAT_FIELD(cfg, stats, ofrags); + __COPY_STAT_FIELD(cfg, stats, ifrags); + __COPY_STAT_FIELD(cfg, stats, oerrors); + __COPY_STAT_FIELD(cfg, stats, noroute4); + __COPY_STAT_FIELD(cfg, stats, noroute6); + __COPY_STAT_FIELD(cfg, stats, nomatch4); + __COPY_STAT_FIELD(cfg, stats, noproto); + __COPY_STAT_FIELD(cfg, stats, nomem); + __COPY_STAT_FIELD(cfg, stats, dropped); + + __COPY_STAT_FIELD(cfg, stats, jcalls); + __COPY_STAT_FIELD(cfg, stats, jrequests); + __COPY_STAT_FIELD(cfg, stats, jhostsreq); + __COPY_STAT_FIELD(cfg, stats, jportreq); + __COPY_STAT_FIELD(cfg, stats, jhostfails); + __COPY_STAT_FIELD(cfg, stats, jportfails); + __COPY_STAT_FIELD(cfg, stats, jmaxlen); + __COPY_STAT_FIELD(cfg, stats, jnomem); + __COPY_STAT_FIELD(cfg, stats, jreinjected); + __COPY_STAT_FIELD(cfg, stats, screated); + __COPY_STAT_FIELD(cfg, stats, sdeleted); + __COPY_STAT_FIELD(cfg, stats, spgcreated); + __COPY_STAT_FIELD(cfg, stats, spgdeleted); + + stats->hostcount = cfg->ihcount; + stats->tcpchunks = cfg->protochunks[NAT_PROTO_TCP]; + stats->udpchunks = cfg->protochunks[NAT_PROTO_UDP]; + stats->icmpchunks = cfg->protochunks[NAT_PROTO_ICMP]; +} +#undef __COPY_STAT_FIELD + +static void +nat64lsn_export_config(struct ip_fw_chain *ch, struct nat64lsn_cfg *cfg, + ipfw_nat64lsn_cfg *uc) +{ + + uc->flags = cfg->flags & NAT64LSN_FLAGSMASK; + uc->max_ports = cfg->max_chunks * NAT64_CHUNK_SIZE; + uc->agg_prefix_len = cfg->agg_prefix_len; + uc->agg_prefix_max = cfg->agg_prefix_max; + + uc->jmaxlen = cfg->jmaxlen; + uc->nh_delete_delay = cfg->nh_delete_delay; + uc->pg_delete_delay = cfg->pg_delete_delay; + uc->st_syn_ttl = cfg->st_syn_ttl; + uc->st_close_ttl = cfg->st_close_ttl; + uc->st_estab_ttl = cfg->st_estab_ttl; + uc->st_udp_ttl = cfg->st_udp_ttl; + uc->st_icmp_ttl = cfg->st_icmp_ttl; + uc->prefix4.s_addr = htonl(cfg->prefix4); + uc->prefix6 = cfg->prefix6; + uc->plen4 = cfg->plen4; + uc->plen6 = cfg->plen6; + uc->set = cfg->no.set; + strlcpy(uc->name, cfg->no.name, sizeof(uc->name)); +} + +struct nat64_dump_arg { + struct ip_fw_chain *ch; + struct sockopt_data *sd; +}; + +static int +export_config_cb(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + struct nat64_dump_arg *da = (struct nat64_dump_arg *)arg; + ipfw_nat64lsn_cfg *uc; + + uc = (struct _ipfw_nat64lsn_cfg *)ipfw_get_sopt_space(da->sd, + sizeof(*uc)); + nat64lsn_export_config(da->ch, (struct nat64lsn_cfg *)no, uc); + return (0); +} + +/* + * Lists all nat64 lsn instances currently available in kernel. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ] + * Reply: [ ipfw_obj_lheader ipfw_nat64lsn_cfg x N ] + * + * Returns 0 on success + */ +static int +nat64lsn_list(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_lheader *olh; + struct nat64_dump_arg da; + + /* Check minimum header size */ + if (sd->valsize < sizeof(ipfw_obj_lheader)) + return (EINVAL); + + olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh)); + + IPFW_UH_RLOCK(ch); + olh->count = ipfw_objhash_count_type(CHAIN_TO_SRV(ch), + IPFW_TLV_NAT64LSN_NAME); + olh->objsize = sizeof(ipfw_nat64lsn_cfg); + olh->size = sizeof(*olh) + olh->count * olh->objsize; + + if (sd->valsize < olh->size) { + IPFW_UH_RUNLOCK(ch); + return (ENOMEM); + } + memset(&da, 0, sizeof(da)); + da.ch = ch; + da.sd = sd; + ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), export_config_cb, &da, + IPFW_TLV_NAT64LSN_NAME); + IPFW_UH_RUNLOCK(ch); + + return (0); +} + +/* + * Change existing nat64lsn instance configuration. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_nat64lsn_cfg ] + * Reply: [ ipfw_obj_header ipfw_nat64lsn_cfg ] + * + * Returns 0 on success + */ +static int +nat64lsn_config(struct ip_fw_chain *ch, ip_fw3_opheader *op, + struct sockopt_data *sd) +{ + ipfw_obj_header *oh; + ipfw_nat64lsn_cfg *uc; + struct nat64lsn_cfg *cfg; + struct namedobj_instance *ni; + + if (sd->valsize != sizeof(*oh) + sizeof(*uc)) + return (EINVAL); + + oh = (ipfw_obj_header *)ipfw_get_sopt_space(sd, + sizeof(*oh) + sizeof(*uc)); + uc = (ipfw_nat64lsn_cfg *)(oh + 1); + + if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || + oh->ntlv.set >= IPFW_MAX_SETS) + return (EINVAL); + + ni = CHAIN_TO_SRV(ch); + if (sd->sopt->sopt_dir == SOPT_GET) { + IPFW_UH_RLOCK(ch); + cfg = nat64lsn_find(ni, oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_RUNLOCK(ch); + return (EEXIST); + } + nat64lsn_export_config(ch, cfg, uc); + IPFW_UH_RUNLOCK(ch); + return (0); + } + + nat64lsn_default_config(uc); + + IPFW_UH_WLOCK(ch); + cfg = nat64lsn_find(ni, oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_WUNLOCK(ch); + return (EEXIST); + } + + /* + * For now allow to change only following values: + * jmaxlen, nh_del_age, pg_del_age, tcp_syn_age, tcp_close_age, + * tcp_est_age, udp_age, icmp_age, flags, max_ports. + */ + + cfg->max_chunks = uc->max_ports / NAT64_CHUNK_SIZE; + cfg->jmaxlen = uc->jmaxlen; + cfg->nh_delete_delay = uc->nh_delete_delay; + cfg->pg_delete_delay = uc->pg_delete_delay; + cfg->st_syn_ttl = uc->st_syn_ttl; + cfg->st_close_ttl = uc->st_close_ttl; + cfg->st_estab_ttl = uc->st_estab_ttl; + cfg->st_udp_ttl = uc->st_udp_ttl; + cfg->st_icmp_ttl = uc->st_icmp_ttl; + cfg->flags = uc->flags & NAT64LSN_FLAGSMASK; + + IPFW_UH_WUNLOCK(ch); + + return (0); +} + +/* + * Get nat64lsn statistics. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ] + * Reply: [ ipfw_obj_header ipfw_counter_tlv ] + * + * Returns 0 on success + */ +static int +nat64lsn_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, + struct sockopt_data *sd) +{ + struct ipfw_nat64lsn_stats stats; + struct nat64lsn_cfg *cfg; + ipfw_obj_header *oh; + ipfw_obj_ctlv *ctlv; + size_t sz; + + sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ctlv) + sizeof(stats); + if (sd->valsize % sizeof(uint64_t)) + return (EINVAL); + if (sd->valsize < sz) + return (ENOMEM); + oh = (ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); + if (oh == NULL) + return (EINVAL); + memset(&stats, 0, sizeof(stats)); + + IPFW_UH_RLOCK(ch); + cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_RUNLOCK(ch); + return (ESRCH); + } + + export_stats(ch, cfg, &stats); + IPFW_UH_RUNLOCK(ch); + + ctlv = (ipfw_obj_ctlv *)(oh + 1); + memset(ctlv, 0, sizeof(*ctlv)); + ctlv->head.type = IPFW_TLV_COUNTERS; + ctlv->head.length = sz - sizeof(ipfw_obj_header); + ctlv->count = sizeof(stats) / sizeof(uint64_t); + ctlv->objsize = sizeof(uint64_t); + ctlv->version = IPFW_NAT64_VERSION; + memcpy(ctlv + 1, &stats, sizeof(stats)); + return (0); +} + +/* + * Reset nat64lsn statistics. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ] + * + * Returns 0 on success + */ +static int +nat64lsn_reset_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, + struct sockopt_data *sd) +{ + struct nat64lsn_cfg *cfg; + ipfw_obj_header *oh; + + if (sd->valsize != sizeof(*oh)) + return (EINVAL); + oh = (ipfw_obj_header *)sd->kbuf; + if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || + oh->ntlv.set >= IPFW_MAX_SETS) + return (EINVAL); + + IPFW_UH_WLOCK(ch); + cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + COUNTER_ARRAY_ZERO(cfg->stats.stats, NAT64STATS); + IPFW_UH_WUNLOCK(ch); + return (0); +} + +/* + * Reply: [ ipfw_obj_header ipfw_obj_data [ ipfw_nat64lsn_stg + * ipfw_nat64lsn_state x count, ... ] ] + */ +static int +export_pg_states(struct nat64lsn_cfg *cfg, struct nat64lsn_portgroup *pg, + ipfw_nat64lsn_stg *stg, struct sockopt_data *sd) +{ + ipfw_nat64lsn_state *ste; + struct nat64lsn_state *st; + int i, count; + + NAT64_LOCK(pg->host); + count = 0; + for (i = 0; i < 64; i++) { + if (PG_IS_BUSY_IDX(pg, i)) + count++; + } + DPRINTF(DP_STATE, "EXPORT PG %d, count %d", pg->idx, count); + + if (count == 0) { + stg->count = 0; + NAT64_UNLOCK(pg->host); + return (0); + } + ste = (ipfw_nat64lsn_state *)ipfw_get_sopt_space(sd, + count * sizeof(ipfw_nat64lsn_state)); + if (ste == NULL) { + NAT64_UNLOCK(pg->host); + return (1); + } + + stg->alias4.s_addr = pg->aaddr; + stg->proto = nat64lsn_rproto_map[pg->nat_proto]; + stg->flags = 0; + stg->host6 = pg->host->addr; + stg->count = count; + for (i = 0; i < 64; i++) { + if (PG_IS_FREE_IDX(pg, i)) + continue; + st = &pg->states[i]; + ste->daddr.s_addr = st->u.s.faddr; + ste->dport = st->u.s.fport; + ste->aport = pg->aport + i; + ste->sport = st->u.s.lport; + ste->flags = st->flags; /* XXX filter flags */ + ste->idle = GET_AGE(st->timestamp); + ste++; + } + NAT64_UNLOCK(pg->host); + + return (0); +} + +static int +get_next_idx(struct nat64lsn_cfg *cfg, uint32_t *addr, uint8_t *nat_proto, + uint16_t *port) +{ + + if (*port < 65536 - NAT64_CHUNK_SIZE) { + *port += NAT64_CHUNK_SIZE; + return (0); + } + *port = 0; + + if (*nat_proto < NAT_MAX_PROTO - 1) { + *nat_proto += 1; + return (0); + } + *nat_proto = 1; + + if (*addr < cfg->pmask4) { + *addr += 1; + return (0); + } + + /* End of space. */ + return (1); +} + +#define PACK_IDX(addr, proto, port) \ + ((uint64_t)addr << 32) | ((uint32_t)port << 16) | (proto << 8) +#define UNPACK_IDX(idx, addr, proto, port) \ + (addr) = (uint32_t)((idx) >> 32); \ + (port) = (uint16_t)(((idx) >> 16) & 0xFFFF); \ + (proto) = (uint8_t)(((idx) >> 8) & 0xFF) + +static struct nat64lsn_portgroup * +get_next_pg(struct nat64lsn_cfg *cfg, uint32_t *addr, uint8_t *nat_proto, + uint16_t *port) +{ + struct nat64lsn_portgroup *pg; + uint64_t pre_pack, post_pack; + + pg = NULL; + pre_pack = PACK_IDX(*addr, *nat_proto, *port); + for (;;) { + if (get_next_idx(cfg, addr, nat_proto, port) != 0) { + /* End of states */ + return (pg); + } + + pg = GET_PORTGROUP(cfg, *addr, *nat_proto, *port); + if (pg != NULL) + break; + } + + post_pack = PACK_IDX(*addr, *nat_proto, *port); + if (pre_pack == post_pack) + DPRINTF(DP_STATE, "XXX: PACK_IDX %u %d %d", + *addr, *nat_proto, *port); + return (pg); +} + +static NAT64NOINLINE struct nat64lsn_portgroup * +get_first_pg(struct nat64lsn_cfg *cfg, uint32_t *addr, uint8_t *nat_proto, + uint16_t *port) +{ + struct nat64lsn_portgroup *pg; + + pg = GET_PORTGROUP(cfg, *addr, *nat_proto, *port); + if (pg == NULL) + pg = get_next_pg(cfg, addr, nat_proto, port); + + return (pg); +} + +/* + * Lists nat64lsn states. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_obj_data [ uint64_t ]] + * Reply: [ ipfw_obj_header ipfw_obj_data [ + * ipfw_nat64lsn_stg ipfw_nat64lsn_state x N] ] + * + * Returns 0 on success + */ +static int +nat64lsn_states(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_header *oh; + ipfw_obj_data *od; + ipfw_nat64lsn_stg *stg; + struct nat64lsn_cfg *cfg; + struct nat64lsn_portgroup *pg, *pg_next; + uint64_t next_idx; + size_t sz; + uint32_t addr, states; + uint16_t port; + uint8_t nat_proto; + + sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_data) + + sizeof(uint64_t); + /* Check minimum header size */ + if (sd->valsize < sz) + return (EINVAL); + + oh = (ipfw_obj_header *)sd->kbuf; + od = (ipfw_obj_data *)(oh + 1); + if (od->head.type != IPFW_TLV_OBJDATA || + od->head.length != sz - sizeof(ipfw_obj_header)) + return (EINVAL); + + next_idx = *(uint64_t *)(od + 1); + /* Translate index to the request position to start from */ + UNPACK_IDX(next_idx, addr, nat_proto, port); + if (nat_proto >= NAT_MAX_PROTO) + return (EINVAL); + if (nat_proto == 0 && addr != 0) + return (EINVAL); + + IPFW_UH_RLOCK(ch); + cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_RUNLOCK(ch); + return (ESRCH); + } + /* Fill in starting point */ + if (addr == 0) { + addr = cfg->prefix4; + nat_proto = 1; + port = 0; + } + if (addr < cfg->prefix4 || addr > cfg->pmask4) { + IPFW_UH_RUNLOCK(ch); + DPRINTF(DP_GENERIC | DP_STATE, "XXX: %ju %u %u", + (uintmax_t)next_idx, addr, cfg->pmask4); + return (EINVAL); + } + + sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_data) + + sizeof(ipfw_nat64lsn_stg); + if (sd->valsize < sz) + return (ENOMEM); + oh = (ipfw_obj_header *)ipfw_get_sopt_space(sd, sz); + od = (ipfw_obj_data *)(oh + 1); + od->head.type = IPFW_TLV_OBJDATA; + od->head.length = sz - sizeof(ipfw_obj_header); + stg = (ipfw_nat64lsn_stg *)(od + 1); + + pg = get_first_pg(cfg, &addr, &nat_proto, &port); + if (pg == NULL) { + /* No states */ + stg->next_idx = 0xFF; + stg->count = 0; + IPFW_UH_RUNLOCK(ch); + return (0); + } + states = 0; + pg_next = NULL; + while (pg != NULL) { + pg_next = get_next_pg(cfg, &addr, &nat_proto, &port); + if (pg_next == NULL) + stg->next_idx = 0xFF; + else + stg->next_idx = PACK_IDX(addr, nat_proto, port); + + if (export_pg_states(cfg, pg, stg, sd) != 0) { + IPFW_UH_RUNLOCK(ch); + return (states == 0 ? ENOMEM: 0); + } + states += stg->count; + od->head.length += stg->count * sizeof(ipfw_nat64lsn_state); + sz += stg->count * sizeof(ipfw_nat64lsn_state); + if (pg_next != NULL) { + sz += sizeof(ipfw_nat64lsn_stg); + if (sd->valsize < sz) + break; + stg = (ipfw_nat64lsn_stg *)ipfw_get_sopt_space(sd, + sizeof(ipfw_nat64lsn_stg)); + } + pg = pg_next; + } + IPFW_UH_RUNLOCK(ch); + return (0); +} + +static struct ipfw_sopt_handler scodes[] = { + { IP_FW_NAT64LSN_CREATE, 0, HDIR_BOTH, nat64lsn_create }, + { IP_FW_NAT64LSN_DESTROY,0, HDIR_SET, nat64lsn_destroy }, + { IP_FW_NAT64LSN_CONFIG, 0, HDIR_BOTH, nat64lsn_config }, + { IP_FW_NAT64LSN_LIST, 0, HDIR_GET, nat64lsn_list }, + { IP_FW_NAT64LSN_STATS, 0, HDIR_GET, nat64lsn_stats }, + { IP_FW_NAT64LSN_RESET_STATS,0, HDIR_SET, nat64lsn_reset_stats }, + { IP_FW_NAT64LSN_LIST_STATES,0, HDIR_GET, nat64lsn_states }, +}; + +static int +nat64lsn_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +{ + ipfw_insn *icmd; + + icmd = cmd - 1; + if (icmd->opcode != O_EXTERNAL_ACTION || + icmd->arg1 != V_nat64lsn_eid) + return (1); + + *puidx = cmd->arg1; + *ptype = 0; + return (0); +} + +static void +nat64lsn_update_arg1(ipfw_insn *cmd, uint16_t idx) +{ + + cmd->arg1 = idx; +} + +static int +nat64lsn_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, + struct named_object **pno) +{ + int err; + + err = ipfw_objhash_find_type(CHAIN_TO_SRV(ch), ti, + IPFW_TLV_NAT64LSN_NAME, pno); + return (err); +} + +static struct named_object * +nat64lsn_findbykidx(struct ip_fw_chain *ch, uint16_t idx) +{ + struct namedobj_instance *ni; + struct named_object *no; + + IPFW_UH_WLOCK_ASSERT(ch); + ni = CHAIN_TO_SRV(ch); + no = ipfw_objhash_lookup_kidx(ni, idx); + KASSERT(no != NULL, ("NAT64LSN with index %d not found", idx)); + + return (no); +} + +static int +nat64lsn_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set, + enum ipfw_sets_cmd cmd) +{ + + return (ipfw_obj_manage_sets(CHAIN_TO_SRV(ch), IPFW_TLV_NAT64LSN_NAME, + set, new_set, cmd)); +} + +static struct opcode_obj_rewrite opcodes[] = { + { + .opcode = O_EXTERNAL_INSTANCE, + .etlv = IPFW_TLV_EACTION /* just show it isn't table */, + .classifier = nat64lsn_classify, + .update = nat64lsn_update_arg1, + .find_byname = nat64lsn_findbyname, + .find_bykidx = nat64lsn_findbykidx, + .manage_sets = nat64lsn_manage_sets, + }, +}; + +static int +destroy_config_cb(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + struct nat64lsn_cfg *cfg; + struct ip_fw_chain *ch; + + ch = (struct ip_fw_chain *)arg; + cfg = (struct nat64lsn_cfg *)SRV_OBJECT(ch, no->kidx); + SRV_OBJECT(ch, no->kidx) = NULL; + nat64lsn_detach_config(ch, cfg); + nat64lsn_destroy_instance(cfg); + return (0); +} + +int +nat64lsn_init(struct ip_fw_chain *ch, int first) +{ + + if (first != 0) + nat64lsn_init_internal(); + V_nat64lsn_eid = ipfw_add_eaction(ch, ipfw_nat64lsn, "nat64lsn"); + if (V_nat64lsn_eid == 0) + return (ENXIO); + IPFW_ADD_SOPT_HANDLER(first, scodes); + IPFW_ADD_OBJ_REWRITER(first, opcodes); + return (0); +} + +void +nat64lsn_uninit(struct ip_fw_chain *ch, int last) +{ + + IPFW_DEL_OBJ_REWRITER(last, opcodes); + IPFW_DEL_SOPT_HANDLER(last, scodes); + ipfw_del_eaction(ch, V_nat64lsn_eid); + /* + * Since we already have deregistered external action, + * our named objects become unaccessible via rules, because + * all rules were truncated by ipfw_del_eaction(). + * So, we can unlink and destroy our named objects without holding + * IPFW_WLOCK(). + */ + IPFW_UH_WLOCK(ch); + ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), destroy_config_cb, ch, + IPFW_TLV_NAT64LSN_NAME); + V_nat64lsn_eid = 0; + IPFW_UH_WUNLOCK(ch); + if (last != 0) + nat64lsn_uninit_internal(); +} + diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64stl.c b/freebsd/sys/netpfil/ipfw/nat64/nat64stl.c new file mode 100644 index 00000000..36e6e268 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/nat64/nat64stl.c @@ -0,0 +1,262 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2015-2016 Yandex LLC + * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/counter.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/rmlock.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_pflog.h> +#include <net/pfil.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_var.h> +#include <netinet/ip_fw.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet6/ip_fw_nat64.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/nat64/ip_fw_nat64.h> +#include <netpfil/ipfw/nat64/nat64_translate.h> +#include <netpfil/ipfw/nat64/nat64stl.h> +#include <netpfil/pf/pf.h> + +#define NAT64_LOOKUP(chain, cmd) \ + (struct nat64stl_cfg *)SRV_OBJECT((chain), (cmd)->arg1) + +static void +nat64stl_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family, + uint32_t kidx) +{ + static uint32_t pktid = 0; + + memset(plog, 0, sizeof(plog)); + plog->length = PFLOG_REAL_HDRLEN; + plog->af = family; + plog->action = PF_NAT; + plog->dir = PF_IN; + plog->rulenr = htonl(kidx); + plog->subrulenr = htonl(++pktid); + plog->ruleset[0] = '\0'; + strlcpy(plog->ifname, "NAT64STL", sizeof(plog->ifname)); + ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m); +} + +static int +nat64stl_handle_ip4(struct ip_fw_chain *chain, struct nat64stl_cfg *cfg, + struct mbuf *m, uint32_t tablearg) +{ + struct pfloghdr loghdr, *logdata; + struct in6_addr saddr, daddr; + struct ip *ip; + + ip = mtod(m, struct ip*); + if (nat64_check_ip4(ip->ip_src.s_addr) != 0 || + nat64_check_ip4(ip->ip_dst.s_addr) != 0 || + nat64_check_private_ip4(ip->ip_src.s_addr) != 0 || + nat64_check_private_ip4(ip->ip_dst.s_addr) != 0) + return (NAT64SKIP); + + daddr = TARG_VAL(chain, tablearg, nh6); + if (nat64_check_ip6(&daddr) != 0) + return (NAT64MFREE); + saddr = cfg->prefix6; + nat64_set_ip4(&saddr, ip->ip_src.s_addr); + + if (cfg->flags & NAT64_LOG) { + logdata = &loghdr; + nat64stl_log(logdata, m, AF_INET, cfg->no.kidx); + } else + logdata = NULL; + return (nat64_do_handle_ip4(m, &saddr, &daddr, 0, &cfg->stats, + logdata)); +} + +static int +nat64stl_handle_ip6(struct ip_fw_chain *chain, struct nat64stl_cfg *cfg, + struct mbuf *m, uint32_t tablearg) +{ + struct pfloghdr loghdr, *logdata; + struct ip6_hdr *ip6; + uint32_t aaddr; + + aaddr = htonl(TARG_VAL(chain, tablearg, nh4)); + + /* + * NOTE: we expect ipfw_chk() did m_pullup() up to upper level + * protocol's headers. Also we skip some checks, that ip6_input(), + * ip6_forward(), ip6_fastfwd() and ipfw_chk() already did. + */ + ip6 = mtod(m, struct ip6_hdr *); + /* Check ip6_dst matches configured prefix */ + if (bcmp(&ip6->ip6_dst, &cfg->prefix6, cfg->plen6 / 8) != 0) + return (NAT64SKIP); + + if (cfg->flags & NAT64_LOG) { + logdata = &loghdr; + nat64stl_log(logdata, m, AF_INET6, cfg->no.kidx); + } else + logdata = NULL; + return (nat64_do_handle_ip6(m, aaddr, 0, &cfg->stats, logdata)); +} + +static int +nat64stl_handle_icmp6(struct ip_fw_chain *chain, struct nat64stl_cfg *cfg, + struct mbuf *m) +{ + struct pfloghdr loghdr, *logdata; + nat64_stats_block *stats; + struct ip6_hdr *ip6i; + struct icmp6_hdr *icmp6; + uint32_t tablearg; + int hlen, proto; + + hlen = 0; + stats = &cfg->stats; + proto = nat64_getlasthdr(m, &hlen); + if (proto != IPPROTO_ICMPV6) { + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + icmp6 = mtodo(m, hlen); + switch (icmp6->icmp6_type) { + case ICMP6_DST_UNREACH: + case ICMP6_PACKET_TOO_BIG: + case ICMP6_TIME_EXCEED_TRANSIT: + case ICMP6_PARAM_PROB: + break; + default: + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + hlen += sizeof(struct icmp6_hdr); + if (m->m_pkthdr.len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) { + NAT64STAT_INC(stats, dropped); + return (NAT64MFREE); + } + if (m->m_len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) + m = m_pullup(m, hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN); + if (m == NULL) { + NAT64STAT_INC(stats, nomem); + return (NAT64RETURN); + } + /* + * Use destination address from inner IPv6 header to determine + * IPv4 mapped address. + */ + ip6i = mtodo(m, hlen); + if (ipfw_lookup_table_extended(chain, cfg->map64, + sizeof(struct in6_addr), &ip6i->ip6_dst, &tablearg) == 0) { + m_freem(m); + return (NAT64RETURN); + } + if (cfg->flags & NAT64_LOG) { + logdata = &loghdr; + nat64stl_log(logdata, m, AF_INET6, cfg->no.kidx); + } else + logdata = NULL; + return (nat64_handle_icmp6(m, 0, + htonl(TARG_VAL(chain, tablearg, nh4)), 0, stats, logdata)); +} + +int +ipfw_nat64stl(struct ip_fw_chain *chain, struct ip_fw_args *args, + ipfw_insn *cmd, int *done) +{ + ipfw_insn *icmd; + struct nat64stl_cfg *cfg; + uint32_t tablearg; + int ret; + + IPFW_RLOCK_ASSERT(chain); + + *done = 0; /* try next rule if not matched */ + icmd = cmd + 1; + if (cmd->opcode != O_EXTERNAL_ACTION || + cmd->arg1 != V_nat64stl_eid || + icmd->opcode != O_EXTERNAL_INSTANCE || + (cfg = NAT64_LOOKUP(chain, icmd)) == NULL) + return (0); + + switch (args->f_id.addr_type) { + case 4: + ret = ipfw_lookup_table(chain, cfg->map46, + htonl(args->f_id.dst_ip), &tablearg); + break; + case 6: + ret = ipfw_lookup_table_extended(chain, cfg->map64, + sizeof(struct in6_addr), &args->f_id.src_ip6, &tablearg); + break; + default: + return (0); + } + if (ret == 0) { + /* + * In case when packet is ICMPv6 message from an intermediate + * router, the source address of message will not match the + * addresses from our map64 table. + */ + if (args->f_id.proto != IPPROTO_ICMPV6) + return (0); + + ret = nat64stl_handle_icmp6(chain, cfg, args->m); + } else { + if (args->f_id.addr_type == 4) + ret = nat64stl_handle_ip4(chain, cfg, args->m, + tablearg); + else + ret = nat64stl_handle_ip6(chain, cfg, args->m, + tablearg); + } + if (ret == NAT64SKIP) + return (0); + + *done = 1; /* terminate the search */ + if (ret == NAT64MFREE) + m_freem(args->m); + args->m = NULL; + return (IP_FW_DENY); +} + + diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64stl.h b/freebsd/sys/netpfil/ipfw/nat64/nat64stl.h new file mode 100644 index 00000000..42ec20ea --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/nat64/nat64stl.h @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 2015-2016 Yandex LLC + * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IP_FW_NAT64STL_H_ +#define _IP_FW_NAT64STL_H_ + +struct nat64stl_cfg { + struct named_object no; + + uint16_t map64; /* table with 6to4 mapping */ + uint16_t map46; /* table with 4to6 mapping */ + + struct in6_addr prefix6;/* IPv6 prefix */ + uint8_t plen6; /* prefix length */ + uint8_t flags; /* flags for internal use */ +#define NAT64STL_KIDX 0x0100 +#define NAT64STL_46T 0x0200 +#define NAT64STL_64T 0x0400 +#define NAT64STL_FLAGSMASK (NAT64_LOG) /* flags to pass to userland */ + char name[64]; + nat64_stats_block stats; +}; + +VNET_DECLARE(uint16_t, nat64stl_eid); +#define V_nat64stl_eid VNET(nat64stl_eid) +#define IPFW_TLV_NAT64STL_NAME IPFW_TLV_EACTION_NAME(V_nat64stl_eid) + +int ipfw_nat64stl(struct ip_fw_chain *chain, struct ip_fw_args *args, + ipfw_insn *cmd, int *done); + +#endif + diff --git a/freebsd/sys/netpfil/ipfw/nat64/nat64stl_control.c b/freebsd/sys/netpfil/ipfw/nat64/nat64stl_control.c new file mode 100644 index 00000000..6ee04867 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/nat64/nat64stl_control.c @@ -0,0 +1,623 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2015-2016 Yandex LLC + * Copyright (c) 2015-2016 Andrey V. Elsukov <ae@FreeBSD.org> + * Copyright (c) 2015 Alexander V. Chernikov <melifaro@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/counter.h> +#include <rtems/bsd/sys/errno.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/rmlock.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/sockopt.h> +#include <sys/queue.h> +#include <sys/syslog.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/pfil.h> +#include <net/route.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip_var.h> +#include <netinet/ip_fw.h> +#include <netinet6/in6_var.h> +#include <netinet6/ip6_var.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/nat64/ip_fw_nat64.h> +#include <netpfil/ipfw/nat64/nat64stl.h> +#include <netinet6/ip_fw_nat64.h> + +VNET_DEFINE(uint16_t, nat64stl_eid) = 0; + +static struct nat64stl_cfg *nat64stl_alloc_config(const char *name, uint8_t set); +static void nat64stl_free_config(struct nat64stl_cfg *cfg); +static struct nat64stl_cfg *nat64stl_find(struct namedobj_instance *ni, + const char *name, uint8_t set); + +static struct nat64stl_cfg * +nat64stl_alloc_config(const char *name, uint8_t set) +{ + struct nat64stl_cfg *cfg; + + cfg = malloc(sizeof(struct nat64stl_cfg), M_IPFW, M_WAITOK | M_ZERO); + COUNTER_ARRAY_ALLOC(cfg->stats.stats, NAT64STATS, M_WAITOK); + cfg->no.name = cfg->name; + cfg->no.etlv = IPFW_TLV_NAT64STL_NAME; + cfg->no.set = set; + strlcpy(cfg->name, name, sizeof(cfg->name)); + return (cfg); +} + +static void +nat64stl_free_config(struct nat64stl_cfg *cfg) +{ + + COUNTER_ARRAY_FREE(cfg->stats.stats, NAT64STATS); + free(cfg, M_IPFW); +} + +static void +nat64stl_export_config(struct ip_fw_chain *ch, struct nat64stl_cfg *cfg, + ipfw_nat64stl_cfg *uc) +{ + struct named_object *no; + + uc->prefix6 = cfg->prefix6; + uc->plen6 = cfg->plen6; + uc->flags = cfg->flags & NAT64STL_FLAGSMASK; + uc->set = cfg->no.set; + strlcpy(uc->name, cfg->no.name, sizeof(uc->name)); + + no = ipfw_objhash_lookup_table_kidx(ch, cfg->map64); + ipfw_export_obj_ntlv(no, &uc->ntlv6); + no = ipfw_objhash_lookup_table_kidx(ch, cfg->map46); + ipfw_export_obj_ntlv(no, &uc->ntlv4); +} + +struct nat64stl_dump_arg { + struct ip_fw_chain *ch; + struct sockopt_data *sd; +}; + +static int +export_config_cb(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + struct nat64stl_dump_arg *da = (struct nat64stl_dump_arg *)arg; + ipfw_nat64stl_cfg *uc; + + uc = (ipfw_nat64stl_cfg *)ipfw_get_sopt_space(da->sd, sizeof(*uc)); + nat64stl_export_config(da->ch, (struct nat64stl_cfg *)no, uc); + return (0); +} + +static struct nat64stl_cfg * +nat64stl_find(struct namedobj_instance *ni, const char *name, uint8_t set) +{ + struct nat64stl_cfg *cfg; + + cfg = (struct nat64stl_cfg *)ipfw_objhash_lookup_name_type(ni, set, + IPFW_TLV_NAT64STL_NAME, name); + + return (cfg); +} + + +static int +nat64stl_create_internal(struct ip_fw_chain *ch, struct nat64stl_cfg *cfg, + ipfw_nat64stl_cfg *i) +{ + + IPFW_UH_WLOCK_ASSERT(ch); + + if (ipfw_objhash_alloc_idx(CHAIN_TO_SRV(ch), &cfg->no.kidx) != 0) + return (ENOSPC); + cfg->flags |= NAT64STL_KIDX; + + if (ipfw_ref_table(ch, &i->ntlv4, &cfg->map46) != 0) + return (EINVAL); + cfg->flags |= NAT64STL_46T; + + if (ipfw_ref_table(ch, &i->ntlv6, &cfg->map64) != 0) + return (EINVAL); + cfg->flags |= NAT64STL_64T; + + ipfw_objhash_add(CHAIN_TO_SRV(ch), &cfg->no); + + return (0); +} + +/* + * Creates new nat64 instance. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ipfw_nat64stl_cfg ] + * + * Returns 0 on success + */ +static int +nat64stl_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_lheader *olh; + ipfw_nat64stl_cfg *uc; + struct namedobj_instance *ni; + struct nat64stl_cfg *cfg; + int error; + + if (sd->valsize != sizeof(*olh) + sizeof(*uc)) + return (EINVAL); + + olh = (ipfw_obj_lheader *)sd->kbuf; + uc = (ipfw_nat64stl_cfg *)(olh + 1); + + if (ipfw_check_object_name_generic(uc->name) != 0) + return (EINVAL); + if (!IN6_IS_ADDR_WKPFX(&uc->prefix6)) + return (EINVAL); + if (uc->plen6 != 96 || uc->set >= IPFW_MAX_SETS) + return (EINVAL); + + /* XXX: check types of tables */ + + ni = CHAIN_TO_SRV(ch); + error = 0; + + IPFW_UH_RLOCK(ch); + if (nat64stl_find(ni, uc->name, uc->set) != NULL) { + IPFW_UH_RUNLOCK(ch); + return (EEXIST); + } + IPFW_UH_RUNLOCK(ch); + + cfg = nat64stl_alloc_config(uc->name, uc->set); + cfg->prefix6 = uc->prefix6; + cfg->plen6 = uc->plen6; + cfg->flags = uc->flags & NAT64STL_FLAGSMASK; + + IPFW_UH_WLOCK(ch); + + if (nat64stl_find(ni, uc->name, uc->set) != NULL) { + IPFW_UH_WUNLOCK(ch); + nat64stl_free_config(cfg); + return (EEXIST); + } + error = nat64stl_create_internal(ch, cfg, uc); + if (error == 0) { + /* Okay, let's link data */ + IPFW_WLOCK(ch); + SRV_OBJECT(ch, cfg->no.kidx) = cfg; + IPFW_WUNLOCK(ch); + + IPFW_UH_WUNLOCK(ch); + return (0); + } + + if (cfg->flags & NAT64STL_KIDX) + ipfw_objhash_free_idx(ni, cfg->no.kidx); + if (cfg->flags & NAT64STL_46T) + ipfw_unref_table(ch, cfg->map46); + if (cfg->flags & NAT64STL_64T) + ipfw_unref_table(ch, cfg->map64); + + IPFW_UH_WUNLOCK(ch); + nat64stl_free_config(cfg); + return (error); +} + +/* + * Change existing nat64stl instance configuration. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_nat64stl_cfg ] + * Reply: [ ipfw_obj_header ipfw_nat64stl_cfg ] + * + * Returns 0 on success + */ +static int +nat64stl_config(struct ip_fw_chain *ch, ip_fw3_opheader *op, + struct sockopt_data *sd) +{ + ipfw_obj_header *oh; + ipfw_nat64stl_cfg *uc; + struct nat64stl_cfg *cfg; + struct namedobj_instance *ni; + + if (sd->valsize != sizeof(*oh) + sizeof(*uc)) + return (EINVAL); + + oh = (ipfw_obj_header *)ipfw_get_sopt_space(sd, + sizeof(*oh) + sizeof(*uc)); + uc = (ipfw_nat64stl_cfg *)(oh + 1); + + if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || + oh->ntlv.set >= IPFW_MAX_SETS) + return (EINVAL); + + ni = CHAIN_TO_SRV(ch); + if (sd->sopt->sopt_dir == SOPT_GET) { + IPFW_UH_RLOCK(ch); + cfg = nat64stl_find(ni, oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_RUNLOCK(ch); + return (EEXIST); + } + nat64stl_export_config(ch, cfg, uc); + IPFW_UH_RUNLOCK(ch); + return (0); + } + + IPFW_UH_WLOCK(ch); + cfg = nat64stl_find(ni, oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_WUNLOCK(ch); + return (EEXIST); + } + + /* + * For now allow to change only following values: + * flags. + */ + + cfg->flags = uc->flags & NAT64STL_FLAGSMASK; + IPFW_UH_WUNLOCK(ch); + return (0); +} + +static void +nat64stl_detach_config(struct ip_fw_chain *ch, struct nat64stl_cfg *cfg) +{ + + IPFW_UH_WLOCK_ASSERT(ch); + + ipfw_objhash_del(CHAIN_TO_SRV(ch), &cfg->no); + ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), cfg->no.kidx); + ipfw_unref_table(ch, cfg->map46); + ipfw_unref_table(ch, cfg->map64); +} + +/* + * Destroys nat64 instance. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ] + * + * Returns 0 on success + */ +static int +nat64stl_destroy(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_header *oh; + struct nat64stl_cfg *cfg; + + if (sd->valsize != sizeof(*oh)) + return (EINVAL); + + oh = (ipfw_obj_header *)sd->kbuf; + if (ipfw_check_object_name_generic(oh->ntlv.name) != 0) + return (EINVAL); + + IPFW_UH_WLOCK(ch); + cfg = nat64stl_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + if (cfg->no.refcnt > 0) { + IPFW_UH_WUNLOCK(ch); + return (EBUSY); + } + + IPFW_WLOCK(ch); + SRV_OBJECT(ch, cfg->no.kidx) = NULL; + IPFW_WUNLOCK(ch); + + nat64stl_detach_config(ch, cfg); + IPFW_UH_WUNLOCK(ch); + + nat64stl_free_config(cfg); + return (0); +} + +/* + * Lists all nat64stl instances currently available in kernel. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ] + * Reply: [ ipfw_obj_lheader ipfw_nat64stl_cfg x N ] + * + * Returns 0 on success + */ +static int +nat64stl_list(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_lheader *olh; + struct nat64stl_dump_arg da; + + /* Check minimum header size */ + if (sd->valsize < sizeof(ipfw_obj_lheader)) + return (EINVAL); + + olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh)); + + IPFW_UH_RLOCK(ch); + olh->count = ipfw_objhash_count_type(CHAIN_TO_SRV(ch), + IPFW_TLV_NAT64STL_NAME); + olh->objsize = sizeof(ipfw_nat64stl_cfg); + olh->size = sizeof(*olh) + olh->count * olh->objsize; + + if (sd->valsize < olh->size) { + IPFW_UH_RUNLOCK(ch); + return (ENOMEM); + } + memset(&da, 0, sizeof(da)); + da.ch = ch; + da.sd = sd; + ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), export_config_cb, + &da, IPFW_TLV_NAT64STL_NAME); + IPFW_UH_RUNLOCK(ch); + + return (0); +} + +#define __COPY_STAT_FIELD(_cfg, _stats, _field) \ + (_stats)->_field = NAT64STAT_FETCH(&(_cfg)->stats, _field) +static void +export_stats(struct ip_fw_chain *ch, struct nat64stl_cfg *cfg, + struct ipfw_nat64stl_stats *stats) +{ + + __COPY_STAT_FIELD(cfg, stats, opcnt64); + __COPY_STAT_FIELD(cfg, stats, opcnt46); + __COPY_STAT_FIELD(cfg, stats, ofrags); + __COPY_STAT_FIELD(cfg, stats, ifrags); + __COPY_STAT_FIELD(cfg, stats, oerrors); + __COPY_STAT_FIELD(cfg, stats, noroute4); + __COPY_STAT_FIELD(cfg, stats, noroute6); + __COPY_STAT_FIELD(cfg, stats, noproto); + __COPY_STAT_FIELD(cfg, stats, nomem); + __COPY_STAT_FIELD(cfg, stats, dropped); +} + +/* + * Get nat64stl statistics. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ] + * Reply: [ ipfw_obj_header ipfw_obj_ctlv [ uint64_t x N ]] + * + * Returns 0 on success + */ +static int +nat64stl_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, + struct sockopt_data *sd) +{ + struct ipfw_nat64stl_stats stats; + struct nat64stl_cfg *cfg; + ipfw_obj_header *oh; + ipfw_obj_ctlv *ctlv; + size_t sz; + + sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ctlv) + sizeof(stats); + if (sd->valsize % sizeof(uint64_t)) + return (EINVAL); + if (sd->valsize < sz) + return (ENOMEM); + oh = (ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); + if (oh == NULL) + return (EINVAL); + memset(&stats, 0, sizeof(stats)); + + IPFW_UH_RLOCK(ch); + cfg = nat64stl_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_RUNLOCK(ch); + return (ESRCH); + } + export_stats(ch, cfg, &stats); + IPFW_UH_RUNLOCK(ch); + + ctlv = (ipfw_obj_ctlv *)(oh + 1); + memset(ctlv, 0, sizeof(*ctlv)); + ctlv->head.type = IPFW_TLV_COUNTERS; + ctlv->head.length = sz - sizeof(ipfw_obj_header); + ctlv->count = sizeof(stats) / sizeof(uint64_t); + ctlv->objsize = sizeof(uint64_t); + ctlv->version = IPFW_NAT64_VERSION; + memcpy(ctlv + 1, &stats, sizeof(stats)); + return (0); +} + +/* + * Reset nat64stl statistics. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ] + * + * Returns 0 on success + */ +static int +nat64stl_reset_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, + struct sockopt_data *sd) +{ + struct nat64stl_cfg *cfg; + ipfw_obj_header *oh; + + if (sd->valsize != sizeof(*oh)) + return (EINVAL); + oh = (ipfw_obj_header *)sd->kbuf; + if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || + oh->ntlv.set >= IPFW_MAX_SETS) + return (EINVAL); + + IPFW_UH_WLOCK(ch); + cfg = nat64stl_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + COUNTER_ARRAY_ZERO(cfg->stats.stats, NAT64STATS); + IPFW_UH_WUNLOCK(ch); + return (0); +} + +static struct ipfw_sopt_handler scodes[] = { + + { IP_FW_NAT64STL_CREATE, 0, HDIR_SET, nat64stl_create }, + { IP_FW_NAT64STL_DESTROY,0, HDIR_SET, nat64stl_destroy }, + { IP_FW_NAT64STL_CONFIG, 0, HDIR_BOTH, nat64stl_config }, + { IP_FW_NAT64STL_LIST, 0, HDIR_GET, nat64stl_list }, + { IP_FW_NAT64STL_STATS, 0, HDIR_GET, nat64stl_stats }, + { IP_FW_NAT64STL_RESET_STATS,0, HDIR_SET, nat64stl_reset_stats }, +}; + +static int +nat64stl_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +{ + ipfw_insn *icmd; + + icmd = cmd - 1; + if (icmd->opcode != O_EXTERNAL_ACTION || + icmd->arg1 != V_nat64stl_eid) + return (1); + + *puidx = cmd->arg1; + *ptype = 0; + return (0); +} + +static void +nat64stl_update_arg1(ipfw_insn *cmd, uint16_t idx) +{ + + cmd->arg1 = idx; +} + +static int +nat64stl_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, + struct named_object **pno) +{ + int err; + + err = ipfw_objhash_find_type(CHAIN_TO_SRV(ch), ti, + IPFW_TLV_NAT64STL_NAME, pno); + return (err); +} + +static struct named_object * +nat64stl_findbykidx(struct ip_fw_chain *ch, uint16_t idx) +{ + struct namedobj_instance *ni; + struct named_object *no; + + IPFW_UH_WLOCK_ASSERT(ch); + ni = CHAIN_TO_SRV(ch); + no = ipfw_objhash_lookup_kidx(ni, idx); + KASSERT(no != NULL, ("NAT with index %d not found", idx)); + + return (no); +} + +static int +nat64stl_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set, + enum ipfw_sets_cmd cmd) +{ + + return (ipfw_obj_manage_sets(CHAIN_TO_SRV(ch), IPFW_TLV_NAT64STL_NAME, + set, new_set, cmd)); +} + +static struct opcode_obj_rewrite opcodes[] = { + { + .opcode = O_EXTERNAL_INSTANCE, + .etlv = IPFW_TLV_EACTION /* just show it isn't table */, + .classifier = nat64stl_classify, + .update = nat64stl_update_arg1, + .find_byname = nat64stl_findbyname, + .find_bykidx = nat64stl_findbykidx, + .manage_sets = nat64stl_manage_sets, + }, +}; + +static int +destroy_config_cb(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + struct nat64stl_cfg *cfg; + struct ip_fw_chain *ch; + + ch = (struct ip_fw_chain *)arg; + cfg = (struct nat64stl_cfg *)SRV_OBJECT(ch, no->kidx); + SRV_OBJECT(ch, no->kidx) = NULL; + nat64stl_detach_config(ch, cfg); + nat64stl_free_config(cfg); + return (0); +} + +int +nat64stl_init(struct ip_fw_chain *ch, int first) +{ + + V_nat64stl_eid = ipfw_add_eaction(ch, ipfw_nat64stl, "nat64stl"); + if (V_nat64stl_eid == 0) + return (ENXIO); + IPFW_ADD_SOPT_HANDLER(first, scodes); + IPFW_ADD_OBJ_REWRITER(first, opcodes); + return (0); +} + +void +nat64stl_uninit(struct ip_fw_chain *ch, int last) +{ + + IPFW_DEL_OBJ_REWRITER(last, opcodes); + IPFW_DEL_SOPT_HANDLER(last, scodes); + ipfw_del_eaction(ch, V_nat64stl_eid); + /* + * Since we already have deregistered external action, + * our named objects become unaccessible via rules, because + * all rules were truncated by ipfw_del_eaction(). + * So, we can unlink and destroy our named objects without holding + * IPFW_WLOCK(). + */ + IPFW_UH_WLOCK(ch); + ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), destroy_config_cb, ch, + IPFW_TLV_NAT64STL_NAME); + V_nat64stl_eid = 0; + IPFW_UH_WUNLOCK(ch); +} + diff --git a/freebsd/sys/netpfil/ipfw/nptv6/ip_fw_nptv6.c b/freebsd/sys/netpfil/ipfw/nptv6/ip_fw_nptv6.c new file mode 100644 index 00000000..92a2c7a3 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/nptv6/ip_fw_nptv6.c @@ -0,0 +1,101 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2016 Yandex LLC + * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/rwlock.h> +#include <sys/socket.h> + +#include <net/if.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip_var.h> +#include <netinet/ip_fw.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/nptv6/nptv6.h> + +static int +vnet_ipfw_nptv6_init(const void *arg __unused) +{ + + return (nptv6_init(&V_layer3_chain, IS_DEFAULT_VNET(curvnet))); +} + +static int +vnet_ipfw_nptv6_uninit(const void *arg __unused) +{ + + nptv6_uninit(&V_layer3_chain, IS_DEFAULT_VNET(curvnet)); + return (0); +} + +static int +ipfw_nptv6_modevent(module_t mod, int type, void *unused) +{ + + switch (type) { + case MOD_LOAD: + case MOD_UNLOAD: + break; + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t ipfw_nptv6_mod = { + "ipfw_nptv6", + ipfw_nptv6_modevent, + 0 +}; + +/* Define startup order. */ +#define IPFW_NPTV6_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN +#define IPFW_NPTV6_MODEVENT_ORDER (SI_ORDER_ANY - 128) /* after ipfw */ +#define IPFW_NPTV6_MODULE_ORDER (IPFW_NPTV6_MODEVENT_ORDER + 1) +#define IPFW_NPTV6_VNET_ORDER (IPFW_NPTV6_MODEVENT_ORDER + 2) + +DECLARE_MODULE(ipfw_nptv6, ipfw_nptv6_mod, IPFW_NPTV6_SI_SUB_FIREWALL, + IPFW_NPTV6_MODULE_ORDER); +MODULE_DEPEND(ipfw_nptv6, ipfw, 3, 3, 3); +MODULE_VERSION(ipfw_nptv6, 1); + +VNET_SYSINIT(vnet_ipfw_nptv6_init, IPFW_NPTV6_SI_SUB_FIREWALL, + IPFW_NPTV6_VNET_ORDER, vnet_ipfw_nptv6_init, NULL); +VNET_SYSUNINIT(vnet_ipfw_nptv6_uninit, IPFW_NPTV6_SI_SUB_FIREWALL, + IPFW_NPTV6_VNET_ORDER, vnet_ipfw_nptv6_uninit, NULL); diff --git a/freebsd/sys/netpfil/ipfw/nptv6/nptv6.c b/freebsd/sys/netpfil/ipfw/nptv6/nptv6.c new file mode 100644 index 00000000..4256d028 --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/nptv6/nptv6.c @@ -0,0 +1,894 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2016 Yandex LLC + * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/counter.h> +#include <rtems/bsd/sys/errno.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/rmlock.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/queue.h> +#include <sys/syslog.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/netisr.h> +#include <net/pfil.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip_var.h> +#include <netinet/ip_fw.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet6/in6_var.h> +#include <netinet6/ip6_var.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/nptv6/nptv6.h> + +static VNET_DEFINE(uint16_t, nptv6_eid) = 0; +#define V_nptv6_eid VNET(nptv6_eid) +#define IPFW_TLV_NPTV6_NAME IPFW_TLV_EACTION_NAME(V_nptv6_eid) + +static struct nptv6_cfg *nptv6_alloc_config(const char *name, uint8_t set); +static void nptv6_free_config(struct nptv6_cfg *cfg); +static struct nptv6_cfg *nptv6_find(struct namedobj_instance *ni, + const char *name, uint8_t set); +static int nptv6_rewrite_internal(struct nptv6_cfg *cfg, struct mbuf **mp, + int offset); +static int nptv6_rewrite_external(struct nptv6_cfg *cfg, struct mbuf **mp, + int offset); + +#define NPTV6_LOOKUP(chain, cmd) \ + (struct nptv6_cfg *)SRV_OBJECT((chain), (cmd)->arg1) + +#ifndef IN6_MASK_ADDR +#define IN6_MASK_ADDR(a, m) do { \ + (a)->s6_addr32[0] &= (m)->s6_addr32[0]; \ + (a)->s6_addr32[1] &= (m)->s6_addr32[1]; \ + (a)->s6_addr32[2] &= (m)->s6_addr32[2]; \ + (a)->s6_addr32[3] &= (m)->s6_addr32[3]; \ +} while (0) +#endif +#ifndef IN6_ARE_MASKED_ADDR_EQUAL +#define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ + (((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \ + (((d)->s6_addr32[1] ^ (a)->s6_addr32[1]) & (m)->s6_addr32[1]) == 0 && \ + (((d)->s6_addr32[2] ^ (a)->s6_addr32[2]) & (m)->s6_addr32[2]) == 0 && \ + (((d)->s6_addr32[3] ^ (a)->s6_addr32[3]) & (m)->s6_addr32[3]) == 0 ) +#endif + +#if 0 +#define NPTV6_DEBUG(fmt, ...) do { \ + printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \ +} while (0) +#define NPTV6_IPDEBUG(fmt, ...) do { \ + char _s[INET6_ADDRSTRLEN], _d[INET6_ADDRSTRLEN]; \ + printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \ +} while (0) +#else +#define NPTV6_DEBUG(fmt, ...) +#define NPTV6_IPDEBUG(fmt, ...) +#endif + +static int +nptv6_getlasthdr(struct nptv6_cfg *cfg, struct mbuf *m, int *offset) +{ + struct ip6_hdr *ip6; + struct ip6_hbh *hbh; + int proto, hlen; + + hlen = (offset == NULL) ? 0: *offset; + if (m->m_len < hlen) + return (-1); + ip6 = mtodo(m, hlen); + hlen += sizeof(*ip6); + proto = ip6->ip6_nxt; + while (proto == IPPROTO_HOPOPTS || proto == IPPROTO_ROUTING || + proto == IPPROTO_DSTOPTS) { + hbh = mtodo(m, hlen); + if (m->m_len < hlen) + return (-1); + proto = hbh->ip6h_nxt; + hlen += hbh->ip6h_len << 3; + } + if (offset != NULL) + *offset = hlen; + return (proto); +} + +static int +nptv6_translate_icmpv6(struct nptv6_cfg *cfg, struct mbuf **mp, int offset) +{ + struct icmp6_hdr *icmp6; + struct ip6_hdr *ip6; + struct mbuf *m; + + m = *mp; + if (offset > m->m_len) + return (-1); + icmp6 = mtodo(m, offset); + NPTV6_DEBUG("ICMPv6 type %d", icmp6->icmp6_type); + switch (icmp6->icmp6_type) { + case ICMP6_DST_UNREACH: + case ICMP6_PACKET_TOO_BIG: + case ICMP6_TIME_EXCEEDED: + case ICMP6_PARAM_PROB: + break; + case ICMP6_ECHO_REQUEST: + case ICMP6_ECHO_REPLY: + /* nothing to translate */ + return (0); + default: + /* + * XXX: We can add some checks to not translate NDP and MLD + * messages. Currently user must explicitly allow these message + * types, otherwise packets will be dropped. + */ + return (-1); + } + offset += sizeof(*icmp6); + if (offset + sizeof(*ip6) > m->m_pkthdr.len) + return (-1); + if (offset + sizeof(*ip6) > m->m_len) + *mp = m = m_pullup(m, offset + sizeof(*ip6)); + if (m == NULL) + return (-1); + ip6 = mtodo(m, offset); + NPTV6_IPDEBUG("offset %d, %s -> %s %d", offset, + inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)), + inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)), + ip6->ip6_nxt); + if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_src, + &cfg->external, &cfg->mask)) + return (nptv6_rewrite_external(cfg, mp, offset)); + else if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_dst, + &cfg->internal, &cfg->mask)) + return (nptv6_rewrite_internal(cfg, mp, offset)); + /* + * Addresses in the inner IPv6 header doesn't matched to + * our prefixes. + */ + return (-1); +} + +static int +nptv6_search_index(struct nptv6_cfg *cfg, struct in6_addr *a) +{ + int idx; + + if (cfg->flags & NPTV6_48PLEN) + return (3); + + /* Search suitable word index for adjustment */ + for (idx = 4; idx < 8; idx++) + if (a->s6_addr16[idx] != 0xffff) + break; + /* + * RFC 6296 p3.7: If an NPTv6 Translator discovers a datagram with + * an IID of all-zeros while performing address mapping, that + * datagram MUST be dropped, and an ICMPv6 Parameter Problem error + * SHOULD be generated. + */ + if (idx == 8 || + (a->s6_addr32[2] == 0 && a->s6_addr32[3] == 0)) + return (-1); + return (idx); +} + +static void +nptv6_copy_addr(struct in6_addr *src, struct in6_addr *dst, + struct in6_addr *mask) +{ + int i; + + for (i = 0; i < 8 && mask->s6_addr8[i] != 0; i++) { + dst->s6_addr8[i] &= ~mask->s6_addr8[i]; + dst->s6_addr8[i] |= src->s6_addr8[i] & mask->s6_addr8[i]; + } +} + +static int +nptv6_rewrite_internal(struct nptv6_cfg *cfg, struct mbuf **mp, int offset) +{ + struct in6_addr *addr; + struct ip6_hdr *ip6; + int idx, proto; + uint16_t adj; + + ip6 = mtodo(*mp, offset); + NPTV6_IPDEBUG("offset %d, %s -> %s %d", offset, + inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)), + inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)), + ip6->ip6_nxt); + if (offset == 0) + addr = &ip6->ip6_src; + else { + /* + * When we rewriting inner IPv6 header, we need to rewrite + * destination address back to external prefix. The datagram in + * the ICMPv6 payload should looks like it was send from + * external prefix. + */ + addr = &ip6->ip6_dst; + } + idx = nptv6_search_index(cfg, addr); + if (idx < 0) { + /* + * Do not send ICMPv6 error when offset isn't zero. + * This means we are rewriting inner IPv6 header in the + * ICMPv6 error message. + */ + if (offset == 0) { + icmp6_error2(*mp, ICMP6_DST_UNREACH, + ICMP6_DST_UNREACH_ADDR, 0, (*mp)->m_pkthdr.rcvif); + *mp = NULL; + } + return (IP_FW_DENY); + } + adj = addr->s6_addr16[idx]; + nptv6_copy_addr(&cfg->external, addr, &cfg->mask); + adj = cksum_add(adj, cfg->adjustment); + if (adj == 0xffff) + adj = 0; + addr->s6_addr16[idx] = adj; + if (offset == 0) { + /* + * We may need to translate addresses in the inner IPv6 + * header for ICMPv6 error messages. + */ + proto = nptv6_getlasthdr(cfg, *mp, &offset); + if (proto < 0 || (proto == IPPROTO_ICMPV6 && + nptv6_translate_icmpv6(cfg, mp, offset) != 0)) + return (IP_FW_DENY); + NPTV6STAT_INC(cfg, in2ex); + } + return (0); +} + +static int +nptv6_rewrite_external(struct nptv6_cfg *cfg, struct mbuf **mp, int offset) +{ + struct in6_addr *addr; + struct ip6_hdr *ip6; + int idx, proto; + uint16_t adj; + + ip6 = mtodo(*mp, offset); + NPTV6_IPDEBUG("offset %d, %s -> %s %d", offset, + inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)), + inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)), + ip6->ip6_nxt); + if (offset == 0) + addr = &ip6->ip6_dst; + else { + /* + * When we rewriting inner IPv6 header, we need to rewrite + * source address back to internal prefix. The datagram in + * the ICMPv6 payload should looks like it was send from + * internal prefix. + */ + addr = &ip6->ip6_src; + } + idx = nptv6_search_index(cfg, addr); + if (idx < 0) { + /* + * Do not send ICMPv6 error when offset isn't zero. + * This means we are rewriting inner IPv6 header in the + * ICMPv6 error message. + */ + if (offset == 0) { + icmp6_error2(*mp, ICMP6_DST_UNREACH, + ICMP6_DST_UNREACH_ADDR, 0, (*mp)->m_pkthdr.rcvif); + *mp = NULL; + } + return (IP_FW_DENY); + } + adj = addr->s6_addr16[idx]; + nptv6_copy_addr(&cfg->internal, addr, &cfg->mask); + adj = cksum_add(adj, ~cfg->adjustment); + if (adj == 0xffff) + adj = 0; + addr->s6_addr16[idx] = adj; + if (offset == 0) { + /* + * We may need to translate addresses in the inner IPv6 + * header for ICMPv6 error messages. + */ + proto = nptv6_getlasthdr(cfg, *mp, &offset); + if (proto < 0 || (proto == IPPROTO_ICMPV6 && + nptv6_translate_icmpv6(cfg, mp, offset) != 0)) + return (IP_FW_DENY); + NPTV6STAT_INC(cfg, ex2in); + } + return (0); +} + +/* + * ipfw external action handler. + */ +static int +ipfw_nptv6(struct ip_fw_chain *chain, struct ip_fw_args *args, + ipfw_insn *cmd, int *done) +{ + struct ip6_hdr *ip6; + struct nptv6_cfg *cfg; + ipfw_insn *icmd; + int ret; + + *done = 0; /* try next rule if not matched */ + icmd = cmd + 1; + if (cmd->opcode != O_EXTERNAL_ACTION || + cmd->arg1 != V_nptv6_eid || + icmd->opcode != O_EXTERNAL_INSTANCE || + (cfg = NPTV6_LOOKUP(chain, icmd)) == NULL) + return (0); + /* + * We need act as router, so when forwarding is disabled - + * do nothing. + */ + if (V_ip6_forwarding == 0 || args->f_id.addr_type != 6) + return (0); + /* + * NOTE: we expect ipfw_chk() did m_pullup() up to upper level + * protocol's headers. Also we skip some checks, that ip6_input(), + * ip6_forward(), ip6_fastfwd() and ipfw_chk() already did. + */ + ret = IP_FW_DENY; + ip6 = mtod(args->m, struct ip6_hdr *); + NPTV6_IPDEBUG("eid %u, oid %u, %s -> %s %d", + cmd->arg1, icmd->arg1, + inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)), + inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)), + ip6->ip6_nxt); + if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_src, + &cfg->internal, &cfg->mask)) { + /* + * XXX: Do not translate packets when both src and dst + * are from internal prefix. + */ + if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_dst, + &cfg->internal, &cfg->mask)) + return (0); + ret = nptv6_rewrite_internal(cfg, &args->m, 0); + } else if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_dst, + &cfg->external, &cfg->mask)) + ret = nptv6_rewrite_external(cfg, &args->m, 0); + else + return (0); + /* + * If address wasn't rewrited - free mbuf. + */ + if (ret != 0) { + if (args->m != NULL) { + m_freem(args->m); + args->m = NULL; /* mark mbuf as consumed */ + } + NPTV6STAT_INC(cfg, dropped); + } + /* Terminate the search if one_pass is set */ + *done = V_fw_one_pass; + /* Update args->f_id when one_pass is off */ + if (*done == 0 && ret == 0) { + ip6 = mtod(args->m, struct ip6_hdr *); + args->f_id.src_ip6 = ip6->ip6_src; + args->f_id.dst_ip6 = ip6->ip6_dst; + } + return (ret); +} + +static struct nptv6_cfg * +nptv6_alloc_config(const char *name, uint8_t set) +{ + struct nptv6_cfg *cfg; + + cfg = malloc(sizeof(struct nptv6_cfg), M_IPFW, M_WAITOK | M_ZERO); + COUNTER_ARRAY_ALLOC(cfg->stats, NPTV6STATS, M_WAITOK); + cfg->no.name = cfg->name; + cfg->no.etlv = IPFW_TLV_NPTV6_NAME; + cfg->no.set = set; + strlcpy(cfg->name, name, sizeof(cfg->name)); + return (cfg); +} + +static void +nptv6_free_config(struct nptv6_cfg *cfg) +{ + + COUNTER_ARRAY_FREE(cfg->stats, NPTV6STATS); + free(cfg, M_IPFW); +} + +static void +nptv6_export_config(struct ip_fw_chain *ch, struct nptv6_cfg *cfg, + ipfw_nptv6_cfg *uc) +{ + + uc->internal = cfg->internal; + uc->external = cfg->external; + uc->plen = cfg->plen; + uc->flags = cfg->flags & NPTV6_FLAGSMASK; + uc->set = cfg->no.set; + strlcpy(uc->name, cfg->no.name, sizeof(uc->name)); +} + +struct nptv6_dump_arg { + struct ip_fw_chain *ch; + struct sockopt_data *sd; +}; + +static int +export_config_cb(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + struct nptv6_dump_arg *da = (struct nptv6_dump_arg *)arg; + ipfw_nptv6_cfg *uc; + + uc = (ipfw_nptv6_cfg *)ipfw_get_sopt_space(da->sd, sizeof(*uc)); + nptv6_export_config(da->ch, (struct nptv6_cfg *)no, uc); + return (0); +} + +static struct nptv6_cfg * +nptv6_find(struct namedobj_instance *ni, const char *name, uint8_t set) +{ + struct nptv6_cfg *cfg; + + cfg = (struct nptv6_cfg *)ipfw_objhash_lookup_name_type(ni, set, + IPFW_TLV_NPTV6_NAME, name); + + return (cfg); +} + +static void +nptv6_calculate_adjustment(struct nptv6_cfg *cfg) +{ + uint16_t i, e; + uint16_t *p; + + /* Calculate checksum of internal prefix */ + for (i = 0, p = (uint16_t *)&cfg->internal; + p < (uint16_t *)(&cfg->internal + 1); p++) + i = cksum_add(i, *p); + + /* Calculate checksum of external prefix */ + for (e = 0, p = (uint16_t *)&cfg->external; + p < (uint16_t *)(&cfg->external + 1); p++) + e = cksum_add(e, *p); + + /* Adjustment value for Int->Ext direction */ + cfg->adjustment = cksum_add(~e, i); +} + +/* + * Creates new NPTv6 instance. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ipfw_nptv6_cfg ] + * + * Returns 0 on success + */ +static int +nptv6_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct in6_addr mask; + ipfw_obj_lheader *olh; + ipfw_nptv6_cfg *uc; + struct namedobj_instance *ni; + struct nptv6_cfg *cfg; + + if (sd->valsize != sizeof(*olh) + sizeof(*uc)) + return (EINVAL); + + olh = (ipfw_obj_lheader *)sd->kbuf; + uc = (ipfw_nptv6_cfg *)(olh + 1); + if (ipfw_check_object_name_generic(uc->name) != 0) + return (EINVAL); + if (uc->plen < 8 || uc->plen > 64 || uc->set >= IPFW_MAX_SETS) + return (EINVAL); + if (IN6_IS_ADDR_MULTICAST(&uc->internal) || + IN6_IS_ADDR_MULTICAST(&uc->external) || + IN6_IS_ADDR_UNSPECIFIED(&uc->internal) || + IN6_IS_ADDR_UNSPECIFIED(&uc->external) || + IN6_IS_ADDR_LINKLOCAL(&uc->internal) || + IN6_IS_ADDR_LINKLOCAL(&uc->external)) + return (EINVAL); + in6_prefixlen2mask(&mask, uc->plen); + if (IN6_ARE_MASKED_ADDR_EQUAL(&uc->internal, &uc->external, &mask)) + return (EINVAL); + + ni = CHAIN_TO_SRV(ch); + IPFW_UH_RLOCK(ch); + if (nptv6_find(ni, uc->name, uc->set) != NULL) { + IPFW_UH_RUNLOCK(ch); + return (EEXIST); + } + IPFW_UH_RUNLOCK(ch); + + cfg = nptv6_alloc_config(uc->name, uc->set); + cfg->plen = uc->plen; + if (cfg->plen <= 48) + cfg->flags |= NPTV6_48PLEN; + cfg->internal = uc->internal; + cfg->external = uc->external; + cfg->mask = mask; + IN6_MASK_ADDR(&cfg->internal, &mask); + IN6_MASK_ADDR(&cfg->external, &mask); + nptv6_calculate_adjustment(cfg); + + IPFW_UH_WLOCK(ch); + if (ipfw_objhash_alloc_idx(ni, &cfg->no.kidx) != 0) { + IPFW_UH_WUNLOCK(ch); + nptv6_free_config(cfg); + return (ENOSPC); + } + ipfw_objhash_add(ni, &cfg->no); + IPFW_WLOCK(ch); + SRV_OBJECT(ch, cfg->no.kidx) = cfg; + IPFW_WUNLOCK(ch); + IPFW_UH_WUNLOCK(ch); + return (0); +} + +/* + * Destroys NPTv6 instance. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ] + * + * Returns 0 on success + */ +static int +nptv6_destroy(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_header *oh; + struct nptv6_cfg *cfg; + + if (sd->valsize != sizeof(*oh)) + return (EINVAL); + + oh = (ipfw_obj_header *)sd->kbuf; + if (ipfw_check_object_name_generic(oh->ntlv.name) != 0) + return (EINVAL); + + IPFW_UH_WLOCK(ch); + cfg = nptv6_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + if (cfg->no.refcnt > 0) { + IPFW_UH_WUNLOCK(ch); + return (EBUSY); + } + + IPFW_WLOCK(ch); + SRV_OBJECT(ch, cfg->no.kidx) = NULL; + IPFW_WUNLOCK(ch); + + ipfw_objhash_del(CHAIN_TO_SRV(ch), &cfg->no); + ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), cfg->no.kidx); + IPFW_UH_WUNLOCK(ch); + + nptv6_free_config(cfg); + return (0); +} + +/* + * Get or change nptv6 instance config. + * Request: [ ipfw_obj_header [ ipfw_nptv6_cfg ] ] + */ +static int +nptv6_config(struct ip_fw_chain *chain, ip_fw3_opheader *op, + struct sockopt_data *sd) +{ + + return (EOPNOTSUPP); +} + +/* + * Lists all NPTv6 instances currently available in kernel. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ] + * Reply: [ ipfw_obj_lheader ipfw_nptv6_cfg x N ] + * + * Returns 0 on success + */ +static int +nptv6_list(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_lheader *olh; + struct nptv6_dump_arg da; + + /* Check minimum header size */ + if (sd->valsize < sizeof(ipfw_obj_lheader)) + return (EINVAL); + + olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh)); + + IPFW_UH_RLOCK(ch); + olh->count = ipfw_objhash_count_type(CHAIN_TO_SRV(ch), + IPFW_TLV_NPTV6_NAME); + olh->objsize = sizeof(ipfw_nptv6_cfg); + olh->size = sizeof(*olh) + olh->count * olh->objsize; + + if (sd->valsize < olh->size) { + IPFW_UH_RUNLOCK(ch); + return (ENOMEM); + } + memset(&da, 0, sizeof(da)); + da.ch = ch; + da.sd = sd; + ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), export_config_cb, + &da, IPFW_TLV_NPTV6_NAME); + IPFW_UH_RUNLOCK(ch); + + return (0); +} + +#define __COPY_STAT_FIELD(_cfg, _stats, _field) \ + (_stats)->_field = NPTV6STAT_FETCH(_cfg, _field) +static void +export_stats(struct ip_fw_chain *ch, struct nptv6_cfg *cfg, + struct ipfw_nptv6_stats *stats) +{ + + __COPY_STAT_FIELD(cfg, stats, in2ex); + __COPY_STAT_FIELD(cfg, stats, ex2in); + __COPY_STAT_FIELD(cfg, stats, dropped); +} + +/* + * Get NPTv6 statistics. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ] + * Reply: [ ipfw_obj_header ipfw_obj_ctlv [ uint64_t x N ]] + * + * Returns 0 on success + */ +static int +nptv6_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, + struct sockopt_data *sd) +{ + struct ipfw_nptv6_stats stats; + struct nptv6_cfg *cfg; + ipfw_obj_header *oh; + ipfw_obj_ctlv *ctlv; + size_t sz; + + sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ctlv) + sizeof(stats); + if (sd->valsize % sizeof(uint64_t)) + return (EINVAL); + if (sd->valsize < sz) + return (ENOMEM); + oh = (ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); + if (oh == NULL) + return (EINVAL); + if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || + oh->ntlv.set >= IPFW_MAX_SETS) + return (EINVAL); + memset(&stats, 0, sizeof(stats)); + + IPFW_UH_RLOCK(ch); + cfg = nptv6_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_RUNLOCK(ch); + return (ESRCH); + } + export_stats(ch, cfg, &stats); + IPFW_UH_RUNLOCK(ch); + + ctlv = (ipfw_obj_ctlv *)(oh + 1); + memset(ctlv, 0, sizeof(*ctlv)); + ctlv->head.type = IPFW_TLV_COUNTERS; + ctlv->head.length = sz - sizeof(ipfw_obj_header); + ctlv->count = sizeof(stats) / sizeof(uint64_t); + ctlv->objsize = sizeof(uint64_t); + ctlv->version = 1; + memcpy(ctlv + 1, &stats, sizeof(stats)); + return (0); +} + +/* + * Reset NPTv6 statistics. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ] + * + * Returns 0 on success + */ +static int +nptv6_reset_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, + struct sockopt_data *sd) +{ + struct nptv6_cfg *cfg; + ipfw_obj_header *oh; + + if (sd->valsize != sizeof(*oh)) + return (EINVAL); + oh = (ipfw_obj_header *)sd->kbuf; + if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || + oh->ntlv.set >= IPFW_MAX_SETS) + return (EINVAL); + + IPFW_UH_WLOCK(ch); + cfg = nptv6_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (cfg == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + COUNTER_ARRAY_ZERO(cfg->stats, NPTV6STATS); + IPFW_UH_WUNLOCK(ch); + return (0); +} + +static struct ipfw_sopt_handler scodes[] = { + { IP_FW_NPTV6_CREATE, 0, HDIR_SET, nptv6_create }, + { IP_FW_NPTV6_DESTROY,0, HDIR_SET, nptv6_destroy }, + { IP_FW_NPTV6_CONFIG, 0, HDIR_BOTH, nptv6_config }, + { IP_FW_NPTV6_LIST, 0, HDIR_GET, nptv6_list }, + { IP_FW_NPTV6_STATS, 0, HDIR_GET, nptv6_stats }, + { IP_FW_NPTV6_RESET_STATS,0, HDIR_SET, nptv6_reset_stats }, +}; + +static int +nptv6_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +{ + ipfw_insn *icmd; + + icmd = cmd - 1; + NPTV6_DEBUG("opcode %d, arg1 %d, opcode0 %d, arg1 %d", + cmd->opcode, cmd->arg1, icmd->opcode, icmd->arg1); + if (icmd->opcode != O_EXTERNAL_ACTION || + icmd->arg1 != V_nptv6_eid) + return (1); + + *puidx = cmd->arg1; + *ptype = 0; + return (0); +} + +static void +nptv6_update_arg1(ipfw_insn *cmd, uint16_t idx) +{ + + cmd->arg1 = idx; + NPTV6_DEBUG("opcode %d, arg1 -> %d", cmd->opcode, cmd->arg1); +} + +static int +nptv6_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, + struct named_object **pno) +{ + int err; + + err = ipfw_objhash_find_type(CHAIN_TO_SRV(ch), ti, + IPFW_TLV_NPTV6_NAME, pno); + NPTV6_DEBUG("uidx %u, type %u, err %d", ti->uidx, ti->type, err); + return (err); +} + +static struct named_object * +nptv6_findbykidx(struct ip_fw_chain *ch, uint16_t idx) +{ + struct namedobj_instance *ni; + struct named_object *no; + + IPFW_UH_WLOCK_ASSERT(ch); + ni = CHAIN_TO_SRV(ch); + no = ipfw_objhash_lookup_kidx(ni, idx); + KASSERT(no != NULL, ("NPT with index %d not found", idx)); + + NPTV6_DEBUG("kidx %u -> %s", idx, no->name); + return (no); +} + +static int +nptv6_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set, + enum ipfw_sets_cmd cmd) +{ + + return (ipfw_obj_manage_sets(CHAIN_TO_SRV(ch), IPFW_TLV_NPTV6_NAME, + set, new_set, cmd)); +} + +static struct opcode_obj_rewrite opcodes[] = { + { + .opcode = O_EXTERNAL_INSTANCE, + .etlv = IPFW_TLV_EACTION /* just show it isn't table */, + .classifier = nptv6_classify, + .update = nptv6_update_arg1, + .find_byname = nptv6_findbyname, + .find_bykidx = nptv6_findbykidx, + .manage_sets = nptv6_manage_sets, + }, +}; + +static int +destroy_config_cb(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + struct nptv6_cfg *cfg; + struct ip_fw_chain *ch; + + ch = (struct ip_fw_chain *)arg; + IPFW_UH_WLOCK_ASSERT(ch); + + cfg = (struct nptv6_cfg *)SRV_OBJECT(ch, no->kidx); + SRV_OBJECT(ch, no->kidx) = NULL; + ipfw_objhash_del(ni, &cfg->no); + ipfw_objhash_free_idx(ni, cfg->no.kidx); + nptv6_free_config(cfg); + return (0); +} + +int +nptv6_init(struct ip_fw_chain *ch, int first) +{ + + V_nptv6_eid = ipfw_add_eaction(ch, ipfw_nptv6, "nptv6"); + if (V_nptv6_eid == 0) + return (ENXIO); + IPFW_ADD_SOPT_HANDLER(first, scodes); + IPFW_ADD_OBJ_REWRITER(first, opcodes); + return (0); +} + +void +nptv6_uninit(struct ip_fw_chain *ch, int last) +{ + + IPFW_DEL_OBJ_REWRITER(last, opcodes); + IPFW_DEL_SOPT_HANDLER(last, scodes); + ipfw_del_eaction(ch, V_nptv6_eid); + /* + * Since we already have deregistered external action, + * our named objects become unaccessible via rules, because + * all rules were truncated by ipfw_del_eaction(). + * So, we can unlink and destroy our named objects without holding + * IPFW_WLOCK(). + */ + IPFW_UH_WLOCK(ch); + ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), destroy_config_cb, ch, + IPFW_TLV_NPTV6_NAME); + V_nptv6_eid = 0; + IPFW_UH_WUNLOCK(ch); +} + diff --git a/freebsd/sys/netpfil/ipfw/nptv6/nptv6.h b/freebsd/sys/netpfil/ipfw/nptv6/nptv6.h new file mode 100644 index 00000000..95b04bfe --- /dev/null +++ b/freebsd/sys/netpfil/ipfw/nptv6/nptv6.h @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2016 Yandex LLC + * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IP_FW_NPTV6_H_ +#define _IP_FW_NPTV6_H_ + +#include <netinet6/ip_fw_nptv6.h> + +#ifdef _KERNEL +#define NPTV6STATS (sizeof(struct ipfw_nptv6_stats) / sizeof(uint64_t)) +#define NPTV6STAT_ADD(c, f, v) \ + counter_u64_add((c)->stats[ \ + offsetof(struct ipfw_nptv6_stats, f) / sizeof(uint64_t)], (v)) +#define NPTV6STAT_INC(c, f) NPTV6STAT_ADD(c, f, 1) +#define NPTV6STAT_FETCH(c, f) \ + counter_u64_fetch((c)->stats[ \ + offsetof(struct ipfw_nptv6_stats, f) / sizeof(uint64_t)]) + +struct nptv6_cfg { + struct named_object no; + + struct in6_addr internal; /* Internal IPv6 prefix */ + struct in6_addr external; /* External IPv6 prefix */ + struct in6_addr mask; /* IPv6 prefix mask */ + uint16_t adjustment; /* Checksum adjustment value */ + uint8_t plen; /* Prefix length */ + uint8_t flags; /* Flags for internal use */ +#define NPTV6_48PLEN 0x0001 + char name[64]; /* Instance name */ + counter_u64_t stats[NPTV6STATS]; /* Statistics counters */ +}; +#define NPTV6_FLAGSMASK 0 + +int nptv6_init(struct ip_fw_chain *ch, int first); +void nptv6_uninit(struct ip_fw_chain *ch, int last); +#endif /* _KERNEL */ + +#endif /* _IP_FW_NPTV6_H_ */ + diff --git a/freebsd/sys/netpfil/pf/if_pflog.c b/freebsd/sys/netpfil/pf/if_pflog.c new file mode 100644 index 00000000..3a364abc --- /dev/null +++ b/freebsd/sys/netpfil/pf/if_pflog.c @@ -0,0 +1,320 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * The authors of this code are John Ioannidis (ji@tla.org), + * Angelos D. Keromytis (kermit@csd.uch.gr) and + * Niels Provos (provos@physnet.uni-hamburg.de). + * + * This code was written by John Ioannidis for BSD/OS in Athens, Greece, + * in November 1995. + * + * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, + * by Angelos D. Keromytis. + * + * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis + * and Niels Provos. + * + * Copyright (C) 1995, 1996, 1997, 1998 by John Ioannidis, Angelos D. Keromytis + * and Niels Provos. + * Copyright (c) 2001, Angelos D. Keromytis, Niels Provos. + * + * Permission to use, copy, and modify this software with or without fee + * is hereby granted, provided that this entire notice is included in + * all copies of any software which is or includes a copy or + * modification of this software. + * You may use this code under the GNU public license if you so wish. Please + * contribute changes back to the authors under this freer than GPL license + * so that we may further the use of strong encryption without limitations to + * all. + * + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE + * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR + * PURPOSE. + * + * $OpenBSD: if_pflog.c,v 1.26 2007/10/18 21:58:18 mpf Exp $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> +#include <rtems/bsd/local/opt_bpf.h> +#include <rtems/bsd/local/opt_pf.h> + +#include <rtems/bsd/sys/param.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/sockio.h> + +#include <net/bpf.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_clone.h> +#include <net/if_pflog.h> +#include <net/if_types.h> +#include <net/vnet.h> +#include <net/pfvar.h> + +#if defined(INET) || defined(INET6) +#include <netinet/in.h> +#endif +#ifdef INET +#include <netinet/in_var.h> +#include <netinet/ip.h> +#endif + +#ifdef INET6 +#include <netinet6/in6_var.h> +#include <netinet6/nd6.h> +#endif /* INET6 */ + +#ifdef INET +#include <machine/in_cksum.h> +#endif /* INET */ + +#define PFLOGMTU (32768 + MHLEN + MLEN) + +#ifdef PFLOGDEBUG +#define DPRINTF(x) do { if (pflogdebug) printf x ; } while (0) +#else +#define DPRINTF(x) +#endif + +static int pflogoutput(struct ifnet *, struct mbuf *, + const struct sockaddr *, struct route *); +static void pflogattach(int); +static int pflogioctl(struct ifnet *, u_long, caddr_t); +static void pflogstart(struct ifnet *); +static int pflog_clone_create(struct if_clone *, int, caddr_t); +static void pflog_clone_destroy(struct ifnet *); + +static const char pflogname[] = "pflog"; + +static VNET_DEFINE(struct if_clone *, pflog_cloner); +#define V_pflog_cloner VNET(pflog_cloner) + +VNET_DEFINE(struct ifnet *, pflogifs[PFLOGIFS_MAX]); /* for fast access */ +#define V_pflogifs VNET(pflogifs) + +static void +pflogattach(int npflog __unused) +{ + int i; + for (i = 0; i < PFLOGIFS_MAX; i++) + V_pflogifs[i] = NULL; + V_pflog_cloner = if_clone_simple(pflogname, pflog_clone_create, + pflog_clone_destroy, 1); +} + +static int +pflog_clone_create(struct if_clone *ifc, int unit, caddr_t param) +{ + struct ifnet *ifp; + + if (unit >= PFLOGIFS_MAX) + return (EINVAL); + + ifp = if_alloc(IFT_PFLOG); + if (ifp == NULL) { + return (ENOSPC); + } + if_initname(ifp, pflogname, unit); + ifp->if_mtu = PFLOGMTU; + ifp->if_ioctl = pflogioctl; + ifp->if_output = pflogoutput; + ifp->if_start = pflogstart; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_hdrlen = PFLOG_HDRLEN; + if_attach(ifp); + + bpfattach(ifp, DLT_PFLOG, PFLOG_HDRLEN); + + V_pflogifs[unit] = ifp; + + return (0); +} + +static void +pflog_clone_destroy(struct ifnet *ifp) +{ + int i; + + for (i = 0; i < PFLOGIFS_MAX; i++) + if (V_pflogifs[i] == ifp) + V_pflogifs[i] = NULL; + + bpfdetach(ifp); + if_detach(ifp); + if_free(ifp); +} + +/* + * Start output on the pflog interface. + */ +static void +pflogstart(struct ifnet *ifp) +{ + struct mbuf *m; + + for (;;) { + IF_LOCK(&ifp->if_snd); + _IF_DEQUEUE(&ifp->if_snd, m); + IF_UNLOCK(&ifp->if_snd); + + if (m == NULL) + return; + else + m_freem(m); + } +} + +static int +pflogoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, + struct route *rt) +{ + m_freem(m); + return (0); +} + +/* ARGSUSED */ +static int +pflogioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + switch (cmd) { + case SIOCSIFFLAGS: + if (ifp->if_flags & IFF_UP) + ifp->if_drv_flags |= IFF_DRV_RUNNING; + else + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + break; + default: + return (ENOTTY); + } + + return (0); +} + +static int +pflog_packet(struct pfi_kif *kif, struct mbuf *m, sa_family_t af, u_int8_t dir, + u_int8_t reason, struct pf_rule *rm, struct pf_rule *am, + struct pf_ruleset *ruleset, struct pf_pdesc *pd, int lookupsafe) +{ + struct ifnet *ifn; + struct pfloghdr hdr; + + if (kif == NULL || m == NULL || rm == NULL || pd == NULL) + return ( 1); + + if ((ifn = V_pflogifs[rm->logif]) == NULL || !ifn->if_bpf) + return (0); + + bzero(&hdr, sizeof(hdr)); + hdr.length = PFLOG_REAL_HDRLEN; + hdr.af = af; + hdr.action = rm->action; + hdr.reason = reason; + memcpy(hdr.ifname, kif->pfik_name, sizeof(hdr.ifname)); + + if (am == NULL) { + hdr.rulenr = htonl(rm->nr); + hdr.subrulenr = 1; + } else { + hdr.rulenr = htonl(am->nr); + hdr.subrulenr = htonl(rm->nr); + if (ruleset != NULL && ruleset->anchor != NULL) + strlcpy(hdr.ruleset, ruleset->anchor->name, + sizeof(hdr.ruleset)); + } + /* + * XXXGL: we avoid pf_socket_lookup() when we are holding + * state lock, since this leads to unsafe LOR. + * These conditions are very very rare, however. + */ + if (rm->log & PF_LOG_SOCKET_LOOKUP && !pd->lookup.done && lookupsafe) + pd->lookup.done = pf_socket_lookup(dir, pd, m); + if (pd->lookup.done > 0) + hdr.uid = pd->lookup.uid; + else + hdr.uid = UID_MAX; + hdr.pid = NO_PID; + hdr.rule_uid = rm->cuid; + hdr.rule_pid = rm->cpid; + hdr.dir = dir; + +#ifdef INET + if (af == AF_INET && dir == PF_OUT) { + struct ip *ip; + + ip = mtod(m, struct ip *); + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m, ip->ip_hl << 2); + } +#endif /* INET */ + + if_inc_counter(ifn, IFCOUNTER_OPACKETS, 1); + if_inc_counter(ifn, IFCOUNTER_OBYTES, m->m_pkthdr.len); + BPF_MTAP2(ifn, &hdr, PFLOG_HDRLEN, m); + + return (0); +} + +static void +vnet_pflog_init(const void *unused __unused) +{ + + pflogattach(1); +} +VNET_SYSINIT(vnet_pflog_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY, + vnet_pflog_init, NULL); + +static void +vnet_pflog_uninit(const void *unused __unused) +{ + + if_clone_detach(V_pflog_cloner); +} +/* + * Detach after pf is gone; otherwise we might touch pflog memory + * from within pf after freeing pflog. + */ +VNET_SYSUNINIT(vnet_pflog_uninit, SI_SUB_INIT_IF, SI_ORDER_SECOND, + vnet_pflog_uninit, NULL); + +static int +pflog_modevent(module_t mod, int type, void *data) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + PF_RULES_WLOCK(); + pflog_packet_ptr = pflog_packet; + PF_RULES_WUNLOCK(); + break; + case MOD_UNLOAD: + PF_RULES_WLOCK(); + pflog_packet_ptr = NULL; + PF_RULES_WUNLOCK(); + break; + default: + error = EOPNOTSUPP; + break; + } + + return error; +} + +static moduledata_t pflog_mod = { pflogname, pflog_modevent, 0 }; + +#define PFLOG_MODVER 1 + +/* Do not run before pf is initialized as we depend on its locks. */ +DECLARE_MODULE(pflog, pflog_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY); +MODULE_VERSION(pflog, PFLOG_MODVER); +MODULE_DEPEND(pflog, pf, PF_MODVER, PF_MODVER, PF_MODVER); diff --git a/freebsd/sys/netpfil/pf/if_pfsync.c b/freebsd/sys/netpfil/pf/if_pfsync.c new file mode 100644 index 00000000..d6a0dfc0 --- /dev/null +++ b/freebsd/sys/netpfil/pf/if_pfsync.c @@ -0,0 +1,2421 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2002 Michael Shalayeff + * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 2009 David Gwynne <dlg@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $ + * + * Revisions picked from OpenBSD after revision 1.110 import: + * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input() + * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates + * 1.120, 1.175 - use monotonic time_uptime + * 1.122 - reduce number of updates for non-TCP sessions + * 1.125, 1.127 - rewrite merge or stale processing + * 1.128 - cleanups + * 1.146 - bzero() mbuf before sparsely filling it with data + * 1.170 - SIOCSIFMTU checks + * 1.126, 1.142 - deferred packets processing + * 1.173 - correct expire time processing + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> +#include <rtems/bsd/local/opt_pf.h> + +#include <rtems/bsd/sys/param.h> +#include <sys/bus.h> +#include <sys/endian.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/priv.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> + +#include <net/bpf.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_clone.h> +#include <net/if_types.h> +#include <net/vnet.h> +#include <net/pfvar.h> +#include <net/if_pfsync.h> + +#include <netinet/if_ether.h> +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_carp.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> + +#define PFSYNC_MINPKT ( \ + sizeof(struct ip) + \ + sizeof(struct pfsync_header) + \ + sizeof(struct pfsync_subheader) ) + +struct pfsync_pkt { + struct ip *ip; + struct in_addr src; + u_int8_t flags; +}; + +static int pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *, + struct pfsync_state_peer *); +static int pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int); + +static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = { + pfsync_in_clr, /* PFSYNC_ACT_CLR */ + pfsync_in_ins, /* PFSYNC_ACT_INS */ + pfsync_in_iack, /* PFSYNC_ACT_INS_ACK */ + pfsync_in_upd, /* PFSYNC_ACT_UPD */ + pfsync_in_upd_c, /* PFSYNC_ACT_UPD_C */ + pfsync_in_ureq, /* PFSYNC_ACT_UPD_REQ */ + pfsync_in_del, /* PFSYNC_ACT_DEL */ + pfsync_in_del_c, /* PFSYNC_ACT_DEL_C */ + pfsync_in_error, /* PFSYNC_ACT_INS_F */ + pfsync_in_error, /* PFSYNC_ACT_DEL_F */ + pfsync_in_bus, /* PFSYNC_ACT_BUS */ + pfsync_in_tdb, /* PFSYNC_ACT_TDB */ + pfsync_in_eof /* PFSYNC_ACT_EOF */ +}; + +struct pfsync_q { + void (*write)(struct pf_state *, void *); + size_t len; + u_int8_t action; +}; + +/* we have one of these for every PFSYNC_S_ */ +static void pfsync_out_state(struct pf_state *, void *); +static void pfsync_out_iack(struct pf_state *, void *); +static void pfsync_out_upd_c(struct pf_state *, void *); +static void pfsync_out_del(struct pf_state *, void *); + +static struct pfsync_q pfsync_qs[] = { + { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_INS }, + { pfsync_out_iack, sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK }, + { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_UPD }, + { pfsync_out_upd_c, sizeof(struct pfsync_upd_c), PFSYNC_ACT_UPD_C }, + { pfsync_out_del, sizeof(struct pfsync_del_c), PFSYNC_ACT_DEL_C } +}; + +static void pfsync_q_ins(struct pf_state *, int); +static void pfsync_q_del(struct pf_state *); + +static void pfsync_update_state(struct pf_state *); + +struct pfsync_upd_req_item { + TAILQ_ENTRY(pfsync_upd_req_item) ur_entry; + struct pfsync_upd_req ur_msg; +}; + +struct pfsync_deferral { + struct pfsync_softc *pd_sc; + TAILQ_ENTRY(pfsync_deferral) pd_entry; + u_int pd_refs; + struct callout pd_tmo; + + struct pf_state *pd_st; + struct mbuf *pd_m; +}; + +struct pfsync_softc { + /* Configuration */ + struct ifnet *sc_ifp; + struct ifnet *sc_sync_if; + struct ip_moptions sc_imo; + struct in_addr sc_sync_peer; + uint32_t sc_flags; +#define PFSYNCF_OK 0x00000001 +#define PFSYNCF_DEFER 0x00000002 +#define PFSYNCF_PUSH 0x00000004 + uint8_t sc_maxupdates; + struct ip sc_template; + struct callout sc_tmo; + struct mtx sc_mtx; + + /* Queued data */ + size_t sc_len; + TAILQ_HEAD(, pf_state) sc_qs[PFSYNC_S_COUNT]; + TAILQ_HEAD(, pfsync_upd_req_item) sc_upd_req_list; + TAILQ_HEAD(, pfsync_deferral) sc_deferrals; + u_int sc_deferred; + void *sc_plus; + size_t sc_pluslen; + + /* Bulk update info */ + struct mtx sc_bulk_mtx; + uint32_t sc_ureq_sent; + int sc_bulk_tries; + uint32_t sc_ureq_received; + int sc_bulk_hashid; + uint64_t sc_bulk_stateid; + uint32_t sc_bulk_creatorid; + struct callout sc_bulk_tmo; + struct callout sc_bulkfail_tmo; +}; + +#define PFSYNC_LOCK(sc) mtx_lock(&(sc)->sc_mtx) +#define PFSYNC_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) +#define PFSYNC_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) + +#define PFSYNC_BLOCK(sc) mtx_lock(&(sc)->sc_bulk_mtx) +#define PFSYNC_BUNLOCK(sc) mtx_unlock(&(sc)->sc_bulk_mtx) +#define PFSYNC_BLOCK_ASSERT(sc) mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED) + +static const char pfsyncname[] = "pfsync"; +static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data"); +static VNET_DEFINE(struct pfsync_softc *, pfsyncif) = NULL; +#define V_pfsyncif VNET(pfsyncif) +static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL; +#define V_pfsync_swi_cookie VNET(pfsync_swi_cookie) +static VNET_DEFINE(struct pfsyncstats, pfsyncstats); +#define V_pfsyncstats VNET(pfsyncstats) +static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW; +#define V_pfsync_carp_adj VNET(pfsync_carp_adj) + +static void pfsync_timeout(void *); +static void pfsync_push(struct pfsync_softc *); +static void pfsyncintr(void *); +static int pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *, + void *); +static void pfsync_multicast_cleanup(struct pfsync_softc *); +static void pfsync_pointers_init(void); +static void pfsync_pointers_uninit(void); +static int pfsync_init(void); +static void pfsync_uninit(void); + +SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC"); +SYSCTL_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(pfsyncstats), pfsyncstats, + "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)"); +SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW, + &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment"); + +static int pfsync_clone_create(struct if_clone *, int, caddr_t); +static void pfsync_clone_destroy(struct ifnet *); +static int pfsync_alloc_scrub_memory(struct pfsync_state_peer *, + struct pf_state_peer *); +static int pfsyncoutput(struct ifnet *, struct mbuf *, + const struct sockaddr *, struct route *); +static int pfsyncioctl(struct ifnet *, u_long, caddr_t); + +static int pfsync_defer(struct pf_state *, struct mbuf *); +static void pfsync_undefer(struct pfsync_deferral *, int); +static void pfsync_undefer_state(struct pf_state *, int); +static void pfsync_defer_tmo(void *); + +static void pfsync_request_update(u_int32_t, u_int64_t); +static void pfsync_update_state_req(struct pf_state *); + +static void pfsync_drop(struct pfsync_softc *); +static void pfsync_sendout(int); +static void pfsync_send_plus(void *, size_t); + +static void pfsync_bulk_start(void); +static void pfsync_bulk_status(u_int8_t); +static void pfsync_bulk_update(void *); +static void pfsync_bulk_fail(void *); + +#ifdef IPSEC +static void pfsync_update_net_tdb(struct pfsync_tdb *); +#endif + +#define PFSYNC_MAX_BULKTRIES 12 + +VNET_DEFINE(struct if_clone *, pfsync_cloner); +#define V_pfsync_cloner VNET(pfsync_cloner) + +static int +pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param) +{ + struct pfsync_softc *sc; + struct ifnet *ifp; + int q; + + if (unit != 0) + return (EINVAL); + + sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO); + sc->sc_flags |= PFSYNCF_OK; + + for (q = 0; q < PFSYNC_S_COUNT; q++) + TAILQ_INIT(&sc->sc_qs[q]); + + TAILQ_INIT(&sc->sc_upd_req_list); + TAILQ_INIT(&sc->sc_deferrals); + + sc->sc_len = PFSYNC_MINPKT; + sc->sc_maxupdates = 128; + + ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC); + if (ifp == NULL) { + free(sc, M_PFSYNC); + return (ENOSPC); + } + if_initname(ifp, pfsyncname, unit); + ifp->if_softc = sc; + ifp->if_ioctl = pfsyncioctl; + ifp->if_output = pfsyncoutput; + ifp->if_type = IFT_PFSYNC; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_hdrlen = sizeof(struct pfsync_header); + ifp->if_mtu = ETHERMTU; + mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF); + mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF); + callout_init(&sc->sc_tmo, 1); + callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0); + callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0); + + if_attach(ifp); + + bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN); + + V_pfsyncif = sc; + + return (0); +} + +static void +pfsync_clone_destroy(struct ifnet *ifp) +{ + struct pfsync_softc *sc = ifp->if_softc; + + /* + * At this stage, everything should have already been + * cleared by pfsync_uninit(), and we have only to + * drain callouts. + */ + while (sc->sc_deferred > 0) { + struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals); + + TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); + sc->sc_deferred--; + if (callout_stop(&pd->pd_tmo) > 0) { + pf_release_state(pd->pd_st); + m_freem(pd->pd_m); + free(pd, M_PFSYNC); + } else { + pd->pd_refs++; + callout_drain(&pd->pd_tmo); + free(pd, M_PFSYNC); + } + } + + callout_drain(&sc->sc_tmo); + callout_drain(&sc->sc_bulkfail_tmo); + callout_drain(&sc->sc_bulk_tmo); + + if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) + (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy"); + bpfdetach(ifp); + if_detach(ifp); + + pfsync_drop(sc); + + if_free(ifp); + if (sc->sc_imo.imo_membership) + pfsync_multicast_cleanup(sc); + mtx_destroy(&sc->sc_mtx); + mtx_destroy(&sc->sc_bulk_mtx); + free(sc, M_PFSYNC); + + V_pfsyncif = NULL; +} + +static int +pfsync_alloc_scrub_memory(struct pfsync_state_peer *s, + struct pf_state_peer *d) +{ + if (s->scrub.scrub_flag && d->scrub == NULL) { + d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO); + if (d->scrub == NULL) + return (ENOMEM); + } + + return (0); +} + + +static int +pfsync_state_import(struct pfsync_state *sp, u_int8_t flags) +{ + struct pfsync_softc *sc = V_pfsyncif; +#ifndef __NO_STRICT_ALIGNMENT + struct pfsync_state_key key[2]; +#endif + struct pfsync_state_key *kw, *ks; + struct pf_state *st = NULL; + struct pf_state_key *skw = NULL, *sks = NULL; + struct pf_rule *r = NULL; + struct pfi_kif *kif; + int error; + + PF_RULES_RASSERT(); + + if (sp->creatorid == 0) { + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("%s: invalid creator id: %08x\n", __func__, + ntohl(sp->creatorid)); + return (EINVAL); + } + + if ((kif = pfi_kif_find(sp->ifname)) == NULL) { + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("%s: unknown interface: %s\n", __func__, + sp->ifname); + if (flags & PFSYNC_SI_IOCTL) + return (EINVAL); + return (0); /* skip this state */ + } + + /* + * If the ruleset checksums match or the state is coming from the ioctl, + * it's safe to associate the state with the rule of that number. + */ + if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) && + (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) < + pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount) + r = pf_main_ruleset.rules[ + PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)]; + else + r = &V_pf_default_rule; + + if ((r->max_states && + counter_u64_fetch(r->states_cur) >= r->max_states)) + goto cleanup; + + /* + * XXXGL: consider M_WAITOK in ioctl path after. + */ + if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL) + goto cleanup; + + if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL) + goto cleanup; + +#ifndef __NO_STRICT_ALIGNMENT + bcopy(&sp->key, key, sizeof(struct pfsync_state_key) * 2); + kw = &key[PF_SK_WIRE]; + ks = &key[PF_SK_STACK]; +#else + kw = &sp->key[PF_SK_WIRE]; + ks = &sp->key[PF_SK_STACK]; +#endif + + if (PF_ANEQ(&kw->addr[0], &ks->addr[0], sp->af) || + PF_ANEQ(&kw->addr[1], &ks->addr[1], sp->af) || + kw->port[0] != ks->port[0] || + kw->port[1] != ks->port[1]) { + sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT); + if (sks == NULL) + goto cleanup; + } else + sks = skw; + + /* allocate memory for scrub info */ + if (pfsync_alloc_scrub_memory(&sp->src, &st->src) || + pfsync_alloc_scrub_memory(&sp->dst, &st->dst)) + goto cleanup; + + /* Copy to state key(s). */ + skw->addr[0] = kw->addr[0]; + skw->addr[1] = kw->addr[1]; + skw->port[0] = kw->port[0]; + skw->port[1] = kw->port[1]; + skw->proto = sp->proto; + skw->af = sp->af; + if (sks != skw) { + sks->addr[0] = ks->addr[0]; + sks->addr[1] = ks->addr[1]; + sks->port[0] = ks->port[0]; + sks->port[1] = ks->port[1]; + sks->proto = sp->proto; + sks->af = sp->af; + } + + /* copy to state */ + bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr)); + st->creation = time_uptime - ntohl(sp->creation); + st->expire = time_uptime; + if (sp->expire) { + uint32_t timeout; + + timeout = r->timeout[sp->timeout]; + if (!timeout) + timeout = V_pf_default_rule.timeout[sp->timeout]; + + /* sp->expire may have been adaptively scaled by export. */ + st->expire -= timeout - ntohl(sp->expire); + } + + st->direction = sp->direction; + st->log = sp->log; + st->timeout = sp->timeout; + st->state_flags = sp->state_flags; + + st->id = sp->id; + st->creatorid = sp->creatorid; + pf_state_peer_ntoh(&sp->src, &st->src); + pf_state_peer_ntoh(&sp->dst, &st->dst); + + st->rule.ptr = r; + st->nat_rule.ptr = NULL; + st->anchor.ptr = NULL; + st->rt_kif = NULL; + + st->pfsync_time = time_uptime; + st->sync_state = PFSYNC_S_NONE; + + if (!(flags & PFSYNC_SI_IOCTL)) + st->state_flags |= PFSTATE_NOSYNC; + + if ((error = pf_state_insert(kif, skw, sks, st)) != 0) + goto cleanup_state; + + /* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */ + counter_u64_add(r->states_cur, 1); + counter_u64_add(r->states_tot, 1); + + if (!(flags & PFSYNC_SI_IOCTL)) { + st->state_flags &= ~PFSTATE_NOSYNC; + if (st->state_flags & PFSTATE_ACK) { + pfsync_q_ins(st, PFSYNC_S_IACK); + pfsync_push(sc); + } + } + st->state_flags &= ~PFSTATE_ACK; + PF_STATE_UNLOCK(st); + + return (0); + +cleanup: + error = ENOMEM; + if (skw == sks) + sks = NULL; + if (skw != NULL) + uma_zfree(V_pf_state_key_z, skw); + if (sks != NULL) + uma_zfree(V_pf_state_key_z, sks); + +cleanup_state: /* pf_state_insert() frees the state keys. */ + if (st) { + if (st->dst.scrub) + uma_zfree(V_pf_state_scrub_z, st->dst.scrub); + if (st->src.scrub) + uma_zfree(V_pf_state_scrub_z, st->src.scrub); + uma_zfree(V_pf_state_z, st); + } + return (error); +} + +static int +pfsync_input(struct mbuf **mp, int *offp __unused, int proto __unused) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_pkt pkt; + struct mbuf *m = *mp; + struct ip *ip = mtod(m, struct ip *); + struct pfsync_header *ph; + struct pfsync_subheader subh; + + int offset, len; + int rv; + uint16_t count; + + *mp = NULL; + V_pfsyncstats.pfsyncs_ipackets++; + + /* Verify that we have a sync interface configured. */ + if (!sc || !sc->sc_sync_if || !V_pf_status.running || + (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + goto done; + + /* verify that the packet came in on the right interface */ + if (sc->sc_sync_if != m->m_pkthdr.rcvif) { + V_pfsyncstats.pfsyncs_badif++; + goto done; + } + + if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1); + if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); + /* verify that the IP TTL is 255. */ + if (ip->ip_ttl != PFSYNC_DFLTTL) { + V_pfsyncstats.pfsyncs_badttl++; + goto done; + } + + offset = ip->ip_hl << 2; + if (m->m_pkthdr.len < offset + sizeof(*ph)) { + V_pfsyncstats.pfsyncs_hdrops++; + goto done; + } + + if (offset + sizeof(*ph) > m->m_len) { + if (m_pullup(m, offset + sizeof(*ph)) == NULL) { + V_pfsyncstats.pfsyncs_hdrops++; + return (IPPROTO_DONE); + } + ip = mtod(m, struct ip *); + } + ph = (struct pfsync_header *)((char *)ip + offset); + + /* verify the version */ + if (ph->version != PFSYNC_VERSION) { + V_pfsyncstats.pfsyncs_badver++; + goto done; + } + + len = ntohs(ph->len) + offset; + if (m->m_pkthdr.len < len) { + V_pfsyncstats.pfsyncs_badlen++; + goto done; + } + + /* Cheaper to grab this now than having to mess with mbufs later */ + pkt.ip = ip; + pkt.src = ip->ip_src; + pkt.flags = 0; + + /* + * Trusting pf_chksum during packet processing, as well as seeking + * in interface name tree, require holding PF_RULES_RLOCK(). + */ + PF_RULES_RLOCK(); + if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH)) + pkt.flags |= PFSYNC_SI_CKSUM; + + offset += sizeof(*ph); + while (offset <= len - sizeof(subh)) { + m_copydata(m, offset, sizeof(subh), (caddr_t)&subh); + offset += sizeof(subh); + + if (subh.action >= PFSYNC_ACT_MAX) { + V_pfsyncstats.pfsyncs_badact++; + PF_RULES_RUNLOCK(); + goto done; + } + + count = ntohs(subh.count); + V_pfsyncstats.pfsyncs_iacts[subh.action] += count; + rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count); + if (rv == -1) { + PF_RULES_RUNLOCK(); + return (IPPROTO_DONE); + } + + offset += rv; + } + PF_RULES_RUNLOCK(); + +done: + m_freem(m); + return (IPPROTO_DONE); +} + +static int +pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct pfsync_clr *clr; + struct mbuf *mp; + int len = sizeof(*clr) * count; + int i, offp; + u_int32_t creatorid; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + clr = (struct pfsync_clr *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + creatorid = clr[i].creatorid; + + if (clr[i].ifname[0] != '\0' && + pfi_kif_find(clr[i].ifname) == NULL) + continue; + + for (int i = 0; i <= pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + struct pf_state *s; +relock: + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + if (s->creatorid == creatorid) { + s->state_flags |= PFSTATE_NOSYNC; + pf_unlink_state(s, PF_ENTER_LOCKED); + goto relock; + } + } + PF_HASHROW_UNLOCK(ih); + } + } + + return (len); +} + +static int +pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct mbuf *mp; + struct pfsync_state *sa, *sp; + int len = sizeof(*sp) * count; + int i, offp; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + sa = (struct pfsync_state *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + sp = &sa[i]; + + /* Check for invalid values. */ + if (sp->timeout >= PFTM_MAX || + sp->src.state > PF_TCPS_PROXY_DST || + sp->dst.state > PF_TCPS_PROXY_DST || + sp->direction > PF_OUT || + (sp->af != AF_INET && sp->af != AF_INET6)) { + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("%s: invalid value\n", __func__); + V_pfsyncstats.pfsyncs_badval++; + continue; + } + + if (pfsync_state_import(sp, pkt->flags) == ENOMEM) + /* Drop out, but process the rest of the actions. */ + break; + } + + return (len); +} + +static int +pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct pfsync_ins_ack *ia, *iaa; + struct pf_state *st; + + struct mbuf *mp; + int len = count * sizeof(*ia); + int offp, i; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + iaa = (struct pfsync_ins_ack *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + ia = &iaa[i]; + + st = pf_find_state_byid(ia->id, ia->creatorid); + if (st == NULL) + continue; + + if (st->state_flags & PFSTATE_ACK) { + PFSYNC_LOCK(V_pfsyncif); + pfsync_undefer_state(st, 0); + PFSYNC_UNLOCK(V_pfsyncif); + } + PF_STATE_UNLOCK(st); + } + /* + * XXX this is not yet implemented, but we know the size of the + * message so we can skip it. + */ + + return (count * sizeof(struct pfsync_ins_ack)); +} + +static int +pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src, + struct pfsync_state_peer *dst) +{ + int sync = 0; + + PF_STATE_LOCK_ASSERT(st); + + /* + * The state should never go backwards except + * for syn-proxy states. Neither should the + * sequence window slide backwards. + */ + if ((st->src.state > src->state && + (st->src.state < PF_TCPS_PROXY_SRC || + src->state >= PF_TCPS_PROXY_SRC)) || + + (st->src.state == src->state && + SEQ_GT(st->src.seqlo, ntohl(src->seqlo)))) + sync++; + else + pf_state_peer_ntoh(src, &st->src); + + if ((st->dst.state > dst->state) || + + (st->dst.state >= TCPS_SYN_SENT && + SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo)))) + sync++; + else + pf_state_peer_ntoh(dst, &st->dst); + + return (sync); +} + +static int +pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_state *sa, *sp; + struct pf_state *st; + int sync; + + struct mbuf *mp; + int len = count * sizeof(*sp); + int offp, i; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + sa = (struct pfsync_state *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + sp = &sa[i]; + + /* check for invalid values */ + if (sp->timeout >= PFTM_MAX || + sp->src.state > PF_TCPS_PROXY_DST || + sp->dst.state > PF_TCPS_PROXY_DST) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pfsync_input: PFSYNC_ACT_UPD: " + "invalid value\n"); + } + V_pfsyncstats.pfsyncs_badval++; + continue; + } + + st = pf_find_state_byid(sp->id, sp->creatorid); + if (st == NULL) { + /* insert the update */ + if (pfsync_state_import(sp, 0)) + V_pfsyncstats.pfsyncs_badstate++; + continue; + } + + if (st->state_flags & PFSTATE_ACK) { + PFSYNC_LOCK(sc); + pfsync_undefer_state(st, 1); + PFSYNC_UNLOCK(sc); + } + + if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) + sync = pfsync_upd_tcp(st, &sp->src, &sp->dst); + else { + sync = 0; + + /* + * Non-TCP protocol state machine always go + * forwards + */ + if (st->src.state > sp->src.state) + sync++; + else + pf_state_peer_ntoh(&sp->src, &st->src); + if (st->dst.state > sp->dst.state) + sync++; + else + pf_state_peer_ntoh(&sp->dst, &st->dst); + } + if (sync < 2) { + pfsync_alloc_scrub_memory(&sp->dst, &st->dst); + pf_state_peer_ntoh(&sp->dst, &st->dst); + st->expire = time_uptime; + st->timeout = sp->timeout; + } + st->pfsync_time = time_uptime; + + if (sync) { + V_pfsyncstats.pfsyncs_stale++; + + pfsync_update_state(st); + PF_STATE_UNLOCK(st); + PFSYNC_LOCK(sc); + pfsync_push(sc); + PFSYNC_UNLOCK(sc); + continue; + } + PF_STATE_UNLOCK(st); + } + + return (len); +} + +static int +pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_upd_c *ua, *up; + struct pf_state *st; + int len = count * sizeof(*up); + int sync; + struct mbuf *mp; + int offp, i; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + ua = (struct pfsync_upd_c *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + up = &ua[i]; + + /* check for invalid values */ + if (up->timeout >= PFTM_MAX || + up->src.state > PF_TCPS_PROXY_DST || + up->dst.state > PF_TCPS_PROXY_DST) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pfsync_input: " + "PFSYNC_ACT_UPD_C: " + "invalid value\n"); + } + V_pfsyncstats.pfsyncs_badval++; + continue; + } + + st = pf_find_state_byid(up->id, up->creatorid); + if (st == NULL) { + /* We don't have this state. Ask for it. */ + PFSYNC_LOCK(sc); + pfsync_request_update(up->creatorid, up->id); + PFSYNC_UNLOCK(sc); + continue; + } + + if (st->state_flags & PFSTATE_ACK) { + PFSYNC_LOCK(sc); + pfsync_undefer_state(st, 1); + PFSYNC_UNLOCK(sc); + } + + if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) + sync = pfsync_upd_tcp(st, &up->src, &up->dst); + else { + sync = 0; + + /* + * Non-TCP protocol state machine always go + * forwards + */ + if (st->src.state > up->src.state) + sync++; + else + pf_state_peer_ntoh(&up->src, &st->src); + if (st->dst.state > up->dst.state) + sync++; + else + pf_state_peer_ntoh(&up->dst, &st->dst); + } + if (sync < 2) { + pfsync_alloc_scrub_memory(&up->dst, &st->dst); + pf_state_peer_ntoh(&up->dst, &st->dst); + st->expire = time_uptime; + st->timeout = up->timeout; + } + st->pfsync_time = time_uptime; + + if (sync) { + V_pfsyncstats.pfsyncs_stale++; + + pfsync_update_state(st); + PF_STATE_UNLOCK(st); + PFSYNC_LOCK(sc); + pfsync_push(sc); + PFSYNC_UNLOCK(sc); + continue; + } + PF_STATE_UNLOCK(st); + } + + return (len); +} + +static int +pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct pfsync_upd_req *ur, *ura; + struct mbuf *mp; + int len = count * sizeof(*ur); + int i, offp; + + struct pf_state *st; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + ura = (struct pfsync_upd_req *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + ur = &ura[i]; + + if (ur->id == 0 && ur->creatorid == 0) + pfsync_bulk_start(); + else { + st = pf_find_state_byid(ur->id, ur->creatorid); + if (st == NULL) { + V_pfsyncstats.pfsyncs_badstate++; + continue; + } + if (st->state_flags & PFSTATE_NOSYNC) { + PF_STATE_UNLOCK(st); + continue; + } + + pfsync_update_state_req(st); + PF_STATE_UNLOCK(st); + } + } + + return (len); +} + +static int +pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct mbuf *mp; + struct pfsync_state *sa, *sp; + struct pf_state *st; + int len = count * sizeof(*sp); + int offp, i; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + sa = (struct pfsync_state *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + sp = &sa[i]; + + st = pf_find_state_byid(sp->id, sp->creatorid); + if (st == NULL) { + V_pfsyncstats.pfsyncs_badstate++; + continue; + } + st->state_flags |= PFSTATE_NOSYNC; + pf_unlink_state(st, PF_ENTER_LOCKED); + } + + return (len); +} + +static int +pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct mbuf *mp; + struct pfsync_del_c *sa, *sp; + struct pf_state *st; + int len = count * sizeof(*sp); + int offp, i; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + sa = (struct pfsync_del_c *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + sp = &sa[i]; + + st = pf_find_state_byid(sp->id, sp->creatorid); + if (st == NULL) { + V_pfsyncstats.pfsyncs_badstate++; + continue; + } + + st->state_flags |= PFSTATE_NOSYNC; + pf_unlink_state(st, PF_ENTER_LOCKED); + } + + return (len); +} + +static int +pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_bus *bus; + struct mbuf *mp; + int len = count * sizeof(*bus); + int offp; + + PFSYNC_BLOCK(sc); + + /* If we're not waiting for a bulk update, who cares. */ + if (sc->sc_ureq_sent == 0) { + PFSYNC_BUNLOCK(sc); + return (len); + } + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + PFSYNC_BUNLOCK(sc); + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + bus = (struct pfsync_bus *)(mp->m_data + offp); + + switch (bus->status) { + case PFSYNC_BUS_START: + callout_reset(&sc->sc_bulkfail_tmo, 4 * hz + + V_pf_limits[PF_LIMIT_STATES].limit / + ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) / + sizeof(struct pfsync_state)), + pfsync_bulk_fail, sc); + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: received bulk update start\n"); + break; + + case PFSYNC_BUS_END: + if (time_uptime - ntohl(bus->endtime) >= + sc->sc_ureq_sent) { + /* that's it, we're happy */ + sc->sc_ureq_sent = 0; + sc->sc_bulk_tries = 0; + callout_stop(&sc->sc_bulkfail_tmo); + if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) + (*carp_demote_adj_p)(-V_pfsync_carp_adj, + "pfsync bulk done"); + sc->sc_flags |= PFSYNCF_OK; + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: received valid " + "bulk update end\n"); + } else { + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: received invalid " + "bulk update end: bad timestamp\n"); + } + break; + } + PFSYNC_BUNLOCK(sc); + + return (len); +} + +static int +pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + int len = count * sizeof(struct pfsync_tdb); + +#if defined(IPSEC) + struct pfsync_tdb *tp; + struct mbuf *mp; + int offp; + int i; + int s; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + tp = (struct pfsync_tdb *)(mp->m_data + offp); + + for (i = 0; i < count; i++) + pfsync_update_net_tdb(&tp[i]); +#endif + + return (len); +} + +#if defined(IPSEC) +/* Update an in-kernel tdb. Silently fail if no tdb is found. */ +static void +pfsync_update_net_tdb(struct pfsync_tdb *pt) +{ + struct tdb *tdb; + int s; + + /* check for invalid values */ + if (ntohl(pt->spi) <= SPI_RESERVED_MAX || + (pt->dst.sa.sa_family != AF_INET && + pt->dst.sa.sa_family != AF_INET6)) + goto bad; + + tdb = gettdb(pt->spi, &pt->dst, pt->sproto); + if (tdb) { + pt->rpl = ntohl(pt->rpl); + pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes); + + /* Neither replay nor byte counter should ever decrease. */ + if (pt->rpl < tdb->tdb_rpl || + pt->cur_bytes < tdb->tdb_cur_bytes) { + goto bad; + } + + tdb->tdb_rpl = pt->rpl; + tdb->tdb_cur_bytes = pt->cur_bytes; + } + return; + +bad: + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: " + "invalid value\n"); + V_pfsyncstats.pfsyncs_badstate++; + return; +} +#endif + + +static int +pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + /* check if we are at the right place in the packet */ + if (offset != m->m_pkthdr.len) + V_pfsyncstats.pfsyncs_badlen++; + + /* we're done. free and let the caller return */ + m_freem(m); + return (-1); +} + +static int +pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + V_pfsyncstats.pfsyncs_badact++; + + m_freem(m); + return (-1); +} + +static int +pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, + struct route *rt) +{ + m_freem(m); + return (0); +} + +/* ARGSUSED */ +static int +pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct pfsync_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *)data; + struct pfsyncreq pfsyncr; + int error; + + switch (cmd) { + case SIOCSIFFLAGS: + PFSYNC_LOCK(sc); + if (ifp->if_flags & IFF_UP) { + ifp->if_drv_flags |= IFF_DRV_RUNNING; + PFSYNC_UNLOCK(sc); + pfsync_pointers_init(); + } else { + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + PFSYNC_UNLOCK(sc); + pfsync_pointers_uninit(); + } + break; + case SIOCSIFMTU: + if (!sc->sc_sync_if || + ifr->ifr_mtu <= PFSYNC_MINPKT || + ifr->ifr_mtu > sc->sc_sync_if->if_mtu) + return (EINVAL); + if (ifr->ifr_mtu < ifp->if_mtu) { + PFSYNC_LOCK(sc); + if (sc->sc_len > PFSYNC_MINPKT) + pfsync_sendout(1); + PFSYNC_UNLOCK(sc); + } + ifp->if_mtu = ifr->ifr_mtu; + break; + case SIOCGETPFSYNC: + bzero(&pfsyncr, sizeof(pfsyncr)); + PFSYNC_LOCK(sc); + if (sc->sc_sync_if) { + strlcpy(pfsyncr.pfsyncr_syncdev, + sc->sc_sync_if->if_xname, IFNAMSIZ); + } + pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer; + pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates; + pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER == + (sc->sc_flags & PFSYNCF_DEFER)); + PFSYNC_UNLOCK(sc); + return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr))); + + case SIOCSETPFSYNC: + { + struct ip_moptions *imo = &sc->sc_imo; + struct ifnet *sifp; + struct ip *ip; + void *mship = NULL; + + if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0) + return (error); + if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr)))) + return (error); + + if (pfsyncr.pfsyncr_maxupdates > 255) + return (EINVAL); + + if (pfsyncr.pfsyncr_syncdev[0] == 0) + sifp = NULL; + else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL) + return (EINVAL); + + if (sifp != NULL && ( + pfsyncr.pfsyncr_syncpeer.s_addr == 0 || + pfsyncr.pfsyncr_syncpeer.s_addr == + htonl(INADDR_PFSYNC_GROUP))) + mship = malloc((sizeof(struct in_multi *) * + IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO); + + PFSYNC_LOCK(sc); + if (pfsyncr.pfsyncr_syncpeer.s_addr == 0) + sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP); + else + sc->sc_sync_peer.s_addr = + pfsyncr.pfsyncr_syncpeer.s_addr; + + sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates; + if (pfsyncr.pfsyncr_defer) { + sc->sc_flags |= PFSYNCF_DEFER; + pfsync_defer_ptr = pfsync_defer; + } else { + sc->sc_flags &= ~PFSYNCF_DEFER; + pfsync_defer_ptr = NULL; + } + + if (sifp == NULL) { + if (sc->sc_sync_if) + if_rele(sc->sc_sync_if); + sc->sc_sync_if = NULL; + if (imo->imo_membership) + pfsync_multicast_cleanup(sc); + PFSYNC_UNLOCK(sc); + break; + } + + if (sc->sc_len > PFSYNC_MINPKT && + (sifp->if_mtu < sc->sc_ifp->if_mtu || + (sc->sc_sync_if != NULL && + sifp->if_mtu < sc->sc_sync_if->if_mtu) || + sifp->if_mtu < MCLBYTES - sizeof(struct ip))) + pfsync_sendout(1); + + if (imo->imo_membership) + pfsync_multicast_cleanup(sc); + + if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) { + error = pfsync_multicast_setup(sc, sifp, mship); + if (error) { + if_rele(sifp); + free(mship, M_PFSYNC); + return (error); + } + } + if (sc->sc_sync_if) + if_rele(sc->sc_sync_if); + sc->sc_sync_if = sifp; + + ip = &sc->sc_template; + bzero(ip, sizeof(*ip)); + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(sc->sc_template) >> 2; + ip->ip_tos = IPTOS_LOWDELAY; + /* len and id are set later. */ + ip->ip_off = htons(IP_DF); + ip->ip_ttl = PFSYNC_DFLTTL; + ip->ip_p = IPPROTO_PFSYNC; + ip->ip_src.s_addr = INADDR_ANY; + ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr; + + /* Request a full state table update. */ + if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) + (*carp_demote_adj_p)(V_pfsync_carp_adj, + "pfsync bulk start"); + sc->sc_flags &= ~PFSYNCF_OK; + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: requesting bulk update\n"); + pfsync_request_update(0, 0); + PFSYNC_UNLOCK(sc); + PFSYNC_BLOCK(sc); + sc->sc_ureq_sent = time_uptime; + callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail, + sc); + PFSYNC_BUNLOCK(sc); + + break; + } + default: + return (ENOTTY); + } + + return (0); +} + +static void +pfsync_out_state(struct pf_state *st, void *buf) +{ + struct pfsync_state *sp = buf; + + pfsync_state_export(sp, st); +} + +static void +pfsync_out_iack(struct pf_state *st, void *buf) +{ + struct pfsync_ins_ack *iack = buf; + + iack->id = st->id; + iack->creatorid = st->creatorid; +} + +static void +pfsync_out_upd_c(struct pf_state *st, void *buf) +{ + struct pfsync_upd_c *up = buf; + + bzero(up, sizeof(*up)); + up->id = st->id; + pf_state_peer_hton(&st->src, &up->src); + pf_state_peer_hton(&st->dst, &up->dst); + up->creatorid = st->creatorid; + up->timeout = st->timeout; +} + +static void +pfsync_out_del(struct pf_state *st, void *buf) +{ + struct pfsync_del_c *dp = buf; + + dp->id = st->id; + dp->creatorid = st->creatorid; + st->state_flags |= PFSTATE_NOSYNC; +} + +static void +pfsync_drop(struct pfsync_softc *sc) +{ + struct pf_state *st, *next; + struct pfsync_upd_req_item *ur; + int q; + + for (q = 0; q < PFSYNC_S_COUNT; q++) { + if (TAILQ_EMPTY(&sc->sc_qs[q])) + continue; + + TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) { + KASSERT(st->sync_state == q, + ("%s: st->sync_state == q", + __func__)); + st->sync_state = PFSYNC_S_NONE; + pf_release_state(st); + } + TAILQ_INIT(&sc->sc_qs[q]); + } + + while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { + TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); + free(ur, M_PFSYNC); + } + + sc->sc_plus = NULL; + sc->sc_len = PFSYNC_MINPKT; +} + +static void +pfsync_sendout(int schedswi) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct ifnet *ifp = sc->sc_ifp; + struct mbuf *m; + struct ip *ip; + struct pfsync_header *ph; + struct pfsync_subheader *subh; + struct pf_state *st; + struct pfsync_upd_req_item *ur; + int offset; + int q, count = 0; + + KASSERT(sc != NULL, ("%s: null sc", __func__)); + KASSERT(sc->sc_len > PFSYNC_MINPKT, + ("%s: sc_len %zu", __func__, sc->sc_len)); + PFSYNC_LOCK_ASSERT(sc); + + if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) { + pfsync_drop(sc); + return; + } + + m = m_get2(max_linkhdr + sc->sc_len, M_NOWAIT, MT_DATA, M_PKTHDR); + if (m == NULL) { + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); + V_pfsyncstats.pfsyncs_onomem++; + return; + } + m->m_data += max_linkhdr; + m->m_len = m->m_pkthdr.len = sc->sc_len; + + /* build the ip header */ + ip = (struct ip *)m->m_data; + bcopy(&sc->sc_template, ip, sizeof(*ip)); + offset = sizeof(*ip); + + ip->ip_len = htons(m->m_pkthdr.len); + ip_fillid(ip); + + /* build the pfsync header */ + ph = (struct pfsync_header *)(m->m_data + offset); + bzero(ph, sizeof(*ph)); + offset += sizeof(*ph); + + ph->version = PFSYNC_VERSION; + ph->len = htons(sc->sc_len - sizeof(*ip)); + bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH); + + /* walk the queues */ + for (q = 0; q < PFSYNC_S_COUNT; q++) { + if (TAILQ_EMPTY(&sc->sc_qs[q])) + continue; + + subh = (struct pfsync_subheader *)(m->m_data + offset); + offset += sizeof(*subh); + + count = 0; + TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) { + KASSERT(st->sync_state == q, + ("%s: st->sync_state == q", + __func__)); + /* + * XXXGL: some of write methods do unlocked reads + * of state data :( + */ + pfsync_qs[q].write(st, m->m_data + offset); + offset += pfsync_qs[q].len; + st->sync_state = PFSYNC_S_NONE; + pf_release_state(st); + count++; + } + TAILQ_INIT(&sc->sc_qs[q]); + + bzero(subh, sizeof(*subh)); + subh->action = pfsync_qs[q].action; + subh->count = htons(count); + V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count; + } + + if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) { + subh = (struct pfsync_subheader *)(m->m_data + offset); + offset += sizeof(*subh); + + count = 0; + while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { + TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); + + bcopy(&ur->ur_msg, m->m_data + offset, + sizeof(ur->ur_msg)); + offset += sizeof(ur->ur_msg); + free(ur, M_PFSYNC); + count++; + } + + bzero(subh, sizeof(*subh)); + subh->action = PFSYNC_ACT_UPD_REQ; + subh->count = htons(count); + V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count; + } + + /* has someone built a custom region for us to add? */ + if (sc->sc_plus != NULL) { + bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen); + offset += sc->sc_pluslen; + + sc->sc_plus = NULL; + } + + subh = (struct pfsync_subheader *)(m->m_data + offset); + offset += sizeof(*subh); + + bzero(subh, sizeof(*subh)); + subh->action = PFSYNC_ACT_EOF; + subh->count = htons(1); + V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++; + + /* we're done, let's put it on the wire */ + if (ifp->if_bpf) { + m->m_data += sizeof(*ip); + m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip); + BPF_MTAP(ifp, m); + m->m_data -= sizeof(*ip); + m->m_len = m->m_pkthdr.len = sc->sc_len; + } + + if (sc->sc_sync_if == NULL) { + sc->sc_len = PFSYNC_MINPKT; + m_freem(m); + return; + } + + if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1); + if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); + sc->sc_len = PFSYNC_MINPKT; + + if (!_IF_QFULL(&sc->sc_ifp->if_snd)) + _IF_ENQUEUE(&sc->sc_ifp->if_snd, m); + else { + m_freem(m); + if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1); + } + if (schedswi) + swi_sched(V_pfsync_swi_cookie, 0); +} + +static void +pfsync_insert_state(struct pf_state *st) +{ + struct pfsync_softc *sc = V_pfsyncif; + + if (st->state_flags & PFSTATE_NOSYNC) + return; + + if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) || + st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) { + st->state_flags |= PFSTATE_NOSYNC; + return; + } + + KASSERT(st->sync_state == PFSYNC_S_NONE, + ("%s: st->sync_state %u", __func__, st->sync_state)); + + PFSYNC_LOCK(sc); + if (sc->sc_len == PFSYNC_MINPKT) + callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); + + pfsync_q_ins(st, PFSYNC_S_INS); + PFSYNC_UNLOCK(sc); + + st->sync_updates = 0; +} + +static int +pfsync_defer(struct pf_state *st, struct mbuf *m) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_deferral *pd; + + if (m->m_flags & (M_BCAST|M_MCAST)) + return (0); + + PFSYNC_LOCK(sc); + + if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) || + !(sc->sc_flags & PFSYNCF_DEFER)) { + PFSYNC_UNLOCK(sc); + return (0); + } + + if (sc->sc_deferred >= 128) + pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0); + + pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT); + if (pd == NULL) + return (0); + sc->sc_deferred++; + + m->m_flags |= M_SKIP_FIREWALL; + st->state_flags |= PFSTATE_ACK; + + pd->pd_sc = sc; + pd->pd_refs = 0; + pd->pd_st = st; + pf_ref_state(st); + pd->pd_m = m; + + TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry); + callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); + callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd); + + pfsync_push(sc); + + return (1); +} + +static void +pfsync_undefer(struct pfsync_deferral *pd, int drop) +{ + struct pfsync_softc *sc = pd->pd_sc; + struct mbuf *m = pd->pd_m; + struct pf_state *st = pd->pd_st; + + PFSYNC_LOCK_ASSERT(sc); + + TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); + sc->sc_deferred--; + pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */ + free(pd, M_PFSYNC); + pf_release_state(st); + + if (drop) + m_freem(m); + else { + _IF_ENQUEUE(&sc->sc_ifp->if_snd, m); + pfsync_push(sc); + } +} + +static void +pfsync_defer_tmo(void *arg) +{ + struct pfsync_deferral *pd = arg; + struct pfsync_softc *sc = pd->pd_sc; + struct mbuf *m = pd->pd_m; + struct pf_state *st = pd->pd_st; + + PFSYNC_LOCK_ASSERT(sc); + + CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); + + TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); + sc->sc_deferred--; + pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */ + if (pd->pd_refs == 0) + free(pd, M_PFSYNC); + PFSYNC_UNLOCK(sc); + + ip_output(m, NULL, NULL, 0, NULL, NULL); + + pf_release_state(st); + + CURVNET_RESTORE(); +} + +static void +pfsync_undefer_state(struct pf_state *st, int drop) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_deferral *pd; + + PFSYNC_LOCK_ASSERT(sc); + + TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) { + if (pd->pd_st == st) { + if (callout_stop(&pd->pd_tmo) > 0) + pfsync_undefer(pd, drop); + return; + } + } + + panic("%s: unable to find deferred state", __func__); +} + +static void +pfsync_update_state(struct pf_state *st) +{ + struct pfsync_softc *sc = V_pfsyncif; + int sync = 0; + + PF_STATE_LOCK_ASSERT(st); + PFSYNC_LOCK(sc); + + if (st->state_flags & PFSTATE_ACK) + pfsync_undefer_state(st, 0); + if (st->state_flags & PFSTATE_NOSYNC) { + if (st->sync_state != PFSYNC_S_NONE) + pfsync_q_del(st); + PFSYNC_UNLOCK(sc); + return; + } + + if (sc->sc_len == PFSYNC_MINPKT) + callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); + + switch (st->sync_state) { + case PFSYNC_S_UPD_C: + case PFSYNC_S_UPD: + case PFSYNC_S_INS: + /* we're already handling it */ + + if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) { + st->sync_updates++; + if (st->sync_updates >= sc->sc_maxupdates) + sync = 1; + } + break; + + case PFSYNC_S_IACK: + pfsync_q_del(st); + case PFSYNC_S_NONE: + pfsync_q_ins(st, PFSYNC_S_UPD_C); + st->sync_updates = 0; + break; + + default: + panic("%s: unexpected sync state %d", __func__, st->sync_state); + } + + if (sync || (time_uptime - st->pfsync_time) < 2) + pfsync_push(sc); + + PFSYNC_UNLOCK(sc); +} + +static void +pfsync_request_update(u_int32_t creatorid, u_int64_t id) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_upd_req_item *item; + size_t nlen = sizeof(struct pfsync_upd_req); + + PFSYNC_LOCK_ASSERT(sc); + + /* + * This code does a bit to prevent multiple update requests for the + * same state being generated. It searches current subheader queue, + * but it doesn't lookup into queue of already packed datagrams. + */ + TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry) + if (item->ur_msg.id == id && + item->ur_msg.creatorid == creatorid) + return; + + item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT); + if (item == NULL) + return; /* XXX stats */ + + item->ur_msg.id = id; + item->ur_msg.creatorid = creatorid; + + if (TAILQ_EMPTY(&sc->sc_upd_req_list)) + nlen += sizeof(struct pfsync_subheader); + + if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) { + pfsync_sendout(1); + + nlen = sizeof(struct pfsync_subheader) + + sizeof(struct pfsync_upd_req); + } + + TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry); + sc->sc_len += nlen; +} + +static void +pfsync_update_state_req(struct pf_state *st) +{ + struct pfsync_softc *sc = V_pfsyncif; + + PF_STATE_LOCK_ASSERT(st); + PFSYNC_LOCK(sc); + + if (st->state_flags & PFSTATE_NOSYNC) { + if (st->sync_state != PFSYNC_S_NONE) + pfsync_q_del(st); + PFSYNC_UNLOCK(sc); + return; + } + + switch (st->sync_state) { + case PFSYNC_S_UPD_C: + case PFSYNC_S_IACK: + pfsync_q_del(st); + case PFSYNC_S_NONE: + pfsync_q_ins(st, PFSYNC_S_UPD); + pfsync_push(sc); + break; + + case PFSYNC_S_INS: + case PFSYNC_S_UPD: + case PFSYNC_S_DEL: + /* we're already handling it */ + break; + + default: + panic("%s: unexpected sync state %d", __func__, st->sync_state); + } + + PFSYNC_UNLOCK(sc); +} + +static void +pfsync_delete_state(struct pf_state *st) +{ + struct pfsync_softc *sc = V_pfsyncif; + + PFSYNC_LOCK(sc); + if (st->state_flags & PFSTATE_ACK) + pfsync_undefer_state(st, 1); + if (st->state_flags & PFSTATE_NOSYNC) { + if (st->sync_state != PFSYNC_S_NONE) + pfsync_q_del(st); + PFSYNC_UNLOCK(sc); + return; + } + + if (sc->sc_len == PFSYNC_MINPKT) + callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); + + switch (st->sync_state) { + case PFSYNC_S_INS: + /* We never got to tell the world so just forget about it. */ + pfsync_q_del(st); + break; + + case PFSYNC_S_UPD_C: + case PFSYNC_S_UPD: + case PFSYNC_S_IACK: + pfsync_q_del(st); + /* FALLTHROUGH to putting it on the del list */ + + case PFSYNC_S_NONE: + pfsync_q_ins(st, PFSYNC_S_DEL); + break; + + default: + panic("%s: unexpected sync state %d", __func__, st->sync_state); + } + PFSYNC_UNLOCK(sc); +} + +static void +pfsync_clear_states(u_int32_t creatorid, const char *ifname) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct { + struct pfsync_subheader subh; + struct pfsync_clr clr; + } __packed r; + + bzero(&r, sizeof(r)); + + r.subh.action = PFSYNC_ACT_CLR; + r.subh.count = htons(1); + V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++; + + strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname)); + r.clr.creatorid = creatorid; + + PFSYNC_LOCK(sc); + pfsync_send_plus(&r, sizeof(r)); + PFSYNC_UNLOCK(sc); +} + +static void +pfsync_q_ins(struct pf_state *st, int q) +{ + struct pfsync_softc *sc = V_pfsyncif; + size_t nlen = pfsync_qs[q].len; + + PFSYNC_LOCK_ASSERT(sc); + + KASSERT(st->sync_state == PFSYNC_S_NONE, + ("%s: st->sync_state %u", __func__, st->sync_state)); + KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu", + sc->sc_len)); + + if (TAILQ_EMPTY(&sc->sc_qs[q])) + nlen += sizeof(struct pfsync_subheader); + + if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) { + pfsync_sendout(1); + + nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len; + } + + sc->sc_len += nlen; + TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list); + st->sync_state = q; + pf_ref_state(st); +} + +static void +pfsync_q_del(struct pf_state *st) +{ + struct pfsync_softc *sc = V_pfsyncif; + int q = st->sync_state; + + PFSYNC_LOCK_ASSERT(sc); + KASSERT(st->sync_state != PFSYNC_S_NONE, + ("%s: st->sync_state != PFSYNC_S_NONE", __func__)); + + sc->sc_len -= pfsync_qs[q].len; + TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); + st->sync_state = PFSYNC_S_NONE; + pf_release_state(st); + + if (TAILQ_EMPTY(&sc->sc_qs[q])) + sc->sc_len -= sizeof(struct pfsync_subheader); +} + +static void +pfsync_bulk_start(void) +{ + struct pfsync_softc *sc = V_pfsyncif; + + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: received bulk update request\n"); + + PFSYNC_BLOCK(sc); + + sc->sc_ureq_received = time_uptime; + sc->sc_bulk_hashid = 0; + sc->sc_bulk_stateid = 0; + pfsync_bulk_status(PFSYNC_BUS_START); + callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc); + PFSYNC_BUNLOCK(sc); +} + +static void +pfsync_bulk_update(void *arg) +{ + struct pfsync_softc *sc = arg; + struct pf_state *s; + int i, sent = 0; + + PFSYNC_BLOCK_ASSERT(sc); + CURVNET_SET(sc->sc_ifp->if_vnet); + + /* + * Start with last state from previous invocation. + * It may had gone, in this case start from the + * hash slot. + */ + s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid); + + if (s != NULL) + i = PF_IDHASH(s); + else + i = sc->sc_bulk_hashid; + + for (; i <= pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + + if (s != NULL) + PF_HASHROW_ASSERT(ih); + else { + PF_HASHROW_LOCK(ih); + s = LIST_FIRST(&ih->states); + } + + for (; s; s = LIST_NEXT(s, entry)) { + + if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) < + sizeof(struct pfsync_state)) { + /* We've filled a packet. */ + sc->sc_bulk_hashid = i; + sc->sc_bulk_stateid = s->id; + sc->sc_bulk_creatorid = s->creatorid; + PF_HASHROW_UNLOCK(ih); + callout_reset(&sc->sc_bulk_tmo, 1, + pfsync_bulk_update, sc); + goto full; + } + + if (s->sync_state == PFSYNC_S_NONE && + s->timeout < PFTM_MAX && + s->pfsync_time <= sc->sc_ureq_received) { + pfsync_update_state_req(s); + sent++; + } + } + PF_HASHROW_UNLOCK(ih); + } + + /* We're done. */ + pfsync_bulk_status(PFSYNC_BUS_END); + +full: + CURVNET_RESTORE(); +} + +static void +pfsync_bulk_status(u_int8_t status) +{ + struct { + struct pfsync_subheader subh; + struct pfsync_bus bus; + } __packed r; + + struct pfsync_softc *sc = V_pfsyncif; + + bzero(&r, sizeof(r)); + + r.subh.action = PFSYNC_ACT_BUS; + r.subh.count = htons(1); + V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++; + + r.bus.creatorid = V_pf_status.hostid; + r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received); + r.bus.status = status; + + PFSYNC_LOCK(sc); + pfsync_send_plus(&r, sizeof(r)); + PFSYNC_UNLOCK(sc); +} + +static void +pfsync_bulk_fail(void *arg) +{ + struct pfsync_softc *sc = arg; + + CURVNET_SET(sc->sc_ifp->if_vnet); + + PFSYNC_BLOCK_ASSERT(sc); + + if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) { + /* Try again */ + callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, + pfsync_bulk_fail, V_pfsyncif); + PFSYNC_LOCK(sc); + pfsync_request_update(0, 0); + PFSYNC_UNLOCK(sc); + } else { + /* Pretend like the transfer was ok. */ + sc->sc_ureq_sent = 0; + sc->sc_bulk_tries = 0; + PFSYNC_LOCK(sc); + if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) + (*carp_demote_adj_p)(-V_pfsync_carp_adj, + "pfsync bulk fail"); + sc->sc_flags |= PFSYNCF_OK; + PFSYNC_UNLOCK(sc); + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: failed to receive bulk update\n"); + } + + CURVNET_RESTORE(); +} + +static void +pfsync_send_plus(void *plus, size_t pluslen) +{ + struct pfsync_softc *sc = V_pfsyncif; + + PFSYNC_LOCK_ASSERT(sc); + + if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu) + pfsync_sendout(1); + + sc->sc_plus = plus; + sc->sc_len += (sc->sc_pluslen = pluslen); + + pfsync_sendout(1); +} + +static void +pfsync_timeout(void *arg) +{ + struct pfsync_softc *sc = arg; + + CURVNET_SET(sc->sc_ifp->if_vnet); + PFSYNC_LOCK(sc); + pfsync_push(sc); + PFSYNC_UNLOCK(sc); + CURVNET_RESTORE(); +} + +static void +pfsync_push(struct pfsync_softc *sc) +{ + + PFSYNC_LOCK_ASSERT(sc); + + sc->sc_flags |= PFSYNCF_PUSH; + swi_sched(V_pfsync_swi_cookie, 0); +} + +static void +pfsyncintr(void *arg) +{ + struct pfsync_softc *sc = arg; + struct mbuf *m, *n; + + CURVNET_SET(sc->sc_ifp->if_vnet); + + PFSYNC_LOCK(sc); + if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) { + pfsync_sendout(0); + sc->sc_flags &= ~PFSYNCF_PUSH; + } + _IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m); + PFSYNC_UNLOCK(sc); + + for (; m != NULL; m = n) { + + n = m->m_nextpkt; + m->m_nextpkt = NULL; + + /* + * We distinguish between a deferral packet and our + * own pfsync packet based on M_SKIP_FIREWALL + * flag. This is XXX. + */ + if (m->m_flags & M_SKIP_FIREWALL) + ip_output(m, NULL, NULL, 0, NULL, NULL); + else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, + NULL) == 0) + V_pfsyncstats.pfsyncs_opackets++; + else + V_pfsyncstats.pfsyncs_oerrors++; + } + CURVNET_RESTORE(); +} + +static int +pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship) +{ + struct ip_moptions *imo = &sc->sc_imo; + int error; + + if (!(ifp->if_flags & IFF_MULTICAST)) + return (EADDRNOTAVAIL); + + imo->imo_membership = (struct in_multi **)mship; + imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; + imo->imo_multicast_vif = -1; + + if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL, + &imo->imo_membership[0])) != 0) { + imo->imo_membership = NULL; + return (error); + } + imo->imo_num_memberships++; + imo->imo_multicast_ifp = ifp; + imo->imo_multicast_ttl = PFSYNC_DFLTTL; + imo->imo_multicast_loop = 0; + + return (0); +} + +static void +pfsync_multicast_cleanup(struct pfsync_softc *sc) +{ + struct ip_moptions *imo = &sc->sc_imo; + + in_leavegroup(imo->imo_membership[0], NULL); + free(imo->imo_membership, M_PFSYNC); + imo->imo_membership = NULL; + imo->imo_multicast_ifp = NULL; +} + +#ifdef INET +extern struct domain inetdomain; +static struct protosw in_pfsync_protosw = { + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_PFSYNC, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = pfsync_input, + .pr_output = rip_output, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}; +#endif + +static void +pfsync_pointers_init() +{ + + PF_RULES_WLOCK(); + pfsync_state_import_ptr = pfsync_state_import; + pfsync_insert_state_ptr = pfsync_insert_state; + pfsync_update_state_ptr = pfsync_update_state; + pfsync_delete_state_ptr = pfsync_delete_state; + pfsync_clear_states_ptr = pfsync_clear_states; + pfsync_defer_ptr = pfsync_defer; + PF_RULES_WUNLOCK(); +} + +static void +pfsync_pointers_uninit() +{ + + PF_RULES_WLOCK(); + pfsync_state_import_ptr = NULL; + pfsync_insert_state_ptr = NULL; + pfsync_update_state_ptr = NULL; + pfsync_delete_state_ptr = NULL; + pfsync_clear_states_ptr = NULL; + pfsync_defer_ptr = NULL; + PF_RULES_WUNLOCK(); +} + +static void +vnet_pfsync_init(const void *unused __unused) +{ + int error; + + V_pfsync_cloner = if_clone_simple(pfsyncname, + pfsync_clone_create, pfsync_clone_destroy, 1); + error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif, + SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie); + if (error) { + if_clone_detach(V_pfsync_cloner); + log(LOG_INFO, "swi_add() failed in %s\n", __func__); + } +} +VNET_SYSINIT(vnet_pfsync_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY, + vnet_pfsync_init, NULL); + +static void +vnet_pfsync_uninit(const void *unused __unused) +{ + + if_clone_detach(V_pfsync_cloner); + swi_remove(V_pfsync_swi_cookie); +} +/* + * Detach after pf is gone; otherwise we might touch pfsync memory + * from within pf after freeing pfsync. + */ +VNET_SYSUNINIT(vnet_pfsync_uninit, SI_SUB_INIT_IF, SI_ORDER_SECOND, + vnet_pfsync_uninit, NULL); + +static int +pfsync_init() +{ +#ifdef INET + int error; + + error = pf_proto_register(PF_INET, &in_pfsync_protosw); + if (error) + return (error); + error = ipproto_register(IPPROTO_PFSYNC); + if (error) { + pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW); + return (error); + } +#endif + pfsync_pointers_init(); + + return (0); +} + +static void +pfsync_uninit() +{ + + pfsync_pointers_uninit(); + +#ifdef INET + ipproto_unregister(IPPROTO_PFSYNC); + pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW); +#endif +} + +static int +pfsync_modevent(module_t mod, int type, void *data) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + error = pfsync_init(); + break; + case MOD_QUIESCE: + /* + * Module should not be unloaded due to race conditions. + */ + error = EBUSY; + break; + case MOD_UNLOAD: + pfsync_uninit(); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +static moduledata_t pfsync_mod = { + pfsyncname, + pfsync_modevent, + 0 +}; + +#define PFSYNC_MODVER 1 + +/* Stay on FIREWALL as we depend on pf being initialized and on inetdomain. */ +DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY); +MODULE_VERSION(pfsync, PFSYNC_MODVER); +MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER); diff --git a/freebsd/sys/netpfil/pf/in4_cksum.c b/freebsd/sys/netpfil/pf/in4_cksum.c new file mode 100644 index 00000000..19cc8ac4 --- /dev/null +++ b/freebsd/sys/netpfil/pf/in4_cksum.c @@ -0,0 +1,122 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/* $FreeBSD$ */ +/* $OpenBSD: in4_cksum.c,v 1.7 2003/06/02 23:28:13 millert Exp $ */ +/* $KAME: in4_cksum.c,v 1.10 2001/11/30 10:06:15 itojun Exp $ */ +/* $NetBSD: in_cksum.c,v 1.13 1996/10/13 02:03:03 christos Exp $ */ + +/* + * Copyright (C) 1999 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 + */ + +#include <rtems/bsd/sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> + +#include <machine/in_cksum.h> + +#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; (void)ADDCARRY(sum);} + +int in4_cksum(struct mbuf *, u_int8_t, int, int); + +int +in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len) +{ + union { + struct ipovly ipov; + u_int16_t w[10]; + } u; + union { + u_int16_t s[2]; + u_int32_t l; + } l_util; + + u_int16_t *w; + int psum; + int sum = 0; + + if (nxt != 0) { + /* pseudo header */ + if (off < sizeof(struct ipovly)) + panic("in4_cksum: offset too short"); + if (m->m_len < sizeof(struct ip)) + panic("in4_cksum: bad mbuf chain"); + bzero(&u.ipov, sizeof(u.ipov)); + u.ipov.ih_len = htons(len); + u.ipov.ih_pr = nxt; + u.ipov.ih_src = mtod(m, struct ip *)->ip_src; + u.ipov.ih_dst = mtod(m, struct ip *)->ip_dst; + w = u.w; + /* assumes sizeof(ipov) == 20 */ + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4]; + sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9]; + } + + psum = in_cksum_skip(m, len + off, off); + psum = ~psum & 0xffff; + sum += psum; + REDUCE; + return (~sum & 0xffff); +} diff --git a/freebsd/sys/netpfil/pf/pf.c b/freebsd/sys/netpfil/pf/pf.c new file mode 100644 index 00000000..7ac181b5 --- /dev/null +++ b/freebsd/sys/netpfil/pf/pf.c @@ -0,0 +1,6657 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002 - 2008 Henning Brauer + * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> +#include <rtems/bsd/local/opt_bpf.h> +#include <rtems/bsd/local/opt_pf.h> + +#include <rtems/bsd/sys/param.h> +#include <sys/bus.h> +#include <sys/endian.h> +#include <sys/hash.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/limits.h> +#include <sys/mbuf.h> +#include <sys/md5.h> +#include <sys/random.h> +#include <sys/refcount.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/taskqueue.h> +#include <sys/ucred.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_types.h> +#include <net/if_vlan_var.h> +#include <net/route.h> +#include <net/radix_mpath.h> +#include <net/vnet.h> + +#include <net/pfvar.h> +#include <net/if_pflog.h> +#include <net/if_pfsync.h> + +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/in_fib.h> +#include <netinet/ip.h> +#include <netinet/ip_fw.h> +#include <netinet/ip_icmp.h> +#include <netinet/icmp_var.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> + +#include <netpfil/ipfw/ip_fw_private.h> /* XXX: only for DIR_IN/DIR_OUT */ + +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet6/nd6.h> +#include <netinet6/ip6_var.h> +#include <netinet6/in6_pcb.h> +#include <netinet6/in6_fib.h> +#include <netinet6/scope6_var.h> +#endif /* INET6 */ + +#include <machine/in_cksum.h> +#include <security/mac/mac_framework.h> + +#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x + +/* + * Global variables + */ + +/* state tables */ +VNET_DEFINE(struct pf_altqqueue, pf_altqs[2]); +VNET_DEFINE(struct pf_palist, pf_pabuf); +VNET_DEFINE(struct pf_altqqueue *, pf_altqs_active); +VNET_DEFINE(struct pf_altqqueue *, pf_altqs_inactive); +VNET_DEFINE(struct pf_kstatus, pf_status); + +VNET_DEFINE(u_int32_t, ticket_altqs_active); +VNET_DEFINE(u_int32_t, ticket_altqs_inactive); +VNET_DEFINE(int, altqs_inactive_open); +VNET_DEFINE(u_int32_t, ticket_pabuf); + +VNET_DEFINE(MD5_CTX, pf_tcp_secret_ctx); +#define V_pf_tcp_secret_ctx VNET(pf_tcp_secret_ctx) +VNET_DEFINE(u_char, pf_tcp_secret[16]); +#define V_pf_tcp_secret VNET(pf_tcp_secret) +VNET_DEFINE(int, pf_tcp_secret_init); +#define V_pf_tcp_secret_init VNET(pf_tcp_secret_init) +VNET_DEFINE(int, pf_tcp_iss_off); +#define V_pf_tcp_iss_off VNET(pf_tcp_iss_off) + +/* + * Queue for pf_intr() sends. + */ +static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations"); +struct pf_send_entry { + STAILQ_ENTRY(pf_send_entry) pfse_next; + struct mbuf *pfse_m; + enum { + PFSE_IP, + PFSE_IP6, + PFSE_ICMP, + PFSE_ICMP6, + } pfse_type; + struct { + int type; + int code; + int mtu; + } icmpopts; +}; + +STAILQ_HEAD(pf_send_head, pf_send_entry); +static VNET_DEFINE(struct pf_send_head, pf_sendqueue); +#define V_pf_sendqueue VNET(pf_sendqueue) + +static struct mtx pf_sendqueue_mtx; +MTX_SYSINIT(pf_sendqueue_mtx, &pf_sendqueue_mtx, "pf send queue", MTX_DEF); +#define PF_SENDQ_LOCK() mtx_lock(&pf_sendqueue_mtx) +#define PF_SENDQ_UNLOCK() mtx_unlock(&pf_sendqueue_mtx) + +/* + * Queue for pf_overload_task() tasks. + */ +struct pf_overload_entry { + SLIST_ENTRY(pf_overload_entry) next; + struct pf_addr addr; + sa_family_t af; + uint8_t dir; + struct pf_rule *rule; +}; + +SLIST_HEAD(pf_overload_head, pf_overload_entry); +static VNET_DEFINE(struct pf_overload_head, pf_overloadqueue); +#define V_pf_overloadqueue VNET(pf_overloadqueue) +static VNET_DEFINE(struct task, pf_overloadtask); +#define V_pf_overloadtask VNET(pf_overloadtask) + +static struct mtx pf_overloadqueue_mtx; +MTX_SYSINIT(pf_overloadqueue_mtx, &pf_overloadqueue_mtx, + "pf overload/flush queue", MTX_DEF); +#define PF_OVERLOADQ_LOCK() mtx_lock(&pf_overloadqueue_mtx) +#define PF_OVERLOADQ_UNLOCK() mtx_unlock(&pf_overloadqueue_mtx) + +VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules); +struct mtx pf_unlnkdrules_mtx; +MTX_SYSINIT(pf_unlnkdrules_mtx, &pf_unlnkdrules_mtx, "pf unlinked rules", + MTX_DEF); + +static VNET_DEFINE(uma_zone_t, pf_sources_z); +#define V_pf_sources_z VNET(pf_sources_z) +uma_zone_t pf_mtag_z; +VNET_DEFINE(uma_zone_t, pf_state_z); +VNET_DEFINE(uma_zone_t, pf_state_key_z); + +VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]); +#define PFID_CPUBITS 8 +#define PFID_CPUSHIFT (sizeof(uint64_t) * NBBY - PFID_CPUBITS) +#define PFID_CPUMASK ((uint64_t)((1 << PFID_CPUBITS) - 1) << PFID_CPUSHIFT) +#define PFID_MAXID (~PFID_CPUMASK) +CTASSERT((1 << PFID_CPUBITS) >= MAXCPU); + +static void pf_src_tree_remove_state(struct pf_state *); +static void pf_init_threshold(struct pf_threshold *, u_int32_t, + u_int32_t); +static void pf_add_threshold(struct pf_threshold *); +static int pf_check_threshold(struct pf_threshold *); + +static void pf_change_ap(struct mbuf *, struct pf_addr *, u_int16_t *, + u_int16_t *, u_int16_t *, struct pf_addr *, + u_int16_t, u_int8_t, sa_family_t); +static int pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *, + struct tcphdr *, struct pf_state_peer *); +static void pf_change_icmp(struct pf_addr *, u_int16_t *, + struct pf_addr *, struct pf_addr *, u_int16_t, + u_int16_t *, u_int16_t *, u_int16_t *, + u_int16_t *, u_int8_t, sa_family_t); +static void pf_send_tcp(struct mbuf *, + const struct pf_rule *, sa_family_t, + const struct pf_addr *, const struct pf_addr *, + u_int16_t, u_int16_t, u_int32_t, u_int32_t, + u_int8_t, u_int16_t, u_int16_t, u_int8_t, int, + u_int16_t, struct ifnet *); +static void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, + sa_family_t, struct pf_rule *); +static void pf_detach_state(struct pf_state *); +static int pf_state_key_attach(struct pf_state_key *, + struct pf_state_key *, struct pf_state *); +static void pf_state_key_detach(struct pf_state *, int); +static int pf_state_key_ctor(void *, int, void *, int); +static u_int32_t pf_tcp_iss(struct pf_pdesc *); +static int pf_test_rule(struct pf_rule **, struct pf_state **, + int, struct pfi_kif *, struct mbuf *, int, + struct pf_pdesc *, struct pf_rule **, + struct pf_ruleset **, struct inpcb *); +static int pf_create_state(struct pf_rule *, struct pf_rule *, + struct pf_rule *, struct pf_pdesc *, + struct pf_src_node *, struct pf_state_key *, + struct pf_state_key *, struct mbuf *, int, + u_int16_t, u_int16_t, int *, struct pfi_kif *, + struct pf_state **, int, u_int16_t, u_int16_t, + int); +static int pf_test_fragment(struct pf_rule **, int, + struct pfi_kif *, struct mbuf *, void *, + struct pf_pdesc *, struct pf_rule **, + struct pf_ruleset **); +static int pf_tcp_track_full(struct pf_state_peer *, + struct pf_state_peer *, struct pf_state **, + struct pfi_kif *, struct mbuf *, int, + struct pf_pdesc *, u_short *, int *); +static int pf_tcp_track_sloppy(struct pf_state_peer *, + struct pf_state_peer *, struct pf_state **, + struct pf_pdesc *, u_short *); +static int pf_test_state_tcp(struct pf_state **, int, + struct pfi_kif *, struct mbuf *, int, + void *, struct pf_pdesc *, u_short *); +static int pf_test_state_udp(struct pf_state **, int, + struct pfi_kif *, struct mbuf *, int, + void *, struct pf_pdesc *); +static int pf_test_state_icmp(struct pf_state **, int, + struct pfi_kif *, struct mbuf *, int, + void *, struct pf_pdesc *, u_short *); +static int pf_test_state_other(struct pf_state **, int, + struct pfi_kif *, struct mbuf *, struct pf_pdesc *); +static u_int8_t pf_get_wscale(struct mbuf *, int, u_int16_t, + sa_family_t); +static u_int16_t pf_get_mss(struct mbuf *, int, u_int16_t, + sa_family_t); +static u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t, + int, u_int16_t); +static int pf_check_proto_cksum(struct mbuf *, int, int, + u_int8_t, sa_family_t); +static void pf_print_state_parts(struct pf_state *, + struct pf_state_key *, struct pf_state_key *); +static int pf_addr_wrap_neq(struct pf_addr_wrap *, + struct pf_addr_wrap *); +static struct pf_state *pf_find_state(struct pfi_kif *, + struct pf_state_key_cmp *, u_int); +static int pf_src_connlimit(struct pf_state **); +static void pf_overload_task(void *v, int pending); +static int pf_insert_src_node(struct pf_src_node **, + struct pf_rule *, struct pf_addr *, sa_family_t); +static u_int pf_purge_expired_states(u_int, int); +static void pf_purge_unlinked_rules(void); +static int pf_mtag_uminit(void *, int, int); +static void pf_mtag_free(struct m_tag *); +#ifdef INET +static void pf_route(struct mbuf **, struct pf_rule *, int, + struct ifnet *, struct pf_state *, + struct pf_pdesc *); +#endif /* INET */ +#ifdef INET6 +static void pf_change_a6(struct pf_addr *, u_int16_t *, + struct pf_addr *, u_int8_t); +static void pf_route6(struct mbuf **, struct pf_rule *, int, + struct ifnet *, struct pf_state *, + struct pf_pdesc *); +#endif /* INET6 */ + +int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len); + +extern int pf_end_threads; + +VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]); + +#define PACKET_LOOPED(pd) ((pd)->pf_mtag && \ + (pd)->pf_mtag->flags & PF_PACKET_LOOPED) + +#define STATE_LOOKUP(i, k, d, s, pd) \ + do { \ + (s) = pf_find_state((i), (k), (d)); \ + if ((s) == NULL) \ + return (PF_DROP); \ + if (PACKET_LOOPED(pd)) \ + return (PF_PASS); \ + if ((d) == PF_OUT && \ + (((s)->rule.ptr->rt == PF_ROUTETO && \ + (s)->rule.ptr->direction == PF_OUT) || \ + ((s)->rule.ptr->rt == PF_REPLYTO && \ + (s)->rule.ptr->direction == PF_IN)) && \ + (s)->rt_kif != NULL && \ + (s)->rt_kif != (i)) \ + return (PF_PASS); \ + } while (0) + +#define BOUND_IFACE(r, k) \ + ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all + +#define STATE_INC_COUNTERS(s) \ + do { \ + counter_u64_add(s->rule.ptr->states_cur, 1); \ + counter_u64_add(s->rule.ptr->states_tot, 1); \ + if (s->anchor.ptr != NULL) { \ + counter_u64_add(s->anchor.ptr->states_cur, 1); \ + counter_u64_add(s->anchor.ptr->states_tot, 1); \ + } \ + if (s->nat_rule.ptr != NULL) { \ + counter_u64_add(s->nat_rule.ptr->states_cur, 1);\ + counter_u64_add(s->nat_rule.ptr->states_tot, 1);\ + } \ + } while (0) + +#define STATE_DEC_COUNTERS(s) \ + do { \ + if (s->nat_rule.ptr != NULL) \ + counter_u64_add(s->nat_rule.ptr->states_cur, -1);\ + if (s->anchor.ptr != NULL) \ + counter_u64_add(s->anchor.ptr->states_cur, -1); \ + counter_u64_add(s->rule.ptr->states_cur, -1); \ + } while (0) + +static MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures"); +VNET_DEFINE(struct pf_keyhash *, pf_keyhash); +VNET_DEFINE(struct pf_idhash *, pf_idhash); +VNET_DEFINE(struct pf_srchash *, pf_srchash); + +SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)"); + +u_long pf_hashmask; +u_long pf_srchashmask; +static u_long pf_hashsize; +static u_long pf_srchashsize; + +SYSCTL_ULONG(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN, + &pf_hashsize, 0, "Size of pf(4) states hashtable"); +SYSCTL_ULONG(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN, + &pf_srchashsize, 0, "Size of pf(4) source nodes hashtable"); + +VNET_DEFINE(void *, pf_swi_cookie); + +VNET_DEFINE(uint32_t, pf_hashseed); +#define V_pf_hashseed VNET(pf_hashseed) + +int +pf_addr_cmp(struct pf_addr *a, struct pf_addr *b, sa_family_t af) +{ + + switch (af) { +#ifdef INET + case AF_INET: + if (a->addr32[0] > b->addr32[0]) + return (1); + if (a->addr32[0] < b->addr32[0]) + return (-1); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (a->addr32[3] > b->addr32[3]) + return (1); + if (a->addr32[3] < b->addr32[3]) + return (-1); + if (a->addr32[2] > b->addr32[2]) + return (1); + if (a->addr32[2] < b->addr32[2]) + return (-1); + if (a->addr32[1] > b->addr32[1]) + return (1); + if (a->addr32[1] < b->addr32[1]) + return (-1); + if (a->addr32[0] > b->addr32[0]) + return (1); + if (a->addr32[0] < b->addr32[0]) + return (-1); + break; +#endif /* INET6 */ + default: + panic("%s: unknown address family %u", __func__, af); + } + return (0); +} + +static __inline uint32_t +pf_hashkey(struct pf_state_key *sk) +{ + uint32_t h; + + h = murmur3_32_hash32((uint32_t *)sk, + sizeof(struct pf_state_key_cmp)/sizeof(uint32_t), + V_pf_hashseed); + + return (h & pf_hashmask); +} + +static __inline uint32_t +pf_hashsrc(struct pf_addr *addr, sa_family_t af) +{ + uint32_t h; + + switch (af) { + case AF_INET: + h = murmur3_32_hash32((uint32_t *)&addr->v4, + sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed); + break; + case AF_INET6: + h = murmur3_32_hash32((uint32_t *)&addr->v6, + sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed); + break; + default: + panic("%s: unknown address family %u", __func__, af); + } + + return (h & pf_srchashmask); +} + +#ifdef ALTQ +static int +pf_state_hash(struct pf_state *s) +{ + u_int32_t hv = (intptr_t)s / sizeof(*s); + + hv ^= crc32(&s->src, sizeof(s->src)); + hv ^= crc32(&s->dst, sizeof(s->dst)); + if (hv == 0) + hv = 1; + return (hv); +} +#endif + +#ifdef INET6 +void +pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + dst->addr32[0] = src->addr32[0]; + break; +#endif /* INET */ + case AF_INET6: + dst->addr32[0] = src->addr32[0]; + dst->addr32[1] = src->addr32[1]; + dst->addr32[2] = src->addr32[2]; + dst->addr32[3] = src->addr32[3]; + break; + } +} +#endif /* INET6 */ + +static void +pf_init_threshold(struct pf_threshold *threshold, + u_int32_t limit, u_int32_t seconds) +{ + threshold->limit = limit * PF_THRESHOLD_MULT; + threshold->seconds = seconds; + threshold->count = 0; + threshold->last = time_uptime; +} + +static void +pf_add_threshold(struct pf_threshold *threshold) +{ + u_int32_t t = time_uptime, diff = t - threshold->last; + + if (diff >= threshold->seconds) + threshold->count = 0; + else + threshold->count -= threshold->count * diff / + threshold->seconds; + threshold->count += PF_THRESHOLD_MULT; + threshold->last = t; +} + +static int +pf_check_threshold(struct pf_threshold *threshold) +{ + return (threshold->count > threshold->limit); +} + +static int +pf_src_connlimit(struct pf_state **state) +{ + struct pf_overload_entry *pfoe; + int bad = 0; + + PF_STATE_LOCK_ASSERT(*state); + + (*state)->src_node->conn++; + (*state)->src.tcp_est = 1; + pf_add_threshold(&(*state)->src_node->conn_rate); + + if ((*state)->rule.ptr->max_src_conn && + (*state)->rule.ptr->max_src_conn < + (*state)->src_node->conn) { + counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONN], 1); + bad++; + } + + if ((*state)->rule.ptr->max_src_conn_rate.limit && + pf_check_threshold(&(*state)->src_node->conn_rate)) { + counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONNRATE], 1); + bad++; + } + + if (!bad) + return (0); + + /* Kill this state. */ + (*state)->timeout = PFTM_PURGE; + (*state)->src.state = (*state)->dst.state = TCPS_CLOSED; + + if ((*state)->rule.ptr->overload_tbl == NULL) + return (1); + + /* Schedule overloading and flushing task. */ + pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT); + if (pfoe == NULL) + return (1); /* too bad :( */ + + bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr)); + pfoe->af = (*state)->key[PF_SK_WIRE]->af; + pfoe->rule = (*state)->rule.ptr; + pfoe->dir = (*state)->direction; + PF_OVERLOADQ_LOCK(); + SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next); + PF_OVERLOADQ_UNLOCK(); + taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask); + + return (1); +} + +static void +pf_overload_task(void *v, int pending) +{ + struct pf_overload_head queue; + struct pfr_addr p; + struct pf_overload_entry *pfoe, *pfoe1; + uint32_t killed = 0; + + CURVNET_SET((struct vnet *)v); + + PF_OVERLOADQ_LOCK(); + queue = V_pf_overloadqueue; + SLIST_INIT(&V_pf_overloadqueue); + PF_OVERLOADQ_UNLOCK(); + + bzero(&p, sizeof(p)); + SLIST_FOREACH(pfoe, &queue, next) { + counter_u64_add(V_pf_status.lcounters[LCNT_OVERLOAD_TABLE], 1); + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("%s: blocking address ", __func__); + pf_print_host(&pfoe->addr, 0, pfoe->af); + printf("\n"); + } + + p.pfra_af = pfoe->af; + switch (pfoe->af) { +#ifdef INET + case AF_INET: + p.pfra_net = 32; + p.pfra_ip4addr = pfoe->addr.v4; + break; +#endif +#ifdef INET6 + case AF_INET6: + p.pfra_net = 128; + p.pfra_ip6addr = pfoe->addr.v6; + break; +#endif + } + + PF_RULES_WLOCK(); + pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second); + PF_RULES_WUNLOCK(); + } + + /* + * Remove those entries, that don't need flushing. + */ + SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1) + if (pfoe->rule->flush == 0) { + SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next); + free(pfoe, M_PFTEMP); + } else + counter_u64_add( + V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH], 1); + + /* If nothing to flush, return. */ + if (SLIST_EMPTY(&queue)) { + CURVNET_RESTORE(); + return; + } + + for (int i = 0; i <= pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + struct pf_state_key *sk; + struct pf_state *s; + + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + sk = s->key[PF_SK_WIRE]; + SLIST_FOREACH(pfoe, &queue, next) + if (sk->af == pfoe->af && + ((pfoe->rule->flush & PF_FLUSH_GLOBAL) || + pfoe->rule == s->rule.ptr) && + ((pfoe->dir == PF_OUT && + PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) || + (pfoe->dir == PF_IN && + PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) { + s->timeout = PFTM_PURGE; + s->src.state = s->dst.state = TCPS_CLOSED; + killed++; + } + } + PF_HASHROW_UNLOCK(ih); + } + SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1) + free(pfoe, M_PFTEMP); + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("%s: %u states killed", __func__, killed); + + CURVNET_RESTORE(); +} + +/* + * Can return locked on failure, so that we can consistently + * allocate and insert a new one. + */ +struct pf_src_node * +pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af, + int returnlocked) +{ + struct pf_srchash *sh; + struct pf_src_node *n; + + counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1); + + sh = &V_pf_srchash[pf_hashsrc(src, af)]; + PF_HASHROW_LOCK(sh); + LIST_FOREACH(n, &sh->nodes, entry) + if (n->rule.ptr == rule && n->af == af && + ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) || + (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0))) + break; + if (n != NULL) { + n->states++; + PF_HASHROW_UNLOCK(sh); + } else if (returnlocked == 0) + PF_HASHROW_UNLOCK(sh); + + return (n); +} + +static int +pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule, + struct pf_addr *src, sa_family_t af) +{ + + KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK || + rule->rpool.opts & PF_POOL_STICKYADDR), + ("%s for non-tracking rule %p", __func__, rule)); + + if (*sn == NULL) + *sn = pf_find_src_node(src, rule, af, 1); + + if (*sn == NULL) { + struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)]; + + PF_HASHROW_ASSERT(sh); + + if (!rule->max_src_nodes || + counter_u64_fetch(rule->src_nodes) < rule->max_src_nodes) + (*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO); + else + counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES], + 1); + if ((*sn) == NULL) { + PF_HASHROW_UNLOCK(sh); + return (-1); + } + + pf_init_threshold(&(*sn)->conn_rate, + rule->max_src_conn_rate.limit, + rule->max_src_conn_rate.seconds); + + (*sn)->af = af; + (*sn)->rule.ptr = rule; + PF_ACPY(&(*sn)->addr, src, af); + LIST_INSERT_HEAD(&sh->nodes, *sn, entry); + (*sn)->creation = time_uptime; + (*sn)->ruletype = rule->action; + (*sn)->states = 1; + if ((*sn)->rule.ptr != NULL) + counter_u64_add((*sn)->rule.ptr->src_nodes, 1); + PF_HASHROW_UNLOCK(sh); + counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1); + } else { + if (rule->max_src_states && + (*sn)->states >= rule->max_src_states) { + counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES], + 1); + return (-1); + } + } + return (0); +} + +void +pf_unlink_src_node(struct pf_src_node *src) +{ + + PF_HASHROW_ASSERT(&V_pf_srchash[pf_hashsrc(&src->addr, src->af)]); + LIST_REMOVE(src, entry); + if (src->rule.ptr) + counter_u64_add(src->rule.ptr->src_nodes, -1); +} + +u_int +pf_free_src_nodes(struct pf_src_node_list *head) +{ + struct pf_src_node *sn, *tmp; + u_int count = 0; + + LIST_FOREACH_SAFE(sn, head, entry, tmp) { + uma_zfree(V_pf_sources_z, sn); + count++; + } + + counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], count); + + return (count); +} + +void +pf_mtag_initialize() +{ + + pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) + + sizeof(struct pf_mtag), NULL, NULL, pf_mtag_uminit, NULL, + UMA_ALIGN_PTR, 0); +} + +/* Per-vnet data storage structures initialization. */ +void +pf_initialize() +{ + struct pf_keyhash *kh; + struct pf_idhash *ih; + struct pf_srchash *sh; + u_int i; + + if (pf_hashsize == 0 || !powerof2(pf_hashsize)) + pf_hashsize = PF_HASHSIZ; + if (pf_srchashsize == 0 || !powerof2(pf_srchashsize)) + pf_srchashsize = PF_HASHSIZ / 4; + + V_pf_hashseed = arc4random(); + + /* States and state keys storage. */ + V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z; + uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT); + uma_zone_set_warning(V_pf_state_z, "PF states limit reached"); + + V_pf_state_key_z = uma_zcreate("pf state keys", + sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + V_pf_keyhash = malloc(pf_hashsize * sizeof(struct pf_keyhash), + M_PFHASH, M_WAITOK | M_ZERO); + V_pf_idhash = malloc(pf_hashsize * sizeof(struct pf_idhash), + M_PFHASH, M_WAITOK | M_ZERO); + pf_hashmask = pf_hashsize - 1; + for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask; + i++, kh++, ih++) { + mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK); + mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF); + } + + /* Source nodes. */ + V_pf_sources_z = uma_zcreate("pf source nodes", + sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, + 0); + V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z; + uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT); + uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached"); + V_pf_srchash = malloc(pf_srchashsize * sizeof(struct pf_srchash), + M_PFHASH, M_WAITOK|M_ZERO); + pf_srchashmask = pf_srchashsize - 1; + for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) + mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF); + + /* ALTQ */ + TAILQ_INIT(&V_pf_altqs[0]); + TAILQ_INIT(&V_pf_altqs[1]); + TAILQ_INIT(&V_pf_pabuf); + V_pf_altqs_active = &V_pf_altqs[0]; + V_pf_altqs_inactive = &V_pf_altqs[1]; + + /* Send & overload+flush queues. */ + STAILQ_INIT(&V_pf_sendqueue); + SLIST_INIT(&V_pf_overloadqueue); + TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, curvnet); + + /* Unlinked, but may be referenced rules. */ + TAILQ_INIT(&V_pf_unlinked_rules); +} + +void +pf_mtag_cleanup() +{ + + uma_zdestroy(pf_mtag_z); +} + +void +pf_cleanup() +{ + struct pf_keyhash *kh; + struct pf_idhash *ih; + struct pf_srchash *sh; + struct pf_send_entry *pfse, *next; + u_int i; + + for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask; + i++, kh++, ih++) { + KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty", + __func__)); + KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty", + __func__)); + mtx_destroy(&kh->lock); + mtx_destroy(&ih->lock); + } + free(V_pf_keyhash, M_PFHASH); + free(V_pf_idhash, M_PFHASH); + + for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) { + KASSERT(LIST_EMPTY(&sh->nodes), + ("%s: source node hash not empty", __func__)); + mtx_destroy(&sh->lock); + } + free(V_pf_srchash, M_PFHASH); + + STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) { + m_freem(pfse->pfse_m); + free(pfse, M_PFTEMP); + } + + uma_zdestroy(V_pf_sources_z); + uma_zdestroy(V_pf_state_z); + uma_zdestroy(V_pf_state_key_z); +} + +static int +pf_mtag_uminit(void *mem, int size, int how) +{ + struct m_tag *t; + + t = (struct m_tag *)mem; + t->m_tag_cookie = MTAG_ABI_COMPAT; + t->m_tag_id = PACKET_TAG_PF; + t->m_tag_len = sizeof(struct pf_mtag); + t->m_tag_free = pf_mtag_free; + + return (0); +} + +static void +pf_mtag_free(struct m_tag *t) +{ + + uma_zfree(pf_mtag_z, t); +} + +struct pf_mtag * +pf_get_mtag(struct mbuf *m) +{ + struct m_tag *mtag; + + if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL) + return ((struct pf_mtag *)(mtag + 1)); + + mtag = uma_zalloc(pf_mtag_z, M_NOWAIT); + if (mtag == NULL) + return (NULL); + bzero(mtag + 1, sizeof(struct pf_mtag)); + m_tag_prepend(m, mtag); + + return ((struct pf_mtag *)(mtag + 1)); +} + +static int +pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks, + struct pf_state *s) +{ + struct pf_keyhash *khs, *khw, *kh; + struct pf_state_key *sk, *cur; + struct pf_state *si, *olds = NULL; + int idx; + + KASSERT(s->refs == 0, ("%s: state not pristine", __func__)); + KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__)); + KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__)); + + /* + * We need to lock hash slots of both keys. To avoid deadlock + * we always lock the slot with lower address first. Unlock order + * isn't important. + * + * We also need to lock ID hash slot before dropping key + * locks. On success we return with ID hash slot locked. + */ + + if (skw == sks) { + khs = khw = &V_pf_keyhash[pf_hashkey(skw)]; + PF_HASHROW_LOCK(khs); + } else { + khs = &V_pf_keyhash[pf_hashkey(sks)]; + khw = &V_pf_keyhash[pf_hashkey(skw)]; + if (khs == khw) { + PF_HASHROW_LOCK(khs); + } else if (khs < khw) { + PF_HASHROW_LOCK(khs); + PF_HASHROW_LOCK(khw); + } else { + PF_HASHROW_LOCK(khw); + PF_HASHROW_LOCK(khs); + } + } + +#define KEYS_UNLOCK() do { \ + if (khs != khw) { \ + PF_HASHROW_UNLOCK(khs); \ + PF_HASHROW_UNLOCK(khw); \ + } else \ + PF_HASHROW_UNLOCK(khs); \ +} while (0) + + /* + * First run: start with wire key. + */ + sk = skw; + kh = khw; + idx = PF_SK_WIRE; + +keyattach: + LIST_FOREACH(cur, &kh->keys, entry) + if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0) + break; + + if (cur != NULL) { + /* Key exists. Check for same kif, if none, add to key. */ + TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) { + struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)]; + + PF_HASHROW_LOCK(ih); + if (si->kif == s->kif && + si->direction == s->direction) { + if (sk->proto == IPPROTO_TCP && + si->src.state >= TCPS_FIN_WAIT_2 && + si->dst.state >= TCPS_FIN_WAIT_2) { + /* + * New state matches an old >FIN_WAIT_2 + * state. We can't drop key hash locks, + * thus we can't unlink it properly. + * + * As a workaround we drop it into + * TCPS_CLOSED state, schedule purge + * ASAP and push it into the very end + * of the slot TAILQ, so that it won't + * conflict with our new state. + */ + si->src.state = si->dst.state = + TCPS_CLOSED; + si->timeout = PFTM_PURGE; + olds = si; + } else { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: %s key attach " + "failed on %s: ", + (idx == PF_SK_WIRE) ? + "wire" : "stack", + s->kif->pfik_name); + pf_print_state_parts(s, + (idx == PF_SK_WIRE) ? + sk : NULL, + (idx == PF_SK_STACK) ? + sk : NULL); + printf(", existing: "); + pf_print_state_parts(si, + (idx == PF_SK_WIRE) ? + sk : NULL, + (idx == PF_SK_STACK) ? + sk : NULL); + printf("\n"); + } + PF_HASHROW_UNLOCK(ih); + KEYS_UNLOCK(); + uma_zfree(V_pf_state_key_z, sk); + if (idx == PF_SK_STACK) + pf_detach_state(s); + return (EEXIST); /* collision! */ + } + } + PF_HASHROW_UNLOCK(ih); + } + uma_zfree(V_pf_state_key_z, sk); + s->key[idx] = cur; + } else { + LIST_INSERT_HEAD(&kh->keys, sk, entry); + s->key[idx] = sk; + } + +stateattach: + /* List is sorted, if-bound states before floating. */ + if (s->kif == V_pfi_all) + TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]); + else + TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]); + + if (olds) { + TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]); + TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds, + key_list[idx]); + olds = NULL; + } + + /* + * Attach done. See how should we (or should not?) + * attach a second key. + */ + if (sks == skw) { + s->key[PF_SK_STACK] = s->key[PF_SK_WIRE]; + idx = PF_SK_STACK; + sks = NULL; + goto stateattach; + } else if (sks != NULL) { + /* + * Continue attaching with stack key. + */ + sk = sks; + kh = khs; + idx = PF_SK_STACK; + sks = NULL; + goto keyattach; + } + + PF_STATE_LOCK(s); + KEYS_UNLOCK(); + + KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL, + ("%s failure", __func__)); + + return (0); +#undef KEYS_UNLOCK +} + +static void +pf_detach_state(struct pf_state *s) +{ + struct pf_state_key *sks = s->key[PF_SK_STACK]; + struct pf_keyhash *kh; + + if (sks != NULL) { + kh = &V_pf_keyhash[pf_hashkey(sks)]; + PF_HASHROW_LOCK(kh); + if (s->key[PF_SK_STACK] != NULL) + pf_state_key_detach(s, PF_SK_STACK); + /* + * If both point to same key, then we are done. + */ + if (sks == s->key[PF_SK_WIRE]) { + pf_state_key_detach(s, PF_SK_WIRE); + PF_HASHROW_UNLOCK(kh); + return; + } + PF_HASHROW_UNLOCK(kh); + } + + if (s->key[PF_SK_WIRE] != NULL) { + kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])]; + PF_HASHROW_LOCK(kh); + if (s->key[PF_SK_WIRE] != NULL) + pf_state_key_detach(s, PF_SK_WIRE); + PF_HASHROW_UNLOCK(kh); + } +} + +static void +pf_state_key_detach(struct pf_state *s, int idx) +{ + struct pf_state_key *sk = s->key[idx]; +#ifdef INVARIANTS + struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)]; + + PF_HASHROW_ASSERT(kh); +#endif + TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]); + s->key[idx] = NULL; + + if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) { + LIST_REMOVE(sk, entry); + uma_zfree(V_pf_state_key_z, sk); + } +} + +static int +pf_state_key_ctor(void *mem, int size, void *arg, int flags) +{ + struct pf_state_key *sk = mem; + + bzero(sk, sizeof(struct pf_state_key_cmp)); + TAILQ_INIT(&sk->states[PF_SK_WIRE]); + TAILQ_INIT(&sk->states[PF_SK_STACK]); + + return (0); +} + +struct pf_state_key * +pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr, + struct pf_addr *daddr, u_int16_t sport, u_int16_t dport) +{ + struct pf_state_key *sk; + + sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT); + if (sk == NULL) + return (NULL); + + PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af); + PF_ACPY(&sk->addr[pd->didx], daddr, pd->af); + sk->port[pd->sidx] = sport; + sk->port[pd->didx] = dport; + sk->proto = pd->proto; + sk->af = pd->af; + + return (sk); +} + +struct pf_state_key * +pf_state_key_clone(struct pf_state_key *orig) +{ + struct pf_state_key *sk; + + sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT); + if (sk == NULL) + return (NULL); + + bcopy(orig, sk, sizeof(struct pf_state_key_cmp)); + + return (sk); +} + +int +pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw, + struct pf_state_key *sks, struct pf_state *s) +{ + struct pf_idhash *ih; + struct pf_state *cur; + int error; + + KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]), + ("%s: sks not pristine", __func__)); + KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]), + ("%s: skw not pristine", __func__)); + KASSERT(s->refs == 0, ("%s: state not pristine", __func__)); + + s->kif = kif; + + if (s->id == 0 && s->creatorid == 0) { + /* XXX: should be atomic, but probability of collision low */ + if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID) + V_pf_stateid[curcpu] = 1; + s->id |= (uint64_t )curcpu << PFID_CPUSHIFT; + s->id = htobe64(s->id); + s->creatorid = V_pf_status.hostid; + } + + /* Returns with ID locked on success. */ + if ((error = pf_state_key_attach(skw, sks, s)) != 0) + return (error); + + ih = &V_pf_idhash[PF_IDHASH(s)]; + PF_HASHROW_ASSERT(ih); + LIST_FOREACH(cur, &ih->states, entry) + if (cur->id == s->id && cur->creatorid == s->creatorid) + break; + + if (cur != NULL) { + PF_HASHROW_UNLOCK(ih); + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: state ID collision: " + "id: %016llx creatorid: %08x\n", + (unsigned long long)be64toh(s->id), + ntohl(s->creatorid)); + } + pf_detach_state(s); + return (EEXIST); + } + LIST_INSERT_HEAD(&ih->states, s, entry); + /* One for keys, one for ID hash. */ + refcount_init(&s->refs, 2); + + counter_u64_add(V_pf_status.fcounters[FCNT_STATE_INSERT], 1); + if (pfsync_insert_state_ptr != NULL) + pfsync_insert_state_ptr(s); + + /* Returns locked. */ + return (0); +} + +/* + * Find state by ID: returns with locked row on success. + */ +struct pf_state * +pf_find_state_byid(uint64_t id, uint32_t creatorid) +{ + struct pf_idhash *ih; + struct pf_state *s; + + counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1); + + ih = &V_pf_idhash[(be64toh(id) % (pf_hashmask + 1))]; + + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) + if (s->id == id && s->creatorid == creatorid) + break; + + if (s == NULL) + PF_HASHROW_UNLOCK(ih); + + return (s); +} + +/* + * Find state by key. + * Returns with ID hash slot locked on success. + */ +static struct pf_state * +pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir) +{ + struct pf_keyhash *kh; + struct pf_state_key *sk; + struct pf_state *s; + int idx; + + counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1); + + kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)]; + + PF_HASHROW_LOCK(kh); + LIST_FOREACH(sk, &kh->keys, entry) + if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0) + break; + if (sk == NULL) { + PF_HASHROW_UNLOCK(kh); + return (NULL); + } + + idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK); + + /* List is sorted, if-bound states before floating ones. */ + TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) + if (s->kif == V_pfi_all || s->kif == kif) { + PF_STATE_LOCK(s); + PF_HASHROW_UNLOCK(kh); + if (s->timeout >= PFTM_MAX) { + /* + * State is either being processed by + * pf_unlink_state() in an other thread, or + * is scheduled for immediate expiry. + */ + PF_STATE_UNLOCK(s); + return (NULL); + } + return (s); + } + PF_HASHROW_UNLOCK(kh); + + return (NULL); +} + +struct pf_state * +pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more) +{ + struct pf_keyhash *kh; + struct pf_state_key *sk; + struct pf_state *s, *ret = NULL; + int idx, inout = 0; + + counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1); + + kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)]; + + PF_HASHROW_LOCK(kh); + LIST_FOREACH(sk, &kh->keys, entry) + if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0) + break; + if (sk == NULL) { + PF_HASHROW_UNLOCK(kh); + return (NULL); + } + switch (dir) { + case PF_IN: + idx = PF_SK_WIRE; + break; + case PF_OUT: + idx = PF_SK_STACK; + break; + case PF_INOUT: + idx = PF_SK_WIRE; + inout = 1; + break; + default: + panic("%s: dir %u", __func__, dir); + } +second_run: + TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) { + if (more == NULL) { + PF_HASHROW_UNLOCK(kh); + return (s); + } + + if (ret) + (*more)++; + else + ret = s; + } + if (inout == 1) { + inout = 0; + idx = PF_SK_STACK; + goto second_run; + } + PF_HASHROW_UNLOCK(kh); + + return (ret); +} + +/* END state table stuff */ + +static void +pf_send(struct pf_send_entry *pfse) +{ + + PF_SENDQ_LOCK(); + STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next); + PF_SENDQ_UNLOCK(); + swi_sched(V_pf_swi_cookie, 0); +} + +void +pf_intr(void *v) +{ + struct pf_send_head queue; + struct pf_send_entry *pfse, *next; + + CURVNET_SET((struct vnet *)v); + + PF_SENDQ_LOCK(); + queue = V_pf_sendqueue; + STAILQ_INIT(&V_pf_sendqueue); + PF_SENDQ_UNLOCK(); + + STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) { + switch (pfse->pfse_type) { +#ifdef INET + case PFSE_IP: + ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL); + break; + case PFSE_ICMP: + icmp_error(pfse->pfse_m, pfse->icmpopts.type, + pfse->icmpopts.code, 0, pfse->icmpopts.mtu); + break; +#endif /* INET */ +#ifdef INET6 + case PFSE_IP6: + ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL, + NULL); + break; + case PFSE_ICMP6: + icmp6_error(pfse->pfse_m, pfse->icmpopts.type, + pfse->icmpopts.code, pfse->icmpopts.mtu); + break; +#endif /* INET6 */ + default: + panic("%s: unknown type", __func__); + } + free(pfse, M_PFTEMP); + } + CURVNET_RESTORE(); +} + +void +pf_purge_thread(void *unused __unused) +{ + VNET_ITERATOR_DECL(vnet_iter); + u_int idx = 0; + + for (;;) { + PF_RULES_RLOCK(); + rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftm", hz / 10); + PF_RULES_RUNLOCK(); + + VNET_LIST_RLOCK(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + + if (pf_end_threads) { + pf_end_threads++; + wakeup(pf_purge_thread); + kproc_exit(0); + } + + /* Process 1/interval fraction of the state table every run. */ + idx = pf_purge_expired_states(idx, pf_hashmask / + (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10)); + + /* Purge other expired types every PFTM_INTERVAL seconds. */ + if (idx == 0) { + /* + * Order is important: + * - states and src nodes reference rules + * - states and rules reference kifs + */ + pf_purge_expired_fragments(); + pf_purge_expired_src_nodes(); + pf_purge_unlinked_rules(); + pfi_kif_purge(); + } + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK(); + } + /* not reached */ +} + +void +pf_unload_vnet_purge(void) +{ + + /* + * To cleanse up all kifs and rules we need + * two runs: first one clears reference flags, + * then pf_purge_expired_states() doesn't + * raise them, and then second run frees. + */ + pf_purge_unlinked_rules(); + pfi_kif_purge(); + + /* + * Now purge everything. + */ + pf_purge_expired_states(0, pf_hashmask); + pf_purge_expired_fragments(); + pf_purge_expired_src_nodes(); + + /* + * Now all kifs & rules should be unreferenced, + * thus should be successfully freed. + */ + pf_purge_unlinked_rules(); + pfi_kif_purge(); +} + + +u_int32_t +pf_state_expires(const struct pf_state *state) +{ + u_int32_t timeout; + u_int32_t start; + u_int32_t end; + u_int32_t states; + + /* handle all PFTM_* > PFTM_MAX here */ + if (state->timeout == PFTM_PURGE) + return (time_uptime); + KASSERT(state->timeout != PFTM_UNLINKED, + ("pf_state_expires: timeout == PFTM_UNLINKED")); + KASSERT((state->timeout < PFTM_MAX), + ("pf_state_expires: timeout > PFTM_MAX")); + timeout = state->rule.ptr->timeout[state->timeout]; + if (!timeout) + timeout = V_pf_default_rule.timeout[state->timeout]; + start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START]; + if (start) { + end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END]; + states = counter_u64_fetch(state->rule.ptr->states_cur); + } else { + start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START]; + end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END]; + states = V_pf_status.states; + } + if (end && states > start && start < end) { + if (states < end) + return (state->expire + timeout * (end - states) / + (end - start)); + else + return (time_uptime); + } + return (state->expire + timeout); +} + +void +pf_purge_expired_src_nodes() +{ + struct pf_src_node_list freelist; + struct pf_srchash *sh; + struct pf_src_node *cur, *next; + int i; + + LIST_INIT(&freelist); + for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) { + PF_HASHROW_LOCK(sh); + LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next) + if (cur->states == 0 && cur->expire <= time_uptime) { + pf_unlink_src_node(cur); + LIST_INSERT_HEAD(&freelist, cur, entry); + } else if (cur->rule.ptr != NULL) + cur->rule.ptr->rule_flag |= PFRULE_REFS; + PF_HASHROW_UNLOCK(sh); + } + + pf_free_src_nodes(&freelist); + + V_pf_status.src_nodes = uma_zone_get_cur(V_pf_sources_z); +} + +static void +pf_src_tree_remove_state(struct pf_state *s) +{ + struct pf_src_node *sn; + struct pf_srchash *sh; + uint32_t timeout; + + timeout = s->rule.ptr->timeout[PFTM_SRC_NODE] ? + s->rule.ptr->timeout[PFTM_SRC_NODE] : + V_pf_default_rule.timeout[PFTM_SRC_NODE]; + + if (s->src_node != NULL) { + sn = s->src_node; + sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)]; + PF_HASHROW_LOCK(sh); + if (s->src.tcp_est) + --sn->conn; + if (--sn->states == 0) + sn->expire = time_uptime + timeout; + PF_HASHROW_UNLOCK(sh); + } + if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) { + sn = s->nat_src_node; + sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)]; + PF_HASHROW_LOCK(sh); + if (--sn->states == 0) + sn->expire = time_uptime + timeout; + PF_HASHROW_UNLOCK(sh); + } + s->src_node = s->nat_src_node = NULL; +} + +/* + * Unlink and potentilly free a state. Function may be + * called with ID hash row locked, but always returns + * unlocked, since it needs to go through key hash locking. + */ +int +pf_unlink_state(struct pf_state *s, u_int flags) +{ + struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)]; + + if ((flags & PF_ENTER_LOCKED) == 0) + PF_HASHROW_LOCK(ih); + else + PF_HASHROW_ASSERT(ih); + + if (s->timeout == PFTM_UNLINKED) { + /* + * State is being processed + * by pf_unlink_state() in + * an other thread. + */ + PF_HASHROW_UNLOCK(ih); + return (0); /* XXXGL: undefined actually */ + } + + if (s->src.state == PF_TCPS_PROXY_DST) { + /* XXX wire key the right one? */ + pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af, + &s->key[PF_SK_WIRE]->addr[1], + &s->key[PF_SK_WIRE]->addr[0], + s->key[PF_SK_WIRE]->port[1], + s->key[PF_SK_WIRE]->port[0], + s->src.seqhi, s->src.seqlo + 1, + TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL); + } + + LIST_REMOVE(s, entry); + pf_src_tree_remove_state(s); + + if (pfsync_delete_state_ptr != NULL) + pfsync_delete_state_ptr(s); + + STATE_DEC_COUNTERS(s); + + s->timeout = PFTM_UNLINKED; + + PF_HASHROW_UNLOCK(ih); + + pf_detach_state(s); + refcount_release(&s->refs); + + return (pf_release_state(s)); +} + +void +pf_free_state(struct pf_state *cur) +{ + + KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur)); + KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__, + cur->timeout)); + + pf_normalize_tcp_cleanup(cur); + uma_zfree(V_pf_state_z, cur); + counter_u64_add(V_pf_status.fcounters[FCNT_STATE_REMOVALS], 1); +} + +/* + * Called only from pf_purge_thread(), thus serialized. + */ +static u_int +pf_purge_expired_states(u_int i, int maxcheck) +{ + struct pf_idhash *ih; + struct pf_state *s; + + V_pf_status.states = uma_zone_get_cur(V_pf_state_z); + + /* + * Go through hash and unlink states that expire now. + */ + while (maxcheck > 0) { + + ih = &V_pf_idhash[i]; +relock: + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + if (pf_state_expires(s) <= time_uptime) { + V_pf_status.states -= + pf_unlink_state(s, PF_ENTER_LOCKED); + goto relock; + } + s->rule.ptr->rule_flag |= PFRULE_REFS; + if (s->nat_rule.ptr != NULL) + s->nat_rule.ptr->rule_flag |= PFRULE_REFS; + if (s->anchor.ptr != NULL) + s->anchor.ptr->rule_flag |= PFRULE_REFS; + s->kif->pfik_flags |= PFI_IFLAG_REFS; + if (s->rt_kif) + s->rt_kif->pfik_flags |= PFI_IFLAG_REFS; + } + PF_HASHROW_UNLOCK(ih); + + /* Return when we hit end of hash. */ + if (++i > pf_hashmask) { + V_pf_status.states = uma_zone_get_cur(V_pf_state_z); + return (0); + } + + maxcheck--; + } + + V_pf_status.states = uma_zone_get_cur(V_pf_state_z); + + return (i); +} + +static void +pf_purge_unlinked_rules() +{ + struct pf_rulequeue tmpq; + struct pf_rule *r, *r1; + + /* + * If we have overloading task pending, then we'd + * better skip purging this time. There is a tiny + * probability that overloading task references + * an already unlinked rule. + */ + PF_OVERLOADQ_LOCK(); + if (!SLIST_EMPTY(&V_pf_overloadqueue)) { + PF_OVERLOADQ_UNLOCK(); + return; + } + PF_OVERLOADQ_UNLOCK(); + + /* + * Do naive mark-and-sweep garbage collecting of old rules. + * Reference flag is raised by pf_purge_expired_states() + * and pf_purge_expired_src_nodes(). + * + * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK, + * use a temporary queue. + */ + TAILQ_INIT(&tmpq); + PF_UNLNKDRULES_LOCK(); + TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) { + if (!(r->rule_flag & PFRULE_REFS)) { + TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries); + TAILQ_INSERT_TAIL(&tmpq, r, entries); + } else + r->rule_flag &= ~PFRULE_REFS; + } + PF_UNLNKDRULES_UNLOCK(); + + if (!TAILQ_EMPTY(&tmpq)) { + PF_RULES_WLOCK(); + TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) { + TAILQ_REMOVE(&tmpq, r, entries); + pf_free_rule(r); + } + PF_RULES_WUNLOCK(); + } +} + +void +pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: { + u_int32_t a = ntohl(addr->addr32[0]); + printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255, + (a>>8)&255, a&255); + if (p) { + p = ntohs(p); + printf(":%u", p); + } + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + u_int16_t b; + u_int8_t i, curstart, curend, maxstart, maxend; + curstart = curend = maxstart = maxend = 255; + for (i = 0; i < 8; i++) { + if (!addr->addr16[i]) { + if (curstart == 255) + curstart = i; + curend = i; + } else { + if ((curend - curstart) > + (maxend - maxstart)) { + maxstart = curstart; + maxend = curend; + } + curstart = curend = 255; + } + } + if ((curend - curstart) > + (maxend - maxstart)) { + maxstart = curstart; + maxend = curend; + } + for (i = 0; i < 8; i++) { + if (i >= maxstart && i <= maxend) { + if (i == 0) + printf(":"); + if (i == maxend) + printf(":"); + } else { + b = ntohs(addr->addr16[i]); + printf("%x", b); + if (i < 7) + printf(":"); + } + } + if (p) { + p = ntohs(p); + printf("[%u]", p); + } + break; + } +#endif /* INET6 */ + } +} + +void +pf_print_state(struct pf_state *s) +{ + pf_print_state_parts(s, NULL, NULL); +} + +static void +pf_print_state_parts(struct pf_state *s, + struct pf_state_key *skwp, struct pf_state_key *sksp) +{ + struct pf_state_key *skw, *sks; + u_int8_t proto, dir; + + /* Do our best to fill these, but they're skipped if NULL */ + skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL); + sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL); + proto = skw ? skw->proto : (sks ? sks->proto : 0); + dir = s ? s->direction : 0; + + switch (proto) { + case IPPROTO_IPV4: + printf("IPv4"); + break; + case IPPROTO_IPV6: + printf("IPv6"); + break; + case IPPROTO_TCP: + printf("TCP"); + break; + case IPPROTO_UDP: + printf("UDP"); + break; + case IPPROTO_ICMP: + printf("ICMP"); + break; + case IPPROTO_ICMPV6: + printf("ICMPv6"); + break; + default: + printf("%u", proto); + break; + } + switch (dir) { + case PF_IN: + printf(" in"); + break; + case PF_OUT: + printf(" out"); + break; + } + if (skw) { + printf(" wire: "); + pf_print_host(&skw->addr[0], skw->port[0], skw->af); + printf(" "); + pf_print_host(&skw->addr[1], skw->port[1], skw->af); + } + if (sks) { + printf(" stack: "); + if (sks != skw) { + pf_print_host(&sks->addr[0], sks->port[0], sks->af); + printf(" "); + pf_print_host(&sks->addr[1], sks->port[1], sks->af); + } else + printf("-"); + } + if (s) { + if (proto == IPPROTO_TCP) { + printf(" [lo=%u high=%u win=%u modulator=%u", + s->src.seqlo, s->src.seqhi, + s->src.max_win, s->src.seqdiff); + if (s->src.wscale && s->dst.wscale) + printf(" wscale=%u", + s->src.wscale & PF_WSCALE_MASK); + printf("]"); + printf(" [lo=%u high=%u win=%u modulator=%u", + s->dst.seqlo, s->dst.seqhi, + s->dst.max_win, s->dst.seqdiff); + if (s->src.wscale && s->dst.wscale) + printf(" wscale=%u", + s->dst.wscale & PF_WSCALE_MASK); + printf("]"); + } + printf(" %u:%u", s->src.state, s->dst.state); + } +} + +void +pf_print_flags(u_int8_t f) +{ + if (f) + printf(" "); + if (f & TH_FIN) + printf("F"); + if (f & TH_SYN) + printf("S"); + if (f & TH_RST) + printf("R"); + if (f & TH_PUSH) + printf("P"); + if (f & TH_ACK) + printf("A"); + if (f & TH_URG) + printf("U"); + if (f & TH_ECE) + printf("E"); + if (f & TH_CWR) + printf("W"); +} + +#define PF_SET_SKIP_STEPS(i) \ + do { \ + while (head[i] != cur) { \ + head[i]->skip[i].ptr = cur; \ + head[i] = TAILQ_NEXT(head[i], entries); \ + } \ + } while (0) + +void +pf_calc_skip_steps(struct pf_rulequeue *rules) +{ + struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT]; + int i; + + cur = TAILQ_FIRST(rules); + prev = cur; + for (i = 0; i < PF_SKIP_COUNT; ++i) + head[i] = cur; + while (cur != NULL) { + + if (cur->kif != prev->kif || cur->ifnot != prev->ifnot) + PF_SET_SKIP_STEPS(PF_SKIP_IFP); + if (cur->direction != prev->direction) + PF_SET_SKIP_STEPS(PF_SKIP_DIR); + if (cur->af != prev->af) + PF_SET_SKIP_STEPS(PF_SKIP_AF); + if (cur->proto != prev->proto) + PF_SET_SKIP_STEPS(PF_SKIP_PROTO); + if (cur->src.neg != prev->src.neg || + pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr)) + PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR); + if (cur->src.port[0] != prev->src.port[0] || + cur->src.port[1] != prev->src.port[1] || + cur->src.port_op != prev->src.port_op) + PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT); + if (cur->dst.neg != prev->dst.neg || + pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr)) + PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR); + if (cur->dst.port[0] != prev->dst.port[0] || + cur->dst.port[1] != prev->dst.port[1] || + cur->dst.port_op != prev->dst.port_op) + PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT); + + prev = cur; + cur = TAILQ_NEXT(cur, entries); + } + for (i = 0; i < PF_SKIP_COUNT; ++i) + PF_SET_SKIP_STEPS(i); +} + +static int +pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2) +{ + if (aw1->type != aw2->type) + return (1); + switch (aw1->type) { + case PF_ADDR_ADDRMASK: + case PF_ADDR_RANGE: + if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, AF_INET6)) + return (1); + if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, AF_INET6)) + return (1); + return (0); + case PF_ADDR_DYNIFTL: + return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt); + case PF_ADDR_NOROUTE: + case PF_ADDR_URPFFAILED: + return (0); + case PF_ADDR_TABLE: + return (aw1->p.tbl != aw2->p.tbl); + default: + printf("invalid address type: %d\n", aw1->type); + return (1); + } +} + +/** + * Checksum updates are a little complicated because the checksum in the TCP/UDP + * header isn't always a full checksum. In some cases (i.e. output) it's a + * pseudo-header checksum, which is a partial checksum over src/dst IP + * addresses, protocol number and length. + * + * That means we have the following cases: + * * Input or forwarding: we don't have TSO, the checksum fields are full + * checksums, we need to update the checksum whenever we change anything. + * * Output (i.e. the checksum is a pseudo-header checksum): + * x The field being updated is src/dst address or affects the length of + * the packet. We need to update the pseudo-header checksum (note that this + * checksum is not ones' complement). + * x Some other field is being modified (e.g. src/dst port numbers): We + * don't have to update anything. + **/ +u_int16_t +pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp) +{ + u_int32_t l; + + if (udp && !cksum) + return (0x0000); + l = cksum + old - new; + l = (l >> 16) + (l & 65535); + l = l & 65535; + if (udp && !l) + return (0xFFFF); + return (l); +} + +u_int16_t +pf_proto_cksum_fixup(struct mbuf *m, u_int16_t cksum, u_int16_t old, + u_int16_t new, u_int8_t udp) +{ + if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) + return (cksum); + + return (pf_cksum_fixup(cksum, old, new, udp)); +} + +static void +pf_change_ap(struct mbuf *m, struct pf_addr *a, u_int16_t *p, u_int16_t *ic, + u_int16_t *pc, struct pf_addr *an, u_int16_t pn, u_int8_t u, + sa_family_t af) +{ + struct pf_addr ao; + u_int16_t po = *p; + + PF_ACPY(&ao, a, af); + PF_ACPY(a, an, af); + + if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) + *pc = ~*pc; + + *p = pn; + + switch (af) { +#ifdef INET + case AF_INET: + *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, + ao.addr16[0], an->addr16[0], 0), + ao.addr16[1], an->addr16[1], 0); + *p = pn; + + *pc = pf_cksum_fixup(pf_cksum_fixup(*pc, + ao.addr16[0], an->addr16[0], u), + ao.addr16[1], an->addr16[1], u); + + *pc = pf_proto_cksum_fixup(m, *pc, po, pn, u); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(*pc, + ao.addr16[0], an->addr16[0], u), + ao.addr16[1], an->addr16[1], u), + ao.addr16[2], an->addr16[2], u), + ao.addr16[3], an->addr16[3], u), + ao.addr16[4], an->addr16[4], u), + ao.addr16[5], an->addr16[5], u), + ao.addr16[6], an->addr16[6], u), + ao.addr16[7], an->addr16[7], u); + + *pc = pf_proto_cksum_fixup(m, *pc, po, pn, u); + break; +#endif /* INET6 */ + } + + if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | + CSUM_DELAY_DATA_IPV6)) { + *pc = ~*pc; + if (! *pc) + *pc = 0xffff; + } +} + +/* Changes a u_int32_t. Uses a void * so there are no align restrictions */ +void +pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u) +{ + u_int32_t ao; + + memcpy(&ao, a, sizeof(ao)); + memcpy(a, &an, sizeof(u_int32_t)); + *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u), + ao % 65536, an % 65536, u); +} + +void +pf_change_proto_a(struct mbuf *m, void *a, u_int16_t *c, u_int32_t an, u_int8_t udp) +{ + u_int32_t ao; + + memcpy(&ao, a, sizeof(ao)); + memcpy(a, &an, sizeof(u_int32_t)); + + *c = pf_proto_cksum_fixup(m, + pf_proto_cksum_fixup(m, *c, ao / 65536, an / 65536, udp), + ao % 65536, an % 65536, udp); +} + +#ifdef INET6 +static void +pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u) +{ + struct pf_addr ao; + + PF_ACPY(&ao, a, AF_INET6); + PF_ACPY(a, an, AF_INET6); + + *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(*c, + ao.addr16[0], an->addr16[0], u), + ao.addr16[1], an->addr16[1], u), + ao.addr16[2], an->addr16[2], u), + ao.addr16[3], an->addr16[3], u), + ao.addr16[4], an->addr16[4], u), + ao.addr16[5], an->addr16[5], u), + ao.addr16[6], an->addr16[6], u), + ao.addr16[7], an->addr16[7], u); +} +#endif /* INET6 */ + +static void +pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa, + struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c, + u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af) +{ + struct pf_addr oia, ooa; + + PF_ACPY(&oia, ia, af); + if (oa) + PF_ACPY(&ooa, oa, af); + + /* Change inner protocol port, fix inner protocol checksum. */ + if (ip != NULL) { + u_int16_t oip = *ip; + u_int32_t opc; + + if (pc != NULL) + opc = *pc; + *ip = np; + if (pc != NULL) + *pc = pf_cksum_fixup(*pc, oip, *ip, u); + *ic = pf_cksum_fixup(*ic, oip, *ip, 0); + if (pc != NULL) + *ic = pf_cksum_fixup(*ic, opc, *pc, 0); + } + /* Change inner ip address, fix inner ip and icmp checksums. */ + PF_ACPY(ia, na, af); + switch (af) { +#ifdef INET + case AF_INET: { + u_int32_t oh2c = *h2c; + + *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c, + oia.addr16[0], ia->addr16[0], 0), + oia.addr16[1], ia->addr16[1], 0); + *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, + oia.addr16[0], ia->addr16[0], 0), + oia.addr16[1], ia->addr16[1], 0); + *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0); + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(*ic, + oia.addr16[0], ia->addr16[0], u), + oia.addr16[1], ia->addr16[1], u), + oia.addr16[2], ia->addr16[2], u), + oia.addr16[3], ia->addr16[3], u), + oia.addr16[4], ia->addr16[4], u), + oia.addr16[5], ia->addr16[5], u), + oia.addr16[6], ia->addr16[6], u), + oia.addr16[7], ia->addr16[7], u); + break; +#endif /* INET6 */ + } + /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */ + if (oa) { + PF_ACPY(oa, na, af); + switch (af) { +#ifdef INET + case AF_INET: + *hc = pf_cksum_fixup(pf_cksum_fixup(*hc, + ooa.addr16[0], oa->addr16[0], 0), + ooa.addr16[1], oa->addr16[1], 0); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(*ic, + ooa.addr16[0], oa->addr16[0], u), + ooa.addr16[1], oa->addr16[1], u), + ooa.addr16[2], oa->addr16[2], u), + ooa.addr16[3], oa->addr16[3], u), + ooa.addr16[4], oa->addr16[4], u), + ooa.addr16[5], oa->addr16[5], u), + ooa.addr16[6], oa->addr16[6], u), + ooa.addr16[7], oa->addr16[7], u); + break; +#endif /* INET6 */ + } + } +} + + +/* + * Need to modulate the sequence numbers in the TCP SACK option + * (credits to Krzysztof Pfaff for report and patch) + */ +static int +pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd, + struct tcphdr *th, struct pf_state_peer *dst) +{ + int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen; + u_int8_t opts[TCP_MAXOLEN], *opt = opts; + int copyback = 0, i, olen; + struct sackblk sack; + +#define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2) + if (hlen < TCPOLEN_SACKLEN || + !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af)) + return 0; + + while (hlen >= TCPOLEN_SACKLEN) { + olen = opt[1]; + switch (*opt) { + case TCPOPT_EOL: /* FALLTHROUGH */ + case TCPOPT_NOP: + opt++; + hlen--; + break; + case TCPOPT_SACK: + if (olen > hlen) + olen = hlen; + if (olen >= TCPOLEN_SACKLEN) { + for (i = 2; i + TCPOLEN_SACK <= olen; + i += TCPOLEN_SACK) { + memcpy(&sack, &opt[i], sizeof(sack)); + pf_change_proto_a(m, &sack.start, &th->th_sum, + htonl(ntohl(sack.start) - dst->seqdiff), 0); + pf_change_proto_a(m, &sack.end, &th->th_sum, + htonl(ntohl(sack.end) - dst->seqdiff), 0); + memcpy(&opt[i], &sack, sizeof(sack)); + } + copyback = 1; + } + /* FALLTHROUGH */ + default: + if (olen < 2) + olen = 2; + hlen -= olen; + opt += olen; + } + } + + if (copyback) + m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts); + return (copyback); +} + +static void +pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af, + const struct pf_addr *saddr, const struct pf_addr *daddr, + u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack, + u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag, + u_int16_t rtag, struct ifnet *ifp) +{ + struct pf_send_entry *pfse; + struct mbuf *m; + int len, tlen; +#ifdef INET + struct ip *h = NULL; +#endif /* INET */ +#ifdef INET6 + struct ip6_hdr *h6 = NULL; +#endif /* INET6 */ + struct tcphdr *th; + char *opt; + struct pf_mtag *pf_mtag; + + len = 0; + th = NULL; + + /* maximum segment size tcp option */ + tlen = sizeof(struct tcphdr); + if (mss) + tlen += 4; + + switch (af) { +#ifdef INET + case AF_INET: + len = sizeof(struct ip) + tlen; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + len = sizeof(struct ip6_hdr) + tlen; + break; +#endif /* INET6 */ + default: + panic("%s: unsupported af %d", __func__, af); + } + + /* Allocate outgoing queue entry, mbuf and mbuf tag. */ + pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT); + if (pfse == NULL) + return; + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + free(pfse, M_PFTEMP); + return; + } +#ifdef MAC + mac_netinet_firewall_send(m); +#endif + if ((pf_mtag = pf_get_mtag(m)) == NULL) { + free(pfse, M_PFTEMP); + m_freem(m); + return; + } + if (tag) + m->m_flags |= M_SKIP_FIREWALL; + pf_mtag->tag = rtag; + + if (r != NULL && r->rtableid >= 0) + M_SETFIB(m, r->rtableid); + +#ifdef ALTQ + if (r != NULL && r->qid) { + pf_mtag->qid = r->qid; + + /* add hints for ecn */ + pf_mtag->hdr = mtod(m, struct ip *); + } +#endif /* ALTQ */ + m->m_data += max_linkhdr; + m->m_pkthdr.len = m->m_len = len; + m->m_pkthdr.rcvif = NULL; + bzero(m->m_data, len); + switch (af) { +#ifdef INET + case AF_INET: + h = mtod(m, struct ip *); + + /* IP header fields included in the TCP checksum */ + h->ip_p = IPPROTO_TCP; + h->ip_len = htons(tlen); + h->ip_src.s_addr = saddr->v4.s_addr; + h->ip_dst.s_addr = daddr->v4.s_addr; + + th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip)); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + h6 = mtod(m, struct ip6_hdr *); + + /* IP header fields included in the TCP checksum */ + h6->ip6_nxt = IPPROTO_TCP; + h6->ip6_plen = htons(tlen); + memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr)); + memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr)); + + th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr)); + break; +#endif /* INET6 */ + } + + /* TCP header */ + th->th_sport = sport; + th->th_dport = dport; + th->th_seq = htonl(seq); + th->th_ack = htonl(ack); + th->th_off = tlen >> 2; + th->th_flags = flags; + th->th_win = htons(win); + + if (mss) { + opt = (char *)(th + 1); + opt[0] = TCPOPT_MAXSEG; + opt[1] = 4; + HTONS(mss); + bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2); + } + + switch (af) { +#ifdef INET + case AF_INET: + /* TCP checksum */ + th->th_sum = in_cksum(m, len); + + /* Finish the IP header */ + h->ip_v = 4; + h->ip_hl = sizeof(*h) >> 2; + h->ip_tos = IPTOS_LOWDELAY; + h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0); + h->ip_len = htons(len); + h->ip_ttl = ttl ? ttl : V_ip_defttl; + h->ip_sum = 0; + + pfse->pfse_type = PFSE_IP; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + /* TCP checksum */ + th->th_sum = in6_cksum(m, IPPROTO_TCP, + sizeof(struct ip6_hdr), tlen); + + h6->ip6_vfc |= IPV6_VERSION; + h6->ip6_hlim = IPV6_DEFHLIM; + + pfse->pfse_type = PFSE_IP6; + break; +#endif /* INET6 */ + } + pfse->pfse_m = m; + pf_send(pfse); +} + +static int +pf_ieee8021q_setpcp(struct mbuf *m, u_int8_t prio) +{ + struct m_tag *mtag; + + KASSERT(prio <= PF_PRIO_MAX, + ("%s with invalid pcp", __func__)); + + mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_OUT, NULL); + if (mtag == NULL) { + mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_OUT, + sizeof(uint8_t), M_NOWAIT); + if (mtag == NULL) + return (ENOMEM); + m_tag_prepend(m, mtag); + } + + *(uint8_t *)(mtag + 1) = prio; + return (0); +} + +static int +pf_match_ieee8021q_pcp(u_int8_t prio, struct mbuf *m) +{ + struct m_tag *mtag; + u_int8_t mpcp; + + mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL); + if (mtag == NULL) + return (0); + + if (prio == PF_PRIO_ZERO) + prio = 0; + + mpcp = *(uint8_t *)(mtag + 1); + + return (mpcp == prio); +} + +static void +pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af, + struct pf_rule *r) +{ + struct pf_send_entry *pfse; + struct mbuf *m0; + struct pf_mtag *pf_mtag; + + /* Allocate outgoing queue entry, mbuf and mbuf tag. */ + pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT); + if (pfse == NULL) + return; + + if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) { + free(pfse, M_PFTEMP); + return; + } + + if ((pf_mtag = pf_get_mtag(m0)) == NULL) { + free(pfse, M_PFTEMP); + return; + } + /* XXX: revisit */ + m0->m_flags |= M_SKIP_FIREWALL; + + if (r->rtableid >= 0) + M_SETFIB(m0, r->rtableid); + +#ifdef ALTQ + if (r->qid) { + pf_mtag->qid = r->qid; + /* add hints for ecn */ + pf_mtag->hdr = mtod(m0, struct ip *); + } +#endif /* ALTQ */ + + switch (af) { +#ifdef INET + case AF_INET: + pfse->pfse_type = PFSE_ICMP; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + pfse->pfse_type = PFSE_ICMP6; + break; +#endif /* INET6 */ + } + pfse->pfse_m = m0; + pfse->icmpopts.type = type; + pfse->icmpopts.code = code; + pf_send(pfse); +} + +/* + * Return 1 if the addresses a and b match (with mask m), otherwise return 0. + * If n is 0, they match if they are equal. If n is != 0, they match if they + * are different. + */ +int +pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m, + struct pf_addr *b, sa_family_t af) +{ + int match = 0; + + switch (af) { +#ifdef INET + case AF_INET: + if ((a->addr32[0] & m->addr32[0]) == + (b->addr32[0] & m->addr32[0])) + match++; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (((a->addr32[0] & m->addr32[0]) == + (b->addr32[0] & m->addr32[0])) && + ((a->addr32[1] & m->addr32[1]) == + (b->addr32[1] & m->addr32[1])) && + ((a->addr32[2] & m->addr32[2]) == + (b->addr32[2] & m->addr32[2])) && + ((a->addr32[3] & m->addr32[3]) == + (b->addr32[3] & m->addr32[3]))) + match++; + break; +#endif /* INET6 */ + } + if (match) { + if (n) + return (0); + else + return (1); + } else { + if (n) + return (1); + else + return (0); + } +} + +/* + * Return 1 if b <= a <= e, otherwise return 0. + */ +int +pf_match_addr_range(struct pf_addr *b, struct pf_addr *e, + struct pf_addr *a, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) || + (ntohl(a->addr32[0]) > ntohl(e->addr32[0]))) + return (0); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + int i; + + /* check a >= b */ + for (i = 0; i < 4; ++i) + if (ntohl(a->addr32[i]) > ntohl(b->addr32[i])) + break; + else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i])) + return (0); + /* check a <= e */ + for (i = 0; i < 4; ++i) + if (ntohl(a->addr32[i]) < ntohl(e->addr32[i])) + break; + else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i])) + return (0); + break; + } +#endif /* INET6 */ + } + return (1); +} + +static int +pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p) +{ + switch (op) { + case PF_OP_IRG: + return ((p > a1) && (p < a2)); + case PF_OP_XRG: + return ((p < a1) || (p > a2)); + case PF_OP_RRG: + return ((p >= a1) && (p <= a2)); + case PF_OP_EQ: + return (p == a1); + case PF_OP_NE: + return (p != a1); + case PF_OP_LT: + return (p < a1); + case PF_OP_LE: + return (p <= a1); + case PF_OP_GT: + return (p > a1); + case PF_OP_GE: + return (p >= a1); + } + return (0); /* never reached */ +} + +int +pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p) +{ + NTOHS(a1); + NTOHS(a2); + NTOHS(p); + return (pf_match(op, a1, a2, p)); +} + +static int +pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u) +{ + if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE) + return (0); + return (pf_match(op, a1, a2, u)); +} + +static int +pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g) +{ + if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE) + return (0); + return (pf_match(op, a1, a2, g)); +} + +int +pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag) +{ + if (*tag == -1) + *tag = mtag; + + return ((!r->match_tag_not && r->match_tag == *tag) || + (r->match_tag_not && r->match_tag != *tag)); +} + +int +pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag) +{ + + KASSERT(tag > 0, ("%s: tag %d", __func__, tag)); + + if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL)) + return (ENOMEM); + + pd->pf_mtag->tag = tag; + + return (0); +} + +#define PF_ANCHOR_STACKSIZE 32 +struct pf_anchor_stackframe { + struct pf_ruleset *rs; + struct pf_rule *r; /* XXX: + match bit */ + struct pf_anchor *child; +}; + +/* + * XXX: We rely on malloc(9) returning pointer aligned addresses. + */ +#define PF_ANCHORSTACK_MATCH 0x00000001 +#define PF_ANCHORSTACK_MASK (PF_ANCHORSTACK_MATCH) + +#define PF_ANCHOR_MATCH(f) ((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH) +#define PF_ANCHOR_RULE(f) (struct pf_rule *) \ + ((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK) +#define PF_ANCHOR_SET_MATCH(f) do { (f)->r = (void *) \ + ((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH); \ +} while (0) + +void +pf_step_into_anchor(struct pf_anchor_stackframe *stack, int *depth, + struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a, + int *match) +{ + struct pf_anchor_stackframe *f; + + PF_RULES_RASSERT(); + + if (match) + *match = 0; + if (*depth >= PF_ANCHOR_STACKSIZE) { + printf("%s: anchor stack overflow on %s\n", + __func__, (*r)->anchor->name); + *r = TAILQ_NEXT(*r, entries); + return; + } else if (*depth == 0 && a != NULL) + *a = *r; + f = stack + (*depth)++; + f->rs = *rs; + f->r = *r; + if ((*r)->anchor_wildcard) { + struct pf_anchor_node *parent = &(*r)->anchor->children; + + if ((f->child = RB_MIN(pf_anchor_node, parent)) == NULL) { + *r = NULL; + return; + } + *rs = &f->child->ruleset; + } else { + f->child = NULL; + *rs = &(*r)->anchor->ruleset; + } + *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); +} + +int +pf_step_out_of_anchor(struct pf_anchor_stackframe *stack, int *depth, + struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a, + int *match) +{ + struct pf_anchor_stackframe *f; + struct pf_rule *fr; + int quick = 0; + + PF_RULES_RASSERT(); + + do { + if (*depth <= 0) + break; + f = stack + *depth - 1; + fr = PF_ANCHOR_RULE(f); + if (f->child != NULL) { + struct pf_anchor_node *parent; + + /* + * This block traverses through + * a wildcard anchor. + */ + parent = &fr->anchor->children; + if (match != NULL && *match) { + /* + * If any of "*" matched, then + * "foo/ *" matched, mark frame + * appropriately. + */ + PF_ANCHOR_SET_MATCH(f); + *match = 0; + } + f->child = RB_NEXT(pf_anchor_node, parent, f->child); + if (f->child != NULL) { + *rs = &f->child->ruleset; + *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); + if (*r == NULL) + continue; + else + break; + } + } + (*depth)--; + if (*depth == 0 && a != NULL) + *a = NULL; + *rs = f->rs; + if (PF_ANCHOR_MATCH(f) || (match != NULL && *match)) + quick = fr->quick; + *r = TAILQ_NEXT(fr, entries); + } while (*r == NULL); + + return (quick); +} + +#ifdef INET6 +void +pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr, + struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | + ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); + break; +#endif /* INET */ + case AF_INET6: + naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | + ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); + naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) | + ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]); + naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) | + ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]); + naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) | + ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]); + break; + } +} + +void +pf_addr_inc(struct pf_addr *addr, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); + break; +#endif /* INET */ + case AF_INET6: + if (addr->addr32[3] == 0xffffffff) { + addr->addr32[3] = 0; + if (addr->addr32[2] == 0xffffffff) { + addr->addr32[2] = 0; + if (addr->addr32[1] == 0xffffffff) { + addr->addr32[1] = 0; + addr->addr32[0] = + htonl(ntohl(addr->addr32[0]) + 1); + } else + addr->addr32[1] = + htonl(ntohl(addr->addr32[1]) + 1); + } else + addr->addr32[2] = + htonl(ntohl(addr->addr32[2]) + 1); + } else + addr->addr32[3] = + htonl(ntohl(addr->addr32[3]) + 1); + break; + } +} +#endif /* INET6 */ + +int +pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m) +{ + struct pf_addr *saddr, *daddr; + u_int16_t sport, dport; + struct inpcbinfo *pi; + struct inpcb *inp; + + pd->lookup.uid = UID_MAX; + pd->lookup.gid = GID_MAX; + + switch (pd->proto) { + case IPPROTO_TCP: + if (pd->hdr.tcp == NULL) + return (-1); + sport = pd->hdr.tcp->th_sport; + dport = pd->hdr.tcp->th_dport; + pi = &V_tcbinfo; + break; + case IPPROTO_UDP: + if (pd->hdr.udp == NULL) + return (-1); + sport = pd->hdr.udp->uh_sport; + dport = pd->hdr.udp->uh_dport; + pi = &V_udbinfo; + break; + default: + return (-1); + } + if (direction == PF_IN) { + saddr = pd->src; + daddr = pd->dst; + } else { + u_int16_t p; + + p = sport; + sport = dport; + dport = p; + saddr = pd->dst; + daddr = pd->src; + } + switch (pd->af) { +#ifdef INET + case AF_INET: + inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4, + dport, INPLOOKUP_RLOCKPCB, NULL, m); + if (inp == NULL) { + inp = in_pcblookup_mbuf(pi, saddr->v4, sport, + daddr->v4, dport, INPLOOKUP_WILDCARD | + INPLOOKUP_RLOCKPCB, NULL, m); + if (inp == NULL) + return (-1); + } + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6, + dport, INPLOOKUP_RLOCKPCB, NULL, m); + if (inp == NULL) { + inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, + &daddr->v6, dport, INPLOOKUP_WILDCARD | + INPLOOKUP_RLOCKPCB, NULL, m); + if (inp == NULL) + return (-1); + } + break; +#endif /* INET6 */ + + default: + return (-1); + } + INP_RLOCK_ASSERT(inp); +#ifndef __rtems__ + pd->lookup.uid = inp->inp_cred->cr_uid; + pd->lookup.gid = inp->inp_cred->cr_groups[0]; +#else /* __rtems__ */ + pd->lookup.uid = BSD_DEFAULT_UID; + pd->lookup.gid = BSD_DEFAULT_GID; +#endif /* __rtems__ */ + INP_RUNLOCK(inp); + + return (1); +} + +static u_int8_t +pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) +{ + int hlen; + u_int8_t hdr[60]; + u_int8_t *opt, optlen; + u_int8_t wscale = 0; + + hlen = th_off << 2; /* hlen <= sizeof(hdr) */ + if (hlen <= sizeof(struct tcphdr)) + return (0); + if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) + return (0); + opt = hdr + sizeof(struct tcphdr); + hlen -= sizeof(struct tcphdr); + while (hlen >= 3) { + switch (*opt) { + case TCPOPT_EOL: + case TCPOPT_NOP: + ++opt; + --hlen; + break; + case TCPOPT_WINDOW: + wscale = opt[2]; + if (wscale > TCP_MAX_WINSHIFT) + wscale = TCP_MAX_WINSHIFT; + wscale |= PF_WSCALE_FLAG; + /* FALLTHROUGH */ + default: + optlen = opt[1]; + if (optlen < 2) + optlen = 2; + hlen -= optlen; + opt += optlen; + break; + } + } + return (wscale); +} + +static u_int16_t +pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) +{ + int hlen; + u_int8_t hdr[60]; + u_int8_t *opt, optlen; + u_int16_t mss = V_tcp_mssdflt; + + hlen = th_off << 2; /* hlen <= sizeof(hdr) */ + if (hlen <= sizeof(struct tcphdr)) + return (0); + if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) + return (0); + opt = hdr + sizeof(struct tcphdr); + hlen -= sizeof(struct tcphdr); + while (hlen >= TCPOLEN_MAXSEG) { + switch (*opt) { + case TCPOPT_EOL: + case TCPOPT_NOP: + ++opt; + --hlen; + break; + case TCPOPT_MAXSEG: + bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2); + NTOHS(mss); + /* FALLTHROUGH */ + default: + optlen = opt[1]; + if (optlen < 2) + optlen = 2; + hlen -= optlen; + opt += optlen; + break; + } + } + return (mss); +} + +static u_int16_t +pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer) +{ +#ifdef INET + struct nhop4_basic nh4; +#endif /* INET */ +#ifdef INET6 + struct nhop6_basic nh6; + struct in6_addr dst6; + uint32_t scopeid; +#endif /* INET6 */ + int hlen = 0; + uint16_t mss = 0; + + switch (af) { +#ifdef INET + case AF_INET: + hlen = sizeof(struct ip); + if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) == 0) + mss = nh4.nh_mtu - hlen - sizeof(struct tcphdr); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + hlen = sizeof(struct ip6_hdr); + in6_splitscope(&addr->v6, &dst6, &scopeid); + if (fib6_lookup_nh_basic(rtableid, &dst6, scopeid, 0,0,&nh6)==0) + mss = nh6.nh_mtu - hlen - sizeof(struct tcphdr); + break; +#endif /* INET6 */ + } + + mss = max(V_tcp_mssdflt, mss); + mss = min(mss, offer); + mss = max(mss, 64); /* sanity - at least max opt space */ + return (mss); +} + +static u_int32_t +pf_tcp_iss(struct pf_pdesc *pd) +{ + MD5_CTX ctx; + u_int32_t digest[4]; + + if (V_pf_tcp_secret_init == 0) { + read_random(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret)); + MD5Init(&V_pf_tcp_secret_ctx); + MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret, + sizeof(V_pf_tcp_secret)); + V_pf_tcp_secret_init = 1; + } + + ctx = V_pf_tcp_secret_ctx; + + MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short)); + MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short)); + if (pd->af == AF_INET6) { + MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr)); + MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr)); + } else { + MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr)); + MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr)); + } + MD5Final((u_char *)digest, &ctx); + V_pf_tcp_iss_off += 4096; +#define ISN_RANDOM_INCREMENT (4096 - 1) + return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) + + V_pf_tcp_iss_off); +#undef ISN_RANDOM_INCREMENT +} + +static int +pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, + struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd, + struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp) +{ + struct pf_rule *nr = NULL; + struct pf_addr * const saddr = pd->src; + struct pf_addr * const daddr = pd->dst; + sa_family_t af = pd->af; + struct pf_rule *r, *a = NULL; + struct pf_ruleset *ruleset = NULL; + struct pf_src_node *nsn = NULL; + struct tcphdr *th = pd->hdr.tcp; + struct pf_state_key *sk = NULL, *nk = NULL; + u_short reason; + int rewrite = 0, hdrlen = 0; + int tag = -1, rtableid = -1; + int asd = 0; + int match = 0; + int state_icmp = 0; + u_int16_t sport = 0, dport = 0; + u_int16_t bproto_sum = 0, bip_sum = 0; + u_int8_t icmptype = 0, icmpcode = 0; + struct pf_anchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE]; + + PF_RULES_RASSERT(); + + if (inp != NULL) { + INP_LOCK_ASSERT(inp); +#ifndef __rtems__ + pd->lookup.uid = inp->inp_cred->cr_uid; + pd->lookup.gid = inp->inp_cred->cr_groups[0]; +#else /* __rtems__ */ + pd->lookup.uid = BSD_DEFAULT_UID; + pd->lookup.gid = BSD_DEFAULT_GID; +#endif /* __rtems__ */ + pd->lookup.done = 1; + } + + switch (pd->proto) { + case IPPROTO_TCP: + sport = th->th_sport; + dport = th->th_dport; + hdrlen = sizeof(*th); + break; + case IPPROTO_UDP: + sport = pd->hdr.udp->uh_sport; + dport = pd->hdr.udp->uh_dport; + hdrlen = sizeof(*pd->hdr.udp); + break; +#ifdef INET + case IPPROTO_ICMP: + if (pd->af != AF_INET) + break; + sport = dport = pd->hdr.icmp->icmp_id; + hdrlen = sizeof(*pd->hdr.icmp); + icmptype = pd->hdr.icmp->icmp_type; + icmpcode = pd->hdr.icmp->icmp_code; + + if (icmptype == ICMP_UNREACH || + icmptype == ICMP_SOURCEQUENCH || + icmptype == ICMP_REDIRECT || + icmptype == ICMP_TIMXCEED || + icmptype == ICMP_PARAMPROB) + state_icmp++; + break; +#endif /* INET */ +#ifdef INET6 + case IPPROTO_ICMPV6: + if (af != AF_INET6) + break; + sport = dport = pd->hdr.icmp6->icmp6_id; + hdrlen = sizeof(*pd->hdr.icmp6); + icmptype = pd->hdr.icmp6->icmp6_type; + icmpcode = pd->hdr.icmp6->icmp6_code; + + if (icmptype == ICMP6_DST_UNREACH || + icmptype == ICMP6_PACKET_TOO_BIG || + icmptype == ICMP6_TIME_EXCEEDED || + icmptype == ICMP6_PARAM_PROB) + state_icmp++; + break; +#endif /* INET6 */ + default: + sport = dport = hdrlen = 0; + break; + } + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); + + /* check packet for BINAT/NAT/RDR */ + if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk, + &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) { + KASSERT(sk != NULL, ("%s: null sk", __func__)); + KASSERT(nk != NULL, ("%s: null nk", __func__)); + + if (pd->ip_sum) + bip_sum = *pd->ip_sum; + + switch (pd->proto) { + case IPPROTO_TCP: + bproto_sum = th->th_sum; + pd->proto_sum = &th->th_sum; + + if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) || + nk->port[pd->sidx] != sport) { + pf_change_ap(m, saddr, &th->th_sport, pd->ip_sum, + &th->th_sum, &nk->addr[pd->sidx], + nk->port[pd->sidx], 0, af); + pd->sport = &th->th_sport; + sport = th->th_sport; + } + + if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) || + nk->port[pd->didx] != dport) { + pf_change_ap(m, daddr, &th->th_dport, pd->ip_sum, + &th->th_sum, &nk->addr[pd->didx], + nk->port[pd->didx], 0, af); + dport = th->th_dport; + pd->dport = &th->th_dport; + } + rewrite++; + break; + case IPPROTO_UDP: + bproto_sum = pd->hdr.udp->uh_sum; + pd->proto_sum = &pd->hdr.udp->uh_sum; + + if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) || + nk->port[pd->sidx] != sport) { + pf_change_ap(m, saddr, &pd->hdr.udp->uh_sport, + pd->ip_sum, &pd->hdr.udp->uh_sum, + &nk->addr[pd->sidx], + nk->port[pd->sidx], 1, af); + sport = pd->hdr.udp->uh_sport; + pd->sport = &pd->hdr.udp->uh_sport; + } + + if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) || + nk->port[pd->didx] != dport) { + pf_change_ap(m, daddr, &pd->hdr.udp->uh_dport, + pd->ip_sum, &pd->hdr.udp->uh_sum, + &nk->addr[pd->didx], + nk->port[pd->didx], 1, af); + dport = pd->hdr.udp->uh_dport; + pd->dport = &pd->hdr.udp->uh_dport; + } + rewrite++; + break; +#ifdef INET + case IPPROTO_ICMP: + nk->port[0] = nk->port[1]; + if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET)) + pf_change_a(&saddr->v4.s_addr, pd->ip_sum, + nk->addr[pd->sidx].v4.s_addr, 0); + + if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET)) + pf_change_a(&daddr->v4.s_addr, pd->ip_sum, + nk->addr[pd->didx].v4.s_addr, 0); + + if (nk->port[1] != pd->hdr.icmp->icmp_id) { + pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( + pd->hdr.icmp->icmp_cksum, sport, + nk->port[1], 0); + pd->hdr.icmp->icmp_id = nk->port[1]; + pd->sport = &pd->hdr.icmp->icmp_id; + } + m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); + break; +#endif /* INET */ +#ifdef INET6 + case IPPROTO_ICMPV6: + nk->port[0] = nk->port[1]; + if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6)) + pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, + &nk->addr[pd->sidx], 0); + + if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6)) + pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, + &nk->addr[pd->didx], 0); + rewrite++; + break; +#endif /* INET */ + default: + switch (af) { +#ifdef INET + case AF_INET: + if (PF_ANEQ(saddr, + &nk->addr[pd->sidx], AF_INET)) + pf_change_a(&saddr->v4.s_addr, + pd->ip_sum, + nk->addr[pd->sidx].v4.s_addr, 0); + + if (PF_ANEQ(daddr, + &nk->addr[pd->didx], AF_INET)) + pf_change_a(&daddr->v4.s_addr, + pd->ip_sum, + nk->addr[pd->didx].v4.s_addr, 0); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (PF_ANEQ(saddr, + &nk->addr[pd->sidx], AF_INET6)) + PF_ACPY(saddr, &nk->addr[pd->sidx], af); + + if (PF_ANEQ(daddr, + &nk->addr[pd->didx], AF_INET6)) + PF_ACPY(saddr, &nk->addr[pd->didx], af); + break; +#endif /* INET */ + } + break; + } + if (nr->natpass) + r = NULL; + pd->nat_rule = nr; + } + + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != direction) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, saddr, af, + r->src.neg, kif, M_GETFIB(m))) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + /* tcp/udp only. port_op always 0 in other cases */ + else if (r->src.port_op && !pf_match_port(r->src.port_op, + r->src.port[0], r->src.port[1], sport)) + r = r->skip[PF_SKIP_SRC_PORT].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, + r->dst.neg, NULL, M_GETFIB(m))) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + /* tcp/udp only. port_op always 0 in other cases */ + else if (r->dst.port_op && !pf_match_port(r->dst.port_op, + r->dst.port[0], r->dst.port[1], dport)) + r = r->skip[PF_SKIP_DST_PORT].ptr; + /* icmp only. type always 0 in other cases */ + else if (r->type && r->type != icmptype + 1) + r = TAILQ_NEXT(r, entries); + /* icmp only. type always 0 in other cases */ + else if (r->code && r->code != icmpcode + 1) + r = TAILQ_NEXT(r, entries); + else if (r->tos && !(r->tos == pd->tos)) + r = TAILQ_NEXT(r, entries); + else if (r->rule_flag & PFRULE_FRAGMENT) + r = TAILQ_NEXT(r, entries); + else if (pd->proto == IPPROTO_TCP && + (r->flagset & th->th_flags) != r->flags) + r = TAILQ_NEXT(r, entries); + /* tcp/udp only. uid.op always 0 in other cases */ + else if (r->uid.op && (pd->lookup.done || (pd->lookup.done = + pf_socket_lookup(direction, pd, m), 1)) && + !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1], + pd->lookup.uid)) + r = TAILQ_NEXT(r, entries); + /* tcp/udp only. gid.op always 0 in other cases */ + else if (r->gid.op && (pd->lookup.done || (pd->lookup.done = + pf_socket_lookup(direction, pd, m), 1)) && + !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1], + pd->lookup.gid)) + r = TAILQ_NEXT(r, entries); + else if (r->prio && + !pf_match_ieee8021q_pcp(r->prio, m)) + r = TAILQ_NEXT(r, entries); + else if (r->prob && + r->prob <= arc4random()) + r = TAILQ_NEXT(r, entries); + else if (r->match_tag && !pf_match_tag(m, r, &tag, + pd->pf_mtag ? pd->pf_mtag->tag : 0)) + r = TAILQ_NEXT(r, entries); + else if (r->os_fingerprint != PF_OSFP_ANY && + (pd->proto != IPPROTO_TCP || !pf_osfp_match( + pf_osfp_fingerprint(pd, m, off, th), + r->os_fingerprint))) + r = TAILQ_NEXT(r, entries); + else { + if (r->tag) + tag = r->tag; + if (r->rtableid >= 0) + rtableid = r->rtableid; + if (r->anchor == NULL) { + match = 1; + *rm = r; + *am = a; + *rsm = ruleset; + if ((*rm)->quick) + break; + r = TAILQ_NEXT(r, entries); + } else + pf_step_into_anchor(anchor_stack, &asd, + &ruleset, PF_RULESET_FILTER, &r, &a, + &match); + } + if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd, + &ruleset, PF_RULESET_FILTER, &r, &a, &match)) + break; + } + r = *rm; + a = *am; + ruleset = *rsm; + + REASON_SET(&reason, PFRES_MATCH); + + if (r->log || (nr != NULL && nr->log)) { + if (rewrite) + m_copyback(m, off, hdrlen, pd->hdr.any); + PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a, + ruleset, pd, 1); + } + + if ((r->action == PF_DROP) && + ((r->rule_flag & PFRULE_RETURNRST) || + (r->rule_flag & PFRULE_RETURNICMP) || + (r->rule_flag & PFRULE_RETURN))) { + /* undo NAT changes, if they have taken place */ + if (nr != NULL) { + PF_ACPY(saddr, &sk->addr[pd->sidx], af); + PF_ACPY(daddr, &sk->addr[pd->didx], af); + if (pd->sport) + *pd->sport = sk->port[pd->sidx]; + if (pd->dport) + *pd->dport = sk->port[pd->didx]; + if (pd->proto_sum) + *pd->proto_sum = bproto_sum; + if (pd->ip_sum) + *pd->ip_sum = bip_sum; + m_copyback(m, off, hdrlen, pd->hdr.any); + } + if (pd->proto == IPPROTO_TCP && + ((r->rule_flag & PFRULE_RETURNRST) || + (r->rule_flag & PFRULE_RETURN)) && + !(th->th_flags & TH_RST)) { + u_int32_t ack = ntohl(th->th_seq) + pd->p_len; + int len = 0; +#ifdef INET + struct ip *h4; +#endif +#ifdef INET6 + struct ip6_hdr *h6; +#endif + + switch (af) { +#ifdef INET + case AF_INET: + h4 = mtod(m, struct ip *); + len = ntohs(h4->ip_len) - off; + break; +#endif +#ifdef INET6 + case AF_INET6: + h6 = mtod(m, struct ip6_hdr *); + len = ntohs(h6->ip6_plen) - (off - sizeof(*h6)); + break; +#endif + } + + if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af)) + REASON_SET(&reason, PFRES_PROTCKSUM); + else { + if (th->th_flags & TH_SYN) + ack++; + if (th->th_flags & TH_FIN) + ack++; + pf_send_tcp(m, r, af, pd->dst, + pd->src, th->th_dport, th->th_sport, + ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0, + r->return_ttl, 1, 0, kif->pfik_ifp); + } + } else if (pd->proto != IPPROTO_ICMP && af == AF_INET && + r->return_icmp) + pf_send_icmp(m, r->return_icmp >> 8, + r->return_icmp & 255, af, r); + else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 && + r->return_icmp6) + pf_send_icmp(m, r->return_icmp6 >> 8, + r->return_icmp6 & 255, af, r); + } + + if (r->action == PF_DROP) + goto cleanup; + + if (tag > 0 && pf_tag_packet(m, pd, tag)) { + REASON_SET(&reason, PFRES_MEMORY); + goto cleanup; + } + if (rtableid >= 0) + M_SETFIB(m, rtableid); + + if (!state_icmp && (r->keep_state || nr != NULL || + (pd->flags & PFDESC_TCP_NORM))) { + int action; + action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off, + sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum, + hdrlen); + if (action != PF_PASS) + return (action); + } else { + if (sk != NULL) + uma_zfree(V_pf_state_key_z, sk); + if (nk != NULL) + uma_zfree(V_pf_state_key_z, nk); + } + + /* copy back packet headers if we performed NAT operations */ + if (rewrite) + m_copyback(m, off, hdrlen, pd->hdr.any); + + if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) && + direction == PF_OUT && + pfsync_defer_ptr != NULL && pfsync_defer_ptr(*sm, m)) + /* + * We want the state created, but we dont + * want to send this in case a partner + * firewall has to know about it to allow + * replies through it. + */ + return (PF_DEFER); + + return (PF_PASS); + +cleanup: + if (sk != NULL) + uma_zfree(V_pf_state_key_z, sk); + if (nk != NULL) + uma_zfree(V_pf_state_key_z, nk); + return (PF_DROP); +} + +static int +pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a, + struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk, + struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport, + u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm, + int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen) +{ + struct pf_state *s = NULL; + struct pf_src_node *sn = NULL; + struct tcphdr *th = pd->hdr.tcp; + u_int16_t mss = V_tcp_mssdflt; + u_short reason; + + /* check maximums */ + if (r->max_states && + (counter_u64_fetch(r->states_cur) >= r->max_states)) { + counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1); + REASON_SET(&reason, PFRES_MAXSTATES); + return (PF_DROP); + } + /* src node for filter rule */ + if ((r->rule_flag & PFRULE_SRCTRACK || + r->rpool.opts & PF_POOL_STICKYADDR) && + pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) { + REASON_SET(&reason, PFRES_SRCLIMIT); + goto csfailed; + } + /* src node for translation rule */ + if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && + pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) { + REASON_SET(&reason, PFRES_SRCLIMIT); + goto csfailed; + } + s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO); + if (s == NULL) { + REASON_SET(&reason, PFRES_MEMORY); + goto csfailed; + } + s->rule.ptr = r; + s->nat_rule.ptr = nr; + s->anchor.ptr = a; + STATE_INC_COUNTERS(s); + if (r->allow_opts) + s->state_flags |= PFSTATE_ALLOWOPTS; + if (r->rule_flag & PFRULE_STATESLOPPY) + s->state_flags |= PFSTATE_SLOPPY; + s->log = r->log & PF_LOG_ALL; + s->sync_state = PFSYNC_S_NONE; + if (nr != NULL) + s->log |= nr->log & PF_LOG_ALL; + switch (pd->proto) { + case IPPROTO_TCP: + s->src.seqlo = ntohl(th->th_seq); + s->src.seqhi = s->src.seqlo + pd->p_len + 1; + if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && + r->keep_state == PF_STATE_MODULATE) { + /* Generate sequence number modulator */ + if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) == + 0) + s->src.seqdiff = 1; + pf_change_proto_a(m, &th->th_seq, &th->th_sum, + htonl(s->src.seqlo + s->src.seqdiff), 0); + *rewrite = 1; + } else + s->src.seqdiff = 0; + if (th->th_flags & TH_SYN) { + s->src.seqhi++; + s->src.wscale = pf_get_wscale(m, off, + th->th_off, pd->af); + } + s->src.max_win = MAX(ntohs(th->th_win), 1); + if (s->src.wscale & PF_WSCALE_MASK) { + /* Remove scale factor from initial window */ + int win = s->src.max_win; + win += 1 << (s->src.wscale & PF_WSCALE_MASK); + s->src.max_win = (win - 1) >> + (s->src.wscale & PF_WSCALE_MASK); + } + if (th->th_flags & TH_FIN) + s->src.seqhi++; + s->dst.seqhi = 1; + s->dst.max_win = 1; + s->src.state = TCPS_SYN_SENT; + s->dst.state = TCPS_CLOSED; + s->timeout = PFTM_TCP_FIRST_PACKET; + break; + case IPPROTO_UDP: + s->src.state = PFUDPS_SINGLE; + s->dst.state = PFUDPS_NO_TRAFFIC; + s->timeout = PFTM_UDP_FIRST_PACKET; + break; + case IPPROTO_ICMP: +#ifdef INET6 + case IPPROTO_ICMPV6: +#endif + s->timeout = PFTM_ICMP_FIRST_PACKET; + break; + default: + s->src.state = PFOTHERS_SINGLE; + s->dst.state = PFOTHERS_NO_TRAFFIC; + s->timeout = PFTM_OTHER_FIRST_PACKET; + } + + if (r->rt && r->rt != PF_FASTROUTE) { + if (pf_map_addr(pd->af, r, pd->src, &s->rt_addr, NULL, &sn)) { + REASON_SET(&reason, PFRES_MAPFAILED); + pf_src_tree_remove_state(s); + STATE_DEC_COUNTERS(s); + uma_zfree(V_pf_state_z, s); + goto csfailed; + } + s->rt_kif = r->rpool.cur->kif; + } + + s->creation = time_uptime; + s->expire = time_uptime; + + if (sn != NULL) + s->src_node = sn; + if (nsn != NULL) { + /* XXX We only modify one side for now. */ + PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af); + s->nat_src_node = nsn; + } + if (pd->proto == IPPROTO_TCP) { + if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m, + off, pd, th, &s->src, &s->dst)) { + REASON_SET(&reason, PFRES_MEMORY); + pf_src_tree_remove_state(s); + STATE_DEC_COUNTERS(s); + uma_zfree(V_pf_state_z, s); + return (PF_DROP); + } + if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub && + pf_normalize_tcp_stateful(m, off, pd, &reason, th, s, + &s->src, &s->dst, rewrite)) { + /* This really shouldn't happen!!! */ + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_normalize_tcp_stateful failed on first pkt")); + pf_normalize_tcp_cleanup(s); + pf_src_tree_remove_state(s); + STATE_DEC_COUNTERS(s); + uma_zfree(V_pf_state_z, s); + return (PF_DROP); + } + } + s->direction = pd->dir; + + /* + * sk/nk could already been setup by pf_get_translation(). + */ + if (nr == NULL) { + KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p", + __func__, nr, sk, nk)); + sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport); + if (sk == NULL) + goto csfailed; + nk = sk; + } else + KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p", + __func__, nr, sk, nk)); + + /* Swap sk/nk for PF_OUT. */ + if (pf_state_insert(BOUND_IFACE(r, kif), + (pd->dir == PF_IN) ? sk : nk, + (pd->dir == PF_IN) ? nk : sk, s)) { + if (pd->proto == IPPROTO_TCP) + pf_normalize_tcp_cleanup(s); + REASON_SET(&reason, PFRES_STATEINS); + pf_src_tree_remove_state(s); + STATE_DEC_COUNTERS(s); + uma_zfree(V_pf_state_z, s); + return (PF_DROP); + } else + *sm = s; + + if (tag > 0) + s->tag = tag; + if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) == + TH_SYN && r->keep_state == PF_STATE_SYNPROXY) { + s->src.state = PF_TCPS_PROXY_SRC; + /* undo NAT changes, if they have taken place */ + if (nr != NULL) { + struct pf_state_key *skt = s->key[PF_SK_WIRE]; + if (pd->dir == PF_OUT) + skt = s->key[PF_SK_STACK]; + PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af); + PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af); + if (pd->sport) + *pd->sport = skt->port[pd->sidx]; + if (pd->dport) + *pd->dport = skt->port[pd->didx]; + if (pd->proto_sum) + *pd->proto_sum = bproto_sum; + if (pd->ip_sum) + *pd->ip_sum = bip_sum; + m_copyback(m, off, hdrlen, pd->hdr.any); + } + s->src.seqhi = htonl(arc4random()); + /* Find mss option */ + int rtid = M_GETFIB(m); + mss = pf_get_mss(m, off, th->th_off, pd->af); + mss = pf_calc_mss(pd->src, pd->af, rtid, mss); + mss = pf_calc_mss(pd->dst, pd->af, rtid, mss); + s->src.mss = mss; + pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport, + th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1, + TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL); + REASON_SET(&reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } + + return (PF_PASS); + +csfailed: + if (sk != NULL) + uma_zfree(V_pf_state_key_z, sk); + if (nk != NULL) + uma_zfree(V_pf_state_key_z, nk); + + if (sn != NULL) { + struct pf_srchash *sh; + + sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)]; + PF_HASHROW_LOCK(sh); + if (--sn->states == 0 && sn->expire == 0) { + pf_unlink_src_node(sn); + uma_zfree(V_pf_sources_z, sn); + counter_u64_add( + V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1); + } + PF_HASHROW_UNLOCK(sh); + } + + if (nsn != sn && nsn != NULL) { + struct pf_srchash *sh; + + sh = &V_pf_srchash[pf_hashsrc(&nsn->addr, nsn->af)]; + PF_HASHROW_LOCK(sh); + if (--nsn->states == 0 && nsn->expire == 0) { + pf_unlink_src_node(nsn); + uma_zfree(V_pf_sources_z, nsn); + counter_u64_add( + V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1); + } + PF_HASHROW_UNLOCK(sh); + } + + return (PF_DROP); +} + +static int +pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif, + struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am, + struct pf_ruleset **rsm) +{ + struct pf_rule *r, *a = NULL; + struct pf_ruleset *ruleset = NULL; + sa_family_t af = pd->af; + u_short reason; + int tag = -1; + int asd = 0; + int match = 0; + struct pf_anchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE]; + + PF_RULES_RASSERT(); + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != direction) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, + r->src.neg, kif, M_GETFIB(m))) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, + r->dst.neg, NULL, M_GETFIB(m))) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (r->tos && !(r->tos == pd->tos)) + r = TAILQ_NEXT(r, entries); + else if (r->os_fingerprint != PF_OSFP_ANY) + r = TAILQ_NEXT(r, entries); + else if (pd->proto == IPPROTO_UDP && + (r->src.port_op || r->dst.port_op)) + r = TAILQ_NEXT(r, entries); + else if (pd->proto == IPPROTO_TCP && + (r->src.port_op || r->dst.port_op || r->flagset)) + r = TAILQ_NEXT(r, entries); + else if ((pd->proto == IPPROTO_ICMP || + pd->proto == IPPROTO_ICMPV6) && + (r->type || r->code)) + r = TAILQ_NEXT(r, entries); + else if (r->prio && + !pf_match_ieee8021q_pcp(r->prio, m)) + r = TAILQ_NEXT(r, entries); + else if (r->prob && r->prob <= + (arc4random() % (UINT_MAX - 1) + 1)) + r = TAILQ_NEXT(r, entries); + else if (r->match_tag && !pf_match_tag(m, r, &tag, + pd->pf_mtag ? pd->pf_mtag->tag : 0)) + r = TAILQ_NEXT(r, entries); + else { + if (r->anchor == NULL) { + match = 1; + *rm = r; + *am = a; + *rsm = ruleset; + if ((*rm)->quick) + break; + r = TAILQ_NEXT(r, entries); + } else + pf_step_into_anchor(anchor_stack, &asd, + &ruleset, PF_RULESET_FILTER, &r, &a, + &match); + } + if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd, + &ruleset, PF_RULESET_FILTER, &r, &a, &match)) + break; + } + r = *rm; + a = *am; + ruleset = *rsm; + + REASON_SET(&reason, PFRES_MATCH); + + if (r->log) + PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd, + 1); + + if (r->action != PF_PASS) + return (PF_DROP); + + if (tag > 0 && pf_tag_packet(m, pd, tag)) { + REASON_SET(&reason, PFRES_MEMORY); + return (PF_DROP); + } + + return (PF_PASS); +} + +static int +pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst, + struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off, + struct pf_pdesc *pd, u_short *reason, int *copyback) +{ + struct tcphdr *th = pd->hdr.tcp; + u_int16_t win = ntohs(th->th_win); + u_int32_t ack, end, seq, orig_seq; + u_int8_t sws, dws; + int ackskew; + + if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) { + sws = src->wscale & PF_WSCALE_MASK; + dws = dst->wscale & PF_WSCALE_MASK; + } else + sws = dws = 0; + + /* + * Sequence tracking algorithm from Guido van Rooij's paper: + * http://www.madison-gurkha.com/publications/tcp_filtering/ + * tcp_filtering.ps + */ + + orig_seq = seq = ntohl(th->th_seq); + if (src->seqlo == 0) { + /* First packet from this end. Set its state */ + + if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) && + src->scrub == NULL) { + if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) { + REASON_SET(reason, PFRES_MEMORY); + return (PF_DROP); + } + } + + /* Deferred generation of sequence number modulator */ + if (dst->seqdiff && !src->seqdiff) { + /* use random iss for the TCP server */ + while ((src->seqdiff = arc4random() - seq) == 0) + ; + ack = ntohl(th->th_ack) - dst->seqdiff; + pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq + + src->seqdiff), 0); + pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0); + *copyback = 1; + } else { + ack = ntohl(th->th_ack); + } + + end = seq + pd->p_len; + if (th->th_flags & TH_SYN) { + end++; + if (dst->wscale & PF_WSCALE_FLAG) { + src->wscale = pf_get_wscale(m, off, th->th_off, + pd->af); + if (src->wscale & PF_WSCALE_FLAG) { + /* Remove scale factor from initial + * window */ + sws = src->wscale & PF_WSCALE_MASK; + win = ((u_int32_t)win + (1 << sws) - 1) + >> sws; + dws = dst->wscale & PF_WSCALE_MASK; + } else { + /* fixup other window */ + dst->max_win <<= dst->wscale & + PF_WSCALE_MASK; + /* in case of a retrans SYN|ACK */ + dst->wscale = 0; + } + } + } + if (th->th_flags & TH_FIN) + end++; + + src->seqlo = seq; + if (src->state < TCPS_SYN_SENT) + src->state = TCPS_SYN_SENT; + + /* + * May need to slide the window (seqhi may have been set by + * the crappy stack check or if we picked up the connection + * after establishment) + */ + if (src->seqhi == 1 || + SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) + src->seqhi = end + MAX(1, dst->max_win << dws); + if (win > src->max_win) + src->max_win = win; + + } else { + ack = ntohl(th->th_ack) - dst->seqdiff; + if (src->seqdiff) { + /* Modulate sequence numbers */ + pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq + + src->seqdiff), 0); + pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0); + *copyback = 1; + } + end = seq + pd->p_len; + if (th->th_flags & TH_SYN) + end++; + if (th->th_flags & TH_FIN) + end++; + } + + if ((th->th_flags & TH_ACK) == 0) { + /* Let it pass through the ack skew check */ + ack = dst->seqlo; + } else if ((ack == 0 && + (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) || + /* broken tcp stacks do not set ack */ + (dst->state < TCPS_SYN_SENT)) { + /* + * Many stacks (ours included) will set the ACK number in an + * FIN|ACK if the SYN times out -- no sequence to ACK. + */ + ack = dst->seqlo; + } + + if (seq == end) { + /* Ease sequencing restrictions on no data packets */ + seq = src->seqlo; + end = seq; + } + + ackskew = dst->seqlo - ack; + + + /* + * Need to demodulate the sequence numbers in any TCP SACK options + * (Selective ACK). We could optionally validate the SACK values + * against the current ACK window, either forwards or backwards, but + * I'm not confident that SACK has been implemented properly + * everywhere. It wouldn't surprise me if several stacks accidentally + * SACK too far backwards of previously ACKed data. There really aren't + * any security implications of bad SACKing unless the target stack + * doesn't validate the option length correctly. Someone trying to + * spoof into a TCP connection won't bother blindly sending SACK + * options anyway. + */ + if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) { + if (pf_modulate_sack(m, off, pd, th, dst)) + *copyback = 1; + } + + +#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ + if (SEQ_GEQ(src->seqhi, end) && + /* Last octet inside other's window space */ + SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) && + /* Retrans: not more than one window back */ + (ackskew >= -MAXACKWINDOW) && + /* Acking not more than one reassembled fragment backwards */ + (ackskew <= (MAXACKWINDOW << sws)) && + /* Acking not more than one window forward */ + ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo || + (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) || + (pd->flags & PFDESC_IP_REAS) == 0)) { + /* Require an exact/+1 sequence match on resets when possible */ + + if (dst->scrub || src->scrub) { + if (pf_normalize_tcp_stateful(m, off, pd, reason, th, + *state, src, dst, copyback)) + return (PF_DROP); + } + + /* update max window */ + if (src->max_win < win) + src->max_win = win; + /* synchronize sequencing */ + if (SEQ_GT(end, src->seqlo)) + src->seqlo = end; + /* slide the window of what the other end can send */ + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) + dst->seqhi = ack + MAX((win << sws), 1); + + + /* update states */ + if (th->th_flags & TH_SYN) + if (src->state < TCPS_SYN_SENT) + src->state = TCPS_SYN_SENT; + if (th->th_flags & TH_FIN) + if (src->state < TCPS_CLOSING) + src->state = TCPS_CLOSING; + if (th->th_flags & TH_ACK) { + if (dst->state == TCPS_SYN_SENT) { + dst->state = TCPS_ESTABLISHED; + if (src->state == TCPS_ESTABLISHED && + (*state)->src_node != NULL && + pf_src_connlimit(state)) { + REASON_SET(reason, PFRES_SRCLIMIT); + return (PF_DROP); + } + } else if (dst->state == TCPS_CLOSING) + dst->state = TCPS_FIN_WAIT_2; + } + if (th->th_flags & TH_RST) + src->state = dst->state = TCPS_TIME_WAIT; + + /* update expire time */ + (*state)->expire = time_uptime; + if (src->state >= TCPS_FIN_WAIT_2 && + dst->state >= TCPS_FIN_WAIT_2) + (*state)->timeout = PFTM_TCP_CLOSED; + else if (src->state >= TCPS_CLOSING && + dst->state >= TCPS_CLOSING) + (*state)->timeout = PFTM_TCP_FIN_WAIT; + else if (src->state < TCPS_ESTABLISHED || + dst->state < TCPS_ESTABLISHED) + (*state)->timeout = PFTM_TCP_OPENING; + else if (src->state >= TCPS_CLOSING || + dst->state >= TCPS_CLOSING) + (*state)->timeout = PFTM_TCP_CLOSING; + else + (*state)->timeout = PFTM_TCP_ESTABLISHED; + + /* Fall through to PASS packet */ + + } else if ((dst->state < TCPS_SYN_SENT || + dst->state >= TCPS_FIN_WAIT_2 || + src->state >= TCPS_FIN_WAIT_2) && + SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) && + /* Within a window forward of the originating packet */ + SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { + /* Within a window backward of the originating packet */ + + /* + * This currently handles three situations: + * 1) Stupid stacks will shotgun SYNs before their peer + * replies. + * 2) When PF catches an already established stream (the + * firewall rebooted, the state table was flushed, routes + * changed...) + * 3) Packets get funky immediately after the connection + * closes (this should catch Solaris spurious ACK|FINs + * that web servers like to spew after a close) + * + * This must be a little more careful than the above code + * since packet floods will also be caught here. We don't + * update the TTL here to mitigate the damage of a packet + * flood and so the same code can handle awkward establishment + * and a loosened connection close. + * In the establishment case, a correct peer response will + * validate the connection, go through the normal state code + * and keep updating the state TTL. + */ + + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: loose state match: "); + pf_print_state(*state); + pf_print_flags(th->th_flags); + printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " + "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, + pd->p_len, ackskew, (unsigned long long)(*state)->packets[0], + (unsigned long long)(*state)->packets[1], + pd->dir == PF_IN ? "in" : "out", + pd->dir == (*state)->direction ? "fwd" : "rev"); + } + + if (dst->scrub || src->scrub) { + if (pf_normalize_tcp_stateful(m, off, pd, reason, th, + *state, src, dst, copyback)) + return (PF_DROP); + } + + /* update max window */ + if (src->max_win < win) + src->max_win = win; + /* synchronize sequencing */ + if (SEQ_GT(end, src->seqlo)) + src->seqlo = end; + /* slide the window of what the other end can send */ + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) + dst->seqhi = ack + MAX((win << sws), 1); + + /* + * Cannot set dst->seqhi here since this could be a shotgunned + * SYN and not an already established connection. + */ + + if (th->th_flags & TH_FIN) + if (src->state < TCPS_CLOSING) + src->state = TCPS_CLOSING; + if (th->th_flags & TH_RST) + src->state = dst->state = TCPS_TIME_WAIT; + + /* Fall through to PASS packet */ + + } else { + if ((*state)->dst.state == TCPS_SYN_SENT && + (*state)->src.state == TCPS_SYN_SENT) { + /* Send RST for state mismatches during handshake */ + if (!(th->th_flags & TH_RST)) + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, + pd->dst, pd->src, th->th_dport, + th->th_sport, ntohl(th->th_ack), 0, + TH_RST, 0, 0, + (*state)->rule.ptr->return_ttl, 1, 0, + kif->pfik_ifp); + src->seqlo = 0; + src->seqhi = 1; + src->max_win = 1; + } else if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: BAD state: "); + pf_print_state(*state); + pf_print_flags(th->th_flags); + printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " + "pkts=%llu:%llu dir=%s,%s\n", + seq, orig_seq, ack, pd->p_len, ackskew, + (unsigned long long)(*state)->packets[0], + (unsigned long long)(*state)->packets[1], + pd->dir == PF_IN ? "in" : "out", + pd->dir == (*state)->direction ? "fwd" : "rev"); + printf("pf: State failure on: %c %c %c %c | %c %c\n", + SEQ_GEQ(src->seqhi, end) ? ' ' : '1', + SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ? + ' ': '2', + (ackskew >= -MAXACKWINDOW) ? ' ' : '3', + (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4', + SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5', + SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6'); + } + REASON_SET(reason, PFRES_BADSTATE); + return (PF_DROP); + } + + return (PF_PASS); +} + +static int +pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst, + struct pf_state **state, struct pf_pdesc *pd, u_short *reason) +{ + struct tcphdr *th = pd->hdr.tcp; + + if (th->th_flags & TH_SYN) + if (src->state < TCPS_SYN_SENT) + src->state = TCPS_SYN_SENT; + if (th->th_flags & TH_FIN) + if (src->state < TCPS_CLOSING) + src->state = TCPS_CLOSING; + if (th->th_flags & TH_ACK) { + if (dst->state == TCPS_SYN_SENT) { + dst->state = TCPS_ESTABLISHED; + if (src->state == TCPS_ESTABLISHED && + (*state)->src_node != NULL && + pf_src_connlimit(state)) { + REASON_SET(reason, PFRES_SRCLIMIT); + return (PF_DROP); + } + } else if (dst->state == TCPS_CLOSING) { + dst->state = TCPS_FIN_WAIT_2; + } else if (src->state == TCPS_SYN_SENT && + dst->state < TCPS_SYN_SENT) { + /* + * Handle a special sloppy case where we only see one + * half of the connection. If there is a ACK after + * the initial SYN without ever seeing a packet from + * the destination, set the connection to established. + */ + dst->state = src->state = TCPS_ESTABLISHED; + if ((*state)->src_node != NULL && + pf_src_connlimit(state)) { + REASON_SET(reason, PFRES_SRCLIMIT); + return (PF_DROP); + } + } else if (src->state == TCPS_CLOSING && + dst->state == TCPS_ESTABLISHED && + dst->seqlo == 0) { + /* + * Handle the closing of half connections where we + * don't see the full bidirectional FIN/ACK+ACK + * handshake. + */ + dst->state = TCPS_CLOSING; + } + } + if (th->th_flags & TH_RST) + src->state = dst->state = TCPS_TIME_WAIT; + + /* update expire time */ + (*state)->expire = time_uptime; + if (src->state >= TCPS_FIN_WAIT_2 && + dst->state >= TCPS_FIN_WAIT_2) + (*state)->timeout = PFTM_TCP_CLOSED; + else if (src->state >= TCPS_CLOSING && + dst->state >= TCPS_CLOSING) + (*state)->timeout = PFTM_TCP_FIN_WAIT; + else if (src->state < TCPS_ESTABLISHED || + dst->state < TCPS_ESTABLISHED) + (*state)->timeout = PFTM_TCP_OPENING; + else if (src->state >= TCPS_CLOSING || + dst->state >= TCPS_CLOSING) + (*state)->timeout = PFTM_TCP_CLOSING; + else + (*state)->timeout = PFTM_TCP_ESTABLISHED; + + return (PF_PASS); +} + +static int +pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, + struct mbuf *m, int off, void *h, struct pf_pdesc *pd, + u_short *reason) +{ + struct pf_state_key_cmp key; + struct tcphdr *th = pd->hdr.tcp; + int copyback = 0; + struct pf_state_peer *src, *dst; + struct pf_state_key *sk; + + bzero(&key, sizeof(key)); + key.af = pd->af; + key.proto = IPPROTO_TCP; + if (direction == PF_IN) { /* wire side, straight */ + PF_ACPY(&key.addr[0], pd->src, key.af); + PF_ACPY(&key.addr[1], pd->dst, key.af); + key.port[0] = th->th_sport; + key.port[1] = th->th_dport; + } else { /* stack side, reverse */ + PF_ACPY(&key.addr[1], pd->src, key.af); + PF_ACPY(&key.addr[0], pd->dst, key.af); + key.port[1] = th->th_sport; + key.port[0] = th->th_dport; + } + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + if (direction == (*state)->direction) { + src = &(*state)->src; + dst = &(*state)->dst; + } else { + src = &(*state)->dst; + dst = &(*state)->src; + } + + sk = (*state)->key[pd->didx]; + + if ((*state)->src.state == PF_TCPS_PROXY_SRC) { + if (direction != (*state)->direction) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } + if (th->th_flags & TH_SYN) { + if (ntohl(th->th_seq) != (*state)->src.seqlo) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_DROP); + } + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, + pd->src, th->th_dport, th->th_sport, + (*state)->src.seqhi, ntohl(th->th_seq) + 1, + TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL); + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } else if (!(th->th_flags & TH_ACK) || + (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || + (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_DROP); + } else if ((*state)->src_node != NULL && + pf_src_connlimit(state)) { + REASON_SET(reason, PFRES_SRCLIMIT); + return (PF_DROP); + } else + (*state)->src.state = PF_TCPS_PROXY_DST; + } + if ((*state)->src.state == PF_TCPS_PROXY_DST) { + if (direction == (*state)->direction) { + if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) || + (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || + (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_DROP); + } + (*state)->src.max_win = MAX(ntohs(th->th_win), 1); + if ((*state)->dst.seqhi == 1) + (*state)->dst.seqhi = htonl(arc4random()); + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, + &sk->addr[pd->sidx], &sk->addr[pd->didx], + sk->port[pd->sidx], sk->port[pd->didx], + (*state)->dst.seqhi, 0, TH_SYN, 0, + (*state)->src.mss, 0, 0, (*state)->tag, NULL); + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } else if (((th->th_flags & (TH_SYN|TH_ACK)) != + (TH_SYN|TH_ACK)) || + (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_DROP); + } else { + (*state)->dst.max_win = MAX(ntohs(th->th_win), 1); + (*state)->dst.seqlo = ntohl(th->th_seq); + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, + pd->src, th->th_dport, th->th_sport, + ntohl(th->th_ack), ntohl(th->th_seq) + 1, + TH_ACK, (*state)->src.max_win, 0, 0, 0, + (*state)->tag, NULL); + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, + &sk->addr[pd->sidx], &sk->addr[pd->didx], + sk->port[pd->sidx], sk->port[pd->didx], + (*state)->src.seqhi + 1, (*state)->src.seqlo + 1, + TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL); + (*state)->src.seqdiff = (*state)->dst.seqhi - + (*state)->src.seqlo; + (*state)->dst.seqdiff = (*state)->src.seqhi - + (*state)->dst.seqlo; + (*state)->src.seqhi = (*state)->src.seqlo + + (*state)->dst.max_win; + (*state)->dst.seqhi = (*state)->dst.seqlo + + (*state)->src.max_win; + (*state)->src.wscale = (*state)->dst.wscale = 0; + (*state)->src.state = (*state)->dst.state = + TCPS_ESTABLISHED; + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } + } + + if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) && + dst->state >= TCPS_FIN_WAIT_2 && + src->state >= TCPS_FIN_WAIT_2) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: state reuse "); + pf_print_state(*state); + pf_print_flags(th->th_flags); + printf("\n"); + } + /* XXX make sure it's the same direction ?? */ + (*state)->src.state = (*state)->dst.state = TCPS_CLOSED; + pf_unlink_state(*state, PF_ENTER_LOCKED); + *state = NULL; + return (PF_DROP); + } + + if ((*state)->state_flags & PFSTATE_SLOPPY) { + if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP) + return (PF_DROP); + } else { + if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason, + ©back) == PF_DROP) + return (PF_DROP); + } + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = (*state)->key[pd->didx]; + + if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) || + nk->port[pd->sidx] != th->th_sport) + pf_change_ap(m, pd->src, &th->th_sport, + pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx], + nk->port[pd->sidx], 0, pd->af); + + if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) || + nk->port[pd->didx] != th->th_dport) + pf_change_ap(m, pd->dst, &th->th_dport, + pd->ip_sum, &th->th_sum, &nk->addr[pd->didx], + nk->port[pd->didx], 0, pd->af); + copyback = 1; + } + + /* Copyback sequence modulation or stateful scrub changes if needed */ + if (copyback) + m_copyback(m, off, sizeof(*th), (caddr_t)th); + + return (PF_PASS); +} + +static int +pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, + struct mbuf *m, int off, void *h, struct pf_pdesc *pd) +{ + struct pf_state_peer *src, *dst; + struct pf_state_key_cmp key; + struct udphdr *uh = pd->hdr.udp; + + bzero(&key, sizeof(key)); + key.af = pd->af; + key.proto = IPPROTO_UDP; + if (direction == PF_IN) { /* wire side, straight */ + PF_ACPY(&key.addr[0], pd->src, key.af); + PF_ACPY(&key.addr[1], pd->dst, key.af); + key.port[0] = uh->uh_sport; + key.port[1] = uh->uh_dport; + } else { /* stack side, reverse */ + PF_ACPY(&key.addr[1], pd->src, key.af); + PF_ACPY(&key.addr[0], pd->dst, key.af); + key.port[1] = uh->uh_sport; + key.port[0] = uh->uh_dport; + } + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + if (direction == (*state)->direction) { + src = &(*state)->src; + dst = &(*state)->dst; + } else { + src = &(*state)->dst; + dst = &(*state)->src; + } + + /* update states */ + if (src->state < PFUDPS_SINGLE) + src->state = PFUDPS_SINGLE; + if (dst->state == PFUDPS_SINGLE) + dst->state = PFUDPS_MULTIPLE; + + /* update expire time */ + (*state)->expire = time_uptime; + if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE) + (*state)->timeout = PFTM_UDP_MULTIPLE; + else + (*state)->timeout = PFTM_UDP_SINGLE; + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = (*state)->key[pd->didx]; + + if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) || + nk->port[pd->sidx] != uh->uh_sport) + pf_change_ap(m, pd->src, &uh->uh_sport, pd->ip_sum, + &uh->uh_sum, &nk->addr[pd->sidx], + nk->port[pd->sidx], 1, pd->af); + + if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) || + nk->port[pd->didx] != uh->uh_dport) + pf_change_ap(m, pd->dst, &uh->uh_dport, pd->ip_sum, + &uh->uh_sum, &nk->addr[pd->didx], + nk->port[pd->didx], 1, pd->af); + m_copyback(m, off, sizeof(*uh), (caddr_t)uh); + } + + return (PF_PASS); +} + +static int +pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, + struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) +{ + struct pf_addr *saddr = pd->src, *daddr = pd->dst; + u_int16_t icmpid = 0, *icmpsum; + u_int8_t icmptype; + int state_icmp = 0; + struct pf_state_key_cmp key; + + bzero(&key, sizeof(key)); + switch (pd->proto) { +#ifdef INET + case IPPROTO_ICMP: + icmptype = pd->hdr.icmp->icmp_type; + icmpid = pd->hdr.icmp->icmp_id; + icmpsum = &pd->hdr.icmp->icmp_cksum; + + if (icmptype == ICMP_UNREACH || + icmptype == ICMP_SOURCEQUENCH || + icmptype == ICMP_REDIRECT || + icmptype == ICMP_TIMXCEED || + icmptype == ICMP_PARAMPROB) + state_icmp++; + break; +#endif /* INET */ +#ifdef INET6 + case IPPROTO_ICMPV6: + icmptype = pd->hdr.icmp6->icmp6_type; + icmpid = pd->hdr.icmp6->icmp6_id; + icmpsum = &pd->hdr.icmp6->icmp6_cksum; + + if (icmptype == ICMP6_DST_UNREACH || + icmptype == ICMP6_PACKET_TOO_BIG || + icmptype == ICMP6_TIME_EXCEEDED || + icmptype == ICMP6_PARAM_PROB) + state_icmp++; + break; +#endif /* INET6 */ + } + + if (!state_icmp) { + + /* + * ICMP query/reply message not related to a TCP/UDP packet. + * Search for an ICMP state. + */ + key.af = pd->af; + key.proto = pd->proto; + key.port[0] = key.port[1] = icmpid; + if (direction == PF_IN) { /* wire side, straight */ + PF_ACPY(&key.addr[0], pd->src, key.af); + PF_ACPY(&key.addr[1], pd->dst, key.af); + } else { /* stack side, reverse */ + PF_ACPY(&key.addr[1], pd->src, key.af); + PF_ACPY(&key.addr[0], pd->dst, key.af); + } + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + (*state)->expire = time_uptime; + (*state)->timeout = PFTM_ICMP_ERROR_REPLY; + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = (*state)->key[pd->didx]; + + switch (pd->af) { +#ifdef INET + case AF_INET: + if (PF_ANEQ(pd->src, + &nk->addr[pd->sidx], AF_INET)) + pf_change_a(&saddr->v4.s_addr, + pd->ip_sum, + nk->addr[pd->sidx].v4.s_addr, 0); + + if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], + AF_INET)) + pf_change_a(&daddr->v4.s_addr, + pd->ip_sum, + nk->addr[pd->didx].v4.s_addr, 0); + + if (nk->port[0] != + pd->hdr.icmp->icmp_id) { + pd->hdr.icmp->icmp_cksum = + pf_cksum_fixup( + pd->hdr.icmp->icmp_cksum, icmpid, + nk->port[pd->sidx], 0); + pd->hdr.icmp->icmp_id = + nk->port[pd->sidx]; + } + + m_copyback(m, off, ICMP_MINLEN, + (caddr_t )pd->hdr.icmp); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (PF_ANEQ(pd->src, + &nk->addr[pd->sidx], AF_INET6)) + pf_change_a6(saddr, + &pd->hdr.icmp6->icmp6_cksum, + &nk->addr[pd->sidx], 0); + + if (PF_ANEQ(pd->dst, + &nk->addr[pd->didx], AF_INET6)) + pf_change_a6(daddr, + &pd->hdr.icmp6->icmp6_cksum, + &nk->addr[pd->didx], 0); + + m_copyback(m, off, sizeof(struct icmp6_hdr), + (caddr_t )pd->hdr.icmp6); + break; +#endif /* INET6 */ + } + } + return (PF_PASS); + + } else { + /* + * ICMP error message in response to a TCP/UDP packet. + * Extract the inner TCP/UDP header and search for that state. + */ + + struct pf_pdesc pd2; + bzero(&pd2, sizeof pd2); +#ifdef INET + struct ip h2; +#endif /* INET */ +#ifdef INET6 + struct ip6_hdr h2_6; + int terminal = 0; +#endif /* INET6 */ + int ipoff2 = 0; + int off2 = 0; + + pd2.af = pd->af; + /* Payload packet is from the opposite direction. */ + pd2.sidx = (direction == PF_IN) ? 1 : 0; + pd2.didx = (direction == PF_IN) ? 0 : 1; + switch (pd->af) { +#ifdef INET + case AF_INET: + /* offset of h2 in mbuf chain */ + ipoff2 = off + ICMP_MINLEN; + + if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2), + NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(ip)\n")); + return (PF_DROP); + } + /* + * ICMP error messages don't refer to non-first + * fragments + */ + if (h2.ip_off & htons(IP_OFFMASK)) { + REASON_SET(reason, PFRES_FRAG); + return (PF_DROP); + } + + /* offset of protocol header that follows h2 */ + off2 = ipoff2 + (h2.ip_hl << 2); + + pd2.proto = h2.ip_p; + pd2.src = (struct pf_addr *)&h2.ip_src; + pd2.dst = (struct pf_addr *)&h2.ip_dst; + pd2.ip_sum = &h2.ip_sum; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + ipoff2 = off + sizeof(struct icmp6_hdr); + + if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6), + NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(ip6)\n")); + return (PF_DROP); + } + pd2.proto = h2_6.ip6_nxt; + pd2.src = (struct pf_addr *)&h2_6.ip6_src; + pd2.dst = (struct pf_addr *)&h2_6.ip6_dst; + pd2.ip_sum = NULL; + off2 = ipoff2 + sizeof(h2_6); + do { + switch (pd2.proto) { + case IPPROTO_FRAGMENT: + /* + * ICMPv6 error messages for + * non-first fragments + */ + REASON_SET(reason, PFRES_FRAG); + return (PF_DROP); + case IPPROTO_AH: + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: { + /* get next header and header length */ + struct ip6_ext opt6; + + if (!pf_pull_hdr(m, off2, &opt6, + sizeof(opt6), NULL, reason, + pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMPv6 short opt\n")); + return (PF_DROP); + } + if (pd2.proto == IPPROTO_AH) + off2 += (opt6.ip6e_len + 2) * 4; + else + off2 += (opt6.ip6e_len + 1) * 8; + pd2.proto = opt6.ip6e_nxt; + /* goto the next header */ + break; + } + default: + terminal++; + break; + } + } while (!terminal); + break; +#endif /* INET6 */ + } + + switch (pd2.proto) { + case IPPROTO_TCP: { + struct tcphdr th; + u_int32_t seq; + struct pf_state_peer *src, *dst; + u_int8_t dws; + int copyback = 0; + + /* + * Only the first 8 bytes of the TCP header can be + * expected. Don't access any TCP header fields after + * th_seq, an ackskew test is not possible. + */ + if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason, + pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(tcp)\n")); + return (PF_DROP); + } + + key.af = pd2.af; + key.proto = IPPROTO_TCP; + PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); + PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + key.port[pd2.sidx] = th.th_sport; + key.port[pd2.didx] = th.th_dport; + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + if (direction == (*state)->direction) { + src = &(*state)->dst; + dst = &(*state)->src; + } else { + src = &(*state)->src; + dst = &(*state)->dst; + } + + if (src->wscale && dst->wscale) + dws = dst->wscale & PF_WSCALE_MASK; + else + dws = 0; + + /* Demodulate sequence number */ + seq = ntohl(th.th_seq) - src->seqdiff; + if (src->seqdiff) { + pf_change_a(&th.th_seq, icmpsum, + htonl(seq), 0); + copyback = 1; + } + + if (!((*state)->state_flags & PFSTATE_SLOPPY) && + (!SEQ_GEQ(src->seqhi, seq) || + !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: BAD ICMP %d:%d ", + icmptype, pd->hdr.icmp->icmp_code); + pf_print_host(pd->src, 0, pd->af); + printf(" -> "); + pf_print_host(pd->dst, 0, pd->af); + printf(" state: "); + pf_print_state(*state); + printf(" seq=%u\n", seq); + } + REASON_SET(reason, PFRES_BADSTATE); + return (PF_DROP); + } else { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: OK ICMP %d:%d ", + icmptype, pd->hdr.icmp->icmp_code); + pf_print_host(pd->src, 0, pd->af); + printf(" -> "); + pf_print_host(pd->dst, 0, pd->af); + printf(" state: "); + pf_print_state(*state); + printf(" seq=%u\n", seq); + } + } + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != + (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = + (*state)->key[pd->didx]; + + if (PF_ANEQ(pd2.src, + &nk->addr[pd2.sidx], pd2.af) || + nk->port[pd2.sidx] != th.th_sport) + pf_change_icmp(pd2.src, &th.th_sport, + daddr, &nk->addr[pd2.sidx], + nk->port[pd2.sidx], NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, pd2.af); + + if (PF_ANEQ(pd2.dst, + &nk->addr[pd2.didx], pd2.af) || + nk->port[pd2.didx] != th.th_dport) + pf_change_icmp(pd2.dst, &th.th_dport, + saddr, &nk->addr[pd2.didx], + nk->port[pd2.didx], NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, pd2.af); + copyback = 1; + } + + if (copyback) { + switch (pd2.af) { +#ifdef INET + case AF_INET: + m_copyback(m, off, ICMP_MINLEN, + (caddr_t )pd->hdr.icmp); + m_copyback(m, ipoff2, sizeof(h2), + (caddr_t )&h2); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + m_copyback(m, off, + sizeof(struct icmp6_hdr), + (caddr_t )pd->hdr.icmp6); + m_copyback(m, ipoff2, sizeof(h2_6), + (caddr_t )&h2_6); + break; +#endif /* INET6 */ + } + m_copyback(m, off2, 8, (caddr_t)&th); + } + + return (PF_PASS); + break; + } + case IPPROTO_UDP: { + struct udphdr uh; + + if (!pf_pull_hdr(m, off2, &uh, sizeof(uh), + NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(udp)\n")); + return (PF_DROP); + } + + key.af = pd2.af; + key.proto = IPPROTO_UDP; + PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); + PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + key.port[pd2.sidx] = uh.uh_sport; + key.port[pd2.didx] = uh.uh_dport; + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != + (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = + (*state)->key[pd->didx]; + + if (PF_ANEQ(pd2.src, + &nk->addr[pd2.sidx], pd2.af) || + nk->port[pd2.sidx] != uh.uh_sport) + pf_change_icmp(pd2.src, &uh.uh_sport, + daddr, &nk->addr[pd2.sidx], + nk->port[pd2.sidx], &uh.uh_sum, + pd2.ip_sum, icmpsum, + pd->ip_sum, 1, pd2.af); + + if (PF_ANEQ(pd2.dst, + &nk->addr[pd2.didx], pd2.af) || + nk->port[pd2.didx] != uh.uh_dport) + pf_change_icmp(pd2.dst, &uh.uh_dport, + saddr, &nk->addr[pd2.didx], + nk->port[pd2.didx], &uh.uh_sum, + pd2.ip_sum, icmpsum, + pd->ip_sum, 1, pd2.af); + + switch (pd2.af) { +#ifdef INET + case AF_INET: + m_copyback(m, off, ICMP_MINLEN, + (caddr_t )pd->hdr.icmp); + m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + m_copyback(m, off, + sizeof(struct icmp6_hdr), + (caddr_t )pd->hdr.icmp6); + m_copyback(m, ipoff2, sizeof(h2_6), + (caddr_t )&h2_6); + break; +#endif /* INET6 */ + } + m_copyback(m, off2, sizeof(uh), (caddr_t)&uh); + } + return (PF_PASS); + break; + } +#ifdef INET + case IPPROTO_ICMP: { + struct icmp iih; + + if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN, + NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short i" + "(icmp)\n")); + return (PF_DROP); + } + + key.af = pd2.af; + key.proto = IPPROTO_ICMP; + PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); + PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + key.port[0] = key.port[1] = iih.icmp_id; + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != + (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = + (*state)->key[pd->didx]; + + if (PF_ANEQ(pd2.src, + &nk->addr[pd2.sidx], pd2.af) || + nk->port[pd2.sidx] != iih.icmp_id) + pf_change_icmp(pd2.src, &iih.icmp_id, + daddr, &nk->addr[pd2.sidx], + nk->port[pd2.sidx], NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, AF_INET); + + if (PF_ANEQ(pd2.dst, + &nk->addr[pd2.didx], pd2.af) || + nk->port[pd2.didx] != iih.icmp_id) + pf_change_icmp(pd2.dst, &iih.icmp_id, + saddr, &nk->addr[pd2.didx], + nk->port[pd2.didx], NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, AF_INET); + + m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); + m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); + m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih); + } + return (PF_PASS); + break; + } +#endif /* INET */ +#ifdef INET6 + case IPPROTO_ICMPV6: { + struct icmp6_hdr iih; + + if (!pf_pull_hdr(m, off2, &iih, + sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(icmp6)\n")); + return (PF_DROP); + } + + key.af = pd2.af; + key.proto = IPPROTO_ICMPV6; + PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); + PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + key.port[0] = key.port[1] = iih.icmp6_id; + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != + (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = + (*state)->key[pd->didx]; + + if (PF_ANEQ(pd2.src, + &nk->addr[pd2.sidx], pd2.af) || + nk->port[pd2.sidx] != iih.icmp6_id) + pf_change_icmp(pd2.src, &iih.icmp6_id, + daddr, &nk->addr[pd2.sidx], + nk->port[pd2.sidx], NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, AF_INET6); + + if (PF_ANEQ(pd2.dst, + &nk->addr[pd2.didx], pd2.af) || + nk->port[pd2.didx] != iih.icmp6_id) + pf_change_icmp(pd2.dst, &iih.icmp6_id, + saddr, &nk->addr[pd2.didx], + nk->port[pd2.didx], NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, AF_INET6); + + m_copyback(m, off, sizeof(struct icmp6_hdr), + (caddr_t)pd->hdr.icmp6); + m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6); + m_copyback(m, off2, sizeof(struct icmp6_hdr), + (caddr_t)&iih); + } + return (PF_PASS); + break; + } +#endif /* INET6 */ + default: { + key.af = pd2.af; + key.proto = pd2.proto; + PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); + PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + key.port[0] = key.port[1] = 0; + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != + (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = + (*state)->key[pd->didx]; + + if (PF_ANEQ(pd2.src, + &nk->addr[pd2.sidx], pd2.af)) + pf_change_icmp(pd2.src, NULL, daddr, + &nk->addr[pd2.sidx], 0, NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, pd2.af); + + if (PF_ANEQ(pd2.dst, + &nk->addr[pd2.didx], pd2.af)) + pf_change_icmp(pd2.dst, NULL, saddr, + &nk->addr[pd2.didx], 0, NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, pd2.af); + + switch (pd2.af) { +#ifdef INET + case AF_INET: + m_copyback(m, off, ICMP_MINLEN, + (caddr_t)pd->hdr.icmp); + m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + m_copyback(m, off, + sizeof(struct icmp6_hdr), + (caddr_t )pd->hdr.icmp6); + m_copyback(m, ipoff2, sizeof(h2_6), + (caddr_t )&h2_6); + break; +#endif /* INET6 */ + } + } + return (PF_PASS); + break; + } + } + } +} + +static int +pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif, + struct mbuf *m, struct pf_pdesc *pd) +{ + struct pf_state_peer *src, *dst; + struct pf_state_key_cmp key; + + bzero(&key, sizeof(key)); + key.af = pd->af; + key.proto = pd->proto; + if (direction == PF_IN) { + PF_ACPY(&key.addr[0], pd->src, key.af); + PF_ACPY(&key.addr[1], pd->dst, key.af); + key.port[0] = key.port[1] = 0; + } else { + PF_ACPY(&key.addr[1], pd->src, key.af); + PF_ACPY(&key.addr[0], pd->dst, key.af); + key.port[1] = key.port[0] = 0; + } + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + if (direction == (*state)->direction) { + src = &(*state)->src; + dst = &(*state)->dst; + } else { + src = &(*state)->dst; + dst = &(*state)->src; + } + + /* update states */ + if (src->state < PFOTHERS_SINGLE) + src->state = PFOTHERS_SINGLE; + if (dst->state == PFOTHERS_SINGLE) + dst->state = PFOTHERS_MULTIPLE; + + /* update expire time */ + (*state)->expire = time_uptime; + if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE) + (*state)->timeout = PFTM_OTHER_MULTIPLE; + else + (*state)->timeout = PFTM_OTHER_SINGLE; + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = (*state)->key[pd->didx]; + + KASSERT(nk, ("%s: nk is null", __func__)); + KASSERT(pd, ("%s: pd is null", __func__)); + KASSERT(pd->src, ("%s: pd->src is null", __func__)); + KASSERT(pd->dst, ("%s: pd->dst is null", __func__)); + switch (pd->af) { +#ifdef INET + case AF_INET: + if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) + pf_change_a(&pd->src->v4.s_addr, + pd->ip_sum, + nk->addr[pd->sidx].v4.s_addr, + 0); + + + if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) + pf_change_a(&pd->dst->v4.s_addr, + pd->ip_sum, + nk->addr[pd->didx].v4.s_addr, + 0); + + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) + PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af); + + if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) + PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af); +#endif /* INET6 */ + } + } + return (PF_PASS); +} + +/* + * ipoff and off are measured from the start of the mbuf chain. + * h must be at "ipoff" on the mbuf chain. + */ +void * +pf_pull_hdr(struct mbuf *m, int off, void *p, int len, + u_short *actionp, u_short *reasonp, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: { + struct ip *h = mtod(m, struct ip *); + u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; + + if (fragoff) { + if (fragoff >= len) + ACTION_SET(actionp, PF_PASS); + else { + ACTION_SET(actionp, PF_DROP); + REASON_SET(reasonp, PFRES_FRAG); + } + return (NULL); + } + if (m->m_pkthdr.len < off + len || + ntohs(h->ip_len) < off + len) { + ACTION_SET(actionp, PF_DROP); + REASON_SET(reasonp, PFRES_SHORT); + return (NULL); + } + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + + if (m->m_pkthdr.len < off + len || + (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) < + (unsigned)(off + len)) { + ACTION_SET(actionp, PF_DROP); + REASON_SET(reasonp, PFRES_SHORT); + return (NULL); + } + break; + } +#endif /* INET6 */ + } + m_copydata(m, off, len, p); + return (p); +} + +#ifdef RADIX_MPATH +static int +pf_routable_oldmpath(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, + int rtableid) +{ + struct radix_node_head *rnh; + struct sockaddr_in *dst; + int ret = 1; + int check_mpath; +#ifdef INET6 + struct sockaddr_in6 *dst6; + struct route_in6 ro; +#else + struct route ro; +#endif + struct radix_node *rn; + struct rtentry *rt; + struct ifnet *ifp; + + check_mpath = 0; + /* XXX: stick to table 0 for now */ + rnh = rt_tables_get_rnh(0, af); + if (rnh != NULL && rn_mpath_capable(rnh)) + check_mpath = 1; + bzero(&ro, sizeof(ro)); + switch (af) { + case AF_INET: + dst = satosin(&ro.ro_dst); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = addr->v4; + break; +#ifdef INET6 + case AF_INET6: + /* + * Skip check for addresses with embedded interface scope, + * as they would always match anyway. + */ + if (IN6_IS_SCOPE_EMBED(&addr->v6)) + goto out; + dst6 = (struct sockaddr_in6 *)&ro.ro_dst; + dst6->sin6_family = AF_INET6; + dst6->sin6_len = sizeof(*dst6); + dst6->sin6_addr = addr->v6; + break; +#endif /* INET6 */ + default: + return (0); + } + + /* Skip checks for ipsec interfaces */ + if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC) + goto out; + + switch (af) { +#ifdef INET6 + case AF_INET6: + in6_rtalloc_ign(&ro, 0, rtableid); + break; +#endif +#ifdef INET + case AF_INET: + in_rtalloc_ign((struct route *)&ro, 0, rtableid); + break; +#endif + } + + if (ro.ro_rt != NULL) { + /* No interface given, this is a no-route check */ + if (kif == NULL) + goto out; + + if (kif->pfik_ifp == NULL) { + ret = 0; + goto out; + } + + /* Perform uRPF check if passed input interface */ + ret = 0; + rn = (struct radix_node *)ro.ro_rt; + do { + rt = (struct rtentry *)rn; + ifp = rt->rt_ifp; + + if (kif->pfik_ifp == ifp) + ret = 1; + rn = rn_mpath_next(rn); + } while (check_mpath == 1 && rn != NULL && ret == 0); + } else + ret = 0; +out: + if (ro.ro_rt != NULL) + RTFREE(ro.ro_rt); + return (ret); +} +#endif + +int +pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, + int rtableid) +{ +#ifdef INET + struct nhop4_basic nh4; +#endif +#ifdef INET6 + struct nhop6_basic nh6; +#endif + struct ifnet *ifp; +#ifdef RADIX_MPATH + struct radix_node_head *rnh; + + /* XXX: stick to table 0 for now */ + rnh = rt_tables_get_rnh(0, af); + if (rnh != NULL && rn_mpath_capable(rnh)) + return (pf_routable_oldmpath(addr, af, kif, rtableid)); +#endif + /* + * Skip check for addresses with embedded interface scope, + * as they would always match anyway. + */ + if (af == AF_INET6 && IN6_IS_SCOPE_EMBED(&addr->v6)) + return (1); + + if (af != AF_INET && af != AF_INET6) + return (0); + + /* Skip checks for ipsec interfaces */ + if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC) + return (1); + + ifp = NULL; + + switch (af) { +#ifdef INET6 + case AF_INET6: + if (fib6_lookup_nh_basic(rtableid, &addr->v6, 0, 0, 0, &nh6)!=0) + return (0); + ifp = nh6.nh_ifp; + break; +#endif +#ifdef INET + case AF_INET: + if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) != 0) + return (0); + ifp = nh4.nh_ifp; + break; +#endif + } + + /* No interface given, this is a no-route check */ + if (kif == NULL) + return (1); + + if (kif->pfik_ifp == NULL) + return (0); + + /* Perform uRPF check if passed input interface */ + if (kif->pfik_ifp == ifp) + return (1); + return (0); +} + +#ifdef INET +static void +pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, + struct pf_state *s, struct pf_pdesc *pd) +{ + struct mbuf *m0, *m1; + struct sockaddr_in dst; + struct ip *ip; + struct ifnet *ifp = NULL; + struct pf_addr naddr; + struct pf_src_node *sn = NULL; + int error = 0; + uint16_t ip_len, ip_off; + + KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__)); + KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction", + __func__)); + + if ((pd->pf_mtag == NULL && + ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) || + pd->pf_mtag->routed++ > 3) { + m0 = *m; + *m = NULL; + goto bad_locked; + } + + if (r->rt == PF_DUPTO) { + if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) { + if (s) + PF_STATE_UNLOCK(s); + return; + } + } else { + if ((r->rt == PF_REPLYTO) == (r->direction == dir)) { + if (s) + PF_STATE_UNLOCK(s); + return; + } + m0 = *m; + } + + ip = mtod(m0, struct ip *); + + bzero(&dst, sizeof(dst)); + dst.sin_family = AF_INET; + dst.sin_len = sizeof(dst); + dst.sin_addr = ip->ip_dst; + + if (r->rt == PF_FASTROUTE) { + struct nhop4_basic nh4; + + if (s) + PF_STATE_UNLOCK(s); + + if (fib4_lookup_nh_basic(M_GETFIB(m0), ip->ip_dst, 0, + m0->m_pkthdr.flowid, &nh4) != 0) { + KMOD_IPSTAT_INC(ips_noroute); + error = EHOSTUNREACH; + goto bad; + } + + ifp = nh4.nh_ifp; + dst.sin_addr = nh4.nh_addr; + } else { + if (TAILQ_EMPTY(&r->rpool.list)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); + goto bad_locked; + } + if (s == NULL) { + pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src, + &naddr, NULL, &sn); + if (!PF_AZERO(&naddr, AF_INET)) + dst.sin_addr.s_addr = naddr.v4.s_addr; + ifp = r->rpool.cur->kif ? + r->rpool.cur->kif->pfik_ifp : NULL; + } else { + if (!PF_AZERO(&s->rt_addr, AF_INET)) + dst.sin_addr.s_addr = + s->rt_addr.v4.s_addr; + ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; + PF_STATE_UNLOCK(s); + } + } + if (ifp == NULL) + goto bad; + + if (oifp != ifp) { + if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS) + goto bad; + else if (m0 == NULL) + goto done; + if (m0->m_len < sizeof(struct ip)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("%s: m0->m_len < sizeof(struct ip)\n", __func__)); + goto bad; + } + ip = mtod(m0, struct ip *); + } + + if (ifp->if_flags & IFF_LOOPBACK) + m0->m_flags |= M_SKIP_FIREWALL; + + ip_len = ntohs(ip->ip_len); + ip_off = ntohs(ip->ip_off); + + /* Copied from FreeBSD 10.0-CURRENT ip_output. */ + m0->m_pkthdr.csum_flags |= CSUM_IP; + if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { + in_delayed_cksum(m0); + m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } +#ifdef SCTP + if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { + sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); + m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; + } +#endif + + /* + * If small enough for interface, or the interface will take + * care of the fragmentation for us, we can just send directly. + */ + if (ip_len <= ifp->if_mtu || + (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { + ip->ip_sum = 0; + if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { + ip->ip_sum = in_cksum(m0, ip->ip_hl << 2); + m0->m_pkthdr.csum_flags &= ~CSUM_IP; + } + m_clrprotoflags(m0); /* Avoid confusing lower layers. */ + error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL); + goto done; + } + + /* Balk when DF bit is set or the interface didn't support TSO. */ + if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) { + error = EMSGSIZE; + KMOD_IPSTAT_INC(ips_cantfrag); + if (r->rt != PF_DUPTO) { + icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, + ifp->if_mtu); + goto done; + } else + goto bad; + } + + error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist); + if (error) + goto bad; + + for (; m0; m0 = m1) { + m1 = m0->m_nextpkt; + m0->m_nextpkt = NULL; + if (error == 0) { + m_clrprotoflags(m0); + error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL); + } else + m_freem(m0); + } + + if (error == 0) + KMOD_IPSTAT_INC(ips_fragmented); + +done: + if (r->rt != PF_DUPTO) + *m = NULL; + return; + +bad_locked: + if (s) + PF_STATE_UNLOCK(s); +bad: + m_freem(m0); + goto done; +} +#endif /* INET */ + +#ifdef INET6 +static void +pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, + struct pf_state *s, struct pf_pdesc *pd) +{ + struct mbuf *m0; + struct sockaddr_in6 dst; + struct ip6_hdr *ip6; + struct ifnet *ifp = NULL; + struct pf_addr naddr; + struct pf_src_node *sn = NULL; + + KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__)); + KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction", + __func__)); + + if ((pd->pf_mtag == NULL && + ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) || + pd->pf_mtag->routed++ > 3) { + m0 = *m; + *m = NULL; + goto bad_locked; + } + + if (r->rt == PF_DUPTO) { + if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) { + if (s) + PF_STATE_UNLOCK(s); + return; + } + } else { + if ((r->rt == PF_REPLYTO) == (r->direction == dir)) { + if (s) + PF_STATE_UNLOCK(s); + return; + } + m0 = *m; + } + + ip6 = mtod(m0, struct ip6_hdr *); + + bzero(&dst, sizeof(dst)); + dst.sin6_family = AF_INET6; + dst.sin6_len = sizeof(dst); + dst.sin6_addr = ip6->ip6_dst; + + /* Cheat. XXX why only in the v6 case??? */ + if (r->rt == PF_FASTROUTE) { + if (s) + PF_STATE_UNLOCK(s); + m0->m_flags |= M_SKIP_FIREWALL; + ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); + *m = NULL; + return; + } + + if (TAILQ_EMPTY(&r->rpool.list)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); + goto bad_locked; + } + if (s == NULL) { + pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src, + &naddr, NULL, &sn); + if (!PF_AZERO(&naddr, AF_INET6)) + PF_ACPY((struct pf_addr *)&dst.sin6_addr, + &naddr, AF_INET6); + ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; + } else { + if (!PF_AZERO(&s->rt_addr, AF_INET6)) + PF_ACPY((struct pf_addr *)&dst.sin6_addr, + &s->rt_addr, AF_INET6); + ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; + } + + if (s) + PF_STATE_UNLOCK(s); + + if (ifp == NULL) + goto bad; + + if (oifp != ifp) { + if (pf_test6(PF_FWD, ifp, &m0, NULL) != PF_PASS) + goto bad; + else if (m0 == NULL) + goto done; + if (m0->m_len < sizeof(struct ip6_hdr)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("%s: m0->m_len < sizeof(struct ip6_hdr)\n", + __func__)); + goto bad; + } + ip6 = mtod(m0, struct ip6_hdr *); + } + + if (ifp->if_flags & IFF_LOOPBACK) + m0->m_flags |= M_SKIP_FIREWALL; + + if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 & + ~ifp->if_hwassist) { + uint32_t plen = m0->m_pkthdr.len - sizeof(*ip6); + in6_delayed_cksum(m0, plen, sizeof(struct ip6_hdr)); + m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; + } + + /* + * If the packet is too large for the outgoing interface, + * send back an icmp6 error. + */ + if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr)) + dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index); + if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) + nd6_output_ifp(ifp, ifp, m0, &dst, NULL); + else { + in6_ifstat_inc(ifp, ifs6_in_toobig); + if (r->rt != PF_DUPTO) + icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); + else + goto bad; + } + +done: + if (r->rt != PF_DUPTO) + *m = NULL; + return; + +bad_locked: + if (s) + PF_STATE_UNLOCK(s); +bad: + m_freem(m0); + goto done; +} +#endif /* INET6 */ + +/* + * FreeBSD supports cksum offloads for the following drivers. + * em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4), + * ti(4), txp(4), xl(4) + * + * CSUM_DATA_VALID | CSUM_PSEUDO_HDR : + * network driver performed cksum including pseudo header, need to verify + * csum_data + * CSUM_DATA_VALID : + * network driver performed cksum, needs to additional pseudo header + * cksum computation with partial csum_data(i.e. lack of H/W support for + * pseudo header, for instance hme(4), sk(4) and possibly gem(4)) + * + * After validating the cksum of packet, set both flag CSUM_DATA_VALID and + * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper + * TCP/UDP layer. + * Also, set csum_data to 0xffff to force cksum validation. + */ +static int +pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af) +{ + u_int16_t sum = 0; + int hw_assist = 0; + struct ip *ip; + + if (off < sizeof(struct ip) || len < sizeof(struct udphdr)) + return (1); + if (m->m_pkthdr.len < off + len) + return (1); + + switch (p) { + case IPPROTO_TCP: + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { + sum = m->m_pkthdr.csum_data; + } else { + ip = mtod(m, struct ip *); + sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl((u_short)len + + m->m_pkthdr.csum_data + IPPROTO_TCP)); + } + sum ^= 0xffff; + ++hw_assist; + } + break; + case IPPROTO_UDP: + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { + sum = m->m_pkthdr.csum_data; + } else { + ip = mtod(m, struct ip *); + sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl((u_short)len + + m->m_pkthdr.csum_data + IPPROTO_UDP)); + } + sum ^= 0xffff; + ++hw_assist; + } + break; + case IPPROTO_ICMP: +#ifdef INET6 + case IPPROTO_ICMPV6: +#endif /* INET6 */ + break; + default: + return (1); + } + + if (!hw_assist) { + switch (af) { + case AF_INET: + if (p == IPPROTO_ICMP) { + if (m->m_len < off) + return (1); + m->m_data += off; + m->m_len -= off; + sum = in_cksum(m, len); + m->m_data -= off; + m->m_len += off; + } else { + if (m->m_len < sizeof(struct ip)) + return (1); + sum = in4_cksum(m, p, off, len); + } + break; +#ifdef INET6 + case AF_INET6: + if (m->m_len < sizeof(struct ip6_hdr)) + return (1); + sum = in6_cksum(m, p, off, len); + break; +#endif /* INET6 */ + default: + return (1); + } + } + if (sum) { + switch (p) { + case IPPROTO_TCP: + { + KMOD_TCPSTAT_INC(tcps_rcvbadsum); + break; + } + case IPPROTO_UDP: + { + KMOD_UDPSTAT_INC(udps_badsum); + break; + } +#ifdef INET + case IPPROTO_ICMP: + { + KMOD_ICMPSTAT_INC(icps_checksum); + break; + } +#endif +#ifdef INET6 + case IPPROTO_ICMPV6: + { + KMOD_ICMP6STAT_INC(icp6s_checksum); + break; + } +#endif /* INET6 */ + } + return (1); + } else { + if (p == IPPROTO_TCP || p == IPPROTO_UDP) { + m->m_pkthdr.csum_flags |= + (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + m->m_pkthdr.csum_data = 0xffff; + } + } + return (0); +} + + +#ifdef INET +int +pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp) +{ + struct pfi_kif *kif; + u_short action, reason = 0, log = 0; + struct mbuf *m = *m0; + struct ip *h = NULL; + struct m_tag *ipfwtag; + struct pf_rule *a = NULL, *r = &V_pf_default_rule, *tr, *nr; + struct pf_state *s = NULL; + struct pf_ruleset *ruleset = NULL; + struct pf_pdesc pd; + int off, dirndx, pqid = 0; + + M_ASSERTPKTHDR(m); + + if (!V_pf_status.running) + return (PF_PASS); + + memset(&pd, 0, sizeof(pd)); + + kif = (struct pfi_kif *)ifp->if_pf_kif; + + if (kif == NULL) { + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname)); + return (PF_DROP); + } + if (kif->pfik_flags & PFI_IFLAG_SKIP) + return (PF_PASS); + + if (m->m_flags & M_SKIP_FIREWALL) + return (PF_PASS); + + pd.pf_mtag = pf_find_mtag(m); + + PF_RULES_RLOCK(); + + if (ip_divert_ptr != NULL && + ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) { + struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1); + if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) { + if (pd.pf_mtag == NULL && + ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { + action = PF_DROP; + goto done; + } + pd.pf_mtag->flags |= PF_PACKET_LOOPED; + m_tag_delete(m, ipfwtag); + } + if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) { + m->m_flags |= M_FASTFWD_OURS; + pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT; + } + } else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) { + /* We do IP header normalization and packet reassembly here */ + action = PF_DROP; + goto done; + } + m = *m0; /* pf_normalize messes with m0 */ + h = mtod(m, struct ip *); + + off = h->ip_hl << 2; + if (off < (int)sizeof(struct ip)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + log = 1; + goto done; + } + + pd.src = (struct pf_addr *)&h->ip_src; + pd.dst = (struct pf_addr *)&h->ip_dst; + pd.sport = pd.dport = NULL; + pd.ip_sum = &h->ip_sum; + pd.proto_sum = NULL; + pd.proto = h->ip_p; + pd.dir = dir; + pd.sidx = (dir == PF_IN) ? 0 : 1; + pd.didx = (dir == PF_IN) ? 1 : 0; + pd.af = AF_INET; + pd.tos = h->ip_tos; + pd.tot_len = ntohs(h->ip_len); + + /* handle fragments that didn't get reassembled by normalization */ + if (h->ip_off & htons(IP_MF | IP_OFFMASK)) { + action = pf_test_fragment(&r, dir, kif, m, h, + &pd, &a, &ruleset); + goto done; + } + + switch (h->ip_p) { + + case IPPROTO_TCP: { + struct tcphdr th; + + pd.hdr.tcp = &th; + if (!pf_pull_hdr(m, off, &th, sizeof(th), + &action, &reason, AF_INET)) { + log = action != PF_PASS; + goto done; + } + pd.p_len = pd.tot_len - off - (th.th_off << 2); + if ((th.th_flags & TH_ACK) && pd.p_len == 0) + pqid = 1; + action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); + if (action == PF_DROP) + goto done; + action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, + &reason); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + + case IPPROTO_UDP: { + struct udphdr uh; + + pd.hdr.udp = &uh; + if (!pf_pull_hdr(m, off, &uh, sizeof(uh), + &action, &reason, AF_INET)) { + log = action != PF_PASS; + goto done; + } + if (uh.uh_dport == 0 || + ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || + ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + goto done; + } + action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + + case IPPROTO_ICMP: { + struct icmp ih; + + pd.hdr.icmp = &ih; + if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN, + &action, &reason, AF_INET)) { + log = action != PF_PASS; + goto done; + } + action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd, + &reason); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + +#ifdef INET6 + case IPPROTO_ICMPV6: { + action = PF_DROP; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: dropping IPv4 packet with ICMPv6 payload\n")); + goto done; + } +#endif + + default: + action = pf_test_state_other(&s, dir, kif, m, &pd); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + +done: + PF_RULES_RUNLOCK(); + if (action == PF_PASS && h->ip_hl > 5 && + !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_IPOPTIONS); + log = r->log; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: dropping packet with ip options\n")); + } + + if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + } + if (r->rtableid >= 0) + M_SETFIB(m, r->rtableid); + + if (r->scrub_flags & PFSTATE_SETPRIO) { + if (pd.tos & IPTOS_LOWDELAY) + pqid = 1; + if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) { + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + log = 1; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: failed to allocate 802.1q mtag\n")); + } + } + +#ifdef ALTQ + if (action == PF_PASS && r->qid) { + if (pd.pf_mtag == NULL && + ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + } else { + if (s != NULL) + pd.pf_mtag->qid_hash = pf_state_hash(s); + if (pqid || (pd.tos & IPTOS_LOWDELAY)) + pd.pf_mtag->qid = r->pqid; + else + pd.pf_mtag->qid = r->qid; + /* Add hints for ecn. */ + pd.pf_mtag->hdr = h; + } + + } +#endif /* ALTQ */ + + /* + * connections redirected to loopback should not match sockets + * bound specifically to loopback due to security implications, + * see tcp_input() and in_pcblookup_listen(). + */ + if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || + pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && + (s->nat_rule.ptr->action == PF_RDR || + s->nat_rule.ptr->action == PF_BINAT) && + (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) + m->m_flags |= M_SKIP_FIREWALL; + + if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL && + !PACKET_LOOPED(&pd)) { + + ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0, + sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); + if (ipfwtag != NULL) { + ((struct ipfw_rule_ref *)(ipfwtag+1))->info = + ntohs(r->divert.port); + ((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir; + + if (s) + PF_STATE_UNLOCK(s); + + m_tag_prepend(m, ipfwtag); + if (m->m_flags & M_FASTFWD_OURS) { + if (pd.pf_mtag == NULL && + ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + log = 1; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: failed to allocate tag\n")); + } else { + pd.pf_mtag->flags |= + PF_FASTFWD_OURS_PRESENT; + m->m_flags &= ~M_FASTFWD_OURS; + } + } + ip_divert_ptr(*m0, dir == PF_IN ? DIR_IN : DIR_OUT); + *m0 = NULL; + + return (action); + } else { + /* XXX: ipfw has the same behaviour! */ + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + log = 1; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: failed to allocate divert tag\n")); + } + } + + if (log) { + struct pf_rule *lr; + + if (s != NULL && s->nat_rule.ptr != NULL && + s->nat_rule.ptr->log & PF_LOG_ALL) + lr = s->nat_rule.ptr; + else + lr = r; + PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd, + (s == NULL)); + } + + kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len; + kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++; + + if (action == PF_PASS || r->action == PF_DROP) { + dirndx = (dir == PF_OUT); + r->packets[dirndx]++; + r->bytes[dirndx] += pd.tot_len; + if (a != NULL) { + a->packets[dirndx]++; + a->bytes[dirndx] += pd.tot_len; + } + if (s != NULL) { + if (s->nat_rule.ptr != NULL) { + s->nat_rule.ptr->packets[dirndx]++; + s->nat_rule.ptr->bytes[dirndx] += pd.tot_len; + } + if (s->src_node != NULL) { + s->src_node->packets[dirndx]++; + s->src_node->bytes[dirndx] += pd.tot_len; + } + if (s->nat_src_node != NULL) { + s->nat_src_node->packets[dirndx]++; + s->nat_src_node->bytes[dirndx] += pd.tot_len; + } + dirndx = (dir == s->direction) ? 0 : 1; + s->packets[dirndx]++; + s->bytes[dirndx] += pd.tot_len; + } + tr = r; + nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; + if (nr != NULL && r == &V_pf_default_rule) + tr = nr; + if (tr->src.addr.type == PF_ADDR_TABLE) + pfr_update_stats(tr->src.addr.p.tbl, + (s == NULL) ? pd.src : + &s->key[(s->direction == PF_IN)]-> + addr[(s->direction == PF_OUT)], + pd.af, pd.tot_len, dir == PF_OUT, + r->action == PF_PASS, tr->src.neg); + if (tr->dst.addr.type == PF_ADDR_TABLE) + pfr_update_stats(tr->dst.addr.p.tbl, + (s == NULL) ? pd.dst : + &s->key[(s->direction == PF_IN)]-> + addr[(s->direction == PF_IN)], + pd.af, pd.tot_len, dir == PF_OUT, + r->action == PF_PASS, tr->dst.neg); + } + + switch (action) { + case PF_SYNPROXY_DROP: + m_freem(*m0); + case PF_DEFER: + *m0 = NULL; + action = PF_PASS; + break; + case PF_DROP: + m_freem(*m0); + *m0 = NULL; + break; + default: + /* pf_route() returns unlocked. */ + if (r->rt) { + pf_route(m0, r, dir, kif->pfik_ifp, s, &pd); + return (action); + } + break; + } + if (s) + PF_STATE_UNLOCK(s); + + return (action); +} +#endif /* INET */ + +#ifdef INET6 +int +pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp) +{ + struct pfi_kif *kif; + u_short action, reason = 0, log = 0; + struct mbuf *m = *m0, *n = NULL; + struct m_tag *mtag; + struct ip6_hdr *h = NULL; + struct pf_rule *a = NULL, *r = &V_pf_default_rule, *tr, *nr; + struct pf_state *s = NULL; + struct pf_ruleset *ruleset = NULL; + struct pf_pdesc pd; + int off, terminal = 0, dirndx, rh_cnt = 0, pqid = 0; + int fwdir = dir; + + M_ASSERTPKTHDR(m); + + /* Detect packet forwarding. + * If the input interface is different from the output interface we're + * forwarding. + * We do need to be careful about bridges. If the + * net.link.bridge.pfil_bridge sysctl is set we can be filtering on a + * bridge, so if the input interface is a bridge member and the output + * interface is its bridge or a member of the same bridge we're not + * actually forwarding but bridging. + */ + if (dir == PF_OUT && m->m_pkthdr.rcvif && ifp != m->m_pkthdr.rcvif && + (m->m_pkthdr.rcvif->if_bridge == NULL || + (m->m_pkthdr.rcvif->if_bridge != ifp->if_softc && + m->m_pkthdr.rcvif->if_bridge != ifp->if_bridge))) + fwdir = PF_FWD; + + if (!V_pf_status.running) + return (PF_PASS); + + memset(&pd, 0, sizeof(pd)); + pd.pf_mtag = pf_find_mtag(m); + + if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED) + return (PF_PASS); + + kif = (struct pfi_kif *)ifp->if_pf_kif; + if (kif == NULL) { + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname)); + return (PF_DROP); + } + if (kif->pfik_flags & PFI_IFLAG_SKIP) + return (PF_PASS); + + if (m->m_flags & M_SKIP_FIREWALL) + return (PF_PASS); + + PF_RULES_RLOCK(); + + /* We do IP header normalization and packet reassembly here */ + if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) { + action = PF_DROP; + goto done; + } + m = *m0; /* pf_normalize messes with m0 */ + h = mtod(m, struct ip6_hdr *); + +#if 1 + /* + * we do not support jumbogram yet. if we keep going, zero ip6_plen + * will do something bad, so drop the packet for now. + */ + if (htons(h->ip6_plen) == 0) { + action = PF_DROP; + REASON_SET(&reason, PFRES_NORM); /*XXX*/ + goto done; + } +#endif + + pd.src = (struct pf_addr *)&h->ip6_src; + pd.dst = (struct pf_addr *)&h->ip6_dst; + pd.sport = pd.dport = NULL; + pd.ip_sum = NULL; + pd.proto_sum = NULL; + pd.dir = dir; + pd.sidx = (dir == PF_IN) ? 0 : 1; + pd.didx = (dir == PF_IN) ? 1 : 0; + pd.af = AF_INET6; + pd.tos = 0; + pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr); + + off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr); + pd.proto = h->ip6_nxt; + do { + switch (pd.proto) { + case IPPROTO_FRAGMENT: + action = pf_test_fragment(&r, dir, kif, m, h, + &pd, &a, &ruleset); + if (action == PF_DROP) + REASON_SET(&reason, PFRES_FRAG); + goto done; + case IPPROTO_ROUTING: { + struct ip6_rthdr rthdr; + + if (rh_cnt++) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: IPv6 more than one rthdr\n")); + action = PF_DROP; + REASON_SET(&reason, PFRES_IPOPTIONS); + log = 1; + goto done; + } + if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL, + &reason, pd.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: IPv6 short rthdr\n")); + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + log = 1; + goto done; + } + if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: IPv6 rthdr0\n")); + action = PF_DROP; + REASON_SET(&reason, PFRES_IPOPTIONS); + log = 1; + goto done; + } + /* FALLTHROUGH */ + } + case IPPROTO_AH: + case IPPROTO_HOPOPTS: + case IPPROTO_DSTOPTS: { + /* get next header and header length */ + struct ip6_ext opt6; + + if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6), + NULL, &reason, pd.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: IPv6 short opt\n")); + action = PF_DROP; + log = 1; + goto done; + } + if (pd.proto == IPPROTO_AH) + off += (opt6.ip6e_len + 2) * 4; + else + off += (opt6.ip6e_len + 1) * 8; + pd.proto = opt6.ip6e_nxt; + /* goto the next header */ + break; + } + default: + terminal++; + break; + } + } while (!terminal); + + /* if there's no routing header, use unmodified mbuf for checksumming */ + if (!n) + n = m; + + switch (pd.proto) { + + case IPPROTO_TCP: { + struct tcphdr th; + + pd.hdr.tcp = &th; + if (!pf_pull_hdr(m, off, &th, sizeof(th), + &action, &reason, AF_INET6)) { + log = action != PF_PASS; + goto done; + } + pd.p_len = pd.tot_len - off - (th.th_off << 2); + action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); + if (action == PF_DROP) + goto done; + action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, + &reason); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + + case IPPROTO_UDP: { + struct udphdr uh; + + pd.hdr.udp = &uh; + if (!pf_pull_hdr(m, off, &uh, sizeof(uh), + &action, &reason, AF_INET6)) { + log = action != PF_PASS; + goto done; + } + if (uh.uh_dport == 0 || + ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || + ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + goto done; + } + action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + + case IPPROTO_ICMP: { + action = PF_DROP; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: dropping IPv6 packet with ICMPv4 payload\n")); + goto done; + } + + case IPPROTO_ICMPV6: { + struct icmp6_hdr ih; + + pd.hdr.icmp6 = &ih; + if (!pf_pull_hdr(m, off, &ih, sizeof(ih), + &action, &reason, AF_INET6)) { + log = action != PF_PASS; + goto done; + } + action = pf_test_state_icmp(&s, dir, kif, + m, off, h, &pd, &reason); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + + default: + action = pf_test_state_other(&s, dir, kif, m, &pd); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + +done: + PF_RULES_RUNLOCK(); + if (n != m) { + m_freem(n); + n = NULL; + } + + /* handle dangerous IPv6 extension headers. */ + if (action == PF_PASS && rh_cnt && + !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_IPOPTIONS); + log = r->log; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: dropping packet with dangerous v6 headers\n")); + } + + if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + } + if (r->rtableid >= 0) + M_SETFIB(m, r->rtableid); + + if (r->scrub_flags & PFSTATE_SETPRIO) { + if (pd.tos & IPTOS_LOWDELAY) + pqid = 1; + if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) { + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + log = 1; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: failed to allocate 802.1q mtag\n")); + } + } + +#ifdef ALTQ + if (action == PF_PASS && r->qid) { + if (pd.pf_mtag == NULL && + ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + } else { + if (s != NULL) + pd.pf_mtag->qid_hash = pf_state_hash(s); + if (pd.tos & IPTOS_LOWDELAY) + pd.pf_mtag->qid = r->pqid; + else + pd.pf_mtag->qid = r->qid; + /* Add hints for ecn. */ + pd.pf_mtag->hdr = h; + } + } +#endif /* ALTQ */ + + if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || + pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && + (s->nat_rule.ptr->action == PF_RDR || + s->nat_rule.ptr->action == PF_BINAT) && + IN6_IS_ADDR_LOOPBACK(&pd.dst->v6)) + m->m_flags |= M_SKIP_FIREWALL; + + /* XXX: Anybody working on it?! */ + if (r->divert.port) + printf("pf: divert(9) is not supported for IPv6\n"); + + if (log) { + struct pf_rule *lr; + + if (s != NULL && s->nat_rule.ptr != NULL && + s->nat_rule.ptr->log & PF_LOG_ALL) + lr = s->nat_rule.ptr; + else + lr = r; + PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset, + &pd, (s == NULL)); + } + + kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len; + kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++; + + if (action == PF_PASS || r->action == PF_DROP) { + dirndx = (dir == PF_OUT); + r->packets[dirndx]++; + r->bytes[dirndx] += pd.tot_len; + if (a != NULL) { + a->packets[dirndx]++; + a->bytes[dirndx] += pd.tot_len; + } + if (s != NULL) { + if (s->nat_rule.ptr != NULL) { + s->nat_rule.ptr->packets[dirndx]++; + s->nat_rule.ptr->bytes[dirndx] += pd.tot_len; + } + if (s->src_node != NULL) { + s->src_node->packets[dirndx]++; + s->src_node->bytes[dirndx] += pd.tot_len; + } + if (s->nat_src_node != NULL) { + s->nat_src_node->packets[dirndx]++; + s->nat_src_node->bytes[dirndx] += pd.tot_len; + } + dirndx = (dir == s->direction) ? 0 : 1; + s->packets[dirndx]++; + s->bytes[dirndx] += pd.tot_len; + } + tr = r; + nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; + if (nr != NULL && r == &V_pf_default_rule) + tr = nr; + if (tr->src.addr.type == PF_ADDR_TABLE) + pfr_update_stats(tr->src.addr.p.tbl, + (s == NULL) ? pd.src : + &s->key[(s->direction == PF_IN)]->addr[0], + pd.af, pd.tot_len, dir == PF_OUT, + r->action == PF_PASS, tr->src.neg); + if (tr->dst.addr.type == PF_ADDR_TABLE) + pfr_update_stats(tr->dst.addr.p.tbl, + (s == NULL) ? pd.dst : + &s->key[(s->direction == PF_IN)]->addr[1], + pd.af, pd.tot_len, dir == PF_OUT, + r->action == PF_PASS, tr->dst.neg); + } + + switch (action) { + case PF_SYNPROXY_DROP: + m_freem(*m0); + case PF_DEFER: + *m0 = NULL; + action = PF_PASS; + break; + case PF_DROP: + m_freem(*m0); + *m0 = NULL; + break; + default: + /* pf_route6() returns unlocked. */ + if (r->rt) { + pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd); + return (action); + } + break; + } + + if (s) + PF_STATE_UNLOCK(s); + + /* If reassembled packet passed, create new fragments. */ + if (action == PF_PASS && *m0 && fwdir == PF_FWD && + (mtag = m_tag_find(m, PF_REASSEMBLED, NULL)) != NULL) + action = pf_refragment6(ifp, m0, mtag); + + return (action); +} +#endif /* INET6 */ diff --git a/freebsd/sys/netpfil/pf/pf.h b/freebsd/sys/netpfil/pf/pf.h new file mode 100644 index 00000000..ac0e0fb9 --- /dev/null +++ b/freebsd/sys/netpfil/pf/pf.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2001 Daniel Hartmeier + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $OpenBSD: pfvar.h,v 1.282 2009/01/29 15:12:28 pyr Exp $ + * $FreeBSD$ + */ + +#ifndef _NET_PF_H_ +#define _NET_PF_H_ + +#define PF_TCPS_PROXY_SRC ((TCP_NSTATES)+0) +#define PF_TCPS_PROXY_DST ((TCP_NSTATES)+1) + +#define PF_MD5_DIGEST_LENGTH 16 +#ifdef MD5_DIGEST_LENGTH +#if PF_MD5_DIGEST_LENGTH != MD5_DIGEST_LENGTH +#error +#endif +#endif + +enum { PF_INOUT, PF_IN, PF_OUT, PF_FWD }; +enum { PF_PASS, PF_DROP, PF_SCRUB, PF_NOSCRUB, PF_NAT, PF_NONAT, + PF_BINAT, PF_NOBINAT, PF_RDR, PF_NORDR, PF_SYNPROXY_DROP, PF_DEFER }; +enum { PF_RULESET_SCRUB, PF_RULESET_FILTER, PF_RULESET_NAT, + PF_RULESET_BINAT, PF_RULESET_RDR, PF_RULESET_MAX }; +enum { PF_OP_NONE, PF_OP_IRG, PF_OP_EQ, PF_OP_NE, PF_OP_LT, + PF_OP_LE, PF_OP_GT, PF_OP_GE, PF_OP_XRG, PF_OP_RRG }; +enum { PF_DEBUG_NONE, PF_DEBUG_URGENT, PF_DEBUG_MISC, PF_DEBUG_NOISY }; +enum { PF_CHANGE_NONE, PF_CHANGE_ADD_HEAD, PF_CHANGE_ADD_TAIL, + PF_CHANGE_ADD_BEFORE, PF_CHANGE_ADD_AFTER, + PF_CHANGE_REMOVE, PF_CHANGE_GET_TICKET }; +enum { PF_GET_NONE, PF_GET_CLR_CNTR }; +enum { PF_SK_WIRE, PF_SK_STACK, PF_SK_BOTH }; + +/* + * Note about PFTM_*: real indices into pf_rule.timeout[] come before + * PFTM_MAX, special cases afterwards. See pf_state_expires(). + */ +enum { PFTM_TCP_FIRST_PACKET, PFTM_TCP_OPENING, PFTM_TCP_ESTABLISHED, + PFTM_TCP_CLOSING, PFTM_TCP_FIN_WAIT, PFTM_TCP_CLOSED, + PFTM_UDP_FIRST_PACKET, PFTM_UDP_SINGLE, PFTM_UDP_MULTIPLE, + PFTM_ICMP_FIRST_PACKET, PFTM_ICMP_ERROR_REPLY, + PFTM_OTHER_FIRST_PACKET, PFTM_OTHER_SINGLE, + PFTM_OTHER_MULTIPLE, PFTM_FRAG, PFTM_INTERVAL, + PFTM_ADAPTIVE_START, PFTM_ADAPTIVE_END, PFTM_SRC_NODE, + PFTM_TS_DIFF, PFTM_MAX, PFTM_PURGE, PFTM_UNLINKED }; + +/* PFTM default values */ +#define PFTM_TCP_FIRST_PACKET_VAL 120 /* First TCP packet */ +#define PFTM_TCP_OPENING_VAL 30 /* No response yet */ +#define PFTM_TCP_ESTABLISHED_VAL 24*60*60/* Established */ +#define PFTM_TCP_CLOSING_VAL 15 * 60 /* Half closed */ +#define PFTM_TCP_FIN_WAIT_VAL 45 /* Got both FINs */ +#define PFTM_TCP_CLOSED_VAL 90 /* Got a RST */ +#define PFTM_UDP_FIRST_PACKET_VAL 60 /* First UDP packet */ +#define PFTM_UDP_SINGLE_VAL 30 /* Unidirectional */ +#define PFTM_UDP_MULTIPLE_VAL 60 /* Bidirectional */ +#define PFTM_ICMP_FIRST_PACKET_VAL 20 /* First ICMP packet */ +#define PFTM_ICMP_ERROR_REPLY_VAL 10 /* Got error response */ +#define PFTM_OTHER_FIRST_PACKET_VAL 60 /* First packet */ +#define PFTM_OTHER_SINGLE_VAL 30 /* Unidirectional */ +#define PFTM_OTHER_MULTIPLE_VAL 60 /* Bidirectional */ +#define PFTM_FRAG_VAL 30 /* Fragment expire */ +#define PFTM_INTERVAL_VAL 10 /* Expire interval */ +#define PFTM_SRC_NODE_VAL 0 /* Source tracking */ +#define PFTM_TS_DIFF_VAL 30 /* Allowed TS diff */ + +enum { PF_NOPFROUTE, PF_FASTROUTE, PF_ROUTETO, PF_DUPTO, PF_REPLYTO }; +enum { PF_LIMIT_STATES, PF_LIMIT_SRC_NODES, PF_LIMIT_FRAGS, + PF_LIMIT_TABLE_ENTRIES, PF_LIMIT_MAX }; +#define PF_POOL_IDMASK 0x0f +enum { PF_POOL_NONE, PF_POOL_BITMASK, PF_POOL_RANDOM, + PF_POOL_SRCHASH, PF_POOL_ROUNDROBIN }; +enum { PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, PF_ADDR_DYNIFTL, + PF_ADDR_TABLE, PF_ADDR_URPFFAILED, + PF_ADDR_RANGE }; +#define PF_POOL_TYPEMASK 0x0f +#define PF_POOL_STICKYADDR 0x20 +#define PF_WSCALE_FLAG 0x80 +#define PF_WSCALE_MASK 0x0f + +#define PF_LOG 0x01 +#define PF_LOG_ALL 0x02 +#define PF_LOG_SOCKET_LOOKUP 0x04 + +/* Reasons code for passing/dropping a packet */ +#define PFRES_MATCH 0 /* Explicit match of a rule */ +#define PFRES_BADOFF 1 /* Bad offset for pull_hdr */ +#define PFRES_FRAG 2 /* Dropping following fragment */ +#define PFRES_SHORT 3 /* Dropping short packet */ +#define PFRES_NORM 4 /* Dropping by normalizer */ +#define PFRES_MEMORY 5 /* Dropped due to lacking mem */ +#define PFRES_TS 6 /* Bad TCP Timestamp (RFC1323) */ +#define PFRES_CONGEST 7 /* Congestion (of ipintrq) */ +#define PFRES_IPOPTIONS 8 /* IP option */ +#define PFRES_PROTCKSUM 9 /* Protocol checksum invalid */ +#define PFRES_BADSTATE 10 /* State mismatch */ +#define PFRES_STATEINS 11 /* State insertion failure */ +#define PFRES_MAXSTATES 12 /* State limit */ +#define PFRES_SRCLIMIT 13 /* Source node/conn limit */ +#define PFRES_SYNPROXY 14 /* SYN proxy */ +#define PFRES_MAPFAILED 15 /* pf_map_addr() failed */ +#define PFRES_MAX 16 /* total+1 */ + +#define PFRES_NAMES { \ + "match", \ + "bad-offset", \ + "fragment", \ + "short", \ + "normalize", \ + "memory", \ + "bad-timestamp", \ + "congestion", \ + "ip-option", \ + "proto-cksum", \ + "state-mismatch", \ + "state-insert", \ + "state-limit", \ + "src-limit", \ + "synproxy", \ + "map-failed", \ + NULL \ +} + +/* Counters for other things we want to keep track of */ +#define LCNT_STATES 0 /* states */ +#define LCNT_SRCSTATES 1 /* max-src-states */ +#define LCNT_SRCNODES 2 /* max-src-nodes */ +#define LCNT_SRCCONN 3 /* max-src-conn */ +#define LCNT_SRCCONNRATE 4 /* max-src-conn-rate */ +#define LCNT_OVERLOAD_TABLE 5 /* entry added to overload table */ +#define LCNT_OVERLOAD_FLUSH 6 /* state entries flushed */ +#define LCNT_MAX 7 /* total+1 */ + +#define LCNT_NAMES { \ + "max states per rule", \ + "max-src-states", \ + "max-src-nodes", \ + "max-src-conn", \ + "max-src-conn-rate", \ + "overload table insertion", \ + "overload flush states", \ + NULL \ +} + +/* state operation counters */ +#define FCNT_STATE_SEARCH 0 +#define FCNT_STATE_INSERT 1 +#define FCNT_STATE_REMOVALS 2 +#define FCNT_MAX 3 + +/* src_node operation counters */ +#define SCNT_SRC_NODE_SEARCH 0 +#define SCNT_SRC_NODE_INSERT 1 +#define SCNT_SRC_NODE_REMOVALS 2 +#define SCNT_MAX 3 + +#define PF_TABLE_NAME_SIZE 32 +#define PF_QNAME_SIZE 64 + +struct pf_status { + uint64_t counters[PFRES_MAX]; + uint64_t lcounters[LCNT_MAX]; + uint64_t fcounters[FCNT_MAX]; + uint64_t scounters[SCNT_MAX]; + uint64_t pcounters[2][2][3]; + uint64_t bcounters[2][2]; + uint32_t running; + uint32_t states; + uint32_t src_nodes; + uint32_t since; + uint32_t debug; + uint32_t hostid; + char ifname[IFNAMSIZ]; + uint8_t pf_chksum[PF_MD5_DIGEST_LENGTH]; +}; + +#endif /* _NET_PF_H_ */ diff --git a/freebsd/sys/netpfil/pf/pf_altq.h b/freebsd/sys/netpfil/pf/pf_altq.h new file mode 100644 index 00000000..3efd4ff7 --- /dev/null +++ b/freebsd/sys/netpfil/pf/pf_altq.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2001 Daniel Hartmeier + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $OpenBSD: pfvar.h,v 1.282 2009/01/29 15:12:28 pyr Exp $ + * $FreeBSD$ + */ + +#ifndef _NET_PF_ALTQ_H_ +#define _NET_PF_ALTQ_H_ + +struct cbq_opts { + u_int minburst; + u_int maxburst; + u_int pktsize; + u_int maxpktsize; + u_int ns_per_byte; + u_int maxidle; + int minidle; + u_int offtime; + int flags; +}; + +struct codel_opts { + u_int target; + u_int interval; + int ecn; +}; + +struct priq_opts { + int flags; +}; + +struct hfsc_opts { + /* real-time service curve */ + u_int rtsc_m1; /* slope of the 1st segment in bps */ + u_int rtsc_d; /* the x-projection of m1 in msec */ + u_int rtsc_m2; /* slope of the 2nd segment in bps */ + /* link-sharing service curve */ + u_int lssc_m1; + u_int lssc_d; + u_int lssc_m2; + /* upper-limit service curve */ + u_int ulsc_m1; + u_int ulsc_d; + u_int ulsc_m2; + int flags; +}; + +/* + * XXX this needs some work + */ +struct fairq_opts { + u_int nbuckets; + u_int hogs_m1; + int flags; + + /* link sharing service curve */ + u_int lssc_m1; + u_int lssc_d; + u_int lssc_m2; +}; + +struct pf_altq { + char ifname[IFNAMSIZ]; + + void *altq_disc; /* discipline-specific state */ + TAILQ_ENTRY(pf_altq) entries; + + /* scheduler spec */ + uint8_t scheduler; /* scheduler type */ + uint16_t tbrsize; /* tokenbucket regulator size */ + uint32_t ifbandwidth; /* interface bandwidth */ + + /* queue spec */ + char qname[PF_QNAME_SIZE]; /* queue name */ + char parent[PF_QNAME_SIZE]; /* parent name */ + uint32_t parent_qid; /* parent queue id */ + uint32_t bandwidth; /* queue bandwidth */ + uint8_t priority; /* priority */ + uint8_t local_flags; /* dynamic interface */ +#define PFALTQ_FLAG_IF_REMOVED 0x01 + + uint16_t qlimit; /* queue size limit */ + uint16_t flags; /* misc flags */ + union { + struct cbq_opts cbq_opts; + struct codel_opts codel_opts; + struct priq_opts priq_opts; + struct hfsc_opts hfsc_opts; + struct fairq_opts fairq_opts; + } pq_u; + + uint32_t qid; /* return value */ +}; + +#endif /* _NET_PF_ALTQ_H_ */ diff --git a/freebsd/sys/netpfil/pf/pf_if.c b/freebsd/sys/netpfil/pf/pf_if.c new file mode 100644 index 00000000..d1c54b22 --- /dev/null +++ b/freebsd/sys/netpfil/pf/pf_if.c @@ -0,0 +1,924 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2003 Cedric Berger + * Copyright (c) 2005 Henning Brauer <henning@openbsd.org> + * Copyright (c) 2005 Ryan McBride <mcbride@openbsd.org> + * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $OpenBSD: pf_if.c,v 1.54 2008/06/14 16:55:28 mk Exp $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> + +#include <rtems/bsd/sys/param.h> +#include <sys/kernel.h> +#include <sys/eventhandler.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/mbuf.h> +#include <sys/rwlock.h> +#include <sys/socket.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/vnet.h> +#include <net/pfvar.h> +#include <net/route.h> + +VNET_DEFINE(struct pfi_kif *, pfi_all); +static VNET_DEFINE(long, pfi_update); +#define V_pfi_update VNET(pfi_update) +#define PFI_BUFFER_MAX 0x10000 + +VNET_DECLARE(int, pf_vnet_active); +#define V_pf_vnet_active VNET(pf_vnet_active) + +static VNET_DEFINE(struct pfr_addr *, pfi_buffer); +static VNET_DEFINE(int, pfi_buffer_cnt); +static VNET_DEFINE(int, pfi_buffer_max); +#define V_pfi_buffer VNET(pfi_buffer) +#define V_pfi_buffer_cnt VNET(pfi_buffer_cnt) +#define V_pfi_buffer_max VNET(pfi_buffer_max) + +eventhandler_tag pfi_attach_cookie; +eventhandler_tag pfi_detach_cookie; +eventhandler_tag pfi_attach_group_cookie; +eventhandler_tag pfi_change_group_cookie; +eventhandler_tag pfi_detach_group_cookie; +eventhandler_tag pfi_ifaddr_event_cookie; + +static void pfi_attach_ifnet(struct ifnet *); +static void pfi_attach_ifgroup(struct ifg_group *); + +static void pfi_kif_update(struct pfi_kif *); +static void pfi_dynaddr_update(struct pfi_dynaddr *dyn); +static void pfi_table_update(struct pfr_ktable *, struct pfi_kif *, int, + int); +static void pfi_instance_add(struct ifnet *, int, int); +static void pfi_address_add(struct sockaddr *, int, int); +static int pfi_if_compare(struct pfi_kif *, struct pfi_kif *); +static int pfi_skip_if(const char *, struct pfi_kif *); +static int pfi_unmask(void *); +static void pfi_attach_ifnet_event(void * __unused, struct ifnet *); +static void pfi_detach_ifnet_event(void * __unused, struct ifnet *); +static void pfi_attach_group_event(void *, struct ifg_group *); +static void pfi_change_group_event(void *, char *); +static void pfi_detach_group_event(void *, struct ifg_group *); +static void pfi_ifaddr_event(void * __unused, struct ifnet *); + +RB_HEAD(pfi_ifhead, pfi_kif); +static RB_PROTOTYPE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare); +static RB_GENERATE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare); +static VNET_DEFINE(struct pfi_ifhead, pfi_ifs); +#define V_pfi_ifs VNET(pfi_ifs) + +#define PFI_BUFFER_MAX 0x10000 +MALLOC_DEFINE(PFI_MTYPE, "pf_ifnet", "pf(4) interface database"); + +LIST_HEAD(pfi_list, pfi_kif); +static VNET_DEFINE(struct pfi_list, pfi_unlinked_kifs); +#define V_pfi_unlinked_kifs VNET(pfi_unlinked_kifs) +static struct mtx pfi_unlnkdkifs_mtx; +MTX_SYSINIT(pfi_unlnkdkifs_mtx, &pfi_unlnkdkifs_mtx, "pf unlinked interfaces", + MTX_DEF); + +void +pfi_initialize_vnet(void) +{ + struct ifg_group *ifg; + struct ifnet *ifp; + struct pfi_kif *kif; + + V_pfi_buffer_max = 64; + V_pfi_buffer = malloc(V_pfi_buffer_max * sizeof(*V_pfi_buffer), + PFI_MTYPE, M_WAITOK); + + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + PF_RULES_WLOCK(); + V_pfi_all = pfi_kif_attach(kif, IFG_ALL); + PF_RULES_WUNLOCK(); + + IFNET_RLOCK(); + TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) + pfi_attach_ifgroup(ifg); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) + pfi_attach_ifnet(ifp); + IFNET_RUNLOCK(); +} + +void +pfi_initialize(void) +{ + + pfi_attach_cookie = EVENTHANDLER_REGISTER(ifnet_arrival_event, + pfi_attach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY); + pfi_detach_cookie = EVENTHANDLER_REGISTER(ifnet_departure_event, + pfi_detach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY); + pfi_attach_group_cookie = EVENTHANDLER_REGISTER(group_attach_event, + pfi_attach_group_event, curvnet, EVENTHANDLER_PRI_ANY); + pfi_change_group_cookie = EVENTHANDLER_REGISTER(group_change_event, + pfi_change_group_event, curvnet, EVENTHANDLER_PRI_ANY); + pfi_detach_group_cookie = EVENTHANDLER_REGISTER(group_detach_event, + pfi_detach_group_event, curvnet, EVENTHANDLER_PRI_ANY); + pfi_ifaddr_event_cookie = EVENTHANDLER_REGISTER(ifaddr_event, + pfi_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY); +} + +void +pfi_cleanup_vnet(void) +{ + struct pfi_kif *kif; + + PF_RULES_WASSERT(); + + V_pfi_all = NULL; + while ((kif = RB_MIN(pfi_ifhead, &V_pfi_ifs))) { + RB_REMOVE(pfi_ifhead, &V_pfi_ifs, kif); + if (kif->pfik_group) + kif->pfik_group->ifg_pf_kif = NULL; + if (kif->pfik_ifp) + kif->pfik_ifp->if_pf_kif = NULL; + free(kif, PFI_MTYPE); + } + + mtx_lock(&pfi_unlnkdkifs_mtx); + while ((kif = LIST_FIRST(&V_pfi_unlinked_kifs))) { + LIST_REMOVE(kif, pfik_list); + free(kif, PFI_MTYPE); + } + mtx_unlock(&pfi_unlnkdkifs_mtx); + + free(V_pfi_buffer, PFI_MTYPE); +} + +void +pfi_cleanup(void) +{ + + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, pfi_attach_cookie); + EVENTHANDLER_DEREGISTER(ifnet_departure_event, pfi_detach_cookie); + EVENTHANDLER_DEREGISTER(group_attach_event, pfi_attach_group_cookie); + EVENTHANDLER_DEREGISTER(group_change_event, pfi_change_group_cookie); + EVENTHANDLER_DEREGISTER(group_detach_event, pfi_detach_group_cookie); + EVENTHANDLER_DEREGISTER(ifaddr_event, pfi_ifaddr_event_cookie); +} + +struct pfi_kif * +pfi_kif_find(const char *kif_name) +{ + struct pfi_kif_cmp s; + + PF_RULES_ASSERT(); + + bzero(&s, sizeof(s)); + strlcpy(s.pfik_name, kif_name, sizeof(s.pfik_name)); + + return (RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kif *)&s)); +} + +struct pfi_kif * +pfi_kif_attach(struct pfi_kif *kif, const char *kif_name) +{ + struct pfi_kif *kif1; + + PF_RULES_WASSERT(); + KASSERT(kif != NULL, ("%s: null kif", __func__)); + + kif1 = pfi_kif_find(kif_name); + if (kif1 != NULL) { + free(kif, PFI_MTYPE); + return (kif1); + } + + bzero(kif, sizeof(*kif)); + strlcpy(kif->pfik_name, kif_name, sizeof(kif->pfik_name)); + /* + * It seems that the value of time_second is in unintialzied state + * when pf sets interface statistics clear time in boot phase if pf + * was statically linked to kernel. Instead of setting the bogus + * time value have pfi_get_ifaces handle this case. In + * pfi_get_ifaces it uses time_second if it sees the time is 0. + */ + kif->pfik_tzero = time_second > 1 ? time_second : 0; + TAILQ_INIT(&kif->pfik_dynaddrs); + + RB_INSERT(pfi_ifhead, &V_pfi_ifs, kif); + + return (kif); +} + +void +pfi_kif_ref(struct pfi_kif *kif) +{ + + PF_RULES_WASSERT(); + kif->pfik_rulerefs++; +} + +void +pfi_kif_unref(struct pfi_kif *kif) +{ + + PF_RULES_WASSERT(); + KASSERT(kif->pfik_rulerefs > 0, ("%s: %p has zero refs", __func__, kif)); + + kif->pfik_rulerefs--; + + if (kif->pfik_rulerefs > 0) + return; + + /* kif referencing an existing ifnet or group should exist. */ + if (kif->pfik_ifp != NULL || kif->pfik_group != NULL || kif == V_pfi_all) + return; + + RB_REMOVE(pfi_ifhead, &V_pfi_ifs, kif); + + kif->pfik_flags |= PFI_IFLAG_REFS; + + mtx_lock(&pfi_unlnkdkifs_mtx); + LIST_INSERT_HEAD(&V_pfi_unlinked_kifs, kif, pfik_list); + mtx_unlock(&pfi_unlnkdkifs_mtx); +} + +void +pfi_kif_purge(void) +{ + struct pfi_kif *kif, *kif1; + + /* + * Do naive mark-and-sweep garbage collecting of old kifs. + * Reference flag is raised by pf_purge_expired_states(). + */ + mtx_lock(&pfi_unlnkdkifs_mtx); + LIST_FOREACH_SAFE(kif, &V_pfi_unlinked_kifs, pfik_list, kif1) { + if (!(kif->pfik_flags & PFI_IFLAG_REFS)) { + LIST_REMOVE(kif, pfik_list); + free(kif, PFI_MTYPE); + } else + kif->pfik_flags &= ~PFI_IFLAG_REFS; + } + mtx_unlock(&pfi_unlnkdkifs_mtx); +} + +int +pfi_kif_match(struct pfi_kif *rule_kif, struct pfi_kif *packet_kif) +{ + struct ifg_list *p; + + if (rule_kif == NULL || rule_kif == packet_kif) + return (1); + + if (rule_kif->pfik_group != NULL) + /* XXXGL: locking? */ + TAILQ_FOREACH(p, &packet_kif->pfik_ifp->if_groups, ifgl_next) + if (p->ifgl_group == rule_kif->pfik_group) + return (1); + + return (0); +} + +static void +pfi_attach_ifnet(struct ifnet *ifp) +{ + struct pfi_kif *kif; + + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + + PF_RULES_WLOCK(); + V_pfi_update++; + kif = pfi_kif_attach(kif, ifp->if_xname); + + kif->pfik_ifp = ifp; + ifp->if_pf_kif = kif; + + pfi_kif_update(kif); + PF_RULES_WUNLOCK(); +} + +static void +pfi_attach_ifgroup(struct ifg_group *ifg) +{ + struct pfi_kif *kif; + + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + + PF_RULES_WLOCK(); + V_pfi_update++; + kif = pfi_kif_attach(kif, ifg->ifg_group); + + kif->pfik_group = ifg; + ifg->ifg_pf_kif = kif; + PF_RULES_WUNLOCK(); +} + +int +pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + switch (dyn->pfid_acnt4) { + case 0: + return (0); + case 1: + return (PF_MATCHA(0, &dyn->pfid_addr4, + &dyn->pfid_mask4, a, AF_INET)); + default: + return (pfr_match_addr(dyn->pfid_kt, a, AF_INET)); + } + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + switch (dyn->pfid_acnt6) { + case 0: + return (0); + case 1: + return (PF_MATCHA(0, &dyn->pfid_addr6, + &dyn->pfid_mask6, a, AF_INET6)); + default: + return (pfr_match_addr(dyn->pfid_kt, a, AF_INET6)); + } + break; +#endif /* INET6 */ + default: + return (0); + } +} + +int +pfi_dynaddr_setup(struct pf_addr_wrap *aw, sa_family_t af) +{ + struct pfi_dynaddr *dyn; + char tblname[PF_TABLE_NAME_SIZE]; + struct pf_ruleset *ruleset = NULL; + struct pfi_kif *kif; + int rv = 0; + + PF_RULES_WASSERT(); + KASSERT(aw->type == PF_ADDR_DYNIFTL, ("%s: type %u", + __func__, aw->type)); + KASSERT(aw->p.dyn == NULL, ("%s: dyn is %p", __func__, aw->p.dyn)); + + if ((dyn = malloc(sizeof(*dyn), PFI_MTYPE, M_NOWAIT | M_ZERO)) == NULL) + return (ENOMEM); + + if ((kif = malloc(sizeof(*kif), PFI_MTYPE, M_NOWAIT)) == NULL) { + free(dyn, PFI_MTYPE); + return (ENOMEM); + } + + if (!strcmp(aw->v.ifname, "self")) + dyn->pfid_kif = pfi_kif_attach(kif, IFG_ALL); + else + dyn->pfid_kif = pfi_kif_attach(kif, aw->v.ifname); + pfi_kif_ref(dyn->pfid_kif); + + dyn->pfid_net = pfi_unmask(&aw->v.a.mask); + if (af == AF_INET && dyn->pfid_net == 32) + dyn->pfid_net = 128; + strlcpy(tblname, aw->v.ifname, sizeof(tblname)); + if (aw->iflags & PFI_AFLAG_NETWORK) + strlcat(tblname, ":network", sizeof(tblname)); + if (aw->iflags & PFI_AFLAG_BROADCAST) + strlcat(tblname, ":broadcast", sizeof(tblname)); + if (aw->iflags & PFI_AFLAG_PEER) + strlcat(tblname, ":peer", sizeof(tblname)); + if (aw->iflags & PFI_AFLAG_NOALIAS) + strlcat(tblname, ":0", sizeof(tblname)); + if (dyn->pfid_net != 128) + snprintf(tblname + strlen(tblname), + sizeof(tblname) - strlen(tblname), "/%d", dyn->pfid_net); + if ((ruleset = pf_find_or_create_ruleset(PF_RESERVED_ANCHOR)) == NULL) { + rv = ENOMEM; + goto _bad; + } + + if ((dyn->pfid_kt = pfr_attach_table(ruleset, tblname)) == NULL) { + rv = ENOMEM; + goto _bad; + } + + dyn->pfid_kt->pfrkt_flags |= PFR_TFLAG_ACTIVE; + dyn->pfid_iflags = aw->iflags; + dyn->pfid_af = af; + + TAILQ_INSERT_TAIL(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry); + aw->p.dyn = dyn; + pfi_kif_update(dyn->pfid_kif); + + return (0); + +_bad: + if (dyn->pfid_kt != NULL) + pfr_detach_table(dyn->pfid_kt); + if (ruleset != NULL) + pf_remove_if_empty_ruleset(ruleset); + if (dyn->pfid_kif != NULL) + pfi_kif_unref(dyn->pfid_kif); + free(dyn, PFI_MTYPE); + + return (rv); +} + +static void +pfi_kif_update(struct pfi_kif *kif) +{ + struct ifg_list *ifgl; + struct pfi_dynaddr *p; + + PF_RULES_WASSERT(); + + /* update all dynaddr */ + TAILQ_FOREACH(p, &kif->pfik_dynaddrs, entry) + pfi_dynaddr_update(p); + + /* again for all groups kif is member of */ + if (kif->pfik_ifp != NULL) { + IF_ADDR_RLOCK(kif->pfik_ifp); + TAILQ_FOREACH(ifgl, &kif->pfik_ifp->if_groups, ifgl_next) + pfi_kif_update((struct pfi_kif *) + ifgl->ifgl_group->ifg_pf_kif); + IF_ADDR_RUNLOCK(kif->pfik_ifp); + } +} + +static void +pfi_dynaddr_update(struct pfi_dynaddr *dyn) +{ + struct pfi_kif *kif; + struct pfr_ktable *kt; + + PF_RULES_WASSERT(); + KASSERT(dyn && dyn->pfid_kif && dyn->pfid_kt, + ("%s: bad argument", __func__)); + + kif = dyn->pfid_kif; + kt = dyn->pfid_kt; + + if (kt->pfrkt_larg != V_pfi_update) { + /* this table needs to be brought up-to-date */ + pfi_table_update(kt, kif, dyn->pfid_net, dyn->pfid_iflags); + kt->pfrkt_larg = V_pfi_update; + } + pfr_dynaddr_update(kt, dyn); +} + +static void +pfi_table_update(struct pfr_ktable *kt, struct pfi_kif *kif, int net, int flags) +{ + int e, size2 = 0; + struct ifg_member *ifgm; + + V_pfi_buffer_cnt = 0; + + if (kif->pfik_ifp != NULL) + pfi_instance_add(kif->pfik_ifp, net, flags); + else if (kif->pfik_group != NULL) { + IFNET_RLOCK_NOSLEEP(); + TAILQ_FOREACH(ifgm, &kif->pfik_group->ifg_members, ifgm_next) + pfi_instance_add(ifgm->ifgm_ifp, net, flags); + IFNET_RUNLOCK_NOSLEEP(); + } + + if ((e = pfr_set_addrs(&kt->pfrkt_t, V_pfi_buffer, V_pfi_buffer_cnt, &size2, + NULL, NULL, NULL, 0, PFR_TFLAG_ALLMASK))) + printf("%s: cannot set %d new addresses into table %s: %d\n", + __func__, V_pfi_buffer_cnt, kt->pfrkt_name, e); +} + +static void +pfi_instance_add(struct ifnet *ifp, int net, int flags) +{ + struct ifaddr *ia; + int got4 = 0, got6 = 0; + int net2, af; + + IF_ADDR_RLOCK(ifp); + TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_list) { + if (ia->ifa_addr == NULL) + continue; + af = ia->ifa_addr->sa_family; + if (af != AF_INET && af != AF_INET6) + continue; + /* + * XXX: For point-to-point interfaces, (ifname:0) and IPv4, + * jump over addresses without a proper route to work + * around a problem with ppp not fully removing the + * address used during IPCP. + */ + if ((ifp->if_flags & IFF_POINTOPOINT) && + !(ia->ifa_flags & IFA_ROUTE) && + (flags & PFI_AFLAG_NOALIAS) && (af == AF_INET)) + continue; + if ((flags & PFI_AFLAG_BROADCAST) && af == AF_INET6) + continue; + if ((flags & PFI_AFLAG_BROADCAST) && + !(ifp->if_flags & IFF_BROADCAST)) + continue; + if ((flags & PFI_AFLAG_PEER) && + !(ifp->if_flags & IFF_POINTOPOINT)) + continue; + if ((flags & PFI_AFLAG_NETWORK) && af == AF_INET6 && + IN6_IS_ADDR_LINKLOCAL( + &((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr)) + continue; + if (flags & PFI_AFLAG_NOALIAS) { + if (af == AF_INET && got4) + continue; + if (af == AF_INET6 && got6) + continue; + } + if (af == AF_INET) + got4 = 1; + else if (af == AF_INET6) + got6 = 1; + net2 = net; + if (net2 == 128 && (flags & PFI_AFLAG_NETWORK)) { + if (af == AF_INET) + net2 = pfi_unmask(&((struct sockaddr_in *) + ia->ifa_netmask)->sin_addr); + else if (af == AF_INET6) + net2 = pfi_unmask(&((struct sockaddr_in6 *) + ia->ifa_netmask)->sin6_addr); + } + if (af == AF_INET && net2 > 32) + net2 = 32; + if (flags & PFI_AFLAG_BROADCAST) + pfi_address_add(ia->ifa_broadaddr, af, net2); + else if (flags & PFI_AFLAG_PEER) + pfi_address_add(ia->ifa_dstaddr, af, net2); + else + pfi_address_add(ia->ifa_addr, af, net2); + } + IF_ADDR_RUNLOCK(ifp); +} + +static void +pfi_address_add(struct sockaddr *sa, int af, int net) +{ + struct pfr_addr *p; + int i; + + if (V_pfi_buffer_cnt >= V_pfi_buffer_max) { + int new_max = V_pfi_buffer_max * 2; + + if (new_max > PFI_BUFFER_MAX) { + printf("%s: address buffer full (%d/%d)\n", __func__, + V_pfi_buffer_cnt, PFI_BUFFER_MAX); + return; + } + p = malloc(new_max * sizeof(*V_pfi_buffer), PFI_MTYPE, + M_NOWAIT); + if (p == NULL) { + printf("%s: no memory to grow buffer (%d/%d)\n", + __func__, V_pfi_buffer_cnt, PFI_BUFFER_MAX); + return; + } + memcpy(p, V_pfi_buffer, V_pfi_buffer_max * sizeof(*V_pfi_buffer)); + /* no need to zero buffer */ + free(V_pfi_buffer, PFI_MTYPE); + V_pfi_buffer = p; + V_pfi_buffer_max = new_max; + } + if (af == AF_INET && net > 32) + net = 128; + p = V_pfi_buffer + V_pfi_buffer_cnt++; + bzero(p, sizeof(*p)); + p->pfra_af = af; + p->pfra_net = net; + if (af == AF_INET) + p->pfra_ip4addr = ((struct sockaddr_in *)sa)->sin_addr; + else if (af == AF_INET6) { + p->pfra_ip6addr = ((struct sockaddr_in6 *)sa)->sin6_addr; + if (IN6_IS_SCOPE_EMBED(&p->pfra_ip6addr)) + p->pfra_ip6addr.s6_addr16[1] = 0; + } + /* mask network address bits */ + if (net < 128) + ((caddr_t)p)[p->pfra_net/8] &= ~(0xFF >> (p->pfra_net%8)); + for (i = (p->pfra_net+7)/8; i < sizeof(p->pfra_u); i++) + ((caddr_t)p)[i] = 0; +} + +void +pfi_dynaddr_remove(struct pfi_dynaddr *dyn) +{ + + KASSERT(dyn->pfid_kif != NULL, ("%s: null pfid_kif", __func__)); + KASSERT(dyn->pfid_kt != NULL, ("%s: null pfid_kt", __func__)); + + TAILQ_REMOVE(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry); + pfi_kif_unref(dyn->pfid_kif); + pfr_detach_table(dyn->pfid_kt); + free(dyn, PFI_MTYPE); +} + +void +pfi_dynaddr_copyout(struct pf_addr_wrap *aw) +{ + + KASSERT(aw->type == PF_ADDR_DYNIFTL, + ("%s: type %u", __func__, aw->type)); + + if (aw->p.dyn == NULL || aw->p.dyn->pfid_kif == NULL) + return; + aw->p.dyncnt = aw->p.dyn->pfid_acnt4 + aw->p.dyn->pfid_acnt6; +} + +static int +pfi_if_compare(struct pfi_kif *p, struct pfi_kif *q) +{ + return (strncmp(p->pfik_name, q->pfik_name, IFNAMSIZ)); +} + +void +pfi_update_status(const char *name, struct pf_status *pfs) +{ + struct pfi_kif *p; + struct pfi_kif_cmp key; + struct ifg_member p_member, *ifgm; + TAILQ_HEAD(, ifg_member) ifg_members; + int i, j, k; + + strlcpy(key.pfik_name, name, sizeof(key.pfik_name)); + p = RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kif *)&key); + if (p == NULL) + return; + + if (p->pfik_group != NULL) { + bcopy(&p->pfik_group->ifg_members, &ifg_members, + sizeof(ifg_members)); + } else { + /* build a temporary list for p only */ + bzero(&p_member, sizeof(p_member)); + p_member.ifgm_ifp = p->pfik_ifp; + TAILQ_INIT(&ifg_members); + TAILQ_INSERT_TAIL(&ifg_members, &p_member, ifgm_next); + } + if (pfs) { + bzero(pfs->pcounters, sizeof(pfs->pcounters)); + bzero(pfs->bcounters, sizeof(pfs->bcounters)); + } + TAILQ_FOREACH(ifgm, &ifg_members, ifgm_next) { + if (ifgm->ifgm_ifp == NULL || ifgm->ifgm_ifp->if_pf_kif == NULL) + continue; + p = (struct pfi_kif *)ifgm->ifgm_ifp->if_pf_kif; + + /* just clear statistics */ + if (pfs == NULL) { + bzero(p->pfik_packets, sizeof(p->pfik_packets)); + bzero(p->pfik_bytes, sizeof(p->pfik_bytes)); + p->pfik_tzero = time_second; + continue; + } + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) { + pfs->pcounters[i][j][k] += + p->pfik_packets[i][j][k]; + pfs->bcounters[i][j] += + p->pfik_bytes[i][j][k]; + } + } +} + +void +pfi_get_ifaces(const char *name, struct pfi_kif *buf, int *size) +{ + struct pfi_kif *p, *nextp; + int n = 0; + + for (p = RB_MIN(pfi_ifhead, &V_pfi_ifs); p; p = nextp) { + nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p); + if (pfi_skip_if(name, p)) + continue; + if (*size <= n++) + break; + if (!p->pfik_tzero) + p->pfik_tzero = time_second; + bcopy(p, buf++, sizeof(*buf)); + nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p); + } + *size = n; +} + +static int +pfi_skip_if(const char *filter, struct pfi_kif *p) +{ + int n; + + if (filter == NULL || !*filter) + return (0); + if (!strcmp(p->pfik_name, filter)) + return (0); /* exact match */ + n = strlen(filter); + if (n < 1 || n >= IFNAMSIZ) + return (1); /* sanity check */ + if (filter[n-1] >= '0' && filter[n-1] <= '9') + return (1); /* only do exact match in that case */ + if (strncmp(p->pfik_name, filter, n)) + return (1); /* prefix doesn't match */ + return (p->pfik_name[n] < '0' || p->pfik_name[n] > '9'); +} + +int +pfi_set_flags(const char *name, int flags) +{ + struct pfi_kif *p; + + RB_FOREACH(p, pfi_ifhead, &V_pfi_ifs) { + if (pfi_skip_if(name, p)) + continue; + p->pfik_flags |= flags; + } + return (0); +} + +int +pfi_clear_flags(const char *name, int flags) +{ + struct pfi_kif *p; + + RB_FOREACH(p, pfi_ifhead, &V_pfi_ifs) { + if (pfi_skip_if(name, p)) + continue; + p->pfik_flags &= ~flags; + } + return (0); +} + +/* from pf_print_state.c */ +static int +pfi_unmask(void *addr) +{ + struct pf_addr *m = addr; + int i = 31, j = 0, b = 0; + u_int32_t tmp; + + while (j < 4 && m->addr32[j] == 0xffffffff) { + b += 32; + j++; + } + if (j < 4) { + tmp = ntohl(m->addr32[j]); + for (i = 31; tmp & (1 << i); --i) + b++; + } + return (b); +} + +static void +pfi_attach_ifnet_event(void *arg __unused, struct ifnet *ifp) +{ + + CURVNET_SET(ifp->if_vnet); + if (V_pf_vnet_active == 0) { + /* Avoid teardown race in the least expensive way. */ + CURVNET_RESTORE(); + return; + } + pfi_attach_ifnet(ifp); +#ifdef ALTQ + PF_RULES_WLOCK(); + pf_altq_ifnet_event(ifp, 0); + PF_RULES_WUNLOCK(); +#endif + CURVNET_RESTORE(); +} + +static void +pfi_detach_ifnet_event(void *arg __unused, struct ifnet *ifp) +{ + struct pfi_kif *kif = (struct pfi_kif *)ifp->if_pf_kif; + + if (kif == NULL) + return; + + CURVNET_SET(ifp->if_vnet); + if (V_pf_vnet_active == 0) { + /* Avoid teardown race in the least expensive way. */ + CURVNET_RESTORE(); + return; + } + PF_RULES_WLOCK(); + V_pfi_update++; + pfi_kif_update(kif); + + kif->pfik_ifp = NULL; + ifp->if_pf_kif = NULL; +#ifdef ALTQ + pf_altq_ifnet_event(ifp, 1); +#endif + PF_RULES_WUNLOCK(); + CURVNET_RESTORE(); +} + +static void +pfi_attach_group_event(void *arg , struct ifg_group *ifg) +{ + + CURVNET_SET((struct vnet *)arg); + if (V_pf_vnet_active == 0) { + /* Avoid teardown race in the least expensive way. */ + CURVNET_RESTORE(); + return; + } + pfi_attach_ifgroup(ifg); + CURVNET_RESTORE(); +} + +static void +pfi_change_group_event(void *arg, char *gname) +{ + struct pfi_kif *kif; + + CURVNET_SET((struct vnet *)arg); + if (V_pf_vnet_active == 0) { + /* Avoid teardown race in the least expensive way. */ + CURVNET_RESTORE(); + return; + } + + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + PF_RULES_WLOCK(); + V_pfi_update++; + kif = pfi_kif_attach(kif, gname); + pfi_kif_update(kif); + PF_RULES_WUNLOCK(); + CURVNET_RESTORE(); +} + +static void +pfi_detach_group_event(void *arg, struct ifg_group *ifg) +{ + struct pfi_kif *kif = (struct pfi_kif *)ifg->ifg_pf_kif; + + if (kif == NULL) + return; + + CURVNET_SET((struct vnet *)arg); + if (V_pf_vnet_active == 0) { + /* Avoid teardown race in the least expensive way. */ + CURVNET_RESTORE(); + return; + } + PF_RULES_WLOCK(); + V_pfi_update++; + + kif->pfik_group = NULL; + ifg->ifg_pf_kif = NULL; + PF_RULES_WUNLOCK(); + CURVNET_RESTORE(); +} + +static void +pfi_ifaddr_event(void *arg __unused, struct ifnet *ifp) +{ + if (ifp->if_pf_kif == NULL) + return; + + CURVNET_SET(ifp->if_vnet); + if (V_pf_vnet_active == 0) { + /* Avoid teardown race in the least expensive way. */ + CURVNET_RESTORE(); + return; + } + PF_RULES_WLOCK(); + if (ifp && ifp->if_pf_kif) { + V_pfi_update++; + pfi_kif_update(ifp->if_pf_kif); + } + PF_RULES_WUNLOCK(); + CURVNET_RESTORE(); +} diff --git a/freebsd/sys/netpfil/pf/pf_ioctl.c b/freebsd/sys/netpfil/pf/pf_ioctl.c new file mode 100644 index 00000000..9c1523ca --- /dev/null +++ b/freebsd/sys/netpfil/pf/pf_ioctl.c @@ -0,0 +1,3872 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002,2003 Henning Brauer + * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + * $OpenBSD: pf_ioctl.c,v 1.213 2009/02/15 21:46:12 mbalmer Exp $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> +#include <rtems/bsd/local/opt_bpf.h> +#include <rtems/bsd/local/opt_pf.h> + +#include <rtems/bsd/sys/param.h> +#include <sys/bus.h> +#include <sys/conf.h> +#include <sys/endian.h> +#include <sys/fcntl.h> +#include <sys/filio.h> +#include <sys/interrupt.h> +#include <sys/jail.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/smp.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/md5.h> +#include <sys/ucred.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/vnet.h> +#include <net/route.h> +#include <net/pfil.h> +#include <net/pfvar.h> +#include <net/if_pfsync.h> +#include <net/if_pflog.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet6/ip6_var.h> +#include <netinet/ip_icmp.h> + +#ifdef INET6 +#include <netinet/ip6.h> +#endif /* INET6 */ + +#ifdef ALTQ +#include <net/altq/altq.h> +#endif + +static struct pf_pool *pf_get_pool(char *, u_int32_t, u_int8_t, u_int32_t, + u_int8_t, u_int8_t, u_int8_t); + +static void pf_mv_pool(struct pf_palist *, struct pf_palist *); +static void pf_empty_pool(struct pf_palist *); +static int pfioctl(struct cdev *, u_long, caddr_t, int, + struct thread *); +#ifdef ALTQ +static int pf_begin_altq(u_int32_t *); +static int pf_rollback_altq(u_int32_t); +static int pf_commit_altq(u_int32_t); +static int pf_enable_altq(struct pf_altq *); +static int pf_disable_altq(struct pf_altq *); +static u_int32_t pf_qname2qid(char *); +static void pf_qid_unref(u_int32_t); +#endif /* ALTQ */ +static int pf_begin_rules(u_int32_t *, int, const char *); +static int pf_rollback_rules(u_int32_t, int, char *); +static int pf_setup_pfsync_matching(struct pf_ruleset *); +static void pf_hash_rule(MD5_CTX *, struct pf_rule *); +static void pf_hash_rule_addr(MD5_CTX *, struct pf_rule_addr *); +static int pf_commit_rules(u_int32_t, int, char *); +static int pf_addr_setup(struct pf_ruleset *, + struct pf_addr_wrap *, sa_family_t); +static void pf_addr_copyout(struct pf_addr_wrap *); + +VNET_DEFINE(struct pf_rule, pf_default_rule); + +#ifdef ALTQ +static VNET_DEFINE(int, pf_altq_running); +#define V_pf_altq_running VNET(pf_altq_running) +#endif + +#define TAGID_MAX 50000 +struct pf_tagname { + TAILQ_ENTRY(pf_tagname) entries; + char name[PF_TAG_NAME_SIZE]; + uint16_t tag; + int ref; +}; + +TAILQ_HEAD(pf_tags, pf_tagname); +#define V_pf_tags VNET(pf_tags) +VNET_DEFINE(struct pf_tags, pf_tags); +#define V_pf_qids VNET(pf_qids) +VNET_DEFINE(struct pf_tags, pf_qids); +static MALLOC_DEFINE(M_PFTAG, "pf_tag", "pf(4) tag names"); +static MALLOC_DEFINE(M_PFALTQ, "pf_altq", "pf(4) altq configuration db"); +static MALLOC_DEFINE(M_PFRULE, "pf_rule", "pf(4) rules"); + +#if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE) +#error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE +#endif + +static u_int16_t tagname2tag(struct pf_tags *, char *); +static u_int16_t pf_tagname2tag(char *); +static void tag_unref(struct pf_tags *, u_int16_t); + +#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x + +struct cdev *pf_dev; + +/* + * XXX - These are new and need to be checked when moveing to a new version + */ +static void pf_clear_states(void); +static int pf_clear_tables(void); +static void pf_clear_srcnodes(struct pf_src_node *); +static void pf_kill_srcnodes(struct pfioc_src_node_kill *); +static void pf_tbladdr_copyout(struct pf_addr_wrap *); + +/* + * Wrapper functions for pfil(9) hooks + */ +#ifdef INET +static int pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp, + int dir, struct inpcb *inp); +static int pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp, + int dir, struct inpcb *inp); +#endif +#ifdef INET6 +static int pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp, + int dir, struct inpcb *inp); +static int pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp, + int dir, struct inpcb *inp); +#endif + +static int hook_pf(void); +static int dehook_pf(void); +static int shutdown_pf(void); +static int pf_load(void); +static int pf_unload(void); + +static struct cdevsw pf_cdevsw = { + .d_ioctl = pfioctl, + .d_name = PF_NAME, + .d_version = D_VERSION, +}; + +static volatile VNET_DEFINE(int, pf_pfil_hooked); +#define V_pf_pfil_hooked VNET(pf_pfil_hooked) + +/* + * We need a flag that is neither hooked nor running to know when + * the VNET is "valid". We primarily need this to control (global) + * external event, e.g., eventhandlers. + */ +VNET_DEFINE(int, pf_vnet_active); +#define V_pf_vnet_active VNET(pf_vnet_active) + +int pf_end_threads; + +struct rwlock pf_rules_lock; +struct sx pf_ioctl_lock; + +/* pfsync */ +pfsync_state_import_t *pfsync_state_import_ptr = NULL; +pfsync_insert_state_t *pfsync_insert_state_ptr = NULL; +pfsync_update_state_t *pfsync_update_state_ptr = NULL; +pfsync_delete_state_t *pfsync_delete_state_ptr = NULL; +pfsync_clear_states_t *pfsync_clear_states_ptr = NULL; +pfsync_defer_t *pfsync_defer_ptr = NULL; +/* pflog */ +pflog_packet_t *pflog_packet_ptr = NULL; + +static void +pfattach_vnet(void) +{ + u_int32_t *my_timeout = V_pf_default_rule.timeout; + + pf_initialize(); + pfr_initialize(); + pfi_initialize_vnet(); + pf_normalize_init(); + + V_pf_limits[PF_LIMIT_STATES].limit = PFSTATE_HIWAT; + V_pf_limits[PF_LIMIT_SRC_NODES].limit = PFSNODE_HIWAT; + + RB_INIT(&V_pf_anchors); + pf_init_ruleset(&pf_main_ruleset); + + /* default rule should never be garbage collected */ + V_pf_default_rule.entries.tqe_prev = &V_pf_default_rule.entries.tqe_next; +#ifdef PF_DEFAULT_TO_DROP + V_pf_default_rule.action = PF_DROP; +#else + V_pf_default_rule.action = PF_PASS; +#endif + V_pf_default_rule.nr = -1; + V_pf_default_rule.rtableid = -1; + + V_pf_default_rule.states_cur = counter_u64_alloc(M_WAITOK); + V_pf_default_rule.states_tot = counter_u64_alloc(M_WAITOK); + V_pf_default_rule.src_nodes = counter_u64_alloc(M_WAITOK); + + /* initialize default timeouts */ + my_timeout[PFTM_TCP_FIRST_PACKET] = PFTM_TCP_FIRST_PACKET_VAL; + my_timeout[PFTM_TCP_OPENING] = PFTM_TCP_OPENING_VAL; + my_timeout[PFTM_TCP_ESTABLISHED] = PFTM_TCP_ESTABLISHED_VAL; + my_timeout[PFTM_TCP_CLOSING] = PFTM_TCP_CLOSING_VAL; + my_timeout[PFTM_TCP_FIN_WAIT] = PFTM_TCP_FIN_WAIT_VAL; + my_timeout[PFTM_TCP_CLOSED] = PFTM_TCP_CLOSED_VAL; + my_timeout[PFTM_UDP_FIRST_PACKET] = PFTM_UDP_FIRST_PACKET_VAL; + my_timeout[PFTM_UDP_SINGLE] = PFTM_UDP_SINGLE_VAL; + my_timeout[PFTM_UDP_MULTIPLE] = PFTM_UDP_MULTIPLE_VAL; + my_timeout[PFTM_ICMP_FIRST_PACKET] = PFTM_ICMP_FIRST_PACKET_VAL; + my_timeout[PFTM_ICMP_ERROR_REPLY] = PFTM_ICMP_ERROR_REPLY_VAL; + my_timeout[PFTM_OTHER_FIRST_PACKET] = PFTM_OTHER_FIRST_PACKET_VAL; + my_timeout[PFTM_OTHER_SINGLE] = PFTM_OTHER_SINGLE_VAL; + my_timeout[PFTM_OTHER_MULTIPLE] = PFTM_OTHER_MULTIPLE_VAL; + my_timeout[PFTM_FRAG] = PFTM_FRAG_VAL; + my_timeout[PFTM_INTERVAL] = PFTM_INTERVAL_VAL; + my_timeout[PFTM_SRC_NODE] = PFTM_SRC_NODE_VAL; + my_timeout[PFTM_TS_DIFF] = PFTM_TS_DIFF_VAL; + my_timeout[PFTM_ADAPTIVE_START] = PFSTATE_ADAPT_START; + my_timeout[PFTM_ADAPTIVE_END] = PFSTATE_ADAPT_END; + + bzero(&V_pf_status, sizeof(V_pf_status)); + V_pf_status.debug = PF_DEBUG_URGENT; + + V_pf_pfil_hooked = 0; + + /* XXX do our best to avoid a conflict */ + V_pf_status.hostid = arc4random(); + + for (int i = 0; i < PFRES_MAX; i++) + V_pf_status.counters[i] = counter_u64_alloc(M_WAITOK); + for (int i = 0; i < LCNT_MAX; i++) + V_pf_status.lcounters[i] = counter_u64_alloc(M_WAITOK); + for (int i = 0; i < FCNT_MAX; i++) + V_pf_status.fcounters[i] = counter_u64_alloc(M_WAITOK); + for (int i = 0; i < SCNT_MAX; i++) + V_pf_status.scounters[i] = counter_u64_alloc(M_WAITOK); + + if (swi_add(NULL, "pf send", pf_intr, curvnet, SWI_NET, + INTR_MPSAFE, &V_pf_swi_cookie) != 0) + /* XXXGL: leaked all above. */ + return; +} + + +static struct pf_pool * +pf_get_pool(char *anchor, u_int32_t ticket, u_int8_t rule_action, + u_int32_t rule_number, u_int8_t r_last, u_int8_t active, + u_int8_t check_ticket) +{ + struct pf_ruleset *ruleset; + struct pf_rule *rule; + int rs_num; + + ruleset = pf_find_ruleset(anchor); + if (ruleset == NULL) + return (NULL); + rs_num = pf_get_ruleset_number(rule_action); + if (rs_num >= PF_RULESET_MAX) + return (NULL); + if (active) { + if (check_ticket && ticket != + ruleset->rules[rs_num].active.ticket) + return (NULL); + if (r_last) + rule = TAILQ_LAST(ruleset->rules[rs_num].active.ptr, + pf_rulequeue); + else + rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr); + } else { + if (check_ticket && ticket != + ruleset->rules[rs_num].inactive.ticket) + return (NULL); + if (r_last) + rule = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr, + pf_rulequeue); + else + rule = TAILQ_FIRST(ruleset->rules[rs_num].inactive.ptr); + } + if (!r_last) { + while ((rule != NULL) && (rule->nr != rule_number)) + rule = TAILQ_NEXT(rule, entries); + } + if (rule == NULL) + return (NULL); + + return (&rule->rpool); +} + +static void +pf_mv_pool(struct pf_palist *poola, struct pf_palist *poolb) +{ + struct pf_pooladdr *mv_pool_pa; + + while ((mv_pool_pa = TAILQ_FIRST(poola)) != NULL) { + TAILQ_REMOVE(poola, mv_pool_pa, entries); + TAILQ_INSERT_TAIL(poolb, mv_pool_pa, entries); + } +} + +static void +pf_empty_pool(struct pf_palist *poola) +{ + struct pf_pooladdr *pa; + + while ((pa = TAILQ_FIRST(poola)) != NULL) { + switch (pa->addr.type) { + case PF_ADDR_DYNIFTL: + pfi_dynaddr_remove(pa->addr.p.dyn); + break; + case PF_ADDR_TABLE: + /* XXX: this could be unfinished pooladdr on pabuf */ + if (pa->addr.p.tbl != NULL) + pfr_detach_table(pa->addr.p.tbl); + break; + } + if (pa->kif) + pfi_kif_unref(pa->kif); + TAILQ_REMOVE(poola, pa, entries); + free(pa, M_PFRULE); + } +} + +static void +pf_unlink_rule(struct pf_rulequeue *rulequeue, struct pf_rule *rule) +{ + + PF_RULES_WASSERT(); + + TAILQ_REMOVE(rulequeue, rule, entries); + + PF_UNLNKDRULES_LOCK(); + rule->rule_flag |= PFRULE_REFS; + TAILQ_INSERT_TAIL(&V_pf_unlinked_rules, rule, entries); + PF_UNLNKDRULES_UNLOCK(); +} + +void +pf_free_rule(struct pf_rule *rule) +{ + + PF_RULES_WASSERT(); + + if (rule->tag) + tag_unref(&V_pf_tags, rule->tag); + if (rule->match_tag) + tag_unref(&V_pf_tags, rule->match_tag); +#ifdef ALTQ + if (rule->pqid != rule->qid) + pf_qid_unref(rule->pqid); + pf_qid_unref(rule->qid); +#endif + switch (rule->src.addr.type) { + case PF_ADDR_DYNIFTL: + pfi_dynaddr_remove(rule->src.addr.p.dyn); + break; + case PF_ADDR_TABLE: + pfr_detach_table(rule->src.addr.p.tbl); + break; + } + switch (rule->dst.addr.type) { + case PF_ADDR_DYNIFTL: + pfi_dynaddr_remove(rule->dst.addr.p.dyn); + break; + case PF_ADDR_TABLE: + pfr_detach_table(rule->dst.addr.p.tbl); + break; + } + if (rule->overload_tbl) + pfr_detach_table(rule->overload_tbl); + if (rule->kif) + pfi_kif_unref(rule->kif); + pf_anchor_remove(rule); + pf_empty_pool(&rule->rpool.list); + counter_u64_free(rule->states_cur); + counter_u64_free(rule->states_tot); + counter_u64_free(rule->src_nodes); + free(rule, M_PFRULE); +} + +static u_int16_t +tagname2tag(struct pf_tags *head, char *tagname) +{ + struct pf_tagname *tag, *p = NULL; + u_int16_t new_tagid = 1; + + PF_RULES_WASSERT(); + + TAILQ_FOREACH(tag, head, entries) + if (strcmp(tagname, tag->name) == 0) { + tag->ref++; + return (tag->tag); + } + + /* + * to avoid fragmentation, we do a linear search from the beginning + * and take the first free slot we find. if there is none or the list + * is empty, append a new entry at the end. + */ + + /* new entry */ + if (!TAILQ_EMPTY(head)) + for (p = TAILQ_FIRST(head); p != NULL && + p->tag == new_tagid; p = TAILQ_NEXT(p, entries)) + new_tagid = p->tag + 1; + + if (new_tagid > TAGID_MAX) + return (0); + + /* allocate and fill new struct pf_tagname */ + tag = malloc(sizeof(*tag), M_PFTAG, M_NOWAIT|M_ZERO); + if (tag == NULL) + return (0); + strlcpy(tag->name, tagname, sizeof(tag->name)); + tag->tag = new_tagid; + tag->ref++; + + if (p != NULL) /* insert new entry before p */ + TAILQ_INSERT_BEFORE(p, tag, entries); + else /* either list empty or no free slot in between */ + TAILQ_INSERT_TAIL(head, tag, entries); + + return (tag->tag); +} + +static void +tag_unref(struct pf_tags *head, u_int16_t tag) +{ + struct pf_tagname *p, *next; + + PF_RULES_WASSERT(); + + for (p = TAILQ_FIRST(head); p != NULL; p = next) { + next = TAILQ_NEXT(p, entries); + if (tag == p->tag) { + if (--p->ref == 0) { + TAILQ_REMOVE(head, p, entries); + free(p, M_PFTAG); + } + break; + } + } +} + +static u_int16_t +pf_tagname2tag(char *tagname) +{ + return (tagname2tag(&V_pf_tags, tagname)); +} + +#ifdef ALTQ +static u_int32_t +pf_qname2qid(char *qname) +{ + return ((u_int32_t)tagname2tag(&V_pf_qids, qname)); +} + +static void +pf_qid_unref(u_int32_t qid) +{ + tag_unref(&V_pf_qids, (u_int16_t)qid); +} + +static int +pf_begin_altq(u_int32_t *ticket) +{ + struct pf_altq *altq; + int error = 0; + + PF_RULES_WASSERT(); + + /* Purge the old altq list */ + while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) { + TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries); + if (altq->qname[0] == 0 && + (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { + /* detach and destroy the discipline */ + error = altq_remove(altq); + } else + pf_qid_unref(altq->qid); + free(altq, M_PFALTQ); + } + if (error) + return (error); + *ticket = ++V_ticket_altqs_inactive; + V_altqs_inactive_open = 1; + return (0); +} + +static int +pf_rollback_altq(u_int32_t ticket) +{ + struct pf_altq *altq; + int error = 0; + + PF_RULES_WASSERT(); + + if (!V_altqs_inactive_open || ticket != V_ticket_altqs_inactive) + return (0); + /* Purge the old altq list */ + while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) { + TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries); + if (altq->qname[0] == 0 && + (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { + /* detach and destroy the discipline */ + error = altq_remove(altq); + } else + pf_qid_unref(altq->qid); + free(altq, M_PFALTQ); + } + V_altqs_inactive_open = 0; + return (error); +} + +static int +pf_commit_altq(u_int32_t ticket) +{ + struct pf_altqqueue *old_altqs; + struct pf_altq *altq; + int err, error = 0; + + PF_RULES_WASSERT(); + + if (!V_altqs_inactive_open || ticket != V_ticket_altqs_inactive) + return (EBUSY); + + /* swap altqs, keep the old. */ + old_altqs = V_pf_altqs_active; + V_pf_altqs_active = V_pf_altqs_inactive; + V_pf_altqs_inactive = old_altqs; + V_ticket_altqs_active = V_ticket_altqs_inactive; + + /* Attach new disciplines */ + TAILQ_FOREACH(altq, V_pf_altqs_active, entries) { + if (altq->qname[0] == 0 && + (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { + /* attach the discipline */ + error = altq_pfattach(altq); + if (error == 0 && V_pf_altq_running) + error = pf_enable_altq(altq); + if (error != 0) + return (error); + } + } + + /* Purge the old altq list */ + while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) { + TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries); + if (altq->qname[0] == 0 && + (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { + /* detach and destroy the discipline */ + if (V_pf_altq_running) + error = pf_disable_altq(altq); + err = altq_pfdetach(altq); + if (err != 0 && error == 0) + error = err; + err = altq_remove(altq); + if (err != 0 && error == 0) + error = err; + } else + pf_qid_unref(altq->qid); + free(altq, M_PFALTQ); + } + + V_altqs_inactive_open = 0; + return (error); +} + +static int +pf_enable_altq(struct pf_altq *altq) +{ + struct ifnet *ifp; + struct tb_profile tb; + int error = 0; + + if ((ifp = ifunit(altq->ifname)) == NULL) + return (EINVAL); + + if (ifp->if_snd.altq_type != ALTQT_NONE) + error = altq_enable(&ifp->if_snd); + + /* set tokenbucket regulator */ + if (error == 0 && ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) { + tb.rate = altq->ifbandwidth; + tb.depth = altq->tbrsize; + error = tbr_set(&ifp->if_snd, &tb); + } + + return (error); +} + +static int +pf_disable_altq(struct pf_altq *altq) +{ + struct ifnet *ifp; + struct tb_profile tb; + int error; + + if ((ifp = ifunit(altq->ifname)) == NULL) + return (EINVAL); + + /* + * when the discipline is no longer referenced, it was overridden + * by a new one. if so, just return. + */ + if (altq->altq_disc != ifp->if_snd.altq_disc) + return (0); + + error = altq_disable(&ifp->if_snd); + + if (error == 0) { + /* clear tokenbucket regulator */ + tb.rate = 0; + error = tbr_set(&ifp->if_snd, &tb); + } + + return (error); +} + +void +pf_altq_ifnet_event(struct ifnet *ifp, int remove) +{ + struct ifnet *ifp1; + struct pf_altq *a1, *a2, *a3; + u_int32_t ticket; + int error = 0; + + /* Interrupt userland queue modifications */ + if (V_altqs_inactive_open) + pf_rollback_altq(V_ticket_altqs_inactive); + + /* Start new altq ruleset */ + if (pf_begin_altq(&ticket)) + return; + + /* Copy the current active set */ + TAILQ_FOREACH(a1, V_pf_altqs_active, entries) { + a2 = malloc(sizeof(*a2), M_PFALTQ, M_NOWAIT); + if (a2 == NULL) { + error = ENOMEM; + break; + } + bcopy(a1, a2, sizeof(struct pf_altq)); + + if (a2->qname[0] != 0) { + if ((a2->qid = pf_qname2qid(a2->qname)) == 0) { + error = EBUSY; + free(a2, M_PFALTQ); + break; + } + a2->altq_disc = NULL; + TAILQ_FOREACH(a3, V_pf_altqs_inactive, entries) { + if (strncmp(a3->ifname, a2->ifname, + IFNAMSIZ) == 0 && a3->qname[0] == 0) { + a2->altq_disc = a3->altq_disc; + break; + } + } + } + /* Deactivate the interface in question */ + a2->local_flags &= ~PFALTQ_FLAG_IF_REMOVED; + if ((ifp1 = ifunit(a2->ifname)) == NULL || + (remove && ifp1 == ifp)) { + a2->local_flags |= PFALTQ_FLAG_IF_REMOVED; + } else { + error = altq_add(a2); + + if (ticket != V_ticket_altqs_inactive) + error = EBUSY; + + if (error) { + free(a2, M_PFALTQ); + break; + } + } + + TAILQ_INSERT_TAIL(V_pf_altqs_inactive, a2, entries); + } + + if (error != 0) + pf_rollback_altq(ticket); + else + pf_commit_altq(ticket); +} +#endif /* ALTQ */ + +static int +pf_begin_rules(u_int32_t *ticket, int rs_num, const char *anchor) +{ + struct pf_ruleset *rs; + struct pf_rule *rule; + + PF_RULES_WASSERT(); + + if (rs_num < 0 || rs_num >= PF_RULESET_MAX) + return (EINVAL); + rs = pf_find_or_create_ruleset(anchor); + if (rs == NULL) + return (EINVAL); + while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) { + pf_unlink_rule(rs->rules[rs_num].inactive.ptr, rule); + rs->rules[rs_num].inactive.rcount--; + } + *ticket = ++rs->rules[rs_num].inactive.ticket; + rs->rules[rs_num].inactive.open = 1; + return (0); +} + +static int +pf_rollback_rules(u_int32_t ticket, int rs_num, char *anchor) +{ + struct pf_ruleset *rs; + struct pf_rule *rule; + + PF_RULES_WASSERT(); + + if (rs_num < 0 || rs_num >= PF_RULESET_MAX) + return (EINVAL); + rs = pf_find_ruleset(anchor); + if (rs == NULL || !rs->rules[rs_num].inactive.open || + rs->rules[rs_num].inactive.ticket != ticket) + return (0); + while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) { + pf_unlink_rule(rs->rules[rs_num].inactive.ptr, rule); + rs->rules[rs_num].inactive.rcount--; + } + rs->rules[rs_num].inactive.open = 0; + return (0); +} + +#define PF_MD5_UPD(st, elm) \ + MD5Update(ctx, (u_int8_t *) &(st)->elm, sizeof((st)->elm)) + +#define PF_MD5_UPD_STR(st, elm) \ + MD5Update(ctx, (u_int8_t *) (st)->elm, strlen((st)->elm)) + +#define PF_MD5_UPD_HTONL(st, elm, stor) do { \ + (stor) = htonl((st)->elm); \ + MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int32_t));\ +} while (0) + +#define PF_MD5_UPD_HTONS(st, elm, stor) do { \ + (stor) = htons((st)->elm); \ + MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int16_t));\ +} while (0) + +static void +pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr) +{ + PF_MD5_UPD(pfr, addr.type); + switch (pfr->addr.type) { + case PF_ADDR_DYNIFTL: + PF_MD5_UPD(pfr, addr.v.ifname); + PF_MD5_UPD(pfr, addr.iflags); + break; + case PF_ADDR_TABLE: + PF_MD5_UPD(pfr, addr.v.tblname); + break; + case PF_ADDR_ADDRMASK: + /* XXX ignore af? */ + PF_MD5_UPD(pfr, addr.v.a.addr.addr32); + PF_MD5_UPD(pfr, addr.v.a.mask.addr32); + break; + } + + PF_MD5_UPD(pfr, port[0]); + PF_MD5_UPD(pfr, port[1]); + PF_MD5_UPD(pfr, neg); + PF_MD5_UPD(pfr, port_op); +} + +static void +pf_hash_rule(MD5_CTX *ctx, struct pf_rule *rule) +{ + u_int16_t x; + u_int32_t y; + + pf_hash_rule_addr(ctx, &rule->src); + pf_hash_rule_addr(ctx, &rule->dst); + PF_MD5_UPD_STR(rule, label); + PF_MD5_UPD_STR(rule, ifname); + PF_MD5_UPD_STR(rule, match_tagname); + PF_MD5_UPD_HTONS(rule, match_tag, x); /* dup? */ + PF_MD5_UPD_HTONL(rule, os_fingerprint, y); + PF_MD5_UPD_HTONL(rule, prob, y); + PF_MD5_UPD_HTONL(rule, uid.uid[0], y); + PF_MD5_UPD_HTONL(rule, uid.uid[1], y); + PF_MD5_UPD(rule, uid.op); + PF_MD5_UPD_HTONL(rule, gid.gid[0], y); + PF_MD5_UPD_HTONL(rule, gid.gid[1], y); + PF_MD5_UPD(rule, gid.op); + PF_MD5_UPD_HTONL(rule, rule_flag, y); + PF_MD5_UPD(rule, action); + PF_MD5_UPD(rule, direction); + PF_MD5_UPD(rule, af); + PF_MD5_UPD(rule, quick); + PF_MD5_UPD(rule, ifnot); + PF_MD5_UPD(rule, match_tag_not); + PF_MD5_UPD(rule, natpass); + PF_MD5_UPD(rule, keep_state); + PF_MD5_UPD(rule, proto); + PF_MD5_UPD(rule, type); + PF_MD5_UPD(rule, code); + PF_MD5_UPD(rule, flags); + PF_MD5_UPD(rule, flagset); + PF_MD5_UPD(rule, allow_opts); + PF_MD5_UPD(rule, rt); + PF_MD5_UPD(rule, tos); +} + +static int +pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) +{ + struct pf_ruleset *rs; + struct pf_rule *rule, **old_array; + struct pf_rulequeue *old_rules; + int error; + u_int32_t old_rcount; + + PF_RULES_WASSERT(); + + if (rs_num < 0 || rs_num >= PF_RULESET_MAX) + return (EINVAL); + rs = pf_find_ruleset(anchor); + if (rs == NULL || !rs->rules[rs_num].inactive.open || + ticket != rs->rules[rs_num].inactive.ticket) + return (EBUSY); + + /* Calculate checksum for the main ruleset */ + if (rs == &pf_main_ruleset) { + error = pf_setup_pfsync_matching(rs); + if (error != 0) + return (error); + } + + /* Swap rules, keep the old. */ + old_rules = rs->rules[rs_num].active.ptr; + old_rcount = rs->rules[rs_num].active.rcount; + old_array = rs->rules[rs_num].active.ptr_array; + + rs->rules[rs_num].active.ptr = + rs->rules[rs_num].inactive.ptr; + rs->rules[rs_num].active.ptr_array = + rs->rules[rs_num].inactive.ptr_array; + rs->rules[rs_num].active.rcount = + rs->rules[rs_num].inactive.rcount; + rs->rules[rs_num].inactive.ptr = old_rules; + rs->rules[rs_num].inactive.ptr_array = old_array; + rs->rules[rs_num].inactive.rcount = old_rcount; + + rs->rules[rs_num].active.ticket = + rs->rules[rs_num].inactive.ticket; + pf_calc_skip_steps(rs->rules[rs_num].active.ptr); + + + /* Purge the old rule list. */ + while ((rule = TAILQ_FIRST(old_rules)) != NULL) + pf_unlink_rule(old_rules, rule); + if (rs->rules[rs_num].inactive.ptr_array) + free(rs->rules[rs_num].inactive.ptr_array, M_TEMP); + rs->rules[rs_num].inactive.ptr_array = NULL; + rs->rules[rs_num].inactive.rcount = 0; + rs->rules[rs_num].inactive.open = 0; + pf_remove_if_empty_ruleset(rs); + + return (0); +} + +static int +pf_setup_pfsync_matching(struct pf_ruleset *rs) +{ + MD5_CTX ctx; + struct pf_rule *rule; + int rs_cnt; + u_int8_t digest[PF_MD5_DIGEST_LENGTH]; + + MD5Init(&ctx); + for (rs_cnt = 0; rs_cnt < PF_RULESET_MAX; rs_cnt++) { + /* XXX PF_RULESET_SCRUB as well? */ + if (rs_cnt == PF_RULESET_SCRUB) + continue; + + if (rs->rules[rs_cnt].inactive.ptr_array) + free(rs->rules[rs_cnt].inactive.ptr_array, M_TEMP); + rs->rules[rs_cnt].inactive.ptr_array = NULL; + + if (rs->rules[rs_cnt].inactive.rcount) { + rs->rules[rs_cnt].inactive.ptr_array = + malloc(sizeof(caddr_t) * + rs->rules[rs_cnt].inactive.rcount, + M_TEMP, M_NOWAIT); + + if (!rs->rules[rs_cnt].inactive.ptr_array) + return (ENOMEM); + } + + TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr, + entries) { + pf_hash_rule(&ctx, rule); + (rs->rules[rs_cnt].inactive.ptr_array)[rule->nr] = rule; + } + } + + MD5Final(digest, &ctx); + memcpy(V_pf_status.pf_chksum, digest, sizeof(V_pf_status.pf_chksum)); + return (0); +} + +static int +pf_addr_setup(struct pf_ruleset *ruleset, struct pf_addr_wrap *addr, + sa_family_t af) +{ + int error = 0; + + switch (addr->type) { + case PF_ADDR_TABLE: + addr->p.tbl = pfr_attach_table(ruleset, addr->v.tblname); + if (addr->p.tbl == NULL) + error = ENOMEM; + break; + case PF_ADDR_DYNIFTL: + error = pfi_dynaddr_setup(addr, af); + break; + } + + return (error); +} + +static void +pf_addr_copyout(struct pf_addr_wrap *addr) +{ + + switch (addr->type) { + case PF_ADDR_DYNIFTL: + pfi_dynaddr_copyout(addr); + break; + case PF_ADDR_TABLE: + pf_tbladdr_copyout(addr); + break; + } +} + +static int +pfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) +{ + int error = 0; + + /* XXX keep in sync with switch() below */ + if (securelevel_gt(td->td_ucred, 2)) + switch (cmd) { + case DIOCGETRULES: + case DIOCGETRULE: + case DIOCGETADDRS: + case DIOCGETADDR: + case DIOCGETSTATE: + case DIOCSETSTATUSIF: + case DIOCGETSTATUS: + case DIOCCLRSTATUS: + case DIOCNATLOOK: + case DIOCSETDEBUG: + case DIOCGETSTATES: + case DIOCGETTIMEOUT: + case DIOCCLRRULECTRS: + case DIOCGETLIMIT: + case DIOCGETALTQS: + case DIOCGETALTQ: + case DIOCGETQSTATS: + case DIOCGETRULESETS: + case DIOCGETRULESET: + case DIOCRGETTABLES: + case DIOCRGETTSTATS: + case DIOCRCLRTSTATS: + case DIOCRCLRADDRS: + case DIOCRADDADDRS: + case DIOCRDELADDRS: + case DIOCRSETADDRS: + case DIOCRGETADDRS: + case DIOCRGETASTATS: + case DIOCRCLRASTATS: + case DIOCRTSTADDRS: + case DIOCOSFPGET: + case DIOCGETSRCNODES: + case DIOCCLRSRCNODES: + case DIOCIGETIFACES: + case DIOCGIFSPEED: + case DIOCSETIFFLAG: + case DIOCCLRIFFLAG: + break; + case DIOCRCLRTABLES: + case DIOCRADDTABLES: + case DIOCRDELTABLES: + case DIOCRSETTFLAGS: + if (((struct pfioc_table *)addr)->pfrio_flags & + PFR_FLAG_DUMMY) + break; /* dummy operation ok */ + return (EPERM); + default: + return (EPERM); + } + + if (!(flags & FWRITE)) + switch (cmd) { + case DIOCGETRULES: + case DIOCGETADDRS: + case DIOCGETADDR: + case DIOCGETSTATE: + case DIOCGETSTATUS: + case DIOCGETSTATES: + case DIOCGETTIMEOUT: + case DIOCGETLIMIT: + case DIOCGETALTQS: + case DIOCGETALTQ: + case DIOCGETQSTATS: + case DIOCGETRULESETS: + case DIOCGETRULESET: + case DIOCNATLOOK: + case DIOCRGETTABLES: + case DIOCRGETTSTATS: + case DIOCRGETADDRS: + case DIOCRGETASTATS: + case DIOCRTSTADDRS: + case DIOCOSFPGET: + case DIOCGETSRCNODES: + case DIOCIGETIFACES: + case DIOCGIFSPEED: + break; + case DIOCRCLRTABLES: + case DIOCRADDTABLES: + case DIOCRDELTABLES: + case DIOCRCLRTSTATS: + case DIOCRCLRADDRS: + case DIOCRADDADDRS: + case DIOCRDELADDRS: + case DIOCRSETADDRS: + case DIOCRSETTFLAGS: + if (((struct pfioc_table *)addr)->pfrio_flags & + PFR_FLAG_DUMMY) { + flags |= FWRITE; /* need write lock for dummy */ + break; /* dummy operation ok */ + } + return (EACCES); + case DIOCGETRULE: + if (((struct pfioc_rule *)addr)->action == + PF_GET_CLR_CNTR) + return (EACCES); + break; + default: + return (EACCES); + } + + CURVNET_SET(TD_TO_VNET(td)); + + switch (cmd) { + case DIOCSTART: + sx_xlock(&pf_ioctl_lock); + if (V_pf_status.running) + error = EEXIST; + else { + int cpu; + + error = hook_pf(); + if (error) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: pfil registration failed\n")); + break; + } + V_pf_status.running = 1; + V_pf_status.since = time_second; + + CPU_FOREACH(cpu) + V_pf_stateid[cpu] = time_second; + + DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n")); + } + break; + + case DIOCSTOP: + sx_xlock(&pf_ioctl_lock); + if (!V_pf_status.running) + error = ENOENT; + else { + V_pf_status.running = 0; + error = dehook_pf(); + if (error) { + V_pf_status.running = 1; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: pfil unregistration failed\n")); + } + V_pf_status.since = time_second; + DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n")); + } + break; + + case DIOCADDRULE: { + struct pfioc_rule *pr = (struct pfioc_rule *)addr; + struct pf_ruleset *ruleset; + struct pf_rule *rule, *tail; + struct pf_pooladdr *pa; + struct pfi_kif *kif = NULL; + int rs_num; + + if (pr->rule.return_icmp >> 8 > ICMP_MAXTYPE) { + error = EINVAL; + break; + } +#ifndef INET + if (pr->rule.af == AF_INET) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#ifndef INET6 + if (pr->rule.af == AF_INET6) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET6 */ + + rule = malloc(sizeof(*rule), M_PFRULE, M_WAITOK); + bcopy(&pr->rule, rule, sizeof(struct pf_rule)); + if (rule->ifname[0]) + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + rule->states_cur = counter_u64_alloc(M_WAITOK); + rule->states_tot = counter_u64_alloc(M_WAITOK); + rule->src_nodes = counter_u64_alloc(M_WAITOK); +#ifndef __rtems__ + rule->cuid = td->td_ucred->cr_ruid; + rule->cpid = td->td_proc ? td->td_proc->p_pid : 0; +#else /* __rtems__ */ + rule->cuid = BSD_DEFAULT_UID; + rule->cpid = BSD_DEFAULT_PID; +#endif /* __rtems__ */ + TAILQ_INIT(&rule->rpool.list); + +#define ERROUT(x) { error = (x); goto DIOCADDRULE_error; } + + PF_RULES_WLOCK(); + pr->anchor[sizeof(pr->anchor) - 1] = 0; + ruleset = pf_find_ruleset(pr->anchor); + if (ruleset == NULL) + ERROUT(EINVAL); + rs_num = pf_get_ruleset_number(pr->rule.action); + if (rs_num >= PF_RULESET_MAX) + ERROUT(EINVAL); + if (pr->ticket != ruleset->rules[rs_num].inactive.ticket) { + DPFPRINTF(PF_DEBUG_MISC, + ("ticket: %d != [%d]%d\n", pr->ticket, rs_num, + ruleset->rules[rs_num].inactive.ticket)); + ERROUT(EBUSY); + } + if (pr->pool_ticket != V_ticket_pabuf) { + DPFPRINTF(PF_DEBUG_MISC, + ("pool_ticket: %d != %d\n", pr->pool_ticket, + V_ticket_pabuf)); + ERROUT(EBUSY); + } + + tail = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr, + pf_rulequeue); + if (tail) + rule->nr = tail->nr + 1; + else + rule->nr = 0; + if (rule->ifname[0]) { + rule->kif = pfi_kif_attach(kif, rule->ifname); + pfi_kif_ref(rule->kif); + } else + rule->kif = NULL; + + if (rule->rtableid > 0 && rule->rtableid >= rt_numfibs) + error = EBUSY; + +#ifdef ALTQ + /* set queue IDs */ + if (rule->qname[0] != 0) { + if ((rule->qid = pf_qname2qid(rule->qname)) == 0) + error = EBUSY; + else if (rule->pqname[0] != 0) { + if ((rule->pqid = + pf_qname2qid(rule->pqname)) == 0) + error = EBUSY; + } else + rule->pqid = rule->qid; + } +#endif + if (rule->tagname[0]) + if ((rule->tag = pf_tagname2tag(rule->tagname)) == 0) + error = EBUSY; + if (rule->match_tagname[0]) + if ((rule->match_tag = + pf_tagname2tag(rule->match_tagname)) == 0) + error = EBUSY; + if (rule->rt && !rule->direction) + error = EINVAL; + if (!rule->log) + rule->logif = 0; + if (rule->logif >= PFLOGIFS_MAX) + error = EINVAL; + if (pf_addr_setup(ruleset, &rule->src.addr, rule->af)) + error = ENOMEM; + if (pf_addr_setup(ruleset, &rule->dst.addr, rule->af)) + error = ENOMEM; + if (pf_anchor_setup(rule, ruleset, pr->anchor_call)) + error = EINVAL; + if (rule->scrub_flags & PFSTATE_SETPRIO && + (rule->set_prio[0] > PF_PRIO_MAX || + rule->set_prio[1] > PF_PRIO_MAX)) + error = EINVAL; + TAILQ_FOREACH(pa, &V_pf_pabuf, entries) + if (pa->addr.type == PF_ADDR_TABLE) { + pa->addr.p.tbl = pfr_attach_table(ruleset, + pa->addr.v.tblname); + if (pa->addr.p.tbl == NULL) + error = ENOMEM; + } + + rule->overload_tbl = NULL; + if (rule->overload_tblname[0]) { + if ((rule->overload_tbl = pfr_attach_table(ruleset, + rule->overload_tblname)) == NULL) + error = EINVAL; + else + rule->overload_tbl->pfrkt_flags |= + PFR_TFLAG_ACTIVE; + } + + pf_mv_pool(&V_pf_pabuf, &rule->rpool.list); + if (((((rule->action == PF_NAT) || (rule->action == PF_RDR) || + (rule->action == PF_BINAT)) && rule->anchor == NULL) || + (rule->rt > PF_FASTROUTE)) && + (TAILQ_FIRST(&rule->rpool.list) == NULL)) + error = EINVAL; + + if (error) { + pf_free_rule(rule); + PF_RULES_WUNLOCK(); + break; + } + + rule->rpool.cur = TAILQ_FIRST(&rule->rpool.list); + rule->evaluations = rule->packets[0] = rule->packets[1] = + rule->bytes[0] = rule->bytes[1] = 0; + TAILQ_INSERT_TAIL(ruleset->rules[rs_num].inactive.ptr, + rule, entries); + ruleset->rules[rs_num].inactive.rcount++; + PF_RULES_WUNLOCK(); + break; + +#undef ERROUT +DIOCADDRULE_error: + PF_RULES_WUNLOCK(); + counter_u64_free(rule->states_cur); + counter_u64_free(rule->states_tot); + counter_u64_free(rule->src_nodes); + free(rule, M_PFRULE); + if (kif) + free(kif, PFI_MTYPE); + break; + } + + case DIOCGETRULES: { + struct pfioc_rule *pr = (struct pfioc_rule *)addr; + struct pf_ruleset *ruleset; + struct pf_rule *tail; + int rs_num; + + PF_RULES_WLOCK(); + pr->anchor[sizeof(pr->anchor) - 1] = 0; + ruleset = pf_find_ruleset(pr->anchor); + if (ruleset == NULL) { + PF_RULES_WUNLOCK(); + error = EINVAL; + break; + } + rs_num = pf_get_ruleset_number(pr->rule.action); + if (rs_num >= PF_RULESET_MAX) { + PF_RULES_WUNLOCK(); + error = EINVAL; + break; + } + tail = TAILQ_LAST(ruleset->rules[rs_num].active.ptr, + pf_rulequeue); + if (tail) + pr->nr = tail->nr + 1; + else + pr->nr = 0; + pr->ticket = ruleset->rules[rs_num].active.ticket; + PF_RULES_WUNLOCK(); + break; + } + + case DIOCGETRULE: { + struct pfioc_rule *pr = (struct pfioc_rule *)addr; + struct pf_ruleset *ruleset; + struct pf_rule *rule; + int rs_num, i; + + PF_RULES_WLOCK(); + pr->anchor[sizeof(pr->anchor) - 1] = 0; + ruleset = pf_find_ruleset(pr->anchor); + if (ruleset == NULL) { + PF_RULES_WUNLOCK(); + error = EINVAL; + break; + } + rs_num = pf_get_ruleset_number(pr->rule.action); + if (rs_num >= PF_RULESET_MAX) { + PF_RULES_WUNLOCK(); + error = EINVAL; + break; + } + if (pr->ticket != ruleset->rules[rs_num].active.ticket) { + PF_RULES_WUNLOCK(); + error = EBUSY; + break; + } + rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr); + while ((rule != NULL) && (rule->nr != pr->nr)) + rule = TAILQ_NEXT(rule, entries); + if (rule == NULL) { + PF_RULES_WUNLOCK(); + error = EBUSY; + break; + } + bcopy(rule, &pr->rule, sizeof(struct pf_rule)); + pr->rule.u_states_cur = counter_u64_fetch(rule->states_cur); + pr->rule.u_states_tot = counter_u64_fetch(rule->states_tot); + pr->rule.u_src_nodes = counter_u64_fetch(rule->src_nodes); + if (pf_anchor_copyout(ruleset, rule, pr)) { + PF_RULES_WUNLOCK(); + error = EBUSY; + break; + } + pf_addr_copyout(&pr->rule.src.addr); + pf_addr_copyout(&pr->rule.dst.addr); + for (i = 0; i < PF_SKIP_COUNT; ++i) + if (rule->skip[i].ptr == NULL) + pr->rule.skip[i].nr = -1; + else + pr->rule.skip[i].nr = + rule->skip[i].ptr->nr; + + if (pr->action == PF_GET_CLR_CNTR) { + rule->evaluations = 0; + rule->packets[0] = rule->packets[1] = 0; + rule->bytes[0] = rule->bytes[1] = 0; + counter_u64_zero(rule->states_tot); + } + PF_RULES_WUNLOCK(); + break; + } + + case DIOCCHANGERULE: { + struct pfioc_rule *pcr = (struct pfioc_rule *)addr; + struct pf_ruleset *ruleset; + struct pf_rule *oldrule = NULL, *newrule = NULL; + struct pfi_kif *kif = NULL; + struct pf_pooladdr *pa; + u_int32_t nr = 0; + int rs_num; + + if (pcr->action < PF_CHANGE_ADD_HEAD || + pcr->action > PF_CHANGE_GET_TICKET) { + error = EINVAL; + break; + } + if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) { + error = EINVAL; + break; + } + + if (pcr->action != PF_CHANGE_REMOVE) { +#ifndef INET + if (pcr->rule.af == AF_INET) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#ifndef INET6 + if (pcr->rule.af == AF_INET6) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET6 */ + newrule = malloc(sizeof(*newrule), M_PFRULE, M_WAITOK); + bcopy(&pcr->rule, newrule, sizeof(struct pf_rule)); + if (newrule->ifname[0]) + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + newrule->states_cur = counter_u64_alloc(M_WAITOK); + newrule->states_tot = counter_u64_alloc(M_WAITOK); + newrule->src_nodes = counter_u64_alloc(M_WAITOK); +#ifndef __rtems__ + newrule->cuid = td->td_ucred->cr_ruid; + newrule->cpid = td->td_proc ? td->td_proc->p_pid : 0; +#else /* __rtems__ */ + newrule->cuid = BSD_DEFAULT_UID; + newrule->cpid = BSD_DEFAULT_PID; +#endif /* __rtems__ */ + TAILQ_INIT(&newrule->rpool.list); + } + +#define ERROUT(x) { error = (x); goto DIOCCHANGERULE_error; } + + PF_RULES_WLOCK(); + if (!(pcr->action == PF_CHANGE_REMOVE || + pcr->action == PF_CHANGE_GET_TICKET) && + pcr->pool_ticket != V_ticket_pabuf) + ERROUT(EBUSY); + + ruleset = pf_find_ruleset(pcr->anchor); + if (ruleset == NULL) + ERROUT(EINVAL); + + rs_num = pf_get_ruleset_number(pcr->rule.action); + if (rs_num >= PF_RULESET_MAX) + ERROUT(EINVAL); + + if (pcr->action == PF_CHANGE_GET_TICKET) { + pcr->ticket = ++ruleset->rules[rs_num].active.ticket; + ERROUT(0); + } else if (pcr->ticket != + ruleset->rules[rs_num].active.ticket) + ERROUT(EINVAL); + + if (pcr->action != PF_CHANGE_REMOVE) { + if (newrule->ifname[0]) { + newrule->kif = pfi_kif_attach(kif, + newrule->ifname); + pfi_kif_ref(newrule->kif); + } else + newrule->kif = NULL; + + if (newrule->rtableid > 0 && + newrule->rtableid >= rt_numfibs) + error = EBUSY; + +#ifdef ALTQ + /* set queue IDs */ + if (newrule->qname[0] != 0) { + if ((newrule->qid = + pf_qname2qid(newrule->qname)) == 0) + error = EBUSY; + else if (newrule->pqname[0] != 0) { + if ((newrule->pqid = + pf_qname2qid(newrule->pqname)) == 0) + error = EBUSY; + } else + newrule->pqid = newrule->qid; + } +#endif /* ALTQ */ + if (newrule->tagname[0]) + if ((newrule->tag = + pf_tagname2tag(newrule->tagname)) == 0) + error = EBUSY; + if (newrule->match_tagname[0]) + if ((newrule->match_tag = pf_tagname2tag( + newrule->match_tagname)) == 0) + error = EBUSY; + if (newrule->rt && !newrule->direction) + error = EINVAL; + if (!newrule->log) + newrule->logif = 0; + if (newrule->logif >= PFLOGIFS_MAX) + error = EINVAL; + if (pf_addr_setup(ruleset, &newrule->src.addr, newrule->af)) + error = ENOMEM; + if (pf_addr_setup(ruleset, &newrule->dst.addr, newrule->af)) + error = ENOMEM; + if (pf_anchor_setup(newrule, ruleset, pcr->anchor_call)) + error = EINVAL; + TAILQ_FOREACH(pa, &V_pf_pabuf, entries) + if (pa->addr.type == PF_ADDR_TABLE) { + pa->addr.p.tbl = + pfr_attach_table(ruleset, + pa->addr.v.tblname); + if (pa->addr.p.tbl == NULL) + error = ENOMEM; + } + + newrule->overload_tbl = NULL; + if (newrule->overload_tblname[0]) { + if ((newrule->overload_tbl = pfr_attach_table( + ruleset, newrule->overload_tblname)) == + NULL) + error = EINVAL; + else + newrule->overload_tbl->pfrkt_flags |= + PFR_TFLAG_ACTIVE; + } + + pf_mv_pool(&V_pf_pabuf, &newrule->rpool.list); + if (((((newrule->action == PF_NAT) || + (newrule->action == PF_RDR) || + (newrule->action == PF_BINAT) || + (newrule->rt > PF_FASTROUTE)) && + !newrule->anchor)) && + (TAILQ_FIRST(&newrule->rpool.list) == NULL)) + error = EINVAL; + + if (error) { + pf_free_rule(newrule); + PF_RULES_WUNLOCK(); + break; + } + + newrule->rpool.cur = TAILQ_FIRST(&newrule->rpool.list); + newrule->evaluations = 0; + newrule->packets[0] = newrule->packets[1] = 0; + newrule->bytes[0] = newrule->bytes[1] = 0; + } + pf_empty_pool(&V_pf_pabuf); + + if (pcr->action == PF_CHANGE_ADD_HEAD) + oldrule = TAILQ_FIRST( + ruleset->rules[rs_num].active.ptr); + else if (pcr->action == PF_CHANGE_ADD_TAIL) + oldrule = TAILQ_LAST( + ruleset->rules[rs_num].active.ptr, pf_rulequeue); + else { + oldrule = TAILQ_FIRST( + ruleset->rules[rs_num].active.ptr); + while ((oldrule != NULL) && (oldrule->nr != pcr->nr)) + oldrule = TAILQ_NEXT(oldrule, entries); + if (oldrule == NULL) { + if (newrule != NULL) + pf_free_rule(newrule); + PF_RULES_WUNLOCK(); + error = EINVAL; + break; + } + } + + if (pcr->action == PF_CHANGE_REMOVE) { + pf_unlink_rule(ruleset->rules[rs_num].active.ptr, + oldrule); + ruleset->rules[rs_num].active.rcount--; + } else { + if (oldrule == NULL) + TAILQ_INSERT_TAIL( + ruleset->rules[rs_num].active.ptr, + newrule, entries); + else if (pcr->action == PF_CHANGE_ADD_HEAD || + pcr->action == PF_CHANGE_ADD_BEFORE) + TAILQ_INSERT_BEFORE(oldrule, newrule, entries); + else + TAILQ_INSERT_AFTER( + ruleset->rules[rs_num].active.ptr, + oldrule, newrule, entries); + ruleset->rules[rs_num].active.rcount++; + } + + nr = 0; + TAILQ_FOREACH(oldrule, + ruleset->rules[rs_num].active.ptr, entries) + oldrule->nr = nr++; + + ruleset->rules[rs_num].active.ticket++; + + pf_calc_skip_steps(ruleset->rules[rs_num].active.ptr); + pf_remove_if_empty_ruleset(ruleset); + + PF_RULES_WUNLOCK(); + break; + +#undef ERROUT +DIOCCHANGERULE_error: + PF_RULES_WUNLOCK(); + if (newrule != NULL) { + counter_u64_free(newrule->states_cur); + counter_u64_free(newrule->states_tot); + counter_u64_free(newrule->src_nodes); + free(newrule, M_PFRULE); + } + if (kif != NULL) + free(kif, PFI_MTYPE); + break; + } + + case DIOCCLRSTATES: { + struct pf_state *s; + struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr; + u_int i, killed = 0; + + for (i = 0; i <= pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + +relock_DIOCCLRSTATES: + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) + if (!psk->psk_ifname[0] || + !strcmp(psk->psk_ifname, + s->kif->pfik_name)) { + /* + * Don't send out individual + * delete messages. + */ + s->state_flags |= PFSTATE_NOSYNC; + pf_unlink_state(s, PF_ENTER_LOCKED); + killed++; + goto relock_DIOCCLRSTATES; + } + PF_HASHROW_UNLOCK(ih); + } + psk->psk_killed = killed; + if (pfsync_clear_states_ptr != NULL) + pfsync_clear_states_ptr(V_pf_status.hostid, psk->psk_ifname); + break; + } + + case DIOCKILLSTATES: { + struct pf_state *s; + struct pf_state_key *sk; + struct pf_addr *srcaddr, *dstaddr; + u_int16_t srcport, dstport; + struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr; + u_int i, killed = 0; + + if (psk->psk_pfcmp.id) { + if (psk->psk_pfcmp.creatorid == 0) + psk->psk_pfcmp.creatorid = V_pf_status.hostid; + if ((s = pf_find_state_byid(psk->psk_pfcmp.id, + psk->psk_pfcmp.creatorid))) { + pf_unlink_state(s, PF_ENTER_LOCKED); + psk->psk_killed = 1; + } + break; + } + + for (i = 0; i <= pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + +relock_DIOCKILLSTATES: + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + sk = s->key[PF_SK_WIRE]; + if (s->direction == PF_OUT) { + srcaddr = &sk->addr[1]; + dstaddr = &sk->addr[0]; + srcport = sk->port[1]; + dstport = sk->port[0]; + } else { + srcaddr = &sk->addr[0]; + dstaddr = &sk->addr[1]; + srcport = sk->port[0]; + dstport = sk->port[1]; + } + + if ((!psk->psk_af || sk->af == psk->psk_af) + && (!psk->psk_proto || psk->psk_proto == + sk->proto) && + PF_MATCHA(psk->psk_src.neg, + &psk->psk_src.addr.v.a.addr, + &psk->psk_src.addr.v.a.mask, + srcaddr, sk->af) && + PF_MATCHA(psk->psk_dst.neg, + &psk->psk_dst.addr.v.a.addr, + &psk->psk_dst.addr.v.a.mask, + dstaddr, sk->af) && + (psk->psk_src.port_op == 0 || + pf_match_port(psk->psk_src.port_op, + psk->psk_src.port[0], psk->psk_src.port[1], + srcport)) && + (psk->psk_dst.port_op == 0 || + pf_match_port(psk->psk_dst.port_op, + psk->psk_dst.port[0], psk->psk_dst.port[1], + dstport)) && + (!psk->psk_label[0] || + (s->rule.ptr->label[0] && + !strcmp(psk->psk_label, + s->rule.ptr->label))) && + (!psk->psk_ifname[0] || + !strcmp(psk->psk_ifname, + s->kif->pfik_name))) { + pf_unlink_state(s, PF_ENTER_LOCKED); + killed++; + goto relock_DIOCKILLSTATES; + } + } + PF_HASHROW_UNLOCK(ih); + } + psk->psk_killed = killed; + break; + } + + case DIOCADDSTATE: { + struct pfioc_state *ps = (struct pfioc_state *)addr; + struct pfsync_state *sp = &ps->state; + + if (sp->timeout >= PFTM_MAX) { + error = EINVAL; + break; + } + if (pfsync_state_import_ptr != NULL) { + PF_RULES_RLOCK(); + error = pfsync_state_import_ptr(sp, PFSYNC_SI_IOCTL); + PF_RULES_RUNLOCK(); + } else + error = EOPNOTSUPP; + break; + } + + case DIOCGETSTATE: { + struct pfioc_state *ps = (struct pfioc_state *)addr; + struct pf_state *s; + + s = pf_find_state_byid(ps->state.id, ps->state.creatorid); + if (s == NULL) { + error = ENOENT; + break; + } + + pfsync_state_export(&ps->state, s); + PF_STATE_UNLOCK(s); + break; + } + + case DIOCGETSTATES: { + struct pfioc_states *ps = (struct pfioc_states *)addr; + struct pf_state *s; + struct pfsync_state *pstore, *p; + int i, nr; + + if (ps->ps_len == 0) { + nr = uma_zone_get_cur(V_pf_state_z); + ps->ps_len = sizeof(struct pfsync_state) * nr; + break; + } + + p = pstore = malloc(ps->ps_len, M_TEMP, M_WAITOK); + nr = 0; + + for (i = 0; i <= pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + + if (s->timeout == PFTM_UNLINKED) + continue; + + if ((nr+1) * sizeof(*p) > ps->ps_len) { + PF_HASHROW_UNLOCK(ih); + goto DIOCGETSTATES_full; + } + pfsync_state_export(p, s); + p++; + nr++; + } + PF_HASHROW_UNLOCK(ih); + } +DIOCGETSTATES_full: + error = copyout(pstore, ps->ps_states, + sizeof(struct pfsync_state) * nr); + if (error) { + free(pstore, M_TEMP); + break; + } + ps->ps_len = sizeof(struct pfsync_state) * nr; + free(pstore, M_TEMP); + + break; + } + + case DIOCGETSTATUS: { + struct pf_status *s = (struct pf_status *)addr; + + PF_RULES_RLOCK(); + s->running = V_pf_status.running; + s->since = V_pf_status.since; + s->debug = V_pf_status.debug; + s->hostid = V_pf_status.hostid; + s->states = V_pf_status.states; + s->src_nodes = V_pf_status.src_nodes; + + for (int i = 0; i < PFRES_MAX; i++) + s->counters[i] = + counter_u64_fetch(V_pf_status.counters[i]); + for (int i = 0; i < LCNT_MAX; i++) + s->lcounters[i] = + counter_u64_fetch(V_pf_status.lcounters[i]); + for (int i = 0; i < FCNT_MAX; i++) + s->fcounters[i] = + counter_u64_fetch(V_pf_status.fcounters[i]); + for (int i = 0; i < SCNT_MAX; i++) + s->scounters[i] = + counter_u64_fetch(V_pf_status.scounters[i]); + + bcopy(V_pf_status.ifname, s->ifname, IFNAMSIZ); + bcopy(V_pf_status.pf_chksum, s->pf_chksum, + PF_MD5_DIGEST_LENGTH); + + pfi_update_status(s->ifname, s); + PF_RULES_RUNLOCK(); + break; + } + + case DIOCSETSTATUSIF: { + struct pfioc_if *pi = (struct pfioc_if *)addr; + + if (pi->ifname[0] == 0) { + bzero(V_pf_status.ifname, IFNAMSIZ); + break; + } + PF_RULES_WLOCK(); + strlcpy(V_pf_status.ifname, pi->ifname, IFNAMSIZ); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCCLRSTATUS: { + PF_RULES_WLOCK(); + for (int i = 0; i < PFRES_MAX; i++) + counter_u64_zero(V_pf_status.counters[i]); + for (int i = 0; i < FCNT_MAX; i++) + counter_u64_zero(V_pf_status.fcounters[i]); + for (int i = 0; i < SCNT_MAX; i++) + counter_u64_zero(V_pf_status.scounters[i]); + V_pf_status.since = time_second; + if (*V_pf_status.ifname) + pfi_update_status(V_pf_status.ifname, NULL); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCNATLOOK: { + struct pfioc_natlook *pnl = (struct pfioc_natlook *)addr; + struct pf_state_key *sk; + struct pf_state *state; + struct pf_state_key_cmp key; + int m = 0, direction = pnl->direction; + int sidx, didx; + + /* NATLOOK src and dst are reversed, so reverse sidx/didx */ + sidx = (direction == PF_IN) ? 1 : 0; + didx = (direction == PF_IN) ? 0 : 1; + + if (!pnl->proto || + PF_AZERO(&pnl->saddr, pnl->af) || + PF_AZERO(&pnl->daddr, pnl->af) || + ((pnl->proto == IPPROTO_TCP || + pnl->proto == IPPROTO_UDP) && + (!pnl->dport || !pnl->sport))) + error = EINVAL; + else { + bzero(&key, sizeof(key)); + key.af = pnl->af; + key.proto = pnl->proto; + PF_ACPY(&key.addr[sidx], &pnl->saddr, pnl->af); + key.port[sidx] = pnl->sport; + PF_ACPY(&key.addr[didx], &pnl->daddr, pnl->af); + key.port[didx] = pnl->dport; + + state = pf_find_state_all(&key, direction, &m); + + if (m > 1) + error = E2BIG; /* more than one state */ + else if (state != NULL) { + /* XXXGL: not locked read */ + sk = state->key[sidx]; + PF_ACPY(&pnl->rsaddr, &sk->addr[sidx], sk->af); + pnl->rsport = sk->port[sidx]; + PF_ACPY(&pnl->rdaddr, &sk->addr[didx], sk->af); + pnl->rdport = sk->port[didx]; + } else + error = ENOENT; + } + break; + } + + case DIOCSETTIMEOUT: { + struct pfioc_tm *pt = (struct pfioc_tm *)addr; + int old; + + if (pt->timeout < 0 || pt->timeout >= PFTM_MAX || + pt->seconds < 0) { + error = EINVAL; + break; + } + PF_RULES_WLOCK(); + old = V_pf_default_rule.timeout[pt->timeout]; + if (pt->timeout == PFTM_INTERVAL && pt->seconds == 0) + pt->seconds = 1; + V_pf_default_rule.timeout[pt->timeout] = pt->seconds; + if (pt->timeout == PFTM_INTERVAL && pt->seconds < old) + wakeup(pf_purge_thread); + pt->seconds = old; + PF_RULES_WUNLOCK(); + break; + } + + case DIOCGETTIMEOUT: { + struct pfioc_tm *pt = (struct pfioc_tm *)addr; + + if (pt->timeout < 0 || pt->timeout >= PFTM_MAX) { + error = EINVAL; + break; + } + PF_RULES_RLOCK(); + pt->seconds = V_pf_default_rule.timeout[pt->timeout]; + PF_RULES_RUNLOCK(); + break; + } + + case DIOCGETLIMIT: { + struct pfioc_limit *pl = (struct pfioc_limit *)addr; + + if (pl->index < 0 || pl->index >= PF_LIMIT_MAX) { + error = EINVAL; + break; + } + PF_RULES_RLOCK(); + pl->limit = V_pf_limits[pl->index].limit; + PF_RULES_RUNLOCK(); + break; + } + + case DIOCSETLIMIT: { + struct pfioc_limit *pl = (struct pfioc_limit *)addr; + int old_limit; + + PF_RULES_WLOCK(); + if (pl->index < 0 || pl->index >= PF_LIMIT_MAX || + V_pf_limits[pl->index].zone == NULL) { + PF_RULES_WUNLOCK(); + error = EINVAL; + break; + } + uma_zone_set_max(V_pf_limits[pl->index].zone, pl->limit); + old_limit = V_pf_limits[pl->index].limit; + V_pf_limits[pl->index].limit = pl->limit; + pl->limit = old_limit; + PF_RULES_WUNLOCK(); + break; + } + + case DIOCSETDEBUG: { + u_int32_t *level = (u_int32_t *)addr; + + PF_RULES_WLOCK(); + V_pf_status.debug = *level; + PF_RULES_WUNLOCK(); + break; + } + + case DIOCCLRRULECTRS: { + /* obsoleted by DIOCGETRULE with action=PF_GET_CLR_CNTR */ + struct pf_ruleset *ruleset = &pf_main_ruleset; + struct pf_rule *rule; + + PF_RULES_WLOCK(); + TAILQ_FOREACH(rule, + ruleset->rules[PF_RULESET_FILTER].active.ptr, entries) { + rule->evaluations = 0; + rule->packets[0] = rule->packets[1] = 0; + rule->bytes[0] = rule->bytes[1] = 0; + } + PF_RULES_WUNLOCK(); + break; + } + + case DIOCGIFSPEED: { + struct pf_ifspeed *psp = (struct pf_ifspeed *)addr; + struct pf_ifspeed ps; + struct ifnet *ifp; + + if (psp->ifname[0] != 0) { + /* Can we completely trust user-land? */ + strlcpy(ps.ifname, psp->ifname, IFNAMSIZ); + ifp = ifunit(ps.ifname); + if (ifp != NULL) + psp->baudrate = ifp->if_baudrate; + else + error = EINVAL; + } else + error = EINVAL; + break; + } + +#ifdef ALTQ + case DIOCSTARTALTQ: { + struct pf_altq *altq; + + PF_RULES_WLOCK(); + /* enable all altq interfaces on active list */ + TAILQ_FOREACH(altq, V_pf_altqs_active, entries) { + if (altq->qname[0] == 0 && (altq->local_flags & + PFALTQ_FLAG_IF_REMOVED) == 0) { + error = pf_enable_altq(altq); + if (error != 0) + break; + } + } + if (error == 0) + V_pf_altq_running = 1; + PF_RULES_WUNLOCK(); + DPFPRINTF(PF_DEBUG_MISC, ("altq: started\n")); + break; + } + + case DIOCSTOPALTQ: { + struct pf_altq *altq; + + PF_RULES_WLOCK(); + /* disable all altq interfaces on active list */ + TAILQ_FOREACH(altq, V_pf_altqs_active, entries) { + if (altq->qname[0] == 0 && (altq->local_flags & + PFALTQ_FLAG_IF_REMOVED) == 0) { + error = pf_disable_altq(altq); + if (error != 0) + break; + } + } + if (error == 0) + V_pf_altq_running = 0; + PF_RULES_WUNLOCK(); + DPFPRINTF(PF_DEBUG_MISC, ("altq: stopped\n")); + break; + } + + case DIOCADDALTQ: { + struct pfioc_altq *pa = (struct pfioc_altq *)addr; + struct pf_altq *altq, *a; + struct ifnet *ifp; + + altq = malloc(sizeof(*altq), M_PFALTQ, M_WAITOK); + bcopy(&pa->altq, altq, sizeof(struct pf_altq)); + altq->local_flags = 0; + + PF_RULES_WLOCK(); + if (pa->ticket != V_ticket_altqs_inactive) { + PF_RULES_WUNLOCK(); + free(altq, M_PFALTQ); + error = EBUSY; + break; + } + + /* + * if this is for a queue, find the discipline and + * copy the necessary fields + */ + if (altq->qname[0] != 0) { + if ((altq->qid = pf_qname2qid(altq->qname)) == 0) { + PF_RULES_WUNLOCK(); + error = EBUSY; + free(altq, M_PFALTQ); + break; + } + altq->altq_disc = NULL; + TAILQ_FOREACH(a, V_pf_altqs_inactive, entries) { + if (strncmp(a->ifname, altq->ifname, + IFNAMSIZ) == 0 && a->qname[0] == 0) { + altq->altq_disc = a->altq_disc; + break; + } + } + } + + if ((ifp = ifunit(altq->ifname)) == NULL) + altq->local_flags |= PFALTQ_FLAG_IF_REMOVED; + else + error = altq_add(altq); + + if (error) { + PF_RULES_WUNLOCK(); + free(altq, M_PFALTQ); + break; + } + + TAILQ_INSERT_TAIL(V_pf_altqs_inactive, altq, entries); + bcopy(altq, &pa->altq, sizeof(struct pf_altq)); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCGETALTQS: { + struct pfioc_altq *pa = (struct pfioc_altq *)addr; + struct pf_altq *altq; + + PF_RULES_RLOCK(); + pa->nr = 0; + TAILQ_FOREACH(altq, V_pf_altqs_active, entries) + pa->nr++; + pa->ticket = V_ticket_altqs_active; + PF_RULES_RUNLOCK(); + break; + } + + case DIOCGETALTQ: { + struct pfioc_altq *pa = (struct pfioc_altq *)addr; + struct pf_altq *altq; + u_int32_t nr; + + PF_RULES_RLOCK(); + if (pa->ticket != V_ticket_altqs_active) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + nr = 0; + altq = TAILQ_FIRST(V_pf_altqs_active); + while ((altq != NULL) && (nr < pa->nr)) { + altq = TAILQ_NEXT(altq, entries); + nr++; + } + if (altq == NULL) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + bcopy(altq, &pa->altq, sizeof(struct pf_altq)); + PF_RULES_RUNLOCK(); + break; + } + + case DIOCCHANGEALTQ: + /* CHANGEALTQ not supported yet! */ + error = ENODEV; + break; + + case DIOCGETQSTATS: { + struct pfioc_qstats *pq = (struct pfioc_qstats *)addr; + struct pf_altq *altq; + u_int32_t nr; + int nbytes; + + PF_RULES_RLOCK(); + if (pq->ticket != V_ticket_altqs_active) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + nbytes = pq->nbytes; + nr = 0; + altq = TAILQ_FIRST(V_pf_altqs_active); + while ((altq != NULL) && (nr < pq->nr)) { + altq = TAILQ_NEXT(altq, entries); + nr++; + } + if (altq == NULL) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + + if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) { + PF_RULES_RUNLOCK(); + error = ENXIO; + break; + } + PF_RULES_RUNLOCK(); + error = altq_getqstats(altq, pq->buf, &nbytes); + if (error == 0) { + pq->scheduler = altq->scheduler; + pq->nbytes = nbytes; + } + break; + } +#endif /* ALTQ */ + + case DIOCBEGINADDRS: { + struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + + PF_RULES_WLOCK(); + pf_empty_pool(&V_pf_pabuf); + pp->ticket = ++V_ticket_pabuf; + PF_RULES_WUNLOCK(); + break; + } + + case DIOCADDADDR: { + struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + struct pf_pooladdr *pa; + struct pfi_kif *kif = NULL; + +#ifndef INET + if (pp->af == AF_INET) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#ifndef INET6 + if (pp->af == AF_INET6) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET6 */ + if (pp->addr.addr.type != PF_ADDR_ADDRMASK && + pp->addr.addr.type != PF_ADDR_DYNIFTL && + pp->addr.addr.type != PF_ADDR_TABLE) { + error = EINVAL; + break; + } + pa = malloc(sizeof(*pa), M_PFRULE, M_WAITOK); + bcopy(&pp->addr, pa, sizeof(struct pf_pooladdr)); + if (pa->ifname[0]) + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + PF_RULES_WLOCK(); + if (pp->ticket != V_ticket_pabuf) { + PF_RULES_WUNLOCK(); + if (pa->ifname[0]) + free(kif, PFI_MTYPE); + free(pa, M_PFRULE); + error = EBUSY; + break; + } + if (pa->ifname[0]) { + pa->kif = pfi_kif_attach(kif, pa->ifname); + pfi_kif_ref(pa->kif); + } else + pa->kif = NULL; + if (pa->addr.type == PF_ADDR_DYNIFTL && ((error = + pfi_dynaddr_setup(&pa->addr, pp->af)) != 0)) { + if (pa->ifname[0]) + pfi_kif_unref(pa->kif); + PF_RULES_WUNLOCK(); + free(pa, M_PFRULE); + break; + } + TAILQ_INSERT_TAIL(&V_pf_pabuf, pa, entries); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCGETADDRS: { + struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + struct pf_pool *pool; + struct pf_pooladdr *pa; + + PF_RULES_RLOCK(); + pp->nr = 0; + pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action, + pp->r_num, 0, 1, 0); + if (pool == NULL) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + TAILQ_FOREACH(pa, &pool->list, entries) + pp->nr++; + PF_RULES_RUNLOCK(); + break; + } + + case DIOCGETADDR: { + struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + struct pf_pool *pool; + struct pf_pooladdr *pa; + u_int32_t nr = 0; + + PF_RULES_RLOCK(); + pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action, + pp->r_num, 0, 1, 1); + if (pool == NULL) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + pa = TAILQ_FIRST(&pool->list); + while ((pa != NULL) && (nr < pp->nr)) { + pa = TAILQ_NEXT(pa, entries); + nr++; + } + if (pa == NULL) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + bcopy(pa, &pp->addr, sizeof(struct pf_pooladdr)); + pf_addr_copyout(&pp->addr.addr); + PF_RULES_RUNLOCK(); + break; + } + + case DIOCCHANGEADDR: { + struct pfioc_pooladdr *pca = (struct pfioc_pooladdr *)addr; + struct pf_pool *pool; + struct pf_pooladdr *oldpa = NULL, *newpa = NULL; + struct pf_ruleset *ruleset; + struct pfi_kif *kif = NULL; + + if (pca->action < PF_CHANGE_ADD_HEAD || + pca->action > PF_CHANGE_REMOVE) { + error = EINVAL; + break; + } + if (pca->addr.addr.type != PF_ADDR_ADDRMASK && + pca->addr.addr.type != PF_ADDR_DYNIFTL && + pca->addr.addr.type != PF_ADDR_TABLE) { + error = EINVAL; + break; + } + + if (pca->action != PF_CHANGE_REMOVE) { +#ifndef INET + if (pca->af == AF_INET) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#ifndef INET6 + if (pca->af == AF_INET6) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET6 */ + newpa = malloc(sizeof(*newpa), M_PFRULE, M_WAITOK); + bcopy(&pca->addr, newpa, sizeof(struct pf_pooladdr)); + if (newpa->ifname[0]) + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + newpa->kif = NULL; + } + +#define ERROUT(x) { error = (x); goto DIOCCHANGEADDR_error; } + PF_RULES_WLOCK(); + ruleset = pf_find_ruleset(pca->anchor); + if (ruleset == NULL) + ERROUT(EBUSY); + + pool = pf_get_pool(pca->anchor, pca->ticket, pca->r_action, + pca->r_num, pca->r_last, 1, 1); + if (pool == NULL) + ERROUT(EBUSY); + + if (pca->action != PF_CHANGE_REMOVE) { + if (newpa->ifname[0]) { + newpa->kif = pfi_kif_attach(kif, newpa->ifname); + pfi_kif_ref(newpa->kif); + kif = NULL; + } + + switch (newpa->addr.type) { + case PF_ADDR_DYNIFTL: + error = pfi_dynaddr_setup(&newpa->addr, + pca->af); + break; + case PF_ADDR_TABLE: + newpa->addr.p.tbl = pfr_attach_table(ruleset, + newpa->addr.v.tblname); + if (newpa->addr.p.tbl == NULL) + error = ENOMEM; + break; + } + if (error) + goto DIOCCHANGEADDR_error; + } + + switch (pca->action) { + case PF_CHANGE_ADD_HEAD: + oldpa = TAILQ_FIRST(&pool->list); + break; + case PF_CHANGE_ADD_TAIL: + oldpa = TAILQ_LAST(&pool->list, pf_palist); + break; + default: + oldpa = TAILQ_FIRST(&pool->list); + for (int i = 0; oldpa && i < pca->nr; i++) + oldpa = TAILQ_NEXT(oldpa, entries); + + if (oldpa == NULL) + ERROUT(EINVAL); + } + + if (pca->action == PF_CHANGE_REMOVE) { + TAILQ_REMOVE(&pool->list, oldpa, entries); + switch (oldpa->addr.type) { + case PF_ADDR_DYNIFTL: + pfi_dynaddr_remove(oldpa->addr.p.dyn); + break; + case PF_ADDR_TABLE: + pfr_detach_table(oldpa->addr.p.tbl); + break; + } + if (oldpa->kif) + pfi_kif_unref(oldpa->kif); + free(oldpa, M_PFRULE); + } else { + if (oldpa == NULL) + TAILQ_INSERT_TAIL(&pool->list, newpa, entries); + else if (pca->action == PF_CHANGE_ADD_HEAD || + pca->action == PF_CHANGE_ADD_BEFORE) + TAILQ_INSERT_BEFORE(oldpa, newpa, entries); + else + TAILQ_INSERT_AFTER(&pool->list, oldpa, + newpa, entries); + } + + pool->cur = TAILQ_FIRST(&pool->list); + PF_ACPY(&pool->counter, &pool->cur->addr.v.a.addr, pca->af); + PF_RULES_WUNLOCK(); + break; + +#undef ERROUT +DIOCCHANGEADDR_error: + if (newpa->kif) + pfi_kif_unref(newpa->kif); + PF_RULES_WUNLOCK(); + if (newpa != NULL) + free(newpa, M_PFRULE); + if (kif != NULL) + free(kif, PFI_MTYPE); + break; + } + + case DIOCGETRULESETS: { + struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr; + struct pf_ruleset *ruleset; + struct pf_anchor *anchor; + + PF_RULES_RLOCK(); + pr->path[sizeof(pr->path) - 1] = 0; + if ((ruleset = pf_find_ruleset(pr->path)) == NULL) { + PF_RULES_RUNLOCK(); + error = ENOENT; + break; + } + pr->nr = 0; + if (ruleset->anchor == NULL) { + /* XXX kludge for pf_main_ruleset */ + RB_FOREACH(anchor, pf_anchor_global, &V_pf_anchors) + if (anchor->parent == NULL) + pr->nr++; + } else { + RB_FOREACH(anchor, pf_anchor_node, + &ruleset->anchor->children) + pr->nr++; + } + PF_RULES_RUNLOCK(); + break; + } + + case DIOCGETRULESET: { + struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr; + struct pf_ruleset *ruleset; + struct pf_anchor *anchor; + u_int32_t nr = 0; + + PF_RULES_RLOCK(); + pr->path[sizeof(pr->path) - 1] = 0; + if ((ruleset = pf_find_ruleset(pr->path)) == NULL) { + PF_RULES_RUNLOCK(); + error = ENOENT; + break; + } + pr->name[0] = 0; + if (ruleset->anchor == NULL) { + /* XXX kludge for pf_main_ruleset */ + RB_FOREACH(anchor, pf_anchor_global, &V_pf_anchors) + if (anchor->parent == NULL && nr++ == pr->nr) { + strlcpy(pr->name, anchor->name, + sizeof(pr->name)); + break; + } + } else { + RB_FOREACH(anchor, pf_anchor_node, + &ruleset->anchor->children) + if (nr++ == pr->nr) { + strlcpy(pr->name, anchor->name, + sizeof(pr->name)); + break; + } + } + if (!pr->name[0]) + error = EBUSY; + PF_RULES_RUNLOCK(); + break; + } + + case DIOCRCLRTABLES: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != 0) { + error = ENODEV; + break; + } + PF_RULES_WLOCK(); + error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel, + io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCRADDTABLES: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_table *pfrts; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_table); + pfrts = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfrts, totlen); + if (error) { + free(pfrts, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_add_tables(pfrts, io->pfrio_size, + &io->pfrio_nadd, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + free(pfrts, M_TEMP); + break; + } + + case DIOCRDELTABLES: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_table *pfrts; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_table); + pfrts = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfrts, totlen); + if (error) { + free(pfrts, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_del_tables(pfrts, io->pfrio_size, + &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + free(pfrts, M_TEMP); + break; + } + + case DIOCRGETTABLES: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_table *pfrts; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_table); + pfrts = malloc(totlen, M_TEMP, M_WAITOK); + PF_RULES_RLOCK(); + error = pfr_get_tables(&io->pfrio_table, pfrts, + &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_RUNLOCK(); + if (error == 0) + error = copyout(pfrts, io->pfrio_buffer, totlen); + free(pfrts, M_TEMP); + break; + } + + case DIOCRGETTSTATS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_tstats *pfrtstats; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_tstats)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_tstats); + pfrtstats = malloc(totlen, M_TEMP, M_WAITOK); + PF_RULES_WLOCK(); + error = pfr_get_tstats(&io->pfrio_table, pfrtstats, + &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + if (error == 0) + error = copyout(pfrtstats, io->pfrio_buffer, totlen); + free(pfrtstats, M_TEMP); + break; + } + + case DIOCRCLRTSTATS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_table *pfrts; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_table); + pfrts = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfrts, totlen); + if (error) { + free(pfrts, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_clr_tstats(pfrts, io->pfrio_size, + &io->pfrio_nzero, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + free(pfrts, M_TEMP); + break; + } + + case DIOCRSETTFLAGS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_table *pfrts; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_table); + pfrts = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfrts, totlen); + if (error) { + free(pfrts, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_set_tflags(pfrts, io->pfrio_size, + io->pfrio_setflag, io->pfrio_clrflag, &io->pfrio_nchange, + &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + free(pfrts, M_TEMP); + break; + } + + case DIOCRCLRADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != 0) { + error = ENODEV; + break; + } + PF_RULES_WLOCK(); + error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel, + io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCRADDADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfras, totlen); + if (error) { + free(pfras, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_add_addrs(&io->pfrio_table, pfras, + io->pfrio_size, &io->pfrio_nadd, io->pfrio_flags | + PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK) + error = copyout(pfras, io->pfrio_buffer, totlen); + free(pfras, M_TEMP); + break; + } + + case DIOCRDELADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfras, totlen); + if (error) { + free(pfras, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_del_addrs(&io->pfrio_table, pfras, + io->pfrio_size, &io->pfrio_ndel, io->pfrio_flags | + PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK) + error = copyout(pfras, io->pfrio_buffer, totlen); + free(pfras, M_TEMP); + break; + } + + case DIOCRSETADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen, count; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + count = max(io->pfrio_size, io->pfrio_size2); + totlen = count * sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfras, totlen); + if (error) { + free(pfras, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_set_addrs(&io->pfrio_table, pfras, + io->pfrio_size, &io->pfrio_size2, &io->pfrio_nadd, + &io->pfrio_ndel, &io->pfrio_nchange, io->pfrio_flags | + PFR_FLAG_USERIOCTL, 0); + PF_RULES_WUNLOCK(); + if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK) + error = copyout(pfras, io->pfrio_buffer, totlen); + free(pfras, M_TEMP); + break; + } + + case DIOCRGETADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + PF_RULES_RLOCK(); + error = pfr_get_addrs(&io->pfrio_table, pfras, + &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_RUNLOCK(); + if (error == 0) + error = copyout(pfras, io->pfrio_buffer, totlen); + free(pfras, M_TEMP); + break; + } + + case DIOCRGETASTATS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_astats *pfrastats; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_astats)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_astats); + pfrastats = malloc(totlen, M_TEMP, M_WAITOK); + PF_RULES_RLOCK(); + error = pfr_get_astats(&io->pfrio_table, pfrastats, + &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_RUNLOCK(); + if (error == 0) + error = copyout(pfrastats, io->pfrio_buffer, totlen); + free(pfrastats, M_TEMP); + break; + } + + case DIOCRCLRASTATS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfras, totlen); + if (error) { + free(pfras, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_clr_astats(&io->pfrio_table, pfras, + io->pfrio_size, &io->pfrio_nzero, io->pfrio_flags | + PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK) + error = copyout(pfras, io->pfrio_buffer, totlen); + free(pfras, M_TEMP); + break; + } + + case DIOCRTSTADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfras, totlen); + if (error) { + free(pfras, M_TEMP); + break; + } + PF_RULES_RLOCK(); + error = pfr_tst_addrs(&io->pfrio_table, pfras, + io->pfrio_size, &io->pfrio_nmatch, io->pfrio_flags | + PFR_FLAG_USERIOCTL); + PF_RULES_RUNLOCK(); + if (error == 0) + error = copyout(pfras, io->pfrio_buffer, totlen); + free(pfras, M_TEMP); + break; + } + + case DIOCRINADEFINE: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfras, totlen); + if (error) { + free(pfras, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_ina_define(&io->pfrio_table, pfras, + io->pfrio_size, &io->pfrio_nadd, &io->pfrio_naddr, + io->pfrio_ticket, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + free(pfras, M_TEMP); + break; + } + + case DIOCOSFPADD: { + struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr; + PF_RULES_WLOCK(); + error = pf_osfp_add(io); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCOSFPGET: { + struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr; + PF_RULES_RLOCK(); + error = pf_osfp_get(io); + PF_RULES_RUNLOCK(); + break; + } + + case DIOCXBEGIN: { + struct pfioc_trans *io = (struct pfioc_trans *)addr; + struct pfioc_trans_e *ioes, *ioe; + size_t totlen; + int i; + + if (io->esize != sizeof(*ioe)) { + error = ENODEV; + break; + } + totlen = sizeof(struct pfioc_trans_e) * io->size; + ioes = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->array, ioes, totlen); + if (error) { + free(ioes, M_TEMP); + break; + } + PF_RULES_WLOCK(); + for (i = 0, ioe = ioes; i < io->size; i++, ioe++) { + switch (ioe->rs_num) { +#ifdef ALTQ + case PF_RULESET_ALTQ: + if (ioe->anchor[0]) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EINVAL; + goto fail; + } + if ((error = pf_begin_altq(&ioe->ticket))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; + } + break; +#endif /* ALTQ */ + case PF_RULESET_TABLE: + { + struct pfr_table table; + + bzero(&table, sizeof(table)); + strlcpy(table.pfrt_anchor, ioe->anchor, + sizeof(table.pfrt_anchor)); + if ((error = pfr_ina_begin(&table, + &ioe->ticket, NULL, 0))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; + } + break; + } + default: + if ((error = pf_begin_rules(&ioe->ticket, + ioe->rs_num, ioe->anchor))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; + } + break; + } + } + PF_RULES_WUNLOCK(); + error = copyout(ioes, io->array, totlen); + free(ioes, M_TEMP); + break; + } + + case DIOCXROLLBACK: { + struct pfioc_trans *io = (struct pfioc_trans *)addr; + struct pfioc_trans_e *ioe, *ioes; + size_t totlen; + int i; + + if (io->esize != sizeof(*ioe)) { + error = ENODEV; + break; + } + totlen = sizeof(struct pfioc_trans_e) * io->size; + ioes = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->array, ioes, totlen); + if (error) { + free(ioes, M_TEMP); + break; + } + PF_RULES_WLOCK(); + for (i = 0, ioe = ioes; i < io->size; i++, ioe++) { + switch (ioe->rs_num) { +#ifdef ALTQ + case PF_RULESET_ALTQ: + if (ioe->anchor[0]) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EINVAL; + goto fail; + } + if ((error = pf_rollback_altq(ioe->ticket))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; /* really bad */ + } + break; +#endif /* ALTQ */ + case PF_RULESET_TABLE: + { + struct pfr_table table; + + bzero(&table, sizeof(table)); + strlcpy(table.pfrt_anchor, ioe->anchor, + sizeof(table.pfrt_anchor)); + if ((error = pfr_ina_rollback(&table, + ioe->ticket, NULL, 0))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; /* really bad */ + } + break; + } + default: + if ((error = pf_rollback_rules(ioe->ticket, + ioe->rs_num, ioe->anchor))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; /* really bad */ + } + break; + } + } + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + break; + } + + case DIOCXCOMMIT: { + struct pfioc_trans *io = (struct pfioc_trans *)addr; + struct pfioc_trans_e *ioe, *ioes; + struct pf_ruleset *rs; + size_t totlen; + int i; + + if (io->esize != sizeof(*ioe)) { + error = ENODEV; + break; + } + totlen = sizeof(struct pfioc_trans_e) * io->size; + ioes = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->array, ioes, totlen); + if (error) { + free(ioes, M_TEMP); + break; + } + PF_RULES_WLOCK(); + /* First makes sure everything will succeed. */ + for (i = 0, ioe = ioes; i < io->size; i++, ioe++) { + switch (ioe->rs_num) { +#ifdef ALTQ + case PF_RULESET_ALTQ: + if (ioe->anchor[0]) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EINVAL; + goto fail; + } + if (!V_altqs_inactive_open || ioe->ticket != + V_ticket_altqs_inactive) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EBUSY; + goto fail; + } + break; +#endif /* ALTQ */ + case PF_RULESET_TABLE: + rs = pf_find_ruleset(ioe->anchor); + if (rs == NULL || !rs->topen || ioe->ticket != + rs->tticket) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EBUSY; + goto fail; + } + break; + default: + if (ioe->rs_num < 0 || ioe->rs_num >= + PF_RULESET_MAX) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EINVAL; + goto fail; + } + rs = pf_find_ruleset(ioe->anchor); + if (rs == NULL || + !rs->rules[ioe->rs_num].inactive.open || + rs->rules[ioe->rs_num].inactive.ticket != + ioe->ticket) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EBUSY; + goto fail; + } + break; + } + } + /* Now do the commit - no errors should happen here. */ + for (i = 0, ioe = ioes; i < io->size; i++, ioe++) { + switch (ioe->rs_num) { +#ifdef ALTQ + case PF_RULESET_ALTQ: + if ((error = pf_commit_altq(ioe->ticket))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; /* really bad */ + } + break; +#endif /* ALTQ */ + case PF_RULESET_TABLE: + { + struct pfr_table table; + + bzero(&table, sizeof(table)); + strlcpy(table.pfrt_anchor, ioe->anchor, + sizeof(table.pfrt_anchor)); + if ((error = pfr_ina_commit(&table, + ioe->ticket, NULL, NULL, 0))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; /* really bad */ + } + break; + } + default: + if ((error = pf_commit_rules(ioe->ticket, + ioe->rs_num, ioe->anchor))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; /* really bad */ + } + break; + } + } + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + break; + } + + case DIOCGETSRCNODES: { + struct pfioc_src_nodes *psn = (struct pfioc_src_nodes *)addr; + struct pf_srchash *sh; + struct pf_src_node *n, *p, *pstore; + uint32_t i, nr = 0; + + if (psn->psn_len == 0) { + for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; + i++, sh++) { + PF_HASHROW_LOCK(sh); + LIST_FOREACH(n, &sh->nodes, entry) + nr++; + PF_HASHROW_UNLOCK(sh); + } + psn->psn_len = sizeof(struct pf_src_node) * nr; + break; + } + + p = pstore = malloc(psn->psn_len, M_TEMP, M_WAITOK); + for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; + i++, sh++) { + PF_HASHROW_LOCK(sh); + LIST_FOREACH(n, &sh->nodes, entry) { + int secs = time_uptime, diff; + + if ((nr + 1) * sizeof(*p) > (unsigned)psn->psn_len) + break; + + bcopy(n, p, sizeof(struct pf_src_node)); + if (n->rule.ptr != NULL) + p->rule.nr = n->rule.ptr->nr; + p->creation = secs - p->creation; + if (p->expire > secs) + p->expire -= secs; + else + p->expire = 0; + + /* Adjust the connection rate estimate. */ + diff = secs - n->conn_rate.last; + if (diff >= n->conn_rate.seconds) + p->conn_rate.count = 0; + else + p->conn_rate.count -= + n->conn_rate.count * diff / + n->conn_rate.seconds; + p++; + nr++; + } + PF_HASHROW_UNLOCK(sh); + } + error = copyout(pstore, psn->psn_src_nodes, + sizeof(struct pf_src_node) * nr); + if (error) { + free(pstore, M_TEMP); + break; + } + psn->psn_len = sizeof(struct pf_src_node) * nr; + free(pstore, M_TEMP); + break; + } + + case DIOCCLRSRCNODES: { + + pf_clear_srcnodes(NULL); + pf_purge_expired_src_nodes(); + break; + } + + case DIOCKILLSRCNODES: + pf_kill_srcnodes((struct pfioc_src_node_kill *)addr); + break; + + case DIOCSETHOSTID: { + u_int32_t *hostid = (u_int32_t *)addr; + + PF_RULES_WLOCK(); + if (*hostid == 0) + V_pf_status.hostid = arc4random(); + else + V_pf_status.hostid = *hostid; + PF_RULES_WUNLOCK(); + break; + } + + case DIOCOSFPFLUSH: + PF_RULES_WLOCK(); + pf_osfp_flush(); + PF_RULES_WUNLOCK(); + break; + + case DIOCIGETIFACES: { + struct pfioc_iface *io = (struct pfioc_iface *)addr; + struct pfi_kif *ifstore; + size_t bufsiz; + + if (io->pfiio_esize != sizeof(struct pfi_kif)) { + error = ENODEV; + break; + } + + bufsiz = io->pfiio_size * sizeof(struct pfi_kif); + ifstore = malloc(bufsiz, M_TEMP, M_WAITOK); + PF_RULES_RLOCK(); + pfi_get_ifaces(io->pfiio_name, ifstore, &io->pfiio_size); + PF_RULES_RUNLOCK(); + error = copyout(ifstore, io->pfiio_buffer, bufsiz); + free(ifstore, M_TEMP); + break; + } + + case DIOCSETIFFLAG: { + struct pfioc_iface *io = (struct pfioc_iface *)addr; + + PF_RULES_WLOCK(); + error = pfi_set_flags(io->pfiio_name, io->pfiio_flags); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCCLRIFFLAG: { + struct pfioc_iface *io = (struct pfioc_iface *)addr; + + PF_RULES_WLOCK(); + error = pfi_clear_flags(io->pfiio_name, io->pfiio_flags); + PF_RULES_WUNLOCK(); + break; + } + + default: + error = ENODEV; + break; + } +fail: + if (sx_xlocked(&pf_ioctl_lock)) + sx_xunlock(&pf_ioctl_lock); + CURVNET_RESTORE(); + + return (error); +} + +void +pfsync_state_export(struct pfsync_state *sp, struct pf_state *st) +{ + bzero(sp, sizeof(struct pfsync_state)); + + /* copy from state key */ + sp->key[PF_SK_WIRE].addr[0] = st->key[PF_SK_WIRE]->addr[0]; + sp->key[PF_SK_WIRE].addr[1] = st->key[PF_SK_WIRE]->addr[1]; + sp->key[PF_SK_WIRE].port[0] = st->key[PF_SK_WIRE]->port[0]; + sp->key[PF_SK_WIRE].port[1] = st->key[PF_SK_WIRE]->port[1]; + sp->key[PF_SK_STACK].addr[0] = st->key[PF_SK_STACK]->addr[0]; + sp->key[PF_SK_STACK].addr[1] = st->key[PF_SK_STACK]->addr[1]; + sp->key[PF_SK_STACK].port[0] = st->key[PF_SK_STACK]->port[0]; + sp->key[PF_SK_STACK].port[1] = st->key[PF_SK_STACK]->port[1]; + sp->proto = st->key[PF_SK_WIRE]->proto; + sp->af = st->key[PF_SK_WIRE]->af; + + /* copy from state */ + strlcpy(sp->ifname, st->kif->pfik_name, sizeof(sp->ifname)); + bcopy(&st->rt_addr, &sp->rt_addr, sizeof(sp->rt_addr)); + sp->creation = htonl(time_uptime - st->creation); + sp->expire = pf_state_expires(st); + if (sp->expire <= time_uptime) + sp->expire = htonl(0); + else + sp->expire = htonl(sp->expire - time_uptime); + + sp->direction = st->direction; + sp->log = st->log; + sp->timeout = st->timeout; + sp->state_flags = st->state_flags; + if (st->src_node) + sp->sync_flags |= PFSYNC_FLAG_SRCNODE; + if (st->nat_src_node) + sp->sync_flags |= PFSYNC_FLAG_NATSRCNODE; + + sp->id = st->id; + sp->creatorid = st->creatorid; + pf_state_peer_hton(&st->src, &sp->src); + pf_state_peer_hton(&st->dst, &sp->dst); + + if (st->rule.ptr == NULL) + sp->rule = htonl(-1); + else + sp->rule = htonl(st->rule.ptr->nr); + if (st->anchor.ptr == NULL) + sp->anchor = htonl(-1); + else + sp->anchor = htonl(st->anchor.ptr->nr); + if (st->nat_rule.ptr == NULL) + sp->nat_rule = htonl(-1); + else + sp->nat_rule = htonl(st->nat_rule.ptr->nr); + + pf_state_counter_hton(st->packets[0], sp->packets[0]); + pf_state_counter_hton(st->packets[1], sp->packets[1]); + pf_state_counter_hton(st->bytes[0], sp->bytes[0]); + pf_state_counter_hton(st->bytes[1], sp->bytes[1]); + +} + +static void +pf_tbladdr_copyout(struct pf_addr_wrap *aw) +{ + struct pfr_ktable *kt; + + KASSERT(aw->type == PF_ADDR_TABLE, ("%s: type %u", __func__, aw->type)); + + kt = aw->p.tbl; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + aw->p.tbl = NULL; + aw->p.tblcnt = (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) ? + kt->pfrkt_cnt : -1; +} + +/* + * XXX - Check for version missmatch!!! + */ +static void +pf_clear_states(void) +{ + struct pf_state *s; + u_int i; + + for (i = 0; i <= pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; +relock: + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + s->timeout = PFTM_PURGE; + /* Don't send out individual delete messages. */ + s->state_flags |= PFSTATE_NOSYNC; + pf_unlink_state(s, PF_ENTER_LOCKED); + goto relock; + } + PF_HASHROW_UNLOCK(ih); + } +} + +static int +pf_clear_tables(void) +{ + struct pfioc_table io; + int error; + + bzero(&io, sizeof(io)); + + error = pfr_clr_tables(&io.pfrio_table, &io.pfrio_ndel, + io.pfrio_flags); + + return (error); +} + +static void +pf_clear_srcnodes(struct pf_src_node *n) +{ + struct pf_state *s; + int i; + + for (i = 0; i <= pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + if (n == NULL || n == s->src_node) + s->src_node = NULL; + if (n == NULL || n == s->nat_src_node) + s->nat_src_node = NULL; + } + PF_HASHROW_UNLOCK(ih); + } + + if (n == NULL) { + struct pf_srchash *sh; + + for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; + i++, sh++) { + PF_HASHROW_LOCK(sh); + LIST_FOREACH(n, &sh->nodes, entry) { + n->expire = 1; + n->states = 0; + } + PF_HASHROW_UNLOCK(sh); + } + } else { + /* XXX: hash slot should already be locked here. */ + n->expire = 1; + n->states = 0; + } +} + +static void +pf_kill_srcnodes(struct pfioc_src_node_kill *psnk) +{ + struct pf_src_node_list kill; + + LIST_INIT(&kill); + for (int i = 0; i <= pf_srchashmask; i++) { + struct pf_srchash *sh = &V_pf_srchash[i]; + struct pf_src_node *sn, *tmp; + + PF_HASHROW_LOCK(sh); + LIST_FOREACH_SAFE(sn, &sh->nodes, entry, tmp) + if (PF_MATCHA(psnk->psnk_src.neg, + &psnk->psnk_src.addr.v.a.addr, + &psnk->psnk_src.addr.v.a.mask, + &sn->addr, sn->af) && + PF_MATCHA(psnk->psnk_dst.neg, + &psnk->psnk_dst.addr.v.a.addr, + &psnk->psnk_dst.addr.v.a.mask, + &sn->raddr, sn->af)) { + pf_unlink_src_node(sn); + LIST_INSERT_HEAD(&kill, sn, entry); + sn->expire = 1; + } + PF_HASHROW_UNLOCK(sh); + } + + for (int i = 0; i <= pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + struct pf_state *s; + + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + if (s->src_node && s->src_node->expire == 1) + s->src_node = NULL; + if (s->nat_src_node && s->nat_src_node->expire == 1) + s->nat_src_node = NULL; + } + PF_HASHROW_UNLOCK(ih); + } + + psnk->psnk_killed = pf_free_src_nodes(&kill); +} + +/* + * XXX - Check for version missmatch!!! + */ + +/* + * Duplicate pfctl -Fa operation to get rid of as much as we can. + */ +static int +shutdown_pf(void) +{ + int error = 0; + u_int32_t t[5]; + char nn = '\0'; + + do { + if ((error = pf_begin_rules(&t[0], PF_RULESET_SCRUB, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: SCRUB\n")); + break; + } + if ((error = pf_begin_rules(&t[1], PF_RULESET_FILTER, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: FILTER\n")); + break; /* XXX: rollback? */ + } + if ((error = pf_begin_rules(&t[2], PF_RULESET_NAT, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: NAT\n")); + break; /* XXX: rollback? */ + } + if ((error = pf_begin_rules(&t[3], PF_RULESET_BINAT, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: BINAT\n")); + break; /* XXX: rollback? */ + } + if ((error = pf_begin_rules(&t[4], PF_RULESET_RDR, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: RDR\n")); + break; /* XXX: rollback? */ + } + + /* XXX: these should always succeed here */ + pf_commit_rules(t[0], PF_RULESET_SCRUB, &nn); + pf_commit_rules(t[1], PF_RULESET_FILTER, &nn); + pf_commit_rules(t[2], PF_RULESET_NAT, &nn); + pf_commit_rules(t[3], PF_RULESET_BINAT, &nn); + pf_commit_rules(t[4], PF_RULESET_RDR, &nn); + + if ((error = pf_clear_tables()) != 0) + break; + +#ifdef ALTQ + if ((error = pf_begin_altq(&t[0])) != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: ALTQ\n")); + break; + } + pf_commit_altq(t[0]); +#endif + + pf_clear_states(); + + pf_clear_srcnodes(NULL); + + /* status does not use malloced mem so no need to cleanup */ + /* fingerprints and interfaces have their own cleanup code */ + + /* Free counters last as we updated them during shutdown. */ + counter_u64_free(V_pf_default_rule.states_cur); + counter_u64_free(V_pf_default_rule.states_tot); + counter_u64_free(V_pf_default_rule.src_nodes); + + for (int i = 0; i < PFRES_MAX; i++) + counter_u64_free(V_pf_status.counters[i]); + for (int i = 0; i < LCNT_MAX; i++) + counter_u64_free(V_pf_status.lcounters[i]); + for (int i = 0; i < FCNT_MAX; i++) + counter_u64_free(V_pf_status.fcounters[i]); + for (int i = 0; i < SCNT_MAX; i++) + counter_u64_free(V_pf_status.scounters[i]); + } while(0); + + return (error); +} + +#ifdef INET +static int +pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + int chk; + + chk = pf_test(PF_IN, ifp, m, inp); + if (chk && *m) { + m_freem(*m); + *m = NULL; + } + + if (chk != PF_PASS) + return (EACCES); + return (0); +} + +static int +pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + int chk; + + chk = pf_test(PF_OUT, ifp, m, inp); + if (chk && *m) { + m_freem(*m); + *m = NULL; + } + + if (chk != PF_PASS) + return (EACCES); + return (0); +} +#endif + +#ifdef INET6 +static int +pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + int chk; + + /* + * In case of loopback traffic IPv6 uses the real interface in + * order to support scoped addresses. In order to support stateful + * filtering we have change this to lo0 as it is the case in IPv4. + */ + CURVNET_SET(ifp->if_vnet); + chk = pf_test6(PF_IN, (*m)->m_flags & M_LOOP ? V_loif : ifp, m, inp); + CURVNET_RESTORE(); + if (chk && *m) { + m_freem(*m); + *m = NULL; + } + if (chk != PF_PASS) + return (EACCES); + return (0); +} + +static int +pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + int chk; + + CURVNET_SET(ifp->if_vnet); + chk = pf_test6(PF_OUT, ifp, m, inp); + CURVNET_RESTORE(); + if (chk && *m) { + m_freem(*m); + *m = NULL; + } + if (chk != PF_PASS) + return (EACCES); + return (0); +} +#endif /* INET6 */ + +static int +hook_pf(void) +{ +#ifdef INET + struct pfil_head *pfh_inet; +#endif +#ifdef INET6 + struct pfil_head *pfh_inet6; +#endif + + if (V_pf_pfil_hooked) + return (0); + +#ifdef INET + pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); + if (pfh_inet == NULL) + return (ESRCH); /* XXX */ + pfil_add_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet); + pfil_add_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet); +#endif +#ifdef INET6 + pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); + if (pfh_inet6 == NULL) { +#ifdef INET + pfil_remove_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, + pfh_inet); + pfil_remove_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, + pfh_inet); +#endif + return (ESRCH); /* XXX */ + } + pfil_add_hook(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet6); + pfil_add_hook(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet6); +#endif + + V_pf_pfil_hooked = 1; + return (0); +} + +static int +dehook_pf(void) +{ +#ifdef INET + struct pfil_head *pfh_inet; +#endif +#ifdef INET6 + struct pfil_head *pfh_inet6; +#endif + + if (V_pf_pfil_hooked == 0) + return (0); + +#ifdef INET + pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); + if (pfh_inet == NULL) + return (ESRCH); /* XXX */ + pfil_remove_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, + pfh_inet); + pfil_remove_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, + pfh_inet); +#endif +#ifdef INET6 + pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); + if (pfh_inet6 == NULL) + return (ESRCH); /* XXX */ + pfil_remove_hook(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK, + pfh_inet6); + pfil_remove_hook(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK, + pfh_inet6); +#endif + + V_pf_pfil_hooked = 0; + return (0); +} + +static void +pf_load_vnet(void) +{ + VNET_ITERATOR_DECL(vnet_iter); + + VNET_LIST_RLOCK(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + V_pf_pfil_hooked = 0; + TAILQ_INIT(&V_pf_tags); + TAILQ_INIT(&V_pf_qids); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK(); + + pfattach_vnet(); + V_pf_vnet_active = 1; +} + +static int +pf_load(void) +{ + int error; + + rw_init(&pf_rules_lock, "pf rulesets"); + sx_init(&pf_ioctl_lock, "pf ioctl"); + + pf_mtag_initialize(); + + pf_dev = make_dev(&pf_cdevsw, 0, 0, 0, 0600, PF_NAME); + if (pf_dev == NULL) + return (ENOMEM); + + pf_end_threads = 0; + error = kproc_create(pf_purge_thread, NULL, NULL, 0, 0, "pf purge"); + if (error != 0) + return (error); + + pfi_initialize(); + + return (0); +} + +static void +pf_unload_vnet(void) +{ + int error; + + V_pf_vnet_active = 0; + V_pf_status.running = 0; + swi_remove(V_pf_swi_cookie); + error = dehook_pf(); + if (error) { + /* + * Should not happen! + * XXX Due to error code ESRCH, kldunload will show + * a message like 'No such process'. + */ + printf("%s : pfil unregisteration fail\n", __FUNCTION__); + return; + } + + pf_unload_vnet_purge(); + + PF_RULES_WLOCK(); + shutdown_pf(); + PF_RULES_WUNLOCK(); + + pf_normalize_cleanup(); + PF_RULES_WLOCK(); + pfi_cleanup_vnet(); + PF_RULES_WUNLOCK(); + pfr_cleanup(); + pf_osfp_flush(); + pf_cleanup(); + if (IS_DEFAULT_VNET(curvnet)) + pf_mtag_cleanup(); +} + +static int +pf_unload(void) +{ + int error = 0; + + pf_end_threads = 1; + while (pf_end_threads < 2) { + wakeup_one(pf_purge_thread); + rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftmo", 0); + } + + if (pf_dev != NULL) + destroy_dev(pf_dev); + + pfi_cleanup(); + + rw_destroy(&pf_rules_lock); + sx_destroy(&pf_ioctl_lock); + + return (error); +} + +static void +vnet_pf_init(void *unused __unused) +{ + + pf_load_vnet(); +} +VNET_SYSINIT(vnet_pf_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_THIRD, + vnet_pf_init, NULL); + +static void +vnet_pf_uninit(const void *unused __unused) +{ + + pf_unload_vnet(); +} +VNET_SYSUNINIT(vnet_pf_uninit, SI_SUB_PROTO_FIREWALL, SI_ORDER_THIRD, + vnet_pf_uninit, NULL); + + +static int +pf_modevent(module_t mod, int type, void *data) +{ + int error = 0; + + switch(type) { + case MOD_LOAD: + error = pf_load(); + break; + case MOD_QUIESCE: + /* + * Module should not be unloaded due to race conditions. + */ + error = EBUSY; + break; + case MOD_UNLOAD: + error = pf_unload(); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +static moduledata_t pf_mod = { + "pf", + pf_modevent, + 0 +}; + +DECLARE_MODULE(pf, pf_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_SECOND); +MODULE_VERSION(pf, PF_MODVER); diff --git a/freebsd/sys/netpfil/pf/pf_lb.c b/freebsd/sys/netpfil/pf/pf_lb.c new file mode 100644 index 00000000..033c3879 --- /dev/null +++ b/freebsd/sys/netpfil/pf/pf_lb.c @@ -0,0 +1,681 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002 - 2008 Henning Brauer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + * $OpenBSD: pf_lb.c,v 1.2 2009/02/12 02:13:15 sthen Exp $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/local/opt_pf.h> +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> + +#include <rtems/bsd/sys/param.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/mbuf.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/vnet.h> +#include <net/pfvar.h> +#include <net/if_pflog.h> + +#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x + +static void pf_hash(struct pf_addr *, struct pf_addr *, + struct pf_poolhashkey *, sa_family_t); +static struct pf_rule *pf_match_translation(struct pf_pdesc *, struct mbuf *, + int, int, struct pfi_kif *, + struct pf_addr *, u_int16_t, struct pf_addr *, + uint16_t, int, struct pf_anchor_stackframe *); +static int pf_get_sport(sa_family_t, uint8_t, struct pf_rule *, + struct pf_addr *, uint16_t, struct pf_addr *, uint16_t, struct pf_addr *, + uint16_t *, uint16_t, uint16_t, struct pf_src_node **); + +#define mix(a,b,c) \ + do { \ + a -= b; a -= c; a ^= (c >> 13); \ + b -= c; b -= a; b ^= (a << 8); \ + c -= a; c -= b; c ^= (b >> 13); \ + a -= b; a -= c; a ^= (c >> 12); \ + b -= c; b -= a; b ^= (a << 16); \ + c -= a; c -= b; c ^= (b >> 5); \ + a -= b; a -= c; a ^= (c >> 3); \ + b -= c; b -= a; b ^= (a << 10); \ + c -= a; c -= b; c ^= (b >> 15); \ + } while (0) + +/* + * hash function based on bridge_hash in if_bridge.c + */ +static void +pf_hash(struct pf_addr *inaddr, struct pf_addr *hash, + struct pf_poolhashkey *key, sa_family_t af) +{ + u_int32_t a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0]; + + switch (af) { +#ifdef INET + case AF_INET: + a += inaddr->addr32[0]; + b += key->key32[1]; + mix(a, b, c); + hash->addr32[0] = c + key->key32[2]; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + a += inaddr->addr32[0]; + b += inaddr->addr32[2]; + mix(a, b, c); + hash->addr32[0] = c; + a += inaddr->addr32[1]; + b += inaddr->addr32[3]; + c += key->key32[1]; + mix(a, b, c); + hash->addr32[1] = c; + a += inaddr->addr32[2]; + b += inaddr->addr32[1]; + c += key->key32[2]; + mix(a, b, c); + hash->addr32[2] = c; + a += inaddr->addr32[3]; + b += inaddr->addr32[0]; + c += key->key32[3]; + mix(a, b, c); + hash->addr32[3] = c; + break; +#endif /* INET6 */ + } +} + +static struct pf_rule * +pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, + int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport, + struct pf_addr *daddr, uint16_t dport, int rs_num, + struct pf_anchor_stackframe *anchor_stack) +{ + struct pf_rule *r, *rm = NULL; + struct pf_ruleset *ruleset = NULL; + int tag = -1; + int rtableid = -1; + int asd = 0; + + r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr); + while (r && rm == NULL) { + struct pf_rule_addr *src = NULL, *dst = NULL; + struct pf_addr_wrap *xdst = NULL; + + if (r->action == PF_BINAT && direction == PF_IN) { + src = &r->dst; + if (r->rpool.cur != NULL) + xdst = &r->rpool.cur->addr; + } else { + src = &r->src; + dst = &r->dst; + } + + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != direction) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != pd->af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&src->addr, saddr, pd->af, + src->neg, kif, M_GETFIB(m))) + r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR : + PF_SKIP_DST_ADDR].ptr; + else if (src->port_op && !pf_match_port(src->port_op, + src->port[0], src->port[1], sport)) + r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT : + PF_SKIP_DST_PORT].ptr; + else if (dst != NULL && + PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL, + M_GETFIB(m))) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af, + 0, NULL, M_GETFIB(m))) + r = TAILQ_NEXT(r, entries); + else if (dst != NULL && dst->port_op && + !pf_match_port(dst->port_op, dst->port[0], + dst->port[1], dport)) + r = r->skip[PF_SKIP_DST_PORT].ptr; + else if (r->match_tag && !pf_match_tag(m, r, &tag, + pd->pf_mtag ? pd->pf_mtag->tag : 0)) + r = TAILQ_NEXT(r, entries); + else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto != + IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m, + off, pd->hdr.tcp), r->os_fingerprint))) + r = TAILQ_NEXT(r, entries); + else { + if (r->tag) + tag = r->tag; + if (r->rtableid >= 0) + rtableid = r->rtableid; + if (r->anchor == NULL) { + rm = r; + } else + pf_step_into_anchor(anchor_stack, &asd, + &ruleset, rs_num, &r, NULL, NULL); + } + if (r == NULL) + pf_step_out_of_anchor(anchor_stack, &asd, &ruleset, + rs_num, &r, NULL, NULL); + } + + if (tag > 0 && pf_tag_packet(m, pd, tag)) + return (NULL); + if (rtableid >= 0) + M_SETFIB(m, rtableid); + + if (rm != NULL && (rm->action == PF_NONAT || + rm->action == PF_NORDR || rm->action == PF_NOBINAT)) + return (NULL); + return (rm); +} + +static int +pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, + struct pf_addr *saddr, uint16_t sport, struct pf_addr *daddr, + uint16_t dport, struct pf_addr *naddr, uint16_t *nport, uint16_t low, + uint16_t high, struct pf_src_node **sn) +{ + struct pf_state_key_cmp key; + struct pf_addr init_addr; + + bzero(&init_addr, sizeof(init_addr)); + if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn)) + return (1); + + if (proto == IPPROTO_ICMP) { + low = 1; + high = 65535; + } + + bzero(&key, sizeof(key)); + key.af = af; + key.proto = proto; + key.port[0] = dport; + PF_ACPY(&key.addr[0], daddr, key.af); + + do { + PF_ACPY(&key.addr[1], naddr, key.af); + + /* + * port search; start random, step; + * similar 2 portloop in in_pcbbind + */ + if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP || + proto == IPPROTO_ICMP) || (low == 0 && high == 0)) { + /* + * XXX bug: icmp states don't use the id on both sides. + * (traceroute -I through nat) + */ + key.port[1] = sport; + if (pf_find_state_all(&key, PF_IN, NULL) == NULL) { + *nport = sport; + return (0); + } + } else if (low == high) { + key.port[1] = htons(low); + if (pf_find_state_all(&key, PF_IN, NULL) == NULL) { + *nport = htons(low); + return (0); + } + } else { + uint16_t tmp, cut; + + if (low > high) { + tmp = low; + low = high; + high = tmp; + } + /* low < high */ + cut = arc4random() % (1 + high - low) + low; + /* low <= cut <= high */ + for (tmp = cut; tmp <= high; ++(tmp)) { + key.port[1] = htons(tmp); + if (pf_find_state_all(&key, PF_IN, NULL) == + NULL) { + *nport = htons(tmp); + return (0); + } + } + for (tmp = cut - 1; tmp >= low; --(tmp)) { + key.port[1] = htons(tmp); + if (pf_find_state_all(&key, PF_IN, NULL) == + NULL) { + *nport = htons(tmp); + return (0); + } + } + } + + switch (r->rpool.opts & PF_POOL_TYPEMASK) { + case PF_POOL_RANDOM: + case PF_POOL_ROUNDROBIN: + if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn)) + return (1); + break; + case PF_POOL_NONE: + case PF_POOL_SRCHASH: + case PF_POOL_BITMASK: + default: + return (1); + } + } while (! PF_AEQ(&init_addr, naddr, af) ); + return (1); /* none available */ +} + +int +pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr, + struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sn) +{ + struct pf_pool *rpool = &r->rpool; + struct pf_addr *raddr = NULL, *rmask = NULL; + + /* Try to find a src_node if none was given and this + is a sticky-address rule. */ + if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR && + (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) + *sn = pf_find_src_node(saddr, r, af, 0); + + /* If a src_node was found or explicitly given and it has a non-zero + route address, use this address. A zeroed address is found if the + src node was created just a moment ago in pf_create_state and it + needs to be filled in with routing decision calculated here. */ + if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) { + PF_ACPY(naddr, &(*sn)->raddr, af); + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf_map_addr: src tracking maps "); + pf_print_host(saddr, 0, af); + printf(" to "); + pf_print_host(naddr, 0, af); + printf("\n"); + } + return (0); + } + + /* Find the route using chosen algorithm. Store the found route + in src_node if it was given or found. */ + if (rpool->cur->addr.type == PF_ADDR_NOROUTE) + return (1); + if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { + switch (af) { +#ifdef INET + case AF_INET: + if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 && + (rpool->opts & PF_POOL_TYPEMASK) != + PF_POOL_ROUNDROBIN) + return (1); + raddr = &rpool->cur->addr.p.dyn->pfid_addr4; + rmask = &rpool->cur->addr.p.dyn->pfid_mask4; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (rpool->cur->addr.p.dyn->pfid_acnt6 < 1 && + (rpool->opts & PF_POOL_TYPEMASK) != + PF_POOL_ROUNDROBIN) + return (1); + raddr = &rpool->cur->addr.p.dyn->pfid_addr6; + rmask = &rpool->cur->addr.p.dyn->pfid_mask6; + break; +#endif /* INET6 */ + } + } else if (rpool->cur->addr.type == PF_ADDR_TABLE) { + if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN) + return (1); /* unsupported */ + } else { + raddr = &rpool->cur->addr.v.a.addr; + rmask = &rpool->cur->addr.v.a.mask; + } + + switch (rpool->opts & PF_POOL_TYPEMASK) { + case PF_POOL_NONE: + PF_ACPY(naddr, raddr, af); + break; + case PF_POOL_BITMASK: + PF_POOLMASK(naddr, raddr, rmask, saddr, af); + break; + case PF_POOL_RANDOM: + if (init_addr != NULL && PF_AZERO(init_addr, af)) { + switch (af) { +#ifdef INET + case AF_INET: + rpool->counter.addr32[0] = htonl(arc4random()); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (rmask->addr32[3] != 0xffffffff) + rpool->counter.addr32[3] = + htonl(arc4random()); + else + break; + if (rmask->addr32[2] != 0xffffffff) + rpool->counter.addr32[2] = + htonl(arc4random()); + else + break; + if (rmask->addr32[1] != 0xffffffff) + rpool->counter.addr32[1] = + htonl(arc4random()); + else + break; + if (rmask->addr32[0] != 0xffffffff) + rpool->counter.addr32[0] = + htonl(arc4random()); + break; +#endif /* INET6 */ + } + PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); + PF_ACPY(init_addr, naddr, af); + + } else { + PF_AINC(&rpool->counter, af); + PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); + } + break; + case PF_POOL_SRCHASH: + { + unsigned char hash[16]; + + pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af); + PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af); + break; + } + case PF_POOL_ROUNDROBIN: + { + struct pf_pooladdr *acur = rpool->cur; + + /* + * XXXGL: in the round-robin case we need to store + * the round-robin machine state in the rule, thus + * forwarding thread needs to modify rule. + * + * This is done w/o locking, because performance is assumed + * more important than round-robin precision. + * + * In the simpliest case we just update the "rpool->cur" + * pointer. However, if pool contains tables or dynamic + * addresses, then "tblidx" is also used to store machine + * state. Since "tblidx" is int, concurrent access to it can't + * lead to inconsistence, only to lost of precision. + * + * Things get worse, if table contains not hosts, but + * prefixes. In this case counter also stores machine state, + * and for IPv6 address, counter can't be updated atomically. + * Probably, using round-robin on a table containing IPv6 + * prefixes (or even IPv4) would cause a panic. + */ + + if (rpool->cur->addr.type == PF_ADDR_TABLE) { + if (!pfr_pool_get(rpool->cur->addr.p.tbl, + &rpool->tblidx, &rpool->counter, af)) + goto get_addr; + } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { + if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, + &rpool->tblidx, &rpool->counter, af)) + goto get_addr; + } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af)) + goto get_addr; + + try_next: + if (TAILQ_NEXT(rpool->cur, entries) == NULL) + rpool->cur = TAILQ_FIRST(&rpool->list); + else + rpool->cur = TAILQ_NEXT(rpool->cur, entries); + if (rpool->cur->addr.type == PF_ADDR_TABLE) { + rpool->tblidx = -1; + if (pfr_pool_get(rpool->cur->addr.p.tbl, + &rpool->tblidx, &rpool->counter, af)) { + /* table contains no address of type 'af' */ + if (rpool->cur != acur) + goto try_next; + return (1); + } + } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { + rpool->tblidx = -1; + if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, + &rpool->tblidx, &rpool->counter, af)) { + /* table contains no address of type 'af' */ + if (rpool->cur != acur) + goto try_next; + return (1); + } + } else { + raddr = &rpool->cur->addr.v.a.addr; + rmask = &rpool->cur->addr.v.a.mask; + PF_ACPY(&rpool->counter, raddr, af); + } + + get_addr: + PF_ACPY(naddr, &rpool->counter, af); + if (init_addr != NULL && PF_AZERO(init_addr, af)) + PF_ACPY(init_addr, naddr, af); + PF_AINC(&rpool->counter, af); + break; + } + } + if (*sn != NULL) + PF_ACPY(&(*sn)->raddr, naddr, af); + + if (V_pf_status.debug >= PF_DEBUG_MISC && + (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) { + printf("pf_map_addr: selected address "); + pf_print_host(naddr, 0, af); + printf("\n"); + } + + return (0); +} + +struct pf_rule * +pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, + struct pfi_kif *kif, struct pf_src_node **sn, + struct pf_state_key **skp, struct pf_state_key **nkp, + struct pf_addr *saddr, struct pf_addr *daddr, + uint16_t sport, uint16_t dport, struct pf_anchor_stackframe *anchor_stack) +{ + struct pf_rule *r = NULL; + struct pf_addr *naddr; + uint16_t *nport; + + PF_RULES_RASSERT(); + KASSERT(*skp == NULL, ("*skp not NULL")); + KASSERT(*nkp == NULL, ("*nkp not NULL")); + + if (direction == PF_OUT) { + r = pf_match_translation(pd, m, off, direction, kif, saddr, + sport, daddr, dport, PF_RULESET_BINAT, anchor_stack); + if (r == NULL) + r = pf_match_translation(pd, m, off, direction, kif, + saddr, sport, daddr, dport, PF_RULESET_NAT, + anchor_stack); + } else { + r = pf_match_translation(pd, m, off, direction, kif, saddr, + sport, daddr, dport, PF_RULESET_RDR, anchor_stack); + if (r == NULL) + r = pf_match_translation(pd, m, off, direction, kif, + saddr, sport, daddr, dport, PF_RULESET_BINAT, + anchor_stack); + } + + if (r == NULL) + return (NULL); + + switch (r->action) { + case PF_NONAT: + case PF_NOBINAT: + case PF_NORDR: + return (NULL); + } + + *skp = pf_state_key_setup(pd, saddr, daddr, sport, dport); + if (*skp == NULL) + return (NULL); + *nkp = pf_state_key_clone(*skp); + if (*nkp == NULL) { + uma_zfree(V_pf_state_key_z, skp); + *skp = NULL; + return (NULL); + } + + /* XXX We only modify one side for now. */ + naddr = &(*nkp)->addr[1]; + nport = &(*nkp)->port[1]; + + switch (r->action) { + case PF_NAT: + if (pf_get_sport(pd->af, pd->proto, r, saddr, sport, daddr, + dport, naddr, nport, r->rpool.proxy_port[0], + r->rpool.proxy_port[1], sn)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: NAT proxy port allocation (%u-%u) failed\n", + r->rpool.proxy_port[0], r->rpool.proxy_port[1])); + goto notrans; + } + break; + case PF_BINAT: + switch (direction) { + case PF_OUT: + if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){ + switch (pd->af) { +#ifdef INET + case AF_INET: + if (r->rpool.cur->addr.p.dyn-> + pfid_acnt4 < 1) + goto notrans; + PF_POOLMASK(naddr, + &r->rpool.cur->addr.p.dyn-> + pfid_addr4, + &r->rpool.cur->addr.p.dyn-> + pfid_mask4, saddr, AF_INET); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (r->rpool.cur->addr.p.dyn-> + pfid_acnt6 < 1) + goto notrans; + PF_POOLMASK(naddr, + &r->rpool.cur->addr.p.dyn-> + pfid_addr6, + &r->rpool.cur->addr.p.dyn-> + pfid_mask6, saddr, AF_INET6); + break; +#endif /* INET6 */ + } + } else + PF_POOLMASK(naddr, + &r->rpool.cur->addr.v.a.addr, + &r->rpool.cur->addr.v.a.mask, saddr, + pd->af); + break; + case PF_IN: + if (r->src.addr.type == PF_ADDR_DYNIFTL) { + switch (pd->af) { +#ifdef INET + case AF_INET: + if (r->src.addr.p.dyn-> pfid_acnt4 < 1) + goto notrans; + PF_POOLMASK(naddr, + &r->src.addr.p.dyn->pfid_addr4, + &r->src.addr.p.dyn->pfid_mask4, + daddr, AF_INET); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (r->src.addr.p.dyn->pfid_acnt6 < 1) + goto notrans; + PF_POOLMASK(naddr, + &r->src.addr.p.dyn->pfid_addr6, + &r->src.addr.p.dyn->pfid_mask6, + daddr, AF_INET6); + break; +#endif /* INET6 */ + } + } else + PF_POOLMASK(naddr, &r->src.addr.v.a.addr, + &r->src.addr.v.a.mask, daddr, pd->af); + break; + } + break; + case PF_RDR: { + if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn)) + goto notrans; + if ((r->rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK) + PF_POOLMASK(naddr, naddr, &r->rpool.cur->addr.v.a.mask, + daddr, pd->af); + + if (r->rpool.proxy_port[1]) { + uint32_t tmp_nport; + + tmp_nport = ((ntohs(dport) - ntohs(r->dst.port[0])) % + (r->rpool.proxy_port[1] - r->rpool.proxy_port[0] + + 1)) + r->rpool.proxy_port[0]; + + /* Wrap around if necessary. */ + if (tmp_nport > 65535) + tmp_nport -= 65535; + *nport = htons((uint16_t)tmp_nport); + } else if (r->rpool.proxy_port[0]) + *nport = htons(r->rpool.proxy_port[0]); + break; + } + default: + panic("%s: unknown action %u", __func__, r->action); + } + + /* Return success only if translation really happened. */ + if (bcmp(*skp, *nkp, sizeof(struct pf_state_key_cmp))) + return (r); + +notrans: + uma_zfree(V_pf_state_key_z, *nkp); + uma_zfree(V_pf_state_key_z, *skp); + *skp = *nkp = NULL; + *sn = NULL; + + return (NULL); +} diff --git a/freebsd/sys/netpfil/pf/pf_mtag.h b/freebsd/sys/netpfil/pf/pf_mtag.h new file mode 100644 index 00000000..fd8554ae --- /dev/null +++ b/freebsd/sys/netpfil/pf/pf_mtag.h @@ -0,0 +1,64 @@ +/* $FreeBSD$ */ +/* + * Copyright (c) 2001 Daniel Hartmeier + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _NET_PF_MTAG_H_ +#define _NET_PF_MTAG_H_ + +#ifdef _KERNEL + +#define PF_TAG_GENERATED 0x01 +#define PF_TAG_FRAGCACHE 0x02 +#define PF_TAG_TRANSLATE_LOCALHOST 0x04 +#define PF_PACKET_LOOPED 0x08 +#define PF_FASTFWD_OURS_PRESENT 0x10 +#define PF_REASSEMBLED 0x20 + +struct pf_mtag { + void *hdr; /* saved hdr pos in mbuf, for ECN */ + u_int32_t qid; /* queue id */ + u_int32_t qid_hash; /* queue hashid used by WFQ like algos */ + u_int16_t tag; /* tag id */ + u_int8_t flags; + u_int8_t routed; +}; + +static __inline struct pf_mtag * +pf_find_mtag(struct mbuf *m) +{ + struct m_tag *mtag; + + if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) == NULL) + return (NULL); + + return ((struct pf_mtag *)(mtag + 1)); +} +#endif /* _KERNEL */ +#endif /* _NET_PF_MTAG_H_ */ diff --git a/freebsd/sys/netpfil/pf/pf_norm.c b/freebsd/sys/netpfil/pf/pf_norm.c new file mode 100644 index 00000000..86d2c8eb --- /dev/null +++ b/freebsd/sys/netpfil/pf/pf_norm.c @@ -0,0 +1,1843 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright 2001 Niels Provos <provos@citi.umich.edu> + * Copyright 2011 Alexander Bluhm <bluhm@openbsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $OpenBSD: pf_norm.c,v 1.114 2009/01/29 14:11:45 henning Exp $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> +#include <rtems/bsd/local/opt_pf.h> + +#include <rtems/bsd/sys/param.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/refcount.h> +#include <sys/rwlock.h> +#include <sys/socket.h> + +#include <net/if.h> +#include <net/vnet.h> +#include <net/pfvar.h> +#include <net/if_pflog.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet6/ip6_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> + +#ifdef INET6 +#include <netinet/ip6.h> +#endif /* INET6 */ + +struct pf_frent { + TAILQ_ENTRY(pf_frent) fr_next; + struct mbuf *fe_m; + uint16_t fe_hdrlen; /* ipv4 header length with ip options + ipv6, extension, fragment header */ + uint16_t fe_extoff; /* last extension header offset or 0 */ + uint16_t fe_len; /* fragment length */ + uint16_t fe_off; /* fragment offset */ + uint16_t fe_mff; /* more fragment flag */ +}; + +struct pf_fragment_cmp { + struct pf_addr frc_src; + struct pf_addr frc_dst; + uint32_t frc_id; + sa_family_t frc_af; + uint8_t frc_proto; +}; + +struct pf_fragment { + struct pf_fragment_cmp fr_key; +#define fr_src fr_key.frc_src +#define fr_dst fr_key.frc_dst +#define fr_id fr_key.frc_id +#define fr_af fr_key.frc_af +#define fr_proto fr_key.frc_proto + + RB_ENTRY(pf_fragment) fr_entry; + TAILQ_ENTRY(pf_fragment) frag_next; + uint32_t fr_timeout; + uint16_t fr_maxlen; /* maximum length of single fragment */ + TAILQ_HEAD(pf_fragq, pf_frent) fr_queue; +}; + +struct pf_fragment_tag { + uint16_t ft_hdrlen; /* header length of reassembled pkt */ + uint16_t ft_extoff; /* last extension header offset or 0 */ + uint16_t ft_maxlen; /* maximum fragment payload length */ + uint32_t ft_id; /* fragment id */ +}; + +static struct mtx pf_frag_mtx; +MTX_SYSINIT(pf_frag_mtx, &pf_frag_mtx, "pf fragments", MTX_DEF); +#define PF_FRAG_LOCK() mtx_lock(&pf_frag_mtx) +#define PF_FRAG_UNLOCK() mtx_unlock(&pf_frag_mtx) +#define PF_FRAG_ASSERT() mtx_assert(&pf_frag_mtx, MA_OWNED) + +VNET_DEFINE(uma_zone_t, pf_state_scrub_z); /* XXX: shared with pfsync */ + +static VNET_DEFINE(uma_zone_t, pf_frent_z); +#define V_pf_frent_z VNET(pf_frent_z) +static VNET_DEFINE(uma_zone_t, pf_frag_z); +#define V_pf_frag_z VNET(pf_frag_z) + +TAILQ_HEAD(pf_fragqueue, pf_fragment); +TAILQ_HEAD(pf_cachequeue, pf_fragment); +static VNET_DEFINE(struct pf_fragqueue, pf_fragqueue); +#define V_pf_fragqueue VNET(pf_fragqueue) +RB_HEAD(pf_frag_tree, pf_fragment); +static VNET_DEFINE(struct pf_frag_tree, pf_frag_tree); +#define V_pf_frag_tree VNET(pf_frag_tree) +static int pf_frag_compare(struct pf_fragment *, + struct pf_fragment *); +static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); +static RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); + +static void pf_flush_fragments(void); +static void pf_free_fragment(struct pf_fragment *); +static void pf_remove_fragment(struct pf_fragment *); +static int pf_normalize_tcpopt(struct pf_rule *, struct mbuf *, + struct tcphdr *, int, sa_family_t); +static struct pf_frent *pf_create_fragment(u_short *); +static struct pf_fragment *pf_find_fragment(struct pf_fragment_cmp *key, + struct pf_frag_tree *tree); +static struct pf_fragment *pf_fillup_fragment(struct pf_fragment_cmp *, + struct pf_frent *, u_short *); +static int pf_isfull_fragment(struct pf_fragment *); +static struct mbuf *pf_join_fragment(struct pf_fragment *); +#ifdef INET +static void pf_scrub_ip(struct mbuf **, uint32_t, uint8_t, uint8_t); +static int pf_reassemble(struct mbuf **, struct ip *, int, u_short *); +#endif /* INET */ +#ifdef INET6 +static int pf_reassemble6(struct mbuf **, struct ip6_hdr *, + struct ip6_frag *, uint16_t, uint16_t, u_short *); +static void pf_scrub_ip6(struct mbuf **, uint8_t); +#endif /* INET6 */ + +#define DPFPRINTF(x) do { \ + if (V_pf_status.debug >= PF_DEBUG_MISC) { \ + printf("%s: ", __func__); \ + printf x ; \ + } \ +} while(0) + +#ifdef INET +static void +pf_ip2key(struct ip *ip, int dir, struct pf_fragment_cmp *key) +{ + + key->frc_src.v4 = ip->ip_src; + key->frc_dst.v4 = ip->ip_dst; + key->frc_af = AF_INET; + key->frc_proto = ip->ip_p; + key->frc_id = ip->ip_id; +} +#endif /* INET */ + +void +pf_normalize_init(void) +{ + + V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + V_pf_state_scrub_z = uma_zcreate("pf state scrubs", + sizeof(struct pf_state_scrub), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + + V_pf_limits[PF_LIMIT_FRAGS].zone = V_pf_frent_z; + V_pf_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT; + uma_zone_set_max(V_pf_frent_z, PFFRAG_FRENT_HIWAT); + uma_zone_set_warning(V_pf_frent_z, "PF frag entries limit reached"); + + TAILQ_INIT(&V_pf_fragqueue); +} + +void +pf_normalize_cleanup(void) +{ + + uma_zdestroy(V_pf_state_scrub_z); + uma_zdestroy(V_pf_frent_z); + uma_zdestroy(V_pf_frag_z); +} + +static int +pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b) +{ + int diff; + + if ((diff = a->fr_id - b->fr_id) != 0) + return (diff); + if ((diff = a->fr_proto - b->fr_proto) != 0) + return (diff); + if ((diff = a->fr_af - b->fr_af) != 0) + return (diff); + if ((diff = pf_addr_cmp(&a->fr_src, &b->fr_src, a->fr_af)) != 0) + return (diff); + if ((diff = pf_addr_cmp(&a->fr_dst, &b->fr_dst, a->fr_af)) != 0) + return (diff); + return (0); +} + +void +pf_purge_expired_fragments(void) +{ + struct pf_fragment *frag; + u_int32_t expire = time_uptime - + V_pf_default_rule.timeout[PFTM_FRAG]; + + PF_FRAG_LOCK(); + while ((frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue)) != NULL) { + if (frag->fr_timeout > expire) + break; + + DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); + pf_free_fragment(frag); + } + + PF_FRAG_UNLOCK(); +} + +/* + * Try to flush old fragments to make space for new ones + */ +static void +pf_flush_fragments(void) +{ + struct pf_fragment *frag; + int goal; + + PF_FRAG_ASSERT(); + + goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10; + DPFPRINTF(("trying to free %d frag entriess\n", goal)); + while (goal < uma_zone_get_cur(V_pf_frent_z)) { + frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue); + if (frag) + pf_free_fragment(frag); + else + break; + } +} + +/* Frees the fragments and all associated entries */ +static void +pf_free_fragment(struct pf_fragment *frag) +{ + struct pf_frent *frent; + + PF_FRAG_ASSERT(); + + /* Free all fragments */ + for (frent = TAILQ_FIRST(&frag->fr_queue); frent; + frent = TAILQ_FIRST(&frag->fr_queue)) { + TAILQ_REMOVE(&frag->fr_queue, frent, fr_next); + + m_freem(frent->fe_m); + uma_zfree(V_pf_frent_z, frent); + } + + pf_remove_fragment(frag); +} + +static struct pf_fragment * +pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree) +{ + struct pf_fragment *frag; + + PF_FRAG_ASSERT(); + + frag = RB_FIND(pf_frag_tree, tree, (struct pf_fragment *)key); + if (frag != NULL) { + /* XXX Are we sure we want to update the timeout? */ + frag->fr_timeout = time_uptime; + TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); + TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); + } + + return (frag); +} + +/* Removes a fragment from the fragment queue and frees the fragment */ +static void +pf_remove_fragment(struct pf_fragment *frag) +{ + + PF_FRAG_ASSERT(); + + RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag); + TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); + uma_zfree(V_pf_frag_z, frag); +} + +static struct pf_frent * +pf_create_fragment(u_short *reason) +{ + struct pf_frent *frent; + + PF_FRAG_ASSERT(); + + frent = uma_zalloc(V_pf_frent_z, M_NOWAIT); + if (frent == NULL) { + pf_flush_fragments(); + frent = uma_zalloc(V_pf_frent_z, M_NOWAIT); + if (frent == NULL) { + REASON_SET(reason, PFRES_MEMORY); + return (NULL); + } + } + + return (frent); +} + +static struct pf_fragment * +pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent, + u_short *reason) +{ + struct pf_frent *after, *next, *prev; + struct pf_fragment *frag; + uint16_t total; + + PF_FRAG_ASSERT(); + + /* No empty fragments. */ + if (frent->fe_len == 0) { + DPFPRINTF(("bad fragment: len 0")); + goto bad_fragment; + } + + /* All fragments are 8 byte aligned. */ + if (frent->fe_mff && (frent->fe_len & 0x7)) { + DPFPRINTF(("bad fragment: mff and len %d", frent->fe_len)); + goto bad_fragment; + } + + /* Respect maximum length, IP_MAXPACKET == IPV6_MAXPACKET. */ + if (frent->fe_off + frent->fe_len > IP_MAXPACKET) { + DPFPRINTF(("bad fragment: max packet %d", + frent->fe_off + frent->fe_len)); + goto bad_fragment; + } + + DPFPRINTF((key->frc_af == AF_INET ? + "reass frag %d @ %d-%d" : "reass frag %#08x @ %d-%d", + key->frc_id, frent->fe_off, frent->fe_off + frent->fe_len)); + + /* Fully buffer all of the fragments in this fragment queue. */ + frag = pf_find_fragment(key, &V_pf_frag_tree); + + /* Create a new reassembly queue for this packet. */ + if (frag == NULL) { + frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); + if (frag == NULL) { + pf_flush_fragments(); + frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); + if (frag == NULL) { + REASON_SET(reason, PFRES_MEMORY); + goto drop_fragment; + } + } + + *(struct pf_fragment_cmp *)frag = *key; + frag->fr_timeout = time_uptime; + frag->fr_maxlen = frent->fe_len; + TAILQ_INIT(&frag->fr_queue); + + RB_INSERT(pf_frag_tree, &V_pf_frag_tree, frag); + TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); + + /* We do not have a previous fragment. */ + TAILQ_INSERT_HEAD(&frag->fr_queue, frent, fr_next); + + return (frag); + } + + KASSERT(!TAILQ_EMPTY(&frag->fr_queue), ("!TAILQ_EMPTY()->fr_queue")); + + /* Remember maximum fragment len for refragmentation. */ + if (frent->fe_len > frag->fr_maxlen) + frag->fr_maxlen = frent->fe_len; + + /* Maximum data we have seen already. */ + total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; + + /* Non terminal fragments must have more fragments flag. */ + if (frent->fe_off + frent->fe_len < total && !frent->fe_mff) + goto bad_fragment; + + /* Check if we saw the last fragment already. */ + if (!TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_mff) { + if (frent->fe_off + frent->fe_len > total || + (frent->fe_off + frent->fe_len == total && frent->fe_mff)) + goto bad_fragment; + } else { + if (frent->fe_off + frent->fe_len == total && !frent->fe_mff) + goto bad_fragment; + } + + /* Find a fragment after the current one. */ + prev = NULL; + TAILQ_FOREACH(after, &frag->fr_queue, fr_next) { + if (after->fe_off > frent->fe_off) + break; + prev = after; + } + + KASSERT(prev != NULL || after != NULL, + ("prev != NULL || after != NULL")); + + if (prev != NULL && prev->fe_off + prev->fe_len > frent->fe_off) { + uint16_t precut; + + precut = prev->fe_off + prev->fe_len - frent->fe_off; + if (precut >= frent->fe_len) + goto bad_fragment; + DPFPRINTF(("overlap -%d", precut)); + m_adj(frent->fe_m, precut); + frent->fe_off += precut; + frent->fe_len -= precut; + } + + for (; after != NULL && frent->fe_off + frent->fe_len > after->fe_off; + after = next) { + uint16_t aftercut; + + aftercut = frent->fe_off + frent->fe_len - after->fe_off; + DPFPRINTF(("adjust overlap %d", aftercut)); + if (aftercut < after->fe_len) { + m_adj(after->fe_m, aftercut); + after->fe_off += aftercut; + after->fe_len -= aftercut; + break; + } + + /* This fragment is completely overlapped, lose it. */ + next = TAILQ_NEXT(after, fr_next); + m_freem(after->fe_m); + TAILQ_REMOVE(&frag->fr_queue, after, fr_next); + uma_zfree(V_pf_frent_z, after); + } + + if (prev == NULL) + TAILQ_INSERT_HEAD(&frag->fr_queue, frent, fr_next); + else + TAILQ_INSERT_AFTER(&frag->fr_queue, prev, frent, fr_next); + + return (frag); + +bad_fragment: + REASON_SET(reason, PFRES_FRAG); +drop_fragment: + uma_zfree(V_pf_frent_z, frent); + return (NULL); +} + +static int +pf_isfull_fragment(struct pf_fragment *frag) +{ + struct pf_frent *frent, *next; + uint16_t off, total; + + /* Check if we are completely reassembled */ + if (TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_mff) + return (0); + + /* Maximum data we have seen already */ + total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; + + /* Check if we have all the data */ + off = 0; + for (frent = TAILQ_FIRST(&frag->fr_queue); frent; frent = next) { + next = TAILQ_NEXT(frent, fr_next); + + off += frent->fe_len; + if (off < total && (next == NULL || next->fe_off != off)) { + DPFPRINTF(("missing fragment at %d, next %d, total %d", + off, next == NULL ? -1 : next->fe_off, total)); + return (0); + } + } + DPFPRINTF(("%d < %d?", off, total)); + if (off < total) + return (0); + KASSERT(off == total, ("off == total")); + + return (1); +} + +static struct mbuf * +pf_join_fragment(struct pf_fragment *frag) +{ + struct mbuf *m, *m2; + struct pf_frent *frent, *next; + + frent = TAILQ_FIRST(&frag->fr_queue); + next = TAILQ_NEXT(frent, fr_next); + + m = frent->fe_m; + m_adj(m, (frent->fe_hdrlen + frent->fe_len) - m->m_pkthdr.len); + uma_zfree(V_pf_frent_z, frent); + for (frent = next; frent != NULL; frent = next) { + next = TAILQ_NEXT(frent, fr_next); + + m2 = frent->fe_m; + /* Strip off ip header. */ + m_adj(m2, frent->fe_hdrlen); + /* Strip off any trailing bytes. */ + m_adj(m2, frent->fe_len - m2->m_pkthdr.len); + + uma_zfree(V_pf_frent_z, frent); + m_cat(m, m2); + } + + /* Remove from fragment queue. */ + pf_remove_fragment(frag); + + return (m); +} + +#ifdef INET +static int +pf_reassemble(struct mbuf **m0, struct ip *ip, int dir, u_short *reason) +{ + struct mbuf *m = *m0; + struct pf_frent *frent; + struct pf_fragment *frag; + struct pf_fragment_cmp key; + uint16_t total, hdrlen; + + /* Get an entry for the fragment queue */ + if ((frent = pf_create_fragment(reason)) == NULL) + return (PF_DROP); + + frent->fe_m = m; + frent->fe_hdrlen = ip->ip_hl << 2; + frent->fe_extoff = 0; + frent->fe_len = ntohs(ip->ip_len) - (ip->ip_hl << 2); + frent->fe_off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; + frent->fe_mff = ntohs(ip->ip_off) & IP_MF; + + pf_ip2key(ip, dir, &key); + + if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) + return (PF_DROP); + + /* The mbuf is part of the fragment entry, no direct free or access */ + m = *m0 = NULL; + + if (!pf_isfull_fragment(frag)) + return (PF_PASS); /* drop because *m0 is NULL, no error */ + + /* We have all the data */ + frent = TAILQ_FIRST(&frag->fr_queue); + KASSERT(frent != NULL, ("frent != NULL")); + total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; + hdrlen = frent->fe_hdrlen; + + m = *m0 = pf_join_fragment(frag); + frag = NULL; + + if (m->m_flags & M_PKTHDR) { + int plen = 0; + for (m = *m0; m; m = m->m_next) + plen += m->m_len; + m = *m0; + m->m_pkthdr.len = plen; + } + + ip = mtod(m, struct ip *); + ip->ip_len = htons(hdrlen + total); + ip->ip_off &= ~(IP_MF|IP_OFFMASK); + + if (hdrlen + total > IP_MAXPACKET) { + DPFPRINTF(("drop: too big: %d", total)); + ip->ip_len = 0; + REASON_SET(reason, PFRES_SHORT); + /* PF_DROP requires a valid mbuf *m0 in pf_test() */ + return (PF_DROP); + } + + DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len))); + return (PF_PASS); +} +#endif /* INET */ + +#ifdef INET6 +static int +pf_reassemble6(struct mbuf **m0, struct ip6_hdr *ip6, struct ip6_frag *fraghdr, + uint16_t hdrlen, uint16_t extoff, u_short *reason) +{ + struct mbuf *m = *m0; + struct pf_frent *frent; + struct pf_fragment *frag; + struct pf_fragment_cmp key; + struct m_tag *mtag; + struct pf_fragment_tag *ftag; + int off; + uint32_t frag_id; + uint16_t total, maxlen; + uint8_t proto; + + PF_FRAG_LOCK(); + + /* Get an entry for the fragment queue. */ + if ((frent = pf_create_fragment(reason)) == NULL) { + PF_FRAG_UNLOCK(); + return (PF_DROP); + } + + frent->fe_m = m; + frent->fe_hdrlen = hdrlen; + frent->fe_extoff = extoff; + frent->fe_len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - hdrlen; + frent->fe_off = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK); + frent->fe_mff = fraghdr->ip6f_offlg & IP6F_MORE_FRAG; + + key.frc_src.v6 = ip6->ip6_src; + key.frc_dst.v6 = ip6->ip6_dst; + key.frc_af = AF_INET6; + /* Only the first fragment's protocol is relevant. */ + key.frc_proto = 0; + key.frc_id = fraghdr->ip6f_ident; + + if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { + PF_FRAG_UNLOCK(); + return (PF_DROP); + } + + /* The mbuf is part of the fragment entry, no direct free or access. */ + m = *m0 = NULL; + + if (!pf_isfull_fragment(frag)) { + PF_FRAG_UNLOCK(); + return (PF_PASS); /* Drop because *m0 is NULL, no error. */ + } + + /* We have all the data. */ + extoff = frent->fe_extoff; + maxlen = frag->fr_maxlen; + frag_id = frag->fr_id; + frent = TAILQ_FIRST(&frag->fr_queue); + KASSERT(frent != NULL, ("frent != NULL")); + total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; + hdrlen = frent->fe_hdrlen - sizeof(struct ip6_frag); + + m = *m0 = pf_join_fragment(frag); + frag = NULL; + + PF_FRAG_UNLOCK(); + + /* Take protocol from first fragment header. */ + m = m_getptr(m, hdrlen + offsetof(struct ip6_frag, ip6f_nxt), &off); + KASSERT(m, ("%s: short mbuf chain", __func__)); + proto = *(mtod(m, caddr_t) + off); + m = *m0; + + /* Delete frag6 header */ + if (ip6_deletefraghdr(m, hdrlen, M_NOWAIT) != 0) + goto fail; + + if (m->m_flags & M_PKTHDR) { + int plen = 0; + for (m = *m0; m; m = m->m_next) + plen += m->m_len; + m = *m0; + m->m_pkthdr.len = plen; + } + + if ((mtag = m_tag_get(PF_REASSEMBLED, sizeof(struct pf_fragment_tag), + M_NOWAIT)) == NULL) + goto fail; + ftag = (struct pf_fragment_tag *)(mtag + 1); + ftag->ft_hdrlen = hdrlen; + ftag->ft_extoff = extoff; + ftag->ft_maxlen = maxlen; + ftag->ft_id = frag_id; + m_tag_prepend(m, mtag); + + ip6 = mtod(m, struct ip6_hdr *); + ip6->ip6_plen = htons(hdrlen - sizeof(struct ip6_hdr) + total); + if (extoff) { + /* Write protocol into next field of last extension header. */ + m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt), + &off); + KASSERT(m, ("%s: short mbuf chain", __func__)); + *(mtod(m, char *) + off) = proto; + m = *m0; + } else + ip6->ip6_nxt = proto; + + if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) { + DPFPRINTF(("drop: too big: %d", total)); + ip6->ip6_plen = 0; + REASON_SET(reason, PFRES_SHORT); + /* PF_DROP requires a valid mbuf *m0 in pf_test6(). */ + return (PF_DROP); + } + + DPFPRINTF(("complete: %p(%d)", m, ntohs(ip6->ip6_plen))); + return (PF_PASS); + +fail: + REASON_SET(reason, PFRES_MEMORY); + /* PF_DROP requires a valid mbuf *m0 in pf_test6(), will free later. */ + return (PF_DROP); +} +#endif /* INET6 */ + +#ifdef INET6 +int +pf_refragment6(struct ifnet *ifp, struct mbuf **m0, struct m_tag *mtag) +{ + struct mbuf *m = *m0, *t; + struct pf_fragment_tag *ftag = (struct pf_fragment_tag *)(mtag + 1); + struct pf_pdesc pd; + uint32_t frag_id; + uint16_t hdrlen, extoff, maxlen; + uint8_t proto; + int error, action; + + hdrlen = ftag->ft_hdrlen; + extoff = ftag->ft_extoff; + maxlen = ftag->ft_maxlen; + frag_id = ftag->ft_id; + m_tag_delete(m, mtag); + mtag = NULL; + ftag = NULL; + + if (extoff) { + int off; + + /* Use protocol from next field of last extension header */ + m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt), + &off); + KASSERT((m != NULL), ("pf_refragment6: short mbuf chain")); + proto = *(mtod(m, caddr_t) + off); + *(mtod(m, char *) + off) = IPPROTO_FRAGMENT; + m = *m0; + } else { + struct ip6_hdr *hdr; + + hdr = mtod(m, struct ip6_hdr *); + proto = hdr->ip6_nxt; + hdr->ip6_nxt = IPPROTO_FRAGMENT; + } + + /* + * Maxlen may be less than 8 if there was only a single + * fragment. As it was fragmented before, add a fragment + * header also for a single fragment. If total or maxlen + * is less than 8, ip6_fragment() will return EMSGSIZE and + * we drop the packet. + */ + error = ip6_fragment(ifp, m, hdrlen, proto, maxlen, frag_id); + m = (*m0)->m_nextpkt; + (*m0)->m_nextpkt = NULL; + if (error == 0) { + /* The first mbuf contains the unfragmented packet. */ + m_freem(*m0); + *m0 = NULL; + action = PF_PASS; + } else { + /* Drop expects an mbuf to free. */ + DPFPRINTF(("refragment error %d", error)); + action = PF_DROP; + } + for (t = m; m; m = t) { + t = m->m_nextpkt; + m->m_nextpkt = NULL; + m->m_flags |= M_SKIP_FIREWALL; + memset(&pd, 0, sizeof(pd)); + pd.pf_mtag = pf_find_mtag(m); + if (error == 0) + ip6_forward(m, 0); + else + m_freem(m); + } + + return (action); +} +#endif /* INET6 */ + +#ifdef INET +int +pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, + struct pf_pdesc *pd) +{ + struct mbuf *m = *m0; + struct pf_rule *r; + struct ip *h = mtod(m, struct ip *); + int mff = (ntohs(h->ip_off) & IP_MF); + int hlen = h->ip_hl << 2; + u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; + u_int16_t max; + int ip_len; + int ip_off; + int tag = -1; + int verdict; + + PF_RULES_RASSERT(); + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != dir) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != AF_INET) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != h->ip_p) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, + (struct pf_addr *)&h->ip_src.s_addr, AF_INET, + r->src.neg, kif, M_GETFIB(m))) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, + (struct pf_addr *)&h->ip_dst.s_addr, AF_INET, + r->dst.neg, NULL, M_GETFIB(m))) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (r->match_tag && !pf_match_tag(m, r, &tag, + pd->pf_mtag ? pd->pf_mtag->tag : 0)) + r = TAILQ_NEXT(r, entries); + else + break; + } + + if (r == NULL || r->action == PF_NOSCRUB) + return (PF_PASS); + else { + r->packets[dir == PF_OUT]++; + r->bytes[dir == PF_OUT] += pd->tot_len; + } + + /* Check for illegal packets */ + if (hlen < (int)sizeof(struct ip)) { + REASON_SET(reason, PFRES_NORM); + goto drop; + } + + if (hlen > ntohs(h->ip_len)) { + REASON_SET(reason, PFRES_NORM); + goto drop; + } + + /* Clear IP_DF if the rule uses the no-df option */ + if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) { + u_int16_t ip_off = h->ip_off; + + h->ip_off &= htons(~IP_DF); + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); + } + + /* We will need other tests here */ + if (!fragoff && !mff) + goto no_fragment; + + /* We're dealing with a fragment now. Don't allow fragments + * with IP_DF to enter the cache. If the flag was cleared by + * no-df above, fine. Otherwise drop it. + */ + if (h->ip_off & htons(IP_DF)) { + DPFPRINTF(("IP_DF\n")); + goto bad; + } + + ip_len = ntohs(h->ip_len) - hlen; + ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3; + + /* All fragments are 8 byte aligned */ + if (mff && (ip_len & 0x7)) { + DPFPRINTF(("mff and %d\n", ip_len)); + goto bad; + } + + /* Respect maximum length */ + if (fragoff + ip_len > IP_MAXPACKET) { + DPFPRINTF(("max packet %d\n", fragoff + ip_len)); + goto bad; + } + max = fragoff + ip_len; + + /* Fully buffer all of the fragments + * Might return a completely reassembled mbuf, or NULL */ + PF_FRAG_LOCK(); + DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max)); + verdict = pf_reassemble(m0, h, dir, reason); + PF_FRAG_UNLOCK(); + + if (verdict != PF_PASS) + return (PF_DROP); + + m = *m0; + if (m == NULL) + return (PF_DROP); + + h = mtod(m, struct ip *); + + no_fragment: + /* At this point, only IP_DF is allowed in ip_off */ + if (h->ip_off & ~htons(IP_DF)) { + u_int16_t ip_off = h->ip_off; + + h->ip_off &= htons(IP_DF); + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); + } + + pf_scrub_ip(&m, r->rule_flag, r->min_ttl, r->set_tos); + + return (PF_PASS); + + bad: + DPFPRINTF(("dropping bad fragment\n")); + REASON_SET(reason, PFRES_FRAG); + drop: + if (r != NULL && r->log) + PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd, + 1); + + return (PF_DROP); +} +#endif + +#ifdef INET6 +int +pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, + u_short *reason, struct pf_pdesc *pd) +{ + struct mbuf *m = *m0; + struct pf_rule *r; + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + int extoff; + int off; + struct ip6_ext ext; + struct ip6_opt opt; + struct ip6_opt_jumbo jumbo; + struct ip6_frag frag; + u_int32_t jumbolen = 0, plen; + int optend; + int ooff; + u_int8_t proto; + int terminal; + + PF_RULES_RASSERT(); + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != dir) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != AF_INET6) + r = r->skip[PF_SKIP_AF].ptr; +#if 0 /* header chain! */ + else if (r->proto && r->proto != h->ip6_nxt) + r = r->skip[PF_SKIP_PROTO].ptr; +#endif + else if (PF_MISMATCHAW(&r->src.addr, + (struct pf_addr *)&h->ip6_src, AF_INET6, + r->src.neg, kif, M_GETFIB(m))) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, + (struct pf_addr *)&h->ip6_dst, AF_INET6, + r->dst.neg, NULL, M_GETFIB(m))) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else + break; + } + + if (r == NULL || r->action == PF_NOSCRUB) + return (PF_PASS); + else { + r->packets[dir == PF_OUT]++; + r->bytes[dir == PF_OUT] += pd->tot_len; + } + + /* Check for illegal packets */ + if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len) + goto drop; + + extoff = 0; + off = sizeof(struct ip6_hdr); + proto = h->ip6_nxt; + terminal = 0; + do { + switch (proto) { + case IPPROTO_FRAGMENT: + goto fragment; + break; + case IPPROTO_AH: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: + if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, + NULL, AF_INET6)) + goto shortpkt; + extoff = off; + if (proto == IPPROTO_AH) + off += (ext.ip6e_len + 2) * 4; + else + off += (ext.ip6e_len + 1) * 8; + proto = ext.ip6e_nxt; + break; + case IPPROTO_HOPOPTS: + if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, + NULL, AF_INET6)) + goto shortpkt; + extoff = off; + optend = off + (ext.ip6e_len + 1) * 8; + ooff = off + sizeof(ext); + do { + if (!pf_pull_hdr(m, ooff, &opt.ip6o_type, + sizeof(opt.ip6o_type), NULL, NULL, + AF_INET6)) + goto shortpkt; + if (opt.ip6o_type == IP6OPT_PAD1) { + ooff++; + continue; + } + if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt), + NULL, NULL, AF_INET6)) + goto shortpkt; + if (ooff + sizeof(opt) + opt.ip6o_len > optend) + goto drop; + switch (opt.ip6o_type) { + case IP6OPT_JUMBO: + if (h->ip6_plen != 0) + goto drop; + if (!pf_pull_hdr(m, ooff, &jumbo, + sizeof(jumbo), NULL, NULL, + AF_INET6)) + goto shortpkt; + memcpy(&jumbolen, jumbo.ip6oj_jumbo_len, + sizeof(jumbolen)); + jumbolen = ntohl(jumbolen); + if (jumbolen <= IPV6_MAXPACKET) + goto drop; + if (sizeof(struct ip6_hdr) + jumbolen != + m->m_pkthdr.len) + goto drop; + break; + default: + break; + } + ooff += sizeof(opt) + opt.ip6o_len; + } while (ooff < optend); + + off = optend; + proto = ext.ip6e_nxt; + break; + default: + terminal = 1; + break; + } + } while (!terminal); + + /* jumbo payload option must be present, or plen > 0 */ + if (ntohs(h->ip6_plen) == 0) + plen = jumbolen; + else + plen = ntohs(h->ip6_plen); + if (plen == 0) + goto drop; + if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) + goto shortpkt; + + pf_scrub_ip6(&m, r->min_ttl); + + return (PF_PASS); + + fragment: + /* Jumbo payload packets cannot be fragmented. */ + plen = ntohs(h->ip6_plen); + if (plen == 0 || jumbolen) + goto drop; + if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) + goto shortpkt; + + if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6)) + goto shortpkt; + + /* Offset now points to data portion. */ + off += sizeof(frag); + + /* Returns PF_DROP or *m0 is NULL or completely reassembled mbuf. */ + if (pf_reassemble6(m0, h, &frag, off, extoff, reason) != PF_PASS) + return (PF_DROP); + m = *m0; + if (m == NULL) + return (PF_DROP); + + pd->flags |= PFDESC_IP_REAS; + return (PF_PASS); + + shortpkt: + REASON_SET(reason, PFRES_SHORT); + if (r != NULL && r->log) + PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd, + 1); + return (PF_DROP); + + drop: + REASON_SET(reason, PFRES_NORM); + if (r != NULL && r->log) + PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd, + 1); + return (PF_DROP); +} +#endif /* INET6 */ + +int +pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff, + int off, void *h, struct pf_pdesc *pd) +{ + struct pf_rule *r, *rm = NULL; + struct tcphdr *th = pd->hdr.tcp; + int rewrite = 0; + u_short reason; + u_int8_t flags; + sa_family_t af = pd->af; + + PF_RULES_RASSERT(); + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != dir) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, + r->src.neg, kif, M_GETFIB(m))) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (r->src.port_op && !pf_match_port(r->src.port_op, + r->src.port[0], r->src.port[1], th->th_sport)) + r = r->skip[PF_SKIP_SRC_PORT].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, + r->dst.neg, NULL, M_GETFIB(m))) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (r->dst.port_op && !pf_match_port(r->dst.port_op, + r->dst.port[0], r->dst.port[1], th->th_dport)) + r = r->skip[PF_SKIP_DST_PORT].ptr; + else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match( + pf_osfp_fingerprint(pd, m, off, th), + r->os_fingerprint)) + r = TAILQ_NEXT(r, entries); + else { + rm = r; + break; + } + } + + if (rm == NULL || rm->action == PF_NOSCRUB) + return (PF_PASS); + else { + r->packets[dir == PF_OUT]++; + r->bytes[dir == PF_OUT] += pd->tot_len; + } + + if (rm->rule_flag & PFRULE_REASSEMBLE_TCP) + pd->flags |= PFDESC_TCP_NORM; + + flags = th->th_flags; + if (flags & TH_SYN) { + /* Illegal packet */ + if (flags & TH_RST) + goto tcp_drop; + + if (flags & TH_FIN) + goto tcp_drop; + } else { + /* Illegal packet */ + if (!(flags & (TH_ACK|TH_RST))) + goto tcp_drop; + } + + if (!(flags & TH_ACK)) { + /* These flags are only valid if ACK is set */ + if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG)) + goto tcp_drop; + } + + /* Check for illegal header length */ + if (th->th_off < (sizeof(struct tcphdr) >> 2)) + goto tcp_drop; + + /* If flags changed, or reserved data set, then adjust */ + if (flags != th->th_flags || th->th_x2 != 0) { + u_int16_t ov, nv; + + ov = *(u_int16_t *)(&th->th_ack + 1); + th->th_flags = flags; + th->th_x2 = 0; + nv = *(u_int16_t *)(&th->th_ack + 1); + + th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, ov, nv, 0); + rewrite = 1; + } + + /* Remove urgent pointer, if TH_URG is not set */ + if (!(flags & TH_URG) && th->th_urp) { + th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, th->th_urp, + 0, 0); + th->th_urp = 0; + rewrite = 1; + } + + /* Process options */ + if (r->max_mss && pf_normalize_tcpopt(r, m, th, off, pd->af)) + rewrite = 1; + + /* copy back packet headers if we sanitized */ + if (rewrite) + m_copyback(m, off, sizeof(*th), (caddr_t)th); + + return (PF_PASS); + + tcp_drop: + REASON_SET(&reason, PFRES_NORM); + if (rm != NULL && r->log) + PFLOG_PACKET(kif, m, AF_INET, dir, reason, r, NULL, NULL, pd, + 1); + return (PF_DROP); +} + +int +pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd, + struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst) +{ + u_int32_t tsval, tsecr; + u_int8_t hdr[60]; + u_int8_t *opt; + + KASSERT((src->scrub == NULL), + ("pf_normalize_tcp_init: src->scrub != NULL")); + + src->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT); + if (src->scrub == NULL) + return (1); + + switch (pd->af) { +#ifdef INET + case AF_INET: { + struct ip *h = mtod(m, struct ip *); + src->scrub->pfss_ttl = h->ip_ttl; + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + src->scrub->pfss_ttl = h->ip6_hlim; + break; + } +#endif /* INET6 */ + } + + + /* + * All normalizations below are only begun if we see the start of + * the connections. They must all set an enabled bit in pfss_flags + */ + if ((th->th_flags & TH_SYN) == 0) + return (0); + + + if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub && + pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { + /* Diddle with TCP options */ + int hlen; + opt = hdr + sizeof(struct tcphdr); + hlen = (th->th_off << 2) - sizeof(struct tcphdr); + while (hlen >= TCPOLEN_TIMESTAMP) { + switch (*opt) { + case TCPOPT_EOL: /* FALLTHROUGH */ + case TCPOPT_NOP: + opt++; + hlen--; + break; + case TCPOPT_TIMESTAMP: + if (opt[1] >= TCPOLEN_TIMESTAMP) { + src->scrub->pfss_flags |= + PFSS_TIMESTAMP; + src->scrub->pfss_ts_mod = + htonl(arc4random()); + + /* note PFSS_PAWS not set yet */ + memcpy(&tsval, &opt[2], + sizeof(u_int32_t)); + memcpy(&tsecr, &opt[6], + sizeof(u_int32_t)); + src->scrub->pfss_tsval0 = ntohl(tsval); + src->scrub->pfss_tsval = ntohl(tsval); + src->scrub->pfss_tsecr = ntohl(tsecr); + getmicrouptime(&src->scrub->pfss_last); + } + /* FALLTHROUGH */ + default: + hlen -= MAX(opt[1], 2); + opt += MAX(opt[1], 2); + break; + } + } + } + + return (0); +} + +void +pf_normalize_tcp_cleanup(struct pf_state *state) +{ + if (state->src.scrub) + uma_zfree(V_pf_state_scrub_z, state->src.scrub); + if (state->dst.scrub) + uma_zfree(V_pf_state_scrub_z, state->dst.scrub); + + /* Someday... flush the TCP segment reassembly descriptors. */ +} + +int +pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd, + u_short *reason, struct tcphdr *th, struct pf_state *state, + struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback) +{ + struct timeval uptime; + u_int32_t tsval, tsecr; + u_int tsval_from_last; + u_int8_t hdr[60]; + u_int8_t *opt; + int copyback = 0; + int got_ts = 0; + + KASSERT((src->scrub || dst->scrub), + ("%s: src->scrub && dst->scrub!", __func__)); + + /* + * Enforce the minimum TTL seen for this connection. Negate a common + * technique to evade an intrusion detection system and confuse + * firewall state code. + */ + switch (pd->af) { +#ifdef INET + case AF_INET: { + if (src->scrub) { + struct ip *h = mtod(m, struct ip *); + if (h->ip_ttl > src->scrub->pfss_ttl) + src->scrub->pfss_ttl = h->ip_ttl; + h->ip_ttl = src->scrub->pfss_ttl; + } + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + if (src->scrub) { + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + if (h->ip6_hlim > src->scrub->pfss_ttl) + src->scrub->pfss_ttl = h->ip6_hlim; + h->ip6_hlim = src->scrub->pfss_ttl; + } + break; + } +#endif /* INET6 */ + } + + if (th->th_off > (sizeof(struct tcphdr) >> 2) && + ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) || + (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) && + pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { + /* Diddle with TCP options */ + int hlen; + opt = hdr + sizeof(struct tcphdr); + hlen = (th->th_off << 2) - sizeof(struct tcphdr); + while (hlen >= TCPOLEN_TIMESTAMP) { + switch (*opt) { + case TCPOPT_EOL: /* FALLTHROUGH */ + case TCPOPT_NOP: + opt++; + hlen--; + break; + case TCPOPT_TIMESTAMP: + /* Modulate the timestamps. Can be used for + * NAT detection, OS uptime determination or + * reboot detection. + */ + + if (got_ts) { + /* Huh? Multiple timestamps!? */ + if (V_pf_status.debug >= PF_DEBUG_MISC) { + DPFPRINTF(("multiple TS??")); + pf_print_state(state); + printf("\n"); + } + REASON_SET(reason, PFRES_TS); + return (PF_DROP); + } + if (opt[1] >= TCPOLEN_TIMESTAMP) { + memcpy(&tsval, &opt[2], + sizeof(u_int32_t)); + if (tsval && src->scrub && + (src->scrub->pfss_flags & + PFSS_TIMESTAMP)) { + tsval = ntohl(tsval); + pf_change_proto_a(m, &opt[2], + &th->th_sum, + htonl(tsval + + src->scrub->pfss_ts_mod), + 0); + copyback = 1; + } + + /* Modulate TS reply iff valid (!0) */ + memcpy(&tsecr, &opt[6], + sizeof(u_int32_t)); + if (tsecr && dst->scrub && + (dst->scrub->pfss_flags & + PFSS_TIMESTAMP)) { + tsecr = ntohl(tsecr) + - dst->scrub->pfss_ts_mod; + pf_change_proto_a(m, &opt[6], + &th->th_sum, htonl(tsecr), + 0); + copyback = 1; + } + got_ts = 1; + } + /* FALLTHROUGH */ + default: + hlen -= MAX(opt[1], 2); + opt += MAX(opt[1], 2); + break; + } + } + if (copyback) { + /* Copyback the options, caller copys back header */ + *writeback = 1; + m_copyback(m, off + sizeof(struct tcphdr), + (th->th_off << 2) - sizeof(struct tcphdr), hdr + + sizeof(struct tcphdr)); + } + } + + + /* + * Must invalidate PAWS checks on connections idle for too long. + * The fastest allowed timestamp clock is 1ms. That turns out to + * be about 24 days before it wraps. XXX Right now our lowerbound + * TS echo check only works for the first 12 days of a connection + * when the TS has exhausted half its 32bit space + */ +#define TS_MAX_IDLE (24*24*60*60) +#define TS_MAX_CONN (12*24*60*60) /* XXX remove when better tsecr check */ + + getmicrouptime(&uptime); + if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && + (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE || + time_uptime - state->creation > TS_MAX_CONN)) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + DPFPRINTF(("src idled out of PAWS\n")); + pf_print_state(state); + printf("\n"); + } + src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS) + | PFSS_PAWS_IDLED; + } + if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) && + uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + DPFPRINTF(("dst idled out of PAWS\n")); + pf_print_state(state); + printf("\n"); + } + dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS) + | PFSS_PAWS_IDLED; + } + + if (got_ts && src->scrub && dst->scrub && + (src->scrub->pfss_flags & PFSS_PAWS) && + (dst->scrub->pfss_flags & PFSS_PAWS)) { + /* Validate that the timestamps are "in-window". + * RFC1323 describes TCP Timestamp options that allow + * measurement of RTT (round trip time) and PAWS + * (protection against wrapped sequence numbers). PAWS + * gives us a set of rules for rejecting packets on + * long fat pipes (packets that were somehow delayed + * in transit longer than the time it took to send the + * full TCP sequence space of 4Gb). We can use these + * rules and infer a few others that will let us treat + * the 32bit timestamp and the 32bit echoed timestamp + * as sequence numbers to prevent a blind attacker from + * inserting packets into a connection. + * + * RFC1323 tells us: + * - The timestamp on this packet must be greater than + * or equal to the last value echoed by the other + * endpoint. The RFC says those will be discarded + * since it is a dup that has already been acked. + * This gives us a lowerbound on the timestamp. + * timestamp >= other last echoed timestamp + * - The timestamp will be less than or equal to + * the last timestamp plus the time between the + * last packet and now. The RFC defines the max + * clock rate as 1ms. We will allow clocks to be + * up to 10% fast and will allow a total difference + * or 30 seconds due to a route change. And this + * gives us an upperbound on the timestamp. + * timestamp <= last timestamp + max ticks + * We have to be careful here. Windows will send an + * initial timestamp of zero and then initialize it + * to a random value after the 3whs; presumably to + * avoid a DoS by having to call an expensive RNG + * during a SYN flood. Proof MS has at least one + * good security geek. + * + * - The TCP timestamp option must also echo the other + * endpoints timestamp. The timestamp echoed is the + * one carried on the earliest unacknowledged segment + * on the left edge of the sequence window. The RFC + * states that the host will reject any echoed + * timestamps that were larger than any ever sent. + * This gives us an upperbound on the TS echo. + * tescr <= largest_tsval + * - The lowerbound on the TS echo is a little more + * tricky to determine. The other endpoint's echoed + * values will not decrease. But there may be + * network conditions that re-order packets and + * cause our view of them to decrease. For now the + * only lowerbound we can safely determine is that + * the TS echo will never be less than the original + * TS. XXX There is probably a better lowerbound. + * Remove TS_MAX_CONN with better lowerbound check. + * tescr >= other original TS + * + * It is also important to note that the fastest + * timestamp clock of 1ms will wrap its 32bit space in + * 24 days. So we just disable TS checking after 24 + * days of idle time. We actually must use a 12d + * connection limit until we can come up with a better + * lowerbound to the TS echo check. + */ + struct timeval delta_ts; + int ts_fudge; + + + /* + * PFTM_TS_DIFF is how many seconds of leeway to allow + * a host's timestamp. This can happen if the previous + * packet got delayed in transit for much longer than + * this packet. + */ + if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0) + ts_fudge = V_pf_default_rule.timeout[PFTM_TS_DIFF]; + + /* Calculate max ticks since the last timestamp */ +#define TS_MAXFREQ 1100 /* RFC max TS freq of 1Khz + 10% skew */ +#define TS_MICROSECS 1000000 /* microseconds per second */ + delta_ts = uptime; + timevalsub(&delta_ts, &src->scrub->pfss_last); + tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ; + tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ); + + if ((src->state >= TCPS_ESTABLISHED && + dst->state >= TCPS_ESTABLISHED) && + (SEQ_LT(tsval, dst->scrub->pfss_tsecr) || + SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) || + (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) || + SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) { + /* Bad RFC1323 implementation or an insertion attack. + * + * - Solaris 2.6 and 2.7 are known to send another ACK + * after the FIN,FIN|ACK,ACK closing that carries + * an old timestamp. + */ + + DPFPRINTF(("Timestamp failed %c%c%c%c\n", + SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ', + SEQ_GT(tsval, src->scrub->pfss_tsval + + tsval_from_last) ? '1' : ' ', + SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ', + SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' ')); + DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u " + "idle: %jus %lums\n", + tsval, tsecr, tsval_from_last, + (uintmax_t)delta_ts.tv_sec, + delta_ts.tv_usec / 1000)); + DPFPRINTF((" src->tsval: %u tsecr: %u\n", + src->scrub->pfss_tsval, src->scrub->pfss_tsecr)); + DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u" + "\n", dst->scrub->pfss_tsval, + dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0)); + if (V_pf_status.debug >= PF_DEBUG_MISC) { + pf_print_state(state); + pf_print_flags(th->th_flags); + printf("\n"); + } + REASON_SET(reason, PFRES_TS); + return (PF_DROP); + } + + /* XXX I'd really like to require tsecr but it's optional */ + + } else if (!got_ts && (th->th_flags & TH_RST) == 0 && + ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED) + || pd->p_len > 0 || (th->th_flags & TH_SYN)) && + src->scrub && dst->scrub && + (src->scrub->pfss_flags & PFSS_PAWS) && + (dst->scrub->pfss_flags & PFSS_PAWS)) { + /* Didn't send a timestamp. Timestamps aren't really useful + * when: + * - connection opening or closing (often not even sent). + * but we must not let an attacker to put a FIN on a + * data packet to sneak it through our ESTABLISHED check. + * - on a TCP reset. RFC suggests not even looking at TS. + * - on an empty ACK. The TS will not be echoed so it will + * probably not help keep the RTT calculation in sync and + * there isn't as much danger when the sequence numbers + * got wrapped. So some stacks don't include TS on empty + * ACKs :-( + * + * To minimize the disruption to mostly RFC1323 conformant + * stacks, we will only require timestamps on data packets. + * + * And what do ya know, we cannot require timestamps on data + * packets. There appear to be devices that do legitimate + * TCP connection hijacking. There are HTTP devices that allow + * a 3whs (with timestamps) and then buffer the HTTP request. + * If the intermediate device has the HTTP response cache, it + * will spoof the response but not bother timestamping its + * packets. So we can look for the presence of a timestamp in + * the first data packet and if there, require it in all future + * packets. + */ + + if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) { + /* + * Hey! Someone tried to sneak a packet in. Or the + * stack changed its RFC1323 behavior?!?! + */ + if (V_pf_status.debug >= PF_DEBUG_MISC) { + DPFPRINTF(("Did not receive expected RFC1323 " + "timestamp\n")); + pf_print_state(state); + pf_print_flags(th->th_flags); + printf("\n"); + } + REASON_SET(reason, PFRES_TS); + return (PF_DROP); + } + } + + + /* + * We will note if a host sends his data packets with or without + * timestamps. And require all data packets to contain a timestamp + * if the first does. PAWS implicitly requires that all data packets be + * timestamped. But I think there are middle-man devices that hijack + * TCP streams immediately after the 3whs and don't timestamp their + * packets (seen in a WWW accelerator or cache). + */ + if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags & + (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) { + if (got_ts) + src->scrub->pfss_flags |= PFSS_DATA_TS; + else { + src->scrub->pfss_flags |= PFSS_DATA_NOTS; + if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub && + (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { + /* Don't warn if other host rejected RFC1323 */ + DPFPRINTF(("Broken RFC1323 stack did not " + "timestamp data packet. Disabled PAWS " + "security.\n")); + pf_print_state(state); + pf_print_flags(th->th_flags); + printf("\n"); + } + } + } + + + /* + * Update PAWS values + */ + if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags & + (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) { + getmicrouptime(&src->scrub->pfss_last); + if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) || + (src->scrub->pfss_flags & PFSS_PAWS) == 0) + src->scrub->pfss_tsval = tsval; + + if (tsecr) { + if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) || + (src->scrub->pfss_flags & PFSS_PAWS) == 0) + src->scrub->pfss_tsecr = tsecr; + + if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 && + (SEQ_LT(tsval, src->scrub->pfss_tsval0) || + src->scrub->pfss_tsval0 == 0)) { + /* tsval0 MUST be the lowest timestamp */ + src->scrub->pfss_tsval0 = tsval; + } + + /* Only fully initialized after a TS gets echoed */ + if ((src->scrub->pfss_flags & PFSS_PAWS) == 0) + src->scrub->pfss_flags |= PFSS_PAWS; + } + } + + /* I have a dream.... TCP segment reassembly.... */ + return (0); +} + +static int +pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th, + int off, sa_family_t af) +{ + u_int16_t *mss; + int thoff; + int opt, cnt, optlen = 0; + int rewrite = 0; + u_char opts[TCP_MAXOLEN]; + u_char *optp = opts; + + thoff = th->th_off << 2; + cnt = thoff - sizeof(struct tcphdr); + + if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt, + NULL, NULL, af)) + return (rewrite); + + for (; cnt > 0; cnt -= optlen, optp += optlen) { + opt = optp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + if (cnt < 2) + break; + optlen = optp[1]; + if (optlen < 2 || optlen > cnt) + break; + } + switch (opt) { + case TCPOPT_MAXSEG: + mss = (u_int16_t *)(optp + 2); + if ((ntohs(*mss)) > r->max_mss) { + th->th_sum = pf_proto_cksum_fixup(m, + th->th_sum, *mss, htons(r->max_mss), 0); + *mss = htons(r->max_mss); + rewrite = 1; + } + break; + default: + break; + } + } + + if (rewrite) + m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts); + + return (rewrite); +} + +#ifdef INET +static void +pf_scrub_ip(struct mbuf **m0, u_int32_t flags, u_int8_t min_ttl, u_int8_t tos) +{ + struct mbuf *m = *m0; + struct ip *h = mtod(m, struct ip *); + + /* Clear IP_DF if no-df was requested */ + if (flags & PFRULE_NODF && h->ip_off & htons(IP_DF)) { + u_int16_t ip_off = h->ip_off; + + h->ip_off &= htons(~IP_DF); + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); + } + + /* Enforce a minimum ttl, may cause endless packet loops */ + if (min_ttl && h->ip_ttl < min_ttl) { + u_int16_t ip_ttl = h->ip_ttl; + + h->ip_ttl = min_ttl; + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0); + } + + /* Enforce tos */ + if (flags & PFRULE_SET_TOS) { + u_int16_t ov, nv; + + ov = *(u_int16_t *)h; + h->ip_tos = tos; + nv = *(u_int16_t *)h; + + h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0); + } + + /* random-id, but not for fragments */ + if (flags & PFRULE_RANDOMID && !(h->ip_off & ~htons(IP_DF))) { + uint16_t ip_id = h->ip_id; + + ip_fillid(h); + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0); + } +} +#endif /* INET */ + +#ifdef INET6 +static void +pf_scrub_ip6(struct mbuf **m0, u_int8_t min_ttl) +{ + struct mbuf *m = *m0; + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + + /* Enforce a minimum ttl, may cause endless packet loops */ + if (min_ttl && h->ip6_hlim < min_ttl) + h->ip6_hlim = min_ttl; +} +#endif diff --git a/freebsd/sys/netpfil/pf/pf_osfp.c b/freebsd/sys/netpfil/pf/pf_osfp.c new file mode 100644 index 00000000..33bef4c8 --- /dev/null +++ b/freebsd/sys/netpfil/pf/pf_osfp.c @@ -0,0 +1,530 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2003 Mike Frantzen <frantzen@w4g.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * $OpenBSD: pf_osfp.c,v 1.14 2008/06/12 18:17:01 henning Exp $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/sys/param.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/mbuf.h> +#include <sys/rwlock.h> +#include <sys/socket.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include <net/if.h> +#include <net/vnet.h> +#include <net/pfvar.h> + +#include <netinet/ip6.h> + +static MALLOC_DEFINE(M_PFOSFP, "pf_osfp", "pf(4) operating system fingerprints"); +#define DPFPRINTF(format, x...) \ + if (V_pf_status.debug >= PF_DEBUG_NOISY) \ + printf(format , ##x) + +SLIST_HEAD(pf_osfp_list, pf_os_fingerprint); +static VNET_DEFINE(struct pf_osfp_list, pf_osfp_list) = + SLIST_HEAD_INITIALIZER(); +#define V_pf_osfp_list VNET(pf_osfp_list) + +static struct pf_osfp_enlist *pf_osfp_fingerprint_hdr(const struct ip *, + const struct ip6_hdr *, + const struct tcphdr *); +static struct pf_os_fingerprint *pf_osfp_find(struct pf_osfp_list *, + struct pf_os_fingerprint *, u_int8_t); +static struct pf_os_fingerprint *pf_osfp_find_exact(struct pf_osfp_list *, + struct pf_os_fingerprint *); +static void pf_osfp_insert(struct pf_osfp_list *, + struct pf_os_fingerprint *); +#ifdef PFDEBUG +static struct pf_os_fingerprint *pf_osfp_validate(void); +#endif + +/* + * Passively fingerprint the OS of the host (IPv4 TCP SYN packets only) + * Returns the list of possible OSes. + */ +struct pf_osfp_enlist * +pf_osfp_fingerprint(struct pf_pdesc *pd, struct mbuf *m, int off, + const struct tcphdr *tcp) +{ + struct ip *ip; + struct ip6_hdr *ip6; + char hdr[60]; + + if ((pd->af != PF_INET && pd->af != PF_INET6) || + pd->proto != IPPROTO_TCP || (tcp->th_off << 2) < sizeof(*tcp)) + return (NULL); + + if (pd->af == PF_INET) { + ip = mtod(m, struct ip *); + ip6 = (struct ip6_hdr *)NULL; + } else { + ip = (struct ip *)NULL; + ip6 = mtod(m, struct ip6_hdr *); + } + if (!pf_pull_hdr(m, off, hdr, tcp->th_off << 2, NULL, NULL, + pd->af)) return (NULL); + + return (pf_osfp_fingerprint_hdr(ip, ip6, (struct tcphdr *)hdr)); +} + +static struct pf_osfp_enlist * +pf_osfp_fingerprint_hdr(const struct ip *ip, const struct ip6_hdr *ip6, const struct tcphdr *tcp) +{ + struct pf_os_fingerprint fp, *fpresult; + int cnt, optlen = 0; + const u_int8_t *optp; + char srcname[128]; + + if ((tcp->th_flags & (TH_SYN|TH_ACK)) != TH_SYN) + return (NULL); + if (ip) { + if ((ip->ip_off & htons(IP_OFFMASK)) != 0) + return (NULL); + } + + memset(&fp, 0, sizeof(fp)); + + if (ip) { + fp.fp_psize = ntohs(ip->ip_len); + fp.fp_ttl = ip->ip_ttl; + if (ip->ip_off & htons(IP_DF)) + fp.fp_flags |= PF_OSFP_DF; + strlcpy(srcname, inet_ntoa(ip->ip_src), sizeof(srcname)); + } +#ifdef INET6 + else if (ip6) { + /* jumbo payload? */ + fp.fp_psize = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); + fp.fp_ttl = ip6->ip6_hlim; + fp.fp_flags |= PF_OSFP_DF; + fp.fp_flags |= PF_OSFP_INET6; + strlcpy(srcname, ip6_sprintf((struct in6_addr *)&ip6->ip6_src), + sizeof(srcname)); + } +#endif + else + return (NULL); + fp.fp_wsize = ntohs(tcp->th_win); + + + cnt = (tcp->th_off << 2) - sizeof(*tcp); + optp = (const u_int8_t *)((const char *)tcp + sizeof(*tcp)); + for (; cnt > 0; cnt -= optlen, optp += optlen) { + if (*optp == TCPOPT_EOL) + break; + + fp.fp_optcnt++; + if (*optp == TCPOPT_NOP) { + fp.fp_tcpopts = (fp.fp_tcpopts << PF_OSFP_TCPOPT_BITS) | + PF_OSFP_TCPOPT_NOP; + optlen = 1; + } else { + if (cnt < 2) + return (NULL); + optlen = optp[1]; + if (optlen > cnt || optlen < 2) + return (NULL); + switch (*optp) { + case TCPOPT_MAXSEG: + if (optlen >= TCPOLEN_MAXSEG) + memcpy(&fp.fp_mss, &optp[2], + sizeof(fp.fp_mss)); + fp.fp_tcpopts = (fp.fp_tcpopts << + PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_MSS; + NTOHS(fp.fp_mss); + break; + case TCPOPT_WINDOW: + if (optlen >= TCPOLEN_WINDOW) + memcpy(&fp.fp_wscale, &optp[2], + sizeof(fp.fp_wscale)); + NTOHS(fp.fp_wscale); + fp.fp_tcpopts = (fp.fp_tcpopts << + PF_OSFP_TCPOPT_BITS) | + PF_OSFP_TCPOPT_WSCALE; + break; + case TCPOPT_SACK_PERMITTED: + fp.fp_tcpopts = (fp.fp_tcpopts << + PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_SACK; + break; + case TCPOPT_TIMESTAMP: + if (optlen >= TCPOLEN_TIMESTAMP) { + u_int32_t ts; + memcpy(&ts, &optp[2], sizeof(ts)); + if (ts == 0) + fp.fp_flags |= PF_OSFP_TS0; + + } + fp.fp_tcpopts = (fp.fp_tcpopts << + PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_TS; + break; + default: + return (NULL); + } + } + optlen = MAX(optlen, 1); /* paranoia */ + } + + DPFPRINTF("fingerprinted %s:%d %d:%d:%d:%d:%llx (%d) " + "(TS=%s,M=%s%d,W=%s%d)\n", + srcname, ntohs(tcp->th_sport), + fp.fp_wsize, fp.fp_ttl, (fp.fp_flags & PF_OSFP_DF) != 0, + fp.fp_psize, (long long int)fp.fp_tcpopts, fp.fp_optcnt, + (fp.fp_flags & PF_OSFP_TS0) ? "0" : "", + (fp.fp_flags & PF_OSFP_MSS_MOD) ? "%" : + (fp.fp_flags & PF_OSFP_MSS_DC) ? "*" : "", + fp.fp_mss, + (fp.fp_flags & PF_OSFP_WSCALE_MOD) ? "%" : + (fp.fp_flags & PF_OSFP_WSCALE_DC) ? "*" : "", + fp.fp_wscale); + + if ((fpresult = pf_osfp_find(&V_pf_osfp_list, &fp, + PF_OSFP_MAXTTL_OFFSET))) + return (&fpresult->fp_oses); + return (NULL); +} + +/* Match a fingerprint ID against a list of OSes */ +int +pf_osfp_match(struct pf_osfp_enlist *list, pf_osfp_t os) +{ + struct pf_osfp_entry *entry; + int os_class, os_version, os_subtype; + int en_class, en_version, en_subtype; + + if (os == PF_OSFP_ANY) + return (1); + if (list == NULL) { + DPFPRINTF("osfp no match against %x\n", os); + return (os == PF_OSFP_UNKNOWN); + } + PF_OSFP_UNPACK(os, os_class, os_version, os_subtype); + SLIST_FOREACH(entry, list, fp_entry) { + PF_OSFP_UNPACK(entry->fp_os, en_class, en_version, en_subtype); + if ((os_class == PF_OSFP_ANY || en_class == os_class) && + (os_version == PF_OSFP_ANY || en_version == os_version) && + (os_subtype == PF_OSFP_ANY || en_subtype == os_subtype)) { + DPFPRINTF("osfp matched %s %s %s %x==%x\n", + entry->fp_class_nm, entry->fp_version_nm, + entry->fp_subtype_nm, os, entry->fp_os); + return (1); + } + } + DPFPRINTF("fingerprint 0x%x didn't match\n", os); + return (0); +} + +/* Flush the fingerprint list */ +void +pf_osfp_flush(void) +{ + struct pf_os_fingerprint *fp; + struct pf_osfp_entry *entry; + + while ((fp = SLIST_FIRST(&V_pf_osfp_list))) { + SLIST_REMOVE_HEAD(&V_pf_osfp_list, fp_next); + while ((entry = SLIST_FIRST(&fp->fp_oses))) { + SLIST_REMOVE_HEAD(&fp->fp_oses, fp_entry); + free(entry, M_PFOSFP); + } + free(fp, M_PFOSFP); + } +} + + +/* Add a fingerprint */ +int +pf_osfp_add(struct pf_osfp_ioctl *fpioc) +{ + struct pf_os_fingerprint *fp, fpadd; + struct pf_osfp_entry *entry; + + PF_RULES_WASSERT(); + + memset(&fpadd, 0, sizeof(fpadd)); + fpadd.fp_tcpopts = fpioc->fp_tcpopts; + fpadd.fp_wsize = fpioc->fp_wsize; + fpadd.fp_psize = fpioc->fp_psize; + fpadd.fp_mss = fpioc->fp_mss; + fpadd.fp_flags = fpioc->fp_flags; + fpadd.fp_optcnt = fpioc->fp_optcnt; + fpadd.fp_wscale = fpioc->fp_wscale; + fpadd.fp_ttl = fpioc->fp_ttl; + +#if 0 /* XXX RYAN wants to fix logging */ + DPFPRINTF("adding osfp %s %s %s = %s%d:%d:%d:%s%d:0x%llx %d " + "(TS=%s,M=%s%d,W=%s%d) %x\n", + fpioc->fp_os.fp_class_nm, fpioc->fp_os.fp_version_nm, + fpioc->fp_os.fp_subtype_nm, + (fpadd.fp_flags & PF_OSFP_WSIZE_MOD) ? "%" : + (fpadd.fp_flags & PF_OSFP_WSIZE_MSS) ? "S" : + (fpadd.fp_flags & PF_OSFP_WSIZE_MTU) ? "T" : + (fpadd.fp_flags & PF_OSFP_WSIZE_DC) ? "*" : "", + fpadd.fp_wsize, + fpadd.fp_ttl, + (fpadd.fp_flags & PF_OSFP_DF) ? 1 : 0, + (fpadd.fp_flags & PF_OSFP_PSIZE_MOD) ? "%" : + (fpadd.fp_flags & PF_OSFP_PSIZE_DC) ? "*" : "", + fpadd.fp_psize, + (long long int)fpadd.fp_tcpopts, fpadd.fp_optcnt, + (fpadd.fp_flags & PF_OSFP_TS0) ? "0" : "", + (fpadd.fp_flags & PF_OSFP_MSS_MOD) ? "%" : + (fpadd.fp_flags & PF_OSFP_MSS_DC) ? "*" : "", + fpadd.fp_mss, + (fpadd.fp_flags & PF_OSFP_WSCALE_MOD) ? "%" : + (fpadd.fp_flags & PF_OSFP_WSCALE_DC) ? "*" : "", + fpadd.fp_wscale, + fpioc->fp_os.fp_os); +#endif + + if ((fp = pf_osfp_find_exact(&V_pf_osfp_list, &fpadd))) { + SLIST_FOREACH(entry, &fp->fp_oses, fp_entry) { + if (PF_OSFP_ENTRY_EQ(entry, &fpioc->fp_os)) + return (EEXIST); + } + if ((entry = malloc(sizeof(*entry), M_PFOSFP, M_NOWAIT)) + == NULL) + return (ENOMEM); + } else { + if ((fp = malloc(sizeof(*fp), M_PFOSFP, M_ZERO | M_NOWAIT)) + == NULL) + return (ENOMEM); + fp->fp_tcpopts = fpioc->fp_tcpopts; + fp->fp_wsize = fpioc->fp_wsize; + fp->fp_psize = fpioc->fp_psize; + fp->fp_mss = fpioc->fp_mss; + fp->fp_flags = fpioc->fp_flags; + fp->fp_optcnt = fpioc->fp_optcnt; + fp->fp_wscale = fpioc->fp_wscale; + fp->fp_ttl = fpioc->fp_ttl; + SLIST_INIT(&fp->fp_oses); + if ((entry = malloc(sizeof(*entry), M_PFOSFP, M_NOWAIT)) + == NULL) { + free(fp, M_PFOSFP); + return (ENOMEM); + } + pf_osfp_insert(&V_pf_osfp_list, fp); + } + memcpy(entry, &fpioc->fp_os, sizeof(*entry)); + + /* Make sure the strings are NUL terminated */ + entry->fp_class_nm[sizeof(entry->fp_class_nm)-1] = '\0'; + entry->fp_version_nm[sizeof(entry->fp_version_nm)-1] = '\0'; + entry->fp_subtype_nm[sizeof(entry->fp_subtype_nm)-1] = '\0'; + + SLIST_INSERT_HEAD(&fp->fp_oses, entry, fp_entry); + +#ifdef PFDEBUG + if ((fp = pf_osfp_validate())) + printf("Invalid fingerprint list\n"); +#endif /* PFDEBUG */ + return (0); +} + + +/* Find a fingerprint in the list */ +static struct pf_os_fingerprint * +pf_osfp_find(struct pf_osfp_list *list, struct pf_os_fingerprint *find, + u_int8_t ttldiff) +{ + struct pf_os_fingerprint *f; + +#define MATCH_INT(_MOD, _DC, _field) \ + if ((f->fp_flags & _DC) == 0) { \ + if ((f->fp_flags & _MOD) == 0) { \ + if (f->_field != find->_field) \ + continue; \ + } else { \ + if (f->_field == 0 || find->_field % f->_field) \ + continue; \ + } \ + } + + SLIST_FOREACH(f, list, fp_next) { + if (f->fp_tcpopts != find->fp_tcpopts || + f->fp_optcnt != find->fp_optcnt || + f->fp_ttl < find->fp_ttl || + f->fp_ttl - find->fp_ttl > ttldiff || + (f->fp_flags & (PF_OSFP_DF|PF_OSFP_TS0)) != + (find->fp_flags & (PF_OSFP_DF|PF_OSFP_TS0))) + continue; + + MATCH_INT(PF_OSFP_PSIZE_MOD, PF_OSFP_PSIZE_DC, fp_psize) + MATCH_INT(PF_OSFP_MSS_MOD, PF_OSFP_MSS_DC, fp_mss) + MATCH_INT(PF_OSFP_WSCALE_MOD, PF_OSFP_WSCALE_DC, fp_wscale) + if ((f->fp_flags & PF_OSFP_WSIZE_DC) == 0) { + if (f->fp_flags & PF_OSFP_WSIZE_MSS) { + if (find->fp_mss == 0) + continue; + +/* + * Some "smart" NAT devices and DSL routers will tweak the MSS size and + * will set it to whatever is suitable for the link type. + */ +#define SMART_MSS 1460 + if ((find->fp_wsize % find->fp_mss || + find->fp_wsize / find->fp_mss != + f->fp_wsize) && + (find->fp_wsize % SMART_MSS || + find->fp_wsize / SMART_MSS != + f->fp_wsize)) + continue; + } else if (f->fp_flags & PF_OSFP_WSIZE_MTU) { + if (find->fp_mss == 0) + continue; + +#define MTUOFF (sizeof(struct ip) + sizeof(struct tcphdr)) +#define SMART_MTU (SMART_MSS + MTUOFF) + if ((find->fp_wsize % (find->fp_mss + MTUOFF) || + find->fp_wsize / (find->fp_mss + MTUOFF) != + f->fp_wsize) && + (find->fp_wsize % SMART_MTU || + find->fp_wsize / SMART_MTU != + f->fp_wsize)) + continue; + } else if (f->fp_flags & PF_OSFP_WSIZE_MOD) { + if (f->fp_wsize == 0 || find->fp_wsize % + f->fp_wsize) + continue; + } else { + if (f->fp_wsize != find->fp_wsize) + continue; + } + } + return (f); + } + + return (NULL); +} + +/* Find an exact fingerprint in the list */ +static struct pf_os_fingerprint * +pf_osfp_find_exact(struct pf_osfp_list *list, struct pf_os_fingerprint *find) +{ + struct pf_os_fingerprint *f; + + SLIST_FOREACH(f, list, fp_next) { + if (f->fp_tcpopts == find->fp_tcpopts && + f->fp_wsize == find->fp_wsize && + f->fp_psize == find->fp_psize && + f->fp_mss == find->fp_mss && + f->fp_flags == find->fp_flags && + f->fp_optcnt == find->fp_optcnt && + f->fp_wscale == find->fp_wscale && + f->fp_ttl == find->fp_ttl) + return (f); + } + + return (NULL); +} + +/* Insert a fingerprint into the list */ +static void +pf_osfp_insert(struct pf_osfp_list *list, struct pf_os_fingerprint *ins) +{ + struct pf_os_fingerprint *f, *prev = NULL; + + /* XXX need to go semi tree based. can key on tcp options */ + + SLIST_FOREACH(f, list, fp_next) + prev = f; + if (prev) + SLIST_INSERT_AFTER(prev, ins, fp_next); + else + SLIST_INSERT_HEAD(list, ins, fp_next); +} + +/* Fill a fingerprint by its number (from an ioctl) */ +int +pf_osfp_get(struct pf_osfp_ioctl *fpioc) +{ + struct pf_os_fingerprint *fp; + struct pf_osfp_entry *entry; + int num = fpioc->fp_getnum; + int i = 0; + + + memset(fpioc, 0, sizeof(*fpioc)); + SLIST_FOREACH(fp, &V_pf_osfp_list, fp_next) { + SLIST_FOREACH(entry, &fp->fp_oses, fp_entry) { + if (i++ == num) { + fpioc->fp_mss = fp->fp_mss; + fpioc->fp_wsize = fp->fp_wsize; + fpioc->fp_flags = fp->fp_flags; + fpioc->fp_psize = fp->fp_psize; + fpioc->fp_ttl = fp->fp_ttl; + fpioc->fp_wscale = fp->fp_wscale; + fpioc->fp_getnum = num; + memcpy(&fpioc->fp_os, entry, + sizeof(fpioc->fp_os)); + return (0); + } + } + } + + return (EBUSY); +} + + +#ifdef PFDEBUG +/* Validate that each signature is reachable */ +static struct pf_os_fingerprint * +pf_osfp_validate(void) +{ + struct pf_os_fingerprint *f, *f2, find; + + SLIST_FOREACH(f, &V_pf_osfp_list, fp_next) { + memcpy(&find, f, sizeof(find)); + + /* We do a few MSS/th_win percolations to make things unique */ + if (find.fp_mss == 0) + find.fp_mss = 128; + if (f->fp_flags & PF_OSFP_WSIZE_MSS) + find.fp_wsize *= find.fp_mss; + else if (f->fp_flags & PF_OSFP_WSIZE_MTU) + find.fp_wsize *= (find.fp_mss + 40); + else if (f->fp_flags & PF_OSFP_WSIZE_MOD) + find.fp_wsize *= 2; + if (f != (f2 = pf_osfp_find(&V_pf_osfp_list, &find, 0))) { + if (f2) + printf("Found \"%s %s %s\" instead of " + "\"%s %s %s\"\n", + SLIST_FIRST(&f2->fp_oses)->fp_class_nm, + SLIST_FIRST(&f2->fp_oses)->fp_version_nm, + SLIST_FIRST(&f2->fp_oses)->fp_subtype_nm, + SLIST_FIRST(&f->fp_oses)->fp_class_nm, + SLIST_FIRST(&f->fp_oses)->fp_version_nm, + SLIST_FIRST(&f->fp_oses)->fp_subtype_nm); + else + printf("Couldn't find \"%s %s %s\"\n", + SLIST_FIRST(&f->fp_oses)->fp_class_nm, + SLIST_FIRST(&f->fp_oses)->fp_version_nm, + SLIST_FIRST(&f->fp_oses)->fp_subtype_nm); + return (f); + } + } + return (NULL); +} +#endif /* PFDEBUG */ diff --git a/freebsd/sys/netpfil/pf/pf_ruleset.c b/freebsd/sys/netpfil/pf/pf_ruleset.c new file mode 100644 index 00000000..e16643aa --- /dev/null +++ b/freebsd/sys/netpfil/pf/pf_ruleset.c @@ -0,0 +1,426 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002,2003 Henning Brauer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + * $OpenBSD: pf_ruleset.c,v 1.2 2008/12/18 15:31:37 dhill Exp $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/sys/param.h> +#include <sys/socket.h> +#ifdef _KERNEL +# include <sys/systm.h> +# include <sys/refcount.h> +#endif /* _KERNEL */ +#include <sys/mbuf.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include <net/if.h> +#include <net/vnet.h> +#include <net/pfvar.h> + +#ifdef INET6 +#include <netinet/ip6.h> +#endif /* INET6 */ + + +#ifdef _KERNEL +#define DPFPRINTF(format, x...) \ + if (V_pf_status.debug >= PF_DEBUG_NOISY) \ + printf(format , ##x) +#define rs_malloc(x) malloc(x, M_TEMP, M_NOWAIT|M_ZERO) +#define rs_free(x) free(x, M_TEMP) + +#else +/* Userland equivalents so we can lend code to pfctl et al. */ + +#include <arpa/inet.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#define rs_malloc(x) calloc(1, x) +#define rs_free(x) free(x) + +#ifdef PFDEBUG +#include <sys/stdarg.h> +#define DPFPRINTF(format, x...) fprintf(stderr, format , ##x) +#else +#define DPFPRINTF(format, x...) ((void)0) +#endif /* PFDEBUG */ +#endif /* _KERNEL */ + +#ifdef _KERNEL +VNET_DEFINE(struct pf_anchor_global, pf_anchors); +VNET_DEFINE(struct pf_anchor, pf_main_anchor); +#else /* ! _KERNEL */ +struct pf_anchor_global pf_anchors; +struct pf_anchor pf_main_anchor; +#undef V_pf_anchors +#define V_pf_anchors pf_anchors +#undef pf_main_ruleset +#define pf_main_ruleset pf_main_anchor.ruleset +#endif /* _KERNEL */ + +static __inline int pf_anchor_compare(struct pf_anchor *, struct pf_anchor *); + +static struct pf_anchor *pf_find_anchor(const char *); + +RB_GENERATE(pf_anchor_global, pf_anchor, entry_global, pf_anchor_compare); +RB_GENERATE(pf_anchor_node, pf_anchor, entry_node, pf_anchor_compare); + +static __inline int +pf_anchor_compare(struct pf_anchor *a, struct pf_anchor *b) +{ + int c = strcmp(a->path, b->path); + + return (c ? (c < 0 ? -1 : 1) : 0); +} + +int +pf_get_ruleset_number(u_int8_t action) +{ + switch (action) { + case PF_SCRUB: + case PF_NOSCRUB: + return (PF_RULESET_SCRUB); + break; + case PF_PASS: + case PF_DROP: + return (PF_RULESET_FILTER); + break; + case PF_NAT: + case PF_NONAT: + return (PF_RULESET_NAT); + break; + case PF_BINAT: + case PF_NOBINAT: + return (PF_RULESET_BINAT); + break; + case PF_RDR: + case PF_NORDR: + return (PF_RULESET_RDR); + break; + default: + return (PF_RULESET_MAX); + break; + } +} + +void +pf_init_ruleset(struct pf_ruleset *ruleset) +{ + int i; + + memset(ruleset, 0, sizeof(struct pf_ruleset)); + for (i = 0; i < PF_RULESET_MAX; i++) { + TAILQ_INIT(&ruleset->rules[i].queues[0]); + TAILQ_INIT(&ruleset->rules[i].queues[1]); + ruleset->rules[i].active.ptr = &ruleset->rules[i].queues[0]; + ruleset->rules[i].inactive.ptr = &ruleset->rules[i].queues[1]; + } +} + +static struct pf_anchor * +pf_find_anchor(const char *path) +{ + struct pf_anchor *key, *found; + + key = (struct pf_anchor *)rs_malloc(sizeof(*key)); + if (key == NULL) + return (NULL); + strlcpy(key->path, path, sizeof(key->path)); + found = RB_FIND(pf_anchor_global, &V_pf_anchors, key); + rs_free(key); + return (found); +} + +struct pf_ruleset * +pf_find_ruleset(const char *path) +{ + struct pf_anchor *anchor; + + while (*path == '/') + path++; + if (!*path) + return (&pf_main_ruleset); + anchor = pf_find_anchor(path); + if (anchor == NULL) + return (NULL); + else + return (&anchor->ruleset); +} + +struct pf_ruleset * +pf_find_or_create_ruleset(const char *path) +{ + char *p, *q, *r; + struct pf_ruleset *ruleset; + struct pf_anchor *anchor = NULL, *dup, *parent = NULL; + + if (path[0] == 0) + return (&pf_main_ruleset); + while (*path == '/') + path++; + ruleset = pf_find_ruleset(path); + if (ruleset != NULL) + return (ruleset); + p = (char *)rs_malloc(MAXPATHLEN); + if (p == NULL) + return (NULL); + strlcpy(p, path, MAXPATHLEN); + while (parent == NULL && (q = strrchr(p, '/')) != NULL) { + *q = 0; + if ((ruleset = pf_find_ruleset(p)) != NULL) { + parent = ruleset->anchor; + break; + } + } + if (q == NULL) + q = p; + else + q++; + strlcpy(p, path, MAXPATHLEN); + if (!*q) { + rs_free(p); + return (NULL); + } + while ((r = strchr(q, '/')) != NULL || *q) { + if (r != NULL) + *r = 0; + if (!*q || strlen(q) >= PF_ANCHOR_NAME_SIZE || + (parent != NULL && strlen(parent->path) >= + MAXPATHLEN - PF_ANCHOR_NAME_SIZE - 1)) { + rs_free(p); + return (NULL); + } + anchor = (struct pf_anchor *)rs_malloc(sizeof(*anchor)); + if (anchor == NULL) { + rs_free(p); + return (NULL); + } + RB_INIT(&anchor->children); + strlcpy(anchor->name, q, sizeof(anchor->name)); + if (parent != NULL) { + strlcpy(anchor->path, parent->path, + sizeof(anchor->path)); + strlcat(anchor->path, "/", sizeof(anchor->path)); + } + strlcat(anchor->path, anchor->name, sizeof(anchor->path)); + if ((dup = RB_INSERT(pf_anchor_global, &V_pf_anchors, anchor)) != + NULL) { + printf("pf_find_or_create_ruleset: RB_INSERT1 " + "'%s' '%s' collides with '%s' '%s'\n", + anchor->path, anchor->name, dup->path, dup->name); + rs_free(anchor); + rs_free(p); + return (NULL); + } + if (parent != NULL) { + anchor->parent = parent; + if ((dup = RB_INSERT(pf_anchor_node, &parent->children, + anchor)) != NULL) { + printf("pf_find_or_create_ruleset: " + "RB_INSERT2 '%s' '%s' collides with " + "'%s' '%s'\n", anchor->path, anchor->name, + dup->path, dup->name); + RB_REMOVE(pf_anchor_global, &V_pf_anchors, + anchor); + rs_free(anchor); + rs_free(p); + return (NULL); + } + } + pf_init_ruleset(&anchor->ruleset); + anchor->ruleset.anchor = anchor; + parent = anchor; + if (r != NULL) + q = r + 1; + else + *q = 0; + } + rs_free(p); + return (&anchor->ruleset); +} + +void +pf_remove_if_empty_ruleset(struct pf_ruleset *ruleset) +{ + struct pf_anchor *parent; + int i; + + while (ruleset != NULL) { + if (ruleset == &pf_main_ruleset || ruleset->anchor == NULL || + !RB_EMPTY(&ruleset->anchor->children) || + ruleset->anchor->refcnt > 0 || ruleset->tables > 0 || + ruleset->topen) + return; + for (i = 0; i < PF_RULESET_MAX; ++i) + if (!TAILQ_EMPTY(ruleset->rules[i].active.ptr) || + !TAILQ_EMPTY(ruleset->rules[i].inactive.ptr) || + ruleset->rules[i].inactive.open) + return; + RB_REMOVE(pf_anchor_global, &V_pf_anchors, ruleset->anchor); + if ((parent = ruleset->anchor->parent) != NULL) + RB_REMOVE(pf_anchor_node, &parent->children, + ruleset->anchor); + rs_free(ruleset->anchor); + if (parent == NULL) + return; + ruleset = &parent->ruleset; + } +} + +int +pf_anchor_setup(struct pf_rule *r, const struct pf_ruleset *s, + const char *name) +{ + char *p, *path; + struct pf_ruleset *ruleset; + + r->anchor = NULL; + r->anchor_relative = 0; + r->anchor_wildcard = 0; + if (!name[0]) + return (0); + path = (char *)rs_malloc(MAXPATHLEN); + if (path == NULL) + return (1); + if (name[0] == '/') + strlcpy(path, name + 1, MAXPATHLEN); + else { + /* relative path */ + r->anchor_relative = 1; + if (s->anchor == NULL || !s->anchor->path[0]) + path[0] = 0; + else + strlcpy(path, s->anchor->path, MAXPATHLEN); + while (name[0] == '.' && name[1] == '.' && name[2] == '/') { + if (!path[0]) { + printf("pf_anchor_setup: .. beyond root\n"); + rs_free(path); + return (1); + } + if ((p = strrchr(path, '/')) != NULL) + *p = 0; + else + path[0] = 0; + r->anchor_relative++; + name += 3; + } + if (path[0]) + strlcat(path, "/", MAXPATHLEN); + strlcat(path, name, MAXPATHLEN); + } + if ((p = strrchr(path, '/')) != NULL && !strcmp(p, "/*")) { + r->anchor_wildcard = 1; + *p = 0; + } + ruleset = pf_find_or_create_ruleset(path); + rs_free(path); + if (ruleset == NULL || ruleset->anchor == NULL) { + printf("pf_anchor_setup: ruleset\n"); + return (1); + } + r->anchor = ruleset->anchor; + r->anchor->refcnt++; + return (0); +} + +int +pf_anchor_copyout(const struct pf_ruleset *rs, const struct pf_rule *r, + struct pfioc_rule *pr) +{ + pr->anchor_call[0] = 0; + if (r->anchor == NULL) + return (0); + if (!r->anchor_relative) { + strlcpy(pr->anchor_call, "/", sizeof(pr->anchor_call)); + strlcat(pr->anchor_call, r->anchor->path, + sizeof(pr->anchor_call)); + } else { + char *a, *p; + int i; + + a = (char *)rs_malloc(MAXPATHLEN); + if (a == NULL) + return (1); + if (rs->anchor == NULL) + a[0] = 0; + else + strlcpy(a, rs->anchor->path, MAXPATHLEN); + for (i = 1; i < r->anchor_relative; ++i) { + if ((p = strrchr(a, '/')) == NULL) + p = a; + *p = 0; + strlcat(pr->anchor_call, "../", + sizeof(pr->anchor_call)); + } + if (strncmp(a, r->anchor->path, strlen(a))) { + printf("pf_anchor_copyout: '%s' '%s'\n", a, + r->anchor->path); + rs_free(a); + return (1); + } + if (strlen(r->anchor->path) > strlen(a)) + strlcat(pr->anchor_call, r->anchor->path + (a[0] ? + strlen(a) + 1 : 0), sizeof(pr->anchor_call)); + rs_free(a); + } + if (r->anchor_wildcard) + strlcat(pr->anchor_call, pr->anchor_call[0] ? "/*" : "*", + sizeof(pr->anchor_call)); + return (0); +} + +void +pf_anchor_remove(struct pf_rule *r) +{ + if (r->anchor == NULL) + return; + if (r->anchor->refcnt <= 0) { + printf("pf_anchor_remove: broken refcount\n"); + r->anchor = NULL; + return; + } + if (!--r->anchor->refcnt) + pf_remove_if_empty_ruleset(&r->anchor->ruleset); + r->anchor = NULL; +} diff --git a/freebsd/sys/netpfil/pf/pf_table.c b/freebsd/sys/netpfil/pf/pf_table.c new file mode 100644 index 00000000..26b6f4e9 --- /dev/null +++ b/freebsd/sys/netpfil/pf/pf_table.c @@ -0,0 +1,2195 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 2002 Cedric Berger + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $OpenBSD: pf_table.c,v 1.79 2008/10/08 06:24:50 mcbride Exp $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> + +#include <rtems/bsd/sys/param.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/refcount.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <vm/uma.h> + +#include <net/if.h> +#include <net/vnet.h> +#include <net/pfvar.h> + +#define ACCEPT_FLAGS(flags, oklist) \ + do { \ + if ((flags & ~(oklist)) & \ + PFR_FLAG_ALLMASK) \ + return (EINVAL); \ + } while (0) + +#define FILLIN_SIN(sin, addr) \ + do { \ + (sin).sin_len = sizeof(sin); \ + (sin).sin_family = AF_INET; \ + (sin).sin_addr = (addr); \ + } while (0) + +#define FILLIN_SIN6(sin6, addr) \ + do { \ + (sin6).sin6_len = sizeof(sin6); \ + (sin6).sin6_family = AF_INET6; \ + (sin6).sin6_addr = (addr); \ + } while (0) + +#define SWAP(type, a1, a2) \ + do { \ + type tmp = a1; \ + a1 = a2; \ + a2 = tmp; \ + } while (0) + +#define SUNION2PF(su, af) (((af)==AF_INET) ? \ + (struct pf_addr *)&(su)->sin.sin_addr : \ + (struct pf_addr *)&(su)->sin6.sin6_addr) + +#define AF_BITS(af) (((af)==AF_INET)?32:128) +#define ADDR_NETWORK(ad) ((ad)->pfra_net < AF_BITS((ad)->pfra_af)) +#define KENTRY_NETWORK(ke) ((ke)->pfrke_net < AF_BITS((ke)->pfrke_af)) +#define KENTRY_RNF_ROOT(ke) \ + ((((struct radix_node *)(ke))->rn_flags & RNF_ROOT) != 0) + +#define NO_ADDRESSES (-1) +#define ENQUEUE_UNMARKED_ONLY (1) +#define INVERT_NEG_FLAG (1) + +struct pfr_walktree { + enum pfrw_op { + PFRW_MARK, + PFRW_SWEEP, + PFRW_ENQUEUE, + PFRW_GET_ADDRS, + PFRW_GET_ASTATS, + PFRW_POOL_GET, + PFRW_DYNADDR_UPDATE + } pfrw_op; + union { + struct pfr_addr *pfrw1_addr; + struct pfr_astats *pfrw1_astats; + struct pfr_kentryworkq *pfrw1_workq; + struct pfr_kentry *pfrw1_kentry; + struct pfi_dynaddr *pfrw1_dyn; + } pfrw_1; + int pfrw_free; +}; +#define pfrw_addr pfrw_1.pfrw1_addr +#define pfrw_astats pfrw_1.pfrw1_astats +#define pfrw_workq pfrw_1.pfrw1_workq +#define pfrw_kentry pfrw_1.pfrw1_kentry +#define pfrw_dyn pfrw_1.pfrw1_dyn +#define pfrw_cnt pfrw_free + +#define senderr(e) do { rv = (e); goto _bad; } while (0) + +static MALLOC_DEFINE(M_PFTABLE, "pf_table", "pf(4) tables structures"); +static VNET_DEFINE(uma_zone_t, pfr_kentry_z); +#define V_pfr_kentry_z VNET(pfr_kentry_z) +static VNET_DEFINE(uma_zone_t, pfr_kcounters_z); +#define V_pfr_kcounters_z VNET(pfr_kcounters_z) + +static struct pf_addr pfr_ffaddr = { + .addr32 = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff } +}; + +static void pfr_copyout_addr(struct pfr_addr *, + struct pfr_kentry *ke); +static int pfr_validate_addr(struct pfr_addr *); +static void pfr_enqueue_addrs(struct pfr_ktable *, + struct pfr_kentryworkq *, int *, int); +static void pfr_mark_addrs(struct pfr_ktable *); +static struct pfr_kentry + *pfr_lookup_addr(struct pfr_ktable *, + struct pfr_addr *, int); +static struct pfr_kentry *pfr_create_kentry(struct pfr_addr *); +static void pfr_destroy_kentries(struct pfr_kentryworkq *); +static void pfr_destroy_kentry(struct pfr_kentry *); +static void pfr_insert_kentries(struct pfr_ktable *, + struct pfr_kentryworkq *, long); +static void pfr_remove_kentries(struct pfr_ktable *, + struct pfr_kentryworkq *); +static void pfr_clstats_kentries(struct pfr_kentryworkq *, long, + int); +static void pfr_reset_feedback(struct pfr_addr *, int); +static void pfr_prepare_network(union sockaddr_union *, int, int); +static int pfr_route_kentry(struct pfr_ktable *, + struct pfr_kentry *); +static int pfr_unroute_kentry(struct pfr_ktable *, + struct pfr_kentry *); +static int pfr_walktree(struct radix_node *, void *); +static int pfr_validate_table(struct pfr_table *, int, int); +static int pfr_fix_anchor(char *); +static void pfr_commit_ktable(struct pfr_ktable *, long); +static void pfr_insert_ktables(struct pfr_ktableworkq *); +static void pfr_insert_ktable(struct pfr_ktable *); +static void pfr_setflags_ktables(struct pfr_ktableworkq *); +static void pfr_setflags_ktable(struct pfr_ktable *, int); +static void pfr_clstats_ktables(struct pfr_ktableworkq *, long, + int); +static void pfr_clstats_ktable(struct pfr_ktable *, long, int); +static struct pfr_ktable + *pfr_create_ktable(struct pfr_table *, long, int); +static void pfr_destroy_ktables(struct pfr_ktableworkq *, int); +static void pfr_destroy_ktable(struct pfr_ktable *, int); +static int pfr_ktable_compare(struct pfr_ktable *, + struct pfr_ktable *); +static struct pfr_ktable + *pfr_lookup_table(struct pfr_table *); +static void pfr_clean_node_mask(struct pfr_ktable *, + struct pfr_kentryworkq *); +static int pfr_table_count(struct pfr_table *, int); +static int pfr_skip_table(struct pfr_table *, + struct pfr_ktable *, int); +static struct pfr_kentry + *pfr_kentry_byidx(struct pfr_ktable *, int, int); + +static RB_PROTOTYPE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare); +static RB_GENERATE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare); + +struct pfr_ktablehead pfr_ktables; +struct pfr_table pfr_nulltable; +int pfr_ktable_cnt; + +void +pfr_initialize(void) +{ + + V_pfr_kentry_z = uma_zcreate("pf table entries", + sizeof(struct pfr_kentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, + 0); + V_pfr_kcounters_z = uma_zcreate("pf table counters", + sizeof(struct pfr_kcounters), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + V_pf_limits[PF_LIMIT_TABLE_ENTRIES].zone = V_pfr_kentry_z; + V_pf_limits[PF_LIMIT_TABLE_ENTRIES].limit = PFR_KENTRY_HIWAT; +} + +void +pfr_cleanup(void) +{ + + uma_zdestroy(V_pfr_kentry_z); + uma_zdestroy(V_pfr_kcounters_z); +} + +int +pfr_clr_addrs(struct pfr_table *tbl, int *ndel, int flags) +{ + struct pfr_ktable *kt; + struct pfr_kentryworkq workq; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_flags & PFR_TFLAG_CONST) + return (EPERM); + pfr_enqueue_addrs(kt, &workq, ndel, 0); + + if (!(flags & PFR_FLAG_DUMMY)) { + pfr_remove_kentries(kt, &workq); + KASSERT(kt->pfrkt_cnt == 0, ("%s: non-null pfrkt_cnt", __func__)); + } + return (0); +} + +int +pfr_add_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *nadd, int flags) +{ + struct pfr_ktable *kt, *tmpkt; + struct pfr_kentryworkq workq; + struct pfr_kentry *p, *q; + struct pfr_addr *ad; + int i, rv, xadd = 0; + long tzero = time_second; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK); + if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_flags & PFR_TFLAG_CONST) + return (EPERM); + tmpkt = pfr_create_ktable(&pfr_nulltable, 0, 0); + if (tmpkt == NULL) + return (ENOMEM); + SLIST_INIT(&workq); + for (i = 0, ad = addr; i < size; i++, ad++) { + if (pfr_validate_addr(ad)) + senderr(EINVAL); + p = pfr_lookup_addr(kt, ad, 1); + q = pfr_lookup_addr(tmpkt, ad, 1); + if (flags & PFR_FLAG_FEEDBACK) { + if (q != NULL) + ad->pfra_fback = PFR_FB_DUPLICATE; + else if (p == NULL) + ad->pfra_fback = PFR_FB_ADDED; + else if (p->pfrke_not != ad->pfra_not) + ad->pfra_fback = PFR_FB_CONFLICT; + else + ad->pfra_fback = PFR_FB_NONE; + } + if (p == NULL && q == NULL) { + p = pfr_create_kentry(ad); + if (p == NULL) + senderr(ENOMEM); + if (pfr_route_kentry(tmpkt, p)) { + pfr_destroy_kentry(p); + ad->pfra_fback = PFR_FB_NONE; + } else { + SLIST_INSERT_HEAD(&workq, p, pfrke_workq); + xadd++; + } + } + } + pfr_clean_node_mask(tmpkt, &workq); + if (!(flags & PFR_FLAG_DUMMY)) + pfr_insert_kentries(kt, &workq, tzero); + else + pfr_destroy_kentries(&workq); + if (nadd != NULL) + *nadd = xadd; + pfr_destroy_ktable(tmpkt, 0); + return (0); +_bad: + pfr_clean_node_mask(tmpkt, &workq); + pfr_destroy_kentries(&workq); + if (flags & PFR_FLAG_FEEDBACK) + pfr_reset_feedback(addr, size); + pfr_destroy_ktable(tmpkt, 0); + return (rv); +} + +int +pfr_del_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *ndel, int flags) +{ + struct pfr_ktable *kt; + struct pfr_kentryworkq workq; + struct pfr_kentry *p; + struct pfr_addr *ad; + int i, rv, xdel = 0, log = 1; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK); + if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_flags & PFR_TFLAG_CONST) + return (EPERM); + /* + * there are two algorithms to choose from here. + * with: + * n: number of addresses to delete + * N: number of addresses in the table + * + * one is O(N) and is better for large 'n' + * one is O(n*LOG(N)) and is better for small 'n' + * + * following code try to decide which one is best. + */ + for (i = kt->pfrkt_cnt; i > 0; i >>= 1) + log++; + if (size > kt->pfrkt_cnt/log) { + /* full table scan */ + pfr_mark_addrs(kt); + } else { + /* iterate over addresses to delete */ + for (i = 0, ad = addr; i < size; i++, ad++) { + if (pfr_validate_addr(ad)) + return (EINVAL); + p = pfr_lookup_addr(kt, ad, 1); + if (p != NULL) + p->pfrke_mark = 0; + } + } + SLIST_INIT(&workq); + for (i = 0, ad = addr; i < size; i++, ad++) { + if (pfr_validate_addr(ad)) + senderr(EINVAL); + p = pfr_lookup_addr(kt, ad, 1); + if (flags & PFR_FLAG_FEEDBACK) { + if (p == NULL) + ad->pfra_fback = PFR_FB_NONE; + else if (p->pfrke_not != ad->pfra_not) + ad->pfra_fback = PFR_FB_CONFLICT; + else if (p->pfrke_mark) + ad->pfra_fback = PFR_FB_DUPLICATE; + else + ad->pfra_fback = PFR_FB_DELETED; + } + if (p != NULL && p->pfrke_not == ad->pfra_not && + !p->pfrke_mark) { + p->pfrke_mark = 1; + SLIST_INSERT_HEAD(&workq, p, pfrke_workq); + xdel++; + } + } + if (!(flags & PFR_FLAG_DUMMY)) + pfr_remove_kentries(kt, &workq); + if (ndel != NULL) + *ndel = xdel; + return (0); +_bad: + if (flags & PFR_FLAG_FEEDBACK) + pfr_reset_feedback(addr, size); + return (rv); +} + +int +pfr_set_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *size2, int *nadd, int *ndel, int *nchange, int flags, + u_int32_t ignore_pfrt_flags) +{ + struct pfr_ktable *kt, *tmpkt; + struct pfr_kentryworkq addq, delq, changeq; + struct pfr_kentry *p, *q; + struct pfr_addr ad; + int i, rv, xadd = 0, xdel = 0, xchange = 0; + long tzero = time_second; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK); + if (pfr_validate_table(tbl, ignore_pfrt_flags, flags & + PFR_FLAG_USERIOCTL)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_flags & PFR_TFLAG_CONST) + return (EPERM); + tmpkt = pfr_create_ktable(&pfr_nulltable, 0, 0); + if (tmpkt == NULL) + return (ENOMEM); + pfr_mark_addrs(kt); + SLIST_INIT(&addq); + SLIST_INIT(&delq); + SLIST_INIT(&changeq); + for (i = 0; i < size; i++) { + /* + * XXXGL: undertand pf_if usage of this function + * and make ad a moving pointer + */ + bcopy(addr + i, &ad, sizeof(ad)); + if (pfr_validate_addr(&ad)) + senderr(EINVAL); + ad.pfra_fback = PFR_FB_NONE; + p = pfr_lookup_addr(kt, &ad, 1); + if (p != NULL) { + if (p->pfrke_mark) { + ad.pfra_fback = PFR_FB_DUPLICATE; + goto _skip; + } + p->pfrke_mark = 1; + if (p->pfrke_not != ad.pfra_not) { + SLIST_INSERT_HEAD(&changeq, p, pfrke_workq); + ad.pfra_fback = PFR_FB_CHANGED; + xchange++; + } + } else { + q = pfr_lookup_addr(tmpkt, &ad, 1); + if (q != NULL) { + ad.pfra_fback = PFR_FB_DUPLICATE; + goto _skip; + } + p = pfr_create_kentry(&ad); + if (p == NULL) + senderr(ENOMEM); + if (pfr_route_kentry(tmpkt, p)) { + pfr_destroy_kentry(p); + ad.pfra_fback = PFR_FB_NONE; + } else { + SLIST_INSERT_HEAD(&addq, p, pfrke_workq); + ad.pfra_fback = PFR_FB_ADDED; + xadd++; + } + } +_skip: + if (flags & PFR_FLAG_FEEDBACK) + bcopy(&ad, addr + i, sizeof(ad)); + } + pfr_enqueue_addrs(kt, &delq, &xdel, ENQUEUE_UNMARKED_ONLY); + if ((flags & PFR_FLAG_FEEDBACK) && *size2) { + if (*size2 < size+xdel) { + *size2 = size+xdel; + senderr(0); + } + i = 0; + SLIST_FOREACH(p, &delq, pfrke_workq) { + pfr_copyout_addr(&ad, p); + ad.pfra_fback = PFR_FB_DELETED; + bcopy(&ad, addr + size + i, sizeof(ad)); + i++; + } + } + pfr_clean_node_mask(tmpkt, &addq); + if (!(flags & PFR_FLAG_DUMMY)) { + pfr_insert_kentries(kt, &addq, tzero); + pfr_remove_kentries(kt, &delq); + pfr_clstats_kentries(&changeq, tzero, INVERT_NEG_FLAG); + } else + pfr_destroy_kentries(&addq); + if (nadd != NULL) + *nadd = xadd; + if (ndel != NULL) + *ndel = xdel; + if (nchange != NULL) + *nchange = xchange; + if ((flags & PFR_FLAG_FEEDBACK) && size2) + *size2 = size+xdel; + pfr_destroy_ktable(tmpkt, 0); + return (0); +_bad: + pfr_clean_node_mask(tmpkt, &addq); + pfr_destroy_kentries(&addq); + if (flags & PFR_FLAG_FEEDBACK) + pfr_reset_feedback(addr, size); + pfr_destroy_ktable(tmpkt, 0); + return (rv); +} + +int +pfr_tst_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *nmatch, int flags) +{ + struct pfr_ktable *kt; + struct pfr_kentry *p; + struct pfr_addr *ad; + int i, xmatch = 0; + + PF_RULES_RASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_REPLACE); + if (pfr_validate_table(tbl, 0, 0)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + + for (i = 0, ad = addr; i < size; i++, ad++) { + if (pfr_validate_addr(ad)) + return (EINVAL); + if (ADDR_NETWORK(ad)) + return (EINVAL); + p = pfr_lookup_addr(kt, ad, 0); + if (flags & PFR_FLAG_REPLACE) + pfr_copyout_addr(ad, p); + ad->pfra_fback = (p == NULL) ? PFR_FB_NONE : + (p->pfrke_not ? PFR_FB_NOTMATCH : PFR_FB_MATCH); + if (p != NULL && !p->pfrke_not) + xmatch++; + } + if (nmatch != NULL) + *nmatch = xmatch; + return (0); +} + +int +pfr_get_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int *size, + int flags) +{ + struct pfr_ktable *kt; + struct pfr_walktree w; + int rv; + + PF_RULES_RASSERT(); + + ACCEPT_FLAGS(flags, 0); + if (pfr_validate_table(tbl, 0, 0)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_cnt > *size) { + *size = kt->pfrkt_cnt; + return (0); + } + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_GET_ADDRS; + w.pfrw_addr = addr; + w.pfrw_free = kt->pfrkt_cnt; + rv = kt->pfrkt_ip4->rnh_walktree(&kt->pfrkt_ip4->rh, pfr_walktree, &w); + if (!rv) + rv = kt->pfrkt_ip6->rnh_walktree(&kt->pfrkt_ip6->rh, + pfr_walktree, &w); + if (rv) + return (rv); + + KASSERT(w.pfrw_free == 0, ("%s: corruption detected (%d)", __func__, + w.pfrw_free)); + + *size = kt->pfrkt_cnt; + return (0); +} + +int +pfr_get_astats(struct pfr_table *tbl, struct pfr_astats *addr, int *size, + int flags) +{ + struct pfr_ktable *kt; + struct pfr_walktree w; + struct pfr_kentryworkq workq; + int rv; + long tzero = time_second; + + PF_RULES_RASSERT(); + + /* XXX PFR_FLAG_CLSTATS disabled */ + ACCEPT_FLAGS(flags, 0); + if (pfr_validate_table(tbl, 0, 0)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_cnt > *size) { + *size = kt->pfrkt_cnt; + return (0); + } + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_GET_ASTATS; + w.pfrw_astats = addr; + w.pfrw_free = kt->pfrkt_cnt; + rv = kt->pfrkt_ip4->rnh_walktree(&kt->pfrkt_ip4->rh, pfr_walktree, &w); + if (!rv) + rv = kt->pfrkt_ip6->rnh_walktree(&kt->pfrkt_ip6->rh, + pfr_walktree, &w); + if (!rv && (flags & PFR_FLAG_CLSTATS)) { + pfr_enqueue_addrs(kt, &workq, NULL, 0); + pfr_clstats_kentries(&workq, tzero, 0); + } + if (rv) + return (rv); + + if (w.pfrw_free) { + printf("pfr_get_astats: corruption detected (%d).\n", + w.pfrw_free); + return (ENOTTY); + } + *size = kt->pfrkt_cnt; + return (0); +} + +int +pfr_clr_astats(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *nzero, int flags) +{ + struct pfr_ktable *kt; + struct pfr_kentryworkq workq; + struct pfr_kentry *p; + struct pfr_addr *ad; + int i, rv, xzero = 0; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK); + if (pfr_validate_table(tbl, 0, 0)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + SLIST_INIT(&workq); + for (i = 0, ad = addr; i < size; i++, ad++) { + if (pfr_validate_addr(ad)) + senderr(EINVAL); + p = pfr_lookup_addr(kt, ad, 1); + if (flags & PFR_FLAG_FEEDBACK) { + ad->pfra_fback = (p != NULL) ? + PFR_FB_CLEARED : PFR_FB_NONE; + } + if (p != NULL) { + SLIST_INSERT_HEAD(&workq, p, pfrke_workq); + xzero++; + } + } + + if (!(flags & PFR_FLAG_DUMMY)) + pfr_clstats_kentries(&workq, 0, 0); + if (nzero != NULL) + *nzero = xzero; + return (0); +_bad: + if (flags & PFR_FLAG_FEEDBACK) + pfr_reset_feedback(addr, size); + return (rv); +} + +static int +pfr_validate_addr(struct pfr_addr *ad) +{ + int i; + + switch (ad->pfra_af) { +#ifdef INET + case AF_INET: + if (ad->pfra_net > 32) + return (-1); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (ad->pfra_net > 128) + return (-1); + break; +#endif /* INET6 */ + default: + return (-1); + } + if (ad->pfra_net < 128 && + (((caddr_t)ad)[ad->pfra_net/8] & (0xFF >> (ad->pfra_net%8)))) + return (-1); + for (i = (ad->pfra_net+7)/8; i < sizeof(ad->pfra_u); i++) + if (((caddr_t)ad)[i]) + return (-1); + if (ad->pfra_not && ad->pfra_not != 1) + return (-1); + if (ad->pfra_fback) + return (-1); + return (0); +} + +static void +pfr_enqueue_addrs(struct pfr_ktable *kt, struct pfr_kentryworkq *workq, + int *naddr, int sweep) +{ + struct pfr_walktree w; + + SLIST_INIT(workq); + bzero(&w, sizeof(w)); + w.pfrw_op = sweep ? PFRW_SWEEP : PFRW_ENQUEUE; + w.pfrw_workq = workq; + if (kt->pfrkt_ip4 != NULL) + if (kt->pfrkt_ip4->rnh_walktree(&kt->pfrkt_ip4->rh, + pfr_walktree, &w)) + printf("pfr_enqueue_addrs: IPv4 walktree failed.\n"); + if (kt->pfrkt_ip6 != NULL) + if (kt->pfrkt_ip6->rnh_walktree(&kt->pfrkt_ip6->rh, + pfr_walktree, &w)) + printf("pfr_enqueue_addrs: IPv6 walktree failed.\n"); + if (naddr != NULL) + *naddr = w.pfrw_cnt; +} + +static void +pfr_mark_addrs(struct pfr_ktable *kt) +{ + struct pfr_walktree w; + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_MARK; + if (kt->pfrkt_ip4->rnh_walktree(&kt->pfrkt_ip4->rh, pfr_walktree, &w)) + printf("pfr_mark_addrs: IPv4 walktree failed.\n"); + if (kt->pfrkt_ip6->rnh_walktree(&kt->pfrkt_ip6->rh, pfr_walktree, &w)) + printf("pfr_mark_addrs: IPv6 walktree failed.\n"); +} + + +static struct pfr_kentry * +pfr_lookup_addr(struct pfr_ktable *kt, struct pfr_addr *ad, int exact) +{ + union sockaddr_union sa, mask; + struct radix_head *head = NULL; + struct pfr_kentry *ke; + + PF_RULES_ASSERT(); + + bzero(&sa, sizeof(sa)); + if (ad->pfra_af == AF_INET) { + FILLIN_SIN(sa.sin, ad->pfra_ip4addr); + head = &kt->pfrkt_ip4->rh; + } else if ( ad->pfra_af == AF_INET6 ) { + FILLIN_SIN6(sa.sin6, ad->pfra_ip6addr); + head = &kt->pfrkt_ip6->rh; + } + if (ADDR_NETWORK(ad)) { + pfr_prepare_network(&mask, ad->pfra_af, ad->pfra_net); + ke = (struct pfr_kentry *)rn_lookup(&sa, &mask, head); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + } else { + ke = (struct pfr_kentry *)rn_match(&sa, head); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + if (exact && ke && KENTRY_NETWORK(ke)) + ke = NULL; + } + return (ke); +} + +static struct pfr_kentry * +pfr_create_kentry(struct pfr_addr *ad) +{ + struct pfr_kentry *ke; + + ke = uma_zalloc(V_pfr_kentry_z, M_NOWAIT | M_ZERO); + if (ke == NULL) + return (NULL); + + if (ad->pfra_af == AF_INET) + FILLIN_SIN(ke->pfrke_sa.sin, ad->pfra_ip4addr); + else if (ad->pfra_af == AF_INET6) + FILLIN_SIN6(ke->pfrke_sa.sin6, ad->pfra_ip6addr); + ke->pfrke_af = ad->pfra_af; + ke->pfrke_net = ad->pfra_net; + ke->pfrke_not = ad->pfra_not; + return (ke); +} + +static void +pfr_destroy_kentries(struct pfr_kentryworkq *workq) +{ + struct pfr_kentry *p, *q; + + for (p = SLIST_FIRST(workq); p != NULL; p = q) { + q = SLIST_NEXT(p, pfrke_workq); + pfr_destroy_kentry(p); + } +} + +static void +pfr_destroy_kentry(struct pfr_kentry *ke) +{ + if (ke->pfrke_counters) + uma_zfree(V_pfr_kcounters_z, ke->pfrke_counters); + uma_zfree(V_pfr_kentry_z, ke); +} + +static void +pfr_insert_kentries(struct pfr_ktable *kt, + struct pfr_kentryworkq *workq, long tzero) +{ + struct pfr_kentry *p; + int rv, n = 0; + + SLIST_FOREACH(p, workq, pfrke_workq) { + rv = pfr_route_kentry(kt, p); + if (rv) { + printf("pfr_insert_kentries: cannot route entry " + "(code=%d).\n", rv); + break; + } + p->pfrke_tzero = tzero; + n++; + } + kt->pfrkt_cnt += n; +} + +int +pfr_insert_kentry(struct pfr_ktable *kt, struct pfr_addr *ad, long tzero) +{ + struct pfr_kentry *p; + int rv; + + p = pfr_lookup_addr(kt, ad, 1); + if (p != NULL) + return (0); + p = pfr_create_kentry(ad); + if (p == NULL) + return (ENOMEM); + + rv = pfr_route_kentry(kt, p); + if (rv) + return (rv); + + p->pfrke_tzero = tzero; + kt->pfrkt_cnt++; + + return (0); +} + +static void +pfr_remove_kentries(struct pfr_ktable *kt, + struct pfr_kentryworkq *workq) +{ + struct pfr_kentry *p; + int n = 0; + + SLIST_FOREACH(p, workq, pfrke_workq) { + pfr_unroute_kentry(kt, p); + n++; + } + kt->pfrkt_cnt -= n; + pfr_destroy_kentries(workq); +} + +static void +pfr_clean_node_mask(struct pfr_ktable *kt, + struct pfr_kentryworkq *workq) +{ + struct pfr_kentry *p; + + SLIST_FOREACH(p, workq, pfrke_workq) + pfr_unroute_kentry(kt, p); +} + +static void +pfr_clstats_kentries(struct pfr_kentryworkq *workq, long tzero, int negchange) +{ + struct pfr_kentry *p; + + SLIST_FOREACH(p, workq, pfrke_workq) { + if (negchange) + p->pfrke_not = !p->pfrke_not; + if (p->pfrke_counters) { + uma_zfree(V_pfr_kcounters_z, p->pfrke_counters); + p->pfrke_counters = NULL; + } + p->pfrke_tzero = tzero; + } +} + +static void +pfr_reset_feedback(struct pfr_addr *addr, int size) +{ + struct pfr_addr *ad; + int i; + + for (i = 0, ad = addr; i < size; i++, ad++) + ad->pfra_fback = PFR_FB_NONE; +} + +static void +pfr_prepare_network(union sockaddr_union *sa, int af, int net) +{ + int i; + + bzero(sa, sizeof(*sa)); + if (af == AF_INET) { + sa->sin.sin_len = sizeof(sa->sin); + sa->sin.sin_family = AF_INET; + sa->sin.sin_addr.s_addr = net ? htonl(-1 << (32-net)) : 0; + } else if (af == AF_INET6) { + sa->sin6.sin6_len = sizeof(sa->sin6); + sa->sin6.sin6_family = AF_INET6; + for (i = 0; i < 4; i++) { + if (net <= 32) { + sa->sin6.sin6_addr.s6_addr32[i] = + net ? htonl(-1 << (32-net)) : 0; + break; + } + sa->sin6.sin6_addr.s6_addr32[i] = 0xFFFFFFFF; + net -= 32; + } + } +} + +static int +pfr_route_kentry(struct pfr_ktable *kt, struct pfr_kentry *ke) +{ + union sockaddr_union mask; + struct radix_node *rn; + struct radix_head *head = NULL; + + PF_RULES_WASSERT(); + + bzero(ke->pfrke_node, sizeof(ke->pfrke_node)); + if (ke->pfrke_af == AF_INET) + head = &kt->pfrkt_ip4->rh; + else if (ke->pfrke_af == AF_INET6) + head = &kt->pfrkt_ip6->rh; + + if (KENTRY_NETWORK(ke)) { + pfr_prepare_network(&mask, ke->pfrke_af, ke->pfrke_net); + rn = rn_addroute(&ke->pfrke_sa, &mask, head, ke->pfrke_node); + } else + rn = rn_addroute(&ke->pfrke_sa, NULL, head, ke->pfrke_node); + + return (rn == NULL ? -1 : 0); +} + +static int +pfr_unroute_kentry(struct pfr_ktable *kt, struct pfr_kentry *ke) +{ + union sockaddr_union mask; + struct radix_node *rn; + struct radix_head *head = NULL; + + if (ke->pfrke_af == AF_INET) + head = &kt->pfrkt_ip4->rh; + else if (ke->pfrke_af == AF_INET6) + head = &kt->pfrkt_ip6->rh; + + if (KENTRY_NETWORK(ke)) { + pfr_prepare_network(&mask, ke->pfrke_af, ke->pfrke_net); + rn = rn_delete(&ke->pfrke_sa, &mask, head); + } else + rn = rn_delete(&ke->pfrke_sa, NULL, head); + + if (rn == NULL) { + printf("pfr_unroute_kentry: delete failed.\n"); + return (-1); + } + return (0); +} + +static void +pfr_copyout_addr(struct pfr_addr *ad, struct pfr_kentry *ke) +{ + bzero(ad, sizeof(*ad)); + if (ke == NULL) + return; + ad->pfra_af = ke->pfrke_af; + ad->pfra_net = ke->pfrke_net; + ad->pfra_not = ke->pfrke_not; + if (ad->pfra_af == AF_INET) + ad->pfra_ip4addr = ke->pfrke_sa.sin.sin_addr; + else if (ad->pfra_af == AF_INET6) + ad->pfra_ip6addr = ke->pfrke_sa.sin6.sin6_addr; +} + +static int +pfr_walktree(struct radix_node *rn, void *arg) +{ + struct pfr_kentry *ke = (struct pfr_kentry *)rn; + struct pfr_walktree *w = arg; + + switch (w->pfrw_op) { + case PFRW_MARK: + ke->pfrke_mark = 0; + break; + case PFRW_SWEEP: + if (ke->pfrke_mark) + break; + /* FALLTHROUGH */ + case PFRW_ENQUEUE: + SLIST_INSERT_HEAD(w->pfrw_workq, ke, pfrke_workq); + w->pfrw_cnt++; + break; + case PFRW_GET_ADDRS: + if (w->pfrw_free-- > 0) { + pfr_copyout_addr(w->pfrw_addr, ke); + w->pfrw_addr++; + } + break; + case PFRW_GET_ASTATS: + if (w->pfrw_free-- > 0) { + struct pfr_astats as; + + pfr_copyout_addr(&as.pfras_a, ke); + + if (ke->pfrke_counters) { + bcopy(ke->pfrke_counters->pfrkc_packets, + as.pfras_packets, sizeof(as.pfras_packets)); + bcopy(ke->pfrke_counters->pfrkc_bytes, + as.pfras_bytes, sizeof(as.pfras_bytes)); + } else { + bzero(as.pfras_packets, sizeof(as.pfras_packets)); + bzero(as.pfras_bytes, sizeof(as.pfras_bytes)); + as.pfras_a.pfra_fback = PFR_FB_NOCOUNT; + } + as.pfras_tzero = ke->pfrke_tzero; + + bcopy(&as, w->pfrw_astats, sizeof(as)); + w->pfrw_astats++; + } + break; + case PFRW_POOL_GET: + if (ke->pfrke_not) + break; /* negative entries are ignored */ + if (!w->pfrw_cnt--) { + w->pfrw_kentry = ke; + return (1); /* finish search */ + } + break; + case PFRW_DYNADDR_UPDATE: + { + union sockaddr_union pfr_mask; + + if (ke->pfrke_af == AF_INET) { + if (w->pfrw_dyn->pfid_acnt4++ > 0) + break; + pfr_prepare_network(&pfr_mask, AF_INET, ke->pfrke_net); + w->pfrw_dyn->pfid_addr4 = *SUNION2PF(&ke->pfrke_sa, + AF_INET); + w->pfrw_dyn->pfid_mask4 = *SUNION2PF(&pfr_mask, + AF_INET); + } else if (ke->pfrke_af == AF_INET6){ + if (w->pfrw_dyn->pfid_acnt6++ > 0) + break; + pfr_prepare_network(&pfr_mask, AF_INET6, ke->pfrke_net); + w->pfrw_dyn->pfid_addr6 = *SUNION2PF(&ke->pfrke_sa, + AF_INET6); + w->pfrw_dyn->pfid_mask6 = *SUNION2PF(&pfr_mask, + AF_INET6); + } + break; + } + } + return (0); +} + +int +pfr_clr_tables(struct pfr_table *filter, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p; + int xdel = 0; + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_ALLRSETS); + if (pfr_fix_anchor(filter->pfrt_anchor)) + return (EINVAL); + if (pfr_table_count(filter, flags) < 0) + return (ENOENT); + + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (pfr_skip_table(filter, p, flags)) + continue; + if (!strcmp(p->pfrkt_anchor, PF_RESERVED_ANCHOR)) + continue; + if (!(p->pfrkt_flags & PFR_TFLAG_ACTIVE)) + continue; + p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_ACTIVE; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xdel++; + } + if (!(flags & PFR_FLAG_DUMMY)) + pfr_setflags_ktables(&workq); + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_add_tables(struct pfr_table *tbl, int size, int *nadd, int flags) +{ + struct pfr_ktableworkq addq, changeq; + struct pfr_ktable *p, *q, *r, key; + int i, rv, xadd = 0; + long tzero = time_second; + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + SLIST_INIT(&addq); + SLIST_INIT(&changeq); + for (i = 0; i < size; i++) { + bcopy(tbl+i, &key.pfrkt_t, sizeof(key.pfrkt_t)); + if (pfr_validate_table(&key.pfrkt_t, PFR_TFLAG_USRMASK, + flags & PFR_FLAG_USERIOCTL)) + senderr(EINVAL); + key.pfrkt_flags |= PFR_TFLAG_ACTIVE; + p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (p == NULL) { + p = pfr_create_ktable(&key.pfrkt_t, tzero, 1); + if (p == NULL) + senderr(ENOMEM); + SLIST_FOREACH(q, &addq, pfrkt_workq) { + if (!pfr_ktable_compare(p, q)) + goto _skip; + } + SLIST_INSERT_HEAD(&addq, p, pfrkt_workq); + xadd++; + if (!key.pfrkt_anchor[0]) + goto _skip; + + /* find or create root table */ + bzero(key.pfrkt_anchor, sizeof(key.pfrkt_anchor)); + r = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (r != NULL) { + p->pfrkt_root = r; + goto _skip; + } + SLIST_FOREACH(q, &addq, pfrkt_workq) { + if (!pfr_ktable_compare(&key, q)) { + p->pfrkt_root = q; + goto _skip; + } + } + key.pfrkt_flags = 0; + r = pfr_create_ktable(&key.pfrkt_t, 0, 1); + if (r == NULL) + senderr(ENOMEM); + SLIST_INSERT_HEAD(&addq, r, pfrkt_workq); + p->pfrkt_root = r; + } else if (!(p->pfrkt_flags & PFR_TFLAG_ACTIVE)) { + SLIST_FOREACH(q, &changeq, pfrkt_workq) + if (!pfr_ktable_compare(&key, q)) + goto _skip; + p->pfrkt_nflags = (p->pfrkt_flags & + ~PFR_TFLAG_USRMASK) | key.pfrkt_flags; + SLIST_INSERT_HEAD(&changeq, p, pfrkt_workq); + xadd++; + } +_skip: + ; + } + if (!(flags & PFR_FLAG_DUMMY)) { + pfr_insert_ktables(&addq); + pfr_setflags_ktables(&changeq); + } else + pfr_destroy_ktables(&addq, 0); + if (nadd != NULL) + *nadd = xadd; + return (0); +_bad: + pfr_destroy_ktables(&addq, 0); + return (rv); +} + +int +pfr_del_tables(struct pfr_table *tbl, int size, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p, *q, key; + int i, xdel = 0; + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + SLIST_INIT(&workq); + for (i = 0; i < size; i++) { + bcopy(tbl+i, &key.pfrkt_t, sizeof(key.pfrkt_t)); + if (pfr_validate_table(&key.pfrkt_t, 0, + flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (p != NULL && (p->pfrkt_flags & PFR_TFLAG_ACTIVE)) { + SLIST_FOREACH(q, &workq, pfrkt_workq) + if (!pfr_ktable_compare(p, q)) + goto _skip; + p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_ACTIVE; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xdel++; + } +_skip: + ; + } + + if (!(flags & PFR_FLAG_DUMMY)) + pfr_setflags_ktables(&workq); + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_get_tables(struct pfr_table *filter, struct pfr_table *tbl, int *size, + int flags) +{ + struct pfr_ktable *p; + int n, nn; + + PF_RULES_RASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_ALLRSETS); + if (pfr_fix_anchor(filter->pfrt_anchor)) + return (EINVAL); + n = nn = pfr_table_count(filter, flags); + if (n < 0) + return (ENOENT); + if (n > *size) { + *size = n; + return (0); + } + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (pfr_skip_table(filter, p, flags)) + continue; + if (n-- <= 0) + continue; + bcopy(&p->pfrkt_t, tbl++, sizeof(*tbl)); + } + + KASSERT(n == 0, ("%s: corruption detected (%d)", __func__, n)); + + *size = nn; + return (0); +} + +int +pfr_get_tstats(struct pfr_table *filter, struct pfr_tstats *tbl, int *size, + int flags) +{ + struct pfr_ktable *p; + struct pfr_ktableworkq workq; + int n, nn; + long tzero = time_second; + + /* XXX PFR_FLAG_CLSTATS disabled */ + ACCEPT_FLAGS(flags, PFR_FLAG_ALLRSETS); + if (pfr_fix_anchor(filter->pfrt_anchor)) + return (EINVAL); + n = nn = pfr_table_count(filter, flags); + if (n < 0) + return (ENOENT); + if (n > *size) { + *size = n; + return (0); + } + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (pfr_skip_table(filter, p, flags)) + continue; + if (n-- <= 0) + continue; + bcopy(&p->pfrkt_ts, tbl++, sizeof(*tbl)); + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + } + if (flags & PFR_FLAG_CLSTATS) + pfr_clstats_ktables(&workq, tzero, + flags & PFR_FLAG_ADDRSTOO); + + KASSERT(n == 0, ("%s: corruption detected (%d)", __func__, n)); + + *size = nn; + return (0); +} + +int +pfr_clr_tstats(struct pfr_table *tbl, int size, int *nzero, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p, key; + int i, xzero = 0; + long tzero = time_second; + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_ADDRSTOO); + SLIST_INIT(&workq); + for (i = 0; i < size; i++) { + bcopy(tbl + i, &key.pfrkt_t, sizeof(key.pfrkt_t)); + if (pfr_validate_table(&key.pfrkt_t, 0, 0)) + return (EINVAL); + p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (p != NULL) { + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xzero++; + } + } + if (!(flags & PFR_FLAG_DUMMY)) + pfr_clstats_ktables(&workq, tzero, flags & PFR_FLAG_ADDRSTOO); + if (nzero != NULL) + *nzero = xzero; + return (0); +} + +int +pfr_set_tflags(struct pfr_table *tbl, int size, int setflag, int clrflag, + int *nchange, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p, *q, key; + int i, xchange = 0, xdel = 0; + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + if ((setflag & ~PFR_TFLAG_USRMASK) || + (clrflag & ~PFR_TFLAG_USRMASK) || + (setflag & clrflag)) + return (EINVAL); + SLIST_INIT(&workq); + for (i = 0; i < size; i++) { + bcopy(tbl + i, &key.pfrkt_t, sizeof(key.pfrkt_t)); + if (pfr_validate_table(&key.pfrkt_t, 0, + flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (p != NULL && (p->pfrkt_flags & PFR_TFLAG_ACTIVE)) { + p->pfrkt_nflags = (p->pfrkt_flags | setflag) & + ~clrflag; + if (p->pfrkt_nflags == p->pfrkt_flags) + goto _skip; + SLIST_FOREACH(q, &workq, pfrkt_workq) + if (!pfr_ktable_compare(p, q)) + goto _skip; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + if ((p->pfrkt_flags & PFR_TFLAG_PERSIST) && + (clrflag & PFR_TFLAG_PERSIST) && + !(p->pfrkt_flags & PFR_TFLAG_REFERENCED)) + xdel++; + else + xchange++; + } +_skip: + ; + } + if (!(flags & PFR_FLAG_DUMMY)) + pfr_setflags_ktables(&workq); + if (nchange != NULL) + *nchange = xchange; + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_ina_begin(struct pfr_table *trs, u_int32_t *ticket, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p; + struct pf_ruleset *rs; + int xdel = 0; + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + rs = pf_find_or_create_ruleset(trs->pfrt_anchor); + if (rs == NULL) + return (ENOMEM); + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) || + pfr_skip_table(trs, p, 0)) + continue; + p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_INACTIVE; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xdel++; + } + if (!(flags & PFR_FLAG_DUMMY)) { + pfr_setflags_ktables(&workq); + if (ticket != NULL) + *ticket = ++rs->tticket; + rs->topen = 1; + } else + pf_remove_if_empty_ruleset(rs); + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_ina_define(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *nadd, int *naddr, u_int32_t ticket, int flags) +{ + struct pfr_ktableworkq tableq; + struct pfr_kentryworkq addrq; + struct pfr_ktable *kt, *rt, *shadow, key; + struct pfr_kentry *p; + struct pfr_addr *ad; + struct pf_ruleset *rs; + int i, rv, xadd = 0, xaddr = 0; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_ADDRSTOO); + if (size && !(flags & PFR_FLAG_ADDRSTOO)) + return (EINVAL); + if (pfr_validate_table(tbl, PFR_TFLAG_USRMASK, + flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + rs = pf_find_ruleset(tbl->pfrt_anchor); + if (rs == NULL || !rs->topen || ticket != rs->tticket) + return (EBUSY); + tbl->pfrt_flags |= PFR_TFLAG_INACTIVE; + SLIST_INIT(&tableq); + kt = RB_FIND(pfr_ktablehead, &pfr_ktables, (struct pfr_ktable *)tbl); + if (kt == NULL) { + kt = pfr_create_ktable(tbl, 0, 1); + if (kt == NULL) + return (ENOMEM); + SLIST_INSERT_HEAD(&tableq, kt, pfrkt_workq); + xadd++; + if (!tbl->pfrt_anchor[0]) + goto _skip; + + /* find or create root table */ + bzero(&key, sizeof(key)); + strlcpy(key.pfrkt_name, tbl->pfrt_name, sizeof(key.pfrkt_name)); + rt = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (rt != NULL) { + kt->pfrkt_root = rt; + goto _skip; + } + rt = pfr_create_ktable(&key.pfrkt_t, 0, 1); + if (rt == NULL) { + pfr_destroy_ktables(&tableq, 0); + return (ENOMEM); + } + SLIST_INSERT_HEAD(&tableq, rt, pfrkt_workq); + kt->pfrkt_root = rt; + } else if (!(kt->pfrkt_flags & PFR_TFLAG_INACTIVE)) + xadd++; +_skip: + shadow = pfr_create_ktable(tbl, 0, 0); + if (shadow == NULL) { + pfr_destroy_ktables(&tableq, 0); + return (ENOMEM); + } + SLIST_INIT(&addrq); + for (i = 0, ad = addr; i < size; i++, ad++) { + if (pfr_validate_addr(ad)) + senderr(EINVAL); + if (pfr_lookup_addr(shadow, ad, 1) != NULL) + continue; + p = pfr_create_kentry(ad); + if (p == NULL) + senderr(ENOMEM); + if (pfr_route_kentry(shadow, p)) { + pfr_destroy_kentry(p); + continue; + } + SLIST_INSERT_HEAD(&addrq, p, pfrke_workq); + xaddr++; + } + if (!(flags & PFR_FLAG_DUMMY)) { + if (kt->pfrkt_shadow != NULL) + pfr_destroy_ktable(kt->pfrkt_shadow, 1); + kt->pfrkt_flags |= PFR_TFLAG_INACTIVE; + pfr_insert_ktables(&tableq); + shadow->pfrkt_cnt = (flags & PFR_FLAG_ADDRSTOO) ? + xaddr : NO_ADDRESSES; + kt->pfrkt_shadow = shadow; + } else { + pfr_clean_node_mask(shadow, &addrq); + pfr_destroy_ktable(shadow, 0); + pfr_destroy_ktables(&tableq, 0); + pfr_destroy_kentries(&addrq); + } + if (nadd != NULL) + *nadd = xadd; + if (naddr != NULL) + *naddr = xaddr; + return (0); +_bad: + pfr_destroy_ktable(shadow, 0); + pfr_destroy_ktables(&tableq, 0); + pfr_destroy_kentries(&addrq); + return (rv); +} + +int +pfr_ina_rollback(struct pfr_table *trs, u_int32_t ticket, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p; + struct pf_ruleset *rs; + int xdel = 0; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + rs = pf_find_ruleset(trs->pfrt_anchor); + if (rs == NULL || !rs->topen || ticket != rs->tticket) + return (0); + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) || + pfr_skip_table(trs, p, 0)) + continue; + p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_INACTIVE; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xdel++; + } + if (!(flags & PFR_FLAG_DUMMY)) { + pfr_setflags_ktables(&workq); + rs->topen = 0; + pf_remove_if_empty_ruleset(rs); + } + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_ina_commit(struct pfr_table *trs, u_int32_t ticket, int *nadd, + int *nchange, int flags) +{ + struct pfr_ktable *p, *q; + struct pfr_ktableworkq workq; + struct pf_ruleset *rs; + int xadd = 0, xchange = 0; + long tzero = time_second; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + rs = pf_find_ruleset(trs->pfrt_anchor); + if (rs == NULL || !rs->topen || ticket != rs->tticket) + return (EBUSY); + + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) || + pfr_skip_table(trs, p, 0)) + continue; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + if (p->pfrkt_flags & PFR_TFLAG_ACTIVE) + xchange++; + else + xadd++; + } + + if (!(flags & PFR_FLAG_DUMMY)) { + for (p = SLIST_FIRST(&workq); p != NULL; p = q) { + q = SLIST_NEXT(p, pfrkt_workq); + pfr_commit_ktable(p, tzero); + } + rs->topen = 0; + pf_remove_if_empty_ruleset(rs); + } + if (nadd != NULL) + *nadd = xadd; + if (nchange != NULL) + *nchange = xchange; + + return (0); +} + +static void +pfr_commit_ktable(struct pfr_ktable *kt, long tzero) +{ + struct pfr_ktable *shadow = kt->pfrkt_shadow; + int nflags; + + PF_RULES_WASSERT(); + + if (shadow->pfrkt_cnt == NO_ADDRESSES) { + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + pfr_clstats_ktable(kt, tzero, 1); + } else if (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) { + /* kt might contain addresses */ + struct pfr_kentryworkq addrq, addq, changeq, delq, garbageq; + struct pfr_kentry *p, *q, *next; + struct pfr_addr ad; + + pfr_enqueue_addrs(shadow, &addrq, NULL, 0); + pfr_mark_addrs(kt); + SLIST_INIT(&addq); + SLIST_INIT(&changeq); + SLIST_INIT(&delq); + SLIST_INIT(&garbageq); + pfr_clean_node_mask(shadow, &addrq); + for (p = SLIST_FIRST(&addrq); p != NULL; p = next) { + next = SLIST_NEXT(p, pfrke_workq); /* XXX */ + pfr_copyout_addr(&ad, p); + q = pfr_lookup_addr(kt, &ad, 1); + if (q != NULL) { + if (q->pfrke_not != p->pfrke_not) + SLIST_INSERT_HEAD(&changeq, q, + pfrke_workq); + q->pfrke_mark = 1; + SLIST_INSERT_HEAD(&garbageq, p, pfrke_workq); + } else { + p->pfrke_tzero = tzero; + SLIST_INSERT_HEAD(&addq, p, pfrke_workq); + } + } + pfr_enqueue_addrs(kt, &delq, NULL, ENQUEUE_UNMARKED_ONLY); + pfr_insert_kentries(kt, &addq, tzero); + pfr_remove_kentries(kt, &delq); + pfr_clstats_kentries(&changeq, tzero, INVERT_NEG_FLAG); + pfr_destroy_kentries(&garbageq); + } else { + /* kt cannot contain addresses */ + SWAP(struct radix_node_head *, kt->pfrkt_ip4, + shadow->pfrkt_ip4); + SWAP(struct radix_node_head *, kt->pfrkt_ip6, + shadow->pfrkt_ip6); + SWAP(int, kt->pfrkt_cnt, shadow->pfrkt_cnt); + pfr_clstats_ktable(kt, tzero, 1); + } + nflags = ((shadow->pfrkt_flags & PFR_TFLAG_USRMASK) | + (kt->pfrkt_flags & PFR_TFLAG_SETMASK) | PFR_TFLAG_ACTIVE) + & ~PFR_TFLAG_INACTIVE; + pfr_destroy_ktable(shadow, 0); + kt->pfrkt_shadow = NULL; + pfr_setflags_ktable(kt, nflags); +} + +static int +pfr_validate_table(struct pfr_table *tbl, int allowedflags, int no_reserved) +{ + int i; + + if (!tbl->pfrt_name[0]) + return (-1); + if (no_reserved && !strcmp(tbl->pfrt_anchor, PF_RESERVED_ANCHOR)) + return (-1); + if (tbl->pfrt_name[PF_TABLE_NAME_SIZE-1]) + return (-1); + for (i = strlen(tbl->pfrt_name); i < PF_TABLE_NAME_SIZE; i++) + if (tbl->pfrt_name[i]) + return (-1); + if (pfr_fix_anchor(tbl->pfrt_anchor)) + return (-1); + if (tbl->pfrt_flags & ~allowedflags) + return (-1); + return (0); +} + +/* + * Rewrite anchors referenced by tables to remove slashes + * and check for validity. + */ +static int +pfr_fix_anchor(char *anchor) +{ + size_t siz = MAXPATHLEN; + int i; + + if (anchor[0] == '/') { + char *path; + int off; + + path = anchor; + off = 1; + while (*++path == '/') + off++; + bcopy(path, anchor, siz - off); + memset(anchor + siz - off, 0, off); + } + if (anchor[siz - 1]) + return (-1); + for (i = strlen(anchor); i < siz; i++) + if (anchor[i]) + return (-1); + return (0); +} + +static int +pfr_table_count(struct pfr_table *filter, int flags) +{ + struct pf_ruleset *rs; + + PF_RULES_ASSERT(); + + if (flags & PFR_FLAG_ALLRSETS) + return (pfr_ktable_cnt); + if (filter->pfrt_anchor[0]) { + rs = pf_find_ruleset(filter->pfrt_anchor); + return ((rs != NULL) ? rs->tables : -1); + } + return (pf_main_ruleset.tables); +} + +static int +pfr_skip_table(struct pfr_table *filter, struct pfr_ktable *kt, int flags) +{ + if (flags & PFR_FLAG_ALLRSETS) + return (0); + if (strcmp(filter->pfrt_anchor, kt->pfrkt_anchor)) + return (1); + return (0); +} + +static void +pfr_insert_ktables(struct pfr_ktableworkq *workq) +{ + struct pfr_ktable *p; + + SLIST_FOREACH(p, workq, pfrkt_workq) + pfr_insert_ktable(p); +} + +static void +pfr_insert_ktable(struct pfr_ktable *kt) +{ + + PF_RULES_WASSERT(); + + RB_INSERT(pfr_ktablehead, &pfr_ktables, kt); + pfr_ktable_cnt++; + if (kt->pfrkt_root != NULL) + if (!kt->pfrkt_root->pfrkt_refcnt[PFR_REFCNT_ANCHOR]++) + pfr_setflags_ktable(kt->pfrkt_root, + kt->pfrkt_root->pfrkt_flags|PFR_TFLAG_REFDANCHOR); +} + +static void +pfr_setflags_ktables(struct pfr_ktableworkq *workq) +{ + struct pfr_ktable *p, *q; + + for (p = SLIST_FIRST(workq); p; p = q) { + q = SLIST_NEXT(p, pfrkt_workq); + pfr_setflags_ktable(p, p->pfrkt_nflags); + } +} + +static void +pfr_setflags_ktable(struct pfr_ktable *kt, int newf) +{ + struct pfr_kentryworkq addrq; + + PF_RULES_WASSERT(); + + if (!(newf & PFR_TFLAG_REFERENCED) && + !(newf & PFR_TFLAG_PERSIST)) + newf &= ~PFR_TFLAG_ACTIVE; + if (!(newf & PFR_TFLAG_ACTIVE)) + newf &= ~PFR_TFLAG_USRMASK; + if (!(newf & PFR_TFLAG_SETMASK)) { + RB_REMOVE(pfr_ktablehead, &pfr_ktables, kt); + if (kt->pfrkt_root != NULL) + if (!--kt->pfrkt_root->pfrkt_refcnt[PFR_REFCNT_ANCHOR]) + pfr_setflags_ktable(kt->pfrkt_root, + kt->pfrkt_root->pfrkt_flags & + ~PFR_TFLAG_REFDANCHOR); + pfr_destroy_ktable(kt, 1); + pfr_ktable_cnt--; + return; + } + if (!(newf & PFR_TFLAG_ACTIVE) && kt->pfrkt_cnt) { + pfr_enqueue_addrs(kt, &addrq, NULL, 0); + pfr_remove_kentries(kt, &addrq); + } + if (!(newf & PFR_TFLAG_INACTIVE) && kt->pfrkt_shadow != NULL) { + pfr_destroy_ktable(kt->pfrkt_shadow, 1); + kt->pfrkt_shadow = NULL; + } + kt->pfrkt_flags = newf; +} + +static void +pfr_clstats_ktables(struct pfr_ktableworkq *workq, long tzero, int recurse) +{ + struct pfr_ktable *p; + + SLIST_FOREACH(p, workq, pfrkt_workq) + pfr_clstats_ktable(p, tzero, recurse); +} + +static void +pfr_clstats_ktable(struct pfr_ktable *kt, long tzero, int recurse) +{ + struct pfr_kentryworkq addrq; + + if (recurse) { + pfr_enqueue_addrs(kt, &addrq, NULL, 0); + pfr_clstats_kentries(&addrq, tzero, 0); + } + bzero(kt->pfrkt_packets, sizeof(kt->pfrkt_packets)); + bzero(kt->pfrkt_bytes, sizeof(kt->pfrkt_bytes)); + kt->pfrkt_match = kt->pfrkt_nomatch = 0; + kt->pfrkt_tzero = tzero; +} + +static struct pfr_ktable * +pfr_create_ktable(struct pfr_table *tbl, long tzero, int attachruleset) +{ + struct pfr_ktable *kt; + struct pf_ruleset *rs; + + PF_RULES_WASSERT(); + + kt = malloc(sizeof(*kt), M_PFTABLE, M_NOWAIT|M_ZERO); + if (kt == NULL) + return (NULL); + kt->pfrkt_t = *tbl; + + if (attachruleset) { + rs = pf_find_or_create_ruleset(tbl->pfrt_anchor); + if (!rs) { + pfr_destroy_ktable(kt, 0); + return (NULL); + } + kt->pfrkt_rs = rs; + rs->tables++; + } + + if (!rn_inithead((void **)&kt->pfrkt_ip4, + offsetof(struct sockaddr_in, sin_addr) * 8) || + !rn_inithead((void **)&kt->pfrkt_ip6, + offsetof(struct sockaddr_in6, sin6_addr) * 8)) { + pfr_destroy_ktable(kt, 0); + return (NULL); + } + kt->pfrkt_tzero = tzero; + + return (kt); +} + +static void +pfr_destroy_ktables(struct pfr_ktableworkq *workq, int flushaddr) +{ + struct pfr_ktable *p, *q; + + for (p = SLIST_FIRST(workq); p; p = q) { + q = SLIST_NEXT(p, pfrkt_workq); + pfr_destroy_ktable(p, flushaddr); + } +} + +static void +pfr_destroy_ktable(struct pfr_ktable *kt, int flushaddr) +{ + struct pfr_kentryworkq addrq; + + if (flushaddr) { + pfr_enqueue_addrs(kt, &addrq, NULL, 0); + pfr_clean_node_mask(kt, &addrq); + pfr_destroy_kentries(&addrq); + } + if (kt->pfrkt_ip4 != NULL) + rn_detachhead((void **)&kt->pfrkt_ip4); + if (kt->pfrkt_ip6 != NULL) + rn_detachhead((void **)&kt->pfrkt_ip6); + if (kt->pfrkt_shadow != NULL) + pfr_destroy_ktable(kt->pfrkt_shadow, flushaddr); + if (kt->pfrkt_rs != NULL) { + kt->pfrkt_rs->tables--; + pf_remove_if_empty_ruleset(kt->pfrkt_rs); + } + free(kt, M_PFTABLE); +} + +static int +pfr_ktable_compare(struct pfr_ktable *p, struct pfr_ktable *q) +{ + int d; + + if ((d = strncmp(p->pfrkt_name, q->pfrkt_name, PF_TABLE_NAME_SIZE))) + return (d); + return (strcmp(p->pfrkt_anchor, q->pfrkt_anchor)); +} + +static struct pfr_ktable * +pfr_lookup_table(struct pfr_table *tbl) +{ + /* struct pfr_ktable start like a struct pfr_table */ + return (RB_FIND(pfr_ktablehead, &pfr_ktables, + (struct pfr_ktable *)tbl)); +} + +int +pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af) +{ + struct pfr_kentry *ke = NULL; + int match; + + PF_RULES_RASSERT(); + + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (0); + + switch (af) { +#ifdef INET + case AF_INET: + { + struct sockaddr_in sin; + + bzero(&sin, sizeof(sin)); + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = a->addr32[0]; + ke = (struct pfr_kentry *)rn_match(&sin, &kt->pfrkt_ip4->rh); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 sin6; + + bzero(&sin6, sizeof(sin6)); + sin6.sin6_len = sizeof(sin6); + sin6.sin6_family = AF_INET6; + bcopy(a, &sin6.sin6_addr, sizeof(sin6.sin6_addr)); + ke = (struct pfr_kentry *)rn_match(&sin6, &kt->pfrkt_ip6->rh); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + break; + } +#endif /* INET6 */ + } + match = (ke && !ke->pfrke_not); + if (match) + kt->pfrkt_match++; + else + kt->pfrkt_nomatch++; + return (match); +} + +void +pfr_update_stats(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af, + u_int64_t len, int dir_out, int op_pass, int notrule) +{ + struct pfr_kentry *ke = NULL; + + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return; + + switch (af) { +#ifdef INET + case AF_INET: + { + struct sockaddr_in sin; + + bzero(&sin, sizeof(sin)); + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = a->addr32[0]; + ke = (struct pfr_kentry *)rn_match(&sin, &kt->pfrkt_ip4->rh); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 sin6; + + bzero(&sin6, sizeof(sin6)); + sin6.sin6_len = sizeof(sin6); + sin6.sin6_family = AF_INET6; + bcopy(a, &sin6.sin6_addr, sizeof(sin6.sin6_addr)); + ke = (struct pfr_kentry *)rn_match(&sin6, &kt->pfrkt_ip6->rh); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + break; + } +#endif /* INET6 */ + default: + panic("%s: unknown address family %u", __func__, af); + } + if ((ke == NULL || ke->pfrke_not) != notrule) { + if (op_pass != PFR_OP_PASS) + printf("pfr_update_stats: assertion failed.\n"); + op_pass = PFR_OP_XPASS; + } + kt->pfrkt_packets[dir_out][op_pass]++; + kt->pfrkt_bytes[dir_out][op_pass] += len; + if (ke != NULL && op_pass != PFR_OP_XPASS && + (kt->pfrkt_flags & PFR_TFLAG_COUNTERS)) { + if (ke->pfrke_counters == NULL) + ke->pfrke_counters = uma_zalloc(V_pfr_kcounters_z, + M_NOWAIT | M_ZERO); + if (ke->pfrke_counters != NULL) { + ke->pfrke_counters->pfrkc_packets[dir_out][op_pass]++; + ke->pfrke_counters->pfrkc_bytes[dir_out][op_pass] += len; + } + } +} + +struct pfr_ktable * +pfr_attach_table(struct pf_ruleset *rs, char *name) +{ + struct pfr_ktable *kt, *rt; + struct pfr_table tbl; + struct pf_anchor *ac = rs->anchor; + + PF_RULES_WASSERT(); + + bzero(&tbl, sizeof(tbl)); + strlcpy(tbl.pfrt_name, name, sizeof(tbl.pfrt_name)); + if (ac != NULL) + strlcpy(tbl.pfrt_anchor, ac->path, sizeof(tbl.pfrt_anchor)); + kt = pfr_lookup_table(&tbl); + if (kt == NULL) { + kt = pfr_create_ktable(&tbl, time_second, 1); + if (kt == NULL) + return (NULL); + if (ac != NULL) { + bzero(tbl.pfrt_anchor, sizeof(tbl.pfrt_anchor)); + rt = pfr_lookup_table(&tbl); + if (rt == NULL) { + rt = pfr_create_ktable(&tbl, 0, 1); + if (rt == NULL) { + pfr_destroy_ktable(kt, 0); + return (NULL); + } + pfr_insert_ktable(rt); + } + kt->pfrkt_root = rt; + } + pfr_insert_ktable(kt); + } + if (!kt->pfrkt_refcnt[PFR_REFCNT_RULE]++) + pfr_setflags_ktable(kt, kt->pfrkt_flags|PFR_TFLAG_REFERENCED); + return (kt); +} + +void +pfr_detach_table(struct pfr_ktable *kt) +{ + + PF_RULES_WASSERT(); + KASSERT(kt->pfrkt_refcnt[PFR_REFCNT_RULE] > 0, ("%s: refcount %d\n", + __func__, kt->pfrkt_refcnt[PFR_REFCNT_RULE])); + + if (!--kt->pfrkt_refcnt[PFR_REFCNT_RULE]) + pfr_setflags_ktable(kt, kt->pfrkt_flags&~PFR_TFLAG_REFERENCED); +} + +int +pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter, + sa_family_t af) +{ + struct pf_addr *addr, *cur, *mask; + union sockaddr_union uaddr, umask; + struct pfr_kentry *ke, *ke2 = NULL; + int idx = -1, use_counter = 0; + + switch (af) { + case AF_INET: + uaddr.sin.sin_len = sizeof(struct sockaddr_in); + uaddr.sin.sin_family = AF_INET; + break; + case AF_INET6: + uaddr.sin6.sin6_len = sizeof(struct sockaddr_in6); + uaddr.sin6.sin6_family = AF_INET6; + break; + } + addr = SUNION2PF(&uaddr, af); + + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (-1); + + if (pidx != NULL) + idx = *pidx; + if (counter != NULL && idx >= 0) + use_counter = 1; + if (idx < 0) + idx = 0; + +_next_block: + ke = pfr_kentry_byidx(kt, idx, af); + if (ke == NULL) { + kt->pfrkt_nomatch++; + return (1); + } + pfr_prepare_network(&umask, af, ke->pfrke_net); + cur = SUNION2PF(&ke->pfrke_sa, af); + mask = SUNION2PF(&umask, af); + + if (use_counter) { + /* is supplied address within block? */ + if (!PF_MATCHA(0, cur, mask, counter, af)) { + /* no, go to next block in table */ + idx++; + use_counter = 0; + goto _next_block; + } + PF_ACPY(addr, counter, af); + } else { + /* use first address of block */ + PF_ACPY(addr, cur, af); + } + + if (!KENTRY_NETWORK(ke)) { + /* this is a single IP address - no possible nested block */ + PF_ACPY(counter, addr, af); + *pidx = idx; + kt->pfrkt_match++; + return (0); + } + for (;;) { + /* we don't want to use a nested block */ + switch (af) { + case AF_INET: + ke2 = (struct pfr_kentry *)rn_match(&uaddr, + &kt->pfrkt_ip4->rh); + break; + case AF_INET6: + ke2 = (struct pfr_kentry *)rn_match(&uaddr, + &kt->pfrkt_ip6->rh); + break; + } + /* no need to check KENTRY_RNF_ROOT() here */ + if (ke2 == ke) { + /* lookup return the same block - perfect */ + PF_ACPY(counter, addr, af); + *pidx = idx; + kt->pfrkt_match++; + return (0); + } + + /* we need to increase the counter past the nested block */ + pfr_prepare_network(&umask, AF_INET, ke2->pfrke_net); + PF_POOLMASK(addr, addr, SUNION2PF(&umask, af), &pfr_ffaddr, af); + PF_AINC(addr, af); + if (!PF_MATCHA(0, cur, mask, addr, af)) { + /* ok, we reached the end of our main block */ + /* go to next block in table */ + idx++; + use_counter = 0; + goto _next_block; + } + } +} + +static struct pfr_kentry * +pfr_kentry_byidx(struct pfr_ktable *kt, int idx, int af) +{ + struct pfr_walktree w; + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_POOL_GET; + w.pfrw_cnt = idx; + + switch (af) { +#ifdef INET + case AF_INET: + kt->pfrkt_ip4->rnh_walktree(&kt->pfrkt_ip4->rh, pfr_walktree, &w); + return (w.pfrw_kentry); +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + kt->pfrkt_ip6->rnh_walktree(&kt->pfrkt_ip6->rh, pfr_walktree, &w); + return (w.pfrw_kentry); +#endif /* INET6 */ + default: + return (NULL); + } +} + +void +pfr_dynaddr_update(struct pfr_ktable *kt, struct pfi_dynaddr *dyn) +{ + struct pfr_walktree w; + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_DYNADDR_UPDATE; + w.pfrw_dyn = dyn; + + dyn->pfid_acnt4 = 0; + dyn->pfid_acnt6 = 0; + if (!dyn->pfid_af || dyn->pfid_af == AF_INET) + kt->pfrkt_ip4->rnh_walktree(&kt->pfrkt_ip4->rh, pfr_walktree, &w); + if (!dyn->pfid_af || dyn->pfid_af == AF_INET6) + kt->pfrkt_ip6->rnh_walktree(&kt->pfrkt_ip6->rh, pfr_walktree, &w); +} |