summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netinet
diff options
context:
space:
mode:
authorSebastian Huber <sebastian.huber@embedded-brains.de>2013-10-09 22:42:09 +0200
committerSebastian Huber <sebastian.huber@embedded-brains.de>2013-10-10 09:06:58 +0200
commitbceabc95c1c85d793200446fa85f1ddc6313ea29 (patch)
tree973c8bd8deca9fd69913f2895cc91e0e6114d46c /freebsd/sys/netinet
parentAdd FreeBSD sources as a submodule (diff)
downloadrtems-libbsd-bceabc95c1c85d793200446fa85f1ddc6313ea29.tar.bz2
Move files to match FreeBSD layout
Diffstat (limited to 'freebsd/sys/netinet')
-rw-r--r--freebsd/sys/netinet/accf_data.c68
-rw-r--r--freebsd/sys/netinet/accf_dns.c134
-rw-r--r--freebsd/sys/netinet/accf_http.c351
-rw-r--r--freebsd/sys/netinet/icmp6.h2
-rw-r--r--freebsd/sys/netinet/icmp_var.h108
-rw-r--r--freebsd/sys/netinet/if_atm.c366
-rw-r--r--freebsd/sys/netinet/if_atm.h47
-rw-r--r--freebsd/sys/netinet/if_ether.c859
-rw-r--r--freebsd/sys/netinet/if_ether.h2
-rw-r--r--freebsd/sys/netinet/igmp.c3655
-rw-r--r--freebsd/sys/netinet/igmp.h2
-rw-r--r--freebsd/sys/netinet/igmp_var.h225
-rw-r--r--freebsd/sys/netinet/in.c1601
-rw-r--r--freebsd/sys/netinet/in.h2
-rw-r--r--freebsd/sys/netinet/in_gif.c469
-rw-r--r--freebsd/sys/netinet/in_gif.h45
-rw-r--r--freebsd/sys/netinet/in_mcast.c2902
-rw-r--r--freebsd/sys/netinet/in_pcb.c1958
-rw-r--r--freebsd/sys/netinet/in_pcb.h525
-rw-r--r--freebsd/sys/netinet/in_proto.c400
-rw-r--r--freebsd/sys/netinet/in_rmx.c516
-rw-r--r--freebsd/sys/netinet/in_systm.h2
-rw-r--r--freebsd/sys/netinet/in_var.h475
-rw-r--r--freebsd/sys/netinet/ip.h2
-rw-r--r--freebsd/sys/netinet/ip6.h2
-rw-r--r--freebsd/sys/netinet/ip_carp.c2427
-rw-r--r--freebsd/sys/netinet/ip_carp.h191
-rw-r--r--freebsd/sys/netinet/ip_divert.c818
-rw-r--r--freebsd/sys/netinet/ip_divert.h55
-rw-r--r--freebsd/sys/netinet/ip_dummynet.h263
-rw-r--r--freebsd/sys/netinet/ip_ecn.c194
-rw-r--r--freebsd/sys/netinet/ip_ecn.h53
-rw-r--r--freebsd/sys/netinet/ip_encap.c465
-rw-r--r--freebsd/sys/netinet/ip_encap.h64
-rw-r--r--freebsd/sys/netinet/ip_fastfwd.c619
-rw-r--r--freebsd/sys/netinet/ip_fw.h579
-rw-r--r--freebsd/sys/netinet/ip_gre.c336
-rw-r--r--freebsd/sys/netinet/ip_gre.h43
-rw-r--r--freebsd/sys/netinet/ip_icmp.c986
-rw-r--r--freebsd/sys/netinet/ip_icmp.h2
-rw-r--r--freebsd/sys/netinet/ip_id.c211
-rw-r--r--freebsd/sys/netinet/ip_input.c1794
-rw-r--r--freebsd/sys/netinet/ip_ipsec.c424
-rw-r--r--freebsd/sys/netinet/ip_ipsec.h41
-rw-r--r--freebsd/sys/netinet/ip_mroute.c2952
-rw-r--r--freebsd/sys/netinet/ip_mroute.h359
-rw-r--r--freebsd/sys/netinet/ip_options.c747
-rw-r--r--freebsd/sys/netinet/ip_options.h60
-rw-r--r--freebsd/sys/netinet/ip_output.c1284
-rw-r--r--freebsd/sys/netinet/ip_var.h315
-rw-r--r--freebsd/sys/netinet/ipfw/dn_heap.c552
-rw-r--r--freebsd/sys/netinet/ipfw/dn_heap.h191
-rw-r--r--freebsd/sys/netinet/ipfw/dn_sched.h189
-rw-r--r--freebsd/sys/netinet/ipfw/dn_sched_fifo.c122
-rw-r--r--freebsd/sys/netinet/ipfw/dn_sched_prio.c231
-rw-r--r--freebsd/sys/netinet/ipfw/dn_sched_qfq.c866
-rw-r--r--freebsd/sys/netinet/ipfw/dn_sched_rr.c309
-rw-r--r--freebsd/sys/netinet/ipfw/dn_sched_wf2q.c375
-rw-r--r--freebsd/sys/netinet/ipfw/ip_dn_glue.c847
-rw-r--r--freebsd/sys/netinet/ipfw/ip_dn_io.c796
-rw-r--r--freebsd/sys/netinet/ipfw/ip_dn_private.h402
-rw-r--r--freebsd/sys/netinet/ipfw/ip_dummynet.c2297
-rw-r--r--freebsd/sys/netinet/ipfw/ip_fw2.c2495
-rw-r--r--freebsd/sys/netinet/ipfw/ip_fw_log.c451
-rw-r--r--freebsd/sys/netinet/ipfw/ip_fw_nat.c606
-rw-r--r--freebsd/sys/netinet/ipfw/ip_fw_pfil.c417
-rw-r--r--freebsd/sys/netinet/ipfw/ip_fw_private.h301
-rw-r--r--freebsd/sys/netinet/ipfw/ip_fw_sockopt.c1345
-rw-r--r--freebsd/sys/netinet/ipfw/ip_fw_table.c288
-rw-r--r--freebsd/sys/netinet/libalias/alias.c1793
-rw-r--r--freebsd/sys/netinet/libalias/alias.h232
-rw-r--r--freebsd/sys/netinet/libalias/alias_cuseeme.c230
-rw-r--r--freebsd/sys/netinet/libalias/alias_db.c2940
-rw-r--r--freebsd/sys/netinet/libalias/alias_dummy.c155
-rw-r--r--freebsd/sys/netinet/libalias/alias_ftp.c696
-rw-r--r--freebsd/sys/netinet/libalias/alias_irc.c490
-rw-r--r--freebsd/sys/netinet/libalias/alias_local.h397
-rw-r--r--freebsd/sys/netinet/libalias/alias_mod.c292
-rw-r--r--freebsd/sys/netinet/libalias/alias_mod.h163
-rw-r--r--freebsd/sys/netinet/libalias/alias_nbt.c855
-rw-r--r--freebsd/sys/netinet/libalias/alias_pptp.c525
-rw-r--r--freebsd/sys/netinet/libalias/alias_proxy.c870
-rw-r--r--freebsd/sys/netinet/libalias/alias_sctp.c2700
-rw-r--r--freebsd/sys/netinet/libalias/alias_sctp.h201
-rw-r--r--freebsd/sys/netinet/libalias/alias_skinny.c449
-rw-r--r--freebsd/sys/netinet/libalias/alias_smedia.c551
-rw-r--r--freebsd/sys/netinet/libalias/alias_util.c178
-rw-r--r--freebsd/sys/netinet/pim.h119
-rw-r--r--freebsd/sys/netinet/pim_var.h84
-rw-r--r--freebsd/sys/netinet/raw_ip.c1116
-rw-r--r--freebsd/sys/netinet/sctp.h549
-rw-r--r--freebsd/sys/netinet/sctp_asconf.c3397
-rw-r--r--freebsd/sys/netinet/sctp_asconf.h96
-rw-r--r--freebsd/sys/netinet/sctp_auth.c2128
-rw-r--r--freebsd/sys/netinet/sctp_auth.h235
-rw-r--r--freebsd/sys/netinet/sctp_bsd_addr.c562
-rw-r--r--freebsd/sys/netinet/sctp_bsd_addr.h63
-rw-r--r--freebsd/sys/netinet/sctp_cc_functions.c1565
-rw-r--r--freebsd/sys/netinet/sctp_cc_functions.h116
-rw-r--r--freebsd/sys/netinet/sctp_constants.h1051
-rw-r--r--freebsd/sys/netinet/sctp_crc32.c148
-rw-r--r--freebsd/sys/netinet/sctp_crc32.h47
-rw-r--r--freebsd/sys/netinet/sctp_header.h624
-rw-r--r--freebsd/sys/netinet/sctp_indata.c5800
-rw-r--r--freebsd/sys/netinet/sctp_indata.h129
-rw-r--r--freebsd/sys/netinet/sctp_input.c5965
-rw-r--r--freebsd/sys/netinet/sctp_input.h57
-rw-r--r--freebsd/sys/netinet/sctp_lock_bsd.h430
-rw-r--r--freebsd/sys/netinet/sctp_os.h72
-rw-r--r--freebsd/sys/netinet/sctp_os_bsd.h503
-rw-r--r--freebsd/sys/netinet/sctp_output.c13539
-rw-r--r--freebsd/sys/netinet/sctp_output.h229
-rw-r--r--freebsd/sys/netinet/sctp_pcb.c6810
-rw-r--r--freebsd/sys/netinet/sctp_pcb.h632
-rw-r--r--freebsd/sys/netinet/sctp_peeloff.c240
-rw-r--r--freebsd/sys/netinet/sctp_peeloff.h52
-rw-r--r--freebsd/sys/netinet/sctp_structs.h1094
-rw-r--r--freebsd/sys/netinet/sctp_sysctl.c1108
-rw-r--r--freebsd/sys/netinet/sctp_sysctl.h532
-rw-r--r--freebsd/sys/netinet/sctp_timer.c1804
-rw-r--r--freebsd/sys/netinet/sctp_timer.h101
-rw-r--r--freebsd/sys/netinet/sctp_uio.h1166
-rw-r--r--freebsd/sys/netinet/sctp_usrreq.c4918
-rw-r--r--freebsd/sys/netinet/sctp_var.h336
-rw-r--r--freebsd/sys/netinet/sctputil.c6977
-rw-r--r--freebsd/sys/netinet/sctputil.h392
-rw-r--r--freebsd/sys/netinet/tcp.h2
-rw-r--r--freebsd/sys/netinet/tcp_debug.c226
-rw-r--r--freebsd/sys/netinet/tcp_debug.h80
-rw-r--r--freebsd/sys/netinet/tcp_fsm.h112
-rw-r--r--freebsd/sys/netinet/tcp_hostcache.c693
-rw-r--r--freebsd/sys/netinet/tcp_hostcache.h82
-rw-r--r--freebsd/sys/netinet/tcp_input.c3453
-rw-r--r--freebsd/sys/netinet/tcp_lro.c389
-rw-r--r--freebsd/sys/netinet/tcp_lro.h85
-rw-r--r--freebsd/sys/netinet/tcp_offload.c147
-rw-r--r--freebsd/sys/netinet/tcp_offload.h354
-rw-r--r--freebsd/sys/netinet/tcp_output.c1485
-rw-r--r--freebsd/sys/netinet/tcp_reass.c335
-rw-r--r--freebsd/sys/netinet/tcp_sack.c687
-rw-r--r--freebsd/sys/netinet/tcp_seq.h68
-rw-r--r--freebsd/sys/netinet/tcp_subr.c2315
-rw-r--r--freebsd/sys/netinet/tcp_syncache.c1823
-rw-r--r--freebsd/sys/netinet/tcp_syncache.h127
-rw-r--r--freebsd/sys/netinet/tcp_timer.c660
-rw-r--r--freebsd/sys/netinet/tcp_timer.h183
-rw-r--r--freebsd/sys/netinet/tcp_timewait.c618
-rw-r--r--freebsd/sys/netinet/tcp_usrreq.c1886
-rw-r--r--freebsd/sys/netinet/tcp_var.h687
-rw-r--r--freebsd/sys/netinet/tcpip.h59
-rw-r--r--freebsd/sys/netinet/toedev.h162
-rw-r--r--freebsd/sys/netinet/udp.h2
-rw-r--r--freebsd/sys/netinet/udp_usrreq.c1633
-rw-r--r--freebsd/sys/netinet/udp_var.h161
154 files changed, 140097 insertions, 0 deletions
diff --git a/freebsd/sys/netinet/accf_data.c b/freebsd/sys/netinet/accf_data.c
new file mode 100644
index 00000000..15696daf
--- /dev/null
+++ b/freebsd/sys/netinet/accf_data.c
@@ -0,0 +1,68 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2000 Alfred Perlstein <alfred@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define ACCEPT_FILTER_MOD
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/signalvar.h>
+#include <freebsd/sys/socketvar.h>
+
+/* accept filter that holds a socket until data arrives */
+
+static int sohasdata(struct socket *so, void *arg, int waitflag);
+
+static struct accept_filter accf_data_filter = {
+ "dataready",
+ sohasdata,
+ NULL,
+ NULL
+};
+
+static moduledata_t accf_data_mod = {
+ "accf_data",
+ accept_filt_generic_mod_event,
+ &accf_data_filter
+};
+
+DECLARE_MODULE(accf_data, accf_data_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
+
+static int
+sohasdata(struct socket *so, void *arg, int waitflag)
+{
+
+ if (!soreadable(so))
+ return (SU_OK);
+
+ return (SU_ISCONNECTED);
+}
diff --git a/freebsd/sys/netinet/accf_dns.c b/freebsd/sys/netinet/accf_dns.c
new file mode 100644
index 00000000..f91cbb08
--- /dev/null
+++ b/freebsd/sys/netinet/accf_dns.c
@@ -0,0 +1,134 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*
+ * Copyright (C) 2007 David Malone <dwmalone@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define ACCEPT_FILTER_MOD
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/signalvar.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/socketvar.h>
+
+/* check for full DNS request */
+static int sohasdns(struct socket *so, void *arg, int waitflag);
+
+struct packet {
+ struct mbuf *m; /* Current mbuf. */
+ struct mbuf *n; /* nextpkt mbuf. */
+ unsigned long moff; /* Offset of the beginning of m. */
+ unsigned long offset; /* Which offset we are working at. */
+ unsigned long len; /* The number of bytes we have to play with. */
+};
+
+#define DNS_OK 0
+#define DNS_WAIT -1
+#define DNS_RUN -2
+
+/* check we can skip over various parts of DNS request */
+static int skippacket(struct sockbuf *sb);
+
+static struct accept_filter accf_dns_filter = {
+ "dnsready",
+ sohasdns,
+ NULL,
+ NULL
+};
+
+static moduledata_t accf_dns_mod = {
+ "accf_dns",
+ accept_filt_generic_mod_event,
+ &accf_dns_filter
+};
+
+DECLARE_MODULE(accf_dns, accf_dns_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
+
+static int
+sohasdns(struct socket *so, void *arg, int waitflag)
+{
+ struct sockbuf *sb = &so->so_rcv;
+
+ /* If the socket is full, we're ready. */
+ if (sb->sb_cc >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax)
+ goto ready;
+
+ /* Check to see if we have a request. */
+ if (skippacket(sb) == DNS_WAIT)
+ return (SU_OK);
+
+ready:
+ return (SU_ISCONNECTED);
+}
+
+#define GET8(p, val) do { \
+ if (p->offset < p->moff) \
+ return DNS_RUN; \
+ while (p->offset >= p->moff + p->m->m_len) { \
+ p->moff += p->m->m_len; \
+ p->m = p->m->m_next; \
+ if (p->m == NULL) { \
+ p->m = p->n; \
+ p->n = p->m->m_nextpkt; \
+ } \
+ if (p->m == NULL) \
+ return DNS_WAIT; \
+ } \
+ val = *(mtod(p->m, unsigned char *) + (p->offset - p->moff)); \
+ p->offset++; \
+ } while (0)
+
+#define GET16(p, val) do { \
+ unsigned int v0, v1; \
+ GET8(p, v0); \
+ GET8(p, v1); \
+ val = v0 * 0x100 + v1; \
+ } while (0)
+
+static int
+skippacket(struct sockbuf *sb) {
+ unsigned long packlen;
+ struct packet q, *p = &q;
+
+ if (sb->sb_cc < 2)
+ return DNS_WAIT;
+
+ q.m = sb->sb_mb;
+ q.n = q.m->m_nextpkt;
+ q.moff = 0;
+ q.offset = 0;
+ q.len = sb->sb_cc;
+
+ GET16(p, packlen);
+ if (packlen + 2 > q.len)
+ return DNS_WAIT;
+
+ return DNS_OK;
+}
diff --git a/freebsd/sys/netinet/accf_http.c b/freebsd/sys/netinet/accf_http.c
new file mode 100644
index 00000000..ce21b1d1
--- /dev/null
+++ b/freebsd/sys/netinet/accf_http.c
@@ -0,0 +1,351 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2000 Paycounter, Inc.
+ * Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define ACCEPT_FILTER_MOD
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/signalvar.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/socketvar.h>
+
+/* check for GET/HEAD */
+static int sohashttpget(struct socket *so, void *arg, int waitflag);
+/* check for HTTP/1.0 or HTTP/1.1 */
+static int soparsehttpvers(struct socket *so, void *arg, int waitflag);
+/* check for end of HTTP/1.x request */
+static int soishttpconnected(struct socket *so, void *arg, int waitflag);
+/* strcmp on an mbuf chain */
+static int mbufstrcmp(struct mbuf *m, struct mbuf *npkt, int offset, char *cmp);
+/* strncmp on an mbuf chain */
+static int mbufstrncmp(struct mbuf *m, struct mbuf *npkt, int offset,
+ int max, char *cmp);
+/* socketbuffer is full */
+static int sbfull(struct sockbuf *sb);
+
+static struct accept_filter accf_http_filter = {
+ "httpready",
+ sohashttpget,
+ NULL,
+ NULL
+};
+
+static moduledata_t accf_http_mod = {
+ "accf_http",
+ accept_filt_generic_mod_event,
+ &accf_http_filter
+};
+
+DECLARE_MODULE(accf_http, accf_http_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
+
+static int parse_http_version = 1;
+
+SYSCTL_NODE(_net_inet_accf, OID_AUTO, http, CTLFLAG_RW, 0,
+"HTTP accept filter");
+SYSCTL_INT(_net_inet_accf_http, OID_AUTO, parsehttpversion, CTLFLAG_RW,
+&parse_http_version, 1,
+"Parse http version so that non 1.x requests work");
+
+#ifdef ACCF_HTTP_DEBUG
+#define DPRINT(fmt, args...) \
+ do { \
+ printf("%s:%d: " fmt "\n", __func__, __LINE__, ##args); \
+ } while (0)
+#else
+#define DPRINT(fmt, args...)
+#endif
+
+static int
+sbfull(struct sockbuf *sb)
+{
+
+ DPRINT("sbfull, cc(%ld) >= hiwat(%ld): %d, "
+ "mbcnt(%ld) >= mbmax(%ld): %d",
+ sb->sb_cc, sb->sb_hiwat, sb->sb_cc >= sb->sb_hiwat,
+ sb->sb_mbcnt, sb->sb_mbmax, sb->sb_mbcnt >= sb->sb_mbmax);
+ return (sb->sb_cc >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax);
+}
+
+/*
+ * start at mbuf m, (must provide npkt if exists)
+ * starting at offset in m compare characters in mbuf chain for 'cmp'
+ */
+static int
+mbufstrcmp(struct mbuf *m, struct mbuf *npkt, int offset, char *cmp)
+{
+ struct mbuf *n;
+
+ for (; m != NULL; m = n) {
+ n = npkt;
+ if (npkt)
+ npkt = npkt->m_nextpkt;
+ for (; m; m = m->m_next) {
+ for (; offset < m->m_len; offset++, cmp++) {
+ if (*cmp == '\0')
+ return (1);
+ else if (*cmp != *(mtod(m, char *) + offset))
+ return (0);
+ }
+ if (*cmp == '\0')
+ return (1);
+ offset = 0;
+ }
+ }
+ return (0);
+}
+
+/*
+ * start at mbuf m, (must provide npkt if exists)
+ * starting at offset in m compare characters in mbuf chain for 'cmp'
+ * stop at 'max' characters
+ */
+static int
+mbufstrncmp(struct mbuf *m, struct mbuf *npkt, int offset, int max, char *cmp)
+{
+ struct mbuf *n;
+
+ for (; m != NULL; m = n) {
+ n = npkt;
+ if (npkt)
+ npkt = npkt->m_nextpkt;
+ for (; m; m = m->m_next) {
+ for (; offset < m->m_len; offset++, cmp++, max--) {
+ if (max == 0 || *cmp == '\0')
+ return (1);
+ else if (*cmp != *(mtod(m, char *) + offset))
+ return (0);
+ }
+ if (max == 0 || *cmp == '\0')
+ return (1);
+ offset = 0;
+ }
+ }
+ return (0);
+}
+
+#define STRSETUP(sptr, slen, str) \
+ do { \
+ sptr = str; \
+ slen = sizeof(str) - 1; \
+ } while(0)
+
+static int
+sohashttpget(struct socket *so, void *arg, int waitflag)
+{
+
+ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) {
+ struct mbuf *m;
+ char *cmp;
+ int cmplen, cc;
+
+ m = so->so_rcv.sb_mb;
+ cc = so->so_rcv.sb_cc - 1;
+ if (cc < 1)
+ return (SU_OK);
+ switch (*mtod(m, char *)) {
+ case 'G':
+ STRSETUP(cmp, cmplen, "ET ");
+ break;
+ case 'H':
+ STRSETUP(cmp, cmplen, "EAD ");
+ break;
+ default:
+ goto fallout;
+ }
+ if (cc < cmplen) {
+ if (mbufstrncmp(m, m->m_nextpkt, 1, cc, cmp) == 1) {
+ DPRINT("short cc (%d) but mbufstrncmp ok", cc);
+ return (SU_OK);
+ } else {
+ DPRINT("short cc (%d) mbufstrncmp failed", cc);
+ goto fallout;
+ }
+ }
+ if (mbufstrcmp(m, m->m_nextpkt, 1, cmp) == 1) {
+ DPRINT("mbufstrcmp ok");
+ if (parse_http_version == 0)
+ return (soishttpconnected(so, arg, waitflag));
+ else
+ return (soparsehttpvers(so, arg, waitflag));
+ }
+ DPRINT("mbufstrcmp bad");
+ }
+
+fallout:
+ DPRINT("fallout");
+ return (SU_ISCONNECTED);
+}
+
+static int
+soparsehttpvers(struct socket *so, void *arg, int waitflag)
+{
+ struct mbuf *m, *n;
+ int i, cc, spaces, inspaces;
+
+ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv))
+ goto fallout;
+
+ m = so->so_rcv.sb_mb;
+ cc = so->so_rcv.sb_cc;
+ inspaces = spaces = 0;
+ for (m = so->so_rcv.sb_mb; m; m = n) {
+ n = m->m_nextpkt;
+ for (; m; m = m->m_next) {
+ for (i = 0; i < m->m_len; i++, cc--) {
+ switch (*(mtod(m, char *) + i)) {
+ case ' ':
+ /* tabs? '\t' */
+ if (!inspaces) {
+ spaces++;
+ inspaces = 1;
+ }
+ break;
+ case '\r':
+ case '\n':
+ DPRINT("newline");
+ goto fallout;
+ default:
+ if (spaces != 2) {
+ inspaces = 0;
+ break;
+ }
+
+ /*
+ * if we don't have enough characters
+ * left (cc < sizeof("HTTP/1.0") - 1)
+ * then see if the remaining ones
+ * are a request we can parse.
+ */
+ if (cc < sizeof("HTTP/1.0") - 1) {
+ if (mbufstrncmp(m, n, i, cc,
+ "HTTP/1.") == 1) {
+ DPRINT("ok");
+ goto readmore;
+ } else {
+ DPRINT("bad");
+ goto fallout;
+ }
+ } else if (
+ mbufstrcmp(m, n, i, "HTTP/1.0") ||
+ mbufstrcmp(m, n, i, "HTTP/1.1")) {
+ DPRINT("ok");
+ return (soishttpconnected(so,
+ arg, waitflag));
+ } else {
+ DPRINT("bad");
+ goto fallout;
+ }
+ }
+ }
+ }
+ }
+readmore:
+ DPRINT("readmore");
+ /*
+ * if we hit here we haven't hit something
+ * we don't understand or a newline, so try again
+ */
+ soupcall_set(so, SO_RCV, soparsehttpvers, arg);
+ return (SU_OK);
+
+fallout:
+ DPRINT("fallout");
+ return (SU_ISCONNECTED);
+}
+
+
+#define NCHRS 3
+
+static int
+soishttpconnected(struct socket *so, void *arg, int waitflag)
+{
+ char a, b, c;
+ struct mbuf *m, *n;
+ int ccleft, copied;
+
+ DPRINT("start");
+ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv))
+ goto gotit;
+
+ /*
+ * Walk the socketbuffer and copy the last NCHRS (3) into a, b, and c
+ * copied - how much we've copied so far
+ * ccleft - how many bytes remaining in the socketbuffer
+ * just loop over the mbufs subtracting from 'ccleft' until we only
+ * have NCHRS left
+ */
+ copied = 0;
+ ccleft = so->so_rcv.sb_cc;
+ if (ccleft < NCHRS)
+ goto readmore;
+ a = b = c = '\0';
+ for (m = so->so_rcv.sb_mb; m; m = n) {
+ n = m->m_nextpkt;
+ for (; m; m = m->m_next) {
+ ccleft -= m->m_len;
+ if (ccleft <= NCHRS) {
+ char *src;
+ int tocopy;
+
+ tocopy = (NCHRS - ccleft) - copied;
+ src = mtod(m, char *) + (m->m_len - tocopy);
+
+ while (tocopy--) {
+ switch (copied++) {
+ case 0:
+ a = *src++;
+ break;
+ case 1:
+ b = *src++;
+ break;
+ case 2:
+ c = *src++;
+ break;
+ }
+ }
+ }
+ }
+ }
+ if (c == '\n' && (b == '\n' || (b == '\r' && a == '\n'))) {
+ /* we have all request headers */
+ goto gotit;
+ }
+
+readmore:
+ soupcall_set(so, SO_RCV, soishttpconnected, arg);
+ return (SU_OK);
+
+gotit:
+ return (SU_ISCONNECTED);
+}
diff --git a/freebsd/sys/netinet/icmp6.h b/freebsd/sys/netinet/icmp6.h
new file mode 100644
index 00000000..bf61ac5b
--- /dev/null
+++ b/freebsd/sys/netinet/icmp6.h
@@ -0,0 +1,2 @@
+#include <freebsd/bsd.h>
+#include <freebsd/netinet/icmp6.h>
diff --git a/freebsd/sys/netinet/icmp_var.h b/freebsd/sys/netinet/icmp_var.h
new file mode 100644
index 00000000..d55fc4d3
--- /dev/null
+++ b/freebsd/sys/netinet/icmp_var.h
@@ -0,0 +1,108 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)icmp_var.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_ICMP_VAR_HH_
+#define _NETINET_ICMP_VAR_HH_
+
+
+/*
+ * Variables related to this implementation
+ * of the internet control message protocol.
+ */
+struct icmpstat {
+/* statistics related to icmp packets generated */
+ u_long icps_error; /* # of calls to icmp_error */
+ u_long icps_oldshort; /* no error 'cuz old ip too short */
+ u_long icps_oldicmp; /* no error 'cuz old was icmp */
+ u_long icps_outhist[ICMP_MAXTYPE + 1];
+/* statistics related to input messages processed */
+ u_long icps_badcode; /* icmp_code out of range */
+ u_long icps_tooshort; /* packet < ICMP_MINLEN */
+ u_long icps_checksum; /* bad checksum */
+ u_long icps_badlen; /* calculated bound mismatch */
+ u_long icps_reflect; /* number of responses */
+ u_long icps_inhist[ICMP_MAXTYPE + 1];
+ u_long icps_bmcastecho; /* b/mcast echo requests dropped */
+ u_long icps_bmcasttstamp; /* b/mcast tstamp requests dropped */
+ u_long icps_badaddr; /* bad return address */
+ u_long icps_noroute; /* no route back */
+};
+
+#ifdef _KERNEL
+/*
+ * In-kernel consumers can use these accessor macros directly to update
+ * stats.
+ */
+#define ICMPSTAT_ADD(name, val) V_icmpstat.name += (val)
+#define ICMPSTAT_INC(name) ICMPSTAT_ADD(name, 1)
+
+/*
+ * Kernel module consumers must use this accessor macro.
+ */
+void kmod_icmpstat_inc(int statnum);
+#define KMOD_ICMPSTAT_INC(name) \
+ kmod_icmpstat_inc(offsetof(struct icmpstat, name) / sizeof(u_long))
+#endif
+
+/*
+ * Names for ICMP sysctl objects
+ */
+#define ICMPCTL_MASKREPL 1 /* allow replies to netmask requests */
+#define ICMPCTL_STATS 2 /* statistics (read-only) */
+#define ICMPCTL_ICMPLIM 3
+#define ICMPCTL_MAXID 4
+
+#define ICMPCTL_NAMES { \
+ { 0, 0 }, \
+ { "maskrepl", CTLTYPE_INT }, \
+ { "stats", CTLTYPE_STRUCT }, \
+ { "icmplim", CTLTYPE_INT }, \
+}
+
+#ifdef _KERNEL
+SYSCTL_DECL(_net_inet_icmp);
+
+VNET_DECLARE(struct icmpstat, icmpstat); /* icmp statistics. */
+#define V_icmpstat VNET(icmpstat)
+
+extern int badport_bandlim(int);
+#define BANDLIM_UNLIMITED -1
+#define BANDLIM_ICMP_UNREACH 0
+#define BANDLIM_ICMP_ECHO 1
+#define BANDLIM_ICMP_TSTAMP 2
+#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */
+#define BANDLIM_RST_OPENPORT 4 /* No connection, listener */
+#define BANDLIM_ICMP6_UNREACH 5
+#define BANDLIM_MAX 5
+#endif
+
+#endif
diff --git a/freebsd/sys/netinet/if_atm.c b/freebsd/sys/netinet/if_atm.c
new file mode 100644
index 00000000..ea6c567d
--- /dev/null
+++ b/freebsd/sys/netinet/if_atm.c
@@ -0,0 +1,366 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/* $NetBSD: if_atm.c,v 1.6 1996/10/13 02:03:01 christos Exp $ */
+
+/*-
+ *
+ * Copyright (c) 1996 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Charles D. Cranor and
+ * Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * IP <=> ATM address resolution.
+ */
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_natm.h>
+
+#if defined(INET) || defined(INET6)
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/queue.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/sockio.h>
+#include <freebsd/sys/syslog.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/if_dl.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/if_atm.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/if_atm.h>
+
+#ifdef NATM
+#include <freebsd/netnatm/natm.h>
+#endif
+
+#define SDL(s) ((struct sockaddr_dl *)s)
+
+#define GET3BYTE(V, A, L) do { \
+ (V) = ((A)[0] << 16) | ((A)[1] << 8) | (A)[2]; \
+ (A) += 3; \
+ (L) -= 3; \
+ } while (0)
+
+#define GET2BYTE(V, A, L) do { \
+ (V) = ((A)[0] << 8) | (A)[1]; \
+ (A) += 2; \
+ (L) -= 2; \
+ } while (0)
+
+#define GET1BYTE(V, A, L) do { \
+ (V) = *(A)++; \
+ (L)--; \
+ } while (0)
+
+
+/*
+ * atm_rtrequest: handle ATM rt request (in support of generic code)
+ * inputs: "req" = request code
+ * "rt" = route entry
+ * "info" = rt_addrinfo
+ */
+void
+atm_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info)
+{
+ struct sockaddr *gate = rt->rt_gateway;
+ struct atmio_openvcc op;
+ struct atmio_closevcc cl;
+ u_char *addr;
+ u_int alen;
+#ifdef NATM
+ struct sockaddr_in *sin;
+ struct natmpcb *npcb = NULL;
+#endif
+ static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
+
+ if (rt->rt_flags & RTF_GATEWAY) /* link level requests only */
+ return;
+
+ switch (req) {
+
+ case RTM_RESOLVE: /* resolve: only happens when cloning */
+ printf("atm_rtrequest: RTM_RESOLVE request detected?\n");
+ break;
+
+ case RTM_ADD:
+ /*
+ * route added by a command (e.g. ifconfig, route, arp...).
+ *
+ * first check to see if this is not a host route, in which
+ * case we are being called via "ifconfig" to set the address.
+ */
+ if ((rt->rt_flags & RTF_HOST) == 0) {
+ rt_setgate(rt,rt_key(rt),(struct sockaddr *)&null_sdl);
+ gate = rt->rt_gateway;
+ SDL(gate)->sdl_type = rt->rt_ifp->if_type;
+ SDL(gate)->sdl_index = rt->rt_ifp->if_index;
+ break;
+ }
+
+ if (gate->sa_family != AF_LINK ||
+ gate->sa_len < sizeof(null_sdl)) {
+ log(LOG_DEBUG, "atm_rtrequest: bad gateway value");
+ break;
+ }
+
+ KASSERT(rt->rt_ifp->if_ioctl != NULL,
+ ("atm_rtrequest: null ioctl"));
+
+ /*
+ * Parse and verify the link level address as
+ * an open request
+ */
+#ifdef NATM
+ NATM_LOCK();
+#endif
+ bzero(&op, sizeof(op));
+ addr = LLADDR(SDL(gate));
+ alen = SDL(gate)->sdl_alen;
+ if (alen < 4) {
+ printf("%s: bad link-level address\n", __func__);
+ goto failed;
+ }
+
+ if (alen == 4) {
+ /* old type address */
+ GET1BYTE(op.param.flags, addr, alen);
+ GET1BYTE(op.param.vpi, addr, alen);
+ GET2BYTE(op.param.vci, addr, alen);
+ op.param.traffic = ATMIO_TRAFFIC_UBR;
+ op.param.aal = (op.param.flags & ATM_PH_AAL5) ?
+ ATMIO_AAL_5 : ATMIO_AAL_0;
+ } else {
+ /* new address */
+ op.param.aal = ATMIO_AAL_5;
+
+ GET1BYTE(op.param.flags, addr, alen);
+ op.param.flags &= ATM_PH_LLCSNAP;
+
+ GET1BYTE(op.param.vpi, addr, alen);
+ GET2BYTE(op.param.vci, addr, alen);
+
+ GET1BYTE(op.param.traffic, addr, alen);
+
+ switch (op.param.traffic) {
+
+ case ATMIO_TRAFFIC_UBR:
+ if (alen >= 3)
+ GET3BYTE(op.param.tparam.pcr,
+ addr, alen);
+ break;
+
+ case ATMIO_TRAFFIC_CBR:
+ if (alen < 3)
+ goto bad_param;
+ GET3BYTE(op.param.tparam.pcr, addr, alen);
+ break;
+
+ case ATMIO_TRAFFIC_VBR:
+ if (alen < 3 * 3)
+ goto bad_param;
+ GET3BYTE(op.param.tparam.pcr, addr, alen);
+ GET3BYTE(op.param.tparam.scr, addr, alen);
+ GET3BYTE(op.param.tparam.mbs, addr, alen);
+ break;
+
+ case ATMIO_TRAFFIC_ABR:
+ if (alen < 4 * 3 + 2 + 1 * 2 + 3)
+ goto bad_param;
+ GET3BYTE(op.param.tparam.pcr, addr, alen);
+ GET3BYTE(op.param.tparam.mcr, addr, alen);
+ GET3BYTE(op.param.tparam.icr, addr, alen);
+ GET3BYTE(op.param.tparam.tbe, addr, alen);
+ GET1BYTE(op.param.tparam.nrm, addr, alen);
+ GET1BYTE(op.param.tparam.trm, addr, alen);
+ GET2BYTE(op.param.tparam.adtf, addr, alen);
+ GET1BYTE(op.param.tparam.rif, addr, alen);
+ GET1BYTE(op.param.tparam.rdf, addr, alen);
+ GET1BYTE(op.param.tparam.cdf, addr, alen);
+ break;
+
+ default:
+ bad_param:
+ printf("%s: bad traffic params\n", __func__);
+ goto failed;
+ }
+ }
+ op.param.rmtu = op.param.tmtu = rt->rt_ifp->if_mtu;
+#ifdef NATM
+ /*
+ * let native ATM know we are using this VCI/VPI
+ * (i.e. reserve it)
+ */
+ sin = (struct sockaddr_in *) rt_key(rt);
+ if (sin->sin_family != AF_INET)
+ goto failed;
+ npcb = npcb_add(NULL, rt->rt_ifp, op.param.vci, op.param.vpi);
+ if (npcb == NULL)
+ goto failed;
+ npcb->npcb_flags |= NPCB_IP;
+ npcb->ipaddr.s_addr = sin->sin_addr.s_addr;
+ /* XXX: move npcb to llinfo when ATM ARP is ready */
+ rt->rt_llinfo = (caddr_t) npcb;
+ rt->rt_flags |= RTF_LLINFO;
+#endif
+ /*
+ * let the lower level know this circuit is active
+ */
+ op.rxhand = NULL;
+ op.param.flags |= ATMIO_FLAG_ASYNC;
+ if (rt->rt_ifp->if_ioctl(rt->rt_ifp, SIOCATMOPENVCC,
+ (caddr_t)&op) != 0) {
+ printf("atm: couldn't add VC\n");
+ goto failed;
+ }
+
+ SDL(gate)->sdl_type = rt->rt_ifp->if_type;
+ SDL(gate)->sdl_index = rt->rt_ifp->if_index;
+
+#ifdef NATM
+ NATM_UNLOCK();
+#endif
+ break;
+
+failed:
+#ifdef NATM
+ if (npcb) {
+ npcb_free(npcb, NPCB_DESTROY);
+ rt->rt_llinfo = NULL;
+ rt->rt_flags &= ~RTF_LLINFO;
+ }
+ NATM_UNLOCK();
+#endif
+ /* mark as invalid. We cannot RTM_DELETE the route from
+ * here, because the recursive call to rtrequest1 does
+ * not really work. */
+ rt->rt_flags |= RTF_REJECT;
+ break;
+
+ case RTM_DELETE:
+#ifdef NATM
+ /*
+ * tell native ATM we are done with this VC
+ */
+ if (rt->rt_flags & RTF_LLINFO) {
+ NATM_LOCK();
+ npcb_free((struct natmpcb *)rt->rt_llinfo,
+ NPCB_DESTROY);
+ rt->rt_llinfo = NULL;
+ rt->rt_flags &= ~RTF_LLINFO;
+ NATM_UNLOCK();
+ }
+#endif
+ /*
+ * tell the lower layer to disable this circuit
+ */
+ bzero(&op, sizeof(op));
+ addr = LLADDR(SDL(gate));
+ addr++;
+ cl.vpi = *addr++;
+ cl.vci = *addr++ << 8;
+ cl.vci |= *addr++;
+ (void)rt->rt_ifp->if_ioctl(rt->rt_ifp, SIOCATMCLOSEVCC,
+ (caddr_t)&cl);
+ break;
+ }
+}
+
+/*
+ * atmresolve:
+ * inputs:
+ * [1] "rt" = the link level route to use (or null if need to look one up)
+ * [2] "m" = mbuf containing the data to be sent
+ * [3] "dst" = sockaddr_in (IP) address of dest.
+ * output:
+ * [4] "desten" = ATM pseudo header which we will fill in VPI/VCI info
+ * return:
+ * 0 == resolve FAILED; note that "m" gets m_freem'd in this case
+ * 1 == resolve OK; desten contains result
+ *
+ * XXX: will need more work if we wish to support ATMARP in the kernel,
+ * but this is enough for PVCs entered via the "route" command.
+ */
+int
+atmresolve(struct rtentry *rt, struct mbuf *m, struct sockaddr *dst,
+ struct atm_pseudohdr *desten)
+{
+ struct sockaddr_dl *sdl;
+
+ if (m->m_flags & (M_BCAST | M_MCAST)) {
+ log(LOG_INFO,
+ "atmresolve: BCAST/MCAST packet detected/dumped\n");
+ goto bad;
+ }
+
+ if (rt == NULL) {
+ rt = RTALLOC1(dst, 0); /* link level on table 0 XXX MRT */
+ if (rt == NULL)
+ goto bad; /* failed */
+ RT_REMREF(rt); /* don't keep LL references */
+ if ((rt->rt_flags & RTF_GATEWAY) != 0 ||
+ rt->rt_gateway->sa_family != AF_LINK) {
+ RT_UNLOCK(rt);
+ goto bad;
+ }
+ RT_UNLOCK(rt);
+ }
+
+ /*
+ * note that rt_gateway is a sockaddr_dl which contains the
+ * atm_pseudohdr data structure for this route. we currently
+ * don't need any rt_llinfo info (but will if we want to support
+ * ATM ARP [c.f. if_ether.c]).
+ */
+ sdl = SDL(rt->rt_gateway);
+
+ /*
+ * Check the address family and length is valid, the address
+ * is resolved; otherwise, try to resolve.
+ */
+ if (sdl->sdl_family == AF_LINK && sdl->sdl_alen >= sizeof(*desten)) {
+ bcopy(LLADDR(sdl), desten, sizeof(*desten));
+ return (1); /* ok, go for it! */
+ }
+
+ /*
+ * we got an entry, but it doesn't have valid link address
+ * info in it (it is prob. the interface route, which has
+ * sdl_alen == 0). dump packet. (fall through to "bad").
+ */
+bad:
+ m_freem(m);
+ return (0);
+}
+#endif /* INET */
diff --git a/freebsd/sys/netinet/if_atm.h b/freebsd/sys/netinet/if_atm.h
new file mode 100644
index 00000000..bd8b5143
--- /dev/null
+++ b/freebsd/sys/netinet/if_atm.h
@@ -0,0 +1,47 @@
+/* $FreeBSD$ */
+/* $NetBSD: if_atm.h,v 1.2 1996/07/03 17:17:17 chuck Exp $ */
+
+/*-
+ *
+ * Copyright (c) 1996 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Charles D. Cranor and
+ * Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * if_atm.h
+ */
+
+struct atm_pseudohdr;
+struct mbuf;
+struct rtentry;
+struct sockaddr;
+
+void atm_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
+int atmresolve(struct rtentry *, struct mbuf *, struct sockaddr *,
+ struct atm_pseudohdr *);
diff --git a/freebsd/sys/netinet/if_ether.c b/freebsd/sys/netinet/if_ether.c
new file mode 100644
index 00000000..2e40c0d2
--- /dev/null
+++ b/freebsd/sys/netinet/if_ether.c
@@ -0,0 +1,859 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)if_ether.c 8.1 (Berkeley) 6/10/93
+ */
+
+/*
+ * Ethernet address resolution protocol.
+ * TODO:
+ * add "inuse/lock" bit (or ref. count) along with valid bit
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/queue.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/syslog.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/if_dl.h>
+#include <freebsd/net/if_types.h>
+#include <freebsd/net/netisr.h>
+#include <freebsd/net/if_llc.h>
+#include <freebsd/net/ethernet.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/net/if_llatbl.h>
+#include <freebsd/netinet/if_ether.h>
+#if defined(INET) || defined(INET6)
+#include <freebsd/netinet/ip_carp.h>
+#endif
+
+#include <freebsd/net/if_arc.h>
+#include <freebsd/net/iso88025.h>
+
+#include <freebsd/security/mac/mac_framework.h>
+
+#define SIN(s) ((struct sockaddr_in *)s)
+#define SDL(s) ((struct sockaddr_dl *)s)
+
+SYSCTL_DECL(_net_link_ether);
+SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, "");
+SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, "");
+
+/* timer values */
+static VNET_DEFINE(int, arpt_keep) = (20*60); /* once resolved, good for 20
+ * minutes */
+static VNET_DEFINE(int, arp_maxtries) = 5;
+VNET_DEFINE(int, useloopback) = 1; /* use loopback interface for
+ * local traffic */
+static VNET_DEFINE(int, arp_proxyall) = 0;
+static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for
+ * 20 seconds */
+static VNET_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */
+
+#define V_arpt_keep VNET(arpt_keep)
+#define V_arpt_down VNET(arpt_down)
+#define V_arp_maxtries VNET(arp_maxtries)
+#define V_arp_proxyall VNET(arp_proxyall)
+#define V_arpstat VNET(arpstat)
+
+SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW,
+ &VNET_NAME(arpt_keep), 0,
+ "ARP entry lifetime in seconds");
+SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW,
+ &VNET_NAME(arp_maxtries), 0,
+ "ARP resolution attempts before returning error");
+SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW,
+ &VNET_NAME(useloopback), 0,
+ "Use the loopback interface for local traffic");
+SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW,
+ &VNET_NAME(arp_proxyall), 0,
+ "Enable proxy ARP for all suitable requests");
+SYSCTL_VNET_STRUCT(_net_link_ether_arp, OID_AUTO, stats, CTLFLAG_RW,
+ &VNET_NAME(arpstat), arpstat,
+ "ARP statistics (struct arpstat, net/if_arp.h)");
+
+static void arp_init(void);
+void arprequest(struct ifnet *,
+ struct in_addr *, struct in_addr *, u_char *);
+static void arpintr(struct mbuf *);
+static void arptimer(void *);
+#ifdef INET
+static void in_arpinput(struct mbuf *);
+#endif
+
+static const struct netisr_handler arp_nh = {
+ .nh_name = "arp",
+ .nh_handler = arpintr,
+ .nh_proto = NETISR_ARP,
+ .nh_policy = NETISR_POLICY_SOURCE,
+};
+
+#ifdef AF_INET
+void arp_ifscrub(struct ifnet *ifp, uint32_t addr);
+
+/*
+ * called by in_ifscrub to remove entry from the table when
+ * the interface goes away
+ */
+void
+arp_ifscrub(struct ifnet *ifp, uint32_t addr)
+{
+ struct sockaddr_in addr4;
+
+ bzero((void *)&addr4, sizeof(addr4));
+ addr4.sin_len = sizeof(addr4);
+ addr4.sin_family = AF_INET;
+ addr4.sin_addr.s_addr = addr;
+ IF_AFDATA_LOCK(ifp);
+ lla_lookup(LLTABLE(ifp), (LLE_DELETE | LLE_IFADDR),
+ (struct sockaddr *)&addr4);
+ IF_AFDATA_UNLOCK(ifp);
+}
+#endif
+
+/*
+ * Timeout routine. Age arp_tab entries periodically.
+ */
+static void
+arptimer(void *arg)
+{
+ struct ifnet *ifp;
+ struct llentry *lle;
+
+ KASSERT(arg != NULL, ("%s: arg NULL", __func__));
+ lle = (struct llentry *)arg;
+ ifp = lle->lle_tbl->llt_ifp;
+ CURVNET_SET(ifp->if_vnet);
+ IF_AFDATA_LOCK(ifp);
+ LLE_WLOCK(lle);
+ if (lle->la_flags & LLE_STATIC)
+ LLE_WUNLOCK(lle);
+ else {
+ if (!callout_pending(&lle->la_timer) &&
+ callout_active(&lle->la_timer)) {
+ callout_stop(&lle->la_timer);
+ LLE_REMREF(lle);
+ (void) llentry_free(lle);
+ ARPSTAT_INC(timeouts);
+ }
+#ifdef DIAGNOSTIC
+ else {
+ struct sockaddr *l3addr = L3_ADDR(lle);
+ log(LOG_INFO,
+ "arptimer issue: %p, IPv4 address: \"%s\"\n", lle,
+ inet_ntoa(
+ ((const struct sockaddr_in *)l3addr)->sin_addr));
+ }
+#endif
+ }
+ IF_AFDATA_UNLOCK(ifp);
+ CURVNET_RESTORE();
+}
+
+/*
+ * Broadcast an ARP request. Caller specifies:
+ * - arp header source ip address
+ * - arp header target ip address
+ * - arp header source ethernet address
+ */
+void
+arprequest(struct ifnet *ifp, struct in_addr *sip, struct in_addr *tip,
+ u_char *enaddr)
+{
+ struct mbuf *m;
+ struct arphdr *ah;
+ struct sockaddr sa;
+
+ if (sip == NULL) {
+ /* XXX don't believe this can happen (or explain why) */
+ /*
+ * The caller did not supply a source address, try to find
+ * a compatible one among those assigned to this interface.
+ */
+ struct ifaddr *ifa;
+
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (!ifa->ifa_addr ||
+ ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ sip = &SIN(ifa->ifa_addr)->sin_addr;
+ if (0 == ((sip->s_addr ^ tip->s_addr) &
+ SIN(ifa->ifa_netmask)->sin_addr.s_addr) )
+ break; /* found it. */
+ }
+ if (sip == NULL) {
+ printf("%s: cannot find matching address\n", __func__);
+ return;
+ }
+ }
+
+ if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
+ return;
+ m->m_len = sizeof(*ah) + 2*sizeof(struct in_addr) +
+ 2*ifp->if_data.ifi_addrlen;
+ m->m_pkthdr.len = m->m_len;
+ MH_ALIGN(m, m->m_len);
+ ah = mtod(m, struct arphdr *);
+ bzero((caddr_t)ah, m->m_len);
+#ifdef MAC
+ mac_netinet_arp_send(ifp, m);
+#endif
+ ah->ar_pro = htons(ETHERTYPE_IP);
+ ah->ar_hln = ifp->if_addrlen; /* hardware address length */
+ ah->ar_pln = sizeof(struct in_addr); /* protocol address length */
+ ah->ar_op = htons(ARPOP_REQUEST);
+ bcopy((caddr_t)enaddr, (caddr_t)ar_sha(ah), ah->ar_hln);
+ bcopy((caddr_t)sip, (caddr_t)ar_spa(ah), ah->ar_pln);
+ bcopy((caddr_t)tip, (caddr_t)ar_tpa(ah), ah->ar_pln);
+ sa.sa_family = AF_ARP;
+ sa.sa_len = 2;
+ m->m_flags |= M_BCAST;
+ (*ifp->if_output)(ifp, m, &sa, NULL);
+ ARPSTAT_INC(txrequests);
+}
+
+/*
+ * Resolve an IP address into an ethernet address.
+ * On input:
+ * ifp is the interface we use
+ * rt0 is the route to the final destination (possibly useless)
+ * m is the mbuf. May be NULL if we don't have a packet.
+ * dst is the next hop,
+ * desten is where we want the address.
+ *
+ * On success, desten is filled in and the function returns 0;
+ * If the packet must be held pending resolution, we return EWOULDBLOCK
+ * On other errors, we return the corresponding error code.
+ * Note that m_freem() handles NULL.
+ */
+int
+arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m,
+ struct sockaddr *dst, u_char *desten, struct llentry **lle)
+{
+ struct llentry *la = 0;
+ u_int flags = 0;
+ int error, renew;
+
+ *lle = NULL;
+ if (m != NULL) {
+ if (m->m_flags & M_BCAST) {
+ /* broadcast */
+ (void)memcpy(desten,
+ ifp->if_broadcastaddr, ifp->if_addrlen);
+ return (0);
+ }
+ if (m->m_flags & M_MCAST && ifp->if_type != IFT_ARCNET) {
+ /* multicast */
+ ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
+ return (0);
+ }
+ }
+ /* XXXXX
+ */
+retry:
+ IF_AFDATA_RLOCK(ifp);
+ la = lla_lookup(LLTABLE(ifp), flags, dst);
+ IF_AFDATA_RUNLOCK(ifp);
+ if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0)
+ && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) {
+ flags |= (LLE_CREATE | LLE_EXCLUSIVE);
+ IF_AFDATA_WLOCK(ifp);
+ la = lla_lookup(LLTABLE(ifp), flags, dst);
+ IF_AFDATA_WUNLOCK(ifp);
+ }
+ if (la == NULL) {
+ if (flags & LLE_CREATE)
+ log(LOG_DEBUG,
+ "arpresolve: can't allocate llinfo for %s\n",
+ inet_ntoa(SIN(dst)->sin_addr));
+ m_freem(m);
+ return (EINVAL);
+ }
+
+ if ((la->la_flags & LLE_VALID) &&
+ ((la->la_flags & LLE_STATIC) || la->la_expire > time_second)) {
+ bcopy(&la->ll_addr, desten, ifp->if_addrlen);
+ /*
+ * If entry has an expiry time and it is approaching,
+ * see if we need to send an ARP request within this
+ * arpt_down interval.
+ */
+ if (!(la->la_flags & LLE_STATIC) &&
+ time_second + la->la_preempt > la->la_expire) {
+ arprequest(ifp, NULL,
+ &SIN(dst)->sin_addr, IF_LLADDR(ifp));
+
+ la->la_preempt--;
+ }
+
+ *lle = la;
+ error = 0;
+ goto done;
+ }
+
+ if (la->la_flags & LLE_STATIC) { /* should not happen! */
+ log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n",
+ inet_ntoa(SIN(dst)->sin_addr));
+ m_freem(m);
+ error = EINVAL;
+ goto done;
+ }
+
+ renew = (la->la_asked == 0 || la->la_expire != time_second);
+ if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) {
+ flags |= LLE_EXCLUSIVE;
+ LLE_RUNLOCK(la);
+ goto retry;
+ }
+ /*
+ * There is an arptab entry, but no ethernet address
+ * response yet. Replace the held mbuf with this
+ * latest one.
+ */
+ if (m != NULL) {
+ if (la->la_hold != NULL) {
+ m_freem(la->la_hold);
+ ARPSTAT_INC(dropped);
+ }
+ la->la_hold = m;
+ if (renew == 0 && (flags & LLE_EXCLUSIVE)) {
+ flags &= ~LLE_EXCLUSIVE;
+ LLE_DOWNGRADE(la);
+ }
+
+ }
+ /*
+ * Return EWOULDBLOCK if we have tried less than arp_maxtries. It
+ * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH
+ * if we have already sent arp_maxtries ARP requests. Retransmit the
+ * ARP request, but not faster than one request per second.
+ */
+ if (la->la_asked < V_arp_maxtries)
+ error = EWOULDBLOCK; /* First request. */
+ else
+ error = rt0 != NULL && (rt0->rt_flags & RTF_GATEWAY) ?
+ EHOSTUNREACH : EHOSTDOWN;
+
+ if (renew) {
+ int canceled;
+
+ LLE_ADDREF(la);
+ la->la_expire = time_second;
+ canceled = callout_reset(&la->la_timer, hz * V_arpt_down,
+ arptimer, la);
+ if (canceled)
+ LLE_REMREF(la);
+ la->la_asked++;
+ LLE_WUNLOCK(la);
+ arprequest(ifp, NULL, &SIN(dst)->sin_addr,
+ IF_LLADDR(ifp));
+ return (error);
+ }
+done:
+ if (flags & LLE_EXCLUSIVE)
+ LLE_WUNLOCK(la);
+ else
+ LLE_RUNLOCK(la);
+ return (error);
+}
+
+/*
+ * Common length and type checks are done here,
+ * then the protocol-specific routine is called.
+ */
+static void
+arpintr(struct mbuf *m)
+{
+ struct arphdr *ar;
+
+ if (m->m_len < sizeof(struct arphdr) &&
+ ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
+ log(LOG_ERR, "arp: runt packet -- m_pullup failed\n");
+ return;
+ }
+ ar = mtod(m, struct arphdr *);
+
+ if (ntohs(ar->ar_hrd) != ARPHRD_ETHER &&
+ ntohs(ar->ar_hrd) != ARPHRD_IEEE802 &&
+ ntohs(ar->ar_hrd) != ARPHRD_ARCNET &&
+ ntohs(ar->ar_hrd) != ARPHRD_IEEE1394) {
+ log(LOG_ERR, "arp: unknown hardware address format (0x%2D)\n",
+ (unsigned char *)&ar->ar_hrd, "");
+ m_freem(m);
+ return;
+ }
+
+ if (m->m_len < arphdr_len(ar)) {
+ if ((m = m_pullup(m, arphdr_len(ar))) == NULL) {
+ log(LOG_ERR, "arp: runt packet\n");
+ m_freem(m);
+ return;
+ }
+ ar = mtod(m, struct arphdr *);
+ }
+
+ ARPSTAT_INC(received);
+ switch (ntohs(ar->ar_pro)) {
+#ifdef INET
+ case ETHERTYPE_IP:
+ in_arpinput(m);
+ return;
+#endif
+ }
+ m_freem(m);
+}
+
+#ifdef INET
+/*
+ * ARP for Internet protocols on 10 Mb/s Ethernet.
+ * Algorithm is that given in RFC 826.
+ * In addition, a sanity check is performed on the sender
+ * protocol address, to catch impersonators.
+ * We no longer handle negotiations for use of trailer protocol:
+ * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
+ * along with IP replies if we wanted trailers sent to us,
+ * and also sent them in response to IP replies.
+ * This allowed either end to announce the desire to receive
+ * trailer packets.
+ * We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
+ * but formerly didn't normally send requests.
+ */
+static int log_arp_wrong_iface = 1;
+static int log_arp_movements = 1;
+static int log_arp_permanent_modify = 1;
+
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
+ &log_arp_wrong_iface, 0,
+ "log arp packets arriving on the wrong interface");
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
+ &log_arp_movements, 0,
+ "log arp replies from MACs different than the one in the cache");
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
+ &log_arp_permanent_modify, 0,
+ "log arp replies from MACs different than the one in the permanent arp entry");
+
+
+static void
+in_arpinput(struct mbuf *m)
+{
+ struct arphdr *ah;
+ struct ifnet *ifp = m->m_pkthdr.rcvif;
+ struct llentry *la = NULL;
+ struct rtentry *rt;
+ struct ifaddr *ifa;
+ struct in_ifaddr *ia;
+ struct mbuf *hold;
+ struct sockaddr sa;
+ struct in_addr isaddr, itaddr, myaddr;
+ u_int8_t *enaddr = NULL;
+ int op, flags;
+ int req_len;
+ int bridged = 0, is_bridge = 0;
+ int carp_match = 0;
+ struct sockaddr_in sin;
+ sin.sin_len = sizeof(struct sockaddr_in);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = 0;
+
+ if (ifp->if_bridge)
+ bridged = 1;
+ if (ifp->if_type == IFT_BRIDGE)
+ is_bridge = 1;
+
+ req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr));
+ if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) {
+ log(LOG_ERR, "in_arp: runt packet -- m_pullup failed\n");
+ return;
+ }
+
+ ah = mtod(m, struct arphdr *);
+ op = ntohs(ah->ar_op);
+ (void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
+ (void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
+
+ if (op == ARPOP_REPLY)
+ ARPSTAT_INC(rxreplies);
+
+ /*
+ * For a bridge, we want to check the address irrespective
+ * of the receive interface. (This will change slightly
+ * when we have clusters of interfaces).
+ * If the interface does not match, but the recieving interface
+ * is part of carp, we call carp_iamatch to see if this is a
+ * request for the virtual host ip.
+ * XXX: This is really ugly!
+ */
+ IN_IFADDR_RLOCK();
+ LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
+ if (((bridged && ia->ia_ifp->if_bridge != NULL) ||
+ ia->ia_ifp == ifp) &&
+ itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
+ ifa_ref(&ia->ia_ifa);
+ IN_IFADDR_RUNLOCK();
+ goto match;
+ }
+ if (ifp->if_carp != NULL &&
+ (*carp_iamatch_p)(ifp, ia, &isaddr, &enaddr) &&
+ itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
+ carp_match = 1;
+ ifa_ref(&ia->ia_ifa);
+ IN_IFADDR_RUNLOCK();
+ goto match;
+ }
+ }
+ LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
+ if (((bridged && ia->ia_ifp->if_bridge != NULL) ||
+ ia->ia_ifp == ifp) &&
+ isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
+ ifa_ref(&ia->ia_ifa);
+ IN_IFADDR_RUNLOCK();
+ goto match;
+ }
+
+#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \
+ (ia->ia_ifp->if_bridge == ifp->if_softc && \
+ !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \
+ addr == ia->ia_addr.sin_addr.s_addr)
+ /*
+ * Check the case when bridge shares its MAC address with
+ * some of its children, so packets are claimed by bridge
+ * itself (bridge_input() does it first), but they are really
+ * meant to be destined to the bridge member.
+ */
+ if (is_bridge) {
+ LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
+ if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
+ ifa_ref(&ia->ia_ifa);
+ ifp = ia->ia_ifp;
+ IN_IFADDR_RUNLOCK();
+ goto match;
+ }
+ }
+ }
+#undef BDG_MEMBER_MATCHES_ARP
+ IN_IFADDR_RUNLOCK();
+
+ /*
+ * No match, use the first inet address on the receive interface
+ * as a dummy address for the rest of the function.
+ */
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
+ if (ifa->ifa_addr->sa_family == AF_INET) {
+ ia = ifatoia(ifa);
+ ifa_ref(ifa);
+ IF_ADDR_UNLOCK(ifp);
+ goto match;
+ }
+ IF_ADDR_UNLOCK(ifp);
+
+ /*
+ * If bridging, fall back to using any inet address.
+ */
+ IN_IFADDR_RLOCK();
+ if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) {
+ IN_IFADDR_RUNLOCK();
+ goto drop;
+ }
+ ifa_ref(&ia->ia_ifa);
+ IN_IFADDR_RUNLOCK();
+match:
+ if (!enaddr)
+ enaddr = (u_int8_t *)IF_LLADDR(ifp);
+ myaddr = ia->ia_addr.sin_addr;
+ ifa_free(&ia->ia_ifa);
+ if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
+ goto drop; /* it's from me, ignore it. */
+ if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
+ log(LOG_ERR,
+ "arp: link address is broadcast for IP address %s!\n",
+ inet_ntoa(isaddr));
+ goto drop;
+ }
+ /*
+ * Warn if another host is using the same IP address, but only if the
+ * IP address isn't 0.0.0.0, which is used for DHCP only, in which
+ * case we suppress the warning to avoid false positive complaints of
+ * potential misconfiguration.
+ */
+ if (!bridged && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) {
+ log(LOG_ERR,
+ "arp: %*D is using my IP address %s on %s!\n",
+ ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+ inet_ntoa(isaddr), ifp->if_xname);
+ itaddr = myaddr;
+ ARPSTAT_INC(dupips);
+ goto reply;
+ }
+ if (ifp->if_flags & IFF_STATICARP)
+ goto reply;
+
+ bzero(&sin, sizeof(sin));
+ sin.sin_len = sizeof(struct sockaddr_in);
+ sin.sin_family = AF_INET;
+ sin.sin_addr = isaddr;
+ flags = (itaddr.s_addr == myaddr.s_addr) ? LLE_CREATE : 0;
+ flags |= LLE_EXCLUSIVE;
+ IF_AFDATA_LOCK(ifp);
+ la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin);
+ IF_AFDATA_UNLOCK(ifp);
+ if (la != NULL) {
+ /* the following is not an error when doing bridging */
+ if (!bridged && la->lle_tbl->llt_ifp != ifp && !carp_match) {
+ if (log_arp_wrong_iface)
+ log(LOG_ERR, "arp: %s is on %s "
+ "but got reply from %*D on %s\n",
+ inet_ntoa(isaddr),
+ la->lle_tbl->llt_ifp->if_xname,
+ ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+ ifp->if_xname);
+ LLE_WUNLOCK(la);
+ goto reply;
+ }
+ if ((la->la_flags & LLE_VALID) &&
+ bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) {
+ if (la->la_flags & LLE_STATIC) {
+ LLE_WUNLOCK(la);
+ log(LOG_ERR,
+ "arp: %*D attempts to modify permanent "
+ "entry for %s on %s\n",
+ ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+ inet_ntoa(isaddr), ifp->if_xname);
+ goto reply;
+ }
+ if (log_arp_movements) {
+ log(LOG_INFO, "arp: %s moved from %*D "
+ "to %*D on %s\n",
+ inet_ntoa(isaddr),
+ ifp->if_addrlen,
+ (u_char *)&la->ll_addr, ":",
+ ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+ ifp->if_xname);
+ }
+ }
+
+ if (ifp->if_addrlen != ah->ar_hln) {
+ LLE_WUNLOCK(la);
+ log(LOG_WARNING,
+ "arp from %*D: addr len: new %d, i/f %d (ignored)",
+ ifp->if_addrlen, (u_char *) ar_sha(ah), ":",
+ ah->ar_hln, ifp->if_addrlen);
+ goto reply;
+ }
+ (void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen);
+ la->la_flags |= LLE_VALID;
+
+ if (!(la->la_flags & LLE_STATIC)) {
+ int canceled;
+
+ LLE_ADDREF(la);
+ la->la_expire = time_second + V_arpt_keep;
+ canceled = callout_reset(&la->la_timer,
+ hz * V_arpt_keep, arptimer, la);
+ if (canceled)
+ LLE_REMREF(la);
+ }
+ la->la_asked = 0;
+ la->la_preempt = V_arp_maxtries;
+ hold = la->la_hold;
+ if (hold != NULL) {
+ la->la_hold = NULL;
+ memcpy(&sa, L3_ADDR(la), sizeof(sa));
+ }
+ LLE_WUNLOCK(la);
+ if (hold != NULL)
+ (*ifp->if_output)(ifp, hold, &sa, NULL);
+ }
+reply:
+ if (op != ARPOP_REQUEST)
+ goto drop;
+ ARPSTAT_INC(rxrequests);
+
+ if (itaddr.s_addr == myaddr.s_addr) {
+ /* Shortcut.. the receiving interface is the target. */
+ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
+ (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
+ } else {
+ struct llentry *lle = NULL;
+
+ sin.sin_addr = itaddr;
+ IF_AFDATA_LOCK(ifp);
+ lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
+ IF_AFDATA_UNLOCK(ifp);
+
+ if ((lle != NULL) && (lle->la_flags & LLE_PUB)) {
+ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
+ (void)memcpy(ar_sha(ah), &lle->ll_addr, ah->ar_hln);
+ LLE_RUNLOCK(lle);
+ } else {
+
+ if (lle != NULL)
+ LLE_RUNLOCK(lle);
+
+ if (!V_arp_proxyall)
+ goto drop;
+
+ sin.sin_addr = itaddr;
+ /* XXX MRT use table 0 for arp reply */
+ rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
+ if (!rt)
+ goto drop;
+
+ /*
+ * Don't send proxies for nodes on the same interface
+ * as this one came out of, or we'll get into a fight
+ * over who claims what Ether address.
+ */
+ if (!rt->rt_ifp || rt->rt_ifp == ifp) {
+ RTFREE_LOCKED(rt);
+ goto drop;
+ }
+ RTFREE_LOCKED(rt);
+
+ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
+ (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
+
+ /*
+ * Also check that the node which sent the ARP packet
+ * is on the the interface we expect it to be on. This
+ * avoids ARP chaos if an interface is connected to the
+ * wrong network.
+ */
+ sin.sin_addr = isaddr;
+
+ /* XXX MRT use table 0 for arp checks */
+ rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
+ if (!rt)
+ goto drop;
+ if (rt->rt_ifp != ifp) {
+ log(LOG_INFO, "arp_proxy: ignoring request"
+ " from %s via %s, expecting %s\n",
+ inet_ntoa(isaddr), ifp->if_xname,
+ rt->rt_ifp->if_xname);
+ RTFREE_LOCKED(rt);
+ goto drop;
+ }
+ RTFREE_LOCKED(rt);
+
+#ifdef DEBUG_PROXY
+ printf("arp: proxying for %s\n",
+ inet_ntoa(itaddr));
+#endif
+ }
+ }
+
+ if (itaddr.s_addr == myaddr.s_addr &&
+ IN_LINKLOCAL(ntohl(itaddr.s_addr))) {
+ /* RFC 3927 link-local IPv4; always reply by broadcast. */
+#ifdef DEBUG_LINKLOCAL
+ printf("arp: sending reply for link-local addr %s\n",
+ inet_ntoa(itaddr));
+#endif
+ m->m_flags |= M_BCAST;
+ m->m_flags &= ~M_MCAST;
+ } else {
+ /* default behaviour; never reply by broadcast. */
+ m->m_flags &= ~(M_BCAST|M_MCAST);
+ }
+ (void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
+ (void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
+ ah->ar_op = htons(ARPOP_REPLY);
+ ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
+ m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);
+ m->m_pkthdr.len = m->m_len;
+ sa.sa_family = AF_ARP;
+ sa.sa_len = 2;
+ (*ifp->if_output)(ifp, m, &sa, NULL);
+ ARPSTAT_INC(txreplies);
+ return;
+
+drop:
+ m_freem(m);
+}
+#endif
+
+void
+arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
+{
+ struct llentry *lle;
+
+ if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) {
+ arprequest(ifp, &IA_SIN(ifa)->sin_addr,
+ &IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp));
+ /*
+ * interface address is considered static entry
+ * because the output of the arp utility shows
+ * that L2 entry as permanent
+ */
+ IF_AFDATA_LOCK(ifp);
+ lle = lla_lookup(LLTABLE(ifp), (LLE_CREATE | LLE_IFADDR | LLE_STATIC),
+ (struct sockaddr *)IA_SIN(ifa));
+ IF_AFDATA_UNLOCK(ifp);
+ if (lle == NULL)
+ log(LOG_INFO, "arp_ifinit: cannot create arp "
+ "entry for interface address\n");
+ else
+ LLE_RUNLOCK(lle);
+ }
+ ifa->ifa_rtrequest = NULL;
+}
+
+void
+arp_ifinit2(struct ifnet *ifp, struct ifaddr *ifa, u_char *enaddr)
+{
+ if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY)
+ arprequest(ifp, &IA_SIN(ifa)->sin_addr,
+ &IA_SIN(ifa)->sin_addr, enaddr);
+ ifa->ifa_rtrequest = NULL;
+}
+
+static void
+arp_init(void)
+{
+
+ netisr_register(&arp_nh);
+}
+SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0);
diff --git a/freebsd/sys/netinet/if_ether.h b/freebsd/sys/netinet/if_ether.h
new file mode 100644
index 00000000..e3c8d009
--- /dev/null
+++ b/freebsd/sys/netinet/if_ether.h
@@ -0,0 +1,2 @@
+#include <freebsd/bsd.h>
+#include <freebsd/netinet/if_ether.h>
diff --git a/freebsd/sys/netinet/igmp.c b/freebsd/sys/netinet/igmp.c
new file mode 100644
index 00000000..5f8893d7
--- /dev/null
+++ b/freebsd/sys/netinet/igmp.c
@@ -0,0 +1,3655 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2007-2009 Bruce Simpson.
+ * Copyright (c) 1988 Stephen Deering.
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Stephen Deering of Stanford University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)igmp.c 8.1 (Berkeley) 7/19/93
+ */
+
+/*
+ * Internet Group Management Protocol (IGMP) routines.
+ * [RFC1112, RFC2236, RFC3376]
+ *
+ * Written by Steve Deering, Stanford, May 1988.
+ * Modified by Rosen Sharma, Stanford, Aug 1994.
+ * Modified by Bill Fenner, Xerox PARC, Feb 1995.
+ * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
+ * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson.
+ *
+ * MULTICAST Revision: 3.5.1.4
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/ktr.h>
+#include <freebsd/sys/condvar.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/netisr.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_options.h>
+#include <freebsd/netinet/igmp.h>
+#include <freebsd/netinet/igmp_var.h>
+
+#include <freebsd/machine/in_cksum.h>
+
+#include <freebsd/security/mac/mac_framework.h>
+
+#ifndef KTR_IGMPV3
+#define KTR_IGMPV3 KTR_INET
+#endif
+
+static struct igmp_ifinfo *
+ igi_alloc_locked(struct ifnet *);
+static void igi_delete_locked(const struct ifnet *);
+static void igmp_dispatch_queue(struct ifqueue *, int, const int);
+static void igmp_fasttimo_vnet(void);
+static void igmp_final_leave(struct in_multi *, struct igmp_ifinfo *);
+static int igmp_handle_state_change(struct in_multi *,
+ struct igmp_ifinfo *);
+static int igmp_initial_join(struct in_multi *, struct igmp_ifinfo *);
+static int igmp_input_v1_query(struct ifnet *, const struct ip *,
+ const struct igmp *);
+static int igmp_input_v2_query(struct ifnet *, const struct ip *,
+ const struct igmp *);
+static int igmp_input_v3_query(struct ifnet *, const struct ip *,
+ /*const*/ struct igmpv3 *);
+static int igmp_input_v3_group_query(struct in_multi *,
+ struct igmp_ifinfo *, int, /*const*/ struct igmpv3 *);
+static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *,
+ /*const*/ struct igmp *);
+static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *,
+ /*const*/ struct igmp *);
+static void igmp_intr(struct mbuf *);
+static int igmp_isgroupreported(const struct in_addr);
+static struct mbuf *
+ igmp_ra_alloc(void);
+#ifdef KTR
+static char * igmp_rec_type_to_str(const int);
+#endif
+static void igmp_set_version(struct igmp_ifinfo *, const int);
+static void igmp_slowtimo_vnet(void);
+static int igmp_v1v2_queue_report(struct in_multi *, const int);
+static void igmp_v1v2_process_group_timer(struct in_multi *, const int);
+static void igmp_v1v2_process_querier_timers(struct igmp_ifinfo *);
+static void igmp_v2_update_group(struct in_multi *, const int);
+static void igmp_v3_cancel_link_timers(struct igmp_ifinfo *);
+static void igmp_v3_dispatch_general_query(struct igmp_ifinfo *);
+static struct mbuf *
+ igmp_v3_encap_report(struct ifnet *, struct mbuf *);
+static int igmp_v3_enqueue_group_record(struct ifqueue *,
+ struct in_multi *, const int, const int, const int);
+static int igmp_v3_enqueue_filter_change(struct ifqueue *,
+ struct in_multi *);
+static void igmp_v3_process_group_timers(struct igmp_ifinfo *,
+ struct ifqueue *, struct ifqueue *, struct in_multi *,
+ const int);
+static int igmp_v3_merge_state_changes(struct in_multi *,
+ struct ifqueue *);
+static void igmp_v3_suppress_group_record(struct in_multi *);
+static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS);
+static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS);
+static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS);
+
+static const struct netisr_handler igmp_nh = {
+ .nh_name = "igmp",
+ .nh_handler = igmp_intr,
+ .nh_proto = NETISR_IGMP,
+ .nh_policy = NETISR_POLICY_SOURCE,
+};
+
+/*
+ * System-wide globals.
+ *
+ * Unlocked access to these is OK, except for the global IGMP output
+ * queue. The IGMP subsystem lock ends up being system-wide for the moment,
+ * because all VIMAGEs have to share a global output queue, as netisrs
+ * themselves are not virtualized.
+ *
+ * Locking:
+ * * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
+ * Any may be taken independently; if any are held at the same
+ * time, the above lock order must be followed.
+ * * All output is delegated to the netisr.
+ * Now that Giant has been eliminated, the netisr may be inlined.
+ * * IN_MULTI_LOCK covers in_multi.
+ * * IGMP_LOCK covers igmp_ifinfo and any global variables in this file,
+ * including the output queue.
+ * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
+ * per-link state iterators.
+ * * igmp_ifinfo is valid as long as PF_INET is attached to the interface,
+ * therefore it is not refcounted.
+ * We allow unlocked reads of igmp_ifinfo when accessed via in_multi.
+ *
+ * Reference counting
+ * * IGMP acquires its own reference every time an in_multi is passed to
+ * it and the group is being joined for the first time.
+ * * IGMP releases its reference(s) on in_multi in a deferred way,
+ * because the operations which process the release run as part of
+ * a loop whose control variables are directly affected by the release
+ * (that, and not recursing on the IF_ADDR_LOCK).
+ *
+ * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds
+ * to a vnet in ifp->if_vnet.
+ *
+ * SMPng: XXX We may potentially race operations on ifma_protospec.
+ * The problem is that we currently lack a clean way of taking the
+ * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing,
+ * as anything which modifies ifma needs to be covered by that lock.
+ * So check for ifma_protospec being NULL before proceeding.
+ */
+struct mtx igmp_mtx;
+
+struct mbuf *m_raopt; /* Router Alert option */
+MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
+
+/*
+ * VIMAGE-wide globals.
+ *
+ * The IGMPv3 timers themselves need to run per-image, however,
+ * protosw timers run globally (see tcp).
+ * An ifnet can only be in one vimage at a time, and the loopback
+ * ifnet, loif, is itself virtualized.
+ * It would otherwise be possible to seriously hose IGMP state,
+ * and create inconsistencies in upstream multicast routing, if you have
+ * multiple VIMAGEs running on the same link joining different multicast
+ * groups, UNLESS the "primary IP address" is different. This is because
+ * IGMP for IPv4 does not force link-local addresses to be used for each
+ * node, unlike MLD for IPv6.
+ * Obviously the IGMPv3 per-interface state has per-vimage granularity
+ * also as a result.
+ *
+ * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection
+ * policy to control the address used by IGMP on the link.
+ */
+static VNET_DEFINE(int, interface_timers_running); /* IGMPv3 general
+ * query response */
+static VNET_DEFINE(int, state_change_timers_running); /* IGMPv3 state-change
+ * retransmit */
+static VNET_DEFINE(int, current_state_timers_running); /* IGMPv1/v2 host
+ * report; IGMPv3 g/sg
+ * query response */
+
+#define V_interface_timers_running VNET(interface_timers_running)
+#define V_state_change_timers_running VNET(state_change_timers_running)
+#define V_current_state_timers_running VNET(current_state_timers_running)
+
+static VNET_DEFINE(LIST_HEAD(, igmp_ifinfo), igi_head);
+static VNET_DEFINE(struct igmpstat, igmpstat) = {
+ .igps_version = IGPS_VERSION_3,
+ .igps_len = sizeof(struct igmpstat),
+};
+static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0};
+
+#define V_igi_head VNET(igi_head)
+#define V_igmpstat VNET(igmpstat)
+#define V_igmp_gsrdelay VNET(igmp_gsrdelay)
+
+static VNET_DEFINE(int, igmp_recvifkludge) = 1;
+static VNET_DEFINE(int, igmp_sendra) = 1;
+static VNET_DEFINE(int, igmp_sendlocal) = 1;
+static VNET_DEFINE(int, igmp_v1enable) = 1;
+static VNET_DEFINE(int, igmp_v2enable) = 1;
+static VNET_DEFINE(int, igmp_legacysupp);
+static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3;
+
+#define V_igmp_recvifkludge VNET(igmp_recvifkludge)
+#define V_igmp_sendra VNET(igmp_sendra)
+#define V_igmp_sendlocal VNET(igmp_sendlocal)
+#define V_igmp_v1enable VNET(igmp_v1enable)
+#define V_igmp_v2enable VNET(igmp_v2enable)
+#define V_igmp_legacysupp VNET(igmp_legacysupp)
+#define V_igmp_default_version VNET(igmp_default_version)
+
+/*
+ * Virtualized sysctls.
+ */
+SYSCTL_VNET_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW,
+ &VNET_NAME(igmpstat), igmpstat, "");
+SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_RW,
+ &VNET_NAME(igmp_recvifkludge), 0,
+ "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address");
+SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_RW,
+ &VNET_NAME(igmp_sendra), 0,
+ "Send IP Router Alert option in IGMPv2/v3 messages");
+SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_RW,
+ &VNET_NAME(igmp_sendlocal), 0,
+ "Send IGMP membership reports for 224.0.0.0/24 groups");
+SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_RW,
+ &VNET_NAME(igmp_v1enable), 0,
+ "Enable backwards compatibility with IGMPv1");
+SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_RW,
+ &VNET_NAME(igmp_v2enable), 0,
+ "Enable backwards compatibility with IGMPv2");
+SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_RW,
+ &VNET_NAME(igmp_legacysupp), 0,
+ "Allow v1/v2 reports to suppress v3 group responses");
+SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, default_version,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I",
+ "Default version of IGMP to run on each interface");
+SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I",
+ "Rate limit for IGMPv3 Group-and-Source queries in seconds");
+
+/*
+ * Non-virtualized sysctls.
+ */
+SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE,
+ sysctl_igmp_ifinfo, "Per-interface IGMPv3 state");
+
+static __inline void
+igmp_save_context(struct mbuf *m, struct ifnet *ifp)
+{
+
+#ifdef VIMAGE
+ m->m_pkthdr.header = ifp->if_vnet;
+#endif /* VIMAGE */
+ m->m_pkthdr.flowid = ifp->if_index;
+}
+
+static __inline void
+igmp_scrub_context(struct mbuf *m)
+{
+
+ m->m_pkthdr.header = NULL;
+ m->m_pkthdr.flowid = 0;
+}
+
+#ifdef KTR
+static __inline char *
+inet_ntoa_haddr(in_addr_t haddr)
+{
+ struct in_addr ia;
+
+ ia.s_addr = htonl(haddr);
+ return (inet_ntoa(ia));
+}
+#endif
+
+/*
+ * Restore context from a queued IGMP output chain.
+ * Return saved ifindex.
+ *
+ * VIMAGE: The assertion is there to make sure that we
+ * actually called CURVNET_SET() with what's in the mbuf chain.
+ */
+static __inline uint32_t
+igmp_restore_context(struct mbuf *m)
+{
+
+#ifdef notyet
+#if defined(VIMAGE) && defined(INVARIANTS)
+ KASSERT(curvnet == (m->m_pkthdr.header),
+ ("%s: called when curvnet was not restored", __func__));
+#endif
+#endif
+ return (m->m_pkthdr.flowid);
+}
+
+/*
+ * Retrieve or set default IGMP version.
+ *
+ * VIMAGE: Assume curvnet set by caller.
+ * SMPng: NOTE: Serialized by IGMP lock.
+ */
+static int
+sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ int new;
+
+ error = sysctl_wire_old_buffer(req, sizeof(int));
+ if (error)
+ return (error);
+
+ IGMP_LOCK();
+
+ new = V_igmp_default_version;
+
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error || !req->newptr)
+ goto out_locked;
+
+ if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) {
+ error = EINVAL;
+ goto out_locked;
+ }
+
+ CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
+ V_igmp_default_version, new);
+
+ V_igmp_default_version = new;
+
+out_locked:
+ IGMP_UNLOCK();
+ return (error);
+}
+
+/*
+ * Retrieve or set threshold between group-source queries in seconds.
+ *
+ * VIMAGE: Assume curvnet set by caller.
+ * SMPng: NOTE: Serialized by IGMP lock.
+ */
+static int
+sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ int i;
+
+ error = sysctl_wire_old_buffer(req, sizeof(int));
+ if (error)
+ return (error);
+
+ IGMP_LOCK();
+
+ i = V_igmp_gsrdelay.tv_sec;
+
+ error = sysctl_handle_int(oidp, &i, 0, req);
+ if (error || !req->newptr)
+ goto out_locked;
+
+ if (i < -1 || i >= 60) {
+ error = EINVAL;
+ goto out_locked;
+ }
+
+ CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d",
+ V_igmp_gsrdelay.tv_sec, i);
+ V_igmp_gsrdelay.tv_sec = i;
+
+out_locked:
+ IGMP_UNLOCK();
+ return (error);
+}
+
+/*
+ * Expose struct igmp_ifinfo to userland, keyed by ifindex.
+ * For use by ifmcstat(8).
+ *
+ * SMPng: NOTE: Does an unlocked ifindex space read.
+ * VIMAGE: Assume curvnet set by caller. The node handler itself
+ * is not directly virtualized.
+ */
+static int
+sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
+{
+ int *name;
+ int error;
+ u_int namelen;
+ struct ifnet *ifp;
+ struct igmp_ifinfo *igi;
+
+ name = (int *)arg1;
+ namelen = arg2;
+
+ if (req->newptr != NULL)
+ return (EPERM);
+
+ if (namelen != 1)
+ return (EINVAL);
+
+ error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo));
+ if (error)
+ return (error);
+
+ IN_MULTI_LOCK();
+ IGMP_LOCK();
+
+ if (name[0] <= 0 || name[0] > V_if_index) {
+ error = ENOENT;
+ goto out_locked;
+ }
+
+ error = ENOENT;
+
+ ifp = ifnet_byindex(name[0]);
+ if (ifp == NULL)
+ goto out_locked;
+
+ LIST_FOREACH(igi, &V_igi_head, igi_link) {
+ if (ifp == igi->igi_ifp) {
+ error = SYSCTL_OUT(req, igi,
+ sizeof(struct igmp_ifinfo));
+ break;
+ }
+ }
+
+out_locked:
+ IGMP_UNLOCK();
+ IN_MULTI_UNLOCK();
+ return (error);
+}
+
+/*
+ * Dispatch an entire queue of pending packet chains
+ * using the netisr.
+ * VIMAGE: Assumes the vnet pointer has been set.
+ */
+static void
+igmp_dispatch_queue(struct ifqueue *ifq, int limit, const int loop)
+{
+ struct mbuf *m;
+
+ for (;;) {
+ _IF_DEQUEUE(ifq, m);
+ if (m == NULL)
+ break;
+ CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, ifq, m);
+ if (loop)
+ m->m_flags |= M_IGMP_LOOP;
+ netisr_dispatch(NETISR_IGMP, m);
+ if (--limit == 0)
+ break;
+ }
+}
+
+/*
+ * Filter outgoing IGMP report state by group.
+ *
+ * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1).
+ * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are
+ * disabled for all groups in the 224.0.0.0/24 link-local scope. However,
+ * this may break certain IGMP snooping switches which rely on the old
+ * report behaviour.
+ *
+ * Return zero if the given group is one for which IGMP reports
+ * should be suppressed, or non-zero if reports should be issued.
+ */
+static __inline int
+igmp_isgroupreported(const struct in_addr addr)
+{
+
+ if (in_allhosts(addr) ||
+ ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr)))))
+ return (0);
+
+ return (1);
+}
+
+/*
+ * Construct a Router Alert option to use in outgoing packets.
+ */
+static struct mbuf *
+igmp_ra_alloc(void)
+{
+ struct mbuf *m;
+ struct ipoption *p;
+
+ MGET(m, M_DONTWAIT, MT_DATA);
+ p = mtod(m, struct ipoption *);
+ p->ipopt_dst.s_addr = INADDR_ANY;
+ p->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */
+ p->ipopt_list[1] = 0x04; /* 4 bytes long */
+ p->ipopt_list[2] = IPOPT_EOL; /* End of IP option list */
+ p->ipopt_list[3] = 0x00; /* pad byte */
+ m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1];
+
+ return (m);
+}
+
+/*
+ * Attach IGMP when PF_INET is attached to an interface.
+ */
+struct igmp_ifinfo *
+igmp_domifattach(struct ifnet *ifp)
+{
+ struct igmp_ifinfo *igi;
+
+ CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
+ __func__, ifp, ifp->if_xname);
+
+ IGMP_LOCK();
+
+ igi = igi_alloc_locked(ifp);
+ if (!(ifp->if_flags & IFF_MULTICAST))
+ igi->igi_flags |= IGIF_SILENT;
+
+ IGMP_UNLOCK();
+
+ return (igi);
+}
+
+/*
+ * VIMAGE: assume curvnet set by caller.
+ */
+static struct igmp_ifinfo *
+igi_alloc_locked(/*const*/ struct ifnet *ifp)
+{
+ struct igmp_ifinfo *igi;
+
+ IGMP_LOCK_ASSERT();
+
+ igi = malloc(sizeof(struct igmp_ifinfo), M_IGMP, M_NOWAIT|M_ZERO);
+ if (igi == NULL)
+ goto out;
+
+ igi->igi_ifp = ifp;
+ igi->igi_version = V_igmp_default_version;
+ igi->igi_flags = 0;
+ igi->igi_rv = IGMP_RV_INIT;
+ igi->igi_qi = IGMP_QI_INIT;
+ igi->igi_qri = IGMP_QRI_INIT;
+ igi->igi_uri = IGMP_URI_INIT;
+
+ SLIST_INIT(&igi->igi_relinmhead);
+
+ /*
+ * Responses to general queries are subject to bounds.
+ */
+ IFQ_SET_MAXLEN(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
+
+ LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
+
+ CTR2(KTR_IGMPV3, "allocate igmp_ifinfo for ifp %p(%s)",
+ ifp, ifp->if_xname);
+
+out:
+ return (igi);
+}
+
+/*
+ * Hook for ifdetach.
+ *
+ * NOTE: Some finalization tasks need to run before the protocol domain
+ * is detached, but also before the link layer does its cleanup.
+ *
+ * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK().
+ * XXX This is also bitten by unlocked ifma_protospec access.
+ */
+void
+igmp_ifdetach(struct ifnet *ifp)
+{
+ struct igmp_ifinfo *igi;
+ struct ifmultiaddr *ifma;
+ struct in_multi *inm, *tinm;
+
+ CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp,
+ ifp->if_xname);
+
+ IGMP_LOCK();
+
+ igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
+ if (igi->igi_version == IGMP_VERSION_3) {
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ if (ifma->ifma_addr->sa_family != AF_INET ||
+ ifma->ifma_protospec == NULL)
+ continue;
+#if 0
+ KASSERT(ifma->ifma_protospec != NULL,
+ ("%s: ifma_protospec is NULL", __func__));
+#endif
+ inm = (struct in_multi *)ifma->ifma_protospec;
+ if (inm->inm_state == IGMP_LEAVING_MEMBER) {
+ SLIST_INSERT_HEAD(&igi->igi_relinmhead,
+ inm, inm_nrele);
+ }
+ inm_clear_recorded(inm);
+ }
+ IF_ADDR_UNLOCK(ifp);
+ /*
+ * Free the in_multi reference(s) for this IGMP lifecycle.
+ */
+ SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele,
+ tinm) {
+ SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
+ inm_release_locked(inm);
+ }
+ }
+
+ IGMP_UNLOCK();
+}
+
+/*
+ * Hook for domifdetach.
+ */
+void
+igmp_domifdetach(struct ifnet *ifp)
+{
+ struct igmp_ifinfo *igi;
+
+ CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
+ __func__, ifp, ifp->if_xname);
+
+ IGMP_LOCK();
+
+ igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
+ igi_delete_locked(ifp);
+
+ IGMP_UNLOCK();
+}
+
+static void
+igi_delete_locked(const struct ifnet *ifp)
+{
+ struct igmp_ifinfo *igi, *tigi;
+
+ CTR3(KTR_IGMPV3, "%s: freeing igmp_ifinfo for ifp %p(%s)",
+ __func__, ifp, ifp->if_xname);
+
+ IGMP_LOCK_ASSERT();
+
+ LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) {
+ if (igi->igi_ifp == ifp) {
+ /*
+ * Free deferred General Query responses.
+ */
+ _IF_DRAIN(&igi->igi_gq);
+
+ LIST_REMOVE(igi, igi_link);
+
+ KASSERT(SLIST_EMPTY(&igi->igi_relinmhead),
+ ("%s: there are dangling in_multi references",
+ __func__));
+
+ free(igi, M_IGMP);
+ return;
+ }
+ }
+
+#ifdef INVARIANTS
+ panic("%s: igmp_ifinfo not found for ifp %p\n", __func__, ifp);
+#endif
+}
+
+/*
+ * Process a received IGMPv1 query.
+ * Return non-zero if the message should be dropped.
+ *
+ * VIMAGE: The curvnet pointer is derived from the input ifp.
+ */
+static int
+igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
+ const struct igmp *igmp)
+{
+ struct ifmultiaddr *ifma;
+ struct igmp_ifinfo *igi;
+ struct in_multi *inm;
+
+ /*
+ * IGMPv1 Host Mmembership Queries SHOULD always be addressed to
+ * 224.0.0.1. They are always treated as General Queries.
+ * igmp_group is always ignored. Do not drop it as a userland
+ * daemon may wish to see it.
+ * XXX SMPng: unlocked increments in igmpstat assumed atomic.
+ */
+ if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) {
+ IGMPSTAT_INC(igps_rcv_badqueries);
+ return (0);
+ }
+ IGMPSTAT_INC(igps_rcv_gen_queries);
+
+ IN_MULTI_LOCK();
+ IGMP_LOCK();
+
+ igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
+ KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
+
+ if (igi->igi_flags & IGIF_LOOPBACK) {
+ CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)",
+ ifp, ifp->if_xname);
+ goto out_locked;
+ }
+
+ /*
+ * Switch to IGMPv1 host compatibility mode.
+ */
+ igmp_set_version(igi, IGMP_VERSION_1);
+
+ CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname);
+
+ /*
+ * Start the timers in all of our group records
+ * for the interface on which the query arrived,
+ * except those which are already running.
+ */
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ if (ifma->ifma_addr->sa_family != AF_INET ||
+ ifma->ifma_protospec == NULL)
+ continue;
+ inm = (struct in_multi *)ifma->ifma_protospec;
+ if (inm->inm_timer != 0)
+ continue;
+ switch (inm->inm_state) {
+ case IGMP_NOT_MEMBER:
+ case IGMP_SILENT_MEMBER:
+ break;
+ case IGMP_G_QUERY_PENDING_MEMBER:
+ case IGMP_SG_QUERY_PENDING_MEMBER:
+ case IGMP_REPORTING_MEMBER:
+ case IGMP_IDLE_MEMBER:
+ case IGMP_LAZY_MEMBER:
+ case IGMP_SLEEPING_MEMBER:
+ case IGMP_AWAKENING_MEMBER:
+ inm->inm_state = IGMP_REPORTING_MEMBER;
+ inm->inm_timer = IGMP_RANDOM_DELAY(
+ IGMP_V1V2_MAX_RI * PR_FASTHZ);
+ V_current_state_timers_running = 1;
+ break;
+ case IGMP_LEAVING_MEMBER:
+ break;
+ }
+ }
+ IF_ADDR_UNLOCK(ifp);
+
+out_locked:
+ IGMP_UNLOCK();
+ IN_MULTI_UNLOCK();
+
+ return (0);
+}
+
+/*
+ * Process a received IGMPv2 general or group-specific query.
+ */
+static int
+igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
+ const struct igmp *igmp)
+{
+ struct ifmultiaddr *ifma;
+ struct igmp_ifinfo *igi;
+ struct in_multi *inm;
+ int is_general_query;
+ uint16_t timer;
+
+ is_general_query = 0;
+
+ /*
+ * Validate address fields upfront.
+ * XXX SMPng: unlocked increments in igmpstat assumed atomic.
+ */
+ if (in_nullhost(igmp->igmp_group)) {
+ /*
+ * IGMPv2 General Query.
+ * If this was not sent to the all-hosts group, ignore it.
+ */
+ if (!in_allhosts(ip->ip_dst))
+ return (0);
+ IGMPSTAT_INC(igps_rcv_gen_queries);
+ is_general_query = 1;
+ } else {
+ /* IGMPv2 Group-Specific Query. */
+ IGMPSTAT_INC(igps_rcv_group_queries);
+ }
+
+ IN_MULTI_LOCK();
+ IGMP_LOCK();
+
+ igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
+ KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
+
+ if (igi->igi_flags & IGIF_LOOPBACK) {
+ CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)",
+ ifp, ifp->if_xname);
+ goto out_locked;
+ }
+
+ /*
+ * Ignore v2 query if in v1 Compatibility Mode.
+ */
+ if (igi->igi_version == IGMP_VERSION_1)
+ goto out_locked;
+
+ igmp_set_version(igi, IGMP_VERSION_2);
+
+ timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
+ if (timer == 0)
+ timer = 1;
+
+ if (is_general_query) {
+ /*
+ * For each reporting group joined on this
+ * interface, kick the report timer.
+ */
+ CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)",
+ ifp, ifp->if_xname);
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ if (ifma->ifma_addr->sa_family != AF_INET ||
+ ifma->ifma_protospec == NULL)
+ continue;
+ inm = (struct in_multi *)ifma->ifma_protospec;
+ igmp_v2_update_group(inm, timer);
+ }
+ IF_ADDR_UNLOCK(ifp);
+ } else {
+ /*
+ * Group-specific IGMPv2 query, we need only
+ * look up the single group to process it.
+ */
+ inm = inm_lookup(ifp, igmp->igmp_group);
+ if (inm != NULL) {
+ CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)",
+ inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
+ igmp_v2_update_group(inm, timer);
+ }
+ }
+
+out_locked:
+ IGMP_UNLOCK();
+ IN_MULTI_UNLOCK();
+
+ return (0);
+}
+
+/*
+ * Update the report timer on a group in response to an IGMPv2 query.
+ *
+ * If we are becoming the reporting member for this group, start the timer.
+ * If we already are the reporting member for this group, and timer is
+ * below the threshold, reset it.
+ *
+ * We may be updating the group for the first time since we switched
+ * to IGMPv3. If we are, then we must clear any recorded source lists,
+ * and transition to REPORTING state; the group timer is overloaded
+ * for group and group-source query responses.
+ *
+ * Unlike IGMPv3, the delay per group should be jittered
+ * to avoid bursts of IGMPv2 reports.
+ */
+static void
+igmp_v2_update_group(struct in_multi *inm, const int timer)
+{
+
+ CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__,
+ inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer);
+
+ IN_MULTI_LOCK_ASSERT();
+
+ switch (inm->inm_state) {
+ case IGMP_NOT_MEMBER:
+ case IGMP_SILENT_MEMBER:
+ break;
+ case IGMP_REPORTING_MEMBER:
+ if (inm->inm_timer != 0 &&
+ inm->inm_timer <= timer) {
+ CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, "
+ "skipping.", __func__);
+ break;
+ }
+ /* FALLTHROUGH */
+ case IGMP_SG_QUERY_PENDING_MEMBER:
+ case IGMP_G_QUERY_PENDING_MEMBER:
+ case IGMP_IDLE_MEMBER:
+ case IGMP_LAZY_MEMBER:
+ case IGMP_AWAKENING_MEMBER:
+ CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__);
+ inm->inm_state = IGMP_REPORTING_MEMBER;
+ inm->inm_timer = IGMP_RANDOM_DELAY(timer);
+ V_current_state_timers_running = 1;
+ break;
+ case IGMP_SLEEPING_MEMBER:
+ CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__);
+ inm->inm_state = IGMP_AWAKENING_MEMBER;
+ break;
+ case IGMP_LEAVING_MEMBER:
+ break;
+ }
+}
+
+/*
+ * Process a received IGMPv3 general, group-specific or
+ * group-and-source-specific query.
+ * Assumes m has already been pulled up to the full IGMP message length.
+ * Return 0 if successful, otherwise an appropriate error code is returned.
+ */
+static int
+igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
+ /*const*/ struct igmpv3 *igmpv3)
+{
+ struct igmp_ifinfo *igi;
+ struct in_multi *inm;
+ int is_general_query;
+ uint32_t maxresp, nsrc, qqi;
+ uint16_t timer;
+ uint8_t qrv;
+
+ is_general_query = 0;
+
+ CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname);
+
+ maxresp = igmpv3->igmp_code; /* in 1/10ths of a second */
+ if (maxresp >= 128) {
+ maxresp = IGMP_MANT(igmpv3->igmp_code) <<
+ (IGMP_EXP(igmpv3->igmp_code) + 3);
+ }
+
+ /*
+ * Robustness must never be less than 2 for on-wire IGMPv3.
+ * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make
+ * an exception for interfaces whose IGMPv3 state changes
+ * are redirected to loopback (e.g. MANET).
+ */
+ qrv = IGMP_QRV(igmpv3->igmp_misc);
+ if (qrv < 2) {
+ CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__,
+ qrv, IGMP_RV_INIT);
+ qrv = IGMP_RV_INIT;
+ }
+
+ qqi = igmpv3->igmp_qqi;
+ if (qqi >= 128) {
+ qqi = IGMP_MANT(igmpv3->igmp_qqi) <<
+ (IGMP_EXP(igmpv3->igmp_qqi) + 3);
+ }
+
+ timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE;
+ if (timer == 0)
+ timer = 1;
+
+ nsrc = ntohs(igmpv3->igmp_numsrc);
+
+ /*
+ * Validate address fields and versions upfront before
+ * accepting v3 query.
+ * XXX SMPng: Unlocked access to igmpstat counters here.
+ */
+ if (in_nullhost(igmpv3->igmp_group)) {
+ /*
+ * IGMPv3 General Query.
+ *
+ * General Queries SHOULD be directed to 224.0.0.1.
+ * A general query with a source list has undefined
+ * behaviour; discard it.
+ */
+ IGMPSTAT_INC(igps_rcv_gen_queries);
+ if (!in_allhosts(ip->ip_dst) || nsrc > 0) {
+ IGMPSTAT_INC(igps_rcv_badqueries);
+ return (0);
+ }
+ is_general_query = 1;
+ } else {
+ /* Group or group-source specific query. */
+ if (nsrc == 0)
+ IGMPSTAT_INC(igps_rcv_group_queries);
+ else
+ IGMPSTAT_INC(igps_rcv_gsr_queries);
+ }
+
+ IN_MULTI_LOCK();
+ IGMP_LOCK();
+
+ igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
+ KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
+
+ if (igi->igi_flags & IGIF_LOOPBACK) {
+ CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)",
+ ifp, ifp->if_xname);
+ goto out_locked;
+ }
+
+ /*
+ * Discard the v3 query if we're in Compatibility Mode.
+ * The RFC is not obviously worded that hosts need to stay in
+ * compatibility mode until the Old Version Querier Present
+ * timer expires.
+ */
+ if (igi->igi_version != IGMP_VERSION_3) {
+ CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)",
+ igi->igi_version, ifp, ifp->if_xname);
+ goto out_locked;
+ }
+
+ igmp_set_version(igi, IGMP_VERSION_3);
+ igi->igi_rv = qrv;
+ igi->igi_qi = qqi;
+ igi->igi_qri = maxresp;
+
+ CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi,
+ maxresp);
+
+ if (is_general_query) {
+ /*
+ * Schedule a current-state report on this ifp for
+ * all groups, possibly containing source lists.
+ * If there is a pending General Query response
+ * scheduled earlier than the selected delay, do
+ * not schedule any other reports.
+ * Otherwise, reset the interface timer.
+ */
+ CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)",
+ ifp, ifp->if_xname);
+ if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) {
+ igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer);
+ V_interface_timers_running = 1;
+ }
+ } else {
+ /*
+ * Group-source-specific queries are throttled on
+ * a per-group basis to defeat denial-of-service attempts.
+ * Queries for groups we are not a member of on this
+ * link are simply ignored.
+ */
+ inm = inm_lookup(ifp, igmpv3->igmp_group);
+ if (inm == NULL)
+ goto out_locked;
+ if (nsrc > 0) {
+ if (!ratecheck(&inm->inm_lastgsrtv,
+ &V_igmp_gsrdelay)) {
+ CTR1(KTR_IGMPV3, "%s: GS query throttled.",
+ __func__);
+ IGMPSTAT_INC(igps_drop_gsr_queries);
+ goto out_locked;
+ }
+ }
+ CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)",
+ inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname);
+ /*
+ * If there is a pending General Query response
+ * scheduled sooner than the selected delay, no
+ * further report need be scheduled.
+ * Otherwise, prepare to respond to the
+ * group-specific or group-and-source query.
+ */
+ if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer)
+ igmp_input_v3_group_query(inm, igi, timer, igmpv3);
+ }
+
+out_locked:
+ IGMP_UNLOCK();
+ IN_MULTI_UNLOCK();
+
+ return (0);
+}
+
+/*
+ * Process a recieved IGMPv3 group-specific or group-and-source-specific
+ * query.
+ * Return <0 if any error occured. Currently this is ignored.
+ */
+static int
+igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifinfo *igi,
+ int timer, /*const*/ struct igmpv3 *igmpv3)
+{
+ int retval;
+ uint16_t nsrc;
+
+ IN_MULTI_LOCK_ASSERT();
+ IGMP_LOCK_ASSERT();
+
+ retval = 0;
+
+ switch (inm->inm_state) {
+ case IGMP_NOT_MEMBER:
+ case IGMP_SILENT_MEMBER:
+ case IGMP_SLEEPING_MEMBER:
+ case IGMP_LAZY_MEMBER:
+ case IGMP_AWAKENING_MEMBER:
+ case IGMP_IDLE_MEMBER:
+ case IGMP_LEAVING_MEMBER:
+ return (retval);
+ break;
+ case IGMP_REPORTING_MEMBER:
+ case IGMP_G_QUERY_PENDING_MEMBER:
+ case IGMP_SG_QUERY_PENDING_MEMBER:
+ break;
+ }
+
+ nsrc = ntohs(igmpv3->igmp_numsrc);
+
+ /*
+ * Deal with group-specific queries upfront.
+ * If any group query is already pending, purge any recorded
+ * source-list state if it exists, and schedule a query response
+ * for this group-specific query.
+ */
+ if (nsrc == 0) {
+ if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
+ inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) {
+ inm_clear_recorded(inm);
+ timer = min(inm->inm_timer, timer);
+ }
+ inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER;
+ inm->inm_timer = IGMP_RANDOM_DELAY(timer);
+ V_current_state_timers_running = 1;
+ return (retval);
+ }
+
+ /*
+ * Deal with the case where a group-and-source-specific query has
+ * been received but a group-specific query is already pending.
+ */
+ if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) {
+ timer = min(inm->inm_timer, timer);
+ inm->inm_timer = IGMP_RANDOM_DELAY(timer);
+ V_current_state_timers_running = 1;
+ return (retval);
+ }
+
+ /*
+ * Finally, deal with the case where a group-and-source-specific
+ * query has been received, where a response to a previous g-s-r
+ * query exists, or none exists.
+ * In this case, we need to parse the source-list which the Querier
+ * has provided us with and check if we have any source list filter
+ * entries at T1 for these sources. If we do not, there is no need
+ * schedule a report and the query may be dropped.
+ * If we do, we must record them and schedule a current-state
+ * report for those sources.
+ * FIXME: Handling source lists larger than 1 mbuf requires that
+ * we pass the mbuf chain pointer down to this function, and use
+ * m_getptr() to walk the chain.
+ */
+ if (inm->inm_nsrc > 0) {
+ const struct in_addr *ap;
+ int i, nrecorded;
+
+ ap = (const struct in_addr *)(igmpv3 + 1);
+ nrecorded = 0;
+ for (i = 0; i < nsrc; i++, ap++) {
+ retval = inm_record_source(inm, ap->s_addr);
+ if (retval < 0)
+ break;
+ nrecorded += retval;
+ }
+ if (nrecorded > 0) {
+ CTR1(KTR_IGMPV3,
+ "%s: schedule response to SG query", __func__);
+ inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER;
+ inm->inm_timer = IGMP_RANDOM_DELAY(timer);
+ V_current_state_timers_running = 1;
+ }
+ }
+
+ return (retval);
+}
+
+/*
+ * Process a received IGMPv1 host membership report.
+ *
+ * NOTE: 0.0.0.0 workaround breaks const correctness.
+ */
+static int
+igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
+ /*const*/ struct igmp *igmp)
+{
+ struct in_ifaddr *ia;
+ struct in_multi *inm;
+
+ IGMPSTAT_INC(igps_rcv_reports);
+
+ if (ifp->if_flags & IFF_LOOPBACK)
+ return (0);
+
+ if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
+ !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
+ IGMPSTAT_INC(igps_rcv_badreports);
+ return (EINVAL);
+ }
+
+ /*
+ * RFC 3376, Section 4.2.13, 9.2, 9.3:
+ * Booting clients may use the source address 0.0.0.0. Some
+ * IGMP daemons may not know how to use IP_RECVIF to determine
+ * the interface upon which this message was received.
+ * Replace 0.0.0.0 with the subnet address if told to do so.
+ */
+ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
+ IFP_TO_IA(ifp, ia);
+ if (ia != NULL) {
+ ip->ip_src.s_addr = htonl(ia->ia_subnet);
+ ifa_free(&ia->ia_ifa);
+ }
+ }
+
+ CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)",
+ inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
+
+ /*
+ * IGMPv1 report suppression.
+ * If we are a member of this group, and our membership should be
+ * reported, stop our group timer and transition to the 'lazy' state.
+ */
+ IN_MULTI_LOCK();
+ inm = inm_lookup(ifp, igmp->igmp_group);
+ if (inm != NULL) {
+ struct igmp_ifinfo *igi;
+
+ igi = inm->inm_igi;
+ if (igi == NULL) {
+ KASSERT(igi != NULL,
+ ("%s: no igi for ifp %p", __func__, ifp));
+ goto out_locked;
+ }
+
+ IGMPSTAT_INC(igps_rcv_ourreports);
+
+ /*
+ * If we are in IGMPv3 host mode, do not allow the
+ * other host's IGMPv1 report to suppress our reports
+ * unless explicitly configured to do so.
+ */
+ if (igi->igi_version == IGMP_VERSION_3) {
+ if (V_igmp_legacysupp)
+ igmp_v3_suppress_group_record(inm);
+ goto out_locked;
+ }
+
+ inm->inm_timer = 0;
+
+ switch (inm->inm_state) {
+ case IGMP_NOT_MEMBER:
+ case IGMP_SILENT_MEMBER:
+ break;
+ case IGMP_IDLE_MEMBER:
+ case IGMP_LAZY_MEMBER:
+ case IGMP_AWAKENING_MEMBER:
+ CTR3(KTR_IGMPV3,
+ "report suppressed for %s on ifp %p(%s)",
+ inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
+ case IGMP_SLEEPING_MEMBER:
+ inm->inm_state = IGMP_SLEEPING_MEMBER;
+ break;
+ case IGMP_REPORTING_MEMBER:
+ CTR3(KTR_IGMPV3,
+ "report suppressed for %s on ifp %p(%s)",
+ inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
+ if (igi->igi_version == IGMP_VERSION_1)
+ inm->inm_state = IGMP_LAZY_MEMBER;
+ else if (igi->igi_version == IGMP_VERSION_2)
+ inm->inm_state = IGMP_SLEEPING_MEMBER;
+ break;
+ case IGMP_G_QUERY_PENDING_MEMBER:
+ case IGMP_SG_QUERY_PENDING_MEMBER:
+ case IGMP_LEAVING_MEMBER:
+ break;
+ }
+ }
+
+out_locked:
+ IN_MULTI_UNLOCK();
+
+ return (0);
+}
+
+/*
+ * Process a received IGMPv2 host membership report.
+ *
+ * NOTE: 0.0.0.0 workaround breaks const correctness.
+ */
+static int
+igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
+ /*const*/ struct igmp *igmp)
+{
+ struct in_ifaddr *ia;
+ struct in_multi *inm;
+
+ /*
+ * Make sure we don't hear our own membership report. Fast
+ * leave requires knowing that we are the only member of a
+ * group.
+ */
+ IFP_TO_IA(ifp, ia);
+ if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) {
+ ifa_free(&ia->ia_ifa);
+ return (0);
+ }
+
+ IGMPSTAT_INC(igps_rcv_reports);
+
+ if (ifp->if_flags & IFF_LOOPBACK) {
+ if (ia != NULL)
+ ifa_free(&ia->ia_ifa);
+ return (0);
+ }
+
+ if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
+ !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
+ if (ia != NULL)
+ ifa_free(&ia->ia_ifa);
+ IGMPSTAT_INC(igps_rcv_badreports);
+ return (EINVAL);
+ }
+
+ /*
+ * RFC 3376, Section 4.2.13, 9.2, 9.3:
+ * Booting clients may use the source address 0.0.0.0. Some
+ * IGMP daemons may not know how to use IP_RECVIF to determine
+ * the interface upon which this message was received.
+ * Replace 0.0.0.0 with the subnet address if told to do so.
+ */
+ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
+ if (ia != NULL)
+ ip->ip_src.s_addr = htonl(ia->ia_subnet);
+ }
+ if (ia != NULL)
+ ifa_free(&ia->ia_ifa);
+
+ CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)",
+ inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
+
+ /*
+ * IGMPv2 report suppression.
+ * If we are a member of this group, and our membership should be
+ * reported, and our group timer is pending or about to be reset,
+ * stop our group timer by transitioning to the 'lazy' state.
+ */
+ IN_MULTI_LOCK();
+ inm = inm_lookup(ifp, igmp->igmp_group);
+ if (inm != NULL) {
+ struct igmp_ifinfo *igi;
+
+ igi = inm->inm_igi;
+ KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp));
+
+ IGMPSTAT_INC(igps_rcv_ourreports);
+
+ /*
+ * If we are in IGMPv3 host mode, do not allow the
+ * other host's IGMPv1 report to suppress our reports
+ * unless explicitly configured to do so.
+ */
+ if (igi->igi_version == IGMP_VERSION_3) {
+ if (V_igmp_legacysupp)
+ igmp_v3_suppress_group_record(inm);
+ goto out_locked;
+ }
+
+ inm->inm_timer = 0;
+
+ switch (inm->inm_state) {
+ case IGMP_NOT_MEMBER:
+ case IGMP_SILENT_MEMBER:
+ case IGMP_SLEEPING_MEMBER:
+ break;
+ case IGMP_REPORTING_MEMBER:
+ case IGMP_IDLE_MEMBER:
+ case IGMP_AWAKENING_MEMBER:
+ CTR3(KTR_IGMPV3,
+ "report suppressed for %s on ifp %p(%s)",
+ inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
+ case IGMP_LAZY_MEMBER:
+ inm->inm_state = IGMP_LAZY_MEMBER;
+ break;
+ case IGMP_G_QUERY_PENDING_MEMBER:
+ case IGMP_SG_QUERY_PENDING_MEMBER:
+ case IGMP_LEAVING_MEMBER:
+ break;
+ }
+ }
+
+out_locked:
+ IN_MULTI_UNLOCK();
+
+ return (0);
+}
+
+void
+igmp_input(struct mbuf *m, int off)
+{
+ int iphlen;
+ struct ifnet *ifp;
+ struct igmp *igmp;
+ struct ip *ip;
+ int igmplen;
+ int minlen;
+ int queryver;
+
+ CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, m, off);
+
+ ifp = m->m_pkthdr.rcvif;
+
+ IGMPSTAT_INC(igps_rcv_total);
+
+ ip = mtod(m, struct ip *);
+ iphlen = off;
+ igmplen = ip->ip_len;
+
+ /*
+ * Validate lengths.
+ */
+ if (igmplen < IGMP_MINLEN) {
+ IGMPSTAT_INC(igps_rcv_tooshort);
+ m_freem(m);
+ return;
+ }
+
+ /*
+ * Always pullup to the minimum size for v1/v2 or v3
+ * to amortize calls to m_pullup().
+ */
+ minlen = iphlen;
+ if (igmplen >= IGMP_V3_QUERY_MINLEN)
+ minlen += IGMP_V3_QUERY_MINLEN;
+ else
+ minlen += IGMP_MINLEN;
+ if ((m->m_flags & M_EXT || m->m_len < minlen) &&
+ (m = m_pullup(m, minlen)) == 0) {
+ IGMPSTAT_INC(igps_rcv_tooshort);
+ return;
+ }
+ ip = mtod(m, struct ip *);
+
+ /*
+ * Validate checksum.
+ */
+ m->m_data += iphlen;
+ m->m_len -= iphlen;
+ igmp = mtod(m, struct igmp *);
+ if (in_cksum(m, igmplen)) {
+ IGMPSTAT_INC(igps_rcv_badsum);
+ m_freem(m);
+ return;
+ }
+ m->m_data -= iphlen;
+ m->m_len += iphlen;
+
+ /*
+ * IGMP control traffic is link-scope, and must have a TTL of 1.
+ * DVMRP traffic (e.g. mrinfo, mtrace) is an exception;
+ * probe packets may come from beyond the LAN.
+ */
+ if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) {
+ IGMPSTAT_INC(igps_rcv_badttl);
+ m_freem(m);
+ return;
+ }
+
+ switch (igmp->igmp_type) {
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ if (igmplen == IGMP_MINLEN) {
+ if (igmp->igmp_code == 0)
+ queryver = IGMP_VERSION_1;
+ else
+ queryver = IGMP_VERSION_2;
+ } else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
+ queryver = IGMP_VERSION_3;
+ } else {
+ IGMPSTAT_INC(igps_rcv_tooshort);
+ m_freem(m);
+ return;
+ }
+
+ switch (queryver) {
+ case IGMP_VERSION_1:
+ IGMPSTAT_INC(igps_rcv_v1v2_queries);
+ if (!V_igmp_v1enable)
+ break;
+ if (igmp_input_v1_query(ifp, ip, igmp) != 0) {
+ m_freem(m);
+ return;
+ }
+ break;
+
+ case IGMP_VERSION_2:
+ IGMPSTAT_INC(igps_rcv_v1v2_queries);
+ if (!V_igmp_v2enable)
+ break;
+ if (igmp_input_v2_query(ifp, ip, igmp) != 0) {
+ m_freem(m);
+ return;
+ }
+ break;
+
+ case IGMP_VERSION_3: {
+ struct igmpv3 *igmpv3;
+ uint16_t igmpv3len;
+ uint16_t srclen;
+ int nsrc;
+
+ IGMPSTAT_INC(igps_rcv_v3_queries);
+ igmpv3 = (struct igmpv3 *)igmp;
+ /*
+ * Validate length based on source count.
+ */
+ nsrc = ntohs(igmpv3->igmp_numsrc);
+ srclen = sizeof(struct in_addr) * nsrc;
+ if (nsrc * sizeof(in_addr_t) > srclen) {
+ IGMPSTAT_INC(igps_rcv_tooshort);
+ return;
+ }
+ /*
+ * m_pullup() may modify m, so pullup in
+ * this scope.
+ */
+ igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN +
+ srclen;
+ if ((m->m_flags & M_EXT ||
+ m->m_len < igmpv3len) &&
+ (m = m_pullup(m, igmpv3len)) == NULL) {
+ IGMPSTAT_INC(igps_rcv_tooshort);
+ return;
+ }
+ igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *)
+ + iphlen);
+ if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
+ m_freem(m);
+ return;
+ }
+ }
+ break;
+ }
+ break;
+
+ case IGMP_v1_HOST_MEMBERSHIP_REPORT:
+ if (!V_igmp_v1enable)
+ break;
+ if (igmp_input_v1_report(ifp, ip, igmp) != 0) {
+ m_freem(m);
+ return;
+ }
+ break;
+
+ case IGMP_v2_HOST_MEMBERSHIP_REPORT:
+ if (!V_igmp_v2enable)
+ break;
+ if (!ip_checkrouteralert(m))
+ IGMPSTAT_INC(igps_rcv_nora);
+ if (igmp_input_v2_report(ifp, ip, igmp) != 0) {
+ m_freem(m);
+ return;
+ }
+ break;
+
+ case IGMP_v3_HOST_MEMBERSHIP_REPORT:
+ /*
+ * Hosts do not need to process IGMPv3 membership reports,
+ * as report suppression is no longer required.
+ */
+ if (!ip_checkrouteralert(m))
+ IGMPSTAT_INC(igps_rcv_nora);
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * Pass all valid IGMP packets up to any process(es) listening on a
+ * raw IGMP socket.
+ */
+ rip_input(m, off);
+}
+
+
+/*
+ * Fast timeout handler (global).
+ * VIMAGE: Timeout handlers are expected to service all vimages.
+ */
+void
+igmp_fasttimo(void)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+
+ VNET_LIST_RLOCK_NOSLEEP();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ igmp_fasttimo_vnet();
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK_NOSLEEP();
+}
+
+/*
+ * Fast timeout handler (per-vnet).
+ * Sends are shuffled off to a netisr to deal with Giant.
+ *
+ * VIMAGE: Assume caller has set up our curvnet.
+ */
+static void
+igmp_fasttimo_vnet(void)
+{
+ struct ifqueue scq; /* State-change packets */
+ struct ifqueue qrq; /* Query response packets */
+ struct ifnet *ifp;
+ struct igmp_ifinfo *igi;
+ struct ifmultiaddr *ifma, *tifma;
+ struct in_multi *inm;
+ int loop, uri_fasthz;
+
+ loop = 0;
+ uri_fasthz = 0;
+
+ /*
+ * Quick check to see if any work needs to be done, in order to
+ * minimize the overhead of fasttimo processing.
+ * SMPng: XXX Unlocked reads.
+ */
+ if (!V_current_state_timers_running &&
+ !V_interface_timers_running &&
+ !V_state_change_timers_running)
+ return;
+
+ IN_MULTI_LOCK();
+ IGMP_LOCK();
+
+ /*
+ * IGMPv3 General Query response timer processing.
+ */
+ if (V_interface_timers_running) {
+ CTR1(KTR_IGMPV3, "%s: interface timers running", __func__);
+
+ V_interface_timers_running = 0;
+ LIST_FOREACH(igi, &V_igi_head, igi_link) {
+ if (igi->igi_v3_timer == 0) {
+ /* Do nothing. */
+ } else if (--igi->igi_v3_timer == 0) {
+ igmp_v3_dispatch_general_query(igi);
+ } else {
+ V_interface_timers_running = 1;
+ }
+ }
+ }
+
+ if (!V_current_state_timers_running &&
+ !V_state_change_timers_running)
+ goto out_locked;
+
+ V_current_state_timers_running = 0;
+ V_state_change_timers_running = 0;
+
+ CTR1(KTR_IGMPV3, "%s: state change timers running", __func__);
+
+ /*
+ * IGMPv1/v2/v3 host report and state-change timer processing.
+ * Note: Processing a v3 group timer may remove a node.
+ */
+ LIST_FOREACH(igi, &V_igi_head, igi_link) {
+ ifp = igi->igi_ifp;
+
+ if (igi->igi_version == IGMP_VERSION_3) {
+ loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
+ uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri *
+ PR_FASTHZ);
+
+ memset(&qrq, 0, sizeof(struct ifqueue));
+ IFQ_SET_MAXLEN(&qrq, IGMP_MAX_G_GS_PACKETS);
+
+ memset(&scq, 0, sizeof(struct ifqueue));
+ IFQ_SET_MAXLEN(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
+ }
+
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link,
+ tifma) {
+ if (ifma->ifma_addr->sa_family != AF_INET ||
+ ifma->ifma_protospec == NULL)
+ continue;
+ inm = (struct in_multi *)ifma->ifma_protospec;
+ switch (igi->igi_version) {
+ case IGMP_VERSION_1:
+ case IGMP_VERSION_2:
+ igmp_v1v2_process_group_timer(inm,
+ igi->igi_version);
+ break;
+ case IGMP_VERSION_3:
+ igmp_v3_process_group_timers(igi, &qrq,
+ &scq, inm, uri_fasthz);
+ break;
+ }
+ }
+ IF_ADDR_UNLOCK(ifp);
+
+ if (igi->igi_version == IGMP_VERSION_3) {
+ struct in_multi *tinm;
+
+ igmp_dispatch_queue(&qrq, 0, loop);
+ igmp_dispatch_queue(&scq, 0, loop);
+
+ /*
+ * Free the in_multi reference(s) for this
+ * IGMP lifecycle.
+ */
+ SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead,
+ inm_nrele, tinm) {
+ SLIST_REMOVE_HEAD(&igi->igi_relinmhead,
+ inm_nrele);
+ inm_release_locked(inm);
+ }
+ }
+ }
+
+out_locked:
+ IGMP_UNLOCK();
+ IN_MULTI_UNLOCK();
+}
+
+/*
+ * Update host report group timer for IGMPv1/v2.
+ * Will update the global pending timer flags.
+ */
+static void
+igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
+{
+ int report_timer_expired;
+
+ IN_MULTI_LOCK_ASSERT();
+ IGMP_LOCK_ASSERT();
+
+ if (inm->inm_timer == 0) {
+ report_timer_expired = 0;
+ } else if (--inm->inm_timer == 0) {
+ report_timer_expired = 1;
+ } else {
+ V_current_state_timers_running = 1;
+ return;
+ }
+
+ switch (inm->inm_state) {
+ case IGMP_NOT_MEMBER:
+ case IGMP_SILENT_MEMBER:
+ case IGMP_IDLE_MEMBER:
+ case IGMP_LAZY_MEMBER:
+ case IGMP_SLEEPING_MEMBER:
+ case IGMP_AWAKENING_MEMBER:
+ break;
+ case IGMP_REPORTING_MEMBER:
+ if (report_timer_expired) {
+ inm->inm_state = IGMP_IDLE_MEMBER;
+ (void)igmp_v1v2_queue_report(inm,
+ (version == IGMP_VERSION_2) ?
+ IGMP_v2_HOST_MEMBERSHIP_REPORT :
+ IGMP_v1_HOST_MEMBERSHIP_REPORT);
+ }
+ break;
+ case IGMP_G_QUERY_PENDING_MEMBER:
+ case IGMP_SG_QUERY_PENDING_MEMBER:
+ case IGMP_LEAVING_MEMBER:
+ break;
+ }
+}
+
+/*
+ * Update a group's timers for IGMPv3.
+ * Will update the global pending timer flags.
+ * Note: Unlocked read from igi.
+ */
+static void
+igmp_v3_process_group_timers(struct igmp_ifinfo *igi,
+ struct ifqueue *qrq, struct ifqueue *scq,
+ struct in_multi *inm, const int uri_fasthz)
+{
+ int query_response_timer_expired;
+ int state_change_retransmit_timer_expired;
+
+ IN_MULTI_LOCK_ASSERT();
+ IGMP_LOCK_ASSERT();
+
+ query_response_timer_expired = 0;
+ state_change_retransmit_timer_expired = 0;
+
+ /*
+ * During a transition from v1/v2 compatibility mode back to v3,
+ * a group record in REPORTING state may still have its group
+ * timer active. This is a no-op in this function; it is easier
+ * to deal with it here than to complicate the slow-timeout path.
+ */
+ if (inm->inm_timer == 0) {
+ query_response_timer_expired = 0;
+ } else if (--inm->inm_timer == 0) {
+ query_response_timer_expired = 1;
+ } else {
+ V_current_state_timers_running = 1;
+ }
+
+ if (inm->inm_sctimer == 0) {
+ state_change_retransmit_timer_expired = 0;
+ } else if (--inm->inm_sctimer == 0) {
+ state_change_retransmit_timer_expired = 1;
+ } else {
+ V_state_change_timers_running = 1;
+ }
+
+ /* We are in fasttimo, so be quick about it. */
+ if (!state_change_retransmit_timer_expired &&
+ !query_response_timer_expired)
+ return;
+
+ switch (inm->inm_state) {
+ case IGMP_NOT_MEMBER:
+ case IGMP_SILENT_MEMBER:
+ case IGMP_SLEEPING_MEMBER:
+ case IGMP_LAZY_MEMBER:
+ case IGMP_AWAKENING_MEMBER:
+ case IGMP_IDLE_MEMBER:
+ break;
+ case IGMP_G_QUERY_PENDING_MEMBER:
+ case IGMP_SG_QUERY_PENDING_MEMBER:
+ /*
+ * Respond to a previously pending Group-Specific
+ * or Group-and-Source-Specific query by enqueueing
+ * the appropriate Current-State report for
+ * immediate transmission.
+ */
+ if (query_response_timer_expired) {
+ int retval;
+
+ retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1,
+ (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER));
+ CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
+ __func__, retval);
+ inm->inm_state = IGMP_REPORTING_MEMBER;
+ /* XXX Clear recorded sources for next time. */
+ inm_clear_recorded(inm);
+ }
+ /* FALLTHROUGH */
+ case IGMP_REPORTING_MEMBER:
+ case IGMP_LEAVING_MEMBER:
+ if (state_change_retransmit_timer_expired) {
+ /*
+ * State-change retransmission timer fired.
+ * If there are any further pending retransmissions,
+ * set the global pending state-change flag, and
+ * reset the timer.
+ */
+ if (--inm->inm_scrv > 0) {
+ inm->inm_sctimer = uri_fasthz;
+ V_state_change_timers_running = 1;
+ }
+ /*
+ * Retransmit the previously computed state-change
+ * report. If there are no further pending
+ * retransmissions, the mbuf queue will be consumed.
+ * Update T0 state to T1 as we have now sent
+ * a state-change.
+ */
+ (void)igmp_v3_merge_state_changes(inm, scq);
+
+ inm_commit(inm);
+ CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
+ inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
+
+ /*
+ * If we are leaving the group for good, make sure
+ * we release IGMP's reference to it.
+ * This release must be deferred using a SLIST,
+ * as we are called from a loop which traverses
+ * the in_ifmultiaddr TAILQ.
+ */
+ if (inm->inm_state == IGMP_LEAVING_MEMBER &&
+ inm->inm_scrv == 0) {
+ inm->inm_state = IGMP_NOT_MEMBER;
+ SLIST_INSERT_HEAD(&igi->igi_relinmhead,
+ inm, inm_nrele);
+ }
+ }
+ break;
+ }
+}
+
+
+/*
+ * Suppress a group's pending response to a group or source/group query.
+ *
+ * Do NOT suppress state changes. This leads to IGMPv3 inconsistency.
+ * Do NOT update ST1/ST0 as this operation merely suppresses
+ * the currently pending group record.
+ * Do NOT suppress the response to a general query. It is possible but
+ * it would require adding another state or flag.
+ */
+static void
+igmp_v3_suppress_group_record(struct in_multi *inm)
+{
+
+ IN_MULTI_LOCK_ASSERT();
+
+ KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3,
+ ("%s: not IGMPv3 mode on link", __func__));
+
+ if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER ||
+ inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER)
+ return;
+
+ if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
+ inm_clear_recorded(inm);
+
+ inm->inm_timer = 0;
+ inm->inm_state = IGMP_REPORTING_MEMBER;
+}
+
+/*
+ * Switch to a different IGMP version on the given interface,
+ * as per Section 7.2.1.
+ */
+static void
+igmp_set_version(struct igmp_ifinfo *igi, const int version)
+{
+ int old_version_timer;
+
+ IGMP_LOCK_ASSERT();
+
+ CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__,
+ version, igi->igi_ifp, igi->igi_ifp->if_xname);
+
+ if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) {
+ /*
+ * Compute the "Older Version Querier Present" timer as per
+ * Section 8.12.
+ */
+ old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri;
+ old_version_timer *= PR_SLOWHZ;
+
+ if (version == IGMP_VERSION_1) {
+ igi->igi_v1_timer = old_version_timer;
+ igi->igi_v2_timer = 0;
+ } else if (version == IGMP_VERSION_2) {
+ igi->igi_v1_timer = 0;
+ igi->igi_v2_timer = old_version_timer;
+ }
+ }
+
+ if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
+ if (igi->igi_version != IGMP_VERSION_2) {
+ igi->igi_version = IGMP_VERSION_2;
+ igmp_v3_cancel_link_timers(igi);
+ }
+ } else if (igi->igi_v1_timer > 0) {
+ if (igi->igi_version != IGMP_VERSION_1) {
+ igi->igi_version = IGMP_VERSION_1;
+ igmp_v3_cancel_link_timers(igi);
+ }
+ }
+}
+
+/*
+ * Cancel pending IGMPv3 timers for the given link and all groups
+ * joined on it; state-change, general-query, and group-query timers.
+ *
+ * Only ever called on a transition from v3 to Compatibility mode. Kill
+ * the timers stone dead (this may be expensive for large N groups), they
+ * will be restarted if Compatibility Mode deems that they must be due to
+ * query processing.
+ */
+static void
+igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi)
+{
+ struct ifmultiaddr *ifma;
+ struct ifnet *ifp;
+ struct in_multi *inm;
+
+ CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__,
+ igi->igi_ifp, igi->igi_ifp->if_xname);
+
+ IN_MULTI_LOCK_ASSERT();
+ IGMP_LOCK_ASSERT();
+
+ /*
+ * Stop the v3 General Query Response on this link stone dead.
+ * If fasttimo is woken up due to V_interface_timers_running,
+ * the flag will be cleared if there are no pending link timers.
+ */
+ igi->igi_v3_timer = 0;
+
+ /*
+ * Now clear the current-state and state-change report timers
+ * for all memberships scoped to this link.
+ */
+ ifp = igi->igi_ifp;
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ if (ifma->ifma_addr->sa_family != AF_INET ||
+ ifma->ifma_protospec == NULL)
+ continue;
+ inm = (struct in_multi *)ifma->ifma_protospec;
+ switch (inm->inm_state) {
+ case IGMP_NOT_MEMBER:
+ case IGMP_SILENT_MEMBER:
+ case IGMP_IDLE_MEMBER:
+ case IGMP_LAZY_MEMBER:
+ case IGMP_SLEEPING_MEMBER:
+ case IGMP_AWAKENING_MEMBER:
+ /*
+ * These states are either not relevant in v3 mode,
+ * or are unreported. Do nothing.
+ */
+ break;
+ case IGMP_LEAVING_MEMBER:
+ /*
+ * If we are leaving the group and switching to
+ * compatibility mode, we need to release the final
+ * reference held for issuing the INCLUDE {}, and
+ * transition to REPORTING to ensure the host leave
+ * message is sent upstream to the old querier --
+ * transition to NOT would lose the leave and race.
+ *
+ * SMPNG: Must drop and re-acquire IF_ADDR_LOCK
+ * around inm_release_locked(), as it is not
+ * a recursive mutex.
+ */
+ IF_ADDR_UNLOCK(ifp);
+ inm_release_locked(inm);
+ IF_ADDR_LOCK(ifp);
+ /* FALLTHROUGH */
+ case IGMP_G_QUERY_PENDING_MEMBER:
+ case IGMP_SG_QUERY_PENDING_MEMBER:
+ inm_clear_recorded(inm);
+ /* FALLTHROUGH */
+ case IGMP_REPORTING_MEMBER:
+ inm->inm_state = IGMP_REPORTING_MEMBER;
+ break;
+ }
+ /*
+ * Always clear state-change and group report timers.
+ * Free any pending IGMPv3 state-change records.
+ */
+ inm->inm_sctimer = 0;
+ inm->inm_timer = 0;
+ _IF_DRAIN(&inm->inm_scq);
+ }
+ IF_ADDR_UNLOCK(ifp);
+}
+
+/*
+ * Update the Older Version Querier Present timers for a link.
+ * See Section 7.2.1 of RFC 3376.
+ */
+static void
+igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi)
+{
+
+ IGMP_LOCK_ASSERT();
+
+ if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) {
+ /*
+ * IGMPv1 and IGMPv2 Querier Present timers expired.
+ *
+ * Revert to IGMPv3.
+ */
+ if (igi->igi_version != IGMP_VERSION_3) {
+ CTR5(KTR_IGMPV3,
+ "%s: transition from v%d -> v%d on %p(%s)",
+ __func__, igi->igi_version, IGMP_VERSION_3,
+ igi->igi_ifp, igi->igi_ifp->if_xname);
+ igi->igi_version = IGMP_VERSION_3;
+ }
+ } else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
+ /*
+ * IGMPv1 Querier Present timer expired,
+ * IGMPv2 Querier Present timer running.
+ * If IGMPv2 was disabled since last timeout,
+ * revert to IGMPv3.
+ * If IGMPv2 is enabled, revert to IGMPv2.
+ */
+ if (!V_igmp_v2enable) {
+ CTR5(KTR_IGMPV3,
+ "%s: transition from v%d -> v%d on %p(%s)",
+ __func__, igi->igi_version, IGMP_VERSION_3,
+ igi->igi_ifp, igi->igi_ifp->if_xname);
+ igi->igi_v2_timer = 0;
+ igi->igi_version = IGMP_VERSION_3;
+ } else {
+ --igi->igi_v2_timer;
+ if (igi->igi_version != IGMP_VERSION_2) {
+ CTR5(KTR_IGMPV3,
+ "%s: transition from v%d -> v%d on %p(%s)",
+ __func__, igi->igi_version, IGMP_VERSION_2,
+ igi->igi_ifp, igi->igi_ifp->if_xname);
+ igi->igi_version = IGMP_VERSION_2;
+ }
+ }
+ } else if (igi->igi_v1_timer > 0) {
+ /*
+ * IGMPv1 Querier Present timer running.
+ * Stop IGMPv2 timer if running.
+ *
+ * If IGMPv1 was disabled since last timeout,
+ * revert to IGMPv3.
+ * If IGMPv1 is enabled, reset IGMPv2 timer if running.
+ */
+ if (!V_igmp_v1enable) {
+ CTR5(KTR_IGMPV3,
+ "%s: transition from v%d -> v%d on %p(%s)",
+ __func__, igi->igi_version, IGMP_VERSION_3,
+ igi->igi_ifp, igi->igi_ifp->if_xname);
+ igi->igi_v1_timer = 0;
+ igi->igi_version = IGMP_VERSION_3;
+ } else {
+ --igi->igi_v1_timer;
+ }
+ if (igi->igi_v2_timer > 0) {
+ CTR3(KTR_IGMPV3,
+ "%s: cancel v2 timer on %p(%s)",
+ __func__, igi->igi_ifp, igi->igi_ifp->if_xname);
+ igi->igi_v2_timer = 0;
+ }
+ }
+}
+
+/*
+ * Global slowtimo handler.
+ * VIMAGE: Timeout handlers are expected to service all vimages.
+ */
+void
+igmp_slowtimo(void)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+
+ VNET_LIST_RLOCK_NOSLEEP();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ igmp_slowtimo_vnet();
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK_NOSLEEP();
+}
+
+/*
+ * Per-vnet slowtimo handler.
+ */
+static void
+igmp_slowtimo_vnet(void)
+{
+ struct igmp_ifinfo *igi;
+
+ IGMP_LOCK();
+
+ LIST_FOREACH(igi, &V_igi_head, igi_link) {
+ igmp_v1v2_process_querier_timers(igi);
+ }
+
+ IGMP_UNLOCK();
+}
+
+/*
+ * Dispatch an IGMPv1/v2 host report or leave message.
+ * These are always small enough to fit inside a single mbuf.
+ */
+static int
+igmp_v1v2_queue_report(struct in_multi *inm, const int type)
+{
+ struct ifnet *ifp;
+ struct igmp *igmp;
+ struct ip *ip;
+ struct mbuf *m;
+
+ IN_MULTI_LOCK_ASSERT();
+ IGMP_LOCK_ASSERT();
+
+ ifp = inm->inm_ifp;
+
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (m == NULL)
+ return (ENOMEM);
+ MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
+
+ m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp);
+
+ m->m_data += sizeof(struct ip);
+ m->m_len = sizeof(struct igmp);
+
+ igmp = mtod(m, struct igmp *);
+ igmp->igmp_type = type;
+ igmp->igmp_code = 0;
+ igmp->igmp_group = inm->inm_addr;
+ igmp->igmp_cksum = 0;
+ igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp));
+
+ m->m_data -= sizeof(struct ip);
+ m->m_len += sizeof(struct ip);
+
+ ip = mtod(m, struct ip *);
+ ip->ip_tos = 0;
+ ip->ip_len = sizeof(struct ip) + sizeof(struct igmp);
+ ip->ip_off = 0;
+ ip->ip_p = IPPROTO_IGMP;
+ ip->ip_src.s_addr = INADDR_ANY;
+
+ if (type == IGMP_HOST_LEAVE_MESSAGE)
+ ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP);
+ else
+ ip->ip_dst = inm->inm_addr;
+
+ igmp_save_context(m, ifp);
+
+ m->m_flags |= M_IGMPV2;
+ if (inm->inm_igi->igi_flags & IGIF_LOOPBACK)
+ m->m_flags |= M_IGMP_LOOP;
+
+ CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m);
+ netisr_dispatch(NETISR_IGMP, m);
+
+ return (0);
+}
+
+/*
+ * Process a state change from the upper layer for the given IPv4 group.
+ *
+ * Each socket holds a reference on the in_multi in its own ip_moptions.
+ * The socket layer will have made the necessary updates to.the group
+ * state, it is now up to IGMP to issue a state change report if there
+ * has been any change between T0 (when the last state-change was issued)
+ * and T1 (now).
+ *
+ * We use the IGMPv3 state machine at group level. The IGMP module
+ * however makes the decision as to which IGMP protocol version to speak.
+ * A state change *from* INCLUDE {} always means an initial join.
+ * A state change *to* INCLUDE {} always means a final leave.
+ *
+ * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can
+ * save ourselves a bunch of work; any exclusive mode groups need not
+ * compute source filter lists.
+ *
+ * VIMAGE: curvnet should have been set by caller, as this routine
+ * is called from the socket option handlers.
+ */
+int
+igmp_change_state(struct in_multi *inm)
+{
+ struct igmp_ifinfo *igi;
+ struct ifnet *ifp;
+ int error;
+
+ IN_MULTI_LOCK_ASSERT();
+
+ error = 0;
+
+ /*
+ * Try to detect if the upper layer just asked us to change state
+ * for an interface which has now gone away.
+ */
+ KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
+ ifp = inm->inm_ifma->ifma_ifp;
+ if (ifp != NULL) {
+ /*
+ * Sanity check that netinet's notion of ifp is the
+ * same as net's.
+ */
+ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
+ }
+
+ IGMP_LOCK();
+
+ igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
+ KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
+
+ /*
+ * If we detect a state transition to or from MCAST_UNDEFINED
+ * for this group, then we are starting or finishing an IGMP
+ * life cycle for this group.
+ */
+ if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) {
+ CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__,
+ inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode);
+ if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) {
+ CTR1(KTR_IGMPV3, "%s: initial join", __func__);
+ error = igmp_initial_join(inm, igi);
+ goto out_locked;
+ } else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) {
+ CTR1(KTR_IGMPV3, "%s: final leave", __func__);
+ igmp_final_leave(inm, igi);
+ goto out_locked;
+ }
+ } else {
+ CTR1(KTR_IGMPV3, "%s: filter set change", __func__);
+ }
+
+ error = igmp_handle_state_change(inm, igi);
+
+out_locked:
+ IGMP_UNLOCK();
+ return (error);
+}
+
+/*
+ * Perform the initial join for an IGMP group.
+ *
+ * When joining a group:
+ * If the group should have its IGMP traffic suppressed, do nothing.
+ * IGMPv1 starts sending IGMPv1 host membership reports.
+ * IGMPv2 starts sending IGMPv2 host membership reports.
+ * IGMPv3 will schedule an IGMPv3 state-change report containing the
+ * initial state of the membership.
+ */
+static int
+igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi)
+{
+ struct ifnet *ifp;
+ struct ifqueue *ifq;
+ int error, retval, syncstates;
+
+ CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)",
+ __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
+ inm->inm_ifp->if_xname);
+
+ error = 0;
+ syncstates = 1;
+
+ ifp = inm->inm_ifp;
+
+ IN_MULTI_LOCK_ASSERT();
+ IGMP_LOCK_ASSERT();
+
+ KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
+
+ /*
+ * Groups joined on loopback or marked as 'not reported',
+ * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and
+ * are never reported in any IGMP protocol exchanges.
+ * All other groups enter the appropriate IGMP state machine
+ * for the version in use on this link.
+ * A link marked as IGIF_SILENT causes IGMP to be completely
+ * disabled for the link.
+ */
+ if ((ifp->if_flags & IFF_LOOPBACK) ||
+ (igi->igi_flags & IGIF_SILENT) ||
+ !igmp_isgroupreported(inm->inm_addr)) {
+ CTR1(KTR_IGMPV3,
+"%s: not kicking state machine for silent group", __func__);
+ inm->inm_state = IGMP_SILENT_MEMBER;
+ inm->inm_timer = 0;
+ } else {
+ /*
+ * Deal with overlapping in_multi lifecycle.
+ * If this group was LEAVING, then make sure
+ * we drop the reference we picked up to keep the
+ * group around for the final INCLUDE {} enqueue.
+ */
+ if (igi->igi_version == IGMP_VERSION_3 &&
+ inm->inm_state == IGMP_LEAVING_MEMBER)
+ inm_release_locked(inm);
+
+ inm->inm_state = IGMP_REPORTING_MEMBER;
+
+ switch (igi->igi_version) {
+ case IGMP_VERSION_1:
+ case IGMP_VERSION_2:
+ inm->inm_state = IGMP_IDLE_MEMBER;
+ error = igmp_v1v2_queue_report(inm,
+ (igi->igi_version == IGMP_VERSION_2) ?
+ IGMP_v2_HOST_MEMBERSHIP_REPORT :
+ IGMP_v1_HOST_MEMBERSHIP_REPORT);
+ if (error == 0) {
+ inm->inm_timer = IGMP_RANDOM_DELAY(
+ IGMP_V1V2_MAX_RI * PR_FASTHZ);
+ V_current_state_timers_running = 1;
+ }
+ break;
+
+ case IGMP_VERSION_3:
+ /*
+ * Defer update of T0 to T1, until the first copy
+ * of the state change has been transmitted.
+ */
+ syncstates = 0;
+
+ /*
+ * Immediately enqueue a State-Change Report for
+ * this interface, freeing any previous reports.
+ * Don't kick the timers if there is nothing to do,
+ * or if an error occurred.
+ */
+ ifq = &inm->inm_scq;
+ _IF_DRAIN(ifq);
+ retval = igmp_v3_enqueue_group_record(ifq, inm, 1,
+ 0, 0);
+ CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
+ __func__, retval);
+ if (retval <= 0) {
+ error = retval * -1;
+ break;
+ }
+
+ /*
+ * Schedule transmission of pending state-change
+ * report up to RV times for this link. The timer
+ * will fire at the next igmp_fasttimo (~200ms),
+ * giving us an opportunity to merge the reports.
+ */
+ if (igi->igi_flags & IGIF_LOOPBACK) {
+ inm->inm_scrv = 1;
+ } else {
+ KASSERT(igi->igi_rv > 1,
+ ("%s: invalid robustness %d", __func__,
+ igi->igi_rv));
+ inm->inm_scrv = igi->igi_rv;
+ }
+ inm->inm_sctimer = 1;
+ V_state_change_timers_running = 1;
+
+ error = 0;
+ break;
+ }
+ }
+
+ /*
+ * Only update the T0 state if state change is atomic,
+ * i.e. we don't need to wait for a timer to fire before we
+ * can consider the state change to have been communicated.
+ */
+ if (syncstates) {
+ inm_commit(inm);
+ CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
+ inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
+ }
+
+ return (error);
+}
+
+/*
+ * Issue an intermediate state change during the IGMP life-cycle.
+ */
+static int
+igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi)
+{
+ struct ifnet *ifp;
+ int retval;
+
+ CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)",
+ __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
+ inm->inm_ifp->if_xname);
+
+ ifp = inm->inm_ifp;
+
+ IN_MULTI_LOCK_ASSERT();
+ IGMP_LOCK_ASSERT();
+
+ KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
+
+ if ((ifp->if_flags & IFF_LOOPBACK) ||
+ (igi->igi_flags & IGIF_SILENT) ||
+ !igmp_isgroupreported(inm->inm_addr) ||
+ (igi->igi_version != IGMP_VERSION_3)) {
+ if (!igmp_isgroupreported(inm->inm_addr)) {
+ CTR1(KTR_IGMPV3,
+"%s: not kicking state machine for silent group", __func__);
+ }
+ CTR1(KTR_IGMPV3, "%s: nothing to do", __func__);
+ inm_commit(inm);
+ CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
+ inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
+ return (0);
+ }
+
+ _IF_DRAIN(&inm->inm_scq);
+
+ retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0);
+ CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval);
+ if (retval <= 0)
+ return (-retval);
+
+ /*
+ * If record(s) were enqueued, start the state-change
+ * report timer for this group.
+ */
+ inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv);
+ inm->inm_sctimer = 1;
+ V_state_change_timers_running = 1;
+
+ return (0);
+}
+
+/*
+ * Perform the final leave for an IGMP group.
+ *
+ * When leaving a group:
+ * IGMPv1 does nothing.
+ * IGMPv2 sends a host leave message, if and only if we are the reporter.
+ * IGMPv3 enqueues a state-change report containing a transition
+ * to INCLUDE {} for immediate transmission.
+ */
+static void
+igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi)
+{
+ int syncstates;
+
+ syncstates = 1;
+
+ CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)",
+ __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
+ inm->inm_ifp->if_xname);
+
+ IN_MULTI_LOCK_ASSERT();
+ IGMP_LOCK_ASSERT();
+
+ switch (inm->inm_state) {
+ case IGMP_NOT_MEMBER:
+ case IGMP_SILENT_MEMBER:
+ case IGMP_LEAVING_MEMBER:
+ /* Already leaving or left; do nothing. */
+ CTR1(KTR_IGMPV3,
+"%s: not kicking state machine for silent group", __func__);
+ break;
+ case IGMP_REPORTING_MEMBER:
+ case IGMP_IDLE_MEMBER:
+ case IGMP_G_QUERY_PENDING_MEMBER:
+ case IGMP_SG_QUERY_PENDING_MEMBER:
+ if (igi->igi_version == IGMP_VERSION_2) {
+#ifdef INVARIANTS
+ if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
+ inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
+ panic("%s: IGMPv3 state reached, not IGMPv3 mode",
+ __func__);
+#endif
+ igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE);
+ inm->inm_state = IGMP_NOT_MEMBER;
+ } else if (igi->igi_version == IGMP_VERSION_3) {
+ /*
+ * Stop group timer and all pending reports.
+ * Immediately enqueue a state-change report
+ * TO_IN {} to be sent on the next fast timeout,
+ * giving us an opportunity to merge reports.
+ */
+ _IF_DRAIN(&inm->inm_scq);
+ inm->inm_timer = 0;
+ if (igi->igi_flags & IGIF_LOOPBACK) {
+ inm->inm_scrv = 1;
+ } else {
+ inm->inm_scrv = igi->igi_rv;
+ }
+ CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d "
+ "pending retransmissions.", __func__,
+ inet_ntoa(inm->inm_addr),
+ inm->inm_ifp->if_xname, inm->inm_scrv);
+ if (inm->inm_scrv == 0) {
+ inm->inm_state = IGMP_NOT_MEMBER;
+ inm->inm_sctimer = 0;
+ } else {
+ int retval;
+
+ inm_acquire_locked(inm);
+
+ retval = igmp_v3_enqueue_group_record(
+ &inm->inm_scq, inm, 1, 0, 0);
+ KASSERT(retval != 0,
+ ("%s: enqueue record = %d", __func__,
+ retval));
+
+ inm->inm_state = IGMP_LEAVING_MEMBER;
+ inm->inm_sctimer = 1;
+ V_state_change_timers_running = 1;
+ syncstates = 0;
+ }
+ break;
+ }
+ break;
+ case IGMP_LAZY_MEMBER:
+ case IGMP_SLEEPING_MEMBER:
+ case IGMP_AWAKENING_MEMBER:
+ /* Our reports are suppressed; do nothing. */
+ break;
+ }
+
+ if (syncstates) {
+ inm_commit(inm);
+ CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
+ inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
+ inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
+ CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s",
+ __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
+ }
+}
+
+/*
+ * Enqueue an IGMPv3 group record to the given output queue.
+ *
+ * XXX This function could do with having the allocation code
+ * split out, and the multiple-tree-walks coalesced into a single
+ * routine as has been done in igmp_v3_enqueue_filter_change().
+ *
+ * If is_state_change is zero, a current-state record is appended.
+ * If is_state_change is non-zero, a state-change report is appended.
+ *
+ * If is_group_query is non-zero, an mbuf packet chain is allocated.
+ * If is_group_query is zero, and if there is a packet with free space
+ * at the tail of the queue, it will be appended to providing there
+ * is enough free space.
+ * Otherwise a new mbuf packet chain is allocated.
+ *
+ * If is_source_query is non-zero, each source is checked to see if
+ * it was recorded for a Group-Source query, and will be omitted if
+ * it is not both in-mode and recorded.
+ *
+ * The function will attempt to allocate leading space in the packet
+ * for the IP/IGMP header to be prepended without fragmenting the chain.
+ *
+ * If successful the size of all data appended to the queue is returned,
+ * otherwise an error code less than zero is returned, or zero if
+ * no record(s) were appended.
+ */
+static int
+igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
+ const int is_state_change, const int is_group_query,
+ const int is_source_query)
+{
+ struct igmp_grouprec ig;
+ struct igmp_grouprec *pig;
+ struct ifnet *ifp;
+ struct ip_msource *ims, *nims;
+ struct mbuf *m0, *m, *md;
+ int error, is_filter_list_change;
+ int minrec0len, m0srcs, msrcs, nbytes, off;
+ int record_has_sources;
+ int now;
+ int type;
+ in_addr_t naddr;
+ uint8_t mode;
+
+ IN_MULTI_LOCK_ASSERT();
+
+ error = 0;
+ ifp = inm->inm_ifp;
+ is_filter_list_change = 0;
+ m = NULL;
+ m0 = NULL;
+ m0srcs = 0;
+ msrcs = 0;
+ nbytes = 0;
+ nims = NULL;
+ record_has_sources = 1;
+ pig = NULL;
+ type = IGMP_DO_NOTHING;
+ mode = inm->inm_st[1].iss_fmode;
+
+ /*
+ * If we did not transition out of ASM mode during t0->t1,
+ * and there are no source nodes to process, we can skip
+ * the generation of source records.
+ */
+ if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 &&
+ inm->inm_nsrc == 0)
+ record_has_sources = 0;
+
+ if (is_state_change) {
+ /*
+ * Queue a state change record.
+ * If the mode did not change, and there are non-ASM
+ * listeners or source filters present,
+ * we potentially need to issue two records for the group.
+ * If we are transitioning to MCAST_UNDEFINED, we need
+ * not send any sources.
+ * If there are ASM listeners, and there was no filter
+ * mode transition of any kind, do nothing.
+ */
+ if (mode != inm->inm_st[0].iss_fmode) {
+ if (mode == MCAST_EXCLUDE) {
+ CTR1(KTR_IGMPV3, "%s: change to EXCLUDE",
+ __func__);
+ type = IGMP_CHANGE_TO_EXCLUDE_MODE;
+ } else {
+ CTR1(KTR_IGMPV3, "%s: change to INCLUDE",
+ __func__);
+ type = IGMP_CHANGE_TO_INCLUDE_MODE;
+ if (mode == MCAST_UNDEFINED)
+ record_has_sources = 0;
+ }
+ } else {
+ if (record_has_sources) {
+ is_filter_list_change = 1;
+ } else {
+ type = IGMP_DO_NOTHING;
+ }
+ }
+ } else {
+ /*
+ * Queue a current state record.
+ */
+ if (mode == MCAST_EXCLUDE) {
+ type = IGMP_MODE_IS_EXCLUDE;
+ } else if (mode == MCAST_INCLUDE) {
+ type = IGMP_MODE_IS_INCLUDE;
+ KASSERT(inm->inm_st[1].iss_asm == 0,
+ ("%s: inm %p is INCLUDE but ASM count is %d",
+ __func__, inm, inm->inm_st[1].iss_asm));
+ }
+ }
+
+ /*
+ * Generate the filter list changes using a separate function.
+ */
+ if (is_filter_list_change)
+ return (igmp_v3_enqueue_filter_change(ifq, inm));
+
+ if (type == IGMP_DO_NOTHING) {
+ CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s",
+ __func__, inet_ntoa(inm->inm_addr),
+ inm->inm_ifp->if_xname);
+ return (0);
+ }
+
+ /*
+ * If any sources are present, we must be able to fit at least
+ * one in the trailing space of the tail packet's mbuf,
+ * ideally more.
+ */
+ minrec0len = sizeof(struct igmp_grouprec);
+ if (record_has_sources)
+ minrec0len += sizeof(in_addr_t);
+
+ CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__,
+ igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr),
+ inm->inm_ifp->if_xname);
+
+ /*
+ * Check if we have a packet in the tail of the queue for this
+ * group into which the first group record for this group will fit.
+ * Otherwise allocate a new packet.
+ * Always allocate leading space for IP+RA_OPT+IGMP+REPORT.
+ * Note: Group records for G/GSR query responses MUST be sent
+ * in their own packet.
+ */
+ m0 = ifq->ifq_tail;
+ if (!is_group_query &&
+ m0 != NULL &&
+ (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) &&
+ (m0->m_pkthdr.len + minrec0len) <
+ (ifp->if_mtu - IGMP_LEADINGSPACE)) {
+ m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
+ sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
+ m = m0;
+ CTR1(KTR_IGMPV3, "%s: use existing packet", __func__);
+ } else {
+ if (_IF_QFULL(ifq)) {
+ CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
+ return (-ENOMEM);
+ }
+ m = NULL;
+ m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
+ sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
+ if (!is_state_change && !is_group_query) {
+ m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+ if (m)
+ m->m_data += IGMP_LEADINGSPACE;
+ }
+ if (m == NULL) {
+ m = m_gethdr(M_DONTWAIT, MT_DATA);
+ if (m)
+ MH_ALIGN(m, IGMP_LEADINGSPACE);
+ }
+ if (m == NULL)
+ return (-ENOMEM);
+
+ igmp_save_context(m, ifp);
+
+ CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__);
+ }
+
+ /*
+ * Append group record.
+ * If we have sources, we don't know how many yet.
+ */
+ ig.ig_type = type;
+ ig.ig_datalen = 0;
+ ig.ig_numsrc = 0;
+ ig.ig_group = inm->inm_addr;
+ if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
+ if (m != m0)
+ m_freem(m);
+ CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
+ return (-ENOMEM);
+ }
+ nbytes += sizeof(struct igmp_grouprec);
+
+ /*
+ * Append as many sources as will fit in the first packet.
+ * If we are appending to a new packet, the chain allocation
+ * may potentially use clusters; use m_getptr() in this case.
+ * If we are appending to an existing packet, we need to obtain
+ * a pointer to the group record after m_append(), in case a new
+ * mbuf was allocated.
+ * Only append sources which are in-mode at t1. If we are
+ * transitioning to MCAST_UNDEFINED state on the group, do not
+ * include source entries.
+ * Only report recorded sources in our filter set when responding
+ * to a group-source query.
+ */
+ if (record_has_sources) {
+ if (m == m0) {
+ md = m_last(m);
+ pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
+ md->m_len - nbytes);
+ } else {
+ md = m_getptr(m, 0, &off);
+ pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
+ off);
+ }
+ msrcs = 0;
+ RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) {
+ CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
+ inet_ntoa_haddr(ims->ims_haddr));
+ now = ims_get_mode(inm, ims, 1);
+ CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now);
+ if ((now != mode) ||
+ (now == mode && mode == MCAST_UNDEFINED)) {
+ CTR1(KTR_IGMPV3, "%s: skip node", __func__);
+ continue;
+ }
+ if (is_source_query && ims->ims_stp == 0) {
+ CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
+ __func__);
+ continue;
+ }
+ CTR1(KTR_IGMPV3, "%s: append node", __func__);
+ naddr = htonl(ims->ims_haddr);
+ if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
+ if (m != m0)
+ m_freem(m);
+ CTR1(KTR_IGMPV3, "%s: m_append() failed.",
+ __func__);
+ return (-ENOMEM);
+ }
+ nbytes += sizeof(in_addr_t);
+ ++msrcs;
+ if (msrcs == m0srcs)
+ break;
+ }
+ CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__,
+ msrcs);
+ pig->ig_numsrc = htons(msrcs);
+ nbytes += (msrcs * sizeof(in_addr_t));
+ }
+
+ if (is_source_query && msrcs == 0) {
+ CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__);
+ if (m != m0)
+ m_freem(m);
+ return (0);
+ }
+
+ /*
+ * We are good to go with first packet.
+ */
+ if (m != m0) {
+ CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__);
+ m->m_pkthdr.PH_vt.vt_nrecs = 1;
+ _IF_ENQUEUE(ifq, m);
+ } else
+ m->m_pkthdr.PH_vt.vt_nrecs++;
+
+ /*
+ * No further work needed if no source list in packet(s).
+ */
+ if (!record_has_sources)
+ return (nbytes);
+
+ /*
+ * Whilst sources remain to be announced, we need to allocate
+ * a new packet and fill out as many sources as will fit.
+ * Always try for a cluster first.
+ */
+ while (nims != NULL) {
+ if (_IF_QFULL(ifq)) {
+ CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
+ return (-ENOMEM);
+ }
+ m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+ if (m)
+ m->m_data += IGMP_LEADINGSPACE;
+ if (m == NULL) {
+ m = m_gethdr(M_DONTWAIT, MT_DATA);
+ if (m)
+ MH_ALIGN(m, IGMP_LEADINGSPACE);
+ }
+ if (m == NULL)
+ return (-ENOMEM);
+ igmp_save_context(m, ifp);
+ md = m_getptr(m, 0, &off);
+ pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off);
+ CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__);
+
+ if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
+ if (m != m0)
+ m_freem(m);
+ CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
+ return (-ENOMEM);
+ }
+ m->m_pkthdr.PH_vt.vt_nrecs = 1;
+ nbytes += sizeof(struct igmp_grouprec);
+
+ m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
+ sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
+
+ msrcs = 0;
+ RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
+ CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
+ inet_ntoa_haddr(ims->ims_haddr));
+ now = ims_get_mode(inm, ims, 1);
+ if ((now != mode) ||
+ (now == mode && mode == MCAST_UNDEFINED)) {
+ CTR1(KTR_IGMPV3, "%s: skip node", __func__);
+ continue;
+ }
+ if (is_source_query && ims->ims_stp == 0) {
+ CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
+ __func__);
+ continue;
+ }
+ CTR1(KTR_IGMPV3, "%s: append node", __func__);
+ naddr = htonl(ims->ims_haddr);
+ if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
+ if (m != m0)
+ m_freem(m);
+ CTR1(KTR_IGMPV3, "%s: m_append() failed.",
+ __func__);
+ return (-ENOMEM);
+ }
+ ++msrcs;
+ if (msrcs == m0srcs)
+ break;
+ }
+ pig->ig_numsrc = htons(msrcs);
+ nbytes += (msrcs * sizeof(in_addr_t));
+
+ CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__);
+ _IF_ENQUEUE(ifq, m);
+ }
+
+ return (nbytes);
+}
+
+/*
+ * Type used to mark record pass completion.
+ * We exploit the fact we can cast to this easily from the
+ * current filter modes on each ip_msource node.
+ */
+typedef enum {
+ REC_NONE = 0x00, /* MCAST_UNDEFINED */
+ REC_ALLOW = 0x01, /* MCAST_INCLUDE */
+ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */
+ REC_FULL = REC_ALLOW | REC_BLOCK
+} rectype_t;
+
+/*
+ * Enqueue an IGMPv3 filter list change to the given output queue.
+ *
+ * Source list filter state is held in an RB-tree. When the filter list
+ * for a group is changed without changing its mode, we need to compute
+ * the deltas between T0 and T1 for each source in the filter set,
+ * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
+ *
+ * As we may potentially queue two record types, and the entire R-B tree
+ * needs to be walked at once, we break this out into its own function
+ * so we can generate a tightly packed queue of packets.
+ *
+ * XXX This could be written to only use one tree walk, although that makes
+ * serializing into the mbuf chains a bit harder. For now we do two walks
+ * which makes things easier on us, and it may or may not be harder on
+ * the L2 cache.
+ *
+ * If successful the size of all data appended to the queue is returned,
+ * otherwise an error code less than zero is returned, or zero if
+ * no record(s) were appended.
+ */
+static int
+igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm)
+{
+ static const int MINRECLEN =
+ sizeof(struct igmp_grouprec) + sizeof(in_addr_t);
+ struct ifnet *ifp;
+ struct igmp_grouprec ig;
+ struct igmp_grouprec *pig;
+ struct ip_msource *ims, *nims;
+ struct mbuf *m, *m0, *md;
+ in_addr_t naddr;
+ int m0srcs, nbytes, npbytes, off, rsrcs, schanged;
+ int nallow, nblock;
+ uint8_t mode, now, then;
+ rectype_t crt, drt, nrt;
+
+ IN_MULTI_LOCK_ASSERT();
+
+ if (inm->inm_nsrc == 0 ||
+ (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0))
+ return (0);
+
+ ifp = inm->inm_ifp; /* interface */
+ mode = inm->inm_st[1].iss_fmode; /* filter mode at t1 */
+ crt = REC_NONE; /* current group record type */
+ drt = REC_NONE; /* mask of completed group record types */
+ nrt = REC_NONE; /* record type for current node */
+ m0srcs = 0; /* # source which will fit in current mbuf chain */
+ nbytes = 0; /* # of bytes appended to group's state-change queue */
+ npbytes = 0; /* # of bytes appended this packet */
+ rsrcs = 0; /* # sources encoded in current record */
+ schanged = 0; /* # nodes encoded in overall filter change */
+ nallow = 0; /* # of source entries in ALLOW_NEW */
+ nblock = 0; /* # of source entries in BLOCK_OLD */
+ nims = NULL; /* next tree node pointer */
+
+ /*
+ * For each possible filter record mode.
+ * The first kind of source we encounter tells us which
+ * is the first kind of record we start appending.
+ * If a node transitioned to UNDEFINED at t1, its mode is treated
+ * as the inverse of the group's filter mode.
+ */
+ while (drt != REC_FULL) {
+ do {
+ m0 = ifq->ifq_tail;
+ if (m0 != NULL &&
+ (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
+ IGMP_V3_REPORT_MAXRECS) &&
+ (m0->m_pkthdr.len + MINRECLEN) <
+ (ifp->if_mtu - IGMP_LEADINGSPACE)) {
+ m = m0;
+ m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
+ sizeof(struct igmp_grouprec)) /
+ sizeof(in_addr_t);
+ CTR1(KTR_IGMPV3,
+ "%s: use previous packet", __func__);
+ } else {
+ m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+ if (m)
+ m->m_data += IGMP_LEADINGSPACE;
+ if (m == NULL) {
+ m = m_gethdr(M_DONTWAIT, MT_DATA);
+ if (m)
+ MH_ALIGN(m, IGMP_LEADINGSPACE);
+ }
+ if (m == NULL) {
+ CTR1(KTR_IGMPV3,
+ "%s: m_get*() failed", __func__);
+ return (-ENOMEM);
+ }
+ m->m_pkthdr.PH_vt.vt_nrecs = 0;
+ igmp_save_context(m, ifp);
+ m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
+ sizeof(struct igmp_grouprec)) /
+ sizeof(in_addr_t);
+ npbytes = 0;
+ CTR1(KTR_IGMPV3,
+ "%s: allocated new packet", __func__);
+ }
+ /*
+ * Append the IGMP group record header to the
+ * current packet's data area.
+ * Recalculate pointer to free space for next
+ * group record, in case m_append() allocated
+ * a new mbuf or cluster.
+ */
+ memset(&ig, 0, sizeof(ig));
+ ig.ig_group = inm->inm_addr;
+ if (!m_append(m, sizeof(ig), (void *)&ig)) {
+ if (m != m0)
+ m_freem(m);
+ CTR1(KTR_IGMPV3,
+ "%s: m_append() failed", __func__);
+ return (-ENOMEM);
+ }
+ npbytes += sizeof(struct igmp_grouprec);
+ if (m != m0) {
+ /* new packet; offset in c hain */
+ md = m_getptr(m, npbytes -
+ sizeof(struct igmp_grouprec), &off);
+ pig = (struct igmp_grouprec *)(mtod(md,
+ uint8_t *) + off);
+ } else {
+ /* current packet; offset from last append */
+ md = m_last(m);
+ pig = (struct igmp_grouprec *)(mtod(md,
+ uint8_t *) + md->m_len -
+ sizeof(struct igmp_grouprec));
+ }
+ /*
+ * Begin walking the tree for this record type
+ * pass, or continue from where we left off
+ * previously if we had to allocate a new packet.
+ * Only report deltas in-mode at t1.
+ * We need not report included sources as allowed
+ * if we are in inclusive mode on the group,
+ * however the converse is not true.
+ */
+ rsrcs = 0;
+ if (nims == NULL)
+ nims = RB_MIN(ip_msource_tree, &inm->inm_srcs);
+ RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
+ CTR2(KTR_IGMPV3, "%s: visit node %s",
+ __func__, inet_ntoa_haddr(ims->ims_haddr));
+ now = ims_get_mode(inm, ims, 1);
+ then = ims_get_mode(inm, ims, 0);
+ CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d",
+ __func__, then, now);
+ if (now == then) {
+ CTR1(KTR_IGMPV3,
+ "%s: skip unchanged", __func__);
+ continue;
+ }
+ if (mode == MCAST_EXCLUDE &&
+ now == MCAST_INCLUDE) {
+ CTR1(KTR_IGMPV3,
+ "%s: skip IN src on EX group",
+ __func__);
+ continue;
+ }
+ nrt = (rectype_t)now;
+ if (nrt == REC_NONE)
+ nrt = (rectype_t)(~mode & REC_FULL);
+ if (schanged++ == 0) {
+ crt = nrt;
+ } else if (crt != nrt)
+ continue;
+ naddr = htonl(ims->ims_haddr);
+ if (!m_append(m, sizeof(in_addr_t),
+ (void *)&naddr)) {
+ if (m != m0)
+ m_freem(m);
+ CTR1(KTR_IGMPV3,
+ "%s: m_append() failed", __func__);
+ return (-ENOMEM);
+ }
+ nallow += !!(crt == REC_ALLOW);
+ nblock += !!(crt == REC_BLOCK);
+ if (++rsrcs == m0srcs)
+ break;
+ }
+ /*
+ * If we did not append any tree nodes on this
+ * pass, back out of allocations.
+ */
+ if (rsrcs == 0) {
+ npbytes -= sizeof(struct igmp_grouprec);
+ if (m != m0) {
+ CTR1(KTR_IGMPV3,
+ "%s: m_free(m)", __func__);
+ m_freem(m);
+ } else {
+ CTR1(KTR_IGMPV3,
+ "%s: m_adj(m, -ig)", __func__);
+ m_adj(m, -((int)sizeof(
+ struct igmp_grouprec)));
+ }
+ continue;
+ }
+ npbytes += (rsrcs * sizeof(in_addr_t));
+ if (crt == REC_ALLOW)
+ pig->ig_type = IGMP_ALLOW_NEW_SOURCES;
+ else if (crt == REC_BLOCK)
+ pig->ig_type = IGMP_BLOCK_OLD_SOURCES;
+ pig->ig_numsrc = htons(rsrcs);
+ /*
+ * Count the new group record, and enqueue this
+ * packet if it wasn't already queued.
+ */
+ m->m_pkthdr.PH_vt.vt_nrecs++;
+ if (m != m0)
+ _IF_ENQUEUE(ifq, m);
+ nbytes += npbytes;
+ } while (nims != NULL);
+ drt |= crt;
+ crt = (~crt & REC_FULL);
+ }
+
+ CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
+ nallow, nblock);
+
+ return (nbytes);
+}
+
+static int
+igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
+{
+ struct ifqueue *gq;
+ struct mbuf *m; /* pending state-change */
+ struct mbuf *m0; /* copy of pending state-change */
+ struct mbuf *mt; /* last state-change in packet */
+ int docopy, domerge;
+ u_int recslen;
+
+ docopy = 0;
+ domerge = 0;
+ recslen = 0;
+
+ IN_MULTI_LOCK_ASSERT();
+ IGMP_LOCK_ASSERT();
+
+ /*
+ * If there are further pending retransmissions, make a writable
+ * copy of each queued state-change message before merging.
+ */
+ if (inm->inm_scrv > 0)
+ docopy = 1;
+
+ gq = &inm->inm_scq;
+#ifdef KTR
+ if (gq->ifq_head == NULL) {
+ CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty",
+ __func__, inm);
+ }
+#endif
+
+ m = gq->ifq_head;
+ while (m != NULL) {
+ /*
+ * Only merge the report into the current packet if
+ * there is sufficient space to do so; an IGMPv3 report
+ * packet may only contain 65,535 group records.
+ * Always use a simple mbuf chain concatentation to do this,
+ * as large state changes for single groups may have
+ * allocated clusters.
+ */
+ domerge = 0;
+ mt = ifscq->ifq_tail;
+ if (mt != NULL) {
+ recslen = m_length(m, NULL);
+
+ if ((mt->m_pkthdr.PH_vt.vt_nrecs +
+ m->m_pkthdr.PH_vt.vt_nrecs <=
+ IGMP_V3_REPORT_MAXRECS) &&
+ (mt->m_pkthdr.len + recslen <=
+ (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE)))
+ domerge = 1;
+ }
+
+ if (!domerge && _IF_QFULL(gq)) {
+ CTR2(KTR_IGMPV3,
+ "%s: outbound queue full, skipping whole packet %p",
+ __func__, m);
+ mt = m->m_nextpkt;
+ if (!docopy)
+ m_freem(m);
+ m = mt;
+ continue;
+ }
+
+ if (!docopy) {
+ CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m);
+ _IF_DEQUEUE(gq, m0);
+ m = m0->m_nextpkt;
+ } else {
+ CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m);
+ m0 = m_dup(m, M_NOWAIT);
+ if (m0 == NULL)
+ return (ENOMEM);
+ m0->m_nextpkt = NULL;
+ m = m->m_nextpkt;
+ }
+
+ if (!domerge) {
+ CTR3(KTR_IGMPV3, "%s: queueing %p to ifscq %p)",
+ __func__, m0, ifscq);
+ _IF_ENQUEUE(ifscq, m0);
+ } else {
+ struct mbuf *mtl; /* last mbuf of packet mt */
+
+ CTR3(KTR_IGMPV3, "%s: merging %p with ifscq tail %p)",
+ __func__, m0, mt);
+
+ mtl = m_last(mt);
+ m0->m_flags &= ~M_PKTHDR;
+ mt->m_pkthdr.len += recslen;
+ mt->m_pkthdr.PH_vt.vt_nrecs +=
+ m0->m_pkthdr.PH_vt.vt_nrecs;
+
+ mtl->m_next = m0;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Respond to a pending IGMPv3 General Query.
+ */
+static void
+igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi)
+{
+ struct ifmultiaddr *ifma, *tifma;
+ struct ifnet *ifp;
+ struct in_multi *inm;
+ int retval, loop;
+
+ IN_MULTI_LOCK_ASSERT();
+ IGMP_LOCK_ASSERT();
+
+ KASSERT(igi->igi_version == IGMP_VERSION_3,
+ ("%s: called when version %d", __func__, igi->igi_version));
+
+ ifp = igi->igi_ifp;
+
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, tifma) {
+ if (ifma->ifma_addr->sa_family != AF_INET ||
+ ifma->ifma_protospec == NULL)
+ continue;
+
+ inm = (struct in_multi *)ifma->ifma_protospec;
+ KASSERT(ifp == inm->inm_ifp,
+ ("%s: inconsistent ifp", __func__));
+
+ switch (inm->inm_state) {
+ case IGMP_NOT_MEMBER:
+ case IGMP_SILENT_MEMBER:
+ break;
+ case IGMP_REPORTING_MEMBER:
+ case IGMP_IDLE_MEMBER:
+ case IGMP_LAZY_MEMBER:
+ case IGMP_SLEEPING_MEMBER:
+ case IGMP_AWAKENING_MEMBER:
+ inm->inm_state = IGMP_REPORTING_MEMBER;
+ retval = igmp_v3_enqueue_group_record(&igi->igi_gq,
+ inm, 0, 0, 0);
+ CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
+ __func__, retval);
+ break;
+ case IGMP_G_QUERY_PENDING_MEMBER:
+ case IGMP_SG_QUERY_PENDING_MEMBER:
+ case IGMP_LEAVING_MEMBER:
+ break;
+ }
+ }
+ IF_ADDR_UNLOCK(ifp);
+
+ loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
+ igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop);
+
+ /*
+ * Slew transmission of bursts over 500ms intervals.
+ */
+ if (igi->igi_gq.ifq_head != NULL) {
+ igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY(
+ IGMP_RESPONSE_BURST_INTERVAL);
+ V_interface_timers_running = 1;
+ }
+}
+
+/*
+ * Transmit the next pending IGMP message in the output queue.
+ *
+ * We get called from netisr_processqueue(). A mutex private to igmpoq
+ * will be acquired and released around this routine.
+ *
+ * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
+ * MRT: Nothing needs to be done, as IGMP traffic is always local to
+ * a link and uses a link-scope multicast address.
+ */
+static void
+igmp_intr(struct mbuf *m)
+{
+ struct ip_moptions imo;
+ struct ifnet *ifp;
+ struct mbuf *ipopts, *m0;
+ int error;
+ uint32_t ifindex;
+
+ CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m);
+
+ /*
+ * Set VNET image pointer from enqueued mbuf chain
+ * before doing anything else. Whilst we use interface
+ * indexes to guard against interface detach, they are
+ * unique to each VIMAGE and must be retrieved.
+ */
+ CURVNET_SET((struct vnet *)(m->m_pkthdr.header));
+ ifindex = igmp_restore_context(m);
+
+ /*
+ * Check if the ifnet still exists. This limits the scope of
+ * any race in the absence of a global ifp lock for low cost
+ * (an array lookup).
+ */
+ ifp = ifnet_byindex(ifindex);
+ if (ifp == NULL) {
+ CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.",
+ __func__, m, ifindex);
+ m_freem(m);
+ IPSTAT_INC(ips_noroute);
+ goto out;
+ }
+
+ ipopts = V_igmp_sendra ? m_raopt : NULL;
+
+ imo.imo_multicast_ttl = 1;
+ imo.imo_multicast_vif = -1;
+ imo.imo_multicast_loop = (V_ip_mrouter != NULL);
+
+ /*
+ * If the user requested that IGMP traffic be explicitly
+ * redirected to the loopback interface (e.g. they are running a
+ * MANET interface and the routing protocol needs to see the
+ * updates), handle this now.
+ */
+ if (m->m_flags & M_IGMP_LOOP)
+ imo.imo_multicast_ifp = V_loif;
+ else
+ imo.imo_multicast_ifp = ifp;
+
+ if (m->m_flags & M_IGMPV2) {
+ m0 = m;
+ } else {
+ m0 = igmp_v3_encap_report(ifp, m);
+ if (m0 == NULL) {
+ CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m);
+ m_freem(m);
+ IPSTAT_INC(ips_odropped);
+ goto out;
+ }
+ }
+
+ igmp_scrub_context(m0);
+ m->m_flags &= ~(M_PROTOFLAGS);
+ m0->m_pkthdr.rcvif = V_loif;
+#ifdef MAC
+ mac_netinet_igmp_send(ifp, m0);
+#endif
+ error = ip_output(m0, ipopts, NULL, 0, &imo, NULL);
+ if (error) {
+ CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error);
+ goto out;
+ }
+
+ IGMPSTAT_INC(igps_snd_reports);
+
+out:
+ /*
+ * We must restore the existing vnet pointer before
+ * continuing as we are run from netisr context.
+ */
+ CURVNET_RESTORE();
+}
+
+/*
+ * Encapsulate an IGMPv3 report.
+ *
+ * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf
+ * chain has already had its IP/IGMPv3 header prepended. In this case
+ * the function will not attempt to prepend; the lengths and checksums
+ * will however be re-computed.
+ *
+ * Returns a pointer to the new mbuf chain head, or NULL if the
+ * allocation failed.
+ */
+static struct mbuf *
+igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
+{
+ struct igmp_report *igmp;
+ struct ip *ip;
+ int hdrlen, igmpreclen;
+
+ KASSERT((m->m_flags & M_PKTHDR),
+ ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
+
+ igmpreclen = m_length(m, NULL);
+ hdrlen = sizeof(struct ip) + sizeof(struct igmp_report);
+
+ if (m->m_flags & M_IGMPV3_HDR) {
+ igmpreclen -= hdrlen;
+ } else {
+ M_PREPEND(m, hdrlen, M_DONTWAIT);
+ if (m == NULL)
+ return (NULL);
+ m->m_flags |= M_IGMPV3_HDR;
+ }
+
+ CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen);
+
+ m->m_data += sizeof(struct ip);
+ m->m_len -= sizeof(struct ip);
+
+ igmp = mtod(m, struct igmp_report *);
+ igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT;
+ igmp->ir_rsv1 = 0;
+ igmp->ir_rsv2 = 0;
+ igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs);
+ igmp->ir_cksum = 0;
+ igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen);
+ m->m_pkthdr.PH_vt.vt_nrecs = 0;
+
+ m->m_data -= sizeof(struct ip);
+ m->m_len += sizeof(struct ip);
+
+ ip = mtod(m, struct ip *);
+ ip->ip_tos = IPTOS_PREC_INTERNETCONTROL;
+ ip->ip_len = hdrlen + igmpreclen;
+ ip->ip_off = IP_DF;
+ ip->ip_p = IPPROTO_IGMP;
+ ip->ip_sum = 0;
+
+ ip->ip_src.s_addr = INADDR_ANY;
+
+ if (m->m_flags & M_IGMP_LOOP) {
+ struct in_ifaddr *ia;
+
+ IFP_TO_IA(ifp, ia);
+ if (ia != NULL) {
+ ip->ip_src = ia->ia_addr.sin_addr;
+ ifa_free(&ia->ia_ifa);
+ }
+ }
+
+ ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP);
+
+ return (m);
+}
+
+#ifdef KTR
+static char *
+igmp_rec_type_to_str(const int type)
+{
+
+ switch (type) {
+ case IGMP_CHANGE_TO_EXCLUDE_MODE:
+ return "TO_EX";
+ break;
+ case IGMP_CHANGE_TO_INCLUDE_MODE:
+ return "TO_IN";
+ break;
+ case IGMP_MODE_IS_EXCLUDE:
+ return "MODE_EX";
+ break;
+ case IGMP_MODE_IS_INCLUDE:
+ return "MODE_IN";
+ break;
+ case IGMP_ALLOW_NEW_SOURCES:
+ return "ALLOW_NEW";
+ break;
+ case IGMP_BLOCK_OLD_SOURCES:
+ return "BLOCK_OLD";
+ break;
+ default:
+ break;
+ }
+ return "unknown";
+}
+#endif
+
+static void
+igmp_init(void *unused __unused)
+{
+
+ CTR1(KTR_IGMPV3, "%s: initializing", __func__);
+
+ IGMP_LOCK_INIT();
+
+ m_raopt = igmp_ra_alloc();
+
+ netisr_register(&igmp_nh);
+}
+SYSINIT(igmp_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_init, NULL);
+
+static void
+igmp_uninit(void *unused __unused)
+{
+
+ CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
+
+ netisr_unregister(&igmp_nh);
+
+ m_free(m_raopt);
+ m_raopt = NULL;
+
+ IGMP_LOCK_DESTROY();
+}
+SYSUNINIT(igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_uninit, NULL);
+
+static void
+vnet_igmp_init(const void *unused __unused)
+{
+
+ CTR1(KTR_IGMPV3, "%s: initializing", __func__);
+
+ LIST_INIT(&V_igi_head);
+}
+VNET_SYSINIT(vnet_igmp_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_igmp_init,
+ NULL);
+
+static void
+vnet_igmp_uninit(const void *unused __unused)
+{
+
+ CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
+
+ KASSERT(LIST_EMPTY(&V_igi_head),
+ ("%s: igi list not empty; ifnets not detached?", __func__));
+}
+VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
+ vnet_igmp_uninit, NULL);
+
+static int
+igmp_modevent(module_t mod, int type, void *unused __unused)
+{
+
+ switch (type) {
+ case MOD_LOAD:
+ case MOD_UNLOAD:
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+ return (0);
+}
+
+static moduledata_t igmp_mod = {
+ "igmp",
+ igmp_modevent,
+ 0
+};
+DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
diff --git a/freebsd/sys/netinet/igmp.h b/freebsd/sys/netinet/igmp.h
new file mode 100644
index 00000000..f328d21f
--- /dev/null
+++ b/freebsd/sys/netinet/igmp.h
@@ -0,0 +1,2 @@
+#include <freebsd/bsd.h>
+#include <freebsd/netinet/igmp.h>
diff --git a/freebsd/sys/netinet/igmp_var.h b/freebsd/sys/netinet/igmp_var.h
new file mode 100644
index 00000000..e1abe6ab
--- /dev/null
+++ b/freebsd/sys/netinet/igmp_var.h
@@ -0,0 +1,225 @@
+/*-a
+ * Copyright (c) 1988 Stephen Deering.
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Stephen Deering of Stanford University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)igmp_var.h 8.1 (Berkeley) 7/19/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IGMP_VAR_HH_
+#define _NETINET_IGMP_VAR_HH_
+
+/*
+ * Internet Group Management Protocol (IGMP),
+ * implementation-specific definitions.
+ *
+ * Written by Steve Deering, Stanford, May 1988.
+ *
+ * MULTICAST Revision: 3.5.1.3
+ */
+
+#ifndef BURN_BRIDGES
+/*
+ * Pre-IGMPV3 igmpstat structure.
+ */
+struct oigmpstat {
+ u_int igps_rcv_total; /* total IGMP messages received */
+ u_int igps_rcv_tooshort; /* received with too few bytes */
+ u_int igps_rcv_badsum; /* received with bad checksum */
+ u_int igps_rcv_queries; /* received membership queries */
+ u_int igps_rcv_badqueries; /* received invalid queries */
+ u_int igps_rcv_reports; /* received membership reports */
+ u_int igps_rcv_badreports; /* received invalid reports */
+ u_int igps_rcv_ourreports; /* received reports for our groups */
+ u_int igps_snd_reports; /* sent membership reports */
+ u_int igps_rcv_toolong; /* received with too many bytes */
+};
+#endif
+
+/*
+ * IGMPv3 protocol statistics.
+ */
+struct igmpstat {
+ /*
+ * Structure header (to insulate ABI changes).
+ */
+ uint32_t igps_version; /* version of this structure */
+ uint32_t igps_len; /* length of this structure */
+ /*
+ * Message statistics.
+ */
+ uint64_t igps_rcv_total; /* total IGMP messages received */
+ uint64_t igps_rcv_tooshort; /* received with too few bytes */
+ uint64_t igps_rcv_badttl; /* received with ttl other than 1 */
+ uint64_t igps_rcv_badsum; /* received with bad checksum */
+ /*
+ * Query statistics.
+ */
+ uint64_t igps_rcv_v1v2_queries; /* received IGMPv1/IGMPv2 queries */
+ uint64_t igps_rcv_v3_queries; /* received IGMPv3 queries */
+ uint64_t igps_rcv_badqueries; /* received invalid queries */
+ uint64_t igps_rcv_gen_queries; /* received general queries */
+ uint64_t igps_rcv_group_queries;/* received group queries */
+ uint64_t igps_rcv_gsr_queries; /* received group-source queries */
+ uint64_t igps_drop_gsr_queries; /* dropped group-source queries */
+ /*
+ * Report statistics.
+ */
+ uint64_t igps_rcv_reports; /* received membership reports */
+ uint64_t igps_rcv_badreports; /* received invalid reports */
+ uint64_t igps_rcv_ourreports; /* received reports for our groups */
+ uint64_t igps_rcv_nora; /* received w/o Router Alert option */
+ uint64_t igps_snd_reports; /* sent membership reports */
+ /*
+ * Padding for future additions.
+ */
+ uint64_t __igps_pad[4];
+};
+#define IGPS_VERSION_3 3 /* as of FreeBSD 8.x */
+#define IGPS_VERSION3_LEN 168
+
+#ifdef _KERNEL
+#define IGMPSTAT_ADD(name, val) V_igmpstat.name += (val)
+#define IGMPSTAT_INC(name) IGMPSTAT_ADD(name, 1)
+#endif
+
+#ifdef CTASSERT
+CTASSERT(sizeof(struct igmpstat) == 168);
+#endif
+
+#ifdef _KERNEL
+#define IGMP_RANDOM_DELAY(X) (random() % (X) + 1)
+
+#define IGMP_MAX_STATE_CHANGES 24 /* Max pending changes per group */
+
+/*
+ * IGMP per-group states.
+ */
+#define IGMP_NOT_MEMBER 0 /* Can garbage collect in_multi */
+#define IGMP_SILENT_MEMBER 1 /* Do not perform IGMP for group */
+#define IGMP_REPORTING_MEMBER 2 /* IGMPv1/2/3 we are reporter */
+#define IGMP_IDLE_MEMBER 3 /* IGMPv1/2 we reported last */
+#define IGMP_LAZY_MEMBER 4 /* IGMPv1/2 other member reporting */
+#define IGMP_SLEEPING_MEMBER 5 /* IGMPv1/2 start query response */
+#define IGMP_AWAKENING_MEMBER 6 /* IGMPv1/2 group timer will start */
+#define IGMP_G_QUERY_PENDING_MEMBER 7 /* IGMPv3 group query pending */
+#define IGMP_SG_QUERY_PENDING_MEMBER 8 /* IGMPv3 source query pending */
+#define IGMP_LEAVING_MEMBER 9 /* IGMPv3 dying gasp (pending last */
+ /* retransmission of INCLUDE {}) */
+
+/*
+ * IGMP version tag.
+ */
+#define IGMP_VERSION_NONE 0 /* Invalid */
+#define IGMP_VERSION_1 1
+#define IGMP_VERSION_2 2
+#define IGMP_VERSION_3 3 /* Default */
+
+/*
+ * IGMPv3 protocol control variables.
+ */
+#define IGMP_RV_INIT 2 /* Robustness Variable */
+#define IGMP_RV_MIN 1
+#define IGMP_RV_MAX 7
+
+#define IGMP_QI_INIT 125 /* Query Interval (s) */
+#define IGMP_QI_MIN 1
+#define IGMP_QI_MAX 255
+
+#define IGMP_QRI_INIT 10 /* Query Response Interval (s) */
+#define IGMP_QRI_MIN 1
+#define IGMP_QRI_MAX 255
+
+#define IGMP_URI_INIT 3 /* Unsolicited Report Interval (s) */
+#define IGMP_URI_MIN 0
+#define IGMP_URI_MAX 10
+
+#define IGMP_MAX_G_GS_PACKETS 8 /* # of packets to answer G/GS */
+#define IGMP_MAX_STATE_CHANGE_PACKETS 8 /* # of packets per state change */
+#define IGMP_MAX_RESPONSE_PACKETS 16 /* # of packets for general query */
+#define IGMP_MAX_RESPONSE_BURST 4 /* # of responses to send at once */
+#define IGMP_RESPONSE_BURST_INTERVAL (PR_FASTHZ / 2) /* 500ms */
+
+/*
+ * IGMP-specific mbuf flags.
+ */
+#define M_IGMPV2 M_PROTO1 /* Packet is IGMPv2 */
+#define M_IGMPV3_HDR M_PROTO2 /* Packet has IGMPv3 headers */
+#define M_GROUPREC M_PROTO3 /* mbuf chain is a group record */
+#define M_IGMP_LOOP M_PROTO4 /* transmit on loif, not real ifp */
+
+/*
+ * Default amount of leading space for IGMPv3 to allocate at the
+ * beginning of its mbuf packet chains, to avoid fragmentation and
+ * unnecessary allocation of leading mbufs.
+ */
+#define RAOPT_LEN 4 /* Length of IP Router Alert option */
+#define IGMP_LEADINGSPACE \
+ (sizeof(struct ip) + RAOPT_LEN + sizeof(struct igmp_report))
+
+/*
+ * Subsystem lock macros.
+ * The IGMP lock is only taken with IGMP. Currently it is system-wide.
+ * VIMAGE: The lock could be pushed to per-VIMAGE granularity in future.
+ */
+#define IGMP_LOCK_INIT() mtx_init(&igmp_mtx, "igmp_mtx", NULL, MTX_DEF)
+#define IGMP_LOCK_DESTROY() mtx_destroy(&igmp_mtx)
+#define IGMP_LOCK() mtx_lock(&igmp_mtx)
+#define IGMP_LOCK_ASSERT() mtx_assert(&igmp_mtx, MA_OWNED)
+#define IGMP_UNLOCK() mtx_unlock(&igmp_mtx)
+#define IGMP_UNLOCK_ASSERT() mtx_assert(&igmp_mtx, MA_NOTOWNED)
+
+struct igmp_ifinfo;
+
+int igmp_change_state(struct in_multi *);
+void igmp_fasttimo(void);
+struct igmp_ifinfo *
+ igmp_domifattach(struct ifnet *);
+void igmp_domifdetach(struct ifnet *);
+void igmp_ifdetach(struct ifnet *);
+void igmp_input(struct mbuf *, int);
+void igmp_slowtimo(void);
+
+SYSCTL_DECL(_net_inet_igmp);
+
+#endif /* _KERNEL */
+
+/*
+ * Names for IGMP sysctl objects
+ */
+#define IGMPCTL_STATS 1 /* statistics (read-only) */
+#define IGMPCTL_MAXID 2
+
+#define IGMPCTL_NAMES { \
+ { 0, 0 }, \
+ { "stats", CTLTYPE_STRUCT } \
+}
+#endif
diff --git a/freebsd/sys/netinet/in.c b/freebsd/sys/netinet/in.c
new file mode 100644
index 00000000..64e5d329
--- /dev/null
+++ b/freebsd/sys/netinet/in.c
@@ -0,0 +1,1601 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * Copyright (C) 2001 WIDE Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in.c 8.4 (Berkeley) 1/9/95
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_mpath.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/sockio.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/jail.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/syslog.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/if_var.h>
+#include <freebsd/net/if_dl.h>
+#include <freebsd/net/if_llatbl.h>
+#include <freebsd/net/if_types.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/igmp_var.h>
+#include <freebsd/netinet/udp.h>
+#include <freebsd/netinet/udp_var.h>
+
+static int in_mask2len(struct in_addr *);
+static void in_len2mask(struct in_addr *, int);
+static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t,
+ struct ifnet *, struct thread *);
+
+static int in_addprefix(struct in_ifaddr *, int);
+static int in_scrubprefix(struct in_ifaddr *);
+static void in_socktrim(struct sockaddr_in *);
+static int in_ifinit(struct ifnet *,
+ struct in_ifaddr *, struct sockaddr_in *, int);
+static void in_purgemaddrs(struct ifnet *);
+
+static VNET_DEFINE(int, subnetsarelocal);
+#define V_subnetsarelocal VNET(subnetsarelocal)
+SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW,
+ &VNET_NAME(subnetsarelocal), 0,
+ "Treat all subnets as directly connected");
+static VNET_DEFINE(int, sameprefixcarponly);
+#define V_sameprefixcarponly VNET(sameprefixcarponly)
+SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW,
+ &VNET_NAME(sameprefixcarponly), 0,
+ "Refuse to create same prefixes on different interfaces");
+
+VNET_DECLARE(struct inpcbinfo, ripcbinfo);
+#define V_ripcbinfo VNET(ripcbinfo)
+
+/*
+ * Return 1 if an internet address is for a ``local'' host
+ * (one to which we have a connection). If subnetsarelocal
+ * is true, this includes other subnets of the local net.
+ * Otherwise, it includes only the directly-connected (sub)nets.
+ */
+int
+in_localaddr(struct in_addr in)
+{
+ register u_long i = ntohl(in.s_addr);
+ register struct in_ifaddr *ia;
+
+ IN_IFADDR_RLOCK();
+ if (V_subnetsarelocal) {
+ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ if ((i & ia->ia_netmask) == ia->ia_net) {
+ IN_IFADDR_RUNLOCK();
+ return (1);
+ }
+ }
+ } else {
+ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ if ((i & ia->ia_subnetmask) == ia->ia_subnet) {
+ IN_IFADDR_RUNLOCK();
+ return (1);
+ }
+ }
+ }
+ IN_IFADDR_RUNLOCK();
+ return (0);
+}
+
+/*
+ * Return 1 if an internet address is for the local host and configured
+ * on one of its interfaces.
+ */
+int
+in_localip(struct in_addr in)
+{
+ struct in_ifaddr *ia;
+
+ IN_IFADDR_RLOCK();
+ LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) {
+ if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) {
+ IN_IFADDR_RUNLOCK();
+ return (1);
+ }
+ }
+ IN_IFADDR_RUNLOCK();
+ return (0);
+}
+
+/*
+ * Determine whether an IP address is in a reserved set of addresses
+ * that may not be forwarded, or whether datagrams to that destination
+ * may be forwarded.
+ */
+int
+in_canforward(struct in_addr in)
+{
+ register u_long i = ntohl(in.s_addr);
+ register u_long net;
+
+ if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i))
+ return (0);
+ if (IN_CLASSA(i)) {
+ net = i & IN_CLASSA_NET;
+ if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))
+ return (0);
+ }
+ return (1);
+}
+
+/*
+ * Trim a mask in a sockaddr
+ */
+static void
+in_socktrim(struct sockaddr_in *ap)
+{
+ register char *cplim = (char *) &ap->sin_addr;
+ register char *cp = (char *) (&ap->sin_addr + 1);
+
+ ap->sin_len = 0;
+ while (--cp >= cplim)
+ if (*cp) {
+ (ap)->sin_len = cp - (char *) (ap) + 1;
+ break;
+ }
+}
+
+static int
+in_mask2len(mask)
+ struct in_addr *mask;
+{
+ int x, y;
+ u_char *p;
+
+ p = (u_char *)mask;
+ for (x = 0; x < sizeof(*mask); x++) {
+ if (p[x] != 0xff)
+ break;
+ }
+ y = 0;
+ if (x < sizeof(*mask)) {
+ for (y = 0; y < 8; y++) {
+ if ((p[x] & (0x80 >> y)) == 0)
+ break;
+ }
+ }
+ return (x * 8 + y);
+}
+
+static void
+in_len2mask(struct in_addr *mask, int len)
+{
+ int i;
+ u_char *p;
+
+ p = (u_char *)mask;
+ bzero(mask, sizeof(*mask));
+ for (i = 0; i < len / 8; i++)
+ p[i] = 0xff;
+ if (len % 8)
+ p[i] = (0xff00 >> (len % 8)) & 0xff;
+}
+
+/*
+ * Generic internet control operations (ioctl's).
+ *
+ * ifp is NULL if not an interface-specific ioctl.
+ */
+/* ARGSUSED */
+int
+in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
+ struct thread *td)
+{
+ register struct ifreq *ifr = (struct ifreq *)data;
+ register struct in_ifaddr *ia, *iap;
+ register struct ifaddr *ifa;
+ struct in_addr allhosts_addr;
+ struct in_addr dst;
+ struct in_ifinfo *ii;
+ struct in_aliasreq *ifra = (struct in_aliasreq *)data;
+ struct sockaddr_in oldaddr;
+ int error, hostIsNew, iaIsNew, maskIsNew;
+ int iaIsFirst;
+
+ ia = NULL;
+ iaIsFirst = 0;
+ iaIsNew = 0;
+ allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP);
+
+ /*
+ * Filter out ioctls we implement directly; forward the rest on to
+ * in_lifaddr_ioctl() and ifp->if_ioctl().
+ */
+ switch (cmd) {
+ case SIOCAIFADDR:
+ case SIOCDIFADDR:
+ case SIOCGIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCGIFDSTADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCSIFNETMASK:
+ break;
+
+ case SIOCALIFADDR:
+ if (td != NULL) {
+ error = priv_check(td, PRIV_NET_ADDIFADDR);
+ if (error)
+ return (error);
+ }
+ if (ifp == NULL)
+ return (EINVAL);
+ return in_lifaddr_ioctl(so, cmd, data, ifp, td);
+
+ case SIOCDLIFADDR:
+ if (td != NULL) {
+ error = priv_check(td, PRIV_NET_DELIFADDR);
+ if (error)
+ return (error);
+ }
+ if (ifp == NULL)
+ return (EINVAL);
+ return in_lifaddr_ioctl(so, cmd, data, ifp, td);
+
+ case SIOCGLIFADDR:
+ if (ifp == NULL)
+ return (EINVAL);
+ return in_lifaddr_ioctl(so, cmd, data, ifp, td);
+
+ default:
+ if (ifp == NULL || ifp->if_ioctl == NULL)
+ return (EOPNOTSUPP);
+ return ((*ifp->if_ioctl)(ifp, cmd, data));
+ }
+
+ if (ifp == NULL)
+ return (EADDRNOTAVAIL);
+
+ /*
+ * Security checks before we get involved in any work.
+ */
+ switch (cmd) {
+ case SIOCAIFADDR:
+ case SIOCSIFADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCSIFNETMASK:
+ case SIOCSIFDSTADDR:
+ if (td != NULL) {
+ error = priv_check(td, PRIV_NET_ADDIFADDR);
+ if (error)
+ return (error);
+ }
+ break;
+
+ case SIOCDIFADDR:
+ if (td != NULL) {
+ error = priv_check(td, PRIV_NET_DELIFADDR);
+ if (error)
+ return (error);
+ }
+ break;
+ }
+
+ /*
+ * Find address for this interface, if it exists.
+ *
+ * If an alias address was specified, find that one instead of the
+ * first one on the interface, if possible.
+ */
+ dst = ((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr;
+ IN_IFADDR_RLOCK();
+ LIST_FOREACH(iap, INADDR_HASH(dst.s_addr), ia_hash) {
+ if (iap->ia_ifp == ifp &&
+ iap->ia_addr.sin_addr.s_addr == dst.s_addr) {
+ if (td == NULL || prison_check_ip4(td->td_ucred,
+ &dst) == 0)
+ ia = iap;
+ break;
+ }
+ }
+ if (ia != NULL)
+ ifa_ref(&ia->ia_ifa);
+ IN_IFADDR_RUNLOCK();
+ if (ia == NULL) {
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ iap = ifatoia(ifa);
+ if (iap->ia_addr.sin_family == AF_INET) {
+ if (td != NULL &&
+ prison_check_ip4(td->td_ucred,
+ &iap->ia_addr.sin_addr) != 0)
+ continue;
+ ia = iap;
+ break;
+ }
+ }
+ if (ia != NULL)
+ ifa_ref(&ia->ia_ifa);
+ IF_ADDR_UNLOCK(ifp);
+ }
+ if (ia == NULL)
+ iaIsFirst = 1;
+
+ error = 0;
+ switch (cmd) {
+ case SIOCAIFADDR:
+ case SIOCDIFADDR:
+ if (ifra->ifra_addr.sin_family == AF_INET) {
+ struct in_ifaddr *oia;
+
+ IN_IFADDR_RLOCK();
+ for (oia = ia; ia; ia = TAILQ_NEXT(ia, ia_link)) {
+ if (ia->ia_ifp == ifp &&
+ ia->ia_addr.sin_addr.s_addr ==
+ ifra->ifra_addr.sin_addr.s_addr)
+ break;
+ }
+ if (ia != NULL && ia != oia)
+ ifa_ref(&ia->ia_ifa);
+ if (oia != NULL && ia != oia)
+ ifa_free(&oia->ia_ifa);
+ IN_IFADDR_RUNLOCK();
+ if ((ifp->if_flags & IFF_POINTOPOINT)
+ && (cmd == SIOCAIFADDR)
+ && (ifra->ifra_dstaddr.sin_addr.s_addr
+ == INADDR_ANY)) {
+ error = EDESTADDRREQ;
+ goto out;
+ }
+ }
+ if (cmd == SIOCDIFADDR && ia == NULL) {
+ error = EADDRNOTAVAIL;
+ goto out;
+ }
+ /* FALLTHROUGH */
+ case SIOCSIFADDR:
+ case SIOCSIFNETMASK:
+ case SIOCSIFDSTADDR:
+ if (ia == NULL) {
+ ia = (struct in_ifaddr *)
+ malloc(sizeof *ia, M_IFADDR, M_NOWAIT |
+ M_ZERO);
+ if (ia == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+
+ ifa = &ia->ia_ifa;
+ ifa_init(ifa);
+ ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr;
+ ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr;
+ ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask;
+
+ ia->ia_sockmask.sin_len = 8;
+ ia->ia_sockmask.sin_family = AF_INET;
+ if (ifp->if_flags & IFF_BROADCAST) {
+ ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr);
+ ia->ia_broadaddr.sin_family = AF_INET;
+ }
+ ia->ia_ifp = ifp;
+
+ ifa_ref(ifa); /* if_addrhead */
+ IF_ADDR_LOCK(ifp);
+ TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link);
+ IF_ADDR_UNLOCK(ifp);
+ ifa_ref(ifa); /* in_ifaddrhead */
+ IN_IFADDR_WLOCK();
+ TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link);
+ IN_IFADDR_WUNLOCK();
+ iaIsNew = 1;
+ }
+ break;
+
+ case SIOCSIFBRDADDR:
+ case SIOCGIFADDR:
+ case SIOCGIFNETMASK:
+ case SIOCGIFDSTADDR:
+ case SIOCGIFBRDADDR:
+ if (ia == NULL) {
+ error = EADDRNOTAVAIL;
+ goto out;
+ }
+ break;
+ }
+
+ /*
+ * Most paths in this switch return directly or via out. Only paths
+ * that remove the address break in order to hit common removal code.
+ */
+ switch (cmd) {
+ case SIOCGIFADDR:
+ *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr;
+ goto out;
+
+ case SIOCGIFBRDADDR:
+ if ((ifp->if_flags & IFF_BROADCAST) == 0) {
+ error = EINVAL;
+ goto out;
+ }
+ *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr;
+ goto out;
+
+ case SIOCGIFDSTADDR:
+ if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
+ error = EINVAL;
+ goto out;
+ }
+ *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr;
+ goto out;
+
+ case SIOCGIFNETMASK:
+ *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask;
+ goto out;
+
+ case SIOCSIFDSTADDR:
+ if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
+ error = EINVAL;
+ goto out;
+ }
+ oldaddr = ia->ia_dstaddr;
+ ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr;
+ if (ifp->if_ioctl != NULL) {
+ error = (*ifp->if_ioctl)(ifp, SIOCSIFDSTADDR,
+ (caddr_t)ia);
+ if (error) {
+ ia->ia_dstaddr = oldaddr;
+ goto out;
+ }
+ }
+ if (ia->ia_flags & IFA_ROUTE) {
+ ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr;
+ rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST);
+ ia->ia_ifa.ifa_dstaddr =
+ (struct sockaddr *)&ia->ia_dstaddr;
+ rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP);
+ }
+ goto out;
+
+ case SIOCSIFBRDADDR:
+ if ((ifp->if_flags & IFF_BROADCAST) == 0) {
+ error = EINVAL;
+ goto out;
+ }
+ ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr;
+ goto out;
+
+ case SIOCSIFADDR:
+ error = in_ifinit(ifp, ia,
+ (struct sockaddr_in *) &ifr->ifr_addr, 1);
+ if (error != 0 && iaIsNew)
+ break;
+ if (error == 0) {
+ ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
+ if (iaIsFirst &&
+ (ifp->if_flags & IFF_MULTICAST) != 0) {
+ error = in_joingroup(ifp, &allhosts_addr,
+ NULL, &ii->ii_allhosts);
+ }
+ EVENTHANDLER_INVOKE(ifaddr_event, ifp);
+ }
+ error = 0;
+ goto out;
+
+ case SIOCSIFNETMASK:
+ ia->ia_sockmask.sin_addr = ifra->ifra_addr.sin_addr;
+ ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr);
+ goto out;
+
+ case SIOCAIFADDR:
+ maskIsNew = 0;
+ hostIsNew = 1;
+ error = 0;
+ if (ia->ia_addr.sin_family == AF_INET) {
+ if (ifra->ifra_addr.sin_len == 0) {
+ ifra->ifra_addr = ia->ia_addr;
+ hostIsNew = 0;
+ } else if (ifra->ifra_addr.sin_addr.s_addr ==
+ ia->ia_addr.sin_addr.s_addr)
+ hostIsNew = 0;
+ }
+ if (ifra->ifra_mask.sin_len) {
+ /*
+ * QL: XXX
+ * Need to scrub the prefix here in case
+ * the issued command is SIOCAIFADDR with
+ * the same address, but with a different
+ * prefix length. And if the prefix length
+ * is the same as before, then the call is
+ * un-necessarily executed here.
+ */
+ in_ifscrub(ifp, ia);
+ ia->ia_sockmask = ifra->ifra_mask;
+ ia->ia_sockmask.sin_family = AF_INET;
+ ia->ia_subnetmask =
+ ntohl(ia->ia_sockmask.sin_addr.s_addr);
+ maskIsNew = 1;
+ }
+ if ((ifp->if_flags & IFF_POINTOPOINT) &&
+ (ifra->ifra_dstaddr.sin_family == AF_INET)) {
+ in_ifscrub(ifp, ia);
+ ia->ia_dstaddr = ifra->ifra_dstaddr;
+ maskIsNew = 1; /* We lie; but the effect's the same */
+ }
+ if (ifra->ifra_addr.sin_family == AF_INET &&
+ (hostIsNew || maskIsNew))
+ error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0);
+ if (error != 0 && iaIsNew)
+ goto out;
+
+ if ((ifp->if_flags & IFF_BROADCAST) &&
+ (ifra->ifra_broadaddr.sin_family == AF_INET))
+ ia->ia_broadaddr = ifra->ifra_broadaddr;
+ if (error == 0) {
+ ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
+ if (iaIsFirst &&
+ (ifp->if_flags & IFF_MULTICAST) != 0) {
+ error = in_joingroup(ifp, &allhosts_addr,
+ NULL, &ii->ii_allhosts);
+ }
+ EVENTHANDLER_INVOKE(ifaddr_event, ifp);
+ }
+ goto out;
+
+ case SIOCDIFADDR:
+ /*
+ * in_ifscrub kills the interface route.
+ */
+ in_ifscrub(ifp, ia);
+
+ /*
+ * in_ifadown gets rid of all the rest of
+ * the routes. This is not quite the right
+ * thing to do, but at least if we are running
+ * a routing process they will come back.
+ */
+ in_ifadown(&ia->ia_ifa, 1);
+ EVENTHANDLER_INVOKE(ifaddr_event, ifp);
+ error = 0;
+ break;
+
+ default:
+ panic("in_control: unsupported ioctl");
+ }
+
+ IF_ADDR_LOCK(ifp);
+ /* Re-check that ia is still part of the list. */
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa == &ia->ia_ifa)
+ break;
+ }
+ if (ifa == NULL) {
+ /*
+ * If we lost the race with another thread, there is no need to
+ * try it again for the next loop as there is no other exit
+ * path between here and out.
+ */
+ IF_ADDR_UNLOCK(ifp);
+ error = EADDRNOTAVAIL;
+ goto out;
+ }
+ TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
+ IF_ADDR_UNLOCK(ifp);
+ ifa_free(&ia->ia_ifa); /* if_addrhead */
+
+ IN_IFADDR_WLOCK();
+ TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link);
+ if (ia->ia_addr.sin_family == AF_INET) {
+ struct in_ifaddr *if_ia;
+
+ LIST_REMOVE(ia, ia_hash);
+ IN_IFADDR_WUNLOCK();
+ /*
+ * If this is the last IPv4 address configured on this
+ * interface, leave the all-hosts group.
+ * No state-change report need be transmitted.
+ */
+ if_ia = NULL;
+ IFP_TO_IA(ifp, if_ia);
+ if (if_ia == NULL) {
+ ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
+ IN_MULTI_LOCK();
+ if (ii->ii_allhosts) {
+ (void)in_leavegroup_locked(ii->ii_allhosts,
+ NULL);
+ ii->ii_allhosts = NULL;
+ }
+ IN_MULTI_UNLOCK();
+ } else
+ ifa_free(&if_ia->ia_ifa);
+ } else
+ IN_IFADDR_WUNLOCK();
+ ifa_free(&ia->ia_ifa); /* in_ifaddrhead */
+out:
+ if (ia != NULL)
+ ifa_free(&ia->ia_ifa);
+ return (error);
+}
+
+/*
+ * SIOC[GAD]LIFADDR.
+ * SIOCGLIFADDR: get first address. (?!?)
+ * SIOCGLIFADDR with IFLR_PREFIX:
+ * get first address that matches the specified prefix.
+ * SIOCALIFADDR: add the specified address.
+ * SIOCALIFADDR with IFLR_PREFIX:
+ * EINVAL since we can't deduce hostid part of the address.
+ * SIOCDLIFADDR: delete the specified address.
+ * SIOCDLIFADDR with IFLR_PREFIX:
+ * delete the first address that matches the specified prefix.
+ * return values:
+ * EINVAL on invalid parameters
+ * EADDRNOTAVAIL on prefix match failed/specified address not found
+ * other values may be returned from in_ioctl()
+ */
+static int
+in_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data,
+ struct ifnet *ifp, struct thread *td)
+{
+ struct if_laddrreq *iflr = (struct if_laddrreq *)data;
+ struct ifaddr *ifa;
+
+ /* sanity checks */
+ if (data == NULL || ifp == NULL) {
+ panic("invalid argument to in_lifaddr_ioctl");
+ /*NOTRECHED*/
+ }
+
+ switch (cmd) {
+ case SIOCGLIFADDR:
+ /* address must be specified on GET with IFLR_PREFIX */
+ if ((iflr->flags & IFLR_PREFIX) == 0)
+ break;
+ /*FALLTHROUGH*/
+ case SIOCALIFADDR:
+ case SIOCDLIFADDR:
+ /* address must be specified on ADD and DELETE */
+ if (iflr->addr.ss_family != AF_INET)
+ return (EINVAL);
+ if (iflr->addr.ss_len != sizeof(struct sockaddr_in))
+ return (EINVAL);
+ /* XXX need improvement */
+ if (iflr->dstaddr.ss_family
+ && iflr->dstaddr.ss_family != AF_INET)
+ return (EINVAL);
+ if (iflr->dstaddr.ss_family
+ && iflr->dstaddr.ss_len != sizeof(struct sockaddr_in))
+ return (EINVAL);
+ break;
+ default: /*shouldn't happen*/
+ return (EOPNOTSUPP);
+ }
+ if (sizeof(struct in_addr) * 8 < iflr->prefixlen)
+ return (EINVAL);
+
+ switch (cmd) {
+ case SIOCALIFADDR:
+ {
+ struct in_aliasreq ifra;
+
+ if (iflr->flags & IFLR_PREFIX)
+ return (EINVAL);
+
+ /* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */
+ bzero(&ifra, sizeof(ifra));
+ bcopy(iflr->iflr_name, ifra.ifra_name,
+ sizeof(ifra.ifra_name));
+
+ bcopy(&iflr->addr, &ifra.ifra_addr, iflr->addr.ss_len);
+
+ if (iflr->dstaddr.ss_family) { /*XXX*/
+ bcopy(&iflr->dstaddr, &ifra.ifra_dstaddr,
+ iflr->dstaddr.ss_len);
+ }
+
+ ifra.ifra_mask.sin_family = AF_INET;
+ ifra.ifra_mask.sin_len = sizeof(struct sockaddr_in);
+ in_len2mask(&ifra.ifra_mask.sin_addr, iflr->prefixlen);
+
+ return (in_control(so, SIOCAIFADDR, (caddr_t)&ifra, ifp, td));
+ }
+ case SIOCGLIFADDR:
+ case SIOCDLIFADDR:
+ {
+ struct in_ifaddr *ia;
+ struct in_addr mask, candidate, match;
+ struct sockaddr_in *sin;
+
+ bzero(&mask, sizeof(mask));
+ bzero(&match, sizeof(match));
+ if (iflr->flags & IFLR_PREFIX) {
+ /* lookup a prefix rather than address. */
+ in_len2mask(&mask, iflr->prefixlen);
+
+ sin = (struct sockaddr_in *)&iflr->addr;
+ match.s_addr = sin->sin_addr.s_addr;
+ match.s_addr &= mask.s_addr;
+
+ /* if you set extra bits, that's wrong */
+ if (match.s_addr != sin->sin_addr.s_addr)
+ return (EINVAL);
+
+ } else {
+ /* on getting an address, take the 1st match */
+ /* on deleting an address, do exact match */
+ if (cmd != SIOCGLIFADDR) {
+ in_len2mask(&mask, 32);
+ sin = (struct sockaddr_in *)&iflr->addr;
+ match.s_addr = sin->sin_addr.s_addr;
+ }
+ }
+
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family != AF_INET6)
+ continue;
+ if (match.s_addr == 0)
+ break;
+ candidate.s_addr = ((struct sockaddr_in *)&ifa->ifa_addr)->sin_addr.s_addr;
+ candidate.s_addr &= mask.s_addr;
+ if (candidate.s_addr == match.s_addr)
+ break;
+ }
+ if (ifa == NULL)
+ return (EADDRNOTAVAIL);
+ ia = (struct in_ifaddr *)ifa;
+
+ if (cmd == SIOCGLIFADDR) {
+ /* fill in the if_laddrreq structure */
+ bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin_len);
+
+ if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
+ bcopy(&ia->ia_dstaddr, &iflr->dstaddr,
+ ia->ia_dstaddr.sin_len);
+ } else
+ bzero(&iflr->dstaddr, sizeof(iflr->dstaddr));
+
+ iflr->prefixlen =
+ in_mask2len(&ia->ia_sockmask.sin_addr);
+
+ iflr->flags = 0; /*XXX*/
+
+ return (0);
+ } else {
+ struct in_aliasreq ifra;
+
+ /* fill in_aliasreq and do ioctl(SIOCDIFADDR_IN6) */
+ bzero(&ifra, sizeof(ifra));
+ bcopy(iflr->iflr_name, ifra.ifra_name,
+ sizeof(ifra.ifra_name));
+
+ bcopy(&ia->ia_addr, &ifra.ifra_addr,
+ ia->ia_addr.sin_len);
+ if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
+ bcopy(&ia->ia_dstaddr, &ifra.ifra_dstaddr,
+ ia->ia_dstaddr.sin_len);
+ }
+ bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr,
+ ia->ia_sockmask.sin_len);
+
+ return (in_control(so, SIOCDIFADDR, (caddr_t)&ifra,
+ ifp, td));
+ }
+ }
+ }
+
+ return (EOPNOTSUPP); /*just for safety*/
+}
+
+/*
+ * Delete any existing route for an interface.
+ */
+void
+in_ifscrub(struct ifnet *ifp, struct in_ifaddr *ia)
+{
+
+ in_scrubprefix(ia);
+}
+
+/*
+ * Initialize an interface's internet address
+ * and routing table entry.
+ */
+static int
+in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin,
+ int scrub)
+{
+ register u_long i = ntohl(sin->sin_addr.s_addr);
+ struct sockaddr_in oldaddr;
+ int s = splimp(), flags = RTF_UP, error = 0;
+
+ oldaddr = ia->ia_addr;
+ if (oldaddr.sin_family == AF_INET)
+ LIST_REMOVE(ia, ia_hash);
+ ia->ia_addr = *sin;
+ if (ia->ia_addr.sin_family == AF_INET) {
+ IN_IFADDR_WLOCK();
+ LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr),
+ ia, ia_hash);
+ IN_IFADDR_WUNLOCK();
+ }
+ /*
+ * Give the interface a chance to initialize
+ * if this is its first address,
+ * and to validate the address if necessary.
+ */
+ if (ifp->if_ioctl != NULL) {
+ error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
+ if (error) {
+ splx(s);
+ /* LIST_REMOVE(ia, ia_hash) is done in in_control */
+ ia->ia_addr = oldaddr;
+ IN_IFADDR_WLOCK();
+ if (ia->ia_addr.sin_family == AF_INET)
+ LIST_INSERT_HEAD(INADDR_HASH(
+ ia->ia_addr.sin_addr.s_addr), ia, ia_hash);
+ else
+ /*
+ * If oldaddr family is not AF_INET (e.g.
+ * interface has been just created) in_control
+ * does not call LIST_REMOVE, and we end up
+ * with bogus ia entries in hash
+ */
+ LIST_REMOVE(ia, ia_hash);
+ IN_IFADDR_WUNLOCK();
+ return (error);
+ }
+ }
+ splx(s);
+ if (scrub) {
+ ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr;
+ in_ifscrub(ifp, ia);
+ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr;
+ }
+ if (IN_CLASSA(i))
+ ia->ia_netmask = IN_CLASSA_NET;
+ else if (IN_CLASSB(i))
+ ia->ia_netmask = IN_CLASSB_NET;
+ else
+ ia->ia_netmask = IN_CLASSC_NET;
+ /*
+ * The subnet mask usually includes at least the standard network part,
+ * but may may be smaller in the case of supernetting.
+ * If it is set, we believe it.
+ */
+ if (ia->ia_subnetmask == 0) {
+ ia->ia_subnetmask = ia->ia_netmask;
+ ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask);
+ } else
+ ia->ia_netmask &= ia->ia_subnetmask;
+ ia->ia_net = i & ia->ia_netmask;
+ ia->ia_subnet = i & ia->ia_subnetmask;
+ in_socktrim(&ia->ia_sockmask);
+ /*
+ * XXX: carp(4) does not have interface route
+ */
+ if (ifp->if_type == IFT_CARP)
+ return (0);
+ /*
+ * Add route for the network.
+ */
+ ia->ia_ifa.ifa_metric = ifp->if_metric;
+ if (ifp->if_flags & IFF_BROADCAST) {
+ ia->ia_broadaddr.sin_addr.s_addr =
+ htonl(ia->ia_subnet | ~ia->ia_subnetmask);
+ ia->ia_netbroadcast.s_addr =
+ htonl(ia->ia_net | ~ ia->ia_netmask);
+ } else if (ifp->if_flags & IFF_LOOPBACK) {
+ ia->ia_dstaddr = ia->ia_addr;
+ flags |= RTF_HOST;
+ } else if (ifp->if_flags & IFF_POINTOPOINT) {
+ if (ia->ia_dstaddr.sin_family != AF_INET)
+ return (0);
+ flags |= RTF_HOST;
+ }
+ if ((error = in_addprefix(ia, flags)) != 0)
+ return (error);
+
+ if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY)
+ return (0);
+
+ if (ifp->if_flags & IFF_POINTOPOINT) {
+ if (ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)
+ return (0);
+ }
+
+
+ /*
+ * add a loopback route to self
+ */
+ if (V_useloopback && !(ifp->if_flags & IFF_LOOPBACK)) {
+ struct route ia_ro;
+
+ bzero(&ia_ro, sizeof(ia_ro));
+ *((struct sockaddr_in *)(&ia_ro.ro_dst)) = ia->ia_addr;
+ rtalloc_ign_fib(&ia_ro, 0, 0);
+ if ((ia_ro.ro_rt != NULL) && (ia_ro.ro_rt->rt_ifp != NULL) &&
+ (ia_ro.ro_rt->rt_ifp == V_loif)) {
+ RT_LOCK(ia_ro.ro_rt);
+ RT_ADDREF(ia_ro.ro_rt);
+ RTFREE_LOCKED(ia_ro.ro_rt);
+ } else
+ error = ifa_add_loopback_route((struct ifaddr *)ia,
+ (struct sockaddr *)&ia->ia_addr);
+ if (error == 0)
+ ia->ia_flags |= IFA_RTSELF;
+ if (ia_ro.ro_rt != NULL)
+ RTFREE(ia_ro.ro_rt);
+ }
+
+ return (error);
+}
+
+#define rtinitflags(x) \
+ ((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \
+ ? RTF_HOST : 0)
+
+/*
+ * Generate a routing message when inserting or deleting
+ * an interface address alias.
+ */
+static void in_addralias_rtmsg(int cmd, struct in_addr *prefix,
+ struct in_ifaddr *target)
+{
+ struct route pfx_ro;
+ struct sockaddr_in *pfx_addr;
+ struct rtentry msg_rt;
+
+ /* QL: XXX
+ * This is a bit questionable because there is no
+ * additional route entry added/deleted for an address
+ * alias. Therefore this route report is inaccurate.
+ */
+ bzero(&pfx_ro, sizeof(pfx_ro));
+ pfx_addr = (struct sockaddr_in *)(&pfx_ro.ro_dst);
+ pfx_addr->sin_len = sizeof(*pfx_addr);
+ pfx_addr->sin_family = AF_INET;
+ pfx_addr->sin_addr = *prefix;
+ rtalloc_ign_fib(&pfx_ro, 0, 0);
+ if (pfx_ro.ro_rt != NULL) {
+ msg_rt = *pfx_ro.ro_rt;
+
+ /* QL: XXX
+ * Point the gateway to the new interface
+ * address as if a new prefix route entry has
+ * been added through the new address alias.
+ * All other parts of the rtentry is accurate,
+ * e.g., rt_key, rt_mask, rt_ifp etc.
+ */
+ msg_rt.rt_gateway =
+ (struct sockaddr *)&target->ia_addr;
+ rt_newaddrmsg(cmd,
+ (struct ifaddr *)target,
+ 0, &msg_rt);
+ RTFREE(pfx_ro.ro_rt);
+ }
+ return;
+}
+
+/*
+ * Check if we have a route for the given prefix already or add one accordingly.
+ */
+static int
+in_addprefix(struct in_ifaddr *target, int flags)
+{
+ struct in_ifaddr *ia;
+ struct in_addr prefix, mask, p, m;
+ int error;
+
+ if ((flags & RTF_HOST) != 0) {
+ prefix = target->ia_dstaddr.sin_addr;
+ mask.s_addr = 0;
+ } else {
+ prefix = target->ia_addr.sin_addr;
+ mask = target->ia_sockmask.sin_addr;
+ prefix.s_addr &= mask.s_addr;
+ }
+
+ IN_IFADDR_RLOCK();
+ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ if (rtinitflags(ia)) {
+ p = ia->ia_addr.sin_addr;
+
+ if (prefix.s_addr != p.s_addr)
+ continue;
+ } else {
+ p = ia->ia_addr.sin_addr;
+ m = ia->ia_sockmask.sin_addr;
+ p.s_addr &= m.s_addr;
+
+ if (prefix.s_addr != p.s_addr ||
+ mask.s_addr != m.s_addr)
+ continue;
+ }
+
+ /*
+ * If we got a matching prefix route inserted by other
+ * interface address, we are done here.
+ */
+ if (ia->ia_flags & IFA_ROUTE) {
+#ifdef RADIX_MPATH
+ if (ia->ia_addr.sin_addr.s_addr ==
+ target->ia_addr.sin_addr.s_addr) {
+ IN_IFADDR_RUNLOCK();
+ return (EEXIST);
+ } else
+ break;
+#endif
+ if (V_sameprefixcarponly &&
+ target->ia_ifp->if_type != IFT_CARP &&
+ ia->ia_ifp->if_type != IFT_CARP) {
+ IN_IFADDR_RUNLOCK();
+ return (EEXIST);
+ } else {
+ in_addralias_rtmsg(RTM_ADD, &prefix, target);
+ IN_IFADDR_RUNLOCK();
+ return (0);
+ }
+ }
+ }
+ IN_IFADDR_RUNLOCK();
+
+ /*
+ * No-one seem to have this prefix route, so we try to insert it.
+ */
+ error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags);
+ if (!error)
+ target->ia_flags |= IFA_ROUTE;
+ return (error);
+}
+
+extern void arp_ifscrub(struct ifnet *ifp, uint32_t addr);
+
+/*
+ * If there is no other address in the system that can serve a route to the
+ * same prefix, remove the route. Hand over the route to the new address
+ * otherwise.
+ */
+static int
+in_scrubprefix(struct in_ifaddr *target)
+{
+ struct in_ifaddr *ia;
+ struct in_addr prefix, mask, p;
+ int error = 0;
+ struct sockaddr_in prefix0, mask0;
+
+ /*
+ * Remove the loopback route to the interface address.
+ * The "useloopback" setting is not consulted because if the
+ * user configures an interface address, turns off this
+ * setting, and then tries to delete that interface address,
+ * checking the current setting of "useloopback" would leave
+ * that interface address loopback route untouched, which
+ * would be wrong. Therefore the interface address loopback route
+ * deletion is unconditional.
+ */
+ if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) &&
+ !(target->ia_ifp->if_flags & IFF_LOOPBACK) &&
+ (target->ia_flags & IFA_RTSELF)) {
+ struct route ia_ro;
+ int freeit = 0;
+
+ bzero(&ia_ro, sizeof(ia_ro));
+ *((struct sockaddr_in *)(&ia_ro.ro_dst)) = target->ia_addr;
+ rtalloc_ign_fib(&ia_ro, 0, 0);
+ if ((ia_ro.ro_rt != NULL) && (ia_ro.ro_rt->rt_ifp != NULL) &&
+ (ia_ro.ro_rt->rt_ifp == V_loif)) {
+ RT_LOCK(ia_ro.ro_rt);
+ if (ia_ro.ro_rt->rt_refcnt <= 1)
+ freeit = 1;
+ else
+ RT_REMREF(ia_ro.ro_rt);
+ RTFREE_LOCKED(ia_ro.ro_rt);
+ }
+ if (freeit)
+ error = ifa_del_loopback_route((struct ifaddr *)target,
+ (struct sockaddr *)&target->ia_addr);
+ if (error == 0)
+ target->ia_flags &= ~IFA_RTSELF;
+ /* remove arp cache */
+ arp_ifscrub(target->ia_ifp, IA_SIN(target)->sin_addr.s_addr);
+ }
+
+ if (rtinitflags(target))
+ prefix = target->ia_dstaddr.sin_addr;
+ else {
+ prefix = target->ia_addr.sin_addr;
+ mask = target->ia_sockmask.sin_addr;
+ prefix.s_addr &= mask.s_addr;
+ }
+
+ if ((target->ia_flags & IFA_ROUTE) == 0) {
+ in_addralias_rtmsg(RTM_DELETE, &prefix, target);
+ return (0);
+ }
+
+ IN_IFADDR_RLOCK();
+ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ if (rtinitflags(ia))
+ p = ia->ia_dstaddr.sin_addr;
+ else {
+ p = ia->ia_addr.sin_addr;
+ p.s_addr &= ia->ia_sockmask.sin_addr.s_addr;
+ }
+
+ if (prefix.s_addr != p.s_addr)
+ continue;
+
+ /*
+ * If we got a matching prefix address, move IFA_ROUTE and
+ * the route itself to it. Make sure that routing daemons
+ * get a heads-up.
+ *
+ * XXX: a special case for carp(4) interface - this should
+ * be more generally specified as an interface that
+ * doesn't support such action.
+ */
+ if ((ia->ia_flags & IFA_ROUTE) == 0
+ && (ia->ia_ifp->if_type != IFT_CARP)
+ ) {
+ IN_IFADDR_RUNLOCK();
+ rtinit(&(target->ia_ifa), (int)RTM_DELETE,
+ rtinitflags(target));
+ target->ia_flags &= ~IFA_ROUTE;
+
+ error = rtinit(&ia->ia_ifa, (int)RTM_ADD,
+ rtinitflags(ia) | RTF_UP);
+ if (error == 0)
+ ia->ia_flags |= IFA_ROUTE;
+ return (error);
+ }
+ }
+ IN_IFADDR_RUNLOCK();
+
+ /*
+ * remove all L2 entries on the given prefix
+ */
+ bzero(&prefix0, sizeof(prefix0));
+ prefix0.sin_len = sizeof(prefix0);
+ prefix0.sin_family = AF_INET;
+ prefix0.sin_addr.s_addr = target->ia_subnet;
+ bzero(&mask0, sizeof(mask0));
+ mask0.sin_len = sizeof(mask0);
+ mask0.sin_family = AF_INET;
+ mask0.sin_addr.s_addr = target->ia_subnetmask;
+ lltable_prefix_free(AF_INET, (struct sockaddr *)&prefix0,
+ (struct sockaddr *)&mask0);
+
+ /*
+ * As no-one seem to have this prefix, we can remove the route.
+ */
+ rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target));
+ target->ia_flags &= ~IFA_ROUTE;
+ return (0);
+}
+
+#undef rtinitflags
+
+/*
+ * Return 1 if the address might be a local broadcast address.
+ */
+int
+in_broadcast(struct in_addr in, struct ifnet *ifp)
+{
+ register struct ifaddr *ifa;
+ u_long t;
+
+ if (in.s_addr == INADDR_BROADCAST ||
+ in.s_addr == INADDR_ANY)
+ return (1);
+ if ((ifp->if_flags & IFF_BROADCAST) == 0)
+ return (0);
+ t = ntohl(in.s_addr);
+ /*
+ * Look through the list of addresses for a match
+ * with a broadcast address.
+ */
+#define ia ((struct in_ifaddr *)ifa)
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
+ if (ifa->ifa_addr->sa_family == AF_INET &&
+ (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr ||
+ in.s_addr == ia->ia_netbroadcast.s_addr ||
+ /*
+ * Check for old-style (host 0) broadcast.
+ */
+ t == ia->ia_subnet || t == ia->ia_net) &&
+ /*
+ * Check for an all one subnetmask. These
+ * only exist when an interface gets a secondary
+ * address.
+ */
+ ia->ia_subnetmask != (u_long)0xffffffff)
+ return (1);
+ return (0);
+#undef ia
+}
+
+/*
+ * On interface removal, clean up IPv4 data structures hung off of the ifnet.
+ */
+void
+in_ifdetach(struct ifnet *ifp)
+{
+
+ in_pcbpurgeif0(&V_ripcbinfo, ifp);
+ in_pcbpurgeif0(&V_udbinfo, ifp);
+ in_purgemaddrs(ifp);
+}
+
+/*
+ * Delete all IPv4 multicast address records, and associated link-layer
+ * multicast address records, associated with ifp.
+ * XXX It looks like domifdetach runs AFTER the link layer cleanup.
+ * XXX This should not race with ifma_protospec being set during
+ * a new allocation, if it does, we have bigger problems.
+ */
+static void
+in_purgemaddrs(struct ifnet *ifp)
+{
+ LIST_HEAD(,in_multi) purgeinms;
+ struct in_multi *inm, *tinm;
+ struct ifmultiaddr *ifma;
+
+ LIST_INIT(&purgeinms);
+ IN_MULTI_LOCK();
+
+ /*
+ * Extract list of in_multi associated with the detaching ifp
+ * which the PF_INET layer is about to release.
+ * We need to do this as IF_ADDR_LOCK() may be re-acquired
+ * by code further down.
+ */
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ if (ifma->ifma_addr->sa_family != AF_INET ||
+ ifma->ifma_protospec == NULL)
+ continue;
+#if 0
+ KASSERT(ifma->ifma_protospec != NULL,
+ ("%s: ifma_protospec is NULL", __func__));
+#endif
+ inm = (struct in_multi *)ifma->ifma_protospec;
+ LIST_INSERT_HEAD(&purgeinms, inm, inm_link);
+ }
+ IF_ADDR_UNLOCK(ifp);
+
+ LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) {
+ LIST_REMOVE(inm, inm_link);
+ inm_release_locked(inm);
+ }
+ igmp_ifdetach(ifp);
+
+ IN_MULTI_UNLOCK();
+}
+
+#include <freebsd/net/if_dl.h>
+#include <freebsd/netinet/if_ether.h>
+
+struct in_llentry {
+ struct llentry base;
+ struct sockaddr_in l3_addr4;
+};
+
+static struct llentry *
+in_lltable_new(const struct sockaddr *l3addr, u_int flags)
+{
+ struct in_llentry *lle;
+
+ lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_DONTWAIT | M_ZERO);
+ if (lle == NULL) /* NB: caller generates msg */
+ return NULL;
+
+ callout_init(&lle->base.la_timer, CALLOUT_MPSAFE);
+ /*
+ * For IPv4 this will trigger "arpresolve" to generate
+ * an ARP request.
+ */
+ lle->base.la_expire = time_second; /* mark expired */
+ lle->l3_addr4 = *(const struct sockaddr_in *)l3addr;
+ lle->base.lle_refcnt = 1;
+ LLE_LOCK_INIT(&lle->base);
+ return &lle->base;
+}
+
+/*
+ * Deletes an address from the address table.
+ * This function is called by the timer functions
+ * such as arptimer() and nd6_llinfo_timer(), and
+ * the caller does the locking.
+ */
+static void
+in_lltable_free(struct lltable *llt, struct llentry *lle)
+{
+ LLE_WUNLOCK(lle);
+ LLE_LOCK_DESTROY(lle);
+ free(lle, M_LLTABLE);
+}
+
+
+#define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \
+ (((ntohl((d)->sin_addr.s_addr) ^ (a)->sin_addr.s_addr) & (m)->sin_addr.s_addr)) == 0 )
+
+static void
+in_lltable_prefix_free(struct lltable *llt,
+ const struct sockaddr *prefix,
+ const struct sockaddr *mask)
+{
+ const struct sockaddr_in *pfx = (const struct sockaddr_in *)prefix;
+ const struct sockaddr_in *msk = (const struct sockaddr_in *)mask;
+ struct llentry *lle, *next;
+ register int i;
+
+ for (i=0; i < LLTBL_HASHTBL_SIZE; i++) {
+ LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) {
+
+ if (IN_ARE_MASKED_ADDR_EQUAL((struct sockaddr_in *)L3_ADDR(lle),
+ pfx, msk)) {
+ int canceled;
+
+ canceled = callout_drain(&lle->la_timer);
+ LLE_WLOCK(lle);
+ if (canceled)
+ LLE_REMREF(lle);
+ llentry_free(lle);
+ }
+ }
+ }
+}
+
+
+static int
+in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr)
+{
+ struct rtentry *rt;
+
+ KASSERT(l3addr->sa_family == AF_INET,
+ ("sin_family %d", l3addr->sa_family));
+
+ /* XXX rtalloc1 should take a const param */
+ rt = rtalloc1(__DECONST(struct sockaddr *, l3addr), 0, 0);
+ if (rt == NULL || (!(flags & LLE_PUB) &&
+ ((rt->rt_flags & RTF_GATEWAY) ||
+ (rt->rt_ifp != ifp)))) {
+#ifdef DIAGNOSTIC
+ log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n",
+ inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr));
+#endif
+ if (rt != NULL)
+ RTFREE_LOCKED(rt);
+ return (EINVAL);
+ }
+ RTFREE_LOCKED(rt);
+ return 0;
+}
+
+/*
+ * Return NULL if not found or marked for deletion.
+ * If found return lle read locked.
+ */
+static struct llentry *
+in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr)
+{
+ const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr;
+ struct ifnet *ifp = llt->llt_ifp;
+ struct llentry *lle;
+ struct llentries *lleh;
+ u_int hashkey;
+
+ IF_AFDATA_LOCK_ASSERT(ifp);
+ KASSERT(l3addr->sa_family == AF_INET,
+ ("sin_family %d", l3addr->sa_family));
+
+ hashkey = sin->sin_addr.s_addr;
+ lleh = &llt->lle_head[LLATBL_HASH(hashkey, LLTBL_HASHMASK)];
+ LIST_FOREACH(lle, lleh, lle_next) {
+ struct sockaddr_in *sa2 = (struct sockaddr_in *)L3_ADDR(lle);
+ if (lle->la_flags & LLE_DELETED)
+ continue;
+ if (sa2->sin_addr.s_addr == sin->sin_addr.s_addr)
+ break;
+ }
+ if (lle == NULL) {
+#ifdef DIAGNOSTIC
+ if (flags & LLE_DELETE)
+ log(LOG_INFO, "interface address is missing from cache = %p in delete\n", lle);
+#endif
+ if (!(flags & LLE_CREATE))
+ return (NULL);
+ /*
+ * A route that covers the given address must have
+ * been installed 1st because we are doing a resolution,
+ * verify this.
+ */
+ if (!(flags & LLE_IFADDR) &&
+ in_lltable_rtcheck(ifp, flags, l3addr) != 0)
+ goto done;
+
+ lle = in_lltable_new(l3addr, flags);
+ if (lle == NULL) {
+ log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
+ goto done;
+ }
+ lle->la_flags = flags & ~LLE_CREATE;
+ if ((flags & (LLE_CREATE | LLE_IFADDR)) == (LLE_CREATE | LLE_IFADDR)) {
+ bcopy(IF_LLADDR(ifp), &lle->ll_addr, ifp->if_addrlen);
+ lle->la_flags |= (LLE_VALID | LLE_STATIC);
+ }
+
+ lle->lle_tbl = llt;
+ lle->lle_head = lleh;
+ LIST_INSERT_HEAD(lleh, lle, lle_next);
+ } else if (flags & LLE_DELETE) {
+ if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) {
+ LLE_WLOCK(lle);
+ lle->la_flags = LLE_DELETED;
+ LLE_WUNLOCK(lle);
+#ifdef DIAGNOSTIC
+ log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
+#endif
+ }
+ lle = (void *)-1;
+
+ }
+ if (LLE_IS_VALID(lle)) {
+ if (flags & LLE_EXCLUSIVE)
+ LLE_WLOCK(lle);
+ else
+ LLE_RLOCK(lle);
+ }
+done:
+ return (lle);
+}
+
+static int
+in_lltable_dump(struct lltable *llt, struct sysctl_req *wr)
+{
+#define SIN(lle) ((struct sockaddr_in *) L3_ADDR(lle))
+ struct ifnet *ifp = llt->llt_ifp;
+ struct llentry *lle;
+ /* XXX stack use */
+ struct {
+ struct rt_msghdr rtm;
+ struct sockaddr_inarp sin;
+ struct sockaddr_dl sdl;
+ } arpc;
+ int error, i;
+
+ LLTABLE_LOCK_ASSERT();
+
+ error = 0;
+ for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) {
+ LIST_FOREACH(lle, &llt->lle_head[i], lle_next) {
+ struct sockaddr_dl *sdl;
+
+ /* skip deleted entries */
+ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
+ continue;
+ /* Skip if jailed and not a valid IP of the prison. */
+ if (prison_if(wr->td->td_ucred, L3_ADDR(lle)) != 0)
+ continue;
+ /*
+ * produce a msg made of:
+ * struct rt_msghdr;
+ * struct sockaddr_inarp; (IPv4)
+ * struct sockaddr_dl;
+ */
+ bzero(&arpc, sizeof(arpc));
+ arpc.rtm.rtm_msglen = sizeof(arpc);
+ arpc.rtm.rtm_version = RTM_VERSION;
+ arpc.rtm.rtm_type = RTM_GET;
+ arpc.rtm.rtm_flags = RTF_UP;
+ arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY;
+ arpc.sin.sin_family = AF_INET;
+ arpc.sin.sin_len = sizeof(arpc.sin);
+ arpc.sin.sin_addr.s_addr = SIN(lle)->sin_addr.s_addr;
+
+ /* publish */
+ if (lle->la_flags & LLE_PUB) {
+ arpc.rtm.rtm_flags |= RTF_ANNOUNCE;
+ /* proxy only */
+ if (lle->la_flags & LLE_PROXY)
+ arpc.sin.sin_other = SIN_PROXY;
+ }
+
+ sdl = &arpc.sdl;
+ sdl->sdl_family = AF_LINK;
+ sdl->sdl_len = sizeof(*sdl);
+ sdl->sdl_index = ifp->if_index;
+ sdl->sdl_type = ifp->if_type;
+ if ((lle->la_flags & LLE_VALID) == LLE_VALID) {
+ sdl->sdl_alen = ifp->if_addrlen;
+ bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
+ } else {
+ sdl->sdl_alen = 0;
+ bzero(LLADDR(sdl), ifp->if_addrlen);
+ }
+
+ arpc.rtm.rtm_rmx.rmx_expire =
+ lle->la_flags & LLE_STATIC ? 0 : lle->la_expire;
+ arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA);
+ if (lle->la_flags & LLE_STATIC)
+ arpc.rtm.rtm_flags |= RTF_STATIC;
+ arpc.rtm.rtm_index = ifp->if_index;
+ error = SYSCTL_OUT(wr, &arpc, sizeof(arpc));
+ if (error)
+ break;
+ }
+ }
+ return error;
+#undef SIN
+}
+
+void *
+in_domifattach(struct ifnet *ifp)
+{
+ struct in_ifinfo *ii;
+ struct lltable *llt;
+
+ ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO);
+
+ llt = lltable_init(ifp, AF_INET);
+ if (llt != NULL) {
+ llt->llt_new = in_lltable_new;
+ llt->llt_free = in_lltable_free;
+ llt->llt_prefix_free = in_lltable_prefix_free;
+ llt->llt_rtcheck = in_lltable_rtcheck;
+ llt->llt_lookup = in_lltable_lookup;
+ llt->llt_dump = in_lltable_dump;
+ }
+ ii->ii_llt = llt;
+
+ ii->ii_igmp = igmp_domifattach(ifp);
+
+ return ii;
+}
+
+void
+in_domifdetach(struct ifnet *ifp, void *aux)
+{
+ struct in_ifinfo *ii = (struct in_ifinfo *)aux;
+
+ igmp_domifdetach(ifp);
+ lltable_free(ii->ii_llt);
+ free(ii, M_IFADDR);
+}
diff --git a/freebsd/sys/netinet/in.h b/freebsd/sys/netinet/in.h
new file mode 100644
index 00000000..73c7ca1a
--- /dev/null
+++ b/freebsd/sys/netinet/in.h
@@ -0,0 +1,2 @@
+#include <freebsd/bsd.h>
+#include <freebsd/netinet/in.h>
diff --git a/freebsd/sys/netinet/in_gif.c b/freebsd/sys/netinet/in_gif.c
new file mode 100644
index 00000000..3613e214
--- /dev/null
+++ b/freebsd/sys/netinet/in_gif.c
@@ -0,0 +1,469 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/* $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $ */
+
+/*-
+ * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_mrouting.h>
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/sockio.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/errno.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/malloc.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/in_gif.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip_encap.h>
+#include <freebsd/netinet/ip_ecn.h>
+
+#ifdef INET6
+#include <freebsd/netinet/ip6.h>
+#endif
+
+#ifdef MROUTING
+#include <freebsd/netinet/ip_mroute.h>
+#endif /* MROUTING */
+
+#include <freebsd/net/if_gif.h>
+
+static int gif_validate4(const struct ip *, struct gif_softc *,
+ struct ifnet *);
+
+extern struct domain inetdomain;
+struct protosw in_gif_protosw = {
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = 0/* IPPROTO_IPV[46] */,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = in_gif_input,
+ .pr_output = (pr_output_t*)rip_output,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_usrreqs = &rip_usrreqs
+};
+
+VNET_DEFINE(int, ip_gif_ttl) = GIF_TTL;
+#define V_ip_gif_ttl VNET(ip_gif_ttl)
+SYSCTL_VNET_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW,
+ &VNET_NAME(ip_gif_ttl), 0, "");
+
+int
+in_gif_output(struct ifnet *ifp, int family, struct mbuf *m)
+{
+ struct gif_softc *sc = ifp->if_softc;
+ struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst;
+ struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc;
+ struct sockaddr_in *sin_dst = (struct sockaddr_in *)sc->gif_pdst;
+ struct ip iphdr; /* capsule IP header, host byte ordered */
+ struct etherip_header eiphdr;
+ int error, len, proto;
+ u_int8_t tos;
+
+ GIF_LOCK_ASSERT(sc);
+
+ if (sin_src == NULL || sin_dst == NULL ||
+ sin_src->sin_family != AF_INET ||
+ sin_dst->sin_family != AF_INET) {
+ m_freem(m);
+ return EAFNOSUPPORT;
+ }
+
+ switch (family) {
+#ifdef INET
+ case AF_INET:
+ {
+ struct ip *ip;
+
+ proto = IPPROTO_IPV4;
+ if (m->m_len < sizeof(*ip)) {
+ m = m_pullup(m, sizeof(*ip));
+ if (!m)
+ return ENOBUFS;
+ }
+ ip = mtod(m, struct ip *);
+ tos = ip->ip_tos;
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct ip6_hdr *ip6;
+ proto = IPPROTO_IPV6;
+ if (m->m_len < sizeof(*ip6)) {
+ m = m_pullup(m, sizeof(*ip6));
+ if (!m)
+ return ENOBUFS;
+ }
+ ip6 = mtod(m, struct ip6_hdr *);
+ tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+ break;
+ }
+#endif /* INET6 */
+ case AF_LINK:
+ proto = IPPROTO_ETHERIP;
+
+ /*
+ * GIF_SEND_REVETHIP (disabled by default) intentionally
+ * sends an EtherIP packet with revered version field in
+ * the header. This is a knob for backward compatibility
+ * with FreeBSD 7.2R or prior.
+ */
+ if ((sc->gif_options & GIF_SEND_REVETHIP)) {
+ eiphdr.eip_ver = 0;
+ eiphdr.eip_resvl = ETHERIP_VERSION;
+ eiphdr.eip_resvh = 0;
+ } else {
+ eiphdr.eip_ver = ETHERIP_VERSION;
+ eiphdr.eip_resvl = 0;
+ eiphdr.eip_resvh = 0;
+ }
+ /* prepend Ethernet-in-IP header */
+ M_PREPEND(m, sizeof(struct etherip_header), M_DONTWAIT);
+ if (m && m->m_len < sizeof(struct etherip_header))
+ m = m_pullup(m, sizeof(struct etherip_header));
+ if (m == NULL)
+ return ENOBUFS;
+ bcopy(&eiphdr, mtod(m, struct etherip_header *),
+ sizeof(struct etherip_header));
+ break;
+
+ default:
+#ifdef DEBUG
+ printf("in_gif_output: warning: unknown family %d passed\n",
+ family);
+#endif
+ m_freem(m);
+ return EAFNOSUPPORT;
+ }
+
+ bzero(&iphdr, sizeof(iphdr));
+ iphdr.ip_src = sin_src->sin_addr;
+ /* bidirectional configured tunnel mode */
+ if (sin_dst->sin_addr.s_addr != INADDR_ANY)
+ iphdr.ip_dst = sin_dst->sin_addr;
+ else {
+ m_freem(m);
+ return ENETUNREACH;
+ }
+ iphdr.ip_p = proto;
+ /* version will be set in ip_output() */
+ iphdr.ip_ttl = V_ip_gif_ttl;
+ iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip);
+ ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED : ECN_NOCARE,
+ &iphdr.ip_tos, &tos);
+
+ /* prepend new IP header */
+ len = sizeof(struct ip);
+#ifndef __NO_STRICT_ALIGNMENT
+ if (family == AF_LINK)
+ len += ETHERIP_ALIGN;
+#endif
+ M_PREPEND(m, len, M_DONTWAIT);
+ if (m != NULL && m->m_len < len)
+ m = m_pullup(m, len);
+ if (m == NULL) {
+ printf("ENOBUFS in in_gif_output %d\n", __LINE__);
+ return ENOBUFS;
+ }
+#ifndef __NO_STRICT_ALIGNMENT
+ if (family == AF_LINK) {
+ len = mtod(m, vm_offset_t) & 3;
+ KASSERT(len == 0 || len == ETHERIP_ALIGN,
+ ("in_gif_output: unexpected misalignment"));
+ m->m_data += len;
+ m->m_len -= ETHERIP_ALIGN;
+ }
+#endif
+ bcopy(&iphdr, mtod(m, struct ip *), sizeof(struct ip));
+
+ M_SETFIB(m, sc->gif_fibnum);
+
+ if (dst->sin_family != sin_dst->sin_family ||
+ dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) {
+ /* cache route doesn't match */
+ bzero(dst, sizeof(*dst));
+ dst->sin_family = sin_dst->sin_family;
+ dst->sin_len = sizeof(struct sockaddr_in);
+ dst->sin_addr = sin_dst->sin_addr;
+ if (sc->gif_ro.ro_rt) {
+ RTFREE(sc->gif_ro.ro_rt);
+ sc->gif_ro.ro_rt = NULL;
+ }
+#if 0
+ GIF2IFP(sc)->if_mtu = GIF_MTU;
+#endif
+ }
+
+ if (sc->gif_ro.ro_rt == NULL) {
+ in_rtalloc_ign(&sc->gif_ro, 0, sc->gif_fibnum);
+ if (sc->gif_ro.ro_rt == NULL) {
+ m_freem(m);
+ return ENETUNREACH;
+ }
+
+ /* if it constitutes infinite encapsulation, punt. */
+ if (sc->gif_ro.ro_rt->rt_ifp == ifp) {
+ m_freem(m);
+ return ENETUNREACH; /* XXX */
+ }
+#if 0
+ ifp->if_mtu = sc->gif_ro.ro_rt->rt_ifp->if_mtu
+ - sizeof(struct ip);
+#endif
+ }
+
+ error = ip_output(m, NULL, &sc->gif_ro, 0, NULL, NULL);
+
+ if (!(GIF2IFP(sc)->if_flags & IFF_LINK0) &&
+ sc->gif_ro.ro_rt != NULL) {
+ RTFREE(sc->gif_ro.ro_rt);
+ sc->gif_ro.ro_rt = NULL;
+ }
+
+ return (error);
+}
+
+void
+in_gif_input(struct mbuf *m, int off)
+{
+ struct ifnet *gifp = NULL;
+ struct gif_softc *sc;
+ struct ip *ip;
+ int af;
+ u_int8_t otos;
+ int proto;
+
+ ip = mtod(m, struct ip *);
+ proto = ip->ip_p;
+
+ sc = (struct gif_softc *)encap_getarg(m);
+ if (sc == NULL) {
+ m_freem(m);
+ KMOD_IPSTAT_INC(ips_nogif);
+ return;
+ }
+
+ gifp = GIF2IFP(sc);
+ if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) {
+ m_freem(m);
+ KMOD_IPSTAT_INC(ips_nogif);
+ return;
+ }
+
+ otos = ip->ip_tos;
+ m_adj(m, off);
+
+ switch (proto) {
+#ifdef INET
+ case IPPROTO_IPV4:
+ {
+ struct ip *ip;
+ af = AF_INET;
+ if (m->m_len < sizeof(*ip)) {
+ m = m_pullup(m, sizeof(*ip));
+ if (!m)
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ if (ip_ecn_egress((gifp->if_flags & IFF_LINK1) ?
+ ECN_ALLOWED : ECN_NOCARE,
+ &otos, &ip->ip_tos) == 0) {
+ m_freem(m);
+ return;
+ }
+ break;
+ }
+#endif
+#ifdef INET6
+ case IPPROTO_IPV6:
+ {
+ struct ip6_hdr *ip6;
+ u_int8_t itos, oitos;
+
+ af = AF_INET6;
+ if (m->m_len < sizeof(*ip6)) {
+ m = m_pullup(m, sizeof(*ip6));
+ if (!m)
+ return;
+ }
+ ip6 = mtod(m, struct ip6_hdr *);
+ itos = oitos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+ if (ip_ecn_egress((gifp->if_flags & IFF_LINK1) ?
+ ECN_ALLOWED : ECN_NOCARE,
+ &otos, &itos) == 0) {
+ m_freem(m);
+ return;
+ }
+ if (itos != oitos) {
+ ip6->ip6_flow &= ~htonl(0xff << 20);
+ ip6->ip6_flow |= htonl((u_int32_t)itos << 20);
+ }
+ break;
+ }
+#endif /* INET6 */
+ case IPPROTO_ETHERIP:
+ af = AF_LINK;
+ break;
+
+ default:
+ KMOD_IPSTAT_INC(ips_nogif);
+ m_freem(m);
+ return;
+ }
+ gif_input(m, af, gifp);
+ return;
+}
+
+/*
+ * validate outer address.
+ */
+static int
+gif_validate4(const struct ip *ip, struct gif_softc *sc, struct ifnet *ifp)
+{
+ struct sockaddr_in *src, *dst;
+ struct in_ifaddr *ia4;
+
+ src = (struct sockaddr_in *)sc->gif_psrc;
+ dst = (struct sockaddr_in *)sc->gif_pdst;
+
+ /* check for address match */
+ if (src->sin_addr.s_addr != ip->ip_dst.s_addr ||
+ dst->sin_addr.s_addr != ip->ip_src.s_addr)
+ return 0;
+
+ /* martian filters on outer source - NOT done in ip_input! */
+ if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)))
+ return 0;
+ switch ((ntohl(ip->ip_src.s_addr) & 0xff000000) >> 24) {
+ case 0: case 127: case 255:
+ return 0;
+ }
+
+ /* reject packets with broadcast on source */
+ /* XXXRW: should use hash lists? */
+ IN_IFADDR_RLOCK();
+ TAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) {
+ if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0)
+ continue;
+ if (ip->ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) {
+ IN_IFADDR_RUNLOCK();
+ return 0;
+ }
+ }
+ IN_IFADDR_RUNLOCK();
+
+ /* ingress filters on outer source */
+ if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0 && ifp) {
+ struct sockaddr_in sin;
+ struct rtentry *rt;
+
+ bzero(&sin, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_len = sizeof(struct sockaddr_in);
+ sin.sin_addr = ip->ip_src;
+ /* XXX MRT check for the interface we would use on output */
+ rt = in_rtalloc1((struct sockaddr *)&sin, 0,
+ 0UL, sc->gif_fibnum);
+ if (!rt || rt->rt_ifp != ifp) {
+#if 0
+ log(LOG_WARNING, "%s: packet from 0x%x dropped "
+ "due to ingress filter\n", if_name(GIF2IFP(sc)),
+ (u_int32_t)ntohl(sin.sin_addr.s_addr));
+#endif
+ if (rt)
+ RTFREE_LOCKED(rt);
+ return 0;
+ }
+ RTFREE_LOCKED(rt);
+ }
+
+ return 32 * 2;
+}
+
+/*
+ * we know that we are in IFF_UP, outer address available, and outer family
+ * matched the physical addr family. see gif_encapcheck().
+ */
+int
+gif_encapcheck4(const struct mbuf *m, int off, int proto, void *arg)
+{
+ struct ip ip;
+ struct gif_softc *sc;
+ struct ifnet *ifp;
+
+ /* sanity check done in caller */
+ sc = (struct gif_softc *)arg;
+
+ /* LINTED const cast */
+ m_copydata(m, 0, sizeof(ip), (caddr_t)&ip);
+ ifp = ((m->m_flags & M_PKTHDR) != 0) ? m->m_pkthdr.rcvif : NULL;
+
+ return gif_validate4(&ip, sc, ifp);
+}
+
+int
+in_gif_attach(struct gif_softc *sc)
+{
+ sc->encap_cookie4 = encap_attach_func(AF_INET, -1, gif_encapcheck,
+ &in_gif_protosw, sc);
+ if (sc->encap_cookie4 == NULL)
+ return EEXIST;
+ return 0;
+}
+
+int
+in_gif_detach(struct gif_softc *sc)
+{
+ int error;
+
+ error = encap_detach(sc->encap_cookie4);
+ if (error == 0)
+ sc->encap_cookie4 = NULL;
+ return error;
+}
diff --git a/freebsd/sys/netinet/in_gif.h b/freebsd/sys/netinet/in_gif.h
new file mode 100644
index 00000000..1e42b01f
--- /dev/null
+++ b/freebsd/sys/netinet/in_gif.h
@@ -0,0 +1,45 @@
+/* $FreeBSD$ */
+/* $KAME: in_gif.h,v 1.5 2000/04/14 08:36:02 itojun Exp $ */
+
+/*-
+ * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETINET_IN_GIF_HH_
+#define _NETINET_IN_GIF_HH_
+
+#define GIF_TTL 30
+
+struct gif_softc;
+void in_gif_input(struct mbuf *, int);
+int in_gif_output(struct ifnet *, int, struct mbuf *);
+int gif_encapcheck4(const struct mbuf *, int, int, void *);
+int in_gif_attach(struct gif_softc *);
+int in_gif_detach(struct gif_softc *);
+
+#endif /*_NETINET_IN_GIF_HH_*/
diff --git a/freebsd/sys/netinet/in_mcast.c b/freebsd/sys/netinet/in_mcast.c
new file mode 100644
index 00000000..ed2bcc12
--- /dev/null
+++ b/freebsd/sys/netinet/in_mcast.c
@@ -0,0 +1,2902 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2007-2009 Bruce Simpson.
+ * Copyright (c) 2005 Robert N. M. Watson.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * IPv4 multicast socket, group, and socket option processing module.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/ktr.h>
+#include <freebsd/sys/tree.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/if_dl.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/igmp_var.h>
+
+#ifndef KTR_IGMPV3
+#define KTR_IGMPV3 KTR_INET
+#endif
+
+#ifndef __SOCKUNION_DECLARED
+union sockunion {
+ struct sockaddr_storage ss;
+ struct sockaddr sa;
+ struct sockaddr_dl sdl;
+ struct sockaddr_in sin;
+};
+typedef union sockunion sockunion_t;
+#define __SOCKUNION_DECLARED
+#endif /* __SOCKUNION_DECLARED */
+
+static MALLOC_DEFINE(M_INMFILTER, "in_mfilter",
+ "IPv4 multicast PCB-layer source filter");
+static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group");
+static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options");
+static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource",
+ "IPv4 multicast IGMP-layer source filter");
+
+/*
+ * Locking:
+ * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
+ * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however
+ * it can be taken by code in net/if.c also.
+ * - ip_moptions and in_mfilter are covered by the INP_WLOCK.
+ *
+ * struct in_multi is covered by IN_MULTI_LOCK. There isn't strictly
+ * any need for in_multi itself to be virtualized -- it is bound to an ifp
+ * anyway no matter what happens.
+ */
+struct mtx in_multi_mtx;
+MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF);
+
+/*
+ * Functions with non-static linkage defined in this file should be
+ * declared in in_var.h:
+ * imo_multi_filter()
+ * in_addmulti()
+ * in_delmulti()
+ * in_joingroup()
+ * in_joingroup_locked()
+ * in_leavegroup()
+ * in_leavegroup_locked()
+ * and ip_var.h:
+ * inp_freemoptions()
+ * inp_getmoptions()
+ * inp_setmoptions()
+ *
+ * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti()
+ * and in_delmulti().
+ */
+static void imf_commit(struct in_mfilter *);
+static int imf_get_source(struct in_mfilter *imf,
+ const struct sockaddr_in *psin,
+ struct in_msource **);
+static struct in_msource *
+ imf_graft(struct in_mfilter *, const uint8_t,
+ const struct sockaddr_in *);
+static void imf_leave(struct in_mfilter *);
+static int imf_prune(struct in_mfilter *, const struct sockaddr_in *);
+static void imf_purge(struct in_mfilter *);
+static void imf_rollback(struct in_mfilter *);
+static void imf_reap(struct in_mfilter *);
+static int imo_grow(struct ip_moptions *);
+static size_t imo_match_group(const struct ip_moptions *,
+ const struct ifnet *, const struct sockaddr *);
+static struct in_msource *
+ imo_match_source(const struct ip_moptions *, const size_t,
+ const struct sockaddr *);
+static void ims_merge(struct ip_msource *ims,
+ const struct in_msource *lims, const int rollback);
+static int in_getmulti(struct ifnet *, const struct in_addr *,
+ struct in_multi **);
+static int inm_get_source(struct in_multi *inm, const in_addr_t haddr,
+ const int noalloc, struct ip_msource **pims);
+static int inm_is_ifp_detached(const struct in_multi *);
+static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *);
+static void inm_purge(struct in_multi *);
+static void inm_reap(struct in_multi *);
+static struct ip_moptions *
+ inp_findmoptions(struct inpcb *);
+static int inp_get_source_filters(struct inpcb *, struct sockopt *);
+static int inp_join_group(struct inpcb *, struct sockopt *);
+static int inp_leave_group(struct inpcb *, struct sockopt *);
+static struct ifnet *
+ inp_lookup_mcast_ifp(const struct inpcb *,
+ const struct sockaddr_in *, const struct in_addr);
+static int inp_block_unblock_source(struct inpcb *, struct sockopt *);
+static int inp_set_multicast_if(struct inpcb *, struct sockopt *);
+static int inp_set_source_filters(struct inpcb *, struct sockopt *);
+static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS);
+
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv4 multicast");
+
+static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER;
+SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc,
+ CTLFLAG_RW | CTLFLAG_TUN, &in_mcast_maxgrpsrc, 0,
+ "Max source filters per group");
+TUNABLE_ULONG("net.inet.ip.mcast.maxgrpsrc", &in_mcast_maxgrpsrc);
+
+static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER;
+SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc,
+ CTLFLAG_RW | CTLFLAG_TUN, &in_mcast_maxsocksrc, 0,
+ "Max source filters per socket");
+TUNABLE_ULONG("net.inet.ip.mcast.maxsocksrc", &in_mcast_maxsocksrc);
+
+int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP;
+SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RW | CTLFLAG_TUN,
+ &in_mcast_loop, 0, "Loopback multicast datagrams by default");
+TUNABLE_INT("net.inet.ip.mcast.loop", &in_mcast_loop);
+
+SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters,
+ CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters,
+ "Per-interface stack-wide source filters");
+
+/*
+ * Inline function which wraps assertions for a valid ifp.
+ * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
+ * is detached.
+ */
+static int __inline
+inm_is_ifp_detached(const struct in_multi *inm)
+{
+ struct ifnet *ifp;
+
+ KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
+ ifp = inm->inm_ifma->ifma_ifp;
+ if (ifp != NULL) {
+ /*
+ * Sanity check that netinet's notion of ifp is the
+ * same as net's.
+ */
+ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
+ }
+
+ return (ifp == NULL);
+}
+
+/*
+ * Initialize an in_mfilter structure to a known state at t0, t1
+ * with an empty source filter list.
+ */
+static __inline void
+imf_init(struct in_mfilter *imf, const int st0, const int st1)
+{
+ memset(imf, 0, sizeof(struct in_mfilter));
+ RB_INIT(&imf->imf_sources);
+ imf->imf_st[0] = st0;
+ imf->imf_st[1] = st1;
+}
+
+/*
+ * Resize the ip_moptions vector to the next power-of-two minus 1.
+ * May be called with locks held; do not sleep.
+ */
+static int
+imo_grow(struct ip_moptions *imo)
+{
+ struct in_multi **nmships;
+ struct in_multi **omships;
+ struct in_mfilter *nmfilters;
+ struct in_mfilter *omfilters;
+ size_t idx;
+ size_t newmax;
+ size_t oldmax;
+
+ nmships = NULL;
+ nmfilters = NULL;
+ omships = imo->imo_membership;
+ omfilters = imo->imo_mfilters;
+ oldmax = imo->imo_max_memberships;
+ newmax = ((oldmax + 1) * 2) - 1;
+
+ if (newmax <= IP_MAX_MEMBERSHIPS) {
+ nmships = (struct in_multi **)realloc(omships,
+ sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT);
+ nmfilters = (struct in_mfilter *)realloc(omfilters,
+ sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT);
+ if (nmships != NULL && nmfilters != NULL) {
+ /* Initialize newly allocated source filter heads. */
+ for (idx = oldmax; idx < newmax; idx++) {
+ imf_init(&nmfilters[idx], MCAST_UNDEFINED,
+ MCAST_EXCLUDE);
+ }
+ imo->imo_max_memberships = newmax;
+ imo->imo_membership = nmships;
+ imo->imo_mfilters = nmfilters;
+ }
+ }
+
+ if (nmships == NULL || nmfilters == NULL) {
+ if (nmships != NULL)
+ free(nmships, M_IPMOPTS);
+ if (nmfilters != NULL)
+ free(nmfilters, M_INMFILTER);
+ return (ETOOMANYREFS);
+ }
+
+ return (0);
+}
+
+/*
+ * Find an IPv4 multicast group entry for this ip_moptions instance
+ * which matches the specified group, and optionally an interface.
+ * Return its index into the array, or -1 if not found.
+ */
+static size_t
+imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp,
+ const struct sockaddr *group)
+{
+ const struct sockaddr_in *gsin;
+ struct in_multi **pinm;
+ int idx;
+ int nmships;
+
+ gsin = (const struct sockaddr_in *)group;
+
+ /* The imo_membership array may be lazy allocated. */
+ if (imo->imo_membership == NULL || imo->imo_num_memberships == 0)
+ return (-1);
+
+ nmships = imo->imo_num_memberships;
+ pinm = &imo->imo_membership[0];
+ for (idx = 0; idx < nmships; idx++, pinm++) {
+ if (*pinm == NULL)
+ continue;
+ if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) &&
+ in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) {
+ break;
+ }
+ }
+ if (idx >= nmships)
+ idx = -1;
+
+ return (idx);
+}
+
+/*
+ * Find an IPv4 multicast source entry for this imo which matches
+ * the given group index for this socket, and source address.
+ *
+ * NOTE: This does not check if the entry is in-mode, merely if
+ * it exists, which may not be the desired behaviour.
+ */
+static struct in_msource *
+imo_match_source(const struct ip_moptions *imo, const size_t gidx,
+ const struct sockaddr *src)
+{
+ struct ip_msource find;
+ struct in_mfilter *imf;
+ struct ip_msource *ims;
+ const sockunion_t *psa;
+
+ KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__));
+ KASSERT(gidx != -1 && gidx < imo->imo_num_memberships,
+ ("%s: invalid index %d\n", __func__, (int)gidx));
+
+ /* The imo_mfilters array may be lazy allocated. */
+ if (imo->imo_mfilters == NULL)
+ return (NULL);
+ imf = &imo->imo_mfilters[gidx];
+
+ /* Source trees are keyed in host byte order. */
+ psa = (const sockunion_t *)src;
+ find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr);
+ ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
+
+ return ((struct in_msource *)ims);
+}
+
+/*
+ * Perform filtering for multicast datagrams on a socket by group and source.
+ *
+ * Returns 0 if a datagram should be allowed through, or various error codes
+ * if the socket was not a member of the group, or the source was muted, etc.
+ */
+int
+imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp,
+ const struct sockaddr *group, const struct sockaddr *src)
+{
+ size_t gidx;
+ struct in_msource *ims;
+ int mode;
+
+ KASSERT(ifp != NULL, ("%s: null ifp", __func__));
+
+ gidx = imo_match_group(imo, ifp, group);
+ if (gidx == -1)
+ return (MCAST_NOTGMEMBER);
+
+ /*
+ * Check if the source was included in an (S,G) join.
+ * Allow reception on exclusive memberships by default,
+ * reject reception on inclusive memberships by default.
+ * Exclude source only if an in-mode exclude filter exists.
+ * Include source only if an in-mode include filter exists.
+ * NOTE: We are comparing group state here at IGMP t1 (now)
+ * with socket-layer t0 (since last downcall).
+ */
+ mode = imo->imo_mfilters[gidx].imf_st[1];
+ ims = imo_match_source(imo, gidx, src);
+
+ if ((ims == NULL && mode == MCAST_INCLUDE) ||
+ (ims != NULL && ims->imsl_st[0] != mode))
+ return (MCAST_NOTSMEMBER);
+
+ return (MCAST_PASS);
+}
+
+/*
+ * Find and return a reference to an in_multi record for (ifp, group),
+ * and bump its reference count.
+ * If one does not exist, try to allocate it, and update link-layer multicast
+ * filters on ifp to listen for group.
+ * Assumes the IN_MULTI lock is held across the call.
+ * Return 0 if successful, otherwise return an appropriate error code.
+ */
+static int
+in_getmulti(struct ifnet *ifp, const struct in_addr *group,
+ struct in_multi **pinm)
+{
+ struct sockaddr_in gsin;
+ struct ifmultiaddr *ifma;
+ struct in_ifinfo *ii;
+ struct in_multi *inm;
+ int error;
+
+ IN_MULTI_LOCK_ASSERT();
+
+ ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET];
+
+ inm = inm_lookup(ifp, *group);
+ if (inm != NULL) {
+ /*
+ * If we already joined this group, just bump the
+ * refcount and return it.
+ */
+ KASSERT(inm->inm_refcount >= 1,
+ ("%s: bad refcount %d", __func__, inm->inm_refcount));
+ ++inm->inm_refcount;
+ *pinm = inm;
+ return (0);
+ }
+
+ memset(&gsin, 0, sizeof(gsin));
+ gsin.sin_family = AF_INET;
+ gsin.sin_len = sizeof(struct sockaddr_in);
+ gsin.sin_addr = *group;
+
+ /*
+ * Check if a link-layer group is already associated
+ * with this network-layer group on the given ifnet.
+ */
+ error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma);
+ if (error != 0)
+ return (error);
+
+ /* XXX ifma_protospec must be covered by IF_ADDR_LOCK */
+ IF_ADDR_LOCK(ifp);
+
+ /*
+ * If something other than netinet is occupying the link-layer
+ * group, print a meaningful error message and back out of
+ * the allocation.
+ * Otherwise, bump the refcount on the existing network-layer
+ * group association and return it.
+ */
+ if (ifma->ifma_protospec != NULL) {
+ inm = (struct in_multi *)ifma->ifma_protospec;
+#ifdef INVARIANTS
+ KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
+ __func__));
+ KASSERT(ifma->ifma_addr->sa_family == AF_INET,
+ ("%s: ifma not AF_INET", __func__));
+ KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
+ if (inm->inm_ifma != ifma || inm->inm_ifp != ifp ||
+ !in_hosteq(inm->inm_addr, *group))
+ panic("%s: ifma %p is inconsistent with %p (%s)",
+ __func__, ifma, inm, inet_ntoa(*group));
+#endif
+ ++inm->inm_refcount;
+ *pinm = inm;
+ IF_ADDR_UNLOCK(ifp);
+ return (0);
+ }
+
+ IF_ADDR_LOCK_ASSERT(ifp);
+
+ /*
+ * A new in_multi record is needed; allocate and initialize it.
+ * We DO NOT perform an IGMP join as the in_ layer may need to
+ * push an initial source list down to IGMP to support SSM.
+ *
+ * The initial source filter state is INCLUDE, {} as per the RFC.
+ */
+ inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO);
+ if (inm == NULL) {
+ if_delmulti_ifma(ifma);
+ IF_ADDR_UNLOCK(ifp);
+ return (ENOMEM);
+ }
+ inm->inm_addr = *group;
+ inm->inm_ifp = ifp;
+ inm->inm_igi = ii->ii_igmp;
+ inm->inm_ifma = ifma;
+ inm->inm_refcount = 1;
+ inm->inm_state = IGMP_NOT_MEMBER;
+
+ /*
+ * Pending state-changes per group are subject to a bounds check.
+ */
+ IFQ_SET_MAXLEN(&inm->inm_scq, IGMP_MAX_STATE_CHANGES);
+
+ inm->inm_st[0].iss_fmode = MCAST_UNDEFINED;
+ inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
+ RB_INIT(&inm->inm_srcs);
+
+ ifma->ifma_protospec = inm;
+
+ *pinm = inm;
+
+ IF_ADDR_UNLOCK(ifp);
+ return (0);
+}
+
+/*
+ * Drop a reference to an in_multi record.
+ *
+ * If the refcount drops to 0, free the in_multi record and
+ * delete the underlying link-layer membership.
+ */
+void
+inm_release_locked(struct in_multi *inm)
+{
+ struct ifmultiaddr *ifma;
+
+ IN_MULTI_LOCK_ASSERT();
+
+ CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount);
+
+ if (--inm->inm_refcount > 0) {
+ CTR2(KTR_IGMPV3, "%s: refcount is now %d", __func__,
+ inm->inm_refcount);
+ return;
+ }
+
+ CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm);
+
+ ifma = inm->inm_ifma;
+
+ /* XXX this access is not covered by IF_ADDR_LOCK */
+ CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma);
+ KASSERT(ifma->ifma_protospec == inm,
+ ("%s: ifma_protospec != inm", __func__));
+ ifma->ifma_protospec = NULL;
+
+ inm_purge(inm);
+
+ free(inm, M_IPMADDR);
+
+ if_delmulti_ifma(ifma);
+}
+
+/*
+ * Clear recorded source entries for a group.
+ * Used by the IGMP code. Caller must hold the IN_MULTI lock.
+ * FIXME: Should reap.
+ */
+void
+inm_clear_recorded(struct in_multi *inm)
+{
+ struct ip_msource *ims;
+
+ IN_MULTI_LOCK_ASSERT();
+
+ RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
+ if (ims->ims_stp) {
+ ims->ims_stp = 0;
+ --inm->inm_st[1].iss_rec;
+ }
+ }
+ KASSERT(inm->inm_st[1].iss_rec == 0,
+ ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec));
+}
+
+/*
+ * Record a source as pending for a Source-Group IGMPv3 query.
+ * This lives here as it modifies the shared tree.
+ *
+ * inm is the group descriptor.
+ * naddr is the address of the source to record in network-byte order.
+ *
+ * If the net.inet.igmp.sgalloc sysctl is non-zero, we will
+ * lazy-allocate a source node in response to an SG query.
+ * Otherwise, no allocation is performed. This saves some memory
+ * with the trade-off that the source will not be reported to the
+ * router if joined in the window between the query response and
+ * the group actually being joined on the local host.
+ *
+ * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed.
+ * This turns off the allocation of a recorded source entry if
+ * the group has not been joined.
+ *
+ * Return 0 if the source didn't exist or was already marked as recorded.
+ * Return 1 if the source was marked as recorded by this function.
+ * Return <0 if any error occured (negated errno code).
+ */
+int
+inm_record_source(struct in_multi *inm, const in_addr_t naddr)
+{
+ struct ip_msource find;
+ struct ip_msource *ims, *nims;
+
+ IN_MULTI_LOCK_ASSERT();
+
+ find.ims_haddr = ntohl(naddr);
+ ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
+ if (ims && ims->ims_stp)
+ return (0);
+ if (ims == NULL) {
+ if (inm->inm_nsrc == in_mcast_maxgrpsrc)
+ return (-ENOSPC);
+ nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
+ M_NOWAIT | M_ZERO);
+ if (nims == NULL)
+ return (-ENOMEM);
+ nims->ims_haddr = find.ims_haddr;
+ RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
+ ++inm->inm_nsrc;
+ ims = nims;
+ }
+
+ /*
+ * Mark the source as recorded and update the recorded
+ * source count.
+ */
+ ++ims->ims_stp;
+ ++inm->inm_st[1].iss_rec;
+
+ return (1);
+}
+
+/*
+ * Return a pointer to an in_msource owned by an in_mfilter,
+ * given its source address.
+ * Lazy-allocate if needed. If this is a new entry its filter state is
+ * undefined at t0.
+ *
+ * imf is the filter set being modified.
+ * haddr is the source address in *host* byte-order.
+ *
+ * SMPng: May be called with locks held; malloc must not block.
+ */
+static int
+imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin,
+ struct in_msource **plims)
+{
+ struct ip_msource find;
+ struct ip_msource *ims, *nims;
+ struct in_msource *lims;
+ int error;
+
+ error = 0;
+ ims = NULL;
+ lims = NULL;
+
+ /* key is host byte order */
+ find.ims_haddr = ntohl(psin->sin_addr.s_addr);
+ ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
+ lims = (struct in_msource *)ims;
+ if (lims == NULL) {
+ if (imf->imf_nsrc == in_mcast_maxsocksrc)
+ return (ENOSPC);
+ nims = malloc(sizeof(struct in_msource), M_INMFILTER,
+ M_NOWAIT | M_ZERO);
+ if (nims == NULL)
+ return (ENOMEM);
+ lims = (struct in_msource *)nims;
+ lims->ims_haddr = find.ims_haddr;
+ lims->imsl_st[0] = MCAST_UNDEFINED;
+ RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
+ ++imf->imf_nsrc;
+ }
+
+ *plims = lims;
+
+ return (error);
+}
+
+/*
+ * Graft a source entry into an existing socket-layer filter set,
+ * maintaining any required invariants and checking allocations.
+ *
+ * The source is marked as being in the new filter mode at t1.
+ *
+ * Return the pointer to the new node, otherwise return NULL.
+ */
+static struct in_msource *
+imf_graft(struct in_mfilter *imf, const uint8_t st1,
+ const struct sockaddr_in *psin)
+{
+ struct ip_msource *nims;
+ struct in_msource *lims;
+
+ nims = malloc(sizeof(struct in_msource), M_INMFILTER,
+ M_NOWAIT | M_ZERO);
+ if (nims == NULL)
+ return (NULL);
+ lims = (struct in_msource *)nims;
+ lims->ims_haddr = ntohl(psin->sin_addr.s_addr);
+ lims->imsl_st[0] = MCAST_UNDEFINED;
+ lims->imsl_st[1] = st1;
+ RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
+ ++imf->imf_nsrc;
+
+ return (lims);
+}
+
+/*
+ * Prune a source entry from an existing socket-layer filter set,
+ * maintaining any required invariants and checking allocations.
+ *
+ * The source is marked as being left at t1, it is not freed.
+ *
+ * Return 0 if no error occurred, otherwise return an errno value.
+ */
+static int
+imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin)
+{
+ struct ip_msource find;
+ struct ip_msource *ims;
+ struct in_msource *lims;
+
+ /* key is host byte order */
+ find.ims_haddr = ntohl(psin->sin_addr.s_addr);
+ ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
+ if (ims == NULL)
+ return (ENOENT);
+ lims = (struct in_msource *)ims;
+ lims->imsl_st[1] = MCAST_UNDEFINED;
+ return (0);
+}
+
+/*
+ * Revert socket-layer filter set deltas at t1 to t0 state.
+ */
+static void
+imf_rollback(struct in_mfilter *imf)
+{
+ struct ip_msource *ims, *tims;
+ struct in_msource *lims;
+
+ RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
+ lims = (struct in_msource *)ims;
+ if (lims->imsl_st[0] == lims->imsl_st[1]) {
+ /* no change at t1 */
+ continue;
+ } else if (lims->imsl_st[0] != MCAST_UNDEFINED) {
+ /* revert change to existing source at t1 */
+ lims->imsl_st[1] = lims->imsl_st[0];
+ } else {
+ /* revert source added t1 */
+ CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
+ RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
+ free(ims, M_INMFILTER);
+ imf->imf_nsrc--;
+ }
+ }
+ imf->imf_st[1] = imf->imf_st[0];
+}
+
+/*
+ * Mark socket-layer filter set as INCLUDE {} at t1.
+ */
+static void
+imf_leave(struct in_mfilter *imf)
+{
+ struct ip_msource *ims;
+ struct in_msource *lims;
+
+ RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
+ lims = (struct in_msource *)ims;
+ lims->imsl_st[1] = MCAST_UNDEFINED;
+ }
+ imf->imf_st[1] = MCAST_INCLUDE;
+}
+
+/*
+ * Mark socket-layer filter set deltas as committed.
+ */
+static void
+imf_commit(struct in_mfilter *imf)
+{
+ struct ip_msource *ims;
+ struct in_msource *lims;
+
+ RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
+ lims = (struct in_msource *)ims;
+ lims->imsl_st[0] = lims->imsl_st[1];
+ }
+ imf->imf_st[0] = imf->imf_st[1];
+}
+
+/*
+ * Reap unreferenced sources from socket-layer filter set.
+ */
+static void
+imf_reap(struct in_mfilter *imf)
+{
+ struct ip_msource *ims, *tims;
+ struct in_msource *lims;
+
+ RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
+ lims = (struct in_msource *)ims;
+ if ((lims->imsl_st[0] == MCAST_UNDEFINED) &&
+ (lims->imsl_st[1] == MCAST_UNDEFINED)) {
+ CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims);
+ RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
+ free(ims, M_INMFILTER);
+ imf->imf_nsrc--;
+ }
+ }
+}
+
+/*
+ * Purge socket-layer filter set.
+ */
+static void
+imf_purge(struct in_mfilter *imf)
+{
+ struct ip_msource *ims, *tims;
+
+ RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
+ CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
+ RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
+ free(ims, M_INMFILTER);
+ imf->imf_nsrc--;
+ }
+ imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED;
+ KASSERT(RB_EMPTY(&imf->imf_sources),
+ ("%s: imf_sources not empty", __func__));
+}
+
+/*
+ * Look up a source filter entry for a multicast group.
+ *
+ * inm is the group descriptor to work with.
+ * haddr is the host-byte-order IPv4 address to look up.
+ * noalloc may be non-zero to suppress allocation of sources.
+ * *pims will be set to the address of the retrieved or allocated source.
+ *
+ * SMPng: NOTE: may be called with locks held.
+ * Return 0 if successful, otherwise return a non-zero error code.
+ */
+static int
+inm_get_source(struct in_multi *inm, const in_addr_t haddr,
+ const int noalloc, struct ip_msource **pims)
+{
+ struct ip_msource find;
+ struct ip_msource *ims, *nims;
+#ifdef KTR
+ struct in_addr ia;
+#endif
+
+ find.ims_haddr = haddr;
+ ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
+ if (ims == NULL && !noalloc) {
+ if (inm->inm_nsrc == in_mcast_maxgrpsrc)
+ return (ENOSPC);
+ nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
+ M_NOWAIT | M_ZERO);
+ if (nims == NULL)
+ return (ENOMEM);
+ nims->ims_haddr = haddr;
+ RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
+ ++inm->inm_nsrc;
+ ims = nims;
+#ifdef KTR
+ ia.s_addr = htonl(haddr);
+ CTR3(KTR_IGMPV3, "%s: allocated %s as %p", __func__,
+ inet_ntoa(ia), ims);
+#endif
+ }
+
+ *pims = ims;
+ return (0);
+}
+
+/*
+ * Merge socket-layer source into IGMP-layer source.
+ * If rollback is non-zero, perform the inverse of the merge.
+ */
+static void
+ims_merge(struct ip_msource *ims, const struct in_msource *lims,
+ const int rollback)
+{
+ int n = rollback ? -1 : 1;
+#ifdef KTR
+ struct in_addr ia;
+
+ ia.s_addr = htonl(ims->ims_haddr);
+#endif
+
+ if (lims->imsl_st[0] == MCAST_EXCLUDE) {
+ CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on %s",
+ __func__, n, inet_ntoa(ia));
+ ims->ims_st[1].ex -= n;
+ } else if (lims->imsl_st[0] == MCAST_INCLUDE) {
+ CTR3(KTR_IGMPV3, "%s: t1 in -= %d on %s",
+ __func__, n, inet_ntoa(ia));
+ ims->ims_st[1].in -= n;
+ }
+
+ if (lims->imsl_st[1] == MCAST_EXCLUDE) {
+ CTR3(KTR_IGMPV3, "%s: t1 ex += %d on %s",
+ __func__, n, inet_ntoa(ia));
+ ims->ims_st[1].ex += n;
+ } else if (lims->imsl_st[1] == MCAST_INCLUDE) {
+ CTR3(KTR_IGMPV3, "%s: t1 in += %d on %s",
+ __func__, n, inet_ntoa(ia));
+ ims->ims_st[1].in += n;
+ }
+}
+
+/*
+ * Atomically update the global in_multi state, when a membership's
+ * filter list is being updated in any way.
+ *
+ * imf is the per-inpcb-membership group filter pointer.
+ * A fake imf may be passed for in-kernel consumers.
+ *
+ * XXX This is a candidate for a set-symmetric-difference style loop
+ * which would eliminate the repeated lookup from root of ims nodes,
+ * as they share the same key space.
+ *
+ * If any error occurred this function will back out of refcounts
+ * and return a non-zero value.
+ */
+static int
+inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
+{
+ struct ip_msource *ims, *nims;
+ struct in_msource *lims;
+ int schanged, error;
+ int nsrc0, nsrc1;
+
+ schanged = 0;
+ error = 0;
+ nsrc1 = nsrc0 = 0;
+
+ /*
+ * Update the source filters first, as this may fail.
+ * Maintain count of in-mode filters at t0, t1. These are
+ * used to work out if we transition into ASM mode or not.
+ * Maintain a count of source filters whose state was
+ * actually modified by this operation.
+ */
+ RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
+ lims = (struct in_msource *)ims;
+ if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++;
+ if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++;
+ if (lims->imsl_st[0] == lims->imsl_st[1]) continue;
+ error = inm_get_source(inm, lims->ims_haddr, 0, &nims);
+ ++schanged;
+ if (error)
+ break;
+ ims_merge(nims, lims, 0);
+ }
+ if (error) {
+ struct ip_msource *bims;
+
+ RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) {
+ lims = (struct in_msource *)ims;
+ if (lims->imsl_st[0] == lims->imsl_st[1])
+ continue;
+ (void)inm_get_source(inm, lims->ims_haddr, 1, &bims);
+ if (bims == NULL)
+ continue;
+ ims_merge(bims, lims, 1);
+ }
+ goto out_reap;
+ }
+
+ CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1",
+ __func__, nsrc0, nsrc1);
+
+ /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
+ if (imf->imf_st[0] == imf->imf_st[1] &&
+ imf->imf_st[1] == MCAST_INCLUDE) {
+ if (nsrc1 == 0) {
+ CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
+ --inm->inm_st[1].iss_in;
+ }
+ }
+
+ /* Handle filter mode transition on socket. */
+ if (imf->imf_st[0] != imf->imf_st[1]) {
+ CTR3(KTR_IGMPV3, "%s: imf transition %d to %d",
+ __func__, imf->imf_st[0], imf->imf_st[1]);
+
+ if (imf->imf_st[0] == MCAST_EXCLUDE) {
+ CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__);
+ --inm->inm_st[1].iss_ex;
+ } else if (imf->imf_st[0] == MCAST_INCLUDE) {
+ CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
+ --inm->inm_st[1].iss_in;
+ }
+
+ if (imf->imf_st[1] == MCAST_EXCLUDE) {
+ CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__);
+ inm->inm_st[1].iss_ex++;
+ } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
+ CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__);
+ inm->inm_st[1].iss_in++;
+ }
+ }
+
+ /*
+ * Track inm filter state in terms of listener counts.
+ * If there are any exclusive listeners, stack-wide
+ * membership is exclusive.
+ * Otherwise, if only inclusive listeners, stack-wide is inclusive.
+ * If no listeners remain, state is undefined at t1,
+ * and the IGMP lifecycle for this group should finish.
+ */
+ if (inm->inm_st[1].iss_ex > 0) {
+ CTR1(KTR_IGMPV3, "%s: transition to EX", __func__);
+ inm->inm_st[1].iss_fmode = MCAST_EXCLUDE;
+ } else if (inm->inm_st[1].iss_in > 0) {
+ CTR1(KTR_IGMPV3, "%s: transition to IN", __func__);
+ inm->inm_st[1].iss_fmode = MCAST_INCLUDE;
+ } else {
+ CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__);
+ inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
+ }
+
+ /* Decrement ASM listener count on transition out of ASM mode. */
+ if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
+ if ((imf->imf_st[1] != MCAST_EXCLUDE) ||
+ (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0))
+ CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__);
+ --inm->inm_st[1].iss_asm;
+ }
+
+ /* Increment ASM listener count on transition to ASM mode. */
+ if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
+ CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__);
+ inm->inm_st[1].iss_asm++;
+ }
+
+ CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm);
+ inm_print(inm);
+
+out_reap:
+ if (schanged > 0) {
+ CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__);
+ inm_reap(inm);
+ }
+ return (error);
+}
+
+/*
+ * Mark an in_multi's filter set deltas as committed.
+ * Called by IGMP after a state change has been enqueued.
+ */
+void
+inm_commit(struct in_multi *inm)
+{
+ struct ip_msource *ims;
+
+ CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm);
+ CTR1(KTR_IGMPV3, "%s: pre commit:", __func__);
+ inm_print(inm);
+
+ RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
+ ims->ims_st[0] = ims->ims_st[1];
+ }
+ inm->inm_st[0] = inm->inm_st[1];
+}
+
+/*
+ * Reap unreferenced nodes from an in_multi's filter set.
+ */
+static void
+inm_reap(struct in_multi *inm)
+{
+ struct ip_msource *ims, *tims;
+
+ RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
+ if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 ||
+ ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 ||
+ ims->ims_stp != 0)
+ continue;
+ CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
+ RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
+ free(ims, M_IPMSOURCE);
+ inm->inm_nsrc--;
+ }
+}
+
+/*
+ * Purge all source nodes from an in_multi's filter set.
+ */
+static void
+inm_purge(struct in_multi *inm)
+{
+ struct ip_msource *ims, *tims;
+
+ RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
+ CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
+ RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
+ free(ims, M_IPMSOURCE);
+ inm->inm_nsrc--;
+ }
+}
+
+/*
+ * Join a multicast group; unlocked entry point.
+ *
+ * SMPng: XXX: in_joingroup() is called from in_control() when Giant
+ * is not held. Fortunately, ifp is unlikely to have been detached
+ * at this point, so we assume it's OK to recurse.
+ */
+int
+in_joingroup(struct ifnet *ifp, const struct in_addr *gina,
+ /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
+{
+ int error;
+
+ IN_MULTI_LOCK();
+ error = in_joingroup_locked(ifp, gina, imf, pinm);
+ IN_MULTI_UNLOCK();
+
+ return (error);
+}
+
+/*
+ * Join a multicast group; real entry point.
+ *
+ * Only preserves atomicity at inm level.
+ * NOTE: imf argument cannot be const due to sys/tree.h limitations.
+ *
+ * If the IGMP downcall fails, the group is not joined, and an error
+ * code is returned.
+ */
+int
+in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina,
+ /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
+{
+ struct in_mfilter timf;
+ struct in_multi *inm;
+ int error;
+
+ IN_MULTI_LOCK_ASSERT();
+
+ CTR4(KTR_IGMPV3, "%s: join %s on %p(%s))", __func__,
+ inet_ntoa(*gina), ifp, ifp->if_xname);
+
+ error = 0;
+ inm = NULL;
+
+ /*
+ * If no imf was specified (i.e. kernel consumer),
+ * fake one up and assume it is an ASM join.
+ */
+ if (imf == NULL) {
+ imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
+ imf = &timf;
+ }
+
+ error = in_getmulti(ifp, gina, &inm);
+ if (error) {
+ CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__);
+ return (error);
+ }
+
+ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
+ error = inm_merge(inm, imf);
+ if (error) {
+ CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
+ goto out_inm_release;
+ }
+
+ CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
+ error = igmp_change_state(inm);
+ if (error) {
+ CTR1(KTR_IGMPV3, "%s: failed to update source", __func__);
+ goto out_inm_release;
+ }
+
+out_inm_release:
+ if (error) {
+ CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
+ inm_release_locked(inm);
+ } else {
+ *pinm = inm;
+ }
+
+ return (error);
+}
+
+/*
+ * Leave a multicast group; unlocked entry point.
+ */
+int
+in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
+{
+ struct ifnet *ifp;
+ int error;
+
+ ifp = inm->inm_ifp;
+
+ IN_MULTI_LOCK();
+ error = in_leavegroup_locked(inm, imf);
+ IN_MULTI_UNLOCK();
+
+ return (error);
+}
+
+/*
+ * Leave a multicast group; real entry point.
+ * All source filters will be expunged.
+ *
+ * Only preserves atomicity at inm level.
+ *
+ * Holding the write lock for the INP which contains imf
+ * is highly advisable. We can't assert for it as imf does not
+ * contain a back-pointer to the owning inp.
+ *
+ * Note: This is not the same as inm_release(*) as this function also
+ * makes a state change downcall into IGMP.
+ */
+int
+in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
+{
+ struct in_mfilter timf;
+ int error;
+
+ error = 0;
+
+ IN_MULTI_LOCK_ASSERT();
+
+ CTR5(KTR_IGMPV3, "%s: leave inm %p, %s/%s, imf %p", __func__,
+ inm, inet_ntoa(inm->inm_addr),
+ (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname),
+ imf);
+
+ /*
+ * If no imf was specified (i.e. kernel consumer),
+ * fake one up and assume it is an ASM join.
+ */
+ if (imf == NULL) {
+ imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
+ imf = &timf;
+ }
+
+ /*
+ * Begin state merge transaction at IGMP layer.
+ *
+ * As this particular invocation should not cause any memory
+ * to be allocated, and there is no opportunity to roll back
+ * the transaction, it MUST NOT fail.
+ */
+ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
+ error = inm_merge(inm, imf);
+ KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
+
+ CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
+ error = igmp_change_state(inm);
+ if (error)
+ CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
+
+ CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
+ inm_release_locked(inm);
+
+ return (error);
+}
+
+/*#ifndef BURN_BRIDGES*/
+/*
+ * Join an IPv4 multicast group in (*,G) exclusive mode.
+ * The group must be a 224.0.0.0/24 link-scope group.
+ * This KPI is for legacy kernel consumers only.
+ */
+struct in_multi *
+in_addmulti(struct in_addr *ap, struct ifnet *ifp)
+{
+ struct in_multi *pinm;
+ int error;
+
+ KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)),
+ ("%s: %s not in 224.0.0.0/24", __func__, inet_ntoa(*ap)));
+
+ error = in_joingroup(ifp, ap, NULL, &pinm);
+ if (error != 0)
+ pinm = NULL;
+
+ return (pinm);
+}
+
+/*
+ * Leave an IPv4 multicast group, assumed to be in exclusive (*,G) mode.
+ * This KPI is for legacy kernel consumers only.
+ */
+void
+in_delmulti(struct in_multi *inm)
+{
+
+ (void)in_leavegroup(inm, NULL);
+}
+/*#endif*/
+
+/*
+ * Block or unblock an ASM multicast source on an inpcb.
+ * This implements the delta-based API described in RFC 3678.
+ *
+ * The delta-based API applies only to exclusive-mode memberships.
+ * An IGMP downcall will be performed.
+ *
+ * SMPng: NOTE: Must take Giant as a join may create a new ifma.
+ *
+ * Return 0 if successful, otherwise return an appropriate error code.
+ */
+static int
+inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
+{
+ struct group_source_req gsr;
+ sockunion_t *gsa, *ssa;
+ struct ifnet *ifp;
+ struct in_mfilter *imf;
+ struct ip_moptions *imo;
+ struct in_msource *ims;
+ struct in_multi *inm;
+ size_t idx;
+ uint16_t fmode;
+ int error, doblock;
+
+ ifp = NULL;
+ error = 0;
+ doblock = 0;
+
+ memset(&gsr, 0, sizeof(struct group_source_req));
+ gsa = (sockunion_t *)&gsr.gsr_group;
+ ssa = (sockunion_t *)&gsr.gsr_source;
+
+ switch (sopt->sopt_name) {
+ case IP_BLOCK_SOURCE:
+ case IP_UNBLOCK_SOURCE: {
+ struct ip_mreq_source mreqs;
+
+ error = sooptcopyin(sopt, &mreqs,
+ sizeof(struct ip_mreq_source),
+ sizeof(struct ip_mreq_source));
+ if (error)
+ return (error);
+
+ gsa->sin.sin_family = AF_INET;
+ gsa->sin.sin_len = sizeof(struct sockaddr_in);
+ gsa->sin.sin_addr = mreqs.imr_multiaddr;
+
+ ssa->sin.sin_family = AF_INET;
+ ssa->sin.sin_len = sizeof(struct sockaddr_in);
+ ssa->sin.sin_addr = mreqs.imr_sourceaddr;
+
+ if (!in_nullhost(mreqs.imr_interface))
+ INADDR_TO_IFP(mreqs.imr_interface, ifp);
+
+ if (sopt->sopt_name == IP_BLOCK_SOURCE)
+ doblock = 1;
+
+ CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p",
+ __func__, inet_ntoa(mreqs.imr_interface), ifp);
+ break;
+ }
+
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ error = sooptcopyin(sopt, &gsr,
+ sizeof(struct group_source_req),
+ sizeof(struct group_source_req));
+ if (error)
+ return (error);
+
+ if (gsa->sin.sin_family != AF_INET ||
+ gsa->sin.sin_len != sizeof(struct sockaddr_in))
+ return (EINVAL);
+
+ if (ssa->sin.sin_family != AF_INET ||
+ ssa->sin.sin_len != sizeof(struct sockaddr_in))
+ return (EINVAL);
+
+ if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
+ return (EADDRNOTAVAIL);
+
+ ifp = ifnet_byindex(gsr.gsr_interface);
+
+ if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
+ doblock = 1;
+ break;
+
+ default:
+ CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
+ __func__, sopt->sopt_name);
+ return (EOPNOTSUPP);
+ break;
+ }
+
+ if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
+ return (EINVAL);
+
+ /*
+ * Check if we are actually a member of this group.
+ */
+ imo = inp_findmoptions(inp);
+ idx = imo_match_group(imo, ifp, &gsa->sa);
+ if (idx == -1 || imo->imo_mfilters == NULL) {
+ error = EADDRNOTAVAIL;
+ goto out_inp_locked;
+ }
+
+ KASSERT(imo->imo_mfilters != NULL,
+ ("%s: imo_mfilters not allocated", __func__));
+ imf = &imo->imo_mfilters[idx];
+ inm = imo->imo_membership[idx];
+
+ /*
+ * Attempting to use the delta-based API on an
+ * non exclusive-mode membership is an error.
+ */
+ fmode = imf->imf_st[0];
+ if (fmode != MCAST_EXCLUDE) {
+ error = EINVAL;
+ goto out_inp_locked;
+ }
+
+ /*
+ * Deal with error cases up-front:
+ * Asked to block, but already blocked; or
+ * Asked to unblock, but nothing to unblock.
+ * If adding a new block entry, allocate it.
+ */
+ ims = imo_match_source(imo, idx, &ssa->sa);
+ if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
+ CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__,
+ inet_ntoa(ssa->sin.sin_addr), doblock ? "" : "not ");
+ error = EADDRNOTAVAIL;
+ goto out_inp_locked;
+ }
+
+ INP_WLOCK_ASSERT(inp);
+
+ /*
+ * Begin state merge transaction at socket layer.
+ */
+ if (doblock) {
+ CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
+ ims = imf_graft(imf, fmode, &ssa->sin);
+ if (ims == NULL)
+ error = ENOMEM;
+ } else {
+ CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
+ error = imf_prune(imf, &ssa->sin);
+ }
+
+ if (error) {
+ CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__);
+ goto out_imf_rollback;
+ }
+
+ /*
+ * Begin state merge transaction at IGMP layer.
+ */
+ IN_MULTI_LOCK();
+
+ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
+ error = inm_merge(inm, imf);
+ if (error) {
+ CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
+ goto out_imf_rollback;
+ }
+
+ CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
+ error = igmp_change_state(inm);
+ if (error)
+ CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
+
+ IN_MULTI_UNLOCK();
+
+out_imf_rollback:
+ if (error)
+ imf_rollback(imf);
+ else
+ imf_commit(imf);
+
+ imf_reap(imf);
+
+out_inp_locked:
+ INP_WUNLOCK(inp);
+ return (error);
+}
+
+/*
+ * Given an inpcb, return its multicast options structure pointer. Accepts
+ * an unlocked inpcb pointer, but will return it locked. May sleep.
+ *
+ * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
+ * SMPng: NOTE: Returns with the INP write lock held.
+ */
+static struct ip_moptions *
+inp_findmoptions(struct inpcb *inp)
+{
+ struct ip_moptions *imo;
+ struct in_multi **immp;
+ struct in_mfilter *imfp;
+ size_t idx;
+
+ INP_WLOCK(inp);
+ if (inp->inp_moptions != NULL)
+ return (inp->inp_moptions);
+
+ INP_WUNLOCK(inp);
+
+ imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
+ immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS,
+ M_WAITOK | M_ZERO);
+ imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS,
+ M_INMFILTER, M_WAITOK);
+
+ imo->imo_multicast_ifp = NULL;
+ imo->imo_multicast_addr.s_addr = INADDR_ANY;
+ imo->imo_multicast_vif = -1;
+ imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+ imo->imo_multicast_loop = in_mcast_loop;
+ imo->imo_num_memberships = 0;
+ imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
+ imo->imo_membership = immp;
+
+ /* Initialize per-group source filters. */
+ for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++)
+ imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE);
+ imo->imo_mfilters = imfp;
+
+ INP_WLOCK(inp);
+ if (inp->inp_moptions != NULL) {
+ free(imfp, M_INMFILTER);
+ free(immp, M_IPMOPTS);
+ free(imo, M_IPMOPTS);
+ return (inp->inp_moptions);
+ }
+ inp->inp_moptions = imo;
+ return (imo);
+}
+
+/*
+ * Discard the IP multicast options (and source filters).
+ *
+ * SMPng: NOTE: assumes INP write lock is held.
+ */
+void
+inp_freemoptions(struct ip_moptions *imo)
+{
+ struct in_mfilter *imf;
+ size_t idx, nmships;
+
+ KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__));
+
+ nmships = imo->imo_num_memberships;
+ for (idx = 0; idx < nmships; ++idx) {
+ imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL;
+ if (imf)
+ imf_leave(imf);
+ (void)in_leavegroup(imo->imo_membership[idx], imf);
+ if (imf)
+ imf_purge(imf);
+ }
+
+ if (imo->imo_mfilters)
+ free(imo->imo_mfilters, M_INMFILTER);
+ free(imo->imo_membership, M_IPMOPTS);
+ free(imo, M_IPMOPTS);
+}
+
+/*
+ * Atomically get source filters on a socket for an IPv4 multicast group.
+ * Called with INP lock held; returns with lock released.
+ */
+static int
+inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
+{
+ struct __msfilterreq msfr;
+ sockunion_t *gsa;
+ struct ifnet *ifp;
+ struct ip_moptions *imo;
+ struct in_mfilter *imf;
+ struct ip_msource *ims;
+ struct in_msource *lims;
+ struct sockaddr_in *psin;
+ struct sockaddr_storage *ptss;
+ struct sockaddr_storage *tss;
+ int error;
+ size_t idx, nsrcs, ncsrcs;
+
+ INP_WLOCK_ASSERT(inp);
+
+ imo = inp->inp_moptions;
+ KASSERT(imo != NULL, ("%s: null ip_moptions", __func__));
+
+ INP_WUNLOCK(inp);
+
+ error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
+ sizeof(struct __msfilterreq));
+ if (error)
+ return (error);
+
+ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
+ return (EINVAL);
+
+ ifp = ifnet_byindex(msfr.msfr_ifindex);
+ if (ifp == NULL)
+ return (EINVAL);
+
+ INP_WLOCK(inp);
+
+ /*
+ * Lookup group on the socket.
+ */
+ gsa = (sockunion_t *)&msfr.msfr_group;
+ idx = imo_match_group(imo, ifp, &gsa->sa);
+ if (idx == -1 || imo->imo_mfilters == NULL) {
+ INP_WUNLOCK(inp);
+ return (EADDRNOTAVAIL);
+ }
+ imf = &imo->imo_mfilters[idx];
+
+ /*
+ * Ignore memberships which are in limbo.
+ */
+ if (imf->imf_st[1] == MCAST_UNDEFINED) {
+ INP_WUNLOCK(inp);
+ return (EAGAIN);
+ }
+ msfr.msfr_fmode = imf->imf_st[1];
+
+ /*
+ * If the user specified a buffer, copy out the source filter
+ * entries to userland gracefully.
+ * We only copy out the number of entries which userland
+ * has asked for, but we always tell userland how big the
+ * buffer really needs to be.
+ */
+ tss = NULL;
+ if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
+ tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
+ M_TEMP, M_NOWAIT | M_ZERO);
+ if (tss == NULL) {
+ INP_WUNLOCK(inp);
+ return (ENOBUFS);
+ }
+ }
+
+ /*
+ * Count number of sources in-mode at t0.
+ * If buffer space exists and remains, copy out source entries.
+ */
+ nsrcs = msfr.msfr_nsrcs;
+ ncsrcs = 0;
+ ptss = tss;
+ RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
+ lims = (struct in_msource *)ims;
+ if (lims->imsl_st[0] == MCAST_UNDEFINED ||
+ lims->imsl_st[0] != imf->imf_st[0])
+ continue;
+ ++ncsrcs;
+ if (tss != NULL && nsrcs > 0) {
+ psin = (struct sockaddr_in *)ptss;
+ psin->sin_family = AF_INET;
+ psin->sin_len = sizeof(struct sockaddr_in);
+ psin->sin_addr.s_addr = htonl(lims->ims_haddr);
+ psin->sin_port = 0;
+ ++ptss;
+ --nsrcs;
+ }
+ }
+
+ INP_WUNLOCK(inp);
+
+ if (tss != NULL) {
+ error = copyout(tss, msfr.msfr_srcs,
+ sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
+ free(tss, M_TEMP);
+ if (error)
+ return (error);
+ }
+
+ msfr.msfr_nsrcs = ncsrcs;
+ error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
+
+ return (error);
+}
+
+/*
+ * Return the IP multicast options in response to user getsockopt().
+ */
+int
+inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
+{
+ struct ip_mreqn mreqn;
+ struct ip_moptions *imo;
+ struct ifnet *ifp;
+ struct in_ifaddr *ia;
+ int error, optval;
+ u_char coptval;
+
+ INP_WLOCK(inp);
+ imo = inp->inp_moptions;
+ /*
+ * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
+ * or is a divert socket, reject it.
+ */
+ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
+ (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
+ inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
+ INP_WUNLOCK(inp);
+ return (EOPNOTSUPP);
+ }
+
+ error = 0;
+ switch (sopt->sopt_name) {
+ case IP_MULTICAST_VIF:
+ if (imo != NULL)
+ optval = imo->imo_multicast_vif;
+ else
+ optval = -1;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof(int));
+ break;
+
+ case IP_MULTICAST_IF:
+ memset(&mreqn, 0, sizeof(struct ip_mreqn));
+ if (imo != NULL) {
+ ifp = imo->imo_multicast_ifp;
+ if (!in_nullhost(imo->imo_multicast_addr)) {
+ mreqn.imr_address = imo->imo_multicast_addr;
+ } else if (ifp != NULL) {
+ mreqn.imr_ifindex = ifp->if_index;
+ IFP_TO_IA(ifp, ia);
+ if (ia != NULL) {
+ mreqn.imr_address =
+ IA_SIN(ia)->sin_addr;
+ ifa_free(&ia->ia_ifa);
+ }
+ }
+ }
+ INP_WUNLOCK(inp);
+ if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
+ error = sooptcopyout(sopt, &mreqn,
+ sizeof(struct ip_mreqn));
+ } else {
+ error = sooptcopyout(sopt, &mreqn.imr_address,
+ sizeof(struct in_addr));
+ }
+ break;
+
+ case IP_MULTICAST_TTL:
+ if (imo == 0)
+ optval = coptval = IP_DEFAULT_MULTICAST_TTL;
+ else
+ optval = coptval = imo->imo_multicast_ttl;
+ INP_WUNLOCK(inp);
+ if (sopt->sopt_valsize == sizeof(u_char))
+ error = sooptcopyout(sopt, &coptval, sizeof(u_char));
+ else
+ error = sooptcopyout(sopt, &optval, sizeof(int));
+ break;
+
+ case IP_MULTICAST_LOOP:
+ if (imo == 0)
+ optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
+ else
+ optval = coptval = imo->imo_multicast_loop;
+ INP_WUNLOCK(inp);
+ if (sopt->sopt_valsize == sizeof(u_char))
+ error = sooptcopyout(sopt, &coptval, sizeof(u_char));
+ else
+ error = sooptcopyout(sopt, &optval, sizeof(int));
+ break;
+
+ case IP_MSFILTER:
+ if (imo == NULL) {
+ error = EADDRNOTAVAIL;
+ INP_WUNLOCK(inp);
+ } else {
+ error = inp_get_source_filters(inp, sopt);
+ }
+ break;
+
+ default:
+ INP_WUNLOCK(inp);
+ error = ENOPROTOOPT;
+ break;
+ }
+
+ INP_UNLOCK_ASSERT(inp);
+
+ return (error);
+}
+
+/*
+ * Look up the ifnet to use for a multicast group membership,
+ * given the IPv4 address of an interface, and the IPv4 group address.
+ *
+ * This routine exists to support legacy multicast applications
+ * which do not understand that multicast memberships are scoped to
+ * specific physical links in the networking stack, or which need
+ * to join link-scope groups before IPv4 addresses are configured.
+ *
+ * If inp is non-NULL, use this socket's current FIB number for any
+ * required FIB lookup.
+ * If ina is INADDR_ANY, look up the group address in the unicast FIB,
+ * and use its ifp; usually, this points to the default next-hop.
+ *
+ * If the FIB lookup fails, attempt to use the first non-loopback
+ * interface with multicast capability in the system as a
+ * last resort. The legacy IPv4 ASM API requires that we do
+ * this in order to allow groups to be joined when the routing
+ * table has not yet been populated during boot.
+ *
+ * Returns NULL if no ifp could be found.
+ *
+ * SMPng: TODO: Acquire the appropriate locks for INADDR_TO_IFP.
+ * FUTURE: Implement IPv4 source-address selection.
+ */
+static struct ifnet *
+inp_lookup_mcast_ifp(const struct inpcb *inp,
+ const struct sockaddr_in *gsin, const struct in_addr ina)
+{
+ struct ifnet *ifp;
+
+ KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__));
+ KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)),
+ ("%s: not multicast", __func__));
+
+ ifp = NULL;
+ if (!in_nullhost(ina)) {
+ INADDR_TO_IFP(ina, ifp);
+ } else {
+ struct route ro;
+
+ ro.ro_rt = NULL;
+ memcpy(&ro.ro_dst, gsin, sizeof(struct sockaddr_in));
+ in_rtalloc_ign(&ro, 0, inp ? inp->inp_inc.inc_fibnum : 0);
+ if (ro.ro_rt != NULL) {
+ ifp = ro.ro_rt->rt_ifp;
+ KASSERT(ifp != NULL, ("%s: null ifp", __func__));
+ RTFREE(ro.ro_rt);
+ } else {
+ struct in_ifaddr *ia;
+ struct ifnet *mifp;
+
+ mifp = NULL;
+ IN_IFADDR_RLOCK();
+ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ mifp = ia->ia_ifp;
+ if (!(mifp->if_flags & IFF_LOOPBACK) &&
+ (mifp->if_flags & IFF_MULTICAST)) {
+ ifp = mifp;
+ break;
+ }
+ }
+ IN_IFADDR_RUNLOCK();
+ }
+ }
+
+ return (ifp);
+}
+
+/*
+ * Join an IPv4 multicast group, possibly with a source.
+ */
+static int
+inp_join_group(struct inpcb *inp, struct sockopt *sopt)
+{
+ struct group_source_req gsr;
+ sockunion_t *gsa, *ssa;
+ struct ifnet *ifp;
+ struct in_mfilter *imf;
+ struct ip_moptions *imo;
+ struct in_multi *inm;
+ struct in_msource *lims;
+ size_t idx;
+ int error, is_new;
+
+ ifp = NULL;
+ imf = NULL;
+ error = 0;
+ is_new = 0;
+
+ memset(&gsr, 0, sizeof(struct group_source_req));
+ gsa = (sockunion_t *)&gsr.gsr_group;
+ gsa->ss.ss_family = AF_UNSPEC;
+ ssa = (sockunion_t *)&gsr.gsr_source;
+ ssa->ss.ss_family = AF_UNSPEC;
+
+ switch (sopt->sopt_name) {
+ case IP_ADD_MEMBERSHIP:
+ case IP_ADD_SOURCE_MEMBERSHIP: {
+ struct ip_mreq_source mreqs;
+
+ if (sopt->sopt_name == IP_ADD_MEMBERSHIP) {
+ error = sooptcopyin(sopt, &mreqs,
+ sizeof(struct ip_mreq),
+ sizeof(struct ip_mreq));
+ /*
+ * Do argument switcharoo from ip_mreq into
+ * ip_mreq_source to avoid using two instances.
+ */
+ mreqs.imr_interface = mreqs.imr_sourceaddr;
+ mreqs.imr_sourceaddr.s_addr = INADDR_ANY;
+ } else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) {
+ error = sooptcopyin(sopt, &mreqs,
+ sizeof(struct ip_mreq_source),
+ sizeof(struct ip_mreq_source));
+ }
+ if (error)
+ return (error);
+
+ gsa->sin.sin_family = AF_INET;
+ gsa->sin.sin_len = sizeof(struct sockaddr_in);
+ gsa->sin.sin_addr = mreqs.imr_multiaddr;
+
+ if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) {
+ ssa->sin.sin_family = AF_INET;
+ ssa->sin.sin_len = sizeof(struct sockaddr_in);
+ ssa->sin.sin_addr = mreqs.imr_sourceaddr;
+ }
+
+ if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
+ return (EINVAL);
+
+ ifp = inp_lookup_mcast_ifp(inp, &gsa->sin,
+ mreqs.imr_interface);
+ CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p",
+ __func__, inet_ntoa(mreqs.imr_interface), ifp);
+ break;
+ }
+
+ case MCAST_JOIN_GROUP:
+ case MCAST_JOIN_SOURCE_GROUP:
+ if (sopt->sopt_name == MCAST_JOIN_GROUP) {
+ error = sooptcopyin(sopt, &gsr,
+ sizeof(struct group_req),
+ sizeof(struct group_req));
+ } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
+ error = sooptcopyin(sopt, &gsr,
+ sizeof(struct group_source_req),
+ sizeof(struct group_source_req));
+ }
+ if (error)
+ return (error);
+
+ if (gsa->sin.sin_family != AF_INET ||
+ gsa->sin.sin_len != sizeof(struct sockaddr_in))
+ return (EINVAL);
+
+ /*
+ * Overwrite the port field if present, as the sockaddr
+ * being copied in may be matched with a binary comparison.
+ */
+ gsa->sin.sin_port = 0;
+ if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
+ if (ssa->sin.sin_family != AF_INET ||
+ ssa->sin.sin_len != sizeof(struct sockaddr_in))
+ return (EINVAL);
+ ssa->sin.sin_port = 0;
+ }
+
+ if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
+ return (EINVAL);
+
+ if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
+ return (EADDRNOTAVAIL);
+ ifp = ifnet_byindex(gsr.gsr_interface);
+ break;
+
+ default:
+ CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
+ __func__, sopt->sopt_name);
+ return (EOPNOTSUPP);
+ break;
+ }
+
+ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0)
+ return (EADDRNOTAVAIL);
+
+ imo = inp_findmoptions(inp);
+ idx = imo_match_group(imo, ifp, &gsa->sa);
+ if (idx == -1) {
+ is_new = 1;
+ } else {
+ inm = imo->imo_membership[idx];
+ imf = &imo->imo_mfilters[idx];
+ if (ssa->ss.ss_family != AF_UNSPEC) {
+ /*
+ * MCAST_JOIN_SOURCE_GROUP on an exclusive membership
+ * is an error. On an existing inclusive membership,
+ * it just adds the source to the filter list.
+ */
+ if (imf->imf_st[1] != MCAST_INCLUDE) {
+ error = EINVAL;
+ goto out_inp_locked;
+ }
+ /* Throw out duplicates. */
+ lims = imo_match_source(imo, idx, &ssa->sa);
+ if (lims != NULL) {
+ error = EADDRNOTAVAIL;
+ goto out_inp_locked;
+ }
+ } else {
+ /*
+ * MCAST_JOIN_GROUP on an existing inclusive
+ * membership is an error; if you want to change
+ * filter mode, you must use the userland API
+ * setsourcefilter().
+ */
+ if (imf->imf_st[1] == MCAST_INCLUDE) {
+ error = EINVAL;
+ goto out_inp_locked;
+ }
+ /*
+ * MCAST_JOIN_GROUP on an existing exclusive
+ * membership is an error; return EADDRINUSE
+ * to preserve 4.4BSD API idempotence, and
+ * avoid tedious detour to code below.
+ * NOTE: This is bending RFC 3678 a bit.
+ */
+ if (imf->imf_st[1] == MCAST_EXCLUDE) {
+ error = EADDRINUSE;
+ goto out_inp_locked;
+ }
+ }
+ }
+
+ /*
+ * Begin state merge transaction at socket layer.
+ */
+ INP_WLOCK_ASSERT(inp);
+
+ if (is_new) {
+ if (imo->imo_num_memberships == imo->imo_max_memberships) {
+ error = imo_grow(imo);
+ if (error)
+ goto out_inp_locked;
+ }
+ /*
+ * Allocate the new slot upfront so we can deal with
+ * grafting the new source filter in same code path
+ * as for join-source on existing membership.
+ */
+ idx = imo->imo_num_memberships;
+ imo->imo_membership[idx] = NULL;
+ imo->imo_num_memberships++;
+ KASSERT(imo->imo_mfilters != NULL,
+ ("%s: imf_mfilters vector was not allocated", __func__));
+ imf = &imo->imo_mfilters[idx];
+ KASSERT(RB_EMPTY(&imf->imf_sources),
+ ("%s: imf_sources not empty", __func__));
+ }
+
+ /*
+ * Graft new source into filter list for this inpcb's
+ * membership of the group. The in_multi may not have
+ * been allocated yet if this is a new membership, however,
+ * the in_mfilter slot will be allocated and must be initialized.
+ */
+ if (ssa->ss.ss_family != AF_UNSPEC) {
+ /* Membership starts in IN mode */
+ if (is_new) {
+ CTR1(KTR_IGMPV3, "%s: new join w/source", __func__);
+ imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE);
+ } else {
+ CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
+ }
+ lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin);
+ if (lims == NULL) {
+ CTR1(KTR_IGMPV3, "%s: merge imf state failed",
+ __func__);
+ error = ENOMEM;
+ goto out_imo_free;
+ }
+ } else {
+ /* No address specified; Membership starts in EX mode */
+ if (is_new) {
+ CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__);
+ imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE);
+ }
+ }
+
+ /*
+ * Begin state merge transaction at IGMP layer.
+ */
+ IN_MULTI_LOCK();
+
+ if (is_new) {
+ error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf,
+ &inm);
+ if (error)
+ goto out_imo_free;
+ imo->imo_membership[idx] = inm;
+ } else {
+ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
+ error = inm_merge(inm, imf);
+ if (error) {
+ CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
+ __func__);
+ goto out_imf_rollback;
+ }
+ CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
+ error = igmp_change_state(inm);
+ if (error) {
+ CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
+ __func__);
+ goto out_imf_rollback;
+ }
+ }
+
+ IN_MULTI_UNLOCK();
+
+out_imf_rollback:
+ INP_WLOCK_ASSERT(inp);
+ if (error) {
+ imf_rollback(imf);
+ if (is_new)
+ imf_purge(imf);
+ else
+ imf_reap(imf);
+ } else {
+ imf_commit(imf);
+ }
+
+out_imo_free:
+ if (error && is_new) {
+ imo->imo_membership[idx] = NULL;
+ --imo->imo_num_memberships;
+ }
+
+out_inp_locked:
+ INP_WUNLOCK(inp);
+ return (error);
+}
+
+/*
+ * Leave an IPv4 multicast group on an inpcb, possibly with a source.
+ */
+static int
+inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
+{
+ struct group_source_req gsr;
+ struct ip_mreq_source mreqs;
+ sockunion_t *gsa, *ssa;
+ struct ifnet *ifp;
+ struct in_mfilter *imf;
+ struct ip_moptions *imo;
+ struct in_msource *ims;
+ struct in_multi *inm;
+ size_t idx;
+ int error, is_final;
+
+ ifp = NULL;
+ error = 0;
+ is_final = 1;
+
+ memset(&gsr, 0, sizeof(struct group_source_req));
+ gsa = (sockunion_t *)&gsr.gsr_group;
+ gsa->ss.ss_family = AF_UNSPEC;
+ ssa = (sockunion_t *)&gsr.gsr_source;
+ ssa->ss.ss_family = AF_UNSPEC;
+
+ switch (sopt->sopt_name) {
+ case IP_DROP_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ if (sopt->sopt_name == IP_DROP_MEMBERSHIP) {
+ error = sooptcopyin(sopt, &mreqs,
+ sizeof(struct ip_mreq),
+ sizeof(struct ip_mreq));
+ /*
+ * Swap interface and sourceaddr arguments,
+ * as ip_mreq and ip_mreq_source are laid
+ * out differently.
+ */
+ mreqs.imr_interface = mreqs.imr_sourceaddr;
+ mreqs.imr_sourceaddr.s_addr = INADDR_ANY;
+ } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
+ error = sooptcopyin(sopt, &mreqs,
+ sizeof(struct ip_mreq_source),
+ sizeof(struct ip_mreq_source));
+ }
+ if (error)
+ return (error);
+
+ gsa->sin.sin_family = AF_INET;
+ gsa->sin.sin_len = sizeof(struct sockaddr_in);
+ gsa->sin.sin_addr = mreqs.imr_multiaddr;
+
+ if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
+ ssa->sin.sin_family = AF_INET;
+ ssa->sin.sin_len = sizeof(struct sockaddr_in);
+ ssa->sin.sin_addr = mreqs.imr_sourceaddr;
+ }
+
+ /*
+ * Attempt to look up hinted ifp from interface address.
+ * Fallthrough with null ifp iff lookup fails, to
+ * preserve 4.4BSD mcast API idempotence.
+ * XXX NOTE WELL: The RFC 3678 API is preferred because
+ * using an IPv4 address as a key is racy.
+ */
+ if (!in_nullhost(mreqs.imr_interface))
+ INADDR_TO_IFP(mreqs.imr_interface, ifp);
+
+ CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p",
+ __func__, inet_ntoa(mreqs.imr_interface), ifp);
+
+ break;
+
+ case MCAST_LEAVE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
+ error = sooptcopyin(sopt, &gsr,
+ sizeof(struct group_req),
+ sizeof(struct group_req));
+ } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
+ error = sooptcopyin(sopt, &gsr,
+ sizeof(struct group_source_req),
+ sizeof(struct group_source_req));
+ }
+ if (error)
+ return (error);
+
+ if (gsa->sin.sin_family != AF_INET ||
+ gsa->sin.sin_len != sizeof(struct sockaddr_in))
+ return (EINVAL);
+
+ if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
+ if (ssa->sin.sin_family != AF_INET ||
+ ssa->sin.sin_len != sizeof(struct sockaddr_in))
+ return (EINVAL);
+ }
+
+ if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
+ return (EADDRNOTAVAIL);
+
+ ifp = ifnet_byindex(gsr.gsr_interface);
+
+ if (ifp == NULL)
+ return (EADDRNOTAVAIL);
+ break;
+
+ default:
+ CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
+ __func__, sopt->sopt_name);
+ return (EOPNOTSUPP);
+ break;
+ }
+
+ if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
+ return (EINVAL);
+
+ /*
+ * Find the membership in the membership array.
+ */
+ imo = inp_findmoptions(inp);
+ idx = imo_match_group(imo, ifp, &gsa->sa);
+ if (idx == -1) {
+ error = EADDRNOTAVAIL;
+ goto out_inp_locked;
+ }
+ inm = imo->imo_membership[idx];
+ imf = &imo->imo_mfilters[idx];
+
+ if (ssa->ss.ss_family != AF_UNSPEC)
+ is_final = 0;
+
+ /*
+ * Begin state merge transaction at socket layer.
+ */
+ INP_WLOCK_ASSERT(inp);
+
+ /*
+ * If we were instructed only to leave a given source, do so.
+ * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
+ */
+ if (is_final) {
+ imf_leave(imf);
+ } else {
+ if (imf->imf_st[0] == MCAST_EXCLUDE) {
+ error = EADDRNOTAVAIL;
+ goto out_inp_locked;
+ }
+ ims = imo_match_source(imo, idx, &ssa->sa);
+ if (ims == NULL) {
+ CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__,
+ inet_ntoa(ssa->sin.sin_addr), "not ");
+ error = EADDRNOTAVAIL;
+ goto out_inp_locked;
+ }
+ CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
+ error = imf_prune(imf, &ssa->sin);
+ if (error) {
+ CTR1(KTR_IGMPV3, "%s: merge imf state failed",
+ __func__);
+ goto out_inp_locked;
+ }
+ }
+
+ /*
+ * Begin state merge transaction at IGMP layer.
+ */
+ IN_MULTI_LOCK();
+
+ if (is_final) {
+ /*
+ * Give up the multicast address record to which
+ * the membership points.
+ */
+ (void)in_leavegroup_locked(inm, imf);
+ } else {
+ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
+ error = inm_merge(inm, imf);
+ if (error) {
+ CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
+ __func__);
+ goto out_imf_rollback;
+ }
+
+ CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
+ error = igmp_change_state(inm);
+ if (error) {
+ CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
+ __func__);
+ }
+ }
+
+ IN_MULTI_UNLOCK();
+
+out_imf_rollback:
+ if (error)
+ imf_rollback(imf);
+ else
+ imf_commit(imf);
+
+ imf_reap(imf);
+
+ if (is_final) {
+ /* Remove the gap in the membership and filter array. */
+ for (++idx; idx < imo->imo_num_memberships; ++idx) {
+ imo->imo_membership[idx-1] = imo->imo_membership[idx];
+ imo->imo_mfilters[idx-1] = imo->imo_mfilters[idx];
+ }
+ imo->imo_num_memberships--;
+ }
+
+out_inp_locked:
+ INP_WUNLOCK(inp);
+ return (error);
+}
+
+/*
+ * Select the interface for transmitting IPv4 multicast datagrams.
+ *
+ * Either an instance of struct in_addr or an instance of struct ip_mreqn
+ * may be passed to this socket option. An address of INADDR_ANY or an
+ * interface index of 0 is used to remove a previous selection.
+ * When no interface is selected, one is chosen for every send.
+ */
+static int
+inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
+{
+ struct in_addr addr;
+ struct ip_mreqn mreqn;
+ struct ifnet *ifp;
+ struct ip_moptions *imo;
+ int error;
+
+ if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
+ /*
+ * An interface index was specified using the
+ * Linux-derived ip_mreqn structure.
+ */
+ error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn),
+ sizeof(struct ip_mreqn));
+ if (error)
+ return (error);
+
+ if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex)
+ return (EINVAL);
+
+ if (mreqn.imr_ifindex == 0) {
+ ifp = NULL;
+ } else {
+ ifp = ifnet_byindex(mreqn.imr_ifindex);
+ if (ifp == NULL)
+ return (EADDRNOTAVAIL);
+ }
+ } else {
+ /*
+ * An interface was specified by IPv4 address.
+ * This is the traditional BSD usage.
+ */
+ error = sooptcopyin(sopt, &addr, sizeof(struct in_addr),
+ sizeof(struct in_addr));
+ if (error)
+ return (error);
+ if (in_nullhost(addr)) {
+ ifp = NULL;
+ } else {
+ INADDR_TO_IFP(addr, ifp);
+ if (ifp == NULL)
+ return (EADDRNOTAVAIL);
+ }
+ CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = %s", __func__, ifp,
+ inet_ntoa(addr));
+ }
+
+ /* Reject interfaces which do not support multicast. */
+ if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0)
+ return (EOPNOTSUPP);
+
+ imo = inp_findmoptions(inp);
+ imo->imo_multicast_ifp = ifp;
+ imo->imo_multicast_addr.s_addr = INADDR_ANY;
+ INP_WUNLOCK(inp);
+
+ return (0);
+}
+
+/*
+ * Atomically set source filters on a socket for an IPv4 multicast group.
+ *
+ * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
+ */
+static int
+inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
+{
+ struct __msfilterreq msfr;
+ sockunion_t *gsa;
+ struct ifnet *ifp;
+ struct in_mfilter *imf;
+ struct ip_moptions *imo;
+ struct in_multi *inm;
+ size_t idx;
+ int error;
+
+ error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
+ sizeof(struct __msfilterreq));
+ if (error)
+ return (error);
+
+ if (msfr.msfr_nsrcs > in_mcast_maxsocksrc ||
+ (msfr.msfr_fmode != MCAST_EXCLUDE &&
+ msfr.msfr_fmode != MCAST_INCLUDE))
+ return (EINVAL);
+
+ if (msfr.msfr_group.ss_family != AF_INET ||
+ msfr.msfr_group.ss_len != sizeof(struct sockaddr_in))
+ return (EINVAL);
+
+ gsa = (sockunion_t *)&msfr.msfr_group;
+ if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
+ return (EINVAL);
+
+ gsa->sin.sin_port = 0; /* ignore port */
+
+ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
+ return (EADDRNOTAVAIL);
+
+ ifp = ifnet_byindex(msfr.msfr_ifindex);
+ if (ifp == NULL)
+ return (EADDRNOTAVAIL);
+
+ /*
+ * Take the INP write lock.
+ * Check if this socket is a member of this group.
+ */
+ imo = inp_findmoptions(inp);
+ idx = imo_match_group(imo, ifp, &gsa->sa);
+ if (idx == -1 || imo->imo_mfilters == NULL) {
+ error = EADDRNOTAVAIL;
+ goto out_inp_locked;
+ }
+ inm = imo->imo_membership[idx];
+ imf = &imo->imo_mfilters[idx];
+
+ /*
+ * Begin state merge transaction at socket layer.
+ */
+ INP_WLOCK_ASSERT(inp);
+
+ imf->imf_st[1] = msfr.msfr_fmode;
+
+ /*
+ * Apply any new source filters, if present.
+ * Make a copy of the user-space source vector so
+ * that we may copy them with a single copyin. This
+ * allows us to deal with page faults up-front.
+ */
+ if (msfr.msfr_nsrcs > 0) {
+ struct in_msource *lims;
+ struct sockaddr_in *psin;
+ struct sockaddr_storage *kss, *pkss;
+ int i;
+
+ INP_WUNLOCK(inp);
+
+ CTR2(KTR_IGMPV3, "%s: loading %lu source list entries",
+ __func__, (unsigned long)msfr.msfr_nsrcs);
+ kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
+ M_TEMP, M_WAITOK);
+ error = copyin(msfr.msfr_srcs, kss,
+ sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
+ if (error) {
+ free(kss, M_TEMP);
+ return (error);
+ }
+
+ INP_WLOCK(inp);
+
+ /*
+ * Mark all source filters as UNDEFINED at t1.
+ * Restore new group filter mode, as imf_leave()
+ * will set it to INCLUDE.
+ */
+ imf_leave(imf);
+ imf->imf_st[1] = msfr.msfr_fmode;
+
+ /*
+ * Update socket layer filters at t1, lazy-allocating
+ * new entries. This saves a bunch of memory at the
+ * cost of one RB_FIND() per source entry; duplicate
+ * entries in the msfr_nsrcs vector are ignored.
+ * If we encounter an error, rollback transaction.
+ *
+ * XXX This too could be replaced with a set-symmetric
+ * difference like loop to avoid walking from root
+ * every time, as the key space is common.
+ */
+ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
+ psin = (struct sockaddr_in *)pkss;
+ if (psin->sin_family != AF_INET) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+ if (psin->sin_len != sizeof(struct sockaddr_in)) {
+ error = EINVAL;
+ break;
+ }
+ error = imf_get_source(imf, psin, &lims);
+ if (error)
+ break;
+ lims->imsl_st[1] = imf->imf_st[1];
+ }
+ free(kss, M_TEMP);
+ }
+
+ if (error)
+ goto out_imf_rollback;
+
+ INP_WLOCK_ASSERT(inp);
+ IN_MULTI_LOCK();
+
+ /*
+ * Begin state merge transaction at IGMP layer.
+ */
+ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
+ error = inm_merge(inm, imf);
+ if (error) {
+ CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
+ goto out_imf_rollback;
+ }
+
+ CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
+ error = igmp_change_state(inm);
+ if (error)
+ CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
+
+ IN_MULTI_UNLOCK();
+
+out_imf_rollback:
+ if (error)
+ imf_rollback(imf);
+ else
+ imf_commit(imf);
+
+ imf_reap(imf);
+
+out_inp_locked:
+ INP_WUNLOCK(inp);
+ return (error);
+}
+
+/*
+ * Set the IP multicast options in response to user setsockopt().
+ *
+ * Many of the socket options handled in this function duplicate the
+ * functionality of socket options in the regular unicast API. However,
+ * it is not possible to merge the duplicate code, because the idempotence
+ * of the IPv4 multicast part of the BSD Sockets API must be preserved;
+ * the effects of these options must be treated as separate and distinct.
+ *
+ * SMPng: XXX: Unlocked read of inp_socket believed OK.
+ * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING
+ * is refactored to no longer use vifs.
+ */
+int
+inp_setmoptions(struct inpcb *inp, struct sockopt *sopt)
+{
+ struct ip_moptions *imo;
+ int error;
+
+ error = 0;
+
+ /*
+ * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
+ * or is a divert socket, reject it.
+ */
+ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
+ (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
+ inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
+ return (EOPNOTSUPP);
+
+ switch (sopt->sopt_name) {
+ case IP_MULTICAST_VIF: {
+ int vifi;
+ /*
+ * Select a multicast VIF for transmission.
+ * Only useful if multicast forwarding is active.
+ */
+ if (legal_vif_num == NULL) {
+ error = EOPNOTSUPP;
+ break;
+ }
+ error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int));
+ if (error)
+ break;
+ if (!legal_vif_num(vifi) && (vifi != -1)) {
+ error = EINVAL;
+ break;
+ }
+ imo = inp_findmoptions(inp);
+ imo->imo_multicast_vif = vifi;
+ INP_WUNLOCK(inp);
+ break;
+ }
+
+ case IP_MULTICAST_IF:
+ error = inp_set_multicast_if(inp, sopt);
+ break;
+
+ case IP_MULTICAST_TTL: {
+ u_char ttl;
+
+ /*
+ * Set the IP time-to-live for outgoing multicast packets.
+ * The original multicast API required a char argument,
+ * which is inconsistent with the rest of the socket API.
+ * We allow either a char or an int.
+ */
+ if (sopt->sopt_valsize == sizeof(u_char)) {
+ error = sooptcopyin(sopt, &ttl, sizeof(u_char),
+ sizeof(u_char));
+ if (error)
+ break;
+ } else {
+ u_int ittl;
+
+ error = sooptcopyin(sopt, &ittl, sizeof(u_int),
+ sizeof(u_int));
+ if (error)
+ break;
+ if (ittl > 255) {
+ error = EINVAL;
+ break;
+ }
+ ttl = (u_char)ittl;
+ }
+ imo = inp_findmoptions(inp);
+ imo->imo_multicast_ttl = ttl;
+ INP_WUNLOCK(inp);
+ break;
+ }
+
+ case IP_MULTICAST_LOOP: {
+ u_char loop;
+
+ /*
+ * Set the loopback flag for outgoing multicast packets.
+ * Must be zero or one. The original multicast API required a
+ * char argument, which is inconsistent with the rest
+ * of the socket API. We allow either a char or an int.
+ */
+ if (sopt->sopt_valsize == sizeof(u_char)) {
+ error = sooptcopyin(sopt, &loop, sizeof(u_char),
+ sizeof(u_char));
+ if (error)
+ break;
+ } else {
+ u_int iloop;
+
+ error = sooptcopyin(sopt, &iloop, sizeof(u_int),
+ sizeof(u_int));
+ if (error)
+ break;
+ loop = (u_char)iloop;
+ }
+ imo = inp_findmoptions(inp);
+ imo->imo_multicast_loop = !!loop;
+ INP_WUNLOCK(inp);
+ break;
+ }
+
+ case IP_ADD_MEMBERSHIP:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case MCAST_JOIN_GROUP:
+ case MCAST_JOIN_SOURCE_GROUP:
+ error = inp_join_group(inp, sopt);
+ break;
+
+ case IP_DROP_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ case MCAST_LEAVE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ error = inp_leave_group(inp, sopt);
+ break;
+
+ case IP_BLOCK_SOURCE:
+ case IP_UNBLOCK_SOURCE:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ error = inp_block_unblock_source(inp, sopt);
+ break;
+
+ case IP_MSFILTER:
+ error = inp_set_source_filters(inp, sopt);
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ INP_UNLOCK_ASSERT(inp);
+
+ return (error);
+}
+
+/*
+ * Expose IGMP's multicast filter mode and source list(s) to userland,
+ * keyed by (ifindex, group).
+ * The filter mode is written out as a uint32_t, followed by
+ * 0..n of struct in_addr.
+ * For use by ifmcstat(8).
+ * SMPng: NOTE: unlocked read of ifindex space.
+ */
+static int
+sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS)
+{
+ struct in_addr src, group;
+ struct ifnet *ifp;
+ struct ifmultiaddr *ifma;
+ struct in_multi *inm;
+ struct ip_msource *ims;
+ int *name;
+ int retval;
+ u_int namelen;
+ uint32_t fmode, ifindex;
+
+ name = (int *)arg1;
+ namelen = arg2;
+
+ if (req->newptr != NULL)
+ return (EPERM);
+
+ if (namelen != 2)
+ return (EINVAL);
+
+ ifindex = name[0];
+ if (ifindex <= 0 || ifindex > V_if_index) {
+ CTR2(KTR_IGMPV3, "%s: ifindex %u out of range",
+ __func__, ifindex);
+ return (ENOENT);
+ }
+
+ group.s_addr = name[1];
+ if (!IN_MULTICAST(ntohl(group.s_addr))) {
+ CTR2(KTR_IGMPV3, "%s: group %s is not multicast",
+ __func__, inet_ntoa(group));
+ return (EINVAL);
+ }
+
+ ifp = ifnet_byindex(ifindex);
+ if (ifp == NULL) {
+ CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u",
+ __func__, ifindex);
+ return (ENOENT);
+ }
+
+ retval = sysctl_wire_old_buffer(req,
+ sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr)));
+ if (retval)
+ return (retval);
+
+ IN_MULTI_LOCK();
+
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ if (ifma->ifma_addr->sa_family != AF_INET ||
+ ifma->ifma_protospec == NULL)
+ continue;
+ inm = (struct in_multi *)ifma->ifma_protospec;
+ if (!in_hosteq(inm->inm_addr, group))
+ continue;
+ fmode = inm->inm_st[1].iss_fmode;
+ retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
+ if (retval != 0)
+ break;
+ RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
+#ifdef KTR
+ struct in_addr ina;
+ ina.s_addr = htonl(ims->ims_haddr);
+ CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
+ inet_ntoa(ina));
+#endif
+ /*
+ * Only copy-out sources which are in-mode.
+ */
+ if (fmode != ims_get_mode(inm, ims, 1)) {
+ CTR1(KTR_IGMPV3, "%s: skip non-in-mode",
+ __func__);
+ continue;
+ }
+ src.s_addr = htonl(ims->ims_haddr);
+ retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr));
+ if (retval != 0)
+ break;
+ }
+ }
+ IF_ADDR_UNLOCK(ifp);
+
+ IN_MULTI_UNLOCK();
+
+ return (retval);
+}
+
+#ifdef KTR
+
+static const char *inm_modestrs[] = { "un", "in", "ex" };
+
+static const char *
+inm_mode_str(const int mode)
+{
+
+ if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
+ return (inm_modestrs[mode]);
+ return ("??");
+}
+
+static const char *inm_statestrs[] = {
+ "not-member",
+ "silent",
+ "idle",
+ "lazy",
+ "sleeping",
+ "awakening",
+ "query-pending",
+ "sg-query-pending",
+ "leaving"
+};
+
+static const char *
+inm_state_str(const int state)
+{
+
+ if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER)
+ return (inm_statestrs[state]);
+ return ("??");
+}
+
+/*
+ * Dump an in_multi structure to the console.
+ */
+void
+inm_print(const struct in_multi *inm)
+{
+ int t;
+
+ if ((ktr_mask & KTR_IGMPV3) == 0)
+ return;
+
+ printf("%s: --- begin inm %p ---\n", __func__, inm);
+ printf("addr %s ifp %p(%s) ifma %p\n",
+ inet_ntoa(inm->inm_addr),
+ inm->inm_ifp,
+ inm->inm_ifp->if_xname,
+ inm->inm_ifma);
+ printf("timer %u state %s refcount %u scq.len %u\n",
+ inm->inm_timer,
+ inm_state_str(inm->inm_state),
+ inm->inm_refcount,
+ inm->inm_scq.ifq_len);
+ printf("igi %p nsrc %lu sctimer %u scrv %u\n",
+ inm->inm_igi,
+ inm->inm_nsrc,
+ inm->inm_sctimer,
+ inm->inm_scrv);
+ for (t = 0; t < 2; t++) {
+ printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
+ inm_mode_str(inm->inm_st[t].iss_fmode),
+ inm->inm_st[t].iss_asm,
+ inm->inm_st[t].iss_ex,
+ inm->inm_st[t].iss_in,
+ inm->inm_st[t].iss_rec);
+ }
+ printf("%s: --- end inm %p ---\n", __func__, inm);
+}
+
+#else /* !KTR */
+
+void
+inm_print(const struct in_multi *inm)
+{
+
+}
+
+#endif /* KTR */
+
+RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp);
diff --git a/freebsd/sys/netinet/in_pcb.c b/freebsd/sys/netinet/in_pcb.c
new file mode 100644
index 00000000..186a0f0a
--- /dev/null
+++ b/freebsd/sys/netinet/in_pcb.c
@@ -0,0 +1,1958 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993, 1995
+ * The Regents of the University of California.
+ * Copyright (c) 2007-2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_ddb.h>
+#include <freebsd/local/opt_ipsec.h>
+#include <freebsd/local/opt_inet6.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/domain.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/jail.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/sysctl.h>
+
+#ifdef DDB
+#include <freebsd/ddb/ddb.h>
+#endif
+
+#include <freebsd/vm/uma.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/if_types.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet/udp.h>
+#include <freebsd/netinet/udp_var.h>
+#ifdef INET6
+#include <freebsd/netinet/ip6.h>
+#include <freebsd/netinet6/ip6_var.h>
+#endif /* INET6 */
+
+
+#ifdef IPSEC
+#include <freebsd/netipsec/ipsec.h>
+#include <freebsd/netipsec/key.h>
+#endif /* IPSEC */
+
+#include <freebsd/security/mac/mac_framework.h>
+
+/*
+ * These configure the range of local port addresses assigned to
+ * "unspecified" outgoing connections/packets/whatever.
+ */
+VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */
+VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */
+VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */
+VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */
+VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */
+VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */
+
+/*
+ * Reserved ports accessible only to root. There are significant
+ * security considerations that must be accounted for when changing these,
+ * but the security benefits can be great. Please be careful.
+ */
+VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */
+VNET_DEFINE(int, ipport_reservedlow);
+
+/* Variables dealing with random ephemeral port allocation. */
+VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */
+VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */
+VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */
+VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */
+VNET_DEFINE(int, ipport_tcpallocs);
+static VNET_DEFINE(int, ipport_tcplastcount);
+
+#define V_ipport_tcplastcount VNET(ipport_tcplastcount)
+
+#define RANGECHK(var, min, max) \
+ if ((var) < (min)) { (var) = (min); } \
+ else if ((var) > (max)) { (var) = (max); }
+
+static void in_pcbremlists(struct inpcb *inp);
+
+static int
+sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+
+#ifdef VIMAGE
+ error = vnet_sysctl_handle_int(oidp, arg1, arg2, req);
+#else
+ error = sysctl_handle_int(oidp, arg1, arg2, req);
+#endif
+ if (error == 0) {
+ RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
+ RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
+ RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
+ RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
+ RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
+ RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
+ }
+ return (error);
+}
+
+#undef RANGECHK
+
+SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
+
+SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
+ CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0,
+ &sysctl_net_ipport_check, "I", "");
+SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
+ CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0,
+ &sysctl_net_ipport_check, "I", "");
+SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, first,
+ CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0,
+ &sysctl_net_ipport_check, "I", "");
+SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, last,
+ CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0,
+ &sysctl_net_ipport_check, "I", "");
+SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
+ CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0,
+ &sysctl_net_ipport_check, "I", "");
+SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
+ CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0,
+ &sysctl_net_ipport_check, "I", "");
+SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
+ CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, "");
+SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
+ CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
+SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW,
+ &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
+SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW,
+ &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
+ "allocations before switching to a sequental one");
+SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW,
+ &VNET_NAME(ipport_randomtime), 0,
+ "Minimum time to keep sequental port "
+ "allocation before switching to a random one");
+
+/*
+ * in_pcb.c: manage the Protocol Control Blocks.
+ *
+ * NOTE: It is assumed that most of these functions will be called with
+ * the pcbinfo lock held, and often, the inpcb lock held, as these utility
+ * functions often modify hash chains or addresses in pcbs.
+ */
+
+/*
+ * Allocate a PCB and associate it with the socket.
+ * On success return with the PCB locked.
+ */
+int
+in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
+{
+ struct inpcb *inp;
+ int error;
+
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ error = 0;
+ inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
+ if (inp == NULL)
+ return (ENOBUFS);
+ bzero(inp, inp_zero_size);
+ inp->inp_pcbinfo = pcbinfo;
+ inp->inp_socket = so;
+ inp->inp_cred = crhold(so->so_cred);
+ inp->inp_inc.inc_fibnum = so->so_fibnum;
+#ifdef MAC
+ error = mac_inpcb_init(inp, M_NOWAIT);
+ if (error != 0)
+ goto out;
+ mac_inpcb_create(so, inp);
+#endif
+#ifdef IPSEC
+ error = ipsec_init_policy(so, &inp->inp_sp);
+ if (error != 0) {
+#ifdef MAC
+ mac_inpcb_destroy(inp);
+#endif
+ goto out;
+ }
+#endif /*IPSEC*/
+#ifdef INET6
+ if (INP_SOCKAF(so) == AF_INET6) {
+ inp->inp_vflag |= INP_IPV6PROTO;
+ if (V_ip6_v6only)
+ inp->inp_flags |= IN6P_IPV6_V6ONLY;
+ }
+#endif
+ LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
+ pcbinfo->ipi_count++;
+ so->so_pcb = (caddr_t)inp;
+#ifdef INET6
+ if (V_ip6_auto_flowlabel)
+ inp->inp_flags |= IN6P_AUTOFLOWLABEL;
+#endif
+ INP_WLOCK(inp);
+ inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
+ inp->inp_refcount = 1; /* Reference from the inpcbinfo */
+#if defined(IPSEC) || defined(MAC)
+out:
+ if (error != 0) {
+ crfree(inp->inp_cred);
+ uma_zfree(pcbinfo->ipi_zone, inp);
+ }
+#endif
+ return (error);
+}
+
+int
+in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
+{
+ int anonport, error;
+
+ INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
+ return (EINVAL);
+ anonport = inp->inp_lport == 0 && (nam == NULL ||
+ ((struct sockaddr_in *)nam)->sin_port == 0);
+ error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
+ &inp->inp_lport, cred);
+ if (error)
+ return (error);
+ if (in_pcbinshash(inp) != 0) {
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ inp->inp_lport = 0;
+ return (EAGAIN);
+ }
+ if (anonport)
+ inp->inp_flags |= INP_ANONPORT;
+ return (0);
+}
+
+/*
+ * Set up a bind operation on a PCB, performing port allocation
+ * as required, but do not actually modify the PCB. Callers can
+ * either complete the bind by setting inp_laddr/inp_lport and
+ * calling in_pcbinshash(), or they can just use the resulting
+ * port and address to authorise the sending of a once-off packet.
+ *
+ * On error, the values of *laddrp and *lportp are not changed.
+ */
+int
+in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
+ u_short *lportp, struct ucred *cred)
+{
+ struct socket *so = inp->inp_socket;
+ unsigned short *lastport;
+ struct sockaddr_in *sin;
+ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+ struct in_addr laddr;
+ u_short lport = 0;
+ int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
+ int error;
+ int dorandom;
+
+ /*
+ * Because no actual state changes occur here, a global write lock on
+ * the pcbinfo isn't required.
+ */
+ INP_INFO_LOCK_ASSERT(pcbinfo);
+ INP_LOCK_ASSERT(inp);
+
+ if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
+ return (EADDRNOTAVAIL);
+ laddr.s_addr = *laddrp;
+ if (nam != NULL && laddr.s_addr != INADDR_ANY)
+ return (EINVAL);
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+ wild = INPLOOKUP_WILDCARD;
+ if (nam == NULL) {
+ if ((error = prison_local_ip4(cred, &laddr)) != 0)
+ return (error);
+ } else {
+ sin = (struct sockaddr_in *)nam;
+ if (nam->sa_len != sizeof (*sin))
+ return (EINVAL);
+#ifdef notdef
+ /*
+ * We should check the family, but old programs
+ * incorrectly fail to initialize it.
+ */
+ if (sin->sin_family != AF_INET)
+ return (EAFNOSUPPORT);
+#endif
+ error = prison_local_ip4(cred, &sin->sin_addr);
+ if (error)
+ return (error);
+ if (sin->sin_port != *lportp) {
+ /* Don't allow the port to change. */
+ if (*lportp != 0)
+ return (EINVAL);
+ lport = sin->sin_port;
+ }
+ /* NB: lport is left as 0 if the port isn't being changed. */
+ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
+ /*
+ * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
+ * allow complete duplication of binding if
+ * SO_REUSEPORT is set, or if SO_REUSEADDR is set
+ * and a multicast address is bound on both
+ * new and duplicated sockets.
+ */
+ if (so->so_options & SO_REUSEADDR)
+ reuseport = SO_REUSEADDR|SO_REUSEPORT;
+ } else if (sin->sin_addr.s_addr != INADDR_ANY) {
+ sin->sin_port = 0; /* yech... */
+ bzero(&sin->sin_zero, sizeof(sin->sin_zero));
+ /*
+ * Is the address a local IP address?
+ * If INP_BINDANY is set, then the socket may be bound
+ * to any endpoint address, local or not.
+ */
+ if ((inp->inp_flags & INP_BINDANY) == 0 &&
+ ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
+ return (EADDRNOTAVAIL);
+ }
+ laddr = sin->sin_addr;
+ if (lport) {
+ struct inpcb *t;
+ struct tcptw *tw;
+
+ /* GROSS */
+ if (ntohs(lport) <= V_ipport_reservedhigh &&
+ ntohs(lport) >= V_ipport_reservedlow &&
+ priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
+ 0))
+ return (EACCES);
+ if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
+ priv_check_cred(inp->inp_cred,
+ PRIV_NETINET_REUSEPORT, 0) != 0) {
+ t = in_pcblookup_local(pcbinfo, sin->sin_addr,
+ lport, INPLOOKUP_WILDCARD, cred);
+ /*
+ * XXX
+ * This entire block sorely needs a rewrite.
+ */
+ if (t &&
+ ((t->inp_flags & INP_TIMEWAIT) == 0) &&
+ (so->so_type != SOCK_STREAM ||
+ ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
+ (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
+ ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
+ (t->inp_socket->so_options &
+ SO_REUSEPORT) == 0) &&
+ (inp->inp_cred->cr_uid !=
+ t->inp_cred->cr_uid))
+ return (EADDRINUSE);
+ }
+ t = in_pcblookup_local(pcbinfo, sin->sin_addr,
+ lport, wild, cred);
+ if (t && (t->inp_flags & INP_TIMEWAIT)) {
+ /*
+ * XXXRW: If an incpb has had its timewait
+ * state recycled, we treat the address as
+ * being in use (for now). This is better
+ * than a panic, but not desirable.
+ */
+ tw = intotw(inp);
+ if (tw == NULL ||
+ (reuseport & tw->tw_so_options) == 0)
+ return (EADDRINUSE);
+ } else if (t &&
+ (reuseport & t->inp_socket->so_options) == 0) {
+#ifdef INET6
+ if (ntohl(sin->sin_addr.s_addr) !=
+ INADDR_ANY ||
+ ntohl(t->inp_laddr.s_addr) !=
+ INADDR_ANY ||
+ INP_SOCKAF(so) ==
+ INP_SOCKAF(t->inp_socket))
+#endif
+ return (EADDRINUSE);
+ }
+ }
+ }
+ if (*lportp != 0)
+ lport = *lportp;
+ if (lport == 0) {
+ u_short first, last, aux;
+ int count;
+
+ if (inp->inp_flags & INP_HIGHPORT) {
+ first = V_ipport_hifirstauto; /* sysctl */
+ last = V_ipport_hilastauto;
+ lastport = &pcbinfo->ipi_lasthi;
+ } else if (inp->inp_flags & INP_LOWPORT) {
+ error = priv_check_cred(cred,
+ PRIV_NETINET_RESERVEDPORT, 0);
+ if (error)
+ return error;
+ first = V_ipport_lowfirstauto; /* 1023 */
+ last = V_ipport_lowlastauto; /* 600 */
+ lastport = &pcbinfo->ipi_lastlow;
+ } else {
+ first = V_ipport_firstauto; /* sysctl */
+ last = V_ipport_lastauto;
+ lastport = &pcbinfo->ipi_lastport;
+ }
+ /*
+ * For UDP, use random port allocation as long as the user
+ * allows it. For TCP (and as of yet unknown) connections,
+ * use random port allocation only if the user allows it AND
+ * ipport_tick() allows it.
+ */
+ if (V_ipport_randomized &&
+ (!V_ipport_stoprandom || pcbinfo == &V_udbinfo))
+ dorandom = 1;
+ else
+ dorandom = 0;
+ /*
+ * It makes no sense to do random port allocation if
+ * we have the only port available.
+ */
+ if (first == last)
+ dorandom = 0;
+ /* Make sure to not include UDP packets in the count. */
+ if (pcbinfo != &V_udbinfo)
+ V_ipport_tcpallocs++;
+ /*
+ * Instead of having two loops further down counting up or down
+ * make sure that first is always <= last and go with only one
+ * code path implementing all logic.
+ */
+ if (first > last) {
+ aux = first;
+ first = last;
+ last = aux;
+ }
+
+ if (dorandom)
+ *lastport = first +
+ (arc4random() % (last - first));
+
+ count = last - first;
+
+ do {
+ if (count-- < 0) /* completely used? */
+ return (EADDRNOTAVAIL);
+ ++*lastport;
+ if (*lastport < first || *lastport > last)
+ *lastport = first;
+ lport = htons(*lastport);
+ } while (in_pcblookup_local(pcbinfo, laddr,
+ lport, wild, cred));
+ }
+ *laddrp = laddr.s_addr;
+ *lportp = lport;
+ return (0);
+}
+
+/*
+ * Connect from a socket to a specified address.
+ * Both address and port must be specified in argument sin.
+ * If don't have a local address for this socket yet,
+ * then pick one.
+ */
+int
+in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
+{
+ u_short lport, fport;
+ in_addr_t laddr, faddr;
+ int anonport, error;
+
+ INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ lport = inp->inp_lport;
+ laddr = inp->inp_laddr.s_addr;
+ anonport = (lport == 0);
+ error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
+ NULL, cred);
+ if (error)
+ return (error);
+
+ /* Do the initial binding of the local address if required. */
+ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
+ inp->inp_lport = lport;
+ inp->inp_laddr.s_addr = laddr;
+ if (in_pcbinshash(inp) != 0) {
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ inp->inp_lport = 0;
+ return (EAGAIN);
+ }
+ }
+
+ /* Commit the remaining changes. */
+ inp->inp_lport = lport;
+ inp->inp_laddr.s_addr = laddr;
+ inp->inp_faddr.s_addr = faddr;
+ inp->inp_fport = fport;
+ in_pcbrehash(inp);
+
+ if (anonport)
+ inp->inp_flags |= INP_ANONPORT;
+ return (0);
+}
+
+/*
+ * Do proper source address selection on an unbound socket in case
+ * of connect. Take jails into account as well.
+ */
+static int
+in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
+ struct ucred *cred)
+{
+ struct ifaddr *ifa;
+ struct sockaddr *sa;
+ struct sockaddr_in *sin;
+ struct route sro;
+ int error;
+
+ KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
+
+ /*
+ * Bypass source address selection and use the primary jail IP
+ * if requested.
+ */
+ if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
+ return (0);
+
+ error = 0;
+ bzero(&sro, sizeof(sro));
+
+ sin = (struct sockaddr_in *)&sro.ro_dst;
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_addr.s_addr = faddr->s_addr;
+
+ /*
+ * If route is known our src addr is taken from the i/f,
+ * else punt.
+ *
+ * Find out route to destination.
+ */
+ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
+ in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
+
+ /*
+ * If we found a route, use the address corresponding to
+ * the outgoing interface.
+ *
+ * Otherwise assume faddr is reachable on a directly connected
+ * network and try to find a corresponding interface to take
+ * the source address from.
+ */
+ if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
+ struct in_ifaddr *ia;
+ struct ifnet *ifp;
+
+ ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin));
+ if (ia == NULL)
+ ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0));
+ if (ia == NULL) {
+ error = ENETUNREACH;
+ goto done;
+ }
+
+ if (cred == NULL || !prison_flag(cred, PR_IP4)) {
+ laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
+ ifa_free(&ia->ia_ifa);
+ goto done;
+ }
+
+ ifp = ia->ia_ifp;
+ ifa_free(&ia->ia_ifa);
+ ia = NULL;
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+
+ sa = ifa->ifa_addr;
+ if (sa->sa_family != AF_INET)
+ continue;
+ sin = (struct sockaddr_in *)sa;
+ if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
+ ia = (struct in_ifaddr *)ifa;
+ break;
+ }
+ }
+ if (ia != NULL) {
+ laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
+ IF_ADDR_UNLOCK(ifp);
+ goto done;
+ }
+ IF_ADDR_UNLOCK(ifp);
+
+ /* 3. As a last resort return the 'default' jail address. */
+ error = prison_get_ip4(cred, laddr);
+ goto done;
+ }
+
+ /*
+ * If the outgoing interface on the route found is not
+ * a loopback interface, use the address from that interface.
+ * In case of jails do those three steps:
+ * 1. check if the interface address belongs to the jail. If so use it.
+ * 2. check if we have any address on the outgoing interface
+ * belonging to this jail. If so use it.
+ * 3. as a last resort return the 'default' jail address.
+ */
+ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
+ struct in_ifaddr *ia;
+ struct ifnet *ifp;
+
+ /* If not jailed, use the default returned. */
+ if (cred == NULL || !prison_flag(cred, PR_IP4)) {
+ ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
+ laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
+ goto done;
+ }
+
+ /* Jailed. */
+ /* 1. Check if the iface address belongs to the jail. */
+ sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
+ if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
+ ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
+ laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
+ goto done;
+ }
+
+ /*
+ * 2. Check if we have any address on the outgoing interface
+ * belonging to this jail.
+ */
+ ia = NULL;
+ ifp = sro.ro_rt->rt_ifp;
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ sa = ifa->ifa_addr;
+ if (sa->sa_family != AF_INET)
+ continue;
+ sin = (struct sockaddr_in *)sa;
+ if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
+ ia = (struct in_ifaddr *)ifa;
+ break;
+ }
+ }
+ if (ia != NULL) {
+ laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
+ IF_ADDR_UNLOCK(ifp);
+ goto done;
+ }
+ IF_ADDR_UNLOCK(ifp);
+
+ /* 3. As a last resort return the 'default' jail address. */
+ error = prison_get_ip4(cred, laddr);
+ goto done;
+ }
+
+ /*
+ * The outgoing interface is marked with 'loopback net', so a route
+ * to ourselves is here.
+ * Try to find the interface of the destination address and then
+ * take the address from there. That interface is not necessarily
+ * a loopback interface.
+ * In case of jails, check that it is an address of the jail
+ * and if we cannot find, fall back to the 'default' jail address.
+ */
+ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
+ struct sockaddr_in sain;
+ struct in_ifaddr *ia;
+
+ bzero(&sain, sizeof(struct sockaddr_in));
+ sain.sin_family = AF_INET;
+ sain.sin_len = sizeof(struct sockaddr_in);
+ sain.sin_addr.s_addr = faddr->s_addr;
+
+ ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain)));
+ if (ia == NULL)
+ ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0));
+ if (ia == NULL)
+ ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
+
+ if (cred == NULL || !prison_flag(cred, PR_IP4)) {
+ if (ia == NULL) {
+ error = ENETUNREACH;
+ goto done;
+ }
+ laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
+ ifa_free(&ia->ia_ifa);
+ goto done;
+ }
+
+ /* Jailed. */
+ if (ia != NULL) {
+ struct ifnet *ifp;
+
+ ifp = ia->ia_ifp;
+ ifa_free(&ia->ia_ifa);
+ ia = NULL;
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+
+ sa = ifa->ifa_addr;
+ if (sa->sa_family != AF_INET)
+ continue;
+ sin = (struct sockaddr_in *)sa;
+ if (prison_check_ip4(cred,
+ &sin->sin_addr) == 0) {
+ ia = (struct in_ifaddr *)ifa;
+ break;
+ }
+ }
+ if (ia != NULL) {
+ laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
+ IF_ADDR_UNLOCK(ifp);
+ goto done;
+ }
+ IF_ADDR_UNLOCK(ifp);
+ }
+
+ /* 3. As a last resort return the 'default' jail address. */
+ error = prison_get_ip4(cred, laddr);
+ goto done;
+ }
+
+done:
+ if (sro.ro_rt != NULL)
+ RTFREE(sro.ro_rt);
+ return (error);
+}
+
+/*
+ * Set up for a connect from a socket to the specified address.
+ * On entry, *laddrp and *lportp should contain the current local
+ * address and port for the PCB; these are updated to the values
+ * that should be placed in inp_laddr and inp_lport to complete
+ * the connect.
+ *
+ * On success, *faddrp and *fportp will be set to the remote address
+ * and port. These are not updated in the error case.
+ *
+ * If the operation fails because the connection already exists,
+ * *oinpp will be set to the PCB of that connection so that the
+ * caller can decide to override it. In all other cases, *oinpp
+ * is set to NULL.
+ */
+int
+in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
+ in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
+ struct inpcb **oinpp, struct ucred *cred)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)nam;
+ struct in_ifaddr *ia;
+ struct inpcb *oinp;
+ struct in_addr laddr, faddr;
+ u_short lport, fport;
+ int error;
+
+ /*
+ * Because a global state change doesn't actually occur here, a read
+ * lock is sufficient.
+ */
+ INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo);
+ INP_LOCK_ASSERT(inp);
+
+ if (oinpp != NULL)
+ *oinpp = NULL;
+ if (nam->sa_len != sizeof (*sin))
+ return (EINVAL);
+ if (sin->sin_family != AF_INET)
+ return (EAFNOSUPPORT);
+ if (sin->sin_port == 0)
+ return (EADDRNOTAVAIL);
+ laddr.s_addr = *laddrp;
+ lport = *lportp;
+ faddr = sin->sin_addr;
+ fport = sin->sin_port;
+
+ if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
+ /*
+ * If the destination address is INADDR_ANY,
+ * use the primary local address.
+ * If the supplied address is INADDR_BROADCAST,
+ * and the primary interface supports broadcast,
+ * choose the broadcast address for that interface.
+ */
+ if (faddr.s_addr == INADDR_ANY) {
+ IN_IFADDR_RLOCK();
+ faddr =
+ IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
+ IN_IFADDR_RUNLOCK();
+ if (cred != NULL &&
+ (error = prison_get_ip4(cred, &faddr)) != 0)
+ return (error);
+ } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
+ IN_IFADDR_RLOCK();
+ if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
+ IFF_BROADCAST)
+ faddr = satosin(&TAILQ_FIRST(
+ &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
+ IN_IFADDR_RUNLOCK();
+ }
+ }
+ if (laddr.s_addr == INADDR_ANY) {
+ error = in_pcbladdr(inp, &faddr, &laddr, cred);
+ /*
+ * If the destination address is multicast and an outgoing
+ * interface has been set as a multicast option, prefer the
+ * address of that interface as our source address.
+ */
+ if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
+ inp->inp_moptions != NULL) {
+ struct ip_moptions *imo;
+ struct ifnet *ifp;
+
+ imo = inp->inp_moptions;
+ if (imo->imo_multicast_ifp != NULL) {
+ ifp = imo->imo_multicast_ifp;
+ IN_IFADDR_RLOCK();
+ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link)
+ if (ia->ia_ifp == ifp)
+ break;
+ if (ia == NULL) {
+ IN_IFADDR_RUNLOCK();
+ error = EADDRNOTAVAIL;
+ } else {
+ laddr = ia->ia_addr.sin_addr;
+ IN_IFADDR_RUNLOCK();
+ error = 0;
+ }
+ }
+ }
+ if (error)
+ return (error);
+ }
+ oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
+ 0, NULL);
+ if (oinp != NULL) {
+ if (oinpp != NULL)
+ *oinpp = oinp;
+ return (EADDRINUSE);
+ }
+ if (lport == 0) {
+ error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
+ cred);
+ if (error)
+ return (error);
+ }
+ *laddrp = laddr.s_addr;
+ *lportp = lport;
+ *faddrp = faddr.s_addr;
+ *fportp = fport;
+ return (0);
+}
+
+void
+in_pcbdisconnect(struct inpcb *inp)
+{
+
+ INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ inp->inp_faddr.s_addr = INADDR_ANY;
+ inp->inp_fport = 0;
+ in_pcbrehash(inp);
+}
+
+/*
+ * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
+ * For most protocols, this will be invoked immediately prior to calling
+ * in_pcbfree(). However, with TCP the inpcb may significantly outlive the
+ * socket, in which case in_pcbfree() is deferred.
+ */
+void
+in_pcbdetach(struct inpcb *inp)
+{
+
+ KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
+
+ inp->inp_socket->so_pcb = NULL;
+ inp->inp_socket = NULL;
+}
+
+/*
+ * in_pcbfree_internal() frees an inpcb that has been detached from its
+ * socket, and whose reference count has reached 0. It will also remove the
+ * inpcb from any global lists it might remain on.
+ */
+static void
+in_pcbfree_internal(struct inpcb *inp)
+{
+ struct inpcbinfo *ipi = inp->inp_pcbinfo;
+
+ KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
+ KASSERT(inp->inp_refcount == 0, ("%s: refcount !0", __func__));
+
+ INP_INFO_WLOCK_ASSERT(ipi);
+ INP_WLOCK_ASSERT(inp);
+
+#ifdef IPSEC
+ if (inp->inp_sp != NULL)
+ ipsec_delete_pcbpolicy(inp);
+#endif /* IPSEC */
+ inp->inp_gencnt = ++ipi->ipi_gencnt;
+ in_pcbremlists(inp);
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6PROTO) {
+ ip6_freepcbopts(inp->in6p_outputopts);
+ if (inp->in6p_moptions != NULL)
+ ip6_freemoptions(inp->in6p_moptions);
+ }
+#endif
+ if (inp->inp_options)
+ (void)m_free(inp->inp_options);
+ if (inp->inp_moptions != NULL)
+ inp_freemoptions(inp->inp_moptions);
+ inp->inp_vflag = 0;
+ crfree(inp->inp_cred);
+
+#ifdef MAC
+ mac_inpcb_destroy(inp);
+#endif
+ INP_WUNLOCK(inp);
+ uma_zfree(ipi->ipi_zone, inp);
+}
+
+/*
+ * in_pcbref() bumps the reference count on an inpcb in order to maintain
+ * stability of an inpcb pointer despite the inpcb lock being released. This
+ * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
+ * but where the inpcb lock is already held.
+ *
+ * While the inpcb will not be freed, releasing the inpcb lock means that the
+ * connection's state may change, so the caller should be careful to
+ * revalidate any cached state on reacquiring the lock. Drop the reference
+ * using in_pcbrele().
+ */
+void
+in_pcbref(struct inpcb *inp)
+{
+
+ INP_WLOCK_ASSERT(inp);
+
+ KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+
+ inp->inp_refcount++;
+}
+
+/*
+ * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
+ * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
+ * return a flag indicating whether or not the inpcb remains valid. If it is
+ * valid, we return with the inpcb lock held.
+ */
+int
+in_pcbrele(struct inpcb *inp)
+{
+#ifdef INVARIANTS
+ struct inpcbinfo *ipi = inp->inp_pcbinfo;
+#endif
+
+ KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+
+ INP_INFO_WLOCK_ASSERT(ipi);
+ INP_WLOCK_ASSERT(inp);
+
+ inp->inp_refcount--;
+ if (inp->inp_refcount > 0)
+ return (0);
+ in_pcbfree_internal(inp);
+ return (1);
+}
+
+/*
+ * Unconditionally schedule an inpcb to be freed by decrementing its
+ * reference count, which should occur only after the inpcb has been detached
+ * from its socket. If another thread holds a temporary reference (acquired
+ * using in_pcbref()) then the free is deferred until that reference is
+ * released using in_pcbrele(), but the inpcb is still unlocked.
+ */
+void
+in_pcbfree(struct inpcb *inp)
+{
+#ifdef INVARIANTS
+ struct inpcbinfo *ipi = inp->inp_pcbinfo;
+#endif
+
+ KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL",
+ __func__));
+
+ INP_INFO_WLOCK_ASSERT(ipi);
+ INP_WLOCK_ASSERT(inp);
+
+ if (!in_pcbrele(inp))
+ INP_WUNLOCK(inp);
+}
+
+/*
+ * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
+ * port reservation, and preventing it from being returned by inpcb lookups.
+ *
+ * It is used by TCP to mark an inpcb as unused and avoid future packet
+ * delivery or event notification when a socket remains open but TCP has
+ * closed. This might occur as a result of a shutdown()-initiated TCP close
+ * or a RST on the wire, and allows the port binding to be reused while still
+ * maintaining the invariant that so_pcb always points to a valid inpcb until
+ * in_pcbdetach().
+ *
+ * XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash
+ * lists, but can lead to confusing netstat output, as open sockets with
+ * closed TCP connections will no longer appear to have their bound port
+ * number. An explicit flag would be better, as it would allow us to leave
+ * the port number intact after the connection is dropped.
+ *
+ * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
+ * in_pcbnotifyall() and in_pcbpurgeif0()?
+ */
+void
+in_pcbdrop(struct inpcb *inp)
+{
+
+ INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ inp->inp_flags |= INP_DROPPED;
+ if (inp->inp_flags & INP_INHASHLIST) {
+ struct inpcbport *phd = inp->inp_phd;
+
+ LIST_REMOVE(inp, inp_hash);
+ LIST_REMOVE(inp, inp_portlist);
+ if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
+ LIST_REMOVE(phd, phd_hash);
+ free(phd, M_PCB);
+ }
+ inp->inp_flags &= ~INP_INHASHLIST;
+ }
+}
+
+/*
+ * Common routines to return the socket addresses associated with inpcbs.
+ */
+struct sockaddr *
+in_sockaddr(in_port_t port, struct in_addr *addr_p)
+{
+ struct sockaddr_in *sin;
+
+ sin = malloc(sizeof *sin, M_SONAME,
+ M_WAITOK | M_ZERO);
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = *addr_p;
+ sin->sin_port = port;
+
+ return (struct sockaddr *)sin;
+}
+
+int
+in_getsockaddr(struct socket *so, struct sockaddr **nam)
+{
+ struct inpcb *inp;
+ struct in_addr addr;
+ in_port_t port;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
+
+ INP_RLOCK(inp);
+ port = inp->inp_lport;
+ addr = inp->inp_laddr;
+ INP_RUNLOCK(inp);
+
+ *nam = in_sockaddr(port, &addr);
+ return 0;
+}
+
+int
+in_getpeeraddr(struct socket *so, struct sockaddr **nam)
+{
+ struct inpcb *inp;
+ struct in_addr addr;
+ in_port_t port;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
+
+ INP_RLOCK(inp);
+ port = inp->inp_fport;
+ addr = inp->inp_faddr;
+ INP_RUNLOCK(inp);
+
+ *nam = in_sockaddr(port, &addr);
+ return 0;
+}
+
+void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
+ struct inpcb *(*notify)(struct inpcb *, int))
+{
+ struct inpcb *inp, *inp_temp;
+
+ INP_INFO_WLOCK(pcbinfo);
+ LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
+ INP_WLOCK(inp);
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV4) == 0) {
+ INP_WUNLOCK(inp);
+ continue;
+ }
+#endif
+ if (inp->inp_faddr.s_addr != faddr.s_addr ||
+ inp->inp_socket == NULL) {
+ INP_WUNLOCK(inp);
+ continue;
+ }
+ if ((*notify)(inp, errno))
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_WUNLOCK(pcbinfo);
+}
+
+void
+in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
+{
+ struct inpcb *inp;
+ struct ip_moptions *imo;
+ int i, gap;
+
+ INP_INFO_RLOCK(pcbinfo);
+ LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
+ INP_WLOCK(inp);
+ imo = inp->inp_moptions;
+ if ((inp->inp_vflag & INP_IPV4) &&
+ imo != NULL) {
+ /*
+ * Unselect the outgoing interface if it is being
+ * detached.
+ */
+ if (imo->imo_multicast_ifp == ifp)
+ imo->imo_multicast_ifp = NULL;
+
+ /*
+ * Drop multicast group membership if we joined
+ * through the interface being detached.
+ */
+ for (i = 0, gap = 0; i < imo->imo_num_memberships;
+ i++) {
+ if (imo->imo_membership[i]->inm_ifp == ifp) {
+ in_delmulti(imo->imo_membership[i]);
+ gap++;
+ } else if (gap != 0)
+ imo->imo_membership[i - gap] =
+ imo->imo_membership[i];
+ }
+ imo->imo_num_memberships -= gap;
+ }
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(pcbinfo);
+}
+
+/*
+ * Lookup a PCB based on the local address and port.
+ */
+#define INP_LOOKUP_MAPPED_PCB_COST 3
+struct inpcb *
+in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
+ u_short lport, int wild_okay, struct ucred *cred)
+{
+ struct inpcb *inp;
+#ifdef INET6
+ int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
+#else
+ int matchwild = 3;
+#endif
+ int wildcard;
+
+ INP_INFO_LOCK_ASSERT(pcbinfo);
+
+ if (!wild_okay) {
+ struct inpcbhead *head;
+ /*
+ * Look for an unconnected (wildcard foreign addr) PCB that
+ * matches the local address and port we're looking for.
+ */
+ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
+ 0, pcbinfo->ipi_hashmask)];
+ LIST_FOREACH(inp, head, inp_hash) {
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (inp->inp_faddr.s_addr == INADDR_ANY &&
+ inp->inp_laddr.s_addr == laddr.s_addr &&
+ inp->inp_lport == lport) {
+ /*
+ * Found?
+ */
+ if (cred == NULL ||
+ prison_equal_ip4(cred->cr_prison,
+ inp->inp_cred->cr_prison))
+ return (inp);
+ }
+ }
+ /*
+ * Not found.
+ */
+ return (NULL);
+ } else {
+ struct inpcbporthead *porthash;
+ struct inpcbport *phd;
+ struct inpcb *match = NULL;
+ /*
+ * Best fit PCB lookup.
+ *
+ * First see if this local port is in use by looking on the
+ * port hash list.
+ */
+ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
+ pcbinfo->ipi_porthashmask)];
+ LIST_FOREACH(phd, porthash, phd_hash) {
+ if (phd->phd_port == lport)
+ break;
+ }
+ if (phd != NULL) {
+ /*
+ * Port is in use by one or more PCBs. Look for best
+ * fit.
+ */
+ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
+ wildcard = 0;
+ if (cred != NULL &&
+ !prison_equal_ip4(inp->inp_cred->cr_prison,
+ cred->cr_prison))
+ continue;
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+ /*
+ * We never select the PCB that has
+ * INP_IPV6 flag and is bound to :: if
+ * we have another PCB which is bound
+ * to 0.0.0.0. If a PCB has the
+ * INP_IPV6 flag, then we set its cost
+ * higher than IPv4 only PCBs.
+ *
+ * Note that the case only happens
+ * when a socket is bound to ::, under
+ * the condition that the use of the
+ * mapped address is allowed.
+ */
+ if ((inp->inp_vflag & INP_IPV6) != 0)
+ wildcard += INP_LOOKUP_MAPPED_PCB_COST;
+#endif
+ if (inp->inp_faddr.s_addr != INADDR_ANY)
+ wildcard++;
+ if (inp->inp_laddr.s_addr != INADDR_ANY) {
+ if (laddr.s_addr == INADDR_ANY)
+ wildcard++;
+ else if (inp->inp_laddr.s_addr != laddr.s_addr)
+ continue;
+ } else {
+ if (laddr.s_addr != INADDR_ANY)
+ wildcard++;
+ }
+ if (wildcard < matchwild) {
+ match = inp;
+ matchwild = wildcard;
+ if (matchwild == 0)
+ break;
+ }
+ }
+ }
+ return (match);
+ }
+}
+#undef INP_LOOKUP_MAPPED_PCB_COST
+
+/*
+ * Lookup PCB in hash list.
+ */
+struct inpcb *
+in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
+ u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
+ struct ifnet *ifp)
+{
+ struct inpcbhead *head;
+ struct inpcb *inp, *tmpinp;
+ u_short fport = fport_arg, lport = lport_arg;
+
+ INP_INFO_LOCK_ASSERT(pcbinfo);
+
+ /*
+ * First look for an exact match.
+ */
+ tmpinp = NULL;
+ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
+ pcbinfo->ipi_hashmask)];
+ LIST_FOREACH(inp, head, inp_hash) {
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (inp->inp_faddr.s_addr == faddr.s_addr &&
+ inp->inp_laddr.s_addr == laddr.s_addr &&
+ inp->inp_fport == fport &&
+ inp->inp_lport == lport) {
+ /*
+ * XXX We should be able to directly return
+ * the inp here, without any checks.
+ * Well unless both bound with SO_REUSEPORT?
+ */
+ if (prison_flag(inp->inp_cred, PR_IP4))
+ return (inp);
+ if (tmpinp == NULL)
+ tmpinp = inp;
+ }
+ }
+ if (tmpinp != NULL)
+ return (tmpinp);
+
+ /*
+ * Then look for a wildcard match, if requested.
+ */
+ if (wildcard == INPLOOKUP_WILDCARD) {
+ struct inpcb *local_wild = NULL, *local_exact = NULL;
+#ifdef INET6
+ struct inpcb *local_wild_mapped = NULL;
+#endif
+ struct inpcb *jail_wild = NULL;
+ int injail;
+
+ /*
+ * Order of socket selection - we always prefer jails.
+ * 1. jailed, non-wild.
+ * 2. jailed, wild.
+ * 3. non-jailed, non-wild.
+ * 4. non-jailed, wild.
+ */
+
+ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
+ 0, pcbinfo->ipi_hashmask)];
+ LIST_FOREACH(inp, head, inp_hash) {
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (inp->inp_faddr.s_addr != INADDR_ANY ||
+ inp->inp_lport != lport)
+ continue;
+
+ /* XXX inp locking */
+ if (ifp && ifp->if_type == IFT_FAITH &&
+ (inp->inp_flags & INP_FAITH) == 0)
+ continue;
+
+ injail = prison_flag(inp->inp_cred, PR_IP4);
+ if (injail) {
+ if (prison_check_ip4(inp->inp_cred,
+ &laddr) != 0)
+ continue;
+ } else {
+ if (local_exact != NULL)
+ continue;
+ }
+
+ if (inp->inp_laddr.s_addr == laddr.s_addr) {
+ if (injail)
+ return (inp);
+ else
+ local_exact = inp;
+ } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
+#ifdef INET6
+ /* XXX inp locking, NULL check */
+ if (inp->inp_vflag & INP_IPV6PROTO)
+ local_wild_mapped = inp;
+ else
+#endif /* INET6 */
+ if (injail)
+ jail_wild = inp;
+ else
+ local_wild = inp;
+ }
+ } /* LIST_FOREACH */
+ if (jail_wild != NULL)
+ return (jail_wild);
+ if (local_exact != NULL)
+ return (local_exact);
+ if (local_wild != NULL)
+ return (local_wild);
+#ifdef INET6
+ if (local_wild_mapped != NULL)
+ return (local_wild_mapped);
+#endif /* defined(INET6) */
+ } /* if (wildcard == INPLOOKUP_WILDCARD) */
+
+ return (NULL);
+}
+
+/*
+ * Insert PCB onto various hash lists.
+ */
+int
+in_pcbinshash(struct inpcb *inp)
+{
+ struct inpcbhead *pcbhash;
+ struct inpcbporthead *pcbporthash;
+ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+ struct inpcbport *phd;
+ u_int32_t hashkey_faddr;
+
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ INP_WLOCK_ASSERT(inp);
+ KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
+ ("in_pcbinshash: INP_INHASHLIST"));
+
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6)
+ hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
+ else
+#endif /* INET6 */
+ hashkey_faddr = inp->inp_faddr.s_addr;
+
+ pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
+ inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
+
+ pcbporthash = &pcbinfo->ipi_porthashbase[
+ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
+
+ /*
+ * Go through port list and look for a head for this lport.
+ */
+ LIST_FOREACH(phd, pcbporthash, phd_hash) {
+ if (phd->phd_port == inp->inp_lport)
+ break;
+ }
+ /*
+ * If none exists, malloc one and tack it on.
+ */
+ if (phd == NULL) {
+ phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
+ if (phd == NULL) {
+ return (ENOBUFS); /* XXX */
+ }
+ phd->phd_port = inp->inp_lport;
+ LIST_INIT(&phd->phd_pcblist);
+ LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
+ }
+ inp->inp_phd = phd;
+ LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
+ LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
+ inp->inp_flags |= INP_INHASHLIST;
+ return (0);
+}
+
+/*
+ * Move PCB to the proper hash bucket when { faddr, fport } have been
+ * changed. NOTE: This does not handle the case of the lport changing (the
+ * hashed port list would have to be updated as well), so the lport must
+ * not change after in_pcbinshash() has been called.
+ */
+void
+in_pcbrehash(struct inpcb *inp)
+{
+ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+ struct inpcbhead *head;
+ u_int32_t hashkey_faddr;
+
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ INP_WLOCK_ASSERT(inp);
+ KASSERT(inp->inp_flags & INP_INHASHLIST,
+ ("in_pcbrehash: !INP_INHASHLIST"));
+
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6)
+ hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
+ else
+#endif /* INET6 */
+ hashkey_faddr = inp->inp_faddr.s_addr;
+
+ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
+ inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
+
+ LIST_REMOVE(inp, inp_hash);
+ LIST_INSERT_HEAD(head, inp, inp_hash);
+}
+
+/*
+ * Remove PCB from various lists.
+ */
+static void
+in_pcbremlists(struct inpcb *inp)
+{
+ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
+ if (inp->inp_flags & INP_INHASHLIST) {
+ struct inpcbport *phd = inp->inp_phd;
+
+ LIST_REMOVE(inp, inp_hash);
+ LIST_REMOVE(inp, inp_portlist);
+ if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
+ LIST_REMOVE(phd, phd_hash);
+ free(phd, M_PCB);
+ }
+ inp->inp_flags &= ~INP_INHASHLIST;
+ }
+ LIST_REMOVE(inp, inp_list);
+ pcbinfo->ipi_count--;
+}
+
+/*
+ * A set label operation has occurred at the socket layer, propagate the
+ * label change into the in_pcb for the socket.
+ */
+void
+in_pcbsosetlabel(struct socket *so)
+{
+#ifdef MAC
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
+
+ INP_WLOCK(inp);
+ SOCK_LOCK(so);
+ mac_inpcb_sosetlabel(so, inp);
+ SOCK_UNLOCK(so);
+ INP_WUNLOCK(inp);
+#endif
+}
+
+/*
+ * ipport_tick runs once per second, determining if random port allocation
+ * should be continued. If more than ipport_randomcps ports have been
+ * allocated in the last second, then we return to sequential port
+ * allocation. We return to random allocation only once we drop below
+ * ipport_randomcps for at least ipport_randomtime seconds.
+ */
+void
+ipport_tick(void *xtp)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+
+ VNET_LIST_RLOCK_NOSLEEP();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
+ if (V_ipport_tcpallocs <=
+ V_ipport_tcplastcount + V_ipport_randomcps) {
+ if (V_ipport_stoprandom > 0)
+ V_ipport_stoprandom--;
+ } else
+ V_ipport_stoprandom = V_ipport_randomtime;
+ V_ipport_tcplastcount = V_ipport_tcpallocs;
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK_NOSLEEP();
+ callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
+}
+
+void
+inp_wlock(struct inpcb *inp)
+{
+
+ INP_WLOCK(inp);
+}
+
+void
+inp_wunlock(struct inpcb *inp)
+{
+
+ INP_WUNLOCK(inp);
+}
+
+void
+inp_rlock(struct inpcb *inp)
+{
+
+ INP_RLOCK(inp);
+}
+
+void
+inp_runlock(struct inpcb *inp)
+{
+
+ INP_RUNLOCK(inp);
+}
+
+#ifdef INVARIANTS
+void
+inp_lock_assert(struct inpcb *inp)
+{
+
+ INP_WLOCK_ASSERT(inp);
+}
+
+void
+inp_unlock_assert(struct inpcb *inp)
+{
+
+ INP_UNLOCK_ASSERT(inp);
+}
+#endif
+
+void
+inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
+{
+ struct inpcb *inp;
+
+ INP_INFO_RLOCK(&V_tcbinfo);
+ LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
+ INP_WLOCK(inp);
+ func(inp, arg);
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+}
+
+struct socket *
+inp_inpcbtosocket(struct inpcb *inp)
+{
+
+ INP_WLOCK_ASSERT(inp);
+ return (inp->inp_socket);
+}
+
+struct tcpcb *
+inp_inpcbtotcpcb(struct inpcb *inp)
+{
+
+ INP_WLOCK_ASSERT(inp);
+ return ((struct tcpcb *)inp->inp_ppcb);
+}
+
+int
+inp_ip_tos_get(const struct inpcb *inp)
+{
+
+ return (inp->inp_ip_tos);
+}
+
+void
+inp_ip_tos_set(struct inpcb *inp, int val)
+{
+
+ inp->inp_ip_tos = val;
+}
+
+void
+inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
+ uint32_t *faddr, uint16_t *fp)
+{
+
+ INP_LOCK_ASSERT(inp);
+ *laddr = inp->inp_laddr.s_addr;
+ *faddr = inp->inp_faddr.s_addr;
+ *lp = inp->inp_lport;
+ *fp = inp->inp_fport;
+}
+
+struct inpcb *
+so_sotoinpcb(struct socket *so)
+{
+
+ return (sotoinpcb(so));
+}
+
+struct tcpcb *
+so_sototcpcb(struct socket *so)
+{
+
+ return (sototcpcb(so));
+}
+
+#ifdef DDB
+static void
+db_print_indent(int indent)
+{
+ int i;
+
+ for (i = 0; i < indent; i++)
+ db_printf(" ");
+}
+
+static void
+db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
+{
+ char faddr_str[48], laddr_str[48];
+
+ db_print_indent(indent);
+ db_printf("%s at %p\n", name, inc);
+
+ indent += 2;
+
+#ifdef INET6
+ if (inc->inc_flags & INC_ISIPV6) {
+ /* IPv6. */
+ ip6_sprintf(laddr_str, &inc->inc6_laddr);
+ ip6_sprintf(faddr_str, &inc->inc6_faddr);
+ } else {
+#endif
+ /* IPv4. */
+ inet_ntoa_r(inc->inc_laddr, laddr_str);
+ inet_ntoa_r(inc->inc_faddr, faddr_str);
+#ifdef INET6
+ }
+#endif
+ db_print_indent(indent);
+ db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
+ ntohs(inc->inc_lport));
+ db_print_indent(indent);
+ db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
+ ntohs(inc->inc_fport));
+}
+
+static void
+db_print_inpflags(int inp_flags)
+{
+ int comma;
+
+ comma = 0;
+ if (inp_flags & INP_RECVOPTS) {
+ db_printf("%sINP_RECVOPTS", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_RECVRETOPTS) {
+ db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_RECVDSTADDR) {
+ db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_HDRINCL) {
+ db_printf("%sINP_HDRINCL", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_HIGHPORT) {
+ db_printf("%sINP_HIGHPORT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_LOWPORT) {
+ db_printf("%sINP_LOWPORT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_ANONPORT) {
+ db_printf("%sINP_ANONPORT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_RECVIF) {
+ db_printf("%sINP_RECVIF", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_MTUDISC) {
+ db_printf("%sINP_MTUDISC", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_FAITH) {
+ db_printf("%sINP_FAITH", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_RECVTTL) {
+ db_printf("%sINP_RECVTTL", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_DONTFRAG) {
+ db_printf("%sINP_DONTFRAG", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & IN6P_IPV6_V6ONLY) {
+ db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & IN6P_PKTINFO) {
+ db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & IN6P_HOPLIMIT) {
+ db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & IN6P_HOPOPTS) {
+ db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & IN6P_DSTOPTS) {
+ db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & IN6P_RTHDR) {
+ db_printf("%sIN6P_RTHDR", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & IN6P_RTHDRDSTOPTS) {
+ db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & IN6P_TCLASS) {
+ db_printf("%sIN6P_TCLASS", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & IN6P_AUTOFLOWLABEL) {
+ db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_TIMEWAIT) {
+ db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_ONESBCAST) {
+ db_printf("%sINP_ONESBCAST", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_DROPPED) {
+ db_printf("%sINP_DROPPED", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & INP_SOCKREF) {
+ db_printf("%sINP_SOCKREF", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & IN6P_RFC2292) {
+ db_printf("%sIN6P_RFC2292", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_flags & IN6P_MTU) {
+ db_printf("IN6P_MTU%s", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_inpvflag(u_char inp_vflag)
+{
+ int comma;
+
+ comma = 0;
+ if (inp_vflag & INP_IPV4) {
+ db_printf("%sINP_IPV4", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_vflag & INP_IPV6) {
+ db_printf("%sINP_IPV6", comma ? ", " : "");
+ comma = 1;
+ }
+ if (inp_vflag & INP_IPV6PROTO) {
+ db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_inpcb(struct inpcb *inp, const char *name, int indent)
+{
+
+ db_print_indent(indent);
+ db_printf("%s at %p\n", name, inp);
+
+ indent += 2;
+
+ db_print_indent(indent);
+ db_printf("inp_flow: 0x%x\n", inp->inp_flow);
+
+ db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
+
+ db_print_indent(indent);
+ db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n",
+ inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
+
+ db_print_indent(indent);
+ db_printf("inp_label: %p inp_flags: 0x%x (",
+ inp->inp_label, inp->inp_flags);
+ db_print_inpflags(inp->inp_flags);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp,
+ inp->inp_vflag);
+ db_print_inpvflag(inp->inp_vflag);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
+ inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
+
+ db_print_indent(indent);
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6) {
+ db_printf("in6p_options: %p in6p_outputopts: %p "
+ "in6p_moptions: %p\n", inp->in6p_options,
+ inp->in6p_outputopts, inp->in6p_moptions);
+ db_printf("in6p_icmp6filt: %p in6p_cksum %d "
+ "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
+ inp->in6p_hops);
+ } else
+#endif
+ {
+ db_printf("inp_ip_tos: %d inp_ip_options: %p "
+ "inp_ip_moptions: %p\n", inp->inp_ip_tos,
+ inp->inp_options, inp->inp_moptions);
+ }
+
+ db_print_indent(indent);
+ db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd,
+ (uintmax_t)inp->inp_gencnt);
+}
+
+DB_SHOW_COMMAND(inpcb, db_show_inpcb)
+{
+ struct inpcb *inp;
+
+ if (!have_addr) {
+ db_printf("usage: show inpcb <addr>\n");
+ return;
+ }
+ inp = (struct inpcb *)addr;
+
+ db_print_inpcb(inp, "inpcb", 0);
+}
+#endif
diff --git a/freebsd/sys/netinet/in_pcb.h b/freebsd/sys/netinet/in_pcb.h
new file mode 100644
index 00000000..8cd4a5f8
--- /dev/null
+++ b/freebsd/sys/netinet/in_pcb.h
@@ -0,0 +1,525 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1993
+ * The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IN_PCB_HH_
+#define _NETINET_IN_PCB_HH_
+
+#include <freebsd/sys/queue.h>
+#include <freebsd/sys/_lock.h>
+#include <freebsd/sys/_mutex.h>
+#include <freebsd/sys/_rwlock.h>
+
+#ifdef _KERNEL
+#include <freebsd/sys/rwlock.h>
+#include <freebsd/net/vnet.h>
+#endif
+
+#define in6pcb inpcb /* for KAME src sync over BSD*'s */
+#define in6p_sp inp_sp /* for KAME src sync over BSD*'s */
+struct inpcbpolicy;
+
+/*
+ * struct inpcb is the common protocol control block structure used in most
+ * IP transport protocols.
+ *
+ * Pointers to local and foreign host table entries, local and foreign socket
+ * numbers, and pointers up (to a socket structure) and down (to a
+ * protocol-specific control block) are stored here.
+ */
+LIST_HEAD(inpcbhead, inpcb);
+LIST_HEAD(inpcbporthead, inpcbport);
+typedef u_quad_t inp_gen_t;
+
+/*
+ * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet.
+ * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing
+ * the following structure.
+ */
+struct in_addr_4in6 {
+ u_int32_t ia46_pad32[3];
+ struct in_addr ia46_addr4;
+};
+
+/*
+ * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has
+ * some extra padding to accomplish this.
+ */
+struct in_endpoints {
+ u_int16_t ie_fport; /* foreign port */
+ u_int16_t ie_lport; /* local port */
+ /* protocol dependent part, local and foreign addr */
+ union {
+ /* foreign host table entry */
+ struct in_addr_4in6 ie46_foreign;
+ struct in6_addr ie6_foreign;
+ } ie_dependfaddr;
+ union {
+ /* local host table entry */
+ struct in_addr_4in6 ie46_local;
+ struct in6_addr ie6_local;
+ } ie_dependladdr;
+};
+#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4
+#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4
+#define ie6_faddr ie_dependfaddr.ie6_foreign
+#define ie6_laddr ie_dependladdr.ie6_local
+
+/*
+ * XXX The defines for inc_* are hacks and should be changed to direct
+ * references.
+ */
+struct in_conninfo {
+ u_int8_t inc_flags;
+ u_int8_t inc_len;
+ u_int16_t inc_fibnum; /* XXX was pad, 16 bits is plenty */
+ /* protocol dependent part */
+ struct in_endpoints inc_ie;
+};
+
+/*
+ * Flags for inc_flags.
+ */
+#define INC_ISIPV6 0x01
+
+#define inc_isipv6 inc_flags /* temp compatability */
+#define inc_fport inc_ie.ie_fport
+#define inc_lport inc_ie.ie_lport
+#define inc_faddr inc_ie.ie_faddr
+#define inc_laddr inc_ie.ie_laddr
+#define inc6_faddr inc_ie.ie6_faddr
+#define inc6_laddr inc_ie.ie6_laddr
+
+struct icmp6_filter;
+
+/*-
+ * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4
+ * and IPv6 sockets. In the case of TCP, further per-connection state is
+ * hung off of inp_ppcb most of the time. Almost all fields of struct inpcb
+ * are static after creation or protected by a per-inpcb rwlock, inp_lock. A
+ * few fields also require the global pcbinfo lock for the inpcb to be held,
+ * when modified, such as the global connection lists and hashes, as well as
+ * binding information (which affects which hash a connection is on). This
+ * model means that connections can be looked up without holding the
+ * per-connection lock, which is important for performance when attempting to
+ * find the connection for a packet given its IP and port tuple. Writing to
+ * these fields that write locks be held on both the inpcb and global locks.
+ *
+ * Key:
+ * (c) - Constant after initialization
+ * (i) - Protected by the inpcb lock
+ * (p) - Protected by the pcbinfo lock for the inpcb
+ * (s) - Protected by another subsystem's locks
+ * (x) - Undefined locking
+ *
+ * A few other notes:
+ *
+ * When a read lock is held, stability of the field is guaranteed; to write
+ * to a field, a write lock must generally be held.
+ *
+ * netinet/netinet6-layer code should not assume that the inp_socket pointer
+ * is safe to dereference without inp_lock being held, even for protocols
+ * other than TCP (where the inpcb persists during TIMEWAIT even after the
+ * socket has been freed), or there may be close(2)-related races.
+ *
+ * The inp_vflag field is overloaded, and would otherwise ideally be (c).
+ */
+struct inpcb {
+ LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */
+ LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */
+ void *inp_ppcb; /* (i) pointer to per-protocol pcb */
+ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
+ struct socket *inp_socket; /* (i) back pointer to socket */
+ struct ucred *inp_cred; /* (c) cache of socket cred */
+ u_int32_t inp_flow; /* (i) IPv6 flow information */
+ int inp_flags; /* (i) generic IP/datagram flags */
+ int inp_flags2; /* (i) generic IP/datagram flags #2*/
+ u_char inp_vflag; /* (i) IP version flag (v4/v6) */
+ u_char inp_ip_ttl; /* (i) time to live proto */
+ u_char inp_ip_p; /* (c) protocol proto */
+ u_char inp_ip_minttl; /* (i) minimum TTL or drop */
+ uint32_t inp_flowid; /* (x) flow id / queue id */
+ u_int inp_refcount; /* (i) refcount */
+ void *inp_pspare[4]; /* (x) rtentry / general use */
+ u_int inp_ispare[4]; /* general use */
+
+ /* Local and foreign ports, local and foreign addr. */
+ struct in_conninfo inp_inc; /* (i/p) list for PCB's local port */
+
+ /* MAC and IPSEC policy information. */
+ struct label *inp_label; /* (i) MAC label */
+ struct inpcbpolicy *inp_sp; /* (s) for IPSEC */
+
+ /* Protocol-dependent part; options. */
+ struct {
+ u_char inp4_ip_tos; /* (i) type of service proto */
+ struct mbuf *inp4_options; /* (i) IP options */
+ struct ip_moptions *inp4_moptions; /* (i) IP mcast options */
+ } inp_depend4;
+ struct {
+ /* (i) IP options */
+ struct mbuf *inp6_options;
+ /* (i) IP6 options for outgoing packets */
+ struct ip6_pktopts *inp6_outputopts;
+ /* (i) IP multicast options */
+ struct ip6_moptions *inp6_moptions;
+ /* (i) ICMPv6 code type filter */
+ struct icmp6_filter *inp6_icmp6filt;
+ /* (i) IPV6_CHECKSUM setsockopt */
+ int inp6_cksum;
+ short inp6_hops;
+ } inp_depend6;
+ LIST_ENTRY(inpcb) inp_portlist; /* (i/p) */
+ struct inpcbport *inp_phd; /* (i/p) head of this list */
+#define inp_zero_size offsetof(struct inpcb, inp_gencnt)
+ inp_gen_t inp_gencnt; /* (c) generation count */
+ struct llentry *inp_lle; /* cached L2 information */
+ struct rtentry *inp_rt; /* cached L3 information */
+ struct rwlock inp_lock;
+};
+#define inp_fport inp_inc.inc_fport
+#define inp_lport inp_inc.inc_lport
+#define inp_faddr inp_inc.inc_faddr
+#define inp_laddr inp_inc.inc_laddr
+#define inp_ip_tos inp_depend4.inp4_ip_tos
+#define inp_options inp_depend4.inp4_options
+#define inp_moptions inp_depend4.inp4_moptions
+
+#define in6p_faddr inp_inc.inc6_faddr
+#define in6p_laddr inp_inc.inc6_laddr
+#define in6p_hops inp_depend6.inp6_hops /* default hop limit */
+#define in6p_flowinfo inp_flow
+#define in6p_options inp_depend6.inp6_options
+#define in6p_outputopts inp_depend6.inp6_outputopts
+#define in6p_moptions inp_depend6.inp6_moptions
+#define in6p_icmp6filt inp_depend6.inp6_icmp6filt
+#define in6p_cksum inp_depend6.inp6_cksum
+
+#define inp_vnet inp_pcbinfo->ipi_vnet
+
+/*
+ * The range of the generation count, as used in this implementation, is 9e19.
+ * We would have to create 300 billion connections per second for this number
+ * to roll over in a year. This seems sufficiently unlikely that we simply
+ * don't concern ourselves with that possibility.
+ */
+
+/*
+ * Interface exported to userland by various protocols which use inpcbs. Hack
+ * alert -- only define if struct xsocket is in scope.
+ */
+#ifdef _SYS_SOCKETVAR_HH_
+struct xinpcb {
+ size_t xi_len; /* length of this structure */
+ struct inpcb xi_inp;
+ struct xsocket xi_socket;
+ u_quad_t xi_alignment_hack;
+};
+
+struct xinpgen {
+ size_t xig_len; /* length of this structure */
+ u_int xig_count; /* number of PCBs at this time */
+ inp_gen_t xig_gen; /* generation count at this time */
+ so_gen_t xig_sogen; /* socket generation count at this time */
+};
+#endif /* _SYS_SOCKETVAR_HH_ */
+
+struct inpcbport {
+ LIST_ENTRY(inpcbport) phd_hash;
+ struct inpcbhead phd_pcblist;
+ u_short phd_port;
+};
+
+/*
+ * Global data structure for each high-level protocol (UDP, TCP, ...) in both
+ * IPv4 and IPv6. Holds inpcb lists and information for managing them.
+ */
+struct inpcbinfo {
+ /*
+ * Global list of inpcbs on the protocol.
+ */
+ struct inpcbhead *ipi_listhead;
+ u_int ipi_count;
+
+ /*
+ * Global hash of inpcbs, hashed by local and foreign addresses and
+ * port numbers.
+ */
+ struct inpcbhead *ipi_hashbase;
+ u_long ipi_hashmask;
+
+ /*
+ * Global hash of inpcbs, hashed by only local port number.
+ */
+ struct inpcbporthead *ipi_porthashbase;
+ u_long ipi_porthashmask;
+
+ /*
+ * Fields associated with port lookup and allocation.
+ */
+ u_short ipi_lastport;
+ u_short ipi_lastlow;
+ u_short ipi_lasthi;
+
+ /*
+ * UMA zone from which inpcbs are allocated for this protocol.
+ */
+ struct uma_zone *ipi_zone;
+
+ /*
+ * Generation count--incremented each time a connection is allocated
+ * or freed.
+ */
+ u_quad_t ipi_gencnt;
+ struct rwlock ipi_lock;
+
+ /*
+ * Pointer to network stack instance
+ */
+ struct vnet *ipi_vnet;
+
+ /*
+ * general use 2
+ */
+ void *ipi_pspare[2];
+};
+
+#define INP_LOCK_INIT(inp, d, t) \
+ rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK)
+#define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock)
+#define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock)
+#define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock)
+#define INP_TRY_RLOCK(inp) rw_try_rlock(&(inp)->inp_lock)
+#define INP_TRY_WLOCK(inp) rw_try_wlock(&(inp)->inp_lock)
+#define INP_RUNLOCK(inp) rw_runlock(&(inp)->inp_lock)
+#define INP_WUNLOCK(inp) rw_wunlock(&(inp)->inp_lock)
+#define INP_TRY_UPGRADE(inp) rw_try_upgrade(&(inp)->inp_lock)
+#define INP_DOWNGRADE(inp) rw_downgrade(&(inp)->inp_lock)
+#define INP_WLOCKED(inp) rw_wowned(&(inp)->inp_lock)
+#define INP_LOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_LOCKED)
+#define INP_RLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_RLOCKED)
+#define INP_WLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_WLOCKED)
+#define INP_UNLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_UNLOCKED)
+
+#ifdef _KERNEL
+/*
+ * These locking functions are for inpcb consumers outside of sys/netinet,
+ * more specifically, they were added for the benefit of TOE drivers. The
+ * macros are reserved for use by the stack.
+ */
+void inp_wlock(struct inpcb *);
+void inp_wunlock(struct inpcb *);
+void inp_rlock(struct inpcb *);
+void inp_runlock(struct inpcb *);
+
+#ifdef INVARIANTS
+void inp_lock_assert(struct inpcb *);
+void inp_unlock_assert(struct inpcb *);
+#else
+static __inline void
+inp_lock_assert(struct inpcb *inp __unused)
+{
+}
+
+static __inline void
+inp_unlock_assert(struct inpcb *inp __unused)
+{
+}
+
+#endif
+
+void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg);
+int inp_ip_tos_get(const struct inpcb *inp);
+void inp_ip_tos_set(struct inpcb *inp, int val);
+struct socket *
+ inp_inpcbtosocket(struct inpcb *inp);
+struct tcpcb *
+ inp_inpcbtotcpcb(struct inpcb *inp);
+void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
+ uint32_t *faddr, uint16_t *fp);
+
+#endif /* _KERNEL */
+
+#define INP_INFO_LOCK_INIT(ipi, d) \
+ rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE)
+#define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock)
+#define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock)
+#define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock)
+#define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock)
+#define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock)
+#define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock)
+#define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock)
+#define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock)
+#define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED)
+#define INP_INFO_RLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_RLOCKED)
+#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
+#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
+
+#define INP_PCBHASH(faddr, lport, fport, mask) \
+ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
+#define INP_PCBPORTHASH(lport, mask) \
+ (ntohs((lport)) & (mask))
+
+/*
+ * Flags for inp_vflags -- historically version flags only
+ */
+#define INP_IPV4 0x1
+#define INP_IPV6 0x2
+#define INP_IPV6PROTO 0x4 /* opened under IPv6 protocol */
+
+/*
+ * Flags for inp_flags.
+ */
+#define INP_RECVOPTS 0x00000001 /* receive incoming IP options */
+#define INP_RECVRETOPTS 0x00000002 /* receive IP options for reply */
+#define INP_RECVDSTADDR 0x00000004 /* receive IP dst address */
+#define INP_HDRINCL 0x00000008 /* user supplies entire IP header */
+#define INP_HIGHPORT 0x00000010 /* user wants "high" port binding */
+#define INP_LOWPORT 0x00000020 /* user wants "low" port binding */
+#define INP_ANONPORT 0x00000040 /* port chosen for user */
+#define INP_RECVIF 0x00000080 /* receive incoming interface */
+#define INP_MTUDISC 0x00000100 /* user can do MTU discovery */
+#define INP_FAITH 0x00000200 /* accept FAITH'ed connections */
+#define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */
+#define INP_DONTFRAG 0x00000800 /* don't fragment packet */
+#define INP_BINDANY 0x00001000 /* allow bind to any address */
+#define INP_INHASHLIST 0x00002000 /* in_pcbinshash() has been called */
+#define IN6P_IPV6_V6ONLY 0x00008000 /* restrict AF_INET6 socket for v6 */
+#define IN6P_PKTINFO 0x00010000 /* receive IP6 dst and I/F */
+#define IN6P_HOPLIMIT 0x00020000 /* receive hoplimit */
+#define IN6P_HOPOPTS 0x00040000 /* receive hop-by-hop options */
+#define IN6P_DSTOPTS 0x00080000 /* receive dst options after rthdr */
+#define IN6P_RTHDR 0x00100000 /* receive routing header */
+#define IN6P_RTHDRDSTOPTS 0x00200000 /* receive dstoptions before rthdr */
+#define IN6P_TCLASS 0x00400000 /* receive traffic class value */
+#define IN6P_AUTOFLOWLABEL 0x00800000 /* attach flowlabel automatically */
+#define INP_TIMEWAIT 0x01000000 /* in TIMEWAIT, ppcb is tcptw */
+#define INP_ONESBCAST 0x02000000 /* send all-ones broadcast */
+#define INP_DROPPED 0x04000000 /* protocol drop flag */
+#define INP_SOCKREF 0x08000000 /* strong socket reference */
+#define INP_SW_FLOWID 0x10000000 /* software generated flow id */
+#define INP_HW_FLOWID 0x20000000 /* hardware generated flow id */
+#define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */
+#define IN6P_MTU 0x80000000 /* receive path MTU */
+
+#define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\
+ INP_RECVIF|INP_RECVTTL|\
+ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\
+ IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\
+ IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\
+ IN6P_MTU)
+
+/*
+ * Flags for inp_flags2.
+ */
+#define INP_LLE_VALID 0x00000001 /* cached lle is valid */
+#define INP_RT_VALID 0x00000002 /* cached rtentry is valid */
+
+#define INPLOOKUP_WILDCARD 1
+#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb)
+#define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */
+
+#define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family
+
+#define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af)
+
+#ifdef _KERNEL
+VNET_DECLARE(int, ipport_reservedhigh);
+VNET_DECLARE(int, ipport_reservedlow);
+VNET_DECLARE(int, ipport_lowfirstauto);
+VNET_DECLARE(int, ipport_lowlastauto);
+VNET_DECLARE(int, ipport_firstauto);
+VNET_DECLARE(int, ipport_lastauto);
+VNET_DECLARE(int, ipport_hifirstauto);
+VNET_DECLARE(int, ipport_hilastauto);
+VNET_DECLARE(int, ipport_randomized);
+VNET_DECLARE(int, ipport_randomcps);
+VNET_DECLARE(int, ipport_randomtime);
+VNET_DECLARE(int, ipport_stoprandom);
+VNET_DECLARE(int, ipport_tcpallocs);
+
+#define V_ipport_reservedhigh VNET(ipport_reservedhigh)
+#define V_ipport_reservedlow VNET(ipport_reservedlow)
+#define V_ipport_lowfirstauto VNET(ipport_lowfirstauto)
+#define V_ipport_lowlastauto VNET(ipport_lowlastauto)
+#define V_ipport_firstauto VNET(ipport_firstauto)
+#define V_ipport_lastauto VNET(ipport_lastauto)
+#define V_ipport_hifirstauto VNET(ipport_hifirstauto)
+#define V_ipport_hilastauto VNET(ipport_hilastauto)
+#define V_ipport_randomized VNET(ipport_randomized)
+#define V_ipport_randomcps VNET(ipport_randomcps)
+#define V_ipport_randomtime VNET(ipport_randomtime)
+#define V_ipport_stoprandom VNET(ipport_stoprandom)
+#define V_ipport_tcpallocs VNET(ipport_tcpallocs)
+
+extern struct callout ipport_tick_callout;
+
+void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
+int in_pcballoc(struct socket *, struct inpcbinfo *);
+int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *);
+int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
+ u_short *, struct ucred *);
+int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *);
+int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
+ u_short *, in_addr_t *, u_short *, struct inpcb **,
+ struct ucred *);
+void in_pcbdetach(struct inpcb *);
+void in_pcbdisconnect(struct inpcb *);
+void in_pcbdrop(struct inpcb *);
+void in_pcbfree(struct inpcb *);
+int in_pcbinshash(struct inpcb *);
+struct inpcb *
+ in_pcblookup_local(struct inpcbinfo *,
+ struct in_addr, u_short, int, struct ucred *);
+struct inpcb *
+ in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int,
+ struct in_addr, u_int, int, struct ifnet *);
+#ifndef __rtems__
+void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
+ int, struct inpcb *(*)(struct inpcb *, int));
+#else
+void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
+ struct inpcb *(*notify)(struct inpcb *, int));
+#endif
+void in_pcbref(struct inpcb *);
+void in_pcbrehash(struct inpcb *);
+int in_pcbrele(struct inpcb *);
+void in_pcbsetsolabel(struct socket *so);
+int in_getpeeraddr(struct socket *so, struct sockaddr **nam);
+int in_getsockaddr(struct socket *so, struct sockaddr **nam);
+struct sockaddr *
+ in_sockaddr(in_port_t port, struct in_addr *addr);
+void in_pcbsosetlabel(struct socket *so);
+void ipport_tick(void *xtp);
+#endif /* _KERNEL */
+
+#endif /* !_NETINET_IN_PCB_HH_ */
diff --git a/freebsd/sys/netinet/in_proto.c b/freebsd/sys/netinet/in_proto.c
new file mode 100644
index 00000000..9be0b626
--- /dev/null
+++ b/freebsd/sys/netinet/in_proto.c
@@ -0,0 +1,400 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in_proto.c 8.2 (Berkeley) 2/9/95
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_ipx.h>
+#include <freebsd/local/opt_mrouting.h>
+#include <freebsd/local/opt_ipsec.h>
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_pf.h>
+#include <freebsd/local/opt_sctp.h>
+#include <freebsd/local/opt_mpath.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/domain.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/queue.h>
+#include <freebsd/sys/sysctl.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#ifdef RADIX_MPATH
+#include <freebsd/net/radix_mpath.h>
+#endif
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_icmp.h>
+#include <freebsd/netinet/igmp_var.h>
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_timer.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet/udp.h>
+#include <freebsd/netinet/udp_var.h>
+#include <freebsd/netinet/ip_encap.h>
+
+/*
+ * TCP/IP protocol family: IP, ICMP, UDP, TCP.
+ */
+
+static struct pr_usrreqs nousrreqs;
+
+#ifdef IPSEC
+#include <freebsd/netipsec/ipsec.h>
+#endif /* IPSEC */
+
+#ifdef SCTP
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#include <freebsd/netinet/sctp.h>
+#include <freebsd/netinet/sctp_var.h>
+#endif /* SCTP */
+
+#ifdef DEV_PFSYNC
+#include <freebsd/net/pfvar.h>
+#include <freebsd/net/if_pfsync.h>
+#endif
+
+extern struct domain inetdomain;
+
+/* Spacer for loadable protocols. */
+#define IPPROTOSPACER \
+{ \
+ .pr_domain = &inetdomain, \
+ .pr_protocol = PROTO_SPACER, \
+ .pr_usrreqs = &nousrreqs \
+}
+
+struct protosw inetsw[] = {
+{
+ .pr_type = 0,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_IP,
+ .pr_init = ip_init,
+#ifdef VIMAGE
+ .pr_destroy = ip_destroy,
+#endif
+ .pr_slowtimo = ip_slowtimo,
+ .pr_drain = ip_drain,
+ .pr_usrreqs = &nousrreqs
+},
+{
+ .pr_type = SOCK_DGRAM,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_UDP,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = udp_input,
+ .pr_ctlinput = udp_ctlinput,
+ .pr_ctloutput = udp_ctloutput,
+ .pr_init = udp_init,
+#ifdef VIMAGE
+ .pr_destroy = udp_destroy,
+#endif
+ .pr_usrreqs = &udp_usrreqs
+},
+{
+ .pr_type = SOCK_STREAM,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_TCP,
+ .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
+ .pr_input = tcp_input,
+ .pr_ctlinput = tcp_ctlinput,
+ .pr_ctloutput = tcp_ctloutput,
+ .pr_init = tcp_init,
+#ifdef VIMAGE
+ .pr_destroy = tcp_destroy,
+#endif
+ .pr_slowtimo = tcp_slowtimo,
+ .pr_drain = tcp_drain,
+ .pr_usrreqs = &tcp_usrreqs
+},
+#ifdef SCTP
+{
+ .pr_type = SOCK_DGRAM,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_SCTP,
+ .pr_flags = PR_WANTRCVD,
+ .pr_input = sctp_input,
+ .pr_ctlinput = sctp_ctlinput,
+ .pr_ctloutput = sctp_ctloutput,
+ .pr_init = sctp_init,
+#ifdef VIMAGE
+ .pr_destroy = sctp_finish,
+#endif
+ .pr_drain = sctp_drain,
+ .pr_usrreqs = &sctp_usrreqs
+},
+{
+ .pr_type = SOCK_SEQPACKET,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_SCTP,
+ .pr_flags = PR_WANTRCVD,
+ .pr_input = sctp_input,
+ .pr_ctlinput = sctp_ctlinput,
+ .pr_ctloutput = sctp_ctloutput,
+ .pr_drain = sctp_drain,
+ .pr_usrreqs = &sctp_usrreqs
+},
+
+{
+ .pr_type = SOCK_STREAM,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_SCTP,
+ .pr_flags = PR_WANTRCVD,
+ .pr_input = sctp_input,
+ .pr_ctlinput = sctp_ctlinput,
+ .pr_ctloutput = sctp_ctloutput,
+ .pr_drain = sctp_drain,
+ .pr_usrreqs = &sctp_usrreqs
+},
+#endif /* SCTP */
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_RAW,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = rip_input,
+ .pr_ctlinput = rip_ctlinput,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_usrreqs = &rip_usrreqs
+},
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_ICMP,
+ .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ .pr_input = icmp_input,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_usrreqs = &rip_usrreqs
+},
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_IGMP,
+ .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ .pr_input = igmp_input,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_fasttimo = igmp_fasttimo,
+ .pr_slowtimo = igmp_slowtimo,
+ .pr_usrreqs = &rip_usrreqs
+},
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_RSVP,
+ .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ .pr_input = rsvp_input,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_usrreqs = &rip_usrreqs
+},
+#ifdef IPSEC
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_AH,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = ah4_input,
+ .pr_ctlinput = ah4_ctlinput,
+ .pr_usrreqs = &nousrreqs
+},
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_ESP,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = esp4_input,
+ .pr_ctlinput = esp4_ctlinput,
+ .pr_usrreqs = &nousrreqs
+},
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_IPCOMP,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = ipcomp4_input,
+ .pr_usrreqs = &nousrreqs
+},
+#endif /* IPSEC */
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_IPV4,
+ .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ .pr_input = encap4_input,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_init = encap_init,
+ .pr_usrreqs = &rip_usrreqs
+},
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_MOBILE,
+ .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ .pr_input = encap4_input,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_init = encap_init,
+ .pr_usrreqs = &rip_usrreqs
+},
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_ETHERIP,
+ .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ .pr_input = encap4_input,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_init = encap_init,
+ .pr_usrreqs = &rip_usrreqs
+},
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_GRE,
+ .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ .pr_input = encap4_input,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_init = encap_init,
+ .pr_usrreqs = &rip_usrreqs
+},
+# ifdef INET6
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_IPV6,
+ .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ .pr_input = encap4_input,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_init = encap_init,
+ .pr_usrreqs = &rip_usrreqs
+},
+#endif
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_PIM,
+ .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ .pr_input = encap4_input,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_usrreqs = &rip_usrreqs
+},
+#ifdef DEV_PFSYNC
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_PFSYNC,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = pfsync_input,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_usrreqs = &rip_usrreqs
+},
+#endif /* DEV_PFSYNC */
+/* Spacer n-times for loadable protocols. */
+IPPROTOSPACER,
+IPPROTOSPACER,
+IPPROTOSPACER,
+IPPROTOSPACER,
+IPPROTOSPACER,
+IPPROTOSPACER,
+IPPROTOSPACER,
+IPPROTOSPACER,
+/* raw wildcard */
+{
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = rip_input,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_init = rip_init,
+#ifdef VIMAGE
+ .pr_destroy = rip_destroy,
+#endif
+ .pr_usrreqs = &rip_usrreqs
+},
+};
+
+extern int in_inithead(void **, int);
+extern int in_detachhead(void **, int);
+
+struct domain inetdomain = {
+ .dom_family = AF_INET,
+ .dom_name = "internet",
+ .dom_protosw = inetsw,
+ .dom_protoswNPROTOSW = &inetsw[sizeof(inetsw)/sizeof(inetsw[0])],
+#ifdef RADIX_MPATH
+ .dom_rtattach = rn4_mpath_inithead,
+#else
+ .dom_rtattach = in_inithead,
+#endif
+#ifdef VIMAGE
+ .dom_rtdetach = in_detachhead,
+#endif
+ .dom_rtoffset = 32,
+ .dom_maxrtkey = sizeof(struct sockaddr_in),
+ .dom_ifattach = in_domifattach,
+ .dom_ifdetach = in_domifdetach
+};
+
+VNET_DOMAIN_SET(inet);
+
+SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW, 0,
+ "Internet Family");
+
+SYSCTL_NODE(_net_inet, IPPROTO_IP, ip, CTLFLAG_RW, 0, "IP");
+SYSCTL_NODE(_net_inet, IPPROTO_ICMP, icmp, CTLFLAG_RW, 0, "ICMP");
+SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW, 0, "UDP");
+SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW, 0, "TCP");
+#ifdef SCTP
+SYSCTL_NODE(_net_inet, IPPROTO_SCTP, sctp, CTLFLAG_RW, 0, "SCTP");
+#endif
+SYSCTL_NODE(_net_inet, IPPROTO_IGMP, igmp, CTLFLAG_RW, 0, "IGMP");
+#ifdef IPSEC
+/* XXX no protocol # to use, pick something "reserved" */
+SYSCTL_NODE(_net_inet, 253, ipsec, CTLFLAG_RW, 0, "IPSEC");
+SYSCTL_NODE(_net_inet, IPPROTO_AH, ah, CTLFLAG_RW, 0, "AH");
+SYSCTL_NODE(_net_inet, IPPROTO_ESP, esp, CTLFLAG_RW, 0, "ESP");
+SYSCTL_NODE(_net_inet, IPPROTO_IPCOMP, ipcomp, CTLFLAG_RW, 0, "IPCOMP");
+SYSCTL_NODE(_net_inet, IPPROTO_IPIP, ipip, CTLFLAG_RW, 0, "IPIP");
+#endif /* IPSEC */
+SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW");
+#ifdef DEV_PFSYNC
+SYSCTL_NODE(_net_inet, IPPROTO_PFSYNC, pfsync, CTLFLAG_RW, 0, "PFSYNC");
+#endif
diff --git a/freebsd/sys/netinet/in_rmx.c b/freebsd/sys/netinet/in_rmx.c
new file mode 100644
index 00000000..25f99ea0
--- /dev/null
+++ b/freebsd/sys/netinet/in_rmx.c
@@ -0,0 +1,516 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright 1994, 1995 Massachusetts Institute of Technology
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation for any purpose and without fee is hereby
+ * granted, provided that both the above copyright notice and this
+ * permission notice appear in all copies, that both the above
+ * copyright notice and this permission notice appear in all
+ * supporting documentation, and that the name of M.I.T. not be used
+ * in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission. M.I.T. makes
+ * no representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied
+ * warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
+ * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
+ * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This code does two things necessary for the enhanced TCP metrics to
+ * function in a useful manner:
+ * 1) It marks all non-host routes as `cloning', thus ensuring that
+ * every actual reference to such a route actually gets turned
+ * into a reference to a host route to the specific destination
+ * requested.
+ * 2) When such routes lose all their references, it arranges for them
+ * to be deleted in some random collection of circumstances, so that
+ * a large quantity of stale routing data is not kept in kernel memory
+ * indefinitely. See in_rtqtimo() below for the exact mechanism.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/sys/callout.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip_var.h>
+
+extern int in_inithead(void **head, int off);
+#ifdef VIMAGE
+extern int in_detachhead(void **head, int off);
+#endif
+
+#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */
+
+/*
+ * Do what we need to do when inserting a route.
+ */
+static struct radix_node *
+in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
+ struct radix_node *treenodes)
+{
+ struct rtentry *rt = (struct rtentry *)treenodes;
+ struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt);
+
+ RADIX_NODE_HEAD_WLOCK_ASSERT(head);
+ /*
+ * A little bit of help for both IP output and input:
+ * For host routes, we make sure that RTF_BROADCAST
+ * is set for anything that looks like a broadcast address.
+ * This way, we can avoid an expensive call to in_broadcast()
+ * in ip_output() most of the time (because the route passed
+ * to ip_output() is almost always a host route).
+ *
+ * We also do the same for local addresses, with the thought
+ * that this might one day be used to speed up ip_input().
+ *
+ * We also mark routes to multicast addresses as such, because
+ * it's easy to do and might be useful (but this is much more
+ * dubious since it's so easy to inspect the address).
+ */
+ if (rt->rt_flags & RTF_HOST) {
+ if (in_broadcast(sin->sin_addr, rt->rt_ifp)) {
+ rt->rt_flags |= RTF_BROADCAST;
+ } else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr ==
+ sin->sin_addr.s_addr) {
+ rt->rt_flags |= RTF_LOCAL;
+ }
+ }
+ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+ rt->rt_flags |= RTF_MULTICAST;
+
+ if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp)
+ rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
+
+ return (rn_addroute(v_arg, n_arg, head, treenodes));
+}
+
+/*
+ * This code is the inverse of in_clsroute: on first reference, if we
+ * were managing the route, stop doing so and set the expiration timer
+ * back off again.
+ */
+static struct radix_node *
+in_matroute(void *v_arg, struct radix_node_head *head)
+{
+ struct radix_node *rn = rn_match(v_arg, head);
+ struct rtentry *rt = (struct rtentry *)rn;
+
+ if (rt) {
+ RT_LOCK(rt);
+ if (rt->rt_flags & RTPRF_OURS) {
+ rt->rt_flags &= ~RTPRF_OURS;
+ rt->rt_rmx.rmx_expire = 0;
+ }
+ RT_UNLOCK(rt);
+ }
+ return rn;
+}
+
+static VNET_DEFINE(int, rtq_reallyold) = 60*60; /* one hour is "really old" */
+#define V_rtq_reallyold VNET(rtq_reallyold)
+SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW,
+ &VNET_NAME(rtq_reallyold), 0,
+ "Default expiration time on dynamically learned routes");
+
+/* never automatically crank down to less */
+static VNET_DEFINE(int, rtq_minreallyold) = 10;
+#define V_rtq_minreallyold VNET(rtq_minreallyold)
+SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW,
+ &VNET_NAME(rtq_minreallyold), 0,
+ "Minimum time to attempt to hold onto dynamically learned routes");
+
+/* 128 cached routes is "too many" */
+static VNET_DEFINE(int, rtq_toomany) = 128;
+#define V_rtq_toomany VNET(rtq_toomany)
+SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW,
+ &VNET_NAME(rtq_toomany), 0,
+ "Upper limit on dynamically learned routes");
+
+/*
+ * On last reference drop, mark the route as belong to us so that it can be
+ * timed out.
+ */
+static void
+in_clsroute(struct radix_node *rn, struct radix_node_head *head)
+{
+ struct rtentry *rt = (struct rtentry *)rn;
+
+ RT_LOCK_ASSERT(rt);
+
+ if (!(rt->rt_flags & RTF_UP))
+ return; /* prophylactic measures */
+
+ if (rt->rt_flags & RTPRF_OURS)
+ return;
+
+ if (!(rt->rt_flags & RTF_DYNAMIC))
+ return;
+
+ /*
+ * If rtq_reallyold is 0, just delete the route without
+ * waiting for a timeout cycle to kill it.
+ */
+ if (V_rtq_reallyold != 0) {
+ rt->rt_flags |= RTPRF_OURS;
+ rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold;
+ } else {
+ rtexpunge(rt);
+ }
+}
+
+struct rtqk_arg {
+ struct radix_node_head *rnh;
+ int draining;
+ int killed;
+ int found;
+ int updating;
+ time_t nextstop;
+};
+
+/*
+ * Get rid of old routes. When draining, this deletes everything, even when
+ * the timeout is not expired yet. When updating, this makes sure that
+ * nothing has a timeout longer than the current value of rtq_reallyold.
+ */
+static int
+in_rtqkill(struct radix_node *rn, void *rock)
+{
+ struct rtqk_arg *ap = rock;
+ struct rtentry *rt = (struct rtentry *)rn;
+ int err;
+
+ RADIX_NODE_HEAD_WLOCK_ASSERT(ap->rnh);
+
+ if (rt->rt_flags & RTPRF_OURS) {
+ ap->found++;
+
+ if (ap->draining || rt->rt_rmx.rmx_expire <= time_uptime) {
+ if (rt->rt_refcnt > 0)
+ panic("rtqkill route really not free");
+
+ err = in_rtrequest(RTM_DELETE,
+ (struct sockaddr *)rt_key(rt),
+ rt->rt_gateway, rt_mask(rt),
+ rt->rt_flags | RTF_RNH_LOCKED, 0,
+ rt->rt_fibnum);
+ if (err) {
+ log(LOG_WARNING, "in_rtqkill: error %d\n", err);
+ } else {
+ ap->killed++;
+ }
+ } else {
+ if (ap->updating &&
+ (rt->rt_rmx.rmx_expire - time_uptime >
+ V_rtq_reallyold)) {
+ rt->rt_rmx.rmx_expire =
+ time_uptime + V_rtq_reallyold;
+ }
+ ap->nextstop = lmin(ap->nextstop,
+ rt->rt_rmx.rmx_expire);
+ }
+ }
+
+ return 0;
+}
+
+#define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */
+static VNET_DEFINE(int, rtq_timeout) = RTQ_TIMEOUT;
+static VNET_DEFINE(struct callout, rtq_timer);
+
+#define V_rtq_timeout VNET(rtq_timeout)
+#define V_rtq_timer VNET(rtq_timer)
+
+static void in_rtqtimo_one(void *rock);
+
+static void
+in_rtqtimo(void *rock)
+{
+ CURVNET_SET((struct vnet *) rock);
+ int fibnum;
+ void *newrock;
+ struct timeval atv;
+
+ for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
+ newrock = rt_tables_get_rnh(fibnum, AF_INET);
+ if (newrock != NULL)
+ in_rtqtimo_one(newrock);
+ }
+ atv.tv_usec = 0;
+ atv.tv_sec = V_rtq_timeout;
+ callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock);
+ CURVNET_RESTORE();
+}
+
+static void
+in_rtqtimo_one(void *rock)
+{
+ struct radix_node_head *rnh = rock;
+ struct rtqk_arg arg;
+ static time_t last_adjusted_timeout = 0;
+
+ arg.found = arg.killed = 0;
+ arg.rnh = rnh;
+ arg.nextstop = time_uptime + V_rtq_timeout;
+ arg.draining = arg.updating = 0;
+ RADIX_NODE_HEAD_LOCK(rnh);
+ rnh->rnh_walktree(rnh, in_rtqkill, &arg);
+ RADIX_NODE_HEAD_UNLOCK(rnh);
+
+ /*
+ * Attempt to be somewhat dynamic about this:
+ * If there are ``too many'' routes sitting around taking up space,
+ * then crank down the timeout, and see if we can't make some more
+ * go away. However, we make sure that we will never adjust more
+ * than once in rtq_timeout seconds, to keep from cranking down too
+ * hard.
+ */
+ if ((arg.found - arg.killed > V_rtq_toomany) &&
+ (time_uptime - last_adjusted_timeout >= V_rtq_timeout) &&
+ V_rtq_reallyold > V_rtq_minreallyold) {
+ V_rtq_reallyold = 2 * V_rtq_reallyold / 3;
+ if (V_rtq_reallyold < V_rtq_minreallyold) {
+ V_rtq_reallyold = V_rtq_minreallyold;
+ }
+
+ last_adjusted_timeout = time_uptime;
+#ifdef DIAGNOSTIC
+ log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n",
+ V_rtq_reallyold);
+#endif
+ arg.found = arg.killed = 0;
+ arg.updating = 1;
+ RADIX_NODE_HEAD_LOCK(rnh);
+ rnh->rnh_walktree(rnh, in_rtqkill, &arg);
+ RADIX_NODE_HEAD_UNLOCK(rnh);
+ }
+
+}
+
+void
+in_rtqdrain(void)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+ struct radix_node_head *rnh;
+ struct rtqk_arg arg;
+ int fibnum;
+
+ VNET_LIST_RLOCK_NOSLEEP();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+
+ for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
+ rnh = rt_tables_get_rnh(fibnum, AF_INET);
+ arg.found = arg.killed = 0;
+ arg.rnh = rnh;
+ arg.nextstop = 0;
+ arg.draining = 1;
+ arg.updating = 0;
+ RADIX_NODE_HEAD_LOCK(rnh);
+ rnh->rnh_walktree(rnh, in_rtqkill, &arg);
+ RADIX_NODE_HEAD_UNLOCK(rnh);
+ }
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK_NOSLEEP();
+}
+
+static int _in_rt_was_here;
+/*
+ * Initialize our routing tree.
+ */
+int
+in_inithead(void **head, int off)
+{
+ struct radix_node_head *rnh;
+
+ /* XXX MRT
+ * This can be called from vfs_export.c too in which case 'off'
+ * will be 0. We know the correct value so just use that and
+ * return directly if it was 0.
+ * This is a hack that replaces an even worse hack on a bad hack
+ * on a bad design. After RELENG_7 this should be fixed but that
+ * will change the ABI, so for now do it this way.
+ */
+ if (!rn_inithead(head, 32))
+ return 0;
+
+ if (off == 0) /* XXX MRT see above */
+ return 1; /* only do the rest for a real routing table */
+
+ rnh = *head;
+ rnh->rnh_addaddr = in_addroute;
+ rnh->rnh_matchaddr = in_matroute;
+ rnh->rnh_close = in_clsroute;
+ if (_in_rt_was_here == 0 ) {
+ callout_init(&V_rtq_timer, CALLOUT_MPSAFE);
+ callout_reset(&V_rtq_timer, 1, in_rtqtimo, curvnet);
+ _in_rt_was_here = 1;
+ }
+ return 1;
+}
+
+#ifdef VIMAGE
+int
+in_detachhead(void **head, int off)
+{
+
+ callout_drain(&V_rtq_timer);
+ return (1);
+}
+#endif
+
+/*
+ * This zaps old routes when the interface goes down or interface
+ * address is deleted. In the latter case, it deletes static routes
+ * that point to this address. If we don't do this, we may end up
+ * using the old address in the future. The ones we always want to
+ * get rid of are things like ARP entries, since the user might down
+ * the interface, walk over to a completely different network, and
+ * plug back in.
+ */
+struct in_ifadown_arg {
+ struct ifaddr *ifa;
+ int del;
+};
+
+static int
+in_ifadownkill(struct radix_node *rn, void *xap)
+{
+ struct in_ifadown_arg *ap = xap;
+ struct rtentry *rt = (struct rtentry *)rn;
+
+ RT_LOCK(rt);
+ if (rt->rt_ifa == ap->ifa &&
+ (ap->del || !(rt->rt_flags & RTF_STATIC))) {
+ /*
+ * Aquire a reference so that it can later be freed
+ * as the refcount would be 0 here in case of at least
+ * ap->del.
+ */
+ RT_ADDREF(rt);
+ /*
+ * Disconnect it from the tree and permit protocols
+ * to cleanup.
+ */
+ rtexpunge(rt);
+ /*
+ * At this point it is an rttrash node, and in case
+ * the above is the only reference we must free it.
+ * If we do not noone will have a pointer and the
+ * rtentry will be leaked forever.
+ * In case someone else holds a reference, we are
+ * fine as we only decrement the refcount. In that
+ * case if the other entity calls RT_REMREF, we
+ * will still be leaking but at least we tried.
+ */
+ RTFREE_LOCKED(rt);
+ return (0);
+ }
+ RT_UNLOCK(rt);
+ return 0;
+}
+
+int
+in_ifadown(struct ifaddr *ifa, int delete)
+{
+ struct in_ifadown_arg arg;
+ struct radix_node_head *rnh;
+ int fibnum;
+
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ return 1;
+
+ for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
+ rnh = rt_tables_get_rnh(fibnum, AF_INET);
+ arg.ifa = ifa;
+ arg.del = delete;
+ RADIX_NODE_HEAD_LOCK(rnh);
+ rnh->rnh_walktree(rnh, in_ifadownkill, &arg);
+ RADIX_NODE_HEAD_UNLOCK(rnh);
+ ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */
+ }
+ return 0;
+}
+
+/*
+ * inet versions of rt functions. These have fib extensions and
+ * for now will just reference the _fib variants.
+ * eventually this order will be reversed,
+ */
+void
+in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum)
+{
+ rtalloc_ign_fib(ro, ignflags, fibnum);
+}
+
+int
+in_rtrequest( int req,
+ struct sockaddr *dst,
+ struct sockaddr *gateway,
+ struct sockaddr *netmask,
+ int flags,
+ struct rtentry **ret_nrt,
+ u_int fibnum)
+{
+ return (rtrequest_fib(req, dst, gateway, netmask,
+ flags, ret_nrt, fibnum));
+}
+
+struct rtentry *
+in_rtalloc1(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum)
+{
+ return (rtalloc1_fib(dst, report, ignflags, fibnum));
+}
+
+void
+in_rtredirect(struct sockaddr *dst,
+ struct sockaddr *gateway,
+ struct sockaddr *netmask,
+ int flags,
+ struct sockaddr *src,
+ u_int fibnum)
+{
+ rtredirect_fib(dst, gateway, netmask, flags, src, fibnum);
+}
+
+void
+in_rtalloc(struct route *ro, u_int fibnum)
+{
+ rtalloc_ign_fib(ro, 0UL, fibnum);
+}
+
+#if 0
+int in_rt_getifa(struct rt_addrinfo *, u_int fibnum);
+int in_rtioctl(u_long, caddr_t, u_int);
+int in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int);
+#endif
+
+
diff --git a/freebsd/sys/netinet/in_systm.h b/freebsd/sys/netinet/in_systm.h
new file mode 100644
index 00000000..68bb190e
--- /dev/null
+++ b/freebsd/sys/netinet/in_systm.h
@@ -0,0 +1,2 @@
+#include <freebsd/bsd.h>
+#include <freebsd/netinet/in_systm.h>
diff --git a/freebsd/sys/netinet/in_var.h b/freebsd/sys/netinet/in_var.h
new file mode 100644
index 00000000..c921ad31
--- /dev/null
+++ b/freebsd/sys/netinet/in_var.h
@@ -0,0 +1,475 @@
+/*-
+ * Copyright (c) 1985, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in_var.h 8.2 (Berkeley) 1/9/95
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IN_VAR_HH_
+#define _NETINET_IN_VAR_HH_
+
+#include <freebsd/sys/queue.h>
+#include <freebsd/sys/fnv_hash.h>
+#include <freebsd/sys/tree.h>
+
+struct igmp_ifinfo;
+struct in_multi;
+struct lltable;
+
+/*
+ * IPv4 per-interface state.
+ */
+struct in_ifinfo {
+ struct lltable *ii_llt; /* ARP state */
+ struct igmp_ifinfo *ii_igmp; /* IGMP state */
+ struct in_multi *ii_allhosts; /* 224.0.0.1 membership */
+};
+
+/*
+ * Interface address, Internet version. One of these structures
+ * is allocated for each Internet address on an interface.
+ * The ifaddr structure contains the protocol-independent part
+ * of the structure and is assumed to be first.
+ */
+struct in_ifaddr {
+ struct ifaddr ia_ifa; /* protocol-independent info */
+#define ia_ifp ia_ifa.ifa_ifp
+#define ia_flags ia_ifa.ifa_flags
+ /* ia_{,sub}net{,mask} in host order */
+ u_long ia_net; /* network number of interface */
+ u_long ia_netmask; /* mask of net part */
+ u_long ia_subnet; /* subnet number, including net */
+ u_long ia_subnetmask; /* mask of subnet part */
+ struct in_addr ia_netbroadcast; /* to recognize net broadcasts */
+ LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */
+ TAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */
+ struct sockaddr_in ia_addr; /* reserve space for interface name */
+ struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */
+#define ia_broadaddr ia_dstaddr
+ struct sockaddr_in ia_sockmask; /* reserve space for general netmask */
+};
+
+struct in_aliasreq {
+ char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */
+ struct sockaddr_in ifra_addr;
+ struct sockaddr_in ifra_broadaddr;
+#define ifra_dstaddr ifra_broadaddr
+ struct sockaddr_in ifra_mask;
+};
+/*
+ * Given a pointer to an in_ifaddr (ifaddr),
+ * return a pointer to the addr as a sockaddr_in.
+ */
+#define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr))
+#define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr))
+
+#define IN_LNAOF(in, ifa) \
+ ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask))
+
+
+#ifdef _KERNEL
+extern u_char inetctlerrmap[];
+
+#define LLTABLE(ifp) \
+ ((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt
+/*
+ * Hash table for IP addresses.
+ */
+TAILQ_HEAD(in_ifaddrhead, in_ifaddr);
+LIST_HEAD(in_ifaddrhashhead, in_ifaddr);
+
+VNET_DECLARE(struct in_ifaddrhashhead *, in_ifaddrhashtbl);
+VNET_DECLARE(struct in_ifaddrhead, in_ifaddrhead);
+VNET_DECLARE(u_long, in_ifaddrhmask); /* mask for hash table */
+
+#define V_in_ifaddrhashtbl VNET(in_ifaddrhashtbl)
+#define V_in_ifaddrhead VNET(in_ifaddrhead)
+#define V_in_ifaddrhmask VNET(in_ifaddrhmask)
+
+#define INADDR_NHASH_LOG2 9
+#define INADDR_NHASH (1 << INADDR_NHASH_LOG2)
+#define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT)
+#define INADDR_HASH(x) \
+ (&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask])
+
+extern struct rwlock in_ifaddr_lock;
+
+#define IN_IFADDR_LOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_LOCKED)
+#define IN_IFADDR_RLOCK() rw_rlock(&in_ifaddr_lock)
+#define IN_IFADDR_RLOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_RLOCKED)
+#define IN_IFADDR_RUNLOCK() rw_runlock(&in_ifaddr_lock)
+#define IN_IFADDR_WLOCK() rw_wlock(&in_ifaddr_lock)
+#define IN_IFADDR_WLOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_WLOCKED)
+#define IN_IFADDR_WUNLOCK() rw_wunlock(&in_ifaddr_lock)
+
+/*
+ * Macro for finding the internet address structure (in_ifaddr)
+ * corresponding to one of our IP addresses (in_addr).
+ */
+#define INADDR_TO_IFADDR(addr, ia) \
+ /* struct in_addr addr; */ \
+ /* struct in_ifaddr *ia; */ \
+do { \
+\
+ LIST_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \
+ if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \
+ break; \
+} while (0)
+
+/*
+ * Macro for finding the interface (ifnet structure) corresponding to one
+ * of our IP addresses.
+ */
+#define INADDR_TO_IFP(addr, ifp) \
+ /* struct in_addr addr; */ \
+ /* struct ifnet *ifp; */ \
+{ \
+ struct in_ifaddr *ia; \
+\
+ INADDR_TO_IFADDR(addr, ia); \
+ (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \
+}
+
+/*
+ * Macro for finding the internet address structure (in_ifaddr) corresponding
+ * to a given interface (ifnet structure).
+ */
+#define IFP_TO_IA(ifp, ia) \
+ /* struct ifnet *ifp; */ \
+ /* struct in_ifaddr *ia; */ \
+{ \
+ for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \
+ (ia) != NULL && (ia)->ia_ifp != (ifp); \
+ (ia) = TAILQ_NEXT((ia), ia_link)) \
+ continue; \
+ if ((ia) != NULL) \
+ ifa_ref(&(ia)->ia_ifa); \
+}
+#endif
+
+/*
+ * IP datagram reassembly.
+ */
+#define IPREASS_NHASH_LOG2 6
+#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
+#define IPREASS_HMASK (IPREASS_NHASH - 1)
+#define IPREASS_HASH(x,y) \
+ (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
+
+/*
+ * Legacy IPv4 IGMP per-link structure.
+ */
+struct router_info {
+ struct ifnet *rti_ifp;
+ int rti_type; /* type of router which is querier on this interface */
+ int rti_time; /* # of slow timeouts since last old query */
+ SLIST_ENTRY(router_info) rti_list;
+};
+
+/*
+ * Per-interface IGMP router version information.
+ */
+struct igmp_ifinfo {
+ LIST_ENTRY(igmp_ifinfo) igi_link;
+ struct ifnet *igi_ifp; /* interface this instance belongs to */
+ uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */
+ uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */
+ uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */
+ uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/
+ uint32_t igi_flags; /* IGMP per-interface flags */
+ uint32_t igi_rv; /* IGMPv3 Robustness Variable */
+ uint32_t igi_qi; /* IGMPv3 Query Interval (s) */
+ uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */
+ uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */
+ SLIST_HEAD(,in_multi) igi_relinmhead; /* released groups */
+ struct ifqueue igi_gq; /* queue of general query responses */
+};
+
+#define IGIF_SILENT 0x00000001 /* Do not use IGMP on this ifp */
+#define IGIF_LOOPBACK 0x00000002 /* Send IGMP reports to loopback */
+
+/*
+ * IPv4 multicast IGMP-layer source entry.
+ */
+struct ip_msource {
+ RB_ENTRY(ip_msource) ims_link; /* RB tree links */
+ in_addr_t ims_haddr; /* host byte order */
+ struct ims_st {
+ uint16_t ex; /* # of exclusive members */
+ uint16_t in; /* # of inclusive members */
+ } ims_st[2]; /* state at t0, t1 */
+ uint8_t ims_stp; /* pending query */
+};
+
+/*
+ * IPv4 multicast PCB-layer source entry.
+ */
+struct in_msource {
+ RB_ENTRY(ip_msource) ims_link; /* RB tree links */
+ in_addr_t ims_haddr; /* host byte order */
+ uint8_t imsl_st[2]; /* state before/at commit */
+};
+
+RB_HEAD(ip_msource_tree, ip_msource); /* define struct ip_msource_tree */
+
+static __inline int
+ip_msource_cmp(const struct ip_msource *a, const struct ip_msource *b)
+{
+
+ if (a->ims_haddr < b->ims_haddr)
+ return (-1);
+ if (a->ims_haddr == b->ims_haddr)
+ return (0);
+ return (1);
+}
+RB_PROTOTYPE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp);
+
+/*
+ * IPv4 multicast PCB-layer group filter descriptor.
+ */
+struct in_mfilter {
+ struct ip_msource_tree imf_sources; /* source list for (S,G) */
+ u_long imf_nsrc; /* # of source entries */
+ uint8_t imf_st[2]; /* state before/at commit */
+};
+
+/*
+ * IPv4 group descriptor.
+ *
+ * For every entry on an ifnet's if_multiaddrs list which represents
+ * an IP multicast group, there is one of these structures.
+ *
+ * If any source filters are present, then a node will exist in the RB-tree
+ * to permit fast lookup by source whenever an operation takes place.
+ * This permits pre-order traversal when we issue reports.
+ * Source filter trees are kept separately from the socket layer to
+ * greatly simplify locking.
+ *
+ * When IGMPv3 is active, inm_timer is the response to group query timer.
+ * The state-change timer inm_sctimer is separate; whenever state changes
+ * for the group the state change record is generated and transmitted,
+ * and kept if retransmissions are necessary.
+ *
+ * FUTURE: inm_link is now only used when groups are being purged
+ * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but
+ * because it is at the very start of the struct, we can't do this
+ * w/o breaking the ABI for ifmcstat.
+ */
+struct in_multi {
+ LIST_ENTRY(in_multi) inm_link; /* to-be-released by in_ifdetach */
+ struct in_addr inm_addr; /* IP multicast address, convenience */
+ struct ifnet *inm_ifp; /* back pointer to ifnet */
+ struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */
+ u_int inm_timer; /* IGMPv1/v2 group / v3 query timer */
+ u_int inm_state; /* state of the membership */
+ void *inm_rti; /* unused, legacy field */
+ u_int inm_refcount; /* reference count */
+
+ /* New fields for IGMPv3 follow. */
+ struct igmp_ifinfo *inm_igi; /* IGMP info */
+ SLIST_ENTRY(in_multi) inm_nrele; /* to-be-released by IGMP */
+ struct ip_msource_tree inm_srcs; /* tree of sources */
+ u_long inm_nsrc; /* # of tree entries */
+
+ struct ifqueue inm_scq; /* queue of pending
+ * state-change packets */
+ struct timeval inm_lastgsrtv; /* Time of last G-S-R query */
+ uint16_t inm_sctimer; /* state-change timer */
+ uint16_t inm_scrv; /* state-change rexmit count */
+
+ /*
+ * SSM state counters which track state at T0 (the time the last
+ * state-change report's RV timer went to zero) and T1
+ * (time of pending report, i.e. now).
+ * Used for computing IGMPv3 state-change reports. Several refcounts
+ * are maintained here to optimize for common use-cases.
+ */
+ struct inm_st {
+ uint16_t iss_fmode; /* IGMP filter mode */
+ uint16_t iss_asm; /* # of ASM listeners */
+ uint16_t iss_ex; /* # of exclusive members */
+ uint16_t iss_in; /* # of inclusive members */
+ uint16_t iss_rec; /* # of recorded sources */
+ } inm_st[2]; /* state at t0, t1 */
+};
+
+/*
+ * Helper function to derive the filter mode on a source entry
+ * from its internal counters. Predicates are:
+ * A source is only excluded if all listeners exclude it.
+ * A source is only included if no listeners exclude it,
+ * and at least one listener includes it.
+ * May be used by ifmcstat(8).
+ */
+static __inline uint8_t
+ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims,
+ uint8_t t)
+{
+
+ t = !!t;
+ if (inm->inm_st[t].iss_ex > 0 &&
+ inm->inm_st[t].iss_ex == ims->ims_st[t].ex)
+ return (MCAST_EXCLUDE);
+ else if (ims->ims_st[t].in > 0 && ims->ims_st[t].ex == 0)
+ return (MCAST_INCLUDE);
+ return (MCAST_UNDEFINED);
+}
+
+#ifdef _KERNEL
+
+#ifdef SYSCTL_DECL
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_raw);
+#endif
+
+/*
+ * Lock macros for IPv4 layer multicast address lists. IPv4 lock goes
+ * before link layer multicast locks in the lock order. In most cases,
+ * consumers of IN_*_MULTI() macros should acquire the locks before
+ * calling them; users of the in_{add,del}multi() functions should not.
+ */
+extern struct mtx in_multi_mtx;
+#define IN_MULTI_LOCK() mtx_lock(&in_multi_mtx)
+#define IN_MULTI_UNLOCK() mtx_unlock(&in_multi_mtx)
+#define IN_MULTI_LOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_OWNED)
+#define IN_MULTI_UNLOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_NOTOWNED)
+
+/*
+ * Function for looking up an in_multi record for an IPv4 multicast address
+ * on a given interface. ifp must be valid. If no record found, return NULL.
+ * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held.
+ */
+static __inline struct in_multi *
+inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina)
+{
+ struct ifmultiaddr *ifma;
+ struct in_multi *inm;
+
+ IN_MULTI_LOCK_ASSERT();
+ IF_ADDR_LOCK_ASSERT(ifp);
+
+ inm = NULL;
+ TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) {
+ if (ifma->ifma_addr->sa_family == AF_INET) {
+ inm = (struct in_multi *)ifma->ifma_protospec;
+ if (inm->inm_addr.s_addr == ina.s_addr)
+ break;
+ inm = NULL;
+ }
+ }
+ return (inm);
+}
+
+/*
+ * Wrapper for inm_lookup_locked().
+ * The IF_ADDR_LOCK will be taken on ifp and released on return.
+ */
+static __inline struct in_multi *
+inm_lookup(struct ifnet *ifp, const struct in_addr ina)
+{
+ struct in_multi *inm;
+
+ IN_MULTI_LOCK_ASSERT();
+ IF_ADDR_LOCK(ifp);
+ inm = inm_lookup_locked(ifp, ina);
+ IF_ADDR_UNLOCK(ifp);
+
+ return (inm);
+}
+
+/* Acquire an in_multi record. */
+static __inline void
+inm_acquire_locked(struct in_multi *inm)
+{
+
+ IN_MULTI_LOCK_ASSERT();
+ ++inm->inm_refcount;
+}
+
+/*
+ * Return values for imo_multi_filter().
+ */
+#define MCAST_PASS 0 /* Pass */
+#define MCAST_NOTGMEMBER 1 /* This host not a member of group */
+#define MCAST_NOTSMEMBER 2 /* This host excluded source */
+#define MCAST_MUTED 3 /* [deprecated] */
+
+struct rtentry;
+struct route;
+struct ip_moptions;
+
+int imo_multi_filter(const struct ip_moptions *, const struct ifnet *,
+ const struct sockaddr *, const struct sockaddr *);
+void inm_commit(struct in_multi *);
+void inm_clear_recorded(struct in_multi *);
+void inm_print(const struct in_multi *);
+int inm_record_source(struct in_multi *inm, const in_addr_t);
+void inm_release(struct in_multi *);
+void inm_release_locked(struct in_multi *);
+struct in_multi *
+ in_addmulti(struct in_addr *, struct ifnet *);
+void in_delmulti(struct in_multi *);
+int in_joingroup(struct ifnet *, const struct in_addr *,
+ /*const*/ struct in_mfilter *, struct in_multi **);
+int in_joingroup_locked(struct ifnet *, const struct in_addr *,
+ /*const*/ struct in_mfilter *, struct in_multi **);
+int in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *);
+int in_leavegroup_locked(struct in_multi *,
+ /*const*/ struct in_mfilter *);
+int in_control(struct socket *, u_long, caddr_t, struct ifnet *,
+ struct thread *);
+void in_rtqdrain(void);
+void ip_input(struct mbuf *);
+int in_ifadown(struct ifaddr *ifa, int);
+void in_ifscrub(struct ifnet *, struct in_ifaddr *);
+struct mbuf *ip_fastforward(struct mbuf *);
+void *in_domifattach(struct ifnet *);
+void in_domifdetach(struct ifnet *, void *);
+
+
+/* XXX */
+void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum);
+void in_rtalloc(struct route *ro, u_int fibnum);
+struct rtentry *in_rtalloc1(struct sockaddr *, int, u_long, u_int);
+void in_rtredirect(struct sockaddr *, struct sockaddr *,
+ struct sockaddr *, int, struct sockaddr *, u_int);
+int in_rtrequest(int, struct sockaddr *,
+ struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int);
+
+#if 0
+int in_rt_getifa(struct rt_addrinfo *, u_int fibnum);
+int in_rtioctl(u_long, caddr_t, u_int);
+int in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int);
+#endif
+#endif /* _KERNEL */
+
+/* INET6 stuff */
+#include <freebsd/netinet6/in6_var.h>
+
+#endif /* _NETINET_IN_VAR_HH_ */
diff --git a/freebsd/sys/netinet/ip.h b/freebsd/sys/netinet/ip.h
new file mode 100644
index 00000000..9d5d8a9c
--- /dev/null
+++ b/freebsd/sys/netinet/ip.h
@@ -0,0 +1,2 @@
+#include <freebsd/bsd.h>
+#include <freebsd/netinet/ip.h>
diff --git a/freebsd/sys/netinet/ip6.h b/freebsd/sys/netinet/ip6.h
new file mode 100644
index 00000000..f30da6d1
--- /dev/null
+++ b/freebsd/sys/netinet/ip6.h
@@ -0,0 +1,2 @@
+#include <freebsd/bsd.h>
+#include <freebsd/netinet/ip6.h>
diff --git a/freebsd/sys/netinet/ip_carp.c b/freebsd/sys/netinet/ip_carp.c
new file mode 100644
index 00000000..25b20895
--- /dev/null
+++ b/freebsd/sys/netinet/ip_carp.c
@@ -0,0 +1,2427 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*
+ * Copyright (c) 2002 Michael Shalayeff. All rights reserved.
+ * Copyright (c) 2003 Ryan McBride. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_bpf.h>
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+
+#include <freebsd/sys/types.h>
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/conf.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/limits.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/time.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/sys/signalvar.h>
+#include <freebsd/sys/filio.h>
+#include <freebsd/sys/sockio.h>
+
+#include <freebsd/sys/socket.h>
+#ifndef __rtems__
+#include <freebsd/sys/vnode.h>
+#endif
+
+#include <freebsd/machine/stdarg.h>
+
+#include <freebsd/net/bpf.h>
+#include <freebsd/net/ethernet.h>
+#include <freebsd/net/fddi.h>
+#include <freebsd/net/iso88025.h>
+#include <freebsd/net/if.h>
+#include <freebsd/net/if_clone.h>
+#include <freebsd/net/if_dl.h>
+#include <freebsd/net/if_types.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#ifdef INET
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/if_ether.h>
+#include <freebsd/machine/in_cksum.h>
+#endif
+
+#ifdef INET6
+#include <freebsd/netinet/icmp6.h>
+#include <freebsd/netinet/ip6.h>
+#include <freebsd/netinet6/ip6protosw.h>
+#include <freebsd/netinet6/ip6_var.h>
+#include <freebsd/netinet6/scope6_var.h>
+#include <freebsd/netinet6/nd6.h>
+#endif
+
+#include <freebsd/crypto/sha1.h>
+#include <freebsd/netinet/ip_carp.h>
+
+#define CARP_IFNAME "carp"
+static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces");
+SYSCTL_DECL(_net_inet_carp);
+
+struct carp_softc {
+ struct ifnet *sc_ifp; /* Interface clue */
+ struct ifnet *sc_carpdev; /* Pointer to parent interface */
+ struct in_ifaddr *sc_ia; /* primary iface address */
+ struct ip_moptions sc_imo;
+#ifdef INET6
+ struct in6_ifaddr *sc_ia6; /* primary iface address v6 */
+ struct ip6_moptions sc_im6o;
+#endif /* INET6 */
+ TAILQ_ENTRY(carp_softc) sc_list;
+
+ enum { INIT = 0, BACKUP, MASTER } sc_state;
+
+ int sc_flags_backup;
+ int sc_suppress;
+
+ int sc_sendad_errors;
+#define CARP_SENDAD_MAX_ERRORS 3
+ int sc_sendad_success;
+#define CARP_SENDAD_MIN_SUCCESS 3
+
+ int sc_vhid;
+ int sc_advskew;
+ int sc_naddrs;
+ int sc_naddrs6;
+ int sc_advbase; /* seconds */
+ int sc_init_counter;
+ u_int64_t sc_counter;
+
+ /* authentication */
+#define CARP_HMAC_PAD 64
+ unsigned char sc_key[CARP_KEY_LEN];
+ unsigned char sc_pad[CARP_HMAC_PAD];
+ SHA1_CTX sc_sha1;
+
+ struct callout sc_ad_tmo; /* advertisement timeout */
+ struct callout sc_md_tmo; /* master down timeout */
+ struct callout sc_md6_tmo; /* master down timeout */
+
+ LIST_ENTRY(carp_softc) sc_next; /* Interface clue */
+};
+#define SC2IFP(sc) ((sc)->sc_ifp)
+
+int carp_suppress_preempt = 0;
+int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, 0 }; /* XXX for now */
+SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP");
+SYSCTL_INT(_net_inet_carp, CARPCTL_ALLOW, allow, CTLFLAG_RW,
+ &carp_opts[CARPCTL_ALLOW], 0, "Accept incoming CARP packets");
+SYSCTL_INT(_net_inet_carp, CARPCTL_PREEMPT, preempt, CTLFLAG_RW,
+ &carp_opts[CARPCTL_PREEMPT], 0, "high-priority backup preemption mode");
+SYSCTL_INT(_net_inet_carp, CARPCTL_LOG, log, CTLFLAG_RW,
+ &carp_opts[CARPCTL_LOG], 0, "log bad carp packets");
+SYSCTL_INT(_net_inet_carp, CARPCTL_ARPBALANCE, arpbalance, CTLFLAG_RW,
+ &carp_opts[CARPCTL_ARPBALANCE], 0, "balance arp responses");
+SYSCTL_INT(_net_inet_carp, OID_AUTO, suppress_preempt, CTLFLAG_RD,
+ &carp_suppress_preempt, 0, "Preemption is suppressed");
+
+struct carpstats carpstats;
+SYSCTL_STRUCT(_net_inet_carp, CARPCTL_STATS, stats, CTLFLAG_RW,
+ &carpstats, carpstats,
+ "CARP statistics (struct carpstats, netinet/ip_carp.h)");
+
+struct carp_if {
+ TAILQ_HEAD(, carp_softc) vhif_vrs;
+ int vhif_nvrs;
+
+ struct ifnet *vhif_ifp;
+ struct mtx vhif_mtx;
+};
+
+#define CARP_INET 0
+#define CARP_INET6 1
+static int proto_reg[] = {-1, -1};
+
+/* Get carp_if from softc. Valid after carp_set_addr{,6}. */
+#define SC2CIF(sc) ((struct carp_if *)(sc)->sc_carpdev->if_carp)
+
+/* lock per carp_if queue */
+#define CARP_LOCK_INIT(cif) mtx_init(&(cif)->vhif_mtx, "carp_if", \
+ NULL, MTX_DEF)
+#define CARP_LOCK_DESTROY(cif) mtx_destroy(&(cif)->vhif_mtx)
+#define CARP_LOCK_ASSERT(cif) mtx_assert(&(cif)->vhif_mtx, MA_OWNED)
+#define CARP_LOCK(cif) mtx_lock(&(cif)->vhif_mtx)
+#define CARP_UNLOCK(cif) mtx_unlock(&(cif)->vhif_mtx)
+
+#define CARP_SCLOCK(sc) mtx_lock(&SC2CIF(sc)->vhif_mtx)
+#define CARP_SCUNLOCK(sc) mtx_unlock(&SC2CIF(sc)->vhif_mtx)
+#define CARP_SCLOCK_ASSERT(sc) mtx_assert(&SC2CIF(sc)->vhif_mtx, MA_OWNED)
+
+#define CARP_LOG(...) do { \
+ if (carp_opts[CARPCTL_LOG] > 0) \
+ log(LOG_INFO, __VA_ARGS__); \
+} while (0)
+
+#define CARP_DEBUG(...) do { \
+ if (carp_opts[CARPCTL_LOG] > 1) \
+ log(LOG_DEBUG, __VA_ARGS__); \
+} while (0)
+
+static void carp_hmac_prepare(struct carp_softc *);
+static void carp_hmac_generate(struct carp_softc *, u_int32_t *,
+ unsigned char *);
+static int carp_hmac_verify(struct carp_softc *, u_int32_t *,
+ unsigned char *);
+static void carp_setroute(struct carp_softc *, int);
+static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t);
+static int carp_clone_create(struct if_clone *, int, caddr_t);
+static void carp_clone_destroy(struct ifnet *);
+static void carpdetach(struct carp_softc *, int);
+static int carp_prepare_ad(struct mbuf *, struct carp_softc *,
+ struct carp_header *);
+static void carp_send_ad_all(void);
+static void carp_send_ad(void *);
+static void carp_send_ad_locked(struct carp_softc *);
+static void carp_send_arp(struct carp_softc *);
+static void carp_master_down(void *);
+static void carp_master_down_locked(struct carp_softc *);
+static int carp_ioctl(struct ifnet *, u_long, caddr_t);
+static int carp_looutput(struct ifnet *, struct mbuf *, struct sockaddr *,
+ struct route *);
+static void carp_start(struct ifnet *);
+static void carp_setrun(struct carp_softc *, sa_family_t);
+static void carp_set_state(struct carp_softc *, int);
+static int carp_addrcount(struct carp_if *, struct in_ifaddr *, int);
+enum { CARP_COUNT_MASTER, CARP_COUNT_RUNNING };
+
+static void carp_multicast_cleanup(struct carp_softc *);
+static int carp_set_addr(struct carp_softc *, struct sockaddr_in *);
+static int carp_del_addr(struct carp_softc *, struct sockaddr_in *);
+static void carp_carpdev_state_locked(struct carp_if *);
+static void carp_sc_state_locked(struct carp_softc *);
+#ifdef INET6
+static void carp_send_na(struct carp_softc *);
+static int carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *);
+static int carp_del_addr6(struct carp_softc *, struct sockaddr_in6 *);
+static void carp_multicast6_cleanup(struct carp_softc *);
+#endif
+
+static LIST_HEAD(, carp_softc) carpif_list;
+static struct mtx carp_mtx;
+IFC_SIMPLE_DECLARE(carp, 0);
+
+static eventhandler_tag if_detach_event_tag;
+
+static __inline u_int16_t
+carp_cksum(struct mbuf *m, int len)
+{
+ return (in_cksum(m, len));
+}
+
+static void
+carp_hmac_prepare(struct carp_softc *sc)
+{
+ u_int8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
+ u_int8_t vhid = sc->sc_vhid & 0xff;
+ struct ifaddr *ifa;
+ int i, found;
+#ifdef INET
+ struct in_addr last, cur, in;
+#endif
+#ifdef INET6
+ struct in6_addr last6, cur6, in6;
+#endif
+
+ if (sc->sc_carpdev)
+ CARP_SCLOCK(sc);
+
+ /* XXX: possible race here */
+
+ /* compute ipad from key */
+ bzero(sc->sc_pad, sizeof(sc->sc_pad));
+ bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
+ for (i = 0; i < sizeof(sc->sc_pad); i++)
+ sc->sc_pad[i] ^= 0x36;
+
+ /* precompute first part of inner hash */
+ SHA1Init(&sc->sc_sha1);
+ SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
+ SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
+ SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
+ SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
+#ifdef INET
+ cur.s_addr = 0;
+ do {
+ found = 0;
+ last = cur;
+ cur.s_addr = 0xffffffff;
+ IF_ADDR_LOCK(SC2IFP(sc));
+ TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
+ in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
+ if (ifa->ifa_addr->sa_family == AF_INET &&
+ ntohl(in.s_addr) > ntohl(last.s_addr) &&
+ ntohl(in.s_addr) < ntohl(cur.s_addr)) {
+ cur.s_addr = in.s_addr;
+ found++;
+ }
+ }
+ IF_ADDR_UNLOCK(SC2IFP(sc));
+ if (found)
+ SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur));
+ } while (found);
+#endif /* INET */
+#ifdef INET6
+ memset(&cur6, 0, sizeof(cur6));
+ do {
+ found = 0;
+ last6 = cur6;
+ memset(&cur6, 0xff, sizeof(cur6));
+ IF_ADDR_LOCK(SC2IFP(sc));
+ TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
+ in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
+ if (IN6_IS_SCOPE_EMBED(&in6))
+ in6.s6_addr16[1] = 0;
+ if (ifa->ifa_addr->sa_family == AF_INET6 &&
+ memcmp(&in6, &last6, sizeof(in6)) > 0 &&
+ memcmp(&in6, &cur6, sizeof(in6)) < 0) {
+ cur6 = in6;
+ found++;
+ }
+ }
+ IF_ADDR_UNLOCK(SC2IFP(sc));
+ if (found)
+ SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6));
+ } while (found);
+#endif /* INET6 */
+
+ /* convert ipad to opad */
+ for (i = 0; i < sizeof(sc->sc_pad); i++)
+ sc->sc_pad[i] ^= 0x36 ^ 0x5c;
+
+ if (sc->sc_carpdev)
+ CARP_SCUNLOCK(sc);
+}
+
+static void
+carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2],
+ unsigned char md[20])
+{
+ SHA1_CTX sha1ctx;
+
+ /* fetch first half of inner hash */
+ bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
+
+ SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
+ SHA1Final(md, &sha1ctx);
+
+ /* outer hash */
+ SHA1Init(&sha1ctx);
+ SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
+ SHA1Update(&sha1ctx, md, 20);
+ SHA1Final(md, &sha1ctx);
+}
+
+static int
+carp_hmac_verify(struct carp_softc *sc, u_int32_t counter[2],
+ unsigned char md[20])
+{
+ unsigned char md2[20];
+
+ CARP_SCLOCK_ASSERT(sc);
+
+ carp_hmac_generate(sc, counter, md2);
+
+ return (bcmp(md, md2, sizeof(md2)));
+}
+
+static void
+carp_setroute(struct carp_softc *sc, int cmd)
+{
+ struct ifaddr *ifa;
+ int s;
+
+ if (sc->sc_carpdev)
+ CARP_SCLOCK_ASSERT(sc);
+
+ s = splnet();
+ TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
+ if (ifa->ifa_addr->sa_family == AF_INET &&
+ sc->sc_carpdev != NULL) {
+ int count = carp_addrcount(
+ (struct carp_if *)sc->sc_carpdev->if_carp,
+ ifatoia(ifa), CARP_COUNT_MASTER);
+
+ if ((cmd == RTM_ADD && count == 1) ||
+ (cmd == RTM_DELETE && count == 0))
+ rtinit(ifa, cmd, RTF_UP | RTF_HOST);
+ }
+ }
+ splx(s);
+}
+
+static int
+carp_clone_create(struct if_clone *ifc, int unit, caddr_t params)
+{
+
+ struct carp_softc *sc;
+ struct ifnet *ifp;
+
+ sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO);
+ ifp = SC2IFP(sc) = if_alloc(IFT_ETHER);
+ if (ifp == NULL) {
+ free(sc, M_CARP);
+ return (ENOSPC);
+ }
+
+ sc->sc_flags_backup = 0;
+ sc->sc_suppress = 0;
+ sc->sc_advbase = CARP_DFLTINTV;
+ sc->sc_vhid = -1; /* required setting */
+ sc->sc_advskew = 0;
+ sc->sc_init_counter = 1;
+ sc->sc_naddrs = sc->sc_naddrs6 = 0; /* M_ZERO? */
+ sc->sc_imo.imo_membership = (struct in_multi **)malloc(
+ (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP,
+ M_WAITOK);
+ sc->sc_imo.imo_mfilters = NULL;
+ sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
+ sc->sc_imo.imo_multicast_vif = -1;
+#ifdef INET6
+ sc->sc_im6o.im6o_membership = (struct in6_multi **)malloc(
+ (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP,
+ M_WAITOK);
+ sc->sc_im6o.im6o_mfilters = NULL;
+ sc->sc_im6o.im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
+ sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL;
+#endif
+
+ callout_init(&sc->sc_ad_tmo, CALLOUT_MPSAFE);
+ callout_init(&sc->sc_md_tmo, CALLOUT_MPSAFE);
+ callout_init(&sc->sc_md6_tmo, CALLOUT_MPSAFE);
+
+ ifp->if_softc = sc;
+ if_initname(ifp, CARP_IFNAME, unit);
+ ifp->if_mtu = ETHERMTU;
+ ifp->if_flags = IFF_LOOPBACK;
+ ifp->if_ioctl = carp_ioctl;
+ ifp->if_output = carp_looutput;
+ ifp->if_start = carp_start;
+ ifp->if_type = IFT_CARP;
+ ifp->if_snd.ifq_maxlen = ifqmaxlen;
+ ifp->if_hdrlen = 0;
+ if_attach(ifp);
+ bpfattach(SC2IFP(sc), DLT_NULL, sizeof(u_int32_t));
+ mtx_lock(&carp_mtx);
+ LIST_INSERT_HEAD(&carpif_list, sc, sc_next);
+ mtx_unlock(&carp_mtx);
+ return (0);
+}
+
+static void
+carp_clone_destroy(struct ifnet *ifp)
+{
+ struct carp_softc *sc = ifp->if_softc;
+
+ if (sc->sc_carpdev)
+ CARP_SCLOCK(sc);
+ carpdetach(sc, 1); /* Returns unlocked. */
+
+ mtx_lock(&carp_mtx);
+ LIST_REMOVE(sc, sc_next);
+ mtx_unlock(&carp_mtx);
+ bpfdetach(ifp);
+ if_detach(ifp);
+ if_free_type(ifp, IFT_ETHER);
+ free(sc->sc_imo.imo_membership, M_CARP);
+#ifdef INET6
+ free(sc->sc_im6o.im6o_membership, M_CARP);
+#endif
+ free(sc, M_CARP);
+}
+
+/*
+ * This function can be called on CARP interface destroy path,
+ * and in case of the removal of the underlying interface as
+ * well. We differentiate these two cases. In the latter case
+ * we do not cleanup our multicast memberships, since they
+ * are already freed. Also, in the latter case we do not
+ * release the lock on return, because the function will be
+ * called once more, for another CARP instance on the same
+ * interface.
+ */
+static void
+carpdetach(struct carp_softc *sc, int unlock)
+{
+ struct carp_if *cif;
+
+ callout_stop(&sc->sc_ad_tmo);
+ callout_stop(&sc->sc_md_tmo);
+ callout_stop(&sc->sc_md6_tmo);
+
+ if (sc->sc_suppress)
+ carp_suppress_preempt--;
+ sc->sc_suppress = 0;
+
+ if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS)
+ carp_suppress_preempt--;
+ sc->sc_sendad_errors = 0;
+
+ carp_set_state(sc, INIT);
+ SC2IFP(sc)->if_flags &= ~IFF_UP;
+ carp_setrun(sc, 0);
+ if (unlock)
+ carp_multicast_cleanup(sc);
+#ifdef INET6
+ carp_multicast6_cleanup(sc);
+#endif
+
+ if (sc->sc_carpdev != NULL) {
+ cif = (struct carp_if *)sc->sc_carpdev->if_carp;
+ CARP_LOCK_ASSERT(cif);
+ TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
+ if (!--cif->vhif_nvrs) {
+ ifpromisc(sc->sc_carpdev, 0);
+ sc->sc_carpdev->if_carp = NULL;
+ CARP_LOCK_DESTROY(cif);
+ free(cif, M_CARP);
+ } else if (unlock)
+ CARP_UNLOCK(cif);
+ sc->sc_carpdev = NULL;
+ }
+}
+
+/* Detach an interface from the carp. */
+static void
+carp_ifdetach(void *arg __unused, struct ifnet *ifp)
+{
+ struct carp_if *cif = (struct carp_if *)ifp->if_carp;
+ struct carp_softc *sc, *nextsc;
+
+ if (cif == NULL)
+ return;
+
+ /*
+ * XXX: At the end of for() cycle the lock will be destroyed.
+ */
+ CARP_LOCK(cif);
+ for (sc = TAILQ_FIRST(&cif->vhif_vrs); sc; sc = nextsc) {
+ nextsc = TAILQ_NEXT(sc, sc_list);
+ carpdetach(sc, 0);
+ }
+}
+
+/*
+ * process input packet.
+ * we have rearranged checks order compared to the rfc,
+ * but it seems more efficient this way or not possible otherwise.
+ */
+void
+carp_input(struct mbuf *m, int hlen)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ struct carp_header *ch;
+ int iplen, len;
+
+ CARPSTATS_INC(carps_ipackets);
+
+ if (!carp_opts[CARPCTL_ALLOW]) {
+ m_freem(m);
+ return;
+ }
+
+ /* check if received on a valid carp interface */
+ if (m->m_pkthdr.rcvif->if_carp == NULL) {
+ CARPSTATS_INC(carps_badif);
+ CARP_DEBUG("carp_input: packet received on non-carp "
+ "interface: %s\n",
+ m->m_pkthdr.rcvif->if_xname);
+ m_freem(m);
+ return;
+ }
+
+ /* verify that the IP TTL is 255. */
+ if (ip->ip_ttl != CARP_DFLTTL) {
+ CARPSTATS_INC(carps_badttl);
+ CARP_DEBUG("carp_input: received ttl %d != 255 on %s\n",
+ ip->ip_ttl,
+ m->m_pkthdr.rcvif->if_xname);
+ m_freem(m);
+ return;
+ }
+
+ iplen = ip->ip_hl << 2;
+
+ if (m->m_pkthdr.len < iplen + sizeof(*ch)) {
+ CARPSTATS_INC(carps_badlen);
+ CARP_DEBUG("carp_input: received len %zd < "
+ "sizeof(struct carp_header) on %s\n",
+ m->m_len - sizeof(struct ip),
+ m->m_pkthdr.rcvif->if_xname);
+ m_freem(m);
+ return;
+ }
+
+ if (iplen + sizeof(*ch) < m->m_len) {
+ if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) {
+ CARPSTATS_INC(carps_hdrops);
+ CARP_DEBUG("carp_input: pullup failed\n");
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ }
+ ch = (struct carp_header *)((char *)ip + iplen);
+
+ /*
+ * verify that the received packet length is
+ * equal to the CARP header
+ */
+ len = iplen + sizeof(*ch);
+ if (len > m->m_pkthdr.len) {
+ CARPSTATS_INC(carps_badlen);
+ CARP_DEBUG("carp_input: packet too short %d on %s\n",
+ m->m_pkthdr.len,
+ m->m_pkthdr.rcvif->if_xname);
+ m_freem(m);
+ return;
+ }
+
+ if ((m = m_pullup(m, len)) == NULL) {
+ CARPSTATS_INC(carps_hdrops);
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ ch = (struct carp_header *)((char *)ip + iplen);
+
+ /* verify the CARP checksum */
+ m->m_data += iplen;
+ if (carp_cksum(m, len - iplen)) {
+ CARPSTATS_INC(carps_badsum);
+ CARP_DEBUG("carp_input: checksum failed on %s\n",
+ m->m_pkthdr.rcvif->if_xname);
+ m_freem(m);
+ return;
+ }
+ m->m_data -= iplen;
+
+ carp_input_c(m, ch, AF_INET);
+}
+
+#ifdef INET6
+int
+carp6_input(struct mbuf **mp, int *offp, int proto)
+{
+ struct mbuf *m = *mp;
+ struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
+ struct carp_header *ch;
+ u_int len;
+
+ CARPSTATS_INC(carps_ipackets6);
+
+ if (!carp_opts[CARPCTL_ALLOW]) {
+ m_freem(m);
+ return (IPPROTO_DONE);
+ }
+
+ /* check if received on a valid carp interface */
+ if (m->m_pkthdr.rcvif->if_carp == NULL) {
+ CARPSTATS_INC(carps_badif);
+ CARP_DEBUG("carp6_input: packet received on non-carp "
+ "interface: %s\n",
+ m->m_pkthdr.rcvif->if_xname);
+ m_freem(m);
+ return (IPPROTO_DONE);
+ }
+
+ /* verify that the IP TTL is 255 */
+ if (ip6->ip6_hlim != CARP_DFLTTL) {
+ CARPSTATS_INC(carps_badttl);
+ CARP_DEBUG("carp6_input: received ttl %d != 255 on %s\n",
+ ip6->ip6_hlim,
+ m->m_pkthdr.rcvif->if_xname);
+ m_freem(m);
+ return (IPPROTO_DONE);
+ }
+
+ /* verify that we have a complete carp packet */
+ len = m->m_len;
+ IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch));
+ if (ch == NULL) {
+ CARPSTATS_INC(carps_badlen);
+ CARP_DEBUG("carp6_input: packet size %u too small\n", len);
+ return (IPPROTO_DONE);
+ }
+
+
+ /* verify the CARP checksum */
+ m->m_data += *offp;
+ if (carp_cksum(m, sizeof(*ch))) {
+ CARPSTATS_INC(carps_badsum);
+ CARP_DEBUG("carp6_input: checksum failed, on %s\n",
+ m->m_pkthdr.rcvif->if_xname);
+ m_freem(m);
+ return (IPPROTO_DONE);
+ }
+ m->m_data -= *offp;
+
+ carp_input_c(m, ch, AF_INET6);
+ return (IPPROTO_DONE);
+}
+#endif /* INET6 */
+
+static void
+carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
+{
+ struct ifnet *ifp = m->m_pkthdr.rcvif;
+ struct carp_softc *sc;
+ u_int64_t tmp_counter;
+ struct timeval sc_tv, ch_tv;
+
+ /* verify that the VHID is valid on the receiving interface */
+ CARP_LOCK(ifp->if_carp);
+ TAILQ_FOREACH(sc, &((struct carp_if *)ifp->if_carp)->vhif_vrs, sc_list)
+ if (sc->sc_vhid == ch->carp_vhid)
+ break;
+
+ if (!sc || !((SC2IFP(sc)->if_flags & IFF_UP) &&
+ (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) {
+ CARPSTATS_INC(carps_badvhid);
+ CARP_UNLOCK(ifp->if_carp);
+ m_freem(m);
+ return;
+ }
+
+ getmicrotime(&SC2IFP(sc)->if_lastchange);
+ SC2IFP(sc)->if_ipackets++;
+ SC2IFP(sc)->if_ibytes += m->m_pkthdr.len;
+
+ if (bpf_peers_present(SC2IFP(sc)->if_bpf)) {
+ struct ip *ip = mtod(m, struct ip *);
+ uint32_t af1 = af;
+
+ /* BPF wants net byte order */
+ ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2));
+ ip->ip_off = htons(ip->ip_off);
+ bpf_mtap2(SC2IFP(sc)->if_bpf, &af1, sizeof(af1), m);
+ }
+
+ /* verify the CARP version. */
+ if (ch->carp_version != CARP_VERSION) {
+ CARPSTATS_INC(carps_badver);
+ SC2IFP(sc)->if_ierrors++;
+ CARP_UNLOCK(ifp->if_carp);
+ CARP_DEBUG("%s; invalid version %d\n",
+ SC2IFP(sc)->if_xname,
+ ch->carp_version);
+ m_freem(m);
+ return;
+ }
+
+ /* verify the hash */
+ if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
+ CARPSTATS_INC(carps_badauth);
+ SC2IFP(sc)->if_ierrors++;
+ CARP_UNLOCK(ifp->if_carp);
+ CARP_DEBUG("%s: incorrect hash\n", SC2IFP(sc)->if_xname);
+ m_freem(m);
+ return;
+ }
+
+ tmp_counter = ntohl(ch->carp_counter[0]);
+ tmp_counter = tmp_counter<<32;
+ tmp_counter += ntohl(ch->carp_counter[1]);
+
+ /* XXX Replay protection goes here */
+
+ sc->sc_init_counter = 0;
+ sc->sc_counter = tmp_counter;
+
+ sc_tv.tv_sec = sc->sc_advbase;
+ if (carp_suppress_preempt && sc->sc_advskew < 240)
+ sc_tv.tv_usec = 240 * 1000000 / 256;
+ else
+ sc_tv.tv_usec = sc->sc_advskew * 1000000 / 256;
+ ch_tv.tv_sec = ch->carp_advbase;
+ ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
+
+ switch (sc->sc_state) {
+ case INIT:
+ break;
+ case MASTER:
+ /*
+ * If we receive an advertisement from a master who's going to
+ * be more frequent than us, go into BACKUP state.
+ */
+ if (timevalcmp(&sc_tv, &ch_tv, >) ||
+ timevalcmp(&sc_tv, &ch_tv, ==)) {
+ callout_stop(&sc->sc_ad_tmo);
+ CARP_LOG("%s: MASTER -> BACKUP "
+ "(more frequent advertisement received)\n",
+ SC2IFP(sc)->if_xname);
+ carp_set_state(sc, BACKUP);
+ carp_setrun(sc, 0);
+ carp_setroute(sc, RTM_DELETE);
+ }
+ break;
+ case BACKUP:
+ /*
+ * If we're pre-empting masters who advertise slower than us,
+ * and this one claims to be slower, treat him as down.
+ */
+ if (carp_opts[CARPCTL_PREEMPT] &&
+ timevalcmp(&sc_tv, &ch_tv, <)) {
+ CARP_LOG("%s: BACKUP -> MASTER "
+ "(preempting a slower master)\n",
+ SC2IFP(sc)->if_xname);
+ carp_master_down_locked(sc);
+ break;
+ }
+
+ /*
+ * If the master is going to advertise at such a low frequency
+ * that he's guaranteed to time out, we'd might as well just
+ * treat him as timed out now.
+ */
+ sc_tv.tv_sec = sc->sc_advbase * 3;
+ if (timevalcmp(&sc_tv, &ch_tv, <)) {
+ CARP_LOG("%s: BACKUP -> MASTER "
+ "(master timed out)\n",
+ SC2IFP(sc)->if_xname);
+ carp_master_down_locked(sc);
+ break;
+ }
+
+ /*
+ * Otherwise, we reset the counter and wait for the next
+ * advertisement.
+ */
+ carp_setrun(sc, af);
+ break;
+ }
+
+ CARP_UNLOCK(ifp->if_carp);
+
+ m_freem(m);
+ return;
+}
+
+static int
+carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
+{
+ struct m_tag *mtag;
+ struct ifnet *ifp = SC2IFP(sc);
+
+ if (sc->sc_init_counter) {
+ /* this could also be seconds since unix epoch */
+ sc->sc_counter = arc4random();
+ sc->sc_counter = sc->sc_counter << 32;
+ sc->sc_counter += arc4random();
+ } else
+ sc->sc_counter++;
+
+ ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff);
+ ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff);
+
+ carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
+
+ /* Tag packet for carp_output */
+ mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct ifnet *), M_NOWAIT);
+ if (mtag == NULL) {
+ m_freem(m);
+ SC2IFP(sc)->if_oerrors++;
+ return (ENOMEM);
+ }
+ bcopy(&ifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *));
+ m_tag_prepend(m, mtag);
+
+ return (0);
+}
+
+static void
+carp_send_ad_all(void)
+{
+ struct carp_softc *sc;
+
+ mtx_lock(&carp_mtx);
+ LIST_FOREACH(sc, &carpif_list, sc_next) {
+ if (sc->sc_carpdev == NULL)
+ continue;
+ CARP_SCLOCK(sc);
+ if ((SC2IFP(sc)->if_flags & IFF_UP) &&
+ (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING) &&
+ sc->sc_state == MASTER)
+ carp_send_ad_locked(sc);
+ CARP_SCUNLOCK(sc);
+ }
+ mtx_unlock(&carp_mtx);
+}
+
+static void
+carp_send_ad(void *v)
+{
+ struct carp_softc *sc = v;
+
+ CARP_SCLOCK(sc);
+ carp_send_ad_locked(sc);
+ CARP_SCUNLOCK(sc);
+}
+
+static void
+carp_send_ad_locked(struct carp_softc *sc)
+{
+ struct carp_header ch;
+ struct timeval tv;
+ struct carp_header *ch_ptr;
+ struct mbuf *m;
+ int len, advbase, advskew;
+
+ CARP_SCLOCK_ASSERT(sc);
+
+ /* bow out if we've lost our UPness or RUNNINGuiness */
+ if (!((SC2IFP(sc)->if_flags & IFF_UP) &&
+ (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) {
+ advbase = 255;
+ advskew = 255;
+ } else {
+ advbase = sc->sc_advbase;
+ if (!carp_suppress_preempt || sc->sc_advskew > 240)
+ advskew = sc->sc_advskew;
+ else
+ advskew = 240;
+ tv.tv_sec = advbase;
+ tv.tv_usec = advskew * 1000000 / 256;
+ }
+
+ ch.carp_version = CARP_VERSION;
+ ch.carp_type = CARP_ADVERTISEMENT;
+ ch.carp_vhid = sc->sc_vhid;
+ ch.carp_advbase = advbase;
+ ch.carp_advskew = advskew;
+ ch.carp_authlen = 7; /* XXX DEFINE */
+ ch.carp_pad1 = 0; /* must be zero */
+ ch.carp_cksum = 0;
+
+#ifdef INET
+ if (sc->sc_ia) {
+ struct ip *ip;
+
+ MGETHDR(m, M_DONTWAIT, MT_HEADER);
+ if (m == NULL) {
+ SC2IFP(sc)->if_oerrors++;
+ CARPSTATS_INC(carps_onomem);
+ /* XXX maybe less ? */
+ if (advbase != 255 || advskew != 255)
+ callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
+ carp_send_ad, sc);
+ return;
+ }
+ len = sizeof(*ip) + sizeof(ch);
+ m->m_pkthdr.len = len;
+ m->m_pkthdr.rcvif = NULL;
+ m->m_len = len;
+ MH_ALIGN(m, m->m_len);
+ m->m_flags |= M_MCAST;
+ ip = mtod(m, struct ip *);
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = sizeof(*ip) >> 2;
+ ip->ip_tos = IPTOS_LOWDELAY;
+ ip->ip_len = len;
+ ip->ip_id = ip_newid();
+ ip->ip_off = IP_DF;
+ ip->ip_ttl = CARP_DFLTTL;
+ ip->ip_p = IPPROTO_CARP;
+ ip->ip_sum = 0;
+ ip->ip_src.s_addr = sc->sc_ia->ia_addr.sin_addr.s_addr;
+ ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
+
+ ch_ptr = (struct carp_header *)(&ip[1]);
+ bcopy(&ch, ch_ptr, sizeof(ch));
+ if (carp_prepare_ad(m, sc, ch_ptr))
+ return;
+
+ m->m_data += sizeof(*ip);
+ ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip));
+ m->m_data -= sizeof(*ip);
+
+ getmicrotime(&SC2IFP(sc)->if_lastchange);
+ SC2IFP(sc)->if_opackets++;
+ SC2IFP(sc)->if_obytes += len;
+ CARPSTATS_INC(carps_opackets);
+
+ if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) {
+ SC2IFP(sc)->if_oerrors++;
+ if (sc->sc_sendad_errors < INT_MAX)
+ sc->sc_sendad_errors++;
+ if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
+ carp_suppress_preempt++;
+ if (carp_suppress_preempt == 1) {
+ CARP_SCUNLOCK(sc);
+ carp_send_ad_all();
+ CARP_SCLOCK(sc);
+ }
+ }
+ sc->sc_sendad_success = 0;
+ } else {
+ if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
+ if (++sc->sc_sendad_success >=
+ CARP_SENDAD_MIN_SUCCESS) {
+ carp_suppress_preempt--;
+ sc->sc_sendad_errors = 0;
+ }
+ } else
+ sc->sc_sendad_errors = 0;
+ }
+ }
+#endif /* INET */
+#ifdef INET6
+ if (sc->sc_ia6) {
+ struct ip6_hdr *ip6;
+
+ MGETHDR(m, M_DONTWAIT, MT_HEADER);
+ if (m == NULL) {
+ SC2IFP(sc)->if_oerrors++;
+ CARPSTATS_INC(carps_onomem);
+ /* XXX maybe less ? */
+ if (advbase != 255 || advskew != 255)
+ callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
+ carp_send_ad, sc);
+ return;
+ }
+ len = sizeof(*ip6) + sizeof(ch);
+ m->m_pkthdr.len = len;
+ m->m_pkthdr.rcvif = NULL;
+ m->m_len = len;
+ MH_ALIGN(m, m->m_len);
+ m->m_flags |= M_MCAST;
+ ip6 = mtod(m, struct ip6_hdr *);
+ bzero(ip6, sizeof(*ip6));
+ ip6->ip6_vfc |= IPV6_VERSION;
+ ip6->ip6_hlim = CARP_DFLTTL;
+ ip6->ip6_nxt = IPPROTO_CARP;
+ bcopy(&sc->sc_ia6->ia_addr.sin6_addr, &ip6->ip6_src,
+ sizeof(struct in6_addr));
+ /* set the multicast destination */
+
+ ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
+ ip6->ip6_dst.s6_addr8[15] = 0x12;
+ if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
+ SC2IFP(sc)->if_oerrors++;
+ m_freem(m);
+ CARP_DEBUG("%s: in6_setscope failed\n", __func__);
+ return;
+ }
+
+ ch_ptr = (struct carp_header *)(&ip6[1]);
+ bcopy(&ch, ch_ptr, sizeof(ch));
+ if (carp_prepare_ad(m, sc, ch_ptr))
+ return;
+
+ m->m_data += sizeof(*ip6);
+ ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip6));
+ m->m_data -= sizeof(*ip6);
+
+ getmicrotime(&SC2IFP(sc)->if_lastchange);
+ SC2IFP(sc)->if_opackets++;
+ SC2IFP(sc)->if_obytes += len;
+ CARPSTATS_INC(carps_opackets6);
+
+ if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) {
+ SC2IFP(sc)->if_oerrors++;
+ if (sc->sc_sendad_errors < INT_MAX)
+ sc->sc_sendad_errors++;
+ if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
+ carp_suppress_preempt++;
+ if (carp_suppress_preempt == 1) {
+ CARP_SCUNLOCK(sc);
+ carp_send_ad_all();
+ CARP_SCLOCK(sc);
+ }
+ }
+ sc->sc_sendad_success = 0;
+ } else {
+ if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
+ if (++sc->sc_sendad_success >=
+ CARP_SENDAD_MIN_SUCCESS) {
+ carp_suppress_preempt--;
+ sc->sc_sendad_errors = 0;
+ }
+ } else
+ sc->sc_sendad_errors = 0;
+ }
+ }
+#endif /* INET6 */
+
+ if (advbase != 255 || advskew != 255)
+ callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
+ carp_send_ad, sc);
+
+}
+
+/*
+ * Broadcast a gratuitous ARP request containing
+ * the virtual router MAC address for each IP address
+ * associated with the virtual router.
+ */
+static void
+carp_send_arp(struct carp_softc *sc)
+{
+ struct ifaddr *ifa;
+
+ TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
+
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+
+/* arprequest(sc->sc_carpdev, &in, &in, IF_LLADDR(sc->sc_ifp)); */
+ arp_ifinit2(sc->sc_carpdev, ifa, IF_LLADDR(sc->sc_ifp));
+
+ DELAY(1000); /* XXX */
+ }
+}
+
+#ifdef INET6
+static void
+carp_send_na(struct carp_softc *sc)
+{
+ struct ifaddr *ifa;
+ struct in6_addr *in6;
+ static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
+
+ TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
+
+ if (ifa->ifa_addr->sa_family != AF_INET6)
+ continue;
+
+ in6 = &ifatoia6(ifa)->ia_addr.sin6_addr;
+ nd6_na_output(sc->sc_carpdev, &mcast, in6,
+ ND_NA_FLAG_OVERRIDE, 1, NULL);
+ DELAY(1000); /* XXX */
+ }
+}
+#endif /* INET6 */
+
+static int
+carp_addrcount(struct carp_if *cif, struct in_ifaddr *ia, int type)
+{
+ struct carp_softc *vh;
+ struct ifaddr *ifa;
+ int count = 0;
+
+ CARP_LOCK_ASSERT(cif);
+
+ TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
+ if ((type == CARP_COUNT_RUNNING &&
+ (SC2IFP(vh)->if_flags & IFF_UP) &&
+ (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) ||
+ (type == CARP_COUNT_MASTER && vh->sc_state == MASTER)) {
+ IF_ADDR_LOCK(SC2IFP(vh));
+ TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist,
+ ifa_list) {
+ if (ifa->ifa_addr->sa_family == AF_INET &&
+ ia->ia_addr.sin_addr.s_addr ==
+ ifatoia(ifa)->ia_addr.sin_addr.s_addr)
+ count++;
+ }
+ IF_ADDR_UNLOCK(SC2IFP(vh));
+ }
+ }
+ return (count);
+}
+
+int
+carp_iamatch(struct ifnet *ifp, struct in_ifaddr *ia,
+ struct in_addr *isaddr, u_int8_t **enaddr)
+{
+ struct carp_if *cif;
+ struct carp_softc *vh;
+ int index, count = 0;
+ struct ifaddr *ifa;
+
+ cif = ifp->if_carp;
+ CARP_LOCK(cif);
+
+ if (carp_opts[CARPCTL_ARPBALANCE]) {
+ /*
+ * XXX proof of concept implementation.
+ * We use the source ip to decide which virtual host should
+ * handle the request. If we're master of that virtual host,
+ * then we respond, otherwise, just drop the arp packet on
+ * the floor.
+ */
+ count = carp_addrcount(cif, ia, CARP_COUNT_RUNNING);
+ if (count == 0) {
+ /* should never reach this */
+ CARP_UNLOCK(cif);
+ return (0);
+ }
+
+ /* this should be a hash, like pf_hash() */
+ index = ntohl(isaddr->s_addr) % count;
+ count = 0;
+
+ TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
+ if ((SC2IFP(vh)->if_flags & IFF_UP) &&
+ (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) {
+ IF_ADDR_LOCK(SC2IFP(vh));
+ TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist,
+ ifa_list) {
+ if (ifa->ifa_addr->sa_family ==
+ AF_INET &&
+ ia->ia_addr.sin_addr.s_addr ==
+ ifatoia(ifa)->ia_addr.sin_addr.s_addr) {
+ if (count == index) {
+ if (vh->sc_state ==
+ MASTER) {
+ *enaddr = IF_LLADDR(vh->sc_ifp);
+ IF_ADDR_UNLOCK(SC2IFP(vh));
+ CARP_UNLOCK(cif);
+ return (1);
+ } else {
+ IF_ADDR_UNLOCK(SC2IFP(vh));
+ CARP_UNLOCK(cif);
+ return (0);
+ }
+ }
+ count++;
+ }
+ }
+ IF_ADDR_UNLOCK(SC2IFP(vh));
+ }
+ }
+ } else {
+ TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
+ if ((SC2IFP(vh)->if_flags & IFF_UP) &&
+ (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
+ ia->ia_ifp == SC2IFP(vh) &&
+ vh->sc_state == MASTER) {
+ *enaddr = IF_LLADDR(vh->sc_ifp);
+ CARP_UNLOCK(cif);
+ return (1);
+ }
+ }
+ }
+ CARP_UNLOCK(cif);
+ return (0);
+}
+
+#ifdef INET6
+struct ifaddr *
+carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr)
+{
+ struct carp_if *cif;
+ struct carp_softc *vh;
+ struct ifaddr *ifa;
+
+ cif = ifp->if_carp;
+ CARP_LOCK(cif);
+ TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
+ IF_ADDR_LOCK(SC2IFP(vh));
+ TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, ifa_list) {
+ if (IN6_ARE_ADDR_EQUAL(taddr,
+ &ifatoia6(ifa)->ia_addr.sin6_addr) &&
+ (SC2IFP(vh)->if_flags & IFF_UP) &&
+ (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
+ vh->sc_state == MASTER) {
+ ifa_ref(ifa);
+ IF_ADDR_UNLOCK(SC2IFP(vh));
+ CARP_UNLOCK(cif);
+ return (ifa);
+ }
+ }
+ IF_ADDR_UNLOCK(SC2IFP(vh));
+ }
+ CARP_UNLOCK(cif);
+
+ return (NULL);
+}
+
+caddr_t
+carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr)
+{
+ struct m_tag *mtag;
+ struct carp_if *cif;
+ struct carp_softc *sc;
+ struct ifaddr *ifa;
+
+ cif = ifp->if_carp;
+ CARP_LOCK(cif);
+ TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) {
+ IF_ADDR_LOCK(SC2IFP(sc));
+ TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
+ if (IN6_ARE_ADDR_EQUAL(taddr,
+ &ifatoia6(ifa)->ia_addr.sin6_addr) &&
+ (SC2IFP(sc)->if_flags & IFF_UP) &&
+ (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING)) {
+ struct ifnet *ifp = SC2IFP(sc);
+ mtag = m_tag_get(PACKET_TAG_CARP,
+ sizeof(struct ifnet *), M_NOWAIT);
+ if (mtag == NULL) {
+ /* better a bit than nothing */
+ IF_ADDR_UNLOCK(SC2IFP(sc));
+ CARP_UNLOCK(cif);
+ return (IF_LLADDR(sc->sc_ifp));
+ }
+ bcopy(&ifp, (caddr_t)(mtag + 1),
+ sizeof(struct ifnet *));
+ m_tag_prepend(m, mtag);
+
+ IF_ADDR_UNLOCK(SC2IFP(sc));
+ CARP_UNLOCK(cif);
+ return (IF_LLADDR(sc->sc_ifp));
+ }
+ }
+ IF_ADDR_UNLOCK(SC2IFP(sc));
+ }
+ CARP_UNLOCK(cif);
+
+ return (NULL);
+}
+#endif
+
+struct ifnet *
+carp_forus(struct ifnet *ifp, u_char *dhost)
+{
+ struct carp_if *cif;
+ struct carp_softc *vh;
+ u_int8_t *ena = dhost;
+
+ if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1)
+ return (NULL);
+
+ cif = ifp->if_carp;
+ CARP_LOCK(cif);
+ TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list)
+ if ((SC2IFP(vh)->if_flags & IFF_UP) &&
+ (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
+ vh->sc_state == MASTER &&
+ !bcmp(dhost, IF_LLADDR(vh->sc_ifp), ETHER_ADDR_LEN)) {
+ CARP_UNLOCK(cif);
+ return (SC2IFP(vh));
+ }
+
+ CARP_UNLOCK(cif);
+ return (NULL);
+}
+
+static void
+carp_master_down(void *v)
+{
+ struct carp_softc *sc = v;
+
+ CARP_SCLOCK(sc);
+ carp_master_down_locked(sc);
+ CARP_SCUNLOCK(sc);
+}
+
+static void
+carp_master_down_locked(struct carp_softc *sc)
+{
+ if (sc->sc_carpdev)
+ CARP_SCLOCK_ASSERT(sc);
+
+ switch (sc->sc_state) {
+ case INIT:
+ printf("%s: master_down event in INIT state\n",
+ SC2IFP(sc)->if_xname);
+ break;
+ case MASTER:
+ break;
+ case BACKUP:
+ carp_set_state(sc, MASTER);
+ carp_send_ad_locked(sc);
+ carp_send_arp(sc);
+#ifdef INET6
+ carp_send_na(sc);
+#endif /* INET6 */
+ carp_setrun(sc, 0);
+ carp_setroute(sc, RTM_ADD);
+ break;
+ }
+}
+
+/*
+ * When in backup state, af indicates whether to reset the master down timer
+ * for v4 or v6. If it's set to zero, reset the ones which are already pending.
+ */
+static void
+carp_setrun(struct carp_softc *sc, sa_family_t af)
+{
+ struct timeval tv;
+
+ if (sc->sc_carpdev == NULL) {
+ SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
+ carp_set_state(sc, INIT);
+ return;
+ } else
+ CARP_SCLOCK_ASSERT(sc);
+
+ if (SC2IFP(sc)->if_flags & IFF_UP &&
+ sc->sc_vhid > 0 && (sc->sc_naddrs || sc->sc_naddrs6) &&
+ sc->sc_carpdev->if_link_state == LINK_STATE_UP)
+ SC2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING;
+ else {
+ SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
+ carp_setroute(sc, RTM_DELETE);
+ return;
+ }
+
+ switch (sc->sc_state) {
+ case INIT:
+ if (carp_opts[CARPCTL_PREEMPT] && !carp_suppress_preempt) {
+ carp_send_ad_locked(sc);
+ carp_send_arp(sc);
+#ifdef INET6
+ carp_send_na(sc);
+#endif /* INET6 */
+ CARP_LOG("%s: INIT -> MASTER (preempting)\n",
+ SC2IFP(sc)->if_xname);
+ carp_set_state(sc, MASTER);
+ carp_setroute(sc, RTM_ADD);
+ } else {
+ CARP_LOG("%s: INIT -> BACKUP\n", SC2IFP(sc)->if_xname);
+ carp_set_state(sc, BACKUP);
+ carp_setroute(sc, RTM_DELETE);
+ carp_setrun(sc, 0);
+ }
+ break;
+ case BACKUP:
+ callout_stop(&sc->sc_ad_tmo);
+ tv.tv_sec = 3 * sc->sc_advbase;
+ tv.tv_usec = sc->sc_advskew * 1000000 / 256;
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
+ carp_master_down, sc);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
+ carp_master_down, sc);
+ break;
+#endif /* INET6 */
+ default:
+ if (sc->sc_naddrs)
+ callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
+ carp_master_down, sc);
+ if (sc->sc_naddrs6)
+ callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
+ carp_master_down, sc);
+ break;
+ }
+ break;
+ case MASTER:
+ tv.tv_sec = sc->sc_advbase;
+ tv.tv_usec = sc->sc_advskew * 1000000 / 256;
+ callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
+ carp_send_ad, sc);
+ break;
+ }
+}
+
+static void
+carp_multicast_cleanup(struct carp_softc *sc)
+{
+ struct ip_moptions *imo = &sc->sc_imo;
+ u_int16_t n = imo->imo_num_memberships;
+
+ /* Clean up our own multicast memberships */
+ while (n-- > 0) {
+ if (imo->imo_membership[n] != NULL) {
+ in_delmulti(imo->imo_membership[n]);
+ imo->imo_membership[n] = NULL;
+ }
+ }
+ KASSERT(imo->imo_mfilters == NULL,
+ ("%s: imo_mfilters != NULL", __func__));
+ imo->imo_num_memberships = 0;
+ imo->imo_multicast_ifp = NULL;
+}
+
+#ifdef INET6
+static void
+carp_multicast6_cleanup(struct carp_softc *sc)
+{
+ struct ip6_moptions *im6o = &sc->sc_im6o;
+ u_int16_t n = im6o->im6o_num_memberships;
+
+ while (n-- > 0) {
+ if (im6o->im6o_membership[n] != NULL) {
+ in6_mc_leave(im6o->im6o_membership[n], NULL);
+ im6o->im6o_membership[n] = NULL;
+ }
+ }
+ KASSERT(im6o->im6o_mfilters == NULL,
+ ("%s: im6o_mfilters != NULL", __func__));
+ im6o->im6o_num_memberships = 0;
+ im6o->im6o_multicast_ifp = NULL;
+}
+#endif
+
+static int
+carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin)
+{
+ struct ifnet *ifp;
+ struct carp_if *cif;
+ struct in_ifaddr *ia, *ia_if;
+ struct ip_moptions *imo = &sc->sc_imo;
+ struct in_addr addr;
+ u_long iaddr = htonl(sin->sin_addr.s_addr);
+ int own, error;
+
+ if (sin->sin_addr.s_addr == 0) {
+ if (!(SC2IFP(sc)->if_flags & IFF_UP))
+ carp_set_state(sc, INIT);
+ if (sc->sc_naddrs)
+ SC2IFP(sc)->if_flags |= IFF_UP;
+ if (sc->sc_carpdev)
+ CARP_SCLOCK(sc);
+ carp_setrun(sc, 0);
+ if (sc->sc_carpdev)
+ CARP_SCUNLOCK(sc);
+ return (0);
+ }
+
+ /* we have to do it by hands to check we won't match on us */
+ ia_if = NULL; own = 0;
+ IN_IFADDR_RLOCK();
+ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ /* and, yeah, we need a multicast-capable iface too */
+ if (ia->ia_ifp != SC2IFP(sc) &&
+ (ia->ia_ifp->if_flags & IFF_MULTICAST) &&
+ (iaddr & ia->ia_subnetmask) == ia->ia_subnet) {
+ if (!ia_if)
+ ia_if = ia;
+ if (sin->sin_addr.s_addr ==
+ ia->ia_addr.sin_addr.s_addr)
+ own++;
+ }
+ }
+
+ if (!ia_if) {
+ IN_IFADDR_RUNLOCK();
+ return (EADDRNOTAVAIL);
+ }
+
+ ia = ia_if;
+ ifa_ref(&ia->ia_ifa);
+ IN_IFADDR_RUNLOCK();
+
+ ifp = ia->ia_ifp;
+
+ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 ||
+ (imo->imo_multicast_ifp && imo->imo_multicast_ifp != ifp)) {
+ ifa_free(&ia->ia_ifa);
+ return (EADDRNOTAVAIL);
+ }
+
+ if (imo->imo_num_memberships == 0) {
+ addr.s_addr = htonl(INADDR_CARP_GROUP);
+ if ((imo->imo_membership[0] = in_addmulti(&addr, ifp)) ==
+ NULL) {
+ ifa_free(&ia->ia_ifa);
+ return (ENOBUFS);
+ }
+ imo->imo_num_memberships++;
+ imo->imo_multicast_ifp = ifp;
+ imo->imo_multicast_ttl = CARP_DFLTTL;
+ imo->imo_multicast_loop = 0;
+ }
+
+ if (!ifp->if_carp) {
+
+ cif = malloc(sizeof(*cif), M_CARP,
+ M_WAITOK|M_ZERO);
+ if (!cif) {
+ error = ENOBUFS;
+ goto cleanup;
+ }
+ if ((error = ifpromisc(ifp, 1))) {
+ free(cif, M_CARP);
+ goto cleanup;
+ }
+
+ CARP_LOCK_INIT(cif);
+ CARP_LOCK(cif);
+ cif->vhif_ifp = ifp;
+ TAILQ_INIT(&cif->vhif_vrs);
+ ifp->if_carp = cif;
+
+ } else {
+ struct carp_softc *vr;
+
+ cif = (struct carp_if *)ifp->if_carp;
+ CARP_LOCK(cif);
+ TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
+ if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
+ CARP_UNLOCK(cif);
+ error = EEXIST;
+ goto cleanup;
+ }
+ }
+ sc->sc_ia = ia;
+ sc->sc_carpdev = ifp;
+
+ { /* XXX prevent endless loop if already in queue */
+ struct carp_softc *vr, *after = NULL;
+ int myself = 0;
+ cif = (struct carp_if *)ifp->if_carp;
+
+ /* XXX: cif should not change, right? So we still hold the lock */
+ CARP_LOCK_ASSERT(cif);
+
+ TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
+ if (vr == sc)
+ myself = 1;
+ if (vr->sc_vhid < sc->sc_vhid)
+ after = vr;
+ }
+
+ if (!myself) {
+ /* We're trying to keep things in order */
+ if (after == NULL) {
+ TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
+ } else {
+ TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
+ }
+ cif->vhif_nvrs++;
+ }
+ }
+
+ sc->sc_naddrs++;
+ SC2IFP(sc)->if_flags |= IFF_UP;
+ if (own)
+ sc->sc_advskew = 0;
+ carp_sc_state_locked(sc);
+ carp_setrun(sc, 0);
+
+ CARP_UNLOCK(cif);
+ ifa_free(&ia->ia_ifa); /* XXXRW: should hold reference for softc. */
+
+ return (0);
+
+cleanup:
+ in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
+ ifa_free(&ia->ia_ifa);
+ return (error);
+}
+
+static int
+carp_del_addr(struct carp_softc *sc, struct sockaddr_in *sin)
+{
+ int error = 0;
+
+ if (!--sc->sc_naddrs) {
+ struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp;
+ struct ip_moptions *imo = &sc->sc_imo;
+
+ CARP_LOCK(cif);
+ callout_stop(&sc->sc_ad_tmo);
+ SC2IFP(sc)->if_flags &= ~IFF_UP;
+ SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
+ sc->sc_vhid = -1;
+ in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
+ imo->imo_multicast_ifp = NULL;
+ TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
+ if (!--cif->vhif_nvrs) {
+ sc->sc_carpdev->if_carp = NULL;
+ CARP_LOCK_DESTROY(cif);
+ free(cif, M_CARP);
+ } else {
+ CARP_UNLOCK(cif);
+ }
+ }
+
+ return (error);
+}
+
+#ifdef INET6
+static int
+carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
+{
+ struct ifnet *ifp;
+ struct carp_if *cif;
+ struct in6_ifaddr *ia, *ia_if;
+ struct ip6_moptions *im6o = &sc->sc_im6o;
+ struct in6_addr in6;
+ int own, error;
+
+ error = 0;
+
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+ if (!(SC2IFP(sc)->if_flags & IFF_UP))
+ carp_set_state(sc, INIT);
+ if (sc->sc_naddrs6)
+ SC2IFP(sc)->if_flags |= IFF_UP;
+ if (sc->sc_carpdev)
+ CARP_SCLOCK(sc);
+ carp_setrun(sc, 0);
+ if (sc->sc_carpdev)
+ CARP_SCUNLOCK(sc);
+ return (0);
+ }
+
+ /* we have to do it by hands to check we won't match on us */
+ ia_if = NULL; own = 0;
+ IN6_IFADDR_RLOCK();
+ TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if ((sin6->sin6_addr.s6_addr32[i] &
+ ia->ia_prefixmask.sin6_addr.s6_addr32[i]) !=
+ (ia->ia_addr.sin6_addr.s6_addr32[i] &
+ ia->ia_prefixmask.sin6_addr.s6_addr32[i]))
+ break;
+ }
+ /* and, yeah, we need a multicast-capable iface too */
+ if (ia->ia_ifp != SC2IFP(sc) &&
+ (ia->ia_ifp->if_flags & IFF_MULTICAST) &&
+ (i == 4)) {
+ if (!ia_if)
+ ia_if = ia;
+ if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
+ &ia->ia_addr.sin6_addr))
+ own++;
+ }
+ }
+
+ if (!ia_if) {
+ IN6_IFADDR_RUNLOCK();
+ return (EADDRNOTAVAIL);
+ }
+ ia = ia_if;
+ ifa_ref(&ia->ia_ifa);
+ IN6_IFADDR_RUNLOCK();
+ ifp = ia->ia_ifp;
+
+ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 ||
+ (im6o->im6o_multicast_ifp && im6o->im6o_multicast_ifp != ifp)) {
+ ifa_free(&ia->ia_ifa);
+ return (EADDRNOTAVAIL);
+ }
+
+ if (!sc->sc_naddrs6) {
+ struct in6_multi *in6m;
+
+ im6o->im6o_multicast_ifp = ifp;
+
+ /* join CARP multicast address */
+ bzero(&in6, sizeof(in6));
+ in6.s6_addr16[0] = htons(0xff02);
+ in6.s6_addr8[15] = 0x12;
+ if (in6_setscope(&in6, ifp, NULL) != 0)
+ goto cleanup;
+ in6m = NULL;
+ error = in6_mc_join(ifp, &in6, NULL, &in6m, 0);
+ if (error)
+ goto cleanup;
+ im6o->im6o_membership[0] = in6m;
+ im6o->im6o_num_memberships++;
+
+ /* join solicited multicast address */
+ bzero(&in6, sizeof(in6));
+ in6.s6_addr16[0] = htons(0xff02);
+ in6.s6_addr32[1] = 0;
+ in6.s6_addr32[2] = htonl(1);
+ in6.s6_addr32[3] = sin6->sin6_addr.s6_addr32[3];
+ in6.s6_addr8[12] = 0xff;
+ if (in6_setscope(&in6, ifp, NULL) != 0)
+ goto cleanup;
+ in6m = NULL;
+ error = in6_mc_join(ifp, &in6, NULL, &in6m, 0);
+ if (error)
+ goto cleanup;
+ im6o->im6o_membership[1] = in6m;
+ im6o->im6o_num_memberships++;
+ }
+
+ if (!ifp->if_carp) {
+ cif = malloc(sizeof(*cif), M_CARP,
+ M_WAITOK|M_ZERO);
+ if (!cif) {
+ error = ENOBUFS;
+ goto cleanup;
+ }
+ if ((error = ifpromisc(ifp, 1))) {
+ free(cif, M_CARP);
+ goto cleanup;
+ }
+
+ CARP_LOCK_INIT(cif);
+ CARP_LOCK(cif);
+ cif->vhif_ifp = ifp;
+ TAILQ_INIT(&cif->vhif_vrs);
+ ifp->if_carp = cif;
+
+ } else {
+ struct carp_softc *vr;
+
+ cif = (struct carp_if *)ifp->if_carp;
+ CARP_LOCK(cif);
+ TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
+ if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
+ CARP_UNLOCK(cif);
+ error = EINVAL;
+ goto cleanup;
+ }
+ }
+ sc->sc_ia6 = ia;
+ sc->sc_carpdev = ifp;
+
+ { /* XXX prevent endless loop if already in queue */
+ struct carp_softc *vr, *after = NULL;
+ int myself = 0;
+ cif = (struct carp_if *)ifp->if_carp;
+ CARP_LOCK_ASSERT(cif);
+
+ TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
+ if (vr == sc)
+ myself = 1;
+ if (vr->sc_vhid < sc->sc_vhid)
+ after = vr;
+ }
+
+ if (!myself) {
+ /* We're trying to keep things in order */
+ if (after == NULL) {
+ TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
+ } else {
+ TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
+ }
+ cif->vhif_nvrs++;
+ }
+ }
+
+ sc->sc_naddrs6++;
+ SC2IFP(sc)->if_flags |= IFF_UP;
+ if (own)
+ sc->sc_advskew = 0;
+ carp_sc_state_locked(sc);
+ carp_setrun(sc, 0);
+
+ CARP_UNLOCK(cif);
+ ifa_free(&ia->ia_ifa); /* XXXRW: should hold reference for softc. */
+
+ return (0);
+
+cleanup:
+ if (!sc->sc_naddrs6)
+ carp_multicast6_cleanup(sc);
+ ifa_free(&ia->ia_ifa);
+ return (error);
+}
+
+static int
+carp_del_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
+{
+ int error = 0;
+
+ if (!--sc->sc_naddrs6) {
+ struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp;
+
+ CARP_LOCK(cif);
+ callout_stop(&sc->sc_ad_tmo);
+ SC2IFP(sc)->if_flags &= ~IFF_UP;
+ SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
+ sc->sc_vhid = -1;
+ carp_multicast6_cleanup(sc);
+ TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
+ if (!--cif->vhif_nvrs) {
+ CARP_LOCK_DESTROY(cif);
+ sc->sc_carpdev->if_carp = NULL;
+ free(cif, M_CARP);
+ } else
+ CARP_UNLOCK(cif);
+ }
+
+ return (error);
+}
+#endif /* INET6 */
+
+static int
+carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr)
+{
+ struct carp_softc *sc = ifp->if_softc, *vr;
+ struct carpreq carpr;
+ struct ifaddr *ifa;
+ struct ifreq *ifr;
+ struct ifaliasreq *ifra;
+ int locked = 0, error = 0;
+
+ ifa = (struct ifaddr *)addr;
+ ifra = (struct ifaliasreq *)addr;
+ ifr = (struct ifreq *)addr;
+
+ switch (cmd) {
+ case SIOCSIFADDR:
+ switch (ifa->ifa_addr->sa_family) {
+#ifdef INET
+ case AF_INET:
+ SC2IFP(sc)->if_flags |= IFF_UP;
+ bcopy(ifa->ifa_addr, ifa->ifa_dstaddr,
+ sizeof(struct sockaddr));
+ error = carp_set_addr(sc, satosin(ifa->ifa_addr));
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ SC2IFP(sc)->if_flags |= IFF_UP;
+ error = carp_set_addr6(sc, satosin6(ifa->ifa_addr));
+ break;
+#endif /* INET6 */
+ default:
+ error = EAFNOSUPPORT;
+ break;
+ }
+ break;
+
+ case SIOCAIFADDR:
+ switch (ifa->ifa_addr->sa_family) {
+#ifdef INET
+ case AF_INET:
+ SC2IFP(sc)->if_flags |= IFF_UP;
+ bcopy(ifa->ifa_addr, ifa->ifa_dstaddr,
+ sizeof(struct sockaddr));
+ error = carp_set_addr(sc, satosin(&ifra->ifra_addr));
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ SC2IFP(sc)->if_flags |= IFF_UP;
+ error = carp_set_addr6(sc, satosin6(&ifra->ifra_addr));
+ break;
+#endif /* INET6 */
+ default:
+ error = EAFNOSUPPORT;
+ break;
+ }
+ break;
+
+ case SIOCDIFADDR:
+ switch (ifa->ifa_addr->sa_family) {
+#ifdef INET
+ case AF_INET:
+ error = carp_del_addr(sc, satosin(&ifra->ifra_addr));
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ error = carp_del_addr6(sc, satosin6(&ifra->ifra_addr));
+ break;
+#endif /* INET6 */
+ default:
+ error = EAFNOSUPPORT;
+ break;
+ }
+ break;
+
+ case SIOCSIFFLAGS:
+ if (sc->sc_carpdev) {
+ locked = 1;
+ CARP_SCLOCK(sc);
+ }
+ if (sc->sc_state != INIT && !(ifr->ifr_flags & IFF_UP)) {
+ callout_stop(&sc->sc_ad_tmo);
+ callout_stop(&sc->sc_md_tmo);
+ callout_stop(&sc->sc_md6_tmo);
+ if (sc->sc_state == MASTER)
+ carp_send_ad_locked(sc);
+ carp_set_state(sc, INIT);
+ carp_setrun(sc, 0);
+ } else if (sc->sc_state == INIT && (ifr->ifr_flags & IFF_UP)) {
+ SC2IFP(sc)->if_flags |= IFF_UP;
+ carp_setrun(sc, 0);
+ }
+ break;
+
+ case SIOCSVH:
+ error = priv_check(curthread, PRIV_NETINET_CARP);
+ if (error)
+ break;
+ if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr)))
+ break;
+ error = 1;
+ if (sc->sc_carpdev) {
+ locked = 1;
+ CARP_SCLOCK(sc);
+ }
+ if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) {
+ switch (carpr.carpr_state) {
+ case BACKUP:
+ callout_stop(&sc->sc_ad_tmo);
+ carp_set_state(sc, BACKUP);
+ carp_setrun(sc, 0);
+ carp_setroute(sc, RTM_DELETE);
+ break;
+ case MASTER:
+ carp_master_down_locked(sc);
+ break;
+ default:
+ break;
+ }
+ }
+ if (carpr.carpr_vhid > 0) {
+ if (carpr.carpr_vhid > 255) {
+ error = EINVAL;
+ break;
+ }
+ if (sc->sc_carpdev) {
+ struct carp_if *cif;
+ cif = (struct carp_if *)sc->sc_carpdev->if_carp;
+ TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
+ if (vr != sc &&
+ vr->sc_vhid == carpr.carpr_vhid) {
+ error = EEXIST;
+ break;
+ }
+ if (error == EEXIST)
+ break;
+ }
+ sc->sc_vhid = carpr.carpr_vhid;
+ IF_LLADDR(sc->sc_ifp)[0] = 0;
+ IF_LLADDR(sc->sc_ifp)[1] = 0;
+ IF_LLADDR(sc->sc_ifp)[2] = 0x5e;
+ IF_LLADDR(sc->sc_ifp)[3] = 0;
+ IF_LLADDR(sc->sc_ifp)[4] = 1;
+ IF_LLADDR(sc->sc_ifp)[5] = sc->sc_vhid;
+ error--;
+ }
+ if (carpr.carpr_advbase > 0 || carpr.carpr_advskew > 0) {
+ if (carpr.carpr_advskew >= 255) {
+ error = EINVAL;
+ break;
+ }
+ if (carpr.carpr_advbase > 255) {
+ error = EINVAL;
+ break;
+ }
+ sc->sc_advbase = carpr.carpr_advbase;
+ sc->sc_advskew = carpr.carpr_advskew;
+ error--;
+ }
+ bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
+ if (error > 0)
+ error = EINVAL;
+ else {
+ error = 0;
+ carp_setrun(sc, 0);
+ }
+ break;
+
+ case SIOCGVH:
+ /* XXX: lockless read */
+ bzero(&carpr, sizeof(carpr));
+ carpr.carpr_state = sc->sc_state;
+ carpr.carpr_vhid = sc->sc_vhid;
+ carpr.carpr_advbase = sc->sc_advbase;
+ carpr.carpr_advskew = sc->sc_advskew;
+ error = priv_check(curthread, PRIV_NETINET_CARP);
+ if (error == 0)
+ bcopy(sc->sc_key, carpr.carpr_key,
+ sizeof(carpr.carpr_key));
+ error = copyout(&carpr, ifr->ifr_data, sizeof(carpr));
+ break;
+
+ default:
+ error = EINVAL;
+ }
+
+ if (locked)
+ CARP_SCUNLOCK(sc);
+
+ carp_hmac_prepare(sc);
+
+ return (error);
+}
+
+/*
+ * XXX: this is looutput. We should eventually use it from there.
+ */
+static int
+carp_looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
+ struct route *ro)
+{
+ u_int32_t af;
+ struct rtentry *rt = NULL;
+
+ M_ASSERTPKTHDR(m); /* check if we have the packet header */
+
+ if (ro != NULL)
+ rt = ro->ro_rt;
+ if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ m_freem(m);
+ return (rt->rt_flags & RTF_BLACKHOLE ? 0 :
+ rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
+ }
+
+ ifp->if_opackets++;
+ ifp->if_obytes += m->m_pkthdr.len;
+
+ /* BPF writes need to be handled specially. */
+ if (dst->sa_family == AF_UNSPEC) {
+ bcopy(dst->sa_data, &af, sizeof(af));
+ dst->sa_family = af;
+ }
+
+#if 1 /* XXX */
+ switch (dst->sa_family) {
+ case AF_INET:
+ case AF_INET6:
+ case AF_IPX:
+ case AF_APPLETALK:
+ break;
+ default:
+ printf("carp_looutput: af=%d unexpected\n", dst->sa_family);
+ m_freem(m);
+ return (EAFNOSUPPORT);
+ }
+#endif
+ return(if_simloop(ifp, m, dst->sa_family, 0));
+}
+
+/*
+ * Start output on carp interface. This function should never be called.
+ */
+static void
+carp_start(struct ifnet *ifp)
+{
+#ifdef DEBUG
+ printf("%s: start called\n", ifp->if_xname);
+#endif
+}
+
+int
+carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
+ struct rtentry *rt)
+{
+ struct m_tag *mtag;
+ struct carp_softc *sc;
+ struct ifnet *carp_ifp;
+
+ if (!sa)
+ return (0);
+
+ switch (sa->sa_family) {
+#ifdef INET
+ case AF_INET:
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ break;
+#endif /* INET6 */
+ default:
+ return (0);
+ }
+
+ mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
+ if (mtag == NULL)
+ return (0);
+
+ bcopy(mtag + 1, &carp_ifp, sizeof(struct ifnet *));
+ sc = carp_ifp->if_softc;
+
+ /* Set the source MAC address to Virtual Router MAC Address */
+ switch (ifp->if_type) {
+ case IFT_ETHER:
+ case IFT_L2VLAN: {
+ struct ether_header *eh;
+
+ eh = mtod(m, struct ether_header *);
+ eh->ether_shost[0] = 0;
+ eh->ether_shost[1] = 0;
+ eh->ether_shost[2] = 0x5e;
+ eh->ether_shost[3] = 0;
+ eh->ether_shost[4] = 1;
+ eh->ether_shost[5] = sc->sc_vhid;
+ }
+ break;
+ case IFT_FDDI: {
+ struct fddi_header *fh;
+
+ fh = mtod(m, struct fddi_header *);
+ fh->fddi_shost[0] = 0;
+ fh->fddi_shost[1] = 0;
+ fh->fddi_shost[2] = 0x5e;
+ fh->fddi_shost[3] = 0;
+ fh->fddi_shost[4] = 1;
+ fh->fddi_shost[5] = sc->sc_vhid;
+ }
+ break;
+ case IFT_ISO88025: {
+ struct iso88025_header *th;
+ th = mtod(m, struct iso88025_header *);
+ th->iso88025_shost[0] = 3;
+ th->iso88025_shost[1] = 0;
+ th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1);
+ th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1);
+ th->iso88025_shost[4] = 0;
+ th->iso88025_shost[5] = 0;
+ }
+ break;
+ default:
+ printf("%s: carp is not supported for this interface type\n",
+ ifp->if_xname);
+ return (EOPNOTSUPP);
+ }
+
+ return (0);
+}
+
+static void
+carp_set_state(struct carp_softc *sc, int state)
+{
+ int link_state;
+
+ if (sc->sc_carpdev)
+ CARP_SCLOCK_ASSERT(sc);
+
+ if (sc->sc_state == state)
+ return;
+
+ sc->sc_state = state;
+ switch (state) {
+ case BACKUP:
+ link_state = LINK_STATE_DOWN;
+ break;
+ case MASTER:
+ link_state = LINK_STATE_UP;
+ break;
+ default:
+ link_state = LINK_STATE_UNKNOWN;
+ break;
+ }
+ if_link_state_change(SC2IFP(sc), link_state);
+}
+
+void
+carp_carpdev_state(struct ifnet *ifp)
+{
+ struct carp_if *cif;
+
+ cif = ifp->if_carp;
+ CARP_LOCK(cif);
+ carp_carpdev_state_locked(cif);
+ CARP_UNLOCK(cif);
+}
+
+static void
+carp_carpdev_state_locked(struct carp_if *cif)
+{
+ struct carp_softc *sc;
+
+ TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list)
+ carp_sc_state_locked(sc);
+}
+
+static void
+carp_sc_state_locked(struct carp_softc *sc)
+{
+ CARP_SCLOCK_ASSERT(sc);
+
+ if (sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
+ !(sc->sc_carpdev->if_flags & IFF_UP)) {
+ sc->sc_flags_backup = SC2IFP(sc)->if_flags;
+ SC2IFP(sc)->if_flags &= ~IFF_UP;
+ SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
+ callout_stop(&sc->sc_ad_tmo);
+ callout_stop(&sc->sc_md_tmo);
+ callout_stop(&sc->sc_md6_tmo);
+ carp_set_state(sc, INIT);
+ carp_setrun(sc, 0);
+ if (!sc->sc_suppress) {
+ carp_suppress_preempt++;
+ if (carp_suppress_preempt == 1) {
+ CARP_SCUNLOCK(sc);
+ carp_send_ad_all();
+ CARP_SCLOCK(sc);
+ }
+ }
+ sc->sc_suppress = 1;
+ } else {
+ SC2IFP(sc)->if_flags |= sc->sc_flags_backup;
+ carp_set_state(sc, INIT);
+ carp_setrun(sc, 0);
+ if (sc->sc_suppress)
+ carp_suppress_preempt--;
+ sc->sc_suppress = 0;
+ }
+
+ return;
+}
+
+#ifdef INET
+extern struct domain inetdomain;
+static struct protosw in_carp_protosw = {
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_CARP,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = carp_input,
+ .pr_output = (pr_output_t *)rip_output,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_usrreqs = &rip_usrreqs
+};
+#endif
+
+#ifdef INET6
+extern struct domain inet6domain;
+static struct ip6protosw in6_carp_protosw = {
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inet6domain,
+ .pr_protocol = IPPROTO_CARP,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = carp6_input,
+ .pr_output = rip6_output,
+ .pr_ctloutput = rip6_ctloutput,
+ .pr_usrreqs = &rip6_usrreqs
+};
+#endif
+
+static void
+carp_mod_cleanup(void)
+{
+
+ if (if_detach_event_tag == NULL)
+ return;
+ EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);
+ if_clone_detach(&carp_cloner);
+#ifdef INET
+ if (proto_reg[CARP_INET] == 0) {
+ (void)ipproto_unregister(IPPROTO_CARP);
+ pf_proto_unregister(PF_INET, IPPROTO_CARP, SOCK_RAW);
+ proto_reg[CARP_INET] = -1;
+ }
+ carp_iamatch_p = NULL;
+#endif
+#ifdef INET6
+ if (proto_reg[CARP_INET6] == 0) {
+ (void)ip6proto_unregister(IPPROTO_CARP);
+ pf_proto_unregister(PF_INET6, IPPROTO_CARP, SOCK_RAW);
+ proto_reg[CARP_INET6] = -1;
+ }
+ carp_iamatch6_p = NULL;
+ carp_macmatch6_p = NULL;
+#endif
+ carp_linkstate_p = NULL;
+ carp_forus_p = NULL;
+ carp_output_p = NULL;
+ mtx_destroy(&carp_mtx);
+}
+
+static int
+carp_mod_load(void)
+{
+ int err;
+
+ if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
+ carp_ifdetach, NULL, EVENTHANDLER_PRI_ANY);
+ if (if_detach_event_tag == NULL)
+ return (ENOMEM);
+ mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF);
+ LIST_INIT(&carpif_list);
+ if_clone_attach(&carp_cloner);
+ carp_linkstate_p = carp_carpdev_state;
+ carp_forus_p = carp_forus;
+ carp_output_p = carp_output;
+#ifdef INET6
+ carp_iamatch6_p = carp_iamatch6;
+ carp_macmatch6_p = carp_macmatch6;
+ proto_reg[CARP_INET6] = pf_proto_register(PF_INET6,
+ (struct protosw *)&in6_carp_protosw);
+ if (proto_reg[CARP_INET6] != 0) {
+ printf("carp: error %d attaching to PF_INET6\n",
+ proto_reg[CARP_INET6]);
+ carp_mod_cleanup();
+ return (EINVAL);
+ }
+ err = ip6proto_register(IPPROTO_CARP);
+ if (err) {
+ printf("carp: error %d registering with INET6\n", err);
+ carp_mod_cleanup();
+ return (EINVAL);
+ }
+#endif
+#ifdef INET
+ carp_iamatch_p = carp_iamatch;
+ proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw);
+ if (proto_reg[CARP_INET] != 0) {
+ printf("carp: error %d attaching to PF_INET\n",
+ proto_reg[CARP_INET]);
+ carp_mod_cleanup();
+ return (EINVAL);
+ }
+ err = ipproto_register(IPPROTO_CARP);
+ if (err) {
+ printf("carp: error %d registering with INET\n", err);
+ carp_mod_cleanup();
+ return (EINVAL);
+ }
+#endif
+ return 0;
+}
+
+static int
+carp_modevent(module_t mod, int type, void *data)
+{
+ switch (type) {
+ case MOD_LOAD:
+ return carp_mod_load();
+ /* NOTREACHED */
+ case MOD_UNLOAD:
+ /*
+ * XXX: For now, disallow module unloading by default due to
+ * a race condition where a thread may dereference one of the
+ * function pointer hooks after the module has been
+ * unloaded, during processing of a packet, causing a panic.
+ */
+#ifdef CARPMOD_CAN_UNLOAD
+ carp_mod_cleanup();
+#else
+ return (EBUSY);
+#endif
+ break;
+
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+static moduledata_t carp_mod = {
+ "carp",
+ carp_modevent,
+ 0
+};
+
+DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
diff --git a/freebsd/sys/netinet/ip_carp.h b/freebsd/sys/netinet/ip_carp.h
new file mode 100644
index 00000000..2f2b4f28
--- /dev/null
+++ b/freebsd/sys/netinet/ip_carp.h
@@ -0,0 +1,191 @@
+/* $FreeBSD$ */
+/* $OpenBSD: ip_carp.h,v 1.8 2004/07/29 22:12:15 mcbride Exp $ */
+
+/*
+ * Copyright (c) 2002 Michael Shalayeff. All rights reserved.
+ * Copyright (c) 2003 Ryan McBride. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _IP_CARP_H
+#define _IP_CARP_H
+
+/*
+ * The CARP header layout is as follows:
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |Version| Type | VirtualHostID | AdvSkew | Auth Len |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Reserved | AdvBase | Checksum |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Counter (1) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Counter (2) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | SHA-1 HMAC (1) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | SHA-1 HMAC (2) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | SHA-1 HMAC (3) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | SHA-1 HMAC (4) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | SHA-1 HMAC (5) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ */
+
+struct carp_header {
+#if BYTE_ORDER == LITTLE_ENDIAN
+ u_int8_t carp_type:4,
+ carp_version:4;
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+ u_int8_t carp_version:4,
+ carp_type:4;
+#endif
+ u_int8_t carp_vhid; /* virtual host id */
+ u_int8_t carp_advskew; /* advertisement skew */
+ u_int8_t carp_authlen; /* size of counter+md, 32bit chunks */
+ u_int8_t carp_pad1; /* reserved */
+ u_int8_t carp_advbase; /* advertisement interval */
+ u_int16_t carp_cksum;
+ u_int32_t carp_counter[2];
+ unsigned char carp_md[20]; /* SHA1 HMAC */
+} __packed;
+
+#ifdef CTASSERT
+CTASSERT(sizeof(struct carp_header) == 36);
+#endif
+
+#define CARP_DFLTTL 255
+
+/* carp_version */
+#define CARP_VERSION 2
+
+/* carp_type */
+#define CARP_ADVERTISEMENT 0x01
+
+#define CARP_KEY_LEN 20 /* a sha1 hash of a passphrase */
+
+/* carp_advbase */
+#define CARP_DFLTINTV 1
+
+/*
+ * Statistics.
+ */
+struct carpstats {
+ uint64_t carps_ipackets; /* total input packets, IPv4 */
+ uint64_t carps_ipackets6; /* total input packets, IPv6 */
+ uint64_t carps_badif; /* wrong interface */
+ uint64_t carps_badttl; /* TTL is not CARP_DFLTTL */
+ uint64_t carps_hdrops; /* packets shorter than hdr */
+ uint64_t carps_badsum; /* bad checksum */
+ uint64_t carps_badver; /* bad (incl unsupp) version */
+ uint64_t carps_badlen; /* data length does not match */
+ uint64_t carps_badauth; /* bad authentication */
+ uint64_t carps_badvhid; /* bad VHID */
+ uint64_t carps_badaddrs; /* bad address list */
+
+ uint64_t carps_opackets; /* total output packets, IPv4 */
+ uint64_t carps_opackets6; /* total output packets, IPv6 */
+ uint64_t carps_onomem; /* no memory for an mbuf */
+ uint64_t carps_ostates; /* total state updates sent */
+
+ uint64_t carps_preempt; /* if enabled, preemptions */
+};
+
+#ifdef _KERNEL
+#define CARPSTATS_ADD(name, val) carpstats.name += (val)
+#define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1)
+#endif
+
+/*
+ * Configuration structure for SIOCSVH SIOCGVH
+ */
+struct carpreq {
+ int carpr_state;
+#define CARP_STATES "INIT", "BACKUP", "MASTER"
+#define CARP_MAXSTATE 2
+ int carpr_vhid;
+ int carpr_advskew;
+ int carpr_advbase;
+ unsigned char carpr_key[CARP_KEY_LEN];
+};
+#define SIOCSVH _IOWR('i', 245, struct ifreq)
+#define SIOCGVH _IOWR('i', 246, struct ifreq)
+
+/*
+ * Names for CARP sysctl objects
+ */
+#define CARPCTL_ALLOW 1 /* accept incoming CARP packets */
+#define CARPCTL_PREEMPT 2 /* high-pri backup preemption mode */
+#define CARPCTL_LOG 3 /* log bad packets */
+#define CARPCTL_STATS 4 /* statistics (read-only) */
+#define CARPCTL_ARPBALANCE 5 /* balance arp responses */
+#define CARPCTL_MAXID 6
+
+#define CARPCTL_NAMES { \
+ { 0, 0 }, \
+ { "allow", CTLTYPE_INT }, \
+ { "preempt", CTLTYPE_INT }, \
+ { "log", CTLTYPE_INT }, \
+ { "stats", CTLTYPE_STRUCT }, \
+ { "arpbalance", CTLTYPE_INT }, \
+}
+
+#ifdef _KERNEL
+void carp_carpdev_state(struct ifnet *);
+void carp_input (struct mbuf *, int);
+int carp6_input (struct mbuf **, int *, int);
+int carp_output (struct ifnet *, struct mbuf *, struct sockaddr *,
+ struct rtentry *);
+int carp_iamatch (struct ifnet *, struct in_ifaddr *, struct in_addr *,
+ u_int8_t **);
+struct ifaddr *carp_iamatch6(struct ifnet *, struct in6_addr *);
+caddr_t carp_macmatch6(struct ifnet *, struct mbuf *, const struct in6_addr *);
+struct ifnet *carp_forus (struct ifnet *, u_char *);
+
+/* These are external networking stack hooks for CARP */
+/* net/if.c */
+extern void (*carp_linkstate_p)(struct ifnet *);
+/* net/if_bridge.c net/if_ethersubr.c */
+extern struct ifnet *(*carp_forus_p)(struct ifnet *, u_char *);
+/* net/if_ethersubr.c */
+extern int (*carp_output_p)(struct ifnet *, struct mbuf *,
+ struct sockaddr *, struct rtentry *);
+#ifdef INET
+/* netinet/if_ether.c */
+extern int (*carp_iamatch_p)(struct ifnet *, struct in_ifaddr *,
+ struct in_addr *, u_int8_t **);
+#endif
+#ifdef INET6
+/* netinet6/nd6_nbr.c */
+extern struct ifaddr *(*carp_iamatch6_p)(struct ifnet *, struct in6_addr *);
+extern caddr_t (*carp_macmatch6_p)(struct ifnet *, struct mbuf *,
+ const struct in6_addr *);
+#endif
+#endif
+#endif /* _IP_CARP_H */
diff --git a/freebsd/sys/netinet/ip_divert.c b/freebsd/sys/netinet/ip_divert.c
new file mode 100644
index 00000000..13999825
--- /dev/null
+++ b/freebsd/sys/netinet/ip_divert.c
@@ -0,0 +1,818 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#if !defined(KLD_MODULE)
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_sctp.h>
+#ifndef INET
+#error "IPDIVERT requires INET."
+#endif
+#endif
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/rwlock.h>
+#include <freebsd/sys/signalvar.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/sx.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/systm.h>
+
+#include <freebsd/vm/uma.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/netisr.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_var.h>
+#ifdef SCTP
+#include <freebsd/netinet/sctp_crc32.h>
+#endif
+
+#include <freebsd/security/mac/mac_framework.h>
+
+/*
+ * Divert sockets
+ */
+
+/*
+ * Allocate enough space to hold a full IP packet
+ */
+#define DIVSNDQ (65536 + 100)
+#define DIVRCVQ (65536 + 100)
+
+/*
+ * Divert sockets work in conjunction with ipfw or other packet filters,
+ * see the divert(4) manpage for features.
+ * Packets are selected by the packet filter and tagged with an
+ * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by
+ * the packet filter) and information on the matching filter rule for
+ * subsequent reinjection. The divert_port is used to put the packet
+ * on the corresponding divert socket, while the rule number is passed
+ * up (at least partially) as the sin_port in the struct sockaddr.
+ *
+ * Packets written to the divert socket carry in sin_addr a
+ * destination address, and in sin_port the number of the filter rule
+ * after which to continue processing.
+ * If the destination address is INADDR_ANY, the packet is treated as
+ * as outgoing and sent to ip_output(); otherwise it is treated as
+ * incoming and sent to ip_input().
+ * Further, sin_zero carries some information on the interface,
+ * which can be used in the reinject -- see comments in the code.
+ *
+ * On reinjection, processing in ip_input() and ip_output()
+ * will be exactly the same as for the original packet, except that
+ * packet filter processing will start at the rule number after the one
+ * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0
+ * will apply the entire ruleset to the packet).
+ */
+
+/* Internal variables. */
+static VNET_DEFINE(struct inpcbhead, divcb);
+static VNET_DEFINE(struct inpcbinfo, divcbinfo);
+
+#define V_divcb VNET(divcb)
+#define V_divcbinfo VNET(divcbinfo)
+
+static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */
+static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */
+
+static eventhandler_tag ip_divert_event_tag;
+
+/*
+ * Initialize divert connection block queue.
+ */
+static void
+div_zone_change(void *tag)
+{
+
+ uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets);
+}
+
+static int
+div_inpcb_init(void *mem, int size, int flags)
+{
+ struct inpcb *inp = mem;
+
+ INP_LOCK_INIT(inp, "inp", "divinp");
+ return (0);
+}
+
+static void
+div_inpcb_fini(void *mem, int size)
+{
+ struct inpcb *inp = mem;
+
+ INP_LOCK_DESTROY(inp);
+}
+
+static void
+div_init(void)
+{
+
+ INP_INFO_LOCK_INIT(&V_divcbinfo, "div");
+ LIST_INIT(&V_divcb);
+ V_divcbinfo.ipi_listhead = &V_divcb;
+#ifdef VIMAGE
+ V_divcbinfo.ipi_vnet = curvnet;
+#endif
+ /*
+ * XXX We don't use the hash list for divert IP, but it's easier
+ * to allocate a one entry hash list than it is to check all
+ * over the place for hashbase == NULL.
+ */
+ V_divcbinfo.ipi_hashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_hashmask);
+ V_divcbinfo.ipi_porthashbase = hashinit(1, M_PCB,
+ &V_divcbinfo.ipi_porthashmask);
+ V_divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb),
+ NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR,
+ UMA_ZONE_NOFREE);
+ uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets);
+}
+
+static void
+div_destroy(void)
+{
+
+ INP_INFO_LOCK_DESTROY(&V_divcbinfo);
+ uma_zdestroy(V_divcbinfo.ipi_zone);
+ hashdestroy(V_divcbinfo.ipi_hashbase, M_PCB, V_divcbinfo.ipi_hashmask);
+ hashdestroy(V_divcbinfo.ipi_porthashbase, M_PCB,
+ V_divcbinfo.ipi_porthashmask);
+}
+
+/*
+ * IPPROTO_DIVERT is not in the real IP protocol number space; this
+ * function should never be called. Just in case, drop any packets.
+ */
+static void
+div_input(struct mbuf *m, int off)
+{
+
+ KMOD_IPSTAT_INC(ips_noproto);
+ m_freem(m);
+}
+
+/*
+ * Divert a packet by passing it up to the divert socket at port 'port'.
+ *
+ * Setup generic address and protocol structures for div_input routine,
+ * then pass them along with mbuf chain.
+ */
+static void
+divert_packet(struct mbuf *m, int incoming)
+{
+ struct ip *ip;
+ struct inpcb *inp;
+ struct socket *sa;
+ u_int16_t nport;
+ struct sockaddr_in divsrc;
+ struct m_tag *mtag;
+
+ mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
+ if (mtag == NULL) {
+ m_freem(m);
+ return;
+ }
+ /* Assure header */
+ if (m->m_len < sizeof(struct ip) &&
+ (m = m_pullup(m, sizeof(struct ip))) == 0)
+ return;
+ ip = mtod(m, struct ip *);
+
+ /* Delayed checksums are currently not compatible with divert. */
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ ip->ip_len = ntohs(ip->ip_len);
+ in_delayed_cksum(m);
+ m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ ip->ip_len = htons(ip->ip_len);
+ }
+#ifdef SCTP
+ if (m->m_pkthdr.csum_flags & CSUM_SCTP) {
+ ip->ip_len = ntohs(ip->ip_len);
+ sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
+ m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
+ ip->ip_len = htons(ip->ip_len);
+ }
+#endif
+ bzero(&divsrc, sizeof(divsrc));
+ divsrc.sin_len = sizeof(divsrc);
+ divsrc.sin_family = AF_INET;
+ /* record matching rule, in host format */
+ divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum;
+ /*
+ * Record receive interface address, if any.
+ * But only for incoming packets.
+ */
+ if (incoming) {
+ struct ifaddr *ifa;
+ struct ifnet *ifp;
+
+ /* Sanity check */
+ M_ASSERTPKTHDR(m);
+
+ /* Find IP address for receive interface */
+ ifp = m->m_pkthdr.rcvif;
+ if_addr_rlock(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ divsrc.sin_addr =
+ ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
+ break;
+ }
+ if_addr_runlock(ifp);
+ }
+ /*
+ * Record the incoming interface name whenever we have one.
+ */
+ if (m->m_pkthdr.rcvif) {
+ /*
+ * Hide the actual interface name in there in the
+ * sin_zero array. XXX This needs to be moved to a
+ * different sockaddr type for divert, e.g.
+ * sockaddr_div with multiple fields like
+ * sockaddr_dl. Presently we have only 7 bytes
+ * but that will do for now as most interfaces
+ * are 4 or less + 2 or less bytes for unit.
+ * There is probably a faster way of doing this,
+ * possibly taking it from the sockaddr_dl on the iface.
+ * This solves the problem of a P2P link and a LAN interface
+ * having the same address, which can result in the wrong
+ * interface being assigned to the packet when fed back
+ * into the divert socket. Theoretically if the daemon saves
+ * and re-uses the sockaddr_in as suggested in the man pages,
+ * this iface name will come along for the ride.
+ * (see div_output for the other half of this.)
+ */
+ strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname,
+ sizeof(divsrc.sin_zero));
+ }
+
+ /* Put packet on socket queue, if any */
+ sa = NULL;
+ nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
+ INP_INFO_RLOCK(&V_divcbinfo);
+ LIST_FOREACH(inp, &V_divcb, inp_list) {
+ /* XXX why does only one socket match? */
+ if (inp->inp_lport == nport) {
+ INP_RLOCK(inp);
+ sa = inp->inp_socket;
+ SOCKBUF_LOCK(&sa->so_rcv);
+ if (sbappendaddr_locked(&sa->so_rcv,
+ (struct sockaddr *)&divsrc, m,
+ (struct mbuf *)0) == 0) {
+ SOCKBUF_UNLOCK(&sa->so_rcv);
+ sa = NULL; /* force mbuf reclaim below */
+ } else
+ sorwakeup_locked(sa);
+ INP_RUNLOCK(inp);
+ break;
+ }
+ }
+ INP_INFO_RUNLOCK(&V_divcbinfo);
+ if (sa == NULL) {
+ m_freem(m);
+ KMOD_IPSTAT_INC(ips_noproto);
+ KMOD_IPSTAT_DEC(ips_delivered);
+ }
+}
+
+/*
+ * Deliver packet back into the IP processing machinery.
+ *
+ * If no address specified, or address is 0.0.0.0, send to ip_output();
+ * otherwise, send to ip_input() and mark as having been received on
+ * the interface with that address.
+ */
+static int
+div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
+ struct mbuf *control)
+{
+ struct m_tag *mtag;
+ struct ipfw_rule_ref *dt;
+ int error = 0;
+ struct mbuf *options;
+
+ /*
+ * An mbuf may hasn't come from userland, but we pretend
+ * that it has.
+ */
+ m->m_pkthdr.rcvif = NULL;
+ m->m_nextpkt = NULL;
+ M_SETFIB(m, so->so_fibnum);
+
+ if (control)
+ m_freem(control); /* XXX */
+
+ mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
+ if (mtag == NULL) {
+ /* this should be normal */
+ mtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
+ sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
+ if (mtag == NULL) {
+ error = ENOBUFS;
+ goto cantsend;
+ }
+ m_tag_prepend(m, mtag);
+ }
+ dt = (struct ipfw_rule_ref *)(mtag+1);
+
+ /* Loopback avoidance and state recovery */
+ if (sin) {
+ int i;
+
+ /* set the starting point. We provide a non-zero slot,
+ * but a non_matching chain_id to skip that info and use
+ * the rulenum/rule_id.
+ */
+ dt->slot = 1; /* dummy, chain_id is invalid */
+ dt->chain_id = 0;
+ dt->rulenum = sin->sin_port+1; /* host format ? */
+ dt->rule_id = 0;
+ /*
+ * Find receive interface with the given name, stuffed
+ * (if it exists) in the sin_zero[] field.
+ * The name is user supplied data so don't trust its size
+ * or that it is zero terminated.
+ */
+ for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++)
+ ;
+ if ( i > 0 && i < sizeof(sin->sin_zero))
+ m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
+ }
+
+ /* Reinject packet into the system as incoming or outgoing */
+ if (!sin || sin->sin_addr.s_addr == 0) {
+ struct ip *const ip = mtod(m, struct ip *);
+ struct inpcb *inp;
+
+ dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT;
+ inp = sotoinpcb(so);
+ INP_RLOCK(inp);
+ /*
+ * Don't allow both user specified and setsockopt options,
+ * and don't allow packet length sizes that will crash
+ */
+ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) ||
+ ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
+ error = EINVAL;
+ INP_RUNLOCK(inp);
+ m_freem(m);
+ } else {
+ /* Convert fields to host order for ip_output() */
+ ip->ip_len = ntohs(ip->ip_len);
+ ip->ip_off = ntohs(ip->ip_off);
+
+ /* Send packet to output processing */
+ KMOD_IPSTAT_INC(ips_rawout); /* XXX */
+
+#ifdef MAC
+ mac_inpcb_create_mbuf(inp, m);
+#endif
+ /*
+ * Get ready to inject the packet into ip_output().
+ * Just in case socket options were specified on the
+ * divert socket, we duplicate them. This is done
+ * to avoid having to hold the PCB locks over the call
+ * to ip_output(), as doing this results in a number of
+ * lock ordering complexities.
+ *
+ * Note that we set the multicast options argument for
+ * ip_output() to NULL since it should be invariant that
+ * they are not present.
+ */
+ KASSERT(inp->inp_moptions == NULL,
+ ("multicast options set on a divert socket"));
+ options = NULL;
+ /*
+ * XXXCSJP: It is unclear to me whether or not it makes
+ * sense for divert sockets to have options. However,
+ * for now we will duplicate them with the INP locks
+ * held so we can use them in ip_output() without
+ * requring a reference to the pcb.
+ */
+ if (inp->inp_options != NULL) {
+ options = m_dup(inp->inp_options, M_DONTWAIT);
+ if (options == NULL)
+ error = ENOBUFS;
+ }
+ INP_RUNLOCK(inp);
+ if (error == ENOBUFS) {
+ m_freem(m);
+ return (error);
+ }
+ error = ip_output(m, options, NULL,
+ ((so->so_options & SO_DONTROUTE) ?
+ IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST |
+ IP_RAWOUTPUT, NULL, NULL);
+ if (options != NULL)
+ m_freem(options);
+ }
+ } else {
+ dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN;
+ if (m->m_pkthdr.rcvif == NULL) {
+ /*
+ * No luck with the name, check by IP address.
+ * Clear the port and the ifname to make sure
+ * there are no distractions for ifa_ifwithaddr.
+ */
+ struct ifaddr *ifa;
+
+ bzero(sin->sin_zero, sizeof(sin->sin_zero));
+ sin->sin_port = 0;
+ ifa = ifa_ifwithaddr((struct sockaddr *) sin);
+ if (ifa == NULL) {
+ error = EADDRNOTAVAIL;
+ goto cantsend;
+ }
+ m->m_pkthdr.rcvif = ifa->ifa_ifp;
+ ifa_free(ifa);
+ }
+#ifdef MAC
+ mac_socket_create_mbuf(so, m);
+#endif
+ /* Send packet to input processing via netisr */
+ netisr_queue_src(NETISR_IP, (uintptr_t)so, m);
+ }
+
+ return error;
+
+cantsend:
+ m_freem(m);
+ return error;
+}
+
+static int
+div_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct inpcb *inp;
+ int error;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp == NULL, ("div_attach: inp != NULL"));
+ if (td != NULL) {
+ error = priv_check(td, PRIV_NETINET_DIVERT);
+ if (error)
+ return (error);
+ }
+ error = soreserve(so, div_sendspace, div_recvspace);
+ if (error)
+ return error;
+ INP_INFO_WLOCK(&V_divcbinfo);
+ error = in_pcballoc(so, &V_divcbinfo);
+ if (error) {
+ INP_INFO_WUNLOCK(&V_divcbinfo);
+ return error;
+ }
+ inp = (struct inpcb *)so->so_pcb;
+ INP_INFO_WUNLOCK(&V_divcbinfo);
+ inp->inp_ip_p = proto;
+ inp->inp_vflag |= INP_IPV4;
+ inp->inp_flags |= INP_HDRINCL;
+ INP_WUNLOCK(inp);
+ return 0;
+}
+
+static void
+div_detach(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("div_detach: inp == NULL"));
+ INP_INFO_WLOCK(&V_divcbinfo);
+ INP_WLOCK(inp);
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+ INP_INFO_WUNLOCK(&V_divcbinfo);
+}
+
+static int
+div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct inpcb *inp;
+ int error;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("div_bind: inp == NULL"));
+ /* in_pcbbind assumes that nam is a sockaddr_in
+ * and in_pcbbind requires a valid address. Since divert
+ * sockets don't we need to make sure the address is
+ * filled in properly.
+ * XXX -- divert should not be abusing in_pcbind
+ * and should probably have its own family.
+ */
+ if (nam->sa_family != AF_INET)
+ return EAFNOSUPPORT;
+ ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
+ INP_INFO_WLOCK(&V_divcbinfo);
+ INP_WLOCK(inp);
+ error = in_pcbbind(inp, nam, td->td_ucred);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_divcbinfo);
+ return error;
+}
+
+static int
+div_shutdown(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("div_shutdown: inp == NULL"));
+ INP_WLOCK(inp);
+ socantsendmore(so);
+ INP_WUNLOCK(inp);
+ return 0;
+}
+
+static int
+div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
+ struct mbuf *control, struct thread *td)
+{
+
+ /* Packet must have a header (but that's about it) */
+ if (m->m_len < sizeof (struct ip) &&
+ (m = m_pullup(m, sizeof (struct ip))) == 0) {
+ KMOD_IPSTAT_INC(ips_toosmall);
+ m_freem(m);
+ return EINVAL;
+ }
+
+ /* Send packet */
+ return div_output(so, m, (struct sockaddr_in *)nam, control);
+}
+
+static void
+div_ctlinput(int cmd, struct sockaddr *sa, void *vip)
+{
+ struct in_addr faddr;
+
+ faddr = ((struct sockaddr_in *)sa)->sin_addr;
+ if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
+ return;
+ if (PRC_IS_REDIRECT(cmd))
+ return;
+}
+
+static int
+div_pcblist(SYSCTL_HANDLER_ARGS)
+{
+ int error, i, n;
+ struct inpcb *inp, **inp_list;
+ inp_gen_t gencnt;
+ struct xinpgen xig;
+
+ /*
+ * The process of preparing the TCB list is too time-consuming and
+ * resource-intensive to repeat twice on every request.
+ */
+ if (req->oldptr == 0) {
+ n = V_divcbinfo.ipi_count;
+ n += imax(n / 8, 10);
+ req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
+ return 0;
+ }
+
+ if (req->newptr != 0)
+ return EPERM;
+
+ /*
+ * OK, now we're committed to doing something.
+ */
+ INP_INFO_RLOCK(&V_divcbinfo);
+ gencnt = V_divcbinfo.ipi_gencnt;
+ n = V_divcbinfo.ipi_count;
+ INP_INFO_RUNLOCK(&V_divcbinfo);
+
+ error = sysctl_wire_old_buffer(req,
+ 2 * sizeof(xig) + n*sizeof(struct xinpcb));
+ if (error != 0)
+ return (error);
+
+ xig.xig_len = sizeof xig;
+ xig.xig_count = n;
+ xig.xig_gen = gencnt;
+ xig.xig_sogen = so_gencnt;
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ if (error)
+ return error;
+
+ inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
+ if (inp_list == 0)
+ return ENOMEM;
+
+ INP_INFO_RLOCK(&V_divcbinfo);
+ for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n;
+ inp = LIST_NEXT(inp, inp_list)) {
+ INP_WLOCK(inp);
+ if (inp->inp_gencnt <= gencnt &&
+ cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
+ in_pcbref(inp);
+ inp_list[i++] = inp;
+ }
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(&V_divcbinfo);
+ n = i;
+
+ error = 0;
+ for (i = 0; i < n; i++) {
+ inp = inp_list[i];
+ INP_RLOCK(inp);
+ if (inp->inp_gencnt <= gencnt) {
+ struct xinpcb xi;
+ bzero(&xi, sizeof(xi));
+ xi.xi_len = sizeof xi;
+ /* XXX should avoid extra copy */
+ bcopy(inp, &xi.xi_inp, sizeof *inp);
+ if (inp->inp_socket)
+ sotoxsocket(inp->inp_socket, &xi.xi_socket);
+ INP_RUNLOCK(inp);
+ error = SYSCTL_OUT(req, &xi, sizeof xi);
+ } else
+ INP_RUNLOCK(inp);
+ }
+ INP_INFO_WLOCK(&V_divcbinfo);
+ for (i = 0; i < n; i++) {
+ inp = inp_list[i];
+ INP_WLOCK(inp);
+ if (!in_pcbrele(inp))
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_WUNLOCK(&V_divcbinfo);
+
+ if (!error) {
+ /*
+ * Give the user an updated idea of our state.
+ * If the generation differs from what we told
+ * her before, she knows that something happened
+ * while we were processing this request, and it
+ * might be necessary to retry.
+ */
+ INP_INFO_RLOCK(&V_divcbinfo);
+ xig.xig_gen = V_divcbinfo.ipi_gencnt;
+ xig.xig_sogen = so_gencnt;
+ xig.xig_count = V_divcbinfo.ipi_count;
+ INP_INFO_RUNLOCK(&V_divcbinfo);
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ }
+ free(inp_list, M_TEMP);
+ return error;
+}
+
+#ifdef SYSCTL_NODE
+SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "IPDIVERT");
+SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0,
+ div_pcblist, "S,xinpcb", "List of active divert sockets");
+#endif
+
+struct pr_usrreqs div_usrreqs = {
+ .pru_attach = div_attach,
+ .pru_bind = div_bind,
+ .pru_control = in_control,
+ .pru_detach = div_detach,
+ .pru_peeraddr = in_getpeeraddr,
+ .pru_send = div_send,
+ .pru_shutdown = div_shutdown,
+ .pru_sockaddr = in_getsockaddr,
+ .pru_sosetlabel = in_pcbsosetlabel
+};
+
+struct protosw div_protosw = {
+ .pr_type = SOCK_RAW,
+ .pr_protocol = IPPROTO_DIVERT,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = div_input,
+ .pr_ctlinput = div_ctlinput,
+ .pr_ctloutput = ip_ctloutput,
+ .pr_init = div_init,
+#ifdef VIMAGE
+ .pr_destroy = div_destroy,
+#endif
+ .pr_usrreqs = &div_usrreqs
+};
+
+static int
+div_modevent(module_t mod, int type, void *unused)
+{
+ int err = 0;
+#ifndef VIMAGE
+ int n;
+#endif
+
+ switch (type) {
+ case MOD_LOAD:
+ /*
+ * Protocol will be initialized by pf_proto_register().
+ * We don't have to register ip_protox because we are not
+ * a true IP protocol that goes over the wire.
+ */
+ err = pf_proto_register(PF_INET, &div_protosw);
+ if (err != 0)
+ return (err);
+ ip_divert_ptr = divert_packet;
+ ip_divert_event_tag = EVENTHANDLER_REGISTER(maxsockets_change,
+ div_zone_change, NULL, EVENTHANDLER_PRI_ANY);
+ break;
+ case MOD_QUIESCE:
+ /*
+ * IPDIVERT may normally not be unloaded because of the
+ * potential race conditions. Tell kldunload we can't be
+ * unloaded unless the unload is forced.
+ */
+ err = EPERM;
+ break;
+ case MOD_UNLOAD:
+#ifdef VIMAGE
+ err = EPERM;
+ break;
+#else
+ /*
+ * Forced unload.
+ *
+ * Module ipdivert can only be unloaded if no sockets are
+ * connected. Maybe this can be changed later to forcefully
+ * disconnect any open sockets.
+ *
+ * XXXRW: Note that there is a slight race here, as a new
+ * socket open request could be spinning on the lock and then
+ * we destroy the lock.
+ */
+ INP_INFO_WLOCK(&V_divcbinfo);
+ n = V_divcbinfo.ipi_count;
+ if (n != 0) {
+ err = EBUSY;
+ INP_INFO_WUNLOCK(&V_divcbinfo);
+ break;
+ }
+ ip_divert_ptr = NULL;
+ err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW);
+ INP_INFO_WUNLOCK(&V_divcbinfo);
+ div_destroy();
+ EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag);
+ break;
+#endif /* !VIMAGE */
+ default:
+ err = EOPNOTSUPP;
+ break;
+ }
+ return err;
+}
+
+static moduledata_t ipdivertmod = {
+ "ipdivert",
+ div_modevent,
+ 0
+};
+
+DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+MODULE_DEPEND(ipdivert, ipfw, 2, 2, 2);
+MODULE_VERSION(ipdivert, 1);
diff --git a/freebsd/sys/netinet/ip_divert.h b/freebsd/sys/netinet/ip_divert.h
new file mode 100644
index 00000000..eb9b33d4
--- /dev/null
+++ b/freebsd/sys/netinet/ip_divert.h
@@ -0,0 +1,55 @@
+/*-
+ * Copyright (c) 2003 Sam Leffler, Errno Consulting
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ * redistribution must be conditioned upon including a substantially
+ * similar Disclaimer requirement for further binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ * of any contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGES.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IP_DIVERT_HH_
+#define _NETINET_IP_DIVERT_HH_
+
+/*
+ * divert has no custom kernel-userland API.
+ *
+ * All communication occurs through a sockaddr_in socket where
+ *
+ * kernel-->userland
+ * sin_port = matching rule, host format;
+ * sin_addr = IN: first address of the incoming interface;
+ * OUT: INADDR_ANY
+ * sin_zero = if fits, the interface name (max 7 bytes + NUL)
+ *
+ * userland->kernel
+ * sin_port = restart-rule - 1, host order
+ * (we restart at sin_port + 1)
+ * sin_addr = IN: address of the incoming interface;
+ * OUT: INADDR_ANY
+ */
+#endif /* _NETINET_IP_DIVERT_HH_ */
diff --git a/freebsd/sys/netinet/ip_dummynet.h b/freebsd/sys/netinet/ip_dummynet.h
new file mode 100644
index 00000000..0bbc3263
--- /dev/null
+++ b/freebsd/sys/netinet/ip_dummynet.h
@@ -0,0 +1,263 @@
+/*-
+ * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DUMMYNET_H
+#define _IP_DUMMYNET_H
+
+/*
+ * Definition of the kernel-userland API for dummynet.
+ *
+ * Setsockopt() and getsockopt() pass a batch of objects, each
+ * of them starting with a "struct dn_id" which should fully identify
+ * the object and its relation with others in the sequence.
+ * The first object in each request should have
+ * type= DN_CMD_*, id = DN_API_VERSION.
+ * For other objects, type and subtype specify the object, len indicates
+ * the total length including the header, and 'id' identifies the specific
+ * object.
+ *
+ * Most objects are numbered with an identifier in the range 1..65535.
+ * DN_MAX_ID indicates the first value outside the range.
+ */
+
+#define DN_API_VERSION 12500000
+#define DN_MAX_ID 0x10000
+
+struct dn_id {
+ uint16_t len; /* total obj len including this header */
+ uint8_t type;
+ uint8_t subtype;
+ uint32_t id; /* generic id */
+};
+
+/*
+ * These values are in the type field of struct dn_id.
+ * To preserve the ABI, never rearrange the list or delete
+ * entries with the exception of DN_LAST
+ */
+enum {
+ DN_NONE = 0,
+ DN_LINK = 1,
+ DN_FS,
+ DN_SCH,
+ DN_SCH_I,
+ DN_QUEUE,
+ DN_DELAY_LINE,
+ DN_PROFILE,
+ DN_FLOW, /* struct dn_flow */
+ DN_TEXT, /* opaque text is the object */
+
+ DN_CMD_CONFIG = 0x80, /* objects follow */
+ DN_CMD_DELETE, /* subtype + list of entries */
+ DN_CMD_GET, /* subtype + list of entries */
+ DN_CMD_FLUSH,
+ /* for compatibility with FreeBSD 7.2/8 */
+ DN_COMPAT_PIPE,
+ DN_COMPAT_QUEUE,
+ DN_GET_COMPAT,
+
+ /* special commands for emulation of sysctl variables */
+ DN_SYSCTL_GET,
+ DN_SYSCTL_SET,
+
+ DN_LAST,
+} ;
+
+enum { /* subtype for schedulers, flowset and the like */
+ DN_SCHED_UNKNOWN = 0,
+ DN_SCHED_FIFO = 1,
+ DN_SCHED_WF2QP = 2,
+ /* others are in individual modules */
+} ;
+
+enum { /* user flags */
+ DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */
+ DN_NOERROR = 0x0002, /* do not report errors */
+ DN_QHT_HASH = 0x0004, /* qht is a hash table */
+ DN_QSIZE_BYTES = 0x0008, /* queue size is in bytes */
+ DN_HAS_PROFILE = 0x0010, /* a link has a profile */
+ DN_IS_RED = 0x0020,
+ DN_IS_GENTLE_RED= 0x0040,
+ DN_PIPE_CMD = 0x1000, /* pipe config... */
+};
+
+/*
+ * link template.
+ */
+struct dn_link {
+ struct dn_id oid;
+
+ /*
+ * Userland sets bw and delay in bits/s and milliseconds.
+ * The kernel converts this back and forth to bits/tick and ticks.
+ * XXX what about burst ?
+ */
+ int32_t link_nr;
+ int bandwidth; /* bit/s or bits/tick. */
+ int delay; /* ms and ticks */
+ uint64_t burst; /* scaled. bits*Hz XXX */
+} ;
+
+/*
+ * A flowset, which is a template for flows. Contains parameters
+ * from the command line: id, target scheduler, queue sizes, plr,
+ * flow masks, buckets for the flow hash, and possibly scheduler-
+ * specific parameters (weight, quantum and so on).
+ */
+struct dn_fs {
+ struct dn_id oid;
+ uint32_t fs_nr; /* the flowset number */
+ uint32_t flags; /* userland flags */
+ int qsize ; /* queue size in slots or bytes */
+ int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */
+ uint32_t buckets; /* buckets used for the queue hash table */
+
+ struct ipfw_flow_id flow_mask ;
+ uint32_t sched_nr; /* the scheduler we attach to */
+ /* generic scheduler parameters. Leave them at -1 if unset.
+ * Now we use 0: weight, 1: lmax, 2: priority
+ */
+ int par[4];
+
+ /* RED/GRED parameters.
+ * weight and probabilities are in the range 0..1 represented
+ * in fixed point arithmetic with SCALE_RED decimal bits.
+ */
+#define SCALE_RED 16
+#define SCALE(x) ( (x) << SCALE_RED )
+#define SCALE_VAL(x) ( (x) >> SCALE_RED )
+#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED )
+ int w_q ; /* queue weight (scaled) */
+ int max_th ; /* maximum threshold for queue (scaled) */
+ int min_th ; /* minimum threshold for queue (scaled) */
+ int max_p ; /* maximum value for p_b (scaled) */
+
+};
+
+/*
+ * dn_flow collects flow_id and stats for queues and scheduler
+ * instances, and is used to pass these info to userland.
+ * oid.type/oid.subtype describe the object, oid.id is number
+ * of the parent object.
+ */
+struct dn_flow {
+ struct dn_id oid;
+ struct ipfw_flow_id fid;
+ uint64_t tot_pkts; /* statistics counters */
+ uint64_t tot_bytes;
+ uint32_t length; /* Queue lenght, in packets */
+ uint32_t len_bytes; /* Queue lenght, in bytes */
+ uint32_t drops;
+};
+
+
+ /*
+ * Scheduler template, mostly indicating the name, number,
+ * sched_mask and buckets.
+ */
+struct dn_sch {
+ struct dn_id oid;
+ uint32_t sched_nr; /* N, scheduler number */
+ uint32_t buckets; /* number of buckets for the instances */
+ uint32_t flags; /* have_mask, ... */
+
+ char name[16]; /* null terminated */
+ /* mask to select the appropriate scheduler instance */
+ struct ipfw_flow_id sched_mask; /* M */
+};
+
+
+/* A delay profile is attached to a link.
+ * Note that a profile, as any other object, cannot be longer than 2^16
+ */
+#define ED_MAX_SAMPLES_NO 1024
+struct dn_profile {
+ struct dn_id oid;
+ /* fields to simulate a delay profile */
+#define ED_MAX_NAME_LEN 32
+ char name[ED_MAX_NAME_LEN];
+ int link_nr;
+ int loss_level;
+ int bandwidth; // XXX use link bandwidth?
+ int samples_no; /* actual length of samples[] */
+ int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */
+};
+
+
+
+/*
+ * Overall structure of dummynet
+
+In dummynet, packets are selected with the firewall rules, and passed
+to two different objects: PIPE or QUEUE (bad name).
+
+A QUEUE defines a classifier, which groups packets into flows
+according to a 'mask', puts them into independent queues (one
+per flow) with configurable size and queue management policy,
+and passes flows to a scheduler:
+
+ (flow_mask|sched_mask) sched_mask
+ +---------+ weight Wx +-------------+
+ | |->-[flow]-->--| |-+
+ -->--| QUEUE x | ... | | |
+ | |->-[flow]-->--| SCHEDuler N | |
+ +---------+ | | |
+ ... | +--[LINK N]-->--
+ +---------+ weight Wy | | +--[LINK N]-->--
+ | |->-[flow]-->--| | |
+ -->--| QUEUE y | ... | | |
+ | |->-[flow]-->--| | |
+ +---------+ +-------------+ |
+ +-------------+
+
+Many QUEUE objects can connect to the same scheduler, each
+QUEUE object can have its own set of parameters.
+
+In turn, the SCHEDuler 'forks' multiple instances according
+to a 'sched_mask', each instance manages its own set of queues
+and transmits on a private instance of a configurable LINK.
+
+A PIPE is a simplified version of the above, where there
+is no flow_mask, and each scheduler instance handles a single queue.
+
+The following data structures (visible from userland) describe
+the objects used by dummynet:
+
+ + dn_link, contains the main configuration parameters related
+ to delay and bandwidth;
+ + dn_profile describes a delay profile;
+ + dn_flow describes the flow status (flow id, statistics)
+
+ + dn_sch describes a scheduler
+ + dn_fs describes a flowset (msk, weight, queue parameters)
+
+ *
+ */
+
+#endif /* _IP_DUMMYNET_H */
diff --git a/freebsd/sys/netinet/ip_ecn.c b/freebsd/sys/netinet/ip_ecn.c
new file mode 100644
index 00000000..97b32b2c
--- /dev/null
+++ b/freebsd/sys/netinet/ip_ecn.c
@@ -0,0 +1,194 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/* $KAME: ip_ecn.c,v 1.12 2002/01/07 11:34:47 kjc Exp $ */
+
+/*-
+ * Copyright (C) 1999 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ * ECN consideration on tunnel ingress/egress operation.
+ * http://www.aciri.org/floyd/papers/draft-ipsec-ecn-00.txt
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/errno.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#ifdef INET6
+#include <freebsd/netinet/ip6.h>
+#endif
+
+#include <freebsd/netinet/ip_ecn.h>
+#ifdef INET6
+#include <freebsd/netinet6/ip6_ecn.h>
+#endif
+
+/*
+ * ECN and TOS (or TCLASS) processing rules at tunnel encapsulation and
+ * decapsulation from RFC3168:
+ *
+ * Outer Hdr at Inner Hdr at
+ * Encapsulator Decapsulator
+ * Header fields: -------------------- ------------
+ * DS Field copied from inner hdr no change
+ * ECN Field constructed by (I) constructed by (E)
+ *
+ * ECN_ALLOWED (full functionality):
+ * (I) if the ECN field in the inner header is set to CE, then set the
+ * ECN field in the outer header to ECT(0).
+ * otherwise, copy the ECN field to the outer header.
+ *
+ * (E) if the ECN field in the outer header is set to CE and the ECN
+ * field of the inner header is not-ECT, drop the packet.
+ * if the ECN field in the inner header is set to ECT(0) or ECT(1)
+ * and the ECN field in the outer header is set to CE, then copy CE to
+ * the inner header. otherwise, make no change to the inner header.
+ *
+ * ECN_FORBIDDEN (limited functionality):
+ * (I) set the ECN field to not-ECT in the outer header.
+ *
+ * (E) if the ECN field in the outer header is set to CE, drop the packet.
+ * otherwise, make no change to the ECN field in the inner header.
+ *
+ * the drop rule is for backward compatibility and protection against
+ * erasure of CE.
+ */
+
+/*
+ * modify outer ECN (TOS) field on ingress operation (tunnel encapsulation).
+ */
+void
+ip_ecn_ingress(int mode, u_int8_t *outer, const u_int8_t *inner)
+{
+
+ if (!outer || !inner)
+ panic("NULL pointer passed to ip_ecn_ingress");
+
+ *outer = *inner;
+ switch (mode) {
+ case ECN_ALLOWED: /* ECN allowed */
+ /*
+ * full-functionality: if the inner is CE, set ECT(0)
+ * to the outer. otherwise, copy the ECN field.
+ */
+ if ((*inner & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
+ *outer &= ~IPTOS_ECN_ECT1;
+ break;
+ case ECN_FORBIDDEN: /* ECN forbidden */
+ /*
+ * limited-functionality: set not-ECT to the outer
+ */
+ *outer &= ~IPTOS_ECN_MASK;
+ break;
+ case ECN_NOCARE: /* no consideration to ECN */
+ break;
+ }
+}
+
+/*
+ * modify inner ECN (TOS) field on egress operation (tunnel decapsulation).
+ * the caller should drop the packet if the return value is 0.
+ */
+int
+ip_ecn_egress(int mode, const u_int8_t *outer, u_int8_t *inner)
+{
+
+ if (!outer || !inner)
+ panic("NULL pointer passed to ip_ecn_egress");
+
+ switch (mode) {
+ case ECN_ALLOWED:
+ /*
+ * full-functionality: if the outer is CE and the inner is
+ * not-ECT, should drop it. otherwise, copy CE.
+ */
+ if ((*outer & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
+ if ((*inner & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
+ return (0);
+ *inner |= IPTOS_ECN_CE;
+ }
+ break;
+ case ECN_FORBIDDEN: /* ECN forbidden */
+ /*
+ * limited-functionality: if the outer is CE, should drop it.
+ * otherwise, leave the inner.
+ */
+ if ((*outer & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
+ return (0);
+ break;
+ case ECN_NOCARE: /* no consideration to ECN */
+ break;
+ }
+ return (1);
+}
+
+#ifdef INET6
+void
+ip6_ecn_ingress(int mode, u_int32_t *outer, const u_int32_t *inner)
+{
+ u_int8_t outer8, inner8;
+
+ if (!outer || !inner)
+ panic("NULL pointer passed to ip6_ecn_ingress");
+
+ inner8 = (ntohl(*inner) >> 20) & 0xff;
+ ip_ecn_ingress(mode, &outer8, &inner8);
+ *outer &= ~htonl(0xff << 20);
+ *outer |= htonl((u_int32_t)outer8 << 20);
+}
+
+int
+ip6_ecn_egress(int mode, const u_int32_t *outer, u_int32_t *inner)
+{
+ u_int8_t outer8, inner8, oinner8;
+
+ if (!outer || !inner)
+ panic("NULL pointer passed to ip6_ecn_egress");
+
+ outer8 = (ntohl(*outer) >> 20) & 0xff;
+ inner8 = oinner8 = (ntohl(*inner) >> 20) & 0xff;
+ if (ip_ecn_egress(mode, &outer8, &inner8) == 0)
+ return (0);
+ if (inner8 != oinner8) {
+ *inner &= ~htonl(0xff << 20);
+ *inner |= htonl((u_int32_t)inner8 << 20);
+ }
+ return (1);
+}
+#endif
diff --git a/freebsd/sys/netinet/ip_ecn.h b/freebsd/sys/netinet/ip_ecn.h
new file mode 100644
index 00000000..271c8a47
--- /dev/null
+++ b/freebsd/sys/netinet/ip_ecn.h
@@ -0,0 +1,53 @@
+/* $FreeBSD$ */
+/* $KAME: ip_ecn.h,v 1.8 2002/01/07 11:34:47 kjc Exp $ */
+
+/*-
+ * Copyright (C) 1999 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ * ECN consideration on tunnel ingress/egress operation.
+ * http://www.aciri.org/floyd/papers/draft-ipsec-ecn-00.txt
+ */
+
+#ifndef _NETINET_IP_ECN_HH_
+#define _NETINET_IP_ECN_HH_
+
+#if defined(_KERNEL) && !defined(_LKM)
+#include <freebsd/local/opt_inet.h>
+#endif
+
+#define ECN_ALLOWED 1 /* ECN allowed */
+#define ECN_FORBIDDEN 0 /* ECN forbidden */
+#define ECN_NOCARE (-1) /* no consideration to ECN */
+
+#ifdef _KERNEL
+extern void ip_ecn_ingress(int, u_int8_t *, const u_int8_t *);
+extern int ip_ecn_egress(int, const u_int8_t *, u_int8_t *);
+#endif
+#endif
diff --git a/freebsd/sys/netinet/ip_encap.c b/freebsd/sys/netinet/ip_encap.c
new file mode 100644
index 00000000..45b0593c
--- /dev/null
+++ b/freebsd/sys/netinet/ip_encap.c
@@ -0,0 +1,465 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/* $KAME: ip_encap.c,v 1.41 2001/03/15 08:35:08 itojun Exp $ */
+
+/*-
+ * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * My grandfather said that there's a devil inside tunnelling technology...
+ *
+ * We have surprisingly many protocols that want packets with IP protocol
+ * #4 or #41. Here's a list of protocols that want protocol #41:
+ * RFC1933 configured tunnel
+ * RFC1933 automatic tunnel
+ * RFC2401 IPsec tunnel
+ * RFC2473 IPv6 generic packet tunnelling
+ * RFC2529 6over4 tunnel
+ * mobile-ip6 (uses RFC2473)
+ * RFC3056 6to4 tunnel
+ * isatap tunnel
+ * Here's a list of protocol that want protocol #4:
+ * RFC1853 IPv4-in-IPv4 tunnelling
+ * RFC2003 IPv4 encapsulation within IPv4
+ * RFC2344 reverse tunnelling for mobile-ip4
+ * RFC2401 IPsec tunnel
+ * Well, what can I say. They impose different en/decapsulation mechanism
+ * from each other, so they need separate protocol handler. The only one
+ * we can easily determine by protocol # is IPsec, which always has
+ * AH/ESP/IPComp header right after outer IP header.
+ *
+ * So, clearly good old protosw does not work for protocol #4 and #41.
+ * The code will let you match protocol via src/dst address pair.
+ */
+/* XXX is M_NETADDR correct? */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_mrouting.h>
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/sockio.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/errno.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/queue.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_encap.h>
+
+#ifdef INET6
+#include <freebsd/netinet/ip6.h>
+#include <freebsd/netinet6/ip6_var.h>
+#include <freebsd/netinet6/ip6protosw.h>
+#endif
+
+#include <freebsd/machine/stdarg.h>
+
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/malloc.h>
+static MALLOC_DEFINE(M_NETADDR, "encap_export_host", "Export host address structure");
+
+static void encap_add(struct encaptab *);
+static int mask_match(const struct encaptab *, const struct sockaddr *,
+ const struct sockaddr *);
+static void encap_fillarg(struct mbuf *, const struct encaptab *);
+
+/*
+ * All global variables in ip_encap.c are locked using encapmtx.
+ */
+static struct mtx encapmtx;
+MTX_SYSINIT(encapmtx, &encapmtx, "encapmtx", MTX_DEF);
+LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(encaptab);
+
+/*
+ * We currently keey encap_init() for source code compatibility reasons --
+ * it's referenced by KAME pieces in netinet6.
+ */
+void
+encap_init(void)
+{
+}
+
+#ifdef INET
+void
+encap4_input(struct mbuf *m, int off)
+{
+ struct ip *ip;
+ int proto;
+ struct sockaddr_in s, d;
+ const struct protosw *psw;
+ struct encaptab *ep, *match;
+ int prio, matchprio;
+
+ ip = mtod(m, struct ip *);
+ proto = ip->ip_p;
+
+ bzero(&s, sizeof(s));
+ s.sin_family = AF_INET;
+ s.sin_len = sizeof(struct sockaddr_in);
+ s.sin_addr = ip->ip_src;
+ bzero(&d, sizeof(d));
+ d.sin_family = AF_INET;
+ d.sin_len = sizeof(struct sockaddr_in);
+ d.sin_addr = ip->ip_dst;
+
+ match = NULL;
+ matchprio = 0;
+ mtx_lock(&encapmtx);
+ LIST_FOREACH(ep, &encaptab, chain) {
+ if (ep->af != AF_INET)
+ continue;
+ if (ep->proto >= 0 && ep->proto != proto)
+ continue;
+ if (ep->func)
+ prio = (*ep->func)(m, off, proto, ep->arg);
+ else {
+ /*
+ * it's inbound traffic, we need to match in reverse
+ * order
+ */
+ prio = mask_match(ep, (struct sockaddr *)&d,
+ (struct sockaddr *)&s);
+ }
+
+ /*
+ * We prioritize the matches by using bit length of the
+ * matches. mask_match() and user-supplied matching function
+ * should return the bit length of the matches (for example,
+ * if both src/dst are matched for IPv4, 64 should be returned).
+ * 0 or negative return value means "it did not match".
+ *
+ * The question is, since we have two "mask" portion, we
+ * cannot really define total order between entries.
+ * For example, which of these should be preferred?
+ * mask_match() returns 48 (32 + 16) for both of them.
+ * src=3ffe::/16, dst=3ffe:501::/32
+ * src=3ffe:501::/32, dst=3ffe::/16
+ *
+ * We need to loop through all the possible candidates
+ * to get the best match - the search takes O(n) for
+ * n attachments (i.e. interfaces).
+ */
+ if (prio <= 0)
+ continue;
+ if (prio > matchprio) {
+ matchprio = prio;
+ match = ep;
+ }
+ }
+ mtx_unlock(&encapmtx);
+
+ if (match) {
+ /* found a match, "match" has the best one */
+ psw = match->psw;
+ if (psw && psw->pr_input) {
+ encap_fillarg(m, match);
+ (*psw->pr_input)(m, off);
+ } else
+ m_freem(m);
+ return;
+ }
+
+ /* last resort: inject to raw socket */
+ rip_input(m, off);
+}
+#endif
+
+#ifdef INET6
+int
+encap6_input(struct mbuf **mp, int *offp, int proto)
+{
+ struct mbuf *m = *mp;
+ struct ip6_hdr *ip6;
+ struct sockaddr_in6 s, d;
+ const struct ip6protosw *psw;
+ struct encaptab *ep, *match;
+ int prio, matchprio;
+
+ ip6 = mtod(m, struct ip6_hdr *);
+
+ bzero(&s, sizeof(s));
+ s.sin6_family = AF_INET6;
+ s.sin6_len = sizeof(struct sockaddr_in6);
+ s.sin6_addr = ip6->ip6_src;
+ bzero(&d, sizeof(d));
+ d.sin6_family = AF_INET6;
+ d.sin6_len = sizeof(struct sockaddr_in6);
+ d.sin6_addr = ip6->ip6_dst;
+
+ match = NULL;
+ matchprio = 0;
+ mtx_lock(&encapmtx);
+ LIST_FOREACH(ep, &encaptab, chain) {
+ if (ep->af != AF_INET6)
+ continue;
+ if (ep->proto >= 0 && ep->proto != proto)
+ continue;
+ if (ep->func)
+ prio = (*ep->func)(m, *offp, proto, ep->arg);
+ else {
+ /*
+ * it's inbound traffic, we need to match in reverse
+ * order
+ */
+ prio = mask_match(ep, (struct sockaddr *)&d,
+ (struct sockaddr *)&s);
+ }
+
+ /* see encap4_input() for issues here */
+ if (prio <= 0)
+ continue;
+ if (prio > matchprio) {
+ matchprio = prio;
+ match = ep;
+ }
+ }
+ mtx_unlock(&encapmtx);
+
+ if (match) {
+ /* found a match */
+ psw = (const struct ip6protosw *)match->psw;
+ if (psw && psw->pr_input) {
+ encap_fillarg(m, match);
+ return (*psw->pr_input)(mp, offp, proto);
+ } else {
+ m_freem(m);
+ return IPPROTO_DONE;
+ }
+ }
+
+ /* last resort: inject to raw socket */
+ return rip6_input(mp, offp, proto);
+}
+#endif
+
+/*lint -sem(encap_add, custodial(1)) */
+static void
+encap_add(struct encaptab *ep)
+{
+
+ mtx_assert(&encapmtx, MA_OWNED);
+ LIST_INSERT_HEAD(&encaptab, ep, chain);
+}
+
+/*
+ * sp (src ptr) is always my side, and dp (dst ptr) is always remote side.
+ * length of mask (sm and dm) is assumed to be same as sp/dp.
+ * Return value will be necessary as input (cookie) for encap_detach().
+ */
+const struct encaptab *
+encap_attach(int af, int proto, const struct sockaddr *sp,
+ const struct sockaddr *sm, const struct sockaddr *dp,
+ const struct sockaddr *dm, const struct protosw *psw, void *arg)
+{
+ struct encaptab *ep;
+
+ /* sanity check on args */
+ if (sp->sa_len > sizeof(ep->src) || dp->sa_len > sizeof(ep->dst))
+ return (NULL);
+ if (sp->sa_len != dp->sa_len)
+ return (NULL);
+ if (af != sp->sa_family || af != dp->sa_family)
+ return (NULL);
+
+ /* check if anyone have already attached with exactly same config */
+ mtx_lock(&encapmtx);
+ LIST_FOREACH(ep, &encaptab, chain) {
+ if (ep->af != af)
+ continue;
+ if (ep->proto != proto)
+ continue;
+ if (ep->src.ss_len != sp->sa_len ||
+ bcmp(&ep->src, sp, sp->sa_len) != 0 ||
+ bcmp(&ep->srcmask, sm, sp->sa_len) != 0)
+ continue;
+ if (ep->dst.ss_len != dp->sa_len ||
+ bcmp(&ep->dst, dp, dp->sa_len) != 0 ||
+ bcmp(&ep->dstmask, dm, dp->sa_len) != 0)
+ continue;
+
+ mtx_unlock(&encapmtx);
+ return (NULL);
+ }
+
+ ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/
+ if (ep == NULL) {
+ mtx_unlock(&encapmtx);
+ return (NULL);
+ }
+ bzero(ep, sizeof(*ep));
+
+ ep->af = af;
+ ep->proto = proto;
+ bcopy(sp, &ep->src, sp->sa_len);
+ bcopy(sm, &ep->srcmask, sp->sa_len);
+ bcopy(dp, &ep->dst, dp->sa_len);
+ bcopy(dm, &ep->dstmask, dp->sa_len);
+ ep->psw = psw;
+ ep->arg = arg;
+
+ encap_add(ep);
+ mtx_unlock(&encapmtx);
+ return (ep);
+}
+
+const struct encaptab *
+encap_attach_func(int af, int proto,
+ int (*func)(const struct mbuf *, int, int, void *),
+ const struct protosw *psw, void *arg)
+{
+ struct encaptab *ep;
+
+ /* sanity check on args */
+ if (!func)
+ return (NULL);
+
+ ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/
+ if (ep == NULL)
+ return (NULL);
+ bzero(ep, sizeof(*ep));
+
+ ep->af = af;
+ ep->proto = proto;
+ ep->func = func;
+ ep->psw = psw;
+ ep->arg = arg;
+
+ mtx_lock(&encapmtx);
+ encap_add(ep);
+ mtx_unlock(&encapmtx);
+ return (ep);
+}
+
+int
+encap_detach(const struct encaptab *cookie)
+{
+ const struct encaptab *ep = cookie;
+ struct encaptab *p;
+
+ mtx_lock(&encapmtx);
+ LIST_FOREACH(p, &encaptab, chain) {
+ if (p == ep) {
+ LIST_REMOVE(p, chain);
+ mtx_unlock(&encapmtx);
+ free(p, M_NETADDR); /*XXX*/
+ return 0;
+ }
+ }
+ mtx_unlock(&encapmtx);
+
+ return EINVAL;
+}
+
+static int
+mask_match(const struct encaptab *ep, const struct sockaddr *sp,
+ const struct sockaddr *dp)
+{
+ struct sockaddr_storage s;
+ struct sockaddr_storage d;
+ int i;
+ const u_int8_t *p, *q;
+ u_int8_t *r;
+ int matchlen;
+
+ if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d))
+ return 0;
+ if (sp->sa_family != ep->af || dp->sa_family != ep->af)
+ return 0;
+ if (sp->sa_len != ep->src.ss_len || dp->sa_len != ep->dst.ss_len)
+ return 0;
+
+ matchlen = 0;
+
+ p = (const u_int8_t *)sp;
+ q = (const u_int8_t *)&ep->srcmask;
+ r = (u_int8_t *)&s;
+ for (i = 0 ; i < sp->sa_len; i++) {
+ r[i] = p[i] & q[i];
+ /* XXX estimate */
+ matchlen += (q[i] ? 8 : 0);
+ }
+
+ p = (const u_int8_t *)dp;
+ q = (const u_int8_t *)&ep->dstmask;
+ r = (u_int8_t *)&d;
+ for (i = 0 ; i < dp->sa_len; i++) {
+ r[i] = p[i] & q[i];
+ /* XXX rough estimate */
+ matchlen += (q[i] ? 8 : 0);
+ }
+
+ /* need to overwrite len/family portion as we don't compare them */
+ s.ss_len = sp->sa_len;
+ s.ss_family = sp->sa_family;
+ d.ss_len = dp->sa_len;
+ d.ss_family = dp->sa_family;
+
+ if (bcmp(&s, &ep->src, ep->src.ss_len) == 0 &&
+ bcmp(&d, &ep->dst, ep->dst.ss_len) == 0) {
+ return matchlen;
+ } else
+ return 0;
+}
+
+static void
+encap_fillarg(struct mbuf *m, const struct encaptab *ep)
+{
+ struct m_tag *tag;
+
+ tag = m_tag_get(PACKET_TAG_ENCAP, sizeof (void*), M_NOWAIT);
+ if (tag) {
+ *(void**)(tag+1) = ep->arg;
+ m_tag_prepend(m, tag);
+ }
+}
+
+void *
+encap_getarg(struct mbuf *m)
+{
+ void *p = NULL;
+ struct m_tag *tag;
+
+ tag = m_tag_find(m, PACKET_TAG_ENCAP, NULL);
+ if (tag) {
+ p = *(void**)(tag+1);
+ m_tag_delete(m, tag);
+ }
+ return p;
+}
diff --git a/freebsd/sys/netinet/ip_encap.h b/freebsd/sys/netinet/ip_encap.h
new file mode 100644
index 00000000..44dd1a0d
--- /dev/null
+++ b/freebsd/sys/netinet/ip_encap.h
@@ -0,0 +1,64 @@
+/* $FreeBSD$ */
+/* $KAME: ip_encap.h,v 1.7 2000/03/25 07:23:37 sumikawa Exp $ */
+
+/*-
+ * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETINET_IP_ENCAP_HH_
+#define _NETINET_IP_ENCAP_HH_
+
+#ifdef _KERNEL
+
+struct encaptab {
+ LIST_ENTRY(encaptab) chain;
+ int af;
+ int proto; /* -1: don't care, I'll check myself */
+ struct sockaddr_storage src; /* my addr */
+ struct sockaddr_storage srcmask;
+ struct sockaddr_storage dst; /* remote addr */
+ struct sockaddr_storage dstmask;
+ int (*func)(const struct mbuf *, int, int, void *);
+ const struct protosw *psw; /* only pr_input will be used */
+ void *arg; /* passed via m->m_pkthdr.aux */
+};
+
+void encap_init(void);
+void encap4_input(struct mbuf *, int);
+int encap6_input(struct mbuf **, int *, int);
+const struct encaptab *encap_attach(int, int, const struct sockaddr *,
+ const struct sockaddr *, const struct sockaddr *,
+ const struct sockaddr *, const struct protosw *, void *);
+const struct encaptab *encap_attach_func(int, int,
+ int (*)(const struct mbuf *, int, int, void *),
+ const struct protosw *, void *);
+int encap_detach(const struct encaptab *);
+void *encap_getarg(struct mbuf *);
+#endif
+
+#endif /*_NETINET_IP_ENCAP_HH_*/
diff --git a/freebsd/sys/netinet/ip_fastfwd.c b/freebsd/sys/netinet/ip_fastfwd.c
new file mode 100644
index 00000000..6d406b2b
--- /dev/null
+++ b/freebsd/sys/netinet/ip_fastfwd.c
@@ -0,0 +1,619 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2003 Andre Oppermann, Internet Business Solutions AG
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * ip_fastforward gets its speed from processing the forwarded packet to
+ * completion (if_output on the other side) without any queues or netisr's.
+ * The receiving interface DMAs the packet into memory, the upper half of
+ * driver calls ip_fastforward, we do our routing table lookup and directly
+ * send it off to the outgoing interface, which DMAs the packet to the
+ * network card. The only part of the packet we touch with the CPU is the
+ * IP header (unless there are complex firewall rules touching other parts
+ * of the packet, but that is up to you). We are essentially limited by bus
+ * bandwidth and how fast the network card/driver can set up receives and
+ * transmits.
+ *
+ * We handle basic errors, IP header errors, checksum errors,
+ * destination unreachable, fragmentation and fragmentation needed and
+ * report them via ICMP to the sender.
+ *
+ * Else if something is not pure IPv4 unicast forwarding we fall back to
+ * the normal ip_input processing path. We should only be called from
+ * interfaces connected to the outside world.
+ *
+ * Firewalling is fully supported including divert, ipfw fwd and ipfilter
+ * ipnat and address rewrite.
+ *
+ * IPSEC is not supported if this host is a tunnel broker. IPSEC is
+ * supported for connections to/from local host.
+ *
+ * We try to do the least expensive (in CPU ops) checks and operations
+ * first to catch junk with as little overhead as possible.
+ *
+ * We take full advantage of hardware support for IP checksum and
+ * fragmentation offloading.
+ *
+ * We don't do ICMP redirect in the fast forwarding path. I have had my own
+ * cases where two core routers with Zebra routing suite would send millions
+ * ICMP redirects to connected hosts if the destination router was not the
+ * default gateway. In one case it was filling the routing table of a host
+ * with approximately 300.000 cloned redirect entries until it ran out of
+ * kernel memory. However the networking code proved very robust and it didn't
+ * crash or fail in other ways.
+ */
+
+/*
+ * Many thanks to Matt Thomas of NetBSD for basic structure of ip_flow.c which
+ * is being followed here.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_ipfw.h>
+#include <freebsd/local/opt_ipstealth.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/sysctl.h>
+
+#include <freebsd/net/pfil.h>
+#include <freebsd/net/if.h>
+#include <freebsd/net/if_types.h>
+#include <freebsd/net/if_var.h>
+#include <freebsd/net/if_dl.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_icmp.h>
+#include <freebsd/netinet/ip_options.h>
+
+#include <freebsd/machine/in_cksum.h>
+
+static VNET_DEFINE(int, ipfastforward_active);
+#define V_ipfastforward_active VNET(ipfastforward_active)
+
+SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_RW,
+ &VNET_NAME(ipfastforward_active), 0, "Enable fast IP forwarding");
+
+static struct sockaddr_in *
+ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m)
+{
+ struct sockaddr_in *dst;
+ struct rtentry *rt;
+
+ /*
+ * Find route to destination.
+ */
+ bzero(ro, sizeof(*ro));
+ dst = (struct sockaddr_in *)&ro->ro_dst;
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr.s_addr = dest.s_addr;
+ in_rtalloc_ign(ro, 0, M_GETFIB(m));
+
+ /*
+ * Route there and interface still up?
+ */
+ rt = ro->ro_rt;
+ if (rt && (rt->rt_flags & RTF_UP) &&
+ (rt->rt_ifp->if_flags & IFF_UP) &&
+ (rt->rt_ifp->if_drv_flags & IFF_DRV_RUNNING)) {
+ if (rt->rt_flags & RTF_GATEWAY)
+ dst = (struct sockaddr_in *)rt->rt_gateway;
+ } else {
+ IPSTAT_INC(ips_noroute);
+ IPSTAT_INC(ips_cantforward);
+ if (rt)
+ RTFREE(rt);
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
+ return NULL;
+ }
+ return dst;
+}
+
+/*
+ * Try to forward a packet based on the destination address.
+ * This is a fast path optimized for the plain forwarding case.
+ * If the packet is handled (and consumed) here then we return 1;
+ * otherwise 0 is returned and the packet should be delivered
+ * to ip_input for full processing.
+ */
+struct mbuf *
+ip_fastforward(struct mbuf *m)
+{
+ struct ip *ip;
+ struct mbuf *m0 = NULL;
+ struct route ro;
+ struct sockaddr_in *dst = NULL;
+ struct ifnet *ifp;
+ struct in_addr odest, dest;
+ u_short sum, ip_len;
+ int error = 0;
+ int hlen, mtu;
+#ifdef IPFIREWALL_FORWARD
+ struct m_tag *fwd_tag;
+#endif
+
+ /*
+ * Are we active and forwarding packets?
+ */
+ if (!V_ipfastforward_active || !V_ipforwarding)
+ return m;
+
+ M_ASSERTVALID(m);
+ M_ASSERTPKTHDR(m);
+
+ bzero(&ro, sizeof(ro));
+
+ /*
+ * Step 1: check for packet drop conditions (and sanity checks)
+ */
+
+ /*
+ * Is entire packet big enough?
+ */
+ if (m->m_pkthdr.len < sizeof(struct ip)) {
+ IPSTAT_INC(ips_tooshort);
+ goto drop;
+ }
+
+ /*
+ * Is first mbuf large enough for ip header and is header present?
+ */
+ if (m->m_len < sizeof (struct ip) &&
+ (m = m_pullup(m, sizeof (struct ip))) == NULL) {
+ IPSTAT_INC(ips_toosmall);
+ return NULL; /* mbuf already free'd */
+ }
+
+ ip = mtod(m, struct ip *);
+
+ /*
+ * Is it IPv4?
+ */
+ if (ip->ip_v != IPVERSION) {
+ IPSTAT_INC(ips_badvers);
+ goto drop;
+ }
+
+ /*
+ * Is IP header length correct and is it in first mbuf?
+ */
+ hlen = ip->ip_hl << 2;
+ if (hlen < sizeof(struct ip)) { /* minimum header length */
+ IPSTAT_INC(ips_badhlen);
+ goto drop;
+ }
+ if (hlen > m->m_len) {
+ if ((m = m_pullup(m, hlen)) == NULL) {
+ IPSTAT_INC(ips_badhlen);
+ return NULL; /* mbuf already free'd */
+ }
+ ip = mtod(m, struct ip *);
+ }
+
+ /*
+ * Checksum correct?
+ */
+ if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED)
+ sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
+ else {
+ if (hlen == sizeof(struct ip))
+ sum = in_cksum_hdr(ip);
+ else
+ sum = in_cksum(m, hlen);
+ }
+ if (sum) {
+ IPSTAT_INC(ips_badsum);
+ goto drop;
+ }
+
+ /*
+ * Remember that we have checked the IP header and found it valid.
+ */
+ m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID);
+
+ ip_len = ntohs(ip->ip_len);
+
+ /*
+ * Is IP length longer than packet we have got?
+ */
+ if (m->m_pkthdr.len < ip_len) {
+ IPSTAT_INC(ips_tooshort);
+ goto drop;
+ }
+
+ /*
+ * Is packet longer than IP header tells us? If yes, truncate packet.
+ */
+ if (m->m_pkthdr.len > ip_len) {
+ if (m->m_len == m->m_pkthdr.len) {
+ m->m_len = ip_len;
+ m->m_pkthdr.len = ip_len;
+ } else
+ m_adj(m, ip_len - m->m_pkthdr.len);
+ }
+
+ /*
+ * Is packet from or to 127/8?
+ */
+ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
+ (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
+ IPSTAT_INC(ips_badaddr);
+ goto drop;
+ }
+
+#ifdef ALTQ
+ /*
+ * Is packet dropped by traffic conditioner?
+ */
+ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
+ goto drop;
+#endif
+
+ /*
+ * Step 2: fallback conditions to normal ip_input path processing
+ */
+
+ /*
+ * Only IP packets without options
+ */
+ if (ip->ip_hl != (sizeof(struct ip) >> 2)) {
+ if (ip_doopts == 1)
+ return m;
+ else if (ip_doopts == 2) {
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB,
+ 0, 0);
+ return NULL; /* mbuf already free'd */
+ }
+ /* else ignore IP options and continue */
+ }
+
+ /*
+ * Only unicast IP, not from loopback, no L2 or IP broadcast,
+ * no multicast, no INADDR_ANY
+ *
+ * XXX: Probably some of these checks could be direct drop
+ * conditions. However it is not clear whether there are some
+ * hacks or obscure behaviours which make it neccessary to
+ * let ip_input handle it. We play safe here and let ip_input
+ * deal with it until it is proven that we can directly drop it.
+ */
+ if ((m->m_flags & (M_BCAST|M_MCAST)) ||
+ (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) ||
+ ntohl(ip->ip_src.s_addr) == (u_long)INADDR_BROADCAST ||
+ ntohl(ip->ip_dst.s_addr) == (u_long)INADDR_BROADCAST ||
+ IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
+ IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
+ IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) ||
+ IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
+ ip->ip_src.s_addr == INADDR_ANY ||
+ ip->ip_dst.s_addr == INADDR_ANY )
+ return m;
+
+ /*
+ * Is it for a local address on this host?
+ */
+ if (in_localip(ip->ip_dst))
+ return m;
+
+ IPSTAT_INC(ips_total);
+
+ /*
+ * Step 3: incoming packet firewall processing
+ */
+
+ /*
+ * Convert to host representation
+ */
+ ip->ip_len = ntohs(ip->ip_len);
+ ip->ip_off = ntohs(ip->ip_off);
+
+ odest.s_addr = dest.s_addr = ip->ip_dst.s_addr;
+
+ /*
+ * Run through list of ipfilter hooks for input packets
+ */
+ if (!PFIL_HOOKED(&V_inet_pfil_hook))
+ goto passin;
+
+ if (pfil_run_hooks(
+ &V_inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL) ||
+ m == NULL)
+ goto drop;
+
+ M_ASSERTVALID(m);
+ M_ASSERTPKTHDR(m);
+
+ ip = mtod(m, struct ip *); /* m may have changed by pfil hook */
+ dest.s_addr = ip->ip_dst.s_addr;
+
+ /*
+ * Destination address changed?
+ */
+ if (odest.s_addr != dest.s_addr) {
+ /*
+ * Is it now for a local address on this host?
+ */
+ if (in_localip(dest))
+ goto forwardlocal;
+ /*
+ * Go on with new destination address
+ */
+ }
+#ifdef IPFIREWALL_FORWARD
+ if (m->m_flags & M_FASTFWD_OURS) {
+ /*
+ * ipfw changed it for a local address on this host.
+ */
+ goto forwardlocal;
+ }
+#endif /* IPFIREWALL_FORWARD */
+
+passin:
+ /*
+ * Step 4: decrement TTL and look up route
+ */
+
+ /*
+ * Check TTL
+ */
+#ifdef IPSTEALTH
+ if (!V_ipstealth) {
+#endif
+ if (ip->ip_ttl <= IPTTLDEC) {
+ icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0);
+ return NULL; /* mbuf already free'd */
+ }
+
+ /*
+ * Decrement the TTL and incrementally change the IP header checksum.
+ * Don't bother doing this with hw checksum offloading, it's faster
+ * doing it right here.
+ */
+ ip->ip_ttl -= IPTTLDEC;
+ if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8))
+ ip->ip_sum -= ~htons(IPTTLDEC << 8);
+ else
+ ip->ip_sum += htons(IPTTLDEC << 8);
+#ifdef IPSTEALTH
+ }
+#endif
+
+ /*
+ * Find route to destination.
+ */
+ if ((dst = ip_findroute(&ro, dest, m)) == NULL)
+ return NULL; /* icmp unreach already sent */
+ ifp = ro.ro_rt->rt_ifp;
+
+ /*
+ * Immediately drop blackholed traffic, and directed broadcasts
+ * for either the all-ones or all-zero subnet addresses on
+ * locally attached networks.
+ */
+ if ((ro.ro_rt->rt_flags & (RTF_BLACKHOLE|RTF_BROADCAST)) != 0)
+ goto drop;
+
+ /*
+ * Step 5: outgoing firewall packet processing
+ */
+
+ /*
+ * Run through list of hooks for output packets.
+ */
+ if (!PFIL_HOOKED(&V_inet_pfil_hook))
+ goto passout;
+
+ if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, NULL) || m == NULL) {
+ goto drop;
+ }
+
+ M_ASSERTVALID(m);
+ M_ASSERTPKTHDR(m);
+
+ ip = mtod(m, struct ip *);
+ dest.s_addr = ip->ip_dst.s_addr;
+
+ /*
+ * Destination address changed?
+ */
+#ifndef IPFIREWALL_FORWARD
+ if (odest.s_addr != dest.s_addr) {
+#else
+ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
+ if (odest.s_addr != dest.s_addr || fwd_tag != NULL) {
+#endif /* IPFIREWALL_FORWARD */
+ /*
+ * Is it now for a local address on this host?
+ */
+#ifndef IPFIREWALL_FORWARD
+ if (in_localip(dest)) {
+#else
+ if (m->m_flags & M_FASTFWD_OURS || in_localip(dest)) {
+#endif /* IPFIREWALL_FORWARD */
+forwardlocal:
+ /*
+ * Return packet for processing by ip_input().
+ * Keep host byte order as expected at ip_input's
+ * "ours"-label.
+ */
+ m->m_flags |= M_FASTFWD_OURS;
+ if (ro.ro_rt)
+ RTFREE(ro.ro_rt);
+ return m;
+ }
+ /*
+ * Redo route lookup with new destination address
+ */
+#ifdef IPFIREWALL_FORWARD
+ if (fwd_tag) {
+ dest.s_addr = ((struct sockaddr_in *)
+ (fwd_tag + 1))->sin_addr.s_addr;
+ m_tag_delete(m, fwd_tag);
+ }
+#endif /* IPFIREWALL_FORWARD */
+ RTFREE(ro.ro_rt);
+ if ((dst = ip_findroute(&ro, dest, m)) == NULL)
+ return NULL; /* icmp unreach already sent */
+ ifp = ro.ro_rt->rt_ifp;
+ }
+
+passout:
+ /*
+ * Step 6: send off the packet
+ */
+
+ /*
+ * Check if route is dampned (when ARP is unable to resolve)
+ */
+ if ((ro.ro_rt->rt_flags & RTF_REJECT) &&
+ (ro.ro_rt->rt_rmx.rmx_expire == 0 ||
+ time_uptime < ro.ro_rt->rt_rmx.rmx_expire)) {
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
+ goto consumed;
+ }
+
+#ifndef ALTQ
+ /*
+ * Check if there is enough space in the interface queue
+ */
+ if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
+ ifp->if_snd.ifq_maxlen) {
+ IPSTAT_INC(ips_odropped);
+ /* would send source quench here but that is depreciated */
+ goto drop;
+ }
+#endif
+
+ /*
+ * Check if media link state of interface is not down
+ */
+ if (ifp->if_link_state == LINK_STATE_DOWN) {
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
+ goto consumed;
+ }
+
+ /*
+ * Check if packet fits MTU or if hardware will fragment for us
+ */
+ if (ro.ro_rt->rt_rmx.rmx_mtu)
+ mtu = min(ro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
+ else
+ mtu = ifp->if_mtu;
+
+ if (ip->ip_len <= mtu ||
+ (ifp->if_hwassist & CSUM_FRAGMENT && (ip->ip_off & IP_DF) == 0)) {
+ /*
+ * Restore packet header fields to original values
+ */
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+ /*
+ * Send off the packet via outgoing interface
+ */
+ error = (*ifp->if_output)(ifp, m,
+ (struct sockaddr *)dst, &ro);
+ } else {
+ /*
+ * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery
+ */
+ if (ip->ip_off & IP_DF) {
+ IPSTAT_INC(ips_cantfrag);
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
+ 0, mtu);
+ goto consumed;
+ } else {
+ /*
+ * We have to fragment the packet
+ */
+ m->m_pkthdr.csum_flags |= CSUM_IP;
+ /*
+ * ip_fragment expects ip_len and ip_off in host byte
+ * order but returns all packets in network byte order
+ */
+ if (ip_fragment(ip, &m, mtu, ifp->if_hwassist,
+ (~ifp->if_hwassist & CSUM_DELAY_IP))) {
+ goto drop;
+ }
+ KASSERT(m != NULL, ("null mbuf and no error"));
+ /*
+ * Send off the fragments via outgoing interface
+ */
+ error = 0;
+ do {
+ m0 = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+
+ error = (*ifp->if_output)(ifp, m,
+ (struct sockaddr *)dst, &ro);
+ if (error)
+ break;
+ } while ((m = m0) != NULL);
+ if (error) {
+ /* Reclaim remaining fragments */
+ for (m = m0; m; m = m0) {
+ m0 = m->m_nextpkt;
+ m_freem(m);
+ }
+ } else
+ IPSTAT_INC(ips_fragmented);
+ }
+ }
+
+ if (error != 0)
+ IPSTAT_INC(ips_odropped);
+ else {
+ ro.ro_rt->rt_rmx.rmx_pksent++;
+ IPSTAT_INC(ips_forward);
+ IPSTAT_INC(ips_fastforward);
+ }
+consumed:
+ RTFREE(ro.ro_rt);
+ return NULL;
+drop:
+ if (m)
+ m_freem(m);
+ if (ro.ro_rt)
+ RTFREE(ro.ro_rt);
+ return NULL;
+}
diff --git a/freebsd/sys/netinet/ip_fw.h b/freebsd/sys/netinet/ip_fw.h
new file mode 100644
index 00000000..cf5d8d03
--- /dev/null
+++ b/freebsd/sys/netinet/ip_fw.h
@@ -0,0 +1,579 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IPFW2_H
+#define _IPFW2_H
+
+/*
+ * The default rule number. By the design of ip_fw, the default rule
+ * is the last one, so its number can also serve as the highest number
+ * allowed for a rule. The ip_fw code relies on both meanings of this
+ * constant.
+ */
+#define IPFW_DEFAULT_RULE 65535
+
+/*
+ * The number of ipfw tables. The maximum allowed table number is the
+ * (IPFW_TABLES_MAX - 1).
+ */
+#define IPFW_TABLES_MAX 128
+
+/*
+ * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit
+ * argument between 1 and 65534. The value 0 is unused, the value
+ * 65535 (IP_FW_TABLEARG) is used to represent 'tablearg', i.e. the
+ * can be 1..65534, or 65535 to indicate the use of a 'tablearg'
+ * result of the most recent table() lookup.
+ * Note that 16bit is only a historical limit, resulting from
+ * the use of a 16-bit fields for that value. In reality, we can have
+ * 2^32 pipes, queues, tag values and so on, and use 0 as a tablearg.
+ */
+#define IPFW_ARG_MIN 1
+#define IPFW_ARG_MAX 65534
+#define IP_FW_TABLEARG 65535 /* XXX should use 0 */
+
+/*
+ * The kernel representation of ipfw rules is made of a list of
+ * 'instructions' (for all practical purposes equivalent to BPF
+ * instructions), which specify which fields of the packet
+ * (or its metadata) should be analysed.
+ *
+ * Each instruction is stored in a structure which begins with
+ * "ipfw_insn", and can contain extra fields depending on the
+ * instruction type (listed below).
+ * Note that the code is written so that individual instructions
+ * have a size which is a multiple of 32 bits. This means that, if
+ * such structures contain pointers or other 64-bit entities,
+ * (there is just one instance now) they may end up unaligned on
+ * 64-bit architectures, so the must be handled with care.
+ *
+ * "enum ipfw_opcodes" are the opcodes supported. We can have up
+ * to 256 different opcodes. When adding new opcodes, they should
+ * be appended to the end of the opcode list before O_LAST_OPCODE,
+ * this will prevent the ABI from being broken, otherwise users
+ * will have to recompile ipfw(8) when they update the kernel.
+ */
+
+enum ipfw_opcodes { /* arguments (4 byte each) */
+ O_NOP,
+
+ O_IP_SRC, /* u32 = IP */
+ O_IP_SRC_MASK, /* ip = IP/mask */
+ O_IP_SRC_ME, /* none */
+ O_IP_SRC_SET, /* u32=base, arg1=len, bitmap */
+
+ O_IP_DST, /* u32 = IP */
+ O_IP_DST_MASK, /* ip = IP/mask */
+ O_IP_DST_ME, /* none */
+ O_IP_DST_SET, /* u32=base, arg1=len, bitmap */
+
+ O_IP_SRCPORT, /* (n)port list:mask 4 byte ea */
+ O_IP_DSTPORT, /* (n)port list:mask 4 byte ea */
+ O_PROTO, /* arg1=protocol */
+
+ O_MACADDR2, /* 2 mac addr:mask */
+ O_MAC_TYPE, /* same as srcport */
+
+ O_LAYER2, /* none */
+ O_IN, /* none */
+ O_FRAG, /* none */
+
+ O_RECV, /* none */
+ O_XMIT, /* none */
+ O_VIA, /* none */
+
+ O_IPOPT, /* arg1 = 2*u8 bitmap */
+ O_IPLEN, /* arg1 = len */
+ O_IPID, /* arg1 = id */
+
+ O_IPTOS, /* arg1 = id */
+ O_IPPRECEDENCE, /* arg1 = precedence << 5 */
+ O_IPTTL, /* arg1 = TTL */
+
+ O_IPVER, /* arg1 = version */
+ O_UID, /* u32 = id */
+ O_GID, /* u32 = id */
+ O_ESTAB, /* none (tcp established) */
+ O_TCPFLAGS, /* arg1 = 2*u8 bitmap */
+ O_TCPWIN, /* arg1 = desired win */
+ O_TCPSEQ, /* u32 = desired seq. */
+ O_TCPACK, /* u32 = desired seq. */
+ O_ICMPTYPE, /* u32 = icmp bitmap */
+ O_TCPOPTS, /* arg1 = 2*u8 bitmap */
+
+ O_VERREVPATH, /* none */
+ O_VERSRCREACH, /* none */
+
+ O_PROBE_STATE, /* none */
+ O_KEEP_STATE, /* none */
+ O_LIMIT, /* ipfw_insn_limit */
+ O_LIMIT_PARENT, /* dyn_type, not an opcode. */
+
+ /*
+ * These are really 'actions'.
+ */
+
+ O_LOG, /* ipfw_insn_log */
+ O_PROB, /* u32 = match probability */
+
+ O_CHECK_STATE, /* none */
+ O_ACCEPT, /* none */
+ O_DENY, /* none */
+ O_REJECT, /* arg1=icmp arg (same as deny) */
+ O_COUNT, /* none */
+ O_SKIPTO, /* arg1=next rule number */
+ O_PIPE, /* arg1=pipe number */
+ O_QUEUE, /* arg1=queue number */
+ O_DIVERT, /* arg1=port number */
+ O_TEE, /* arg1=port number */
+ O_FORWARD_IP, /* fwd sockaddr */
+ O_FORWARD_MAC, /* fwd mac */
+ O_NAT, /* nope */
+ O_REASS, /* none */
+
+ /*
+ * More opcodes.
+ */
+ O_IPSEC, /* has ipsec history */
+ O_IP_SRC_LOOKUP, /* arg1=table number, u32=value */
+ O_IP_DST_LOOKUP, /* arg1=table number, u32=value */
+ O_ANTISPOOF, /* none */
+ O_JAIL, /* u32 = id */
+ O_ALTQ, /* u32 = altq classif. qid */
+ O_DIVERTED, /* arg1=bitmap (1:loop, 2:out) */
+ O_TCPDATALEN, /* arg1 = tcp data len */
+ O_IP6_SRC, /* address without mask */
+ O_IP6_SRC_ME, /* my addresses */
+ O_IP6_SRC_MASK, /* address with the mask */
+ O_IP6_DST,
+ O_IP6_DST_ME,
+ O_IP6_DST_MASK,
+ O_FLOW6ID, /* for flow id tag in the ipv6 pkt */
+ O_ICMP6TYPE, /* icmp6 packet type filtering */
+ O_EXT_HDR, /* filtering for ipv6 extension header */
+ O_IP6,
+
+ /*
+ * actions for ng_ipfw
+ */
+ O_NETGRAPH, /* send to ng_ipfw */
+ O_NGTEE, /* copy to ng_ipfw */
+
+ O_IP4,
+
+ O_UNREACH6, /* arg1=icmpv6 code arg (deny) */
+
+ O_TAG, /* arg1=tag number */
+ O_TAGGED, /* arg1=tag number */
+
+ O_SETFIB, /* arg1=FIB number */
+ O_FIB, /* arg1=FIB desired fib number */
+
+ O_LAST_OPCODE /* not an opcode! */
+};
+
+/*
+ * The extension header are filtered only for presence using a bit
+ * vector with a flag for each header.
+ */
+#define EXT_FRAGMENT 0x1
+#define EXT_HOPOPTS 0x2
+#define EXT_ROUTING 0x4
+#define EXT_AH 0x8
+#define EXT_ESP 0x10
+#define EXT_DSTOPTS 0x20
+#define EXT_RTHDR0 0x40
+#define EXT_RTHDR2 0x80
+
+/*
+ * Template for instructions.
+ *
+ * ipfw_insn is used for all instructions which require no operands,
+ * a single 16-bit value (arg1), or a couple of 8-bit values.
+ *
+ * For other instructions which require different/larger arguments
+ * we have derived structures, ipfw_insn_*.
+ *
+ * The size of the instruction (in 32-bit words) is in the low
+ * 6 bits of "len". The 2 remaining bits are used to implement
+ * NOT and OR on individual instructions. Given a type, you can
+ * compute the length to be put in "len" using F_INSN_SIZE(t)
+ *
+ * F_NOT negates the match result of the instruction.
+ *
+ * F_OR is used to build or blocks. By default, instructions
+ * are evaluated as part of a logical AND. An "or" block
+ * { X or Y or Z } contains F_OR set in all but the last
+ * instruction of the block. A match will cause the code
+ * to skip past the last instruction of the block.
+ *
+ * NOTA BENE: in a couple of places we assume that
+ * sizeof(ipfw_insn) == sizeof(u_int32_t)
+ * this needs to be fixed.
+ *
+ */
+typedef struct _ipfw_insn { /* template for instructions */
+ u_int8_t opcode;
+ u_int8_t len; /* number of 32-bit words */
+#define F_NOT 0x80
+#define F_OR 0x40
+#define F_LEN_MASK 0x3f
+#define F_LEN(cmd) ((cmd)->len & F_LEN_MASK)
+
+ u_int16_t arg1;
+} ipfw_insn;
+
+/*
+ * The F_INSN_SIZE(type) computes the size, in 4-byte words, of
+ * a given type.
+ */
+#define F_INSN_SIZE(t) ((sizeof (t))/sizeof(u_int32_t))
+
+/*
+ * This is used to store an array of 16-bit entries (ports etc.)
+ */
+typedef struct _ipfw_insn_u16 {
+ ipfw_insn o;
+ u_int16_t ports[2]; /* there may be more */
+} ipfw_insn_u16;
+
+/*
+ * This is used to store an array of 32-bit entries
+ * (uid, single IPv4 addresses etc.)
+ */
+typedef struct _ipfw_insn_u32 {
+ ipfw_insn o;
+ u_int32_t d[1]; /* one or more */
+} ipfw_insn_u32;
+
+/*
+ * This is used to store IP addr-mask pairs.
+ */
+typedef struct _ipfw_insn_ip {
+ ipfw_insn o;
+ struct in_addr addr;
+ struct in_addr mask;
+} ipfw_insn_ip;
+
+/*
+ * This is used to forward to a given address (ip).
+ */
+typedef struct _ipfw_insn_sa {
+ ipfw_insn o;
+ struct sockaddr_in sa;
+} ipfw_insn_sa;
+
+/*
+ * This is used for MAC addr-mask pairs.
+ */
+typedef struct _ipfw_insn_mac {
+ ipfw_insn o;
+ u_char addr[12]; /* dst[6] + src[6] */
+ u_char mask[12]; /* dst[6] + src[6] */
+} ipfw_insn_mac;
+
+/*
+ * This is used for interface match rules (recv xx, xmit xx).
+ */
+typedef struct _ipfw_insn_if {
+ ipfw_insn o;
+ union {
+ struct in_addr ip;
+ int glob;
+ } p;
+ char name[IFNAMSIZ];
+} ipfw_insn_if;
+
+/*
+ * This is used for storing an altq queue id number.
+ */
+typedef struct _ipfw_insn_altq {
+ ipfw_insn o;
+ u_int32_t qid;
+} ipfw_insn_altq;
+
+/*
+ * This is used for limit rules.
+ */
+typedef struct _ipfw_insn_limit {
+ ipfw_insn o;
+ u_int8_t _pad;
+ u_int8_t limit_mask; /* combination of DYN_* below */
+#define DYN_SRC_ADDR 0x1
+#define DYN_SRC_PORT 0x2
+#define DYN_DST_ADDR 0x4
+#define DYN_DST_PORT 0x8
+
+ u_int16_t conn_limit;
+} ipfw_insn_limit;
+
+/*
+ * This is used for log instructions.
+ */
+typedef struct _ipfw_insn_log {
+ ipfw_insn o;
+ u_int32_t max_log; /* how many do we log -- 0 = all */
+ u_int32_t log_left; /* how many left to log */
+} ipfw_insn_log;
+
+/*
+ * Data structures required by both ipfw(8) and ipfw(4) but not part of the
+ * management API are protected by IPFW_INTERNAL.
+ */
+#ifdef IPFW_INTERNAL
+/* Server pool support (LSNAT). */
+struct cfg_spool {
+ LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */
+ struct in_addr addr;
+ u_short port;
+};
+#endif
+
+/* Redirect modes id. */
+#define REDIR_ADDR 0x01
+#define REDIR_PORT 0x02
+#define REDIR_PROTO 0x04
+
+#ifdef IPFW_INTERNAL
+/* Nat redirect configuration. */
+struct cfg_redir {
+ LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */
+ u_int16_t mode; /* type of redirect mode */
+ struct in_addr laddr; /* local ip address */
+ struct in_addr paddr; /* public ip address */
+ struct in_addr raddr; /* remote ip address */
+ u_short lport; /* local port */
+ u_short pport; /* public port */
+ u_short rport; /* remote port */
+ u_short pport_cnt; /* number of public ports */
+ u_short rport_cnt; /* number of remote ports */
+ int proto; /* protocol: tcp/udp */
+ struct alias_link **alink;
+ /* num of entry in spool chain */
+ u_int16_t spool_cnt;
+ /* chain of spool instances */
+ LIST_HEAD(spool_chain, cfg_spool) spool_chain;
+};
+#endif
+
+#define NAT_BUF_LEN 1024
+
+#ifdef IPFW_INTERNAL
+/* Nat configuration data struct. */
+struct cfg_nat {
+ /* chain of nat instances */
+ LIST_ENTRY(cfg_nat) _next;
+ int id; /* nat id */
+ struct in_addr ip; /* nat ip address */
+ char if_name[IF_NAMESIZE]; /* interface name */
+ int mode; /* aliasing mode */
+ struct libalias *lib; /* libalias instance */
+ /* number of entry in spool chain */
+ int redir_cnt;
+ /* chain of redir instances */
+ LIST_HEAD(redir_chain, cfg_redir) redir_chain;
+};
+#endif
+
+#define SOF_NAT sizeof(struct cfg_nat)
+#define SOF_REDIR sizeof(struct cfg_redir)
+#define SOF_SPOOL sizeof(struct cfg_spool)
+
+/* Nat command. */
+typedef struct _ipfw_insn_nat {
+ ipfw_insn o;
+ struct cfg_nat *nat;
+} ipfw_insn_nat;
+
+/* Apply ipv6 mask on ipv6 addr */
+#define APPLY_MASK(addr,mask) \
+ (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \
+ (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \
+ (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \
+ (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3];
+
+/* Structure for ipv6 */
+typedef struct _ipfw_insn_ip6 {
+ ipfw_insn o;
+ struct in6_addr addr6;
+ struct in6_addr mask6;
+} ipfw_insn_ip6;
+
+/* Used to support icmp6 types */
+typedef struct _ipfw_insn_icmp6 {
+ ipfw_insn o;
+ uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h
+ * define ICMP6_MAXTYPE
+ * as follows: n = ICMP6_MAXTYPE/32 + 1
+ * Actually is 203
+ */
+} ipfw_insn_icmp6;
+
+/*
+ * Here we have the structure representing an ipfw rule.
+ *
+ * It starts with a general area (with link fields and counters)
+ * followed by an array of one or more instructions, which the code
+ * accesses as an array of 32-bit values.
+ *
+ * Given a rule pointer r:
+ *
+ * r->cmd is the start of the first instruction.
+ * ACTION_PTR(r) is the start of the first action (things to do
+ * once a rule matched).
+ *
+ * When assembling instruction, remember the following:
+ *
+ * + if a rule has a "keep-state" (or "limit") option, then the
+ * first instruction (at r->cmd) MUST BE an O_PROBE_STATE
+ * + if a rule has a "log" option, then the first action
+ * (at ACTION_PTR(r)) MUST be O_LOG
+ * + if a rule has an "altq" option, it comes after "log"
+ * + if a rule has an O_TAG option, it comes after "log" and "altq"
+ *
+ * NOTE: we use a simple linked list of rules because we never need
+ * to delete a rule without scanning the list. We do not use
+ * queue(3) macros for portability and readability.
+ */
+
+struct ip_fw {
+ struct ip_fw *x_next; /* linked list of rules */
+ struct ip_fw *next_rule; /* ptr to next [skipto] rule */
+ /* 'next_rule' is used to pass up 'set_disable' status */
+
+ uint16_t act_ofs; /* offset of action in 32-bit units */
+ uint16_t cmd_len; /* # of 32-bit words in cmd */
+ uint16_t rulenum; /* rule number */
+ uint8_t set; /* rule set (0..31) */
+#define RESVD_SET 31 /* set for default and persistent rules */
+ uint8_t _pad; /* padding */
+ uint32_t id; /* rule id */
+
+ /* These fields are present in all rules. */
+ uint64_t pcnt; /* Packet counter */
+ uint64_t bcnt; /* Byte counter */
+ uint32_t timestamp; /* tv_sec of last match */
+
+ ipfw_insn cmd[1]; /* storage for commands */
+};
+
+#define ACTION_PTR(rule) \
+ (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) )
+
+#define RULESIZE(rule) (sizeof(struct ip_fw) + \
+ ((struct ip_fw *)(rule))->cmd_len * 4 - 4)
+
+#if 1 // should be moved to in.h
+/*
+ * This structure is used as a flow mask and a flow id for various
+ * parts of the code.
+ * addr_type is used in userland and kernel to mark the address type.
+ * fib is used in the kernel to record the fib in use.
+ * _flags is used in the kernel to store tcp flags for dynamic rules.
+ */
+struct ipfw_flow_id {
+ uint32_t dst_ip;
+ uint32_t src_ip;
+ uint16_t dst_port;
+ uint16_t src_port;
+ uint8_t fib;
+ uint8_t proto;
+ uint8_t _flags; /* protocol-specific flags */
+ uint8_t addr_type; /* 4=ip4, 6=ip6, 1=ether ? */
+ struct in6_addr dst_ip6;
+ struct in6_addr src_ip6;
+ uint32_t flow_id6;
+ uint32_t extra; /* queue/pipe or frag_id */
+};
+#endif
+
+#define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6)
+
+/*
+ * Dynamic ipfw rule.
+ */
+typedef struct _ipfw_dyn_rule ipfw_dyn_rule;
+
+struct _ipfw_dyn_rule {
+ ipfw_dyn_rule *next; /* linked list of rules. */
+ struct ip_fw *rule; /* pointer to rule */
+ /* 'rule' is used to pass up the rule number (from the parent) */
+
+ ipfw_dyn_rule *parent; /* pointer to parent rule */
+ u_int64_t pcnt; /* packet match counter */
+ u_int64_t bcnt; /* byte match counter */
+ struct ipfw_flow_id id; /* (masked) flow id */
+ u_int32_t expire; /* expire time */
+ u_int32_t bucket; /* which bucket in hash table */
+ u_int32_t state; /* state of this rule (typically a
+ * combination of TCP flags)
+ */
+ u_int32_t ack_fwd; /* most recent ACKs in forward */
+ u_int32_t ack_rev; /* and reverse directions (used */
+ /* to generate keepalives) */
+ u_int16_t dyn_type; /* rule type */
+ u_int16_t count; /* refcount */
+};
+
+/*
+ * Definitions for IP option names.
+ */
+#define IP_FW_IPOPT_LSRR 0x01
+#define IP_FW_IPOPT_SSRR 0x02
+#define IP_FW_IPOPT_RR 0x04
+#define IP_FW_IPOPT_TS 0x08
+
+/*
+ * Definitions for TCP option names.
+ */
+#define IP_FW_TCPOPT_MSS 0x01
+#define IP_FW_TCPOPT_WINDOW 0x02
+#define IP_FW_TCPOPT_SACK 0x04
+#define IP_FW_TCPOPT_TS 0x08
+#define IP_FW_TCPOPT_CC 0x10
+
+#define ICMP_REJECT_RST 0x100 /* fake ICMP code (send a TCP RST) */
+#define ICMP6_UNREACH_RST 0x100 /* fake ICMPv6 code (send a TCP RST) */
+
+/*
+ * These are used for lookup tables.
+ */
+typedef struct _ipfw_table_entry {
+ in_addr_t addr; /* network address */
+ u_int32_t value; /* value */
+ u_int16_t tbl; /* table number */
+ u_int8_t masklen; /* mask length */
+} ipfw_table_entry;
+
+typedef struct _ipfw_table {
+ u_int32_t size; /* size of entries in bytes */
+ u_int32_t cnt; /* # of entries */
+ u_int16_t tbl; /* table number */
+ ipfw_table_entry ent[0]; /* entries */
+} ipfw_table;
+
+#endif /* _IPFW2_H */
diff --git a/freebsd/sys/netinet/ip_gre.c b/freebsd/sys/netinet/ip_gre.c
new file mode 100644
index 00000000..253376de
--- /dev/null
+++ b/freebsd/sys/netinet/ip_gre.c
@@ -0,0 +1,336 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/* $NetBSD: ip_gre.c,v 1.29 2003/09/05 23:02:43 itojun Exp $ */
+
+/*-
+ * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Heiko W.Rupp <hwr@pilhuhn.de>
+ *
+ * IPv6-over-GRE contributed by Gert Doering <gert@greenie.muc.de>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * deencapsulate tunneled packets and send them on
+ * output half is in net/if_gre.[ch]
+ * This currently handles IPPROTO_GRE, IPPROTO_MOBILE
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_atalk.h>
+#include <freebsd/local/opt_inet6.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/errno.h>
+#include <freebsd/sys/time.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/net/bpf.h>
+#include <freebsd/net/ethernet.h>
+#include <freebsd/net/if.h>
+#include <freebsd/net/netisr.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/raw_cb.h>
+
+#ifdef INET
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_gre.h>
+#include <freebsd/machine/in_cksum.h>
+#else
+#error ip_gre input without IP?
+#endif
+
+#ifdef NETATALK
+#include <freebsd/netatalk/at.h>
+#include <freebsd/netatalk/at_var.h>
+#include <freebsd/netatalk/at_extern.h>
+#endif
+
+/* Needs IP headers. */
+#include <freebsd/net/if_gre.h>
+
+#include <freebsd/machine/stdarg.h>
+
+#if 1
+void gre_inet_ntoa(struct in_addr in); /* XXX */
+#endif
+
+static struct gre_softc *gre_lookup(struct mbuf *, u_int8_t);
+
+static struct mbuf *gre_input2(struct mbuf *, int, u_char);
+
+/*
+ * De-encapsulate a packet and feed it back through ip input (this
+ * routine is called whenever IP gets a packet with proto type
+ * IPPROTO_GRE and a local destination address).
+ * This really is simple
+ */
+void
+gre_input(struct mbuf *m, int off)
+{
+ int proto;
+
+ proto = (mtod(m, struct ip *))->ip_p;
+
+ m = gre_input2(m, off, proto);
+
+ /*
+ * If no matching tunnel that is up is found. We inject
+ * the mbuf to raw ip socket to see if anyone picks it up.
+ */
+ if (m != NULL)
+ rip_input(m, off);
+}
+
+/*
+ * Decapsulate. Does the real work and is called from gre_input()
+ * (above). Returns an mbuf back if packet is not yet processed,
+ * and NULL if it needs no further processing. proto is the protocol
+ * number of the "calling" foo_input() routine.
+ */
+static struct mbuf *
+gre_input2(struct mbuf *m ,int hlen, u_char proto)
+{
+ struct greip *gip;
+ int isr;
+ struct gre_softc *sc;
+ u_int16_t flags;
+ u_int32_t af;
+
+ if ((sc = gre_lookup(m, proto)) == NULL) {
+ /* No matching tunnel or tunnel is down. */
+ return (m);
+ }
+
+ if (m->m_len < sizeof(*gip)) {
+ m = m_pullup(m, sizeof(*gip));
+ if (m == NULL)
+ return (NULL);
+ }
+ gip = mtod(m, struct greip *);
+
+ GRE2IFP(sc)->if_ipackets++;
+ GRE2IFP(sc)->if_ibytes += m->m_pkthdr.len;
+
+ switch (proto) {
+ case IPPROTO_GRE:
+ hlen += sizeof(struct gre_h);
+
+ /* process GRE flags as packet can be of variable len */
+ flags = ntohs(gip->gi_flags);
+
+ /* Checksum & Offset are present */
+ if ((flags & GRE_CP) | (flags & GRE_RP))
+ hlen += 4;
+ /* We don't support routing fields (variable length) */
+ if (flags & GRE_RP)
+ return (m);
+ if (flags & GRE_KP)
+ hlen += 4;
+ if (flags & GRE_SP)
+ hlen += 4;
+
+ switch (ntohs(gip->gi_ptype)) { /* ethertypes */
+ case WCCP_PROTOCOL_TYPE:
+ if (sc->wccp_ver == WCCP_V2)
+ hlen += 4;
+ /* FALLTHROUGH */
+ case ETHERTYPE_IP: /* shouldn't need a schednetisr(), */
+ isr = NETISR_IP;/* as we are in ip_input */
+ af = AF_INET;
+ break;
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ isr = NETISR_IPV6;
+ af = AF_INET6;
+ break;
+#endif
+#ifdef NETATALK
+ case ETHERTYPE_ATALK:
+ isr = NETISR_ATALK1;
+ af = AF_APPLETALK;
+ break;
+#endif
+ default:
+ /* Others not yet supported. */
+ return (m);
+ }
+ break;
+ default:
+ /* Others not yet supported. */
+ return (m);
+ }
+
+ if (hlen > m->m_pkthdr.len) {
+ m_freem(m);
+ return (NULL);
+ }
+ /* Unlike NetBSD, in FreeBSD m_adj() adjusts m->m_pkthdr.len as well */
+ m_adj(m, hlen);
+
+ if (bpf_peers_present(GRE2IFP(sc)->if_bpf)) {
+ bpf_mtap2(GRE2IFP(sc)->if_bpf, &af, sizeof(af), m);
+ }
+
+ m->m_pkthdr.rcvif = GRE2IFP(sc);
+
+ netisr_queue(isr, m);
+
+ /* Packet is done, no further processing needed. */
+ return (NULL);
+}
+
+/*
+ * input routine for IPPRPOTO_MOBILE
+ * This is a little bit diffrent from the other modes, as the
+ * encapsulating header was not prepended, but instead inserted
+ * between IP header and payload
+ */
+
+void
+gre_mobile_input(struct mbuf *m, int hlen)
+{
+ struct ip *ip;
+ struct mobip_h *mip;
+ struct gre_softc *sc;
+ int msiz;
+
+ if ((sc = gre_lookup(m, IPPROTO_MOBILE)) == NULL) {
+ /* No matching tunnel or tunnel is down. */
+ m_freem(m);
+ return;
+ }
+
+ if (m->m_len < sizeof(*mip)) {
+ m = m_pullup(m, sizeof(*mip));
+ if (m == NULL)
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ mip = mtod(m, struct mobip_h *);
+
+ GRE2IFP(sc)->if_ipackets++;
+ GRE2IFP(sc)->if_ibytes += m->m_pkthdr.len;
+
+ if (ntohs(mip->mh.proto) & MOB_HH_SBIT) {
+ msiz = MOB_HH_SIZ_L;
+ mip->mi.ip_src.s_addr = mip->mh.osrc;
+ } else
+ msiz = MOB_HH_SIZ_S;
+
+ if (m->m_len < (ip->ip_hl << 2) + msiz) {
+ m = m_pullup(m, (ip->ip_hl << 2) + msiz);
+ if (m == NULL)
+ return;
+ ip = mtod(m, struct ip *);
+ mip = mtod(m, struct mobip_h *);
+ }
+
+ mip->mi.ip_dst.s_addr = mip->mh.odst;
+ mip->mi.ip_p = (ntohs(mip->mh.proto) >> 8);
+
+ if (gre_in_cksum((u_int16_t *)&mip->mh, msiz) != 0) {
+ m_freem(m);
+ return;
+ }
+
+ bcopy((caddr_t)(ip) + (ip->ip_hl << 2) + msiz, (caddr_t)(ip) +
+ (ip->ip_hl << 2), m->m_len - msiz - (ip->ip_hl << 2));
+ m->m_len -= msiz;
+ m->m_pkthdr.len -= msiz;
+
+ /*
+ * On FreeBSD, rip_input() supplies us with ip->ip_len
+ * already converted into host byteorder and also decreases
+ * it by the lengh of IP header, however, ip_input() expects
+ * that this field is in the original format (network byteorder
+ * and full size of IP packet), so that adjust accordingly.
+ */
+ ip->ip_len = htons(ip->ip_len + sizeof(struct ip) - msiz);
+
+ ip->ip_sum = 0;
+ ip->ip_sum = in_cksum(m, (ip->ip_hl << 2));
+
+ if (bpf_peers_present(GRE2IFP(sc)->if_bpf)) {
+ u_int32_t af = AF_INET;
+ bpf_mtap2(GRE2IFP(sc)->if_bpf, &af, sizeof(af), m);
+ }
+
+ m->m_pkthdr.rcvif = GRE2IFP(sc);
+
+ netisr_queue(NETISR_IP, m);
+}
+
+/*
+ * Find the gre interface associated with our src/dst/proto set.
+ *
+ * XXXRW: Need some sort of drain/refcount mechanism so that the softc
+ * reference remains valid after it's returned from gre_lookup(). Right
+ * now, I'm thinking it should be reference-counted with a gre_dropref()
+ * when the caller is done with the softc. This is complicated by how
+ * to handle destroying the gre softc; probably using a gre_drain() in
+ * in_gre.c during destroy.
+ */
+static struct gre_softc *
+gre_lookup(struct mbuf *m, u_int8_t proto)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ struct gre_softc *sc;
+
+ mtx_lock(&gre_mtx);
+ for (sc = LIST_FIRST(&gre_softc_list); sc != NULL;
+ sc = LIST_NEXT(sc, sc_list)) {
+ if ((sc->g_dst.s_addr == ip->ip_src.s_addr) &&
+ (sc->g_src.s_addr == ip->ip_dst.s_addr) &&
+ (sc->g_proto == proto) &&
+ ((GRE2IFP(sc)->if_flags & IFF_UP) != 0)) {
+ mtx_unlock(&gre_mtx);
+ return (sc);
+ }
+ }
+ mtx_unlock(&gre_mtx);
+
+ return (NULL);
+}
diff --git a/freebsd/sys/netinet/ip_gre.h b/freebsd/sys/netinet/ip_gre.h
new file mode 100644
index 00000000..1fb67d93
--- /dev/null
+++ b/freebsd/sys/netinet/ip_gre.h
@@ -0,0 +1,43 @@
+/* $NetBSD: ip_gre.h,v 1.5 2002/06/09 16:33:40 itojun Exp $ */
+/* $FreeBSD$ */
+
+/*-
+ * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Heiko W.Rupp <hwr@pilhuhn.de>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef _KERNEL
+void gre_input(struct mbuf *, int);
+void gre_mobile_input(struct mbuf *, int);
+#endif /* _KERNEL */
diff --git a/freebsd/sys/netinet/ip_icmp.c b/freebsd/sys/netinet/ip_icmp.c
new file mode 100644
index 00000000..b7a83128
--- /dev/null
+++ b/freebsd/sys/netinet/ip_icmp.c
@@ -0,0 +1,986 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_ipsec.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/time.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/syslog.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/if_types.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_icmp.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_options.h>
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet/tcpip.h>
+#include <freebsd/netinet/icmp_var.h>
+
+#ifdef IPSEC
+#include <freebsd/netipsec/ipsec.h>
+#include <freebsd/netipsec/key.h>
+#endif
+
+#include <freebsd/machine/in_cksum.h>
+
+#include <freebsd/security/mac/mac_framework.h>
+
+/*
+ * ICMP routines: error generation, receive packet processing, and
+ * routines to turnaround packets back to the originator, and
+ * host table maintenance routines.
+ */
+VNET_DEFINE(struct icmpstat, icmpstat);
+SYSCTL_VNET_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW,
+ &VNET_NAME(icmpstat), icmpstat, "");
+
+static VNET_DEFINE(int, icmpmaskrepl) = 0;
+#define V_icmpmaskrepl VNET(icmpmaskrepl)
+SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW,
+ &VNET_NAME(icmpmaskrepl), 0,
+ "Reply to ICMP Address Mask Request packets.");
+
+static VNET_DEFINE(u_int, icmpmaskfake) = 0;
+#define V_icmpmaskfake VNET(icmpmaskfake)
+SYSCTL_VNET_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_RW,
+ &VNET_NAME(icmpmaskfake), 0,
+ "Fake reply to ICMP Address Mask Request packets.");
+
+static VNET_DEFINE(int, drop_redirect) = 0;
+#define V_drop_redirect VNET(drop_redirect)
+SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW,
+ &VNET_NAME(drop_redirect), 0,
+ "Ignore ICMP redirects");
+
+static VNET_DEFINE(int, log_redirect) = 0;
+#define V_log_redirect VNET(log_redirect)
+SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW,
+ &VNET_NAME(log_redirect), 0,
+ "Log ICMP redirects to the console");
+
+static VNET_DEFINE(int, icmplim) = 200;
+#define V_icmplim VNET(icmplim)
+SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW,
+ &VNET_NAME(icmplim), 0,
+ "Maximum number of ICMP responses per second");
+
+static VNET_DEFINE(int, icmplim_output) = 1;
+#define V_icmplim_output VNET(icmplim_output)
+SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW,
+ &VNET_NAME(icmplim_output), 0,
+ "Enable rate limiting of ICMP responses");
+
+static VNET_DEFINE(char, reply_src[IFNAMSIZ]);
+#define V_reply_src VNET(reply_src)
+SYSCTL_VNET_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_RW,
+ &VNET_NAME(reply_src), IFNAMSIZ,
+ "icmp reply source for non-local packets.");
+
+static VNET_DEFINE(int, icmp_rfi) = 0;
+#define V_icmp_rfi VNET(icmp_rfi)
+SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_RW,
+ &VNET_NAME(icmp_rfi), 0,
+ "ICMP reply from incoming interface for non-local packets");
+
+static VNET_DEFINE(int, icmp_quotelen) = 8;
+#define V_icmp_quotelen VNET(icmp_quotelen)
+SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_RW,
+ &VNET_NAME(icmp_quotelen), 0,
+ "Number of bytes from original packet to quote in ICMP reply");
+
+/*
+ * ICMP broadcast echo sysctl
+ */
+static VNET_DEFINE(int, icmpbmcastecho) = 0;
+#define V_icmpbmcastecho VNET(icmpbmcastecho)
+SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW,
+ &VNET_NAME(icmpbmcastecho), 0,
+ "");
+
+
+#ifdef ICMPPRINTFS
+int icmpprintfs = 0;
+#endif
+
+static void icmp_reflect(struct mbuf *);
+static void icmp_send(struct mbuf *, struct mbuf *);
+
+extern struct protosw inetsw[];
+
+/*
+ * Kernel module interface for updating icmpstat. The argument is an index
+ * into icmpstat treated as an array of u_long. While this encodes the
+ * general layout of icmpstat into the caller, it doesn't encode its
+ * location, so that future changes to add, for example, per-CPU stats
+ * support won't cause binary compatibility problems for kernel modules.
+ */
+void
+kmod_icmpstat_inc(int statnum)
+{
+
+ (*((u_long *)&V_icmpstat + statnum))++;
+}
+
+/*
+ * Generate an error packet of type error
+ * in response to bad packet ip.
+ */
+void
+icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu)
+{
+ register struct ip *oip = mtod(n, struct ip *), *nip;
+ register unsigned oiphlen = oip->ip_hl << 2;
+ register struct icmp *icp;
+ register struct mbuf *m;
+ unsigned icmplen, icmpelen, nlen;
+
+ KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type", __func__));
+#ifdef ICMPPRINTFS
+ if (icmpprintfs)
+ printf("icmp_error(%p, %x, %d)\n", oip, type, code);
+#endif
+ if (type != ICMP_REDIRECT)
+ ICMPSTAT_INC(icps_error);
+ /*
+ * Don't send error:
+ * if the original packet was encrypted.
+ * if not the first fragment of message.
+ * in response to a multicast or broadcast packet.
+ * if the old packet protocol was an ICMP error message.
+ */
+ if (n->m_flags & M_DECRYPTED)
+ goto freeit;
+ if (oip->ip_off & ~(IP_MF|IP_DF))
+ goto freeit;
+ if (n->m_flags & (M_BCAST|M_MCAST))
+ goto freeit;
+ if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT &&
+ n->m_len >= oiphlen + ICMP_MINLEN &&
+ !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) {
+ ICMPSTAT_INC(icps_oldicmp);
+ goto freeit;
+ }
+ /* Drop if IP header plus 8 bytes is not contignous in first mbuf. */
+ if (oiphlen + 8 > n->m_len)
+ goto freeit;
+ /*
+ * Calculate length to quote from original packet and
+ * prevent the ICMP mbuf from overflowing.
+ * Unfortunatly this is non-trivial since ip_forward()
+ * sends us truncated packets.
+ */
+ nlen = m_length(n, NULL);
+ if (oip->ip_p == IPPROTO_TCP) {
+ struct tcphdr *th;
+ int tcphlen;
+
+ if (oiphlen + sizeof(struct tcphdr) > n->m_len &&
+ n->m_next == NULL)
+ goto stdreply;
+ if (n->m_len < oiphlen + sizeof(struct tcphdr) &&
+ ((n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL))
+ goto freeit;
+ th = (struct tcphdr *)((caddr_t)oip + oiphlen);
+ tcphlen = th->th_off << 2;
+ if (tcphlen < sizeof(struct tcphdr))
+ goto freeit;
+ if (oip->ip_len < oiphlen + tcphlen)
+ goto freeit;
+ if (oiphlen + tcphlen > n->m_len && n->m_next == NULL)
+ goto stdreply;
+ if (n->m_len < oiphlen + tcphlen &&
+ ((n = m_pullup(n, oiphlen + tcphlen)) == NULL))
+ goto freeit;
+ icmpelen = max(tcphlen, min(V_icmp_quotelen, oip->ip_len - oiphlen));
+ } else
+stdreply: icmpelen = max(8, min(V_icmp_quotelen, oip->ip_len - oiphlen));
+
+ icmplen = min(oiphlen + icmpelen, nlen);
+ if (icmplen < sizeof(struct ip))
+ goto freeit;
+
+ if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen)
+ m = m_gethdr(M_DONTWAIT, MT_DATA);
+ else
+ m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+ if (m == NULL)
+ goto freeit;
+#ifdef MAC
+ mac_netinet_icmp_reply(n, m);
+#endif
+ icmplen = min(icmplen, M_TRAILINGSPACE(m) - sizeof(struct ip) - ICMP_MINLEN);
+ m_align(m, ICMP_MINLEN + icmplen);
+ m->m_len = ICMP_MINLEN + icmplen;
+
+ /* XXX MRT make the outgoing packet use the same FIB
+ * that was associated with the incoming packet
+ */
+ M_SETFIB(m, M_GETFIB(n));
+ icp = mtod(m, struct icmp *);
+ ICMPSTAT_INC(icps_outhist[type]);
+ icp->icmp_type = type;
+ if (type == ICMP_REDIRECT)
+ icp->icmp_gwaddr.s_addr = dest;
+ else {
+ icp->icmp_void = 0;
+ /*
+ * The following assignments assume an overlay with the
+ * just zeroed icmp_void field.
+ */
+ if (type == ICMP_PARAMPROB) {
+ icp->icmp_pptr = code;
+ code = 0;
+ } else if (type == ICMP_UNREACH &&
+ code == ICMP_UNREACH_NEEDFRAG && mtu) {
+ icp->icmp_nextmtu = htons(mtu);
+ }
+ }
+ icp->icmp_code = code;
+
+ /*
+ * Copy the quotation into ICMP message and
+ * convert quoted IP header back to network representation.
+ */
+ m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip);
+ nip = &icp->icmp_ip;
+ nip->ip_len = htons(nip->ip_len);
+ nip->ip_off = htons(nip->ip_off);
+
+ /*
+ * Set up ICMP message mbuf and copy old IP header (without options
+ * in front of ICMP message.
+ * If the original mbuf was meant to bypass the firewall, the error
+ * reply should bypass as well.
+ */
+ m->m_flags |= n->m_flags & M_SKIP_FIREWALL;
+ m->m_data -= sizeof(struct ip);
+ m->m_len += sizeof(struct ip);
+ m->m_pkthdr.len = m->m_len;
+ m->m_pkthdr.rcvif = n->m_pkthdr.rcvif;
+ nip = mtod(m, struct ip *);
+ bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip));
+ nip->ip_len = m->m_len;
+ nip->ip_v = IPVERSION;
+ nip->ip_hl = 5;
+ nip->ip_p = IPPROTO_ICMP;
+ nip->ip_tos = 0;
+ icmp_reflect(m);
+
+freeit:
+ m_freem(n);
+}
+
+/*
+ * Process a received ICMP message.
+ */
+void
+icmp_input(struct mbuf *m, int off)
+{
+ struct icmp *icp;
+ struct in_ifaddr *ia;
+ struct ip *ip = mtod(m, struct ip *);
+ struct sockaddr_in icmpsrc, icmpdst, icmpgw;
+ int hlen = off;
+ int icmplen = ip->ip_len;
+ int i, code;
+ void (*ctlfunc)(int, struct sockaddr *, void *);
+ int fibnum;
+
+ /*
+ * Locate icmp structure in mbuf, and check
+ * that not corrupted and of at least minimum length.
+ */
+#ifdef ICMPPRINTFS
+ if (icmpprintfs) {
+ char buf[4 * sizeof "123"];
+ strcpy(buf, inet_ntoa(ip->ip_src));
+ printf("icmp_input from %s to %s, len %d\n",
+ buf, inet_ntoa(ip->ip_dst), icmplen);
+ }
+#endif
+ if (icmplen < ICMP_MINLEN) {
+ ICMPSTAT_INC(icps_tooshort);
+ goto freeit;
+ }
+ i = hlen + min(icmplen, ICMP_ADVLENMIN);
+ if (m->m_len < i && (m = m_pullup(m, i)) == NULL) {
+ ICMPSTAT_INC(icps_tooshort);
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ m->m_len -= hlen;
+ m->m_data += hlen;
+ icp = mtod(m, struct icmp *);
+ if (in_cksum(m, icmplen)) {
+ ICMPSTAT_INC(icps_checksum);
+ goto freeit;
+ }
+ m->m_len += hlen;
+ m->m_data -= hlen;
+
+ if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
+ /*
+ * Deliver very specific ICMP type only.
+ */
+ switch (icp->icmp_type) {
+ case ICMP_UNREACH:
+ case ICMP_TIMXCEED:
+ break;
+ default:
+ goto freeit;
+ }
+ }
+
+#ifdef ICMPPRINTFS
+ if (icmpprintfs)
+ printf("icmp_input, type %d code %d\n", icp->icmp_type,
+ icp->icmp_code);
+#endif
+
+ /*
+ * Message type specific processing.
+ */
+ if (icp->icmp_type > ICMP_MAXTYPE)
+ goto raw;
+
+ /* Initialize */
+ bzero(&icmpsrc, sizeof(icmpsrc));
+ icmpsrc.sin_len = sizeof(struct sockaddr_in);
+ icmpsrc.sin_family = AF_INET;
+ bzero(&icmpdst, sizeof(icmpdst));
+ icmpdst.sin_len = sizeof(struct sockaddr_in);
+ icmpdst.sin_family = AF_INET;
+ bzero(&icmpgw, sizeof(icmpgw));
+ icmpgw.sin_len = sizeof(struct sockaddr_in);
+ icmpgw.sin_family = AF_INET;
+
+ ICMPSTAT_INC(icps_inhist[icp->icmp_type]);
+ code = icp->icmp_code;
+ switch (icp->icmp_type) {
+
+ case ICMP_UNREACH:
+ switch (code) {
+ case ICMP_UNREACH_NET:
+ case ICMP_UNREACH_HOST:
+ case ICMP_UNREACH_SRCFAIL:
+ case ICMP_UNREACH_NET_UNKNOWN:
+ case ICMP_UNREACH_HOST_UNKNOWN:
+ case ICMP_UNREACH_ISOLATED:
+ case ICMP_UNREACH_TOSNET:
+ case ICMP_UNREACH_TOSHOST:
+ case ICMP_UNREACH_HOST_PRECEDENCE:
+ case ICMP_UNREACH_PRECEDENCE_CUTOFF:
+ code = PRC_UNREACH_NET;
+ break;
+
+ case ICMP_UNREACH_NEEDFRAG:
+ code = PRC_MSGSIZE;
+ break;
+
+ /*
+ * RFC 1122, Sections 3.2.2.1 and 4.2.3.9.
+ * Treat subcodes 2,3 as immediate RST
+ */
+ case ICMP_UNREACH_PROTOCOL:
+ case ICMP_UNREACH_PORT:
+ code = PRC_UNREACH_PORT;
+ break;
+
+ case ICMP_UNREACH_NET_PROHIB:
+ case ICMP_UNREACH_HOST_PROHIB:
+ case ICMP_UNREACH_FILTER_PROHIB:
+ code = PRC_UNREACH_ADMIN_PROHIB;
+ break;
+
+ default:
+ goto badcode;
+ }
+ goto deliver;
+
+ case ICMP_TIMXCEED:
+ if (code > 1)
+ goto badcode;
+ code += PRC_TIMXCEED_INTRANS;
+ goto deliver;
+
+ case ICMP_PARAMPROB:
+ if (code > 1)
+ goto badcode;
+ code = PRC_PARAMPROB;
+ goto deliver;
+
+ case ICMP_SOURCEQUENCH:
+ if (code)
+ goto badcode;
+ code = PRC_QUENCH;
+ deliver:
+ /*
+ * Problem with datagram; advise higher level routines.
+ */
+ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
+ icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
+ ICMPSTAT_INC(icps_badlen);
+ goto freeit;
+ }
+ icp->icmp_ip.ip_len = ntohs(icp->icmp_ip.ip_len);
+ /* Discard ICMP's in response to multicast packets */
+ if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr)))
+ goto badcode;
+#ifdef ICMPPRINTFS
+ if (icmpprintfs)
+ printf("deliver to protocol %d\n", icp->icmp_ip.ip_p);
+#endif
+ icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
+ /*
+ * XXX if the packet contains [IPv4 AH TCP], we can't make a
+ * notification to TCP layer.
+ */
+ ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput;
+ if (ctlfunc)
+ (*ctlfunc)(code, (struct sockaddr *)&icmpsrc,
+ (void *)&icp->icmp_ip);
+ break;
+
+ badcode:
+ ICMPSTAT_INC(icps_badcode);
+ break;
+
+ case ICMP_ECHO:
+ if (!V_icmpbmcastecho
+ && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
+ ICMPSTAT_INC(icps_bmcastecho);
+ break;
+ }
+ icp->icmp_type = ICMP_ECHOREPLY;
+ if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0)
+ goto freeit;
+ else
+ goto reflect;
+
+ case ICMP_TSTAMP:
+ if (!V_icmpbmcastecho
+ && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
+ ICMPSTAT_INC(icps_bmcasttstamp);
+ break;
+ }
+ if (icmplen < ICMP_TSLEN) {
+ ICMPSTAT_INC(icps_badlen);
+ break;
+ }
+ icp->icmp_type = ICMP_TSTAMPREPLY;
+ icp->icmp_rtime = iptime();
+ icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */
+ if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0)
+ goto freeit;
+ else
+ goto reflect;
+
+ case ICMP_MASKREQ:
+ if (V_icmpmaskrepl == 0)
+ break;
+ /*
+ * We are not able to respond with all ones broadcast
+ * unless we receive it over a point-to-point interface.
+ */
+ if (icmplen < ICMP_MASKLEN)
+ break;
+ switch (ip->ip_dst.s_addr) {
+
+ case INADDR_BROADCAST:
+ case INADDR_ANY:
+ icmpdst.sin_addr = ip->ip_src;
+ break;
+
+ default:
+ icmpdst.sin_addr = ip->ip_dst;
+ }
+ ia = (struct in_ifaddr *)ifaof_ifpforaddr(
+ (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif);
+ if (ia == NULL)
+ break;
+ if (ia->ia_ifp == NULL) {
+ ifa_free(&ia->ia_ifa);
+ break;
+ }
+ icp->icmp_type = ICMP_MASKREPLY;
+ if (V_icmpmaskfake == 0)
+ icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr;
+ else
+ icp->icmp_mask = V_icmpmaskfake;
+ if (ip->ip_src.s_addr == 0) {
+ if (ia->ia_ifp->if_flags & IFF_BROADCAST)
+ ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr;
+ else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT)
+ ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr;
+ }
+ ifa_free(&ia->ia_ifa);
+reflect:
+ ip->ip_len += hlen; /* since ip_input deducts this */
+ ICMPSTAT_INC(icps_reflect);
+ ICMPSTAT_INC(icps_outhist[icp->icmp_type]);
+ icmp_reflect(m);
+ return;
+
+ case ICMP_REDIRECT:
+ if (V_log_redirect) {
+ u_long src, dst, gw;
+
+ src = ntohl(ip->ip_src.s_addr);
+ dst = ntohl(icp->icmp_ip.ip_dst.s_addr);
+ gw = ntohl(icp->icmp_gwaddr.s_addr);
+ printf("icmp redirect from %d.%d.%d.%d: "
+ "%d.%d.%d.%d => %d.%d.%d.%d\n",
+ (int)(src >> 24), (int)((src >> 16) & 0xff),
+ (int)((src >> 8) & 0xff), (int)(src & 0xff),
+ (int)(dst >> 24), (int)((dst >> 16) & 0xff),
+ (int)((dst >> 8) & 0xff), (int)(dst & 0xff),
+ (int)(gw >> 24), (int)((gw >> 16) & 0xff),
+ (int)((gw >> 8) & 0xff), (int)(gw & 0xff));
+ }
+ /*
+ * RFC1812 says we must ignore ICMP redirects if we
+ * are acting as router.
+ */
+ if (V_drop_redirect || V_ipforwarding)
+ break;
+ if (code > 3)
+ goto badcode;
+ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
+ icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
+ ICMPSTAT_INC(icps_badlen);
+ break;
+ }
+ /*
+ * Short circuit routing redirects to force
+ * immediate change in the kernel's routing
+ * tables. The message is also handed to anyone
+ * listening on a raw socket (e.g. the routing
+ * daemon for use in updating its tables).
+ */
+ icmpgw.sin_addr = ip->ip_src;
+ icmpdst.sin_addr = icp->icmp_gwaddr;
+#ifdef ICMPPRINTFS
+ if (icmpprintfs) {
+ char buf[4 * sizeof "123"];
+ strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst));
+
+ printf("redirect dst %s to %s\n",
+ buf, inet_ntoa(icp->icmp_gwaddr));
+ }
+#endif
+ icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
+ for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
+ in_rtredirect((struct sockaddr *)&icmpsrc,
+ (struct sockaddr *)&icmpdst,
+ (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST,
+ (struct sockaddr *)&icmpgw, fibnum);
+ }
+ pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc);
+#ifdef IPSEC
+ key_sa_routechange((struct sockaddr *)&icmpsrc);
+#endif
+ break;
+
+ /*
+ * No kernel processing for the following;
+ * just fall through to send to raw listener.
+ */
+ case ICMP_ECHOREPLY:
+ case ICMP_ROUTERADVERT:
+ case ICMP_ROUTERSOLICIT:
+ case ICMP_TSTAMPREPLY:
+ case ICMP_IREQREPLY:
+ case ICMP_MASKREPLY:
+ default:
+ break;
+ }
+
+raw:
+ rip_input(m, off);
+ return;
+
+freeit:
+ m_freem(m);
+}
+
+/*
+ * Reflect the ip packet back to the source
+ */
+static void
+icmp_reflect(struct mbuf *m)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ struct ifaddr *ifa;
+ struct ifnet *ifp;
+ struct in_ifaddr *ia;
+ struct in_addr t;
+ struct mbuf *opts = 0;
+ int optlen = (ip->ip_hl << 2) - sizeof(struct ip);
+
+ if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
+ IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) ||
+ IN_ZERONET(ntohl(ip->ip_src.s_addr)) ) {
+ m_freem(m); /* Bad return address */
+ ICMPSTAT_INC(icps_badaddr);
+ goto done; /* Ip_output() will check for broadcast */
+ }
+
+ t = ip->ip_dst;
+ ip->ip_dst = ip->ip_src;
+
+ /*
+ * Source selection for ICMP replies:
+ *
+ * If the incoming packet was addressed directly to one of our
+ * own addresses, use dst as the src for the reply.
+ */
+ IN_IFADDR_RLOCK();
+ LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) {
+ if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) {
+ t = IA_SIN(ia)->sin_addr;
+ IN_IFADDR_RUNLOCK();
+ goto match;
+ }
+ }
+ IN_IFADDR_RUNLOCK();
+
+ /*
+ * If the incoming packet was addressed to one of our broadcast
+ * addresses, use the first non-broadcast address which corresponds
+ * to the incoming interface.
+ */
+ ifp = m->m_pkthdr.rcvif;
+ if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ ia = ifatoia(ifa);
+ if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
+ t.s_addr) {
+ t = IA_SIN(ia)->sin_addr;
+ IF_ADDR_UNLOCK(ifp);
+ goto match;
+ }
+ }
+ IF_ADDR_UNLOCK(ifp);
+ }
+ /*
+ * If the packet was transiting through us, use the address of
+ * the interface the packet came through in. If that interface
+ * doesn't have a suitable IP address, the normal selection
+ * criteria apply.
+ */
+ if (V_icmp_rfi && ifp != NULL) {
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ ia = ifatoia(ifa);
+ t = IA_SIN(ia)->sin_addr;
+ IF_ADDR_UNLOCK(ifp);
+ goto match;
+ }
+ IF_ADDR_UNLOCK(ifp);
+ }
+ /*
+ * If the incoming packet was not addressed directly to us, use
+ * designated interface for icmp replies specified by sysctl
+ * net.inet.icmp.reply_src (default not set). Otherwise continue
+ * with normal source selection.
+ */
+ if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) {
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ ia = ifatoia(ifa);
+ t = IA_SIN(ia)->sin_addr;
+ IF_ADDR_UNLOCK(ifp);
+ goto match;
+ }
+ IF_ADDR_UNLOCK(ifp);
+ }
+ /*
+ * If the packet was transiting through us, use the address of
+ * the interface that is the closest to the packet source.
+ * When we don't have a route back to the packet source, stop here
+ * and drop the packet.
+ */
+ ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
+ if (ia == NULL) {
+ m_freem(m);
+ ICMPSTAT_INC(icps_noroute);
+ goto done;
+ }
+ t = IA_SIN(ia)->sin_addr;
+ ifa_free(&ia->ia_ifa);
+match:
+#ifdef MAC
+ mac_netinet_icmp_replyinplace(m);
+#endif
+ ip->ip_src = t;
+ ip->ip_ttl = V_ip_defttl;
+
+ if (optlen > 0) {
+ register u_char *cp;
+ int opt, cnt;
+ u_int len;
+
+ /*
+ * Retrieve any source routing from the incoming packet;
+ * add on any record-route or timestamp options.
+ */
+ cp = (u_char *) (ip + 1);
+ if ((opts = ip_srcroute(m)) == 0 &&
+ (opts = m_gethdr(M_DONTWAIT, MT_DATA))) {
+ opts->m_len = sizeof(struct in_addr);
+ mtod(opts, struct in_addr *)->s_addr = 0;
+ }
+ if (opts) {
+#ifdef ICMPPRINTFS
+ if (icmpprintfs)
+ printf("icmp_reflect optlen %d rt %d => ",
+ optlen, opts->m_len);
+#endif
+ for (cnt = optlen; cnt > 0; cnt -= len, cp += len) {
+ opt = cp[IPOPT_OPTVAL];
+ if (opt == IPOPT_EOL)
+ break;
+ if (opt == IPOPT_NOP)
+ len = 1;
+ else {
+ if (cnt < IPOPT_OLEN + sizeof(*cp))
+ break;
+ len = cp[IPOPT_OLEN];
+ if (len < IPOPT_OLEN + sizeof(*cp) ||
+ len > cnt)
+ break;
+ }
+ /*
+ * Should check for overflow, but it "can't happen"
+ */
+ if (opt == IPOPT_RR || opt == IPOPT_TS ||
+ opt == IPOPT_SECURITY) {
+ bcopy((caddr_t)cp,
+ mtod(opts, caddr_t) + opts->m_len, len);
+ opts->m_len += len;
+ }
+ }
+ /* Terminate & pad, if necessary */
+ cnt = opts->m_len % 4;
+ if (cnt) {
+ for (; cnt < 4; cnt++) {
+ *(mtod(opts, caddr_t) + opts->m_len) =
+ IPOPT_EOL;
+ opts->m_len++;
+ }
+ }
+#ifdef ICMPPRINTFS
+ if (icmpprintfs)
+ printf("%d\n", opts->m_len);
+#endif
+ }
+ /*
+ * Now strip out original options by copying rest of first
+ * mbuf's data back, and adjust the IP length.
+ */
+ ip->ip_len -= optlen;
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = 5;
+ m->m_len -= optlen;
+ if (m->m_flags & M_PKTHDR)
+ m->m_pkthdr.len -= optlen;
+ optlen += sizeof(struct ip);
+ bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1),
+ (unsigned)(m->m_len - sizeof(struct ip)));
+ }
+ m_tag_delete_nonpersistent(m);
+ m->m_flags &= ~(M_BCAST|M_MCAST);
+ icmp_send(m, opts);
+done:
+ if (opts)
+ (void)m_free(opts);
+}
+
+/*
+ * Send an icmp packet back to the ip level,
+ * after supplying a checksum.
+ */
+static void
+icmp_send(struct mbuf *m, struct mbuf *opts)
+{
+ register struct ip *ip = mtod(m, struct ip *);
+ register int hlen;
+ register struct icmp *icp;
+
+ hlen = ip->ip_hl << 2;
+ m->m_data += hlen;
+ m->m_len -= hlen;
+ icp = mtod(m, struct icmp *);
+ icp->icmp_cksum = 0;
+ icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen);
+ m->m_data -= hlen;
+ m->m_len += hlen;
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+#ifdef ICMPPRINTFS
+ if (icmpprintfs) {
+ char buf[4 * sizeof "123"];
+ strcpy(buf, inet_ntoa(ip->ip_dst));
+ printf("icmp_send dst %s src %s\n",
+ buf, inet_ntoa(ip->ip_src));
+ }
+#endif
+ (void) ip_output(m, opts, NULL, 0, NULL, NULL);
+}
+
+/*
+ * Return milliseconds since 00:00 GMT in network format.
+ */
+uint32_t
+iptime(void)
+{
+ struct timeval atv;
+ u_long t;
+
+ getmicrotime(&atv);
+ t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000;
+ return (htonl(t));
+}
+
+/*
+ * Return the next larger or smaller MTU plateau (table from RFC 1191)
+ * given current value MTU. If DIR is less than zero, a larger plateau
+ * is returned; otherwise, a smaller value is returned.
+ */
+int
+ip_next_mtu(int mtu, int dir)
+{
+ static int mtutab[] = {
+ 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508,
+ 296, 68, 0
+ };
+ int i, size;
+
+ size = (sizeof mtutab) / (sizeof mtutab[0]);
+ if (dir >= 0) {
+ for (i = 0; i < size; i++)
+ if (mtu > mtutab[i])
+ return mtutab[i];
+ } else {
+ for (i = size - 1; i >= 0; i--)
+ if (mtu < mtutab[i])
+ return mtutab[i];
+ if (mtu == mtutab[0])
+ return mtutab[0];
+ }
+ return 0;
+}
+
+
+/*
+ * badport_bandlim() - check for ICMP bandwidth limit
+ *
+ * Return 0 if it is ok to send an ICMP error response, -1 if we have
+ * hit our bandwidth limit and it is not ok.
+ *
+ * If icmplim is <= 0, the feature is disabled and 0 is returned.
+ *
+ * For now we separate the TCP and UDP subsystems w/ different 'which'
+ * values. We may eventually remove this separation (and simplify the
+ * code further).
+ *
+ * Note that the printing of the error message is delayed so we can
+ * properly print the icmp error rate that the system was trying to do
+ * (i.e. 22000/100 pps, etc...). This can cause long delays in printing
+ * the 'final' error, but it doesn't make sense to solve the printing
+ * delay with more complex code.
+ */
+
+int
+badport_bandlim(int which)
+{
+
+#define N(a) (sizeof (a) / sizeof (a[0]))
+ static struct rate {
+ const char *type;
+ struct timeval lasttime;
+ int curpps;
+ } rates[BANDLIM_MAX+1] = {
+ { "icmp unreach response" },
+ { "icmp ping response" },
+ { "icmp tstamp response" },
+ { "closed port RST response" },
+ { "open port RST response" },
+ { "icmp6 unreach response" }
+ };
+
+ /*
+ * Return ok status if feature disabled or argument out of range.
+ */
+ if (V_icmplim > 0 && (u_int) which < N(rates)) {
+ struct rate *r = &rates[which];
+ int opps = r->curpps;
+
+ if (!ppsratecheck(&r->lasttime, &r->curpps, V_icmplim))
+ return -1; /* discard packet */
+ /*
+ * If we've dropped below the threshold after having
+ * rate-limited traffic print the message. This preserves
+ * the previous behaviour at the expense of added complexity.
+ */
+ if (V_icmplim_output && opps > V_icmplim)
+ log(LOG_NOTICE, "Limiting %s from %d to %d packets/sec\n",
+ r->type, opps, V_icmplim);
+ }
+ return 0; /* okay to send packet */
+#undef N
+}
diff --git a/freebsd/sys/netinet/ip_icmp.h b/freebsd/sys/netinet/ip_icmp.h
new file mode 100644
index 00000000..903f033d
--- /dev/null
+++ b/freebsd/sys/netinet/ip_icmp.h
@@ -0,0 +1,2 @@
+#include <freebsd/bsd.h>
+#include <freebsd/netinet/ip_icmp.h>
diff --git a/freebsd/sys/netinet/ip_id.c b/freebsd/sys/netinet/ip_id.c
new file mode 100644
index 00000000..ba99cdbb
--- /dev/null
+++ b/freebsd/sys/netinet/ip_id.c
@@ -0,0 +1,211 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+
+/*-
+ * Copyright (c) 2008 Michael J. Silbersack.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * IP ID generation is a fascinating topic.
+ *
+ * In order to avoid ID collisions during packet reassembly, common sense
+ * dictates that the period between reuse of IDs be as large as possible.
+ * This leads to the classic implementation of a system-wide counter, thereby
+ * ensuring that IDs repeat only once every 2^16 packets.
+ *
+ * Subsequent security researchers have pointed out that using a global
+ * counter makes ID values predictable. This predictability allows traffic
+ * analysis, idle scanning, and even packet injection in specific cases.
+ * These results suggest that IP IDs should be as random as possible.
+ *
+ * The "searchable queues" algorithm used in this IP ID implementation was
+ * proposed by Amit Klein. It is a compromise between the above two
+ * viewpoints that has provable behavior that can be tuned to the user's
+ * requirements.
+ *
+ * The basic concept is that we supplement a standard random number generator
+ * with a queue of the last L IDs that we have handed out to ensure that all
+ * IDs have a period of at least L.
+ *
+ * To efficiently implement this idea, we keep two data structures: a
+ * circular array of IDs of size L and a bitstring of 65536 bits.
+ *
+ * To start, we ask the RNG for a new ID. A quick index into the bitstring
+ * is used to determine if this is a recently used value. The process is
+ * repeated until a value is returned that is not in the bitstring.
+ *
+ * Having found a usable ID, we remove the ID stored at the current position
+ * in the queue from the bitstring and replace it with our new ID. Our new
+ * ID is then added to the bitstring and the queue pointer is incremented.
+ *
+ * The lower limit of 512 was chosen because there doesn't seem to be much
+ * point to having a smaller value. The upper limit of 32768 was chosen for
+ * two reasons. First, every step above 32768 decreases the entropy. Taken
+ * to an extreme, 65533 would offer 1 bit of entropy. Second, the number of
+ * attempts it takes the algorithm to find an unused ID drastically
+ * increases, killing performance. The default value of 8192 was chosen
+ * because it provides a good tradeoff between randomness and non-repetition.
+ *
+ * With L=8192, the queue will use 16K of memory. The bitstring always
+ * uses 8K of memory. No memory is allocated until the use of random ids is
+ * enabled.
+ */
+
+#include <freebsd/sys/types.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/time.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/libkern.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/mutex.h>
+#include <freebsd/sys/random.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/sys/bitstring.h>
+
+static MALLOC_DEFINE(M_IPID, "ipid", "randomized ip id state");
+
+static u_int16_t *id_array = NULL;
+static bitstr_t *id_bits = NULL;
+static int array_ptr = 0;
+static int array_size = 8192;
+static int random_id_collisions = 0;
+static int random_id_total = 0;
+static struct mtx ip_id_mtx;
+
+static void ip_initid(void);
+static int sysctl_ip_id_change(SYSCTL_HANDLER_ARGS);
+
+MTX_SYSINIT(ip_id_mtx, &ip_id_mtx, "ip_id_mtx", MTX_DEF);
+
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, random_id_period, CTLTYPE_INT|CTLFLAG_RW,
+ &array_size, 0, sysctl_ip_id_change, "IU", "IP ID Array size");
+SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id_collisions, CTLFLAG_RD,
+ &random_id_collisions, 0, "Count of IP ID collisions");
+SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id_total, CTLFLAG_RD,
+ &random_id_total, 0, "Count of IP IDs created");
+
+static int
+sysctl_ip_id_change(SYSCTL_HANDLER_ARGS)
+{
+ int error, new;
+
+ new = array_size;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error == 0 && req->newptr) {
+ if (new >= 512 && new <= 32768) {
+ mtx_lock(&ip_id_mtx);
+ array_size = new;
+ ip_initid();
+ mtx_unlock(&ip_id_mtx);
+ } else
+ error = EINVAL;
+ }
+ return (error);
+}
+
+/*
+ * ip_initid() runs with a mutex held and may execute in a network context.
+ * As a result, it uses M_NOWAIT. Ideally, we would always do this
+ * allocation from the sysctl contact and have it be an invariant that if
+ * this random ID allocation mode is selected, the buffers are present. This
+ * would also avoid potential network context failures of IP ID generation.
+ */
+static void
+ip_initid(void)
+{
+
+ mtx_assert(&ip_id_mtx, MA_OWNED);
+
+ if (id_array != NULL) {
+ free(id_array, M_IPID);
+ free(id_bits, M_IPID);
+ }
+ random_id_collisions = 0;
+ random_id_total = 0;
+ array_ptr = 0;
+ id_array = (u_int16_t *) malloc(array_size * sizeof(u_int16_t),
+ M_IPID, M_NOWAIT | M_ZERO);
+ id_bits = (bitstr_t *) malloc(bitstr_size(65536), M_IPID,
+ M_NOWAIT | M_ZERO);
+ if (id_array == NULL || id_bits == NULL) {
+ /* Neither or both. */
+ if (id_array != NULL) {
+ free(id_array, M_IPID);
+ id_array = NULL;
+ }
+ if (id_bits != NULL) {
+ free(id_bits, M_IPID);
+ id_bits = NULL;
+ }
+ }
+}
+
+u_int16_t
+ip_randomid(void)
+{
+ u_int16_t new_id;
+
+ mtx_lock(&ip_id_mtx);
+ if (id_array == NULL)
+ ip_initid();
+
+ /*
+ * Fail gracefully; return a fixed id if memory allocation failed;
+ * ideally we wouldn't do allocation in this context in order to
+ * avoid the possibility of this failure mode.
+ */
+ if (id_array == NULL) {
+ mtx_unlock(&ip_id_mtx);
+ return (1);
+ }
+
+ /*
+ * To avoid a conflict with the zeros that the array is initially
+ * filled with, we never hand out an id of zero.
+ */
+ new_id = 0;
+ do {
+ if (new_id != 0)
+ random_id_collisions++;
+ arc4rand(&new_id, sizeof(new_id), 0);
+ } while (bit_test(id_bits, new_id) || new_id == 0);
+ bit_clear(id_bits, id_array[array_ptr]);
+ bit_set(id_bits, new_id);
+ id_array[array_ptr] = new_id;
+ array_ptr++;
+ if (array_ptr == array_size)
+ array_ptr = 0;
+ random_id_total++;
+ mtx_unlock(&ip_id_mtx);
+ return (new_id);
+}
diff --git a/freebsd/sys/netinet/ip_input.c b/freebsd/sys/netinet/ip_input.c
new file mode 100644
index 00000000..3964e886
--- /dev/null
+++ b/freebsd/sys/netinet/ip_input.c
@@ -0,0 +1,1794 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_input.c 8.2 (Berkeley) 1/4/94
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_bootp.h>
+#include <freebsd/local/opt_ipfw.h>
+#include <freebsd/local/opt_ipstealth.h>
+#include <freebsd/local/opt_ipsec.h>
+#include <freebsd/local/opt_route.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/callout.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/domain.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/time.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/rwlock.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/sys/sysctl.h>
+
+#include <freebsd/net/pfil.h>
+#include <freebsd/net/if.h>
+#include <freebsd/net/if_types.h>
+#include <freebsd/net/if_var.h>
+#include <freebsd/net/if_dl.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/netisr.h>
+#include <freebsd/net/vnet.h>
+#include <freebsd/net/flowtable.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_fw.h>
+#include <freebsd/netinet/ip_icmp.h>
+#include <freebsd/netinet/ip_options.h>
+#include <freebsd/machine/in_cksum.h>
+#include <freebsd/netinet/ip_carp.h>
+#ifdef IPSEC
+#include <freebsd/netinet/ip_ipsec.h>
+#endif /* IPSEC */
+
+#include <freebsd/sys/socketvar.h>
+
+#include <freebsd/security/mac/mac_framework.h>
+
+#ifdef CTASSERT
+CTASSERT(sizeof(struct ip) == 20);
+#endif
+
+struct rwlock in_ifaddr_lock;
+RW_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock");
+
+VNET_DEFINE(int, rsvp_on);
+
+VNET_DEFINE(int, ipforwarding);
+SYSCTL_VNET_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW,
+ &VNET_NAME(ipforwarding), 0,
+ "Enable IP forwarding between interfaces");
+
+static VNET_DEFINE(int, ipsendredirects) = 1; /* XXX */
+#define V_ipsendredirects VNET(ipsendredirects)
+SYSCTL_VNET_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW,
+ &VNET_NAME(ipsendredirects), 0,
+ "Enable sending IP redirects");
+
+VNET_DEFINE(int, ip_defttl) = IPDEFTTL;
+SYSCTL_VNET_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW,
+ &VNET_NAME(ip_defttl), 0,
+ "Maximum TTL on IP packets");
+
+static VNET_DEFINE(int, ip_keepfaith);
+#define V_ip_keepfaith VNET(ip_keepfaith)
+SYSCTL_VNET_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW,
+ &VNET_NAME(ip_keepfaith), 0,
+ "Enable packet capture for FAITH IPv4->IPv6 translater daemon");
+
+static VNET_DEFINE(int, ip_sendsourcequench);
+#define V_ip_sendsourcequench VNET(ip_sendsourcequench)
+SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW,
+ &VNET_NAME(ip_sendsourcequench), 0,
+ "Enable the transmission of source quench packets");
+
+VNET_DEFINE(int, ip_do_randomid);
+SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW,
+ &VNET_NAME(ip_do_randomid), 0,
+ "Assign random ip_id values");
+
+/*
+ * XXX - Setting ip_checkinterface mostly implements the receive side of
+ * the Strong ES model described in RFC 1122, but since the routing table
+ * and transmit implementation do not implement the Strong ES model,
+ * setting this to 1 results in an odd hybrid.
+ *
+ * XXX - ip_checkinterface currently must be disabled if you use ipnat
+ * to translate the destination address to another local interface.
+ *
+ * XXX - ip_checkinterface must be disabled if you add IP aliases
+ * to the loopback interface instead of the interface where the
+ * packets for those addresses are received.
+ */
+static VNET_DEFINE(int, ip_checkinterface);
+#define V_ip_checkinterface VNET(ip_checkinterface)
+SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW,
+ &VNET_NAME(ip_checkinterface), 0,
+ "Verify packet arrives on correct interface");
+
+VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */
+
+static struct netisr_handler ip_nh = {
+ .nh_name = "ip",
+ .nh_handler = ip_input,
+ .nh_proto = NETISR_IP,
+ .nh_policy = NETISR_POLICY_FLOW,
+};
+
+extern struct domain inetdomain;
+extern struct protosw inetsw[];
+u_char ip_protox[IPPROTO_MAX];
+VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */
+VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */
+VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */
+
+VNET_DEFINE(struct ipstat, ipstat);
+SYSCTL_VNET_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW,
+ &VNET_NAME(ipstat), ipstat,
+ "IP statistics (struct ipstat, netinet/ip_var.h)");
+
+static VNET_DEFINE(uma_zone_t, ipq_zone);
+static VNET_DEFINE(TAILQ_HEAD(ipqhead, ipq), ipq[IPREASS_NHASH]);
+static struct mtx ipqlock;
+
+#define V_ipq_zone VNET(ipq_zone)
+#define V_ipq VNET(ipq)
+
+#define IPQ_LOCK() mtx_lock(&ipqlock)
+#define IPQ_UNLOCK() mtx_unlock(&ipqlock)
+#define IPQ_LOCK_INIT() mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF)
+#define IPQ_LOCK_ASSERT() mtx_assert(&ipqlock, MA_OWNED)
+
+static void maxnipq_update(void);
+static void ipq_zone_change(void *);
+static void ip_drain_locked(void);
+
+static VNET_DEFINE(int, maxnipq); /* Administrative limit on # reass queues. */
+static VNET_DEFINE(int, nipq); /* Total # of reass queues */
+#define V_maxnipq VNET(maxnipq)
+#define V_nipq VNET(nipq)
+SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD,
+ &VNET_NAME(nipq), 0,
+ "Current number of IPv4 fragment reassembly queue entries");
+
+static VNET_DEFINE(int, maxfragsperpacket);
+#define V_maxfragsperpacket VNET(maxfragsperpacket)
+SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW,
+ &VNET_NAME(maxfragsperpacket), 0,
+ "Maximum number of IPv4 fragments allowed per packet");
+
+struct callout ipport_tick_callout;
+
+#ifdef IPCTL_DEFMTU
+SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
+ &ip_mtu, 0, "Default MTU");
+#endif
+
+#ifdef IPSTEALTH
+VNET_DEFINE(int, ipstealth);
+SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW,
+ &VNET_NAME(ipstealth), 0,
+ "IP stealth mode, no TTL decrementation on forwarding");
+#endif
+
+#ifdef FLOWTABLE
+static VNET_DEFINE(int, ip_output_flowtable_size) = 2048;
+VNET_DEFINE(struct flowtable *, ip_ft);
+#define V_ip_output_flowtable_size VNET(ip_output_flowtable_size)
+
+SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN,
+ &VNET_NAME(ip_output_flowtable_size), 2048,
+ "number of entries in the per-cpu output flow caches");
+#endif
+
+VNET_DEFINE(int, fw_one_pass) = 1;
+
+static void ip_freef(struct ipqhead *, struct ipq *);
+
+/*
+ * Kernel module interface for updating ipstat. The argument is an index
+ * into ipstat treated as an array of u_long. While this encodes the general
+ * layout of ipstat into the caller, it doesn't encode its location, so that
+ * future changes to add, for example, per-CPU stats support won't cause
+ * binary compatibility problems for kernel modules.
+ */
+void
+kmod_ipstat_inc(int statnum)
+{
+
+ (*((u_long *)&V_ipstat + statnum))++;
+}
+
+void
+kmod_ipstat_dec(int statnum)
+{
+
+ (*((u_long *)&V_ipstat + statnum))--;
+}
+
+static int
+sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS)
+{
+ int error, qlimit;
+
+ netisr_getqlimit(&ip_nh, &qlimit);
+ error = sysctl_handle_int(oidp, &qlimit, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (qlimit < 1)
+ return (EINVAL);
+ return (netisr_setqlimit(&ip_nh, qlimit));
+}
+SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen,
+ CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I",
+ "Maximum size of the IP input queue");
+
+static int
+sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS)
+{
+ u_int64_t qdrops_long;
+ int error, qdrops;
+
+ netisr_getqdrops(&ip_nh, &qdrops_long);
+ qdrops = qdrops_long;
+ error = sysctl_handle_int(oidp, &qdrops, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (qdrops != 0)
+ return (EINVAL);
+ netisr_clearqdrops(&ip_nh);
+ return (0);
+}
+
+SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops,
+ CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I",
+ "Number of packets dropped from the IP input queue");
+
+/*
+ * IP initialization: fill in IP protocol switch table.
+ * All protocols not implemented in kernel go to raw IP protocol handler.
+ */
+void
+ip_init(void)
+{
+ struct protosw *pr;
+ int i;
+
+ V_ip_id = time_second & 0xffff;
+
+ TAILQ_INIT(&V_in_ifaddrhead);
+ V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
+
+ /* Initialize IP reassembly queue. */
+ for (i = 0; i < IPREASS_NHASH; i++)
+ TAILQ_INIT(&V_ipq[i]);
+ V_maxnipq = nmbclusters / 32;
+ V_maxfragsperpacket = 16;
+ V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
+ NULL, UMA_ALIGN_PTR, 0);
+ maxnipq_update();
+
+ /* Initialize packet filter hooks. */
+ V_inet_pfil_hook.ph_type = PFIL_TYPE_AF;
+ V_inet_pfil_hook.ph_af = AF_INET;
+ if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0)
+ printf("%s: WARNING: unable to register pfil hook, "
+ "error %d\n", __func__, i);
+
+#ifdef FLOWTABLE
+ if (TUNABLE_INT_FETCH("net.inet.ip.output_flowtable_size",
+ &V_ip_output_flowtable_size)) {
+ if (V_ip_output_flowtable_size < 256)
+ V_ip_output_flowtable_size = 256;
+ if (!powerof2(V_ip_output_flowtable_size)) {
+ printf("flowtable must be power of 2 size\n");
+ V_ip_output_flowtable_size = 2048;
+ }
+ } else {
+ /*
+ * round up to the next power of 2
+ */
+ V_ip_output_flowtable_size = 1 << fls((1024 + maxusers * 64)-1);
+ }
+ V_ip_ft = flowtable_alloc("ipv4", V_ip_output_flowtable_size, FL_PCPU);
+#endif
+
+ /* Skip initialization of globals for non-default instances. */
+ if (!IS_DEFAULT_VNET(curvnet))
+ return;
+
+ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
+ if (pr == NULL)
+ panic("ip_init: PF_INET not found");
+
+ /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
+ for (i = 0; i < IPPROTO_MAX; i++)
+ ip_protox[i] = pr - inetsw;
+ /*
+ * Cycle through IP protocols and put them into the appropriate place
+ * in ip_protox[].
+ */
+ for (pr = inetdomain.dom_protosw;
+ pr < inetdomain.dom_protoswNPROTOSW; pr++)
+ if (pr->pr_domain->dom_family == PF_INET &&
+ pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
+ /* Be careful to only index valid IP protocols. */
+ if (pr->pr_protocol < IPPROTO_MAX)
+ ip_protox[pr->pr_protocol] = pr - inetsw;
+ }
+
+ /* Start ipport_tick. */
+ callout_init(&ipport_tick_callout, CALLOUT_MPSAFE);
+ callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
+ SHUTDOWN_PRI_DEFAULT);
+ EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change,
+ NULL, EVENTHANDLER_PRI_ANY);
+
+ /* Initialize various other remaining things. */
+ IPQ_LOCK_INIT();
+ netisr_register(&ip_nh);
+}
+
+#ifdef VIMAGE
+void
+ip_destroy(void)
+{
+
+ /* Cleanup in_ifaddr hash table; should be empty. */
+ hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask);
+
+ IPQ_LOCK();
+ ip_drain_locked();
+ IPQ_UNLOCK();
+
+ uma_zdestroy(V_ipq_zone);
+}
+#endif
+
+void
+ip_fini(void *xtp)
+{
+
+ callout_stop(&ipport_tick_callout);
+}
+
+/*
+ * Ip input routine. Checksum and byte swap header. If fragmented
+ * try to reassemble. Process options. Pass to next level.
+ */
+void
+ip_input(struct mbuf *m)
+{
+ struct ip *ip = NULL;
+ struct in_ifaddr *ia = NULL;
+ struct ifaddr *ifa;
+ struct ifnet *ifp;
+ int checkif, hlen = 0;
+ u_short sum;
+ int dchg = 0; /* dest changed after fw */
+ struct in_addr odst; /* original dst address */
+
+ M_ASSERTPKTHDR(m);
+
+ if (m->m_flags & M_FASTFWD_OURS) {
+ /*
+ * Firewall or NAT changed destination to local.
+ * We expect ip_len and ip_off to be in host byte order.
+ */
+ m->m_flags &= ~M_FASTFWD_OURS;
+ /* Set up some basics that will be used later. */
+ ip = mtod(m, struct ip *);
+ hlen = ip->ip_hl << 2;
+ goto ours;
+ }
+
+ IPSTAT_INC(ips_total);
+
+ if (m->m_pkthdr.len < sizeof(struct ip))
+ goto tooshort;
+
+ if (m->m_len < sizeof (struct ip) &&
+ (m = m_pullup(m, sizeof (struct ip))) == NULL) {
+ IPSTAT_INC(ips_toosmall);
+ return;
+ }
+ ip = mtod(m, struct ip *);
+
+ if (ip->ip_v != IPVERSION) {
+ IPSTAT_INC(ips_badvers);
+ goto bad;
+ }
+
+ hlen = ip->ip_hl << 2;
+ if (hlen < sizeof(struct ip)) { /* minimum header length */
+ IPSTAT_INC(ips_badhlen);
+ goto bad;
+ }
+ if (hlen > m->m_len) {
+ if ((m = m_pullup(m, hlen)) == NULL) {
+ IPSTAT_INC(ips_badhlen);
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ }
+
+ /* 127/8 must not appear on wire - RFC1122 */
+ ifp = m->m_pkthdr.rcvif;
+ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
+ (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
+ if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
+ IPSTAT_INC(ips_badaddr);
+ goto bad;
+ }
+ }
+
+ if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
+ sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
+ } else {
+ if (hlen == sizeof(struct ip)) {
+ sum = in_cksum_hdr(ip);
+ } else {
+ sum = in_cksum(m, hlen);
+ }
+ }
+ if (sum) {
+ IPSTAT_INC(ips_badsum);
+ goto bad;
+ }
+
+#ifdef ALTQ
+ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
+ /* packet is dropped by traffic conditioner */
+ return;
+#endif
+
+ /*
+ * Convert fields to host representation.
+ */
+ ip->ip_len = ntohs(ip->ip_len);
+ if (ip->ip_len < hlen) {
+ IPSTAT_INC(ips_badlen);
+ goto bad;
+ }
+ ip->ip_off = ntohs(ip->ip_off);
+
+ /*
+ * Check that the amount of data in the buffers
+ * is as at least much as the IP header would have us expect.
+ * Trim mbufs if longer than we expect.
+ * Drop packet if shorter than we expect.
+ */
+ if (m->m_pkthdr.len < ip->ip_len) {
+tooshort:
+ IPSTAT_INC(ips_tooshort);
+ goto bad;
+ }
+ if (m->m_pkthdr.len > ip->ip_len) {
+ if (m->m_len == m->m_pkthdr.len) {
+ m->m_len = ip->ip_len;
+ m->m_pkthdr.len = ip->ip_len;
+ } else
+ m_adj(m, ip->ip_len - m->m_pkthdr.len);
+ }
+#ifdef IPSEC
+ /*
+ * Bypass packet filtering for packets from a tunnel (gif).
+ */
+ if (ip_ipsec_filtertunnel(m))
+ goto passin;
+#endif /* IPSEC */
+
+ /*
+ * Run through list of hooks for input packets.
+ *
+ * NB: Beware of the destination address changing (e.g.
+ * by NAT rewriting). When this happens, tell
+ * ip_forward to do the right thing.
+ */
+
+ /* Jump over all PFIL processing if hooks are not active. */
+ if (!PFIL_HOOKED(&V_inet_pfil_hook))
+ goto passin;
+
+ odst = ip->ip_dst;
+ if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0)
+ return;
+ if (m == NULL) /* consumed by filter */
+ return;
+
+ ip = mtod(m, struct ip *);
+ dchg = (odst.s_addr != ip->ip_dst.s_addr);
+ ifp = m->m_pkthdr.rcvif;
+
+#ifdef IPFIREWALL_FORWARD
+ if (m->m_flags & M_FASTFWD_OURS) {
+ m->m_flags &= ~M_FASTFWD_OURS;
+ goto ours;
+ }
+ if ((dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL)) != 0) {
+ /*
+ * Directly ship the packet on. This allows forwarding
+ * packets originally destined to us to some other directly
+ * connected host.
+ */
+ ip_forward(m, dchg);
+ return;
+ }
+#endif /* IPFIREWALL_FORWARD */
+
+passin:
+ /*
+ * Process options and, if not destined for us,
+ * ship it on. ip_dooptions returns 1 when an
+ * error was detected (causing an icmp message
+ * to be sent and the original packet to be freed).
+ */
+ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0))
+ return;
+
+ /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
+ * matter if it is destined to another node, or whether it is
+ * a multicast one, RSVP wants it! and prevents it from being forwarded
+ * anywhere else. Also checks if the rsvp daemon is running before
+ * grabbing the packet.
+ */
+ if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP)
+ goto ours;
+
+ /*
+ * Check our list of addresses, to see if the packet is for us.
+ * If we don't have any addresses, assume any unicast packet
+ * we receive might be for us (and let the upper layers deal
+ * with it).
+ */
+ if (TAILQ_EMPTY(&V_in_ifaddrhead) &&
+ (m->m_flags & (M_MCAST|M_BCAST)) == 0)
+ goto ours;
+
+ /*
+ * Enable a consistency check between the destination address
+ * and the arrival interface for a unicast packet (the RFC 1122
+ * strong ES model) if IP forwarding is disabled and the packet
+ * is not locally generated and the packet is not subject to
+ * 'ipfw fwd'.
+ *
+ * XXX - Checking also should be disabled if the destination
+ * address is ipnat'ed to a different interface.
+ *
+ * XXX - Checking is incompatible with IP aliases added
+ * to the loopback interface instead of the interface where
+ * the packets are received.
+ *
+ * XXX - This is the case for carp vhost IPs as well so we
+ * insert a workaround. If the packet got here, we already
+ * checked with carp_iamatch() and carp_forus().
+ */
+ checkif = V_ip_checkinterface && (V_ipforwarding == 0) &&
+ ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) &&
+ ifp->if_carp == NULL && (dchg == 0);
+
+ /*
+ * Check for exact addresses in the hash bucket.
+ */
+ /* IN_IFADDR_RLOCK(); */
+ LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
+ /*
+ * If the address matches, verify that the packet
+ * arrived via the correct interface if checking is
+ * enabled.
+ */
+ if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr &&
+ (!checkif || ia->ia_ifp == ifp)) {
+ ifa_ref(&ia->ia_ifa);
+ /* IN_IFADDR_RUNLOCK(); */
+ goto ours;
+ }
+ }
+ /* IN_IFADDR_RUNLOCK(); */
+
+ /*
+ * Check for broadcast addresses.
+ *
+ * Only accept broadcast packets that arrive via the matching
+ * interface. Reception of forwarded directed broadcasts would
+ * be handled via ip_forward() and ether_output() with the loopback
+ * into the stack for SIMPLEX interfaces handled by ether_output().
+ */
+ if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
+ IF_ADDR_LOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ ia = ifatoia(ifa);
+ if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
+ ip->ip_dst.s_addr) {
+ ifa_ref(ifa);
+ IF_ADDR_UNLOCK(ifp);
+ goto ours;
+ }
+ if (ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) {
+ ifa_ref(ifa);
+ IF_ADDR_UNLOCK(ifp);
+ goto ours;
+ }
+#ifdef BOOTP_COMPAT
+ if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) {
+ ifa_ref(ifa);
+ IF_ADDR_UNLOCK(ifp);
+ goto ours;
+ }
+#endif
+ }
+ IF_ADDR_UNLOCK(ifp);
+ ia = NULL;
+ }
+ /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */
+ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
+ IPSTAT_INC(ips_cantforward);
+ m_freem(m);
+ return;
+ }
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
+ if (V_ip_mrouter) {
+ /*
+ * If we are acting as a multicast router, all
+ * incoming multicast packets are passed to the
+ * kernel-level multicast forwarding function.
+ * The packet is returned (relatively) intact; if
+ * ip_mforward() returns a non-zero value, the packet
+ * must be discarded, else it may be accepted below.
+ */
+ if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) {
+ IPSTAT_INC(ips_cantforward);
+ m_freem(m);
+ return;
+ }
+
+ /*
+ * The process-level routing daemon needs to receive
+ * all multicast IGMP packets, whether or not this
+ * host belongs to their destination groups.
+ */
+ if (ip->ip_p == IPPROTO_IGMP)
+ goto ours;
+ IPSTAT_INC(ips_forward);
+ }
+ /*
+ * Assume the packet is for us, to avoid prematurely taking
+ * a lock on the in_multi hash. Protocols must perform
+ * their own filtering and update statistics accordingly.
+ */
+ goto ours;
+ }
+ if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
+ goto ours;
+ if (ip->ip_dst.s_addr == INADDR_ANY)
+ goto ours;
+
+ /*
+ * FAITH(Firewall Aided Internet Translator)
+ */
+ if (ifp && ifp->if_type == IFT_FAITH) {
+ if (V_ip_keepfaith) {
+ if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP)
+ goto ours;
+ }
+ m_freem(m);
+ return;
+ }
+
+ /*
+ * Not for us; forward if possible and desirable.
+ */
+ if (V_ipforwarding == 0) {
+ IPSTAT_INC(ips_cantforward);
+ m_freem(m);
+ } else {
+#ifdef IPSEC
+ if (ip_ipsec_fwd(m))
+ goto bad;
+#endif /* IPSEC */
+ ip_forward(m, dchg);
+ }
+ return;
+
+ours:
+#ifdef IPSTEALTH
+ /*
+ * IPSTEALTH: Process non-routing options only
+ * if the packet is destined for us.
+ */
+ if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) {
+ if (ia != NULL)
+ ifa_free(&ia->ia_ifa);
+ return;
+ }
+#endif /* IPSTEALTH */
+
+ /* Count the packet in the ip address stats */
+ if (ia != NULL) {
+ ia->ia_ifa.if_ipackets++;
+ ia->ia_ifa.if_ibytes += m->m_pkthdr.len;
+ ifa_free(&ia->ia_ifa);
+ }
+
+ /*
+ * Attempt reassembly; if it succeeds, proceed.
+ * ip_reass() will return a different mbuf.
+ */
+ if (ip->ip_off & (IP_MF | IP_OFFMASK)) {
+ m = ip_reass(m);
+ if (m == NULL)
+ return;
+ ip = mtod(m, struct ip *);
+ /* Get the header length of the reassembled packet */
+ hlen = ip->ip_hl << 2;
+ }
+
+ /*
+ * Further protocols expect the packet length to be w/o the
+ * IP header.
+ */
+ ip->ip_len -= hlen;
+
+#ifdef IPSEC
+ /*
+ * enforce IPsec policy checking if we are seeing last header.
+ * note that we do not visit this with protocols with pcb layer
+ * code - like udp/tcp/raw ip.
+ */
+ if (ip_ipsec_input(m))
+ goto bad;
+#endif /* IPSEC */
+
+ /*
+ * Switch out to protocol's input routine.
+ */
+ IPSTAT_INC(ips_delivered);
+
+ (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen);
+ return;
+bad:
+ m_freem(m);
+}
+
+/*
+ * After maxnipq has been updated, propagate the change to UMA. The UMA zone
+ * max has slightly different semantics than the sysctl, for historical
+ * reasons.
+ */
+static void
+maxnipq_update(void)
+{
+
+ /*
+ * -1 for unlimited allocation.
+ */
+ if (V_maxnipq < 0)
+ uma_zone_set_max(V_ipq_zone, 0);
+ /*
+ * Positive number for specific bound.
+ */
+ if (V_maxnipq > 0)
+ uma_zone_set_max(V_ipq_zone, V_maxnipq);
+ /*
+ * Zero specifies no further fragment queue allocation -- set the
+ * bound very low, but rely on implementation elsewhere to actually
+ * prevent allocation and reclaim current queues.
+ */
+ if (V_maxnipq == 0)
+ uma_zone_set_max(V_ipq_zone, 1);
+}
+
+static void
+ipq_zone_change(void *tag)
+{
+
+ if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) {
+ V_maxnipq = nmbclusters / 32;
+ maxnipq_update();
+ }
+}
+
+static int
+sysctl_maxnipq(SYSCTL_HANDLER_ARGS)
+{
+ int error, i;
+
+ i = V_maxnipq;
+ error = sysctl_handle_int(oidp, &i, 0, req);
+ if (error || !req->newptr)
+ return (error);
+
+ /*
+ * XXXRW: Might be a good idea to sanity check the argument and place
+ * an extreme upper bound.
+ */
+ if (i < -1)
+ return (EINVAL);
+ V_maxnipq = i;
+ maxnipq_update();
+ return (0);
+}
+
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW,
+ NULL, 0, sysctl_maxnipq, "I",
+ "Maximum number of IPv4 fragment reassembly queue entries");
+
+/*
+ * Take incoming datagram fragment and try to reassemble it into
+ * whole datagram. If the argument is the first fragment or one
+ * in between the function will return NULL and store the mbuf
+ * in the fragment chain. If the argument is the last fragment
+ * the packet will be reassembled and the pointer to the new
+ * mbuf returned for further processing. Only m_tags attached
+ * to the first packet/fragment are preserved.
+ * The IP header is *NOT* adjusted out of iplen.
+ */
+struct mbuf *
+ip_reass(struct mbuf *m)
+{
+ struct ip *ip;
+ struct mbuf *p, *q, *nq, *t;
+ struct ipq *fp = NULL;
+ struct ipqhead *head;
+ int i, hlen, next;
+ u_int8_t ecn, ecn0;
+ u_short hash;
+
+ /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */
+ if (V_maxnipq == 0 || V_maxfragsperpacket == 0) {
+ IPSTAT_INC(ips_fragments);
+ IPSTAT_INC(ips_fragdropped);
+ m_freem(m);
+ return (NULL);
+ }
+
+ ip = mtod(m, struct ip *);
+ hlen = ip->ip_hl << 2;
+
+ hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
+ head = &V_ipq[hash];
+ IPQ_LOCK();
+
+ /*
+ * Look for queue of fragments
+ * of this datagram.
+ */
+ TAILQ_FOREACH(fp, head, ipq_list)
+ if (ip->ip_id == fp->ipq_id &&
+ ip->ip_src.s_addr == fp->ipq_src.s_addr &&
+ ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
+#ifdef MAC
+ mac_ipq_match(m, fp) &&
+#endif
+ ip->ip_p == fp->ipq_p)
+ goto found;
+
+ fp = NULL;
+
+ /*
+ * Attempt to trim the number of allocated fragment queues if it
+ * exceeds the administrative limit.
+ */
+ if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) {
+ /*
+ * drop something from the tail of the current queue
+ * before proceeding further
+ */
+ struct ipq *q = TAILQ_LAST(head, ipqhead);
+ if (q == NULL) { /* gak */
+ for (i = 0; i < IPREASS_NHASH; i++) {
+ struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead);
+ if (r) {
+ IPSTAT_ADD(ips_fragtimeout,
+ r->ipq_nfrags);
+ ip_freef(&V_ipq[i], r);
+ break;
+ }
+ }
+ } else {
+ IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags);
+ ip_freef(head, q);
+ }
+ }
+
+found:
+ /*
+ * Adjust ip_len to not reflect header,
+ * convert offset of this to bytes.
+ */
+ ip->ip_len -= hlen;
+ if (ip->ip_off & IP_MF) {
+ /*
+ * Make sure that fragments have a data length
+ * that's a non-zero multiple of 8 bytes.
+ */
+ if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
+ IPSTAT_INC(ips_toosmall); /* XXX */
+ goto dropfrag;
+ }
+ m->m_flags |= M_FRAG;
+ } else
+ m->m_flags &= ~M_FRAG;
+ ip->ip_off <<= 3;
+
+
+ /*
+ * Attempt reassembly; if it succeeds, proceed.
+ * ip_reass() will return a different mbuf.
+ */
+ IPSTAT_INC(ips_fragments);
+ m->m_pkthdr.header = ip;
+
+ /* Previous ip_reass() started here. */
+ /*
+ * Presence of header sizes in mbufs
+ * would confuse code below.
+ */
+ m->m_data += hlen;
+ m->m_len -= hlen;
+
+ /*
+ * If first fragment to arrive, create a reassembly queue.
+ */
+ if (fp == NULL) {
+ fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
+ if (fp == NULL)
+ goto dropfrag;
+#ifdef MAC
+ if (mac_ipq_init(fp, M_NOWAIT) != 0) {
+ uma_zfree(V_ipq_zone, fp);
+ fp = NULL;
+ goto dropfrag;
+ }
+ mac_ipq_create(m, fp);
+#endif
+ TAILQ_INSERT_HEAD(head, fp, ipq_list);
+ V_nipq++;
+ fp->ipq_nfrags = 1;
+ fp->ipq_ttl = IPFRAGTTL;
+ fp->ipq_p = ip->ip_p;
+ fp->ipq_id = ip->ip_id;
+ fp->ipq_src = ip->ip_src;
+ fp->ipq_dst = ip->ip_dst;
+ fp->ipq_frags = m;
+ m->m_nextpkt = NULL;
+ goto done;
+ } else {
+ fp->ipq_nfrags++;
+#ifdef MAC
+ mac_ipq_update(m, fp);
+#endif
+ }
+
+#define GETIP(m) ((struct ip*)((m)->m_pkthdr.header))
+
+ /*
+ * Handle ECN by comparing this segment with the first one;
+ * if CE is set, do not lose CE.
+ * drop if CE and not-ECT are mixed for the same packet.
+ */
+ ecn = ip->ip_tos & IPTOS_ECN_MASK;
+ ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
+ if (ecn == IPTOS_ECN_CE) {
+ if (ecn0 == IPTOS_ECN_NOTECT)
+ goto dropfrag;
+ if (ecn0 != IPTOS_ECN_CE)
+ GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
+ }
+ if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
+ goto dropfrag;
+
+ /*
+ * Find a segment which begins after this one does.
+ */
+ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
+ if (GETIP(q)->ip_off > ip->ip_off)
+ break;
+
+ /*
+ * If there is a preceding segment, it may provide some of
+ * our data already. If so, drop the data from the incoming
+ * segment. If it provides all of our data, drop us, otherwise
+ * stick new segment in the proper place.
+ *
+ * If some of the data is dropped from the the preceding
+ * segment, then it's checksum is invalidated.
+ */
+ if (p) {
+ i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off;
+ if (i > 0) {
+ if (i >= ip->ip_len)
+ goto dropfrag;
+ m_adj(m, i);
+ m->m_pkthdr.csum_flags = 0;
+ ip->ip_off += i;
+ ip->ip_len -= i;
+ }
+ m->m_nextpkt = p->m_nextpkt;
+ p->m_nextpkt = m;
+ } else {
+ m->m_nextpkt = fp->ipq_frags;
+ fp->ipq_frags = m;
+ }
+
+ /*
+ * While we overlap succeeding segments trim them or,
+ * if they are completely covered, dequeue them.
+ */
+ for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off;
+ q = nq) {
+ i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off;
+ if (i < GETIP(q)->ip_len) {
+ GETIP(q)->ip_len -= i;
+ GETIP(q)->ip_off += i;
+ m_adj(q, i);
+ q->m_pkthdr.csum_flags = 0;
+ break;
+ }
+ nq = q->m_nextpkt;
+ m->m_nextpkt = nq;
+ IPSTAT_INC(ips_fragdropped);
+ fp->ipq_nfrags--;
+ m_freem(q);
+ }
+
+ /*
+ * Check for complete reassembly and perform frag per packet
+ * limiting.
+ *
+ * Frag limiting is performed here so that the nth frag has
+ * a chance to complete the packet before we drop the packet.
+ * As a result, n+1 frags are actually allowed per packet, but
+ * only n will ever be stored. (n = maxfragsperpacket.)
+ *
+ */
+ next = 0;
+ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
+ if (GETIP(q)->ip_off != next) {
+ if (fp->ipq_nfrags > V_maxfragsperpacket) {
+ IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
+ ip_freef(head, fp);
+ }
+ goto done;
+ }
+ next += GETIP(q)->ip_len;
+ }
+ /* Make sure the last packet didn't have the IP_MF flag */
+ if (p->m_flags & M_FRAG) {
+ if (fp->ipq_nfrags > V_maxfragsperpacket) {
+ IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
+ ip_freef(head, fp);
+ }
+ goto done;
+ }
+
+ /*
+ * Reassembly is complete. Make sure the packet is a sane size.
+ */
+ q = fp->ipq_frags;
+ ip = GETIP(q);
+ if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
+ IPSTAT_INC(ips_toolong);
+ IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
+ ip_freef(head, fp);
+ goto done;
+ }
+
+ /*
+ * Concatenate fragments.
+ */
+ m = q;
+ t = m->m_next;
+ m->m_next = NULL;
+ m_cat(m, t);
+ nq = q->m_nextpkt;
+ q->m_nextpkt = NULL;
+ for (q = nq; q != NULL; q = nq) {
+ nq = q->m_nextpkt;
+ q->m_nextpkt = NULL;
+ m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
+ m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
+ m_cat(m, q);
+ }
+ /*
+ * In order to do checksumming faster we do 'end-around carry' here
+ * (and not in for{} loop), though it implies we are not going to
+ * reassemble more than 64k fragments.
+ */
+ m->m_pkthdr.csum_data =
+ (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16);
+#ifdef MAC
+ mac_ipq_reassemble(fp, m);
+ mac_ipq_destroy(fp);
+#endif
+
+ /*
+ * Create header for new ip packet by modifying header of first
+ * packet; dequeue and discard fragment reassembly header.
+ * Make header visible.
+ */
+ ip->ip_len = (ip->ip_hl << 2) + next;
+ ip->ip_src = fp->ipq_src;
+ ip->ip_dst = fp->ipq_dst;
+ TAILQ_REMOVE(head, fp, ipq_list);
+ V_nipq--;
+ uma_zfree(V_ipq_zone, fp);
+ m->m_len += (ip->ip_hl << 2);
+ m->m_data -= (ip->ip_hl << 2);
+ /* some debugging cruft by sklower, below, will go away soon */
+ if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */
+ m_fixhdr(m);
+ IPSTAT_INC(ips_reassembled);
+ IPQ_UNLOCK();
+ return (m);
+
+dropfrag:
+ IPSTAT_INC(ips_fragdropped);
+ if (fp != NULL)
+ fp->ipq_nfrags--;
+ m_freem(m);
+done:
+ IPQ_UNLOCK();
+ return (NULL);
+
+#undef GETIP
+}
+
+/*
+ * Free a fragment reassembly header and all
+ * associated datagrams.
+ */
+static void
+ip_freef(struct ipqhead *fhp, struct ipq *fp)
+{
+ struct mbuf *q;
+
+ IPQ_LOCK_ASSERT();
+
+ while (fp->ipq_frags) {
+ q = fp->ipq_frags;
+ fp->ipq_frags = q->m_nextpkt;
+ m_freem(q);
+ }
+ TAILQ_REMOVE(fhp, fp, ipq_list);
+ uma_zfree(V_ipq_zone, fp);
+ V_nipq--;
+}
+
+/*
+ * IP timer processing;
+ * if a timer expires on a reassembly
+ * queue, discard it.
+ */
+void
+ip_slowtimo(void)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+ struct ipq *fp;
+ int i;
+
+ VNET_LIST_RLOCK_NOSLEEP();
+ IPQ_LOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ for (i = 0; i < IPREASS_NHASH; i++) {
+ for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) {
+ struct ipq *fpp;
+
+ fpp = fp;
+ fp = TAILQ_NEXT(fp, ipq_list);
+ if(--fpp->ipq_ttl == 0) {
+ IPSTAT_ADD(ips_fragtimeout,
+ fpp->ipq_nfrags);
+ ip_freef(&V_ipq[i], fpp);
+ }
+ }
+ }
+ /*
+ * If we are over the maximum number of fragments
+ * (due to the limit being lowered), drain off
+ * enough to get down to the new limit.
+ */
+ if (V_maxnipq >= 0 && V_nipq > V_maxnipq) {
+ for (i = 0; i < IPREASS_NHASH; i++) {
+ while (V_nipq > V_maxnipq &&
+ !TAILQ_EMPTY(&V_ipq[i])) {
+ IPSTAT_ADD(ips_fragdropped,
+ TAILQ_FIRST(&V_ipq[i])->ipq_nfrags);
+ ip_freef(&V_ipq[i],
+ TAILQ_FIRST(&V_ipq[i]));
+ }
+ }
+ }
+ CURVNET_RESTORE();
+ }
+ IPQ_UNLOCK();
+ VNET_LIST_RUNLOCK_NOSLEEP();
+}
+
+/*
+ * Drain off all datagram fragments.
+ */
+static void
+ip_drain_locked(void)
+{
+ int i;
+
+ IPQ_LOCK_ASSERT();
+
+ for (i = 0; i < IPREASS_NHASH; i++) {
+ while(!TAILQ_EMPTY(&V_ipq[i])) {
+ IPSTAT_ADD(ips_fragdropped,
+ TAILQ_FIRST(&V_ipq[i])->ipq_nfrags);
+ ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i]));
+ }
+ }
+}
+
+void
+ip_drain(void)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+
+ VNET_LIST_RLOCK_NOSLEEP();
+ IPQ_LOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ ip_drain_locked();
+ CURVNET_RESTORE();
+ }
+ IPQ_UNLOCK();
+ VNET_LIST_RUNLOCK_NOSLEEP();
+ in_rtqdrain();
+}
+
+/*
+ * The protocol to be inserted into ip_protox[] must be already registered
+ * in inetsw[], either statically or through pf_proto_register().
+ */
+int
+ipproto_register(short ipproto)
+{
+ struct protosw *pr;
+
+ /* Sanity checks. */
+ if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
+ return (EPROTONOSUPPORT);
+
+ /*
+ * The protocol slot must not be occupied by another protocol
+ * already. An index pointing to IPPROTO_RAW is unused.
+ */
+ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
+ if (pr == NULL)
+ return (EPFNOSUPPORT);
+ if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */
+ return (EEXIST);
+
+ /* Find the protocol position in inetsw[] and set the index. */
+ for (pr = inetdomain.dom_protosw;
+ pr < inetdomain.dom_protoswNPROTOSW; pr++) {
+ if (pr->pr_domain->dom_family == PF_INET &&
+ pr->pr_protocol && pr->pr_protocol == ipproto) {
+ ip_protox[pr->pr_protocol] = pr - inetsw;
+ return (0);
+ }
+ }
+ return (EPROTONOSUPPORT);
+}
+
+int
+ipproto_unregister(short ipproto)
+{
+ struct protosw *pr;
+
+ /* Sanity checks. */
+ if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
+ return (EPROTONOSUPPORT);
+
+ /* Check if the protocol was indeed registered. */
+ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
+ if (pr == NULL)
+ return (EPFNOSUPPORT);
+ if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */
+ return (ENOENT);
+
+ /* Reset the protocol slot to IPPROTO_RAW. */
+ ip_protox[ipproto] = pr - inetsw;
+ return (0);
+}
+
+/*
+ * Given address of next destination (final or next hop), return (referenced)
+ * internet address info of interface to be used to get there.
+ */
+struct in_ifaddr *
+ip_rtaddr(struct in_addr dst, u_int fibnum)
+{
+ struct route sro;
+ struct sockaddr_in *sin;
+ struct in_ifaddr *ia;
+
+ bzero(&sro, sizeof(sro));
+ sin = (struct sockaddr_in *)&sro.ro_dst;
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = dst;
+ in_rtalloc_ign(&sro, 0, fibnum);
+
+ if (sro.ro_rt == NULL)
+ return (NULL);
+
+ ia = ifatoia(sro.ro_rt->rt_ifa);
+ ifa_ref(&ia->ia_ifa);
+ RTFREE(sro.ro_rt);
+ return (ia);
+}
+
+u_char inetctlerrmap[PRC_NCMDS] = {
+ 0, 0, 0, 0,
+ 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
+ EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
+ EMSGSIZE, EHOSTUNREACH, 0, 0,
+ 0, 0, EHOSTUNREACH, 0,
+ ENOPROTOOPT, ECONNREFUSED
+};
+
+/*
+ * Forward a packet. If some error occurs return the sender
+ * an icmp packet. Note we can't always generate a meaningful
+ * icmp message because icmp doesn't have a large enough repertoire
+ * of codes and types.
+ *
+ * If not forwarding, just drop the packet. This could be confusing
+ * if ipforwarding was zero but some routing protocol was advancing
+ * us as a gateway to somewhere. However, we must let the routing
+ * protocol deal with that.
+ *
+ * The srcrt parameter indicates whether the packet is being forwarded
+ * via a source route.
+ */
+void
+ip_forward(struct mbuf *m, int srcrt)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ struct in_ifaddr *ia;
+ struct mbuf *mcopy;
+ struct in_addr dest;
+ struct route ro;
+ int error, type = 0, code = 0, mtu = 0;
+
+ if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
+ IPSTAT_INC(ips_cantforward);
+ m_freem(m);
+ return;
+ }
+#ifdef IPSTEALTH
+ if (!V_ipstealth) {
+#endif
+ if (ip->ip_ttl <= IPTTLDEC) {
+ icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
+ 0, 0);
+ return;
+ }
+#ifdef IPSTEALTH
+ }
+#endif
+
+ ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
+#ifndef IPSEC
+ /*
+ * 'ia' may be NULL if there is no route for this destination.
+ * In case of IPsec, Don't discard it just yet, but pass it to
+ * ip_output in case of outgoing IPsec policy.
+ */
+ if (!srcrt && ia == NULL) {
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
+ return;
+ }
+#endif
+
+ /*
+ * Save the IP header and at most 8 bytes of the payload,
+ * in case we need to generate an ICMP message to the src.
+ *
+ * XXX this can be optimized a lot by saving the data in a local
+ * buffer on the stack (72 bytes at most), and only allocating the
+ * mbuf if really necessary. The vast majority of the packets
+ * are forwarded without having to send an ICMP back (either
+ * because unnecessary, or because rate limited), so we are
+ * really we are wasting a lot of work here.
+ *
+ * We don't use m_copy() because it might return a reference
+ * to a shared cluster. Both this function and ip_output()
+ * assume exclusive access to the IP header in `m', so any
+ * data in a cluster may change before we reach icmp_error().
+ */
+ MGETHDR(mcopy, M_DONTWAIT, m->m_type);
+ if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_DONTWAIT)) {
+ /*
+ * It's probably ok if the pkthdr dup fails (because
+ * the deep copy of the tag chain failed), but for now
+ * be conservative and just discard the copy since
+ * code below may some day want the tags.
+ */
+ m_free(mcopy);
+ mcopy = NULL;
+ }
+ if (mcopy != NULL) {
+ mcopy->m_len = min(ip->ip_len, M_TRAILINGSPACE(mcopy));
+ mcopy->m_pkthdr.len = mcopy->m_len;
+ m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
+ }
+
+#ifdef IPSTEALTH
+ if (!V_ipstealth) {
+#endif
+ ip->ip_ttl -= IPTTLDEC;
+#ifdef IPSTEALTH
+ }
+#endif
+
+ /*
+ * If forwarding packet using same interface that it came in on,
+ * perhaps should send a redirect to sender to shortcut a hop.
+ * Only send redirect if source is sending directly to us,
+ * and if packet was not source routed (or has any options).
+ * Also, don't send redirect if forwarding using a default route
+ * or a route modified by a redirect.
+ */
+ dest.s_addr = 0;
+ if (!srcrt && V_ipsendredirects &&
+ ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) {
+ struct sockaddr_in *sin;
+ struct rtentry *rt;
+
+ bzero(&ro, sizeof(ro));
+ sin = (struct sockaddr_in *)&ro.ro_dst;
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = ip->ip_dst;
+ in_rtalloc_ign(&ro, 0, M_GETFIB(m));
+
+ rt = ro.ro_rt;
+
+ if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
+ satosin(rt_key(rt))->sin_addr.s_addr != 0) {
+#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa))
+ u_long src = ntohl(ip->ip_src.s_addr);
+
+ if (RTA(rt) &&
+ (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
+ if (rt->rt_flags & RTF_GATEWAY)
+ dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr;
+ else
+ dest.s_addr = ip->ip_dst.s_addr;
+ /* Router requirements says to only send host redirects */
+ type = ICMP_REDIRECT;
+ code = ICMP_REDIRECT_HOST;
+ }
+ }
+ if (rt)
+ RTFREE(rt);
+ }
+
+ /*
+ * Try to cache the route MTU from ip_output so we can consider it for
+ * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191.
+ */
+ bzero(&ro, sizeof(ro));
+
+ error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
+
+ if (error == EMSGSIZE && ro.ro_rt)
+ mtu = ro.ro_rt->rt_rmx.rmx_mtu;
+ if (ro.ro_rt)
+ RTFREE(ro.ro_rt);
+
+ if (error)
+ IPSTAT_INC(ips_cantforward);
+ else {
+ IPSTAT_INC(ips_forward);
+ if (type)
+ IPSTAT_INC(ips_redirectsent);
+ else {
+ if (mcopy)
+ m_freem(mcopy);
+ if (ia != NULL)
+ ifa_free(&ia->ia_ifa);
+ return;
+ }
+ }
+ if (mcopy == NULL) {
+ if (ia != NULL)
+ ifa_free(&ia->ia_ifa);
+ return;
+ }
+
+ switch (error) {
+
+ case 0: /* forwarded, but need redirect */
+ /* type, code set above */
+ break;
+
+ case ENETUNREACH:
+ case EHOSTUNREACH:
+ case ENETDOWN:
+ case EHOSTDOWN:
+ default:
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_HOST;
+ break;
+
+ case EMSGSIZE:
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_NEEDFRAG;
+
+#ifdef IPSEC
+ /*
+ * If IPsec is configured for this path,
+ * override any possibly mtu value set by ip_output.
+ */
+ mtu = ip_ipsec_mtu(mcopy, mtu);
+#endif /* IPSEC */
+ /*
+ * If the MTU was set before make sure we are below the
+ * interface MTU.
+ * If the MTU wasn't set before use the interface mtu or
+ * fall back to the next smaller mtu step compared to the
+ * current packet size.
+ */
+ if (mtu != 0) {
+ if (ia != NULL)
+ mtu = min(mtu, ia->ia_ifp->if_mtu);
+ } else {
+ if (ia != NULL)
+ mtu = ia->ia_ifp->if_mtu;
+ else
+ mtu = ip_next_mtu(ip->ip_len, 0);
+ }
+ IPSTAT_INC(ips_cantfrag);
+ break;
+
+ case ENOBUFS:
+ /*
+ * A router should not generate ICMP_SOURCEQUENCH as
+ * required in RFC1812 Requirements for IP Version 4 Routers.
+ * Source quench could be a big problem under DoS attacks,
+ * or if the underlying interface is rate-limited.
+ * Those who need source quench packets may re-enable them
+ * via the net.inet.ip.sendsourcequench sysctl.
+ */
+ if (V_ip_sendsourcequench == 0) {
+ m_freem(mcopy);
+ if (ia != NULL)
+ ifa_free(&ia->ia_ifa);
+ return;
+ } else {
+ type = ICMP_SOURCEQUENCH;
+ code = 0;
+ }
+ break;
+
+ case EACCES: /* ipfw denied packet */
+ m_freem(mcopy);
+ if (ia != NULL)
+ ifa_free(&ia->ia_ifa);
+ return;
+ }
+ if (ia != NULL)
+ ifa_free(&ia->ia_ifa);
+ icmp_error(mcopy, type, code, dest.s_addr, mtu);
+}
+
+void
+ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
+ struct mbuf *m)
+{
+
+ if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) {
+ struct bintime bt;
+
+ bintime(&bt);
+ if (inp->inp_socket->so_options & SO_BINTIME) {
+ *mp = sbcreatecontrol((caddr_t) &bt, sizeof(bt),
+ SCM_BINTIME, SOL_SOCKET);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+ if (inp->inp_socket->so_options & SO_TIMESTAMP) {
+ struct timeval tv;
+
+ bintime2timeval(&bt, &tv);
+ *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
+ SCM_TIMESTAMP, SOL_SOCKET);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+ }
+ if (inp->inp_flags & INP_RECVDSTADDR) {
+ *mp = sbcreatecontrol((caddr_t) &ip->ip_dst,
+ sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+ if (inp->inp_flags & INP_RECVTTL) {
+ *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl,
+ sizeof(u_char), IP_RECVTTL, IPPROTO_IP);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+#ifdef notyet
+ /* XXX
+ * Moving these out of udp_input() made them even more broken
+ * than they already were.
+ */
+ /* options were tossed already */
+ if (inp->inp_flags & INP_RECVOPTS) {
+ *mp = sbcreatecontrol((caddr_t) opts_deleted_above,
+ sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+ /* ip_srcroute doesn't do what we want here, need to fix */
+ if (inp->inp_flags & INP_RECVRETOPTS) {
+ *mp = sbcreatecontrol((caddr_t) ip_srcroute(m),
+ sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+#endif
+ if (inp->inp_flags & INP_RECVIF) {
+ struct ifnet *ifp;
+ struct sdlbuf {
+ struct sockaddr_dl sdl;
+ u_char pad[32];
+ } sdlbuf;
+ struct sockaddr_dl *sdp;
+ struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
+
+ if (((ifp = m->m_pkthdr.rcvif))
+ && ( ifp->if_index && (ifp->if_index <= V_if_index))) {
+ sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
+ /*
+ * Change our mind and don't try copy.
+ */
+ if ((sdp->sdl_family != AF_LINK)
+ || (sdp->sdl_len > sizeof(sdlbuf))) {
+ goto makedummy;
+ }
+ bcopy(sdp, sdl2, sdp->sdl_len);
+ } else {
+makedummy:
+ sdl2->sdl_len
+ = offsetof(struct sockaddr_dl, sdl_data[0]);
+ sdl2->sdl_family = AF_LINK;
+ sdl2->sdl_index = 0;
+ sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
+ }
+ *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len,
+ IP_RECVIF, IPPROTO_IP);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+}
+
+/*
+ * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the
+ * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on
+ * locking. This code remains in ip_input.c as ip_mroute.c is optionally
+ * compiled.
+ */
+static VNET_DEFINE(int, ip_rsvp_on);
+VNET_DEFINE(struct socket *, ip_rsvpd);
+
+#define V_ip_rsvp_on VNET(ip_rsvp_on)
+
+int
+ip_rsvp_init(struct socket *so)
+{
+
+ if (so->so_type != SOCK_RAW ||
+ so->so_proto->pr_protocol != IPPROTO_RSVP)
+ return EOPNOTSUPP;
+
+ if (V_ip_rsvpd != NULL)
+ return EADDRINUSE;
+
+ V_ip_rsvpd = so;
+ /*
+ * This may seem silly, but we need to be sure we don't over-increment
+ * the RSVP counter, in case something slips up.
+ */
+ if (!V_ip_rsvp_on) {
+ V_ip_rsvp_on = 1;
+ V_rsvp_on++;
+ }
+
+ return 0;
+}
+
+int
+ip_rsvp_done(void)
+{
+
+ V_ip_rsvpd = NULL;
+ /*
+ * This may seem silly, but we need to be sure we don't over-decrement
+ * the RSVP counter, in case something slips up.
+ */
+ if (V_ip_rsvp_on) {
+ V_ip_rsvp_on = 0;
+ V_rsvp_on--;
+ }
+ return 0;
+}
+
+void
+rsvp_input(struct mbuf *m, int off) /* XXX must fixup manually */
+{
+
+ if (rsvp_input_p) { /* call the real one if loaded */
+ rsvp_input_p(m, off);
+ return;
+ }
+
+ /* Can still get packets with rsvp_on = 0 if there is a local member
+ * of the group to which the RSVP packet is addressed. But in this
+ * case we want to throw the packet away.
+ */
+
+ if (!V_rsvp_on) {
+ m_freem(m);
+ return;
+ }
+
+ if (V_ip_rsvpd != NULL) {
+ rip_input(m, off);
+ return;
+ }
+ /* Drop the packet */
+ m_freem(m);
+}
diff --git a/freebsd/sys/netinet/ip_ipsec.c b/freebsd/sys/netinet/ip_ipsec.c
new file mode 100644
index 00000000..f19d5e0e
--- /dev/null
+++ b/freebsd/sys/netinet/ip_ipsec.c
@@ -0,0 +1,424 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_ipsec.h>
+#include <freebsd/local/opt_sctp.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/errno.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/sysctl.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_options.h>
+#include <freebsd/netinet/ip_ipsec.h>
+#ifdef SCTP
+#include <freebsd/netinet/sctp_crc32.h>
+#endif
+
+#include <freebsd/machine/in_cksum.h>
+
+#ifdef IPSEC
+#include <freebsd/netipsec/ipsec.h>
+#include <freebsd/netipsec/xform.h>
+#include <freebsd/netipsec/key.h>
+#endif /*IPSEC*/
+
+extern struct protosw inetsw[];
+
+#ifdef IPSEC
+#ifdef IPSEC_FILTERTUNNEL
+static VNET_DEFINE(int, ip4_ipsec_filtertunnel) = 1;
+#else
+static VNET_DEFINE(int, ip4_ipsec_filtertunnel) = 0;
+#endif
+#define V_ip4_ipsec_filtertunnel VNET(ip4_ipsec_filtertunnel)
+
+SYSCTL_DECL(_net_inet_ipsec);
+SYSCTL_VNET_INT(_net_inet_ipsec, OID_AUTO, filtertunnel,
+ CTLFLAG_RW, &VNET_NAME(ip4_ipsec_filtertunnel), 0,
+ "If set filter packets from an IPsec tunnel.");
+#endif /* IPSEC */
+
+/*
+ * Check if we have to jump over firewall processing for this packet.
+ * Called from ip_input().
+ * 1 = jump over firewall, 0 = packet goes through firewall.
+ */
+int
+ip_ipsec_filtertunnel(struct mbuf *m)
+{
+#if defined(IPSEC)
+
+ /*
+ * Bypass packet filtering for packets from a tunnel.
+ */
+ if (!V_ip4_ipsec_filtertunnel &&
+ m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL)
+ return 1;
+#endif
+ return 0;
+}
+
+/*
+ * Check if this packet has an active SA and needs to be dropped instead
+ * of forwarded.
+ * Called from ip_input().
+ * 1 = drop packet, 0 = forward packet.
+ */
+int
+ip_ipsec_fwd(struct mbuf *m)
+{
+#ifdef IPSEC
+ struct m_tag *mtag;
+ struct tdb_ident *tdbi;
+ struct secpolicy *sp;
+ int s, error;
+
+ mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
+ s = splnet();
+ if (mtag != NULL) {
+ tdbi = (struct tdb_ident *)(mtag + 1);
+ sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
+ } else {
+ sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
+ IP_FORWARDING, &error);
+ }
+ if (sp == NULL) { /* NB: can happen if error */
+ splx(s);
+ /*XXX error stat???*/
+ DPRINTF(("ip_input: no SP for forwarding\n")); /*XXX*/
+ return 1;
+ }
+
+ /*
+ * Check security policy against packet attributes.
+ */
+ error = ipsec_in_reject(sp, m);
+ KEY_FREESP(&sp);
+ splx(s);
+ if (error) {
+ IPSTAT_INC(ips_cantforward);
+ return 1;
+ }
+#endif /* IPSEC */
+ return 0;
+}
+
+/*
+ * Check if protocol type doesn't have a further header and do IPSEC
+ * decryption or reject right now. Protocols with further headers get
+ * their IPSEC treatment within the protocol specific processing.
+ * Called from ip_input().
+ * 1 = drop packet, 0 = continue processing packet.
+ */
+int
+ip_ipsec_input(struct mbuf *m)
+{
+#ifdef IPSEC
+ struct ip *ip = mtod(m, struct ip *);
+ struct m_tag *mtag;
+ struct tdb_ident *tdbi;
+ struct secpolicy *sp;
+ int s, error;
+ /*
+ * enforce IPsec policy checking if we are seeing last header.
+ * note that we do not visit this with protocols with pcb layer
+ * code - like udp/tcp/raw ip.
+ */
+ if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0) {
+ /*
+ * Check if the packet has already had IPsec processing
+ * done. If so, then just pass it along. This tag gets
+ * set during AH, ESP, etc. input handling, before the
+ * packet is returned to the ip input queue for delivery.
+ */
+ mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
+ s = splnet();
+ if (mtag != NULL) {
+ tdbi = (struct tdb_ident *)(mtag + 1);
+ sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
+ } else {
+ sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
+ IP_FORWARDING, &error);
+ }
+ if (sp != NULL) {
+ /*
+ * Check security policy against packet attributes.
+ */
+ error = ipsec_in_reject(sp, m);
+ KEY_FREESP(&sp);
+ } else {
+ /* XXX error stat??? */
+ error = EINVAL;
+ DPRINTF(("ip_input: no SP, packet discarded\n"));/*XXX*/
+ return 1;
+ }
+ splx(s);
+ if (error)
+ return 1;
+ }
+#endif /* IPSEC */
+ return 0;
+}
+
+/*
+ * Compute the MTU for a forwarded packet that gets IPSEC encapsulated.
+ * Called from ip_forward().
+ * Returns MTU suggestion for ICMP needfrag reply.
+ */
+int
+ip_ipsec_mtu(struct mbuf *m, int mtu)
+{
+ /*
+ * If the packet is routed over IPsec tunnel, tell the
+ * originator the tunnel MTU.
+ * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
+ * XXX quickhack!!!
+ */
+ struct secpolicy *sp = NULL;
+ int ipsecerror;
+ int ipsechdr;
+ struct route *ro;
+ sp = ipsec_getpolicybyaddr(m,
+ IPSEC_DIR_OUTBOUND,
+ IP_FORWARDING,
+ &ipsecerror);
+ if (sp != NULL) {
+ /* count IPsec header size */
+ ipsechdr = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, NULL);
+
+ /*
+ * find the correct route for outer IPv4
+ * header, compute tunnel MTU.
+ */
+ if (sp->req != NULL &&
+ sp->req->sav != NULL &&
+ sp->req->sav->sah != NULL) {
+ ro = &sp->req->sav->sah->route_cache.sa_route;
+ if (ro->ro_rt && ro->ro_rt->rt_ifp) {
+ mtu =
+ ro->ro_rt->rt_rmx.rmx_mtu ?
+ ro->ro_rt->rt_rmx.rmx_mtu :
+ ro->ro_rt->rt_ifp->if_mtu;
+ mtu -= ipsechdr;
+ }
+ }
+ KEY_FREESP(&sp);
+ }
+ return mtu;
+}
+
+/*
+ *
+ * Called from ip_output().
+ * 1 = drop packet, 0 = continue processing packet,
+ * -1 = packet was reinjected and stop processing packet
+ */
+int
+ip_ipsec_output(struct mbuf **m, struct inpcb *inp, int *flags, int *error,
+ struct ifnet **ifp)
+{
+#ifdef IPSEC
+ struct secpolicy *sp = NULL;
+ struct ip *ip = mtod(*m, struct ip *);
+ struct tdb_ident *tdbi;
+ struct m_tag *mtag;
+ int s;
+ /*
+ * Check the security policy (SP) for the packet and, if
+ * required, do IPsec-related processing. There are two
+ * cases here; the first time a packet is sent through
+ * it will be untagged and handled by ipsec4_checkpolicy.
+ * If the packet is resubmitted to ip_output (e.g. after
+ * AH, ESP, etc. processing), there will be a tag to bypass
+ * the lookup and related policy checking.
+ */
+ mtag = m_tag_find(*m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
+ s = splnet();
+ if (mtag != NULL) {
+ tdbi = (struct tdb_ident *)(mtag + 1);
+ sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND);
+ if (sp == NULL)
+ *error = -EINVAL; /* force silent drop */
+ m_tag_delete(*m, mtag);
+ } else {
+ sp = ipsec4_checkpolicy(*m, IPSEC_DIR_OUTBOUND, *flags,
+ error, inp);
+ }
+ /*
+ * There are four return cases:
+ * sp != NULL apply IPsec policy
+ * sp == NULL, error == 0 no IPsec handling needed
+ * sp == NULL, error == -EINVAL discard packet w/o error
+ * sp == NULL, error != 0 discard packet, report error
+ */
+ if (sp != NULL) {
+ /* Loop detection, check if ipsec processing already done */
+ KASSERT(sp->req != NULL, ("ip_output: no ipsec request"));
+ for (mtag = m_tag_first(*m); mtag != NULL;
+ mtag = m_tag_next(*m, mtag)) {
+ if (mtag->m_tag_cookie != MTAG_ABI_COMPAT)
+ continue;
+ if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
+ mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
+ continue;
+ /*
+ * Check if policy has an SA associated with it.
+ * This can happen when an SP has yet to acquire
+ * an SA; e.g. on first reference. If it occurs,
+ * then we let ipsec4_process_packet do its thing.
+ */
+ if (sp->req->sav == NULL)
+ break;
+ tdbi = (struct tdb_ident *)(mtag + 1);
+ if (tdbi->spi == sp->req->sav->spi &&
+ tdbi->proto == sp->req->sav->sah->saidx.proto &&
+ bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst,
+ sizeof (union sockaddr_union)) == 0) {
+ /*
+ * No IPsec processing is needed, free
+ * reference to SP.
+ *
+ * NB: null pointer to avoid free at
+ * done: below.
+ */
+ KEY_FREESP(&sp), sp = NULL;
+ splx(s);
+ goto done;
+ }
+ }
+
+ /*
+ * Do delayed checksums now because we send before
+ * this is done in the normal processing path.
+ */
+ if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ in_delayed_cksum(*m);
+ (*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ }
+#ifdef SCTP
+ if ((*m)->m_pkthdr.csum_flags & CSUM_SCTP) {
+ sctp_delayed_cksum(*m, (uint32_t)(ip->ip_hl << 2));
+ (*m)->m_pkthdr.csum_flags &= ~CSUM_SCTP;
+ }
+#endif
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+
+ /* NB: callee frees mbuf */
+ *error = ipsec4_process_packet(*m, sp->req, *flags, 0);
+ if (*error == EJUSTRETURN) {
+ /*
+ * We had a SP with a level of 'use' and no SA. We
+ * will just continue to process the packet without
+ * IPsec processing and return without error.
+ */
+ *error = 0;
+ ip->ip_len = ntohs(ip->ip_len);
+ ip->ip_off = ntohs(ip->ip_off);
+ goto done;
+ }
+ /*
+ * Preserve KAME behaviour: ENOENT can be returned
+ * when an SA acquire is in progress. Don't propagate
+ * this to user-level; it confuses applications.
+ *
+ * XXX this will go away when the SADB is redone.
+ */
+ if (*error == ENOENT)
+ *error = 0;
+ splx(s);
+ goto reinjected;
+ } else { /* sp == NULL */
+ splx(s);
+
+ if (*error != 0) {
+ /*
+ * Hack: -EINVAL is used to signal that a packet
+ * should be silently discarded. This is typically
+ * because we asked key management for an SA and
+ * it was delayed (e.g. kicked up to IKE).
+ */
+ if (*error == -EINVAL)
+ *error = 0;
+ goto bad;
+ } else {
+ /* No IPsec processing for this packet. */
+ }
+#ifdef notyet
+ /*
+ * If deferred crypto processing is needed, check that
+ * the interface supports it.
+ */
+ mtag = m_tag_find(*m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL);
+ if (mtag != NULL && ifp != NULL &&
+ ((*ifp)->if_capenable & IFCAP_IPSEC) == 0) {
+ /* notify IPsec to do its own crypto */
+ ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1));
+ *error = EHOSTUNREACH;
+ goto bad;
+ }
+#endif
+ }
+done:
+ if (sp != NULL)
+ KEY_FREESP(&sp);
+ return 0;
+reinjected:
+ if (sp != NULL)
+ KEY_FREESP(&sp);
+ return -1;
+bad:
+ if (sp != NULL)
+ KEY_FREESP(&sp);
+ return 1;
+#endif /* IPSEC */
+ return 0;
+}
diff --git a/freebsd/sys/netinet/ip_ipsec.h b/freebsd/sys/netinet/ip_ipsec.h
new file mode 100644
index 00000000..c4de1652
--- /dev/null
+++ b/freebsd/sys/netinet/ip_ipsec.h
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IP_IPSEC_HH_
+#define _NETINET_IP_IPSEC_HH_
+
+int ip_ipsec_filtertunnel(struct mbuf *);
+int ip_ipsec_fwd(struct mbuf *);
+int ip_ipsec_input(struct mbuf *);
+int ip_ipsec_mtu(struct mbuf *, int);
+int ip_ipsec_output(struct mbuf **, struct inpcb *, int *, int *,
+ struct ifnet **);
+#endif
diff --git a/freebsd/sys/netinet/ip_mroute.c b/freebsd/sys/netinet/ip_mroute.c
new file mode 100644
index 00000000..2f7676ad
--- /dev/null
+++ b/freebsd/sys/netinet/ip_mroute.c
@@ -0,0 +1,2952 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1989 Stephen Deering
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Stephen Deering of Stanford University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
+ */
+
+/*
+ * IP multicast forwarding procedures
+ *
+ * Written by David Waitzman, BBN Labs, August 1988.
+ * Modified by Steve Deering, Stanford, February 1989.
+ * Modified by Mark J. Steiglitz, Stanford, May, 1991
+ * Modified by Van Jacobson, LBL, January 1993
+ * Modified by Ajit Thyagarajan, PARC, August 1993
+ * Modified by Bill Fenner, PARC, April 1995
+ * Modified by Ahmed Helmy, SGI, June 1996
+ * Modified by George Edmond Eddy (Rusty), ISI, February 1998
+ * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
+ * Modified by Hitoshi Asaeda, WIDE, August 2000
+ * Modified by Pavlin Radoslavov, ICSI, October 2002
+ *
+ * MROUTING Revision: 3.5
+ * and PIM-SMv2 and PIM-DM support, advanced API support,
+ * bandwidth metering and signaling
+ */
+
+/*
+ * TODO: Prefix functions with ipmf_.
+ * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol
+ * domain attachment (if_afdata) so we can track consumers of that service.
+ * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT,
+ * move it to socket options.
+ * TODO: Cleanup LSRR removal further.
+ * TODO: Push RSVP stubs into raw_ip.c.
+ * TODO: Use bitstring.h for vif set.
+ * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded.
+ * TODO: Sync ip6_mroute.c with this file.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_mrouting.h>
+
+#define _PIM_VT 1
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/stddef.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/ktr.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/signalvar.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/sockio.h>
+#include <freebsd/sys/sx.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/time.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/netisr.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/igmp.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_encap.h>
+#include <freebsd/netinet/ip_mroute.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_options.h>
+#include <freebsd/netinet/pim.h>
+#include <freebsd/netinet/pim_var.h>
+#include <freebsd/netinet/udp.h>
+
+#include <freebsd/machine/in_cksum.h>
+
+#include <freebsd/security/mac/mac_framework.h>
+
+#ifndef KTR_IPMF
+#define KTR_IPMF KTR_INET
+#endif
+
+#define VIFI_INVALID ((vifi_t) -1)
+#define M_HASCL(m) ((m)->m_flags & M_EXT)
+
+static VNET_DEFINE(uint32_t, last_tv_sec); /* last time we processed this */
+#define V_last_tv_sec VNET(last_tv_sec)
+
+static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache");
+
+/*
+ * Locking. We use two locks: one for the virtual interface table and
+ * one for the forwarding table. These locks may be nested in which case
+ * the VIF lock must always be taken first. Note that each lock is used
+ * to cover not only the specific data structure but also related data
+ * structures.
+ */
+
+static struct mtx mrouter_mtx;
+#define MROUTER_LOCK() mtx_lock(&mrouter_mtx)
+#define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx)
+#define MROUTER_LOCK_ASSERT() mtx_assert(&mrouter_mtx, MA_OWNED)
+#define MROUTER_LOCK_INIT() \
+ mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF)
+#define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx)
+
+static int ip_mrouter_cnt; /* # of vnets with active mrouters */
+static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */
+
+static VNET_DEFINE(struct mrtstat, mrtstat);
+#define V_mrtstat VNET(mrtstat)
+SYSCTL_VNET_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW,
+ &VNET_NAME(mrtstat), mrtstat,
+ "IPv4 Multicast Forwarding Statistics (struct mrtstat, "
+ "netinet/ip_mroute.h)");
+
+static VNET_DEFINE(u_long, mfchash);
+#define V_mfchash VNET(mfchash)
+#define MFCHASH(a, g) \
+ ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \
+ ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash)
+#define MFCHASHSIZE 256
+
+static u_long mfchashsize; /* Hash size */
+static VNET_DEFINE(u_char *, nexpire); /* 0..mfchashsize-1 */
+#define V_nexpire VNET(nexpire)
+static VNET_DEFINE(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl);
+#define V_mfchashtbl VNET(mfchashtbl)
+
+static struct mtx mfc_mtx;
+#define MFC_LOCK() mtx_lock(&mfc_mtx)
+#define MFC_UNLOCK() mtx_unlock(&mfc_mtx)
+#define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED)
+#define MFC_LOCK_INIT() \
+ mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF)
+#define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx)
+
+static VNET_DEFINE(vifi_t, numvifs);
+#define V_numvifs VNET(numvifs)
+static VNET_DEFINE(struct vif, viftable[MAXVIFS]);
+#define V_viftable VNET(viftable)
+SYSCTL_VNET_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD,
+ &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]",
+ "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)");
+
+static struct mtx vif_mtx;
+#define VIF_LOCK() mtx_lock(&vif_mtx)
+#define VIF_UNLOCK() mtx_unlock(&vif_mtx)
+#define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED)
+#define VIF_LOCK_INIT() \
+ mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF)
+#define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx)
+
+static eventhandler_tag if_detach_event_tag = NULL;
+
+static VNET_DEFINE(struct callout, expire_upcalls_ch);
+#define V_expire_upcalls_ch VNET(expire_upcalls_ch)
+
+#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */
+#define UPCALL_EXPIRE 6 /* number of timeouts */
+
+/*
+ * Bandwidth meter variables and constants
+ */
+static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters");
+/*
+ * Pending timeouts are stored in a hash table, the key being the
+ * expiration time. Periodically, the entries are analysed and processed.
+ */
+#define BW_METER_BUCKETS 1024
+static VNET_DEFINE(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]);
+#define V_bw_meter_timers VNET(bw_meter_timers)
+static VNET_DEFINE(struct callout, bw_meter_ch);
+#define V_bw_meter_ch VNET(bw_meter_ch)
+#define BW_METER_PERIOD (hz) /* periodical handling of bw meters */
+
+/*
+ * Pending upcalls are stored in a vector which is flushed when
+ * full, or periodically
+ */
+static VNET_DEFINE(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]);
+#define V_bw_upcalls VNET(bw_upcalls)
+static VNET_DEFINE(u_int, bw_upcalls_n); /* # of pending upcalls */
+#define V_bw_upcalls_n VNET(bw_upcalls_n)
+static VNET_DEFINE(struct callout, bw_upcalls_ch);
+#define V_bw_upcalls_ch VNET(bw_upcalls_ch)
+
+#define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */
+
+static VNET_DEFINE(struct pimstat, pimstat);
+#define V_pimstat VNET(pimstat)
+
+SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM");
+SYSCTL_VNET_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD,
+ &VNET_NAME(pimstat), pimstat,
+ "PIM Statistics (struct pimstat, netinet/pim_var.h)");
+
+static u_long pim_squelch_wholepkt = 0;
+SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW,
+ &pim_squelch_wholepkt, 0,
+ "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified");
+
+extern struct domain inetdomain;
+static const struct protosw in_pim_protosw = {
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_PIM,
+ .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ .pr_input = pim_input,
+ .pr_output = (pr_output_t*)rip_output,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_usrreqs = &rip_usrreqs
+};
+static const struct encaptab *pim_encap_cookie;
+
+static int pim_encapcheck(const struct mbuf *, int, int, void *);
+
+/*
+ * Note: the PIM Register encapsulation adds the following in front of a
+ * data packet:
+ *
+ * struct pim_encap_hdr {
+ * struct ip ip;
+ * struct pim_encap_pimhdr pim;
+ * }
+ *
+ */
+
+struct pim_encap_pimhdr {
+ struct pim pim;
+ uint32_t flags;
+};
+#define PIM_ENCAP_TTL 64
+
+static struct ip pim_encap_iphdr = {
+#if BYTE_ORDER == LITTLE_ENDIAN
+ sizeof(struct ip) >> 2,
+ IPVERSION,
+#else
+ IPVERSION,
+ sizeof(struct ip) >> 2,
+#endif
+ 0, /* tos */
+ sizeof(struct ip), /* total length */
+ 0, /* id */
+ 0, /* frag offset */
+ PIM_ENCAP_TTL,
+ IPPROTO_PIM,
+ 0, /* checksum */
+};
+
+static struct pim_encap_pimhdr pim_encap_pimhdr = {
+ {
+ PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
+ 0, /* reserved */
+ 0, /* checksum */
+ },
+ 0 /* flags */
+};
+
+static VNET_DEFINE(vifi_t, reg_vif_num) = VIFI_INVALID;
+#define V_reg_vif_num VNET(reg_vif_num)
+static VNET_DEFINE(struct ifnet, multicast_register_if);
+#define V_multicast_register_if VNET(multicast_register_if)
+
+/*
+ * Private variables.
+ */
+
+static u_long X_ip_mcast_src(int);
+static int X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *,
+ struct ip_moptions *);
+static int X_ip_mrouter_done(void);
+static int X_ip_mrouter_get(struct socket *, struct sockopt *);
+static int X_ip_mrouter_set(struct socket *, struct sockopt *);
+static int X_legal_vif_num(int);
+static int X_mrt_ioctl(u_long, caddr_t, int);
+
+static int add_bw_upcall(struct bw_upcall *);
+static int add_mfc(struct mfcctl2 *);
+static int add_vif(struct vifctl *);
+static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
+static void bw_meter_process(void);
+static void bw_meter_receive_packet(struct bw_meter *, int,
+ struct timeval *);
+static void bw_upcalls_send(void);
+static int del_bw_upcall(struct bw_upcall *);
+static int del_mfc(struct mfcctl2 *);
+static int del_vif(vifi_t);
+static int del_vif_locked(vifi_t);
+static void expire_bw_meter_process(void *);
+static void expire_bw_upcalls_send(void *);
+static void expire_mfc(struct mfc *);
+static void expire_upcalls(void *);
+static void free_bw_list(struct bw_meter *);
+static int get_sg_cnt(struct sioc_sg_req *);
+static int get_vif_cnt(struct sioc_vif_req *);
+static void if_detached_event(void *, struct ifnet *);
+static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
+static int ip_mrouter_init(struct socket *, int);
+static __inline struct mfc *
+ mfc_find(struct in_addr *, struct in_addr *);
+static void phyint_send(struct ip *, struct vif *, struct mbuf *);
+static struct mbuf *
+ pim_register_prepare(struct ip *, struct mbuf *);
+static int pim_register_send(struct ip *, struct vif *,
+ struct mbuf *, struct mfc *);
+static int pim_register_send_rp(struct ip *, struct vif *,
+ struct mbuf *, struct mfc *);
+static int pim_register_send_upcall(struct ip *, struct vif *,
+ struct mbuf *, struct mfc *);
+static void schedule_bw_meter(struct bw_meter *, struct timeval *);
+static void send_packet(struct vif *, struct mbuf *);
+static int set_api_config(uint32_t *);
+static int set_assert(int);
+static int socket_send(struct socket *, struct mbuf *,
+ struct sockaddr_in *);
+static void unschedule_bw_meter(struct bw_meter *);
+
+/*
+ * Kernel multicast forwarding API capabilities and setup.
+ * If more API capabilities are added to the kernel, they should be
+ * recorded in `mrt_api_support'.
+ */
+#define MRT_API_VERSION 0x0305
+
+static const int mrt_api_version = MRT_API_VERSION;
+static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
+ MRT_MFC_FLAGS_BORDER_VIF |
+ MRT_MFC_RP |
+ MRT_MFC_BW_UPCALL);
+static VNET_DEFINE(uint32_t, mrt_api_config);
+#define V_mrt_api_config VNET(mrt_api_config)
+static VNET_DEFINE(int, pim_assert_enabled);
+#define V_pim_assert_enabled VNET(pim_assert_enabled)
+static struct timeval pim_assert_interval = { 3, 0 }; /* Rate limit */
+
+/*
+ * Find a route for a given origin IP address and multicast group address.
+ * Statistics must be updated by the caller.
+ */
+static __inline struct mfc *
+mfc_find(struct in_addr *o, struct in_addr *g)
+{
+ struct mfc *rt;
+
+ MFC_LOCK_ASSERT();
+
+ LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
+ if (in_hosteq(rt->mfc_origin, *o) &&
+ in_hosteq(rt->mfc_mcastgrp, *g) &&
+ TAILQ_EMPTY(&rt->mfc_stall))
+ break;
+ }
+
+ return (rt);
+}
+
+/*
+ * Handle MRT setsockopt commands to modify the multicast forwarding tables.
+ */
+static int
+X_ip_mrouter_set(struct socket *so, struct sockopt *sopt)
+{
+ int error, optval;
+ vifi_t vifi;
+ struct vifctl vifc;
+ struct mfcctl2 mfc;
+ struct bw_upcall bw_upcall;
+ uint32_t i;
+
+ if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT)
+ return EPERM;
+
+ error = 0;
+ switch (sopt->sopt_name) {
+ case MRT_INIT:
+ error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
+ if (error)
+ break;
+ error = ip_mrouter_init(so, optval);
+ break;
+
+ case MRT_DONE:
+ error = ip_mrouter_done();
+ break;
+
+ case MRT_ADD_VIF:
+ error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc);
+ if (error)
+ break;
+ error = add_vif(&vifc);
+ break;
+
+ case MRT_DEL_VIF:
+ error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
+ if (error)
+ break;
+ error = del_vif(vifi);
+ break;
+
+ case MRT_ADD_MFC:
+ case MRT_DEL_MFC:
+ /*
+ * select data size depending on API version.
+ */
+ if (sopt->sopt_name == MRT_ADD_MFC &&
+ V_mrt_api_config & MRT_API_FLAGS_ALL) {
+ error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2),
+ sizeof(struct mfcctl2));
+ } else {
+ error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl),
+ sizeof(struct mfcctl));
+ bzero((caddr_t)&mfc + sizeof(struct mfcctl),
+ sizeof(mfc) - sizeof(struct mfcctl));
+ }
+ if (error)
+ break;
+ if (sopt->sopt_name == MRT_ADD_MFC)
+ error = add_mfc(&mfc);
+ else
+ error = del_mfc(&mfc);
+ break;
+
+ case MRT_ASSERT:
+ error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
+ if (error)
+ break;
+ set_assert(optval);
+ break;
+
+ case MRT_API_CONFIG:
+ error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
+ if (!error)
+ error = set_api_config(&i);
+ if (!error)
+ error = sooptcopyout(sopt, &i, sizeof i);
+ break;
+
+ case MRT_ADD_BW_UPCALL:
+ case MRT_DEL_BW_UPCALL:
+ error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall,
+ sizeof bw_upcall);
+ if (error)
+ break;
+ if (sopt->sopt_name == MRT_ADD_BW_UPCALL)
+ error = add_bw_upcall(&bw_upcall);
+ else
+ error = del_bw_upcall(&bw_upcall);
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ return error;
+}
+
+/*
+ * Handle MRT getsockopt commands
+ */
+static int
+X_ip_mrouter_get(struct socket *so, struct sockopt *sopt)
+{
+ int error;
+
+ switch (sopt->sopt_name) {
+ case MRT_VERSION:
+ error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version);
+ break;
+
+ case MRT_ASSERT:
+ error = sooptcopyout(sopt, &V_pim_assert_enabled,
+ sizeof V_pim_assert_enabled);
+ break;
+
+ case MRT_API_SUPPORT:
+ error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support);
+ break;
+
+ case MRT_API_CONFIG:
+ error = sooptcopyout(sopt, &V_mrt_api_config, sizeof V_mrt_api_config);
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ return error;
+}
+
+/*
+ * Handle ioctl commands to obtain information from the cache
+ */
+static int
+X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused)
+{
+ int error = 0;
+
+ /*
+ * Currently the only function calling this ioctl routine is rtioctl().
+ * Typically, only root can create the raw socket in order to execute
+ * this ioctl method, however the request might be coming from a prison
+ */
+ error = priv_check(curthread, PRIV_NETINET_MROUTE);
+ if (error)
+ return (error);
+ switch (cmd) {
+ case (SIOCGETVIFCNT):
+ error = get_vif_cnt((struct sioc_vif_req *)data);
+ break;
+
+ case (SIOCGETSGCNT):
+ error = get_sg_cnt((struct sioc_sg_req *)data);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+ return error;
+}
+
+/*
+ * returns the packet, byte, rpf-failure count for the source group provided
+ */
+static int
+get_sg_cnt(struct sioc_sg_req *req)
+{
+ struct mfc *rt;
+
+ MFC_LOCK();
+ rt = mfc_find(&req->src, &req->grp);
+ if (rt == NULL) {
+ MFC_UNLOCK();
+ req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
+ return EADDRNOTAVAIL;
+ }
+ req->pktcnt = rt->mfc_pkt_cnt;
+ req->bytecnt = rt->mfc_byte_cnt;
+ req->wrong_if = rt->mfc_wrong_if;
+ MFC_UNLOCK();
+ return 0;
+}
+
+/*
+ * returns the input and output packet and byte counts on the vif provided
+ */
+static int
+get_vif_cnt(struct sioc_vif_req *req)
+{
+ vifi_t vifi = req->vifi;
+
+ VIF_LOCK();
+ if (vifi >= V_numvifs) {
+ VIF_UNLOCK();
+ return EINVAL;
+ }
+
+ req->icount = V_viftable[vifi].v_pkt_in;
+ req->ocount = V_viftable[vifi].v_pkt_out;
+ req->ibytes = V_viftable[vifi].v_bytes_in;
+ req->obytes = V_viftable[vifi].v_bytes_out;
+ VIF_UNLOCK();
+
+ return 0;
+}
+
+static void
+if_detached_event(void *arg __unused, struct ifnet *ifp)
+{
+ vifi_t vifi;
+ int i;
+
+ MROUTER_LOCK();
+
+ if (V_ip_mrouter == NULL) {
+ MROUTER_UNLOCK();
+ return;
+ }
+
+ VIF_LOCK();
+ MFC_LOCK();
+
+ /*
+ * Tear down multicast forwarder state associated with this ifnet.
+ * 1. Walk the vif list, matching vifs against this ifnet.
+ * 2. Walk the multicast forwarding cache (mfc) looking for
+ * inner matches with this vif's index.
+ * 3. Expire any matching multicast forwarding cache entries.
+ * 4. Free vif state. This should disable ALLMULTI on the interface.
+ */
+ for (vifi = 0; vifi < V_numvifs; vifi++) {
+ if (V_viftable[vifi].v_ifp != ifp)
+ continue;
+ for (i = 0; i < mfchashsize; i++) {
+ struct mfc *rt, *nrt;
+ for (rt = LIST_FIRST(&V_mfchashtbl[i]); rt; rt = nrt) {
+ nrt = LIST_NEXT(rt, mfc_hash);
+ if (rt->mfc_parent == vifi) {
+ expire_mfc(rt);
+ }
+ }
+ }
+ del_vif_locked(vifi);
+ }
+
+ MFC_UNLOCK();
+ VIF_UNLOCK();
+
+ MROUTER_UNLOCK();
+}
+
+/*
+ * Enable multicast forwarding.
+ */
+static int
+ip_mrouter_init(struct socket *so, int version)
+{
+
+ CTR3(KTR_IPMF, "%s: so_type %d, pr_protocol %d", __func__,
+ so->so_type, so->so_proto->pr_protocol);
+
+ if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP)
+ return EOPNOTSUPP;
+
+ if (version != 1)
+ return ENOPROTOOPT;
+
+ MROUTER_LOCK();
+
+ if (ip_mrouter_unloading) {
+ MROUTER_UNLOCK();
+ return ENOPROTOOPT;
+ }
+
+ if (V_ip_mrouter != NULL) {
+ MROUTER_UNLOCK();
+ return EADDRINUSE;
+ }
+
+ V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash,
+ HASH_NOWAIT);
+
+ callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
+ curvnet);
+ callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
+ curvnet);
+ callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
+ curvnet);
+
+ V_ip_mrouter = so;
+ ip_mrouter_cnt++;
+
+ MROUTER_UNLOCK();
+
+ CTR1(KTR_IPMF, "%s: done", __func__);
+
+ return 0;
+}
+
+/*
+ * Disable multicast forwarding.
+ */
+static int
+X_ip_mrouter_done(void)
+{
+ vifi_t vifi;
+ int i;
+ struct ifnet *ifp;
+ struct ifreq ifr;
+
+ MROUTER_LOCK();
+
+ if (V_ip_mrouter == NULL) {
+ MROUTER_UNLOCK();
+ return EINVAL;
+ }
+
+ /*
+ * Detach/disable hooks to the reset of the system.
+ */
+ V_ip_mrouter = NULL;
+ ip_mrouter_cnt--;
+ V_mrt_api_config = 0;
+
+ VIF_LOCK();
+
+ /*
+ * For each phyint in use, disable promiscuous reception of all IP
+ * multicasts.
+ */
+ for (vifi = 0; vifi < V_numvifs; vifi++) {
+ if (!in_nullhost(V_viftable[vifi].v_lcl_addr) &&
+ !(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
+ struct sockaddr_in *so = (struct sockaddr_in *)&(ifr.ifr_addr);
+
+ so->sin_len = sizeof(struct sockaddr_in);
+ so->sin_family = AF_INET;
+ so->sin_addr.s_addr = INADDR_ANY;
+ ifp = V_viftable[vifi].v_ifp;
+ if_allmulti(ifp, 0);
+ }
+ }
+ bzero((caddr_t)V_viftable, sizeof(V_viftable));
+ V_numvifs = 0;
+ V_pim_assert_enabled = 0;
+
+ VIF_UNLOCK();
+
+ callout_stop(&V_expire_upcalls_ch);
+ callout_stop(&V_bw_upcalls_ch);
+ callout_stop(&V_bw_meter_ch);
+
+ MFC_LOCK();
+
+ /*
+ * Free all multicast forwarding cache entries.
+ * Do not use hashdestroy(), as we must perform other cleanup.
+ */
+ for (i = 0; i < mfchashsize; i++) {
+ struct mfc *rt, *nrt;
+ for (rt = LIST_FIRST(&V_mfchashtbl[i]); rt; rt = nrt) {
+ nrt = LIST_NEXT(rt, mfc_hash);
+ expire_mfc(rt);
+ }
+ }
+ free(V_mfchashtbl, M_MRTABLE);
+ V_mfchashtbl = NULL;
+
+ bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize);
+
+ V_bw_upcalls_n = 0;
+ bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers));
+
+ MFC_UNLOCK();
+
+ V_reg_vif_num = VIFI_INVALID;
+
+ MROUTER_UNLOCK();
+
+ CTR1(KTR_IPMF, "%s: done", __func__);
+
+ return 0;
+}
+
+/*
+ * Set PIM assert processing global
+ */
+static int
+set_assert(int i)
+{
+ if ((i != 1) && (i != 0))
+ return EINVAL;
+
+ V_pim_assert_enabled = i;
+
+ return 0;
+}
+
+/*
+ * Configure API capabilities
+ */
+int
+set_api_config(uint32_t *apival)
+{
+ int i;
+
+ /*
+ * We can set the API capabilities only if it is the first operation
+ * after MRT_INIT. I.e.:
+ * - there are no vifs installed
+ * - pim_assert is not enabled
+ * - the MFC table is empty
+ */
+ if (V_numvifs > 0) {
+ *apival = 0;
+ return EPERM;
+ }
+ if (V_pim_assert_enabled) {
+ *apival = 0;
+ return EPERM;
+ }
+
+ MFC_LOCK();
+
+ for (i = 0; i < mfchashsize; i++) {
+ if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) {
+ *apival = 0;
+ return EPERM;
+ }
+ }
+
+ MFC_UNLOCK();
+
+ V_mrt_api_config = *apival & mrt_api_support;
+ *apival = V_mrt_api_config;
+
+ return 0;
+}
+
+/*
+ * Add a vif to the vif table
+ */
+static int
+add_vif(struct vifctl *vifcp)
+{
+ struct vif *vifp = V_viftable + vifcp->vifc_vifi;
+ struct sockaddr_in sin = {sizeof sin, AF_INET};
+ struct ifaddr *ifa;
+ struct ifnet *ifp;
+ int error;
+
+ VIF_LOCK();
+ if (vifcp->vifc_vifi >= MAXVIFS) {
+ VIF_UNLOCK();
+ return EINVAL;
+ }
+ /* rate limiting is no longer supported by this code */
+ if (vifcp->vifc_rate_limit != 0) {
+ log(LOG_ERR, "rate limiting is no longer supported\n");
+ VIF_UNLOCK();
+ return EINVAL;
+ }
+ if (!in_nullhost(vifp->v_lcl_addr)) {
+ VIF_UNLOCK();
+ return EADDRINUSE;
+ }
+ if (in_nullhost(vifcp->vifc_lcl_addr)) {
+ VIF_UNLOCK();
+ return EADDRNOTAVAIL;
+ }
+
+ /* Find the interface with an address in AF_INET family */
+ if (vifcp->vifc_flags & VIFF_REGISTER) {
+ /*
+ * XXX: Because VIFF_REGISTER does not really need a valid
+ * local interface (e.g. it could be 127.0.0.2), we don't
+ * check its address.
+ */
+ ifp = NULL;
+ } else {
+ sin.sin_addr = vifcp->vifc_lcl_addr;
+ ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
+ if (ifa == NULL) {
+ VIF_UNLOCK();
+ return EADDRNOTAVAIL;
+ }
+ ifp = ifa->ifa_ifp;
+ ifa_free(ifa);
+ }
+
+ if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) {
+ CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__);
+ VIF_UNLOCK();
+ return EOPNOTSUPP;
+ } else if (vifcp->vifc_flags & VIFF_REGISTER) {
+ ifp = &V_multicast_register_if;
+ CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp);
+ if (V_reg_vif_num == VIFI_INVALID) {
+ if_initname(&V_multicast_register_if, "register_vif", 0);
+ V_multicast_register_if.if_flags = IFF_LOOPBACK;
+ V_reg_vif_num = vifcp->vifc_vifi;
+ }
+ } else { /* Make sure the interface supports multicast */
+ if ((ifp->if_flags & IFF_MULTICAST) == 0) {
+ VIF_UNLOCK();
+ return EOPNOTSUPP;
+ }
+
+ /* Enable promiscuous reception of all IP multicasts from the if */
+ error = if_allmulti(ifp, 1);
+ if (error) {
+ VIF_UNLOCK();
+ return error;
+ }
+ }
+
+ vifp->v_flags = vifcp->vifc_flags;
+ vifp->v_threshold = vifcp->vifc_threshold;
+ vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
+ vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
+ vifp->v_ifp = ifp;
+ /* initialize per vif pkt counters */
+ vifp->v_pkt_in = 0;
+ vifp->v_pkt_out = 0;
+ vifp->v_bytes_in = 0;
+ vifp->v_bytes_out = 0;
+ bzero(&vifp->v_route, sizeof(vifp->v_route));
+
+ /* Adjust numvifs up if the vifi is higher than numvifs */
+ if (V_numvifs <= vifcp->vifc_vifi)
+ V_numvifs = vifcp->vifc_vifi + 1;
+
+ VIF_UNLOCK();
+
+ CTR4(KTR_IPMF, "%s: add vif %d laddr %s thresh %x", __func__,
+ (int)vifcp->vifc_vifi, inet_ntoa(vifcp->vifc_lcl_addr),
+ (int)vifcp->vifc_threshold);
+
+ return 0;
+}
+
+/*
+ * Delete a vif from the vif table
+ */
+static int
+del_vif_locked(vifi_t vifi)
+{
+ struct vif *vifp;
+
+ VIF_LOCK_ASSERT();
+
+ if (vifi >= V_numvifs) {
+ return EINVAL;
+ }
+ vifp = &V_viftable[vifi];
+ if (in_nullhost(vifp->v_lcl_addr)) {
+ return EADDRNOTAVAIL;
+ }
+
+ if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER)))
+ if_allmulti(vifp->v_ifp, 0);
+
+ if (vifp->v_flags & VIFF_REGISTER)
+ V_reg_vif_num = VIFI_INVALID;
+
+ bzero((caddr_t)vifp, sizeof (*vifp));
+
+ CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi);
+
+ /* Adjust numvifs down */
+ for (vifi = V_numvifs; vifi > 0; vifi--)
+ if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr))
+ break;
+ V_numvifs = vifi;
+
+ return 0;
+}
+
+static int
+del_vif(vifi_t vifi)
+{
+ int cc;
+
+ VIF_LOCK();
+ cc = del_vif_locked(vifi);
+ VIF_UNLOCK();
+
+ return cc;
+}
+
+/*
+ * update an mfc entry without resetting counters and S,G addresses.
+ */
+static void
+update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
+{
+ int i;
+
+ rt->mfc_parent = mfccp->mfcc_parent;
+ for (i = 0; i < V_numvifs; i++) {
+ rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
+ rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config &
+ MRT_MFC_FLAGS_ALL;
+ }
+ /* set the RP address */
+ if (V_mrt_api_config & MRT_MFC_RP)
+ rt->mfc_rp = mfccp->mfcc_rp;
+ else
+ rt->mfc_rp.s_addr = INADDR_ANY;
+}
+
+/*
+ * fully initialize an mfc entry from the parameter.
+ */
+static void
+init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
+{
+ rt->mfc_origin = mfccp->mfcc_origin;
+ rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;
+
+ update_mfc_params(rt, mfccp);
+
+ /* initialize pkt counters per src-grp */
+ rt->mfc_pkt_cnt = 0;
+ rt->mfc_byte_cnt = 0;
+ rt->mfc_wrong_if = 0;
+ timevalclear(&rt->mfc_last_assert);
+}
+
+static void
+expire_mfc(struct mfc *rt)
+{
+ struct rtdetq *rte, *nrte;
+
+ free_bw_list(rt->mfc_bw_meter);
+
+ TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) {
+ m_freem(rte->m);
+ TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link);
+ free(rte, M_MRTABLE);
+ }
+
+ LIST_REMOVE(rt, mfc_hash);
+ free(rt, M_MRTABLE);
+}
+
+/*
+ * Add an mfc entry
+ */
+static int
+add_mfc(struct mfcctl2 *mfccp)
+{
+ struct mfc *rt;
+ struct rtdetq *rte, *nrte;
+ u_long hash = 0;
+ u_short nstl;
+
+ VIF_LOCK();
+ MFC_LOCK();
+
+ rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
+
+ /* If an entry already exists, just update the fields */
+ if (rt) {
+ CTR4(KTR_IPMF, "%s: update mfc orig %s group %lx parent %x",
+ __func__, inet_ntoa(mfccp->mfcc_origin),
+ (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
+ mfccp->mfcc_parent);
+ update_mfc_params(rt, mfccp);
+ MFC_UNLOCK();
+ VIF_UNLOCK();
+ return (0);
+ }
+
+ /*
+ * Find the entry for which the upcall was made and update
+ */
+ nstl = 0;
+ hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
+ LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
+ if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
+ in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
+ !TAILQ_EMPTY(&rt->mfc_stall)) {
+ CTR5(KTR_IPMF,
+ "%s: add mfc orig %s group %lx parent %x qh %p",
+ __func__, inet_ntoa(mfccp->mfcc_origin),
+ (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
+ mfccp->mfcc_parent,
+ TAILQ_FIRST(&rt->mfc_stall));
+ if (nstl++)
+ CTR1(KTR_IPMF, "%s: multiple matches", __func__);
+
+ init_mfc_params(rt, mfccp);
+ rt->mfc_expire = 0; /* Don't clean this guy up */
+ V_nexpire[hash]--;
+
+ /* Free queued packets, but attempt to forward them first. */
+ TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) {
+ if (rte->ifp != NULL)
+ ip_mdq(rte->m, rte->ifp, rt, -1);
+ m_freem(rte->m);
+ TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link);
+ rt->mfc_nstall--;
+ free(rte, M_MRTABLE);
+ }
+ }
+ }
+
+ /*
+ * It is possible that an entry is being inserted without an upcall
+ */
+ if (nstl == 0) {
+ CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__);
+ LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
+ if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
+ in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
+ init_mfc_params(rt, mfccp);
+ if (rt->mfc_expire)
+ V_nexpire[hash]--;
+ rt->mfc_expire = 0;
+ break; /* XXX */
+ }
+ }
+
+ if (rt == NULL) { /* no upcall, so make a new entry */
+ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
+ if (rt == NULL) {
+ MFC_UNLOCK();
+ VIF_UNLOCK();
+ return (ENOBUFS);
+ }
+
+ init_mfc_params(rt, mfccp);
+ TAILQ_INIT(&rt->mfc_stall);
+ rt->mfc_nstall = 0;
+
+ rt->mfc_expire = 0;
+ rt->mfc_bw_meter = NULL;
+
+ /* insert new entry at head of hash chain */
+ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
+ }
+ }
+
+ MFC_UNLOCK();
+ VIF_UNLOCK();
+
+ return (0);
+}
+
+/*
+ * Delete an mfc entry
+ */
+static int
+del_mfc(struct mfcctl2 *mfccp)
+{
+ struct in_addr origin;
+ struct in_addr mcastgrp;
+ struct mfc *rt;
+
+ origin = mfccp->mfcc_origin;
+ mcastgrp = mfccp->mfcc_mcastgrp;
+
+ CTR3(KTR_IPMF, "%s: delete mfc orig %s group %lx", __func__,
+ inet_ntoa(origin), (u_long)ntohl(mcastgrp.s_addr));
+
+ MFC_LOCK();
+
+ rt = mfc_find(&origin, &mcastgrp);
+ if (rt == NULL) {
+ MFC_UNLOCK();
+ return EADDRNOTAVAIL;
+ }
+
+ /*
+ * free the bw_meter entries
+ */
+ free_bw_list(rt->mfc_bw_meter);
+ rt->mfc_bw_meter = NULL;
+
+ LIST_REMOVE(rt, mfc_hash);
+ free(rt, M_MRTABLE);
+
+ MFC_UNLOCK();
+
+ return (0);
+}
+
+/*
+ * Send a message to the routing daemon on the multicast routing socket.
+ */
+static int
+socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
+{
+ if (s) {
+ SOCKBUF_LOCK(&s->so_rcv);
+ if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm,
+ NULL) != 0) {
+ sorwakeup_locked(s);
+ return 0;
+ }
+ SOCKBUF_UNLOCK(&s->so_rcv);
+ }
+ m_freem(mm);
+ return -1;
+}
+
+/*
+ * IP multicast forwarding function. This function assumes that the packet
+ * pointed to by "ip" has arrived on (or is about to be sent to) the interface
+ * pointed to by "ifp", and the packet is to be relayed to other networks
+ * that have members of the packet's destination IP multicast group.
+ *
+ * The packet is returned unscathed to the caller, unless it is
+ * erroneous, in which case a non-zero return value tells the caller to
+ * discard it.
+ */
+
+#define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */
+
+static int
+X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m,
+ struct ip_moptions *imo)
+{
+ struct mfc *rt;
+ int error;
+ vifi_t vifi;
+
+ CTR3(KTR_IPMF, "ip_mforward: delete mfc orig %s group %lx ifp %p",
+ inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr), ifp);
+
+ if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 ||
+ ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) {
+ /*
+ * Packet arrived via a physical interface or
+ * an encapsulated tunnel or a register_vif.
+ */
+ } else {
+ /*
+ * Packet arrived through a source-route tunnel.
+ * Source-route tunnels are no longer supported.
+ */
+ return (1);
+ }
+
+ VIF_LOCK();
+ MFC_LOCK();
+ if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) {
+ if (ip->ip_ttl < MAXTTL)
+ ip->ip_ttl++; /* compensate for -1 in *_send routines */
+ error = ip_mdq(m, ifp, NULL, vifi);
+ MFC_UNLOCK();
+ VIF_UNLOCK();
+ return error;
+ }
+
+ /*
+ * Don't forward a packet with time-to-live of zero or one,
+ * or a packet destined to a local-only group.
+ */
+ if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) {
+ MFC_UNLOCK();
+ VIF_UNLOCK();
+ return 0;
+ }
+
+ /*
+ * Determine forwarding vifs from the forwarding cache table
+ */
+ MRTSTAT_INC(mrts_mfc_lookups);
+ rt = mfc_find(&ip->ip_src, &ip->ip_dst);
+
+ /* Entry exists, so forward if necessary */
+ if (rt != NULL) {
+ error = ip_mdq(m, ifp, rt, -1);
+ MFC_UNLOCK();
+ VIF_UNLOCK();
+ return error;
+ } else {
+ /*
+ * If we don't have a route for packet's origin,
+ * Make a copy of the packet & send message to routing daemon
+ */
+
+ struct mbuf *mb0;
+ struct rtdetq *rte;
+ u_long hash;
+ int hlen = ip->ip_hl << 2;
+
+ MRTSTAT_INC(mrts_mfc_misses);
+ MRTSTAT_INC(mrts_no_route);
+ CTR2(KTR_IPMF, "ip_mforward: no mfc for (%s,%lx)",
+ inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr));
+
+ /*
+ * Allocate mbufs early so that we don't do extra work if we are
+ * just going to fail anyway. Make sure to pullup the header so
+ * that other people can't step on it.
+ */
+ rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE,
+ M_NOWAIT|M_ZERO);
+ if (rte == NULL) {
+ MFC_UNLOCK();
+ VIF_UNLOCK();
+ return ENOBUFS;
+ }
+
+ mb0 = m_copypacket(m, M_DONTWAIT);
+ if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen))
+ mb0 = m_pullup(mb0, hlen);
+ if (mb0 == NULL) {
+ free(rte, M_MRTABLE);
+ MFC_UNLOCK();
+ VIF_UNLOCK();
+ return ENOBUFS;
+ }
+
+ /* is there an upcall waiting for this flow ? */
+ hash = MFCHASH(ip->ip_src, ip->ip_dst);
+ LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
+ if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
+ in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
+ !TAILQ_EMPTY(&rt->mfc_stall))
+ break;
+ }
+
+ if (rt == NULL) {
+ int i;
+ struct igmpmsg *im;
+ struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
+ struct mbuf *mm;
+
+ /*
+ * Locate the vifi for the incoming interface for this packet.
+ * If none found, drop packet.
+ */
+ for (vifi = 0; vifi < V_numvifs &&
+ V_viftable[vifi].v_ifp != ifp; vifi++)
+ ;
+ if (vifi >= V_numvifs) /* vif not found, drop packet */
+ goto non_fatal;
+
+ /* no upcall, so make a new entry */
+ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
+ if (rt == NULL)
+ goto fail;
+
+ /* Make a copy of the header to send to the user level process */
+ mm = m_copy(mb0, 0, hlen);
+ if (mm == NULL)
+ goto fail1;
+
+ /*
+ * Send message to routing daemon to install
+ * a route into the kernel table
+ */
+
+ im = mtod(mm, struct igmpmsg *);
+ im->im_msgtype = IGMPMSG_NOCACHE;
+ im->im_mbz = 0;
+ im->im_vif = vifi;
+
+ MRTSTAT_INC(mrts_upcalls);
+
+ k_igmpsrc.sin_addr = ip->ip_src;
+ if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
+ CTR0(KTR_IPMF, "ip_mforward: socket queue full");
+ MRTSTAT_INC(mrts_upq_sockfull);
+fail1:
+ free(rt, M_MRTABLE);
+fail:
+ free(rte, M_MRTABLE);
+ m_freem(mb0);
+ MFC_UNLOCK();
+ VIF_UNLOCK();
+ return ENOBUFS;
+ }
+
+ /* insert new entry at head of hash chain */
+ rt->mfc_origin.s_addr = ip->ip_src.s_addr;
+ rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr;
+ rt->mfc_expire = UPCALL_EXPIRE;
+ V_nexpire[hash]++;
+ for (i = 0; i < V_numvifs; i++) {
+ rt->mfc_ttls[i] = 0;
+ rt->mfc_flags[i] = 0;
+ }
+ rt->mfc_parent = -1;
+
+ /* clear the RP address */
+ rt->mfc_rp.s_addr = INADDR_ANY;
+ rt->mfc_bw_meter = NULL;
+
+ /* initialize pkt counters per src-grp */
+ rt->mfc_pkt_cnt = 0;
+ rt->mfc_byte_cnt = 0;
+ rt->mfc_wrong_if = 0;
+ timevalclear(&rt->mfc_last_assert);
+
+ TAILQ_INIT(&rt->mfc_stall);
+ rt->mfc_nstall = 0;
+
+ /* link into table */
+ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
+ TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link);
+ rt->mfc_nstall++;
+
+ } else {
+ /* determine if queue has overflowed */
+ if (rt->mfc_nstall > MAX_UPQ) {
+ MRTSTAT_INC(mrts_upq_ovflw);
+non_fatal:
+ free(rte, M_MRTABLE);
+ m_freem(mb0);
+ MFC_UNLOCK();
+ VIF_UNLOCK();
+ return (0);
+ }
+ TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link);
+ rt->mfc_nstall++;
+ }
+
+ rte->m = mb0;
+ rte->ifp = ifp;
+
+ MFC_UNLOCK();
+ VIF_UNLOCK();
+
+ return 0;
+ }
+}
+
+/*
+ * Clean up the cache entry if upcall is not serviced
+ */
+static void
+expire_upcalls(void *arg)
+{
+ int i;
+
+ CURVNET_SET((struct vnet *) arg);
+
+ MFC_LOCK();
+
+ for (i = 0; i < mfchashsize; i++) {
+ struct mfc *rt, *nrt;
+
+ if (V_nexpire[i] == 0)
+ continue;
+
+ for (rt = LIST_FIRST(&V_mfchashtbl[i]); rt; rt = nrt) {
+ nrt = LIST_NEXT(rt, mfc_hash);
+
+ if (TAILQ_EMPTY(&rt->mfc_stall))
+ continue;
+
+ if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
+ continue;
+
+ /*
+ * free the bw_meter entries
+ */
+ while (rt->mfc_bw_meter != NULL) {
+ struct bw_meter *x = rt->mfc_bw_meter;
+
+ rt->mfc_bw_meter = x->bm_mfc_next;
+ free(x, M_BWMETER);
+ }
+
+ MRTSTAT_INC(mrts_cache_cleanups);
+ CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__,
+ (u_long)ntohl(rt->mfc_origin.s_addr),
+ (u_long)ntohl(rt->mfc_mcastgrp.s_addr));
+
+ expire_mfc(rt);
+ }
+ }
+
+ MFC_UNLOCK();
+
+ callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
+ curvnet);
+
+ CURVNET_RESTORE();
+}
+
+/*
+ * Packet forwarding routine once entry in the cache is made
+ */
+static int
+ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ vifi_t vifi;
+ int plen = ip->ip_len;
+
+ VIF_LOCK_ASSERT();
+
+ /*
+ * If xmt_vif is not -1, send on only the requested vif.
+ *
+ * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.)
+ */
+ if (xmt_vif < V_numvifs) {
+ if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER)
+ pim_register_send(ip, V_viftable + xmt_vif, m, rt);
+ else
+ phyint_send(ip, V_viftable + xmt_vif, m);
+ return 1;
+ }
+
+ /*
+ * Don't forward if it didn't arrive from the parent vif for its origin.
+ */
+ vifi = rt->mfc_parent;
+ if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) {
+ CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)",
+ __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp);
+ MRTSTAT_INC(mrts_wrong_if);
+ ++rt->mfc_wrong_if;
+ /*
+ * If we are doing PIM assert processing, send a message
+ * to the routing daemon.
+ *
+ * XXX: A PIM-SM router needs the WRONGVIF detection so it
+ * can complete the SPT switch, regardless of the type
+ * of the iif (broadcast media, GRE tunnel, etc).
+ */
+ if (V_pim_assert_enabled && (vifi < V_numvifs) &&
+ V_viftable[vifi].v_ifp) {
+
+ if (ifp == &V_multicast_register_if)
+ PIMSTAT_INC(pims_rcv_registers_wrongiif);
+
+ /* Get vifi for the incoming packet */
+ for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp;
+ vifi++)
+ ;
+ if (vifi >= V_numvifs)
+ return 0; /* The iif is not found: ignore the packet. */
+
+ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF)
+ return 0; /* WRONGVIF disabled: ignore the packet */
+
+ if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) {
+ struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
+ struct igmpmsg *im;
+ int hlen = ip->ip_hl << 2;
+ struct mbuf *mm = m_copy(m, 0, hlen);
+
+ if (mm && (M_HASCL(mm) || mm->m_len < hlen))
+ mm = m_pullup(mm, hlen);
+ if (mm == NULL)
+ return ENOBUFS;
+
+ im = mtod(mm, struct igmpmsg *);
+ im->im_msgtype = IGMPMSG_WRONGVIF;
+ im->im_mbz = 0;
+ im->im_vif = vifi;
+
+ MRTSTAT_INC(mrts_upcalls);
+
+ k_igmpsrc.sin_addr = im->im_src;
+ if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
+ CTR1(KTR_IPMF, "%s: socket queue full", __func__);
+ MRTSTAT_INC(mrts_upq_sockfull);
+ return ENOBUFS;
+ }
+ }
+ }
+ return 0;
+ }
+
+
+ /* If I sourced this packet, it counts as output, else it was input. */
+ if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) {
+ V_viftable[vifi].v_pkt_out++;
+ V_viftable[vifi].v_bytes_out += plen;
+ } else {
+ V_viftable[vifi].v_pkt_in++;
+ V_viftable[vifi].v_bytes_in += plen;
+ }
+ rt->mfc_pkt_cnt++;
+ rt->mfc_byte_cnt += plen;
+
+ /*
+ * For each vif, decide if a copy of the packet should be forwarded.
+ * Forward if:
+ * - the ttl exceeds the vif's threshold
+ * - there are group members downstream on interface
+ */
+ for (vifi = 0; vifi < V_numvifs; vifi++)
+ if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) {
+ V_viftable[vifi].v_pkt_out++;
+ V_viftable[vifi].v_bytes_out += plen;
+ if (V_viftable[vifi].v_flags & VIFF_REGISTER)
+ pim_register_send(ip, V_viftable + vifi, m, rt);
+ else
+ phyint_send(ip, V_viftable + vifi, m);
+ }
+
+ /*
+ * Perform upcall-related bw measuring.
+ */
+ if (rt->mfc_bw_meter != NULL) {
+ struct bw_meter *x;
+ struct timeval now;
+
+ microtime(&now);
+ MFC_LOCK_ASSERT();
+ for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
+ bw_meter_receive_packet(x, plen, &now);
+ }
+
+ return 0;
+}
+
+/*
+ * Check if a vif number is legal/ok. This is used by in_mcast.c.
+ */
+static int
+X_legal_vif_num(int vif)
+{
+ int ret;
+
+ ret = 0;
+ if (vif < 0)
+ return (ret);
+
+ VIF_LOCK();
+ if (vif < V_numvifs)
+ ret = 1;
+ VIF_UNLOCK();
+
+ return (ret);
+}
+
+/*
+ * Return the local address used by this vif
+ */
+static u_long
+X_ip_mcast_src(int vifi)
+{
+ in_addr_t addr;
+
+ addr = INADDR_ANY;
+ if (vifi < 0)
+ return (addr);
+
+ VIF_LOCK();
+ if (vifi < V_numvifs)
+ addr = V_viftable[vifi].v_lcl_addr.s_addr;
+ VIF_UNLOCK();
+
+ return (addr);
+}
+
+static void
+phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
+{
+ struct mbuf *mb_copy;
+ int hlen = ip->ip_hl << 2;
+
+ VIF_LOCK_ASSERT();
+
+ /*
+ * Make a new reference to the packet; make sure that
+ * the IP header is actually copied, not just referenced,
+ * so that ip_output() only scribbles on the copy.
+ */
+ mb_copy = m_copypacket(m, M_DONTWAIT);
+ if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen))
+ mb_copy = m_pullup(mb_copy, hlen);
+ if (mb_copy == NULL)
+ return;
+
+ send_packet(vifp, mb_copy);
+}
+
+static void
+send_packet(struct vif *vifp, struct mbuf *m)
+{
+ struct ip_moptions imo;
+ struct in_multi *imm[2];
+ int error;
+
+ VIF_LOCK_ASSERT();
+
+ imo.imo_multicast_ifp = vifp->v_ifp;
+ imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
+ imo.imo_multicast_loop = 1;
+ imo.imo_multicast_vif = -1;
+ imo.imo_num_memberships = 0;
+ imo.imo_max_memberships = 2;
+ imo.imo_membership = &imm[0];
+
+ /*
+ * Re-entrancy should not be a problem here, because
+ * the packets that we send out and are looped back at us
+ * should get rejected because they appear to come from
+ * the loopback interface, thus preventing looping.
+ */
+ error = ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, &imo, NULL);
+ CTR3(KTR_IPMF, "%s: vif %td err %d", __func__,
+ (ptrdiff_t)(vifp - V_viftable), error);
+}
+
+/*
+ * Stubs for old RSVP socket shim implementation.
+ */
+
+static int
+X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static void
+X_ip_rsvp_force_done(struct socket *so __unused)
+{
+
+}
+
+static void
+X_rsvp_input(struct mbuf *m, int off __unused)
+{
+
+ if (!V_rsvp_on)
+ m_freem(m);
+}
+
+/*
+ * Code for bandwidth monitors
+ */
+
+/*
+ * Define common interface for timeval-related methods
+ */
+#define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp)
+#define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp))
+#define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp))
+
+static uint32_t
+compute_bw_meter_flags(struct bw_upcall *req)
+{
+ uint32_t flags = 0;
+
+ if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
+ flags |= BW_METER_UNIT_PACKETS;
+ if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
+ flags |= BW_METER_UNIT_BYTES;
+ if (req->bu_flags & BW_UPCALL_GEQ)
+ flags |= BW_METER_GEQ;
+ if (req->bu_flags & BW_UPCALL_LEQ)
+ flags |= BW_METER_LEQ;
+
+ return flags;
+}
+
+/*
+ * Add a bw_meter entry
+ */
+static int
+add_bw_upcall(struct bw_upcall *req)
+{
+ struct mfc *mfc;
+ struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
+ BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
+ struct timeval now;
+ struct bw_meter *x;
+ uint32_t flags;
+
+ if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
+ return EOPNOTSUPP;
+
+ /* Test if the flags are valid */
+ if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
+ return EINVAL;
+ if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
+ return EINVAL;
+ if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
+ == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
+ return EINVAL;
+
+ /* Test if the threshold time interval is valid */
+ if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
+ return EINVAL;
+
+ flags = compute_bw_meter_flags(req);
+
+ /*
+ * Find if we have already same bw_meter entry
+ */
+ MFC_LOCK();
+ mfc = mfc_find(&req->bu_src, &req->bu_dst);
+ if (mfc == NULL) {
+ MFC_UNLOCK();
+ return EADDRNOTAVAIL;
+ }
+ for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
+ if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
+ &req->bu_threshold.b_time, ==)) &&
+ (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
+ (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
+ (x->bm_flags & BW_METER_USER_FLAGS) == flags) {
+ MFC_UNLOCK();
+ return 0; /* XXX Already installed */
+ }
+ }
+
+ /* Allocate the new bw_meter entry */
+ x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
+ if (x == NULL) {
+ MFC_UNLOCK();
+ return ENOBUFS;
+ }
+
+ /* Set the new bw_meter entry */
+ x->bm_threshold.b_time = req->bu_threshold.b_time;
+ microtime(&now);
+ x->bm_start_time = now;
+ x->bm_threshold.b_packets = req->bu_threshold.b_packets;
+ x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
+ x->bm_measured.b_packets = 0;
+ x->bm_measured.b_bytes = 0;
+ x->bm_flags = flags;
+ x->bm_time_next = NULL;
+ x->bm_time_hash = BW_METER_BUCKETS;
+
+ /* Add the new bw_meter entry to the front of entries for this MFC */
+ x->bm_mfc = mfc;
+ x->bm_mfc_next = mfc->mfc_bw_meter;
+ mfc->mfc_bw_meter = x;
+ schedule_bw_meter(x, &now);
+ MFC_UNLOCK();
+
+ return 0;
+}
+
+static void
+free_bw_list(struct bw_meter *list)
+{
+ while (list != NULL) {
+ struct bw_meter *x = list;
+
+ list = list->bm_mfc_next;
+ unschedule_bw_meter(x);
+ free(x, M_BWMETER);
+ }
+}
+
+/*
+ * Delete one or multiple bw_meter entries
+ */
+static int
+del_bw_upcall(struct bw_upcall *req)
+{
+ struct mfc *mfc;
+ struct bw_meter *x;
+
+ if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
+ return EOPNOTSUPP;
+
+ MFC_LOCK();
+
+ /* Find the corresponding MFC entry */
+ mfc = mfc_find(&req->bu_src, &req->bu_dst);
+ if (mfc == NULL) {
+ MFC_UNLOCK();
+ return EADDRNOTAVAIL;
+ } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
+ /*
+ * Delete all bw_meter entries for this mfc
+ */
+ struct bw_meter *list;
+
+ list = mfc->mfc_bw_meter;
+ mfc->mfc_bw_meter = NULL;
+ free_bw_list(list);
+ MFC_UNLOCK();
+ return 0;
+ } else { /* Delete a single bw_meter entry */
+ struct bw_meter *prev;
+ uint32_t flags = 0;
+
+ flags = compute_bw_meter_flags(req);
+
+ /* Find the bw_meter entry to delete */
+ for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
+ prev = x, x = x->bm_mfc_next) {
+ if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
+ &req->bu_threshold.b_time, ==)) &&
+ (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
+ (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
+ (x->bm_flags & BW_METER_USER_FLAGS) == flags)
+ break;
+ }
+ if (x != NULL) { /* Delete entry from the list for this MFC */
+ if (prev != NULL)
+ prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/
+ else
+ x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
+
+ unschedule_bw_meter(x);
+ MFC_UNLOCK();
+ /* Free the bw_meter entry */
+ free(x, M_BWMETER);
+ return 0;
+ } else {
+ MFC_UNLOCK();
+ return EINVAL;
+ }
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Perform bandwidth measurement processing that may result in an upcall
+ */
+static void
+bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
+{
+ struct timeval delta;
+
+ MFC_LOCK_ASSERT();
+
+ delta = *nowp;
+ BW_TIMEVALDECR(&delta, &x->bm_start_time);
+
+ if (x->bm_flags & BW_METER_GEQ) {
+ /*
+ * Processing for ">=" type of bw_meter entry
+ */
+ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
+ /* Reset the bw_meter entry */
+ x->bm_start_time = *nowp;
+ x->bm_measured.b_packets = 0;
+ x->bm_measured.b_bytes = 0;
+ x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
+ }
+
+ /* Record that a packet is received */
+ x->bm_measured.b_packets++;
+ x->bm_measured.b_bytes += plen;
+
+ /*
+ * Test if we should deliver an upcall
+ */
+ if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
+ if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
+ (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
+ ((x->bm_flags & BW_METER_UNIT_BYTES) &&
+ (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
+ /* Prepare an upcall for delivery */
+ bw_meter_prepare_upcall(x, nowp);
+ x->bm_flags |= BW_METER_UPCALL_DELIVERED;
+ }
+ }
+ } else if (x->bm_flags & BW_METER_LEQ) {
+ /*
+ * Processing for "<=" type of bw_meter entry
+ */
+ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
+ /*
+ * We are behind time with the multicast forwarding table
+ * scanning for "<=" type of bw_meter entries, so test now
+ * if we should deliver an upcall.
+ */
+ if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
+ (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
+ ((x->bm_flags & BW_METER_UNIT_BYTES) &&
+ (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
+ /* Prepare an upcall for delivery */
+ bw_meter_prepare_upcall(x, nowp);
+ }
+ /* Reschedule the bw_meter entry */
+ unschedule_bw_meter(x);
+ schedule_bw_meter(x, nowp);
+ }
+
+ /* Record that a packet is received */
+ x->bm_measured.b_packets++;
+ x->bm_measured.b_bytes += plen;
+
+ /*
+ * Test if we should restart the measuring interval
+ */
+ if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
+ x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
+ (x->bm_flags & BW_METER_UNIT_BYTES &&
+ x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
+ /* Don't restart the measuring interval */
+ } else {
+ /* Do restart the measuring interval */
+ /*
+ * XXX: note that we don't unschedule and schedule, because this
+ * might be too much overhead per packet. Instead, when we process
+ * all entries for a given timer hash bin, we check whether it is
+ * really a timeout. If not, we reschedule at that time.
+ */
+ x->bm_start_time = *nowp;
+ x->bm_measured.b_packets = 0;
+ x->bm_measured.b_bytes = 0;
+ x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
+ }
+ }
+}
+
+/*
+ * Prepare a bandwidth-related upcall
+ */
+static void
+bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
+{
+ struct timeval delta;
+ struct bw_upcall *u;
+
+ MFC_LOCK_ASSERT();
+
+ /*
+ * Compute the measured time interval
+ */
+ delta = *nowp;
+ BW_TIMEVALDECR(&delta, &x->bm_start_time);
+
+ /*
+ * If there are too many pending upcalls, deliver them now
+ */
+ if (V_bw_upcalls_n >= BW_UPCALLS_MAX)
+ bw_upcalls_send();
+
+ /*
+ * Set the bw_upcall entry
+ */
+ u = &V_bw_upcalls[V_bw_upcalls_n++];
+ u->bu_src = x->bm_mfc->mfc_origin;
+ u->bu_dst = x->bm_mfc->mfc_mcastgrp;
+ u->bu_threshold.b_time = x->bm_threshold.b_time;
+ u->bu_threshold.b_packets = x->bm_threshold.b_packets;
+ u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
+ u->bu_measured.b_time = delta;
+ u->bu_measured.b_packets = x->bm_measured.b_packets;
+ u->bu_measured.b_bytes = x->bm_measured.b_bytes;
+ u->bu_flags = 0;
+ if (x->bm_flags & BW_METER_UNIT_PACKETS)
+ u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
+ if (x->bm_flags & BW_METER_UNIT_BYTES)
+ u->bu_flags |= BW_UPCALL_UNIT_BYTES;
+ if (x->bm_flags & BW_METER_GEQ)
+ u->bu_flags |= BW_UPCALL_GEQ;
+ if (x->bm_flags & BW_METER_LEQ)
+ u->bu_flags |= BW_UPCALL_LEQ;
+}
+
+/*
+ * Send the pending bandwidth-related upcalls
+ */
+static void
+bw_upcalls_send(void)
+{
+ struct mbuf *m;
+ int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]);
+ struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
+ static struct igmpmsg igmpmsg = { 0, /* unused1 */
+ 0, /* unused2 */
+ IGMPMSG_BW_UPCALL,/* im_msgtype */
+ 0, /* im_mbz */
+ 0, /* im_vif */
+ 0, /* unused3 */
+ { 0 }, /* im_src */
+ { 0 } }; /* im_dst */
+
+ MFC_LOCK_ASSERT();
+
+ if (V_bw_upcalls_n == 0)
+ return; /* No pending upcalls */
+
+ V_bw_upcalls_n = 0;
+
+ /*
+ * Allocate a new mbuf, initialize it with the header and
+ * the payload for the pending calls.
+ */
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (m == NULL) {
+ log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
+ return;
+ }
+
+ m->m_len = m->m_pkthdr.len = 0;
+ m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
+ m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]);
+
+ /*
+ * Send the upcalls
+ * XXX do we need to set the address in k_igmpsrc ?
+ */
+ MRTSTAT_INC(mrts_upcalls);
+ if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) {
+ log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
+ MRTSTAT_INC(mrts_upq_sockfull);
+ }
+}
+
+/*
+ * Compute the timeout hash value for the bw_meter entries
+ */
+#define BW_METER_TIMEHASH(bw_meter, hash) \
+ do { \
+ struct timeval next_timeval = (bw_meter)->bm_start_time; \
+ \
+ BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
+ (hash) = next_timeval.tv_sec; \
+ if (next_timeval.tv_usec) \
+ (hash)++; /* XXX: make sure we don't timeout early */ \
+ (hash) %= BW_METER_BUCKETS; \
+ } while (0)
+
+/*
+ * Schedule a timer to process periodically bw_meter entry of type "<="
+ * by linking the entry in the proper hash bucket.
+ */
+static void
+schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
+{
+ int time_hash;
+
+ MFC_LOCK_ASSERT();
+
+ if (!(x->bm_flags & BW_METER_LEQ))
+ return; /* XXX: we schedule timers only for "<=" entries */
+
+ /*
+ * Reset the bw_meter entry
+ */
+ x->bm_start_time = *nowp;
+ x->bm_measured.b_packets = 0;
+ x->bm_measured.b_bytes = 0;
+ x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
+
+ /*
+ * Compute the timeout hash value and insert the entry
+ */
+ BW_METER_TIMEHASH(x, time_hash);
+ x->bm_time_next = V_bw_meter_timers[time_hash];
+ V_bw_meter_timers[time_hash] = x;
+ x->bm_time_hash = time_hash;
+}
+
+/*
+ * Unschedule the periodic timer that processes bw_meter entry of type "<="
+ * by removing the entry from the proper hash bucket.
+ */
+static void
+unschedule_bw_meter(struct bw_meter *x)
+{
+ int time_hash;
+ struct bw_meter *prev, *tmp;
+
+ MFC_LOCK_ASSERT();
+
+ if (!(x->bm_flags & BW_METER_LEQ))
+ return; /* XXX: we schedule timers only for "<=" entries */
+
+ /*
+ * Compute the timeout hash value and delete the entry
+ */
+ time_hash = x->bm_time_hash;
+ if (time_hash >= BW_METER_BUCKETS)
+ return; /* Entry was not scheduled */
+
+ for (prev = NULL, tmp = V_bw_meter_timers[time_hash];
+ tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
+ if (tmp == x)
+ break;
+
+ if (tmp == NULL)
+ panic("unschedule_bw_meter: bw_meter entry not found");
+
+ if (prev != NULL)
+ prev->bm_time_next = x->bm_time_next;
+ else
+ V_bw_meter_timers[time_hash] = x->bm_time_next;
+
+ x->bm_time_next = NULL;
+ x->bm_time_hash = BW_METER_BUCKETS;
+}
+
+
+/*
+ * Process all "<=" type of bw_meter that should be processed now,
+ * and for each entry prepare an upcall if necessary. Each processed
+ * entry is rescheduled again for the (periodic) processing.
+ *
+ * This is run periodically (once per second normally). On each round,
+ * all the potentially matching entries are in the hash slot that we are
+ * looking at.
+ */
+static void
+bw_meter_process()
+{
+ uint32_t loops;
+ int i;
+ struct timeval now, process_endtime;
+
+ microtime(&now);
+ if (V_last_tv_sec == now.tv_sec)
+ return; /* nothing to do */
+
+ loops = now.tv_sec - V_last_tv_sec;
+ V_last_tv_sec = now.tv_sec;
+ if (loops > BW_METER_BUCKETS)
+ loops = BW_METER_BUCKETS;
+
+ MFC_LOCK();
+ /*
+ * Process all bins of bw_meter entries from the one after the last
+ * processed to the current one. On entry, i points to the last bucket
+ * visited, so we need to increment i at the beginning of the loop.
+ */
+ for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
+ struct bw_meter *x, *tmp_list;
+
+ if (++i >= BW_METER_BUCKETS)
+ i = 0;
+
+ /* Disconnect the list of bw_meter entries from the bin */
+ tmp_list = V_bw_meter_timers[i];
+ V_bw_meter_timers[i] = NULL;
+
+ /* Process the list of bw_meter entries */
+ while (tmp_list != NULL) {
+ x = tmp_list;
+ tmp_list = tmp_list->bm_time_next;
+
+ /* Test if the time interval is over */
+ process_endtime = x->bm_start_time;
+ BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
+ if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
+ /* Not yet: reschedule, but don't reset */
+ int time_hash;
+
+ BW_METER_TIMEHASH(x, time_hash);
+ if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
+ /*
+ * XXX: somehow the bin processing is a bit ahead of time.
+ * Put the entry in the next bin.
+ */
+ if (++time_hash >= BW_METER_BUCKETS)
+ time_hash = 0;
+ }
+ x->bm_time_next = V_bw_meter_timers[time_hash];
+ V_bw_meter_timers[time_hash] = x;
+ x->bm_time_hash = time_hash;
+
+ continue;
+ }
+
+ /*
+ * Test if we should deliver an upcall
+ */
+ if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
+ (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
+ ((x->bm_flags & BW_METER_UNIT_BYTES) &&
+ (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
+ /* Prepare an upcall for delivery */
+ bw_meter_prepare_upcall(x, &now);
+ }
+
+ /*
+ * Reschedule for next processing
+ */
+ schedule_bw_meter(x, &now);
+ }
+ }
+
+ /* Send all upcalls that are pending delivery */
+ bw_upcalls_send();
+
+ MFC_UNLOCK();
+}
+
+/*
+ * A periodic function for sending all upcalls that are pending delivery
+ */
+static void
+expire_bw_upcalls_send(void *arg)
+{
+ CURVNET_SET((struct vnet *) arg);
+
+ MFC_LOCK();
+ bw_upcalls_send();
+ MFC_UNLOCK();
+
+ callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
+ curvnet);
+ CURVNET_RESTORE();
+}
+
+/*
+ * A periodic function for periodic scanning of the multicast forwarding
+ * table for processing all "<=" bw_meter entries.
+ */
+static void
+expire_bw_meter_process(void *arg)
+{
+ CURVNET_SET((struct vnet *) arg);
+
+ if (V_mrt_api_config & MRT_MFC_BW_UPCALL)
+ bw_meter_process();
+
+ callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
+ curvnet);
+ CURVNET_RESTORE();
+}
+
+/*
+ * End of bandwidth monitoring code
+ */
+
+/*
+ * Send the packet up to the user daemon, or eventually do kernel encapsulation
+ *
+ */
+static int
+pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m,
+ struct mfc *rt)
+{
+ struct mbuf *mb_copy, *mm;
+
+ /*
+ * Do not send IGMP_WHOLEPKT notifications to userland, if the
+ * rendezvous point was unspecified, and we were told not to.
+ */
+ if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) &&
+ in_nullhost(rt->mfc_rp))
+ return 0;
+
+ mb_copy = pim_register_prepare(ip, m);
+ if (mb_copy == NULL)
+ return ENOBUFS;
+
+ /*
+ * Send all the fragments. Note that the mbuf for each fragment
+ * is freed by the sending machinery.
+ */
+ for (mm = mb_copy; mm; mm = mb_copy) {
+ mb_copy = mm->m_nextpkt;
+ mm->m_nextpkt = 0;
+ mm = m_pullup(mm, sizeof(struct ip));
+ if (mm != NULL) {
+ ip = mtod(mm, struct ip *);
+ if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) {
+ pim_register_send_rp(ip, vifp, mm, rt);
+ } else {
+ pim_register_send_upcall(ip, vifp, mm, rt);
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Return a copy of the data packet that is ready for PIM Register
+ * encapsulation.
+ * XXX: Note that in the returned copy the IP header is a valid one.
+ */
+static struct mbuf *
+pim_register_prepare(struct ip *ip, struct mbuf *m)
+{
+ struct mbuf *mb_copy = NULL;
+ int mtu;
+
+ /* Take care of delayed checksums */
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ in_delayed_cksum(m);
+ m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ }
+
+ /*
+ * Copy the old packet & pullup its IP header into the
+ * new mbuf so we can modify it.
+ */
+ mb_copy = m_copypacket(m, M_DONTWAIT);
+ if (mb_copy == NULL)
+ return NULL;
+ mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
+ if (mb_copy == NULL)
+ return NULL;
+
+ /* take care of the TTL */
+ ip = mtod(mb_copy, struct ip *);
+ --ip->ip_ttl;
+
+ /* Compute the MTU after the PIM Register encapsulation */
+ mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
+
+ if (ip->ip_len <= mtu) {
+ /* Turn the IP header into a valid one */
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+ ip->ip_sum = 0;
+ ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
+ } else {
+ /* Fragment the packet */
+ if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) {
+ m_freem(mb_copy);
+ return NULL;
+ }
+ }
+ return mb_copy;
+}
+
+/*
+ * Send an upcall with the data packet to the user-level process.
+ */
+static int
+pim_register_send_upcall(struct ip *ip, struct vif *vifp,
+ struct mbuf *mb_copy, struct mfc *rt)
+{
+ struct mbuf *mb_first;
+ int len = ntohs(ip->ip_len);
+ struct igmpmsg *im;
+ struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
+
+ VIF_LOCK_ASSERT();
+
+ /*
+ * Add a new mbuf with an upcall header
+ */
+ MGETHDR(mb_first, M_DONTWAIT, MT_DATA);
+ if (mb_first == NULL) {
+ m_freem(mb_copy);
+ return ENOBUFS;
+ }
+ mb_first->m_data += max_linkhdr;
+ mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
+ mb_first->m_len = sizeof(struct igmpmsg);
+ mb_first->m_next = mb_copy;
+
+ /* Send message to routing daemon */
+ im = mtod(mb_first, struct igmpmsg *);
+ im->im_msgtype = IGMPMSG_WHOLEPKT;
+ im->im_mbz = 0;
+ im->im_vif = vifp - V_viftable;
+ im->im_src = ip->ip_src;
+ im->im_dst = ip->ip_dst;
+
+ k_igmpsrc.sin_addr = ip->ip_src;
+
+ MRTSTAT_INC(mrts_upcalls);
+
+ if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) {
+ CTR1(KTR_IPMF, "%s: socket queue full", __func__);
+ MRTSTAT_INC(mrts_upq_sockfull);
+ return ENOBUFS;
+ }
+
+ /* Keep statistics */
+ PIMSTAT_INC(pims_snd_registers_msgs);
+ PIMSTAT_ADD(pims_snd_registers_bytes, len);
+
+ return 0;
+}
+
+/*
+ * Encapsulate the data packet in PIM Register message and send it to the RP.
+ */
+static int
+pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy,
+ struct mfc *rt)
+{
+ struct mbuf *mb_first;
+ struct ip *ip_outer;
+ struct pim_encap_pimhdr *pimhdr;
+ int len = ntohs(ip->ip_len);
+ vifi_t vifi = rt->mfc_parent;
+
+ VIF_LOCK_ASSERT();
+
+ if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) {
+ m_freem(mb_copy);
+ return EADDRNOTAVAIL; /* The iif vif is invalid */
+ }
+
+ /*
+ * Add a new mbuf with the encapsulating header
+ */
+ MGETHDR(mb_first, M_DONTWAIT, MT_DATA);
+ if (mb_first == NULL) {
+ m_freem(mb_copy);
+ return ENOBUFS;
+ }
+ mb_first->m_data += max_linkhdr;
+ mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
+ mb_first->m_next = mb_copy;
+
+ mb_first->m_pkthdr.len = len + mb_first->m_len;
+
+ /*
+ * Fill in the encapsulating IP and PIM header
+ */
+ ip_outer = mtod(mb_first, struct ip *);
+ *ip_outer = pim_encap_iphdr;
+ ip_outer->ip_id = ip_newid();
+ ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
+ ip_outer->ip_src = V_viftable[vifi].v_lcl_addr;
+ ip_outer->ip_dst = rt->mfc_rp;
+ /*
+ * Copy the inner header TOS to the outer header, and take care of the
+ * IP_DF bit.
+ */
+ ip_outer->ip_tos = ip->ip_tos;
+ if (ntohs(ip->ip_off) & IP_DF)
+ ip_outer->ip_off |= IP_DF;
+ pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
+ + sizeof(pim_encap_iphdr));
+ *pimhdr = pim_encap_pimhdr;
+ /* If the iif crosses a border, set the Border-bit */
+ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config)
+ pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
+
+ mb_first->m_data += sizeof(pim_encap_iphdr);
+ pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
+ mb_first->m_data -= sizeof(pim_encap_iphdr);
+
+ send_packet(vifp, mb_first);
+
+ /* Keep statistics */
+ PIMSTAT_INC(pims_snd_registers_msgs);
+ PIMSTAT_ADD(pims_snd_registers_bytes, len);
+
+ return 0;
+}
+
+/*
+ * pim_encapcheck() is called by the encap4_input() path at runtime to
+ * determine if a packet is for PIM; allowing PIM to be dynamically loaded
+ * into the kernel.
+ */
+static int
+pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
+{
+
+#ifdef DIAGNOSTIC
+ KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM"));
+#endif
+ if (proto != IPPROTO_PIM)
+ return 0; /* not for us; reject the datagram. */
+
+ return 64; /* claim the datagram. */
+}
+
+/*
+ * PIM-SMv2 and PIM-DM messages processing.
+ * Receives and verifies the PIM control messages, and passes them
+ * up to the listening socket, using rip_input().
+ * The only message with special processing is the PIM_REGISTER message
+ * (used by PIM-SM): the PIM header is stripped off, and the inner packet
+ * is passed to if_simloop().
+ */
+void
+pim_input(struct mbuf *m, int off)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ struct pim *pim;
+ int minlen;
+ int datalen = ip->ip_len;
+ int ip_tos;
+ int iphlen = off;
+
+ /* Keep statistics */
+ PIMSTAT_INC(pims_rcv_total_msgs);
+ PIMSTAT_ADD(pims_rcv_total_bytes, datalen);
+
+ /*
+ * Validate lengths
+ */
+ if (datalen < PIM_MINLEN) {
+ PIMSTAT_INC(pims_rcv_tooshort);
+ CTR3(KTR_IPMF, "%s: short packet (%d) from %s",
+ __func__, datalen, inet_ntoa(ip->ip_src));
+ m_freem(m);
+ return;
+ }
+
+ /*
+ * If the packet is at least as big as a REGISTER, go agead
+ * and grab the PIM REGISTER header size, to avoid another
+ * possible m_pullup() later.
+ *
+ * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8
+ * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
+ */
+ minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
+ /*
+ * Get the IP and PIM headers in contiguous memory, and
+ * possibly the PIM REGISTER header.
+ */
+ if ((m->m_flags & M_EXT || m->m_len < minlen) &&
+ (m = m_pullup(m, minlen)) == 0) {
+ CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__);
+ return;
+ }
+
+ /* m_pullup() may have given us a new mbuf so reset ip. */
+ ip = mtod(m, struct ip *);
+ ip_tos = ip->ip_tos;
+
+ /* adjust mbuf to point to the PIM header */
+ m->m_data += iphlen;
+ m->m_len -= iphlen;
+ pim = mtod(m, struct pim *);
+
+ /*
+ * Validate checksum. If PIM REGISTER, exclude the data packet.
+ *
+ * XXX: some older PIMv2 implementations don't make this distinction,
+ * so for compatibility reason perform the checksum over part of the
+ * message, and if error, then over the whole message.
+ */
+ if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
+ /* do nothing, checksum okay */
+ } else if (in_cksum(m, datalen)) {
+ PIMSTAT_INC(pims_rcv_badsum);
+ CTR1(KTR_IPMF, "%s: invalid checksum", __func__);
+ m_freem(m);
+ return;
+ }
+
+ /* PIM version check */
+ if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
+ PIMSTAT_INC(pims_rcv_badversion);
+ CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__,
+ (int)PIM_VT_V(pim->pim_vt), PIM_VERSION);
+ m_freem(m);
+ return;
+ }
+
+ /* restore mbuf back to the outer IP */
+ m->m_data -= iphlen;
+ m->m_len += iphlen;
+
+ if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
+ /*
+ * Since this is a REGISTER, we'll make a copy of the register
+ * headers ip + pim + u_int32 + encap_ip, to be passed up to the
+ * routing daemon.
+ */
+ struct sockaddr_in dst = { sizeof(dst), AF_INET };
+ struct mbuf *mcp;
+ struct ip *encap_ip;
+ u_int32_t *reghdr;
+ struct ifnet *vifp;
+
+ VIF_LOCK();
+ if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) {
+ VIF_UNLOCK();
+ CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__,
+ (int)V_reg_vif_num);
+ m_freem(m);
+ return;
+ }
+ /* XXX need refcnt? */
+ vifp = V_viftable[V_reg_vif_num].v_ifp;
+ VIF_UNLOCK();
+
+ /*
+ * Validate length
+ */
+ if (datalen < PIM_REG_MINLEN) {
+ PIMSTAT_INC(pims_rcv_tooshort);
+ PIMSTAT_INC(pims_rcv_badregisters);
+ CTR1(KTR_IPMF, "%s: register packet size too small", __func__);
+ m_freem(m);
+ return;
+ }
+
+ reghdr = (u_int32_t *)(pim + 1);
+ encap_ip = (struct ip *)(reghdr + 1);
+
+ CTR3(KTR_IPMF, "%s: register: encap ip src %s len %d",
+ __func__, inet_ntoa(encap_ip->ip_src), ntohs(encap_ip->ip_len));
+
+ /* verify the version number of the inner packet */
+ if (encap_ip->ip_v != IPVERSION) {
+ PIMSTAT_INC(pims_rcv_badregisters);
+ CTR1(KTR_IPMF, "%s: bad encap ip version", __func__);
+ m_freem(m);
+ return;
+ }
+
+ /* verify the inner packet is destined to a mcast group */
+ if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) {
+ PIMSTAT_INC(pims_rcv_badregisters);
+ CTR2(KTR_IPMF, "%s: bad encap ip dest %s", __func__,
+ inet_ntoa(encap_ip->ip_dst));
+ m_freem(m);
+ return;
+ }
+
+ /* If a NULL_REGISTER, pass it to the daemon */
+ if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
+ goto pim_input_to_daemon;
+
+ /*
+ * Copy the TOS from the outer IP header to the inner IP header.
+ */
+ if (encap_ip->ip_tos != ip_tos) {
+ /* Outer TOS -> inner TOS */
+ encap_ip->ip_tos = ip_tos;
+ /* Recompute the inner header checksum. Sigh... */
+
+ /* adjust mbuf to point to the inner IP header */
+ m->m_data += (iphlen + PIM_MINLEN);
+ m->m_len -= (iphlen + PIM_MINLEN);
+
+ encap_ip->ip_sum = 0;
+ encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
+
+ /* restore mbuf to point back to the outer IP header */
+ m->m_data -= (iphlen + PIM_MINLEN);
+ m->m_len += (iphlen + PIM_MINLEN);
+ }
+
+ /*
+ * Decapsulate the inner IP packet and loopback to forward it
+ * as a normal multicast packet. Also, make a copy of the
+ * outer_iphdr + pimhdr + reghdr + encap_iphdr
+ * to pass to the daemon later, so it can take the appropriate
+ * actions (e.g., send back PIM_REGISTER_STOP).
+ * XXX: here m->m_data points to the outer IP header.
+ */
+ mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN);
+ if (mcp == NULL) {
+ CTR1(KTR_IPMF, "%s: m_copy() failed", __func__);
+ m_freem(m);
+ return;
+ }
+
+ /* Keep statistics */
+ /* XXX: registers_bytes include only the encap. mcast pkt */
+ PIMSTAT_INC(pims_rcv_registers_msgs);
+ PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len));
+
+ /*
+ * forward the inner ip packet; point m_data at the inner ip.
+ */
+ m_adj(m, iphlen + PIM_MINLEN);
+
+ CTR4(KTR_IPMF,
+ "%s: forward decap'd REGISTER: src %lx dst %lx vif %d",
+ __func__,
+ (u_long)ntohl(encap_ip->ip_src.s_addr),
+ (u_long)ntohl(encap_ip->ip_dst.s_addr),
+ (int)V_reg_vif_num);
+
+ /* NB: vifp was collected above; can it change on us? */
+ if_simloop(vifp, m, dst.sin_family, 0);
+
+ /* prepare the register head to send to the mrouting daemon */
+ m = mcp;
+ }
+
+pim_input_to_daemon:
+ /*
+ * Pass the PIM message up to the daemon; if it is a Register message,
+ * pass the 'head' only up to the daemon. This includes the
+ * outer IP header, PIM header, PIM-Register header and the
+ * inner IP header.
+ * XXX: the outer IP header pkt size of a Register is not adjust to
+ * reflect the fact that the inner multicast data is truncated.
+ */
+ rip_input(m, iphlen);
+
+ return;
+}
+
+static int
+sysctl_mfctable(SYSCTL_HANDLER_ARGS)
+{
+ struct mfc *rt;
+ int error, i;
+
+ if (req->newptr)
+ return (EPERM);
+ if (V_mfchashtbl == NULL) /* XXX unlocked */
+ return (0);
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error)
+ return (error);
+
+ MFC_LOCK();
+ for (i = 0; i < mfchashsize; i++) {
+ LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) {
+ error = SYSCTL_OUT(req, rt, sizeof(struct mfc));
+ if (error)
+ goto out_locked;
+ }
+ }
+out_locked:
+ MFC_UNLOCK();
+ return (error);
+}
+
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, sysctl_mfctable,
+ "IPv4 Multicast Forwarding Table (struct *mfc[mfchashsize], "
+ "netinet/ip_mroute.h)");
+
+static void
+vnet_mroute_init(const void *unused __unused)
+{
+
+ MALLOC(V_nexpire, u_char *, mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO);
+ bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers));
+ callout_init(&V_expire_upcalls_ch, CALLOUT_MPSAFE);
+ callout_init(&V_bw_upcalls_ch, CALLOUT_MPSAFE);
+ callout_init(&V_bw_meter_ch, CALLOUT_MPSAFE);
+}
+
+VNET_SYSINIT(vnet_mroute_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, vnet_mroute_init,
+ NULL);
+
+static void
+vnet_mroute_uninit(const void *unused __unused)
+{
+
+ FREE(V_nexpire, M_MRTABLE);
+ V_nexpire = NULL;
+}
+
+VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE,
+ vnet_mroute_uninit, NULL);
+
+static int
+ip_mroute_modevent(module_t mod, int type, void *unused)
+{
+
+ switch (type) {
+ case MOD_LOAD:
+ MROUTER_LOCK_INIT();
+
+ if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
+ if_detached_event, NULL, EVENTHANDLER_PRI_ANY);
+ if (if_detach_event_tag == NULL) {
+ printf("ip_mroute: unable to ifnet_deperture_even handler\n");
+ MROUTER_LOCK_DESTROY();
+ return (EINVAL);
+ }
+
+ MFC_LOCK_INIT();
+ VIF_LOCK_INIT();
+
+ mfchashsize = MFCHASHSIZE;
+#ifndef __rtems__
+ if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) &&
+ !powerof2(mfchashsize)) {
+ printf("WARNING: %s not a power of 2; using default\n",
+ "net.inet.ip.mfchashsize");
+ mfchashsize = MFCHASHSIZE;
+ }
+#endif
+
+ pim_squelch_wholepkt = 0;
+ TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt",
+ &pim_squelch_wholepkt);
+
+ pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM,
+ pim_encapcheck, &in_pim_protosw, NULL);
+ if (pim_encap_cookie == NULL) {
+ printf("ip_mroute: unable to attach pim encap\n");
+ VIF_LOCK_DESTROY();
+ MFC_LOCK_DESTROY();
+ MROUTER_LOCK_DESTROY();
+ return (EINVAL);
+ }
+
+ ip_mcast_src = X_ip_mcast_src;
+ ip_mforward = X_ip_mforward;
+ ip_mrouter_done = X_ip_mrouter_done;
+ ip_mrouter_get = X_ip_mrouter_get;
+ ip_mrouter_set = X_ip_mrouter_set;
+
+ ip_rsvp_force_done = X_ip_rsvp_force_done;
+ ip_rsvp_vif = X_ip_rsvp_vif;
+
+ legal_vif_num = X_legal_vif_num;
+ mrt_ioctl = X_mrt_ioctl;
+ rsvp_input_p = X_rsvp_input;
+ break;
+
+ case MOD_UNLOAD:
+ /*
+ * Typically module unload happens after the user-level
+ * process has shutdown the kernel services (the check
+ * below insures someone can't just yank the module out
+ * from under a running process). But if the module is
+ * just loaded and then unloaded w/o starting up a user
+ * process we still need to cleanup.
+ */
+ MROUTER_LOCK();
+ if (ip_mrouter_cnt != 0) {
+ MROUTER_UNLOCK();
+ return (EINVAL);
+ }
+ ip_mrouter_unloading = 1;
+ MROUTER_UNLOCK();
+
+ EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);
+
+ if (pim_encap_cookie) {
+ encap_detach(pim_encap_cookie);
+ pim_encap_cookie = NULL;
+ }
+
+ ip_mcast_src = NULL;
+ ip_mforward = NULL;
+ ip_mrouter_done = NULL;
+ ip_mrouter_get = NULL;
+ ip_mrouter_set = NULL;
+
+ ip_rsvp_force_done = NULL;
+ ip_rsvp_vif = NULL;
+
+ legal_vif_num = NULL;
+ mrt_ioctl = NULL;
+ rsvp_input_p = NULL;
+
+ VIF_LOCK_DESTROY();
+ MFC_LOCK_DESTROY();
+ MROUTER_LOCK_DESTROY();
+ break;
+
+ default:
+ return EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static moduledata_t ip_mroutemod = {
+ "ip_mroute",
+ ip_mroute_modevent,
+ 0
+};
+
+DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY);
diff --git a/freebsd/sys/netinet/ip_mroute.h b/freebsd/sys/netinet/ip_mroute.h
new file mode 100644
index 00000000..3bc7f52f
--- /dev/null
+++ b/freebsd/sys/netinet/ip_mroute.h
@@ -0,0 +1,359 @@
+/*-
+ * Copyright (c) 1989 Stephen Deering.
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Stephen Deering of Stanford University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_mroute.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IP_MROUTE_HH_
+#define _NETINET_IP_MROUTE_HH_
+
+/*
+ * Definitions for IP multicast forwarding.
+ *
+ * Written by David Waitzman, BBN Labs, August 1988.
+ * Modified by Steve Deering, Stanford, February 1989.
+ * Modified by Ajit Thyagarajan, PARC, August 1993.
+ * Modified by Ajit Thyagarajan, PARC, August 1994.
+ * Modified by Ahmed Helmy, SGI, June 1996.
+ * Modified by Pavlin Radoslavov, ICSI, October 2002.
+ *
+ * MROUTING Revision: 3.3.1.3
+ * and PIM-SMv2 and PIM-DM support, advanced API support,
+ * bandwidth metering and signaling.
+ */
+
+/*
+ * Multicast Routing set/getsockopt commands.
+ */
+#define MRT_INIT 100 /* initialize forwarder */
+#define MRT_DONE 101 /* shut down forwarder */
+#define MRT_ADD_VIF 102 /* create virtual interface */
+#define MRT_DEL_VIF 103 /* delete virtual interface */
+#define MRT_ADD_MFC 104 /* insert forwarding cache entry */
+#define MRT_DEL_MFC 105 /* delete forwarding cache entry */
+#define MRT_VERSION 106 /* get kernel version number */
+#define MRT_ASSERT 107 /* enable assert processing */
+#define MRT_PIM MRT_ASSERT /* enable PIM processing */
+#define MRT_API_SUPPORT 109 /* supported MRT API */
+#define MRT_API_CONFIG 110 /* config MRT API */
+#define MRT_ADD_BW_UPCALL 111 /* create bandwidth monitor */
+#define MRT_DEL_BW_UPCALL 112 /* delete bandwidth monitor */
+
+/*
+ * Types and macros for handling bitmaps with one bit per virtual interface.
+ */
+#define MAXVIFS 32
+typedef u_long vifbitmap_t;
+typedef u_short vifi_t; /* type of a vif index */
+#define ALL_VIFS (vifi_t)-1
+
+#define VIFM_SET(n, m) ((m) |= (1 << (n)))
+#define VIFM_CLR(n, m) ((m) &= ~(1 << (n)))
+#define VIFM_ISSET(n, m) ((m) & (1 << (n)))
+#define VIFM_CLRALL(m) ((m) = 0x00000000)
+#define VIFM_COPY(mfrom, mto) ((mto) = (mfrom))
+#define VIFM_SAME(m1, m2) ((m1) == (m2))
+
+struct mfc;
+
+/*
+ * Argument structure for MRT_ADD_VIF.
+ * (MRT_DEL_VIF takes a single vifi_t argument.)
+ */
+struct vifctl {
+ vifi_t vifc_vifi; /* the index of the vif to be added */
+ u_char vifc_flags; /* VIFF_ flags defined below */
+ u_char vifc_threshold; /* min ttl required to forward on vif */
+ u_int vifc_rate_limit; /* max rate */
+ struct in_addr vifc_lcl_addr; /* local interface address */
+ struct in_addr vifc_rmt_addr; /* remote address (tunnels only) */
+};
+
+#define VIFF_TUNNEL 0x1 /* no-op; retained for old source */
+#define VIFF_SRCRT 0x2 /* no-op; retained for old source */
+#define VIFF_REGISTER 0x4 /* used for PIM Register encap/decap */
+
+/*
+ * Argument structure for MRT_ADD_MFC and MRT_DEL_MFC
+ * XXX if you change this, make sure to change struct mfcctl2 as well.
+ */
+struct mfcctl {
+ struct in_addr mfcc_origin; /* ip origin of mcasts */
+ struct in_addr mfcc_mcastgrp; /* multicast group associated*/
+ vifi_t mfcc_parent; /* incoming vif */
+ u_char mfcc_ttls[MAXVIFS]; /* forwarding ttls on vifs */
+};
+
+/*
+ * The new argument structure for MRT_ADD_MFC and MRT_DEL_MFC overlays
+ * and extends the old struct mfcctl.
+ */
+struct mfcctl2 {
+ /* the mfcctl fields */
+ struct in_addr mfcc_origin; /* ip origin of mcasts */
+ struct in_addr mfcc_mcastgrp; /* multicast group associated*/
+ vifi_t mfcc_parent; /* incoming vif */
+ u_char mfcc_ttls[MAXVIFS]; /* forwarding ttls on vifs */
+
+ /* extension fields */
+ uint8_t mfcc_flags[MAXVIFS]; /* the MRT_MFC_FLAGS_* flags */
+ struct in_addr mfcc_rp; /* the RP address */
+};
+/*
+ * The advanced-API flags.
+ *
+ * The MRT_MFC_FLAGS_XXX API flags are also used as flags
+ * for the mfcc_flags field.
+ */
+#define MRT_MFC_FLAGS_DISABLE_WRONGVIF (1 << 0) /* disable WRONGVIF signals */
+#define MRT_MFC_FLAGS_BORDER_VIF (1 << 1) /* border vif */
+#define MRT_MFC_RP (1 << 8) /* enable RP address */
+#define MRT_MFC_BW_UPCALL (1 << 9) /* enable bw upcalls */
+#define MRT_MFC_FLAGS_ALL (MRT_MFC_FLAGS_DISABLE_WRONGVIF | \
+ MRT_MFC_FLAGS_BORDER_VIF)
+#define MRT_API_FLAGS_ALL (MRT_MFC_FLAGS_ALL | \
+ MRT_MFC_RP | \
+ MRT_MFC_BW_UPCALL)
+
+/*
+ * Structure for installing or delivering an upcall if the
+ * measured bandwidth is above or below a threshold.
+ *
+ * User programs (e.g. daemons) may have a need to know when the
+ * bandwidth used by some data flow is above or below some threshold.
+ * This interface allows the userland to specify the threshold (in
+ * bytes and/or packets) and the measurement interval. Flows are
+ * all packet with the same source and destination IP address.
+ * At the moment the code is only used for multicast destinations
+ * but there is nothing that prevents its use for unicast.
+ *
+ * The measurement interval cannot be shorter than some Tmin (currently, 3s).
+ * The threshold is set in packets and/or bytes per_interval.
+ *
+ * Measurement works as follows:
+ *
+ * For >= measurements:
+ * The first packet marks the start of a measurement interval.
+ * During an interval we count packets and bytes, and when we
+ * pass the threshold we deliver an upcall and we are done.
+ * The first packet after the end of the interval resets the
+ * count and restarts the measurement.
+ *
+ * For <= measurement:
+ * We start a timer to fire at the end of the interval, and
+ * then for each incoming packet we count packets and bytes.
+ * When the timer fires, we compare the value with the threshold,
+ * schedule an upcall if we are below, and restart the measurement
+ * (reschedule timer and zero counters).
+ */
+
+struct bw_data {
+ struct timeval b_time;
+ uint64_t b_packets;
+ uint64_t b_bytes;
+};
+
+struct bw_upcall {
+ struct in_addr bu_src; /* source address */
+ struct in_addr bu_dst; /* destination address */
+ uint32_t bu_flags; /* misc flags (see below) */
+#define BW_UPCALL_UNIT_PACKETS (1 << 0) /* threshold (in packets) */
+#define BW_UPCALL_UNIT_BYTES (1 << 1) /* threshold (in bytes) */
+#define BW_UPCALL_GEQ (1 << 2) /* upcall if bw >= threshold */
+#define BW_UPCALL_LEQ (1 << 3) /* upcall if bw <= threshold */
+#define BW_UPCALL_DELETE_ALL (1 << 4) /* delete all upcalls for s,d*/
+ struct bw_data bu_threshold; /* the bw threshold */
+ struct bw_data bu_measured; /* the measured bw */
+};
+
+/* max. number of upcalls to deliver together */
+#define BW_UPCALLS_MAX 128
+/* min. threshold time interval for bandwidth measurement */
+#define BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC 3
+#define BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC 0
+
+/*
+ * The kernel's multicast routing statistics.
+ */
+struct mrtstat {
+ u_long mrts_mfc_lookups; /* # forw. cache hash table hits */
+ u_long mrts_mfc_misses; /* # forw. cache hash table misses */
+ u_long mrts_upcalls; /* # calls to multicast routing daemon */
+ u_long mrts_no_route; /* no route for packet's origin */
+ u_long mrts_bad_tunnel; /* malformed tunnel options */
+ u_long mrts_cant_tunnel; /* no room for tunnel options */
+ u_long mrts_wrong_if; /* arrived on wrong interface */
+ u_long mrts_upq_ovflw; /* upcall Q overflow */
+ u_long mrts_cache_cleanups; /* # entries with no upcalls */
+ u_long mrts_drop_sel; /* pkts dropped selectively */
+ u_long mrts_q_overflow; /* pkts dropped - Q overflow */
+ u_long mrts_pkt2large; /* pkts dropped - size > BKT SIZE */
+ u_long mrts_upq_sockfull; /* upcalls dropped - socket full */
+};
+
+#ifdef _KERNEL
+#define MRTSTAT_ADD(name, val) V_mrtstat.name += (val)
+#define MRTSTAT_INC(name) MRTSTAT_ADD(name, 1)
+#endif
+
+/*
+ * Argument structure used by mrouted to get src-grp pkt counts
+ */
+struct sioc_sg_req {
+ struct in_addr src;
+ struct in_addr grp;
+ u_long pktcnt;
+ u_long bytecnt;
+ u_long wrong_if;
+};
+
+/*
+ * Argument structure used by mrouted to get vif pkt counts
+ */
+struct sioc_vif_req {
+ vifi_t vifi; /* vif number */
+ u_long icount; /* Input packet count on vif */
+ u_long ocount; /* Output packet count on vif */
+ u_long ibytes; /* Input byte count on vif */
+ u_long obytes; /* Output byte count on vif */
+};
+
+
+/*
+ * The kernel's virtual-interface structure.
+ */
+struct vif {
+ u_char v_flags; /* VIFF_ flags defined above */
+ u_char v_threshold; /* min ttl required to forward on vif*/
+ struct in_addr v_lcl_addr; /* local interface address */
+ struct in_addr v_rmt_addr; /* remote address (tunnels only) */
+ struct ifnet *v_ifp; /* pointer to interface */
+ u_long v_pkt_in; /* # pkts in on interface */
+ u_long v_pkt_out; /* # pkts out on interface */
+ u_long v_bytes_in; /* # bytes in on interface */
+ u_long v_bytes_out; /* # bytes out on interface */
+ struct route v_route; /* cached route */
+};
+
+#ifdef _KERNEL
+/*
+ * The kernel's multicast forwarding cache entry structure
+ */
+struct mfc {
+ LIST_ENTRY(mfc) mfc_hash;
+ struct in_addr mfc_origin; /* IP origin of mcasts */
+ struct in_addr mfc_mcastgrp; /* multicast group associated*/
+ vifi_t mfc_parent; /* incoming vif */
+ u_char mfc_ttls[MAXVIFS]; /* forwarding ttls on vifs */
+ u_long mfc_pkt_cnt; /* pkt count for src-grp */
+ u_long mfc_byte_cnt; /* byte count for src-grp */
+ u_long mfc_wrong_if; /* wrong if for src-grp */
+ int mfc_expire; /* time to clean entry up */
+ struct timeval mfc_last_assert; /* last time I sent an assert*/
+ uint8_t mfc_flags[MAXVIFS]; /* the MRT_MFC_FLAGS_* flags */
+ struct in_addr mfc_rp; /* the RP address */
+ struct bw_meter *mfc_bw_meter; /* list of bandwidth meters */
+ u_long mfc_nstall; /* # of packets awaiting mfc */
+ TAILQ_HEAD(, rtdetq) mfc_stall; /* q of packets awaiting mfc */
+};
+#endif /* _KERNEL */
+
+/*
+ * Struct used to communicate from kernel to multicast router
+ * note the convenient similarity to an IP packet
+ */
+struct igmpmsg {
+ uint32_t unused1;
+ uint32_t unused2;
+ u_char im_msgtype; /* what type of message */
+#define IGMPMSG_NOCACHE 1 /* no MFC in the kernel */
+#define IGMPMSG_WRONGVIF 2 /* packet came from wrong interface */
+#define IGMPMSG_WHOLEPKT 3 /* PIM pkt for user level encap. */
+#define IGMPMSG_BW_UPCALL 4 /* BW monitoring upcall */
+ u_char im_mbz; /* must be zero */
+ u_char im_vif; /* vif rec'd on */
+ u_char unused3;
+ struct in_addr im_src, im_dst;
+};
+
+#ifdef _KERNEL
+/*
+ * Argument structure used for pkt info. while upcall is made
+ */
+struct rtdetq {
+ TAILQ_ENTRY(rtdetq) rte_link;
+ struct mbuf *m; /* A copy of the packet */
+ struct ifnet *ifp; /* Interface pkt came in on */
+ vifi_t xmt_vif; /* Saved copy of imo_multicast_vif */
+};
+#define MAX_UPQ 4 /* max. no of pkts in upcall Q */
+#endif /* _KERNEL */
+
+/*
+ * Structure for measuring the bandwidth and sending an upcall if the
+ * measured bandwidth is above or below a threshold.
+ */
+struct bw_meter {
+ struct bw_meter *bm_mfc_next; /* next bw meter (same mfc) */
+ struct bw_meter *bm_time_next; /* next bw meter (same time) */
+ uint32_t bm_time_hash; /* the time hash value */
+ struct mfc *bm_mfc; /* the corresponding mfc */
+ uint32_t bm_flags; /* misc flags (see below) */
+#define BW_METER_UNIT_PACKETS (1 << 0) /* threshold (in packets) */
+#define BW_METER_UNIT_BYTES (1 << 1) /* threshold (in bytes) */
+#define BW_METER_GEQ (1 << 2) /* upcall if bw >= threshold */
+#define BW_METER_LEQ (1 << 3) /* upcall if bw <= threshold */
+#define BW_METER_USER_FLAGS (BW_METER_UNIT_PACKETS | \
+ BW_METER_UNIT_BYTES | \
+ BW_METER_GEQ | \
+ BW_METER_LEQ)
+
+#define BW_METER_UPCALL_DELIVERED (1 << 24) /* upcall was delivered */
+
+ struct bw_data bm_threshold; /* the upcall threshold */
+ struct bw_data bm_measured; /* the measured bw */
+ struct timeval bm_start_time; /* abs. time */
+};
+
+#ifdef _KERNEL
+
+struct sockopt;
+
+extern int (*ip_mrouter_set)(struct socket *, struct sockopt *);
+extern int (*ip_mrouter_get)(struct socket *, struct sockopt *);
+extern int (*ip_mrouter_done)(void);
+extern int (*mrt_ioctl)(u_long, caddr_t, int);
+
+#endif /* _KERNEL */
+
+#endif /* _NETINET_IP_MROUTE_HH_ */
diff --git a/freebsd/sys/netinet/ip_options.c b/freebsd/sys/netinet/ip_options.c
new file mode 100644
index 00000000..f8b31607
--- /dev/null
+++ b/freebsd/sys/netinet/ip_options.c
@@ -0,0 +1,747 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California.
+ * Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_ipstealth.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/domain.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/time.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/sys/sysctl.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/if_types.h>
+#include <freebsd/net/if_var.h>
+#include <freebsd/net/if_dl.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/netisr.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_options.h>
+#include <freebsd/netinet/ip_icmp.h>
+#include <freebsd/machine/in_cksum.h>
+
+#include <freebsd/sys/socketvar.h>
+
+#include <freebsd/security/mac/mac_framework.h>
+
+static int ip_dosourceroute = 0;
+SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW,
+ &ip_dosourceroute, 0, "Enable forwarding source routed IP packets");
+
+static int ip_acceptsourceroute = 0;
+SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
+ CTLFLAG_RW, &ip_acceptsourceroute, 0,
+ "Enable accepting source routed IP packets");
+
+int ip_doopts = 1; /* 0 = ignore, 1 = process, 2 = reject */
+SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_RW,
+ &ip_doopts, 0, "Enable IP options processing ([LS]SRR, RR, TS)");
+
+static void save_rte(struct mbuf *m, u_char *, struct in_addr);
+
+/*
+ * Do option processing on a datagram, possibly discarding it if bad options
+ * are encountered, or forwarding it if source-routed.
+ *
+ * The pass argument is used when operating in the IPSTEALTH mode to tell
+ * what options to process: [LS]SRR (pass 0) or the others (pass 1). The
+ * reason for as many as two passes is that when doing IPSTEALTH, non-routing
+ * options should be processed only if the packet is for us.
+ *
+ * Returns 1 if packet has been forwarded/freed, 0 if the packet should be
+ * processed further.
+ */
+int
+ip_dooptions(struct mbuf *m, int pass)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ u_char *cp;
+ struct in_ifaddr *ia;
+ int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
+ struct in_addr *sin, dst;
+ uint32_t ntime;
+ struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
+
+ /* Ignore or reject packets with IP options. */
+ if (ip_doopts == 0)
+ return 0;
+ else if (ip_doopts == 2) {
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_FILTER_PROHIB;
+ goto bad;
+ }
+
+ dst = ip->ip_dst;
+ cp = (u_char *)(ip + 1);
+ cnt = (ip->ip_hl << 2) - sizeof (struct ip);
+ for (; cnt > 0; cnt -= optlen, cp += optlen) {
+ opt = cp[IPOPT_OPTVAL];
+ if (opt == IPOPT_EOL)
+ break;
+ if (opt == IPOPT_NOP)
+ optlen = 1;
+ else {
+ if (cnt < IPOPT_OLEN + sizeof(*cp)) {
+ code = &cp[IPOPT_OLEN] - (u_char *)ip;
+ goto bad;
+ }
+ optlen = cp[IPOPT_OLEN];
+ if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
+ code = &cp[IPOPT_OLEN] - (u_char *)ip;
+ goto bad;
+ }
+ }
+ switch (opt) {
+
+ default:
+ break;
+
+ /*
+ * Source routing with record. Find interface with current
+ * destination address. If none on this machine then drop if
+ * strictly routed, or do nothing if loosely routed. Record
+ * interface address and bring up next address component. If
+ * strictly routed make sure next address is on directly
+ * accessible net.
+ */
+ case IPOPT_LSRR:
+ case IPOPT_SSRR:
+#ifdef IPSTEALTH
+ if (V_ipstealth && pass > 0)
+ break;
+#endif
+ if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
+ code = &cp[IPOPT_OLEN] - (u_char *)ip;
+ goto bad;
+ }
+ if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
+ code = &cp[IPOPT_OFFSET] - (u_char *)ip;
+ goto bad;
+ }
+ ipaddr.sin_addr = ip->ip_dst;
+ if (ifa_ifwithaddr_check((struct sockaddr *)&ipaddr)
+ == 0) {
+ if (opt == IPOPT_SSRR) {
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_SRCFAIL;
+ goto bad;
+ }
+ if (!ip_dosourceroute)
+ goto nosourcerouting;
+ /*
+ * Loose routing, and not at next destination
+ * yet; nothing to do except forward.
+ */
+ break;
+ }
+ off--; /* 0 origin */
+ if (off > optlen - (int)sizeof(struct in_addr)) {
+ /*
+ * End of source route. Should be for us.
+ */
+ if (!ip_acceptsourceroute)
+ goto nosourcerouting;
+ save_rte(m, cp, ip->ip_src);
+ break;
+ }
+#ifdef IPSTEALTH
+ if (V_ipstealth)
+ goto dropit;
+#endif
+ if (!ip_dosourceroute) {
+ if (V_ipforwarding) {
+ char buf[16]; /* aaa.bbb.ccc.ddd\0 */
+ /*
+ * Acting as a router, so generate
+ * ICMP
+ */
+nosourcerouting:
+ strcpy(buf, inet_ntoa(ip->ip_dst));
+ log(LOG_WARNING,
+ "attempted source route from %s to %s\n",
+ inet_ntoa(ip->ip_src), buf);
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_SRCFAIL;
+ goto bad;
+ } else {
+ /*
+ * Not acting as a router, so
+ * silently drop.
+ */
+#ifdef IPSTEALTH
+dropit:
+#endif
+ IPSTAT_INC(ips_cantforward);
+ m_freem(m);
+ return (1);
+ }
+ }
+
+ /*
+ * locate outgoing interface
+ */
+ (void)memcpy(&ipaddr.sin_addr, cp + off,
+ sizeof(ipaddr.sin_addr));
+
+ if (opt == IPOPT_SSRR) {
+#define INA struct in_ifaddr *
+#define SA struct sockaddr *
+ if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == NULL)
+ ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0);
+ } else
+/* XXX MRT 0 for routing */
+ ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m));
+ if (ia == NULL) {
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_SRCFAIL;
+ goto bad;
+ }
+ ip->ip_dst = ipaddr.sin_addr;
+ (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
+ sizeof(struct in_addr));
+ ifa_free(&ia->ia_ifa);
+ cp[IPOPT_OFFSET] += sizeof(struct in_addr);
+ /*
+ * Let ip_intr's mcast routing check handle mcast pkts
+ */
+ forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
+ break;
+
+ case IPOPT_RR:
+#ifdef IPSTEALTH
+ if (V_ipstealth && pass == 0)
+ break;
+#endif
+ if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
+ code = &cp[IPOPT_OFFSET] - (u_char *)ip;
+ goto bad;
+ }
+ if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
+ code = &cp[IPOPT_OFFSET] - (u_char *)ip;
+ goto bad;
+ }
+ /*
+ * If no space remains, ignore.
+ */
+ off--; /* 0 origin */
+ if (off > optlen - (int)sizeof(struct in_addr))
+ break;
+ (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst,
+ sizeof(ipaddr.sin_addr));
+ /*
+ * Locate outgoing interface; if we're the
+ * destination, use the incoming interface (should be
+ * same).
+ */
+ if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == NULL &&
+ (ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m))) == NULL) {
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_HOST;
+ goto bad;
+ }
+ (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
+ sizeof(struct in_addr));
+ ifa_free(&ia->ia_ifa);
+ cp[IPOPT_OFFSET] += sizeof(struct in_addr);
+ break;
+
+ case IPOPT_TS:
+#ifdef IPSTEALTH
+ if (V_ipstealth && pass == 0)
+ break;
+#endif
+ code = cp - (u_char *)ip;
+ if (optlen < 4 || optlen > 40) {
+ code = &cp[IPOPT_OLEN] - (u_char *)ip;
+ goto bad;
+ }
+ if ((off = cp[IPOPT_OFFSET]) < 5) {
+ code = &cp[IPOPT_OLEN] - (u_char *)ip;
+ goto bad;
+ }
+ if (off > optlen - (int)sizeof(int32_t)) {
+ cp[IPOPT_OFFSET + 1] += (1 << 4);
+ if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) {
+ code = &cp[IPOPT_OFFSET] - (u_char *)ip;
+ goto bad;
+ }
+ break;
+ }
+ off--; /* 0 origin */
+ sin = (struct in_addr *)(cp + off);
+ switch (cp[IPOPT_OFFSET + 1] & 0x0f) {
+
+ case IPOPT_TS_TSONLY:
+ break;
+
+ case IPOPT_TS_TSANDADDR:
+ if (off + sizeof(uint32_t) +
+ sizeof(struct in_addr) > optlen) {
+ code = &cp[IPOPT_OFFSET] - (u_char *)ip;
+ goto bad;
+ }
+ ipaddr.sin_addr = dst;
+ ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
+ m->m_pkthdr.rcvif);
+ if (ia == NULL)
+ continue;
+ (void)memcpy(sin, &IA_SIN(ia)->sin_addr,
+ sizeof(struct in_addr));
+ ifa_free(&ia->ia_ifa);
+ cp[IPOPT_OFFSET] += sizeof(struct in_addr);
+ off += sizeof(struct in_addr);
+ break;
+
+ case IPOPT_TS_PRESPEC:
+ if (off + sizeof(uint32_t) +
+ sizeof(struct in_addr) > optlen) {
+ code = &cp[IPOPT_OFFSET] - (u_char *)ip;
+ goto bad;
+ }
+ (void)memcpy(&ipaddr.sin_addr, sin,
+ sizeof(struct in_addr));
+ if (ifa_ifwithaddr_check((SA)&ipaddr) == 0)
+ continue;
+ cp[IPOPT_OFFSET] += sizeof(struct in_addr);
+ off += sizeof(struct in_addr);
+ break;
+
+ default:
+ code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip;
+ goto bad;
+ }
+ ntime = iptime();
+ (void)memcpy(cp + off, &ntime, sizeof(uint32_t));
+ cp[IPOPT_OFFSET] += sizeof(uint32_t);
+ }
+ }
+ if (forward && V_ipforwarding) {
+ ip_forward(m, 1);
+ return (1);
+ }
+ return (0);
+bad:
+ icmp_error(m, type, code, 0, 0);
+ IPSTAT_INC(ips_badoptions);
+ return (1);
+}
+
+/*
+ * Save incoming source route for use in replies, to be picked up later by
+ * ip_srcroute if the receiver is interested.
+ */
+static void
+save_rte(struct mbuf *m, u_char *option, struct in_addr dst)
+{
+ unsigned olen;
+ struct ipopt_tag *opts;
+
+ opts = (struct ipopt_tag *)m_tag_get(PACKET_TAG_IPOPTIONS,
+ sizeof(struct ipopt_tag), M_NOWAIT);
+ if (opts == NULL)
+ return;
+
+ olen = option[IPOPT_OLEN];
+ if (olen > sizeof(opts->ip_srcrt) - (1 + sizeof(dst))) {
+ m_tag_free((struct m_tag *)opts);
+ return;
+ }
+ bcopy(option, opts->ip_srcrt.srcopt, olen);
+ opts->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
+ opts->ip_srcrt.dst = dst;
+ m_tag_prepend(m, (struct m_tag *)opts);
+}
+
+/*
+ * Retrieve incoming source route for use in replies, in the same form used
+ * by setsockopt. The first hop is placed before the options, will be
+ * removed later.
+ */
+struct mbuf *
+ip_srcroute(struct mbuf *m0)
+{
+ struct in_addr *p, *q;
+ struct mbuf *m;
+ struct ipopt_tag *opts;
+
+ opts = (struct ipopt_tag *)m_tag_find(m0, PACKET_TAG_IPOPTIONS, NULL);
+ if (opts == NULL)
+ return (NULL);
+
+ if (opts->ip_nhops == 0)
+ return (NULL);
+ m = m_get(M_DONTWAIT, MT_DATA);
+ if (m == NULL)
+ return (NULL);
+
+#define OPTSIZ (sizeof(opts->ip_srcrt.nop) + sizeof(opts->ip_srcrt.srcopt))
+
+ /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
+ m->m_len = opts->ip_nhops * sizeof(struct in_addr) +
+ sizeof(struct in_addr) + OPTSIZ;
+
+ /*
+ * First, save first hop for return route.
+ */
+ p = &(opts->ip_srcrt.route[opts->ip_nhops - 1]);
+ *(mtod(m, struct in_addr *)) = *p--;
+
+ /*
+ * Copy option fields and padding (nop) to mbuf.
+ */
+ opts->ip_srcrt.nop = IPOPT_NOP;
+ opts->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
+ (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr),
+ &(opts->ip_srcrt.nop), OPTSIZ);
+ q = (struct in_addr *)(mtod(m, caddr_t) +
+ sizeof(struct in_addr) + OPTSIZ);
+#undef OPTSIZ
+ /*
+ * Record return path as an IP source route, reversing the path
+ * (pointers are now aligned).
+ */
+ while (p >= opts->ip_srcrt.route) {
+ *q++ = *p--;
+ }
+ /*
+ * Last hop goes to final destination.
+ */
+ *q = opts->ip_srcrt.dst;
+ m_tag_delete(m0, (struct m_tag *)opts);
+ return (m);
+}
+
+/*
+ * Strip out IP options, at higher level protocol in the kernel. Second
+ * argument is buffer to which options will be moved, and return value is
+ * their length.
+ *
+ * XXX should be deleted; last arg currently ignored.
+ */
+void
+ip_stripoptions(struct mbuf *m, struct mbuf *mopt)
+{
+ int i;
+ struct ip *ip = mtod(m, struct ip *);
+ caddr_t opts;
+ int olen;
+
+ olen = (ip->ip_hl << 2) - sizeof (struct ip);
+ opts = (caddr_t)(ip + 1);
+ i = m->m_len - (sizeof (struct ip) + olen);
+ bcopy(opts + olen, opts, (unsigned)i);
+ m->m_len -= olen;
+ if (m->m_flags & M_PKTHDR)
+ m->m_pkthdr.len -= olen;
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = sizeof(struct ip) >> 2;
+}
+
+/*
+ * Insert IP options into preformed packet. Adjust IP destination as
+ * required for IP source routing, as indicated by a non-zero in_addr at the
+ * start of the options.
+ *
+ * XXX This routine assumes that the packet has no options in place.
+ */
+struct mbuf *
+ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
+{
+ struct ipoption *p = mtod(opt, struct ipoption *);
+ struct mbuf *n;
+ struct ip *ip = mtod(m, struct ip *);
+ unsigned optlen;
+
+ optlen = opt->m_len - sizeof(p->ipopt_dst);
+ if (optlen + ip->ip_len > IP_MAXPACKET) {
+ *phlen = 0;
+ return (m); /* XXX should fail */
+ }
+ if (p->ipopt_dst.s_addr)
+ ip->ip_dst = p->ipopt_dst;
+ if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
+ MGETHDR(n, M_DONTWAIT, MT_DATA);
+ if (n == NULL) {
+ *phlen = 0;
+ return (m);
+ }
+ M_MOVE_PKTHDR(n, m);
+ n->m_pkthdr.rcvif = NULL;
+ n->m_pkthdr.len += optlen;
+ m->m_len -= sizeof(struct ip);
+ m->m_data += sizeof(struct ip);
+ n->m_next = m;
+ m = n;
+ m->m_len = optlen + sizeof(struct ip);
+ m->m_data += max_linkhdr;
+ bcopy(ip, mtod(m, void *), sizeof(struct ip));
+ } else {
+ m->m_data -= optlen;
+ m->m_len += optlen;
+ m->m_pkthdr.len += optlen;
+ bcopy(ip, mtod(m, void *), sizeof(struct ip));
+ }
+ ip = mtod(m, struct ip *);
+ bcopy(p->ipopt_list, ip + 1, optlen);
+ *phlen = sizeof(struct ip) + optlen;
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = *phlen >> 2;
+ ip->ip_len += optlen;
+ return (m);
+}
+
+/*
+ * Copy options from ip to jp, omitting those not copied during
+ * fragmentation.
+ */
+int
+ip_optcopy(struct ip *ip, struct ip *jp)
+{
+ u_char *cp, *dp;
+ int opt, optlen, cnt;
+
+ cp = (u_char *)(ip + 1);
+ dp = (u_char *)(jp + 1);
+ cnt = (ip->ip_hl << 2) - sizeof (struct ip);
+ for (; cnt > 0; cnt -= optlen, cp += optlen) {
+ opt = cp[0];
+ if (opt == IPOPT_EOL)
+ break;
+ if (opt == IPOPT_NOP) {
+ /* Preserve for IP mcast tunnel's LSRR alignment. */
+ *dp++ = IPOPT_NOP;
+ optlen = 1;
+ continue;
+ }
+
+ KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
+ ("ip_optcopy: malformed ipv4 option"));
+ optlen = cp[IPOPT_OLEN];
+ KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
+ ("ip_optcopy: malformed ipv4 option"));
+
+ /* Bogus lengths should have been caught by ip_dooptions. */
+ if (optlen > cnt)
+ optlen = cnt;
+ if (IPOPT_COPIED(opt)) {
+ bcopy(cp, dp, optlen);
+ dp += optlen;
+ }
+ }
+ for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
+ *dp++ = IPOPT_EOL;
+ return (optlen);
+}
+
+/*
+ * Set up IP options in pcb for insertion in output packets. Store in mbuf
+ * with pointer in pcbopt, adding pseudo-option with destination address if
+ * source routed.
+ */
+int
+ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m)
+{
+ int cnt, optlen;
+ u_char *cp;
+ struct mbuf **pcbopt;
+ u_char opt;
+
+ INP_WLOCK_ASSERT(inp);
+
+ pcbopt = &inp->inp_options;
+
+ /* turn off any old options */
+ if (*pcbopt)
+ (void)m_free(*pcbopt);
+ *pcbopt = 0;
+ if (m == NULL || m->m_len == 0) {
+ /*
+ * Only turning off any previous options.
+ */
+ if (m != NULL)
+ (void)m_free(m);
+ return (0);
+ }
+
+ if (m->m_len % sizeof(int32_t))
+ goto bad;
+ /*
+ * IP first-hop destination address will be stored before actual
+ * options; move other options back and clear it when none present.
+ */
+ if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
+ goto bad;
+ cnt = m->m_len;
+ m->m_len += sizeof(struct in_addr);
+ cp = mtod(m, u_char *) + sizeof(struct in_addr);
+ bcopy(mtod(m, void *), cp, (unsigned)cnt);
+ bzero(mtod(m, void *), sizeof(struct in_addr));
+
+ for (; cnt > 0; cnt -= optlen, cp += optlen) {
+ opt = cp[IPOPT_OPTVAL];
+ if (opt == IPOPT_EOL)
+ break;
+ if (opt == IPOPT_NOP)
+ optlen = 1;
+ else {
+ if (cnt < IPOPT_OLEN + sizeof(*cp))
+ goto bad;
+ optlen = cp[IPOPT_OLEN];
+ if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
+ goto bad;
+ }
+ switch (opt) {
+
+ default:
+ break;
+
+ case IPOPT_LSRR:
+ case IPOPT_SSRR:
+ /*
+ * User process specifies route as:
+ *
+ * ->A->B->C->D
+ *
+ * D must be our final destination (but we can't
+ * check that since we may not have connected yet).
+ * A is first hop destination, which doesn't appear
+ * in actual IP option, but is stored before the
+ * options.
+ */
+ /* XXX-BZ PRIV_NETINET_SETHDROPTS? */
+ if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
+ goto bad;
+ m->m_len -= sizeof(struct in_addr);
+ cnt -= sizeof(struct in_addr);
+ optlen -= sizeof(struct in_addr);
+ cp[IPOPT_OLEN] = optlen;
+ /*
+ * Move first hop before start of options.
+ */
+ bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
+ sizeof(struct in_addr));
+ /*
+ * Then copy rest of options back
+ * to close up the deleted entry.
+ */
+ bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)),
+ &cp[IPOPT_OFFSET+1],
+ (unsigned)cnt - (IPOPT_MINOFF - 1));
+ break;
+ }
+ }
+ if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
+ goto bad;
+ *pcbopt = m;
+ return (0);
+
+bad:
+ (void)m_free(m);
+ return (EINVAL);
+}
+
+/*
+ * Check for the presence of the IP Router Alert option [RFC2113]
+ * in the header of an IPv4 datagram.
+ *
+ * This call is not intended for use from the forwarding path; it is here
+ * so that protocol domains may check for the presence of the option.
+ * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert
+ * option does not have much relevance to the implementation, though this
+ * may change in future.
+ * Router alert options SHOULD be passed if running in IPSTEALTH mode and
+ * we are not the endpoint.
+ * Length checks on individual options should already have been peformed
+ * by ip_dooptions() therefore they are folded under INVARIANTS here.
+ *
+ * Return zero if not present or options are invalid, non-zero if present.
+ */
+int
+ip_checkrouteralert(struct mbuf *m)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ u_char *cp;
+ int opt, optlen, cnt, found_ra;
+
+ found_ra = 0;
+ cp = (u_char *)(ip + 1);
+ cnt = (ip->ip_hl << 2) - sizeof (struct ip);
+ for (; cnt > 0; cnt -= optlen, cp += optlen) {
+ opt = cp[IPOPT_OPTVAL];
+ if (opt == IPOPT_EOL)
+ break;
+ if (opt == IPOPT_NOP)
+ optlen = 1;
+ else {
+#ifdef INVARIANTS
+ if (cnt < IPOPT_OLEN + sizeof(*cp))
+ break;
+#endif
+ optlen = cp[IPOPT_OLEN];
+#ifdef INVARIANTS
+ if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
+ break;
+#endif
+ }
+ switch (opt) {
+ case IPOPT_RA:
+#ifdef INVARIANTS
+ if (optlen != IPOPT_OFFSET + sizeof(uint16_t) ||
+ (*((uint16_t *)&cp[IPOPT_OFFSET]) != 0))
+ break;
+ else
+#endif
+ found_ra = 1;
+ break;
+ default:
+ break;
+ }
+ }
+
+ return (found_ra);
+}
diff --git a/freebsd/sys/netinet/ip_options.h b/freebsd/sys/netinet/ip_options.h
new file mode 100644
index 00000000..9c08004d
--- /dev/null
+++ b/freebsd/sys/netinet/ip_options.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California.
+ * Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IP_OPTIONS_HH_
+#define _NETINET_IP_OPTIONS_HH_
+
+struct ipoptrt {
+ struct in_addr dst; /* final destination */
+ char nop; /* one NOP to align */
+ char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */
+ struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)];
+};
+
+struct ipopt_tag {
+ struct m_tag tag; /* m_tag */
+ int ip_nhops;
+ struct ipoptrt ip_srcrt;
+};
+
+extern int ip_doopts; /* process or ignore IP options */
+
+int ip_checkrouteralert(struct mbuf *);
+int ip_dooptions(struct mbuf *, int);
+struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
+int ip_optcopy(struct ip *, struct ip *);
+int ip_pcbopts(struct inpcb *, int, struct mbuf *);
+void ip_stripoptions(struct mbuf *, struct mbuf *);
+struct mbuf *ip_srcroute(struct mbuf *);
+
+#endif /* !_NETINET_IP_OPTIONS_HH_ */
diff --git a/freebsd/sys/netinet/ip_output.c b/freebsd/sys/netinet/ip_output.c
new file mode 100644
index 00000000..51132333
--- /dev/null
+++ b/freebsd/sys/netinet/ip_output.c
@@ -0,0 +1,1284 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_ipfw.h>
+#include <freebsd/local/opt_ipsec.h>
+#include <freebsd/local/opt_route.h>
+#include <freebsd/local/opt_mbuf_stress_test.h>
+#include <freebsd/local/opt_mpath.h>
+#include <freebsd/local/opt_sctp.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/ucred.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/if_llatbl.h>
+#include <freebsd/net/netisr.h>
+#include <freebsd/net/pfil.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/flowtable.h>
+#ifdef RADIX_MPATH
+#include <freebsd/net/radix_mpath.h>
+#endif
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_options.h>
+#ifdef SCTP
+#include <freebsd/netinet/sctp.h>
+#include <freebsd/netinet/sctp_crc32.h>
+#endif
+
+#ifdef IPSEC
+#include <freebsd/netinet/ip_ipsec.h>
+#include <freebsd/netipsec/ipsec.h>
+#endif /* IPSEC*/
+
+#include <freebsd/machine/in_cksum.h>
+
+#include <freebsd/security/mac/mac_framework.h>
+
+#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\
+ x, (ntohl(a.s_addr)>>24)&0xFF,\
+ (ntohl(a.s_addr)>>16)&0xFF,\
+ (ntohl(a.s_addr)>>8)&0xFF,\
+ (ntohl(a.s_addr))&0xFF, y);
+
+VNET_DEFINE(u_short, ip_id);
+
+#ifdef MBUF_STRESS_TEST
+int mbuf_frag_size = 0;
+SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
+ &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
+#endif
+
+static void ip_mloopback
+ (struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
+
+
+extern int in_mcast_loop;
+extern struct protosw inetsw[];
+
+/*
+ * IP output. The packet in mbuf chain m contains a skeletal IP
+ * header (with len, off, ttl, proto, tos, src, dst).
+ * The mbuf chain containing the packet will be freed.
+ * The mbuf opt, if present, will not be freed.
+ * In the IP forwarding case, the packet will arrive with options already
+ * inserted, so must have a NULL opt pointer.
+ */
+int
+ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
+ struct ip_moptions *imo, struct inpcb *inp)
+{
+ struct ip *ip;
+ struct ifnet *ifp = NULL; /* keep compiler happy */
+ struct mbuf *m0;
+ int hlen = sizeof (struct ip);
+ int mtu;
+ int len, error = 0;
+ int nortfree = 0;
+ struct sockaddr_in *dst = NULL; /* keep compiler happy */
+ struct in_ifaddr *ia = NULL;
+ int isbroadcast, sw_csum;
+ struct route iproute;
+ struct rtentry *rte; /* cache for ro->ro_rt */
+ struct in_addr odst;
+#ifdef IPFIREWALL_FORWARD
+ struct m_tag *fwd_tag = NULL;
+#endif
+#ifdef IPSEC
+ int no_route_but_check_spd = 0;
+#endif
+ M_ASSERTPKTHDR(m);
+
+ if (inp != NULL) {
+ INP_LOCK_ASSERT(inp);
+ M_SETFIB(m, inp->inp_inc.inc_fibnum);
+ if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
+ m->m_pkthdr.flowid = inp->inp_flowid;
+ m->m_flags |= M_FLOWID;
+ }
+ }
+
+ if (ro == NULL) {
+ ro = &iproute;
+ bzero(ro, sizeof (*ro));
+
+#ifdef FLOWTABLE
+ {
+ struct flentry *fle;
+
+ /*
+ * The flow table returns route entries valid for up to 30
+ * seconds; we rely on the remainder of ip_output() taking no
+ * longer than that long for the stability of ro_rt. The
+ * flow ID assignment must have happened before this point.
+ */
+ if ((fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET)) != NULL) {
+ flow_to_route(fle, ro);
+ nortfree = 1;
+ }
+ }
+#endif
+ }
+
+ if (opt) {
+ len = 0;
+ m = ip_insertoptions(m, opt, &len);
+ if (len != 0)
+ hlen = len;
+ }
+ ip = mtod(m, struct ip *);
+
+ /*
+ * Fill in IP header. If we are not allowing fragmentation,
+ * then the ip_id field is meaningless, but we don't set it
+ * to zero. Doing so causes various problems when devices along
+ * the path (routers, load balancers, firewalls, etc.) illegally
+ * disable DF on our packet. Note that a 16-bit counter
+ * will wrap around in less than 10 seconds at 100 Mbit/s on a
+ * medium with MTU 1500. See Steven M. Bellovin, "A Technique
+ * for Counting NATted Hosts", Proc. IMW'02, available at
+ * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
+ */
+ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = hlen >> 2;
+ ip->ip_id = ip_newid();
+ IPSTAT_INC(ips_localout);
+ } else {
+ hlen = ip->ip_hl << 2;
+ }
+
+ dst = (struct sockaddr_in *)&ro->ro_dst;
+again:
+ /*
+ * If there is a cached route,
+ * check that it is to the same destination
+ * and is still up. If not, free it and try again.
+ * The address family should also be checked in case of sharing the
+ * cache with IPv6.
+ */
+ rte = ro->ro_rt;
+ if (rte && ((rte->rt_flags & RTF_UP) == 0 ||
+ rte->rt_ifp == NULL ||
+ !RT_LINK_IS_UP(rte->rt_ifp) ||
+ dst->sin_family != AF_INET ||
+ dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
+ if (!nortfree)
+ RTFREE(rte);
+ rte = ro->ro_rt = (struct rtentry *)NULL;
+ ro->ro_lle = (struct llentry *)NULL;
+ }
+#ifdef IPFIREWALL_FORWARD
+ if (rte == NULL && fwd_tag == NULL) {
+#else
+ if (rte == NULL) {
+#endif
+ bzero(dst, sizeof(*dst));
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = ip->ip_dst;
+ }
+ /*
+ * If routing to interface only, short circuit routing lookup.
+ * The use of an all-ones broadcast address implies this; an
+ * interface is specified by the broadcast address of an interface,
+ * or the destination address of a ptp interface.
+ */
+ if (flags & IP_SENDONES) {
+ if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
+ (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
+ IPSTAT_INC(ips_noroute);
+ error = ENETUNREACH;
+ goto bad;
+ }
+ ip->ip_dst.s_addr = INADDR_BROADCAST;
+ dst->sin_addr = ip->ip_dst;
+ ifp = ia->ia_ifp;
+ ip->ip_ttl = 1;
+ isbroadcast = 1;
+ } else if (flags & IP_ROUTETOIF) {
+ if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
+ (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0))) == NULL) {
+ IPSTAT_INC(ips_noroute);
+ error = ENETUNREACH;
+ goto bad;
+ }
+ ifp = ia->ia_ifp;
+ ip->ip_ttl = 1;
+ isbroadcast = in_broadcast(dst->sin_addr, ifp);
+ } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
+ imo != NULL && imo->imo_multicast_ifp != NULL) {
+ /*
+ * Bypass the normal routing lookup for multicast
+ * packets if the interface is specified.
+ */
+ ifp = imo->imo_multicast_ifp;
+ IFP_TO_IA(ifp, ia);
+ isbroadcast = 0; /* fool gcc */
+ } else {
+ /*
+ * We want to do any cloning requested by the link layer,
+ * as this is probably required in all cases for correct
+ * operation (as it is for ARP).
+ */
+ if (rte == NULL) {
+#ifdef RADIX_MPATH
+ rtalloc_mpath_fib(ro,
+ ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
+ inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+#else
+ in_rtalloc_ign(ro, 0,
+ inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+#endif
+ rte = ro->ro_rt;
+ }
+ if (rte == NULL ||
+ rte->rt_ifp == NULL ||
+ !RT_LINK_IS_UP(rte->rt_ifp)) {
+#ifdef IPSEC
+ /*
+ * There is no route for this packet, but it is
+ * possible that a matching SPD entry exists.
+ */
+ no_route_but_check_spd = 1;
+ mtu = 0; /* Silence GCC warning. */
+ goto sendit;
+#endif
+ IPSTAT_INC(ips_noroute);
+ error = EHOSTUNREACH;
+ goto bad;
+ }
+ ia = ifatoia(rte->rt_ifa);
+ ifa_ref(&ia->ia_ifa);
+ ifp = rte->rt_ifp;
+ rte->rt_rmx.rmx_pksent++;
+ if (rte->rt_flags & RTF_GATEWAY)
+ dst = (struct sockaddr_in *)rte->rt_gateway;
+ if (rte->rt_flags & RTF_HOST)
+ isbroadcast = (rte->rt_flags & RTF_BROADCAST);
+ else
+ isbroadcast = in_broadcast(dst->sin_addr, ifp);
+ }
+ /*
+ * Calculate MTU. If we have a route that is up, use that,
+ * otherwise use the interface's MTU.
+ */
+ if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) {
+ /*
+ * This case can happen if the user changed the MTU
+ * of an interface after enabling IP on it. Because
+ * most netifs don't keep track of routes pointing to
+ * them, there is no way for one to update all its
+ * routes when the MTU is changed.
+ */
+ if (rte->rt_rmx.rmx_mtu > ifp->if_mtu)
+ rte->rt_rmx.rmx_mtu = ifp->if_mtu;
+ mtu = rte->rt_rmx.rmx_mtu;
+ } else {
+ mtu = ifp->if_mtu;
+ }
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
+ m->m_flags |= M_MCAST;
+ /*
+ * IP destination address is multicast. Make sure "dst"
+ * still points to the address in "ro". (It may have been
+ * changed to point to a gateway address, above.)
+ */
+ dst = (struct sockaddr_in *)&ro->ro_dst;
+ /*
+ * See if the caller provided any multicast options
+ */
+ if (imo != NULL) {
+ ip->ip_ttl = imo->imo_multicast_ttl;
+ if (imo->imo_multicast_vif != -1)
+ ip->ip_src.s_addr =
+ ip_mcast_src ?
+ ip_mcast_src(imo->imo_multicast_vif) :
+ INADDR_ANY;
+ } else
+ ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
+ /*
+ * Confirm that the outgoing interface supports multicast.
+ */
+ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
+ if ((ifp->if_flags & IFF_MULTICAST) == 0) {
+ IPSTAT_INC(ips_noroute);
+ error = ENETUNREACH;
+ goto bad;
+ }
+ }
+ /*
+ * If source address not specified yet, use address
+ * of outgoing interface.
+ */
+ if (ip->ip_src.s_addr == INADDR_ANY) {
+ /* Interface may have no addresses. */
+ if (ia != NULL)
+ ip->ip_src = IA_SIN(ia)->sin_addr;
+ }
+
+ if ((imo == NULL && in_mcast_loop) ||
+ (imo && imo->imo_multicast_loop)) {
+ /*
+ * Loop back multicast datagram if not expressly
+ * forbidden to do so, even if we are not a member
+ * of the group; ip_input() will filter it later,
+ * thus deferring a hash lookup and mutex acquisition
+ * at the expense of a cheap copy using m_copym().
+ */
+ ip_mloopback(ifp, m, dst, hlen);
+ } else {
+ /*
+ * If we are acting as a multicast router, perform
+ * multicast forwarding as if the packet had just
+ * arrived on the interface to which we are about
+ * to send. The multicast forwarding function
+ * recursively calls this function, using the
+ * IP_FORWARDING flag to prevent infinite recursion.
+ *
+ * Multicasts that are looped back by ip_mloopback(),
+ * above, will be forwarded by the ip_input() routine,
+ * if necessary.
+ */
+ if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
+ /*
+ * If rsvp daemon is not running, do not
+ * set ip_moptions. This ensures that the packet
+ * is multicast and not just sent down one link
+ * as prescribed by rsvpd.
+ */
+ if (!V_rsvp_on)
+ imo = NULL;
+ if (ip_mforward &&
+ ip_mforward(ip, ifp, m, imo) != 0) {
+ m_freem(m);
+ goto done;
+ }
+ }
+ }
+
+ /*
+ * Multicasts with a time-to-live of zero may be looped-
+ * back, above, but must not be transmitted on a network.
+ * Also, multicasts addressed to the loopback interface
+ * are not sent -- the above call to ip_mloopback() will
+ * loop back a copy. ip_input() will drop the copy if
+ * this host does not belong to the destination group on
+ * the loopback interface.
+ */
+ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
+ m_freem(m);
+ goto done;
+ }
+
+ goto sendit;
+ }
+
+ /*
+ * If the source address is not specified yet, use the address
+ * of the outoing interface.
+ */
+ if (ip->ip_src.s_addr == INADDR_ANY) {
+ /* Interface may have no addresses. */
+ if (ia != NULL) {
+ ip->ip_src = IA_SIN(ia)->sin_addr;
+ }
+ }
+
+ /*
+ * Verify that we have any chance at all of being able to queue the
+ * packet or packet fragments, unless ALTQ is enabled on the given
+ * interface in which case packetdrop should be done by queueing.
+ */
+#ifdef ALTQ
+ if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
+ ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
+ ifp->if_snd.ifq_maxlen))
+#else
+ if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
+ ifp->if_snd.ifq_maxlen)
+#endif /* ALTQ */
+ {
+ error = ENOBUFS;
+ IPSTAT_INC(ips_odropped);
+ ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
+ goto bad;
+ }
+
+ /*
+ * Look for broadcast address and
+ * verify user is allowed to send
+ * such a packet.
+ */
+ if (isbroadcast) {
+ if ((ifp->if_flags & IFF_BROADCAST) == 0) {
+ error = EADDRNOTAVAIL;
+ goto bad;
+ }
+ if ((flags & IP_ALLOWBROADCAST) == 0) {
+ error = EACCES;
+ goto bad;
+ }
+ /* don't allow broadcast messages to be fragmented */
+ if (ip->ip_len > mtu) {
+ error = EMSGSIZE;
+ goto bad;
+ }
+ m->m_flags |= M_BCAST;
+ } else {
+ m->m_flags &= ~M_BCAST;
+ }
+
+sendit:
+#ifdef IPSEC
+ switch(ip_ipsec_output(&m, inp, &flags, &error, &ifp)) {
+ case 1:
+ goto bad;
+ case -1:
+ goto done;
+ case 0:
+ default:
+ break; /* Continue with packet processing. */
+ }
+ /*
+ * Check if there was a route for this packet; return error if not.
+ */
+ if (no_route_but_check_spd) {
+ IPSTAT_INC(ips_noroute);
+ error = EHOSTUNREACH;
+ goto bad;
+ }
+ /* Update variables that are affected by ipsec4_output(). */
+ ip = mtod(m, struct ip *);
+ hlen = ip->ip_hl << 2;
+#endif /* IPSEC */
+
+ /* Jump over all PFIL processing if hooks are not active. */
+ if (!PFIL_HOOKED(&V_inet_pfil_hook))
+ goto passout;
+
+ /* Run through list of hooks for output packets. */
+ odst.s_addr = ip->ip_dst.s_addr;
+ error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
+ if (error != 0 || m == NULL)
+ goto done;
+
+ ip = mtod(m, struct ip *);
+
+ /* See if destination IP address was changed by packet filter. */
+ if (odst.s_addr != ip->ip_dst.s_addr) {
+ m->m_flags |= M_SKIP_FIREWALL;
+ /* If destination is now ourself drop to ip_input(). */
+ if (in_localip(ip->ip_dst)) {
+ m->m_flags |= M_FASTFWD_OURS;
+ if (m->m_pkthdr.rcvif == NULL)
+ m->m_pkthdr.rcvif = V_loif;
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ m->m_pkthdr.csum_flags |=
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ m->m_pkthdr.csum_data = 0xffff;
+ }
+ m->m_pkthdr.csum_flags |=
+ CSUM_IP_CHECKED | CSUM_IP_VALID;
+#ifdef SCTP
+ if (m->m_pkthdr.csum_flags & CSUM_SCTP)
+ m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
+#endif
+ error = netisr_queue(NETISR_IP, m);
+ goto done;
+ } else
+ goto again; /* Redo the routing table lookup. */
+ }
+
+#ifdef IPFIREWALL_FORWARD
+ /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
+ if (m->m_flags & M_FASTFWD_OURS) {
+ if (m->m_pkthdr.rcvif == NULL)
+ m->m_pkthdr.rcvif = V_loif;
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ m->m_pkthdr.csum_flags |=
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ m->m_pkthdr.csum_data = 0xffff;
+ }
+#ifdef SCTP
+ if (m->m_pkthdr.csum_flags & CSUM_SCTP)
+ m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
+#endif
+ m->m_pkthdr.csum_flags |=
+ CSUM_IP_CHECKED | CSUM_IP_VALID;
+
+ error = netisr_queue(NETISR_IP, m);
+ goto done;
+ }
+ /* Or forward to some other address? */
+ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
+ if (fwd_tag) {
+ dst = (struct sockaddr_in *)&ro->ro_dst;
+ bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
+ m->m_flags |= M_SKIP_FIREWALL;
+ m_tag_delete(m, fwd_tag);
+ goto again;
+ }
+#endif /* IPFIREWALL_FORWARD */
+
+passout:
+ /* 127/8 must not appear on wire - RFC1122. */
+ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
+ (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
+ if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
+ IPSTAT_INC(ips_badaddr);
+ error = EADDRNOTAVAIL;
+ goto bad;
+ }
+ }
+
+ m->m_pkthdr.csum_flags |= CSUM_IP;
+ sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
+ if (sw_csum & CSUM_DELAY_DATA) {
+ in_delayed_cksum(m);
+ sw_csum &= ~CSUM_DELAY_DATA;
+ }
+#ifdef SCTP
+ if (sw_csum & CSUM_SCTP) {
+ sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
+ sw_csum &= ~CSUM_SCTP;
+ }
+#endif
+ m->m_pkthdr.csum_flags &= ifp->if_hwassist;
+
+ /*
+ * If small enough for interface, or the interface will take
+ * care of the fragmentation for us, we can just send directly.
+ */
+ if (ip->ip_len <= mtu ||
+ (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
+ ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+ ip->ip_sum = 0;
+ if (sw_csum & CSUM_DELAY_IP)
+ ip->ip_sum = in_cksum(m, hlen);
+
+ /*
+ * Record statistics for this interface address.
+ * With CSUM_TSO the byte/packet count will be slightly
+ * incorrect because we count the IP+TCP headers only
+ * once instead of for every generated packet.
+ */
+ if (!(flags & IP_FORWARDING) && ia) {
+ if (m->m_pkthdr.csum_flags & CSUM_TSO)
+ ia->ia_ifa.if_opackets +=
+ m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
+ else
+ ia->ia_ifa.if_opackets++;
+ ia->ia_ifa.if_obytes += m->m_pkthdr.len;
+ }
+#ifdef MBUF_STRESS_TEST
+ if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
+ m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
+#endif
+ /*
+ * Reset layer specific mbuf flags
+ * to avoid confusing lower layers.
+ */
+ m->m_flags &= ~(M_PROTOFLAGS);
+ error = (*ifp->if_output)(ifp, m,
+ (struct sockaddr *)dst, ro);
+ goto done;
+ }
+
+ /* Balk when DF bit is set or the interface didn't support TSO. */
+ if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
+ error = EMSGSIZE;
+ IPSTAT_INC(ips_cantfrag);
+ goto bad;
+ }
+
+ /*
+ * Too large for interface; fragment if possible. If successful,
+ * on return, m will point to a list of packets to be sent.
+ */
+ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum);
+ if (error)
+ goto bad;
+ for (; m; m = m0) {
+ m0 = m->m_nextpkt;
+ m->m_nextpkt = 0;
+ if (error == 0) {
+ /* Record statistics for this interface address. */
+ if (ia != NULL) {
+ ia->ia_ifa.if_opackets++;
+ ia->ia_ifa.if_obytes += m->m_pkthdr.len;
+ }
+ /*
+ * Reset layer specific mbuf flags
+ * to avoid confusing upper layers.
+ */
+ m->m_flags &= ~(M_PROTOFLAGS);
+
+ error = (*ifp->if_output)(ifp, m,
+ (struct sockaddr *)dst, ro);
+ } else
+ m_freem(m);
+ }
+
+ if (error == 0)
+ IPSTAT_INC(ips_fragmented);
+
+done:
+ if (ro == &iproute && ro->ro_rt && !nortfree) {
+ RTFREE(ro->ro_rt);
+ }
+ if (ia != NULL)
+ ifa_free(&ia->ia_ifa);
+ return (error);
+bad:
+ m_freem(m);
+ goto done;
+}
+
+/*
+ * Create a chain of fragments which fit the given mtu. m_frag points to the
+ * mbuf to be fragmented; on return it points to the chain with the fragments.
+ * Return 0 if no error. If error, m_frag may contain a partially built
+ * chain of fragments that should be freed by the caller.
+ *
+ * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
+ * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
+ */
+int
+ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
+ u_long if_hwassist_flags, int sw_csum)
+{
+ int error = 0;
+ int hlen = ip->ip_hl << 2;
+ int len = (mtu - hlen) & ~7; /* size of payload in each fragment */
+ int off;
+ struct mbuf *m0 = *m_frag; /* the original packet */
+ int firstlen;
+ struct mbuf **mnext;
+ int nfrags;
+
+ if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */
+ IPSTAT_INC(ips_cantfrag);
+ return EMSGSIZE;
+ }
+
+ /*
+ * Must be able to put at least 8 bytes per fragment.
+ */
+ if (len < 8)
+ return EMSGSIZE;
+
+ /*
+ * If the interface will not calculate checksums on
+ * fragmented packets, then do it here.
+ */
+ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
+ (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
+ in_delayed_cksum(m0);
+ m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ }
+#ifdef SCTP
+ if (m0->m_pkthdr.csum_flags & CSUM_SCTP &&
+ (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
+ sctp_delayed_cksum(m0, hlen);
+ m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
+ }
+#endif
+ if (len > PAGE_SIZE) {
+ /*
+ * Fragment large datagrams such that each segment
+ * contains a multiple of PAGE_SIZE amount of data,
+ * plus headers. This enables a receiver to perform
+ * page-flipping zero-copy optimizations.
+ *
+ * XXX When does this help given that sender and receiver
+ * could have different page sizes, and also mtu could
+ * be less than the receiver's page size ?
+ */
+ int newlen;
+ struct mbuf *m;
+
+ for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
+ off += m->m_len;
+
+ /*
+ * firstlen (off - hlen) must be aligned on an
+ * 8-byte boundary
+ */
+ if (off < hlen)
+ goto smart_frag_failure;
+ off = ((off - hlen) & ~7) + hlen;
+ newlen = (~PAGE_MASK) & mtu;
+ if ((newlen + sizeof (struct ip)) > mtu) {
+ /* we failed, go back the default */
+smart_frag_failure:
+ newlen = len;
+ off = hlen + len;
+ }
+ len = newlen;
+
+ } else {
+ off = hlen + len;
+ }
+
+ firstlen = off - hlen;
+ mnext = &m0->m_nextpkt; /* pointer to next packet */
+
+ /*
+ * Loop through length of segment after first fragment,
+ * make new header and copy data of each part and link onto chain.
+ * Here, m0 is the original packet, m is the fragment being created.
+ * The fragments are linked off the m_nextpkt of the original
+ * packet, which after processing serves as the first fragment.
+ */
+ for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
+ struct ip *mhip; /* ip header on the fragment */
+ struct mbuf *m;
+ int mhlen = sizeof (struct ip);
+
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (m == NULL) {
+ error = ENOBUFS;
+ IPSTAT_INC(ips_odropped);
+ goto done;
+ }
+ m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
+ /*
+ * In the first mbuf, leave room for the link header, then
+ * copy the original IP header including options. The payload
+ * goes into an additional mbuf chain returned by m_copym().
+ */
+ m->m_data += max_linkhdr;
+ mhip = mtod(m, struct ip *);
+ *mhip = *ip;
+ if (hlen > sizeof (struct ip)) {
+ mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
+ mhip->ip_v = IPVERSION;
+ mhip->ip_hl = mhlen >> 2;
+ }
+ m->m_len = mhlen;
+ /* XXX do we need to add ip->ip_off below ? */
+ mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
+ if (off + len >= ip->ip_len) { /* last fragment */
+ len = ip->ip_len - off;
+ m->m_flags |= M_LASTFRAG;
+ } else
+ mhip->ip_off |= IP_MF;
+ mhip->ip_len = htons((u_short)(len + mhlen));
+ m->m_next = m_copym(m0, off, len, M_DONTWAIT);
+ if (m->m_next == NULL) { /* copy failed */
+ m_free(m);
+ error = ENOBUFS; /* ??? */
+ IPSTAT_INC(ips_odropped);
+ goto done;
+ }
+ m->m_pkthdr.len = mhlen + len;
+ m->m_pkthdr.rcvif = NULL;
+#ifdef MAC
+ mac_netinet_fragment(m0, m);
+#endif
+ m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
+ mhip->ip_off = htons(mhip->ip_off);
+ mhip->ip_sum = 0;
+ if (sw_csum & CSUM_DELAY_IP)
+ mhip->ip_sum = in_cksum(m, mhlen);
+ *mnext = m;
+ mnext = &m->m_nextpkt;
+ }
+ IPSTAT_ADD(ips_ofragments, nfrags);
+
+ /* set first marker for fragment chain */
+ m0->m_flags |= M_FIRSTFRAG | M_FRAG;
+ m0->m_pkthdr.csum_data = nfrags;
+
+ /*
+ * Update first fragment by trimming what's been copied out
+ * and updating header.
+ */
+ m_adj(m0, hlen + firstlen - ip->ip_len);
+ m0->m_pkthdr.len = hlen + firstlen;
+ ip->ip_len = htons((u_short)m0->m_pkthdr.len);
+ ip->ip_off |= IP_MF;
+ ip->ip_off = htons(ip->ip_off);
+ ip->ip_sum = 0;
+ if (sw_csum & CSUM_DELAY_IP)
+ ip->ip_sum = in_cksum(m0, hlen);
+
+done:
+ *m_frag = m0;
+ return error;
+}
+
+void
+in_delayed_cksum(struct mbuf *m)
+{
+ struct ip *ip;
+ u_short csum, offset;
+
+ ip = mtod(m, struct ip *);
+ offset = ip->ip_hl << 2 ;
+ csum = in_cksum_skip(m, ip->ip_len, offset);
+ if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
+ csum = 0xffff;
+ offset += m->m_pkthdr.csum_data; /* checksum offset */
+
+ if (offset + sizeof(u_short) > m->m_len) {
+ printf("delayed m_pullup, m->len: %d off: %d p: %d\n",
+ m->m_len, offset, ip->ip_p);
+ /*
+ * XXX
+ * this shouldn't happen, but if it does, the
+ * correct behavior may be to insert the checksum
+ * in the appropriate next mbuf in the chain.
+ */
+ return;
+ }
+ *(u_short *)(m->m_data + offset) = csum;
+}
+
+/*
+ * IP socket option processing.
+ */
+int
+ip_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ struct inpcb *inp = sotoinpcb(so);
+ int error, optval;
+
+ error = optval = 0;
+ if (sopt->sopt_level != IPPROTO_IP) {
+ if ((sopt->sopt_level == SOL_SOCKET) &&
+ (sopt->sopt_name == SO_SETFIB)) {
+ inp->inp_inc.inc_fibnum = so->so_fibnum;
+ return (0);
+ }
+ return (EINVAL);
+ }
+
+ switch (sopt->sopt_dir) {
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case IP_OPTIONS:
+#ifdef notyet
+ case IP_RETOPTS:
+#endif
+ {
+ struct mbuf *m;
+ if (sopt->sopt_valsize > MLEN) {
+ error = EMSGSIZE;
+ break;
+ }
+ MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
+ if (m == NULL) {
+ error = ENOBUFS;
+ break;
+ }
+ m->m_len = sopt->sopt_valsize;
+ error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
+ m->m_len);
+ if (error) {
+ m_free(m);
+ break;
+ }
+ INP_WLOCK(inp);
+ error = ip_pcbopts(inp, sopt->sopt_name, m);
+ INP_WUNLOCK(inp);
+ return (error);
+ }
+
+ case IP_BINDANY:
+ if (sopt->sopt_td != NULL) {
+ error = priv_check(sopt->sopt_td,
+ PRIV_NETINET_BINDANY);
+ if (error)
+ break;
+ }
+ /* FALLTHROUGH */
+ case IP_TOS:
+ case IP_TTL:
+ case IP_MINTTL:
+ case IP_RECVOPTS:
+ case IP_RECVRETOPTS:
+ case IP_RECVDSTADDR:
+ case IP_RECVTTL:
+ case IP_RECVIF:
+ case IP_FAITH:
+ case IP_ONESBCAST:
+ case IP_DONTFRAG:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ break;
+
+ switch (sopt->sopt_name) {
+ case IP_TOS:
+ inp->inp_ip_tos = optval;
+ break;
+
+ case IP_TTL:
+ inp->inp_ip_ttl = optval;
+ break;
+
+ case IP_MINTTL:
+ if (optval >= 0 && optval <= MAXTTL)
+ inp->inp_ip_minttl = optval;
+ else
+ error = EINVAL;
+ break;
+
+#define OPTSET(bit) do { \
+ INP_WLOCK(inp); \
+ if (optval) \
+ inp->inp_flags |= bit; \
+ else \
+ inp->inp_flags &= ~bit; \
+ INP_WUNLOCK(inp); \
+} while (0)
+
+ case IP_RECVOPTS:
+ OPTSET(INP_RECVOPTS);
+ break;
+
+ case IP_RECVRETOPTS:
+ OPTSET(INP_RECVRETOPTS);
+ break;
+
+ case IP_RECVDSTADDR:
+ OPTSET(INP_RECVDSTADDR);
+ break;
+
+ case IP_RECVTTL:
+ OPTSET(INP_RECVTTL);
+ break;
+
+ case IP_RECVIF:
+ OPTSET(INP_RECVIF);
+ break;
+
+ case IP_FAITH:
+ OPTSET(INP_FAITH);
+ break;
+
+ case IP_ONESBCAST:
+ OPTSET(INP_ONESBCAST);
+ break;
+ case IP_DONTFRAG:
+ OPTSET(INP_DONTFRAG);
+ break;
+ case IP_BINDANY:
+ OPTSET(INP_BINDANY);
+ break;
+ }
+ break;
+#undef OPTSET
+
+ /*
+ * Multicast socket options are processed by the in_mcast
+ * module.
+ */
+ case IP_MULTICAST_IF:
+ case IP_MULTICAST_VIF:
+ case IP_MULTICAST_TTL:
+ case IP_MULTICAST_LOOP:
+ case IP_ADD_MEMBERSHIP:
+ case IP_DROP_MEMBERSHIP:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ case IP_BLOCK_SOURCE:
+ case IP_UNBLOCK_SOURCE:
+ case IP_MSFILTER:
+ case MCAST_JOIN_GROUP:
+ case MCAST_LEAVE_GROUP:
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ error = inp_setmoptions(inp, sopt);
+ break;
+
+ case IP_PORTRANGE:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ break;
+
+ INP_WLOCK(inp);
+ switch (optval) {
+ case IP_PORTRANGE_DEFAULT:
+ inp->inp_flags &= ~(INP_LOWPORT);
+ inp->inp_flags &= ~(INP_HIGHPORT);
+ break;
+
+ case IP_PORTRANGE_HIGH:
+ inp->inp_flags &= ~(INP_LOWPORT);
+ inp->inp_flags |= INP_HIGHPORT;
+ break;
+
+ case IP_PORTRANGE_LOW:
+ inp->inp_flags &= ~(INP_HIGHPORT);
+ inp->inp_flags |= INP_LOWPORT;
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+ INP_WUNLOCK(inp);
+ break;
+
+#ifdef IPSEC
+ case IP_IPSEC_POLICY:
+ {
+ caddr_t req;
+ struct mbuf *m;
+
+ if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
+ break;
+ if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
+ break;
+ req = mtod(m, caddr_t);
+ error = ipsec_set_policy(inp, sopt->sopt_name, req,
+ m->m_len, (sopt->sopt_td != NULL) ?
+ sopt->sopt_td->td_ucred : NULL);
+ m_freem(m);
+ break;
+ }
+#endif /* IPSEC */
+
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ break;
+
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case IP_OPTIONS:
+ case IP_RETOPTS:
+ if (inp->inp_options)
+ error = sooptcopyout(sopt,
+ mtod(inp->inp_options,
+ char *),
+ inp->inp_options->m_len);
+ else
+ sopt->sopt_valsize = 0;
+ break;
+
+ case IP_TOS:
+ case IP_TTL:
+ case IP_MINTTL:
+ case IP_RECVOPTS:
+ case IP_RECVRETOPTS:
+ case IP_RECVDSTADDR:
+ case IP_RECVTTL:
+ case IP_RECVIF:
+ case IP_PORTRANGE:
+ case IP_FAITH:
+ case IP_ONESBCAST:
+ case IP_DONTFRAG:
+ case IP_BINDANY:
+ switch (sopt->sopt_name) {
+
+ case IP_TOS:
+ optval = inp->inp_ip_tos;
+ break;
+
+ case IP_TTL:
+ optval = inp->inp_ip_ttl;
+ break;
+
+ case IP_MINTTL:
+ optval = inp->inp_ip_minttl;
+ break;
+
+#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
+
+ case IP_RECVOPTS:
+ optval = OPTBIT(INP_RECVOPTS);
+ break;
+
+ case IP_RECVRETOPTS:
+ optval = OPTBIT(INP_RECVRETOPTS);
+ break;
+
+ case IP_RECVDSTADDR:
+ optval = OPTBIT(INP_RECVDSTADDR);
+ break;
+
+ case IP_RECVTTL:
+ optval = OPTBIT(INP_RECVTTL);
+ break;
+
+ case IP_RECVIF:
+ optval = OPTBIT(INP_RECVIF);
+ break;
+
+ case IP_PORTRANGE:
+ if (inp->inp_flags & INP_HIGHPORT)
+ optval = IP_PORTRANGE_HIGH;
+ else if (inp->inp_flags & INP_LOWPORT)
+ optval = IP_PORTRANGE_LOW;
+ else
+ optval = 0;
+ break;
+
+ case IP_FAITH:
+ optval = OPTBIT(INP_FAITH);
+ break;
+
+ case IP_ONESBCAST:
+ optval = OPTBIT(INP_ONESBCAST);
+ break;
+ case IP_DONTFRAG:
+ optval = OPTBIT(INP_DONTFRAG);
+ break;
+ case IP_BINDANY:
+ optval = OPTBIT(INP_BINDANY);
+ break;
+ }
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+
+ /*
+ * Multicast socket options are processed by the in_mcast
+ * module.
+ */
+ case IP_MULTICAST_IF:
+ case IP_MULTICAST_VIF:
+ case IP_MULTICAST_TTL:
+ case IP_MULTICAST_LOOP:
+ case IP_MSFILTER:
+ error = inp_getmoptions(inp, sopt);
+ break;
+
+#ifdef IPSEC
+ case IP_IPSEC_POLICY:
+ {
+ struct mbuf *m = NULL;
+ caddr_t req = NULL;
+ size_t len = 0;
+
+ if (m != 0) {
+ req = mtod(m, caddr_t);
+ len = m->m_len;
+ }
+ error = ipsec_get_policy(sotoinpcb(so), req, len, &m);
+ if (error == 0)
+ error = soopt_mcopyout(sopt, m); /* XXX */
+ if (error == 0)
+ m_freem(m);
+ break;
+ }
+#endif /* IPSEC */
+
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ break;
+ }
+ return (error);
+}
+
+/*
+ * Routine called from ip_output() to loop back a copy of an IP multicast
+ * packet to the input queue of a specified interface. Note that this
+ * calls the output routine of the loopback "driver", but with an interface
+ * pointer that might NOT be a loopback interface -- evil, but easier than
+ * replicating that code here.
+ */
+static void
+ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
+ int hlen)
+{
+ register struct ip *ip;
+ struct mbuf *copym;
+
+ /*
+ * Make a deep copy of the packet because we're going to
+ * modify the pack in order to generate checksums.
+ */
+ copym = m_dup(m, M_DONTWAIT);
+ if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
+ copym = m_pullup(copym, hlen);
+ if (copym != NULL) {
+ /* If needed, compute the checksum and mark it as valid. */
+ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ in_delayed_cksum(copym);
+ copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ copym->m_pkthdr.csum_flags |=
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ copym->m_pkthdr.csum_data = 0xffff;
+ }
+ /*
+ * We don't bother to fragment if the IP length is greater
+ * than the interface's MTU. Can this possibly matter?
+ */
+ ip = mtod(copym, struct ip *);
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+ ip->ip_sum = 0;
+ ip->ip_sum = in_cksum(copym, hlen);
+#if 1 /* XXX */
+ if (dst->sin_family != AF_INET) {
+ printf("ip_mloopback: bad address family %d\n",
+ dst->sin_family);
+ dst->sin_family = AF_INET;
+ }
+#endif
+ if_simloop(ifp, copym, dst->sin_family, 0);
+ }
+}
diff --git a/freebsd/sys/netinet/ip_var.h b/freebsd/sys/netinet/ip_var.h
new file mode 100644
index 00000000..2902174d
--- /dev/null
+++ b/freebsd/sys/netinet/ip_var.h
@@ -0,0 +1,315 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_var.h 8.2 (Berkeley) 1/9/95
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IP_VAR_HH_
+#define _NETINET_IP_VAR_HH_
+
+#include <freebsd/sys/queue.h>
+
+/*
+ * Overlay for ip header used by other protocols (tcp, udp).
+ */
+struct ipovly {
+ u_char ih_x1[9]; /* (unused) */
+ u_char ih_pr; /* protocol */
+ u_short ih_len; /* protocol length */
+ struct in_addr ih_src; /* source internet address */
+ struct in_addr ih_dst; /* destination internet address */
+};
+
+#ifdef _KERNEL
+/*
+ * Ip reassembly queue structure. Each fragment
+ * being reassembled is attached to one of these structures.
+ * They are timed out after ipq_ttl drops to 0, and may also
+ * be reclaimed if memory becomes tight.
+ */
+struct ipq {
+ TAILQ_ENTRY(ipq) ipq_list; /* to other reass headers */
+ u_char ipq_ttl; /* time for reass q to live */
+ u_char ipq_p; /* protocol of this fragment */
+ u_short ipq_id; /* sequence id for reassembly */
+ struct mbuf *ipq_frags; /* to ip headers of fragments */
+ struct in_addr ipq_src,ipq_dst;
+ u_char ipq_nfrags; /* # frags in this packet */
+ struct label *ipq_label; /* MAC label */
+};
+#endif /* _KERNEL */
+
+/*
+ * Structure stored in mbuf in inpcb.ip_options
+ * and passed to ip_output when ip options are in use.
+ * The actual length of the options (including ipopt_dst)
+ * is in m_len.
+ */
+#define MAX_IPOPTLEN 40
+
+struct ipoption {
+ struct in_addr ipopt_dst; /* first-hop dst if source routed */
+ char ipopt_list[MAX_IPOPTLEN]; /* options proper */
+};
+
+/*
+ * Structure attached to inpcb.ip_moptions and
+ * passed to ip_output when IP multicast options are in use.
+ * This structure is lazy-allocated.
+ */
+struct ip_moptions {
+ struct ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */
+ struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */
+ u_long imo_multicast_vif; /* vif num outgoing multicasts */
+ u_char imo_multicast_ttl; /* TTL for outgoing multicasts */
+ u_char imo_multicast_loop; /* 1 => hear sends if a member */
+ u_short imo_num_memberships; /* no. memberships this socket */
+ u_short imo_max_memberships; /* max memberships this socket */
+ struct in_multi **imo_membership; /* group memberships */
+ struct in_mfilter *imo_mfilters; /* source filters */
+};
+
+struct ipstat {
+ u_long ips_total; /* total packets received */
+ u_long ips_badsum; /* checksum bad */
+ u_long ips_tooshort; /* packet too short */
+ u_long ips_toosmall; /* not enough data */
+ u_long ips_badhlen; /* ip header length < data size */
+ u_long ips_badlen; /* ip length < ip header length */
+ u_long ips_fragments; /* fragments received */
+ u_long ips_fragdropped; /* frags dropped (dups, out of space) */
+ u_long ips_fragtimeout; /* fragments timed out */
+ u_long ips_forward; /* packets forwarded */
+ u_long ips_fastforward; /* packets fast forwarded */
+ u_long ips_cantforward; /* packets rcvd for unreachable dest */
+ u_long ips_redirectsent; /* packets forwarded on same net */
+ u_long ips_noproto; /* unknown or unsupported protocol */
+ u_long ips_delivered; /* datagrams delivered to upper level*/
+ u_long ips_localout; /* total ip packets generated here */
+ u_long ips_odropped; /* lost packets due to nobufs, etc. */
+ u_long ips_reassembled; /* total packets reassembled ok */
+ u_long ips_fragmented; /* datagrams successfully fragmented */
+ u_long ips_ofragments; /* output fragments created */
+ u_long ips_cantfrag; /* don't fragment flag was set, etc. */
+ u_long ips_badoptions; /* error in option processing */
+ u_long ips_noroute; /* packets discarded due to no route */
+ u_long ips_badvers; /* ip version != 4 */
+ u_long ips_rawout; /* total raw ip packets generated */
+ u_long ips_toolong; /* ip length > max ip packet size */
+ u_long ips_notmember; /* multicasts for unregistered grps */
+ u_long ips_nogif; /* no match gif found */
+ u_long ips_badaddr; /* invalid address on header */
+};
+
+#ifdef _KERNEL
+
+#include <freebsd/net/vnet.h>
+
+/*
+ * In-kernel consumers can use these accessor macros directly to update
+ * stats.
+ */
+#define IPSTAT_ADD(name, val) V_ipstat.name += (val)
+#define IPSTAT_SUB(name, val) V_ipstat.name -= (val)
+#define IPSTAT_INC(name) IPSTAT_ADD(name, 1)
+#define IPSTAT_DEC(name) IPSTAT_SUB(name, 1)
+
+/*
+ * Kernel module consumers must use this accessor macro.
+ */
+void kmod_ipstat_inc(int statnum);
+#define KMOD_IPSTAT_INC(name) \
+ kmod_ipstat_inc(offsetof(struct ipstat, name) / sizeof(u_long))
+void kmod_ipstat_dec(int statnum);
+#define KMOD_IPSTAT_DEC(name) \
+ kmod_ipstat_dec(offsetof(struct ipstat, name) / sizeof(u_long))
+
+/* flags passed to ip_output as last parameter */
+#define IP_FORWARDING 0x1 /* most of ip header exists */
+#define IP_RAWOUTPUT 0x2 /* raw ip header exists */
+#define IP_SENDONES 0x4 /* send all-ones broadcast */
+#define IP_SENDTOIF 0x8 /* send on specific ifnet */
+#define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */
+#define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */
+
+/*
+ * mbuf flag used by ip_fastfwd
+ */
+#define M_FASTFWD_OURS M_PROTO1 /* changed dst to local */
+
+#ifdef __NO_STRICT_ALIGNMENT
+#define IP_HDR_ALIGNED_P(ip) 1
+#else
+#define IP_HDR_ALIGNED_P(ip) ((((intptr_t) (ip)) & 3) == 0)
+#endif
+
+struct ip;
+struct inpcb;
+struct route;
+struct sockopt;
+
+VNET_DECLARE(struct ipstat, ipstat);
+VNET_DECLARE(u_short, ip_id); /* ip packet ctr, for ids */
+VNET_DECLARE(int, ip_defttl); /* default IP ttl */
+VNET_DECLARE(int, ipforwarding); /* ip forwarding */
+#ifdef IPSTEALTH
+VNET_DECLARE(int, ipstealth); /* stealth forwarding */
+#endif
+extern u_char ip_protox[];
+VNET_DECLARE(struct socket *, ip_rsvpd); /* reservation protocol daemon*/
+VNET_DECLARE(struct socket *, ip_mrouter); /* multicast routing daemon */
+extern int (*legal_vif_num)(int);
+extern u_long (*ip_mcast_src)(int);
+VNET_DECLARE(int, rsvp_on);
+extern struct pr_usrreqs rip_usrreqs;
+
+#define V_ipstat VNET(ipstat)
+#define V_ip_id VNET(ip_id)
+#define V_ip_defttl VNET(ip_defttl)
+#define V_ipforwarding VNET(ipforwarding)
+#ifdef IPSTEALTH
+#define V_ipstealth VNET(ipstealth)
+#endif
+#define V_ip_rsvpd VNET(ip_rsvpd)
+#define V_ip_mrouter VNET(ip_mrouter)
+#define V_rsvp_on VNET(rsvp_on)
+
+void inp_freemoptions(struct ip_moptions *);
+int inp_getmoptions(struct inpcb *, struct sockopt *);
+int inp_setmoptions(struct inpcb *, struct sockopt *);
+
+int ip_ctloutput(struct socket *, struct sockopt *sopt);
+void ip_drain(void);
+void ip_fini(void *xtp);
+int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
+ u_long if_hwassist_flags, int sw_csum);
+void ip_forward(struct mbuf *m, int srcrt);
+void ip_init(void);
+#ifdef VIMAGE
+void ip_destroy(void);
+#endif
+extern int
+ (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
+ struct ip_moptions *);
+int ip_output(struct mbuf *,
+ struct mbuf *, struct route *, int, struct ip_moptions *,
+ struct inpcb *);
+int ipproto_register(short);
+int ipproto_unregister(short);
+struct mbuf *
+ ip_reass(struct mbuf *);
+struct in_ifaddr *
+ ip_rtaddr(struct in_addr, u_int fibnum);
+void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
+ struct mbuf *);
+void ip_slowtimo(void);
+u_int16_t ip_randomid(void);
+int rip_ctloutput(struct socket *, struct sockopt *);
+void rip_ctlinput(int, struct sockaddr *, void *);
+void rip_init(void);
+#ifdef VIMAGE
+void rip_destroy(void);
+#endif
+void rip_input(struct mbuf *, int);
+int rip_output(struct mbuf *, struct socket *, u_long);
+void ipip_input(struct mbuf *, int);
+void rsvp_input(struct mbuf *, int);
+int ip_rsvp_init(struct socket *);
+int ip_rsvp_done(void);
+extern int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
+extern void (*ip_rsvp_force_done)(struct socket *);
+extern void (*rsvp_input_p)(struct mbuf *m, int off);
+
+VNET_DECLARE(struct pfil_head, inet_pfil_hook); /* packet filter hooks */
+#define V_inet_pfil_hook VNET(inet_pfil_hook)
+
+void in_delayed_cksum(struct mbuf *m);
+
+/* Hooks for ipfw, dummynet, divert etc. Most are declared in raw_ip.c */
+/*
+ * Reference to an ipfw or packet filter rule that can be carried
+ * outside critical sections.
+ * A rule is identified by rulenum:rule_id which is ordered.
+ * In version chain_id the rule can be found in slot 'slot', so
+ * we don't need a lookup if chain_id == chain->id.
+ *
+ * On exit from the firewall this structure refers to the rule after
+ * the matching one (slot points to the new rule; rulenum:rule_id-1
+ * is the matching rule), and additional info (e.g. info often contains
+ * the insn argument or tablearg in the low 16 bits, in host format).
+ * On entry, the structure is valid if slot>0, and refers to the starting
+ * rules. 'info' contains the reason for reinject, e.g. divert port,
+ * divert direction, and so on.
+ */
+struct ipfw_rule_ref {
+ uint32_t slot; /* slot for matching rule */
+ uint32_t rulenum; /* matching rule number */
+ uint32_t rule_id; /* matching rule id */
+ uint32_t chain_id; /* ruleset id */
+ uint32_t info; /* see below */
+};
+
+enum {
+ IPFW_INFO_MASK = 0x0000ffff,
+ IPFW_INFO_OUT = 0x00000000, /* outgoing, just for convenience */
+ IPFW_INFO_IN = 0x80000000, /* incoming, overloads dir */
+ IPFW_ONEPASS = 0x40000000, /* One-pass, do not reinject */
+ IPFW_IS_MASK = 0x30000000, /* which source ? */
+ IPFW_IS_DIVERT = 0x20000000,
+ IPFW_IS_DUMMYNET =0x10000000,
+ IPFW_IS_PIPE = 0x08000000, /* pip1=1, queue = 0 */
+};
+#define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */
+#define MTAG_IPFW_RULE 1262273568 /* rule reference */
+
+struct ip_fw_args;
+typedef int (*ip_fw_chk_ptr_t)(struct ip_fw_args *args);
+typedef int (*ip_fw_ctl_ptr_t)(struct sockopt *);
+VNET_DECLARE(ip_fw_chk_ptr_t, ip_fw_chk_ptr);
+VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr);
+#define V_ip_fw_chk_ptr VNET(ip_fw_chk_ptr)
+#define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr)
+
+/* Divert hooks. */
+extern void (*ip_divert_ptr)(struct mbuf *m, int incoming);
+/* ng_ipfw hooks -- XXX make it the same as divert and dummynet */
+extern int (*ng_ipfw_input_p)(struct mbuf **, int,
+ struct ip_fw_args *, int);
+
+extern int (*ip_dn_ctl_ptr)(struct sockopt *);
+extern int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *);
+
+VNET_DECLARE(int, ip_do_randomid);
+#define V_ip_do_randomid VNET(ip_do_randomid)
+#define ip_newid() ((V_ip_do_randomid != 0) ? ip_randomid() : \
+ htons(V_ip_id++))
+
+#endif /* _KERNEL */
+
+#endif /* !_NETINET_IP_VAR_HH_ */
diff --git a/freebsd/sys/netinet/ipfw/dn_heap.c b/freebsd/sys/netinet/ipfw/dn_heap.c
new file mode 100644
index 00000000..1e6133bc
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/dn_heap.c
@@ -0,0 +1,552 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Binary heap and hash tables, used in dummynet
+ *
+ * $FreeBSD$
+ */
+
+#include <freebsd/sys/cdefs.h>
+#include <freebsd/sys/param.h>
+#ifdef _KERNEL
+__FBSDID("$FreeBSD$");
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/netinet/ipfw/dn_heap.h>
+#ifndef log
+#define log(x, arg...)
+#endif
+
+#else /* !_KERNEL */
+
+#include <freebsd/stdio.h>
+#include <freebsd/dn_test.h>
+#include <freebsd/strings.h>
+#include <freebsd/stdlib.h>
+
+#include "dn_heap.h"
+#define log(x, arg...) fprintf(stderr, ## arg)
+#define panic(x...) fprintf(stderr, ## x), exit(1)
+#define MALLOC_DEFINE(a, b, c)
+static void *my_malloc(int s) { return malloc(s); }
+static void my_free(void *p) { free(p); }
+#define malloc(s, t, w) my_malloc(s)
+#define free(p, t) my_free(p)
+#endif /* !_KERNEL */
+
+MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap");
+
+/*
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ *
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 16 entries.
+ * Returns 1 on error, 0 on success
+ */
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( (x)+(x) + 1 )
+#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT 15
+
+static int
+heap_resize(struct dn_heap *h, unsigned int new_size)
+{
+ struct dn_heap_entry *p;
+
+ if (h->size >= new_size ) /* have enough room */
+ return 0;
+#if 1 /* round to the next power of 2 */
+ new_size |= new_size >> 1;
+ new_size |= new_size >> 2;
+ new_size |= new_size >> 4;
+ new_size |= new_size >> 8;
+ new_size |= new_size >> 16;
+#else
+ new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT;
+#endif
+ p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT);
+ if (p == NULL) {
+ printf("--- %s, resize %d failed\n", __func__, new_size );
+ return 1; /* error */
+ }
+ if (h->size > 0) {
+ bcopy(h->p, p, h->size * sizeof(*p) );
+ free(h->p, M_DN_HEAP);
+ }
+ h->p = p;
+ h->size = new_size;
+ return 0;
+}
+
+int
+heap_init(struct dn_heap *h, int size, int ofs)
+{
+ if (heap_resize(h, size))
+ return 1;
+ h->elements = 0;
+ h->ofs = ofs;
+ return 0;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ *
+ * If ofs > 0 the position (index, int) of the element in the heap is
+ * also stored in the element itself at the given offset in bytes.
+ */
+#define SET_OFFSET(h, i) do { \
+ if (h->ofs > 0) \
+ *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \
+ } while (0)
+/*
+ * RESET_OFFSET is used for sanity checks. It sets ofs
+ * to an invalid value.
+ */
+#define RESET_OFFSET(h, i) do { \
+ if (h->ofs > 0) \
+ *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \
+ } while (0)
+
+int
+heap_insert(struct dn_heap *h, uint64_t key1, void *p)
+{
+ int son = h->elements;
+
+ //log("%s key %llu p %p\n", __FUNCTION__, key1, p);
+ if (p == NULL) { /* data already there, set starting point */
+ son = key1;
+ } else { /* insert new element at the end, possibly resize */
+ son = h->elements;
+ if (son == h->size) /* need resize... */
+ // XXX expand by 16 or so
+ if (heap_resize(h, h->elements+16) )
+ return 1; /* failure... */
+ h->p[son].object = p;
+ h->p[son].key = key1;
+ h->elements++;
+ }
+ /* make sure that son >= father along the path */
+ while (son > 0) {
+ int father = HEAP_FATHER(son);
+ struct dn_heap_entry tmp;
+
+ if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+ break; /* found right position */
+ /* son smaller than father, swap and repeat */
+ HEAP_SWAP(h->p[son], h->p[father], tmp);
+ SET_OFFSET(h, son);
+ son = father;
+ }
+ SET_OFFSET(h, son);
+ return 0;
+}
+
+/*
+ * remove top element from heap, or obj if obj != NULL
+ */
+void
+heap_extract(struct dn_heap *h, void *obj)
+{
+ int child, father, max = h->elements - 1;
+
+ if (max < 0) {
+ printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h);
+ return;
+ }
+ if (obj == NULL)
+ father = 0; /* default: move up smallest child */
+ else { /* extract specific element, index is at offset */
+ if (h->ofs <= 0)
+ panic("%s: extract from middle not set on %p\n",
+ __FUNCTION__, h);
+ father = *((int *)((char *)obj + h->ofs));
+ if (father < 0 || father >= h->elements) {
+ panic("%s: father %d out of bound 0..%d\n",
+ __FUNCTION__, father, h->elements);
+ }
+ }
+ /*
+ * below, father is the index of the empty element, which
+ * we replace at each step with the smallest child until we
+ * reach the bottom level.
+ */
+ // XXX why removing RESET_OFFSET increases runtime by 10% ?
+ RESET_OFFSET(h, father);
+ while ( (child = HEAP_LEFT(father)) <= max ) {
+ if (child != max &&
+ DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+ child++; /* take right child, otherwise left */
+ h->p[father] = h->p[child];
+ SET_OFFSET(h, father);
+ father = child;
+ }
+ h->elements--;
+ if (father != max) {
+ /*
+ * Fill hole with last entry and bubble up,
+ * reusing the insert code
+ */
+ h->p[father] = h->p[max];
+ heap_insert(h, father, NULL);
+ }
+}
+
+#if 0
+/*
+ * change object position and update references
+ * XXX this one is never used!
+ */
+static void
+heap_move(struct dn_heap *h, uint64_t new_key, void *object)
+{
+ int temp, i, max = h->elements-1;
+ struct dn_heap_entry *p, buf;
+
+ if (h->ofs <= 0)
+ panic("cannot move items on this heap");
+ p = h->p; /* shortcut */
+
+ i = *((int *)((char *)object + h->ofs));
+ if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */
+ p[i].key = new_key;
+ for (; i>0 &&
+ DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key);
+ i = temp ) { /* bubble up */
+ HEAP_SWAP(p[i], p[temp], buf);
+ SET_OFFSET(h, i);
+ }
+ } else { /* must move down */
+ p[i].key = new_key;
+ while ( (temp = HEAP_LEFT(i)) <= max ) {
+ /* found left child */
+ if (temp != max &&
+ DN_KEY_LT(p[temp+1].key, p[temp].key))
+ temp++; /* select child with min key */
+ if (DN_KEY_LT(>p[temp].key, new_key)) {
+ /* go down */
+ HEAP_SWAP(p[i], p[temp], buf);
+ SET_OFFSET(h, i);
+ } else
+ break;
+ i = temp;
+ }
+ }
+ SET_OFFSET(h, i);
+}
+#endif /* heap_move, unused */
+
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+static void
+heapify(struct dn_heap *h)
+{
+ int i;
+
+ for (i = 0; i < h->elements; i++ )
+ heap_insert(h, i , NULL);
+}
+
+int
+heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t),
+ uintptr_t arg)
+{
+ int i, ret, found;
+
+ for (i = found = 0 ; i < h->elements ;) {
+ ret = fn(h->p[i].object, arg);
+ if (ret & HEAP_SCAN_DEL) {
+ h->elements-- ;
+ h->p[i] = h->p[h->elements] ;
+ found++ ;
+ } else
+ i++ ;
+ if (ret & HEAP_SCAN_END)
+ break;
+ }
+ if (found)
+ heapify(h);
+ return found;
+}
+
+/*
+ * cleanup the heap and free data structure
+ */
+void
+heap_free(struct dn_heap *h)
+{
+ if (h->size >0 )
+ free(h->p, M_DN_HEAP);
+ bzero(h, sizeof(*h) );
+}
+
+/*
+ * hash table support.
+ */
+
+struct dn_ht {
+ int buckets; /* how many buckets, really buckets - 1*/
+ int entries; /* how many entries */
+ int ofs; /* offset of link field */
+ uint32_t (*hash)(uintptr_t, int, void *arg);
+ int (*match)(void *_el, uintptr_t key, int, void *);
+ void *(*newh)(uintptr_t, int, void *);
+ void **ht; /* bucket heads */
+};
+/*
+ * Initialize, allocating bucket pointers inline.
+ * Recycle previous record if possible.
+ * If the 'newh' function is not supplied, we assume that the
+ * key passed to ht_find is the same object to be stored in.
+ */
+struct dn_ht *
+dn_ht_init(struct dn_ht *ht, int buckets, int ofs,
+ uint32_t (*h)(uintptr_t, int, void *),
+ int (*match)(void *, uintptr_t, int, void *),
+ void *(*newh)(uintptr_t, int, void *))
+{
+ int l;
+
+ /*
+ * Notes about rounding bucket size to a power of two.
+ * Given the original bucket size, we compute the nearest lower and
+ * higher power of two, minus 1 (respectively b_min and b_max) because
+ * this value will be used to do an AND with the index returned
+ * by hash function.
+ * To choice between these two values, the original bucket size is
+ * compared with b_min. If the original size is greater than 4/3 b_min,
+ * we round the bucket size to b_max, else to b_min.
+ * This ratio try to round to the nearest power of two, advantaging
+ * the greater size if the different between two power is relatively
+ * big.
+ * Rounding the bucket size to a power of two avoid the use of
+ * module when calculating the correct bucket.
+ * The ht->buckets variable store the bucket size - 1 to simply
+ * do an AND between the index returned by hash function and ht->bucket
+ * instead of a module.
+ */
+ int b_min; /* min buckets */
+ int b_max; /* max buckets */
+ int b_ori; /* original buckets */
+
+ if (h == NULL || match == NULL) {
+ printf("--- missing hash or match function");
+ return NULL;
+ }
+ if (buckets < 1 || buckets > 65536)
+ return NULL;
+
+ b_ori = buckets;
+ /* calculate next power of 2, - 1*/
+ buckets |= buckets >> 1;
+ buckets |= buckets >> 2;
+ buckets |= buckets >> 4;
+ buckets |= buckets >> 8;
+ buckets |= buckets >> 16;
+
+ b_max = buckets; /* Next power */
+ b_min = buckets >> 1; /* Previous power */
+
+ /* Calculate the 'nearest' bucket size */
+ if (b_min * 4000 / 3000 < b_ori)
+ buckets = b_max;
+ else
+ buckets = b_min;
+
+ if (ht) { /* see if we can reuse */
+ if (buckets <= ht->buckets) {
+ ht->buckets = buckets;
+ } else {
+ /* free pointers if not allocated inline */
+ if (ht->ht != (void *)(ht + 1))
+ free(ht->ht, M_DN_HEAP);
+ free(ht, M_DN_HEAP);
+ ht = NULL;
+ }
+ }
+ if (ht == NULL) {
+ /* Allocate buckets + 1 entries because buckets is use to
+ * do the AND with the index returned by hash function
+ */
+ l = sizeof(*ht) + (buckets + 1) * sizeof(void **);
+ ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO);
+ }
+ if (ht) {
+ ht->ht = (void **)(ht + 1);
+ ht->buckets = buckets;
+ ht->ofs = ofs;
+ ht->hash = h;
+ ht->match = match;
+ ht->newh = newh;
+ }
+ return ht;
+}
+
+/* dummy callback for dn_ht_free to unlink all */
+static int
+do_del(void *obj, void *arg)
+{
+ return DNHT_SCAN_DEL;
+}
+
+void
+dn_ht_free(struct dn_ht *ht, int flags)
+{
+ if (ht == NULL)
+ return;
+ if (flags & DNHT_REMOVE) {
+ (void)dn_ht_scan(ht, do_del, NULL);
+ } else {
+ if (ht->ht && ht->ht != (void *)(ht + 1))
+ free(ht->ht, M_DN_HEAP);
+ free(ht, M_DN_HEAP);
+ }
+}
+
+int
+dn_ht_entries(struct dn_ht *ht)
+{
+ return ht ? ht->entries : 0;
+}
+
+/* lookup and optionally create or delete element */
+void *
+dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg)
+{
+ int i;
+ void **pp, *p;
+
+ if (ht == NULL) /* easy on an empty hash */
+ return NULL;
+ i = (ht->buckets == 1) ? 0 :
+ (ht->hash(key, flags, arg) & ht->buckets);
+
+ for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) {
+ if (flags & DNHT_MATCH_PTR) {
+ if (key == (uintptr_t)p)
+ break;
+ } else if (ht->match(p, key, flags, arg)) /* found match */
+ break;
+ }
+ if (p) {
+ if (flags & DNHT_REMOVE) {
+ /* link in the next element */
+ *pp = *(void **)((char *)p + ht->ofs);
+ *(void **)((char *)p + ht->ofs) = NULL;
+ ht->entries--;
+ }
+ } else if (flags & DNHT_INSERT) {
+ // printf("%s before calling new, bucket %d ofs %d\n",
+ // __FUNCTION__, i, ht->ofs);
+ p = ht->newh ? ht->newh(key, flags, arg) : (void *)key;
+ // printf("%s newh returns %p\n", __FUNCTION__, p);
+ if (p) {
+ ht->entries++;
+ *(void **)((char *)p + ht->ofs) = ht->ht[i];
+ ht->ht[i] = p;
+ }
+ }
+ return p;
+}
+
+/*
+ * do a scan with the option to delete the object. Extract next before
+ * running the callback because the element may be destroyed there.
+ */
+int
+dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg)
+{
+ int i, ret, found = 0;
+ void **curp, *cur, *next;
+
+ if (ht == NULL || fn == NULL)
+ return 0;
+ for (i = 0; i <= ht->buckets; i++) {
+ curp = &ht->ht[i];
+ while ( (cur = *curp) != NULL) {
+ next = *(void **)((char *)cur + ht->ofs);
+ ret = fn(cur, arg);
+ if (ret & DNHT_SCAN_DEL) {
+ found++;
+ ht->entries--;
+ *curp = next;
+ } else {
+ curp = (void **)((char *)cur + ht->ofs);
+ }
+ if (ret & DNHT_SCAN_END)
+ return found;
+ }
+ }
+ return found;
+}
+
+/*
+ * Similar to dn_ht_scan(), except thah the scan is performed only
+ * in the bucket 'bucket'. The function returns a correct bucket number if
+ * the original is invalid
+ */
+int
+dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *),
+ void *arg)
+{
+ int i, ret, found = 0;
+ void **curp, *cur, *next;
+
+ if (ht == NULL || fn == NULL)
+ return 0;
+ if (*bucket > ht->buckets)
+ *bucket = 0;
+ i = *bucket;
+
+ curp = &ht->ht[i];
+ while ( (cur = *curp) != NULL) {
+ next = *(void **)((char *)cur + ht->ofs);
+ ret = fn(cur, arg);
+ if (ret & DNHT_SCAN_DEL) {
+ found++;
+ ht->entries--;
+ *curp = next;
+ } else {
+ curp = (void **)((char *)cur + ht->ofs);
+ }
+ if (ret & DNHT_SCAN_END)
+ return found;
+ }
+ return found;
+}
+
diff --git a/freebsd/sys/netinet/ipfw/dn_heap.h b/freebsd/sys/netinet/ipfw/dn_heap.h
new file mode 100644
index 00000000..c95473ad
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/dn_heap.h
@@ -0,0 +1,191 @@
+/*-
+ * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Binary heap and hash tables, header file
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_HEAP_H
+#define _IP_DN_HEAP_H
+
+#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0)
+#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0)
+
+/*
+ * This module implements a binary heap supporting random extraction.
+ *
+ * A heap entry contains an uint64_t key and a pointer to object.
+ * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b'
+ *
+ * The heap is a struct dn_heap plus a dynamically allocated
+ * array of dn_heap_entry entries. 'size' represents the size of
+ * the array, 'elements' count entries in use. The topmost
+ * element has the smallest key.
+ * The heap supports ordered insert, and extract from the top.
+ * To extract an object from the middle of the heap, we the object
+ * must reserve an 'int32_t' to store the position of the object
+ * in the heap itself, and the location of this field must be
+ * passed as an argument to heap_init() -- use -1 if the feature
+ * is not used.
+ */
+struct dn_heap_entry {
+ uint64_t key; /* sorting key, smallest comes first */
+ void *object; /* object pointer */
+};
+
+struct dn_heap {
+ int size; /* the size of the array */
+ int elements; /* elements in use */
+ int ofs; /* offset in the object of heap index */
+ struct dn_heap_entry *p; /* array of "size" entries */
+};
+
+enum {
+ HEAP_SCAN_DEL = 1,
+ HEAP_SCAN_END = 2,
+};
+
+/*
+ * heap_init() reinitializes the heap setting the size and the offset
+ * of the index for random extraction (use -1 if not used).
+ * The 'elements' counter is set to 0.
+ *
+ * SET_HEAP_OFS() indicates where, in the object, is stored the index
+ * for random extractions from the heap.
+ *
+ * heap_free() frees the memory associated to a heap.
+ *
+ * heap_insert() adds a key-pointer pair to the heap
+ *
+ * HEAP_TOP() returns a pointer to the top element of the heap,
+ * but makes no checks on its existance (XXX should we change ?)
+ *
+ * heap_extract() removes the entry at the top, returing the pointer.
+ * (the key should have been read before).
+ *
+ * heap_scan() invokes a callback on each entry of the heap.
+ * The callback can return a combination of HEAP_SCAN_DEL and
+ * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must
+ * be removed, and HEAP_SCAN_END means to terminate the scan.
+ * heap_scan() returns the number of elements removed.
+ * Because the order is not guaranteed, we should use heap_scan()
+ * only as a last resort mechanism.
+ */
+#define HEAP_TOP(h) ((h)->p)
+#define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0)
+int heap_init(struct dn_heap *h, int size, int ofs);
+int heap_insert(struct dn_heap *h, uint64_t key1, void *p);
+void heap_extract(struct dn_heap *h, void *obj);
+void heap_free(struct dn_heap *h);
+int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t);
+
+/*------------------------------------------------------
+ * This module implements a generic hash table with support for
+ * running callbacks on the entire table. To avoid allocating
+ * memory during hash table operations, objects must reserve
+ * space for a link field. XXX if the heap is moderately full,
+ * an SLIST suffices, and we can tolerate the cost of a hash
+ * computation on each removal.
+ *
+ * dn_ht_init() initializes the table, setting the number of
+ * buckets, the offset of the link field, the main callbacks.
+ * Callbacks are:
+ *
+ * hash(key, flags, arg) called to return a bucket index.
+ * match(obj, key, flags, arg) called to determine if key
+ * matches the current 'obj' in the heap
+ * newh(key, flags, arg) optional, used to allocate a new
+ * object during insertions.
+ *
+ * dn_ht_free() frees the heap or unlink elements.
+ * DNHT_REMOVE unlink elements, 0 frees the heap.
+ * You need two calls to do both.
+ *
+ * dn_ht_find() is the main lookup function, which can also be
+ * used to insert or delete elements in the hash table.
+ * The final 'arg' is passed to all callbacks.
+ *
+ * dn_ht_scan() is used to invoke a callback on all entries of
+ * the heap, or possibly on just one bucket. The callback
+ * is invoked with a pointer to the object, and must return
+ * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the
+ * removal of the object from the heap and the end of the
+ * scan, respectively.
+ *
+ * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans
+ * only the specific bucket of the table. The bucket is a in-out
+ * parameter and return a valid bucket number if the original
+ * is invalid.
+ *
+ * A combination of flags can be used to modify the operation
+ * of the dn_ht_find(), and of the callbacks:
+ *
+ * DNHT_KEY_IS_OBJ means the key is the object pointer.
+ * It is usally of interest for the hash and match functions.
+ *
+ * DNHT_MATCH_PTR during a lookup, match pointers instead
+ * of calling match(). Normally used when removing specific
+ * entries. Does not imply KEY_IS_OBJ as the latter _is_ used
+ * by the match function.
+ *
+ * DNHT_INSERT insert the element if not found.
+ * Calls new() to allocates a new object unless
+ * DNHT_KEY_IS_OBJ is set.
+ *
+ * DNHT_UNIQUE only insert if object not found.
+ * XXX should it imply DNHT_INSERT ?
+ *
+ * DNHT_REMOVE remove objects if we find them.
+ */
+struct dn_ht; /* should be opaque */
+
+struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs,
+ uint32_t (*hash)(uintptr_t, int, void *),
+ int (*match)(void *, uintptr_t, int, void *),
+ void *(*newh)(uintptr_t, int, void *));
+void dn_ht_free(struct dn_ht *, int flags);
+
+void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *);
+int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *);
+int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *);
+int dn_ht_entries(struct dn_ht *);
+
+enum { /* flags values.
+ * first two are returned by the scan callback to indicate
+ * to delete the matching element or to end the scan
+ */
+ DNHT_SCAN_DEL = 0x0001,
+ DNHT_SCAN_END = 0x0002,
+ DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */
+ DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */
+ DNHT_INSERT = 0x0010, /* insert if not found */
+ DNHT_UNIQUE = 0x0020, /* report error if already there */
+ DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */
+};
+
+#endif /* _IP_DN_HEAP_H */
diff --git a/freebsd/sys/netinet/ipfw/dn_sched.h b/freebsd/sys/netinet/ipfw/dn_sched.h
new file mode 100644
index 00000000..fe54b020
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/dn_sched.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The API to write a packet scheduling algorithm for dummynet.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _DN_SCHED_H
+#define _DN_SCHED_H
+
+#define DN_MULTIQUEUE 0x01
+/*
+ * Descriptor for a scheduling algorithm.
+ * Contains all function pointers for a given scheduler
+ * This is typically created when a module is loaded, and stored
+ * in a global list of schedulers.
+ */
+struct dn_alg {
+ uint32_t type; /* the scheduler type */
+ const char *name; /* scheduler name */
+ uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */
+
+ /*
+ * The following define the size of 3 optional data structures
+ * that may need to be allocated at runtime, and are appended
+ * to each of the base data structures: scheduler, sched.inst,
+ * and queue. We don't have a per-flowset structure.
+ */
+ /* + parameters attached to the template, e.g.
+ * default queue sizes, weights, quantum size, and so on;
+ */
+ size_t schk_datalen;
+
+ /* + per-instance parameters, such as timestamps,
+ * containers for queues, etc;
+ */
+ size_t si_datalen;
+
+ size_t q_datalen; /* per-queue parameters (e.g. S,F) */
+
+ /*
+ * Methods implemented by the scheduler:
+ * enqueue enqueue packet 'm' on scheduler 's', queue 'q'.
+ * q is NULL for !MULTIQUEUE.
+ * Return 0 on success, 1 on drop (packet consumed anyways).
+ * Note that q should be interpreted only as a hint
+ * on the flow that the mbuf belongs to: while a
+ * scheduler will normally enqueue m into q, it is ok
+ * to leave q alone and put the mbuf elsewhere.
+ * This function is called in two cases:
+ * - when a new packet arrives to the scheduler;
+ * - when a scheduler is reconfigured. In this case the
+ * call is issued by the new_queue callback, with a
+ * non empty queue (q) and m pointing to the first
+ * mbuf in the queue. For this reason, the function
+ * should internally check for (m != q->mq.head)
+ * before calling dn_enqueue().
+ *
+ * dequeue Called when scheduler instance 's' can
+ * dequeue a packet. Return NULL if none are available.
+ * XXX what about non work-conserving ?
+ *
+ * config called on 'sched X config ...', normally writes
+ * in the area of size sch_arg
+ *
+ * destroy called on 'sched delete', frees everything
+ * in sch_arg (other parts are handled by more specific
+ * functions)
+ *
+ * new_sched called when a new instance is created, e.g.
+ * to create the local queue for !MULTIQUEUE, set V or
+ * copy parameters for WFQ, and so on.
+ *
+ * free_sched called when deleting an instance, cleans
+ * extra data in the per-instance area.
+ *
+ * new_fsk called when a flowset is linked to a scheduler,
+ * e.g. to validate parameters such as weights etc.
+ * free_fsk when a flowset is unlinked from a scheduler.
+ * (probably unnecessary)
+ *
+ * new_queue called to set the per-queue parameters,
+ * e.g. S and F, adjust sum of weights in the parent, etc.
+ *
+ * The new_queue callback is normally called from when
+ * creating a new queue. In some cases (such as a
+ * scheduler change or reconfiguration) it can be called
+ * with a non empty queue. In this case, the queue
+ * In case of non empty queue, the new_queue callback could
+ * need to call the enqueue function. In this case,
+ * the callback should eventually call enqueue() passing
+ * as m the first element in the queue.
+ *
+ * free_queue actions related to a queue removal, e.g. undo
+ * all the above. If the queue has data in it, also remove
+ * from the scheduler. This can e.g. happen during a reconfigure.
+ */
+ int (*enqueue)(struct dn_sch_inst *, struct dn_queue *,
+ struct mbuf *);
+ struct mbuf * (*dequeue)(struct dn_sch_inst *);
+
+ int (*config)(struct dn_schk *);
+ int (*destroy)(struct dn_schk*);
+ int (*new_sched)(struct dn_sch_inst *);
+ int (*free_sched)(struct dn_sch_inst *);
+ int (*new_fsk)(struct dn_fsk *f);
+ int (*free_fsk)(struct dn_fsk *f);
+ int (*new_queue)(struct dn_queue *q);
+ int (*free_queue)(struct dn_queue *q);
+
+ /* run-time fields */
+ int ref_count; /* XXX number of instances in the system */
+ SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */
+};
+
+/* MSVC does not support initializers so we need this ugly macro */
+#ifdef _WIN32
+#define _SI(fld)
+#else
+#define _SI(fld) fld
+#endif
+
+/*
+ * Additionally, dummynet exports some functions and macros
+ * to be used by schedulers:
+ */
+
+void dn_free_pkts(struct mbuf *mnext);
+int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop);
+/* bound a variable between min and max */
+int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg);
+
+/*
+ * Extract the head of a queue, update stats. Must be the very last
+ * thing done on a dequeue as the queue itself may go away.
+ */
+static __inline struct mbuf*
+dn_dequeue(struct dn_queue *q)
+{
+ struct mbuf *m = q->mq.head;
+ if (m == NULL)
+ return NULL;
+ q->mq.head = m->m_nextpkt;
+ q->ni.length--;
+ q->ni.len_bytes -= m->m_pkthdr.len;
+ if (q->_si) {
+ q->_si->ni.length--;
+ q->_si->ni.len_bytes -= m->m_pkthdr.len;
+ }
+ if (q->ni.length == 0) /* queue is now idle */
+ q->q_time = dn_cfg.curr_time;
+ return m;
+}
+
+int dn_sched_modevent(module_t mod, int cmd, void *arg);
+
+#define DECLARE_DNSCHED_MODULE(name, dnsched) \
+ static moduledata_t name##_mod = { \
+ #name, dn_sched_modevent, dnsched \
+ }; \
+ DECLARE_MODULE(name, name##_mod, \
+ SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \
+ MODULE_DEPEND(name, dummynet, 3, 3, 3);
+#endif /* _DN_SCHED_H */
diff --git a/freebsd/sys/netinet/ipfw/dn_sched_fifo.c b/freebsd/sys/netinet/ipfw/dn_sched_fifo.c
new file mode 100644
index 00000000..6d5a4a12
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/dn_sched_fifo.c
@@ -0,0 +1,122 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/net/if.h> /* IFNAMSIZ */
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip_var.h> /* ipfw_rule_ref */
+#include <freebsd/netinet/ip_fw.h> /* flow_id */
+#include <freebsd/netinet/ip_dummynet.h>
+#include <freebsd/netinet/ipfw/dn_heap.h>
+#include <freebsd/netinet/ipfw/ip_dn_private.h>
+#include <freebsd/netinet/ipfw/dn_sched.h>
+#else
+#include <freebsd/dn_test.h>
+#endif
+
+/*
+ * This file implements a FIFO scheduler for a single queue.
+ * The queue is allocated as part of the scheduler instance,
+ * and there is a single flowset is in the template which stores
+ * queue size and policy.
+ * Enqueue and dequeue use the default library functions.
+ */
+static int
+fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m)
+{
+ /* XXX if called with q != NULL and m=NULL, this is a
+ * re-enqueue from an existing scheduler, which we should
+ * handle.
+ */
+ return dn_enqueue((struct dn_queue *)(si+1), m, 0);
+}
+
+static struct mbuf *
+fifo_dequeue(struct dn_sch_inst *si)
+{
+ return dn_dequeue((struct dn_queue *)(si + 1));
+}
+
+static int
+fifo_new_sched(struct dn_sch_inst *si)
+{
+ /* This scheduler instance contains the queue */
+ struct dn_queue *q = (struct dn_queue *)(si + 1);
+
+ set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
+ q->_si = si;
+ q->fs = si->sched->fs;
+ return 0;
+}
+
+static int
+fifo_free_sched(struct dn_sch_inst *si)
+{
+ struct dn_queue *q = (struct dn_queue *)(si + 1);
+ dn_free_pkts(q->mq.head);
+ bzero(q, sizeof(*q));
+ return 0;
+}
+
+/*
+ * FIFO scheduler descriptor
+ * contains the type of the scheduler, the name, the size of extra
+ * data structures, and function pointers.
+ */
+static struct dn_alg fifo_desc = {
+ _SI( .type = ) DN_SCHED_FIFO,
+ _SI( .name = ) "FIFO",
+ _SI( .flags = ) 0,
+
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct dn_queue),
+ _SI( .q_datalen = ) 0,
+
+ _SI( .enqueue = ) fifo_enqueue,
+ _SI( .dequeue = ) fifo_dequeue,
+ _SI( .config = ) NULL,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) fifo_new_sched,
+ _SI( .free_sched = ) fifo_free_sched,
+ _SI( .new_fsk = ) NULL,
+ _SI( .free_fsk = ) NULL,
+ _SI( .new_queue = ) NULL,
+ _SI( .free_queue = ) NULL,
+};
+
+DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc);
diff --git a/freebsd/sys/netinet/ipfw/dn_sched_prio.c b/freebsd/sys/netinet/ipfw/dn_sched_prio.c
new file mode 100644
index 00000000..c6b6027c
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/dn_sched_prio.c
@@ -0,0 +1,231 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+#ifdef _KERNEL
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/net/if.h> /* IFNAMSIZ */
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip_var.h> /* ipfw_rule_ref */
+#include <freebsd/netinet/ip_fw.h> /* flow_id */
+#include <freebsd/netinet/ip_dummynet.h>
+#include <freebsd/netinet/ipfw/dn_heap.h>
+#include <freebsd/netinet/ipfw/ip_dn_private.h>
+#include <freebsd/netinet/ipfw/dn_sched.h>
+#else
+#include <freebsd/dn_test.h>
+#endif
+
+#define DN_SCHED_PRIO 5 //XXX
+
+#if !defined(_KERNEL) || !defined(__linux__)
+#define test_bit(ix, pData) ((*pData) & (1<<(ix)))
+#define __set_bit(ix, pData) (*pData) |= (1<<(ix))
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif
+
+#ifdef __MIPSEL__
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif
+
+/* Size of the array of queues pointers. */
+#define BITMAP_T unsigned long
+#define MAXPRIO (sizeof(BITMAP_T) * 8)
+
+/*
+ * The scheduler instance contains an array of pointers to queues,
+ * one for each priority, and a bitmap listing backlogged queues.
+ */
+struct prio_si {
+ BITMAP_T bitmap; /* array bitmap */
+ struct dn_queue *q_array[MAXPRIO]; /* Array of queues pointers */
+};
+
+/*
+ * If a queue with the same priority is already backlogged, use
+ * that one instead of the queue passed as argument.
+ */
+static int
+prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+ struct prio_si *si = (struct prio_si *)(_si + 1);
+ int prio = q->fs->fs.par[0];
+
+ if (test_bit(prio, &si->bitmap) == 0) {
+ /* No queue with this priority, insert */
+ __set_bit(prio, &si->bitmap);
+ si->q_array[prio] = q;
+ } else { /* use the existing queue */
+ q = si->q_array[prio];
+ }
+ if (dn_enqueue(q, m, 0))
+ return 1;
+ return 0;
+}
+
+/*
+ * Packets are dequeued only from the highest priority queue.
+ * The function ffs() return the lowest bit in the bitmap that rapresent
+ * the array index (-1) which contains the pointer to the highest priority
+ * queue.
+ * After the dequeue, if this queue become empty, it is index is removed
+ * from the bitmap.
+ * Scheduler is idle if the bitmap is empty
+ *
+ * NOTE: highest priority is 0, lowest is sched->max_prio_q
+ */
+static struct mbuf *
+prio_dequeue(struct dn_sch_inst *_si)
+{
+ struct prio_si *si = (struct prio_si *)(_si + 1);
+ struct mbuf *m;
+ struct dn_queue *q;
+ int prio;
+
+ if (si->bitmap == 0) /* scheduler idle */
+ return NULL;
+
+ prio = ffs(si->bitmap) - 1;
+
+ /* Take the highest priority queue in the scheduler */
+ q = si->q_array[prio];
+ // assert(q)
+
+ m = dn_dequeue(q);
+ if (q->mq.head == NULL) {
+ /* Queue is now empty, remove from scheduler
+ * and mark it
+ */
+ si->q_array[prio] = NULL;
+ __clear_bit(prio, &si->bitmap);
+ }
+ return m;
+}
+
+static int
+prio_new_sched(struct dn_sch_inst *_si)
+{
+ struct prio_si *si = (struct prio_si *)(_si + 1);
+
+ bzero(si->q_array, sizeof(si->q_array));
+ si->bitmap = 0;
+
+ return 0;
+}
+
+static int
+prio_new_fsk(struct dn_fsk *fs)
+{
+ /* Check if the prioritiy is between 0 and MAXPRIO-1 */
+ ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority");
+ return 0;
+}
+
+static int
+prio_new_queue(struct dn_queue *q)
+{
+ struct prio_si *si = (struct prio_si *)(q->_si + 1);
+ int prio = q->fs->fs.par[0];
+ struct dn_queue *oldq;
+
+ q->ni.oid.subtype = DN_SCHED_PRIO;
+
+ if (q->mq.head == NULL)
+ return 0;
+
+ /* Queue already full, must insert in the scheduler or append
+ * mbufs to existing queue. This partly duplicates prio_enqueue
+ */
+ if (test_bit(prio, &si->bitmap) == 0) {
+ /* No queue with this priority, insert */
+ __set_bit(prio, &si->bitmap);
+ si->q_array[prio] = q;
+ } else if ( (oldq = si->q_array[prio]) != q) {
+ /* must append to the existing queue.
+ * can simply append q->mq.head to q2->...
+ * and add the counters to those of q2
+ */
+ oldq->mq.tail->m_nextpkt = q->mq.head;
+ oldq->mq.tail = q->mq.tail;
+ oldq->ni.length += q->ni.length;
+ q->ni.length = 0;
+ oldq->ni.len_bytes += q->ni.len_bytes;
+ q->ni.len_bytes = 0;
+ q->mq.tail = q->mq.head = NULL;
+ }
+ return 0;
+}
+
+static int
+prio_free_queue(struct dn_queue *q)
+{
+ int prio = q->fs->fs.par[0];
+ struct prio_si *si = (struct prio_si *)(q->_si + 1);
+
+ if (si->q_array[prio] == q) {
+ si->q_array[prio] = NULL;
+ __clear_bit(prio, &si->bitmap);
+ }
+ return 0;
+}
+
+
+static struct dn_alg prio_desc = {
+ _SI( .type = ) DN_SCHED_PRIO,
+ _SI( .name = ) "PRIO",
+ _SI( .flags = ) DN_MULTIQUEUE,
+
+ /* we need extra space in the si and the queue */
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct prio_si),
+ _SI( .q_datalen = ) 0,
+
+ _SI( .enqueue = ) prio_enqueue,
+ _SI( .dequeue = ) prio_dequeue,
+
+ _SI( .config = ) NULL,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) prio_new_sched,
+ _SI( .free_sched = ) NULL,
+
+ _SI( .new_fsk = ) prio_new_fsk,
+ _SI( .free_fsk = ) NULL,
+
+ _SI( .new_queue = ) prio_new_queue,
+ _SI( .free_queue = ) prio_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc);
diff --git a/freebsd/sys/netinet/ipfw/dn_sched_qfq.c b/freebsd/sys/netinet/ipfw/dn_sched_qfq.c
new file mode 100644
index 00000000..23890199
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/dn_sched_qfq.c
@@ -0,0 +1,866 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*
+ * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/net/if.h> /* IFNAMSIZ */
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip_var.h> /* ipfw_rule_ref */
+#include <freebsd/netinet/ip_fw.h> /* flow_id */
+#include <freebsd/netinet/ip_dummynet.h>
+#include <freebsd/netinet/ipfw/dn_heap.h>
+#include <freebsd/netinet/ipfw/ip_dn_private.h>
+#include <freebsd/netinet/ipfw/dn_sched.h>
+#else
+#include <freebsd/dn_test.h>
+#endif
+
+#ifdef QFQ_DEBUG
+struct qfq_sched;
+static void dump_sched(struct qfq_sched *q, const char *msg);
+#define NO(x) x
+#else
+#define NO(x)
+#endif
+#define DN_SCHED_QFQ 4 // XXX Where?
+typedef unsigned long bitmap;
+
+/*
+ * bitmaps ops are critical. Some linux versions have __fls
+ * and the bitmap ops. Some machines have ffs
+ */
+#if defined(_WIN32)
+int fls(unsigned int n)
+{
+ int i = 0;
+ for (i = 0; n > 0; n >>= 1, i++)
+ ;
+ return i;
+}
+#endif
+
+#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32)
+static inline unsigned long __fls(unsigned long word)
+{
+ return fls(word) - 1;
+}
+#endif
+
+#if !defined(_KERNEL) || !defined(__linux__)
+#ifdef QFQ_DEBUG
+int test_bit(int ix, bitmap *p)
+{
+ if (ix < 0 || ix > 31)
+ D("bad index %d", ix);
+ return *p & (1<<ix);
+}
+void __set_bit(int ix, bitmap *p)
+{
+ if (ix < 0 || ix > 31)
+ D("bad index %d", ix);
+ *p |= (1<<ix);
+}
+void __clear_bit(int ix, bitmap *p)
+{
+ if (ix < 0 || ix > 31)
+ D("bad index %d", ix);
+ *p &= ~(1<<ix);
+}
+#else /* !QFQ_DEBUG */
+/* XXX do we have fast version, or leave it to the compiler ? */
+#define test_bit(ix, pData) ((*pData) & (1<<(ix)))
+#define __set_bit(ix, pData) (*pData) |= (1<<(ix))
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif /* !QFQ_DEBUG */
+#endif /* !__linux__ */
+
+#ifdef __MIPSEL__
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif
+
+/*-------------------------------------------*/
+/*
+
+Virtual time computations.
+
+S, F and V are all computed in fixed point arithmetic with
+FRAC_BITS decimal bits.
+
+ QFQ_MAX_INDEX is the maximum index allowed for a group. We need
+ one bit per index.
+ QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
+ The layout of the bits is as below:
+
+ [ MTU_SHIFT ][ FRAC_BITS ]
+ [ MAX_INDEX ][ MIN_SLOT_SHIFT ]
+ ^.__grp->index = 0
+ *.__grp->slot_shift
+
+ where MIN_SLOT_SHIFT is derived by difference from the others.
+
+The max group index corresponds to Lmax/w_min, where
+Lmax=1<<MTU_SHIFT, w_min = 1 .
+From this, and knowing how many groups (MAX_INDEX) we want,
+we can derive the shift corresponding to each group.
+
+Because we often need to compute
+ F = S + len/w_i and V = V + len/wsum
+instead of storing w_i store the value
+ inv_w = (1<<FRAC_BITS)/w_i
+so we can do F = S + len * inv_w * wsum.
+We use W_TOT in the formulas so we can easily move between
+static and adaptive weight sum.
+
+The per-scheduler-instance data contain all the data structures
+for the scheduler: bitmaps and bucket lists.
+
+ */
+/*
+ * Maximum number of consecutive slots occupied by backlogged classes
+ * inside a group. This is approx lmax/lmin + 5.
+ * XXX check because it poses constraints on MAX_INDEX
+ */
+#define QFQ_MAX_SLOTS 32
+/*
+ * Shifts used for class<->group mapping. Class weights are
+ * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the
+ * group with the smallest index that can support the L_i / r_i
+ * configured for the class.
+ *
+ * grp->index is the index of the group; and grp->slot_shift
+ * is the shift for the corresponding (scaled) sigma_i.
+ *
+ * When computing the group index, we do (len<<FP_SHIFT)/weight,
+ * then compute an FLS (which is like a log2()), and if the result
+ * is below the MAX_INDEX region we use 0 (which is the same as
+ * using a larger len).
+ */
+#define QFQ_MAX_INDEX 19
+#define QFQ_MAX_WSHIFT 16 /* log2(max_weight) */
+
+#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT)
+#define QFQ_MAX_WSUM (2*QFQ_MAX_WEIGHT)
+//#define IWSUM (q->i_wsum)
+#define IWSUM ((1<<FRAC_BITS)/QFQ_MAX_WSUM)
+
+#define FRAC_BITS 30 /* fixed point arithmetic */
+#define ONE_FP (1UL << FRAC_BITS)
+
+#define QFQ_MTU_SHIFT 11 /* log2(max_len) */
+#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
+
+/*
+ * Possible group states, also indexes for the bitmaps array in
+ * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3
+ */
+enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
+
+struct qfq_group;
+/*
+ * additional queue info. Some of this info should come from
+ * the flowset, we copy them here for faster processing.
+ * This is an overlay of the struct dn_queue
+ */
+struct qfq_class {
+ struct dn_queue _q;
+ uint64_t S, F; /* flow timestamps (exact) */
+ struct qfq_class *next; /* Link for the slot list. */
+
+ /* group we belong to. In principle we would need the index,
+ * which is log_2(lmax/weight), but we never reference it
+ * directly, only the group.
+ */
+ struct qfq_group *grp;
+
+ /* these are copied from the flowset. */
+ uint32_t inv_w; /* ONE_FP/weight */
+ uint32_t lmax; /* Max packet size for this flow. */
+};
+
+/* Group descriptor, see the paper for details.
+ * Basically this contains the bucket lists
+ */
+struct qfq_group {
+ uint64_t S, F; /* group timestamps (approx). */
+ unsigned int slot_shift; /* Slot shift. */
+ unsigned int index; /* Group index. */
+ unsigned int front; /* Index of the front slot. */
+ bitmap full_slots; /* non-empty slots */
+
+ /* Array of lists of active classes. */
+ struct qfq_class *slots[QFQ_MAX_SLOTS];
+};
+
+/* scheduler instance descriptor. */
+struct qfq_sched {
+ uint64_t V; /* Precise virtual time. */
+ uint32_t wsum; /* weight sum */
+ NO(uint32_t i_wsum; /* ONE_FP/w_sum */
+ uint32_t _queued; /* debugging */
+ uint32_t loops; /* debugging */)
+ bitmap bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
+ struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
+};
+
+/*---- support functions ----------------------------*/
+
+/* Generic comparison function, handling wraparound. */
+static inline int qfq_gt(uint64_t a, uint64_t b)
+{
+ return (int64_t)(a - b) > 0;
+}
+
+/* Round a precise timestamp to its slotted value. */
+static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift)
+{
+ return ts & ~((1ULL << shift) - 1);
+}
+
+/* return the pointer to the group with lowest index in the bitmap */
+static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
+ unsigned long bitmap)
+{
+ int index = ffs(bitmap) - 1; // zero-based
+ return &q->groups[index];
+}
+
+/*
+ * Calculate a flow index, given its weight and maximum packet length.
+ * index = log_2(maxlen/weight) but we need to apply the scaling.
+ * This is used only once at flow creation.
+ */
+static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen)
+{
+ uint64_t slot_size = (uint64_t)maxlen *inv_w;
+ unsigned long size_map;
+ int index = 0;
+
+ size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT);
+ if (!size_map)
+ goto out;
+
+ index = __fls(size_map) + 1; // basically a log_2()
+ index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
+
+ if (index < 0)
+ index = 0;
+
+out:
+ ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index);
+ return index;
+}
+/*---- end support functions ----*/
+
+/*-------- API calls --------------------------------*/
+/*
+ * Validate and copy parameters from flowset.
+ */
+static int
+qfq_new_queue(struct dn_queue *_q)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
+ struct qfq_class *cl = (struct qfq_class *)_q;
+ int i;
+ uint32_t w; /* approximated weight */
+
+ /* import parameters from the flowset. They should be correct
+ * already.
+ */
+ w = _q->fs->fs.par[0];
+ cl->lmax = _q->fs->fs.par[1];
+ if (!w || w > QFQ_MAX_WEIGHT) {
+ w = 1;
+ D("rounding weight to 1");
+ }
+ cl->inv_w = ONE_FP/w;
+ w = ONE_FP/cl->inv_w;
+ if (q->wsum + w > QFQ_MAX_WSUM)
+ return EINVAL;
+
+ i = qfq_calc_index(cl->inv_w, cl->lmax);
+ cl->grp = &q->groups[i];
+ q->wsum += w;
+ // XXX cl->S = q->V; ?
+ // XXX compute q->i_wsum
+ return 0;
+}
+
+/* remove an empty queue */
+static int
+qfq_free_queue(struct dn_queue *_q)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
+ struct qfq_class *cl = (struct qfq_class *)_q;
+ if (cl->inv_w) {
+ q->wsum -= ONE_FP/cl->inv_w;
+ cl->inv_w = 0; /* reset weight to avoid run twice */
+ }
+ return 0;
+}
+
+/* Calculate a mask to mimic what would be ffs_from(). */
+static inline unsigned long
+mask_from(unsigned long bitmap, int from)
+{
+ return bitmap & ~((1UL << from) - 1);
+}
+
+/*
+ * The state computation relies on ER=0, IR=1, EB=2, IB=3
+ * First compute eligibility comparing grp->S, q->V,
+ * then check if someone is blocking us and possibly add EB
+ */
+static inline unsigned int
+qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp)
+{
+ /* if S > V we are not eligible */
+ unsigned int state = qfq_gt(grp->S, q->V);
+ unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
+ struct qfq_group *next;
+
+ if (mask) {
+ next = qfq_ffs(q, mask);
+ if (qfq_gt(grp->F, next->F))
+ state |= EB;
+ }
+
+ return state;
+}
+
+/*
+ * In principle
+ * q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ * q->bitmaps[src] &= ~mask;
+ * but we should make sure that src != dst
+ */
+static inline void
+qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst)
+{
+ q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ q->bitmaps[src] &= ~mask;
+}
+
+static inline void
+qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish)
+{
+ unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
+ struct qfq_group *next;
+
+ if (mask) {
+ next = qfq_ffs(q, mask);
+ if (!qfq_gt(next->F, old_finish))
+ return;
+ }
+
+ mask = (1UL << index) - 1;
+ qfq_move_groups(q, mask, EB, ER);
+ qfq_move_groups(q, mask, IB, IR);
+}
+
+/*
+ * perhaps
+ *
+ old_V ^= q->V;
+ old_V >>= QFQ_MIN_SLOT_SHIFT;
+ if (old_V) {
+ ...
+ }
+ *
+ */
+static inline void
+qfq_make_eligible(struct qfq_sched *q, uint64_t old_V)
+{
+ unsigned long mask, vslot, old_vslot;
+
+ vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
+ old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
+
+ if (vslot != old_vslot) {
+ mask = (2UL << (__fls(vslot ^ old_vslot))) - 1;
+ qfq_move_groups(q, mask, IR, ER);
+ qfq_move_groups(q, mask, IB, EB);
+ }
+}
+
+/*
+ * XXX we should make sure that slot becomes less than 32.
+ * This is guaranteed by the input values.
+ * roundedS is always cl->S rounded on grp->slot_shift bits.
+ */
+static inline void
+qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS)
+{
+ uint64_t slot = (roundedS - grp->S) >> grp->slot_shift;
+ unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
+
+ cl->next = grp->slots[i];
+ grp->slots[i] = cl;
+ __set_bit(slot, &grp->full_slots);
+}
+
+/*
+ * remove the entry from the slot
+ */
+static inline void
+qfq_front_slot_remove(struct qfq_group *grp)
+{
+ struct qfq_class **h = &grp->slots[grp->front];
+
+ *h = (*h)->next;
+ if (!*h)
+ __clear_bit(0, &grp->full_slots);
+}
+
+/*
+ * Returns the first full queue in a group. As a side effect,
+ * adjust the bucket list so the first non-empty bucket is at
+ * position 0 in full_slots.
+ */
+static inline struct qfq_class *
+qfq_slot_scan(struct qfq_group *grp)
+{
+ int i;
+
+ ND("grp %d full %x", grp->index, grp->full_slots);
+ if (!grp->full_slots)
+ return NULL;
+
+ i = ffs(grp->full_slots) - 1; // zero-based
+ if (i > 0) {
+ grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
+ grp->full_slots >>= i;
+ }
+
+ return grp->slots[grp->front];
+}
+
+/*
+ * adjust the bucket list. When the start time of a group decreases,
+ * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
+ * move the objects. The mask of occupied slots must be shifted
+ * because we use ffs() to find the first non-empty slot.
+ * This covers decreases in the group's start time, but what about
+ * increases of the start time ?
+ * Here too we should make sure that i is less than 32
+ */
+static inline void
+qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS)
+{
+ unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
+
+ grp->full_slots <<= i;
+ grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
+}
+
+
+static inline void
+qfq_update_eligible(struct qfq_sched *q, uint64_t old_V)
+{
+ bitmap ineligible;
+
+ ineligible = q->bitmaps[IR] | q->bitmaps[IB];
+ if (ineligible) {
+ if (!q->bitmaps[ER]) {
+ struct qfq_group *grp;
+ grp = qfq_ffs(q, ineligible);
+ if (qfq_gt(grp->S, q->V))
+ q->V = grp->S;
+ }
+ qfq_make_eligible(q, old_V);
+ }
+}
+
+/*
+ * Updates the class, returns true if also the group needs to be updated.
+ */
+static inline int
+qfq_update_class(struct qfq_sched *q, struct qfq_group *grp,
+ struct qfq_class *cl)
+{
+
+ cl->S = cl->F;
+ if (cl->_q.mq.head == NULL) {
+ qfq_front_slot_remove(grp);
+ } else {
+ unsigned int len;
+ uint64_t roundedS;
+
+ len = cl->_q.mq.head->m_pkthdr.len;
+ cl->F = cl->S + (uint64_t)len * cl->inv_w;
+ roundedS = qfq_round_down(cl->S, grp->slot_shift);
+ if (roundedS == grp->S)
+ return 0;
+
+ qfq_front_slot_remove(grp);
+ qfq_slot_insert(grp, cl, roundedS);
+ }
+ return 1;
+}
+
+static struct mbuf *
+qfq_dequeue(struct dn_sch_inst *si)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+ struct qfq_group *grp;
+ struct qfq_class *cl;
+ struct mbuf *m;
+ uint64_t old_V;
+
+ NO(q->loops++;)
+ if (!q->bitmaps[ER]) {
+ NO(if (q->queued)
+ dump_sched(q, "start dequeue");)
+ return NULL;
+ }
+
+ grp = qfq_ffs(q, q->bitmaps[ER]);
+
+ cl = grp->slots[grp->front];
+ /* extract from the first bucket in the bucket list */
+ m = dn_dequeue(&cl->_q);
+
+ if (!m) {
+ D("BUG/* non-workconserving leaf */");
+ return NULL;
+ }
+ NO(q->queued--;)
+ old_V = q->V;
+ q->V += (uint64_t)m->m_pkthdr.len * IWSUM;
+ ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V);
+
+ if (qfq_update_class(q, grp, cl)) {
+ uint64_t old_F = grp->F;
+ cl = qfq_slot_scan(grp);
+ if (!cl) { /* group gone, remove from ER */
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ // grp->S = grp->F + 1; // XXX debugging only
+ } else {
+ uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift);
+ unsigned int s;
+
+ if (grp->S == roundedS)
+ goto skip_unblock;
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ /* remove from ER and put in the new set */
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ }
+ /* we need to unblock even if the group has gone away */
+ qfq_unblock_groups(q, grp->index, old_F);
+ }
+
+skip_unblock:
+ qfq_update_eligible(q, old_V);
+ NO(if (!q->bitmaps[ER] && q->queued)
+ dump_sched(q, "end dequeue");)
+
+ return m;
+}
+
+/*
+ * Assign a reasonable start time for a new flow k in group i.
+ * Admissible values for \hat(F) are multiples of \sigma_i
+ * no greater than V+\sigma_i . Larger values mean that
+ * we had a wraparound so we consider the timestamp to be stale.
+ *
+ * If F is not stale and F >= V then we set S = F.
+ * Otherwise we should assign S = V, but this may violate
+ * the ordering in ER. So, if we have groups in ER, set S to
+ * the F_j of the first group j which would be blocking us.
+ * We are guaranteed not to move S backward because
+ * otherwise our group i would still be blocked.
+ */
+static inline void
+qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
+{
+ unsigned long mask;
+ uint32_t limit, roundedF;
+ int slot_shift = cl->grp->slot_shift;
+
+ roundedF = qfq_round_down(cl->F, slot_shift);
+ limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift);
+
+ if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
+ /* timestamp was stale */
+ mask = mask_from(q->bitmaps[ER], cl->grp->index);
+ if (mask) {
+ struct qfq_group *next = qfq_ffs(q, mask);
+ if (qfq_gt(roundedF, next->F)) {
+ cl->S = next->F;
+ return;
+ }
+ }
+ cl->S = q->V;
+ } else { /* timestamp is not stale */
+ cl->S = cl->F;
+ }
+}
+
+static int
+qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+ struct qfq_group *grp;
+ struct qfq_class *cl = (struct qfq_class *)_q;
+ uint64_t roundedS;
+ int s;
+
+ NO(q->loops++;)
+ DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len,
+ _q, cl->inv_w, cl->grp->index);
+ /* XXX verify that the packet obeys the parameters */
+ if (m != _q->mq.head) {
+ if (dn_enqueue(_q, m, 0)) /* packet was dropped */
+ return 1;
+ NO(q->queued++;)
+ if (m != _q->mq.head)
+ return 0;
+ }
+ /* If reach this point, queue q was idle */
+ grp = cl->grp;
+ qfq_update_start(q, cl); /* adjust start time */
+ /* compute new finish time and rounded start. */
+ cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w;
+ roundedS = qfq_round_down(cl->S, grp->slot_shift);
+
+ /*
+ * insert cl in the correct bucket.
+ * If cl->S >= grp->S we don't need to adjust the
+ * bucket list and simply go to the insertion phase.
+ * Otherwise grp->S is decreasing, we must make room
+ * in the bucket list, and also recompute the group state.
+ * Finally, if there were no flows in this group and nobody
+ * was in ER make sure to adjust V.
+ */
+ if (grp->full_slots) {
+ if (!qfq_gt(grp->S, cl->S))
+ goto skip_update;
+ /* create a slot for this cl->S */
+ qfq_slot_rotate(q, grp, roundedS);
+ /* group was surely ineligible, remove */
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+ } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
+ q->V = roundedS;
+
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ ND("new state %d 0x%x", s, q->bitmaps[s]);
+ ND("S %llx F %llx V %llx", cl->S, cl->F, q->V);
+skip_update:
+ qfq_slot_insert(grp, cl, roundedS);
+
+ return 0;
+}
+
+
+#if 0
+static inline void
+qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
+ struct qfq_class *cl, struct qfq_class **pprev)
+{
+ unsigned int i, offset;
+ uint64_t roundedS;
+
+ roundedS = qfq_round_down(cl->S, grp->slot_shift);
+ offset = (roundedS - grp->S) >> grp->slot_shift;
+ i = (grp->front + offset) % QFQ_MAX_SLOTS;
+
+#ifdef notyet
+ if (!pprev) {
+ pprev = &grp->slots[i];
+ while (*pprev && *pprev != cl)
+ pprev = &(*pprev)->next;
+ }
+#endif
+
+ *pprev = cl->next;
+ if (!grp->slots[i])
+ __clear_bit(offset, &grp->full_slots);
+}
+
+/*
+ * called to forcibly destroy a queue.
+ * If the queue is not in the front bucket, or if it has
+ * other queues in the front bucket, we can simply remove
+ * the queue with no other side effects.
+ * Otherwise we must propagate the event up.
+ * XXX description to be completed.
+ */
+static void
+qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl,
+ struct qfq_class **pprev)
+{
+ struct qfq_group *grp = &q->groups[cl->index];
+ unsigned long mask;
+ uint64_t roundedS;
+ int s;
+
+ cl->F = cl->S; // not needed if the class goes away.
+ qfq_slot_remove(q, grp, cl, pprev);
+
+ if (!grp->full_slots) {
+ /* nothing left in the group, remove from all sets.
+ * Do ER last because if we were blocking other groups
+ * we must unblock them.
+ */
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[EB]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+
+ if (test_bit(grp->index, &q->bitmaps[ER]) &&
+ !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
+ mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
+ if (mask)
+ mask = ~((1UL << __fls(mask)) - 1);
+ else
+ mask = ~0UL;
+ qfq_move_groups(q, mask, EB, ER);
+ qfq_move_groups(q, mask, IB, IR);
+ }
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ } else if (!grp->slots[grp->front]) {
+ cl = qfq_slot_scan(grp);
+ roundedS = qfq_round_down(cl->S, grp->slot_shift);
+ if (grp->S != roundedS) {
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[EB]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ }
+ }
+ qfq_update_eligible(q, q->V);
+}
+#endif
+
+static int
+qfq_new_fsk(struct dn_fsk *f)
+{
+ ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight");
+ ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen");
+ ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]);
+ return 0;
+}
+
+/*
+ * initialize a new scheduler instance
+ */
+static int
+qfq_new_sched(struct dn_sch_inst *si)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+ struct qfq_group *grp;
+ int i;
+
+ for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+ grp = &q->groups[i];
+ grp->index = i;
+ grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS -
+ (QFQ_MAX_INDEX - i);
+ }
+ return 0;
+}
+
+/*
+ * QFQ scheduler descriptor
+ */
+static struct dn_alg qfq_desc = {
+ _SI( .type = ) DN_SCHED_QFQ,
+ _SI( .name = ) "QFQ",
+ _SI( .flags = ) DN_MULTIQUEUE,
+
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct qfq_sched),
+ _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue),
+
+ _SI( .enqueue = ) qfq_enqueue,
+ _SI( .dequeue = ) qfq_dequeue,
+
+ _SI( .config = ) NULL,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) qfq_new_sched,
+ _SI( .free_sched = ) NULL,
+ _SI( .new_fsk = ) qfq_new_fsk,
+ _SI( .free_fsk = ) NULL,
+ _SI( .new_queue = ) qfq_new_queue,
+ _SI( .free_queue = ) qfq_free_queue,
+};
+
+DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);
+
+#ifdef QFQ_DEBUG
+static void
+dump_groups(struct qfq_sched *q, uint32_t mask)
+{
+ int i, j;
+
+ for (i = 0; i < QFQ_MAX_INDEX + 1; i++) {
+ struct qfq_group *g = &q->groups[i];
+
+ if (0 == (mask & (1<<i)))
+ continue;
+ for (j = 0; j < QFQ_MAX_SLOTS; j++) {
+ if (g->slots[j])
+ D(" bucket %d %p", j, g->slots[j]);
+ }
+ D("full_slots 0x%x", g->full_slots);
+ D(" %2d S 0x%20llx F 0x%llx %c", i,
+ g->S, g->F,
+ mask & (1<<i) ? '1' : '0');
+ }
+}
+
+static void
+dump_sched(struct qfq_sched *q, const char *msg)
+{
+ D("--- in %s: ---", msg);
+ ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V);
+ D(" ER 0x%08x", q->bitmaps[ER]);
+ D(" EB 0x%08x", q->bitmaps[EB]);
+ D(" IR 0x%08x", q->bitmaps[IR]);
+ D(" IB 0x%08x", q->bitmaps[IB]);
+ dump_groups(q, 0xffffffff);
+};
+#endif /* QFQ_DEBUG */
diff --git a/freebsd/sys/netinet/ipfw/dn_sched_rr.c b/freebsd/sys/netinet/ipfw/dn_sched_rr.c
new file mode 100644
index 00000000..4aa833f6
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/dn_sched_rr.c
@@ -0,0 +1,309 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/net/if.h> /* IFNAMSIZ */
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip_var.h> /* ipfw_rule_ref */
+#include <freebsd/netinet/ip_fw.h> /* flow_id */
+#include <freebsd/netinet/ip_dummynet.h>
+#include <freebsd/netinet/ipfw/dn_heap.h>
+#include <freebsd/netinet/ipfw/ip_dn_private.h>
+#include <freebsd/netinet/ipfw/dn_sched.h>
+#else
+#include <freebsd/dn_test.h>
+#endif
+
+#define DN_SCHED_RR 3 // XXX Where?
+
+struct rr_queue {
+ struct dn_queue q; /* Standard queue */
+ int status; /* 1: queue is in the list */
+ int credit; /* Number of bytes to transmit */
+ int quantum; /* quantum * C */
+ struct rr_queue *qnext; /* */
+};
+
+/* struct rr_schk contains global config parameters
+ * and is right after dn_schk
+ */
+struct rr_schk {
+ int min_q; /* Min quantum */
+ int max_q; /* Max quantum */
+ int q_bytes; /* Bytes per quantum */
+};
+
+/* per-instance round robin list, right after dn_sch_inst */
+struct rr_si {
+ struct rr_queue *head, *tail; /* Pointer to current queue */
+};
+
+/* Append a queue to the rr list */
+static inline void
+rr_append(struct rr_queue *q, struct rr_si *si)
+{
+ q->status = 1; /* mark as in-rr_list */
+ q->credit = q->quantum; /* initialize credit */
+
+ /* append to the tail */
+ if (si->head == NULL)
+ si->head = q;
+ else
+ si->tail->qnext = q;
+ si->tail = q; /* advance the tail pointer */
+ q->qnext = si->head; /* make it circular */
+}
+
+/* Remove the head queue from circular list. */
+static inline void
+rr_remove_head(struct rr_si *si)
+{
+ if (si->head == NULL)
+ return; /* empty queue */
+ si->head->status = 0;
+
+ if (si->head == si->tail) {
+ si->head = si->tail = NULL;
+ return;
+ }
+
+ si->head = si->head->qnext;
+ si->tail->qnext = si->head;
+}
+
+/* Remove a queue from circular list.
+ * XXX see if ti can be merge with remove_queue()
+ */
+static inline void
+remove_queue_q(struct rr_queue *q, struct rr_si *si)
+{
+ struct rr_queue *prev;
+
+ if (q->status != 1)
+ return;
+ if (q == si->head) {
+ rr_remove_head(si);
+ return;
+ }
+
+ for (prev = si->head; prev; prev = prev->qnext) {
+ if (prev->qnext != q)
+ continue;
+ prev->qnext = q->qnext;
+ if (q == si->tail)
+ si->tail = prev;
+ q->status = 0;
+ break;
+ }
+}
+
+
+static inline void
+next_pointer(struct rr_si *si)
+{
+ if (si->head == NULL)
+ return; /* empty queue */
+
+ si->head = si->head->qnext;
+ si->tail = si->tail->qnext;
+}
+
+static int
+rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+ struct rr_si *si;
+ struct rr_queue *rrq;
+
+ if (m != q->mq.head) {
+ if (dn_enqueue(q, m, 0)) /* packet was dropped */
+ return 1;
+ if (m != q->mq.head)
+ return 0;
+ }
+
+ /* If reach this point, queue q was idle */
+ si = (struct rr_si *)(_si + 1);
+ rrq = (struct rr_queue *)q;
+
+ if (rrq->status == 1) /* Queue is already in the queue list */
+ return 0;
+
+ /* Insert the queue in the queue list */
+ rr_append(rrq, si);
+
+ return 0;
+}
+
+static struct mbuf *
+rr_dequeue(struct dn_sch_inst *_si)
+{
+ /* Access scheduler instance private data */
+ struct rr_si *si = (struct rr_si *)(_si + 1);
+ struct rr_queue *rrq;
+ uint64_t len;
+
+ while ( (rrq = si->head) ) {
+ struct mbuf *m = rrq->q.mq.head;
+ if ( m == NULL) {
+ /* empty queue, remove from list */
+ rr_remove_head(si);
+ continue;
+ }
+ len = m->m_pkthdr.len;
+
+ if (len > rrq->credit) {
+ /* Packet too big */
+ rrq->credit += rrq->quantum;
+ /* Try next queue */
+ next_pointer(si);
+ } else {
+ rrq->credit -= len;
+ return dn_dequeue(&rrq->q);
+ }
+ }
+
+ /* no packet to dequeue*/
+ return NULL;
+}
+
+static int
+rr_config(struct dn_schk *_schk)
+{
+ struct rr_schk *schk = (struct rr_schk *)(_schk + 1);
+ ND("called");
+
+ /* use reasonable quantums (64..2k bytes, default 1500) */
+ schk->min_q = 64;
+ schk->max_q = 2048;
+ schk->q_bytes = 1500; /* quantum */
+
+ return 0;
+}
+
+static int
+rr_new_sched(struct dn_sch_inst *_si)
+{
+ struct rr_si *si = (struct rr_si *)(_si + 1);
+
+ ND("called");
+ si->head = si->tail = NULL;
+
+ return 0;
+}
+
+static int
+rr_free_sched(struct dn_sch_inst *_si)
+{
+ ND("called");
+ /* Nothing to do? */
+ return 0;
+}
+
+static int
+rr_new_fsk(struct dn_fsk *fs)
+{
+ struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1);
+ /* par[0] is the weight, par[1] is the quantum step */
+ ipdn_bound_var(&fs->fs.par[0], 1,
+ 1, 65536, "RR weight");
+ ipdn_bound_var(&fs->fs.par[1], schk->q_bytes,
+ schk->min_q, schk->max_q, "RR quantum");
+ return 0;
+}
+
+static int
+rr_new_queue(struct dn_queue *_q)
+{
+ struct rr_queue *q = (struct rr_queue *)_q;
+
+ _q->ni.oid.subtype = DN_SCHED_RR;
+
+ q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1];
+ ND("called, q->quantum %d", q->quantum);
+ q->credit = q->quantum;
+ q->status = 0;
+
+ if (_q->mq.head != NULL) {
+ /* Queue NOT empty, insert in the queue list */
+ rr_append(q, (struct rr_si *)(_q->_si + 1));
+ }
+ return 0;
+}
+
+static int
+rr_free_queue(struct dn_queue *_q)
+{
+ struct rr_queue *q = (struct rr_queue *)_q;
+
+ ND("called");
+ if (q->status == 1) {
+ struct rr_si *si = (struct rr_si *)(_q->_si + 1);
+ remove_queue_q(q, si);
+ }
+ return 0;
+}
+
+/*
+ * RR scheduler descriptor
+ * contains the type of the scheduler, the name, the size of the
+ * structures and function pointers.
+ */
+static struct dn_alg rr_desc = {
+ _SI( .type = ) DN_SCHED_RR,
+ _SI( .name = ) "RR",
+ _SI( .flags = ) DN_MULTIQUEUE,
+
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct rr_si),
+ _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue),
+
+ _SI( .enqueue = ) rr_enqueue,
+ _SI( .dequeue = ) rr_dequeue,
+
+ _SI( .config = ) rr_config,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) rr_new_sched,
+ _SI( .free_sched = ) rr_free_sched,
+ _SI( .new_fsk = ) rr_new_fsk,
+ _SI( .free_fsk = ) NULL,
+ _SI( .new_queue = ) rr_new_queue,
+ _SI( .free_queue = ) rr_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc);
diff --git a/freebsd/sys/netinet/ipfw/dn_sched_wf2q.c b/freebsd/sys/netinet/ipfw/dn_sched_wf2q.c
new file mode 100644
index 00000000..c1e4c21d
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/dn_sched_wf2q.c
@@ -0,0 +1,375 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/net/if.h> /* IFNAMSIZ */
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip_var.h> /* ipfw_rule_ref */
+#include <freebsd/netinet/ip_fw.h> /* flow_id */
+#include <freebsd/netinet/ip_dummynet.h>
+#include <freebsd/netinet/ipfw/dn_heap.h>
+#include <freebsd/netinet/ipfw/ip_dn_private.h>
+#include <freebsd/netinet/ipfw/dn_sched.h>
+#else
+#include <freebsd/dn_test.h>
+#endif
+
+#ifndef MAX64
+#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
+#endif
+
+/*
+ * timestamps are computed on 64 bit using fixed point arithmetic.
+ * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len
+ * and sum of weights, respectively. FRAC_BITS is the number of
+ * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large
+ * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w
+ * using an unsigned 32-bit division, and to avoid wraparounds we need
+ * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64
+ * As an example
+ * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19
+ */
+#ifndef FRAC_BITS
+#define FRAC_BITS 28 /* shift for fixed point arithmetic */
+#define ONE_FP (1UL << FRAC_BITS)
+#endif
+
+/*
+ * Private information for the scheduler instance:
+ * sch_heap (key is Finish time) returns the next queue to serve
+ * ne_heap (key is Start time) stores not-eligible queues
+ * idle_heap (key=start/finish time) stores idle flows. It must
+ * support extract-from-middle.
+ * A flow is only in 1 of the three heaps.
+ * XXX todo: use a more efficient data structure, e.g. a tree sorted
+ * by F with min_subtree(S) in each node
+ */
+struct wf2qp_si {
+ struct dn_heap sch_heap; /* top extract - key Finish time */
+ struct dn_heap ne_heap; /* top extract - key Start time */
+ struct dn_heap idle_heap; /* random extract - key Start=Finish time */
+ uint64_t V; /* virtual time */
+ uint32_t inv_wsum; /* inverse of sum of weights */
+ uint32_t wsum; /* sum of weights */
+};
+
+struct wf2qp_queue {
+ struct dn_queue _q;
+ uint64_t S, F; /* start time, finish time */
+ uint32_t inv_w; /* ONE_FP / weight */
+ int32_t heap_pos; /* position (index) of struct in heap */
+};
+
+/*
+ * This file implements a WF2Q+ scheduler as it has been in dummynet
+ * since 2000.
+ * The scheduler supports per-flow queues and has O(log N) complexity.
+ *
+ * WF2Q+ needs to drain entries from the idle heap so that we
+ * can keep the sum of weights up to date. We can do it whenever
+ * we get a chance, or periodically, or following some other
+ * strategy. The function idle_check() drains at most N elements
+ * from the idle heap.
+ */
+static void
+idle_check(struct wf2qp_si *si, int n, int force)
+{
+ struct dn_heap *h = &si->idle_heap;
+ while (n-- > 0 && h->elements > 0 &&
+ (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) {
+ struct dn_queue *q = HEAP_TOP(h)->object;
+ struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
+
+ heap_extract(h, NULL);
+ /* XXX to let the flowset delete the queue we should
+ * mark it as 'unused' by the scheduler.
+ */
+ alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */
+ si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */
+ if (si->wsum > 0)
+ si->inv_wsum = ONE_FP/si->wsum;
+ }
+}
+
+static int
+wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+ struct dn_fsk *fs = q->fs;
+ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+ struct wf2qp_queue *alg_fq;
+ uint64_t len = m->m_pkthdr.len;
+
+ if (m != q->mq.head) {
+ if (dn_enqueue(q, m, 0)) /* packet was dropped */
+ return 1;
+ if (m != q->mq.head) /* queue was already busy */
+ return 0;
+ }
+
+ /* If reach this point, queue q was idle */
+ alg_fq = (struct wf2qp_queue *)q;
+
+ if (DN_KEY_LT(alg_fq->F, alg_fq->S)) {
+ /* F<S means timestamps are invalid ->brand new queue. */
+ alg_fq->S = si->V; /* init start time */
+ si->wsum += fs->fs.par[0]; /* add weight of new queue. */
+ si->inv_wsum = ONE_FP/si->wsum;
+ } else { /* if it was idle then it was in the idle heap */
+ heap_extract(&si->idle_heap, q);
+ alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */
+ }
+ alg_fq->F = alg_fq->S + len * alg_fq->inv_w;
+
+ /* if nothing is backlogged, make sure this flow is eligible */
+ if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0)
+ si->V = MAX64(alg_fq->S, si->V);
+
+ /*
+ * Look at eligibility. A flow is not eligibile if S>V (when
+ * this happens, it means that there is some other flow already
+ * scheduled for the same pipe, so the sch_heap cannot be
+ * empty). If the flow is not eligible we just store it in the
+ * ne_heap. Otherwise, we store in the sch_heap.
+ * Note that for all flows in sch_heap (SCH), S_i <= V,
+ * and for all flows in ne_heap (NEH), S_i > V.
+ * So when we need to compute max(V, min(S_i)) forall i in
+ * SCH+NEH, we only need to look into NEH.
+ */
+ if (DN_KEY_LT(si->V, alg_fq->S)) {
+ /* S>V means flow Not eligible. */
+ if (si->sch_heap.elements == 0)
+ D("++ ouch! not eligible but empty scheduler!");
+ heap_insert(&si->ne_heap, alg_fq->S, q);
+ } else {
+ heap_insert(&si->sch_heap, alg_fq->F, q);
+ }
+ return 0;
+}
+
+/* XXX invariant: sch > 0 || V >= min(S in neh) */
+static struct mbuf *
+wf2qp_dequeue(struct dn_sch_inst *_si)
+{
+ /* Access scheduler instance private data */
+ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+ struct mbuf *m;
+ struct dn_queue *q;
+ struct dn_heap *sch = &si->sch_heap;
+ struct dn_heap *neh = &si->ne_heap;
+ struct wf2qp_queue *alg_fq;
+
+ if (sch->elements == 0 && neh->elements == 0) {
+ /* we have nothing to do. We could kill the idle heap
+ * altogether and reset V
+ */
+ idle_check(si, 0x7fffffff, 1);
+ si->V = 0;
+ si->wsum = 0; /* should be set already */
+ return NULL; /* quick return if nothing to do */
+ }
+ idle_check(si, 1, 0); /* drain something from the idle heap */
+
+ /* make sure at least one element is eligible, bumping V
+ * and moving entries that have become eligible.
+ * We need to repeat the first part twice, before and
+ * after extracting the candidate, or enqueue() will
+ * find the data structure in a wrong state.
+ */
+ m = NULL;
+ for(;;) {
+ /*
+ * Compute V = max(V, min(S_i)). Remember that all elements
+ * in sch have by definition S_i <= V so if sch is not empty,
+ * V is surely the max and we must not update it. Conversely,
+ * if sch is empty we only need to look at neh.
+ * We don't need to move the queues, as it will be done at the
+ * next enqueue
+ */
+ if (sch->elements == 0 && neh->elements > 0) {
+ si->V = MAX64(si->V, HEAP_TOP(neh)->key);
+ }
+ while (neh->elements > 0 &&
+ DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) {
+ q = HEAP_TOP(neh)->object;
+ alg_fq = (struct wf2qp_queue *)q;
+ heap_extract(neh, NULL);
+ heap_insert(sch, alg_fq->F, q);
+ }
+ if (m) /* pkt found in previous iteration */
+ break;
+ /* ok we have at least one eligible pkt */
+ q = HEAP_TOP(sch)->object;
+ alg_fq = (struct wf2qp_queue *)q;
+ m = dn_dequeue(q);
+ heap_extract(sch, NULL); /* Remove queue from heap. */
+ si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum;
+ alg_fq->S = alg_fq->F; /* Update start time. */
+ if (q->mq.head == 0) { /* not backlogged any more. */
+ heap_insert(&si->idle_heap, alg_fq->F, q);
+ } else { /* Still backlogged. */
+ /* Update F, store in neh or sch */
+ uint64_t len = q->mq.head->m_pkthdr.len;
+ alg_fq->F += len * alg_fq->inv_w;
+ if (DN_KEY_LEQ(alg_fq->S, si->V)) {
+ heap_insert(sch, alg_fq->F, q);
+ } else {
+ heap_insert(neh, alg_fq->S, q);
+ }
+ }
+ }
+ return m;
+}
+
+static int
+wf2qp_new_sched(struct dn_sch_inst *_si)
+{
+ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+ int ofs = offsetof(struct wf2qp_queue, heap_pos);
+
+ /* all heaps support extract from middle */
+ if (heap_init(&si->idle_heap, 16, ofs) ||
+ heap_init(&si->sch_heap, 16, ofs) ||
+ heap_init(&si->ne_heap, 16, ofs)) {
+ heap_free(&si->ne_heap);
+ heap_free(&si->sch_heap);
+ heap_free(&si->idle_heap);
+ return ENOMEM;
+ }
+ return 0;
+}
+
+static int
+wf2qp_free_sched(struct dn_sch_inst *_si)
+{
+ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+
+ heap_free(&si->sch_heap);
+ heap_free(&si->ne_heap);
+ heap_free(&si->idle_heap);
+
+ return 0;
+}
+
+static int
+wf2qp_new_fsk(struct dn_fsk *fs)
+{
+ ipdn_bound_var(&fs->fs.par[0], 1,
+ 1, 100, "WF2Q+ weight");
+ return 0;
+}
+
+static int
+wf2qp_new_queue(struct dn_queue *_q)
+{
+ struct wf2qp_queue *q = (struct wf2qp_queue *)_q;
+
+ _q->ni.oid.subtype = DN_SCHED_WF2QP;
+ q->F = 0; /* not strictly necessary */
+ q->S = q->F + 1; /* mark timestamp as invalid. */
+ q->inv_w = ONE_FP / _q->fs->fs.par[0];
+ if (_q->mq.head != NULL) {
+ wf2qp_enqueue(_q->_si, _q, _q->mq.head);
+ }
+ return 0;
+}
+
+/*
+ * Called when the infrastructure removes a queue (e.g. flowset
+ * is reconfigured). Nothing to do if we did not 'own' the queue,
+ * otherwise remove it from the right heap and adjust the sum
+ * of weights.
+ */
+static int
+wf2qp_free_queue(struct dn_queue *q)
+{
+ struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
+ struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1);
+
+ if (alg_fq->S >= alg_fq->F + 1)
+ return 0; /* nothing to do, not in any heap */
+ si->wsum -= q->fs->fs.par[0];
+ if (si->wsum > 0)
+ si->inv_wsum = ONE_FP/si->wsum;
+
+ /* extract from the heap. XXX TODO we may need to adjust V
+ * to make sure the invariants hold.
+ */
+ if (q->mq.head == NULL) {
+ heap_extract(&si->idle_heap, q);
+ } else if (DN_KEY_LT(si->V, alg_fq->S)) {
+ heap_extract(&si->ne_heap, q);
+ } else {
+ heap_extract(&si->sch_heap, q);
+ }
+ return 0;
+}
+
+/*
+ * WF2Q+ scheduler descriptor
+ * contains the type of the scheduler, the name, the size of the
+ * structures and function pointers.
+ */
+static struct dn_alg wf2qp_desc = {
+ _SI( .type = ) DN_SCHED_WF2QP,
+ _SI( .name = ) "WF2Q+",
+ _SI( .flags = ) DN_MULTIQUEUE,
+
+ /* we need extra space in the si and the queue */
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct wf2qp_si),
+ _SI( .q_datalen = ) sizeof(struct wf2qp_queue) -
+ sizeof(struct dn_queue),
+
+ _SI( .enqueue = ) wf2qp_enqueue,
+ _SI( .dequeue = ) wf2qp_dequeue,
+
+ _SI( .config = ) NULL,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) wf2qp_new_sched,
+ _SI( .free_sched = ) wf2qp_free_sched,
+
+ _SI( .new_fsk = ) wf2qp_new_fsk,
+ _SI( .free_fsk = ) NULL,
+
+ _SI( .new_queue = ) wf2qp_new_queue,
+ _SI( .free_queue = ) wf2qp_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc);
diff --git a/freebsd/sys/netinet/ipfw/ip_dn_glue.c b/freebsd/sys/netinet/ipfw/ip_dn_glue.c
new file mode 100644
index 00000000..302c4d29
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/ip_dn_glue.c
@@ -0,0 +1,847 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ *
+ * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8
+ */
+
+#include <freebsd/local/opt_inet6.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/rwlock.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/time.h>
+#include <freebsd/sys/taskqueue.h>
+#include <freebsd/net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
+#include <freebsd/netinet/ip_fw.h>
+#include <freebsd/netinet/ipfw/ip_fw_private.h>
+#include <freebsd/netinet/ipfw/dn_heap.h>
+#include <freebsd/netinet/ip_dummynet.h>
+#include <freebsd/netinet/ipfw/ip_dn_private.h>
+#include <freebsd/netinet/ipfw/dn_sched.h>
+
+/* FREEBSD7.2 ip_dummynet.h r191715*/
+
+struct dn_heap_entry7 {
+ int64_t key; /* sorting key. Topmost element is smallest one */
+ void *object; /* object pointer */
+};
+
+struct dn_heap7 {
+ int size;
+ int elements;
+ int offset; /* XXX if > 0 this is the offset of direct ptr to obj */
+ struct dn_heap_entry7 *p; /* really an array of "size" entries */
+};
+
+/* Common to 7.2 and 8 */
+struct dn_flow_set {
+ SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */
+
+ u_short fs_nr ; /* flow_set number */
+ u_short flags_fs;
+#define DNOLD_HAVE_FLOW_MASK 0x0001
+#define DNOLD_IS_RED 0x0002
+#define DNOLD_IS_GENTLE_RED 0x0004
+#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */
+#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */
+#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */
+#define DNOLD_IS_PIPE 0x4000
+#define DNOLD_IS_QUEUE 0x8000
+
+ struct dn_pipe7 *pipe ; /* pointer to parent pipe */
+ u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */
+
+ int weight ; /* WFQ queue weight */
+ int qsize ; /* queue size in slots or bytes */
+ int plr ; /* pkt loss rate (2^31-1 means 100%) */
+
+ struct ipfw_flow_id flow_mask ;
+
+ /* hash table of queues onto this flow_set */
+ int rq_size ; /* number of slots */
+ int rq_elements ; /* active elements */
+ struct dn_flow_queue7 **rq; /* array of rq_size entries */
+
+ u_int32_t last_expired ; /* do not expire too frequently */
+ int backlogged ; /* #active queues for this flowset */
+
+ /* RED parameters */
+#define SCALE_RED 16
+#define SCALE(x) ( (x) << SCALE_RED )
+#define SCALE_VAL(x) ( (x) >> SCALE_RED )
+#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED )
+ int w_q ; /* queue weight (scaled) */
+ int max_th ; /* maximum threshold for queue (scaled) */
+ int min_th ; /* minimum threshold for queue (scaled) */
+ int max_p ; /* maximum value for p_b (scaled) */
+ u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */
+ u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */
+ u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */
+ u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */
+ u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */
+ u_int lookup_depth ; /* depth of lookup table */
+ int lookup_step ; /* granularity inside the lookup table */
+ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+ int avg_pkt_size ; /* medium packet size */
+ int max_pkt_size ; /* max packet size */
+};
+SLIST_HEAD(dn_flow_set_head, dn_flow_set);
+
+#define DN_IS_PIPE 0x4000
+#define DN_IS_QUEUE 0x8000
+struct dn_flow_queue7 {
+ struct dn_flow_queue7 *next ;
+ struct ipfw_flow_id id ;
+
+ struct mbuf *head, *tail ; /* queue of packets */
+ u_int len ;
+ u_int len_bytes ;
+
+ u_long numbytes;
+
+ u_int64_t tot_pkts ; /* statistics counters */
+ u_int64_t tot_bytes ;
+ u_int32_t drops ;
+
+ int hash_slot ; /* debugging/diagnostic */
+
+ /* RED parameters */
+ int avg ; /* average queue length est. (scaled) */
+ int count ; /* arrivals since last RED drop */
+ int random ; /* random value (scaled) */
+ u_int32_t q_time; /* start of queue idle time */
+
+ /* WF2Q+ support */
+ struct dn_flow_set *fs ; /* parent flow set */
+ int heap_pos ; /* position (index) of struct in heap */
+ int64_t sched_time ; /* current time when queue enters ready_heap */
+
+ int64_t S,F ; /* start time, finish time */
+};
+
+struct dn_pipe7 { /* a pipe */
+ SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */
+
+ int pipe_nr ; /* number */
+ int bandwidth; /* really, bytes/tick. */
+ int delay ; /* really, ticks */
+
+ struct mbuf *head, *tail ; /* packets in delay line */
+
+ /* WF2Q+ */
+ struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
+ struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
+ struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
+
+ int64_t V ; /* virtual time */
+ int sum; /* sum of weights of all active sessions */
+
+ int numbytes;
+
+ int64_t sched_time ; /* time pipe was scheduled in ready_heap */
+
+ /*
+ * When the tx clock come from an interface (if_name[0] != '\0'), its name
+ * is stored below, whereas the ifp is filled when the rule is configured.
+ */
+ char if_name[IFNAMSIZ];
+ struct ifnet *ifp ;
+ int ready ; /* set if ifp != NULL and we got a signal from it */
+
+ struct dn_flow_set fs ; /* used with fixed-rate flows */
+};
+SLIST_HEAD(dn_pipe_head7, dn_pipe7);
+
+
+/* FREEBSD8 ip_dummynet.h r196045 */
+struct dn_flow_queue8 {
+ struct dn_flow_queue8 *next ;
+ struct ipfw_flow_id id ;
+
+ struct mbuf *head, *tail ; /* queue of packets */
+ u_int len ;
+ u_int len_bytes ;
+
+ uint64_t numbytes ; /* credit for transmission (dynamic queues) */
+ int64_t extra_bits; /* extra bits simulating unavailable channel */
+
+ u_int64_t tot_pkts ; /* statistics counters */
+ u_int64_t tot_bytes ;
+ u_int32_t drops ;
+
+ int hash_slot ; /* debugging/diagnostic */
+
+ /* RED parameters */
+ int avg ; /* average queue length est. (scaled) */
+ int count ; /* arrivals since last RED drop */
+ int random ; /* random value (scaled) */
+ int64_t idle_time; /* start of queue idle time */
+
+ /* WF2Q+ support */
+ struct dn_flow_set *fs ; /* parent flow set */
+ int heap_pos ; /* position (index) of struct in heap */
+ int64_t sched_time ; /* current time when queue enters ready_heap */
+
+ int64_t S,F ; /* start time, finish time */
+};
+
+struct dn_pipe8 { /* a pipe */
+ SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */
+
+ int pipe_nr ; /* number */
+ int bandwidth; /* really, bytes/tick. */
+ int delay ; /* really, ticks */
+
+ struct mbuf *head, *tail ; /* packets in delay line */
+
+ /* WF2Q+ */
+ struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
+ struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
+ struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
+
+ int64_t V ; /* virtual time */
+ int sum; /* sum of weights of all active sessions */
+
+ /* Same as in dn_flow_queue, numbytes can become large */
+ int64_t numbytes; /* bits I can transmit (more or less). */
+ uint64_t burst; /* burst size, scaled: bits * hz */
+
+ int64_t sched_time ; /* time pipe was scheduled in ready_heap */
+ int64_t idle_time; /* start of pipe idle time */
+
+ char if_name[IFNAMSIZ];
+ struct ifnet *ifp ;
+ int ready ; /* set if ifp != NULL and we got a signal from it */
+
+ struct dn_flow_set fs ; /* used with fixed-rate flows */
+
+ /* fields to simulate a delay profile */
+#define ED_MAX_NAME_LEN 32
+ char name[ED_MAX_NAME_LEN];
+ int loss_level;
+ int samples_no;
+ int *samples;
+};
+
+#define ED_MAX_SAMPLES_NO 1024
+struct dn_pipe_max8 {
+ struct dn_pipe8 pipe;
+ int samples[ED_MAX_SAMPLES_NO];
+};
+SLIST_HEAD(dn_pipe_head8, dn_pipe8);
+
+/*
+ * Changes from 7.2 to 8:
+ * dn_pipe:
+ * numbytes from int to int64_t
+ * add burst (int64_t)
+ * add idle_time (int64_t)
+ * add profile
+ * add struct dn_pipe_max
+ * add flag DN_HAS_PROFILE
+ *
+ * dn_flow_queue
+ * numbytes from u_long to int64_t
+ * add extra_bits (int64_t)
+ * q_time from u_int32_t to int64_t and name idle_time
+ *
+ * dn_flow_set unchanged
+ *
+ */
+
+/* NOTE:XXX copied from dummynet.c */
+#define O_NEXT(p, len) ((void *)((char *)p + len))
+static void
+oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
+{
+ oid->len = len;
+ oid->type = type;
+ oid->subtype = 0;
+ oid->id = id;
+}
+/* make room in the buffer and move the pointer forward */
+static void *
+o_next(struct dn_id **o, int len, int type)
+{
+ struct dn_id *ret = *o;
+ oid_fill(ret, len, type, 0);
+ *o = O_NEXT(*o, len);
+ return ret;
+}
+
+
+static size_t pipesize7 = sizeof(struct dn_pipe7);
+static size_t pipesize8 = sizeof(struct dn_pipe8);
+static size_t pipesizemax8 = sizeof(struct dn_pipe_max8);
+
+/* Indicate 'ipfw' version
+ * 1: from FreeBSD 7.2
+ * 0: from FreeBSD 8
+ * -1: unknow (for now is unused)
+ *
+ * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives
+ * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknow,
+ * it is suppose to be the FreeBSD 8 version.
+ */
+static int is7 = 0;
+
+static int
+convertflags2new(int src)
+{
+ int dst = 0;
+
+ if (src & DNOLD_HAVE_FLOW_MASK)
+ dst |= DN_HAVE_MASK;
+ if (src & DNOLD_QSIZE_IS_BYTES)
+ dst |= DN_QSIZE_BYTES;
+ if (src & DNOLD_NOERROR)
+ dst |= DN_NOERROR;
+ if (src & DNOLD_IS_RED)
+ dst |= DN_IS_RED;
+ if (src & DNOLD_IS_GENTLE_RED)
+ dst |= DN_IS_GENTLE_RED;
+ if (src & DNOLD_HAS_PROFILE)
+ dst |= DN_HAS_PROFILE;
+
+ return dst;
+}
+
+static int
+convertflags2old(int src)
+{
+ int dst = 0;
+
+ if (src & DN_HAVE_MASK)
+ dst |= DNOLD_HAVE_FLOW_MASK;
+ if (src & DN_IS_RED)
+ dst |= DNOLD_IS_RED;
+ if (src & DN_IS_GENTLE_RED)
+ dst |= DNOLD_IS_GENTLE_RED;
+ if (src & DN_NOERROR)
+ dst |= DNOLD_NOERROR;
+ if (src & DN_HAS_PROFILE)
+ dst |= DNOLD_HAS_PROFILE;
+ if (src & DN_QSIZE_BYTES)
+ dst |= DNOLD_QSIZE_IS_BYTES;
+
+ return dst;
+}
+
+static int
+dn_compat_del(void *v)
+{
+ struct dn_pipe7 *p = (struct dn_pipe7 *) v;
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *) v;
+ struct {
+ struct dn_id oid;
+ uintptr_t a[1]; /* add more if we want a list */
+ } cmd;
+
+ /* XXX DN_API_VERSION ??? */
+ oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
+
+ if (is7) {
+ if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
+ return EINVAL;
+ if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
+ return EINVAL;
+ } else {
+ if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0)
+ return EINVAL;
+ if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0)
+ return EINVAL;
+ }
+
+ if (p->pipe_nr != 0) { /* pipe x delete */
+ cmd.a[0] = p->pipe_nr;
+ cmd.oid.subtype = DN_LINK;
+ } else { /* queue x delete */
+ cmd.oid.subtype = DN_FS;
+ cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr;
+ }
+
+ return do_config(&cmd, cmd.oid.len);
+}
+
+static int
+dn_compat_config_queue(struct dn_fs *fs, void* v)
+{
+ struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+ struct dn_flow_set *f;
+
+ if (is7)
+ f = &p7->fs;
+ else
+ f = &p8->fs;
+
+ fs->fs_nr = f->fs_nr;
+ fs->sched_nr = f->parent_nr;
+ fs->flow_mask = f->flow_mask;
+ fs->buckets = f->rq_size;
+ fs->qsize = f->qsize;
+ fs->plr = f->plr;
+ fs->par[0] = f->weight;
+ fs->flags = convertflags2new(f->flags_fs);
+ if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) {
+ fs->w_q = f->w_q;
+ fs->max_th = f->max_th;
+ fs->min_th = f->min_th;
+ fs->max_p = f->max_p;
+ }
+
+ return 0;
+}
+
+static int
+dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p,
+ struct dn_fs *fs, void* v)
+{
+ struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+ int i = p7->pipe_nr;
+
+ sch->sched_nr = i;
+ sch->oid.subtype = 0;
+ p->link_nr = i;
+ fs->fs_nr = i + 2*DN_MAX_ID;
+ fs->sched_nr = i + DN_MAX_ID;
+
+ /* Common to 7 and 8 */
+ p->bandwidth = p7->bandwidth;
+ p->delay = p7->delay;
+ if (!is7) {
+ /* FreeBSD 8 has burst */
+ p->burst = p8->burst;
+ }
+
+ /* fill the fifo flowset */
+ dn_compat_config_queue(fs, v);
+ fs->fs_nr = i + 2*DN_MAX_ID;
+ fs->sched_nr = i + DN_MAX_ID;
+
+ /* Move scheduler related parameter from fs to sch */
+ sch->buckets = fs->buckets; /*XXX*/
+ fs->buckets = 0;
+ if (fs->flags & DN_HAVE_MASK) {
+ sch->flags |= DN_HAVE_MASK;
+ fs->flags &= ~DN_HAVE_MASK;
+ sch->sched_mask = fs->flow_mask;
+ bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id));
+ }
+
+ return 0;
+}
+
+static int
+dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p,
+ void *v)
+{
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+
+ p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]);
+
+ pf->link_nr = p->link_nr;
+ pf->loss_level = p8->loss_level;
+// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant?
+ pf->samples_no = p8->samples_no;
+ strncpy(pf->name, p8->name,sizeof(pf->name));
+ bcopy(p8->samples, pf->samples, sizeof(pf->samples));
+
+ return 0;
+}
+
+/*
+ * If p->pipe_nr != 0 the command is 'pipe x config', so need to create
+ * the three main struct, else only a flowset is created
+ */
+static int
+dn_compat_configure(void *v)
+{
+ struct dn_id *buf = NULL, *base;
+ struct dn_sch *sch = NULL;
+ struct dn_link *p = NULL;
+ struct dn_fs *fs = NULL;
+ struct dn_profile *pf = NULL;
+ int lmax;
+ int error;
+
+ struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+
+ int i; /* number of object to configure */
+
+ lmax = sizeof(struct dn_id); /* command header */
+ lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
+ sizeof(struct dn_fs) + sizeof(struct dn_profile);
+
+ base = buf = malloc(lmax, M_DUMMYNET, M_WAIT|M_ZERO);
+ o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
+ base->id = DN_API_VERSION;
+
+ /* pipe_nr is the same in p7 and p8 */
+ i = p7->pipe_nr;
+ if (i != 0) { /* pipe config */
+ sch = o_next(&buf, sizeof(*sch), DN_SCH);
+ p = o_next(&buf, sizeof(*p), DN_LINK);
+ fs = o_next(&buf, sizeof(*fs), DN_FS);
+
+ error = dn_compat_config_pipe(sch, p, fs, v);
+ if (error) {
+ free(buf, M_DUMMYNET);
+ return error;
+ }
+ if (!is7 && p8->samples_no > 0) {
+ /* Add profiles*/
+ pf = o_next(&buf, sizeof(*pf), DN_PROFILE);
+ error = dn_compat_config_profile(pf, p, v);
+ if (error) {
+ free(buf, M_DUMMYNET);
+ return error;
+ }
+ }
+ } else { /* queue config */
+ fs = o_next(&buf, sizeof(*fs), DN_FS);
+ error = dn_compat_config_queue(fs, v);
+ if (error) {
+ free(buf, M_DUMMYNET);
+ return error;
+ }
+ }
+ error = do_config(base, (char *)buf - (char *)base);
+
+ if (buf)
+ free(buf, M_DUMMYNET);
+ return error;
+}
+
+int
+dn_compat_calc_size(struct dn_parms dn_cfg)
+{
+ int need = 0;
+ /* XXX use FreeBSD 8 struct size */
+ /* NOTE:
+ * - half scheduler: schk_count/2
+ * - all flowset: fsk_count
+ * - all flowset queues: queue_count
+ * - all pipe queue: si_count
+ */
+ need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2;
+ need += dn_cfg.fsk_count * sizeof(struct dn_flow_set);
+ need += dn_cfg.si_count * sizeof(struct dn_flow_queue8);
+ need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8);
+
+ return need;
+}
+
+int
+dn_c_copy_q (void *_ni, void *arg)
+{
+ struct copy_args *a = arg;
+ struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start;
+ struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start;
+ struct dn_flow *ni = (struct dn_flow *)_ni;
+ int size = 0;
+
+ /* XXX hash slot not set */
+ /* No difference between 7.2/8 */
+ fq7->len = ni->length;
+ fq7->len_bytes = ni->len_bytes;
+ fq7->id = ni->fid;
+
+ if (is7) {
+ size = sizeof(struct dn_flow_queue7);
+ fq7->tot_pkts = ni->tot_pkts;
+ fq7->tot_bytes = ni->tot_bytes;
+ fq7->drops = ni->drops;
+ } else {
+ size = sizeof(struct dn_flow_queue8);
+ fq8->tot_pkts = ni->tot_pkts;
+ fq8->tot_bytes = ni->tot_bytes;
+ fq8->drops = ni->drops;
+ }
+
+ *a->start += size;
+ return 0;
+}
+
+int
+dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq)
+{
+ struct dn_link *l = &s->link;
+ struct dn_fsk *f = s->fs;
+
+ struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start;
+ struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start;
+ struct dn_flow_set *fs;
+ int size = 0;
+
+ if (is7) {
+ fs = &pipe7->fs;
+ size = sizeof(struct dn_pipe7);
+ } else {
+ fs = &pipe8->fs;
+ size = sizeof(struct dn_pipe8);
+ }
+
+ /* These 4 field are the same in pipe7 and pipe8 */
+ pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE;
+ pipe7->bandwidth = l->bandwidth;
+ pipe7->delay = l->delay;
+ pipe7->pipe_nr = l->link_nr - DN_MAX_ID;
+
+ if (!is7) {
+ if (s->profile) {
+ struct dn_profile *pf = s->profile;
+ strncpy(pipe8->name, pf->name, sizeof(pf->name));
+ pipe8->loss_level = pf->loss_level;
+ pipe8->samples_no = pf->samples_no;
+ }
+ pipe8->burst = div64(l->burst , 8 * hz);
+ }
+
+ fs->flow_mask = s->sch.sched_mask;
+ fs->rq_size = s->sch.buckets ? s->sch.buckets : 1;
+
+ fs->parent_nr = l->link_nr - DN_MAX_ID;
+ fs->qsize = f->fs.qsize;
+ fs->plr = f->fs.plr;
+ fs->w_q = f->fs.w_q;
+ fs->max_th = f->max_th;
+ fs->min_th = f->min_th;
+ fs->max_p = f->fs.max_p;
+ fs->rq_elements = nq;
+
+ fs->flags_fs = convertflags2old(f->fs.flags);
+
+ *a->start += size;
+ return 0;
+}
+
+
+int
+dn_compat_copy_pipe(struct copy_args *a, void *_o)
+{
+ int have = a->end - *a->start;
+ int need = 0;
+ int pipe_size = sizeof(struct dn_pipe8);
+ int queue_size = sizeof(struct dn_flow_queue8);
+ int n_queue = 0; /* number of queues */
+
+ struct dn_schk *s = (struct dn_schk *)_o;
+ /* calculate needed space:
+ * - struct dn_pipe
+ * - if there are instances, dn_queue * n_instances
+ */
+ n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) :
+ (s->siht ? 1 : 0));
+ need = pipe_size + queue_size * n_queue;
+ if (have < need) {
+ D("have %d < need %d", have, need);
+ return 1;
+ }
+ /* copy pipe */
+ dn_c_copy_pipe(s, a, n_queue);
+
+ /* copy queues */
+ if (s->sch.flags & DN_HAVE_MASK)
+ dn_ht_scan(s->siht, dn_c_copy_q, a);
+ else if (s->siht)
+ dn_c_copy_q(s->siht, a);
+ return 0;
+}
+
+int
+dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq)
+{
+ struct dn_flow_set *fs = (struct dn_flow_set *)*a->start;
+
+ fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
+ fs->fs_nr = f->fs.fs_nr;
+ fs->qsize = f->fs.qsize;
+ fs->plr = f->fs.plr;
+ fs->w_q = f->fs.w_q;
+ fs->max_th = f->max_th;
+ fs->min_th = f->min_th;
+ fs->max_p = f->fs.max_p;
+ fs->flow_mask = f->fs.flow_mask;
+ fs->rq_elements = nq;
+ fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1);
+ fs->parent_nr = f->fs.sched_nr;
+ fs->weight = f->fs.par[0];
+
+ fs->flags_fs = convertflags2old(f->fs.flags);
+ *a->start += sizeof(struct dn_flow_set);
+ return 0;
+}
+
+int
+dn_compat_copy_queue(struct copy_args *a, void *_o)
+{
+ int have = a->end - *a->start;
+ int need = 0;
+ int fs_size = sizeof(struct dn_flow_set);
+ int queue_size = sizeof(struct dn_flow_queue8);
+
+ struct dn_fsk *fs = (struct dn_fsk *)_o;
+ int n_queue = 0; /* number of queues */
+
+ n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) :
+ (fs->qht ? 1 : 0));
+
+ need = fs_size + queue_size * n_queue;
+ if (have < need) {
+ D("have < need");
+ return 1;
+ }
+
+ /* copy flowset */
+ dn_c_copy_fs(fs, a, n_queue);
+
+ /* copy queues */
+ if (fs->fs.flags & DN_HAVE_MASK)
+ dn_ht_scan(fs->qht, dn_c_copy_q, a);
+ else if (fs->qht)
+ dn_c_copy_q(fs->qht, a);
+
+ return 0;
+}
+
+int
+copy_data_helper_compat(void *_o, void *_arg)
+{
+ struct copy_args *a = _arg;
+
+ if (a->type == DN_COMPAT_PIPE) {
+ struct dn_schk *s = _o;
+ if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) {
+ return 0; /* not old type */
+ }
+ /* copy pipe parameters, and if instance exists, copy
+ * other parameters and eventually queues.
+ */
+ if(dn_compat_copy_pipe(a, _o))
+ return DNHT_SCAN_END;
+ } else if (a->type == DN_COMPAT_QUEUE) {
+ struct dn_fsk *fs = _o;
+ if (fs->fs.fs_nr >= DN_MAX_ID)
+ return 0;
+ if (dn_compat_copy_queue(a, _o))
+ return DNHT_SCAN_END;
+ }
+ return 0;
+}
+
+/* Main function to manage old requests */
+int
+ip_dummynet_compat(struct sockopt *sopt)
+{
+ int error=0;
+ void *v = NULL;
+ struct dn_id oid;
+
+ /* Lenght of data, used to found ipfw version... */
+ int len = sopt->sopt_valsize;
+
+ /* len can be 0 if command was dummynet_flush */
+ if (len == pipesize7) {
+ D("setting compatibility with FreeBSD 7.2");
+ is7 = 1;
+ }
+ else if (len == pipesize8 || len == pipesizemax8) {
+ D("setting compatibility with FreeBSD 8");
+ is7 = 0;
+ }
+
+ switch (sopt->sopt_name) {
+ default:
+ printf("dummynet: -- unknown option %d", sopt->sopt_name);
+ error = EINVAL;
+ break;
+
+ case IP_DUMMYNET_FLUSH:
+ oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
+ do_config(&oid, oid.len);
+ break;
+
+ case IP_DUMMYNET_DEL:
+ v = malloc(len, M_TEMP, M_WAITOK);
+ error = sooptcopyin(sopt, v, len, len);
+ if (error)
+ break;
+ error = dn_compat_del(v);
+ free(v, M_DUMMYNET);
+ break;
+
+ case IP_DUMMYNET_CONFIGURE:
+ v = malloc(len, M_TEMP, M_WAITOK);
+ error = sooptcopyin(sopt, v, len, len);
+ if (error)
+ break;
+ error = dn_compat_configure(v);
+ free(v, M_DUMMYNET);
+ break;
+
+ case IP_DUMMYNET_GET: {
+ void *buf;
+ int ret;
+ int original_size = sopt->sopt_valsize;
+ int size;
+
+ ret = dummynet_get(sopt, &buf);
+ if (ret)
+ return 0;//XXX ?
+ size = sopt->sopt_valsize;
+ sopt->sopt_valsize = original_size;
+ D("size=%d, buf=%p", size, buf);
+ ret = sooptcopyout(sopt, buf, size);
+ if (ret)
+ printf(" %s ERROR sooptcopyout\n", __FUNCTION__);
+ if (buf)
+ free(buf, M_DUMMYNET);
+ }
+ }
+
+ return error;
+}
+
+
diff --git a/freebsd/sys/netinet/ipfw/ip_dn_io.c b/freebsd/sys/netinet/ipfw/ip_dn_io.c
new file mode 100644
index 00000000..7a2c46d4
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/ip_dn_io.c
@@ -0,0 +1,796 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Dummynet portions related to packet handling.
+ */
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet6.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/rwlock.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/time.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <freebsd/net/netisr.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h> /* ip_len, ip_off */
+#include <freebsd/netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
+#include <freebsd/netinet/ip_fw.h>
+#include <freebsd/netinet/ipfw/ip_fw_private.h>
+#include <freebsd/netinet/ipfw/dn_heap.h>
+#include <freebsd/netinet/ip_dummynet.h>
+#include <freebsd/netinet/ipfw/ip_dn_private.h>
+#include <freebsd/netinet/ipfw/dn_sched.h>
+
+#include <freebsd/netinet/if_ether.h> /* various ether_* routines */
+
+#include <freebsd/netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
+#include <freebsd/netinet6/ip6_var.h>
+
+/*
+ * We keep a private variable for the simulation time, but we could
+ * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
+ * instead of dn_cfg.curr_time
+ */
+
+struct dn_parms dn_cfg;
+
+static long tick_last; /* Last tick duration (usec). */
+static long tick_delta; /* Last vs standard tick diff (usec). */
+static long tick_delta_sum; /* Accumulated tick difference (usec).*/
+static long tick_adjustment; /* Tick adjustments done. */
+static long tick_lost; /* Lost(coalesced) ticks number. */
+/* Adjusted vs non-adjusted curr_time difference (ticks). */
+static long tick_diff;
+
+static unsigned long io_pkt;
+static unsigned long io_pkt_fast;
+static unsigned long io_pkt_drop;
+
+/*
+ * We use a heap to store entities for which we have pending timer events.
+ * The heap is checked at every tick and all entities with expired events
+ * are extracted.
+ */
+
+MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
+
+extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+#ifdef SYSCTL_NODE
+
+SYSBEGIN(f4)
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+
+/* parameters */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
+ CTLFLAG_RW, &dn_cfg.hash_size, 0, "Default hash table size");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
+ CTLFLAG_RW, &dn_cfg.slot_limit, 0,
+ "Upper limit in slots for pipe queue.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
+ CTLFLAG_RW, &dn_cfg.byte_limit, 0,
+ "Upper limit in bytes for pipe queue.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
+ CTLFLAG_RW, &dn_cfg.io_fast, 0, "Enable fast dummynet io.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug,
+ CTLFLAG_RW, &dn_cfg.debug, 0, "Dummynet debug level");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
+ CTLFLAG_RW, &dn_cfg.expire, 0, "Expire empty queues/pipes");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle,
+ CTLFLAG_RD, &dn_cfg.expire_cycle, 0, "Expire cycle for queues/pipes");
+
+/* RED parameters */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
+ CTLFLAG_RD, &dn_cfg.red_lookup_depth, 0, "Depth of RED lookup table");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
+ CTLFLAG_RD, &dn_cfg.red_avg_pkt_size, 0, "RED Medium packet size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
+ CTLFLAG_RD, &dn_cfg.red_max_pkt_size, 0, "RED Max packet size");
+
+/* time adjustment */
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
+ CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
+ CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
+ CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
+ CTLFLAG_RD, &tick_diff, 0,
+ "Adjusted vs non-adjusted curr_time difference (ticks).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
+ CTLFLAG_RD, &tick_lost, 0,
+ "Number of ticks coalesced by dummynet taskqueue.");
+
+/* statistics */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count,
+ CTLFLAG_RD, &dn_cfg.schk_count, 0, "Number of schedulers");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count,
+ CTLFLAG_RD, &dn_cfg.si_count, 0, "Number of scheduler instances");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count,
+ CTLFLAG_RD, &dn_cfg.fsk_count, 0, "Number of flowsets");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count,
+ CTLFLAG_RD, &dn_cfg.queue_count, 0, "Number of queues");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
+ CTLFLAG_RD, &io_pkt, 0,
+ "Number of packets passed to dummynet.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
+ CTLFLAG_RD, &io_pkt_fast, 0,
+ "Number of packets bypassed dummynet scheduler.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
+ CTLFLAG_RD, &io_pkt_drop, 0,
+ "Number of packets dropped by dummynet.");
+
+SYSEND
+
+#endif
+
+static void dummynet_send(struct mbuf *);
+
+/*
+ * Packets processed by dummynet have an mbuf tag associated with
+ * them that carries their dummynet state.
+ * Outside dummynet, only the 'rule' field is relevant, and it must
+ * be at the beginning of the structure.
+ */
+struct dn_pkt_tag {
+ struct ipfw_rule_ref rule; /* matching rule */
+
+ /* second part, dummynet specific */
+ int dn_dir; /* action when packet comes out.*/
+ /* see ip_fw_private.h */
+ uint64_t output_time; /* when the pkt is due for delivery*/
+ struct ifnet *ifp; /* interface, for ip_output */
+ struct _ip6dn_args ip6opt; /* XXX ipv6 options */
+};
+
+/*
+ * Return the mbuf tag holding the dummynet state (it should
+ * be the first one on the list).
+ */
+static struct dn_pkt_tag *
+dn_tag_get(struct mbuf *m)
+{
+ struct m_tag *mtag = m_tag_first(m);
+ KASSERT(mtag != NULL &&
+ mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
+ mtag->m_tag_id == PACKET_TAG_DUMMYNET,
+ ("packet on dummynet queue w/o dummynet tag!"));
+ return (struct dn_pkt_tag *)(mtag+1);
+}
+
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+ if (q->head == NULL)
+ q->head = m;
+ else
+ q->tail->m_nextpkt = m;
+ q->tail = m;
+ m->m_nextpkt = NULL;
+}
+
+/*
+ * Dispose a list of packet. Use a functions so if we need to do
+ * more work, this is a central point to do it.
+ */
+void dn_free_pkts(struct mbuf *mnext)
+{
+ struct mbuf *m;
+
+ while ((m = mnext) != NULL) {
+ mnext = m->m_nextpkt;
+ FREE_PKT(m);
+ }
+}
+
+static int
+red_drops (struct dn_queue *q, int len)
+{
+ /*
+ * RED algorithm
+ *
+ * RED calculates the average queue size (avg) using a low-pass filter
+ * with an exponential weighted (w_q) moving average:
+ * avg <- (1-w_q) * avg + w_q * q_size
+ * where q_size is the queue length (measured in bytes or * packets).
+ *
+ * If q_size == 0, we compute the idle time for the link, and set
+ * avg = (1 - w_q)^(idle/s)
+ * where s is the time needed for transmitting a medium-sized packet.
+ *
+ * Now, if avg < min_th the packet is enqueued.
+ * If avg > max_th the packet is dropped. Otherwise, the packet is
+ * dropped with probability P function of avg.
+ */
+
+ struct dn_fsk *fs = q->fs;
+ int64_t p_b = 0;
+
+ /* Queue in bytes or packets? */
+ uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ?
+ q->ni.len_bytes : q->ni.length;
+
+ /* Average queue size estimation. */
+ if (q_size != 0) {
+ /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
+ int diff = SCALE(q_size) - q->avg;
+ int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
+
+ q->avg += (int)v;
+ } else {
+ /*
+ * Queue is empty, find for how long the queue has been
+ * empty and use a lookup table for computing
+ * (1 - * w_q)^(idle_time/s) where s is the time to send a
+ * (small) packet.
+ * XXX check wraps...
+ */
+ if (q->avg) {
+ u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step);
+
+ q->avg = (t < fs->lookup_depth) ?
+ SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
+ }
+ }
+
+ /* Should i drop? */
+ if (q->avg < fs->min_th) {
+ q->count = -1;
+ return (0); /* accept packet */
+ }
+ if (q->avg >= fs->max_th) { /* average queue >= max threshold */
+ if (fs->fs.flags & DN_IS_GENTLE_RED) {
+ /*
+ * According to Gentle-RED, if avg is greater than
+ * max_th the packet is dropped with a probability
+ * p_b = c_3 * avg - c_4
+ * where c_3 = (1 - max_p) / max_th
+ * c_4 = 1 - 2 * max_p
+ */
+ p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
+ fs->c_4;
+ } else {
+ q->count = -1;
+ return (1);
+ }
+ } else if (q->avg > fs->min_th) {
+ /*
+ * We compute p_b using the linear dropping function
+ * p_b = c_1 * avg - c_2
+ * where c_1 = max_p / (max_th - min_th)
+ * c_2 = max_p * min_th / (max_th - min_th)
+ */
+ p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
+ }
+
+ if (fs->fs.flags & DN_QSIZE_BYTES)
+ p_b = div64((p_b * len) , fs->max_pkt_size);
+ if (++q->count == 0)
+ q->random = random() & 0xffff;
+ else {
+ /*
+ * q->count counts packets arrived since last drop, so a greater
+ * value of q->count means a greater packet drop probability.
+ */
+ if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
+ q->count = 0;
+ /* After a drop we calculate a new random value. */
+ q->random = random() & 0xffff;
+ return (1); /* drop */
+ }
+ }
+ /* End of RED algorithm. */
+
+ return (0); /* accept */
+
+}
+
+/*
+ * Enqueue a packet in q, subject to space and queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+int
+dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
+{
+ struct dn_fs *f;
+ struct dn_flow *ni; /* stats for scheduler instance */
+ uint64_t len;
+
+ if (q->fs == NULL || q->_si == NULL) {
+ printf("%s fs %p si %p, dropping\n",
+ __FUNCTION__, q->fs, q->_si);
+ FREE_PKT(m);
+ return 1;
+ }
+ f = &(q->fs->fs);
+ ni = &q->_si->ni;
+ len = m->m_pkthdr.len;
+ /* Update statistics, then check reasons to drop pkt. */
+ q->ni.tot_bytes += len;
+ q->ni.tot_pkts++;
+ ni->tot_bytes += len;
+ ni->tot_pkts++;
+ if (drop)
+ goto drop;
+ if (f->plr && random() < f->plr)
+ goto drop;
+ if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len))
+ goto drop;
+ if (f->flags & DN_QSIZE_BYTES) {
+ if (q->ni.len_bytes > f->qsize)
+ goto drop;
+ } else if (q->ni.length >= f->qsize) {
+ goto drop;
+ }
+ mq_append(&q->mq, m);
+ q->ni.length++;
+ q->ni.len_bytes += len;
+ ni->length++;
+ ni->len_bytes += len;
+ return 0;
+
+drop:
+ io_pkt_drop++;
+ q->ni.drops++;
+ ni->drops++;
+ FREE_PKT(m);
+ return 1;
+}
+
+/*
+ * Fetch packets from the delay line which are due now. If there are
+ * leftover packets, reinsert the delay line in the heap.
+ * Runs under scheduler lock.
+ */
+static void
+transmit_event(struct mq *q, struct delay_line *dline, uint64_t now)
+{
+ struct mbuf *m;
+ struct dn_pkt_tag *pkt = NULL;
+
+ dline->oid.subtype = 0; /* not in heap */
+ while ((m = dline->mq.head) != NULL) {
+ pkt = dn_tag_get(m);
+ if (!DN_KEY_LEQ(pkt->output_time, now))
+ break;
+ dline->mq.head = m->m_nextpkt;
+ mq_append(q, m);
+ }
+ if (m != NULL) {
+ dline->oid.subtype = 1; /* in heap */
+ heap_insert(&dn_cfg.evheap, pkt->output_time, dline);
+ }
+}
+
+/*
+ * Convert the additional MAC overheads/delays into an equivalent
+ * number of bits for the given data rate. The samples are
+ * in milliseconds so we need to divide by 1000.
+ */
+static uint64_t
+extra_bits(struct mbuf *m, struct dn_schk *s)
+{
+ int index;
+ uint64_t bits;
+ struct dn_profile *pf = s->profile;
+
+ if (!pf || pf->samples_no == 0)
+ return 0;
+ index = random() % pf->samples_no;
+ bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000);
+ if (index >= pf->loss_level) {
+ struct dn_pkt_tag *dt = dn_tag_get(m);
+ if (dt)
+ dt->dn_dir = DIR_DROP;
+ }
+ return bits;
+}
+
+/*
+ * Send traffic from a scheduler instance due by 'now'.
+ * Return a pointer to the head of the queue.
+ */
+static struct mbuf *
+serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now)
+{
+ struct mq def_q;
+ struct dn_schk *s = si->sched;
+ struct mbuf *m = NULL;
+ int delay_line_idle = (si->dline.mq.head == NULL);
+ int done, bw;
+
+ if (q == NULL) {
+ q = &def_q;
+ q->head = NULL;
+ }
+
+ bw = s->link.bandwidth;
+ si->kflags &= ~DN_ACTIVE;
+
+ if (bw > 0)
+ si->credit += (now - si->sched_time) * bw;
+ else
+ si->credit = 0;
+ si->sched_time = now;
+ done = 0;
+ while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) {
+ uint64_t len_scaled;
+ done++;
+ len_scaled = (bw == 0) ? 0 : hz *
+ (m->m_pkthdr.len * 8 + extra_bits(m, s));
+ si->credit -= len_scaled;
+ /* Move packet in the delay line */
+ dn_tag_get(m)->output_time += s->link.delay ;
+ mq_append(&si->dline.mq, m);
+ }
+ /*
+ * If credit >= 0 the instance is idle, mark time.
+ * Otherwise put back in the heap, and adjust the output
+ * time of the last inserted packet, m, which was too early.
+ */
+ if (si->credit >= 0) {
+ si->idle_time = now;
+ } else {
+ uint64_t t;
+ KASSERT (bw > 0, ("bw=0 and credit<0 ?"));
+ t = div64(bw - 1 - si->credit, bw);
+ if (m)
+ dn_tag_get(m)->output_time += t;
+ si->kflags |= DN_ACTIVE;
+ heap_insert(&dn_cfg.evheap, now + t, si);
+ }
+ if (delay_line_idle && done)
+ transmit_event(q, &si->dline, now);
+ return q->head;
+}
+
+/*
+ * The timer handler for dummynet. Time is computed in ticks, but
+ * but the code is tolerant to the actual rate at which this is called.
+ * Once complete, the function reschedules itself for the next tick.
+ */
+void
+dummynet_task(void *context, int pending)
+{
+ struct timeval t;
+ struct mq q = { NULL, NULL }; /* queue to accumulate results */
+
+ DN_BH_WLOCK();
+
+ /* Update number of lost(coalesced) ticks. */
+ tick_lost += pending - 1;
+
+ getmicrouptime(&t);
+ /* Last tick duration (usec). */
+ tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 +
+ (t.tv_usec - dn_cfg.prev_t.tv_usec);
+ /* Last tick vs standard tick difference (usec). */
+ tick_delta = (tick_last * hz - 1000000) / hz;
+ /* Accumulated tick difference (usec). */
+ tick_delta_sum += tick_delta;
+
+ dn_cfg.prev_t = t;
+
+ /*
+ * Adjust curr_time if the accumulated tick difference is
+ * greater than the 'standard' tick. Since curr_time should
+ * be monotonically increasing, we do positive adjustments
+ * as required, and throttle curr_time in case of negative
+ * adjustment.
+ */
+ dn_cfg.curr_time++;
+ if (tick_delta_sum - tick >= 0) {
+ int diff = tick_delta_sum / tick;
+
+ dn_cfg.curr_time += diff;
+ tick_diff += diff;
+ tick_delta_sum %= tick;
+ tick_adjustment++;
+ } else if (tick_delta_sum + tick <= 0) {
+ dn_cfg.curr_time--;
+ tick_diff--;
+ tick_delta_sum += tick;
+ tick_adjustment++;
+ }
+
+ /* serve pending events, accumulate in q */
+ for (;;) {
+ struct dn_id *p; /* generic parameter to handler */
+
+ if (dn_cfg.evheap.elements == 0 ||
+ DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key))
+ break;
+ p = HEAP_TOP(&dn_cfg.evheap)->object;
+ heap_extract(&dn_cfg.evheap, NULL);
+
+ if (p->type == DN_SCH_I) {
+ serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time);
+ } else { /* extracted a delay line */
+ transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time);
+ }
+ }
+ if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) {
+ dn_cfg.expire_cycle = 0;
+ dn_drain_scheduler();
+ dn_drain_queue();
+ }
+
+ DN_BH_WUNLOCK();
+ dn_reschedule();
+ if (q.head != NULL)
+ dummynet_send(q.head);
+}
+
+/*
+ * forward a chain of packets to the proper destination.
+ * This runs outside the dummynet lock.
+ */
+static void
+dummynet_send(struct mbuf *m)
+{
+ struct mbuf *n;
+
+ for (; m != NULL; m = n) {
+ struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */
+ struct m_tag *tag;
+ int dst;
+
+ n = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ tag = m_tag_first(m);
+ if (tag == NULL) { /* should not happen */
+ dst = DIR_DROP;
+ } else {
+ struct dn_pkt_tag *pkt = dn_tag_get(m);
+ /* extract the dummynet info, rename the tag
+ * to carry reinject info.
+ */
+ dst = pkt->dn_dir;
+ ifp = pkt->ifp;
+ tag->m_tag_cookie = MTAG_IPFW_RULE;
+ tag->m_tag_id = 0;
+ }
+
+ switch (dst) {
+ case DIR_OUT:
+ SET_HOST_IPLEN(mtod(m, struct ip *));
+ ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
+ break ;
+
+ case DIR_IN :
+ /* put header in network format for ip_input() */
+ //SET_NET_IPLEN(mtod(m, struct ip *));
+ netisr_dispatch(NETISR_IP, m);
+ break;
+
+#ifdef INET6
+ case DIR_IN | PROTO_IPV6:
+ netisr_dispatch(NETISR_IPV6, m);
+ break;
+
+ case DIR_OUT | PROTO_IPV6:
+ SET_HOST_IPLEN(mtod(m, struct ip *));
+ ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
+ break;
+#endif
+
+ case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
+ if (bridge_dn_p != NULL)
+ ((*bridge_dn_p)(m, ifp));
+ else
+ printf("dummynet: if_bridge not loaded\n");
+
+ break;
+
+ case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
+ /*
+ * The Ethernet code assumes the Ethernet header is
+ * contiguous in the first mbuf header.
+ * Insure this is true.
+ */
+ if (m->m_len < ETHER_HDR_LEN &&
+ (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
+ printf("dummynet/ether: pullup failed, "
+ "dropping packet\n");
+ break;
+ }
+ ether_demux(m->m_pkthdr.rcvif, m);
+ break;
+
+ case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */
+ ether_output_frame(ifp, m);
+ break;
+
+ case DIR_DROP:
+ /* drop the packet after some time */
+ FREE_PKT(m);
+ break;
+
+ default:
+ printf("dummynet: bad switch %d!\n", dst);
+ FREE_PKT(m);
+ break;
+ }
+ }
+}
+
+static inline int
+tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa)
+{
+ struct dn_pkt_tag *dt;
+ struct m_tag *mtag;
+
+ mtag = m_tag_get(PACKET_TAG_DUMMYNET,
+ sizeof(*dt), M_NOWAIT | M_ZERO);
+ if (mtag == NULL)
+ return 1; /* Cannot allocate packet header. */
+ m_tag_prepend(m, mtag); /* Attach to mbuf chain. */
+ dt = (struct dn_pkt_tag *)(mtag + 1);
+ dt->rule = fwa->rule;
+ dt->rule.info &= IPFW_ONEPASS; /* only keep this info */
+ dt->dn_dir = dir;
+ dt->ifp = fwa->oif;
+ /* dt->output tame is updated as we move through */
+ dt->output_time = dn_cfg.curr_time;
+ return 0;
+}
+
+
+/*
+ * dummynet hook for packets.
+ * We use the argument to locate the flowset fs and the sched_set sch
+ * associated to it. The we apply flow_mask and sched_mask to
+ * determine the queue and scheduler instances.
+ *
+ * dir where shall we send the packet after dummynet.
+ * *m0 the mbuf with the packet
+ * ifp the 'ifp' parameter from the caller.
+ * NULL in ip_input, destination interface in ip_output,
+ */
+int
+dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
+{
+ struct mbuf *m = *m0;
+ struct dn_fsk *fs = NULL;
+ struct dn_sch_inst *si;
+ struct dn_queue *q = NULL; /* default */
+
+ int fs_id = (fwa->rule.info & IPFW_INFO_MASK) +
+ ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0);
+ DN_BH_WLOCK();
+ io_pkt++;
+ /* we could actually tag outside the lock, but who cares... */
+ if (tag_mbuf(m, dir, fwa))
+ goto dropit;
+ if (dn_cfg.busy) {
+ /* if the upper half is busy doing something expensive,
+ * lets queue the packet and move forward
+ */
+ mq_append(&dn_cfg.pending, m);
+ m = *m0 = NULL; /* consumed */
+ goto done; /* already active, nothing to do */
+ }
+ /* XXX locate_flowset could be optimised with a direct ref. */
+ fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL);
+ if (fs == NULL)
+ goto dropit; /* This queue/pipe does not exist! */
+ if (fs->sched == NULL) /* should not happen */
+ goto dropit;
+ /* find scheduler instance, possibly applying sched_mask */
+ si = ipdn_si_find(fs->sched, &(fwa->f_id));
+ if (si == NULL)
+ goto dropit;
+ /*
+ * If the scheduler supports multiple queues, find the right one
+ * (otherwise it will be ignored by enqueue).
+ */
+ if (fs->sched->fp->flags & DN_MULTIQUEUE) {
+ q = ipdn_q_find(fs, si, &(fwa->f_id));
+ if (q == NULL)
+ goto dropit;
+ }
+ if (fs->sched->fp->enqueue(si, q, m)) {
+ /* packet was dropped by enqueue() */
+ m = *m0 = NULL;
+ goto dropit;
+ }
+
+ if (si->kflags & DN_ACTIVE) {
+ m = *m0 = NULL; /* consumed */
+ goto done; /* already active, nothing to do */
+ }
+
+ /* compute the initial allowance */
+ if (si->idle_time < dn_cfg.curr_time) {
+ /* Do this only on the first packet on an idle pipe */
+ struct dn_link *p = &fs->sched->link;
+
+ si->sched_time = dn_cfg.curr_time;
+ si->credit = dn_cfg.io_fast ? p->bandwidth : 0;
+ if (p->burst) {
+ uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth;
+ if (burst > p->burst)
+ burst = p->burst;
+ si->credit += burst;
+ }
+ }
+ /* pass through scheduler and delay line */
+ m = serve_sched(NULL, si, dn_cfg.curr_time);
+
+ /* optimization -- pass it back to ipfw for immediate send */
+ /* XXX Don't call dummynet_send() if scheduler return the packet
+ * just enqueued. This avoid a lock order reversal.
+ *
+ */
+ if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) {
+ /* fast io, rename the tag * to carry reinject info. */
+ struct m_tag *tag = m_tag_first(m);
+
+ tag->m_tag_cookie = MTAG_IPFW_RULE;
+ tag->m_tag_id = 0;
+ io_pkt_fast++;
+ if (m->m_nextpkt != NULL) {
+ printf("dummynet: fast io: pkt chain detected!\n");
+ m->m_nextpkt = NULL;
+ }
+ m = NULL;
+ } else {
+ *m0 = NULL;
+ }
+done:
+ DN_BH_WUNLOCK();
+ if (m)
+ dummynet_send(m);
+ return 0;
+
+dropit:
+ io_pkt_drop++;
+ DN_BH_WUNLOCK();
+ if (m)
+ FREE_PKT(m);
+ *m0 = NULL;
+ return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS;
+}
diff --git a/freebsd/sys/netinet/ipfw/ip_dn_private.h b/freebsd/sys/netinet/ipfw/ip_dn_private.h
new file mode 100644
index 00000000..270f1881
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/ip_dn_private.h
@@ -0,0 +1,402 @@
+/*-
+ * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * internal dummynet APIs.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_PRIVATE_H
+#define _IP_DN_PRIVATE_H
+
+/* debugging support
+ * use ND() to remove debugging, D() to print a line,
+ * DX(level, ...) to print above a certain level
+ * If you redefine D() you are expected to redefine all.
+ */
+#ifndef D
+#define ND(fmt, ...) do {} while (0)
+#define D1(fmt, ...) do {} while (0)
+#define D(fmt, ...) printf("%-10s " fmt "\n", \
+ __FUNCTION__, ## __VA_ARGS__)
+#define DX(lev, fmt, ...) do { \
+ if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0)
+#endif
+
+MALLOC_DECLARE(M_DUMMYNET);
+
+#ifndef FREE_PKT
+#define FREE_PKT(m) m_freem(m)
+#endif
+
+#ifndef __linux__
+#define div64(a, b) ((int64_t)(a) / (int64_t)(b))
+#endif
+
+#define DN_LOCK_INIT() do { \
+ mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \
+ mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \
+ } while (0)
+#define DN_LOCK_DESTROY() do { \
+ mtx_destroy(&dn_cfg.uh_mtx); \
+ mtx_destroy(&dn_cfg.bh_mtx); \
+ } while (0)
+#if 0 /* not used yet */
+#define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx)
+#define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx)
+#define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)
+#endif
+
+#define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx)
+#define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx)
+#define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)
+
+SLIST_HEAD(dn_schk_head, dn_schk);
+SLIST_HEAD(dn_sch_inst_head, dn_sch_inst);
+SLIST_HEAD(dn_fsk_head, dn_fsk);
+SLIST_HEAD(dn_queue_head, dn_queue);
+SLIST_HEAD(dn_alg_head, dn_alg);
+
+struct mq { /* a basic queue of packets*/
+ struct mbuf *head, *tail;
+};
+
+static inline void
+set_oid(struct dn_id *o, int type, int len)
+{
+ o->type = type;
+ o->len = len;
+ o->subtype = 0;
+};
+
+/*
+ * configuration and global data for a dummynet instance
+ *
+ * When a configuration is modified from userland, 'id' is incremented
+ * so we can use the value to check for stale pointers.
+ */
+struct dn_parms {
+ uint32_t id; /* configuration version */
+
+ /* defaults (sysctl-accessible) */
+ int red_lookup_depth;
+ int red_avg_pkt_size;
+ int red_max_pkt_size;
+ int hash_size;
+ int max_hash_size;
+ long byte_limit; /* max queue sizes */
+ long slot_limit;
+
+ int io_fast;
+ int debug;
+
+ /* timekeeping */
+ struct timeval prev_t; /* last time dummynet_tick ran */
+ struct dn_heap evheap; /* scheduled events */
+
+ /* counters of objects -- used for reporting space */
+ int schk_count;
+ int si_count;
+ int fsk_count;
+ int queue_count;
+
+ /* ticks and other stuff */
+ uint64_t curr_time;
+ /* flowsets and schedulers are in hash tables, with 'hash_size'
+ * buckets. fshash is looked up at every packet arrival
+ * so better be generous if we expect many entries.
+ */
+ struct dn_ht *fshash;
+ struct dn_ht *schedhash;
+ /* list of flowsets without a scheduler -- use sch_chain */
+ struct dn_fsk_head fsu; /* list of unlinked flowsets */
+ struct dn_alg_head schedlist; /* list of algorithms */
+
+ /* Store the fs/sch to scan when draining. The value is the
+ * bucket number of the hash table. Expire can be disabled
+ * with net.inet.ip.dummynet.expire=0, or it happens every
+ * expire ticks.
+ **/
+ int drain_fs;
+ int drain_sch;
+ uint32_t expire;
+ uint32_t expire_cycle; /* tick count */
+
+ /* if the upper half is busy doing something long,
+ * can set the busy flag and we will enqueue packets in
+ * a queue for later processing.
+ */
+ int busy;
+ struct mq pending;
+
+#ifdef _KERNEL
+ /*
+ * This file is normally used in the kernel, unless we do
+ * some userland tests, in which case we do not need a mtx.
+ * uh_mtx arbitrates between system calls and also
+ * protects fshash, schedhash and fsunlinked.
+ * These structures are readonly for the lower half.
+ * bh_mtx protects all other structures which may be
+ * modified upon packet arrivals
+ */
+#if defined( __linux__ ) || defined( _WIN32 )
+ spinlock_t uh_mtx;
+ spinlock_t bh_mtx;
+#else
+ struct mtx uh_mtx;
+ struct mtx bh_mtx;
+#endif
+
+#endif /* _KERNEL */
+};
+
+/*
+ * Delay line, contains all packets on output from a link.
+ * Every scheduler instance has one.
+ */
+struct delay_line {
+ struct dn_id oid;
+ struct dn_sch_inst *si;
+ struct mq mq;
+};
+
+/*
+ * The kernel side of a flowset. It is linked in a hash table
+ * of flowsets, and in a list of children of their parent scheduler.
+ * qht is either the queue or (if HAVE_MASK) a hash table queues.
+ * Note that the mask to use is the (flow_mask|sched_mask), which
+ * changes as we attach/detach schedulers. So we store it here.
+ *
+ * XXX If we want to add scheduler-specific parameters, we need to
+ * put them in external storage because the scheduler may not be
+ * available when the fsk is created.
+ */
+struct dn_fsk { /* kernel side of a flowset */
+ struct dn_fs fs;
+ SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */
+
+ struct ipfw_flow_id fsk_mask;
+
+ /* qht is a hash table of queues, or just a single queue
+ * a bit in fs.flags tells us which one
+ */
+ struct dn_ht *qht;
+ struct dn_schk *sched; /* Sched we are linked to */
+ SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */
+
+ /* bucket index used by drain routine to drain queues for this
+ * flowset
+ */
+ int drain_bucket;
+ /* Parameter realted to RED / GRED */
+ /* original values are in dn_fs*/
+ int w_q ; /* queue weight (scaled) */
+ int max_th ; /* maximum threshold for queue (scaled) */
+ int min_th ; /* minimum threshold for queue (scaled) */
+ int max_p ; /* maximum value for p_b (scaled) */
+
+ u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */
+ u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */
+ u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */
+ u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */
+ u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */
+ u_int lookup_depth ; /* depth of lookup table */
+ int lookup_step ; /* granularity inside the lookup table */
+ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+ int avg_pkt_size ; /* medium packet size */
+ int max_pkt_size ; /* max packet size */
+};
+
+/*
+ * A queue is created as a child of a flowset unless it belongs to
+ * a !MULTIQUEUE scheduler. It is normally in a hash table in the
+ * flowset. fs always points to the parent flowset.
+ * si normally points to the sch_inst, unless the flowset has been
+ * detached from the scheduler -- in this case si == NULL and we
+ * should not enqueue.
+ */
+struct dn_queue {
+ struct dn_flow ni; /* oid, flow_id, stats */
+ struct mq mq; /* packets queue */
+ struct dn_sch_inst *_si; /* owner scheduler instance */
+ SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */
+ struct dn_fsk *fs; /* parent flowset. */
+
+ /* RED parameters */
+ int avg; /* average queue length est. (scaled) */
+ int count; /* arrivals since last RED drop */
+ int random; /* random value (scaled) */
+ uint64_t q_time; /* start of queue idle time */
+
+};
+
+/*
+ * The kernel side of a scheduler. Contains the userland config,
+ * a link, pointer to extra config arguments from command line,
+ * kernel flags, and a pointer to the scheduler methods.
+ * It is stored in a hash table, and holds a list of all
+ * flowsets and scheduler instances.
+ * XXX sch must be at the beginning, see schk_hash().
+ */
+struct dn_schk {
+ struct dn_sch sch;
+ struct dn_alg *fp; /* Pointer to scheduler functions */
+ struct dn_link link; /* The link, embedded */
+ struct dn_profile *profile; /* delay profile, if any */
+ struct dn_id *cfg; /* extra config arguments */
+
+ SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */
+
+ struct dn_fsk_head fsk_list; /* all fsk linked to me */
+ struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */
+
+ /* bucket index used by the drain routine to drain the scheduler
+ * instance for this flowset.
+ */
+ int drain_bucket;
+
+ /* Hash table of all instances (through sch.sched_mask)
+ * or single instance if no mask. Always valid.
+ */
+ struct dn_ht *siht;
+};
+
+
+/*
+ * Scheduler instance.
+ * Contains variables and all queues relative to a this instance.
+ * This struct is created a runtime.
+ */
+struct dn_sch_inst {
+ struct dn_flow ni; /* oid, flowid and stats */
+ SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */
+ struct delay_line dline;
+ struct dn_schk *sched; /* the template */
+ int kflags; /* DN_ACTIVE */
+
+ int64_t credit; /* bits I can transmit (more or less). */
+ uint64_t sched_time; /* time link was scheduled in ready_heap */
+ uint64_t idle_time; /* start of scheduler instance idle time */
+
+ /* q_count is the number of queues that this instance is using.
+ * The counter is incremented or decremented when
+ * a reference from the queue is created or deleted.
+ * It is used to make sure that a scheduler instance can be safely
+ * deleted by the drain routine. See notes below.
+ */
+ int q_count;
+
+};
+
+/*
+ * NOTE about object drain.
+ * The system will automatically (XXX check when) drain queues and
+ * scheduler instances when they are idle.
+ * A queue is idle when it has no packets; an instance is idle when
+ * it is not in the evheap heap, and the corresponding delay line is empty.
+ * A queue can be safely deleted when it is idle because of the scheduler
+ * function xxx_free_queue() will remove any references to it.
+ * An instance can be only deleted when no queues reference it. To be sure
+ * of that, a counter (q_count) stores the number of queues that are pointing
+ * to the instance.
+ *
+ * XXX
+ * Order of scan:
+ * - take all flowset in a bucket for the flowset hash table
+ * - take all queues in a bucket for the flowset
+ * - increment the queue bucket
+ * - scan next flowset bucket
+ * Nothing is done if a bucket contains no entries.
+ *
+ * The same schema is used for sceduler instances
+ */
+
+
+/* kernel-side flags. Linux has DN_DELETE in fcntl.h
+ */
+enum {
+ /* 1 and 2 are reserved for the SCAN flags */
+ DN_DESTROY = 0x0004, /* destroy */
+ DN_DELETE_FS = 0x0008, /* destroy flowset */
+ DN_DETACH = 0x0010,
+ DN_ACTIVE = 0x0020, /* object is in evheap */
+ DN_F_DLINE = 0x0040, /* object is a delay line */
+ DN_F_SCHI = 0x00C0, /* object is a sched.instance */
+ DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */
+};
+
+extern struct dn_parms dn_cfg;
+
+int dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+void dummynet_task(void *context, int pending);
+void dn_reschedule(void);
+
+struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *,
+ struct ipfw_flow_id *);
+struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *);
+
+/*
+ * copy_range is a template for requests for ranges of pipes/queues/scheds.
+ * The number of ranges is variable and can be derived by o.len.
+ * As a default, we use a small number of entries so that the struct
+ * fits easily on the stack and is sufficient for most common requests.
+ */
+#define DEFAULT_RANGES 5
+struct copy_range {
+ struct dn_id o;
+ uint32_t r[ 2 * DEFAULT_RANGES ];
+};
+
+struct copy_args {
+ char **start;
+ char *end;
+ int flags;
+ int type;
+ struct copy_range *extra; /* extra filtering */
+};
+
+struct sockopt;
+int ip_dummynet_compat(struct sockopt *sopt);
+int dummynet_get(struct sockopt *sopt, void **compat);
+int dn_c_copy_q (void *_ni, void *arg);
+int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq);
+int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq);
+int dn_compat_copy_queue(struct copy_args *a, void *_o);
+int dn_compat_copy_pipe(struct copy_args *a, void *_o);
+int copy_data_helper_compat(void *_o, void *_arg);
+int dn_compat_calc_size(struct dn_parms dn_cfg);
+int do_config(void *p, int l);
+
+/* function to drain idle object */
+void dn_drain_scheduler(void);
+void dn_drain_queue(void);
+
+#endif /* _IP_DN_PRIVATE_H */
diff --git a/freebsd/sys/netinet/ipfw/ip_dummynet.c b/freebsd/sys/netinet/ipfw/ip_dummynet.c
new file mode 100644
index 00000000..dca39d06
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/ip_dummynet.c
@@ -0,0 +1,2297 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Configuration and internal object management for dummynet.
+ */
+
+#include <freebsd/local/opt_inet6.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/rwlock.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/time.h>
+#include <freebsd/sys/taskqueue.h>
+#include <freebsd/net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
+#include <freebsd/netinet/ip_fw.h>
+#include <freebsd/netinet/ipfw/ip_fw_private.h>
+#include <freebsd/netinet/ipfw/dn_heap.h>
+#include <freebsd/netinet/ip_dummynet.h>
+#include <freebsd/netinet/ipfw/ip_dn_private.h>
+#include <freebsd/netinet/ipfw/dn_sched.h>
+
+/* which objects to copy */
+#define DN_C_LINK 0x01
+#define DN_C_SCH 0x02
+#define DN_C_FLOW 0x04
+#define DN_C_FS 0x08
+#define DN_C_QUEUE 0x10
+
+/* we use this argument in case of a schk_new */
+struct schk_new_arg {
+ struct dn_alg *fp;
+ struct dn_sch *sch;
+};
+
+/*---- callout hooks. ----*/
+static struct callout dn_timeout;
+static struct task dn_task;
+static struct taskqueue *dn_tq = NULL;
+
+static void
+dummynet(void * __unused unused)
+{
+
+ taskqueue_enqueue(dn_tq, &dn_task);
+}
+
+void
+dn_reschedule(void)
+{
+ callout_reset(&dn_timeout, 1, dummynet, NULL);
+}
+/*----- end of callout hooks -----*/
+
+/* Return a scheduler descriptor given the type or name. */
+static struct dn_alg *
+find_sched_type(int type, char *name)
+{
+ struct dn_alg *d;
+
+ SLIST_FOREACH(d, &dn_cfg.schedlist, next) {
+ if (d->type == type || (name && !strcmp(d->name, name)))
+ return d;
+ }
+ return NULL; /* not found */
+}
+
+int
+ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
+{
+ int oldv = *v;
+ const char *op = NULL;
+ if (oldv < lo) {
+ *v = dflt;
+ op = "Bump";
+ } else if (oldv > hi) {
+ *v = hi;
+ op = "Clamp";
+ } else
+ return *v;
+ if (op && msg)
+ printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
+ return *v;
+}
+
+/*---- flow_id mask, hash and compare functions ---*/
+/*
+ * The flow_id includes the 5-tuple, the queue/pipe number
+ * which we store in the extra area in host order,
+ * and for ipv6 also the flow_id6.
+ * XXX see if we want the tos byte (can store in 'flags')
+ */
+static struct ipfw_flow_id *
+flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id)
+{
+ int is_v6 = IS_IP6_FLOW_ID(id);
+
+ id->dst_port &= mask->dst_port;
+ id->src_port &= mask->src_port;
+ id->proto &= mask->proto;
+ id->extra &= mask->extra;
+ if (is_v6) {
+ APPLY_MASK(&id->dst_ip6, &mask->dst_ip6);
+ APPLY_MASK(&id->src_ip6, &mask->src_ip6);
+ id->flow_id6 &= mask->flow_id6;
+ } else {
+ id->dst_ip &= mask->dst_ip;
+ id->src_ip &= mask->src_ip;
+ }
+ return id;
+}
+
+/* computes an OR of two masks, result in dst and also returned */
+static struct ipfw_flow_id *
+flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst)
+{
+ int is_v6 = IS_IP6_FLOW_ID(dst);
+
+ dst->dst_port |= src->dst_port;
+ dst->src_port |= src->src_port;
+ dst->proto |= src->proto;
+ dst->extra |= src->extra;
+ if (is_v6) {
+#define OR_MASK(_d, _s) \
+ (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \
+ (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \
+ (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \
+ (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3];
+ OR_MASK(&dst->dst_ip6, &src->dst_ip6);
+ OR_MASK(&dst->src_ip6, &src->src_ip6);
+#undef OR_MASK
+ dst->flow_id6 |= src->flow_id6;
+ } else {
+ dst->dst_ip |= src->dst_ip;
+ dst->src_ip |= src->src_ip;
+ }
+ return dst;
+}
+
+static int
+nonzero_mask(struct ipfw_flow_id *m)
+{
+ if (m->dst_port || m->src_port || m->proto || m->extra)
+ return 1;
+ if (IS_IP6_FLOW_ID(m)) {
+ return
+ m->dst_ip6.__u6_addr.__u6_addr32[0] ||
+ m->dst_ip6.__u6_addr.__u6_addr32[1] ||
+ m->dst_ip6.__u6_addr.__u6_addr32[2] ||
+ m->dst_ip6.__u6_addr.__u6_addr32[3] ||
+ m->src_ip6.__u6_addr.__u6_addr32[0] ||
+ m->src_ip6.__u6_addr.__u6_addr32[1] ||
+ m->src_ip6.__u6_addr.__u6_addr32[2] ||
+ m->src_ip6.__u6_addr.__u6_addr32[3] ||
+ m->flow_id6;
+ } else {
+ return m->dst_ip || m->src_ip;
+ }
+}
+
+/* XXX we may want a better hash function */
+static uint32_t
+flow_id_hash(struct ipfw_flow_id *id)
+{
+ uint32_t i;
+
+ if (IS_IP6_FLOW_ID(id)) {
+ uint32_t *d = (uint32_t *)&id->dst_ip6;
+ uint32_t *s = (uint32_t *)&id->src_ip6;
+ i = (d[0] ) ^ (d[1]) ^
+ (d[2] ) ^ (d[3]) ^
+ (d[0] >> 15) ^ (d[1] >> 15) ^
+ (d[2] >> 15) ^ (d[3] >> 15) ^
+ (s[0] << 1) ^ (s[1] << 1) ^
+ (s[2] << 1) ^ (s[3] << 1) ^
+ (s[0] << 16) ^ (s[1] << 16) ^
+ (s[2] << 16) ^ (s[3] << 16) ^
+ (id->dst_port << 1) ^ (id->src_port) ^
+ (id->extra) ^
+ (id->proto ) ^ (id->flow_id6);
+ } else {
+ i = (id->dst_ip) ^ (id->dst_ip >> 15) ^
+ (id->src_ip << 1) ^ (id->src_ip >> 16) ^
+ (id->extra) ^
+ (id->dst_port << 1) ^ (id->src_port) ^ (id->proto);
+ }
+ return i;
+}
+
+/* Like bcmp, returns 0 if ids match, 1 otherwise. */
+static int
+flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2)
+{
+ int is_v6 = IS_IP6_FLOW_ID(id1);
+
+ if (!is_v6) {
+ if (IS_IP6_FLOW_ID(id2))
+ return 1; /* different address families */
+
+ return (id1->dst_ip == id2->dst_ip &&
+ id1->src_ip == id2->src_ip &&
+ id1->dst_port == id2->dst_port &&
+ id1->src_port == id2->src_port &&
+ id1->proto == id2->proto &&
+ id1->extra == id2->extra) ? 0 : 1;
+ }
+ /* the ipv6 case */
+ return (
+ !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) &&
+ !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) &&
+ id1->dst_port == id2->dst_port &&
+ id1->src_port == id2->src_port &&
+ id1->proto == id2->proto &&
+ id1->extra == id2->extra &&
+ id1->flow_id6 == id2->flow_id6) ? 0 : 1;
+}
+/*--------- end of flow-id mask, hash and compare ---------*/
+
+/*--- support functions for the qht hashtable ----
+ * Entries are hashed by flow-id
+ */
+static uint32_t
+q_hash(uintptr_t key, int flags, void *arg)
+{
+ /* compute the hash slot from the flow id */
+ struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
+ &((struct dn_queue *)key)->ni.fid :
+ (struct ipfw_flow_id *)key;
+
+ return flow_id_hash(id);
+}
+
+static int
+q_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+ struct dn_queue *o = (struct dn_queue *)obj;
+ struct ipfw_flow_id *id2;
+
+ if (flags & DNHT_KEY_IS_OBJ) {
+ /* compare pointers */
+ id2 = &((struct dn_queue *)key)->ni.fid;
+ } else {
+ id2 = (struct ipfw_flow_id *)key;
+ }
+ return (0 == flow_id_cmp(&o->ni.fid, id2));
+}
+
+/*
+ * create a new queue instance for the given 'key'.
+ */
+static void *
+q_new(uintptr_t key, int flags, void *arg)
+{
+ struct dn_queue *q, *template = arg;
+ struct dn_fsk *fs = template->fs;
+ int size = sizeof(*q) + fs->sched->fp->q_datalen;
+
+ q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (q == NULL) {
+ D("no memory for new queue");
+ return NULL;
+ }
+
+ set_oid(&q->ni.oid, DN_QUEUE, size);
+ if (fs->fs.flags & DN_QHT_HASH)
+ q->ni.fid = *(struct ipfw_flow_id *)key;
+ q->fs = fs;
+ q->_si = template->_si;
+ q->_si->q_count++;
+
+ if (fs->sched->fp->new_queue)
+ fs->sched->fp->new_queue(q);
+ dn_cfg.queue_count++;
+ return q;
+}
+
+/*
+ * Notify schedulers that a queue is going away.
+ * If (flags & DN_DESTROY), also free the packets.
+ * The version for callbacks is called q_delete_cb().
+ */
+static void
+dn_delete_queue(struct dn_queue *q, int flags)
+{
+ struct dn_fsk *fs = q->fs;
+
+ // D("fs %p si %p\n", fs, q->_si);
+ /* notify the parent scheduler that the queue is going away */
+ if (fs && fs->sched->fp->free_queue)
+ fs->sched->fp->free_queue(q);
+ q->_si->q_count--;
+ q->_si = NULL;
+ if (flags & DN_DESTROY) {
+ if (q->mq.head)
+ dn_free_pkts(q->mq.head);
+ bzero(q, sizeof(*q)); // safety
+ free(q, M_DUMMYNET);
+ dn_cfg.queue_count--;
+ }
+}
+
+static int
+q_delete_cb(void *q, void *arg)
+{
+ int flags = (int)(uintptr_t)arg;
+ dn_delete_queue(q, flags);
+ return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0;
+}
+
+/*
+ * calls dn_delete_queue/q_delete_cb on all queues,
+ * which notifies the parent scheduler and possibly drains packets.
+ * flags & DN_DESTROY: drains queues and destroy qht;
+ */
+static void
+qht_delete(struct dn_fsk *fs, int flags)
+{
+ ND("fs %d start flags %d qht %p",
+ fs->fs.fs_nr, flags, fs->qht);
+ if (!fs->qht)
+ return;
+ if (fs->fs.flags & DN_QHT_HASH) {
+ dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags);
+ if (flags & DN_DESTROY) {
+ dn_ht_free(fs->qht, 0);
+ fs->qht = NULL;
+ }
+ } else {
+ dn_delete_queue((struct dn_queue *)(fs->qht), flags);
+ if (flags & DN_DESTROY)
+ fs->qht = NULL;
+ }
+}
+
+/*
+ * Find and possibly create the queue for a MULTIQUEUE scheduler.
+ * We never call it for !MULTIQUEUE (the queue is in the sch_inst).
+ */
+struct dn_queue *
+ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si,
+ struct ipfw_flow_id *id)
+{
+ struct dn_queue template;
+
+ template._si = si;
+ template.fs = fs;
+
+ if (fs->fs.flags & DN_QHT_HASH) {
+ struct ipfw_flow_id masked_id;
+ if (fs->qht == NULL) {
+ fs->qht = dn_ht_init(NULL, fs->fs.buckets,
+ offsetof(struct dn_queue, q_next),
+ q_hash, q_match, q_new);
+ if (fs->qht == NULL)
+ return NULL;
+ }
+ masked_id = *id;
+ flow_id_mask(&fs->fsk_mask, &masked_id);
+ return dn_ht_find(fs->qht, (uintptr_t)&masked_id,
+ DNHT_INSERT, &template);
+ } else {
+ if (fs->qht == NULL)
+ fs->qht = q_new(0, 0, &template);
+ return (struct dn_queue *)fs->qht;
+ }
+}
+/*--- end of queue hash table ---*/
+
+/*--- support functions for the sch_inst hashtable ----
+ *
+ * These are hashed by flow-id
+ */
+static uint32_t
+si_hash(uintptr_t key, int flags, void *arg)
+{
+ /* compute the hash slot from the flow id */
+ struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
+ &((struct dn_sch_inst *)key)->ni.fid :
+ (struct ipfw_flow_id *)key;
+
+ return flow_id_hash(id);
+}
+
+static int
+si_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+ struct dn_sch_inst *o = obj;
+ struct ipfw_flow_id *id2;
+
+ id2 = (flags & DNHT_KEY_IS_OBJ) ?
+ &((struct dn_sch_inst *)key)->ni.fid :
+ (struct ipfw_flow_id *)key;
+ return flow_id_cmp(&o->ni.fid, id2) == 0;
+}
+
+/*
+ * create a new instance for the given 'key'
+ * Allocate memory for instance, delay line and scheduler private data.
+ */
+static void *
+si_new(uintptr_t key, int flags, void *arg)
+{
+ struct dn_schk *s = arg;
+ struct dn_sch_inst *si;
+ int l = sizeof(*si) + s->fp->si_datalen;
+
+ si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (si == NULL)
+ goto error;
+ /* Set length only for the part passed up to userland. */
+ set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow));
+ set_oid(&(si->dline.oid), DN_DELAY_LINE,
+ sizeof(struct delay_line));
+ /* mark si and dline as outside the event queue */
+ si->ni.oid.id = si->dline.oid.id = -1;
+
+ si->sched = s;
+ si->dline.si = si;
+
+ if (s->fp->new_sched && s->fp->new_sched(si)) {
+ D("new_sched error");
+ goto error;
+ }
+ if (s->sch.flags & DN_HAVE_MASK)
+ si->ni.fid = *(struct ipfw_flow_id *)key;
+
+ dn_cfg.si_count++;
+ return si;
+
+error:
+ if (si) {
+ bzero(si, sizeof(*si)); // safety
+ free(si, M_DUMMYNET);
+ }
+ return NULL;
+}
+
+/*
+ * Callback from siht to delete all scheduler instances. Remove
+ * si and delay line from the system heap, destroy all queues.
+ * We assume that all flowset have been notified and do not
+ * point to us anymore.
+ */
+static int
+si_destroy(void *_si, void *arg)
+{
+ struct dn_sch_inst *si = _si;
+ struct dn_schk *s = si->sched;
+ struct delay_line *dl = &si->dline;
+
+ if (dl->oid.subtype) /* remove delay line from event heap */
+ heap_extract(&dn_cfg.evheap, dl);
+ dn_free_pkts(dl->mq.head); /* drain delay line */
+ if (si->kflags & DN_ACTIVE) /* remove si from event heap */
+ heap_extract(&dn_cfg.evheap, si);
+ if (s->fp->free_sched)
+ s->fp->free_sched(si);
+ bzero(si, sizeof(*si)); /* safety */
+ free(si, M_DUMMYNET);
+ dn_cfg.si_count--;
+ return DNHT_SCAN_DEL;
+}
+
+/*
+ * Find the scheduler instance for this packet. If we need to apply
+ * a mask, do on a local copy of the flow_id to preserve the original.
+ * Assume siht is always initialized if we have a mask.
+ */
+struct dn_sch_inst *
+ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id)
+{
+
+ if (s->sch.flags & DN_HAVE_MASK) {
+ struct ipfw_flow_id id_t = *id;
+ flow_id_mask(&s->sch.sched_mask, &id_t);
+ return dn_ht_find(s->siht, (uintptr_t)&id_t,
+ DNHT_INSERT, s);
+ }
+ if (!s->siht)
+ s->siht = si_new(0, 0, s);
+ return (struct dn_sch_inst *)s->siht;
+}
+
+/* callback to flush credit for the scheduler instance */
+static int
+si_reset_credit(void *_si, void *arg)
+{
+ struct dn_sch_inst *si = _si;
+ struct dn_link *p = &si->sched->link;
+
+ si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0);
+ return 0;
+}
+
+static void
+schk_reset_credit(struct dn_schk *s)
+{
+ if (s->sch.flags & DN_HAVE_MASK)
+ dn_ht_scan(s->siht, si_reset_credit, NULL);
+ else if (s->siht)
+ si_reset_credit(s->siht, NULL);
+}
+/*---- end of sch_inst hashtable ---------------------*/
+
+/*-------------------------------------------------------
+ * flowset hash (fshash) support. Entries are hashed by fs_nr.
+ * New allocations are put in the fsunlinked list, from which
+ * they are removed when they point to a specific scheduler.
+ */
+static uint32_t
+fsk_hash(uintptr_t key, int flags, void *arg)
+{
+ uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+ ((struct dn_fsk *)key)->fs.fs_nr;
+
+ return ( (i>>8)^(i>>4)^i );
+}
+
+static int
+fsk_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+ struct dn_fsk *fs = obj;
+ int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+ ((struct dn_fsk *)key)->fs.fs_nr;
+
+ return (fs->fs.fs_nr == i);
+}
+
+static void *
+fsk_new(uintptr_t key, int flags, void *arg)
+{
+ struct dn_fsk *fs;
+
+ fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (fs) {
+ set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs));
+ dn_cfg.fsk_count++;
+ fs->drain_bucket = 0;
+ SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
+ }
+ return fs;
+}
+
+/*
+ * detach flowset from its current scheduler. Flags as follows:
+ * DN_DETACH removes from the fsk_list
+ * DN_DESTROY deletes individual queues
+ * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked).
+ */
+static void
+fsk_detach(struct dn_fsk *fs, int flags)
+{
+ if (flags & DN_DELETE_FS)
+ flags |= DN_DESTROY;
+ ND("fs %d from sched %d flags %s %s %s",
+ fs->fs.fs_nr, fs->fs.sched_nr,
+ (flags & DN_DELETE_FS) ? "DEL_FS":"",
+ (flags & DN_DESTROY) ? "DEL":"",
+ (flags & DN_DETACH) ? "DET":"");
+ if (flags & DN_DETACH) { /* detach from the list */
+ struct dn_fsk_head *h;
+ h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu;
+ SLIST_REMOVE(h, fs, dn_fsk, sch_chain);
+ }
+ /* Free the RED parameters, they will be recomputed on
+ * subsequent attach if needed.
+ */
+ if (fs->w_q_lookup)
+ free(fs->w_q_lookup, M_DUMMYNET);
+ fs->w_q_lookup = NULL;
+ qht_delete(fs, flags);
+ if (fs->sched && fs->sched->fp->free_fsk)
+ fs->sched->fp->free_fsk(fs);
+ fs->sched = NULL;
+ if (flags & DN_DELETE_FS) {
+ bzero(fs, sizeof(fs)); /* safety */
+ free(fs, M_DUMMYNET);
+ dn_cfg.fsk_count--;
+ } else {
+ SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
+ }
+}
+
+/*
+ * Detach or destroy all flowsets in a list.
+ * flags specifies what to do:
+ * DN_DESTROY: flush all queues
+ * DN_DELETE_FS: DN_DESTROY + destroy flowset
+ * DN_DELETE_FS implies DN_DESTROY
+ */
+static void
+fsk_detach_list(struct dn_fsk_head *h, int flags)
+{
+ struct dn_fsk *fs;
+ int n = 0; /* only for stats */
+
+ ND("head %p flags %x", h, flags);
+ while ((fs = SLIST_FIRST(h))) {
+ SLIST_REMOVE_HEAD(h, sch_chain);
+ n++;
+ fsk_detach(fs, flags);
+ }
+ ND("done %d flowsets", n);
+}
+
+/*
+ * called on 'queue X delete' -- removes the flowset from fshash,
+ * deletes all queues for the flowset, and removes the flowset.
+ */
+static int
+delete_fs(int i, int locked)
+{
+ struct dn_fsk *fs;
+ int err = 0;
+
+ if (!locked)
+ DN_BH_WLOCK();
+ fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL);
+ ND("fs %d found %p", i, fs);
+ if (fs) {
+ fsk_detach(fs, DN_DETACH | DN_DELETE_FS);
+ err = 0;
+ } else
+ err = EINVAL;
+ if (!locked)
+ DN_BH_WUNLOCK();
+ return err;
+}
+
+/*----- end of flowset hashtable support -------------*/
+
+/*------------------------------------------------------------
+ * Scheduler hash. When searching by index we pass sched_nr,
+ * otherwise we pass struct dn_sch * which is the first field in
+ * struct dn_schk so we can cast between the two. We use this trick
+ * because in the create phase (but it should be fixed).
+ */
+static uint32_t
+schk_hash(uintptr_t key, int flags, void *_arg)
+{
+ uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+ ((struct dn_schk *)key)->sch.sched_nr;
+ return ( (i>>8)^(i>>4)^i );
+}
+
+static int
+schk_match(void *obj, uintptr_t key, int flags, void *_arg)
+{
+ struct dn_schk *s = (struct dn_schk *)obj;
+ int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+ ((struct dn_schk *)key)->sch.sched_nr;
+ return (s->sch.sched_nr == i);
+}
+
+/*
+ * Create the entry and intialize with the sched hash if needed.
+ * Leave s->fp unset so we can tell whether a dn_ht_find() returns
+ * a new object or a previously existing one.
+ */
+static void *
+schk_new(uintptr_t key, int flags, void *arg)
+{
+ struct schk_new_arg *a = arg;
+ struct dn_schk *s;
+ int l = sizeof(*s) +a->fp->schk_datalen;
+
+ s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (s == NULL)
+ return NULL;
+ set_oid(&s->link.oid, DN_LINK, sizeof(s->link));
+ s->sch = *a->sch; // copy initial values
+ s->link.link_nr = s->sch.sched_nr;
+ SLIST_INIT(&s->fsk_list);
+ /* initialize the hash table or create the single instance */
+ s->fp = a->fp; /* si_new needs this */
+ s->drain_bucket = 0;
+ if (s->sch.flags & DN_HAVE_MASK) {
+ s->siht = dn_ht_init(NULL, s->sch.buckets,
+ offsetof(struct dn_sch_inst, si_next),
+ si_hash, si_match, si_new);
+ if (s->siht == NULL) {
+ free(s, M_DUMMYNET);
+ return NULL;
+ }
+ }
+ s->fp = NULL; /* mark as a new scheduler */
+ dn_cfg.schk_count++;
+ return s;
+}
+
+/*
+ * Callback for sched delete. Notify all attached flowsets to
+ * detach from the scheduler, destroy the internal flowset, and
+ * all instances. The scheduler goes away too.
+ * arg is 0 (only detach flowsets and destroy instances)
+ * DN_DESTROY (detach & delete queues, delete schk)
+ * or DN_DELETE_FS (delete queues and flowsets, delete schk)
+ */
+static int
+schk_delete_cb(void *obj, void *arg)
+{
+ struct dn_schk *s = obj;
+#if 0
+ int a = (int)arg;
+ ND("sched %d arg %s%s",
+ s->sch.sched_nr,
+ a&DN_DESTROY ? "DEL ":"",
+ a&DN_DELETE_FS ? "DEL_FS":"");
+#endif
+ fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0);
+ /* no more flowset pointing to us now */
+ if (s->sch.flags & DN_HAVE_MASK)
+ dn_ht_scan(s->siht, si_destroy, NULL);
+ else if (s->siht)
+ si_destroy(s->siht, NULL);
+ if (s->profile) {
+ free(s->profile, M_DUMMYNET);
+ s->profile = NULL;
+ }
+ s->siht = NULL;
+ if (s->fp->destroy)
+ s->fp->destroy(s);
+ bzero(s, sizeof(*s)); // safety
+ free(obj, M_DUMMYNET);
+ dn_cfg.schk_count--;
+ return DNHT_SCAN_DEL;
+}
+
+/*
+ * called on a 'sched X delete' command. Deletes a single scheduler.
+ * This is done by removing from the schedhash, unlinking all
+ * flowsets and deleting their traffic.
+ */
+static int
+delete_schk(int i)
+{
+ struct dn_schk *s;
+
+ s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
+ ND("%d %p", i, s);
+ if (!s)
+ return EINVAL;
+ delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */
+ /* then detach flowsets, delete traffic */
+ schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY);
+ return 0;
+}
+/*--- end of schk hashtable support ---*/
+
+static int
+copy_obj(char **start, char *end, void *_o, const char *msg, int i)
+{
+ struct dn_id *o = _o;
+ int have = end - *start;
+
+ if (have < o->len || o->len == 0 || o->type == 0) {
+ D("(WARN) type %d %s %d have %d need %d",
+ o->type, msg, i, have, o->len);
+ return 1;
+ }
+ ND("type %d %s %d len %d", o->type, msg, i, o->len);
+ bcopy(_o, *start, o->len);
+ if (o->type == DN_LINK) {
+ /* Adjust burst parameter for link */
+ struct dn_link *l = (struct dn_link *)*start;
+ l->burst = div64(l->burst, 8 * hz);
+ } else if (o->type == DN_SCH) {
+ /* Set id->id to the number of instances */
+ struct dn_schk *s = _o;
+ struct dn_id *id = (struct dn_id *)(*start);
+ id->id = (s->sch.flags & DN_HAVE_MASK) ?
+ dn_ht_entries(s->siht) : (s->siht ? 1 : 0);
+ }
+ *start += o->len;
+ return 0;
+}
+
+/* Specific function to copy a queue.
+ * Copies only the user-visible part of a queue (which is in
+ * a struct dn_flow), and sets len accordingly.
+ */
+static int
+copy_obj_q(char **start, char *end, void *_o, const char *msg, int i)
+{
+ struct dn_id *o = _o;
+ int have = end - *start;
+ int len = sizeof(struct dn_flow); /* see above comment */
+
+ if (have < len || o->len == 0 || o->type != DN_QUEUE) {
+ D("ERROR type %d %s %d have %d need %d",
+ o->type, msg, i, have, len);
+ return 1;
+ }
+ ND("type %d %s %d len %d", o->type, msg, i, len);
+ bcopy(_o, *start, len);
+ ((struct dn_id*)(*start))->len = len;
+ *start += len;
+ return 0;
+}
+
+static int
+copy_q_cb(void *obj, void *arg)
+{
+ struct dn_queue *q = obj;
+ struct copy_args *a = arg;
+ struct dn_flow *ni = (struct dn_flow *)(*a->start);
+ if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1))
+ return DNHT_SCAN_END;
+ ni->oid.type = DN_FLOW; /* override the DN_QUEUE */
+ ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL);
+ return 0;
+}
+
+static int
+copy_q(struct copy_args *a, struct dn_fsk *fs, int flags)
+{
+ if (!fs->qht)
+ return 0;
+ if (fs->fs.flags & DN_QHT_HASH)
+ dn_ht_scan(fs->qht, copy_q_cb, a);
+ else
+ copy_q_cb(fs->qht, a);
+ return 0;
+}
+
+/*
+ * This routine only copies the initial part of a profile ? XXX
+ */
+static int
+copy_profile(struct copy_args *a, struct dn_profile *p)
+{
+ int have = a->end - *a->start;
+ /* XXX here we check for max length */
+ int profile_len = sizeof(struct dn_profile) -
+ ED_MAX_SAMPLES_NO*sizeof(int);
+
+ if (p == NULL)
+ return 0;
+ if (have < profile_len) {
+ D("error have %d need %d", have, profile_len);
+ return 1;
+ }
+ bcopy(p, *a->start, profile_len);
+ ((struct dn_id *)(*a->start))->len = profile_len;
+ *a->start += profile_len;
+ return 0;
+}
+
+static int
+copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags)
+{
+ struct dn_fs *ufs = (struct dn_fs *)(*a->start);
+ if (!fs)
+ return 0;
+ ND("flowset %d", fs->fs.fs_nr);
+ if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr))
+ return DNHT_SCAN_END;
+ ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ?
+ dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0);
+ if (flags) { /* copy queues */
+ copy_q(a, fs, 0);
+ }
+ return 0;
+}
+
+static int
+copy_si_cb(void *obj, void *arg)
+{
+ struct dn_sch_inst *si = obj;
+ struct copy_args *a = arg;
+ struct dn_flow *ni = (struct dn_flow *)(*a->start);
+ if (copy_obj(a->start, a->end, &si->ni, "inst",
+ si->sched->sch.sched_nr))
+ return DNHT_SCAN_END;
+ ni->oid.type = DN_FLOW; /* override the DN_SCH_I */
+ ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL);
+ return 0;
+}
+
+static int
+copy_si(struct copy_args *a, struct dn_schk *s, int flags)
+{
+ if (s->sch.flags & DN_HAVE_MASK)
+ dn_ht_scan(s->siht, copy_si_cb, a);
+ else if (s->siht)
+ copy_si_cb(s->siht, a);
+ return 0;
+}
+
+/*
+ * compute a list of children of a scheduler and copy up
+ */
+static int
+copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags)
+{
+ struct dn_fsk *fs;
+ struct dn_id *o;
+ uint32_t *p;
+
+ int n = 0, space = sizeof(*o);
+ SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
+ if (fs->fs.fs_nr < DN_MAX_ID)
+ n++;
+ }
+ space += n * sizeof(uint32_t);
+ DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n);
+ if (a->end - *(a->start) < space)
+ return DNHT_SCAN_END;
+ o = (struct dn_id *)(*(a->start));
+ o->len = space;
+ *a->start += o->len;
+ o->type = DN_TEXT;
+ p = (uint32_t *)(o+1);
+ SLIST_FOREACH(fs, &s->fsk_list, sch_chain)
+ if (fs->fs.fs_nr < DN_MAX_ID)
+ *p++ = fs->fs.fs_nr;
+ return 0;
+}
+
+static int
+copy_data_helper(void *_o, void *_arg)
+{
+ struct copy_args *a = _arg;
+ uint32_t *r = a->extra->r; /* start of first range */
+ uint32_t *lim; /* first invalid pointer */
+ int n;
+
+ lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len);
+
+ if (a->type == DN_LINK || a->type == DN_SCH) {
+ /* pipe|sched show, we receive a dn_schk */
+ struct dn_schk *s = _o;
+
+ n = s->sch.sched_nr;
+ if (a->type == DN_SCH && n >= DN_MAX_ID)
+ return 0; /* not a scheduler */
+ if (a->type == DN_LINK && n <= DN_MAX_ID)
+ return 0; /* not a pipe */
+
+ /* see if the object is within one of our ranges */
+ for (;r < lim; r += 2) {
+ if (n < r[0] || n > r[1])
+ continue;
+ /* Found a valid entry, copy and we are done */
+ if (a->flags & DN_C_LINK) {
+ if (copy_obj(a->start, a->end,
+ &s->link, "link", n))
+ return DNHT_SCAN_END;
+ if (copy_profile(a, s->profile))
+ return DNHT_SCAN_END;
+ if (copy_flowset(a, s->fs, 0))
+ return DNHT_SCAN_END;
+ }
+ if (a->flags & DN_C_SCH) {
+ if (copy_obj(a->start, a->end,
+ &s->sch, "sched", n))
+ return DNHT_SCAN_END;
+ /* list all attached flowsets */
+ if (copy_fsk_list(a, s, 0))
+ return DNHT_SCAN_END;
+ }
+ if (a->flags & DN_C_FLOW)
+ copy_si(a, s, 0);
+ break;
+ }
+ } else if (a->type == DN_FS) {
+ /* queue show, skip internal flowsets */
+ struct dn_fsk *fs = _o;
+
+ n = fs->fs.fs_nr;
+ if (n >= DN_MAX_ID)
+ return 0;
+ /* see if the object is within one of our ranges */
+ for (;r < lim; r += 2) {
+ if (n < r[0] || n > r[1])
+ continue;
+ if (copy_flowset(a, fs, 0))
+ return DNHT_SCAN_END;
+ copy_q(a, fs, 0);
+ break; /* we are done */
+ }
+ }
+ return 0;
+}
+
+static inline struct dn_schk *
+locate_scheduler(int i)
+{
+ return dn_ht_find(dn_cfg.schedhash, i, 0, NULL);
+}
+
+/*
+ * red parameters are in fixed point arithmetic.
+ */
+static int
+config_red(struct dn_fsk *fs)
+{
+ int64_t s, idle, weight, w0;
+ int t, i;
+
+ fs->w_q = fs->fs.w_q;
+ fs->max_p = fs->fs.max_p;
+ D("called");
+ /* Doing stuff that was in userland */
+ i = fs->sched->link.bandwidth;
+ s = (i <= 0) ? 0 :
+ hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i;
+
+ idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */
+ fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth);
+ /* fs->lookup_step not scaled, */
+ if (!fs->lookup_step)
+ fs->lookup_step = 1;
+ w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled
+
+ for (t = fs->lookup_step; t > 1; --t)
+ weight = SCALE_MUL(weight, w0);
+ fs->lookup_weight = (int)(weight); // scaled
+
+ /* Now doing stuff that was in kerneland */
+ fs->min_th = SCALE(fs->fs.min_th);
+ fs->max_th = SCALE(fs->fs.max_th);
+
+ fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th);
+ fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th));
+
+ if (fs->fs.flags & DN_IS_GENTLE_RED) {
+ fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th;
+ fs->c_4 = SCALE(1) - 2 * fs->max_p;
+ }
+
+ /* If the lookup table already exist, free and create it again. */
+ if (fs->w_q_lookup) {
+ free(fs->w_q_lookup, M_DUMMYNET);
+ fs->w_q_lookup = NULL;
+ }
+ if (dn_cfg.red_lookup_depth == 0) {
+ printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
+ "must be > 0\n");
+ fs->fs.flags &= ~DN_IS_RED;
+ fs->fs.flags &= ~DN_IS_GENTLE_RED;
+ return (EINVAL);
+ }
+ fs->lookup_depth = dn_cfg.red_lookup_depth;
+ fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int),
+ M_DUMMYNET, M_NOWAIT);
+ if (fs->w_q_lookup == NULL) {
+ printf("dummynet: sorry, cannot allocate red lookup table\n");
+ fs->fs.flags &= ~DN_IS_RED;
+ fs->fs.flags &= ~DN_IS_GENTLE_RED;
+ return(ENOSPC);
+ }
+
+ /* Fill the lookup table with (1 - w_q)^x */
+ fs->w_q_lookup[0] = SCALE(1) - fs->w_q;
+
+ for (i = 1; i < fs->lookup_depth; i++)
+ fs->w_q_lookup[i] =
+ SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight);
+
+ if (dn_cfg.red_avg_pkt_size < 1)
+ dn_cfg.red_avg_pkt_size = 512;
+ fs->avg_pkt_size = dn_cfg.red_avg_pkt_size;
+ if (dn_cfg.red_max_pkt_size < 1)
+ dn_cfg.red_max_pkt_size = 1500;
+ fs->max_pkt_size = dn_cfg.red_max_pkt_size;
+ D("exit");
+ return 0;
+}
+
+/* Scan all flowset attached to this scheduler and update red */
+static void
+update_red(struct dn_schk *s)
+{
+ struct dn_fsk *fs;
+ SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
+ if (fs && (fs->fs.flags & DN_IS_RED))
+ config_red(fs);
+ }
+}
+
+/* attach flowset to scheduler s, possibly requeue */
+static void
+fsk_attach(struct dn_fsk *fs, struct dn_schk *s)
+{
+ ND("remove fs %d from fsunlinked, link to sched %d",
+ fs->fs.fs_nr, s->sch.sched_nr);
+ SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain);
+ fs->sched = s;
+ SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain);
+ if (s->fp->new_fsk)
+ s->fp->new_fsk(fs);
+ /* XXX compute fsk_mask */
+ fs->fsk_mask = fs->fs.flow_mask;
+ if (fs->sched->sch.flags & DN_HAVE_MASK)
+ flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask);
+ if (fs->qht) {
+ /*
+ * we must drain qht according to the old
+ * type, and reinsert according to the new one.
+ * The requeue is complex -- in general we need to
+ * reclassify every single packet.
+ * For the time being, let's hope qht is never set
+ * when we reach this point.
+ */
+ D("XXX TODO requeue from fs %d to sch %d",
+ fs->fs.fs_nr, s->sch.sched_nr);
+ fs->qht = NULL;
+ }
+ /* set the new type for qht */
+ if (nonzero_mask(&fs->fsk_mask))
+ fs->fs.flags |= DN_QHT_HASH;
+ else
+ fs->fs.flags &= ~DN_QHT_HASH;
+
+ /* XXX config_red() can fail... */
+ if (fs->fs.flags & DN_IS_RED)
+ config_red(fs);
+}
+
+/* update all flowsets which may refer to this scheduler */
+static void
+update_fs(struct dn_schk *s)
+{
+ struct dn_fsk *fs, *tmp;
+
+ SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) {
+ if (s->sch.sched_nr != fs->fs.sched_nr) {
+ D("fs %d for sch %d not %d still unlinked",
+ fs->fs.fs_nr, fs->fs.sched_nr,
+ s->sch.sched_nr);
+ continue;
+ }
+ fsk_attach(fs, s);
+ }
+}
+
+/*
+ * Configuration -- to preserve backward compatibility we use
+ * the following scheme (N is 65536)
+ * NUMBER SCHED LINK FLOWSET
+ * 1 .. N-1 (1)WFQ (2)WFQ (3)queue
+ * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1
+ * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1
+ *
+ * "pipe i config" configures #1, #2 and #3
+ * "sched i config" configures #1 and possibly #6
+ * "queue i config" configures #3
+ * #1 is configured with 'pipe i config' or 'sched i config'
+ * #2 is configured with 'pipe i config', and created if not
+ * existing with 'sched i config'
+ * #3 is configured with 'queue i config'
+ * #4 is automatically configured after #1, can only be FIFO
+ * #5 is automatically configured after #2
+ * #6 is automatically created when #1 is !MULTIQUEUE,
+ * and can be updated.
+ * #7 is automatically configured after #2
+ */
+
+/*
+ * configure a link (and its FIFO instance)
+ */
+static int
+config_link(struct dn_link *p, struct dn_id *arg)
+{
+ int i;
+
+ if (p->oid.len != sizeof(*p)) {
+ D("invalid pipe len %d", p->oid.len);
+ return EINVAL;
+ }
+ i = p->link_nr;
+ if (i <= 0 || i >= DN_MAX_ID)
+ return EINVAL;
+ /*
+ * The config program passes parameters as follows:
+ * bw = bits/second (0 means no limits),
+ * delay = ms, must be translated into ticks.
+ * qsize = slots/bytes
+ * burst ???
+ */
+ p->delay = (p->delay * hz) / 1000;
+ /* Scale burst size: bytes -> bits * hz */
+ p->burst *= 8 * hz;
+
+ DN_BH_WLOCK();
+ /* do it twice, base link and FIFO link */
+ for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
+ struct dn_schk *s = locate_scheduler(i);
+ if (s == NULL) {
+ DN_BH_WUNLOCK();
+ D("sched %d not found", i);
+ return EINVAL;
+ }
+ /* remove profile if exists */
+ if (s->profile) {
+ free(s->profile, M_DUMMYNET);
+ s->profile = NULL;
+ }
+ /* copy all parameters */
+ s->link.oid = p->oid;
+ s->link.link_nr = i;
+ s->link.delay = p->delay;
+ if (s->link.bandwidth != p->bandwidth) {
+ /* XXX bandwidth changes, need to update red params */
+ s->link.bandwidth = p->bandwidth;
+ update_red(s);
+ }
+ s->link.burst = p->burst;
+ schk_reset_credit(s);
+ }
+ dn_cfg.id++;
+ DN_BH_WUNLOCK();
+ return 0;
+}
+
+/*
+ * configure a flowset. Can be called from inside with locked=1,
+ */
+static struct dn_fsk *
+config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
+{
+ int i;
+ struct dn_fsk *fs;
+
+ if (nfs->oid.len != sizeof(*nfs)) {
+ D("invalid flowset len %d", nfs->oid.len);
+ return NULL;
+ }
+ i = nfs->fs_nr;
+ if (i <= 0 || i >= 3*DN_MAX_ID)
+ return NULL;
+ ND("flowset %d", i);
+ /* XXX other sanity checks */
+ if (nfs->flags & DN_QSIZE_BYTES) {
+ ipdn_bound_var(&nfs->qsize, 16384,
+ 1500, dn_cfg.byte_limit, NULL); // "queue byte size");
+ } else {
+ ipdn_bound_var(&nfs->qsize, 50,
+ 1, dn_cfg.slot_limit, NULL); // "queue slot size");
+ }
+ if (nfs->flags & DN_HAVE_MASK) {
+ /* make sure we have some buckets */
+ ipdn_bound_var(&nfs->buckets, dn_cfg.hash_size,
+ 1, dn_cfg.max_hash_size, "flowset buckets");
+ } else {
+ nfs->buckets = 1; /* we only need 1 */
+ }
+ if (!locked)
+ DN_BH_WLOCK();
+ do { /* exit with break when done */
+ struct dn_schk *s;
+ int flags = nfs->sched_nr ? DNHT_INSERT : 0;
+ int j;
+ int oldc = dn_cfg.fsk_count;
+ fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL);
+ if (fs == NULL) {
+ D("missing sched for flowset %d", i);
+ break;
+ }
+ /* grab some defaults from the existing one */
+ if (nfs->sched_nr == 0) /* reuse */
+ nfs->sched_nr = fs->fs.sched_nr;
+ for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) {
+ if (nfs->par[j] == -1) /* reuse */
+ nfs->par[j] = fs->fs.par[j];
+ }
+ if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) {
+ ND("flowset %d unchanged", i);
+ break; /* no change, nothing to do */
+ }
+ if (oldc != dn_cfg.fsk_count) /* new item */
+ dn_cfg.id++;
+ s = locate_scheduler(nfs->sched_nr);
+ /* detach from old scheduler if needed, preserving
+ * queues if we need to reattach. Then update the
+ * configuration, and possibly attach to the new sched.
+ */
+ DX(2, "fs %d changed sched %d@%p to %d@%p",
+ fs->fs.fs_nr,
+ fs->fs.sched_nr, fs->sched, nfs->sched_nr, s);
+ if (fs->sched) {
+ int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY);
+ flags |= DN_DESTROY; /* XXX temporary */
+ fsk_detach(fs, flags);
+ }
+ fs->fs = *nfs; /* copy configuration */
+ if (s != NULL)
+ fsk_attach(fs, s);
+ } while (0);
+ if (!locked)
+ DN_BH_WUNLOCK();
+ return fs;
+}
+
+/*
+ * config/reconfig a scheduler and its FIFO variant.
+ * For !MULTIQUEUE schedulers, also set up the flowset.
+ *
+ * On reconfigurations (detected because s->fp is set),
+ * detach existing flowsets preserving traffic, preserve link,
+ * and delete the old scheduler creating a new one.
+ */
+static int
+config_sched(struct dn_sch *_nsch, struct dn_id *arg)
+{
+ struct dn_schk *s;
+ struct schk_new_arg a; /* argument for schk_new */
+ int i;
+ struct dn_link p; /* copy of oldlink */
+ struct dn_profile *pf = NULL; /* copy of old link profile */
+ /* Used to preserv mask parameter */
+ struct ipfw_flow_id new_mask;
+ int new_buckets = 0;
+ int new_flags = 0;
+ int pipe_cmd;
+ int err = ENOMEM;
+
+ a.sch = _nsch;
+ if (a.sch->oid.len != sizeof(*a.sch)) {
+ D("bad sched len %d", a.sch->oid.len);
+ return EINVAL;
+ }
+ i = a.sch->sched_nr;
+ if (i <= 0 || i >= DN_MAX_ID)
+ return EINVAL;
+ /* make sure we have some buckets */
+ if (a.sch->flags & DN_HAVE_MASK)
+ ipdn_bound_var(&a.sch->buckets, dn_cfg.hash_size,
+ 1, dn_cfg.max_hash_size, "sched buckets");
+ /* XXX other sanity checks */
+ bzero(&p, sizeof(p));
+
+ pipe_cmd = a.sch->flags & DN_PIPE_CMD;
+ a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set?
+ if (pipe_cmd) {
+ /* Copy mask parameter */
+ new_mask = a.sch->sched_mask;
+ new_buckets = a.sch->buckets;
+ new_flags = a.sch->flags;
+ }
+ DN_BH_WLOCK();
+again: /* run twice, for wfq and fifo */
+ /*
+ * lookup the type. If not supplied, use the previous one
+ * or default to WF2Q+. Otherwise, return an error.
+ */
+ dn_cfg.id++;
+ a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name);
+ if (a.fp != NULL) {
+ /* found. Lookup or create entry */
+ s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a);
+ } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) {
+ /* No type. search existing s* or retry with WF2Q+ */
+ s = dn_ht_find(dn_cfg.schedhash, i, 0, &a);
+ if (s != NULL) {
+ a.fp = s->fp;
+ /* Scheduler exists, skip to FIFO scheduler
+ * if command was pipe config...
+ */
+ if (pipe_cmd)
+ goto next;
+ } else {
+ /* New scheduler, create a wf2q+ with no mask
+ * if command was pipe config...
+ */
+ if (pipe_cmd) {
+ /* clear mask parameter */
+ bzero(&a.sch->sched_mask, sizeof(new_mask));
+ a.sch->buckets = 0;
+ a.sch->flags &= ~DN_HAVE_MASK;
+ }
+ a.sch->oid.subtype = DN_SCHED_WF2QP;
+ goto again;
+ }
+ } else {
+ D("invalid scheduler type %d %s",
+ a.sch->oid.subtype, a.sch->name);
+ err = EINVAL;
+ goto error;
+ }
+ /* normalize name and subtype */
+ a.sch->oid.subtype = a.fp->type;
+ bzero(a.sch->name, sizeof(a.sch->name));
+ strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name));
+ if (s == NULL) {
+ D("cannot allocate scheduler %d", i);
+ goto error;
+ }
+ /* restore existing link if any */
+ if (p.link_nr) {
+ s->link = p;
+ if (!pf || pf->link_nr != p.link_nr) { /* no saved value */
+ s->profile = NULL; /* XXX maybe not needed */
+ } else {
+ s->profile = malloc(sizeof(struct dn_profile),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (s->profile == NULL) {
+ D("cannot allocate profile");
+ goto error; //XXX
+ }
+ bcopy(pf, s->profile, sizeof(*pf));
+ }
+ }
+ p.link_nr = 0;
+ if (s->fp == NULL) {
+ DX(2, "sched %d new type %s", i, a.fp->name);
+ } else if (s->fp != a.fp ||
+ bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) {
+ /* already existing. */
+ DX(2, "sched %d type changed from %s to %s",
+ i, s->fp->name, a.fp->name);
+ DX(4, " type/sub %d/%d -> %d/%d",
+ s->sch.oid.type, s->sch.oid.subtype,
+ a.sch->oid.type, a.sch->oid.subtype);
+ if (s->link.link_nr == 0)
+ D("XXX WARNING link 0 for sched %d", i);
+ p = s->link; /* preserve link */
+ if (s->profile) {/* preserve profile */
+ if (!pf)
+ pf = malloc(sizeof(*pf),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (pf) /* XXX should issue a warning otherwise */
+ bcopy(s->profile, pf, sizeof(*pf));
+ }
+ /* remove from the hash */
+ dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
+ /* Detach flowsets, preserve queues. */
+ // schk_delete_cb(s, NULL);
+ // XXX temporarily, kill queues
+ schk_delete_cb(s, (void *)DN_DESTROY);
+ goto again;
+ } else {
+ DX(4, "sched %d unchanged type %s", i, a.fp->name);
+ }
+ /* complete initialization */
+ s->sch = *a.sch;
+ s->fp = a.fp;
+ s->cfg = arg;
+ // XXX schk_reset_credit(s);
+ /* create the internal flowset if needed,
+ * trying to reuse existing ones if available
+ */
+ if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) {
+ s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL);
+ if (!s->fs) {
+ struct dn_fs fs;
+ bzero(&fs, sizeof(fs));
+ set_oid(&fs.oid, DN_FS, sizeof(fs));
+ fs.fs_nr = i + DN_MAX_ID;
+ fs.sched_nr = i;
+ s->fs = config_fs(&fs, NULL, 1 /* locked */);
+ }
+ if (!s->fs) {
+ schk_delete_cb(s, (void *)DN_DESTROY);
+ D("error creating internal fs for %d", i);
+ goto error;
+ }
+ }
+ /* call init function after the flowset is created */
+ if (s->fp->config)
+ s->fp->config(s);
+ update_fs(s);
+next:
+ if (i < DN_MAX_ID) { /* now configure the FIFO instance */
+ i += DN_MAX_ID;
+ if (pipe_cmd) {
+ /* Restore mask parameter for FIFO */
+ a.sch->sched_mask = new_mask;
+ a.sch->buckets = new_buckets;
+ a.sch->flags = new_flags;
+ } else {
+ /* sched config shouldn't modify the FIFO scheduler */
+ if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) {
+ /* FIFO already exist, don't touch it */
+ err = 0; /* and this is not an error */
+ goto error;
+ }
+ }
+ a.sch->sched_nr = i;
+ a.sch->oid.subtype = DN_SCHED_FIFO;
+ bzero(a.sch->name, sizeof(a.sch->name));
+ goto again;
+ }
+ err = 0;
+error:
+ DN_BH_WUNLOCK();
+ if (pf)
+ free(pf, M_DUMMYNET);
+ return err;
+}
+
+/*
+ * attach a profile to a link
+ */
+static int
+config_profile(struct dn_profile *pf, struct dn_id *arg)
+{
+ struct dn_schk *s;
+ int i, olen, err = 0;
+
+ if (pf->oid.len < sizeof(*pf)) {
+ D("short profile len %d", pf->oid.len);
+ return EINVAL;
+ }
+ i = pf->link_nr;
+ if (i <= 0 || i >= DN_MAX_ID)
+ return EINVAL;
+ /* XXX other sanity checks */
+ DN_BH_WLOCK();
+ for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
+ s = locate_scheduler(i);
+
+ if (s == NULL) {
+ err = EINVAL;
+ break;
+ }
+ dn_cfg.id++;
+ /*
+ * If we had a profile and the new one does not fit,
+ * or it is deleted, then we need to free memory.
+ */
+ if (s->profile && (pf->samples_no == 0 ||
+ s->profile->oid.len < pf->oid.len)) {
+ free(s->profile, M_DUMMYNET);
+ s->profile = NULL;
+ }
+ if (pf->samples_no == 0)
+ continue;
+ /*
+ * new profile, possibly allocate memory
+ * and copy data.
+ */
+ if (s->profile == NULL)
+ s->profile = malloc(pf->oid.len,
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (s->profile == NULL) {
+ D("no memory for profile %d", i);
+ err = ENOMEM;
+ break;
+ }
+ /* preserve larger length XXX double check */
+ olen = s->profile->oid.len;
+ if (olen < pf->oid.len)
+ olen = pf->oid.len;
+ bcopy(pf, s->profile, pf->oid.len);
+ s->profile->oid.len = olen;
+ }
+ DN_BH_WUNLOCK();
+ return err;
+}
+
+/*
+ * Delete all objects:
+ */
+static void
+dummynet_flush(void)
+{
+
+ /* delete all schedulers and related links/queues/flowsets */
+ dn_ht_scan(dn_cfg.schedhash, schk_delete_cb,
+ (void *)(uintptr_t)DN_DELETE_FS);
+ /* delete all remaining (unlinked) flowsets */
+ DX(4, "still %d unlinked fs", dn_cfg.fsk_count);
+ dn_ht_free(dn_cfg.fshash, DNHT_REMOVE);
+ fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS);
+ /* Reinitialize system heap... */
+ heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
+}
+
+/*
+ * Main handler for configuration. We are guaranteed to be called
+ * with an oid which is at least a dn_id.
+ * - the first object is the command (config, delete, flush, ...)
+ * - config_link must be issued after the corresponding config_sched
+ * - parameters (DN_TXT) for an object must preceed the object
+ * processed on a config_sched.
+ */
+int
+do_config(void *p, int l)
+{
+ struct dn_id *next, *o;
+ int err = 0, err2 = 0;
+ struct dn_id *arg = NULL;
+ uintptr_t *a;
+
+ o = p;
+ if (o->id != DN_API_VERSION) {
+ D("invalid api version got %d need %d",
+ o->id, DN_API_VERSION);
+ return EINVAL;
+ }
+ for (; l >= sizeof(*o); o = next) {
+ struct dn_id *prev = arg;
+ if (o->len < sizeof(*o) || l < o->len) {
+ D("bad len o->len %d len %d", o->len, l);
+ err = EINVAL;
+ break;
+ }
+ l -= o->len;
+ next = (struct dn_id *)((char *)o + o->len);
+ err = 0;
+ switch (o->type) {
+ default:
+ D("cmd %d not implemented", o->type);
+ break;
+#ifdef EMULATE_SYSCTL
+ /* sysctl emulation.
+ * if we recognize the command, jump to the correct
+ * handler and return
+ */
+ case DN_SYSCTL_SET:
+ err = kesysctl_emu_set(p, l);
+ return err;
+#endif
+ case DN_CMD_CONFIG: /* simply a header */
+ break;
+
+ case DN_CMD_DELETE:
+ /* the argument is in the first uintptr_t after o */
+ a = (uintptr_t *)(o+1);
+ if (o->len < sizeof(*o) + sizeof(*a)) {
+ err = EINVAL;
+ break;
+ }
+ switch (o->subtype) {
+ case DN_LINK:
+ /* delete base and derived schedulers */
+ DN_BH_WLOCK();
+ err = delete_schk(*a);
+ err2 = delete_schk(*a + DN_MAX_ID);
+ DN_BH_WUNLOCK();
+ if (!err)
+ err = err2;
+ break;
+
+ default:
+ D("invalid delete type %d",
+ o->subtype);
+ err = EINVAL;
+ break;
+
+ case DN_FS:
+ err = (*a <1 || *a >= DN_MAX_ID) ?
+ EINVAL : delete_fs(*a, 0) ;
+ break;
+ }
+ break;
+
+ case DN_CMD_FLUSH:
+ DN_BH_WLOCK();
+ dummynet_flush();
+ DN_BH_WUNLOCK();
+ break;
+ case DN_TEXT: /* store argument the next block */
+ prev = NULL;
+ arg = o;
+ break;
+ case DN_LINK:
+ err = config_link((struct dn_link *)o, arg);
+ break;
+ case DN_PROFILE:
+ err = config_profile((struct dn_profile *)o, arg);
+ break;
+ case DN_SCH:
+ err = config_sched((struct dn_sch *)o, arg);
+ break;
+ case DN_FS:
+ err = (NULL==config_fs((struct dn_fs *)o, arg, 0));
+ break;
+ }
+ if (prev)
+ arg = NULL;
+ if (err != 0)
+ break;
+ }
+ return err;
+}
+
+static int
+compute_space(struct dn_id *cmd, struct copy_args *a)
+{
+ int x = 0, need = 0;
+ int profile_size = sizeof(struct dn_profile) -
+ ED_MAX_SAMPLES_NO*sizeof(int);
+
+ /* NOTE about compute space:
+ * NP = dn_cfg.schk_count
+ * NSI = dn_cfg.si_count
+ * NF = dn_cfg.fsk_count
+ * NQ = dn_cfg.queue_count
+ * - ipfw pipe show
+ * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
+ * link, scheduler template, flowset
+ * integrated in scheduler and header
+ * for flowset list
+ * (NSI)*(dn_flow) all scheduler instance (includes
+ * the queue instance)
+ * - ipfw sched show
+ * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
+ * link, scheduler template, flowset
+ * integrated in scheduler and header
+ * for flowset list
+ * (NSI * dn_flow) all scheduler instances
+ * (NF * sizeof(uint_32)) space for flowset list linked to scheduler
+ * (NQ * dn_queue) all queue [XXXfor now not listed]
+ * - ipfw queue show
+ * (NF * dn_fs) all flowset
+ * (NQ * dn_queue) all queues
+ */
+ switch (cmd->subtype) {
+ default:
+ return -1;
+ /* XXX where do LINK and SCH differ ? */
+ /* 'ipfw sched show' could list all queues associated to
+ * a scheduler. This feature for now is disabled
+ */
+ case DN_LINK: /* pipe show */
+ x = DN_C_LINK | DN_C_SCH | DN_C_FLOW;
+ need += dn_cfg.schk_count *
+ (sizeof(struct dn_fs) + profile_size) / 2;
+ need += dn_cfg.fsk_count * sizeof(uint32_t);
+ break;
+ case DN_SCH: /* sched show */
+ need += dn_cfg.schk_count *
+ (sizeof(struct dn_fs) + profile_size) / 2;
+ need += dn_cfg.fsk_count * sizeof(uint32_t);
+ x = DN_C_SCH | DN_C_LINK | DN_C_FLOW;
+ break;
+ case DN_FS: /* queue show */
+ x = DN_C_FS | DN_C_QUEUE;
+ break;
+ case DN_GET_COMPAT: /* compatibility mode */
+ need = dn_compat_calc_size(dn_cfg);
+ break;
+ }
+ a->flags = x;
+ if (x & DN_C_SCH) {
+ need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2;
+ /* NOT also, each fs might be attached to a sched */
+ need += dn_cfg.schk_count * sizeof(struct dn_id) / 2;
+ }
+ if (x & DN_C_FS)
+ need += dn_cfg.fsk_count * sizeof(struct dn_fs);
+ if (x & DN_C_LINK) {
+ need += dn_cfg.schk_count * sizeof(struct dn_link) / 2;
+ }
+ /*
+ * When exporting a queue to userland, only pass up the
+ * struct dn_flow, which is the only visible part.
+ */
+
+ if (x & DN_C_QUEUE)
+ need += dn_cfg.queue_count * sizeof(struct dn_flow);
+ if (x & DN_C_FLOW)
+ need += dn_cfg.si_count * (sizeof(struct dn_flow));
+ return need;
+}
+
+/*
+ * If compat != NULL dummynet_get is called in compatibility mode.
+ * *compat will be the pointer to the buffer to pass to ipfw
+ */
+int
+dummynet_get(struct sockopt *sopt, void **compat)
+{
+ int have, i, need, error;
+ char *start = NULL, *buf;
+ size_t sopt_valsize;
+ struct dn_id *cmd;
+ struct copy_args a;
+ struct copy_range r;
+ int l = sizeof(struct dn_id);
+
+ bzero(&a, sizeof(a));
+ bzero(&r, sizeof(r));
+
+ /* save and restore original sopt_valsize around copyin */
+ sopt_valsize = sopt->sopt_valsize;
+
+ cmd = &r.o;
+
+ if (!compat) {
+ /* copy at least an oid, and possibly a full object */
+ error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd));
+ sopt->sopt_valsize = sopt_valsize;
+ if (error)
+ goto done;
+ l = cmd->len;
+#ifdef EMULATE_SYSCTL
+ /* sysctl emulation. */
+ if (cmd->type == DN_SYSCTL_GET)
+ return kesysctl_emu_get(sopt);
+#endif
+ if (l > sizeof(r)) {
+ /* request larger than default, allocate buffer */
+ cmd = malloc(l, M_DUMMYNET, M_WAIT);
+ if (cmd == NULL)
+ return ENOMEM; //XXX
+ error = sooptcopyin(sopt, cmd, l, l);
+ sopt->sopt_valsize = sopt_valsize;
+ if (error)
+ goto done;
+ }
+ } else { /* compatibility */
+ error = 0;
+ cmd->type = DN_CMD_GET;
+ cmd->len = sizeof(struct dn_id);
+ cmd->subtype = DN_GET_COMPAT;
+ // cmd->id = sopt_valsize;
+ D("compatibility mode");
+ }
+ a.extra = (struct copy_range *)cmd;
+ if (cmd->len == sizeof(*cmd)) { /* no range, create a default */
+ uint32_t *rp = (uint32_t *)(cmd + 1);
+ cmd->len += 2* sizeof(uint32_t);
+ rp[0] = 1;
+ rp[1] = DN_MAX_ID - 1;
+ if (cmd->subtype == DN_LINK) {
+ rp[0] += DN_MAX_ID;
+ rp[1] += DN_MAX_ID;
+ }
+ }
+ /* Count space (under lock) and allocate (outside lock).
+ * Exit with lock held if we manage to get enough buffer.
+ * Try a few times then give up.
+ */
+ for (have = 0, i = 0; i < 10; i++) {
+ DN_BH_WLOCK();
+ need = compute_space(cmd, &a);
+
+ /* if there is a range, ignore value from compute_space() */
+ if (l > sizeof(*cmd))
+ need = sopt_valsize - sizeof(*cmd);
+
+ if (need < 0) {
+ DN_BH_WUNLOCK();
+ error = EINVAL;
+ goto done;
+ }
+ need += sizeof(*cmd);
+ cmd->id = need;
+ if (have >= need)
+ break;
+
+ DN_BH_WUNLOCK();
+ if (start)
+ free(start, M_DUMMYNET);
+ start = NULL;
+ if (need > sopt_valsize)
+ break;
+
+ have = need;
+ start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO);
+ if (start == NULL) {
+ error = ENOMEM;
+ goto done;
+ }
+ }
+
+ if (start == NULL) {
+ if (compat) {
+ *compat = NULL;
+ error = 1; // XXX
+ } else {
+ error = sooptcopyout(sopt, cmd, sizeof(*cmd));
+ }
+ goto done;
+ }
+ ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, "
+ "%d:%d si %d, %d:%d queues %d",
+ dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH,
+ dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK,
+ dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS,
+ dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I,
+ dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE);
+ sopt->sopt_valsize = sopt_valsize;
+ a.type = cmd->subtype;
+
+ if (compat == NULL) {
+ bcopy(cmd, start, sizeof(*cmd));
+ ((struct dn_id*)(start))->len = sizeof(struct dn_id);
+ buf = start + sizeof(*cmd);
+ } else
+ buf = start;
+ a.start = &buf;
+ a.end = start + have;
+ /* start copying other objects */
+ if (compat) {
+ a.type = DN_COMPAT_PIPE;
+ dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a);
+ a.type = DN_COMPAT_QUEUE;
+ dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a);
+ } else if (a.type == DN_FS) {
+ dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a);
+ } else {
+ dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a);
+ }
+ DN_BH_WUNLOCK();
+
+ if (compat) {
+ *compat = start;
+ sopt->sopt_valsize = buf - start;
+ /* free() is done by ip_dummynet_compat() */
+ start = NULL; //XXX hack
+ } else {
+ error = sooptcopyout(sopt, start, buf - start);
+ }
+done:
+ if (cmd && cmd != &r.o)
+ free(cmd, M_DUMMYNET);
+ if (start)
+ free(start, M_DUMMYNET);
+ return error;
+}
+
+/* Callback called on scheduler instance to delete it if idle */
+static int
+drain_scheduler_cb(void *_si, void *arg)
+{
+ struct dn_sch_inst *si = _si;
+
+ if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL)
+ return 0;
+
+ if (si->sched->fp->flags & DN_MULTIQUEUE) {
+ if (si->q_count == 0)
+ return si_destroy(si, NULL);
+ else
+ return 0;
+ } else { /* !DN_MULTIQUEUE */
+ if ((si+1)->ni.length == 0)
+ return si_destroy(si, NULL);
+ else
+ return 0;
+ }
+ return 0; /* unreachable */
+}
+
+/* Callback called on scheduler to check if it has instances */
+static int
+drain_scheduler_sch_cb(void *_s, void *arg)
+{
+ struct dn_schk *s = _s;
+
+ if (s->sch.flags & DN_HAVE_MASK) {
+ dn_ht_scan_bucket(s->siht, &s->drain_bucket,
+ drain_scheduler_cb, NULL);
+ s->drain_bucket++;
+ } else {
+ if (s->siht) {
+ if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL)
+ s->siht = NULL;
+ }
+ }
+ return 0;
+}
+
+/* Called every tick, try to delete a 'bucket' of scheduler */
+void
+dn_drain_scheduler(void)
+{
+ dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch,
+ drain_scheduler_sch_cb, NULL);
+ dn_cfg.drain_sch++;
+}
+
+/* Callback called on queue to delete if it is idle */
+static int
+drain_queue_cb(void *_q, void *arg)
+{
+ struct dn_queue *q = _q;
+
+ if (q->ni.length == 0) {
+ dn_delete_queue(q, DN_DESTROY);
+ return DNHT_SCAN_DEL; /* queue is deleted */
+ }
+
+ return 0; /* queue isn't deleted */
+}
+
+/* Callback called on flowset used to check if it has queues */
+static int
+drain_queue_fs_cb(void *_fs, void *arg)
+{
+ struct dn_fsk *fs = _fs;
+
+ if (fs->fs.flags & DN_QHT_HASH) {
+ /* Flowset has a hash table for queues */
+ dn_ht_scan_bucket(fs->qht, &fs->drain_bucket,
+ drain_queue_cb, NULL);
+ fs->drain_bucket++;
+ } else {
+ /* No hash table for this flowset, null the pointer
+ * if the queue is deleted
+ */
+ if (fs->qht) {
+ if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL)
+ fs->qht = NULL;
+ }
+ }
+ return 0;
+}
+
+/* Called every tick, try to delete a 'bucket' of queue */
+void
+dn_drain_queue(void)
+{
+ /* scan a bucket of flowset */
+ dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs,
+ drain_queue_fs_cb, NULL);
+ dn_cfg.drain_fs++;
+}
+
+/*
+ * Handler for the various dummynet socket options
+ */
+static int
+ip_dn_ctl(struct sockopt *sopt)
+{
+ void *p = NULL;
+ int error, l;
+
+ error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
+ if (error)
+ return (error);
+
+ /* Disallow sets in really-really secure mode. */
+ if (sopt->sopt_dir == SOPT_SET) {
+ error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+ if (error)
+ return (error);
+ }
+
+ switch (sopt->sopt_name) {
+ default :
+ D("dummynet: unknown option %d", sopt->sopt_name);
+ error = EINVAL;
+ break;
+
+ case IP_DUMMYNET_FLUSH:
+ case IP_DUMMYNET_CONFIGURE:
+ case IP_DUMMYNET_DEL: /* remove a pipe or queue */
+ case IP_DUMMYNET_GET:
+ D("dummynet: compat option %d", sopt->sopt_name);
+ error = ip_dummynet_compat(sopt);
+ break;
+
+ case IP_DUMMYNET3 :
+ if (sopt->sopt_dir == SOPT_GET) {
+ error = dummynet_get(sopt, NULL);
+ break;
+ }
+ l = sopt->sopt_valsize;
+ if (l < sizeof(struct dn_id) || l > 12000) {
+ D("argument len %d invalid", l);
+ break;
+ }
+ p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ?
+ error = sooptcopyin(sopt, p, l, l);
+ if (error)
+ break ;
+ error = do_config(p, l);
+ break;
+ }
+
+ if (p != NULL)
+ free(p, M_TEMP);
+
+ return error ;
+}
+
+
+static void
+ip_dn_init(void)
+{
+ static int init_done = 0;
+
+ if (init_done)
+ return;
+ init_done = 1;
+ if (bootverbose)
+ printf("DUMMYNET with IPv6 initialized (100131)\n");
+
+ /* Set defaults here. MSVC does not accept initializers,
+ * and this is also useful for vimages
+ */
+ /* queue limits */
+ dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */
+ dn_cfg.byte_limit = 1024 * 1024;
+ dn_cfg.expire = 1;
+
+ /* RED parameters */
+ dn_cfg.red_lookup_depth = 256; /* default lookup table depth */
+ dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */
+ dn_cfg.red_max_pkt_size = 1500; /* default max packet size */
+
+ /* hash tables */
+ dn_cfg.max_hash_size = 1024; /* max in the hash tables */
+ dn_cfg.hash_size = 64; /* default hash size */
+
+ /* create hash tables for schedulers and flowsets.
+ * In both we search by key and by pointer.
+ */
+ dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size,
+ offsetof(struct dn_schk, schk_next),
+ schk_hash, schk_match, schk_new);
+ dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size,
+ offsetof(struct dn_fsk, fsk_next),
+ fsk_hash, fsk_match, fsk_new);
+
+ /* bucket index to drain object */
+ dn_cfg.drain_fs = 0;
+ dn_cfg.drain_sch = 0;
+
+ heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
+ SLIST_INIT(&dn_cfg.fsu);
+ SLIST_INIT(&dn_cfg.schedlist);
+
+ DN_LOCK_INIT();
+ ip_dn_ctl_ptr = ip_dn_ctl;
+ ip_dn_io_ptr = dummynet_io;
+
+ TASK_INIT(&dn_task, 0, dummynet_task, NULL);
+ dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT,
+ taskqueue_thread_enqueue, &dn_tq);
+ taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");
+
+ callout_init(&dn_timeout, CALLOUT_MPSAFE);
+ callout_reset(&dn_timeout, 1, dummynet, NULL);
+
+ /* Initialize curr_time adjustment mechanics. */
+ getmicrouptime(&dn_cfg.prev_t);
+}
+
+#ifdef KLD_MODULE
+static void
+ip_dn_destroy(void)
+{
+ callout_drain(&dn_timeout);
+
+ DN_BH_WLOCK();
+ ip_dn_ctl_ptr = NULL;
+ ip_dn_io_ptr = NULL;
+
+ dummynet_flush();
+ DN_BH_WUNLOCK();
+ taskqueue_drain(dn_tq, &dn_task);
+ taskqueue_free(dn_tq);
+
+ dn_ht_free(dn_cfg.schedhash, 0);
+ dn_ht_free(dn_cfg.fshash, 0);
+ heap_free(&dn_cfg.evheap);
+
+ DN_LOCK_DESTROY();
+}
+#endif /* KLD_MODULE */
+
+static int
+dummynet_modevent(module_t mod, int type, void *data)
+{
+
+ if (type == MOD_LOAD) {
+ if (ip_dn_io_ptr) {
+ printf("DUMMYNET already loaded\n");
+ return EEXIST ;
+ }
+ ip_dn_init();
+ return 0;
+ } else if (type == MOD_UNLOAD) {
+#if !defined(KLD_MODULE)
+ printf("dummynet statically compiled, cannot unload\n");
+ return EINVAL ;
+#else
+ ip_dn_destroy();
+ return 0;
+#endif
+ } else
+ return EOPNOTSUPP;
+}
+
+/* modevent helpers for the modules */
+static int
+load_dn_sched(struct dn_alg *d)
+{
+ struct dn_alg *s;
+
+ if (d == NULL)
+ return 1; /* error */
+ ip_dn_init(); /* just in case, we need the lock */
+
+ /* Check that mandatory funcs exists */
+ if (d->enqueue == NULL || d->dequeue == NULL) {
+ D("missing enqueue or dequeue for %s", d->name);
+ return 1;
+ }
+
+ /* Search if scheduler already exists */
+ DN_BH_WLOCK();
+ SLIST_FOREACH(s, &dn_cfg.schedlist, next) {
+ if (strcmp(s->name, d->name) == 0) {
+ D("%s already loaded", d->name);
+ break; /* scheduler already exists */
+ }
+ }
+ if (s == NULL)
+ SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next);
+ DN_BH_WUNLOCK();
+ D("dn_sched %s %sloaded", d->name, s ? "not ":"");
+ return s ? 1 : 0;
+}
+
+static int
+unload_dn_sched(struct dn_alg *s)
+{
+ struct dn_alg *tmp, *r;
+ int err = EINVAL;
+
+ D("called for %s", s->name);
+
+ DN_BH_WLOCK();
+ SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) {
+ if (strcmp(s->name, r->name) != 0)
+ continue;
+ D("ref_count = %d", r->ref_count);
+ err = (r->ref_count != 0) ? EBUSY : 0;
+ if (err == 0)
+ SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next);
+ break;
+ }
+ DN_BH_WUNLOCK();
+ D("dn_sched %s %sunloaded", s->name, err ? "not ":"");
+ return err;
+}
+
+int
+dn_sched_modevent(module_t mod, int cmd, void *arg)
+{
+ struct dn_alg *sch = arg;
+
+ if (cmd == MOD_LOAD)
+ return load_dn_sched(sch);
+ else if (cmd == MOD_UNLOAD)
+ return unload_dn_sched(sch);
+ else
+ return EINVAL;
+}
+
+static moduledata_t dummynet_mod = {
+ "dummynet", dummynet_modevent, NULL
+};
+
+DECLARE_MODULE(dummynet, dummynet_mod,
+ SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY-1);
+MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
+MODULE_VERSION(dummynet, 1);
+/* end of file */
diff --git a/freebsd/sys/netinet/ipfw/ip_fw2.c b/freebsd/sys/netinet/ipfw/ip_fw2.c
new file mode 100644
index 00000000..682cced1
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/ip_fw2.c
@@ -0,0 +1,2495 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * The FreeBSD IP packet firewall, main file
+ */
+
+#if !defined(KLD_MODULE)
+#include <freebsd/local/opt_ipfw.h>
+#include <freebsd/local/opt_ipdivert.h>
+#include <freebsd/local/opt_ipdn.h>
+#include <freebsd/local/opt_inet.h>
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_ipsec.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/condvar.h>
+#include <freebsd/sys/eventhandler.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/jail.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/rwlock.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/sys/ucred.h>
+#include <freebsd/net/ethernet.h> /* for ETHERTYPE_IP */
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/pf_mtag.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_icmp.h>
+#include <freebsd/netinet/ip_fw.h>
+#include <freebsd/netinet/ipfw/ip_fw_private.h>
+#include <freebsd/netinet/ip_carp.h>
+#include <freebsd/netinet/pim.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet/udp.h>
+#include <freebsd/netinet/udp_var.h>
+#include <freebsd/netinet/sctp.h>
+
+#include <freebsd/netinet/ip6.h>
+#include <freebsd/netinet/icmp6.h>
+#ifdef INET6
+#include <freebsd/netinet6/scope6_var.h>
+#include <freebsd/netinet6/ip6_var.h>
+#endif
+
+#include <freebsd/machine/in_cksum.h> /* XXX for in_cksum */
+
+#ifdef MAC
+#include <freebsd/security/mac/mac_framework.h>
+#endif
+
+/*
+ * static variables followed by global ones.
+ * All ipfw global variables are here.
+ */
+
+/* ipfw_vnet_ready controls when we are open for business */
+static VNET_DEFINE(int, ipfw_vnet_ready) = 0;
+#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready)
+
+static VNET_DEFINE(int, fw_deny_unknown_exthdrs);
+#define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs)
+
+#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
+static int default_to_accept = 1;
+#else
+static int default_to_accept;
+#endif
+
+VNET_DEFINE(int, autoinc_step);
+
+/*
+ * Each rule belongs to one of 32 different sets (0..31).
+ * The variable set_disable contains one bit per set.
+ * If the bit is set, all rules in the corresponding set
+ * are disabled. Set RESVD_SET(31) is reserved for the default rule
+ * and rules that are not deleted by the flush command,
+ * and CANNOT be disabled.
+ * Rules in set RESVD_SET can only be deleted individually.
+ */
+VNET_DEFINE(u_int32_t, set_disable);
+#define V_set_disable VNET(set_disable)
+
+VNET_DEFINE(int, fw_verbose);
+/* counter for ipfw_log(NULL...) */
+VNET_DEFINE(u_int64_t, norule_counter);
+VNET_DEFINE(int, verbose_limit);
+
+/* layer3_chain contains the list of rules for layer 3 */
+VNET_DEFINE(struct ip_fw_chain, layer3_chain);
+
+ipfw_nat_t *ipfw_nat_ptr = NULL;
+struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
+ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+#ifdef SYSCTL_NODE
+uint32_t dummy_def = IPFW_DEFAULT_RULE;
+uint32_t dummy_tables_max = IPFW_TABLES_MAX;
+
+SYSBEGIN(f3)
+
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
+ CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
+ "Only do a single pass through ipfw when using dummynet(4)");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
+ CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
+ "Rule number auto-increment step");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose,
+ CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
+ "Log matches to ipfw rules");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
+ CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
+ "Set upper limit of matches of ipfw rules logged");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
+ &dummy_def, 0,
+ "The default/max possible rule number.");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
+ &dummy_tables_max, 0,
+ "The maximum number of tables.");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
+ &default_to_accept, 0,
+ "Make the default rule accept all packets.");
+TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept);
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count,
+ CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
+ "Number of static rules");
+
+#ifdef INET6
+SYSCTL_DECL(_net_inet6_ip6);
+SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
+ CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0,
+ "Deny packets with unknown IPv6 Extension Headers");
+#endif /* INET6 */
+
+SYSEND
+
+#endif /* SYSCTL_NODE */
+
+
+/*
+ * Some macros used in the various matching options.
+ * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
+ * Other macros just cast void * into the appropriate type
+ */
+#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
+#define TCP(p) ((struct tcphdr *)(p))
+#define SCTP(p) ((struct sctphdr *)(p))
+#define UDP(p) ((struct udphdr *)(p))
+#define ICMP(p) ((struct icmphdr *)(p))
+#define ICMP6(p) ((struct icmp6_hdr *)(p))
+
+static __inline int
+icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
+{
+ int type = icmp->icmp_type;
+
+ return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
+}
+
+#define TT ( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
+ (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
+
+static int
+is_icmp_query(struct icmphdr *icmp)
+{
+ int type = icmp->icmp_type;
+
+ return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
+}
+#undef TT
+
+/*
+ * The following checks use two arrays of 8 or 16 bits to store the
+ * bits that we want set or clear, respectively. They are in the
+ * low and high half of cmd->arg1 or cmd->d[0].
+ *
+ * We scan options and store the bits we find set. We succeed if
+ *
+ * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
+ *
+ * The code is sometimes optimized not to store additional variables.
+ */
+
+static int
+flags_match(ipfw_insn *cmd, u_int8_t bits)
+{
+ u_char want_clear;
+ bits = ~bits;
+
+ if ( ((cmd->arg1 & 0xff) & bits) != 0)
+ return 0; /* some bits we want set were clear */
+ want_clear = (cmd->arg1 >> 8) & 0xff;
+ if ( (want_clear & bits) != want_clear)
+ return 0; /* some bits we want clear were set */
+ return 1;
+}
+
+static int
+ipopts_match(struct ip *ip, ipfw_insn *cmd)
+{
+ int optlen, bits = 0;
+ u_char *cp = (u_char *)(ip + 1);
+ int x = (ip->ip_hl << 2) - sizeof (struct ip);
+
+ for (; x > 0; x -= optlen, cp += optlen) {
+ int opt = cp[IPOPT_OPTVAL];
+
+ if (opt == IPOPT_EOL)
+ break;
+ if (opt == IPOPT_NOP)
+ optlen = 1;
+ else {
+ optlen = cp[IPOPT_OLEN];
+ if (optlen <= 0 || optlen > x)
+ return 0; /* invalid or truncated */
+ }
+ switch (opt) {
+
+ default:
+ break;
+
+ case IPOPT_LSRR:
+ bits |= IP_FW_IPOPT_LSRR;
+ break;
+
+ case IPOPT_SSRR:
+ bits |= IP_FW_IPOPT_SSRR;
+ break;
+
+ case IPOPT_RR:
+ bits |= IP_FW_IPOPT_RR;
+ break;
+
+ case IPOPT_TS:
+ bits |= IP_FW_IPOPT_TS;
+ break;
+ }
+ }
+ return (flags_match(cmd, bits));
+}
+
+static int
+tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
+{
+ int optlen, bits = 0;
+ u_char *cp = (u_char *)(tcp + 1);
+ int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
+
+ for (; x > 0; x -= optlen, cp += optlen) {
+ int opt = cp[0];
+ if (opt == TCPOPT_EOL)
+ break;
+ if (opt == TCPOPT_NOP)
+ optlen = 1;
+ else {
+ optlen = cp[1];
+ if (optlen <= 0)
+ break;
+ }
+
+ switch (opt) {
+
+ default:
+ break;
+
+ case TCPOPT_MAXSEG:
+ bits |= IP_FW_TCPOPT_MSS;
+ break;
+
+ case TCPOPT_WINDOW:
+ bits |= IP_FW_TCPOPT_WINDOW;
+ break;
+
+ case TCPOPT_SACK_PERMITTED:
+ case TCPOPT_SACK:
+ bits |= IP_FW_TCPOPT_SACK;
+ break;
+
+ case TCPOPT_TIMESTAMP:
+ bits |= IP_FW_TCPOPT_TS;
+ break;
+
+ }
+ }
+ return (flags_match(cmd, bits));
+}
+
+static int
+iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
+{
+ if (ifp == NULL) /* no iface with this packet, match fails */
+ return 0;
+ /* Check by name or by IP address */
+ if (cmd->name[0] != '\0') { /* match by name */
+ /* Check name */
+ if (cmd->p.glob) {
+ if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
+ return(1);
+ } else {
+ if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
+ return(1);
+ }
+ } else {
+#ifdef __FreeBSD__ /* and OSX too ? */
+ struct ifaddr *ia;
+
+ if_addr_rlock(ifp);
+ TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
+ if (ia->ifa_addr->sa_family != AF_INET)
+ continue;
+ if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
+ (ia->ifa_addr))->sin_addr.s_addr) {
+ if_addr_runlock(ifp);
+ return(1); /* match */
+ }
+ }
+ if_addr_runlock(ifp);
+#endif /* __FreeBSD__ */
+ }
+ return(0); /* no match, fail ... */
+}
+
+/*
+ * The verify_path function checks if a route to the src exists and
+ * if it is reachable via ifp (when provided).
+ *
+ * The 'verrevpath' option checks that the interface that an IP packet
+ * arrives on is the same interface that traffic destined for the
+ * packet's source address would be routed out of.
+ * The 'versrcreach' option just checks that the source address is
+ * reachable via any route (except default) in the routing table.
+ * These two are a measure to block forged packets. This is also
+ * commonly known as "anti-spoofing" or Unicast Reverse Path
+ * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
+ * is purposely reminiscent of the Cisco IOS command,
+ *
+ * ip verify unicast reverse-path
+ * ip verify unicast source reachable-via any
+ *
+ * which implements the same functionality. But note that the syntax
+ * is misleading, and the check may be performed on all IP packets
+ * whether unicast, multicast, or broadcast.
+ */
+static int
+verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
+{
+#ifndef __FreeBSD__
+ return 0;
+#else
+ struct route ro;
+ struct sockaddr_in *dst;
+
+ bzero(&ro, sizeof(ro));
+
+ dst = (struct sockaddr_in *)&(ro.ro_dst);
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = src;
+ in_rtalloc_ign(&ro, 0, fib);
+
+ if (ro.ro_rt == NULL)
+ return 0;
+
+ /*
+ * If ifp is provided, check for equality with rtentry.
+ * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+ * in order to pass packets injected back by if_simloop():
+ * if useloopback == 1 routing entry (via lo0) for our own address
+ * may exist, so we need to handle routing assymetry.
+ */
+ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+
+ /* if no ifp provided, check if rtentry is not default route */
+ if (ifp == NULL &&
+ satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+
+ /* or if this is a blackhole/reject route */
+ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+
+ /* found valid route */
+ RTFREE(ro.ro_rt);
+ return 1;
+#endif /* __FreeBSD__ */
+}
+
+#ifdef INET6
+/*
+ * ipv6 specific rules here...
+ */
+static __inline int
+icmp6type_match (int type, ipfw_insn_u32 *cmd)
+{
+ return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
+}
+
+static int
+flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
+{
+ int i;
+ for (i=0; i <= cmd->o.arg1; ++i )
+ if (curr_flow == cmd->d[i] )
+ return 1;
+ return 0;
+}
+
+/* support for IP6_*_ME opcodes */
+static int
+search_ip6_addr_net (struct in6_addr * ip6_addr)
+{
+ struct ifnet *mdc;
+ struct ifaddr *mdc2;
+ struct in6_ifaddr *fdm;
+ struct in6_addr copia;
+
+ TAILQ_FOREACH(mdc, &V_ifnet, if_link) {
+ if_addr_rlock(mdc);
+ TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) {
+ if (mdc2->ifa_addr->sa_family == AF_INET6) {
+ fdm = (struct in6_ifaddr *)mdc2;
+ copia = fdm->ia_addr.sin6_addr;
+ /* need for leaving scope_id in the sock_addr */
+ in6_clearscope(&copia);
+ if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) {
+ if_addr_runlock(mdc);
+ return 1;
+ }
+ }
+ }
+ if_addr_runlock(mdc);
+ }
+ return 0;
+}
+
+static int
+verify_path6(struct in6_addr *src, struct ifnet *ifp)
+{
+ struct route_in6 ro;
+ struct sockaddr_in6 *dst;
+
+ bzero(&ro, sizeof(ro));
+
+ dst = (struct sockaddr_in6 * )&(ro.ro_dst);
+ dst->sin6_family = AF_INET6;
+ dst->sin6_len = sizeof(*dst);
+ dst->sin6_addr = *src;
+ /* XXX MRT 0 for ipv6 at this time */
+ rtalloc_ign((struct route *)&ro, 0);
+
+ if (ro.ro_rt == NULL)
+ return 0;
+
+ /*
+ * if ifp is provided, check for equality with rtentry
+ * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+ * to support the case of sending packets to an address of our own.
+ * (where the former interface is the first argument of if_simloop()
+ * (=ifp), the latter is lo0)
+ */
+ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+
+ /* if no ifp provided, check if rtentry is not default route */
+ if (ifp == NULL &&
+ IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+
+ /* or if this is a blackhole/reject route */
+ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+
+ /* found valid route */
+ RTFREE(ro.ro_rt);
+ return 1;
+
+}
+
+static int
+is_icmp6_query(int icmp6_type)
+{
+ if ((icmp6_type <= ICMP6_MAXTYPE) &&
+ (icmp6_type == ICMP6_ECHO_REQUEST ||
+ icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
+ icmp6_type == ICMP6_WRUREQUEST ||
+ icmp6_type == ICMP6_FQDN_QUERY ||
+ icmp6_type == ICMP6_NI_QUERY))
+ return (1);
+
+ return (0);
+}
+
+static void
+send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
+{
+ struct mbuf *m;
+
+ m = args->m;
+ if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
+ struct tcphdr *tcp;
+ tcp = (struct tcphdr *)((char *)ip6 + hlen);
+
+ if ((tcp->th_flags & TH_RST) == 0) {
+ struct mbuf *m0;
+ m0 = ipfw_send_pkt(args->m, &(args->f_id),
+ ntohl(tcp->th_seq), ntohl(tcp->th_ack),
+ tcp->th_flags | TH_RST);
+ if (m0 != NULL)
+ ip6_output(m0, NULL, NULL, 0, NULL, NULL,
+ NULL);
+ }
+ FREE_PKT(m);
+ } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */
+#if 0
+ /*
+ * Unlike above, the mbufs need to line up with the ip6 hdr,
+ * as the contents are read. We need to m_adj() the
+ * needed amount.
+ * The mbuf will however be thrown away so we can adjust it.
+ * Remember we did an m_pullup on it already so we
+ * can make some assumptions about contiguousness.
+ */
+ if (args->L3offset)
+ m_adj(m, args->L3offset);
+#endif
+ icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
+ } else
+ FREE_PKT(m);
+
+ args->m = NULL;
+}
+
+#endif /* INET6 */
+
+
+/*
+ * sends a reject message, consuming the mbuf passed as an argument.
+ */
+static void
+send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
+{
+
+#if 0
+ /* XXX When ip is not guaranteed to be at mtod() we will
+ * need to account for this */
+ * The mbuf will however be thrown away so we can adjust it.
+ * Remember we did an m_pullup on it already so we
+ * can make some assumptions about contiguousness.
+ */
+ if (args->L3offset)
+ m_adj(m, args->L3offset);
+#endif
+ if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
+ /* We need the IP header in host order for icmp_error(). */
+ SET_HOST_IPLEN(ip);
+ icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
+ } else if (args->f_id.proto == IPPROTO_TCP) {
+ struct tcphdr *const tcp =
+ L3HDR(struct tcphdr, mtod(args->m, struct ip *));
+ if ( (tcp->th_flags & TH_RST) == 0) {
+ struct mbuf *m;
+ m = ipfw_send_pkt(args->m, &(args->f_id),
+ ntohl(tcp->th_seq), ntohl(tcp->th_ack),
+ tcp->th_flags | TH_RST);
+ if (m != NULL)
+ ip_output(m, NULL, NULL, 0, NULL, NULL);
+ }
+ FREE_PKT(args->m);
+ } else
+ FREE_PKT(args->m);
+ args->m = NULL;
+}
+
+/*
+ * Support for uid/gid/jail lookup. These tests are expensive
+ * (because we may need to look into the list of active sockets)
+ * so we cache the results. ugid_lookupp is 0 if we have not
+ * yet done a lookup, 1 if we succeeded, and -1 if we tried
+ * and failed. The function always returns the match value.
+ * We could actually spare the variable and use *uc, setting
+ * it to '(void *)check_uidgid if we have no info, NULL if
+ * we tried and failed, or any other value if successful.
+ */
+static int
+check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
+ struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+ u_int16_t src_port, int *ugid_lookupp,
+ struct ucred **uc, struct inpcb *inp)
+{
+#ifndef __FreeBSD__
+ return cred_check(insn, proto, oif,
+ dst_ip, dst_port, src_ip, src_port,
+ (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
+#else /* FreeBSD */
+ struct inpcbinfo *pi;
+ int wildcard;
+ struct inpcb *pcb;
+ int match;
+
+ /*
+ * Check to see if the UDP or TCP stack supplied us with
+ * the PCB. If so, rather then holding a lock and looking
+ * up the PCB, we can use the one that was supplied.
+ */
+ if (inp && *ugid_lookupp == 0) {
+ INP_LOCK_ASSERT(inp);
+ if (inp->inp_socket != NULL) {
+ *uc = crhold(inp->inp_cred);
+ *ugid_lookupp = 1;
+ } else
+ *ugid_lookupp = -1;
+ }
+ /*
+ * If we have already been here and the packet has no
+ * PCB entry associated with it, then we can safely
+ * assume that this is a no match.
+ */
+ if (*ugid_lookupp == -1)
+ return (0);
+ if (proto == IPPROTO_TCP) {
+ wildcard = 0;
+ pi = &V_tcbinfo;
+ } else if (proto == IPPROTO_UDP) {
+ wildcard = INPLOOKUP_WILDCARD;
+ pi = &V_udbinfo;
+ } else
+ return 0;
+ match = 0;
+ if (*ugid_lookupp == 0) {
+ INP_INFO_RLOCK(pi);
+ pcb = (oif) ?
+ in_pcblookup_hash(pi,
+ dst_ip, htons(dst_port),
+ src_ip, htons(src_port),
+ wildcard, oif) :
+ in_pcblookup_hash(pi,
+ src_ip, htons(src_port),
+ dst_ip, htons(dst_port),
+ wildcard, NULL);
+ if (pcb != NULL) {
+ *uc = crhold(pcb->inp_cred);
+ *ugid_lookupp = 1;
+ }
+ INP_INFO_RUNLOCK(pi);
+ if (*ugid_lookupp == 0) {
+ /*
+ * We tried and failed, set the variable to -1
+ * so we will not try again on this packet.
+ */
+ *ugid_lookupp = -1;
+ return (0);
+ }
+ }
+ if (insn->o.opcode == O_UID)
+ match = ((*uc)->cr_uid == (uid_t)insn->d[0]);
+ else if (insn->o.opcode == O_GID)
+ match = groupmember((gid_t)insn->d[0], *uc);
+ else if (insn->o.opcode == O_JAIL)
+ match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
+ return match;
+#endif /* __FreeBSD__ */
+}
+
+/*
+ * Helper function to set args with info on the rule after the matching
+ * one. slot is precise, whereas we guess rule_id as they are
+ * assigned sequentially.
+ */
+static inline void
+set_match(struct ip_fw_args *args, int slot,
+ struct ip_fw_chain *chain)
+{
+ args->rule.chain_id = chain->id;
+ args->rule.slot = slot + 1; /* we use 0 as a marker */
+ args->rule.rule_id = 1 + chain->map[slot]->id;
+ args->rule.rulenum = chain->map[slot]->rulenum;
+}
+
+/*
+ * The main check routine for the firewall.
+ *
+ * All arguments are in args so we can modify them and return them
+ * back to the caller.
+ *
+ * Parameters:
+ *
+ * args->m (in/out) The packet; we set to NULL when/if we nuke it.
+ * Starts with the IP header.
+ * args->eh (in) Mac header if present, NULL for layer3 packet.
+ * args->L3offset Number of bytes bypassed if we came from L2.
+ * e.g. often sizeof(eh) ** NOTYET **
+ * args->oif Outgoing interface, NULL if packet is incoming.
+ * The incoming interface is in the mbuf. (in)
+ * args->divert_rule (in/out)
+ * Skip up to the first rule past this rule number;
+ * upon return, non-zero port number for divert or tee.
+ *
+ * args->rule Pointer to the last matching rule (in/out)
+ * args->next_hop Socket we are forwarding to (out).
+ * args->f_id Addresses grabbed from the packet (out)
+ * args->rule.info a cookie depending on rule action
+ *
+ * Return value:
+ *
+ * IP_FW_PASS the packet must be accepted
+ * IP_FW_DENY the packet must be dropped
+ * IP_FW_DIVERT divert packet, port in m_tag
+ * IP_FW_TEE tee packet, port in m_tag
+ * IP_FW_DUMMYNET to dummynet, pipe in args->cookie
+ * IP_FW_NETGRAPH into netgraph, cookie args->cookie
+ * args->rule contains the matching rule,
+ * args->rule.info has additional information.
+ *
+ */
+int
+ipfw_chk(struct ip_fw_args *args)
+{
+
+ /*
+ * Local variables holding state while processing a packet:
+ *
+ * IMPORTANT NOTE: to speed up the processing of rules, there
+ * are some assumption on the values of the variables, which
+ * are documented here. Should you change them, please check
+ * the implementation of the various instructions to make sure
+ * that they still work.
+ *
+ * args->eh The MAC header. It is non-null for a layer2
+ * packet, it is NULL for a layer-3 packet.
+ * **notyet**
+ * args->L3offset Offset in the packet to the L3 (IP or equiv.) header.
+ *
+ * m | args->m Pointer to the mbuf, as received from the caller.
+ * It may change if ipfw_chk() does an m_pullup, or if it
+ * consumes the packet because it calls send_reject().
+ * XXX This has to change, so that ipfw_chk() never modifies
+ * or consumes the buffer.
+ * ip is the beginning of the ip(4 or 6) header.
+ * Calculated by adding the L3offset to the start of data.
+ * (Until we start using L3offset, the packet is
+ * supposed to start with the ip header).
+ */
+ struct mbuf *m = args->m;
+ struct ip *ip = mtod(m, struct ip *);
+
+ /*
+ * For rules which contain uid/gid or jail constraints, cache
+ * a copy of the users credentials after the pcb lookup has been
+ * executed. This will speed up the processing of rules with
+ * these types of constraints, as well as decrease contention
+ * on pcb related locks.
+ */
+#ifndef __FreeBSD__
+ struct bsd_ucred ucred_cache;
+#else
+ struct ucred *ucred_cache = NULL;
+#endif
+ int ucred_lookup = 0;
+
+ /*
+ * oif | args->oif If NULL, ipfw_chk has been called on the
+ * inbound path (ether_input, ip_input).
+ * If non-NULL, ipfw_chk has been called on the outbound path
+ * (ether_output, ip_output).
+ */
+ struct ifnet *oif = args->oif;
+
+ int f_pos = 0; /* index of current rule in the array */
+ int retval = 0;
+
+ /*
+ * hlen The length of the IP header.
+ */
+ u_int hlen = 0; /* hlen >0 means we have an IP pkt */
+
+ /*
+ * offset The offset of a fragment. offset != 0 means that
+ * we have a fragment at this offset of an IPv4 packet.
+ * offset == 0 means that (if this is an IPv4 packet)
+ * this is the first or only fragment.
+ * For IPv6 offset == 0 means there is no Fragment Header.
+ * If offset != 0 for IPv6 always use correct mask to
+ * get the correct offset because we add IP6F_MORE_FRAG
+ * to be able to dectect the first fragment which would
+ * otherwise have offset = 0.
+ */
+ u_short offset = 0;
+
+ /*
+ * Local copies of addresses. They are only valid if we have
+ * an IP packet.
+ *
+ * proto The protocol. Set to 0 for non-ip packets,
+ * or to the protocol read from the packet otherwise.
+ * proto != 0 means that we have an IPv4 packet.
+ *
+ * src_port, dst_port port numbers, in HOST format. Only
+ * valid for TCP and UDP packets.
+ *
+ * src_ip, dst_ip ip addresses, in NETWORK format.
+ * Only valid for IPv4 packets.
+ */
+ uint8_t proto;
+ uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */
+ struct in_addr src_ip, dst_ip; /* NOTE: network format */
+ uint16_t iplen=0;
+ int pktlen;
+ uint16_t etype = 0; /* Host order stored ether type */
+
+ /*
+ * dyn_dir = MATCH_UNKNOWN when rules unchecked,
+ * MATCH_NONE when checked and not matched (q = NULL),
+ * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
+ */
+ int dyn_dir = MATCH_UNKNOWN;
+ ipfw_dyn_rule *q = NULL;
+ struct ip_fw_chain *chain = &V_layer3_chain;
+
+ /*
+ * We store in ulp a pointer to the upper layer protocol header.
+ * In the ipv4 case this is easy to determine from the header,
+ * but for ipv6 we might have some additional headers in the middle.
+ * ulp is NULL if not found.
+ */
+ void *ulp = NULL; /* upper layer protocol pointer. */
+
+ /* XXX ipv6 variables */
+ int is_ipv6 = 0;
+ uint8_t icmp6_type = 0;
+ uint16_t ext_hd = 0; /* bits vector for extension header filtering */
+ /* end of ipv6 variables */
+
+ int is_ipv4 = 0;
+
+ int done = 0; /* flag to exit the outer loop */
+
+ if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
+ return (IP_FW_PASS); /* accept */
+
+ dst_ip.s_addr = 0; /* make sure it is initialized */
+ src_ip.s_addr = 0; /* make sure it is initialized */
+ pktlen = m->m_pkthdr.len;
+ args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */
+ proto = args->f_id.proto = 0; /* mark f_id invalid */
+ /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */
+
+/*
+ * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
+ * then it sets p to point at the offset "len" in the mbuf. WARNING: the
+ * pointer might become stale after other pullups (but we never use it
+ * this way).
+ */
+#define PULLUP_TO(_len, p, T) \
+do { \
+ int x = (_len) + sizeof(T); \
+ if ((m)->m_len < x) { \
+ args->m = m = m_pullup(m, x); \
+ if (m == NULL) \
+ goto pullup_failed; \
+ } \
+ p = (mtod(m, char *) + (_len)); \
+} while (0)
+
+ /*
+ * if we have an ether header,
+ */
+ if (args->eh)
+ etype = ntohs(args->eh->ether_type);
+
+ /* Identify IP packets and fill up variables. */
+ if (pktlen >= sizeof(struct ip6_hdr) &&
+ (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) {
+ struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
+ is_ipv6 = 1;
+ args->f_id.addr_type = 6;
+ hlen = sizeof(struct ip6_hdr);
+ proto = ip6->ip6_nxt;
+
+ /* Search extension headers to find upper layer protocols */
+ while (ulp == NULL) {
+ switch (proto) {
+ case IPPROTO_ICMPV6:
+ PULLUP_TO(hlen, ulp, struct icmp6_hdr);
+ icmp6_type = ICMP6(ulp)->icmp6_type;
+ break;
+
+ case IPPROTO_TCP:
+ PULLUP_TO(hlen, ulp, struct tcphdr);
+ dst_port = TCP(ulp)->th_dport;
+ src_port = TCP(ulp)->th_sport;
+ /* save flags for dynamic rules */
+ args->f_id._flags = TCP(ulp)->th_flags;
+ break;
+
+ case IPPROTO_SCTP:
+ PULLUP_TO(hlen, ulp, struct sctphdr);
+ src_port = SCTP(ulp)->src_port;
+ dst_port = SCTP(ulp)->dest_port;
+ break;
+
+ case IPPROTO_UDP:
+ PULLUP_TO(hlen, ulp, struct udphdr);
+ dst_port = UDP(ulp)->uh_dport;
+ src_port = UDP(ulp)->uh_sport;
+ break;
+
+ case IPPROTO_HOPOPTS: /* RFC 2460 */
+ PULLUP_TO(hlen, ulp, struct ip6_hbh);
+ ext_hd |= EXT_HOPOPTS;
+ hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+ proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+ ulp = NULL;
+ break;
+
+ case IPPROTO_ROUTING: /* RFC 2460 */
+ PULLUP_TO(hlen, ulp, struct ip6_rthdr);
+ switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
+ case 0:
+ ext_hd |= EXT_RTHDR0;
+ break;
+ case 2:
+ ext_hd |= EXT_RTHDR2;
+ break;
+ default:
+ printf("IPFW2: IPV6 - Unknown Routing "
+ "Header type(%d)\n",
+ ((struct ip6_rthdr *)ulp)->ip6r_type);
+ if (V_fw_deny_unknown_exthdrs)
+ return (IP_FW_DENY);
+ break;
+ }
+ ext_hd |= EXT_ROUTING;
+ hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
+ proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
+ ulp = NULL;
+ break;
+
+ case IPPROTO_FRAGMENT: /* RFC 2460 */
+ PULLUP_TO(hlen, ulp, struct ip6_frag);
+ ext_hd |= EXT_FRAGMENT;
+ hlen += sizeof (struct ip6_frag);
+ proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
+ offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
+ IP6F_OFF_MASK;
+ /* Add IP6F_MORE_FRAG for offset of first
+ * fragment to be != 0. */
+ offset |= ((struct ip6_frag *)ulp)->ip6f_offlg &
+ IP6F_MORE_FRAG;
+ if (offset == 0) {
+ printf("IPFW2: IPV6 - Invalid Fragment "
+ "Header\n");
+ if (V_fw_deny_unknown_exthdrs)
+ return (IP_FW_DENY);
+ break;
+ }
+ args->f_id.extra =
+ ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
+ ulp = NULL;
+ break;
+
+ case IPPROTO_DSTOPTS: /* RFC 2460 */
+ PULLUP_TO(hlen, ulp, struct ip6_hbh);
+ ext_hd |= EXT_DSTOPTS;
+ hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+ proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+ ulp = NULL;
+ break;
+
+ case IPPROTO_AH: /* RFC 2402 */
+ PULLUP_TO(hlen, ulp, struct ip6_ext);
+ ext_hd |= EXT_AH;
+ hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
+ proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
+ ulp = NULL;
+ break;
+
+ case IPPROTO_ESP: /* RFC 2406 */
+ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */
+ /* Anything past Seq# is variable length and
+ * data past this ext. header is encrypted. */
+ ext_hd |= EXT_ESP;
+ break;
+
+ case IPPROTO_NONE: /* RFC 2460 */
+ /*
+ * Packet ends here, and IPv6 header has
+ * already been pulled up. If ip6e_len!=0
+ * then octets must be ignored.
+ */
+ ulp = ip; /* non-NULL to get out of loop. */
+ break;
+
+ case IPPROTO_OSPFIGP:
+ /* XXX OSPF header check? */
+ PULLUP_TO(hlen, ulp, struct ip6_ext);
+ break;
+
+ case IPPROTO_PIM:
+ /* XXX PIM header check? */
+ PULLUP_TO(hlen, ulp, struct pim);
+ break;
+
+ case IPPROTO_CARP:
+ PULLUP_TO(hlen, ulp, struct carp_header);
+ if (((struct carp_header *)ulp)->carp_version !=
+ CARP_VERSION)
+ return (IP_FW_DENY);
+ if (((struct carp_header *)ulp)->carp_type !=
+ CARP_ADVERTISEMENT)
+ return (IP_FW_DENY);
+ break;
+
+ case IPPROTO_IPV6: /* RFC 2893 */
+ PULLUP_TO(hlen, ulp, struct ip6_hdr);
+ break;
+
+ case IPPROTO_IPV4: /* RFC 2893 */
+ PULLUP_TO(hlen, ulp, struct ip);
+ break;
+
+ default:
+ printf("IPFW2: IPV6 - Unknown Extension "
+ "Header(%d), ext_hd=%x\n", proto, ext_hd);
+ if (V_fw_deny_unknown_exthdrs)
+ return (IP_FW_DENY);
+ PULLUP_TO(hlen, ulp, struct ip6_ext);
+ break;
+ } /*switch */
+ }
+ ip = mtod(m, struct ip *);
+ ip6 = (struct ip6_hdr *)ip;
+ args->f_id.src_ip6 = ip6->ip6_src;
+ args->f_id.dst_ip6 = ip6->ip6_dst;
+ args->f_id.src_ip = 0;
+ args->f_id.dst_ip = 0;
+ args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
+ } else if (pktlen >= sizeof(struct ip) &&
+ (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) {
+ is_ipv4 = 1;
+ hlen = ip->ip_hl << 2;
+ args->f_id.addr_type = 4;
+
+ /*
+ * Collect parameters into local variables for faster matching.
+ */
+ proto = ip->ip_p;
+ src_ip = ip->ip_src;
+ dst_ip = ip->ip_dst;
+ offset = ntohs(ip->ip_off) & IP_OFFMASK;
+ iplen = ntohs(ip->ip_len);
+ pktlen = iplen < pktlen ? iplen : pktlen;
+
+ if (offset == 0) {
+ switch (proto) {
+ case IPPROTO_TCP:
+ PULLUP_TO(hlen, ulp, struct tcphdr);
+ dst_port = TCP(ulp)->th_dport;
+ src_port = TCP(ulp)->th_sport;
+ /* save flags for dynamic rules */
+ args->f_id._flags = TCP(ulp)->th_flags;
+ break;
+
+ case IPPROTO_UDP:
+ PULLUP_TO(hlen, ulp, struct udphdr);
+ dst_port = UDP(ulp)->uh_dport;
+ src_port = UDP(ulp)->uh_sport;
+ break;
+
+ case IPPROTO_ICMP:
+ PULLUP_TO(hlen, ulp, struct icmphdr);
+ //args->f_id.flags = ICMP(ulp)->icmp_type;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ ip = mtod(m, struct ip *);
+ args->f_id.src_ip = ntohl(src_ip.s_addr);
+ args->f_id.dst_ip = ntohl(dst_ip.s_addr);
+ }
+#undef PULLUP_TO
+ if (proto) { /* we may have port numbers, store them */
+ args->f_id.proto = proto;
+ args->f_id.src_port = src_port = ntohs(src_port);
+ args->f_id.dst_port = dst_port = ntohs(dst_port);
+ }
+
+ IPFW_RLOCK(chain);
+ if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
+ IPFW_RUNLOCK(chain);
+ return (IP_FW_PASS); /* accept */
+ }
+ if (args->rule.slot) {
+ /*
+ * Packet has already been tagged as a result of a previous
+ * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
+ * REASS, NETGRAPH, DIVERT/TEE...)
+ * Validate the slot and continue from the next one
+ * if still present, otherwise do a lookup.
+ */
+ f_pos = (args->rule.chain_id == chain->id) ?
+ args->rule.slot :
+ ipfw_find_rule(chain, args->rule.rulenum,
+ args->rule.rule_id);
+ } else {
+ f_pos = 0;
+ }
+
+ /*
+ * Now scan the rules, and parse microinstructions for each rule.
+ * We have two nested loops and an inner switch. Sometimes we
+ * need to break out of one or both loops, or re-enter one of
+ * the loops with updated variables. Loop variables are:
+ *
+ * f_pos (outer loop) points to the current rule.
+ * On output it points to the matching rule.
+ * done (outer loop) is used as a flag to break the loop.
+ * l (inner loop) residual length of current rule.
+ * cmd points to the current microinstruction.
+ *
+ * We break the inner loop by setting l=0 and possibly
+ * cmdlen=0 if we don't want to advance cmd.
+ * We break the outer loop by setting done=1
+ * We can restart the inner loop by setting l>0 and f_pos, f, cmd
+ * as needed.
+ */
+ for (; f_pos < chain->n_rules; f_pos++) {
+ ipfw_insn *cmd;
+ uint32_t tablearg = 0;
+ int l, cmdlen, skip_or; /* skip rest of OR block */
+ struct ip_fw *f;
+
+ f = chain->map[f_pos];
+ if (V_set_disable & (1 << f->set) )
+ continue;
+
+ skip_or = 0;
+ for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
+ l -= cmdlen, cmd += cmdlen) {
+ int match;
+
+ /*
+ * check_body is a jump target used when we find a
+ * CHECK_STATE, and need to jump to the body of
+ * the target rule.
+ */
+
+/* check_body: */
+ cmdlen = F_LEN(cmd);
+ /*
+ * An OR block (insn_1 || .. || insn_n) has the
+ * F_OR bit set in all but the last instruction.
+ * The first match will set "skip_or", and cause
+ * the following instructions to be skipped until
+ * past the one with the F_OR bit clear.
+ */
+ if (skip_or) { /* skip this instruction */
+ if ((cmd->len & F_OR) == 0)
+ skip_or = 0; /* next one is good */
+ continue;
+ }
+ match = 0; /* set to 1 if we succeed */
+
+ switch (cmd->opcode) {
+ /*
+ * The first set of opcodes compares the packet's
+ * fields with some pattern, setting 'match' if a
+ * match is found. At the end of the loop there is
+ * logic to deal with F_NOT and F_OR flags associated
+ * with the opcode.
+ */
+ case O_NOP:
+ match = 1;
+ break;
+
+ case O_FORWARD_MAC:
+ printf("ipfw: opcode %d unimplemented\n",
+ cmd->opcode);
+ break;
+
+ case O_GID:
+ case O_UID:
+ case O_JAIL:
+ /*
+ * We only check offset == 0 && proto != 0,
+ * as this ensures that we have a
+ * packet with the ports info.
+ */
+ if (offset!=0)
+ break;
+ if (is_ipv6) /* XXX to be fixed later */
+ break;
+ if (proto == IPPROTO_TCP ||
+ proto == IPPROTO_UDP)
+ match = check_uidgid(
+ (ipfw_insn_u32 *)cmd,
+ proto, oif,
+ dst_ip, dst_port,
+ src_ip, src_port, &ucred_lookup,
+#ifdef __FreeBSD__
+ &ucred_cache, args->inp);
+#else
+ (void *)&ucred_cache,
+ (struct inpcb *)args->m);
+#endif
+ break;
+
+ case O_RECV:
+ match = iface_match(m->m_pkthdr.rcvif,
+ (ipfw_insn_if *)cmd);
+ break;
+
+ case O_XMIT:
+ match = iface_match(oif, (ipfw_insn_if *)cmd);
+ break;
+
+ case O_VIA:
+ match = iface_match(oif ? oif :
+ m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
+ break;
+
+ case O_MACADDR2:
+ if (args->eh != NULL) { /* have MAC header */
+ u_int32_t *want = (u_int32_t *)
+ ((ipfw_insn_mac *)cmd)->addr;
+ u_int32_t *mask = (u_int32_t *)
+ ((ipfw_insn_mac *)cmd)->mask;
+ u_int32_t *hdr = (u_int32_t *)args->eh;
+
+ match =
+ ( want[0] == (hdr[0] & mask[0]) &&
+ want[1] == (hdr[1] & mask[1]) &&
+ want[2] == (hdr[2] & mask[2]) );
+ }
+ break;
+
+ case O_MAC_TYPE:
+ if (args->eh != NULL) {
+ u_int16_t *p =
+ ((ipfw_insn_u16 *)cmd)->ports;
+ int i;
+
+ for (i = cmdlen - 1; !match && i>0;
+ i--, p += 2)
+ match = (etype >= p[0] &&
+ etype <= p[1]);
+ }
+ break;
+
+ case O_FRAG:
+ match = (offset != 0);
+ break;
+
+ case O_IN: /* "out" is "not in" */
+ match = (oif == NULL);
+ break;
+
+ case O_LAYER2:
+ match = (args->eh != NULL);
+ break;
+
+ case O_DIVERTED:
+ {
+ /* For diverted packets, args->rule.info
+ * contains the divert port (in host format)
+ * reason and direction.
+ */
+ uint32_t i = args->rule.info;
+ match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT &&
+ cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2);
+ }
+ break;
+
+ case O_PROTO:
+ /*
+ * We do not allow an arg of 0 so the
+ * check of "proto" only suffices.
+ */
+ match = (proto == cmd->arg1);
+ break;
+
+ case O_IP_SRC:
+ match = is_ipv4 &&
+ (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+ src_ip.s_addr);
+ break;
+
+ case O_IP_SRC_LOOKUP:
+ case O_IP_DST_LOOKUP:
+ if (is_ipv4) {
+ uint32_t key =
+ (cmd->opcode == O_IP_DST_LOOKUP) ?
+ dst_ip.s_addr : src_ip.s_addr;
+ uint32_t v = 0;
+
+ if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) {
+ /* generic lookup. The key must be
+ * in 32bit big-endian format.
+ */
+ v = ((ipfw_insn_u32 *)cmd)->d[1];
+ if (v == 0)
+ key = dst_ip.s_addr;
+ else if (v == 1)
+ key = src_ip.s_addr;
+ else if (v == 6) /* dscp */
+ key = (ip->ip_tos >> 2) & 0x3f;
+ else if (offset != 0)
+ break;
+ else if (proto != IPPROTO_TCP &&
+ proto != IPPROTO_UDP)
+ break;
+ else if (v == 2)
+ key = htonl(dst_port);
+ else if (v == 3)
+ key = htonl(src_port);
+ else if (v == 4 || v == 5) {
+ check_uidgid(
+ (ipfw_insn_u32 *)cmd,
+ proto, oif,
+ dst_ip, dst_port,
+ src_ip, src_port, &ucred_lookup,
+#ifdef __FreeBSD__
+ &ucred_cache, args->inp);
+ if (v == 4 /* O_UID */)
+ key = ucred_cache->cr_uid;
+ else if (v == 5 /* O_JAIL */)
+ key = ucred_cache->cr_prison->pr_id;
+#else /* !__FreeBSD__ */
+ (void *)&ucred_cache,
+ (struct inpcb *)args->m);
+ if (v ==4 /* O_UID */)
+ key = ucred_cache.uid;
+ else if (v == 5 /* O_JAIL */)
+ key = ucred_cache.xid;
+#endif /* !__FreeBSD__ */
+ key = htonl(key);
+ } else
+ break;
+ }
+ match = ipfw_lookup_table(chain,
+ cmd->arg1, key, &v);
+ if (!match)
+ break;
+ if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
+ match =
+ ((ipfw_insn_u32 *)cmd)->d[0] == v;
+ else
+ tablearg = v;
+ }
+ break;
+
+ case O_IP_SRC_MASK:
+ case O_IP_DST_MASK:
+ if (is_ipv4) {
+ uint32_t a =
+ (cmd->opcode == O_IP_DST_MASK) ?
+ dst_ip.s_addr : src_ip.s_addr;
+ uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
+ int i = cmdlen-1;
+
+ for (; !match && i>0; i-= 2, p+= 2)
+ match = (p[0] == (a & p[1]));
+ }
+ break;
+
+ case O_IP_SRC_ME:
+ if (is_ipv4) {
+ struct ifnet *tif;
+
+ INADDR_TO_IFP(src_ip, tif);
+ match = (tif != NULL);
+ break;
+ }
+#ifdef INET6
+ /* FALLTHROUGH */
+ case O_IP6_SRC_ME:
+ match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
+#endif
+ break;
+
+ case O_IP_DST_SET:
+ case O_IP_SRC_SET:
+ if (is_ipv4) {
+ u_int32_t *d = (u_int32_t *)(cmd+1);
+ u_int32_t addr =
+ cmd->opcode == O_IP_DST_SET ?
+ args->f_id.dst_ip :
+ args->f_id.src_ip;
+
+ if (addr < d[0])
+ break;
+ addr -= d[0]; /* subtract base */
+ match = (addr < cmd->arg1) &&
+ ( d[ 1 + (addr>>5)] &
+ (1<<(addr & 0x1f)) );
+ }
+ break;
+
+ case O_IP_DST:
+ match = is_ipv4 &&
+ (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+ dst_ip.s_addr);
+ break;
+
+ case O_IP_DST_ME:
+ if (is_ipv4) {
+ struct ifnet *tif;
+
+ INADDR_TO_IFP(dst_ip, tif);
+ match = (tif != NULL);
+ break;
+ }
+#ifdef INET6
+ /* FALLTHROUGH */
+ case O_IP6_DST_ME:
+ match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
+#endif
+ break;
+
+
+ case O_IP_SRCPORT:
+ case O_IP_DSTPORT:
+ /*
+ * offset == 0 && proto != 0 is enough
+ * to guarantee that we have a
+ * packet with port info.
+ */
+ if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
+ && offset == 0) {
+ u_int16_t x =
+ (cmd->opcode == O_IP_SRCPORT) ?
+ src_port : dst_port ;
+ u_int16_t *p =
+ ((ipfw_insn_u16 *)cmd)->ports;
+ int i;
+
+ for (i = cmdlen - 1; !match && i>0;
+ i--, p += 2)
+ match = (x>=p[0] && x<=p[1]);
+ }
+ break;
+
+ case O_ICMPTYPE:
+ match = (offset == 0 && proto==IPPROTO_ICMP &&
+ icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
+ break;
+
+#ifdef INET6
+ case O_ICMP6TYPE:
+ match = is_ipv6 && offset == 0 &&
+ proto==IPPROTO_ICMPV6 &&
+ icmp6type_match(
+ ICMP6(ulp)->icmp6_type,
+ (ipfw_insn_u32 *)cmd);
+ break;
+#endif /* INET6 */
+
+ case O_IPOPT:
+ match = (is_ipv4 &&
+ ipopts_match(ip, cmd) );
+ break;
+
+ case O_IPVER:
+ match = (is_ipv4 &&
+ cmd->arg1 == ip->ip_v);
+ break;
+
+ case O_IPID:
+ case O_IPLEN:
+ case O_IPTTL:
+ if (is_ipv4) { /* only for IP packets */
+ uint16_t x;
+ uint16_t *p;
+ int i;
+
+ if (cmd->opcode == O_IPLEN)
+ x = iplen;
+ else if (cmd->opcode == O_IPTTL)
+ x = ip->ip_ttl;
+ else /* must be IPID */
+ x = ntohs(ip->ip_id);
+ if (cmdlen == 1) {
+ match = (cmd->arg1 == x);
+ break;
+ }
+ /* otherwise we have ranges */
+ p = ((ipfw_insn_u16 *)cmd)->ports;
+ i = cmdlen - 1;
+ for (; !match && i>0; i--, p += 2)
+ match = (x >= p[0] && x <= p[1]);
+ }
+ break;
+
+ case O_IPPRECEDENCE:
+ match = (is_ipv4 &&
+ (cmd->arg1 == (ip->ip_tos & 0xe0)) );
+ break;
+
+ case O_IPTOS:
+ match = (is_ipv4 &&
+ flags_match(cmd, ip->ip_tos));
+ break;
+
+ case O_TCPDATALEN:
+ if (proto == IPPROTO_TCP && offset == 0) {
+ struct tcphdr *tcp;
+ uint16_t x;
+ uint16_t *p;
+ int i;
+
+ tcp = TCP(ulp);
+ x = iplen -
+ ((ip->ip_hl + tcp->th_off) << 2);
+ if (cmdlen == 1) {
+ match = (cmd->arg1 == x);
+ break;
+ }
+ /* otherwise we have ranges */
+ p = ((ipfw_insn_u16 *)cmd)->ports;
+ i = cmdlen - 1;
+ for (; !match && i>0; i--, p += 2)
+ match = (x >= p[0] && x <= p[1]);
+ }
+ break;
+
+ case O_TCPFLAGS:
+ match = (proto == IPPROTO_TCP && offset == 0 &&
+ flags_match(cmd, TCP(ulp)->th_flags));
+ break;
+
+ case O_TCPOPTS:
+ match = (proto == IPPROTO_TCP && offset == 0 &&
+ tcpopts_match(TCP(ulp), cmd));
+ break;
+
+ case O_TCPSEQ:
+ match = (proto == IPPROTO_TCP && offset == 0 &&
+ ((ipfw_insn_u32 *)cmd)->d[0] ==
+ TCP(ulp)->th_seq);
+ break;
+
+ case O_TCPACK:
+ match = (proto == IPPROTO_TCP && offset == 0 &&
+ ((ipfw_insn_u32 *)cmd)->d[0] ==
+ TCP(ulp)->th_ack);
+ break;
+
+ case O_TCPWIN:
+ match = (proto == IPPROTO_TCP && offset == 0 &&
+ cmd->arg1 == TCP(ulp)->th_win);
+ break;
+
+ case O_ESTAB:
+ /* reject packets which have SYN only */
+ /* XXX should i also check for TH_ACK ? */
+ match = (proto == IPPROTO_TCP && offset == 0 &&
+ (TCP(ulp)->th_flags &
+ (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
+ break;
+
+ case O_ALTQ: {
+ struct pf_mtag *at;
+ ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+ match = 1;
+ at = pf_find_mtag(m);
+ if (at != NULL && at->qid != 0)
+ break;
+ at = pf_get_mtag(m);
+ if (at == NULL) {
+ /*
+ * Let the packet fall back to the
+ * default ALTQ.
+ */
+ break;
+ }
+ at->qid = altq->qid;
+ if (is_ipv4)
+ at->af = AF_INET;
+ else
+ at->af = AF_LINK;
+ at->hdr = ip;
+ break;
+ }
+
+ case O_LOG:
+ ipfw_log(f, hlen, args, m,
+ oif, offset, tablearg, ip);
+ match = 1;
+ break;
+
+ case O_PROB:
+ match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
+ break;
+
+ case O_VERREVPATH:
+ /* Outgoing packets automatically pass/match */
+ match = ((oif != NULL) ||
+ (m->m_pkthdr.rcvif == NULL) ||
+ (
+#ifdef INET6
+ is_ipv6 ?
+ verify_path6(&(args->f_id.src_ip6),
+ m->m_pkthdr.rcvif) :
+#endif
+ verify_path(src_ip, m->m_pkthdr.rcvif,
+ args->f_id.fib)));
+ break;
+
+ case O_VERSRCREACH:
+ /* Outgoing packets automatically pass/match */
+ match = (hlen > 0 && ((oif != NULL) ||
+#ifdef INET6
+ is_ipv6 ?
+ verify_path6(&(args->f_id.src_ip6),
+ NULL) :
+#endif
+ verify_path(src_ip, NULL, args->f_id.fib)));
+ break;
+
+ case O_ANTISPOOF:
+ /* Outgoing packets automatically pass/match */
+ if (oif == NULL && hlen > 0 &&
+ ( (is_ipv4 && in_localaddr(src_ip))
+#ifdef INET6
+ || (is_ipv6 &&
+ in6_localaddr(&(args->f_id.src_ip6)))
+#endif
+ ))
+ match =
+#ifdef INET6
+ is_ipv6 ? verify_path6(
+ &(args->f_id.src_ip6),
+ m->m_pkthdr.rcvif) :
+#endif
+ verify_path(src_ip,
+ m->m_pkthdr.rcvif,
+ args->f_id.fib);
+ else
+ match = 1;
+ break;
+
+ case O_IPSEC:
+#ifdef IPSEC
+ match = (m_tag_find(m,
+ PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
+#endif
+ /* otherwise no match */
+ break;
+
+#ifdef INET6
+ case O_IP6_SRC:
+ match = is_ipv6 &&
+ IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
+ &((ipfw_insn_ip6 *)cmd)->addr6);
+ break;
+
+ case O_IP6_DST:
+ match = is_ipv6 &&
+ IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
+ &((ipfw_insn_ip6 *)cmd)->addr6);
+ break;
+ case O_IP6_SRC_MASK:
+ case O_IP6_DST_MASK:
+ if (is_ipv6) {
+ int i = cmdlen - 1;
+ struct in6_addr p;
+ struct in6_addr *d =
+ &((ipfw_insn_ip6 *)cmd)->addr6;
+
+ for (; !match && i > 0; d += 2,
+ i -= F_INSN_SIZE(struct in6_addr)
+ * 2) {
+ p = (cmd->opcode ==
+ O_IP6_SRC_MASK) ?
+ args->f_id.src_ip6:
+ args->f_id.dst_ip6;
+ APPLY_MASK(&p, &d[1]);
+ match =
+ IN6_ARE_ADDR_EQUAL(&d[0],
+ &p);
+ }
+ }
+ break;
+
+ case O_FLOW6ID:
+ match = is_ipv6 &&
+ flow6id_match(args->f_id.flow_id6,
+ (ipfw_insn_u32 *) cmd);
+ break;
+
+ case O_EXT_HDR:
+ match = is_ipv6 &&
+ (ext_hd & ((ipfw_insn *) cmd)->arg1);
+ break;
+
+ case O_IP6:
+ match = is_ipv6;
+ break;
+#endif
+
+ case O_IP4:
+ match = is_ipv4;
+ break;
+
+ case O_TAG: {
+ struct m_tag *mtag;
+ uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+
+ /* Packet is already tagged with this tag? */
+ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
+
+ /* We have `untag' action when F_NOT flag is
+ * present. And we must remove this mtag from
+ * mbuf and reset `match' to zero (`match' will
+ * be inversed later).
+ * Otherwise we should allocate new mtag and
+ * push it into mbuf.
+ */
+ if (cmd->len & F_NOT) { /* `untag' action */
+ if (mtag != NULL)
+ m_tag_delete(m, mtag);
+ match = 0;
+ } else if (mtag == NULL) {
+ if ((mtag = m_tag_alloc(MTAG_IPFW,
+ tag, 0, M_NOWAIT)) != NULL)
+ m_tag_prepend(m, mtag);
+ match = 1;
+ }
+ break;
+ }
+
+ case O_FIB: /* try match the specified fib */
+ if (args->f_id.fib == cmd->arg1)
+ match = 1;
+ break;
+
+ case O_TAGGED: {
+ struct m_tag *mtag;
+ uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+
+ if (cmdlen == 1) {
+ match = m_tag_locate(m, MTAG_IPFW,
+ tag, NULL) != NULL;
+ break;
+ }
+
+ /* we have ranges */
+ for (mtag = m_tag_first(m);
+ mtag != NULL && !match;
+ mtag = m_tag_next(m, mtag)) {
+ uint16_t *p;
+ int i;
+
+ if (mtag->m_tag_cookie != MTAG_IPFW)
+ continue;
+
+ p = ((ipfw_insn_u16 *)cmd)->ports;
+ i = cmdlen - 1;
+ for(; !match && i > 0; i--, p += 2)
+ match =
+ mtag->m_tag_id >= p[0] &&
+ mtag->m_tag_id <= p[1];
+ }
+ break;
+ }
+
+ /*
+ * The second set of opcodes represents 'actions',
+ * i.e. the terminal part of a rule once the packet
+ * matches all previous patterns.
+ * Typically there is only one action for each rule,
+ * and the opcode is stored at the end of the rule
+ * (but there are exceptions -- see below).
+ *
+ * In general, here we set retval and terminate the
+ * outer loop (would be a 'break 3' in some language,
+ * but we need to set l=0, done=1)
+ *
+ * Exceptions:
+ * O_COUNT and O_SKIPTO actions:
+ * instead of terminating, we jump to the next rule
+ * (setting l=0), or to the SKIPTO target (setting
+ * f/f_len, cmd and l as needed), respectively.
+ *
+ * O_TAG, O_LOG and O_ALTQ action parameters:
+ * perform some action and set match = 1;
+ *
+ * O_LIMIT and O_KEEP_STATE: these opcodes are
+ * not real 'actions', and are stored right
+ * before the 'action' part of the rule.
+ * These opcodes try to install an entry in the
+ * state tables; if successful, we continue with
+ * the next opcode (match=1; break;), otherwise
+ * the packet must be dropped (set retval,
+ * break loops with l=0, done=1)
+ *
+ * O_PROBE_STATE and O_CHECK_STATE: these opcodes
+ * cause a lookup of the state table, and a jump
+ * to the 'action' part of the parent rule
+ * if an entry is found, or
+ * (CHECK_STATE only) a jump to the next rule if
+ * the entry is not found.
+ * The result of the lookup is cached so that
+ * further instances of these opcodes become NOPs.
+ * The jump to the next rule is done by setting
+ * l=0, cmdlen=0.
+ */
+ case O_LIMIT:
+ case O_KEEP_STATE:
+ if (ipfw_install_state(f,
+ (ipfw_insn_limit *)cmd, args, tablearg)) {
+ /* error or limit violation */
+ retval = IP_FW_DENY;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ }
+ match = 1;
+ break;
+
+ case O_PROBE_STATE:
+ case O_CHECK_STATE:
+ /*
+ * dynamic rules are checked at the first
+ * keep-state or check-state occurrence,
+ * with the result being stored in dyn_dir.
+ * The compiler introduces a PROBE_STATE
+ * instruction for us when we have a
+ * KEEP_STATE (because PROBE_STATE needs
+ * to be run first).
+ */
+ if (dyn_dir == MATCH_UNKNOWN &&
+ (q = ipfw_lookup_dyn_rule(&args->f_id,
+ &dyn_dir, proto == IPPROTO_TCP ?
+ TCP(ulp) : NULL))
+ != NULL) {
+ /*
+ * Found dynamic entry, update stats
+ * and jump to the 'action' part of
+ * the parent rule by setting
+ * f, cmd, l and clearing cmdlen.
+ */
+ q->pcnt++;
+ q->bcnt += pktlen;
+ /* XXX we would like to have f_pos
+ * readily accessible in the dynamic
+ * rule, instead of having to
+ * lookup q->rule.
+ */
+ f = q->rule;
+ f_pos = ipfw_find_rule(chain,
+ f->rulenum, f->id);
+ cmd = ACTION_PTR(f);
+ l = f->cmd_len - f->act_ofs;
+ ipfw_dyn_unlock();
+ cmdlen = 0;
+ match = 1;
+ break;
+ }
+ /*
+ * Dynamic entry not found. If CHECK_STATE,
+ * skip to next rule, if PROBE_STATE just
+ * ignore and continue with next opcode.
+ */
+ if (cmd->opcode == O_CHECK_STATE)
+ l = 0; /* exit inner loop */
+ match = 1;
+ break;
+
+ case O_ACCEPT:
+ retval = 0; /* accept */
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+
+ case O_PIPE:
+ case O_QUEUE:
+ set_match(args, f_pos, chain);
+ args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ if (cmd->opcode == O_PIPE)
+ args->rule.info |= IPFW_IS_PIPE;
+ if (V_fw_one_pass)
+ args->rule.info |= IPFW_ONEPASS;
+ retval = IP_FW_DUMMYNET;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+
+ case O_DIVERT:
+ case O_TEE:
+ if (args->eh) /* not on layer 2 */
+ break;
+ /* otherwise this is terminal */
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ retval = (cmd->opcode == O_DIVERT) ?
+ IP_FW_DIVERT : IP_FW_TEE;
+ set_match(args, f_pos, chain);
+ args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ break;
+
+ case O_COUNT:
+ f->pcnt++; /* update stats */
+ f->bcnt += pktlen;
+ f->timestamp = time_uptime;
+ l = 0; /* exit inner loop */
+ break;
+
+ case O_SKIPTO:
+ f->pcnt++; /* update stats */
+ f->bcnt += pktlen;
+ f->timestamp = time_uptime;
+ /* If possible use cached f_pos (in f->next_rule),
+ * whose version is written in f->next_rule
+ * (horrible hacks to avoid changing the ABI).
+ */
+ if (cmd->arg1 != IP_FW_TABLEARG &&
+ (uintptr_t)f->x_next == chain->id) {
+ f_pos = (uintptr_t)f->next_rule;
+ } else {
+ int i = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ /* make sure we do not jump backward */
+ if (i <= f->rulenum)
+ i = f->rulenum + 1;
+ f_pos = ipfw_find_rule(chain, i, 0);
+ /* update the cache */
+ if (cmd->arg1 != IP_FW_TABLEARG) {
+ f->next_rule =
+ (void *)(uintptr_t)f_pos;
+ f->x_next =
+ (void *)(uintptr_t)chain->id;
+ }
+ }
+ /*
+ * Skip disabled rules, and re-enter
+ * the inner loop with the correct
+ * f_pos, f, l and cmd.
+ * Also clear cmdlen and skip_or
+ */
+ for (; f_pos < chain->n_rules - 1 &&
+ (V_set_disable &
+ (1 << chain->map[f_pos]->set));
+ f_pos++)
+ ;
+ /* Re-enter the inner loop at the skipto rule. */
+ f = chain->map[f_pos];
+ l = f->cmd_len;
+ cmd = f->cmd;
+ match = 1;
+ cmdlen = 0;
+ skip_or = 0;
+ continue;
+ break; /* not reached */
+
+ case O_REJECT:
+ /*
+ * Drop the packet and send a reject notice
+ * if the packet is not ICMP (or is an ICMP
+ * query), and it is not multicast/broadcast.
+ */
+ if (hlen > 0 && is_ipv4 && offset == 0 &&
+ (proto != IPPROTO_ICMP ||
+ is_icmp_query(ICMP(ulp))) &&
+ !(m->m_flags & (M_BCAST|M_MCAST)) &&
+ !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
+ send_reject(args, cmd->arg1, iplen, ip);
+ m = args->m;
+ }
+ /* FALLTHROUGH */
+#ifdef INET6
+ case O_UNREACH6:
+ if (hlen > 0 && is_ipv6 &&
+ ((offset & IP6F_OFF_MASK) == 0) &&
+ (proto != IPPROTO_ICMPV6 ||
+ (is_icmp6_query(icmp6_type) == 1)) &&
+ !(m->m_flags & (M_BCAST|M_MCAST)) &&
+ !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) {
+ send_reject6(
+ args, cmd->arg1, hlen,
+ (struct ip6_hdr *)ip);
+ m = args->m;
+ }
+ /* FALLTHROUGH */
+#endif
+ case O_DENY:
+ retval = IP_FW_DENY;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+
+ case O_FORWARD_IP:
+ if (args->eh) /* not valid on layer2 pkts */
+ break;
+ if (!q || dyn_dir == MATCH_FORWARD) {
+ struct sockaddr_in *sa;
+ sa = &(((ipfw_insn_sa *)cmd)->sa);
+ if (sa->sin_addr.s_addr == INADDR_ANY) {
+ bcopy(sa, &args->hopstore,
+ sizeof(*sa));
+ args->hopstore.sin_addr.s_addr =
+ htonl(tablearg);
+ args->next_hop = &args->hopstore;
+ } else {
+ args->next_hop = sa;
+ }
+ }
+ retval = IP_FW_PASS;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+
+ case O_NETGRAPH:
+ case O_NGTEE:
+ set_match(args, f_pos, chain);
+ args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ if (V_fw_one_pass)
+ args->rule.info |= IPFW_ONEPASS;
+ retval = (cmd->opcode == O_NETGRAPH) ?
+ IP_FW_NETGRAPH : IP_FW_NGTEE;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+
+ case O_SETFIB:
+ f->pcnt++; /* update stats */
+ f->bcnt += pktlen;
+ f->timestamp = time_uptime;
+ M_SETFIB(m, cmd->arg1);
+ args->f_id.fib = cmd->arg1;
+ l = 0; /* exit inner loop */
+ break;
+
+ case O_NAT:
+ if (!IPFW_NAT_LOADED) {
+ retval = IP_FW_DENY;
+ } else {
+ struct cfg_nat *t;
+ int nat_id;
+
+ set_match(args, f_pos, chain);
+ t = ((ipfw_insn_nat *)cmd)->nat;
+ if (t == NULL) {
+ nat_id = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ t = (*lookup_nat_ptr)(&chain->nat, nat_id);
+
+ if (t == NULL) {
+ retval = IP_FW_DENY;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+ }
+ if (cmd->arg1 != IP_FW_TABLEARG)
+ ((ipfw_insn_nat *)cmd)->nat = t;
+ }
+ retval = ipfw_nat_ptr(args, t, m);
+ }
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+
+ case O_REASS: {
+ int ip_off;
+
+ f->pcnt++;
+ f->bcnt += pktlen;
+ l = 0; /* in any case exit inner loop */
+ ip_off = ntohs(ip->ip_off);
+
+ /* if not fragmented, go to next rule */
+ if ((ip_off & (IP_MF | IP_OFFMASK)) == 0)
+ break;
+ /*
+ * ip_reass() expects len & off in host
+ * byte order.
+ */
+ SET_HOST_IPLEN(ip);
+
+ args->m = m = ip_reass(m);
+
+ /*
+ * do IP header checksum fixup.
+ */
+ if (m == NULL) { /* fragment got swallowed */
+ retval = IP_FW_DENY;
+ } else { /* good, packet complete */
+ int hlen;
+
+ ip = mtod(m, struct ip *);
+ hlen = ip->ip_hl << 2;
+ SET_NET_IPLEN(ip);
+ ip->ip_sum = 0;
+ if (hlen == sizeof(struct ip))
+ ip->ip_sum = in_cksum_hdr(ip);
+ else
+ ip->ip_sum = in_cksum(m, hlen);
+ retval = IP_FW_REASS;
+ set_match(args, f_pos, chain);
+ }
+ done = 1; /* exit outer loop */
+ break;
+ }
+
+ default:
+ panic("-- unknown opcode %d\n", cmd->opcode);
+ } /* end of switch() on opcodes */
+ /*
+ * if we get here with l=0, then match is irrelevant.
+ */
+
+ if (cmd->len & F_NOT)
+ match = !match;
+
+ if (match) {
+ if (cmd->len & F_OR)
+ skip_or = 1;
+ } else {
+ if (!(cmd->len & F_OR)) /* not an OR block, */
+ break; /* try next rule */
+ }
+
+ } /* end of inner loop, scan opcodes */
+
+ if (done)
+ break;
+
+/* next_rule:; */ /* try next rule */
+
+ } /* end of outer for, scan rules */
+
+ if (done) {
+ struct ip_fw *rule = chain->map[f_pos];
+ /* Update statistics */
+ rule->pcnt++;
+ rule->bcnt += pktlen;
+ rule->timestamp = time_uptime;
+ } else {
+ retval = IP_FW_DENY;
+ printf("ipfw: ouch!, skip past end of rules, denying packet\n");
+ }
+ IPFW_RUNLOCK(chain);
+#ifdef __FreeBSD__
+ if (ucred_cache != NULL)
+ crfree(ucred_cache);
+#endif
+ return (retval);
+
+pullup_failed:
+ if (V_fw_verbose)
+ printf("ipfw: pullup failed\n");
+ return (IP_FW_DENY);
+}
+
+/*
+ * Module and VNET glue
+ */
+
+/*
+ * Stuff that must be initialised only on boot or module load
+ */
+static int
+ipfw_init(void)
+{
+ int error = 0;
+
+ ipfw_dyn_attach();
+ /*
+ * Only print out this stuff the first time around,
+ * when called from the sysinit code.
+ */
+ printf("ipfw2 "
+#ifdef INET6
+ "(+ipv6) "
+#endif
+ "initialized, divert %s, nat %s, "
+ "rule-based forwarding "
+#ifdef IPFIREWALL_FORWARD
+ "enabled, "
+#else
+ "disabled, "
+#endif
+ "default to %s, logging ",
+#ifdef IPDIVERT
+ "enabled",
+#else
+ "loadable",
+#endif
+#ifdef IPFIREWALL_NAT
+ "enabled",
+#else
+ "loadable",
+#endif
+ default_to_accept ? "accept" : "deny");
+
+ /*
+ * Note: V_xxx variables can be accessed here but the vnet specific
+ * initializer may not have been called yet for the VIMAGE case.
+ * Tuneables will have been processed. We will print out values for
+ * the default vnet.
+ * XXX This should all be rationalized AFTER 8.0
+ */
+ if (V_fw_verbose == 0)
+ printf("disabled\n");
+ else if (V_verbose_limit == 0)
+ printf("unlimited\n");
+ else
+ printf("limited to %d packets/entry by default\n",
+ V_verbose_limit);
+
+ ipfw_log_bpf(1); /* init */
+ return (error);
+}
+
+/*
+ * Called for the removal of the last instance only on module unload.
+ */
+static void
+ipfw_destroy(void)
+{
+
+ ipfw_log_bpf(0); /* uninit */
+ ipfw_dyn_detach();
+ printf("IP firewall unloaded\n");
+}
+
+/*
+ * Stuff that must be initialized for every instance
+ * (including the first of course).
+ */
+static int
+vnet_ipfw_init(const void *unused)
+{
+ int error;
+ struct ip_fw *rule = NULL;
+ struct ip_fw_chain *chain;
+
+ chain = &V_layer3_chain;
+
+ /* First set up some values that are compile time options */
+ V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */
+ V_fw_deny_unknown_exthdrs = 1;
+#ifdef IPFIREWALL_VERBOSE
+ V_fw_verbose = 1;
+#endif
+#ifdef IPFIREWALL_VERBOSE_LIMIT
+ V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
+#endif
+#ifdef IPFIREWALL_NAT
+ LIST_INIT(&chain->nat);
+#endif
+
+ /* insert the default rule and create the initial map */
+ chain->n_rules = 1;
+ chain->static_len = sizeof(struct ip_fw);
+ chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_NOWAIT | M_ZERO);
+ if (chain->map)
+ rule = malloc(chain->static_len, M_IPFW, M_NOWAIT | M_ZERO);
+ if (rule == NULL) {
+ if (chain->map)
+ free(chain->map, M_IPFW);
+ printf("ipfw2: ENOSPC initializing default rule "
+ "(support disabled)\n");
+ return (ENOSPC);
+ }
+ error = ipfw_init_tables(chain);
+ if (error) {
+ panic("init_tables"); /* XXX Marko fix this ! */
+ }
+
+ /* fill and insert the default rule */
+ rule->act_ofs = 0;
+ rule->rulenum = IPFW_DEFAULT_RULE;
+ rule->cmd_len = 1;
+ rule->set = RESVD_SET;
+ rule->cmd[0].len = 1;
+ rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
+ chain->rules = chain->default_rule = chain->map[0] = rule;
+ chain->id = rule->id = 1;
+
+ IPFW_LOCK_INIT(chain);
+ ipfw_dyn_init();
+
+ /* First set up some values that are compile time options */
+ V_ipfw_vnet_ready = 1; /* Open for business */
+
+ /*
+ * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr)
+ * and pfil hooks for ipv4 and ipv6. Even if the latter two fail
+ * we still keep the module alive because the sockopt and
+ * layer2 paths are still useful.
+ * ipfw[6]_hook return 0 on success, ENOENT on failure,
+ * so we can ignore the exact return value and just set a flag.
+ *
+ * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so
+ * changes in the underlying (per-vnet) variables trigger
+ * immediate hook()/unhook() calls.
+ * In layer2 we have the same behaviour, except that V_ether_ipfw
+ * is checked on each packet because there are no pfil hooks.
+ */
+ V_ip_fw_ctl_ptr = ipfw_ctl;
+ V_ip_fw_chk_ptr = ipfw_chk;
+ error = ipfw_attach_hooks(1);
+ return (error);
+}
+
+/*
+ * Called for the removal of each instance.
+ */
+static int
+vnet_ipfw_uninit(const void *unused)
+{
+ struct ip_fw *reap, *rule;
+ struct ip_fw_chain *chain = &V_layer3_chain;
+ int i;
+
+ V_ipfw_vnet_ready = 0; /* tell new callers to go away */
+ /*
+ * disconnect from ipv4, ipv6, layer2 and sockopt.
+ * Then grab, release and grab again the WLOCK so we make
+ * sure the update is propagated and nobody will be in.
+ */
+ (void)ipfw_attach_hooks(0 /* detach */);
+ V_ip_fw_chk_ptr = NULL;
+ V_ip_fw_ctl_ptr = NULL;
+ IPFW_UH_WLOCK(chain);
+ IPFW_UH_WUNLOCK(chain);
+ IPFW_UH_WLOCK(chain);
+
+ IPFW_WLOCK(chain);
+ IPFW_WUNLOCK(chain);
+ IPFW_WLOCK(chain);
+
+ ipfw_dyn_uninit(0); /* run the callout_drain */
+ ipfw_destroy_tables(chain);
+ reap = NULL;
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ rule->x_next = reap;
+ reap = rule;
+ }
+ if (chain->map)
+ free(chain->map, M_IPFW);
+ IPFW_WUNLOCK(chain);
+ IPFW_UH_WUNLOCK(chain);
+ if (reap != NULL)
+ ipfw_reap_rules(reap);
+ IPFW_LOCK_DESTROY(chain);
+ ipfw_dyn_uninit(1); /* free the remaining parts */
+ return 0;
+}
+
+/*
+ * Module event handler.
+ * In general we have the choice of handling most of these events by the
+ * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to
+ * use the SYSINIT handlers as they are more capable of expressing the
+ * flow of control during module and vnet operations, so this is just
+ * a skeleton. Note there is no SYSINIT equivalent of the module
+ * SHUTDOWN handler, but we don't have anything to do in that case anyhow.
+ */
+static int
+ipfw_modevent(module_t mod, int type, void *unused)
+{
+ int err = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ /* Called once at module load or
+ * system boot if compiled in. */
+ break;
+ case MOD_QUIESCE:
+ /* Called before unload. May veto unloading. */
+ break;
+ case MOD_UNLOAD:
+ /* Called during unload. */
+ break;
+ case MOD_SHUTDOWN:
+ /* Called during system shutdown. */
+ break;
+ default:
+ err = EOPNOTSUPP;
+ break;
+ }
+ return err;
+}
+
+static moduledata_t ipfwmod = {
+ "ipfw",
+ ipfw_modevent,
+ 0
+};
+
+/* Define startup order. */
+#define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN
+#define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */
+#define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */
+#define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */
+
+DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER);
+MODULE_VERSION(ipfw, 2);
+/* should declare some dependencies here */
+
+/*
+ * Starting up. Done in order after ipfwmod() has been called.
+ * VNET_SYSINIT is also called for each existing vnet and each new vnet.
+ */
+SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
+ ipfw_init, NULL);
+VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
+ vnet_ipfw_init, NULL);
+
+/*
+ * Closing up shop. These are done in REVERSE ORDER, but still
+ * after ipfwmod() has been called. Not called on reboot.
+ * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
+ * or when the module is unloaded.
+ */
+SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
+ ipfw_destroy, NULL);
+VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
+ vnet_ipfw_uninit, NULL);
+/* end of file */
diff --git a/freebsd/sys/netinet/ipfw/ip_fw_log.c b/freebsd/sys/netinet/ipfw/ip_fw_log.c
new file mode 100644
index 00000000..0a5cd94c
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/ip_fw_log.c
@@ -0,0 +1,451 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Logging support for ipfw
+ */
+
+#if !defined(KLD_MODULE)
+#include <freebsd/local/opt_ipfw.h>
+#include <freebsd/local/opt_ipdivert.h>
+#include <freebsd/local/opt_ipdn.h>
+#include <freebsd/local/opt_inet.h>
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_ipsec.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/net/ethernet.h> /* for ETHERTYPE_IP */
+#include <freebsd/net/if.h>
+#include <freebsd/net/vnet.h>
+#include <freebsd/net/if_types.h> /* for IFT_ETHER */
+#include <freebsd/net/bpf.h> /* for BPF */
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_icmp.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_fw.h>
+#include <freebsd/netinet/ipfw/ip_fw_private.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet/udp.h>
+
+#include <freebsd/netinet/ip6.h>
+#include <freebsd/netinet/icmp6.h>
+#ifdef INET6
+#include <freebsd/netinet6/in6_var.h> /* ip6_sprintf() */
+#endif
+
+#ifdef MAC
+#include <freebsd/security/mac/mac_framework.h>
+#endif
+
+/*
+ * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
+ * Other macros just cast void * into the appropriate type
+ */
+#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
+#define TCP(p) ((struct tcphdr *)(p))
+#define SCTP(p) ((struct sctphdr *)(p))
+#define UDP(p) ((struct udphdr *)(p))
+#define ICMP(p) ((struct icmphdr *)(p))
+#define ICMP6(p) ((struct icmp6_hdr *)(p))
+
+#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
+#define SNP(buf) buf, sizeof(buf)
+
+#ifdef WITHOUT_BPF
+void
+ipfw_log_bpf(int onoff)
+{
+}
+#else /* !WITHOUT_BPF */
+static struct ifnet *log_if; /* hook to attach to bpf */
+
+/* we use this dummy function for all ifnet callbacks */
+static int
+log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr)
+{
+ return EINVAL;
+}
+
+static int
+ipfw_log_output(struct ifnet *ifp, struct mbuf *m,
+ struct sockaddr *dst, struct route *ro)
+{
+ if (m != NULL)
+ m_freem(m);
+ return EINVAL;
+}
+
+static void
+ipfw_log_start(struct ifnet* ifp)
+{
+ panic("ipfw_log_start() must not be called");
+}
+
+static const u_char ipfwbroadcastaddr[6] =
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
+void
+ipfw_log_bpf(int onoff)
+{
+ struct ifnet *ifp;
+
+ if (onoff) {
+ if (log_if)
+ return;
+ ifp = if_alloc(IFT_ETHER);
+ if (ifp == NULL)
+ return;
+ if_initname(ifp, "ipfw", 0);
+ ifp->if_mtu = 65536;
+ ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
+ ifp->if_init = (void *)log_dummy;
+ ifp->if_ioctl = log_dummy;
+ ifp->if_start = ipfw_log_start;
+ ifp->if_output = ipfw_log_output;
+ ifp->if_addrlen = 6;
+ ifp->if_hdrlen = 14;
+ if_attach(ifp);
+ ifp->if_broadcastaddr = ipfwbroadcastaddr;
+ ifp->if_baudrate = IF_Mbps(10);
+ bpfattach(ifp, DLT_EN10MB, 14);
+ log_if = ifp;
+ } else {
+ if (log_if) {
+ ether_ifdetach(log_if);
+ if_free(log_if);
+ }
+ log_if = NULL;
+ }
+}
+#endif /* !WITHOUT_BPF */
+
+/*
+ * We enter here when we have a rule with O_LOG.
+ * XXX this function alone takes about 2Kbytes of code!
+ */
+void
+ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+ struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+ struct ip *ip)
+{
+ char *action;
+ int limit_reached = 0;
+ char action2[40], proto[128], fragment[32];
+
+ if (V_fw_verbose == 0) {
+#ifndef WITHOUT_BPF
+
+ if (log_if == NULL || log_if->if_bpf == NULL)
+ return;
+
+ if (args->eh) /* layer2, use orig hdr */
+ BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m);
+ else
+ /* Add fake header. Later we will store
+ * more info in the header.
+ */
+ BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m);
+#endif /* !WITHOUT_BPF */
+ return;
+ }
+ /* the old 'log' function */
+ fragment[0] = '\0';
+ proto[0] = '\0';
+
+ if (f == NULL) { /* bogus pkt */
+ if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit)
+ return;
+ V_norule_counter++;
+ if (V_norule_counter == V_verbose_limit)
+ limit_reached = V_verbose_limit;
+ action = "Refuse";
+ } else { /* O_LOG is the first action, find the real one */
+ ipfw_insn *cmd = ACTION_PTR(f);
+ ipfw_insn_log *l = (ipfw_insn_log *)cmd;
+
+ if (l->max_log != 0 && l->log_left == 0)
+ return;
+ l->log_left--;
+ if (l->log_left == 0)
+ limit_reached = l->max_log;
+ cmd += F_LEN(cmd); /* point to first action */
+ if (cmd->opcode == O_ALTQ) {
+ ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+ snprintf(SNPARGS(action2, 0), "Altq %d",
+ altq->qid);
+ cmd += F_LEN(cmd);
+ }
+ if (cmd->opcode == O_PROB)
+ cmd += F_LEN(cmd);
+
+ if (cmd->opcode == O_TAG)
+ cmd += F_LEN(cmd);
+
+ action = action2;
+ switch (cmd->opcode) {
+ case O_DENY:
+ action = "Deny";
+ break;
+
+ case O_REJECT:
+ if (cmd->arg1==ICMP_REJECT_RST)
+ action = "Reset";
+ else if (cmd->arg1==ICMP_UNREACH_HOST)
+ action = "Reject";
+ else
+ snprintf(SNPARGS(action2, 0), "Unreach %d",
+ cmd->arg1);
+ break;
+
+ case O_UNREACH6:
+ if (cmd->arg1==ICMP6_UNREACH_RST)
+ action = "Reset";
+ else
+ snprintf(SNPARGS(action2, 0), "Unreach %d",
+ cmd->arg1);
+ break;
+
+ case O_ACCEPT:
+ action = "Accept";
+ break;
+ case O_COUNT:
+ action = "Count";
+ break;
+ case O_DIVERT:
+ snprintf(SNPARGS(action2, 0), "Divert %d",
+ cmd->arg1);
+ break;
+ case O_TEE:
+ snprintf(SNPARGS(action2, 0), "Tee %d",
+ cmd->arg1);
+ break;
+ case O_SETFIB:
+ snprintf(SNPARGS(action2, 0), "SetFib %d",
+ cmd->arg1);
+ break;
+ case O_SKIPTO:
+ snprintf(SNPARGS(action2, 0), "SkipTo %d",
+ cmd->arg1);
+ break;
+ case O_PIPE:
+ snprintf(SNPARGS(action2, 0), "Pipe %d",
+ cmd->arg1);
+ break;
+ case O_QUEUE:
+ snprintf(SNPARGS(action2, 0), "Queue %d",
+ cmd->arg1);
+ break;
+ case O_FORWARD_IP: {
+ ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
+ int len;
+ struct in_addr dummyaddr;
+ if (sa->sa.sin_addr.s_addr == INADDR_ANY)
+ dummyaddr.s_addr = htonl(tablearg);
+ else
+ dummyaddr.s_addr = sa->sa.sin_addr.s_addr;
+
+ len = snprintf(SNPARGS(action2, 0), "Forward to %s",
+ inet_ntoa(dummyaddr));
+
+ if (sa->sa.sin_port)
+ snprintf(SNPARGS(action2, len), ":%d",
+ sa->sa.sin_port);
+ }
+ break;
+ case O_NETGRAPH:
+ snprintf(SNPARGS(action2, 0), "Netgraph %d",
+ cmd->arg1);
+ break;
+ case O_NGTEE:
+ snprintf(SNPARGS(action2, 0), "Ngtee %d",
+ cmd->arg1);
+ break;
+ case O_NAT:
+ action = "Nat";
+ break;
+ case O_REASS:
+ action = "Reass";
+ break;
+ default:
+ action = "UNKNOWN";
+ break;
+ }
+ }
+
+ if (hlen == 0) { /* non-ip */
+ snprintf(SNPARGS(proto, 0), "MAC");
+
+ } else {
+ int len;
+#ifdef INET6
+ char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
+#else
+ char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+ struct icmphdr *icmp;
+ struct tcphdr *tcp;
+ struct udphdr *udp;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+ struct icmp6_hdr *icmp6;
+#endif
+ src[0] = '\0';
+ dst[0] = '\0';
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(&(args->f_id))) {
+ char ip6buf[INET6_ADDRSTRLEN];
+ snprintf(src, sizeof(src), "[%s]",
+ ip6_sprintf(ip6buf, &args->f_id.src_ip6));
+ snprintf(dst, sizeof(dst), "[%s]",
+ ip6_sprintf(ip6buf, &args->f_id.dst_ip6));
+
+ ip6 = (struct ip6_hdr *)ip;
+ tcp = (struct tcphdr *)(((char *)ip) + hlen);
+ udp = (struct udphdr *)(((char *)ip) + hlen);
+ } else
+#endif
+ {
+ tcp = L3HDR(struct tcphdr, ip);
+ udp = L3HDR(struct udphdr, ip);
+
+ inet_ntoa_r(ip->ip_src, src);
+ inet_ntoa_r(ip->ip_dst, dst);
+ }
+
+ switch (args->f_id.proto) {
+ case IPPROTO_TCP:
+ len = snprintf(SNPARGS(proto, 0), "TCP %s", src);
+ if (offset == 0)
+ snprintf(SNPARGS(proto, len), ":%d %s:%d",
+ ntohs(tcp->th_sport),
+ dst,
+ ntohs(tcp->th_dport));
+ else
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+
+ case IPPROTO_UDP:
+ len = snprintf(SNPARGS(proto, 0), "UDP %s", src);
+ if (offset == 0)
+ snprintf(SNPARGS(proto, len), ":%d %s:%d",
+ ntohs(udp->uh_sport),
+ dst,
+ ntohs(udp->uh_dport));
+ else
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+
+ case IPPROTO_ICMP:
+ icmp = L3HDR(struct icmphdr, ip);
+ if (offset == 0)
+ len = snprintf(SNPARGS(proto, 0),
+ "ICMP:%u.%u ",
+ icmp->icmp_type, icmp->icmp_code);
+ else
+ len = snprintf(SNPARGS(proto, 0), "ICMP ");
+ len += snprintf(SNPARGS(proto, len), "%s", src);
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+ icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen);
+ if (offset == 0)
+ len = snprintf(SNPARGS(proto, 0),
+ "ICMPv6:%u.%u ",
+ icmp6->icmp6_type, icmp6->icmp6_code);
+ else
+ len = snprintf(SNPARGS(proto, 0), "ICMPv6 ");
+ len += snprintf(SNPARGS(proto, len), "%s", src);
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+#endif
+ default:
+ len = snprintf(SNPARGS(proto, 0), "P:%d %s",
+ args->f_id.proto, src);
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+ }
+
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(&(args->f_id))) {
+ if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG))
+ snprintf(SNPARGS(fragment, 0),
+ " (frag %08x:%d@%d%s)",
+ args->f_id.extra,
+ ntohs(ip6->ip6_plen) - hlen,
+ ntohs(offset & IP6F_OFF_MASK) << 3,
+ (offset & IP6F_MORE_FRAG) ? "+" : "");
+ } else
+#endif
+ {
+ int ipoff, iplen;
+ ipoff = ntohs(ip->ip_off);
+ iplen = ntohs(ip->ip_len);
+ if (ipoff & (IP_MF | IP_OFFMASK))
+ snprintf(SNPARGS(fragment, 0),
+ " (frag %d:%d@%d%s)",
+ ntohs(ip->ip_id), iplen - (ip->ip_hl << 2),
+ offset << 3,
+ (ipoff & IP_MF) ? "+" : "");
+ }
+ }
+#ifdef __FreeBSD__
+ if (oif || m->m_pkthdr.rcvif)
+ log(LOG_SECURITY | LOG_INFO,
+ "ipfw: %d %s %s %s via %s%s\n",
+ f ? f->rulenum : -1,
+ action, proto, oif ? "out" : "in",
+ oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
+ fragment);
+ else
+#endif
+ log(LOG_SECURITY | LOG_INFO,
+ "ipfw: %d %s %s [no if info]%s\n",
+ f ? f->rulenum : -1,
+ action, proto, fragment);
+ if (limit_reached)
+ log(LOG_SECURITY | LOG_NOTICE,
+ "ipfw: limit %d reached on entry %d\n",
+ limit_reached, f ? f->rulenum : -1);
+}
+/* end of file */
diff --git a/freebsd/sys/netinet/ipfw/ip_fw_nat.c b/freebsd/sys/netinet/ipfw/ip_fw_nat.c
new file mode 100644
index 00000000..e6c8bcec
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/ip_fw_nat.c
@@ -0,0 +1,606 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2008 Paolo Pisati
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/eventhandler.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/rwlock.h>
+
+#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */
+
+#include <freebsd/netinet/libalias/alias.h>
+#include <freebsd/netinet/libalias/alias_local.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_fw.h>
+#include <freebsd/netinet/ipfw/ip_fw_private.h>
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/udp.h>
+
+#include <freebsd/machine/in_cksum.h> /* XXX for in_cksum */
+
+static VNET_DEFINE(eventhandler_tag, ifaddr_event_tag);
+#define V_ifaddr_event_tag VNET(ifaddr_event_tag)
+
+static void
+ifaddr_change(void *arg __unused, struct ifnet *ifp)
+{
+ struct cfg_nat *ptr;
+ struct ifaddr *ifa;
+ struct ip_fw_chain *chain;
+
+ chain = &V_layer3_chain;
+ IPFW_WLOCK(chain);
+ /* Check every nat entry... */
+ LIST_FOREACH(ptr, &chain->nat, _next) {
+ /* ...using nic 'ifp->if_xname' as dynamic alias address. */
+ if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0)
+ continue;
+ if_addr_rlock(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr == NULL)
+ continue;
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ ptr->ip = ((struct sockaddr_in *)
+ (ifa->ifa_addr))->sin_addr;
+ LibAliasSetAddress(ptr->lib, ptr->ip);
+ }
+ if_addr_runlock(ifp);
+ }
+ IPFW_WUNLOCK(chain);
+}
+
+/*
+ * delete the pointers for nat entry ix, or all of them if ix < 0
+ */
+static void
+flush_nat_ptrs(struct ip_fw_chain *chain, const int ix)
+{
+ int i;
+ ipfw_insn_nat *cmd;
+
+ IPFW_WLOCK_ASSERT(chain);
+ for (i = 0; i < chain->n_rules; i++) {
+ cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]);
+ /* XXX skip log and the like ? */
+ if (cmd->o.opcode == O_NAT && cmd->nat != NULL &&
+ (ix < 0 || cmd->nat->id == ix))
+ cmd->nat = NULL;
+ }
+}
+
+static void
+del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
+{
+ struct cfg_redir *r, *tmp_r;
+ struct cfg_spool *s, *tmp_s;
+ int i, num;
+
+ LIST_FOREACH_SAFE(r, head, _next, tmp_r) {
+ num = 1; /* Number of alias_link to delete. */
+ switch (r->mode) {
+ case REDIR_PORT:
+ num = r->pport_cnt;
+ /* FALLTHROUGH */
+ case REDIR_ADDR:
+ case REDIR_PROTO:
+ /* Delete all libalias redirect entry. */
+ for (i = 0; i < num; i++)
+ LibAliasRedirectDelete(n->lib, r->alink[i]);
+ /* Del spool cfg if any. */
+ LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) {
+ LIST_REMOVE(s, _next);
+ free(s, M_IPFW);
+ }
+ free(r->alink, M_IPFW);
+ LIST_REMOVE(r, _next);
+ free(r, M_IPFW);
+ break;
+ default:
+ printf("unknown redirect mode: %u\n", r->mode);
+ /* XXX - panic?!?!? */
+ break;
+ }
+ }
+}
+
+static int
+add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
+{
+ struct cfg_redir *r, *ser_r;
+ struct cfg_spool *s, *ser_s;
+ int cnt, off, i;
+
+ for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
+ ser_r = (struct cfg_redir *)&buf[off];
+ r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
+ memcpy(r, ser_r, SOF_REDIR);
+ LIST_INIT(&r->spool_chain);
+ off += SOF_REDIR;
+ r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt,
+ M_IPFW, M_WAITOK | M_ZERO);
+ switch (r->mode) {
+ case REDIR_ADDR:
+ r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr,
+ r->paddr);
+ break;
+ case REDIR_PORT:
+ for (i = 0 ; i < r->pport_cnt; i++) {
+ /* If remotePort is all ports, set it to 0. */
+ u_short remotePortCopy = r->rport + i;
+ if (r->rport_cnt == 1 && r->rport == 0)
+ remotePortCopy = 0;
+ r->alink[i] = LibAliasRedirectPort(ptr->lib,
+ r->laddr, htons(r->lport + i), r->raddr,
+ htons(remotePortCopy), r->paddr,
+ htons(r->pport + i), r->proto);
+ if (r->alink[i] == NULL) {
+ r->alink[0] = NULL;
+ break;
+ }
+ }
+ break;
+ case REDIR_PROTO:
+ r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr,
+ r->raddr, r->paddr, r->proto);
+ break;
+ default:
+ printf("unknown redirect mode: %u\n", r->mode);
+ break;
+ }
+ /* XXX perhaps return an error instead of panic ? */
+ if (r->alink[0] == NULL)
+ panic("LibAliasRedirect* returned NULL");
+ /* LSNAT handling. */
+ for (i = 0; i < r->spool_cnt; i++) {
+ ser_s = (struct cfg_spool *)&buf[off];
+ s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
+ memcpy(s, ser_s, SOF_SPOOL);
+ LibAliasAddServer(ptr->lib, r->alink[0],
+ s->addr, htons(s->port));
+ off += SOF_SPOOL;
+ /* Hook spool entry. */
+ LIST_INSERT_HEAD(&r->spool_chain, s, _next);
+ }
+ /* And finally hook this redir entry. */
+ LIST_INSERT_HEAD(&ptr->redir_chain, r, _next);
+ }
+ return (1);
+}
+
+static int
+ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
+{
+ struct mbuf *mcl;
+ struct ip *ip;
+ /* XXX - libalias duct tape */
+ int ldt, retval;
+ char *c;
+
+ ldt = 0;
+ retval = 0;
+ mcl = m_megapullup(m, m->m_pkthdr.len);
+ if (mcl == NULL) {
+ args->m = NULL;
+ return (IP_FW_DENY);
+ }
+ ip = mtod(mcl, struct ip *);
+
+ /*
+ * XXX - Libalias checksum offload 'duct tape':
+ *
+ * locally generated packets have only pseudo-header checksum
+ * calculated and libalias will break it[1], so mark them for
+ * later fix. Moreover there are cases when libalias modifies
+ * tcp packet data[2], mark them for later fix too.
+ *
+ * [1] libalias was never meant to run in kernel, so it does
+ * not have any knowledge about checksum offloading, and
+ * expects a packet with a full internet checksum.
+ * Unfortunately, packets generated locally will have just the
+ * pseudo header calculated, and when libalias tries to adjust
+ * the checksum it will actually compute a wrong value.
+ *
+ * [2] when libalias modifies tcp's data content, full TCP
+ * checksum has to be recomputed: the problem is that
+ * libalias does not have any idea about checksum offloading.
+ * To work around this, we do not do checksumming in LibAlias,
+ * but only mark the packets in th_x2 field. If we receive a
+ * marked packet, we calculate correct checksum for it
+ * aware of offloading. Why such a terrible hack instead of
+ * recalculating checksum for each packet?
+ * Because the previous checksum was not checked!
+ * Recalculating checksums for EVERY packet will hide ALL
+ * transmission errors. Yes, marked packets still suffer from
+ * this problem. But, sigh, natd(8) has this problem, too.
+ *
+ * TODO: -make libalias mbuf aware (so
+ * it can handle delayed checksum and tso)
+ */
+
+ if (mcl->m_pkthdr.rcvif == NULL &&
+ mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
+ ldt = 1;
+
+ c = mtod(mcl, char *);
+ if (args->oif == NULL)
+ retval = LibAliasIn(t->lib, c,
+ mcl->m_len + M_TRAILINGSPACE(mcl));
+ else
+ retval = LibAliasOut(t->lib, c,
+ mcl->m_len + M_TRAILINGSPACE(mcl));
+ if (retval == PKT_ALIAS_RESPOND) {
+ m->m_flags |= M_SKIP_FIREWALL;
+ retval = PKT_ALIAS_OK;
+ }
+ if (retval != PKT_ALIAS_OK &&
+ retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) {
+ /* XXX - should i add some logging? */
+ m_free(mcl);
+ args->m = NULL;
+ return (IP_FW_DENY);
+ }
+ mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len);
+
+ /*
+ * XXX - libalias checksum offload
+ * 'duct tape' (see above)
+ */
+
+ if ((ip->ip_off & htons(IP_OFFMASK)) == 0 &&
+ ip->ip_p == IPPROTO_TCP) {
+ struct tcphdr *th;
+
+ th = (struct tcphdr *)(ip + 1);
+ if (th->th_x2)
+ ldt = 1;
+ }
+
+ if (ldt) {
+ struct tcphdr *th;
+ struct udphdr *uh;
+ u_short cksum;
+
+ ip->ip_len = ntohs(ip->ip_len);
+ cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2)));
+
+ switch (ip->ip_p) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)(ip + 1);
+ /*
+ * Maybe it was set in
+ * libalias...
+ */
+ th->th_x2 = 0;
+ th->th_sum = cksum;
+ mcl->m_pkthdr.csum_data =
+ offsetof(struct tcphdr, th_sum);
+ break;
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)(ip + 1);
+ uh->uh_sum = cksum;
+ mcl->m_pkthdr.csum_data =
+ offsetof(struct udphdr, uh_sum);
+ break;
+ }
+ /* No hw checksum offloading: do it ourselves */
+ if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) {
+ in_delayed_cksum(mcl);
+ mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ }
+ ip->ip_len = htons(ip->ip_len);
+ }
+ args->m = mcl;
+ return (IP_FW_NAT);
+}
+
+static struct cfg_nat *
+lookup_nat(struct nat_list *l, int nat_id)
+{
+ struct cfg_nat *res;
+
+ LIST_FOREACH(res, l, _next) {
+ if (res->id == nat_id)
+ break;
+ }
+ return res;
+}
+
+static int
+ipfw_nat_cfg(struct sockopt *sopt)
+{
+ struct cfg_nat *ptr, *ser_n;
+ char *buf;
+ struct ip_fw_chain *chain = &V_layer3_chain;
+
+ buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
+ sooptcopyin(sopt, buf, NAT_BUF_LEN, sizeof(struct cfg_nat));
+ ser_n = (struct cfg_nat *)buf;
+
+ /* check valid parameter ser_n->id > 0 ? */
+ /*
+ * Find/create nat rule.
+ */
+ IPFW_WLOCK(chain);
+ ptr = lookup_nat(&chain->nat, ser_n->id);
+ if (ptr == NULL) {
+ /* New rule: allocate and init new instance. */
+ ptr = malloc(sizeof(struct cfg_nat),
+ M_IPFW, M_NOWAIT | M_ZERO);
+ if (ptr == NULL) {
+ IPFW_WUNLOCK(chain);
+ free(buf, M_IPFW);
+ return (ENOSPC);
+ }
+ ptr->lib = LibAliasInit(NULL);
+ if (ptr->lib == NULL) {
+ IPFW_WUNLOCK(chain);
+ free(ptr, M_IPFW);
+ free(buf, M_IPFW);
+ return (EINVAL);
+ }
+ LIST_INIT(&ptr->redir_chain);
+ } else {
+ /* Entry already present: temporarly unhook it. */
+ LIST_REMOVE(ptr, _next);
+ flush_nat_ptrs(chain, ser_n->id);
+ }
+ IPFW_WUNLOCK(chain);
+
+ /*
+ * Basic nat configuration.
+ */
+ ptr->id = ser_n->id;
+ /*
+ * XXX - what if this rule doesn't nat any ip and just
+ * redirect?
+ * do we set aliasaddress to 0.0.0.0?
+ */
+ ptr->ip = ser_n->ip;
+ ptr->redir_cnt = ser_n->redir_cnt;
+ ptr->mode = ser_n->mode;
+ LibAliasSetMode(ptr->lib, ser_n->mode, ser_n->mode);
+ LibAliasSetAddress(ptr->lib, ptr->ip);
+ memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE);
+
+ /*
+ * Redir and LSNAT configuration.
+ */
+ /* Delete old cfgs. */
+ del_redir_spool_cfg(ptr, &ptr->redir_chain);
+ /* Add new entries. */
+ add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr);
+ free(buf, M_IPFW);
+ IPFW_WLOCK(chain);
+ LIST_INSERT_HEAD(&chain->nat, ptr, _next);
+ IPFW_WUNLOCK(chain);
+ return (0);
+}
+
+static int
+ipfw_nat_del(struct sockopt *sopt)
+{
+ struct cfg_nat *ptr;
+ struct ip_fw_chain *chain = &V_layer3_chain;
+ int i;
+
+ sooptcopyin(sopt, &i, sizeof i, sizeof i);
+ /* XXX validate i */
+ IPFW_WLOCK(chain);
+ ptr = lookup_nat(&chain->nat, i);
+ if (ptr == NULL) {
+ IPFW_WUNLOCK(chain);
+ return (EINVAL);
+ }
+ LIST_REMOVE(ptr, _next);
+ flush_nat_ptrs(chain, i);
+ IPFW_WUNLOCK(chain);
+ del_redir_spool_cfg(ptr, &ptr->redir_chain);
+ LibAliasUninit(ptr->lib);
+ free(ptr, M_IPFW);
+ return (0);
+}
+
+static int
+ipfw_nat_get_cfg(struct sockopt *sopt)
+{
+ uint8_t *data;
+ struct cfg_nat *n;
+ struct cfg_redir *r;
+ struct cfg_spool *s;
+ int nat_cnt, off;
+ struct ip_fw_chain *chain;
+ int err = ENOSPC;
+
+ chain = &V_layer3_chain;
+ nat_cnt = 0;
+ off = sizeof(nat_cnt);
+
+ data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
+ IPFW_RLOCK(chain);
+ /* Serialize all the data. */
+ LIST_FOREACH(n, &chain->nat, _next) {
+ nat_cnt++;
+ if (off + SOF_NAT >= NAT_BUF_LEN)
+ goto nospace;
+ bcopy(n, &data[off], SOF_NAT);
+ off += SOF_NAT;
+ LIST_FOREACH(r, &n->redir_chain, _next) {
+ if (off + SOF_REDIR >= NAT_BUF_LEN)
+ goto nospace;
+ bcopy(r, &data[off], SOF_REDIR);
+ off += SOF_REDIR;
+ LIST_FOREACH(s, &r->spool_chain, _next) {
+ if (off + SOF_SPOOL >= NAT_BUF_LEN)
+ goto nospace;
+ bcopy(s, &data[off], SOF_SPOOL);
+ off += SOF_SPOOL;
+ }
+ }
+ }
+ err = 0; /* all good */
+nospace:
+ IPFW_RUNLOCK(chain);
+ if (err == 0) {
+ bcopy(&nat_cnt, data, sizeof(nat_cnt));
+ sooptcopyout(sopt, data, NAT_BUF_LEN);
+ } else {
+ printf("serialized data buffer not big enough:"
+ "please increase NAT_BUF_LEN\n");
+ }
+ free(data, M_IPFW);
+ return (err);
+}
+
+static int
+ipfw_nat_get_log(struct sockopt *sopt)
+{
+ uint8_t *data;
+ struct cfg_nat *ptr;
+ int i, size;
+ struct ip_fw_chain *chain;
+
+ chain = &V_layer3_chain;
+
+ IPFW_RLOCK(chain);
+ /* one pass to count, one to copy the data */
+ i = 0;
+ LIST_FOREACH(ptr, &chain->nat, _next) {
+ if (ptr->lib->logDesc == NULL)
+ continue;
+ i++;
+ }
+ size = i * (LIBALIAS_BUF_SIZE + sizeof(int));
+ data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO);
+ if (data == NULL) {
+ IPFW_RUNLOCK(chain);
+ return (ENOSPC);
+ }
+ i = 0;
+ LIST_FOREACH(ptr, &chain->nat, _next) {
+ if (ptr->lib->logDesc == NULL)
+ continue;
+ bcopy(&ptr->id, &data[i], sizeof(int));
+ i += sizeof(int);
+ bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE);
+ i += LIBALIAS_BUF_SIZE;
+ }
+ IPFW_RUNLOCK(chain);
+ sooptcopyout(sopt, data, size);
+ free(data, M_IPFW);
+ return(0);
+}
+
+static void
+ipfw_nat_init(void)
+{
+
+ IPFW_WLOCK(&V_layer3_chain);
+ /* init ipfw hooks */
+ ipfw_nat_ptr = ipfw_nat;
+ lookup_nat_ptr = lookup_nat;
+ ipfw_nat_cfg_ptr = ipfw_nat_cfg;
+ ipfw_nat_del_ptr = ipfw_nat_del;
+ ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg;
+ ipfw_nat_get_log_ptr = ipfw_nat_get_log;
+ IPFW_WUNLOCK(&V_layer3_chain);
+ V_ifaddr_event_tag = EVENTHANDLER_REGISTER(
+ ifaddr_event, ifaddr_change,
+ NULL, EVENTHANDLER_PRI_ANY);
+}
+
+static void
+ipfw_nat_destroy(void)
+{
+ struct cfg_nat *ptr, *ptr_temp;
+ struct ip_fw_chain *chain;
+
+ chain = &V_layer3_chain;
+ IPFW_WLOCK(chain);
+ LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) {
+ LIST_REMOVE(ptr, _next);
+ del_redir_spool_cfg(ptr, &ptr->redir_chain);
+ LibAliasUninit(ptr->lib);
+ free(ptr, M_IPFW);
+ }
+ EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag);
+ flush_nat_ptrs(chain, -1 /* flush all */);
+ /* deregister ipfw_nat */
+ ipfw_nat_ptr = NULL;
+ lookup_nat_ptr = NULL;
+ ipfw_nat_cfg_ptr = NULL;
+ ipfw_nat_del_ptr = NULL;
+ ipfw_nat_get_cfg_ptr = NULL;
+ ipfw_nat_get_log_ptr = NULL;
+ IPFW_WUNLOCK(chain);
+}
+
+static int
+ipfw_nat_modevent(module_t mod, int type, void *unused)
+{
+ int err = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ ipfw_nat_init();
+ break;
+
+ case MOD_UNLOAD:
+ ipfw_nat_destroy();
+ break;
+
+ default:
+ return EOPNOTSUPP;
+ break;
+ }
+ return err;
+}
+
+static moduledata_t ipfw_nat_mod = {
+ "ipfw_nat",
+ ipfw_nat_modevent,
+ 0
+};
+
+DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1);
+MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2);
+MODULE_VERSION(ipfw_nat, 1);
+/* end of file */
diff --git a/freebsd/sys/netinet/ipfw/ip_fw_pfil.c b/freebsd/sys/netinet/ipfw/ip_fw_pfil.c
new file mode 100644
index 00000000..8759f409
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/ip_fw_pfil.c
@@ -0,0 +1,417 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#if !defined(KLD_MODULE)
+#include <freebsd/local/opt_ipfw.h>
+#include <freebsd/local/opt_ipdn.h>
+#include <freebsd/local/opt_inet.h>
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif /* KLD_MODULE */
+#include <freebsd/local/opt_inet6.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/rwlock.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/sysctl.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/pfil.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_fw.h>
+#include <freebsd/netinet/ipfw/ip_fw_private.h>
+#include <freebsd/netgraph/ng_ipfw.h>
+
+#include <freebsd/machine/in_cksum.h>
+
+static VNET_DEFINE(int, fw_enable) = 1;
+#define V_fw_enable VNET(fw_enable)
+
+#ifdef INET6
+static VNET_DEFINE(int, fw6_enable) = 1;
+#define V_fw6_enable VNET(fw6_enable)
+#endif
+
+int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
+
+/* Forward declarations. */
+static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int);
+
+#ifdef SYSCTL_NODE
+
+SYSBEGIN(f1)
+
+SYSCTL_DECL(_net_inet_ip_fw);
+SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0,
+ ipfw_chg_hook, "I", "Enable ipfw");
+#ifdef INET6
+SYSCTL_DECL(_net_inet6_ip6_fw);
+SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0,
+ ipfw_chg_hook, "I", "Enable ipfw+6");
+#endif /* INET6 */
+
+SYSEND
+
+#endif /* SYSCTL_NODE */
+
+/*
+ * The pfilter hook to pass packets to ipfw_chk and then to
+ * dummynet, divert, netgraph or other modules.
+ * The packet may be consumed.
+ */
+int
+ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+ struct inpcb *inp)
+{
+ struct ip_fw_args args;
+ struct m_tag *tag;
+ int ipfw;
+ int ret;
+
+ /* all the processing now uses ip_len in net format */
+ if (mtod(*m0, struct ip *)->ip_v == 4)
+ SET_NET_IPLEN(mtod(*m0, struct ip *));
+
+ /* convert dir to IPFW values */
+ dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT;
+ bzero(&args, sizeof(args));
+
+again:
+ /*
+ * extract and remove the tag if present. If we are left
+ * with onepass, optimize the outgoing path.
+ */
+ tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL);
+ if (tag != NULL) {
+ args.rule = *((struct ipfw_rule_ref *)(tag+1));
+ m_tag_delete(*m0, tag);
+ if (args.rule.info & IPFW_ONEPASS) {
+ SET_HOST_IPLEN(mtod(*m0, struct ip *));
+ return 0;
+ }
+ }
+
+ args.m = *m0;
+ args.oif = dir == DIR_OUT ? ifp : NULL;
+ args.inp = inp;
+
+ ipfw = ipfw_chk(&args);
+ *m0 = args.m;
+
+ KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
+ __func__));
+
+ /* breaking out of the switch means drop */
+ ret = 0; /* default return value for pass */
+ switch (ipfw) {
+ case IP_FW_PASS:
+ /* next_hop may be set by ipfw_chk */
+ if (args.next_hop == NULL)
+ break; /* pass */
+#ifndef IPFIREWALL_FORWARD
+ ret = EACCES;
+#else
+ {
+ struct m_tag *fwd_tag;
+
+ /* Incoming packets should not be tagged so we do not
+ * m_tag_find. Outgoing packets may be tagged, so we
+ * reuse the tag if present.
+ */
+ fwd_tag = (dir == DIR_IN) ? NULL :
+ m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL);
+ if (fwd_tag != NULL) {
+ m_tag_unlink(*m0, fwd_tag);
+ } else {
+ fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD,
+ sizeof(struct sockaddr_in), M_NOWAIT);
+ if (fwd_tag == NULL) {
+ ret = EACCES;
+ break; /* i.e. drop */
+ }
+ }
+ bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in));
+ m_tag_prepend(*m0, fwd_tag);
+
+ if (in_localip(args.next_hop->sin_addr))
+ (*m0)->m_flags |= M_FASTFWD_OURS;
+ }
+#endif
+ break;
+
+ case IP_FW_DENY:
+ ret = EACCES;
+ break; /* i.e. drop */
+
+ case IP_FW_DUMMYNET:
+ ret = EACCES;
+ if (ip_dn_io_ptr == NULL)
+ break; /* i.e. drop */
+ if (mtod(*m0, struct ip *)->ip_v == 4)
+ ret = ip_dn_io_ptr(m0, dir, &args);
+ else if (mtod(*m0, struct ip *)->ip_v == 6)
+ ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args);
+ else
+ break; /* drop it */
+ /*
+ * XXX should read the return value.
+ * dummynet normally eats the packet and sets *m0=NULL
+ * unless the packet can be sent immediately. In this
+ * case args is updated and we should re-run the
+ * check without clearing args.
+ */
+ if (*m0 != NULL)
+ goto again;
+ break;
+
+ case IP_FW_TEE:
+ case IP_FW_DIVERT:
+ if (ip_divert_ptr == NULL) {
+ ret = EACCES;
+ break; /* i.e. drop */
+ }
+ ret = ipfw_divert(m0, dir, &args.rule,
+ (ipfw == IP_FW_TEE) ? 1 : 0);
+ /* continue processing for the original packet (tee). */
+ if (*m0)
+ goto again;
+ break;
+
+ case IP_FW_NGTEE:
+ case IP_FW_NETGRAPH:
+ if (ng_ipfw_input_p == NULL) {
+ ret = EACCES;
+ break; /* i.e. drop */
+ }
+ ret = ng_ipfw_input_p(m0, dir, &args,
+ (ipfw == IP_FW_NGTEE) ? 1 : 0);
+ if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */
+ goto again; /* continue with packet */
+ break;
+
+ case IP_FW_NAT:
+ /* honor one-pass in case of successful nat */
+ if (V_fw_one_pass)
+ break; /* ret is already 0 */
+ goto again;
+
+ case IP_FW_REASS:
+ goto again; /* continue with packet */
+
+ default:
+ KASSERT(0, ("%s: unknown retval", __func__));
+ }
+
+ if (ret != 0) {
+ if (*m0)
+ FREE_PKT(*m0);
+ *m0 = NULL;
+ }
+ if (*m0 && mtod(*m0, struct ip *)->ip_v == 4)
+ SET_HOST_IPLEN(mtod(*m0, struct ip *));
+ return ret;
+}
+
+/* do the divert, return 1 on error 0 on success */
+static int
+ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule,
+ int tee)
+{
+ /*
+ * ipfw_chk() has already tagged the packet with the divert tag.
+ * If tee is set, copy packet and return original.
+ * If not tee, consume packet and send it to divert socket.
+ */
+ struct mbuf *clone;
+ struct ip *ip;
+ struct m_tag *tag;
+
+ /* Cloning needed for tee? */
+ if (tee == 0) {
+ clone = *m0; /* use the original mbuf */
+ *m0 = NULL;
+ } else {
+ clone = m_dup(*m0, M_DONTWAIT);
+ /* If we cannot duplicate the mbuf, we sacrifice the divert
+ * chain and continue with the tee-ed packet.
+ */
+ if (clone == NULL)
+ return 1;
+ }
+
+ /*
+ * Divert listeners can normally handle non-fragmented packets,
+ * but we can only reass in the non-tee case.
+ * This means that listeners on a tee rule may get fragments,
+ * and have to live with that.
+ * Note that we now have the 'reass' ipfw option so if we care
+ * we can do it before a 'tee'.
+ */
+ ip = mtod(clone, struct ip *);
+ if (!tee && ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) {
+ int hlen;
+ struct mbuf *reass;
+
+ SET_HOST_IPLEN(ip); /* ip_reass wants host order */
+ reass = ip_reass(clone); /* Reassemble packet. */
+ if (reass == NULL)
+ return 0; /* not an error */
+ /* if reass = NULL then it was consumed by ip_reass */
+ /*
+ * IP header checksum fixup after reassembly and leave header
+ * in network byte order.
+ */
+ ip = mtod(reass, struct ip *);
+ hlen = ip->ip_hl << 2;
+ SET_NET_IPLEN(ip);
+ ip->ip_sum = 0;
+ if (hlen == sizeof(struct ip))
+ ip->ip_sum = in_cksum_hdr(ip);
+ else
+ ip->ip_sum = in_cksum(reass, hlen);
+ clone = reass;
+ }
+ /* attach a tag to the packet with the reinject info */
+ tag = m_tag_alloc(MTAG_IPFW_RULE, 0,
+ sizeof(struct ipfw_rule_ref), M_NOWAIT);
+ if (tag == NULL) {
+ FREE_PKT(clone);
+ return 1;
+ }
+ *((struct ipfw_rule_ref *)(tag+1)) = *rule;
+ m_tag_prepend(clone, tag);
+
+ /* Do the dirty job... */
+ ip_divert_ptr(clone, incoming);
+ return 0;
+}
+
+/*
+ * attach or detach hooks for a given protocol family
+ */
+static int
+ipfw_hook(int onoff, int pf)
+{
+ struct pfil_head *pfh;
+
+ pfh = pfil_head_get(PFIL_TYPE_AF, pf);
+ if (pfh == NULL)
+ return ENOENT;
+
+ (void) (onoff ? pfil_add_hook : pfil_remove_hook)
+ (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh);
+
+ return 0;
+}
+
+int
+ipfw_attach_hooks(int arg)
+{
+ int error = 0;
+
+ if (arg == 0) /* detach */
+ ipfw_hook(0, AF_INET);
+ else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) {
+ error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */
+ printf("ipfw_hook() error\n");
+ }
+#ifdef INET6
+ if (arg == 0) /* detach */
+ ipfw_hook(0, AF_INET6);
+ else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) {
+ error = ENOENT;
+ printf("ipfw6_hook() error\n");
+ }
+#endif
+ return error;
+}
+
+int
+ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
+{
+ int enable;
+ int oldenable;
+ int error;
+ int af;
+
+ if (arg1 == &VNET_NAME(fw_enable)) {
+ enable = V_fw_enable;
+ af = AF_INET;
+ }
+#ifdef INET6
+ else if (arg1 == &VNET_NAME(fw6_enable)) {
+ enable = V_fw6_enable;
+ af = AF_INET6;
+ }
+#endif
+ else
+ return (EINVAL);
+
+ oldenable = enable;
+
+ error = sysctl_handle_int(oidp, &enable, 0, req);
+
+ if (error)
+ return (error);
+
+ enable = (enable) ? 1 : 0;
+
+ if (enable == oldenable)
+ return (0);
+
+ error = ipfw_hook(enable, af);
+ if (error)
+ return (error);
+ if (af == AF_INET)
+ V_fw_enable = enable;
+#ifdef INET6
+ else if (af == AF_INET6)
+ V_fw6_enable = enable;
+#endif
+
+ return (0);
+}
+/* end of file */
diff --git a/freebsd/sys/netinet/ipfw/ip_fw_private.h b/freebsd/sys/netinet/ipfw/ip_fw_private.h
new file mode 100644
index 00000000..c29ae0ad
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/ip_fw_private.h
@@ -0,0 +1,301 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IPFW2_PRIVATE_H
+#define _IPFW2_PRIVATE_H
+
+/*
+ * Internal constants and data structures used by ipfw components
+ * and not meant to be exported outside the kernel.
+ */
+
+#ifdef _KERNEL
+
+/*
+ * For platforms that do not have SYSCTL support, we wrap the
+ * SYSCTL_* into a function (one per file) to collect the values
+ * into an array at module initialization. The wrapping macros,
+ * SYSBEGIN() and SYSEND, are empty in the default case.
+ */
+#ifndef SYSBEGIN
+#define SYSBEGIN(x)
+#endif
+#ifndef SYSEND
+#define SYSEND
+#endif
+
+/* Return values from ipfw_chk() */
+enum {
+ IP_FW_PASS = 0,
+ IP_FW_DENY,
+ IP_FW_DIVERT,
+ IP_FW_TEE,
+ IP_FW_DUMMYNET,
+ IP_FW_NETGRAPH,
+ IP_FW_NGTEE,
+ IP_FW_NAT,
+ IP_FW_REASS,
+};
+
+/*
+ * Structure for collecting parameters to dummynet for ip6_output forwarding
+ */
+struct _ip6dn_args {
+ struct ip6_pktopts *opt_or;
+ struct route_in6 ro_or;
+ int flags_or;
+ struct ip6_moptions *im6o_or;
+ struct ifnet *origifp_or;
+ struct ifnet *ifp_or;
+ struct sockaddr_in6 dst_or;
+ u_long mtu_or;
+ struct route_in6 ro_pmtu_or;
+};
+
+
+/*
+ * Arguments for calling ipfw_chk() and dummynet_io(). We put them
+ * all into a structure because this way it is easier and more
+ * efficient to pass variables around and extend the interface.
+ */
+struct ip_fw_args {
+ struct mbuf *m; /* the mbuf chain */
+ struct ifnet *oif; /* output interface */
+ struct sockaddr_in *next_hop; /* forward address */
+
+ /*
+ * On return, it points to the matching rule.
+ * On entry, rule.slot > 0 means the info is valid and
+ * contains the the starting rule for an ipfw search.
+ * If chain_id == chain->id && slot >0 then jump to that slot.
+ * Otherwise, we locate the first rule >= rulenum:rule_id
+ */
+ struct ipfw_rule_ref rule; /* match/restart info */
+
+ struct ether_header *eh; /* for bridged packets */
+
+ struct ipfw_flow_id f_id; /* grabbed from IP header */
+ //uint32_t cookie; /* a cookie depending on rule action */
+ struct inpcb *inp;
+
+ struct _ip6dn_args dummypar; /* dummynet->ip6_output */
+ struct sockaddr_in hopstore; /* store here if cannot use a pointer */
+};
+
+MALLOC_DECLARE(M_IPFW);
+
+/*
+ * Hooks sometime need to know the direction of the packet
+ * (divert, dummynet, netgraph, ...)
+ * We use a generic definition here, with bit0-1 indicating the
+ * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the
+ * specific protocol
+ * indicating the protocol (if necessary)
+ */
+enum {
+ DIR_MASK = 0x3,
+ DIR_OUT = 0,
+ DIR_IN = 1,
+ DIR_FWD = 2,
+ DIR_DROP = 3,
+ PROTO_LAYER2 = 0x4, /* set for layer 2 */
+ /* PROTO_DEFAULT = 0, */
+ PROTO_IPV4 = 0x08,
+ PROTO_IPV6 = 0x10,
+ PROTO_IFB = 0x0c, /* layer2 + ifbridge */
+ /* PROTO_OLDBDG = 0x14, unused, old bridge */
+};
+
+/* wrapper for freeing a packet, in case we need to do more work */
+#ifndef FREE_PKT
+#if defined(__linux__) || defined(_WIN32)
+#define FREE_PKT(m) netisr_dispatch(-1, m)
+#else
+#define FREE_PKT(m) m_freem(m)
+#endif
+#endif /* !FREE_PKT */
+
+/*
+ * Function definitions.
+ */
+
+/* attach (arg = 1) or detach (arg = 0) hooks */
+int ipfw_attach_hooks(int);
+#ifdef NOTYET
+void ipfw_nat_destroy(void);
+#endif
+
+/* In ip_fw_log.c */
+struct ip;
+void ipfw_log_bpf(int);
+void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+ struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+ struct ip *ip);
+VNET_DECLARE(u_int64_t, norule_counter);
+#define V_norule_counter VNET(norule_counter)
+VNET_DECLARE(int, verbose_limit);
+#define V_verbose_limit VNET(verbose_limit)
+
+/* In ip_fw_dynamic.c */
+
+enum { /* result for matching dynamic rules */
+ MATCH_REVERSE = 0,
+ MATCH_FORWARD,
+ MATCH_NONE,
+ MATCH_UNKNOWN,
+};
+
+/*
+ * The lock for dynamic rules is only used once outside the file,
+ * and only to release the result of lookup_dyn_rule().
+ * Eventually we may implement it with a callback on the function.
+ */
+void ipfw_dyn_unlock(void);
+
+struct tcphdr;
+struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *,
+ u_int32_t, u_int32_t, int);
+int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
+ struct ip_fw_args *args, uint32_t tablearg);
+ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt,
+ int *match_direction, struct tcphdr *tcp);
+void ipfw_remove_dyn_children(struct ip_fw *rule);
+void ipfw_get_dynamic(char **bp, const char *ep);
+
+void ipfw_dyn_attach(void); /* uma_zcreate .... */
+void ipfw_dyn_detach(void); /* uma_zdestroy ... */
+void ipfw_dyn_init(void); /* per-vnet initialization */
+void ipfw_dyn_uninit(int); /* per-vnet deinitialization */
+int ipfw_dyn_len(void);
+
+/* common variables */
+VNET_DECLARE(int, fw_one_pass);
+#define V_fw_one_pass VNET(fw_one_pass)
+
+VNET_DECLARE(int, fw_verbose);
+#define V_fw_verbose VNET(fw_verbose)
+
+VNET_DECLARE(struct ip_fw_chain, layer3_chain);
+#define V_layer3_chain VNET(layer3_chain)
+
+VNET_DECLARE(u_int32_t, set_disable);
+#define V_set_disable VNET(set_disable)
+
+VNET_DECLARE(int, autoinc_step);
+#define V_autoinc_step VNET(autoinc_step)
+
+struct ip_fw_chain {
+ struct ip_fw *rules; /* list of rules */
+ struct ip_fw *reap; /* list of rules to reap */
+ struct ip_fw *default_rule;
+ int n_rules; /* number of static rules */
+ int static_len; /* total len of static rules */
+ struct ip_fw **map; /* array of rule ptrs to ease lookup */
+ LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */
+ struct radix_node_head *tables[IPFW_TABLES_MAX];
+#if defined( __linux__ ) || defined( _WIN32 )
+ spinlock_t rwmtx;
+ spinlock_t uh_lock;
+#else
+ struct rwlock rwmtx;
+ struct rwlock uh_lock; /* lock for upper half */
+#endif
+ uint32_t id; /* ruleset id */
+};
+
+struct sockopt; /* used by tcp_var.h */
+
+/*
+ * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c
+ * so the variable and the macros must be here.
+ */
+
+#define IPFW_LOCK_INIT(_chain) do { \
+ rw_init(&(_chain)->rwmtx, "IPFW static rules"); \
+ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \
+ } while (0)
+
+#define IPFW_LOCK_DESTROY(_chain) do { \
+ rw_destroy(&(_chain)->rwmtx); \
+ rw_destroy(&(_chain)->uh_lock); \
+ } while (0)
+
+#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED)
+
+#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
+#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
+#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
+#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)
+
+#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock)
+#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock)
+#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock)
+#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock)
+
+/* In ip_fw_sockopt.c */
+int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
+int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
+int ipfw_ctl(struct sockopt *sopt);
+int ipfw_chk(struct ip_fw_args *args);
+void ipfw_reap_rules(struct ip_fw *head);
+
+/* In ip_fw_pfil */
+int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+ struct inpcb *inp);
+
+/* In ip_fw_table.c */
+struct radix_node;
+int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint32_t *val);
+int ipfw_init_tables(struct ip_fw_chain *ch);
+void ipfw_destroy_tables(struct ip_fw_chain *ch);
+int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl);
+int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint8_t mlen, uint32_t value);
+int ipfw_dump_table_entry(struct radix_node *rn, void *arg);
+int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint8_t mlen);
+int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt);
+int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl);
+
+/* In ip_fw_nat.c -- XXX to be moved to ip_var.h */
+
+extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
+
+typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *);
+typedef int ipfw_nat_cfg_t(struct sockopt *);
+
+extern ipfw_nat_t *ipfw_nat_ptr;
+#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL)
+
+extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+#endif /* _KERNEL */
+#endif /* _IPFW2_PRIVATE_H */
diff --git a/freebsd/sys/netinet/ipfw/ip_fw_sockopt.c b/freebsd/sys/netinet/ipfw/ip_fw_sockopt.c
new file mode 100644
index 00000000..6af09905
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/ip_fw_sockopt.c
@@ -0,0 +1,1345 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Supported by: Valeria Paoli
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Sockopt support for ipfw. The routines here implement
+ * the upper half of the ipfw code.
+ */
+
+#if !defined(KLD_MODULE)
+#include <freebsd/local/opt_ipfw.h>
+#include <freebsd/local/opt_ipdivert.h>
+#include <freebsd/local/opt_ipdn.h>
+#include <freebsd/local/opt_inet.h>
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_ipsec.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h> /* struct m_tag used by nested headers */
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/rwlock.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip_var.h> /* hooks */
+#include <freebsd/netinet/ip_fw.h>
+#include <freebsd/netinet/ipfw/ip_fw_private.h>
+
+#ifdef MAC
+#include <freebsd/security/mac/mac_framework.h>
+#endif
+
+MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
+
+/*
+ * static variables followed by global ones (none in this file)
+ */
+
+/*
+ * Find the smallest rule >= key, id.
+ * We could use bsearch but it is so simple that we code it directly
+ */
+int
+ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
+{
+ int i, lo, hi;
+ struct ip_fw *r;
+
+ for (lo = 0, hi = chain->n_rules - 1; lo < hi;) {
+ i = (lo + hi) / 2;
+ r = chain->map[i];
+ if (r->rulenum < key)
+ lo = i + 1; /* continue from the next one */
+ else if (r->rulenum > key)
+ hi = i; /* this might be good */
+ else if (r->id < id)
+ lo = i + 1; /* continue from the next one */
+ else /* r->id >= id */
+ hi = i; /* this might be good */
+ };
+ return hi;
+}
+
+/*
+ * allocate a new map, returns the chain locked. extra is the number
+ * of entries to add or delete.
+ */
+static struct ip_fw **
+get_map(struct ip_fw_chain *chain, int extra, int locked)
+{
+
+ for (;;) {
+ struct ip_fw **map;
+ int i;
+
+ i = chain->n_rules + extra;
+ map = malloc(i * sizeof(struct ip_fw *), M_IPFW,
+ locked ? M_NOWAIT : M_WAITOK);
+ if (map == NULL) {
+ printf("%s: cannot allocate map\n", __FUNCTION__);
+ return NULL;
+ }
+ if (!locked)
+ IPFW_UH_WLOCK(chain);
+ if (i >= chain->n_rules + extra) /* good */
+ return map;
+ /* otherwise we lost the race, free and retry */
+ if (!locked)
+ IPFW_UH_WUNLOCK(chain);
+ free(map, M_IPFW);
+ }
+}
+
+/*
+ * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
+ */
+static struct ip_fw **
+swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
+{
+ struct ip_fw **old_map;
+
+ IPFW_WLOCK(chain);
+ chain->id++;
+ chain->n_rules = new_len;
+ old_map = chain->map;
+ chain->map = new_map;
+ IPFW_WUNLOCK(chain);
+ return old_map;
+}
+
+/*
+ * Add a new rule to the list. Copy the rule into a malloc'ed area, then
+ * possibly create a rule number and add the rule to the list.
+ * Update the rule_number in the input struct so the caller knows it as well.
+ * XXX DO NOT USE FOR THE DEFAULT RULE.
+ * Must be called without IPFW_UH held
+ */
+int
+ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
+{
+ struct ip_fw *rule;
+ int i, l, insert_before;
+ struct ip_fw **map; /* the new array of pointers */
+
+ if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
+ return (EINVAL);
+
+ l = RULESIZE(input_rule);
+ rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
+ if (rule == NULL)
+ return (ENOSPC);
+ /* get_map returns with IPFW_UH_WLOCK if successful */
+ map = get_map(chain, 1, 0 /* not locked */);
+ if (map == NULL) {
+ free(rule, M_IPFW);
+ return ENOSPC;
+ }
+
+ bcopy(input_rule, rule, l);
+ /* clear fields not settable from userland */
+ rule->x_next = NULL;
+ rule->next_rule = NULL;
+ rule->pcnt = 0;
+ rule->bcnt = 0;
+ rule->timestamp = 0;
+
+ if (V_autoinc_step < 1)
+ V_autoinc_step = 1;
+ else if (V_autoinc_step > 1000)
+ V_autoinc_step = 1000;
+ /* find the insertion point, we will insert before */
+ insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE;
+ i = ipfw_find_rule(chain, insert_before, 0);
+ /* duplicate first part */
+ if (i > 0)
+ bcopy(chain->map, map, i * sizeof(struct ip_fw *));
+ map[i] = rule;
+ /* duplicate remaining part, we always have the default rule */
+ bcopy(chain->map + i, map + i + 1,
+ sizeof(struct ip_fw *) *(chain->n_rules - i));
+ if (rule->rulenum == 0) {
+ /* write back the number */
+ rule->rulenum = i > 0 ? map[i-1]->rulenum : 0;
+ if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
+ rule->rulenum += V_autoinc_step;
+ input_rule->rulenum = rule->rulenum;
+ }
+
+ rule->id = chain->id + 1;
+ map = swap_map(chain, map, chain->n_rules + 1);
+ chain->static_len += l;
+ IPFW_UH_WUNLOCK(chain);
+ if (map)
+ free(map, M_IPFW);
+ return (0);
+}
+
+/*
+ * Reclaim storage associated with a list of rules. This is
+ * typically the list created using remove_rule.
+ * A NULL pointer on input is handled correctly.
+ */
+void
+ipfw_reap_rules(struct ip_fw *head)
+{
+ struct ip_fw *rule;
+
+ while ((rule = head) != NULL) {
+ head = head->x_next;
+ free(rule, M_IPFW);
+ }
+}
+
+/*
+ * Used by del_entry() to check if a rule should be kept.
+ * Returns 1 if the rule must be kept, 0 otherwise.
+ *
+ * Called with cmd = {0,1,5}.
+ * cmd == 0 matches on rule numbers, excludes rules in RESVD_SET if n == 0 ;
+ * cmd == 1 matches on set numbers only, rule numbers are ignored;
+ * cmd == 5 matches on rule and set numbers.
+ *
+ * n == 0 is a wildcard for rule numbers, there is no wildcard for sets.
+ *
+ * Rules to keep are
+ * (default || reserved || !match_set || !match_number)
+ * where
+ * default ::= (rule->rulenum == IPFW_DEFAULT_RULE)
+ * // the default rule is always protected
+ *
+ * reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET)
+ * // RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush")
+ *
+ * match_set ::= (cmd == 0 || rule->set == set)
+ * // set number is ignored for cmd == 0
+ *
+ * match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum)
+ * // number is ignored for cmd == 1 or n == 0
+ *
+ */
+static int
+keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n)
+{
+ return
+ (rule->rulenum == IPFW_DEFAULT_RULE) ||
+ (cmd == 0 && n == 0 && rule->set == RESVD_SET) ||
+ !(cmd == 0 || rule->set == set) ||
+ !(cmd == 1 || n == 0 || n == rule->rulenum);
+}
+
+/**
+ * Remove all rules with given number, or do set manipulation.
+ * Assumes chain != NULL && *chain != NULL.
+ *
+ * The argument is an uint32_t. The low 16 bit are the rule or set number;
+ * the next 8 bits are the new set; the top 8 bits indicate the command:
+ *
+ * 0 delete rules numbered "rulenum"
+ * 1 delete rules in set "rulenum"
+ * 2 move rules "rulenum" to set "new_set"
+ * 3 move rules from set "rulenum" to set "new_set"
+ * 4 swap sets "rulenum" and "new_set"
+ * 5 delete rules "rulenum" and set "new_set"
+ */
+static int
+del_entry(struct ip_fw_chain *chain, uint32_t arg)
+{
+ struct ip_fw *rule;
+ uint32_t num; /* rule number or old_set */
+ uint8_t cmd, new_set;
+ int start, end, i, ofs, n;
+ struct ip_fw **map = NULL;
+ int error = 0;
+
+ num = arg & 0xffff;
+ cmd = (arg >> 24) & 0xff;
+ new_set = (arg >> 16) & 0xff;
+
+ if (cmd > 5 || new_set > RESVD_SET)
+ return EINVAL;
+ if (cmd == 0 || cmd == 2 || cmd == 5) {
+ if (num >= IPFW_DEFAULT_RULE)
+ return EINVAL;
+ } else {
+ if (num > RESVD_SET) /* old_set */
+ return EINVAL;
+ }
+
+ IPFW_UH_WLOCK(chain); /* arbitrate writers */
+ chain->reap = NULL; /* prepare for deletions */
+
+ switch (cmd) {
+ case 0: /* delete rules "num" (num == 0 matches all) */
+ case 1: /* delete all rules in set N */
+ case 5: /* delete rules with number N and set "new_set". */
+
+ /*
+ * Locate first rule to delete (start), the rule after
+ * the last one to delete (end), and count how many
+ * rules to delete (n). Always use keep_rule() to
+ * determine which rules to keep.
+ */
+ n = 0;
+ if (cmd == 1) {
+ /* look for a specific set including RESVD_SET.
+ * Must scan the entire range, ignore num.
+ */
+ new_set = num;
+ for (start = -1, end = i = 0; i < chain->n_rules; i++) {
+ if (keep_rule(chain->map[i], cmd, new_set, 0))
+ continue;
+ if (start < 0)
+ start = i;
+ end = i;
+ n++;
+ }
+ end++; /* first non-matching */
+ } else {
+ /* Optimized search on rule numbers */
+ start = ipfw_find_rule(chain, num, 0);
+ for (end = start; end < chain->n_rules; end++) {
+ rule = chain->map[end];
+ if (num > 0 && rule->rulenum != num)
+ break;
+ if (!keep_rule(rule, cmd, new_set, num))
+ n++;
+ }
+ }
+
+ if (n == 0) {
+ /* A flush request (arg == 0) on empty ruleset
+ * returns with no error. On the contrary,
+ * if there is no match on a specific request,
+ * we return EINVAL.
+ */
+ error = (arg == 0) ? 0 : EINVAL;
+ break;
+ }
+
+ /* We have something to delete. Allocate the new map */
+ map = get_map(chain, -n, 1 /* locked */);
+ if (map == NULL) {
+ error = EINVAL;
+ break;
+ }
+
+ /* 1. bcopy the initial part of the map */
+ if (start > 0)
+ bcopy(chain->map, map, start * sizeof(struct ip_fw *));
+ /* 2. copy active rules between start and end */
+ for (i = ofs = start; i < end; i++) {
+ rule = chain->map[i];
+ if (keep_rule(rule, cmd, new_set, num))
+ map[ofs++] = rule;
+ }
+ /* 3. copy the final part of the map */
+ bcopy(chain->map + end, map + ofs,
+ (chain->n_rules - end) * sizeof(struct ip_fw *));
+ /* 4. swap the maps (under BH_LOCK) */
+ map = swap_map(chain, map, chain->n_rules - n);
+ /* 5. now remove the rules deleted from the old map */
+ for (i = start; i < end; i++) {
+ int l;
+ rule = map[i];
+ if (keep_rule(rule, cmd, new_set, num))
+ continue;
+ l = RULESIZE(rule);
+ chain->static_len -= l;
+ ipfw_remove_dyn_children(rule);
+ rule->x_next = chain->reap;
+ chain->reap = rule;
+ }
+ break;
+
+ /*
+ * In the next 3 cases the loop stops at (n_rules - 1)
+ * because the default rule is never eligible..
+ */
+
+ case 2: /* move rules with given RULE number to new set */
+ for (i = 0; i < chain->n_rules - 1; i++) {
+ rule = chain->map[i];
+ if (rule->rulenum == num)
+ rule->set = new_set;
+ }
+ break;
+
+ case 3: /* move rules with given SET number to new set */
+ for (i = 0; i < chain->n_rules - 1; i++) {
+ rule = chain->map[i];
+ if (rule->set == num)
+ rule->set = new_set;
+ }
+ break;
+
+ case 4: /* swap two sets */
+ for (i = 0; i < chain->n_rules - 1; i++) {
+ rule = chain->map[i];
+ if (rule->set == num)
+ rule->set = new_set;
+ else if (rule->set == new_set)
+ rule->set = num;
+ }
+ break;
+ }
+
+ rule = chain->reap;
+ chain->reap = NULL;
+ IPFW_UH_WUNLOCK(chain);
+ ipfw_reap_rules(rule);
+ if (map)
+ free(map, M_IPFW);
+ return error;
+}
+
+/*
+ * Clear counters for a specific rule.
+ * Normally run under IPFW_UH_RLOCK, but these are idempotent ops
+ * so we only care that rules do not disappear.
+ */
+static void
+clear_counters(struct ip_fw *rule, int log_only)
+{
+ ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
+
+ if (log_only == 0) {
+ rule->bcnt = rule->pcnt = 0;
+ rule->timestamp = 0;
+ }
+ if (l->o.opcode == O_LOG)
+ l->log_left = l->max_log;
+}
+
+/**
+ * Reset some or all counters on firewall rules.
+ * The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
+ * the next 8 bits are the set number, the top 8 bits are the command:
+ * 0 work with rules from all set's;
+ * 1 work with rules only from specified set.
+ * Specified rule number is zero if we want to clear all entries.
+ * log_only is 1 if we only want to reset logs, zero otherwise.
+ */
+static int
+zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
+{
+ struct ip_fw *rule;
+ char *msg;
+ int i;
+
+ uint16_t rulenum = arg & 0xffff;
+ uint8_t set = (arg >> 16) & 0xff;
+ uint8_t cmd = (arg >> 24) & 0xff;
+
+ if (cmd > 1)
+ return (EINVAL);
+ if (cmd == 1 && set > RESVD_SET)
+ return (EINVAL);
+
+ IPFW_UH_RLOCK(chain);
+ if (rulenum == 0) {
+ V_norule_counter = 0;
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ /* Skip rules not in our set. */
+ if (cmd == 1 && rule->set != set)
+ continue;
+ clear_counters(rule, log_only);
+ }
+ msg = log_only ? "All logging counts reset" :
+ "Accounting cleared";
+ } else {
+ int cleared = 0;
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ if (rule->rulenum == rulenum) {
+ if (cmd == 0 || rule->set == set)
+ clear_counters(rule, log_only);
+ cleared = 1;
+ }
+ if (rule->rulenum > rulenum)
+ break;
+ }
+ if (!cleared) { /* we did not find any matching rules */
+ IPFW_UH_RUNLOCK(chain);
+ return (EINVAL);
+ }
+ msg = log_only ? "logging count reset" : "cleared";
+ }
+ IPFW_UH_RUNLOCK(chain);
+
+ if (V_fw_verbose) {
+ int lev = LOG_SECURITY | LOG_NOTICE;
+
+ if (rulenum)
+ log(lev, "ipfw: Entry %d %s.\n", rulenum, msg);
+ else
+ log(lev, "ipfw: %s.\n", msg);
+ }
+ return (0);
+}
+
+/*
+ * Check validity of the structure before insert.
+ * Rules are simple, so this mostly need to check rule sizes.
+ */
+static int
+check_ipfw_struct(struct ip_fw *rule, int size)
+{
+ int l, cmdlen = 0;
+ int have_action=0;
+ ipfw_insn *cmd;
+
+ if (size < sizeof(*rule)) {
+ printf("ipfw: rule too short\n");
+ return (EINVAL);
+ }
+ /* first, check for valid size */
+ l = RULESIZE(rule);
+ if (l != size) {
+ printf("ipfw: size mismatch (have %d want %d)\n", size, l);
+ return (EINVAL);
+ }
+ if (rule->act_ofs >= rule->cmd_len) {
+ printf("ipfw: bogus action offset (%u > %u)\n",
+ rule->act_ofs, rule->cmd_len - 1);
+ return (EINVAL);
+ }
+ /*
+ * Now go for the individual checks. Very simple ones, basically only
+ * instruction sizes.
+ */
+ for (l = rule->cmd_len, cmd = rule->cmd ;
+ l > 0 ; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+ if (cmdlen > l) {
+ printf("ipfw: opcode %d size truncated\n",
+ cmd->opcode);
+ return EINVAL;
+ }
+ switch (cmd->opcode) {
+ case O_PROBE_STATE:
+ case O_KEEP_STATE:
+ case O_PROTO:
+ case O_IP_SRC_ME:
+ case O_IP_DST_ME:
+ case O_LAYER2:
+ case O_IN:
+ case O_FRAG:
+ case O_DIVERTED:
+ case O_IPOPT:
+ case O_IPTOS:
+ case O_IPPRECEDENCE:
+ case O_IPVER:
+ case O_TCPWIN:
+ case O_TCPFLAGS:
+ case O_TCPOPTS:
+ case O_ESTAB:
+ case O_VERREVPATH:
+ case O_VERSRCREACH:
+ case O_ANTISPOOF:
+ case O_IPSEC:
+#ifdef INET6
+ case O_IP6_SRC_ME:
+ case O_IP6_DST_ME:
+ case O_EXT_HDR:
+ case O_IP6:
+#endif
+ case O_IP4:
+ case O_TAG:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ break;
+
+ case O_FIB:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ if (cmd->arg1 >= rt_numfibs) {
+ printf("ipfw: invalid fib number %d\n",
+ cmd->arg1);
+ return EINVAL;
+ }
+ break;
+
+ case O_SETFIB:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ if (cmd->arg1 >= rt_numfibs) {
+ printf("ipfw: invalid fib number %d\n",
+ cmd->arg1);
+ return EINVAL;
+ }
+ goto check_action;
+
+ case O_UID:
+ case O_GID:
+ case O_JAIL:
+ case O_IP_SRC:
+ case O_IP_DST:
+ case O_TCPSEQ:
+ case O_TCPACK:
+ case O_PROB:
+ case O_ICMPTYPE:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+ goto bad_size;
+ break;
+
+ case O_LIMIT:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
+ goto bad_size;
+ break;
+
+ case O_LOG:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
+ goto bad_size;
+
+ ((ipfw_insn_log *)cmd)->log_left =
+ ((ipfw_insn_log *)cmd)->max_log;
+
+ break;
+
+ case O_IP_SRC_MASK:
+ case O_IP_DST_MASK:
+ /* only odd command lengths */
+ if ( !(cmdlen & 1) || cmdlen > 31)
+ goto bad_size;
+ break;
+
+ case O_IP_SRC_SET:
+ case O_IP_DST_SET:
+ if (cmd->arg1 == 0 || cmd->arg1 > 256) {
+ printf("ipfw: invalid set size %d\n",
+ cmd->arg1);
+ return EINVAL;
+ }
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+ (cmd->arg1+31)/32 )
+ goto bad_size;
+ break;
+
+ case O_IP_SRC_LOOKUP:
+ case O_IP_DST_LOOKUP:
+ if (cmd->arg1 >= IPFW_TABLES_MAX) {
+ printf("ipfw: invalid table number %d\n",
+ cmd->arg1);
+ return (EINVAL);
+ }
+ if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
+ cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 &&
+ cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+ goto bad_size;
+ break;
+
+ case O_MACADDR2:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
+ goto bad_size;
+ break;
+
+ case O_NOP:
+ case O_IPID:
+ case O_IPTTL:
+ case O_IPLEN:
+ case O_TCPDATALEN:
+ case O_TAGGED:
+ if (cmdlen < 1 || cmdlen > 31)
+ goto bad_size;
+ break;
+
+ case O_MAC_TYPE:
+ case O_IP_SRCPORT:
+ case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
+ if (cmdlen < 2 || cmdlen > 31)
+ goto bad_size;
+ break;
+
+ case O_RECV:
+ case O_XMIT:
+ case O_VIA:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
+ goto bad_size;
+ break;
+
+ case O_ALTQ:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
+ goto bad_size;
+ break;
+
+ case O_PIPE:
+ case O_QUEUE:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ goto check_action;
+
+ case O_FORWARD_IP:
+#ifdef IPFIREWALL_FORWARD
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
+ goto bad_size;
+ goto check_action;
+#else
+ return EINVAL;
+#endif
+
+ case O_DIVERT:
+ case O_TEE:
+ if (ip_divert_ptr == NULL)
+ return EINVAL;
+ else
+ goto check_size;
+ case O_NETGRAPH:
+ case O_NGTEE:
+ if (ng_ipfw_input_p == NULL)
+ return EINVAL;
+ else
+ goto check_size;
+ case O_NAT:
+ if (!IPFW_NAT_LOADED)
+ return EINVAL;
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
+ goto bad_size;
+ goto check_action;
+ case O_FORWARD_MAC: /* XXX not implemented yet */
+ case O_CHECK_STATE:
+ case O_COUNT:
+ case O_ACCEPT:
+ case O_DENY:
+ case O_REJECT:
+#ifdef INET6
+ case O_UNREACH6:
+#endif
+ case O_SKIPTO:
+ case O_REASS:
+check_size:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+check_action:
+ if (have_action) {
+ printf("ipfw: opcode %d, multiple actions"
+ " not allowed\n",
+ cmd->opcode);
+ return EINVAL;
+ }
+ have_action = 1;
+ if (l != cmdlen) {
+ printf("ipfw: opcode %d, action must be"
+ " last opcode\n",
+ cmd->opcode);
+ return EINVAL;
+ }
+ break;
+#ifdef INET6
+ case O_IP6_SRC:
+ case O_IP6_DST:
+ if (cmdlen != F_INSN_SIZE(struct in6_addr) +
+ F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ break;
+
+ case O_FLOW6ID:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+ ((ipfw_insn_u32 *)cmd)->o.arg1)
+ goto bad_size;
+ break;
+
+ case O_IP6_SRC_MASK:
+ case O_IP6_DST_MASK:
+ if ( !(cmdlen & 1) || cmdlen > 127)
+ goto bad_size;
+ break;
+ case O_ICMP6TYPE:
+ if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
+ goto bad_size;
+ break;
+#endif
+
+ default:
+ switch (cmd->opcode) {
+#ifndef INET6
+ case O_IP6_SRC_ME:
+ case O_IP6_DST_ME:
+ case O_EXT_HDR:
+ case O_IP6:
+ case O_UNREACH6:
+ case O_IP6_SRC:
+ case O_IP6_DST:
+ case O_FLOW6ID:
+ case O_IP6_SRC_MASK:
+ case O_IP6_DST_MASK:
+ case O_ICMP6TYPE:
+ printf("ipfw: no IPv6 support in kernel\n");
+ return EPROTONOSUPPORT;
+#endif
+ default:
+ printf("ipfw: opcode %d, unknown opcode\n",
+ cmd->opcode);
+ return EINVAL;
+ }
+ }
+ }
+ if (have_action == 0) {
+ printf("ipfw: missing action\n");
+ return EINVAL;
+ }
+ return 0;
+
+bad_size:
+ printf("ipfw: opcode %d size %d wrong\n",
+ cmd->opcode, cmdlen);
+ return EINVAL;
+}
+
+
+/*
+ * Translation of requests for compatibility with FreeBSD 7.2/8.
+ * a static variable tells us if we have an old client from userland,
+ * and if necessary we translate requests and responses between the
+ * two formats.
+ */
+static int is7 = 0;
+
+struct ip_fw7 {
+ struct ip_fw7 *next; /* linked list of rules */
+ struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */
+ /* 'next_rule' is used to pass up 'set_disable' status */
+
+ uint16_t act_ofs; /* offset of action in 32-bit units */
+ uint16_t cmd_len; /* # of 32-bit words in cmd */
+ uint16_t rulenum; /* rule number */
+ uint8_t set; /* rule set (0..31) */
+ // #define RESVD_SET 31 /* set for default and persistent rules */
+ uint8_t _pad; /* padding */
+ // uint32_t id; /* rule id, only in v.8 */
+ /* These fields are present in all rules. */
+ uint64_t pcnt; /* Packet counter */
+ uint64_t bcnt; /* Byte counter */
+ uint32_t timestamp; /* tv_sec of last match */
+
+ ipfw_insn cmd[1]; /* storage for commands */
+};
+
+ int convert_rule_to_7(struct ip_fw *rule);
+int convert_rule_to_8(struct ip_fw *rule);
+
+#ifndef RULESIZE7
+#define RULESIZE7(rule) (sizeof(struct ip_fw7) + \
+ ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4)
+#endif
+
+
+/*
+ * Copy the static and dynamic rules to the supplied buffer
+ * and return the amount of space actually used.
+ * Must be run under IPFW_UH_RLOCK
+ */
+static size_t
+ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
+{
+ char *bp = buf;
+ char *ep = bp + space;
+ struct ip_fw *rule, *dst;
+ int l, i;
+ time_t boot_seconds;
+
+ boot_seconds = boottime.tv_sec;
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+
+ if (is7) {
+ /* Convert rule to FreeBSd 7.2 format */
+ l = RULESIZE7(rule);
+ if (bp + l + sizeof(uint32_t) <= ep) {
+ int error;
+ bcopy(rule, bp, l + sizeof(uint32_t));
+ error = convert_rule_to_7((struct ip_fw *) bp);
+ if (error)
+ return 0; /*XXX correct? */
+ /*
+ * XXX HACK. Store the disable mask in the "next"
+ * pointer in a wild attempt to keep the ABI the same.
+ * Why do we do this on EVERY rule?
+ */
+ bcopy(&V_set_disable,
+ &(((struct ip_fw7 *)bp)->next_rule),
+ sizeof(V_set_disable));
+ if (((struct ip_fw7 *)bp)->timestamp)
+ ((struct ip_fw7 *)bp)->timestamp += boot_seconds;
+ bp += l;
+ }
+ continue; /* go to next rule */
+ }
+
+ /* normal mode, don't touch rules */
+ l = RULESIZE(rule);
+ if (bp + l > ep) { /* should not happen */
+ printf("overflow dumping static rules\n");
+ break;
+ }
+ dst = (struct ip_fw *)bp;
+ bcopy(rule, dst, l);
+ /*
+ * XXX HACK. Store the disable mask in the "next"
+ * pointer in a wild attempt to keep the ABI the same.
+ * Why do we do this on EVERY rule?
+ */
+ bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
+ if (dst->timestamp)
+ dst->timestamp += boot_seconds;
+ bp += l;
+ }
+ ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */
+ return (bp - (char *)buf);
+}
+
+
+/**
+ * {set|get}sockopt parser.
+ */
+int
+ipfw_ctl(struct sockopt *sopt)
+{
+#define RULE_MAXSIZE (256*sizeof(u_int32_t))
+ int error;
+ size_t size;
+ struct ip_fw *buf, *rule;
+ struct ip_fw_chain *chain;
+ u_int32_t rulenum[2];
+
+ error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
+ if (error)
+ return (error);
+
+ /*
+ * Disallow modifications in really-really secure mode, but still allow
+ * the logging counters to be reset.
+ */
+ if (sopt->sopt_name == IP_FW_ADD ||
+ (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
+ error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+ if (error)
+ return (error);
+ }
+
+ chain = &V_layer3_chain;
+ error = 0;
+
+ switch (sopt->sopt_name) {
+ case IP_FW_GET:
+ /*
+ * pass up a copy of the current rules. Static rules
+ * come first (the last of which has number IPFW_DEFAULT_RULE),
+ * followed by a possibly empty list of dynamic rule.
+ * The last dynamic rule has NULL in the "next" field.
+ *
+ * Note that the calculated size is used to bound the
+ * amount of data returned to the user. The rule set may
+ * change between calculating the size and returning the
+ * data in which case we'll just return what fits.
+ */
+ for (;;) {
+ int len = 0, want;
+
+ size = chain->static_len;
+ size += ipfw_dyn_len();
+ if (size >= sopt->sopt_valsize)
+ break;
+ buf = malloc(size, M_TEMP, M_WAITOK);
+ if (buf == NULL)
+ break;
+ IPFW_UH_RLOCK(chain);
+ /* check again how much space we need */
+ want = chain->static_len + ipfw_dyn_len();
+ if (size >= want)
+ len = ipfw_getrules(chain, buf, size);
+ IPFW_UH_RUNLOCK(chain);
+ if (size >= want)
+ error = sooptcopyout(sopt, buf, len);
+ free(buf, M_TEMP);
+ if (size >= want)
+ break;
+ }
+ break;
+
+ case IP_FW_FLUSH:
+ /* locking is done within del_entry() */
+ error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */
+ break;
+
+ case IP_FW_ADD:
+ rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
+ error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
+ sizeof(struct ip_fw7) );
+
+ /*
+ * If the size of commands equals RULESIZE7 then we assume
+ * a FreeBSD7.2 binary is talking to us (set is7=1).
+ * is7 is persistent so the next 'ipfw list' command
+ * will use this format.
+ * NOTE: If wrong version is guessed (this can happen if
+ * the first ipfw command is 'ipfw [pipe] list')
+ * the ipfw binary may crash or loop infinitly...
+ */
+ if (sopt->sopt_valsize == RULESIZE7(rule)) {
+ is7 = 1;
+ error = convert_rule_to_8(rule);
+ if (error)
+ return error;
+ if (error == 0)
+ error = check_ipfw_struct(rule, RULESIZE(rule));
+ } else {
+ is7 = 0;
+ if (error == 0)
+ error = check_ipfw_struct(rule, sopt->sopt_valsize);
+ }
+ if (error == 0) {
+ /* locking is done within ipfw_add_rule() */
+ error = ipfw_add_rule(chain, rule);
+ size = RULESIZE(rule);
+ if (!error && sopt->sopt_dir == SOPT_GET) {
+ if (is7) {
+ error = convert_rule_to_7(rule);
+ size = RULESIZE7(rule);
+ if (error)
+ return error;
+ }
+ error = sooptcopyout(sopt, rule, size);
+ }
+ }
+ free(rule, M_TEMP);
+ break;
+
+ case IP_FW_DEL:
+ /*
+ * IP_FW_DEL is used for deleting single rules or sets,
+ * and (ab)used to atomically manipulate sets. Argument size
+ * is used to distinguish between the two:
+ * sizeof(u_int32_t)
+ * delete single rule or set of rules,
+ * or reassign rules (or sets) to a different set.
+ * 2*sizeof(u_int32_t)
+ * atomic disable/enable sets.
+ * first u_int32_t contains sets to be disabled,
+ * second u_int32_t contains sets to be enabled.
+ */
+ error = sooptcopyin(sopt, rulenum,
+ 2*sizeof(u_int32_t), sizeof(u_int32_t));
+ if (error)
+ break;
+ size = sopt->sopt_valsize;
+ if (size == sizeof(u_int32_t) && rulenum[0] != 0) {
+ /* delete or reassign, locking done in del_entry() */
+ error = del_entry(chain, rulenum[0]);
+ } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */
+ IPFW_UH_WLOCK(chain);
+ V_set_disable =
+ (V_set_disable | rulenum[0]) & ~rulenum[1] &
+ ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
+ IPFW_UH_WUNLOCK(chain);
+ } else
+ error = EINVAL;
+ break;
+
+ case IP_FW_ZERO:
+ case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
+ rulenum[0] = 0;
+ if (sopt->sopt_val != 0) {
+ error = sooptcopyin(sopt, rulenum,
+ sizeof(u_int32_t), sizeof(u_int32_t));
+ if (error)
+ break;
+ }
+ error = zero_entry(chain, rulenum[0],
+ sopt->sopt_name == IP_FW_RESETLOG);
+ break;
+
+ /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/
+ case IP_FW_TABLE_ADD:
+ {
+ ipfw_table_entry ent;
+
+ error = sooptcopyin(sopt, &ent,
+ sizeof(ent), sizeof(ent));
+ if (error)
+ break;
+ error = ipfw_add_table_entry(chain, ent.tbl,
+ ent.addr, ent.masklen, ent.value);
+ }
+ break;
+
+ case IP_FW_TABLE_DEL:
+ {
+ ipfw_table_entry ent;
+
+ error = sooptcopyin(sopt, &ent,
+ sizeof(ent), sizeof(ent));
+ if (error)
+ break;
+ error = ipfw_del_table_entry(chain, ent.tbl,
+ ent.addr, ent.masklen);
+ }
+ break;
+
+ case IP_FW_TABLE_FLUSH:
+ {
+ u_int16_t tbl;
+
+ error = sooptcopyin(sopt, &tbl,
+ sizeof(tbl), sizeof(tbl));
+ if (error)
+ break;
+ IPFW_WLOCK(chain);
+ error = ipfw_flush_table(chain, tbl);
+ IPFW_WUNLOCK(chain);
+ }
+ break;
+
+ case IP_FW_TABLE_GETSIZE:
+ {
+ u_int32_t tbl, cnt;
+
+ if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
+ sizeof(tbl))))
+ break;
+ IPFW_RLOCK(chain);
+ error = ipfw_count_table(chain, tbl, &cnt);
+ IPFW_RUNLOCK(chain);
+ if (error)
+ break;
+ error = sooptcopyout(sopt, &cnt, sizeof(cnt));
+ }
+ break;
+
+ case IP_FW_TABLE_LIST:
+ {
+ ipfw_table *tbl;
+
+ if (sopt->sopt_valsize < sizeof(*tbl)) {
+ error = EINVAL;
+ break;
+ }
+ size = sopt->sopt_valsize;
+ tbl = malloc(size, M_TEMP, M_WAITOK);
+ error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
+ if (error) {
+ free(tbl, M_TEMP);
+ break;
+ }
+ tbl->size = (size - sizeof(*tbl)) /
+ sizeof(ipfw_table_entry);
+ IPFW_RLOCK(chain);
+ error = ipfw_dump_table(chain, tbl);
+ IPFW_RUNLOCK(chain);
+ if (error) {
+ free(tbl, M_TEMP);
+ break;
+ }
+ error = sooptcopyout(sopt, tbl, size);
+ free(tbl, M_TEMP);
+ }
+ break;
+
+ /*--- NAT operations are protected by the IPFW_LOCK ---*/
+ case IP_FW_NAT_CFG:
+ if (IPFW_NAT_LOADED)
+ error = ipfw_nat_cfg_ptr(sopt);
+ else {
+ printf("IP_FW_NAT_CFG: %s\n",
+ "ipfw_nat not present, please load it");
+ error = EINVAL;
+ }
+ break;
+
+ case IP_FW_NAT_DEL:
+ if (IPFW_NAT_LOADED)
+ error = ipfw_nat_del_ptr(sopt);
+ else {
+ printf("IP_FW_NAT_DEL: %s\n",
+ "ipfw_nat not present, please load it");
+ error = EINVAL;
+ }
+ break;
+
+ case IP_FW_NAT_GET_CONFIG:
+ if (IPFW_NAT_LOADED)
+ error = ipfw_nat_get_cfg_ptr(sopt);
+ else {
+ printf("IP_FW_NAT_GET_CFG: %s\n",
+ "ipfw_nat not present, please load it");
+ error = EINVAL;
+ }
+ break;
+
+ case IP_FW_NAT_GET_LOG:
+ if (IPFW_NAT_LOADED)
+ error = ipfw_nat_get_log_ptr(sopt);
+ else {
+ printf("IP_FW_NAT_GET_LOG: %s\n",
+ "ipfw_nat not present, please load it");
+ error = EINVAL;
+ }
+ break;
+
+ default:
+ printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
+ error = EINVAL;
+ }
+
+ return (error);
+#undef RULE_MAXSIZE
+}
+
+
+#define RULE_MAXSIZE (256*sizeof(u_int32_t))
+
+/* Functions to convert rules 7.2 <==> 8.0 */
+int
+convert_rule_to_7(struct ip_fw *rule)
+{
+ /* Used to modify original rule */
+ struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
+ /* copy of original rule, version 8 */
+ struct ip_fw *tmp;
+
+ /* Used to copy commands */
+ ipfw_insn *ccmd, *dst;
+ int ll = 0, ccmdlen = 0;
+
+ tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
+ if (tmp == NULL) {
+ return 1; //XXX error
+ }
+ bcopy(rule, tmp, RULE_MAXSIZE);
+
+ /* Copy fields */
+ rule7->_pad = tmp->_pad;
+ rule7->set = tmp->set;
+ rule7->rulenum = tmp->rulenum;
+ rule7->cmd_len = tmp->cmd_len;
+ rule7->act_ofs = tmp->act_ofs;
+ rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
+ rule7->next = (struct ip_fw7 *)tmp->x_next;
+ rule7->cmd_len = tmp->cmd_len;
+ rule7->pcnt = tmp->pcnt;
+ rule7->bcnt = tmp->bcnt;
+ rule7->timestamp = tmp->timestamp;
+
+ /* Copy commands */
+ for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
+ ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
+ ccmdlen = F_LEN(ccmd);
+
+ bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
+
+ if (dst->opcode > O_NAT)
+ /* O_REASS doesn't exists in 7.2 version, so
+ * decrement opcode if it is after O_REASS
+ */
+ dst->opcode--;
+
+ if (ccmdlen > ll) {
+ printf("ipfw: opcode %d size truncated\n",
+ ccmd->opcode);
+ return EINVAL;
+ }
+ }
+ free(tmp, M_TEMP);
+
+ return 0;
+}
+
+int
+convert_rule_to_8(struct ip_fw *rule)
+{
+ /* Used to modify original rule */
+ struct ip_fw7 *rule7 = (struct ip_fw7 *) rule;
+
+ /* Used to copy commands */
+ ipfw_insn *ccmd, *dst;
+ int ll = 0, ccmdlen = 0;
+
+ /* Copy of original rule */
+ struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
+ if (tmp == NULL) {
+ return 1; //XXX error
+ }
+
+ bcopy(rule7, tmp, RULE_MAXSIZE);
+
+ for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ;
+ ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
+ ccmdlen = F_LEN(ccmd);
+
+ bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
+
+ if (dst->opcode > O_NAT)
+ /* O_REASS doesn't exists in 7.2 version, so
+ * increment opcode if it is after O_REASS
+ */
+ dst->opcode++;
+
+ if (ccmdlen > ll) {
+ printf("ipfw: opcode %d size truncated\n",
+ ccmd->opcode);
+ return EINVAL;
+ }
+ }
+
+ rule->_pad = tmp->_pad;
+ rule->set = tmp->set;
+ rule->rulenum = tmp->rulenum;
+ rule->cmd_len = tmp->cmd_len;
+ rule->act_ofs = tmp->act_ofs;
+ rule->next_rule = (struct ip_fw *)tmp->next_rule;
+ rule->x_next = (struct ip_fw *)tmp->next;
+ rule->cmd_len = tmp->cmd_len;
+ rule->id = 0; /* XXX see if is ok = 0 */
+ rule->pcnt = tmp->pcnt;
+ rule->bcnt = tmp->bcnt;
+ rule->timestamp = tmp->timestamp;
+
+ free (tmp, M_TEMP);
+ return 0;
+}
+
+/* end of file */
diff --git a/freebsd/sys/netinet/ipfw/ip_fw_table.c b/freebsd/sys/netinet/ipfw/ip_fw_table.c
new file mode 100644
index 00000000..39a1dfcc
--- /dev/null
+++ b/freebsd/sys/netinet/ipfw/ip_fw_table.c
@@ -0,0 +1,288 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Lookup table support for ipfw
+ *
+ * Lookup tables are implemented (at the moment) using the radix
+ * tree used for routing tables. Tables store key-value entries, where
+ * keys are network prefixes (addr/masklen), and values are integers.
+ * As a degenerate case we can interpret keys as 32-bit integers
+ * (with a /32 mask).
+ *
+ * The table is protected by the IPFW lock even for manipulation coming
+ * from userland, because operations are typically fast.
+ */
+
+#if !defined(KLD_MODULE)
+#include <freebsd/local/opt_ipfw.h>
+#include <freebsd/local/opt_ipdivert.h>
+#include <freebsd/local/opt_ipdn.h>
+#include <freebsd/local/opt_inet.h>
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_ipsec.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/rwlock.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/net/if.h> /* ip_fw.h requires IFNAMSIZ */
+#include <freebsd/net/radix.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip_var.h> /* struct ipfw_rule_ref */
+#include <freebsd/netinet/ip_fw.h>
+#include <freebsd/sys/queue.h> /* LIST_HEAD */
+#include <freebsd/netinet/ipfw/ip_fw_private.h>
+
+#ifdef MAC
+#include <freebsd/security/mac/mac_framework.h>
+#endif
+
+MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
+
+struct table_entry {
+ struct radix_node rn[2];
+ struct sockaddr_in addr, mask;
+ u_int32_t value;
+};
+
+/*
+ * The radix code expects addr and mask to be array of bytes,
+ * with the first byte being the length of the array. rn_inithead
+ * is called with the offset in bits of the lookup key within the
+ * array. If we use a sockaddr_in as the underlying type,
+ * sin_len is conveniently located at offset 0, sin_addr is at
+ * offset 4 and normally aligned.
+ * But for portability, let's avoid assumption and make the code explicit
+ */
+#define KEY_LEN(v) *((uint8_t *)&(v))
+#define KEY_OFS (8*offsetof(struct sockaddr_in, sin_addr))
+
+int
+ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint8_t mlen, uint32_t value)
+{
+ struct radix_node_head *rnh;
+ struct table_entry *ent;
+ struct radix_node *rn;
+
+ if (tbl >= IPFW_TABLES_MAX)
+ return (EINVAL);
+ rnh = ch->tables[tbl];
+ ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO);
+ if (ent == NULL)
+ return (ENOMEM);
+ ent->value = value;
+ KEY_LEN(ent->addr) = KEY_LEN(ent->mask) = 8;
+ ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+ ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
+ IPFW_WLOCK(ch);
+ rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent);
+ if (rn == NULL) {
+ IPFW_WUNLOCK(ch);
+ free(ent, M_IPFW_TBL);
+ return (EEXIST);
+ }
+ IPFW_WUNLOCK(ch);
+ return (0);
+}
+
+int
+ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint8_t mlen)
+{
+ struct radix_node_head *rnh;
+ struct table_entry *ent;
+ struct sockaddr_in sa, mask;
+
+ if (tbl >= IPFW_TABLES_MAX)
+ return (EINVAL);
+ rnh = ch->tables[tbl];
+ KEY_LEN(sa) = KEY_LEN(mask) = 8;
+ mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+ sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
+ IPFW_WLOCK(ch);
+ ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
+ if (ent == NULL) {
+ IPFW_WUNLOCK(ch);
+ return (ESRCH);
+ }
+ IPFW_WUNLOCK(ch);
+ free(ent, M_IPFW_TBL);
+ return (0);
+}
+
+static int
+flush_table_entry(struct radix_node *rn, void *arg)
+{
+ struct radix_node_head * const rnh = arg;
+ struct table_entry *ent;
+
+ ent = (struct table_entry *)
+ rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
+ if (ent != NULL)
+ free(ent, M_IPFW_TBL);
+ return (0);
+}
+
+int
+ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl)
+{
+ struct radix_node_head *rnh;
+
+ IPFW_WLOCK_ASSERT(ch);
+
+ if (tbl >= IPFW_TABLES_MAX)
+ return (EINVAL);
+ rnh = ch->tables[tbl];
+ KASSERT(rnh != NULL, ("NULL IPFW table"));
+ rnh->rnh_walktree(rnh, flush_table_entry, rnh);
+ return (0);
+}
+
+void
+ipfw_destroy_tables(struct ip_fw_chain *ch)
+{
+ uint16_t tbl;
+ struct radix_node_head *rnh;
+
+ IPFW_WLOCK_ASSERT(ch);
+
+ for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) {
+ ipfw_flush_table(ch, tbl);
+ rnh = ch->tables[tbl];
+ rn_detachhead((void **)&rnh);
+ }
+}
+
+int
+ipfw_init_tables(struct ip_fw_chain *ch)
+{
+ int i;
+ uint16_t j;
+
+ for (i = 0; i < IPFW_TABLES_MAX; i++) {
+ if (!rn_inithead((void **)&ch->tables[i], KEY_OFS)) {
+ for (j = 0; j < i; j++) {
+ (void) ipfw_flush_table(ch, j);
+ }
+ return (ENOMEM);
+ }
+ }
+ return (0);
+}
+
+int
+ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint32_t *val)
+{
+ struct radix_node_head *rnh;
+ struct table_entry *ent;
+ struct sockaddr_in sa;
+
+ if (tbl >= IPFW_TABLES_MAX)
+ return (0);
+ rnh = ch->tables[tbl];
+ KEY_LEN(sa) = 8;
+ sa.sin_addr.s_addr = addr;
+ ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh));
+ if (ent != NULL) {
+ *val = ent->value;
+ return (1);
+ }
+ return (0);
+}
+
+static int
+count_table_entry(struct radix_node *rn, void *arg)
+{
+ u_int32_t * const cnt = arg;
+
+ (*cnt)++;
+ return (0);
+}
+
+int
+ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
+{
+ struct radix_node_head *rnh;
+
+ if (tbl >= IPFW_TABLES_MAX)
+ return (EINVAL);
+ rnh = ch->tables[tbl];
+ *cnt = 0;
+ rnh->rnh_walktree(rnh, count_table_entry, cnt);
+ return (0);
+}
+
+static int
+dump_table_entry(struct radix_node *rn, void *arg)
+{
+ struct table_entry * const n = (struct table_entry *)rn;
+ ipfw_table * const tbl = arg;
+ ipfw_table_entry *ent;
+
+ if (tbl->cnt == tbl->size)
+ return (1);
+ ent = &tbl->ent[tbl->cnt];
+ ent->tbl = tbl->tbl;
+ if (in_nullhost(n->mask.sin_addr))
+ ent->masklen = 0;
+ else
+ ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
+ ent->addr = n->addr.sin_addr.s_addr;
+ ent->value = n->value;
+ tbl->cnt++;
+ return (0);
+}
+
+int
+ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl)
+{
+ struct radix_node_head *rnh;
+
+ if (tbl->tbl >= IPFW_TABLES_MAX)
+ return (EINVAL);
+ rnh = ch->tables[tbl->tbl];
+ tbl->cnt = 0;
+ rnh->rnh_walktree(rnh, dump_table_entry, tbl);
+ return (0);
+}
+/* end of file */
diff --git a/freebsd/sys/netinet/libalias/alias.c b/freebsd/sys/netinet/libalias/alias.c
new file mode 100644
index 00000000..e5c5138d
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias.c
@@ -0,0 +1,1793 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ Alias.c provides supervisory control for the functions of the
+ packet aliasing software. It consists of routines to monitor
+ TCP connection state, protocol-specific aliasing routines,
+ fragment handling and the following outside world functional
+ interfaces: SaveFragmentPtr, GetFragmentPtr, FragmentAliasIn,
+ PacketAliasIn and PacketAliasOut.
+
+ The other C program files are briefly described. The data
+ structure framework which holds information needed to translate
+ packets is encapsulated in alias_db.c. Data is accessed by
+ function calls, so other segments of the program need not know
+ about the underlying data structures. Alias_ftp.c contains
+ special code for modifying the ftp PORT command used to establish
+ data connections, while alias_irc.c does the same for IRC
+ DCC. Alias_util.c contains a few utility routines.
+
+ Version 1.0 August, 1996 (cjm)
+
+ Version 1.1 August 20, 1996 (cjm)
+ PPP host accepts incoming connections for ports 0 to 1023.
+ (Gary Roberts pointed out the need to handle incoming
+ connections.)
+
+ Version 1.2 September 7, 1996 (cjm)
+ Fragment handling error in alias_db.c corrected.
+ (Tom Torrance helped fix this problem.)
+
+ Version 1.4 September 16, 1996 (cjm)
+ - A more generalized method for handling incoming
+ connections, without the 0-1023 restriction, is
+ implemented in alias_db.c
+ - Improved ICMP support in alias.c. Traceroute
+ packet streams can now be correctly aliased.
+ - TCP connection closing logic simplified in
+ alias.c and now allows for additional 1 minute
+ "grace period" after FIN or RST is observed.
+
+ Version 1.5 September 17, 1996 (cjm)
+ Corrected error in handling incoming UDP packets with 0 checksum.
+ (Tom Torrance helped fix this problem.)
+
+ Version 1.6 September 18, 1996 (cjm)
+ Simplified ICMP aliasing scheme. Should now support
+ traceroute from Win95 as well as FreeBSD.
+
+ Version 1.7 January 9, 1997 (cjm)
+ - Out-of-order fragment handling.
+ - IP checksum error fixed for ftp transfers
+ from aliasing host.
+ - Integer return codes added to all
+ aliasing/de-aliasing functions.
+ - Some obsolete comments cleaned up.
+ - Differential checksum computations for
+ IP header (TCP, UDP and ICMP were already
+ differential).
+
+ Version 2.1 May 1997 (cjm)
+ - Added support for outgoing ICMP error
+ messages.
+ - Added two functions PacketAliasIn2()
+ and PacketAliasOut2() for dynamic address
+ control (e.g. round-robin allocation of
+ incoming packets).
+
+ Version 2.2 July 1997 (cjm)
+ - Rationalized API function names to begin
+ with "PacketAlias..."
+ - Eliminated PacketAliasIn2() and
+ PacketAliasOut2() as poorly conceived.
+
+ Version 2.3 Dec 1998 (dillon)
+ - Major bounds checking additions, see FreeBSD/CVS
+
+ Version 3.1 May, 2000 (salander)
+ - Added hooks to handle PPTP.
+
+ Version 3.2 July, 2000 (salander and satoh)
+ - Added PacketUnaliasOut routine.
+ - Added hooks to handle RTSP/RTP.
+
+ See HISTORY file for additional revisions.
+*/
+
+#ifdef _KERNEL
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/sysctl.h>
+#else
+#include <freebsd/sys/types.h>
+#include <freebsd/stdlib.h>
+#include <freebsd/stdio.h>
+#include <freebsd/ctype.h>
+#include <freebsd/dlfcn.h>
+#include <freebsd/errno.h>
+#include <freebsd/string.h>
+#endif
+
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_icmp.h>
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/udp.h>
+
+#ifdef _KERNEL
+#include <freebsd/netinet/libalias/alias.h>
+#include <freebsd/netinet/libalias/alias_local.h>
+#include <freebsd/netinet/libalias/alias_mod.h>
+#else
+#include <freebsd/err.h>
+#include <freebsd/local/alias.h>
+#include <freebsd/local/alias_local.h>
+#include <freebsd/local/alias_mod.h>
+#endif
+
+/*
+ * Define libalias SYSCTL Node
+ */
+#ifdef SYSCTL_NODE
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, alias, CTLFLAG_RW, NULL, "Libalias sysctl API");
+
+#endif
+
+static __inline int
+twowords(void *p)
+{
+ uint8_t *c = p;
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+ uint16_t s1 = ((uint16_t)c[1] << 8) + (uint16_t)c[0];
+ uint16_t s2 = ((uint16_t)c[3] << 8) + (uint16_t)c[2];
+#else
+ uint16_t s1 = ((uint16_t)c[0] << 8) + (uint16_t)c[1];
+ uint16_t s2 = ((uint16_t)c[2] << 8) + (uint16_t)c[3];
+#endif
+ return (s1 + s2);
+}
+
+/* TCP Handling Routines
+
+ TcpMonitorIn() -- These routines monitor TCP connections, and
+ TcpMonitorOut() delete a link when a connection is closed.
+
+These routines look for SYN, FIN and RST flags to determine when TCP
+connections open and close. When a TCP connection closes, the data
+structure containing packet aliasing information is deleted after
+a timeout period.
+*/
+
+/* Local prototypes */
+static void TcpMonitorIn(u_char, struct alias_link *);
+
+static void TcpMonitorOut(u_char, struct alias_link *);
+
+
+static void
+TcpMonitorIn(u_char th_flags, struct alias_link *lnk)
+{
+
+ switch (GetStateIn(lnk)) {
+ case ALIAS_TCP_STATE_NOT_CONNECTED:
+ if (th_flags & TH_RST)
+ SetStateIn(lnk, ALIAS_TCP_STATE_DISCONNECTED);
+ else if (th_flags & TH_SYN)
+ SetStateIn(lnk, ALIAS_TCP_STATE_CONNECTED);
+ break;
+ case ALIAS_TCP_STATE_CONNECTED:
+ if (th_flags & (TH_FIN | TH_RST))
+ SetStateIn(lnk, ALIAS_TCP_STATE_DISCONNECTED);
+ break;
+ }
+}
+
+static void
+TcpMonitorOut(u_char th_flags, struct alias_link *lnk)
+{
+
+ switch (GetStateOut(lnk)) {
+ case ALIAS_TCP_STATE_NOT_CONNECTED:
+ if (th_flags & TH_RST)
+ SetStateOut(lnk, ALIAS_TCP_STATE_DISCONNECTED);
+ else if (th_flags & TH_SYN)
+ SetStateOut(lnk, ALIAS_TCP_STATE_CONNECTED);
+ break;
+ case ALIAS_TCP_STATE_CONNECTED:
+ if (th_flags & (TH_FIN | TH_RST))
+ SetStateOut(lnk, ALIAS_TCP_STATE_DISCONNECTED);
+ break;
+ }
+}
+
+
+
+
+
+/* Protocol Specific Packet Aliasing Routines
+
+ IcmpAliasIn(), IcmpAliasIn1(), IcmpAliasIn2()
+ IcmpAliasOut(), IcmpAliasOut1(), IcmpAliasOut2()
+ ProtoAliasIn(), ProtoAliasOut()
+ UdpAliasIn(), UdpAliasOut()
+ TcpAliasIn(), TcpAliasOut()
+
+These routines handle protocol specific details of packet aliasing.
+One may observe a certain amount of repetitive arithmetic in these
+functions, the purpose of which is to compute a revised checksum
+without actually summing over the entire data packet, which could be
+unnecessarily time consuming.
+
+The purpose of the packet aliasing routines is to replace the source
+address of the outgoing packet and then correctly put it back for
+any incoming packets. For TCP and UDP, ports are also re-mapped.
+
+For ICMP echo/timestamp requests and replies, the following scheme
+is used: the ID number is replaced by an alias for the outgoing
+packet.
+
+ICMP error messages are handled by looking at the IP fragment
+in the data section of the message.
+
+For TCP and UDP protocols, a port number is chosen for an outgoing
+packet, and then incoming packets are identified by IP address and
+port numbers. For TCP packets, there is additional logic in the event
+that sequence and ACK numbers have been altered (as in the case for
+FTP data port commands).
+
+The port numbers used by the packet aliasing module are not true
+ports in the Unix sense. No sockets are actually bound to ports.
+They are more correctly thought of as placeholders.
+
+All packets go through the aliasing mechanism, whether they come from
+the gateway machine or other machines on a local area network.
+*/
+
+
+/* Local prototypes */
+static int IcmpAliasIn1(struct libalias *, struct ip *);
+static int IcmpAliasIn2(struct libalias *, struct ip *);
+static int IcmpAliasIn(struct libalias *, struct ip *);
+
+static int IcmpAliasOut1(struct libalias *, struct ip *, int create);
+static int IcmpAliasOut2(struct libalias *, struct ip *);
+static int IcmpAliasOut(struct libalias *, struct ip *, int create);
+
+static int ProtoAliasIn(struct libalias *la, struct in_addr ip_src,
+ struct in_addr *ip_dst, u_char ip_p, u_short *ip_sum);
+static int ProtoAliasOut(struct libalias *la, struct in_addr *ip_src,
+ struct in_addr ip_dst, u_char ip_p, u_short *ip_sum,
+ int create);
+
+static int UdpAliasIn(struct libalias *, struct ip *);
+static int UdpAliasOut(struct libalias *, struct ip *, int, int create);
+
+static int TcpAliasIn(struct libalias *, struct ip *);
+static int TcpAliasOut(struct libalias *, struct ip *, int, int create);
+
+
+static int
+IcmpAliasIn1(struct libalias *la, struct ip *pip)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+/*
+ De-alias incoming echo and timestamp replies.
+ Alias incoming echo and timestamp requests.
+*/
+ struct alias_link *lnk;
+ struct icmp *ic;
+
+ ic = (struct icmp *)ip_next(pip);
+
+/* Get source address from ICMP data field and restore original data */
+ lnk = FindIcmpIn(la, pip->ip_src, pip->ip_dst, ic->icmp_id, 1);
+ if (lnk != NULL) {
+ u_short original_id;
+ int accumulate;
+
+ original_id = GetOriginalPort(lnk);
+
+/* Adjust ICMP checksum */
+ accumulate = ic->icmp_id;
+ accumulate -= original_id;
+ ADJUST_CHECKSUM(accumulate, ic->icmp_cksum);
+
+/* Put original sequence number back in */
+ ic->icmp_id = original_id;
+
+/* Put original address back into IP header */
+ {
+ struct in_addr original_address;
+
+ original_address = GetOriginalAddress(lnk);
+ DifferentialChecksum(&pip->ip_sum,
+ &original_address, &pip->ip_dst, 2);
+ pip->ip_dst = original_address;
+ }
+
+ return (PKT_ALIAS_OK);
+ }
+ return (PKT_ALIAS_IGNORED);
+}
+
+static int
+IcmpAliasIn2(struct libalias *la, struct ip *pip)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+/*
+ Alias incoming ICMP error messages containing
+ IP header and first 64 bits of datagram.
+*/
+ struct ip *ip;
+ struct icmp *ic, *ic2;
+ struct udphdr *ud;
+ struct tcphdr *tc;
+ struct alias_link *lnk;
+
+ ic = (struct icmp *)ip_next(pip);
+ ip = &ic->icmp_ip;
+
+ ud = (struct udphdr *)ip_next(ip);
+ tc = (struct tcphdr *)ip_next(ip);
+ ic2 = (struct icmp *)ip_next(ip);
+
+ if (ip->ip_p == IPPROTO_UDP)
+ lnk = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src,
+ ud->uh_dport, ud->uh_sport,
+ IPPROTO_UDP, 0);
+ else if (ip->ip_p == IPPROTO_TCP)
+ lnk = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src,
+ tc->th_dport, tc->th_sport,
+ IPPROTO_TCP, 0);
+ else if (ip->ip_p == IPPROTO_ICMP) {
+ if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP)
+ lnk = FindIcmpIn(la, ip->ip_dst, ip->ip_src, ic2->icmp_id, 0);
+ else
+ lnk = NULL;
+ } else
+ lnk = NULL;
+
+ if (lnk != NULL) {
+ if (ip->ip_p == IPPROTO_UDP || ip->ip_p == IPPROTO_TCP) {
+ int accumulate, accumulate2;
+ struct in_addr original_address;
+ u_short original_port;
+
+ original_address = GetOriginalAddress(lnk);
+ original_port = GetOriginalPort(lnk);
+
+/* Adjust ICMP checksum */
+ accumulate = twowords(&ip->ip_src);
+ accumulate -= twowords(&original_address);
+ accumulate += ud->uh_sport;
+ accumulate -= original_port;
+ accumulate2 = accumulate;
+ accumulate2 += ip->ip_sum;
+ ADJUST_CHECKSUM(accumulate, ip->ip_sum);
+ accumulate2 -= ip->ip_sum;
+ ADJUST_CHECKSUM(accumulate2, ic->icmp_cksum);
+
+/* Un-alias address in IP header */
+ DifferentialChecksum(&pip->ip_sum,
+ &original_address, &pip->ip_dst, 2);
+ pip->ip_dst = original_address;
+
+/* Un-alias address and port number of original IP packet
+fragment contained in ICMP data section */
+ ip->ip_src = original_address;
+ ud->uh_sport = original_port;
+ } else if (ip->ip_p == IPPROTO_ICMP) {
+ int accumulate, accumulate2;
+ struct in_addr original_address;
+ u_short original_id;
+
+ original_address = GetOriginalAddress(lnk);
+ original_id = GetOriginalPort(lnk);
+
+/* Adjust ICMP checksum */
+ accumulate = twowords(&ip->ip_src);
+ accumulate -= twowords(&original_address);
+ accumulate += ic2->icmp_id;
+ accumulate -= original_id;
+ accumulate2 = accumulate;
+ accumulate2 += ip->ip_sum;
+ ADJUST_CHECKSUM(accumulate, ip->ip_sum);
+ accumulate2 -= ip->ip_sum;
+ ADJUST_CHECKSUM(accumulate2, ic->icmp_cksum);
+
+/* Un-alias address in IP header */
+ DifferentialChecksum(&pip->ip_sum,
+ &original_address, &pip->ip_dst, 2);
+ pip->ip_dst = original_address;
+
+/* Un-alias address of original IP packet and sequence number of
+ embedded ICMP datagram */
+ ip->ip_src = original_address;
+ ic2->icmp_id = original_id;
+ }
+ return (PKT_ALIAS_OK);
+ }
+ return (PKT_ALIAS_IGNORED);
+}
+
+
+static int
+IcmpAliasIn(struct libalias *la, struct ip *pip)
+{
+ int iresult;
+ struct icmp *ic;
+
+ LIBALIAS_LOCK_ASSERT(la);
+/* Return if proxy-only mode is enabled */
+ if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY)
+ return (PKT_ALIAS_OK);
+
+ ic = (struct icmp *)ip_next(pip);
+
+ iresult = PKT_ALIAS_IGNORED;
+ switch (ic->icmp_type) {
+ case ICMP_ECHOREPLY:
+ case ICMP_TSTAMPREPLY:
+ if (ic->icmp_code == 0) {
+ iresult = IcmpAliasIn1(la, pip);
+ }
+ break;
+ case ICMP_UNREACH:
+ case ICMP_SOURCEQUENCH:
+ case ICMP_TIMXCEED:
+ case ICMP_PARAMPROB:
+ iresult = IcmpAliasIn2(la, pip);
+ break;
+ case ICMP_ECHO:
+ case ICMP_TSTAMP:
+ iresult = IcmpAliasIn1(la, pip);
+ break;
+ }
+ return (iresult);
+}
+
+
+static int
+IcmpAliasOut1(struct libalias *la, struct ip *pip, int create)
+{
+/*
+ Alias outgoing echo and timestamp requests.
+ De-alias outgoing echo and timestamp replies.
+*/
+ struct alias_link *lnk;
+ struct icmp *ic;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ ic = (struct icmp *)ip_next(pip);
+
+/* Save overwritten data for when echo packet returns */
+ lnk = FindIcmpOut(la, pip->ip_src, pip->ip_dst, ic->icmp_id, create);
+ if (lnk != NULL) {
+ u_short alias_id;
+ int accumulate;
+
+ alias_id = GetAliasPort(lnk);
+
+/* Since data field is being modified, adjust ICMP checksum */
+ accumulate = ic->icmp_id;
+ accumulate -= alias_id;
+ ADJUST_CHECKSUM(accumulate, ic->icmp_cksum);
+
+/* Alias sequence number */
+ ic->icmp_id = alias_id;
+
+/* Change source address */
+ {
+ struct in_addr alias_address;
+
+ alias_address = GetAliasAddress(lnk);
+ DifferentialChecksum(&pip->ip_sum,
+ &alias_address, &pip->ip_src, 2);
+ pip->ip_src = alias_address;
+ }
+
+ return (PKT_ALIAS_OK);
+ }
+ return (PKT_ALIAS_IGNORED);
+}
+
+
+static int
+IcmpAliasOut2(struct libalias *la, struct ip *pip)
+{
+/*
+ Alias outgoing ICMP error messages containing
+ IP header and first 64 bits of datagram.
+*/
+ struct ip *ip;
+ struct icmp *ic, *ic2;
+ struct udphdr *ud;
+ struct tcphdr *tc;
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ ic = (struct icmp *)ip_next(pip);
+ ip = &ic->icmp_ip;
+
+ ud = (struct udphdr *)ip_next(ip);
+ tc = (struct tcphdr *)ip_next(ip);
+ ic2 = (struct icmp *)ip_next(ip);
+
+ if (ip->ip_p == IPPROTO_UDP)
+ lnk = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src,
+ ud->uh_dport, ud->uh_sport,
+ IPPROTO_UDP, 0);
+ else if (ip->ip_p == IPPROTO_TCP)
+ lnk = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src,
+ tc->th_dport, tc->th_sport,
+ IPPROTO_TCP, 0);
+ else if (ip->ip_p == IPPROTO_ICMP) {
+ if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP)
+ lnk = FindIcmpOut(la, ip->ip_dst, ip->ip_src, ic2->icmp_id, 0);
+ else
+ lnk = NULL;
+ } else
+ lnk = NULL;
+
+ if (lnk != NULL) {
+ if (ip->ip_p == IPPROTO_UDP || ip->ip_p == IPPROTO_TCP) {
+ int accumulate;
+ struct in_addr alias_address;
+ u_short alias_port;
+
+ alias_address = GetAliasAddress(lnk);
+ alias_port = GetAliasPort(lnk);
+
+/* Adjust ICMP checksum */
+ accumulate = twowords(&ip->ip_dst);
+ accumulate -= twowords(&alias_address);
+ accumulate += ud->uh_dport;
+ accumulate -= alias_port;
+ ADJUST_CHECKSUM(accumulate, ic->icmp_cksum);
+
+/*
+ * Alias address in IP header if it comes from the host
+ * the original TCP/UDP packet was destined for.
+ */
+ if (pip->ip_src.s_addr == ip->ip_dst.s_addr) {
+ DifferentialChecksum(&pip->ip_sum,
+ &alias_address, &pip->ip_src, 2);
+ pip->ip_src = alias_address;
+ }
+/* Alias address and port number of original IP packet
+fragment contained in ICMP data section */
+ ip->ip_dst = alias_address;
+ ud->uh_dport = alias_port;
+ } else if (ip->ip_p == IPPROTO_ICMP) {
+ int accumulate;
+ struct in_addr alias_address;
+ u_short alias_id;
+
+ alias_address = GetAliasAddress(lnk);
+ alias_id = GetAliasPort(lnk);
+
+/* Adjust ICMP checksum */
+ accumulate = twowords(&ip->ip_dst);
+ accumulate -= twowords(&alias_address);
+ accumulate += ic2->icmp_id;
+ accumulate -= alias_id;
+ ADJUST_CHECKSUM(accumulate, ic->icmp_cksum);
+
+/*
+ * Alias address in IP header if it comes from the host
+ * the original ICMP message was destined for.
+ */
+ if (pip->ip_src.s_addr == ip->ip_dst.s_addr) {
+ DifferentialChecksum(&pip->ip_sum,
+ &alias_address, &pip->ip_src, 2);
+ pip->ip_src = alias_address;
+ }
+/* Alias address of original IP packet and sequence number of
+ embedded ICMP datagram */
+ ip->ip_dst = alias_address;
+ ic2->icmp_id = alias_id;
+ }
+ return (PKT_ALIAS_OK);
+ }
+ return (PKT_ALIAS_IGNORED);
+}
+
+
+static int
+IcmpAliasOut(struct libalias *la, struct ip *pip, int create)
+{
+ int iresult;
+ struct icmp *ic;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ (void)create;
+
+/* Return if proxy-only mode is enabled */
+ if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY)
+ return (PKT_ALIAS_OK);
+
+ ic = (struct icmp *)ip_next(pip);
+
+ iresult = PKT_ALIAS_IGNORED;
+ switch (ic->icmp_type) {
+ case ICMP_ECHO:
+ case ICMP_TSTAMP:
+ if (ic->icmp_code == 0) {
+ iresult = IcmpAliasOut1(la, pip, create);
+ }
+ break;
+ case ICMP_UNREACH:
+ case ICMP_SOURCEQUENCH:
+ case ICMP_TIMXCEED:
+ case ICMP_PARAMPROB:
+ iresult = IcmpAliasOut2(la, pip);
+ break;
+ case ICMP_ECHOREPLY:
+ case ICMP_TSTAMPREPLY:
+ iresult = IcmpAliasOut1(la, pip, create);
+ }
+ return (iresult);
+}
+
+static int
+ProtoAliasIn(struct libalias *la, struct in_addr ip_src,
+ struct in_addr *ip_dst, u_char ip_p, u_short *ip_sum)
+{
+/*
+ Handle incoming IP packets. The
+ only thing which is done in this case is to alias
+ the dest IP address of the packet to our inside
+ machine.
+*/
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+/* Return if proxy-only mode is enabled */
+ if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY)
+ return (PKT_ALIAS_OK);
+
+ lnk = FindProtoIn(la, ip_src, *ip_dst, ip_p);
+ if (lnk != NULL) {
+ struct in_addr original_address;
+
+ original_address = GetOriginalAddress(lnk);
+
+/* Restore original IP address */
+ DifferentialChecksum(ip_sum,
+ &original_address, ip_dst, 2);
+ *ip_dst = original_address;
+
+ return (PKT_ALIAS_OK);
+ }
+ return (PKT_ALIAS_IGNORED);
+}
+
+static int
+ProtoAliasOut(struct libalias *la, struct in_addr *ip_src,
+ struct in_addr ip_dst, u_char ip_p, u_short *ip_sum, int create)
+{
+/*
+ Handle outgoing IP packets. The
+ only thing which is done in this case is to alias
+ the source IP address of the packet.
+*/
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ (void)create;
+
+/* Return if proxy-only mode is enabled */
+ if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY)
+ return (PKT_ALIAS_OK);
+
+ lnk = FindProtoOut(la, *ip_src, ip_dst, ip_p);
+ if (lnk != NULL) {
+ struct in_addr alias_address;
+
+ alias_address = GetAliasAddress(lnk);
+
+/* Change source address */
+ DifferentialChecksum(ip_sum,
+ &alias_address, ip_src, 2);
+ *ip_src = alias_address;
+
+ return (PKT_ALIAS_OK);
+ }
+ return (PKT_ALIAS_IGNORED);
+}
+
+
+static int
+UdpAliasIn(struct libalias *la, struct ip *pip)
+{
+ struct udphdr *ud;
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+
+ ud = (struct udphdr *)ip_next(pip);
+
+ lnk = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst,
+ ud->uh_sport, ud->uh_dport,
+ IPPROTO_UDP, !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY));
+ if (lnk != NULL) {
+ struct in_addr alias_address;
+ struct in_addr original_address;
+ struct in_addr proxy_address;
+ u_short alias_port;
+ u_short proxy_port;
+ int accumulate;
+ int error;
+ struct alias_data ad = {
+ .lnk = lnk,
+ .oaddr = &original_address,
+ .aaddr = &alias_address,
+ .aport = &alias_port,
+ .sport = &ud->uh_sport,
+ .dport = &ud->uh_dport,
+ .maxpktsize = 0
+ };
+
+ alias_address = GetAliasAddress(lnk);
+ original_address = GetOriginalAddress(lnk);
+ proxy_address = GetProxyAddress(lnk);
+ alias_port = ud->uh_dport;
+ ud->uh_dport = GetOriginalPort(lnk);
+ proxy_port = GetProxyPort(lnk);
+
+ /* Walk out chain. */
+ error = find_handler(IN, UDP, la, pip, &ad);
+ /* If we cannot figure out the packet, ignore it. */
+ if (error < 0)
+ return (PKT_ALIAS_IGNORED);
+
+/* If UDP checksum is not zero, then adjust since destination port */
+/* is being unaliased and destination address is being altered. */
+ if (ud->uh_sum != 0) {
+ accumulate = alias_port;
+ accumulate -= ud->uh_dport;
+ accumulate += twowords(&alias_address);
+ accumulate -= twowords(&original_address);
+
+/* If this is a proxy packet, modify checksum because of source change.*/
+ if (proxy_port != 0) {
+ accumulate += ud->uh_sport;
+ accumulate -= proxy_port;
+ }
+
+ if (proxy_address.s_addr != 0) {
+ accumulate += twowords(&pip->ip_src);
+ accumulate -= twowords(&proxy_address);
+ }
+
+ ADJUST_CHECKSUM(accumulate, ud->uh_sum);
+ }
+/* XXX: Could the two if's below be concatenated to one ? */
+/* Restore source port and/or address in case of proxying*/
+
+ if (proxy_port != 0)
+ ud->uh_sport = proxy_port;
+
+ if (proxy_address.s_addr != 0) {
+ DifferentialChecksum(&pip->ip_sum,
+ &proxy_address, &pip->ip_src, 2);
+ pip->ip_src = proxy_address;
+ }
+
+/* Restore original IP address */
+ DifferentialChecksum(&pip->ip_sum,
+ &original_address, &pip->ip_dst, 2);
+ pip->ip_dst = original_address;
+
+ return (PKT_ALIAS_OK);
+ }
+ return (PKT_ALIAS_IGNORED);
+}
+
+static int
+UdpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create)
+{
+ struct udphdr *ud;
+ struct alias_link *lnk;
+ struct in_addr dest_address;
+ struct in_addr proxy_server_address;
+ u_short dest_port;
+ u_short proxy_server_port;
+ int proxy_type;
+ int error;
+
+ LIBALIAS_LOCK_ASSERT(la);
+
+/* Return if proxy-only mode is enabled and not proxyrule found.*/
+ ud = (struct udphdr *)ip_next(pip);
+ proxy_type = ProxyCheck(la, &proxy_server_address,
+ &proxy_server_port, pip->ip_src, pip->ip_dst,
+ ud->uh_dport, pip->ip_p);
+ if (proxy_type == 0 && (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY))
+ return (PKT_ALIAS_OK);
+
+/* If this is a transparent proxy, save original destination,
+ * then alter the destination and adjust checksums */
+ dest_port = ud->uh_dport;
+ dest_address = pip->ip_dst;
+
+ if (proxy_type != 0) {
+ int accumulate;
+
+ accumulate = twowords(&pip->ip_dst);
+ accumulate -= twowords(&proxy_server_address);
+
+ ADJUST_CHECKSUM(accumulate, pip->ip_sum);
+
+ if (ud->uh_sum != 0) {
+ accumulate = twowords(&pip->ip_dst);
+ accumulate -= twowords(&proxy_server_address);
+ accumulate += ud->uh_dport;
+ accumulate -= proxy_server_port;
+ ADJUST_CHECKSUM(accumulate, ud->uh_sum);
+ }
+ pip->ip_dst = proxy_server_address;
+ ud->uh_dport = proxy_server_port;
+ }
+ lnk = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst,
+ ud->uh_sport, ud->uh_dport,
+ IPPROTO_UDP, create);
+ if (lnk != NULL) {
+ u_short alias_port;
+ struct in_addr alias_address;
+ struct alias_data ad = {
+ .lnk = lnk,
+ .oaddr = NULL,
+ .aaddr = &alias_address,
+ .aport = &alias_port,
+ .sport = &ud->uh_sport,
+ .dport = &ud->uh_dport,
+ .maxpktsize = 0
+ };
+
+/* Save original destination address, if this is a proxy packet.
+ * Also modify packet to include destination encoding. This may
+ * change the size of IP header. */
+ if (proxy_type != 0) {
+ SetProxyPort(lnk, dest_port);
+ SetProxyAddress(lnk, dest_address);
+ ProxyModify(la, lnk, pip, maxpacketsize, proxy_type);
+ ud = (struct udphdr *)ip_next(pip);
+ }
+
+ alias_address = GetAliasAddress(lnk);
+ alias_port = GetAliasPort(lnk);
+
+ /* Walk out chain. */
+ error = find_handler(OUT, UDP, la, pip, &ad);
+
+/* If UDP checksum is not zero, adjust since source port is */
+/* being aliased and source address is being altered */
+ if (ud->uh_sum != 0) {
+ int accumulate;
+
+ accumulate = ud->uh_sport;
+ accumulate -= alias_port;
+ accumulate += twowords(&pip->ip_src);
+ accumulate -= twowords(&alias_address);
+ ADJUST_CHECKSUM(accumulate, ud->uh_sum);
+ }
+/* Put alias port in UDP header */
+ ud->uh_sport = alias_port;
+
+/* Change source address */
+ DifferentialChecksum(&pip->ip_sum,
+ &alias_address, &pip->ip_src, 2);
+ pip->ip_src = alias_address;
+
+ return (PKT_ALIAS_OK);
+ }
+ return (PKT_ALIAS_IGNORED);
+}
+
+
+
+static int
+TcpAliasIn(struct libalias *la, struct ip *pip)
+{
+ struct tcphdr *tc;
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ tc = (struct tcphdr *)ip_next(pip);
+
+ lnk = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst,
+ tc->th_sport, tc->th_dport,
+ IPPROTO_TCP,
+ !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY));
+ if (lnk != NULL) {
+ struct in_addr alias_address;
+ struct in_addr original_address;
+ struct in_addr proxy_address;
+ u_short alias_port;
+ u_short proxy_port;
+ int accumulate, error;
+
+ /*
+ * The init of MANY vars is a bit below, but aliashandlepptpin
+ * seems to need the destination port that came within the
+ * packet and not the original one looks below [*].
+ */
+
+ struct alias_data ad = {
+ .lnk = lnk,
+ .oaddr = NULL,
+ .aaddr = NULL,
+ .aport = NULL,
+ .sport = &tc->th_sport,
+ .dport = &tc->th_dport,
+ .maxpktsize = 0
+ };
+
+ /* Walk out chain. */
+ error = find_handler(IN, TCP, la, pip, &ad);
+
+ alias_address = GetAliasAddress(lnk);
+ original_address = GetOriginalAddress(lnk);
+ proxy_address = GetProxyAddress(lnk);
+ alias_port = tc->th_dport;
+ tc->th_dport = GetOriginalPort(lnk);
+ proxy_port = GetProxyPort(lnk);
+
+ /*
+ * Look above, if anyone is going to add find_handler AFTER
+ * this aliashandlepptpin/point, please redo alias_data too.
+ * Uncommenting the piece here below should be enough.
+ */
+#if 0
+ struct alias_data ad = {
+ .lnk = lnk,
+ .oaddr = &original_address,
+ .aaddr = &alias_address,
+ .aport = &alias_port,
+ .sport = &ud->uh_sport,
+ .dport = &ud->uh_dport,
+ .maxpktsize = 0
+ };
+
+ /* Walk out chain. */
+ error = find_handler(la, pip, &ad);
+ if (error == EHDNOF)
+ printf("Protocol handler not found\n");
+#endif
+
+/* Adjust TCP checksum since destination port is being unaliased */
+/* and destination port is being altered. */
+ accumulate = alias_port;
+ accumulate -= tc->th_dport;
+ accumulate += twowords(&alias_address);
+ accumulate -= twowords(&original_address);
+
+/* If this is a proxy, then modify the TCP source port and
+ checksum accumulation */
+ if (proxy_port != 0) {
+ accumulate += tc->th_sport;
+ tc->th_sport = proxy_port;
+ accumulate -= tc->th_sport;
+ accumulate += twowords(&pip->ip_src);
+ accumulate -= twowords(&proxy_address);
+ }
+/* See if ACK number needs to be modified */
+ if (GetAckModified(lnk) == 1) {
+ int delta;
+
+ tc = (struct tcphdr *)ip_next(pip);
+ delta = GetDeltaAckIn(tc->th_ack, lnk);
+ if (delta != 0) {
+ accumulate += twowords(&tc->th_ack);
+ tc->th_ack = htonl(ntohl(tc->th_ack) - delta);
+ accumulate -= twowords(&tc->th_ack);
+ }
+ }
+ ADJUST_CHECKSUM(accumulate, tc->th_sum);
+
+/* Restore original IP address */
+ accumulate = twowords(&pip->ip_dst);
+ pip->ip_dst = original_address;
+ accumulate -= twowords(&pip->ip_dst);
+
+/* If this is a transparent proxy packet, then modify the source
+ address */
+ if (proxy_address.s_addr != 0) {
+ accumulate += twowords(&pip->ip_src);
+ pip->ip_src = proxy_address;
+ accumulate -= twowords(&pip->ip_src);
+ }
+ ADJUST_CHECKSUM(accumulate, pip->ip_sum);
+
+/* Monitor TCP connection state */
+ tc = (struct tcphdr *)ip_next(pip);
+ TcpMonitorIn(tc->th_flags, lnk);
+
+ return (PKT_ALIAS_OK);
+ }
+ return (PKT_ALIAS_IGNORED);
+}
+
+static int
+TcpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create)
+{
+ int proxy_type, error;
+ u_short dest_port;
+ u_short proxy_server_port;
+ struct in_addr dest_address;
+ struct in_addr proxy_server_address;
+ struct tcphdr *tc;
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ tc = (struct tcphdr *)ip_next(pip);
+
+ if (create)
+ proxy_type = ProxyCheck(la, &proxy_server_address,
+ &proxy_server_port, pip->ip_src, pip->ip_dst,
+ tc->th_dport, pip->ip_p);
+ else
+ proxy_type = 0;
+
+ if (proxy_type == 0 && (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY))
+ return (PKT_ALIAS_OK);
+
+/* If this is a transparent proxy, save original destination,
+ then alter the destination and adjust checksums */
+ dest_port = tc->th_dport;
+ dest_address = pip->ip_dst;
+ if (proxy_type != 0) {
+ int accumulate;
+
+ accumulate = tc->th_dport;
+ tc->th_dport = proxy_server_port;
+ accumulate -= tc->th_dport;
+ accumulate += twowords(&pip->ip_dst);
+ accumulate -= twowords(&proxy_server_address);
+ ADJUST_CHECKSUM(accumulate, tc->th_sum);
+
+ accumulate = twowords(&pip->ip_dst);
+ pip->ip_dst = proxy_server_address;
+ accumulate -= twowords(&pip->ip_dst);
+ ADJUST_CHECKSUM(accumulate, pip->ip_sum);
+ }
+ lnk = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst,
+ tc->th_sport, tc->th_dport,
+ IPPROTO_TCP, create);
+ if (lnk == NULL)
+ return (PKT_ALIAS_IGNORED);
+ if (lnk != NULL) {
+ u_short alias_port;
+ struct in_addr alias_address;
+ int accumulate;
+ struct alias_data ad = {
+ .lnk = lnk,
+ .oaddr = NULL,
+ .aaddr = &alias_address,
+ .aport = &alias_port,
+ .sport = &tc->th_sport,
+ .dport = &tc->th_dport,
+ .maxpktsize = maxpacketsize
+ };
+
+/* Save original destination address, if this is a proxy packet.
+ Also modify packet to include destination encoding. This may
+ change the size of IP header. */
+ if (proxy_type != 0) {
+ SetProxyPort(lnk, dest_port);
+ SetProxyAddress(lnk, dest_address);
+ ProxyModify(la, lnk, pip, maxpacketsize, proxy_type);
+ tc = (struct tcphdr *)ip_next(pip);
+ }
+/* Get alias address and port */
+ alias_port = GetAliasPort(lnk);
+ alias_address = GetAliasAddress(lnk);
+
+/* Monitor TCP connection state */
+ tc = (struct tcphdr *)ip_next(pip);
+ TcpMonitorOut(tc->th_flags, lnk);
+
+ /* Walk out chain. */
+ error = find_handler(OUT, TCP, la, pip, &ad);
+
+/* Adjust TCP checksum since source port is being aliased */
+/* and source address is being altered */
+ accumulate = tc->th_sport;
+ tc->th_sport = alias_port;
+ accumulate -= tc->th_sport;
+ accumulate += twowords(&pip->ip_src);
+ accumulate -= twowords(&alias_address);
+
+/* Modify sequence number if necessary */
+ if (GetAckModified(lnk) == 1) {
+ int delta;
+
+ tc = (struct tcphdr *)ip_next(pip);
+ delta = GetDeltaSeqOut(tc->th_seq, lnk);
+ if (delta != 0) {
+ accumulate += twowords(&tc->th_seq);
+ tc->th_seq = htonl(ntohl(tc->th_seq) + delta);
+ accumulate -= twowords(&tc->th_seq);
+ }
+ }
+ ADJUST_CHECKSUM(accumulate, tc->th_sum);
+
+/* Change source address */
+ accumulate = twowords(&pip->ip_src);
+ pip->ip_src = alias_address;
+ accumulate -= twowords(&pip->ip_src);
+ ADJUST_CHECKSUM(accumulate, pip->ip_sum);
+
+ return (PKT_ALIAS_OK);
+ }
+ return (PKT_ALIAS_IGNORED);
+}
+
+
+
+
+/* Fragment Handling
+
+ FragmentIn()
+ FragmentOut()
+
+The packet aliasing module has a limited ability for handling IP
+fragments. If the ICMP, TCP or UDP header is in the first fragment
+received, then the ID number of the IP packet is saved, and other
+fragments are identified according to their ID number and IP address
+they were sent from. Pointers to unresolved fragments can also be
+saved and recalled when a header fragment is seen.
+*/
+
+/* Local prototypes */
+static int FragmentIn(struct libalias *la, struct in_addr ip_src,
+ struct in_addr *ip_dst, u_short ip_id, u_short *ip_sum);
+static int FragmentOut(struct libalias *, struct in_addr *ip_src,
+ u_short *ip_sum);
+
+static int
+FragmentIn(struct libalias *la, struct in_addr ip_src, struct in_addr *ip_dst,
+ u_short ip_id, u_short *ip_sum)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = FindFragmentIn2(la, ip_src, *ip_dst, ip_id);
+ if (lnk != NULL) {
+ struct in_addr original_address;
+
+ GetFragmentAddr(lnk, &original_address);
+ DifferentialChecksum(ip_sum,
+ &original_address, ip_dst, 2);
+ *ip_dst = original_address;
+
+ return (PKT_ALIAS_OK);
+ }
+ return (PKT_ALIAS_UNRESOLVED_FRAGMENT);
+}
+
+static int
+FragmentOut(struct libalias *la, struct in_addr *ip_src, u_short *ip_sum)
+{
+ struct in_addr alias_address;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ alias_address = FindAliasAddress(la, *ip_src);
+ DifferentialChecksum(ip_sum,
+ &alias_address, ip_src, 2);
+ *ip_src = alias_address;
+
+ return (PKT_ALIAS_OK);
+}
+
+
+
+
+
+
+/* Outside World Access
+
+ PacketAliasSaveFragment()
+ PacketAliasGetFragment()
+ PacketAliasFragmentIn()
+ PacketAliasIn()
+ PacketAliasOut()
+ PacketUnaliasOut()
+
+(prototypes in alias.h)
+*/
+
+int
+LibAliasSaveFragment(struct libalias *la, char *ptr)
+{
+ int iresult;
+ struct alias_link *lnk;
+ struct ip *pip;
+
+ LIBALIAS_LOCK(la);
+ pip = (struct ip *)ptr;
+ lnk = AddFragmentPtrLink(la, pip->ip_src, pip->ip_id);
+ iresult = PKT_ALIAS_ERROR;
+ if (lnk != NULL) {
+ SetFragmentPtr(lnk, ptr);
+ iresult = PKT_ALIAS_OK;
+ }
+ LIBALIAS_UNLOCK(la);
+ return (iresult);
+}
+
+char *
+LibAliasGetFragment(struct libalias *la, char *ptr)
+{
+ struct alias_link *lnk;
+ char *fptr;
+ struct ip *pip;
+
+ LIBALIAS_LOCK(la);
+ pip = (struct ip *)ptr;
+ lnk = FindFragmentPtr(la, pip->ip_src, pip->ip_id);
+ if (lnk != NULL) {
+ GetFragmentPtr(lnk, &fptr);
+ SetFragmentPtr(lnk, NULL);
+ SetExpire(lnk, 0); /* Deletes link */
+ } else
+ fptr = NULL;
+
+ LIBALIAS_UNLOCK(la);
+ return (fptr);
+}
+
+void
+LibAliasFragmentIn(struct libalias *la, char *ptr, /* Points to correctly
+ * de-aliased header
+ * fragment */
+ char *ptr_fragment /* Points to fragment which must be
+ * de-aliased */
+)
+{
+ struct ip *pip;
+ struct ip *fpip;
+
+ LIBALIAS_LOCK(la);
+ (void)la;
+ pip = (struct ip *)ptr;
+ fpip = (struct ip *)ptr_fragment;
+
+ DifferentialChecksum(&fpip->ip_sum,
+ &pip->ip_dst, &fpip->ip_dst, 2);
+ fpip->ip_dst = pip->ip_dst;
+ LIBALIAS_UNLOCK(la);
+}
+
+/* Local prototypes */
+static int
+LibAliasOutLocked(struct libalias *la, char *ptr,
+ int maxpacketsize, int create);
+static int
+LibAliasInLocked(struct libalias *la, char *ptr,
+ int maxpacketsize);
+
+int
+LibAliasIn(struct libalias *la, char *ptr, int maxpacketsize)
+{
+ int res;
+
+ LIBALIAS_LOCK(la);
+ res = LibAliasInLocked(la, ptr, maxpacketsize);
+ LIBALIAS_UNLOCK(la);
+ return (res);
+}
+
+static int
+LibAliasInLocked(struct libalias *la, char *ptr, int maxpacketsize)
+{
+ struct in_addr alias_addr;
+ struct ip *pip;
+ int iresult;
+
+ if (la->packetAliasMode & PKT_ALIAS_REVERSE) {
+ la->packetAliasMode &= ~PKT_ALIAS_REVERSE;
+ iresult = LibAliasOutLocked(la, ptr, maxpacketsize, 1);
+ la->packetAliasMode |= PKT_ALIAS_REVERSE;
+ goto getout;
+ }
+ HouseKeeping(la);
+ ClearCheckNewLink(la);
+ pip = (struct ip *)ptr;
+ alias_addr = pip->ip_dst;
+
+ /* Defense against mangled packets */
+ if (ntohs(pip->ip_len) > maxpacketsize
+ || (pip->ip_hl << 2) > maxpacketsize) {
+ iresult = PKT_ALIAS_IGNORED;
+ goto getout;
+ }
+
+ iresult = PKT_ALIAS_IGNORED;
+ if ((ntohs(pip->ip_off) & IP_OFFMASK) == 0) {
+ switch (pip->ip_p) {
+ case IPPROTO_ICMP:
+ iresult = IcmpAliasIn(la, pip);
+ break;
+ case IPPROTO_UDP:
+ iresult = UdpAliasIn(la, pip);
+ break;
+ case IPPROTO_TCP:
+ iresult = TcpAliasIn(la, pip);
+ break;
+#ifdef _KERNEL
+ case IPPROTO_SCTP:
+ iresult = SctpAlias(la, pip, SN_TO_LOCAL);
+ break;
+#endif
+ case IPPROTO_GRE: {
+ int error;
+ struct alias_data ad = {
+ .lnk = NULL,
+ .oaddr = NULL,
+ .aaddr = NULL,
+ .aport = NULL,
+ .sport = NULL,
+ .dport = NULL,
+ .maxpktsize = 0
+ };
+
+ /* Walk out chain. */
+ error = find_handler(IN, IP, la, pip, &ad);
+ if (error == 0)
+ iresult = PKT_ALIAS_OK;
+ else
+ iresult = ProtoAliasIn(la, pip->ip_src,
+ &pip->ip_dst, pip->ip_p, &pip->ip_sum);
+ }
+ break;
+ default:
+ iresult = ProtoAliasIn(la, pip->ip_src, &pip->ip_dst,
+ pip->ip_p, &pip->ip_sum);
+ break;
+ }
+
+ if (ntohs(pip->ip_off) & IP_MF) {
+ struct alias_link *lnk;
+
+ lnk = FindFragmentIn1(la, pip->ip_src, alias_addr, pip->ip_id);
+ if (lnk != NULL) {
+ iresult = PKT_ALIAS_FOUND_HEADER_FRAGMENT;
+ SetFragmentAddr(lnk, pip->ip_dst);
+ } else {
+ iresult = PKT_ALIAS_ERROR;
+ }
+ }
+ } else {
+ iresult = FragmentIn(la, pip->ip_src, &pip->ip_dst, pip->ip_id,
+ &pip->ip_sum);
+ }
+
+getout:
+ return (iresult);
+}
+
+
+
+/* Unregistered address ranges */
+
+/* 10.0.0.0 -> 10.255.255.255 */
+#define UNREG_ADDR_A_LOWER 0x0a000000
+#define UNREG_ADDR_A_UPPER 0x0affffff
+
+/* 172.16.0.0 -> 172.31.255.255 */
+#define UNREG_ADDR_B_LOWER 0xac100000
+#define UNREG_ADDR_B_UPPER 0xac1fffff
+
+/* 192.168.0.0 -> 192.168.255.255 */
+#define UNREG_ADDR_C_LOWER 0xc0a80000
+#define UNREG_ADDR_C_UPPER 0xc0a8ffff
+
+int
+LibAliasOut(struct libalias *la, char *ptr, int maxpacketsize)
+{
+ int res;
+
+ LIBALIAS_LOCK(la);
+ res = LibAliasOutLocked(la, ptr, maxpacketsize, 1);
+ LIBALIAS_UNLOCK(la);
+ return (res);
+}
+
+int
+LibAliasOutTry(struct libalias *la, char *ptr, int maxpacketsize, int create)
+{
+ int res;
+
+ LIBALIAS_LOCK(la);
+ res = LibAliasOutLocked(la, ptr, maxpacketsize, create);
+ LIBALIAS_UNLOCK(la);
+ return (res);
+}
+
+static int
+LibAliasOutLocked(struct libalias *la, char *ptr, /* valid IP packet */
+ int maxpacketsize, /* How much the packet data may grow (FTP
+ * and IRC inline changes) */
+ int create /* Create new entries ? */
+)
+{
+ int iresult;
+ struct in_addr addr_save;
+ struct ip *pip;
+
+ if (la->packetAliasMode & PKT_ALIAS_REVERSE) {
+ la->packetAliasMode &= ~PKT_ALIAS_REVERSE;
+ iresult = LibAliasInLocked(la, ptr, maxpacketsize);
+ la->packetAliasMode |= PKT_ALIAS_REVERSE;
+ goto getout;
+ }
+ HouseKeeping(la);
+ ClearCheckNewLink(la);
+ pip = (struct ip *)ptr;
+
+ /* Defense against mangled packets */
+ if (ntohs(pip->ip_len) > maxpacketsize
+ || (pip->ip_hl << 2) > maxpacketsize) {
+ iresult = PKT_ALIAS_IGNORED;
+ goto getout;
+ }
+
+ addr_save = GetDefaultAliasAddress(la);
+ if (la->packetAliasMode & PKT_ALIAS_UNREGISTERED_ONLY) {
+ u_long addr;
+ int iclass;
+
+ iclass = 0;
+ addr = ntohl(pip->ip_src.s_addr);
+ if (addr >= UNREG_ADDR_C_LOWER && addr <= UNREG_ADDR_C_UPPER)
+ iclass = 3;
+ else if (addr >= UNREG_ADDR_B_LOWER && addr <= UNREG_ADDR_B_UPPER)
+ iclass = 2;
+ else if (addr >= UNREG_ADDR_A_LOWER && addr <= UNREG_ADDR_A_UPPER)
+ iclass = 1;
+
+ if (iclass == 0) {
+ SetDefaultAliasAddress(la, pip->ip_src);
+ }
+ } else if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY) {
+ SetDefaultAliasAddress(la, pip->ip_src);
+ }
+ iresult = PKT_ALIAS_IGNORED;
+ if ((ntohs(pip->ip_off) & IP_OFFMASK) == 0) {
+ switch (pip->ip_p) {
+ case IPPROTO_ICMP:
+ iresult = IcmpAliasOut(la, pip, create);
+ break;
+ case IPPROTO_UDP:
+ iresult = UdpAliasOut(la, pip, maxpacketsize, create);
+ break;
+ case IPPROTO_TCP:
+ iresult = TcpAliasOut(la, pip, maxpacketsize, create);
+ break;
+#ifdef _KERNEL
+ case IPPROTO_SCTP:
+ iresult = SctpAlias(la, pip, SN_TO_GLOBAL);
+ break;
+#endif
+ case IPPROTO_GRE: {
+ int error;
+ struct alias_data ad = {
+ .lnk = NULL,
+ .oaddr = NULL,
+ .aaddr = NULL,
+ .aport = NULL,
+ .sport = NULL,
+ .dport = NULL,
+ .maxpktsize = 0
+ };
+ /* Walk out chain. */
+ error = find_handler(OUT, IP, la, pip, &ad);
+ if (error == 0)
+ iresult = PKT_ALIAS_OK;
+ else
+ iresult = ProtoAliasOut(la, &pip->ip_src,
+ pip->ip_dst, pip->ip_p, &pip->ip_sum, create);
+ }
+ break;
+ default:
+ iresult = ProtoAliasOut(la, &pip->ip_src,
+ pip->ip_dst, pip->ip_p, &pip->ip_sum, create);
+ break;
+ }
+ } else {
+ iresult = FragmentOut(la, &pip->ip_src, &pip->ip_sum);
+ }
+
+ SetDefaultAliasAddress(la, addr_save);
+getout:
+ return (iresult);
+}
+
+int
+LibAliasUnaliasOut(struct libalias *la, char *ptr, /* valid IP packet */
+ int maxpacketsize /* for error checking */
+)
+{
+ struct ip *pip;
+ struct icmp *ic;
+ struct udphdr *ud;
+ struct tcphdr *tc;
+ struct alias_link *lnk;
+ int iresult = PKT_ALIAS_IGNORED;
+
+ LIBALIAS_LOCK(la);
+ pip = (struct ip *)ptr;
+
+ /* Defense against mangled packets */
+ if (ntohs(pip->ip_len) > maxpacketsize
+ || (pip->ip_hl << 2) > maxpacketsize)
+ goto getout;
+
+ ud = (struct udphdr *)ip_next(pip);
+ tc = (struct tcphdr *)ip_next(pip);
+ ic = (struct icmp *)ip_next(pip);
+
+ /* Find a link */
+ if (pip->ip_p == IPPROTO_UDP)
+ lnk = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src,
+ ud->uh_dport, ud->uh_sport,
+ IPPROTO_UDP, 0);
+ else if (pip->ip_p == IPPROTO_TCP)
+ lnk = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src,
+ tc->th_dport, tc->th_sport,
+ IPPROTO_TCP, 0);
+ else if (pip->ip_p == IPPROTO_ICMP)
+ lnk = FindIcmpIn(la, pip->ip_dst, pip->ip_src, ic->icmp_id, 0);
+ else
+ lnk = NULL;
+
+ /* Change it from an aliased packet to an unaliased packet */
+ if (lnk != NULL) {
+ if (pip->ip_p == IPPROTO_UDP || pip->ip_p == IPPROTO_TCP) {
+ int accumulate;
+ struct in_addr original_address;
+ u_short original_port;
+
+ original_address = GetOriginalAddress(lnk);
+ original_port = GetOriginalPort(lnk);
+
+ /* Adjust TCP/UDP checksum */
+ accumulate = twowords(&pip->ip_src);
+ accumulate -= twowords(&original_address);
+
+ if (pip->ip_p == IPPROTO_UDP) {
+ accumulate += ud->uh_sport;
+ accumulate -= original_port;
+ ADJUST_CHECKSUM(accumulate, ud->uh_sum);
+ } else {
+ accumulate += tc->th_sport;
+ accumulate -= original_port;
+ ADJUST_CHECKSUM(accumulate, tc->th_sum);
+ }
+
+ /* Adjust IP checksum */
+ DifferentialChecksum(&pip->ip_sum,
+ &original_address, &pip->ip_src, 2);
+
+ /* Un-alias source address and port number */
+ pip->ip_src = original_address;
+ if (pip->ip_p == IPPROTO_UDP)
+ ud->uh_sport = original_port;
+ else
+ tc->th_sport = original_port;
+
+ iresult = PKT_ALIAS_OK;
+
+ } else if (pip->ip_p == IPPROTO_ICMP) {
+
+ int accumulate;
+ struct in_addr original_address;
+ u_short original_id;
+
+ original_address = GetOriginalAddress(lnk);
+ original_id = GetOriginalPort(lnk);
+
+ /* Adjust ICMP checksum */
+ accumulate = twowords(&pip->ip_src);
+ accumulate -= twowords(&original_address);
+ accumulate += ic->icmp_id;
+ accumulate -= original_id;
+ ADJUST_CHECKSUM(accumulate, ic->icmp_cksum);
+
+ /* Adjust IP checksum */
+ DifferentialChecksum(&pip->ip_sum,
+ &original_address, &pip->ip_src, 2);
+
+ /* Un-alias source address and port number */
+ pip->ip_src = original_address;
+ ic->icmp_id = original_id;
+
+ iresult = PKT_ALIAS_OK;
+ }
+ }
+getout:
+ LIBALIAS_UNLOCK(la);
+ return (iresult);
+
+}
+
+#ifndef _KERNEL
+
+int
+LibAliasRefreshModules(void)
+{
+ char buf[256], conf[] = "/etc/libalias.conf";
+ FILE *fd;
+ int i, len;
+
+ fd = fopen(conf, "r");
+ if (fd == NULL)
+ err(1, "fopen(%s)", conf);
+
+ LibAliasUnLoadAllModule();
+
+ for (;;) {
+ fgets(buf, 256, fd);
+ if (feof(fd))
+ break;
+ len = strlen(buf);
+ if (len > 1) {
+ for (i = 0; i < len; i++)
+ if (!isspace(buf[i]))
+ break;
+ if (buf[i] == '#')
+ continue;
+ buf[len - 1] = '\0';
+ LibAliasLoadModule(buf);
+ }
+ }
+ fclose(fd);
+ return (0);
+}
+
+int
+LibAliasLoadModule(char *path)
+{
+ struct dll *t;
+ void *handle;
+ struct proto_handler *m;
+ const char *error;
+ moduledata_t *p;
+
+ handle = dlopen (path, RTLD_LAZY);
+ if (!handle) {
+ fprintf(stderr, "%s\n", dlerror());
+ return (EINVAL);
+ }
+
+ p = dlsym(handle, "alias_mod");
+ if ((error = dlerror()) != NULL) {
+ fprintf(stderr, "%s\n", dlerror());
+ return (EINVAL);
+ }
+
+ t = malloc(sizeof(struct dll));
+ if (t == NULL)
+ return (ENOMEM);
+ strncpy(t->name, p->name, DLL_LEN);
+ t->handle = handle;
+ if (attach_dll(t) == EEXIST) {
+ free(t);
+ fprintf(stderr, "dll conflict\n");
+ return (EEXIST);
+ }
+
+ m = dlsym(t->handle, "handlers");
+ if ((error = dlerror()) != NULL) {
+ fprintf(stderr, "%s\n", error);
+ return (EINVAL);
+ }
+
+ LibAliasAttachHandlers(m);
+ return (0);
+}
+
+int
+LibAliasUnLoadAllModule(void)
+{
+ struct dll *t;
+ struct proto_handler *p;
+
+ /* Unload all modules then reload everything. */
+ while ((p = first_handler()) != NULL) {
+ detach_handler(p);
+ }
+ while ((t = walk_dll_chain()) != NULL) {
+ dlclose(t->handle);
+ free(t);
+ }
+ return (1);
+}
+
+#endif
+
+#ifdef _KERNEL
+/*
+ * m_megapullup() - this function is a big hack.
+ * Thankfully, it's only used in ng_nat and ipfw+nat.
+ *
+ * It allocates an mbuf with cluster and copies the specified part of the chain
+ * into cluster, so that it is all contiguous and can be accessed via a plain
+ * (char *) pointer. This is required, because libalias doesn't know how to
+ * handle mbuf chains.
+ *
+ * On success, m_megapullup returns an mbuf (possibly with cluster) containing
+ * the input packet, on failure NULL. The input packet is always consumed.
+ */
+struct mbuf *
+m_megapullup(struct mbuf *m, int len) {
+ struct mbuf *mcl;
+
+ if (len > m->m_pkthdr.len)
+ goto bad;
+
+ /* Do not reallocate packet if it is sequentional,
+ * writable and has some extra space for expansion.
+ * XXX: Constant 100bytes is completely empirical. */
+#define RESERVE 100
+ if (m->m_next == NULL && M_WRITABLE(m) && M_TRAILINGSPACE(m) >= RESERVE)
+ return (m);
+
+ if (len <= MCLBYTES - RESERVE) {
+ mcl = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+ } else if (len < MJUM16BYTES) {
+ int size;
+ if (len <= MJUMPAGESIZE - RESERVE) {
+ size = MJUMPAGESIZE;
+ } else if (len <= MJUM9BYTES - RESERVE) {
+ size = MJUM9BYTES;
+ } else {
+ size = MJUM16BYTES;
+ };
+ mcl = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, size);
+ } else {
+ goto bad;
+ }
+ if (mcl == NULL)
+ goto bad;
+
+ m_move_pkthdr(mcl, m);
+ m_copydata(m, 0, len, mtod(mcl, caddr_t));
+ mcl->m_len = mcl->m_pkthdr.len = len;
+ m_freem(m);
+
+ return (mcl);
+bad:
+ m_freem(m);
+ return (NULL);
+}
+#endif
diff --git a/freebsd/sys/netinet/libalias/alias.h b/freebsd/sys/netinet/libalias/alias.h
new file mode 100644
index 00000000..f835e1b7
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias.h
@@ -0,0 +1,232 @@
+/* lint -save -library Flexelint comment for external headers */
+
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Alias.h defines the outside world interfaces for the packet aliasing
+ * software.
+ *
+ * This software is placed into the public domain with no restrictions on its
+ * distribution.
+ */
+
+#ifndef _ALIAS_HH_
+#define _ALIAS_HH_
+
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+
+#define LIBALIAS_BUF_SIZE 128
+#ifdef _KERNEL
+/*
+ * The kernel version of libalias does not support these features.
+ */
+#define NO_FW_PUNCH
+#define NO_USE_SOCKETS
+#endif
+
+/*
+ * The external interface to libalias, the packet aliasing engine.
+ *
+ * There are two sets of functions:
+ *
+ * PacketAlias*() the old API which doesn't take an instance pointer
+ * and therefore can only have one packet engine at a time.
+ *
+ * LibAlias*() the new API which takes as first argument a pointer to
+ * the instance of the packet aliasing engine.
+ *
+ * The functions otherwise correspond to each other one for one, except
+ * for the LibAliasUnaliasOut()/PacketUnaliasOut() function which were
+ * were misnamed in the old API.
+ */
+
+/*
+ * The instance structure
+ */
+struct libalias;
+
+/*
+ * An anonymous structure, a pointer to which is returned from
+ * PacketAliasRedirectAddr(), PacketAliasRedirectPort() or
+ * PacketAliasRedirectProto(), passed to PacketAliasAddServer(),
+ * and freed by PacketAliasRedirectDelete().
+ */
+struct alias_link;
+
+/* Initialization and control functions. */
+struct libalias *LibAliasInit(struct libalias *);
+void LibAliasSetAddress(struct libalias *, struct in_addr _addr);
+void LibAliasSetFWBase(struct libalias *, unsigned int _base, unsigned int _num);
+void LibAliasSetSkinnyPort(struct libalias *, unsigned int _port);
+unsigned int
+ LibAliasSetMode(struct libalias *, unsigned int _flags, unsigned int _mask);
+void LibAliasUninit(struct libalias *);
+
+/* Packet Handling functions. */
+int LibAliasIn (struct libalias *, char *_ptr, int _maxpacketsize);
+int LibAliasOut(struct libalias *, char *_ptr, int _maxpacketsize);
+int LibAliasOutTry(struct libalias *, char *_ptr, int _maxpacketsize, int _create);
+int LibAliasUnaliasOut(struct libalias *, char *_ptr, int _maxpacketsize);
+
+/* Port and address redirection functions. */
+
+int
+LibAliasAddServer(struct libalias *, struct alias_link *_lnk,
+ struct in_addr _addr, unsigned short _port);
+struct alias_link *
+LibAliasRedirectAddr(struct libalias *, struct in_addr _src_addr,
+ struct in_addr _alias_addr);
+int LibAliasRedirectDynamic(struct libalias *, struct alias_link *_lnk);
+void LibAliasRedirectDelete(struct libalias *, struct alias_link *_lnk);
+struct alias_link *
+LibAliasRedirectPort(struct libalias *, struct in_addr _src_addr,
+ unsigned short _src_port, struct in_addr _dst_addr,
+ unsigned short _dst_port, struct in_addr _alias_addr,
+ unsigned short _alias_port, unsigned char _proto);
+struct alias_link *
+LibAliasRedirectProto(struct libalias *, struct in_addr _src_addr,
+ struct in_addr _dst_addr, struct in_addr _alias_addr,
+ unsigned char _proto);
+
+/* Fragment Handling functions. */
+void LibAliasFragmentIn(struct libalias *, char *_ptr, char *_ptr_fragment);
+char *LibAliasGetFragment(struct libalias *, char *_ptr);
+int LibAliasSaveFragment(struct libalias *, char *_ptr);
+
+/* Miscellaneous functions. */
+int LibAliasCheckNewLink(struct libalias *);
+unsigned short
+ LibAliasInternetChecksum(struct libalias *, unsigned short *_ptr, int _nbytes);
+void LibAliasSetTarget(struct libalias *, struct in_addr _target_addr);
+
+/* Transparent proxying routines. */
+int LibAliasProxyRule(struct libalias *, const char *_cmd);
+
+/* Module handling API */
+int LibAliasLoadModule(char *);
+int LibAliasUnLoadAllModule(void);
+int LibAliasRefreshModules(void);
+
+/* Mbuf helper function. */
+struct mbuf *m_megapullup(struct mbuf *, int);
+
+/*
+ * Mode flags and other constants.
+ */
+
+
+/* Mode flags, set using PacketAliasSetMode() */
+
+/*
+ * If PKT_ALIAS_LOG is set, a message will be printed to /var/log/alias.log
+ * every time a link is created or deleted. This is useful for debugging.
+ */
+#define PKT_ALIAS_LOG 0x01
+
+/*
+ * If PKT_ALIAS_DENY_INCOMING is set, then incoming connections (e.g. to ftp,
+ * telnet or web servers will be prevented by the aliasing mechanism.
+ */
+#define PKT_ALIAS_DENY_INCOMING 0x02
+
+/*
+ * If PKT_ALIAS_SAME_PORTS is set, packets will be attempted sent from the
+ * same port as they originated on. This allows e.g. rsh to work *99% of the
+ * time*, but _not_ 100% (it will be slightly flakey instead of not working
+ * at all). This mode bit is set by PacketAliasInit(), so it is a default
+ * mode of operation.
+ */
+#define PKT_ALIAS_SAME_PORTS 0x04
+
+/*
+ * If PKT_ALIAS_USE_SOCKETS is set, then when partially specified links (e.g.
+ * destination port and/or address is zero), the packet aliasing engine will
+ * attempt to allocate a socket for the aliasing port it chooses. This will
+ * avoid interference with the host machine. Fully specified links do not
+ * require this. This bit is set after a call to PacketAliasInit(), so it is
+ * a default mode of operation.
+ */
+#ifndef NO_USE_SOCKETS
+#define PKT_ALIAS_USE_SOCKETS 0x08
+#endif
+/*-
+ * If PKT_ALIAS_UNREGISTERED_ONLY is set, then only packets with
+ * unregistered source addresses will be aliased. Private
+ * addresses are those in the following ranges:
+ *
+ * 10.0.0.0 -> 10.255.255.255
+ * 172.16.0.0 -> 172.31.255.255
+ * 192.168.0.0 -> 192.168.255.255
+ */
+#define PKT_ALIAS_UNREGISTERED_ONLY 0x10
+
+/*
+ * If PKT_ALIAS_RESET_ON_ADDR_CHANGE is set, then the table of dynamic
+ * aliasing links will be reset whenever PacketAliasSetAddress() changes the
+ * default aliasing address. If the default aliasing address is left
+ * unchanged by this function call, then the table of dynamic aliasing links
+ * will be left intact. This bit is set after a call to PacketAliasInit().
+ */
+#define PKT_ALIAS_RESET_ON_ADDR_CHANGE 0x20
+
+#ifndef NO_FW_PUNCH
+/*
+ * If PKT_ALIAS_PUNCH_FW is set, active FTP and IRC DCC connections will
+ * create a 'hole' in the firewall to allow the transfers to work. The
+ * ipfw rule number that the hole is created with is controlled by
+ * PacketAliasSetFWBase(). The hole will be attached to that
+ * particular alias_link, so when the link goes away the hole is deleted.
+ */
+#define PKT_ALIAS_PUNCH_FW 0x100
+#endif
+
+/*
+ * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only
+ * transparent proxying is performed.
+ */
+#define PKT_ALIAS_PROXY_ONLY 0x40
+
+/*
+ * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and
+ * PacketAliasOut() are reversed.
+ */
+#define PKT_ALIAS_REVERSE 0x80
+
+/* Function return codes. */
+#define PKT_ALIAS_ERROR -1
+#define PKT_ALIAS_OK 1
+#define PKT_ALIAS_IGNORED 2
+#define PKT_ALIAS_UNRESOLVED_FRAGMENT 3
+#define PKT_ALIAS_FOUND_HEADER_FRAGMENT 4
+
+#endif /* !_ALIAS_HH_ */
+
+/* lint -restore */
diff --git a/freebsd/sys/netinet/libalias/alias_cuseeme.c b/freebsd/sys/netinet/libalias/alias_cuseeme.c
new file mode 100644
index 00000000..90f2aaae
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_cuseeme.c
@@ -0,0 +1,230 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1998 Brian Somers <brian@Awfulhak.org>
+ * with the aid of code written by
+ * Junichi SATOH <junichi@astec.co.jp> 1996, 1997.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef _KERNEL
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/module.h>
+#else
+#include <freebsd/errno.h>
+#include <freebsd/sys/types.h>
+#include <freebsd/stdio.h>
+#endif
+
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/udp.h>
+
+#ifdef _KERNEL
+#include <freebsd/netinet/libalias/alias.h>
+#include <freebsd/netinet/libalias/alias_local.h>
+#include <freebsd/netinet/libalias/alias_mod.h>
+#else
+#include <freebsd/local/alias_local.h>
+#include <freebsd/local/alias_mod.h>
+#endif
+
+#define CUSEEME_PORT_NUMBER 7648
+
+static void
+AliasHandleCUSeeMeOut(struct libalias *la, struct ip *pip,
+ struct alias_link *lnk);
+
+static void
+AliasHandleCUSeeMeIn(struct libalias *la, struct ip *pip,
+ struct in_addr original_addr);
+
+static int
+fingerprint(struct libalias *la, struct alias_data *ah)
+{
+
+ if (ah->dport == NULL || ah->oaddr == NULL)
+ return (-1);
+ if (ntohs(*ah->dport) == CUSEEME_PORT_NUMBER)
+ return (0);
+ return (-1);
+}
+
+static int
+protohandlerin(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ AliasHandleCUSeeMeIn(la, pip, *ah->oaddr);
+ return (0);
+}
+
+static int
+protohandlerout(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ AliasHandleCUSeeMeOut(la, pip, ah->lnk);
+ return (0);
+}
+
+/* Kernel module definition. */
+struct proto_handler handlers[] = {
+ {
+ .pri = 120,
+ .dir = OUT,
+ .proto = UDP,
+ .fingerprint = &fingerprint,
+ .protohandler = &protohandlerout
+ },
+ {
+ .pri = 120,
+ .dir = IN,
+ .proto = UDP,
+ .fingerprint = &fingerprint,
+ .protohandler = &protohandlerin
+ },
+ { EOH }
+};
+
+static int
+mod_handler(module_t mod, int type, void *data)
+{
+ int error;
+
+ switch (type) {
+ case MOD_LOAD:
+ error = 0;
+ LibAliasAttachHandlers(handlers);
+ break;
+ case MOD_UNLOAD:
+ error = 0;
+ LibAliasDetachHandlers(handlers);
+ break;
+ default:
+ error = EINVAL;
+ }
+ return (error);
+}
+
+#ifdef _KERNEL
+static
+#endif
+moduledata_t
+alias_mod = {
+ "alias_cuseeme", mod_handler, NULL
+};
+
+#ifdef _KERNEL
+DECLARE_MODULE(alias_cuseeme, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);
+MODULE_VERSION(alias_cuseeme, 1);
+MODULE_DEPEND(alias_cuseeme, libalias, 1, 1, 1);
+#endif
+
+/* CU-SeeMe Data Header */
+struct cu_header {
+ u_int16_t dest_family;
+ u_int16_t dest_port;
+ u_int32_t dest_addr;
+ int16_t family;
+ u_int16_t port;
+ u_int32_t addr;
+ u_int32_t seq;
+ u_int16_t msg;
+ u_int16_t data_type;
+ u_int16_t packet_len;
+};
+
+/* Open Continue Header */
+struct oc_header {
+ u_int16_t client_count; /* Number of client info structs */
+ u_int32_t seq_no;
+ char user_name [20];
+ char reserved [4]; /* flags, version stuff, etc */
+};
+
+/* client info structures */
+struct client_info {
+ u_int32_t address;/* Client address */
+ char reserved [8]; /* Flags, pruning bitfield, packet
+ * counts etc */
+};
+
+static void
+AliasHandleCUSeeMeOut(struct libalias *la, struct ip *pip, struct alias_link *lnk)
+{
+ struct udphdr *ud = ip_next(pip);
+
+ if (ntohs(ud->uh_ulen) - sizeof(struct udphdr) >= sizeof(struct cu_header)) {
+ struct cu_header *cu;
+ struct alias_link *cu_lnk;
+
+ cu = udp_next(ud);
+ if (cu->addr)
+ cu->addr = (u_int32_t) GetAliasAddress(lnk).s_addr;
+
+ cu_lnk = FindUdpTcpOut(la, pip->ip_src, GetDestAddress(lnk),
+ ud->uh_dport, 0, IPPROTO_UDP, 1);
+
+#ifndef NO_FW_PUNCH
+ if (cu_lnk)
+ PunchFWHole(cu_lnk);
+#endif
+ }
+}
+
+static void
+AliasHandleCUSeeMeIn(struct libalias *la, struct ip *pip, struct in_addr original_addr)
+{
+ struct in_addr alias_addr;
+ struct udphdr *ud;
+ struct cu_header *cu;
+ struct oc_header *oc;
+ struct client_info *ci;
+ char *end;
+ int i;
+
+ (void)la;
+ alias_addr.s_addr = pip->ip_dst.s_addr;
+ ud = ip_next(pip);
+ cu = udp_next(ud);
+ oc = (struct oc_header *)(cu + 1);
+ ci = (struct client_info *)(oc + 1);
+ end = (char *)ud + ntohs(ud->uh_ulen);
+
+ if ((char *)oc <= end) {
+ if (cu->dest_addr)
+ cu->dest_addr = (u_int32_t) original_addr.s_addr;
+ if (ntohs(cu->data_type) == 101)
+ /* Find and change our address */
+ for (i = 0; (char *)(ci + 1) <= end && i < oc->client_count; i++, ci++)
+ if (ci->address == (u_int32_t) alias_addr.s_addr) {
+ ci->address = (u_int32_t) original_addr.s_addr;
+ break;
+ }
+ }
+}
diff --git a/freebsd/sys/netinet/libalias/alias_db.c b/freebsd/sys/netinet/libalias/alias_db.c
new file mode 100644
index 00000000..4b003366
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_db.c
@@ -0,0 +1,2940 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ Alias_db.c encapsulates all data structures used for storing
+ packet aliasing data. Other parts of the aliasing software
+ access data through functions provided in this file.
+
+ Data storage is based on the notion of a "link", which is
+ established for ICMP echo/reply packets, UDP datagrams and
+ TCP stream connections. A link stores the original source
+ and destination addresses. For UDP and TCP, it also stores
+ source and destination port numbers, as well as an alias
+ port number. Links are also used to store information about
+ fragments.
+
+ There is a facility for sweeping through and deleting old
+ links as new packets are sent through. A simple timeout is
+ used for ICMP and UDP links. TCP links are left alone unless
+ there is an incomplete connection, in which case the link
+ can be deleted after a certain amount of time.
+
+
+ Initial version: August, 1996 (cjm)
+
+ Version 1.4: September 16, 1996 (cjm)
+ Facility for handling incoming links added.
+
+ Version 1.6: September 18, 1996 (cjm)
+ ICMP data handling simplified.
+
+ Version 1.7: January 9, 1997 (cjm)
+ Fragment handling simplified.
+ Saves pointers for unresolved fragments.
+ Permits links for unspecified remote ports
+ or unspecified remote addresses.
+ Fixed bug which did not properly zero port
+ table entries after a link was deleted.
+ Cleaned up some obsolete comments.
+
+ Version 1.8: January 14, 1997 (cjm)
+ Fixed data type error in StartPoint().
+ (This error did not exist prior to v1.7
+ and was discovered and fixed by Ari Suutari)
+
+ Version 1.9: February 1, 1997
+ Optionally, connections initiated from packet aliasing host
+ machine will will not have their port number aliased unless it
+ conflicts with an aliasing port already being used. (cjm)
+
+ All options earlier being #ifdef'ed are now available through
+ a new interface, SetPacketAliasMode(). This allows run time
+ control (which is now available in PPP+pktAlias through the
+ 'alias' keyword). (ee)
+
+ Added ability to create an alias port without
+ either destination address or port specified.
+ port type = ALIAS_PORT_UNKNOWN_DEST_ALL (ee)
+
+ Removed K&R style function headers
+ and general cleanup. (ee)
+
+ Added packetAliasMode to replace compiler #defines's (ee)
+
+ Allocates sockets for partially specified
+ ports if ALIAS_USE_SOCKETS defined. (cjm)
+
+ Version 2.0: March, 1997
+ SetAliasAddress() will now clean up alias links
+ if the aliasing address is changed. (cjm)
+
+ PacketAliasPermanentLink() function added to support permanent
+ links. (J. Fortes suggested the need for this.)
+ Examples:
+
+ (192.168.0.1, port 23) <-> alias port 6002, unknown dest addr/port
+
+ (192.168.0.2, port 21) <-> alias port 3604, known dest addr
+ unknown dest port
+
+ These permanent links allow for incoming connections to
+ machines on the local network. They can be given with a
+ user-chosen amount of specificity, with increasing specificity
+ meaning more security. (cjm)
+
+ Quite a bit of rework to the basic engine. The portTable[]
+ array, which kept track of which ports were in use was replaced
+ by a table/linked list structure. (cjm)
+
+ SetExpire() function added. (cjm)
+
+ DeleteLink() no longer frees memory association with a pointer
+ to a fragment (this bug was first recognized by E. Eklund in
+ v1.9).
+
+ Version 2.1: May, 1997 (cjm)
+ Packet aliasing engine reworked so that it can handle
+ multiple external addresses rather than just a single
+ host address.
+
+ PacketAliasRedirectPort() and PacketAliasRedirectAddr()
+ added to the API. The first function is a more generalized
+ version of PacketAliasPermanentLink(). The second function
+ implements static network address translation.
+
+ Version 3.2: July, 2000 (salander and satoh)
+ Added FindNewPortGroup to get contiguous range of port values.
+
+ Added QueryUdpTcpIn and QueryUdpTcpOut to look for an aliasing
+ link but not actually add one.
+
+ Added FindRtspOut, which is closely derived from FindUdpTcpOut,
+ except that the alias port (from FindNewPortGroup) is provided
+ as input.
+
+ See HISTORY file for additional revisions.
+*/
+
+#ifdef _KERNEL
+#include <freebsd/machine/stdarg.h>
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/rwlock.h>
+#include <freebsd/sys/syslog.h>
+#else
+#include <freebsd/stdarg.h>
+#include <freebsd/stdlib.h>
+#include <freebsd/stdio.h>
+#include <freebsd/sys/errno.h>
+#include <freebsd/sys/time.h>
+#include <freebsd/unistd.h>
+#endif
+
+#include <freebsd/sys/socket.h>
+#include <freebsd/netinet/tcp.h>
+
+#ifdef _KERNEL
+#include <freebsd/netinet/libalias/alias.h>
+#include <freebsd/netinet/libalias/alias_local.h>
+#include <freebsd/netinet/libalias/alias_mod.h>
+#include <freebsd/net/if.h>
+#else
+#include <freebsd/local/alias.h>
+#include <freebsd/local/alias_local.h>
+#include <freebsd/local/alias_mod.h>
+#endif
+
+static LIST_HEAD(, libalias) instancehead = LIST_HEAD_INITIALIZER(instancehead);
+
+
+/*
+ Constants (note: constants are also defined
+ near relevant functions or structs)
+*/
+
+/* Parameters used for cleanup of expired links */
+/* NOTE: ALIAS_CLEANUP_INTERVAL_SECS must be less then LINK_TABLE_OUT_SIZE */
+#define ALIAS_CLEANUP_INTERVAL_SECS 64
+#define ALIAS_CLEANUP_MAX_SPOKES (LINK_TABLE_OUT_SIZE/5)
+
+/* Timeouts (in seconds) for different link types */
+#define ICMP_EXPIRE_TIME 60
+#define UDP_EXPIRE_TIME 60
+#define PROTO_EXPIRE_TIME 60
+#define FRAGMENT_ID_EXPIRE_TIME 10
+#define FRAGMENT_PTR_EXPIRE_TIME 30
+
+/* TCP link expire time for different cases */
+/* When the link has been used and closed - minimal grace time to
+ allow ACKs and potential re-connect in FTP (XXX - is this allowed?) */
+#ifndef TCP_EXPIRE_DEAD
+#define TCP_EXPIRE_DEAD 10
+#endif
+
+/* When the link has been used and closed on one side - the other side
+ is allowed to still send data */
+#ifndef TCP_EXPIRE_SINGLEDEAD
+#define TCP_EXPIRE_SINGLEDEAD 90
+#endif
+
+/* When the link isn't yet up */
+#ifndef TCP_EXPIRE_INITIAL
+#define TCP_EXPIRE_INITIAL 300
+#endif
+
+/* When the link is up */
+#ifndef TCP_EXPIRE_CONNECTED
+#define TCP_EXPIRE_CONNECTED 86400
+#endif
+
+
+/* Dummy port number codes used for FindLinkIn/Out() and AddLink().
+ These constants can be anything except zero, which indicates an
+ unknown port number. */
+
+#define NO_DEST_PORT 1
+#define NO_SRC_PORT 1
+
+
+
+/* Data Structures
+
+ The fundamental data structure used in this program is
+ "struct alias_link". Whenever a TCP connection is made,
+ a UDP datagram is sent out, or an ICMP echo request is made,
+ a link record is made (if it has not already been created).
+ The link record is identified by the source address/port
+ and the destination address/port. In the case of an ICMP
+ echo request, the source port is treated as being equivalent
+ with the 16-bit ID number of the ICMP packet.
+
+ The link record also can store some auxiliary data. For
+ TCP connections that have had sequence and acknowledgment
+ modifications, data space is available to track these changes.
+ A state field is used to keep track in changes to the TCP
+ connection state. ID numbers of fragments can also be
+ stored in the auxiliary space. Pointers to unresolved
+ fragments can also be stored.
+
+ The link records support two independent chainings. Lookup
+ tables for input and out tables hold the initial pointers
+ the link chains. On input, the lookup table indexes on alias
+ port and link type. On output, the lookup table indexes on
+ source address, destination address, source port, destination
+ port and link type.
+*/
+
+struct ack_data_record { /* used to save changes to ACK/sequence
+ * numbers */
+ u_long ack_old;
+ u_long ack_new;
+ int delta;
+ int active;
+};
+
+struct tcp_state { /* Information about TCP connection */
+ int in; /* State for outside -> inside */
+ int out; /* State for inside -> outside */
+ int index; /* Index to ACK data array */
+ int ack_modified; /* Indicates whether ACK and
+ * sequence numbers */
+ /* been modified */
+};
+
+#define N_LINK_TCP_DATA 3 /* Number of distinct ACK number changes
+ * saved for a modified TCP stream */
+struct tcp_dat {
+ struct tcp_state state;
+ struct ack_data_record ack[N_LINK_TCP_DATA];
+ int fwhole; /* Which firewall record is used for this
+ * hole? */
+};
+
+struct server { /* LSNAT server pool (circular list) */
+ struct in_addr addr;
+ u_short port;
+ struct server *next;
+};
+
+struct alias_link { /* Main data structure */
+ struct libalias *la;
+ struct in_addr src_addr; /* Address and port information */
+ struct in_addr dst_addr;
+ struct in_addr alias_addr;
+ struct in_addr proxy_addr;
+ u_short src_port;
+ u_short dst_port;
+ u_short alias_port;
+ u_short proxy_port;
+ struct server *server;
+
+ int link_type; /* Type of link: TCP, UDP, ICMP,
+ * proto, frag */
+
+/* values for link_type */
+#define LINK_ICMP IPPROTO_ICMP
+#define LINK_UDP IPPROTO_UDP
+#define LINK_TCP IPPROTO_TCP
+#define LINK_FRAGMENT_ID (IPPROTO_MAX + 1)
+#define LINK_FRAGMENT_PTR (IPPROTO_MAX + 2)
+#define LINK_ADDR (IPPROTO_MAX + 3)
+#define LINK_PPTP (IPPROTO_MAX + 4)
+
+ int flags; /* indicates special characteristics */
+ int pflags; /* protocol-specific flags */
+
+/* flag bits */
+#define LINK_UNKNOWN_DEST_PORT 0x01
+#define LINK_UNKNOWN_DEST_ADDR 0x02
+#define LINK_PERMANENT 0x04
+#define LINK_PARTIALLY_SPECIFIED 0x03 /* logical-or of first two bits */
+#define LINK_UNFIREWALLED 0x08
+
+ int timestamp; /* Time link was last accessed */
+ int expire_time; /* Expire time for link */
+#ifndef NO_USE_SOCKETS
+ int sockfd; /* socket descriptor */
+#endif
+ LIST_ENTRY (alias_link) list_out; /* Linked list of
+ * pointers for */
+ LIST_ENTRY (alias_link) list_in; /* input and output
+ * lookup tables */
+
+ union { /* Auxiliary data */
+ char *frag_ptr;
+ struct in_addr frag_addr;
+ struct tcp_dat *tcp;
+ } data;
+};
+
+/* Clean up procedure. */
+static void finishoff(void);
+
+/* Kernel module definition. */
+#ifdef _KERNEL
+MALLOC_DEFINE(M_ALIAS, "libalias", "packet aliasing");
+
+MODULE_VERSION(libalias, 1);
+
+static int
+alias_mod_handler(module_t mod, int type, void *data)
+{
+ int error;
+
+ switch (type) {
+ case MOD_LOAD:
+ error = 0;
+ handler_chain_init();
+ break;
+ case MOD_QUIESCE:
+ case MOD_UNLOAD:
+ handler_chain_destroy();
+ finishoff();
+ error = 0;
+ break;
+ default:
+ error = EINVAL;
+ }
+
+ return (error);
+}
+
+static moduledata_t alias_mod = {
+ "alias", alias_mod_handler, NULL
+};
+
+DECLARE_MODULE(alias, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);
+#endif
+
+/* Internal utility routines (used only in alias_db.c)
+
+Lookup table starting points:
+ StartPointIn() -- link table initial search point for
+ incoming packets
+ StartPointOut() -- link table initial search point for
+ outgoing packets
+
+Miscellaneous:
+ SeqDiff() -- difference between two TCP sequences
+ ShowAliasStats() -- send alias statistics to a monitor file
+*/
+
+
+/* Local prototypes */
+static u_int StartPointIn(struct in_addr, u_short, int);
+
+static u_int
+StartPointOut(struct in_addr, struct in_addr,
+ u_short, u_short, int);
+
+static int SeqDiff(u_long, u_long);
+
+#ifndef NO_FW_PUNCH
+/* Firewall control */
+static void InitPunchFW(struct libalias *);
+static void UninitPunchFW(struct libalias *);
+static void ClearFWHole(struct alias_link *);
+
+#endif
+
+/* Log file control */
+static void ShowAliasStats(struct libalias *);
+static int InitPacketAliasLog(struct libalias *);
+static void UninitPacketAliasLog(struct libalias *);
+
+void SctpShowAliasStats(struct libalias *la);
+
+static u_int
+StartPointIn(struct in_addr alias_addr,
+ u_short alias_port,
+ int link_type)
+{
+ u_int n;
+
+ n = alias_addr.s_addr;
+ if (link_type != LINK_PPTP)
+ n += alias_port;
+ n += link_type;
+ return (n % LINK_TABLE_IN_SIZE);
+}
+
+
+static u_int
+StartPointOut(struct in_addr src_addr, struct in_addr dst_addr,
+ u_short src_port, u_short dst_port, int link_type)
+{
+ u_int n;
+
+ n = src_addr.s_addr;
+ n += dst_addr.s_addr;
+ if (link_type != LINK_PPTP) {
+ n += src_port;
+ n += dst_port;
+ }
+ n += link_type;
+
+ return (n % LINK_TABLE_OUT_SIZE);
+}
+
+
+static int
+SeqDiff(u_long x, u_long y)
+{
+/* Return the difference between two TCP sequence numbers */
+
+/*
+ This function is encapsulated in case there are any unusual
+ arithmetic conditions that need to be considered.
+*/
+
+ return (ntohl(y) - ntohl(x));
+}
+
+#ifdef _KERNEL
+
+static void
+AliasLog(char *str, const char *format, ...)
+{
+ va_list ap;
+
+ va_start(ap, format);
+ vsnprintf(str, LIBALIAS_BUF_SIZE, format, ap);
+ va_end(ap);
+}
+#else
+static void
+AliasLog(FILE *stream, const char *format, ...)
+{
+ va_list ap;
+
+ va_start(ap, format);
+ vfprintf(stream, format, ap);
+ va_end(ap);
+ fflush(stream);
+}
+#endif
+
+static void
+ShowAliasStats(struct libalias *la)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+/* Used for debugging */
+ if (la->logDesc) {
+ int tot = la->icmpLinkCount + la->udpLinkCount +
+ (la->sctpLinkCount>>1) + /* sctp counts half associations */
+ la->tcpLinkCount + la->pptpLinkCount +
+ la->protoLinkCount + la->fragmentIdLinkCount +
+ la->fragmentPtrLinkCount;
+
+ AliasLog(la->logDesc,
+ "icmp=%u, udp=%u, tcp=%u, sctp=%u, pptp=%u, proto=%u, frag_id=%u frag_ptr=%u / tot=%u",
+ la->icmpLinkCount,
+ la->udpLinkCount,
+ la->tcpLinkCount,
+ la->sctpLinkCount>>1, /* sctp counts half associations */
+ la->pptpLinkCount,
+ la->protoLinkCount,
+ la->fragmentIdLinkCount,
+ la->fragmentPtrLinkCount, tot);
+#ifndef _KERNEL
+ AliasLog(la->logDesc, " (sock=%u)\n", la->sockCount);
+#endif
+ }
+}
+
+void SctpShowAliasStats(struct libalias *la)
+{
+
+ ShowAliasStats(la);
+}
+
+
+/* Internal routines for finding, deleting and adding links
+
+Port Allocation:
+ GetNewPort() -- find and reserve new alias port number
+ GetSocket() -- try to allocate a socket for a given port
+
+Link creation and deletion:
+ CleanupAliasData() - remove all link chains from lookup table
+ IncrementalCleanup() - look for stale links in a single chain
+ DeleteLink() - remove link
+ AddLink() - add link
+ ReLink() - change link
+
+Link search:
+ FindLinkOut() - find link for outgoing packets
+ FindLinkIn() - find link for incoming packets
+
+Port search:
+ FindNewPortGroup() - find an available group of ports
+*/
+
+/* Local prototypes */
+static int GetNewPort(struct libalias *, struct alias_link *, int);
+#ifndef NO_USE_SOCKETS
+static u_short GetSocket(struct libalias *, u_short, int *, int);
+#endif
+static void CleanupAliasData(struct libalias *);
+
+static void IncrementalCleanup(struct libalias *);
+
+static void DeleteLink(struct alias_link *);
+
+static struct alias_link *
+AddLink(struct libalias *, struct in_addr, struct in_addr, struct in_addr,
+ u_short, u_short, int, int);
+
+static struct alias_link *
+ReLink(struct alias_link *,
+ struct in_addr, struct in_addr, struct in_addr,
+ u_short, u_short, int, int);
+
+static struct alias_link *
+ FindLinkOut (struct libalias *, struct in_addr, struct in_addr, u_short, u_short, int, int);
+
+static struct alias_link *
+ FindLinkIn (struct libalias *, struct in_addr, struct in_addr, u_short, u_short, int, int);
+
+
+#define ALIAS_PORT_BASE 0x08000
+#define ALIAS_PORT_MASK 0x07fff
+#define ALIAS_PORT_MASK_EVEN 0x07ffe
+#define GET_NEW_PORT_MAX_ATTEMPTS 20
+
+#define GET_ALIAS_PORT -1
+#define GET_ALIAS_ID GET_ALIAS_PORT
+
+#define FIND_EVEN_ALIAS_BASE 1
+
+/* GetNewPort() allocates port numbers. Note that if a port number
+ is already in use, that does not mean that it cannot be used by
+ another link concurrently. This is because GetNewPort() looks for
+ unused triplets: (dest addr, dest port, alias port). */
+
+static int
+GetNewPort(struct libalias *la, struct alias_link *lnk, int alias_port_param)
+{
+ int i;
+ int max_trials;
+ u_short port_sys;
+ u_short port_net;
+
+ LIBALIAS_LOCK_ASSERT(la);
+/*
+ Description of alias_port_param for GetNewPort(). When
+ this parameter is zero or positive, it precisely specifies
+ the port number. GetNewPort() will return this number
+ without check that it is in use.
+
+ When this parameter is GET_ALIAS_PORT, it indicates to get a randomly
+ selected port number.
+*/
+
+ if (alias_port_param == GET_ALIAS_PORT) {
+ /*
+ * The aliasing port is automatically selected by one of
+ * two methods below:
+ */
+ max_trials = GET_NEW_PORT_MAX_ATTEMPTS;
+
+ if (la->packetAliasMode & PKT_ALIAS_SAME_PORTS) {
+ /*
+ * When the PKT_ALIAS_SAME_PORTS option is chosen,
+ * the first try will be the actual source port. If
+ * this is already in use, the remainder of the
+ * trials will be random.
+ */
+ port_net = lnk->src_port;
+ port_sys = ntohs(port_net);
+ } else {
+ /* First trial and all subsequent are random. */
+ port_sys = arc4random() & ALIAS_PORT_MASK;
+ port_sys += ALIAS_PORT_BASE;
+ port_net = htons(port_sys);
+ }
+ } else if (alias_port_param >= 0 && alias_port_param < 0x10000) {
+ lnk->alias_port = (u_short) alias_port_param;
+ return (0);
+ } else {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr, "PacketAlias/GetNewPort(): ");
+ fprintf(stderr, "input parameter error\n");
+#endif
+ return (-1);
+ }
+
+
+/* Port number search */
+ for (i = 0; i < max_trials; i++) {
+ int go_ahead;
+ struct alias_link *search_result;
+
+ search_result = FindLinkIn(la, lnk->dst_addr, lnk->alias_addr,
+ lnk->dst_port, port_net,
+ lnk->link_type, 0);
+
+ if (search_result == NULL)
+ go_ahead = 1;
+ else if (!(lnk->flags & LINK_PARTIALLY_SPECIFIED)
+ && (search_result->flags & LINK_PARTIALLY_SPECIFIED))
+ go_ahead = 1;
+ else
+ go_ahead = 0;
+
+ if (go_ahead) {
+#ifndef NO_USE_SOCKETS
+ if ((la->packetAliasMode & PKT_ALIAS_USE_SOCKETS)
+ && (lnk->flags & LINK_PARTIALLY_SPECIFIED)
+ && ((lnk->link_type == LINK_TCP) ||
+ (lnk->link_type == LINK_UDP))) {
+ if (GetSocket(la, port_net, &lnk->sockfd, lnk->link_type)) {
+ lnk->alias_port = port_net;
+ return (0);
+ }
+ } else {
+#endif
+ lnk->alias_port = port_net;
+ return (0);
+#ifndef NO_USE_SOCKETS
+ }
+#endif
+ }
+ port_sys = arc4random() & ALIAS_PORT_MASK;
+ port_sys += ALIAS_PORT_BASE;
+ port_net = htons(port_sys);
+ }
+
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr, "PacketAlias/GetnewPort(): ");
+ fprintf(stderr, "could not find free port\n");
+#endif
+
+ return (-1);
+}
+
+#ifndef NO_USE_SOCKETS
+static u_short
+GetSocket(struct libalias *la, u_short port_net, int *sockfd, int link_type)
+{
+ int err;
+ int sock;
+ struct sockaddr_in sock_addr;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ if (link_type == LINK_TCP)
+ sock = socket(AF_INET, SOCK_STREAM, 0);
+ else if (link_type == LINK_UDP)
+ sock = socket(AF_INET, SOCK_DGRAM, 0);
+ else {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr, "PacketAlias/GetSocket(): ");
+ fprintf(stderr, "incorrect link type\n");
+#endif
+ return (0);
+ }
+
+ if (sock < 0) {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr, "PacketAlias/GetSocket(): ");
+ fprintf(stderr, "socket() error %d\n", *sockfd);
+#endif
+ return (0);
+ }
+ sock_addr.sin_family = AF_INET;
+ sock_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ sock_addr.sin_port = port_net;
+
+ err = bind(sock,
+ (struct sockaddr *)&sock_addr,
+ sizeof(sock_addr));
+ if (err == 0) {
+ la->sockCount++;
+ *sockfd = sock;
+ return (1);
+ } else {
+ close(sock);
+ return (0);
+ }
+}
+#endif
+
+/* FindNewPortGroup() returns a base port number for an available
+ range of contiguous port numbers. Note that if a port number
+ is already in use, that does not mean that it cannot be used by
+ another link concurrently. This is because FindNewPortGroup()
+ looks for unused triplets: (dest addr, dest port, alias port). */
+
+int
+FindNewPortGroup(struct libalias *la,
+ struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short src_port,
+ u_short dst_port,
+ u_short port_count,
+ u_char proto,
+ u_char align)
+{
+ int i, j;
+ int max_trials;
+ u_short port_sys;
+ int link_type;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ /*
+ * Get link_type from protocol
+ */
+
+ switch (proto) {
+ case IPPROTO_UDP:
+ link_type = LINK_UDP;
+ break;
+ case IPPROTO_TCP:
+ link_type = LINK_TCP;
+ break;
+ default:
+ return (0);
+ break;
+ }
+
+ /*
+ * The aliasing port is automatically selected by one of two
+ * methods below:
+ */
+ max_trials = GET_NEW_PORT_MAX_ATTEMPTS;
+
+ if (la->packetAliasMode & PKT_ALIAS_SAME_PORTS) {
+ /*
+ * When the ALIAS_SAME_PORTS option is chosen, the first
+ * try will be the actual source port. If this is already
+ * in use, the remainder of the trials will be random.
+ */
+ port_sys = ntohs(src_port);
+
+ } else {
+
+ /* First trial and all subsequent are random. */
+ if (align == FIND_EVEN_ALIAS_BASE)
+ port_sys = arc4random() & ALIAS_PORT_MASK_EVEN;
+ else
+ port_sys = arc4random() & ALIAS_PORT_MASK;
+
+ port_sys += ALIAS_PORT_BASE;
+ }
+
+/* Port number search */
+ for (i = 0; i < max_trials; i++) {
+
+ struct alias_link *search_result;
+
+ for (j = 0; j < port_count; j++)
+ if (0 != (search_result = FindLinkIn(la, dst_addr, alias_addr,
+ dst_port, htons(port_sys + j),
+ link_type, 0)))
+ break;
+
+ /* Found a good range, return base */
+ if (j == port_count)
+ return (htons(port_sys));
+
+ /* Find a new base to try */
+ if (align == FIND_EVEN_ALIAS_BASE)
+ port_sys = arc4random() & ALIAS_PORT_MASK_EVEN;
+ else
+ port_sys = arc4random() & ALIAS_PORT_MASK;
+
+ port_sys += ALIAS_PORT_BASE;
+ }
+
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr, "PacketAlias/FindNewPortGroup(): ");
+ fprintf(stderr, "could not find free port(s)\n");
+#endif
+
+ return (0);
+}
+
+static void
+CleanupAliasData(struct libalias *la)
+{
+ struct alias_link *lnk;
+ int i;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ for (i = 0; i < LINK_TABLE_OUT_SIZE; i++) {
+ lnk = LIST_FIRST(&la->linkTableOut[i]);
+ while (lnk != NULL) {
+ struct alias_link *link_next = LIST_NEXT(lnk, list_out);
+ DeleteLink(lnk);
+ lnk = link_next;
+ }
+ }
+
+ la->cleanupIndex = 0;
+}
+
+
+static void
+IncrementalCleanup(struct libalias *la)
+{
+ struct alias_link *lnk, *lnk_tmp;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ LIST_FOREACH_SAFE(lnk, &la->linkTableOut[la->cleanupIndex++],
+ list_out, lnk_tmp) {
+ if (la->timeStamp - lnk->timestamp > lnk->expire_time)
+ DeleteLink(lnk);
+ }
+
+ if (la->cleanupIndex == LINK_TABLE_OUT_SIZE)
+ la->cleanupIndex = 0;
+}
+
+static void
+DeleteLink(struct alias_link *lnk)
+{
+ struct libalias *la = lnk->la;
+
+ LIBALIAS_LOCK_ASSERT(la);
+/* Don't do anything if the link is marked permanent */
+ if (la->deleteAllLinks == 0 && lnk->flags & LINK_PERMANENT)
+ return;
+
+#ifndef NO_FW_PUNCH
+/* Delete associated firewall hole, if any */
+ ClearFWHole(lnk);
+#endif
+
+/* Free memory allocated for LSNAT server pool */
+ if (lnk->server != NULL) {
+ struct server *head, *curr, *next;
+
+ head = curr = lnk->server;
+ do {
+ next = curr->next;
+ free(curr);
+ } while ((curr = next) != head);
+ }
+/* Adjust output table pointers */
+ LIST_REMOVE(lnk, list_out);
+
+/* Adjust input table pointers */
+ LIST_REMOVE(lnk, list_in);
+#ifndef NO_USE_SOCKETS
+/* Close socket, if one has been allocated */
+ if (lnk->sockfd != -1) {
+ la->sockCount--;
+ close(lnk->sockfd);
+ }
+#endif
+/* Link-type dependent cleanup */
+ switch (lnk->link_type) {
+ case LINK_ICMP:
+ la->icmpLinkCount--;
+ break;
+ case LINK_UDP:
+ la->udpLinkCount--;
+ break;
+ case LINK_TCP:
+ la->tcpLinkCount--;
+ free(lnk->data.tcp);
+ break;
+ case LINK_PPTP:
+ la->pptpLinkCount--;
+ break;
+ case LINK_FRAGMENT_ID:
+ la->fragmentIdLinkCount--;
+ break;
+ case LINK_FRAGMENT_PTR:
+ la->fragmentPtrLinkCount--;
+ if (lnk->data.frag_ptr != NULL)
+ free(lnk->data.frag_ptr);
+ break;
+ case LINK_ADDR:
+ break;
+ default:
+ la->protoLinkCount--;
+ break;
+ }
+
+/* Free memory */
+ free(lnk);
+
+/* Write statistics, if logging enabled */
+ if (la->packetAliasMode & PKT_ALIAS_LOG) {
+ ShowAliasStats(la);
+ }
+}
+
+
+static struct alias_link *
+AddLink(struct libalias *la, struct in_addr src_addr,
+ struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short src_port,
+ u_short dst_port,
+ int alias_port_param, /* if less than zero, alias */
+ int link_type)
+{ /* port will be automatically *//* chosen.
+ * If greater than */
+ u_int start_point; /* zero, equal to alias port */
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = malloc(sizeof(struct alias_link));
+ if (lnk != NULL) {
+ /* Basic initialization */
+ lnk->la = la;
+ lnk->src_addr = src_addr;
+ lnk->dst_addr = dst_addr;
+ lnk->alias_addr = alias_addr;
+ lnk->proxy_addr.s_addr = INADDR_ANY;
+ lnk->src_port = src_port;
+ lnk->dst_port = dst_port;
+ lnk->proxy_port = 0;
+ lnk->server = NULL;
+ lnk->link_type = link_type;
+#ifndef NO_USE_SOCKETS
+ lnk->sockfd = -1;
+#endif
+ lnk->flags = 0;
+ lnk->pflags = 0;
+ lnk->timestamp = la->timeStamp;
+
+ /* Expiration time */
+ switch (link_type) {
+ case LINK_ICMP:
+ lnk->expire_time = ICMP_EXPIRE_TIME;
+ break;
+ case LINK_UDP:
+ lnk->expire_time = UDP_EXPIRE_TIME;
+ break;
+ case LINK_TCP:
+ lnk->expire_time = TCP_EXPIRE_INITIAL;
+ break;
+ case LINK_PPTP:
+ lnk->flags |= LINK_PERMANENT; /* no timeout. */
+ break;
+ case LINK_FRAGMENT_ID:
+ lnk->expire_time = FRAGMENT_ID_EXPIRE_TIME;
+ break;
+ case LINK_FRAGMENT_PTR:
+ lnk->expire_time = FRAGMENT_PTR_EXPIRE_TIME;
+ break;
+ case LINK_ADDR:
+ break;
+ default:
+ lnk->expire_time = PROTO_EXPIRE_TIME;
+ break;
+ }
+
+ /* Determine alias flags */
+ if (dst_addr.s_addr == INADDR_ANY)
+ lnk->flags |= LINK_UNKNOWN_DEST_ADDR;
+ if (dst_port == 0)
+ lnk->flags |= LINK_UNKNOWN_DEST_PORT;
+
+ /* Determine alias port */
+ if (GetNewPort(la, lnk, alias_port_param) != 0) {
+ free(lnk);
+ return (NULL);
+ }
+ /* Link-type dependent initialization */
+ switch (link_type) {
+ struct tcp_dat *aux_tcp;
+
+ case LINK_ICMP:
+ la->icmpLinkCount++;
+ break;
+ case LINK_UDP:
+ la->udpLinkCount++;
+ break;
+ case LINK_TCP:
+ aux_tcp = malloc(sizeof(struct tcp_dat));
+ if (aux_tcp != NULL) {
+ int i;
+
+ la->tcpLinkCount++;
+ aux_tcp->state.in = ALIAS_TCP_STATE_NOT_CONNECTED;
+ aux_tcp->state.out = ALIAS_TCP_STATE_NOT_CONNECTED;
+ aux_tcp->state.index = 0;
+ aux_tcp->state.ack_modified = 0;
+ for (i = 0; i < N_LINK_TCP_DATA; i++)
+ aux_tcp->ack[i].active = 0;
+ aux_tcp->fwhole = -1;
+ lnk->data.tcp = aux_tcp;
+ } else {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr, "PacketAlias/AddLink: ");
+ fprintf(stderr, " cannot allocate auxiliary TCP data\n");
+#endif
+ free(lnk);
+ return (NULL);
+ }
+ break;
+ case LINK_PPTP:
+ la->pptpLinkCount++;
+ break;
+ case LINK_FRAGMENT_ID:
+ la->fragmentIdLinkCount++;
+ break;
+ case LINK_FRAGMENT_PTR:
+ la->fragmentPtrLinkCount++;
+ break;
+ case LINK_ADDR:
+ break;
+ default:
+ la->protoLinkCount++;
+ break;
+ }
+
+ /* Set up pointers for output lookup table */
+ start_point = StartPointOut(src_addr, dst_addr,
+ src_port, dst_port, link_type);
+ LIST_INSERT_HEAD(&la->linkTableOut[start_point], lnk, list_out);
+
+ /* Set up pointers for input lookup table */
+ start_point = StartPointIn(alias_addr, lnk->alias_port, link_type);
+ LIST_INSERT_HEAD(&la->linkTableIn[start_point], lnk, list_in);
+ } else {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr, "PacketAlias/AddLink(): ");
+ fprintf(stderr, "malloc() call failed.\n");
+#endif
+ }
+ if (la->packetAliasMode & PKT_ALIAS_LOG) {
+ ShowAliasStats(la);
+ }
+ return (lnk);
+}
+
+static struct alias_link *
+ReLink(struct alias_link *old_lnk,
+ struct in_addr src_addr,
+ struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short src_port,
+ u_short dst_port,
+ int alias_port_param, /* if less than zero, alias */
+ int link_type)
+{ /* port will be automatically *//* chosen.
+ * If greater than */
+ struct alias_link *new_lnk; /* zero, equal to alias port */
+ struct libalias *la = old_lnk->la;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ new_lnk = AddLink(la, src_addr, dst_addr, alias_addr,
+ src_port, dst_port, alias_port_param,
+ link_type);
+#ifndef NO_FW_PUNCH
+ if (new_lnk != NULL &&
+ old_lnk->link_type == LINK_TCP &&
+ old_lnk->data.tcp->fwhole > 0) {
+ PunchFWHole(new_lnk);
+ }
+#endif
+ DeleteLink(old_lnk);
+ return (new_lnk);
+}
+
+static struct alias_link *
+_FindLinkOut(struct libalias *la, struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_short src_port,
+ u_short dst_port,
+ int link_type,
+ int replace_partial_links)
+{
+ u_int i;
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ i = StartPointOut(src_addr, dst_addr, src_port, dst_port, link_type);
+ LIST_FOREACH(lnk, &la->linkTableOut[i], list_out) {
+ if (lnk->dst_addr.s_addr == dst_addr.s_addr &&
+ lnk->src_addr.s_addr == src_addr.s_addr &&
+ lnk->src_port == src_port &&
+ lnk->dst_port == dst_port &&
+ lnk->link_type == link_type &&
+ lnk->server == NULL) {
+ lnk->timestamp = la->timeStamp;
+ break;
+ }
+ }
+
+/* Search for partially specified links. */
+ if (lnk == NULL && replace_partial_links) {
+ if (dst_port != 0 && dst_addr.s_addr != INADDR_ANY) {
+ lnk = _FindLinkOut(la, src_addr, dst_addr, src_port, 0,
+ link_type, 0);
+ if (lnk == NULL)
+ lnk = _FindLinkOut(la, src_addr, la->nullAddress, src_port,
+ dst_port, link_type, 0);
+ }
+ if (lnk == NULL &&
+ (dst_port != 0 || dst_addr.s_addr != INADDR_ANY)) {
+ lnk = _FindLinkOut(la, src_addr, la->nullAddress, src_port, 0,
+ link_type, 0);
+ }
+ if (lnk != NULL) {
+ lnk = ReLink(lnk,
+ src_addr, dst_addr, lnk->alias_addr,
+ src_port, dst_port, lnk->alias_port,
+ link_type);
+ }
+ }
+ return (lnk);
+}
+
+static struct alias_link *
+FindLinkOut(struct libalias *la, struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_short src_port,
+ u_short dst_port,
+ int link_type,
+ int replace_partial_links)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = _FindLinkOut(la, src_addr, dst_addr, src_port, dst_port,
+ link_type, replace_partial_links);
+
+ if (lnk == NULL) {
+ /*
+ * The following allows permanent links to be specified as
+ * using the default source address (i.e. device interface
+ * address) without knowing in advance what that address
+ * is.
+ */
+ if (la->aliasAddress.s_addr != INADDR_ANY &&
+ src_addr.s_addr == la->aliasAddress.s_addr) {
+ lnk = _FindLinkOut(la, la->nullAddress, dst_addr, src_port, dst_port,
+ link_type, replace_partial_links);
+ }
+ }
+ return (lnk);
+}
+
+
+static struct alias_link *
+_FindLinkIn(struct libalias *la, struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short dst_port,
+ u_short alias_port,
+ int link_type,
+ int replace_partial_links)
+{
+ int flags_in;
+ u_int start_point;
+ struct alias_link *lnk;
+ struct alias_link *lnk_fully_specified;
+ struct alias_link *lnk_unknown_all;
+ struct alias_link *lnk_unknown_dst_addr;
+ struct alias_link *lnk_unknown_dst_port;
+
+ LIBALIAS_LOCK_ASSERT(la);
+/* Initialize pointers */
+ lnk_fully_specified = NULL;
+ lnk_unknown_all = NULL;
+ lnk_unknown_dst_addr = NULL;
+ lnk_unknown_dst_port = NULL;
+
+/* If either the dest addr or port is unknown, the search
+ loop will have to know about this. */
+
+ flags_in = 0;
+ if (dst_addr.s_addr == INADDR_ANY)
+ flags_in |= LINK_UNKNOWN_DEST_ADDR;
+ if (dst_port == 0)
+ flags_in |= LINK_UNKNOWN_DEST_PORT;
+
+/* Search loop */
+ start_point = StartPointIn(alias_addr, alias_port, link_type);
+ LIST_FOREACH(lnk, &la->linkTableIn[start_point], list_in) {
+ int flags;
+
+ flags = flags_in | lnk->flags;
+ if (!(flags & LINK_PARTIALLY_SPECIFIED)) {
+ if (lnk->alias_addr.s_addr == alias_addr.s_addr
+ && lnk->alias_port == alias_port
+ && lnk->dst_addr.s_addr == dst_addr.s_addr
+ && lnk->dst_port == dst_port
+ && lnk->link_type == link_type) {
+ lnk_fully_specified = lnk;
+ break;
+ }
+ } else if ((flags & LINK_UNKNOWN_DEST_ADDR)
+ && (flags & LINK_UNKNOWN_DEST_PORT)) {
+ if (lnk->alias_addr.s_addr == alias_addr.s_addr
+ && lnk->alias_port == alias_port
+ && lnk->link_type == link_type) {
+ if (lnk_unknown_all == NULL)
+ lnk_unknown_all = lnk;
+ }
+ } else if (flags & LINK_UNKNOWN_DEST_ADDR) {
+ if (lnk->alias_addr.s_addr == alias_addr.s_addr
+ && lnk->alias_port == alias_port
+ && lnk->link_type == link_type
+ && lnk->dst_port == dst_port) {
+ if (lnk_unknown_dst_addr == NULL)
+ lnk_unknown_dst_addr = lnk;
+ }
+ } else if (flags & LINK_UNKNOWN_DEST_PORT) {
+ if (lnk->alias_addr.s_addr == alias_addr.s_addr
+ && lnk->alias_port == alias_port
+ && lnk->link_type == link_type
+ && lnk->dst_addr.s_addr == dst_addr.s_addr) {
+ if (lnk_unknown_dst_port == NULL)
+ lnk_unknown_dst_port = lnk;
+ }
+ }
+ }
+
+
+
+ if (lnk_fully_specified != NULL) {
+ lnk_fully_specified->timestamp = la->timeStamp;
+ lnk = lnk_fully_specified;
+ } else if (lnk_unknown_dst_port != NULL)
+ lnk = lnk_unknown_dst_port;
+ else if (lnk_unknown_dst_addr != NULL)
+ lnk = lnk_unknown_dst_addr;
+ else if (lnk_unknown_all != NULL)
+ lnk = lnk_unknown_all;
+ else
+ return (NULL);
+
+ if (replace_partial_links &&
+ (lnk->flags & LINK_PARTIALLY_SPECIFIED || lnk->server != NULL)) {
+ struct in_addr src_addr;
+ u_short src_port;
+
+ if (lnk->server != NULL) { /* LSNAT link */
+ src_addr = lnk->server->addr;
+ src_port = lnk->server->port;
+ lnk->server = lnk->server->next;
+ } else {
+ src_addr = lnk->src_addr;
+ src_port = lnk->src_port;
+ }
+
+ if (link_type == LINK_SCTP) {
+ lnk->src_addr = src_addr;
+ lnk->src_port = src_port;
+ return(lnk);
+ }
+ lnk = ReLink(lnk,
+ src_addr, dst_addr, alias_addr,
+ src_port, dst_port, alias_port,
+ link_type);
+ }
+ return (lnk);
+}
+
+static struct alias_link *
+FindLinkIn(struct libalias *la, struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short dst_port,
+ u_short alias_port,
+ int link_type,
+ int replace_partial_links)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = _FindLinkIn(la, dst_addr, alias_addr, dst_port, alias_port,
+ link_type, replace_partial_links);
+
+ if (lnk == NULL) {
+ /*
+ * The following allows permanent links to be specified as
+ * using the default aliasing address (i.e. device
+ * interface address) without knowing in advance what that
+ * address is.
+ */
+ if (la->aliasAddress.s_addr != INADDR_ANY &&
+ alias_addr.s_addr == la->aliasAddress.s_addr) {
+ lnk = _FindLinkIn(la, dst_addr, la->nullAddress, dst_port, alias_port,
+ link_type, replace_partial_links);
+ }
+ }
+ return (lnk);
+}
+
+
+
+
+/* External routines for finding/adding links
+
+-- "external" means outside alias_db.c, but within alias*.c --
+
+ FindIcmpIn(), FindIcmpOut()
+ FindFragmentIn1(), FindFragmentIn2()
+ AddFragmentPtrLink(), FindFragmentPtr()
+ FindProtoIn(), FindProtoOut()
+ FindUdpTcpIn(), FindUdpTcpOut()
+ AddPptp(), FindPptpOutByCallId(), FindPptpInByCallId(),
+ FindPptpOutByPeerCallId(), FindPptpInByPeerCallId()
+ FindOriginalAddress(), FindAliasAddress()
+
+(prototypes in alias_local.h)
+*/
+
+
+struct alias_link *
+FindIcmpIn(struct libalias *la, struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short id_alias,
+ int create)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = FindLinkIn(la, dst_addr, alias_addr,
+ NO_DEST_PORT, id_alias,
+ LINK_ICMP, 0);
+ if (lnk == NULL && create && !(la->packetAliasMode & PKT_ALIAS_DENY_INCOMING)) {
+ struct in_addr target_addr;
+
+ target_addr = FindOriginalAddress(la, alias_addr);
+ lnk = AddLink(la, target_addr, dst_addr, alias_addr,
+ id_alias, NO_DEST_PORT, id_alias,
+ LINK_ICMP);
+ }
+ return (lnk);
+}
+
+
+struct alias_link *
+FindIcmpOut(struct libalias *la, struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_short id,
+ int create)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = FindLinkOut(la, src_addr, dst_addr,
+ id, NO_DEST_PORT,
+ LINK_ICMP, 0);
+ if (lnk == NULL && create) {
+ struct in_addr alias_addr;
+
+ alias_addr = FindAliasAddress(la, src_addr);
+ lnk = AddLink(la, src_addr, dst_addr, alias_addr,
+ id, NO_DEST_PORT, GET_ALIAS_ID,
+ LINK_ICMP);
+ }
+ return (lnk);
+}
+
+
+struct alias_link *
+FindFragmentIn1(struct libalias *la, struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short ip_id)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = FindLinkIn(la, dst_addr, alias_addr,
+ NO_DEST_PORT, ip_id,
+ LINK_FRAGMENT_ID, 0);
+
+ if (lnk == NULL) {
+ lnk = AddLink(la, la->nullAddress, dst_addr, alias_addr,
+ NO_SRC_PORT, NO_DEST_PORT, ip_id,
+ LINK_FRAGMENT_ID);
+ }
+ return (lnk);
+}
+
+
+struct alias_link *
+FindFragmentIn2(struct libalias *la, struct in_addr dst_addr, /* Doesn't add a link if
+ * one */
+ struct in_addr alias_addr, /* is not found. */
+ u_short ip_id)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+ return FindLinkIn(la, dst_addr, alias_addr,
+ NO_DEST_PORT, ip_id,
+ LINK_FRAGMENT_ID, 0);
+}
+
+
+struct alias_link *
+AddFragmentPtrLink(struct libalias *la, struct in_addr dst_addr,
+ u_short ip_id)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+ return AddLink(la, la->nullAddress, dst_addr, la->nullAddress,
+ NO_SRC_PORT, NO_DEST_PORT, ip_id,
+ LINK_FRAGMENT_PTR);
+}
+
+
+struct alias_link *
+FindFragmentPtr(struct libalias *la, struct in_addr dst_addr,
+ u_short ip_id)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+ return FindLinkIn(la, dst_addr, la->nullAddress,
+ NO_DEST_PORT, ip_id,
+ LINK_FRAGMENT_PTR, 0);
+}
+
+
+struct alias_link *
+FindProtoIn(struct libalias *la, struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_char proto)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = FindLinkIn(la, dst_addr, alias_addr,
+ NO_DEST_PORT, 0,
+ proto, 1);
+
+ if (lnk == NULL && !(la->packetAliasMode & PKT_ALIAS_DENY_INCOMING)) {
+ struct in_addr target_addr;
+
+ target_addr = FindOriginalAddress(la, alias_addr);
+ lnk = AddLink(la, target_addr, dst_addr, alias_addr,
+ NO_SRC_PORT, NO_DEST_PORT, 0,
+ proto);
+ }
+ return (lnk);
+}
+
+
+struct alias_link *
+FindProtoOut(struct libalias *la, struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_char proto)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = FindLinkOut(la, src_addr, dst_addr,
+ NO_SRC_PORT, NO_DEST_PORT,
+ proto, 1);
+
+ if (lnk == NULL) {
+ struct in_addr alias_addr;
+
+ alias_addr = FindAliasAddress(la, src_addr);
+ lnk = AddLink(la, src_addr, dst_addr, alias_addr,
+ NO_SRC_PORT, NO_DEST_PORT, 0,
+ proto);
+ }
+ return (lnk);
+}
+
+
+struct alias_link *
+FindUdpTcpIn(struct libalias *la, struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short dst_port,
+ u_short alias_port,
+ u_char proto,
+ int create)
+{
+ int link_type;
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ switch (proto) {
+ case IPPROTO_UDP:
+ link_type = LINK_UDP;
+ break;
+ case IPPROTO_TCP:
+ link_type = LINK_TCP;
+ break;
+ default:
+ return (NULL);
+ break;
+ }
+
+ lnk = FindLinkIn(la, dst_addr, alias_addr,
+ dst_port, alias_port,
+ link_type, create);
+
+ if (lnk == NULL && create && !(la->packetAliasMode & PKT_ALIAS_DENY_INCOMING)) {
+ struct in_addr target_addr;
+
+ target_addr = FindOriginalAddress(la, alias_addr);
+ lnk = AddLink(la, target_addr, dst_addr, alias_addr,
+ alias_port, dst_port, alias_port,
+ link_type);
+ }
+ return (lnk);
+}
+
+
+struct alias_link *
+FindUdpTcpOut(struct libalias *la, struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_short src_port,
+ u_short dst_port,
+ u_char proto,
+ int create)
+{
+ int link_type;
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ switch (proto) {
+ case IPPROTO_UDP:
+ link_type = LINK_UDP;
+ break;
+ case IPPROTO_TCP:
+ link_type = LINK_TCP;
+ break;
+ default:
+ return (NULL);
+ break;
+ }
+
+ lnk = FindLinkOut(la, src_addr, dst_addr, src_port, dst_port, link_type, create);
+
+ if (lnk == NULL && create) {
+ struct in_addr alias_addr;
+
+ alias_addr = FindAliasAddress(la, src_addr);
+ lnk = AddLink(la, src_addr, dst_addr, alias_addr,
+ src_port, dst_port, GET_ALIAS_PORT,
+ link_type);
+ }
+ return (lnk);
+}
+
+
+struct alias_link *
+AddPptp(struct libalias *la, struct in_addr src_addr,
+ struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_int16_t src_call_id)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = AddLink(la, src_addr, dst_addr, alias_addr,
+ src_call_id, 0, GET_ALIAS_PORT,
+ LINK_PPTP);
+
+ return (lnk);
+}
+
+
+struct alias_link *
+FindPptpOutByCallId(struct libalias *la, struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_int16_t src_call_id)
+{
+ u_int i;
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ i = StartPointOut(src_addr, dst_addr, 0, 0, LINK_PPTP);
+ LIST_FOREACH(lnk, &la->linkTableOut[i], list_out)
+ if (lnk->link_type == LINK_PPTP &&
+ lnk->src_addr.s_addr == src_addr.s_addr &&
+ lnk->dst_addr.s_addr == dst_addr.s_addr &&
+ lnk->src_port == src_call_id)
+ break;
+
+ return (lnk);
+}
+
+
+struct alias_link *
+FindPptpOutByPeerCallId(struct libalias *la, struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_int16_t dst_call_id)
+{
+ u_int i;
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ i = StartPointOut(src_addr, dst_addr, 0, 0, LINK_PPTP);
+ LIST_FOREACH(lnk, &la->linkTableOut[i], list_out)
+ if (lnk->link_type == LINK_PPTP &&
+ lnk->src_addr.s_addr == src_addr.s_addr &&
+ lnk->dst_addr.s_addr == dst_addr.s_addr &&
+ lnk->dst_port == dst_call_id)
+ break;
+
+ return (lnk);
+}
+
+
+struct alias_link *
+FindPptpInByCallId(struct libalias *la, struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_int16_t dst_call_id)
+{
+ u_int i;
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ i = StartPointIn(alias_addr, 0, LINK_PPTP);
+ LIST_FOREACH(lnk, &la->linkTableIn[i], list_in)
+ if (lnk->link_type == LINK_PPTP &&
+ lnk->dst_addr.s_addr == dst_addr.s_addr &&
+ lnk->alias_addr.s_addr == alias_addr.s_addr &&
+ lnk->dst_port == dst_call_id)
+ break;
+
+ return (lnk);
+}
+
+
+struct alias_link *
+FindPptpInByPeerCallId(struct libalias *la, struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_int16_t alias_call_id)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = FindLinkIn(la, dst_addr, alias_addr,
+ 0 /* any */ , alias_call_id,
+ LINK_PPTP, 0);
+
+
+ return (lnk);
+}
+
+
+struct alias_link *
+FindRtspOut(struct libalias *la, struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_short src_port,
+ u_short alias_port,
+ u_char proto)
+{
+ int link_type;
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ switch (proto) {
+ case IPPROTO_UDP:
+ link_type = LINK_UDP;
+ break;
+ case IPPROTO_TCP:
+ link_type = LINK_TCP;
+ break;
+ default:
+ return (NULL);
+ break;
+ }
+
+ lnk = FindLinkOut(la, src_addr, dst_addr, src_port, 0, link_type, 1);
+
+ if (lnk == NULL) {
+ struct in_addr alias_addr;
+
+ alias_addr = FindAliasAddress(la, src_addr);
+ lnk = AddLink(la, src_addr, dst_addr, alias_addr,
+ src_port, 0, alias_port,
+ link_type);
+ }
+ return (lnk);
+}
+
+
+struct in_addr
+FindOriginalAddress(struct libalias *la, struct in_addr alias_addr)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = FindLinkIn(la, la->nullAddress, alias_addr,
+ 0, 0, LINK_ADDR, 0);
+ if (lnk == NULL) {
+ la->newDefaultLink = 1;
+ if (la->targetAddress.s_addr == INADDR_ANY)
+ return (alias_addr);
+ else if (la->targetAddress.s_addr == INADDR_NONE)
+ return (la->aliasAddress.s_addr != INADDR_ANY) ?
+ la->aliasAddress : alias_addr;
+ else
+ return (la->targetAddress);
+ } else {
+ if (lnk->server != NULL) { /* LSNAT link */
+ struct in_addr src_addr;
+
+ src_addr = lnk->server->addr;
+ lnk->server = lnk->server->next;
+ return (src_addr);
+ } else if (lnk->src_addr.s_addr == INADDR_ANY)
+ return (la->aliasAddress.s_addr != INADDR_ANY) ?
+ la->aliasAddress : alias_addr;
+ else
+ return (lnk->src_addr);
+ }
+}
+
+
+struct in_addr
+FindAliasAddress(struct libalias *la, struct in_addr original_addr)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = FindLinkOut(la, original_addr, la->nullAddress,
+ 0, 0, LINK_ADDR, 0);
+ if (lnk == NULL) {
+ return (la->aliasAddress.s_addr != INADDR_ANY) ?
+ la->aliasAddress : original_addr;
+ } else {
+ if (lnk->alias_addr.s_addr == INADDR_ANY)
+ return (la->aliasAddress.s_addr != INADDR_ANY) ?
+ la->aliasAddress : original_addr;
+ else
+ return (lnk->alias_addr);
+ }
+}
+
+
+/* External routines for getting or changing link data
+ (external to alias_db.c, but internal to alias*.c)
+
+ SetFragmentData(), GetFragmentData()
+ SetFragmentPtr(), GetFragmentPtr()
+ SetStateIn(), SetStateOut(), GetStateIn(), GetStateOut()
+ GetOriginalAddress(), GetDestAddress(), GetAliasAddress()
+ GetOriginalPort(), GetAliasPort()
+ SetAckModified(), GetAckModified()
+ GetDeltaAckIn(), GetDeltaSeqOut(), AddSeq()
+ SetProtocolFlags(), GetProtocolFlags()
+ SetDestCallId()
+*/
+
+
+void
+SetFragmentAddr(struct alias_link *lnk, struct in_addr src_addr)
+{
+ lnk->data.frag_addr = src_addr;
+}
+
+
+void
+GetFragmentAddr(struct alias_link *lnk, struct in_addr *src_addr)
+{
+ *src_addr = lnk->data.frag_addr;
+}
+
+
+void
+SetFragmentPtr(struct alias_link *lnk, char *fptr)
+{
+ lnk->data.frag_ptr = fptr;
+}
+
+
+void
+GetFragmentPtr(struct alias_link *lnk, char **fptr)
+{
+ *fptr = lnk->data.frag_ptr;
+}
+
+
+void
+SetStateIn(struct alias_link *lnk, int state)
+{
+ /* TCP input state */
+ switch (state) {
+ case ALIAS_TCP_STATE_DISCONNECTED:
+ if (lnk->data.tcp->state.out != ALIAS_TCP_STATE_CONNECTED)
+ lnk->expire_time = TCP_EXPIRE_DEAD;
+ else
+ lnk->expire_time = TCP_EXPIRE_SINGLEDEAD;
+ break;
+ case ALIAS_TCP_STATE_CONNECTED:
+ if (lnk->data.tcp->state.out == ALIAS_TCP_STATE_CONNECTED)
+ lnk->expire_time = TCP_EXPIRE_CONNECTED;
+ break;
+ default:
+#ifdef _KERNEL
+ panic("libalias:SetStateIn() unknown state");
+#else
+ abort();
+#endif
+ }
+ lnk->data.tcp->state.in = state;
+}
+
+
+void
+SetStateOut(struct alias_link *lnk, int state)
+{
+ /* TCP output state */
+ switch (state) {
+ case ALIAS_TCP_STATE_DISCONNECTED:
+ if (lnk->data.tcp->state.in != ALIAS_TCP_STATE_CONNECTED)
+ lnk->expire_time = TCP_EXPIRE_DEAD;
+ else
+ lnk->expire_time = TCP_EXPIRE_SINGLEDEAD;
+ break;
+ case ALIAS_TCP_STATE_CONNECTED:
+ if (lnk->data.tcp->state.in == ALIAS_TCP_STATE_CONNECTED)
+ lnk->expire_time = TCP_EXPIRE_CONNECTED;
+ break;
+ default:
+#ifdef _KERNEL
+ panic("libalias:SetStateOut() unknown state");
+#else
+ abort();
+#endif
+ }
+ lnk->data.tcp->state.out = state;
+}
+
+
+int
+GetStateIn(struct alias_link *lnk)
+{
+ /* TCP input state */
+ return (lnk->data.tcp->state.in);
+}
+
+
+int
+GetStateOut(struct alias_link *lnk)
+{
+ /* TCP output state */
+ return (lnk->data.tcp->state.out);
+}
+
+
+struct in_addr
+GetOriginalAddress(struct alias_link *lnk)
+{
+ if (lnk->src_addr.s_addr == INADDR_ANY)
+ return (lnk->la->aliasAddress);
+ else
+ return (lnk->src_addr);
+}
+
+
+struct in_addr
+GetDestAddress(struct alias_link *lnk)
+{
+ return (lnk->dst_addr);
+}
+
+
+struct in_addr
+GetAliasAddress(struct alias_link *lnk)
+{
+ if (lnk->alias_addr.s_addr == INADDR_ANY)
+ return (lnk->la->aliasAddress);
+ else
+ return (lnk->alias_addr);
+}
+
+
+struct in_addr
+GetDefaultAliasAddress(struct libalias *la)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+ return (la->aliasAddress);
+}
+
+
+void
+SetDefaultAliasAddress(struct libalias *la, struct in_addr alias_addr)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+ la->aliasAddress = alias_addr;
+}
+
+
+u_short
+GetOriginalPort(struct alias_link *lnk)
+{
+ return (lnk->src_port);
+}
+
+
+u_short
+GetAliasPort(struct alias_link *lnk)
+{
+ return (lnk->alias_port);
+}
+
+#ifndef NO_FW_PUNCH
+static u_short
+GetDestPort(struct alias_link *lnk)
+{
+ return (lnk->dst_port);
+}
+
+#endif
+
+void
+SetAckModified(struct alias_link *lnk)
+{
+/* Indicate that ACK numbers have been modified in a TCP connection */
+ lnk->data.tcp->state.ack_modified = 1;
+}
+
+
+struct in_addr
+GetProxyAddress(struct alias_link *lnk)
+{
+ return (lnk->proxy_addr);
+}
+
+
+void
+SetProxyAddress(struct alias_link *lnk, struct in_addr addr)
+{
+ lnk->proxy_addr = addr;
+}
+
+
+u_short
+GetProxyPort(struct alias_link *lnk)
+{
+ return (lnk->proxy_port);
+}
+
+
+void
+SetProxyPort(struct alias_link *lnk, u_short port)
+{
+ lnk->proxy_port = port;
+}
+
+
+int
+GetAckModified(struct alias_link *lnk)
+{
+/* See if ACK numbers have been modified */
+ return (lnk->data.tcp->state.ack_modified);
+}
+
+// XXX ip free
+int
+GetDeltaAckIn(u_long ack, struct alias_link *lnk)
+{
+/*
+Find out how much the ACK number has been altered for an incoming
+TCP packet. To do this, a circular list of ACK numbers where the TCP
+packet size was altered is searched.
+*/
+
+ int i;
+ int delta, ack_diff_min;
+
+ delta = 0;
+ ack_diff_min = -1;
+ for (i = 0; i < N_LINK_TCP_DATA; i++) {
+ struct ack_data_record x;
+
+ x = lnk->data.tcp->ack[i];
+ if (x.active == 1) {
+ int ack_diff;
+
+ ack_diff = SeqDiff(x.ack_new, ack);
+ if (ack_diff >= 0) {
+ if (ack_diff_min >= 0) {
+ if (ack_diff < ack_diff_min) {
+ delta = x.delta;
+ ack_diff_min = ack_diff;
+ }
+ } else {
+ delta = x.delta;
+ ack_diff_min = ack_diff;
+ }
+ }
+ }
+ }
+ return (delta);
+}
+
+// XXX ip free
+int
+GetDeltaSeqOut(u_long seq, struct alias_link *lnk)
+{
+/*
+Find out how much the sequence number has been altered for an outgoing
+TCP packet. To do this, a circular list of ACK numbers where the TCP
+packet size was altered is searched.
+*/
+
+ int i;
+ int delta, seq_diff_min;
+
+ delta = 0;
+ seq_diff_min = -1;
+ for (i = 0; i < N_LINK_TCP_DATA; i++) {
+ struct ack_data_record x;
+
+ x = lnk->data.tcp->ack[i];
+ if (x.active == 1) {
+ int seq_diff;
+
+ seq_diff = SeqDiff(x.ack_old, seq);
+ if (seq_diff >= 0) {
+ if (seq_diff_min >= 0) {
+ if (seq_diff < seq_diff_min) {
+ delta = x.delta;
+ seq_diff_min = seq_diff;
+ }
+ } else {
+ delta = x.delta;
+ seq_diff_min = seq_diff;
+ }
+ }
+ }
+ }
+ return (delta);
+}
+
+// XXX ip free
+void
+AddSeq(struct alias_link *lnk, int delta, u_int ip_hl, u_short ip_len,
+ u_long th_seq, u_int th_off)
+{
+/*
+When a TCP packet has been altered in length, save this
+information in a circular list. If enough packets have
+been altered, then this list will begin to overwrite itself.
+*/
+
+ struct ack_data_record x;
+ int hlen, tlen, dlen;
+ int i;
+
+ hlen = (ip_hl + th_off) << 2;
+ tlen = ntohs(ip_len);
+ dlen = tlen - hlen;
+
+ x.ack_old = htonl(ntohl(th_seq) + dlen);
+ x.ack_new = htonl(ntohl(th_seq) + dlen + delta);
+ x.delta = delta;
+ x.active = 1;
+
+ i = lnk->data.tcp->state.index;
+ lnk->data.tcp->ack[i] = x;
+
+ i++;
+ if (i == N_LINK_TCP_DATA)
+ lnk->data.tcp->state.index = 0;
+ else
+ lnk->data.tcp->state.index = i;
+}
+
+void
+SetExpire(struct alias_link *lnk, int expire)
+{
+ if (expire == 0) {
+ lnk->flags &= ~LINK_PERMANENT;
+ DeleteLink(lnk);
+ } else if (expire == -1) {
+ lnk->flags |= LINK_PERMANENT;
+ } else if (expire > 0) {
+ lnk->expire_time = expire;
+ } else {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr, "PacketAlias/SetExpire(): ");
+ fprintf(stderr, "error in expire parameter\n");
+#endif
+ }
+}
+
+void
+ClearCheckNewLink(struct libalias *la)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+ la->newDefaultLink = 0;
+}
+
+void
+SetProtocolFlags(struct alias_link *lnk, int pflags)
+{
+
+ lnk->pflags = pflags;
+}
+
+int
+GetProtocolFlags(struct alias_link *lnk)
+{
+
+ return (lnk->pflags);
+}
+
+void
+SetDestCallId(struct alias_link *lnk, u_int16_t cid)
+{
+ struct libalias *la = lnk->la;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ la->deleteAllLinks = 1;
+ ReLink(lnk, lnk->src_addr, lnk->dst_addr, lnk->alias_addr,
+ lnk->src_port, cid, lnk->alias_port, lnk->link_type);
+ la->deleteAllLinks = 0;
+}
+
+
+/* Miscellaneous Functions
+
+ HouseKeeping()
+ InitPacketAliasLog()
+ UninitPacketAliasLog()
+*/
+
+/*
+ Whenever an outgoing or incoming packet is handled, HouseKeeping()
+ is called to find and remove timed-out aliasing links. Logic exists
+ to sweep through the entire table and linked list structure
+ every 60 seconds.
+
+ (prototype in alias_local.h)
+*/
+
+void
+HouseKeeping(struct libalias *la)
+{
+ int i, n;
+#ifndef _KERNEL
+ struct timeval tv;
+ struct timezone tz;
+#endif
+
+ LIBALIAS_LOCK_ASSERT(la);
+ /*
+ * Save system time (seconds) in global variable timeStamp for use
+ * by other functions. This is done so as not to unnecessarily
+ * waste timeline by making system calls.
+ */
+#ifdef _KERNEL
+ la->timeStamp = time_uptime;
+#else
+ gettimeofday(&tv, &tz);
+ la->timeStamp = tv.tv_sec;
+#endif
+
+ /* Compute number of spokes (output table link chains) to cover */
+ n = LINK_TABLE_OUT_SIZE * (la->timeStamp - la->lastCleanupTime);
+ n /= ALIAS_CLEANUP_INTERVAL_SECS;
+
+ /* Handle different cases */
+ if (n > 0) {
+ if (n > ALIAS_CLEANUP_MAX_SPOKES)
+ n = ALIAS_CLEANUP_MAX_SPOKES;
+ la->lastCleanupTime = la->timeStamp;
+ for (i = 0; i < n; i++)
+ IncrementalCleanup(la);
+ } else if (n < 0) {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr, "PacketAlias/HouseKeeping(): ");
+ fprintf(stderr, "something unexpected in time values\n");
+#endif
+ la->lastCleanupTime = la->timeStamp;
+ }
+}
+
+/* Init the log file and enable logging */
+static int
+InitPacketAliasLog(struct libalias *la)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+ if (~la->packetAliasMode & PKT_ALIAS_LOG) {
+#ifdef _KERNEL
+ if ((la->logDesc = malloc(LIBALIAS_BUF_SIZE)))
+ ;
+#else
+ if ((la->logDesc = fopen("/var/log/alias.log", "w")))
+ fprintf(la->logDesc, "PacketAlias/InitPacketAliasLog: Packet alias logging enabled.\n");
+#endif
+ else
+ return (ENOMEM); /* log initialization failed */
+ la->packetAliasMode |= PKT_ALIAS_LOG;
+ }
+
+ return (1);
+}
+
+/* Close the log-file and disable logging. */
+static void
+UninitPacketAliasLog(struct libalias *la)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+ if (la->logDesc) {
+#ifdef _KERNEL
+ free(la->logDesc);
+#else
+ fclose(la->logDesc);
+#endif
+ la->logDesc = NULL;
+ }
+ la->packetAliasMode &= ~PKT_ALIAS_LOG;
+}
+
+/* Outside world interfaces
+
+-- "outside world" means other than alias*.c routines --
+
+ PacketAliasRedirectPort()
+ PacketAliasAddServer()
+ PacketAliasRedirectProto()
+ PacketAliasRedirectAddr()
+ PacketAliasRedirectDynamic()
+ PacketAliasRedirectDelete()
+ PacketAliasSetAddress()
+ PacketAliasInit()
+ PacketAliasUninit()
+ PacketAliasSetMode()
+
+(prototypes in alias.h)
+*/
+
+/* Redirection from a specific public addr:port to a
+ private addr:port */
+struct alias_link *
+LibAliasRedirectPort(struct libalias *la, struct in_addr src_addr, u_short src_port,
+ struct in_addr dst_addr, u_short dst_port,
+ struct in_addr alias_addr, u_short alias_port,
+ u_char proto)
+{
+ int link_type;
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK(la);
+ switch (proto) {
+ case IPPROTO_UDP:
+ link_type = LINK_UDP;
+ break;
+ case IPPROTO_TCP:
+ link_type = LINK_TCP;
+ break;
+ case IPPROTO_SCTP:
+ link_type = LINK_SCTP;
+ break;
+ default:
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr, "PacketAliasRedirectPort(): ");
+ fprintf(stderr, "only SCTP, TCP and UDP protocols allowed\n");
+#endif
+ lnk = NULL;
+ goto getout;
+ }
+
+ lnk = AddLink(la, src_addr, dst_addr, alias_addr,
+ src_port, dst_port, alias_port,
+ link_type);
+
+ if (lnk != NULL) {
+ lnk->flags |= LINK_PERMANENT;
+ }
+#ifdef LIBALIAS_DEBUG
+ else {
+ fprintf(stderr, "PacketAliasRedirectPort(): "
+ "call to AddLink() failed\n");
+ }
+#endif
+
+getout:
+ LIBALIAS_UNLOCK(la);
+ return (lnk);
+}
+
+/* Add server to the pool of servers */
+int
+LibAliasAddServer(struct libalias *la, struct alias_link *lnk, struct in_addr addr, u_short port)
+{
+ struct server *server;
+ int res;
+
+ LIBALIAS_LOCK(la);
+ (void)la;
+
+ server = malloc(sizeof(struct server));
+
+ if (server != NULL) {
+ struct server *head;
+
+ server->addr = addr;
+ server->port = port;
+
+ head = lnk->server;
+ if (head == NULL)
+ server->next = server;
+ else {
+ struct server *s;
+
+ for (s = head; s->next != head; s = s->next);
+ s->next = server;
+ server->next = head;
+ }
+ lnk->server = server;
+ res = 0;
+ } else
+ res = -1;
+
+ LIBALIAS_UNLOCK(la);
+ return (res);
+}
+
+/* Redirect packets of a given IP protocol from a specific
+ public address to a private address */
+struct alias_link *
+LibAliasRedirectProto(struct libalias *la, struct in_addr src_addr,
+ struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_char proto)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK(la);
+ lnk = AddLink(la, src_addr, dst_addr, alias_addr,
+ NO_SRC_PORT, NO_DEST_PORT, 0,
+ proto);
+
+ if (lnk != NULL) {
+ lnk->flags |= LINK_PERMANENT;
+ }
+#ifdef LIBALIAS_DEBUG
+ else {
+ fprintf(stderr, "PacketAliasRedirectProto(): "
+ "call to AddLink() failed\n");
+ }
+#endif
+
+ LIBALIAS_UNLOCK(la);
+ return (lnk);
+}
+
+/* Static address translation */
+struct alias_link *
+LibAliasRedirectAddr(struct libalias *la, struct in_addr src_addr,
+ struct in_addr alias_addr)
+{
+ struct alias_link *lnk;
+
+ LIBALIAS_LOCK(la);
+ lnk = AddLink(la, src_addr, la->nullAddress, alias_addr,
+ 0, 0, 0,
+ LINK_ADDR);
+
+ if (lnk != NULL) {
+ lnk->flags |= LINK_PERMANENT;
+ }
+#ifdef LIBALIAS_DEBUG
+ else {
+ fprintf(stderr, "PacketAliasRedirectAddr(): "
+ "call to AddLink() failed\n");
+ }
+#endif
+
+ LIBALIAS_UNLOCK(la);
+ return (lnk);
+}
+
+
+/* Mark the aliasing link dynamic */
+int
+LibAliasRedirectDynamic(struct libalias *la, struct alias_link *lnk)
+{
+ int res;
+
+ LIBALIAS_LOCK(la);
+ (void)la;
+
+ if (lnk->flags & LINK_PARTIALLY_SPECIFIED)
+ res = -1;
+ else {
+ lnk->flags &= ~LINK_PERMANENT;
+ res = 0;
+ }
+ LIBALIAS_UNLOCK(la);
+ return (res);
+}
+
+
+void
+LibAliasRedirectDelete(struct libalias *la, struct alias_link *lnk)
+{
+/* This is a dangerous function to put in the API,
+ because an invalid pointer can crash the program. */
+
+ LIBALIAS_LOCK(la);
+ la->deleteAllLinks = 1;
+ DeleteLink(lnk);
+ la->deleteAllLinks = 0;
+ LIBALIAS_UNLOCK(la);
+}
+
+
+void
+LibAliasSetAddress(struct libalias *la, struct in_addr addr)
+{
+
+ LIBALIAS_LOCK(la);
+ if (la->packetAliasMode & PKT_ALIAS_RESET_ON_ADDR_CHANGE
+ && la->aliasAddress.s_addr != addr.s_addr)
+ CleanupAliasData(la);
+
+ la->aliasAddress = addr;
+ LIBALIAS_UNLOCK(la);
+}
+
+
+void
+LibAliasSetTarget(struct libalias *la, struct in_addr target_addr)
+{
+
+ LIBALIAS_LOCK(la);
+ la->targetAddress = target_addr;
+ LIBALIAS_UNLOCK(la);
+}
+
+static void
+finishoff(void)
+{
+
+ while (!LIST_EMPTY(&instancehead))
+ LibAliasUninit(LIST_FIRST(&instancehead));
+}
+
+struct libalias *
+LibAliasInit(struct libalias *la)
+{
+ int i;
+#ifndef _KERNEL
+ struct timeval tv;
+ struct timezone tz;
+#endif
+
+ if (la == NULL) {
+ la = calloc(sizeof *la, 1);
+ if (la == NULL)
+ return (la);
+
+#ifndef _KERNEL /* kernel cleans up on module unload */
+ if (LIST_EMPTY(&instancehead))
+ atexit(finishoff);
+#endif
+ LIST_INSERT_HEAD(&instancehead, la, instancelist);
+
+#ifdef _KERNEL
+ la->timeStamp = time_uptime;
+ la->lastCleanupTime = time_uptime;
+#else
+ gettimeofday(&tv, &tz);
+ la->timeStamp = tv.tv_sec;
+ la->lastCleanupTime = tv.tv_sec;
+#endif
+
+ for (i = 0; i < LINK_TABLE_OUT_SIZE; i++)
+ LIST_INIT(&la->linkTableOut[i]);
+ for (i = 0; i < LINK_TABLE_IN_SIZE; i++)
+ LIST_INIT(&la->linkTableIn[i]);
+#ifdef _KERNEL
+ AliasSctpInit(la);
+#endif
+ LIBALIAS_LOCK_INIT(la);
+ LIBALIAS_LOCK(la);
+ } else {
+ LIBALIAS_LOCK(la);
+ la->deleteAllLinks = 1;
+ CleanupAliasData(la);
+ la->deleteAllLinks = 0;
+#ifdef _KERNEL
+ AliasSctpTerm(la);
+ AliasSctpInit(la);
+#endif
+ }
+
+ la->aliasAddress.s_addr = INADDR_ANY;
+ la->targetAddress.s_addr = INADDR_ANY;
+
+ la->icmpLinkCount = 0;
+ la->udpLinkCount = 0;
+ la->tcpLinkCount = 0;
+ la->sctpLinkCount = 0;
+ la->pptpLinkCount = 0;
+ la->protoLinkCount = 0;
+ la->fragmentIdLinkCount = 0;
+ la->fragmentPtrLinkCount = 0;
+ la->sockCount = 0;
+
+ la->cleanupIndex = 0;
+
+ la->packetAliasMode = PKT_ALIAS_SAME_PORTS
+#ifndef NO_USE_SOCKETS
+ | PKT_ALIAS_USE_SOCKETS
+#endif
+ | PKT_ALIAS_RESET_ON_ADDR_CHANGE;
+#ifndef NO_FW_PUNCH
+ la->fireWallFD = -1;
+#endif
+#ifndef _KERNEL
+ LibAliasRefreshModules();
+#endif
+ LIBALIAS_UNLOCK(la);
+ return (la);
+}
+
+void
+LibAliasUninit(struct libalias *la)
+{
+
+ LIBALIAS_LOCK(la);
+#ifdef _KERNEL
+ AliasSctpTerm(la);
+#endif
+ la->deleteAllLinks = 1;
+ CleanupAliasData(la);
+ la->deleteAllLinks = 0;
+ UninitPacketAliasLog(la);
+#ifndef NO_FW_PUNCH
+ UninitPunchFW(la);
+#endif
+ LIST_REMOVE(la, instancelist);
+ LIBALIAS_UNLOCK(la);
+ LIBALIAS_LOCK_DESTROY(la);
+ free(la);
+}
+
+/* Change mode for some operations */
+unsigned int
+LibAliasSetMode(
+ struct libalias *la,
+ unsigned int flags, /* Which state to bring flags to */
+ unsigned int mask /* Mask of which flags to affect (use 0 to
+ * do a probe for flag values) */
+)
+{
+ int res = -1;
+
+ LIBALIAS_LOCK(la);
+/* Enable logging? */
+ if (flags & mask & PKT_ALIAS_LOG) {
+ /* Do the enable */
+ if (InitPacketAliasLog(la) == ENOMEM)
+ goto getout;
+ } else
+/* _Disable_ logging? */
+ if (~flags & mask & PKT_ALIAS_LOG) {
+ UninitPacketAliasLog(la);
+ }
+#ifndef NO_FW_PUNCH
+/* Start punching holes in the firewall? */
+ if (flags & mask & PKT_ALIAS_PUNCH_FW) {
+ InitPunchFW(la);
+ } else
+/* Stop punching holes in the firewall? */
+ if (~flags & mask & PKT_ALIAS_PUNCH_FW) {
+ UninitPunchFW(la);
+ }
+#endif
+
+/* Other flags can be set/cleared without special action */
+ la->packetAliasMode = (flags & mask) | (la->packetAliasMode & ~mask);
+ res = la->packetAliasMode;
+getout:
+ LIBALIAS_UNLOCK(la);
+ return (res);
+}
+
+
+int
+LibAliasCheckNewLink(struct libalias *la)
+{
+ int res;
+
+ LIBALIAS_LOCK(la);
+ res = la->newDefaultLink;
+ LIBALIAS_UNLOCK(la);
+ return (res);
+}
+
+
+#ifndef NO_FW_PUNCH
+
+/*****************
+ Code to support firewall punching. This shouldn't really be in this
+ file, but making variables global is evil too.
+ ****************/
+
+/* Firewall include files */
+#include <freebsd/net/if.h>
+#include <freebsd/netinet/ip_fw.h>
+#include <freebsd/string.h>
+#include <freebsd/err.h>
+
+/*
+ * helper function, updates the pointer to cmd with the length
+ * of the current command, and also cleans up the first word of
+ * the new command in case it has been clobbered before.
+ */
+static ipfw_insn *
+next_cmd(ipfw_insn * cmd)
+{
+ cmd += F_LEN(cmd);
+ bzero(cmd, sizeof(*cmd));
+ return (cmd);
+}
+
+/*
+ * A function to fill simple commands of size 1.
+ * Existing flags are preserved.
+ */
+static ipfw_insn *
+fill_cmd(ipfw_insn * cmd, enum ipfw_opcodes opcode, int size,
+ int flags, u_int16_t arg)
+{
+ cmd->opcode = opcode;
+ cmd->len = ((cmd->len | flags) & (F_NOT | F_OR)) | (size & F_LEN_MASK);
+ cmd->arg1 = arg;
+ return next_cmd(cmd);
+}
+
+static ipfw_insn *
+fill_ip(ipfw_insn * cmd1, enum ipfw_opcodes opcode, u_int32_t addr)
+{
+ ipfw_insn_ip *cmd = (ipfw_insn_ip *) cmd1;
+
+ cmd->addr.s_addr = addr;
+ return fill_cmd(cmd1, opcode, F_INSN_SIZE(ipfw_insn_u32), 0, 0);
+}
+
+static ipfw_insn *
+fill_one_port(ipfw_insn * cmd1, enum ipfw_opcodes opcode, u_int16_t port)
+{
+ ipfw_insn_u16 *cmd = (ipfw_insn_u16 *) cmd1;
+
+ cmd->ports[0] = cmd->ports[1] = port;
+ return fill_cmd(cmd1, opcode, F_INSN_SIZE(ipfw_insn_u16), 0, 0);
+}
+
+static int
+fill_rule(void *buf, int bufsize, int rulenum,
+ enum ipfw_opcodes action, int proto,
+ struct in_addr sa, u_int16_t sp, struct in_addr da, u_int16_t dp)
+{
+ struct ip_fw *rule = (struct ip_fw *)buf;
+ ipfw_insn *cmd = (ipfw_insn *) rule->cmd;
+
+ bzero(buf, bufsize);
+ rule->rulenum = rulenum;
+
+ cmd = fill_cmd(cmd, O_PROTO, F_INSN_SIZE(ipfw_insn), 0, proto);
+ cmd = fill_ip(cmd, O_IP_SRC, sa.s_addr);
+ cmd = fill_one_port(cmd, O_IP_SRCPORT, sp);
+ cmd = fill_ip(cmd, O_IP_DST, da.s_addr);
+ cmd = fill_one_port(cmd, O_IP_DSTPORT, dp);
+
+ rule->act_ofs = (u_int32_t *) cmd - (u_int32_t *) rule->cmd;
+ cmd = fill_cmd(cmd, action, F_INSN_SIZE(ipfw_insn), 0, 0);
+
+ rule->cmd_len = (u_int32_t *) cmd - (u_int32_t *) rule->cmd;
+
+ return ((char *)cmd - (char *)buf);
+}
+
+static void ClearAllFWHoles(struct libalias *la);
+
+
+#define fw_setfield(la, field, num) \
+do { \
+ (field)[(num) - la->fireWallBaseNum] = 1; \
+} /*lint -save -e717 */ while(0)/* lint -restore */
+
+#define fw_clrfield(la, field, num) \
+do { \
+ (field)[(num) - la->fireWallBaseNum] = 0; \
+} /*lint -save -e717 */ while(0)/* lint -restore */
+
+#define fw_tstfield(la, field, num) ((field)[(num) - la->fireWallBaseNum])
+
+static void
+InitPunchFW(struct libalias *la)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+ la->fireWallField = malloc(la->fireWallNumNums);
+ if (la->fireWallField) {
+ memset(la->fireWallField, 0, la->fireWallNumNums);
+ if (la->fireWallFD < 0) {
+ la->fireWallFD = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
+ }
+ ClearAllFWHoles(la);
+ la->fireWallActiveNum = la->fireWallBaseNum;
+ }
+}
+
+static void
+UninitPunchFW(struct libalias *la)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+ ClearAllFWHoles(la);
+ if (la->fireWallFD >= 0)
+ close(la->fireWallFD);
+ la->fireWallFD = -1;
+ if (la->fireWallField)
+ free(la->fireWallField);
+ la->fireWallField = NULL;
+ la->packetAliasMode &= ~PKT_ALIAS_PUNCH_FW;
+}
+
+/* Make a certain link go through the firewall */
+void
+PunchFWHole(struct alias_link *lnk)
+{
+ struct libalias *la;
+ int r; /* Result code */
+ struct ip_fw rule; /* On-the-fly built rule */
+ int fwhole; /* Where to punch hole */
+
+ LIBALIAS_LOCK_ASSERT(la);
+ la = lnk->la;
+
+/* Don't do anything unless we are asked to */
+ if (!(la->packetAliasMode & PKT_ALIAS_PUNCH_FW) ||
+ la->fireWallFD < 0 ||
+ lnk->link_type != LINK_TCP)
+ return;
+
+ memset(&rule, 0, sizeof rule);
+
+/** Build rule **/
+
+ /* Find empty slot */
+ for (fwhole = la->fireWallActiveNum;
+ fwhole < la->fireWallBaseNum + la->fireWallNumNums &&
+ fw_tstfield(la, la->fireWallField, fwhole);
+ fwhole++);
+ if (fwhole == la->fireWallBaseNum + la->fireWallNumNums) {
+ for (fwhole = la->fireWallBaseNum;
+ fwhole < la->fireWallActiveNum &&
+ fw_tstfield(la, la->fireWallField, fwhole);
+ fwhole++);
+ if (fwhole == la->fireWallActiveNum) {
+ /* No rule point empty - we can't punch more holes. */
+ la->fireWallActiveNum = la->fireWallBaseNum;
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr, "libalias: Unable to create firewall hole!\n");
+#endif
+ return;
+ }
+ }
+ /* Start next search at next position */
+ la->fireWallActiveNum = fwhole + 1;
+
+ /*
+ * generate two rules of the form
+ *
+ * add fwhole accept tcp from OAddr OPort to DAddr DPort add fwhole
+ * accept tcp from DAddr DPort to OAddr OPort
+ */
+ if (GetOriginalPort(lnk) != 0 && GetDestPort(lnk) != 0) {
+ u_int32_t rulebuf[255];
+ int i;
+
+ i = fill_rule(rulebuf, sizeof(rulebuf), fwhole,
+ O_ACCEPT, IPPROTO_TCP,
+ GetOriginalAddress(lnk), ntohs(GetOriginalPort(lnk)),
+ GetDestAddress(lnk), ntohs(GetDestPort(lnk)));
+ r = setsockopt(la->fireWallFD, IPPROTO_IP, IP_FW_ADD, rulebuf, i);
+ if (r)
+ err(1, "alias punch inbound(1) setsockopt(IP_FW_ADD)");
+
+ i = fill_rule(rulebuf, sizeof(rulebuf), fwhole,
+ O_ACCEPT, IPPROTO_TCP,
+ GetDestAddress(lnk), ntohs(GetDestPort(lnk)),
+ GetOriginalAddress(lnk), ntohs(GetOriginalPort(lnk)));
+ r = setsockopt(la->fireWallFD, IPPROTO_IP, IP_FW_ADD, rulebuf, i);
+ if (r)
+ err(1, "alias punch inbound(2) setsockopt(IP_FW_ADD)");
+ }
+
+/* Indicate hole applied */
+ lnk->data.tcp->fwhole = fwhole;
+ fw_setfield(la, la->fireWallField, fwhole);
+}
+
+/* Remove a hole in a firewall associated with a particular alias
+ lnk. Calling this too often is harmless. */
+static void
+ClearFWHole(struct alias_link *lnk)
+{
+ struct libalias *la;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ la = lnk->la;
+ if (lnk->link_type == LINK_TCP) {
+ int fwhole = lnk->data.tcp->fwhole; /* Where is the firewall
+ * hole? */
+ struct ip_fw rule;
+
+ if (fwhole < 0)
+ return;
+
+ memset(&rule, 0, sizeof rule); /* useless for ipfw2 */
+ while (!setsockopt(la->fireWallFD, IPPROTO_IP, IP_FW_DEL,
+ &fwhole, sizeof fwhole));
+ fw_clrfield(la, la->fireWallField, fwhole);
+ lnk->data.tcp->fwhole = -1;
+ }
+}
+
+/* Clear out the entire range dedicated to firewall holes. */
+static void
+ClearAllFWHoles(struct libalias *la)
+{
+ struct ip_fw rule; /* On-the-fly built rule */
+ int i;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ if (la->fireWallFD < 0)
+ return;
+
+ memset(&rule, 0, sizeof rule);
+ for (i = la->fireWallBaseNum; i < la->fireWallBaseNum + la->fireWallNumNums; i++) {
+ int r = i;
+
+ while (!setsockopt(la->fireWallFD, IPPROTO_IP, IP_FW_DEL, &r, sizeof r));
+ }
+ /* XXX: third arg correct here ? /phk */
+ memset(la->fireWallField, 0, la->fireWallNumNums);
+}
+
+#endif
+
+void
+LibAliasSetFWBase(struct libalias *la, unsigned int base, unsigned int num)
+{
+
+ LIBALIAS_LOCK(la);
+#ifndef NO_FW_PUNCH
+ la->fireWallBaseNum = base;
+ la->fireWallNumNums = num;
+#endif
+ LIBALIAS_UNLOCK(la);
+}
+
+void
+LibAliasSetSkinnyPort(struct libalias *la, unsigned int port)
+{
+
+ LIBALIAS_LOCK(la);
+ la->skinnyPort = port;
+ LIBALIAS_UNLOCK(la);
+}
+
+/*
+ * Find the address to redirect incoming packets
+ */
+struct in_addr
+FindSctpRedirectAddress(struct libalias *la, struct sctp_nat_msg *sm)
+{
+ struct alias_link *lnk;
+ struct in_addr redir;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ lnk = FindLinkIn(la, sm->ip_hdr->ip_src, sm->ip_hdr->ip_dst,
+ sm->sctp_hdr->dest_port,sm->sctp_hdr->dest_port, LINK_SCTP, 1);
+ if (lnk != NULL) {
+ return(lnk->src_addr); /* port redirect */
+ } else {
+ redir = FindOriginalAddress(la,sm->ip_hdr->ip_dst);
+ if (redir.s_addr == la->aliasAddress.s_addr ||
+ redir.s_addr == la->targetAddress.s_addr) { /* No address found */
+ lnk = FindLinkIn(la, sm->ip_hdr->ip_src, sm->ip_hdr->ip_dst,
+ NO_DEST_PORT, 0, LINK_SCTP, 1);
+ if (lnk != NULL)
+ return(lnk->src_addr); /* redirect proto */
+ }
+ return(redir); /* address redirect */
+ }
+}
diff --git a/freebsd/sys/netinet/libalias/alias_dummy.c b/freebsd/sys/netinet/libalias/alias_dummy.c
new file mode 100644
index 00000000..c5a316d4
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_dummy.c
@@ -0,0 +1,155 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2005 Paolo Pisati <piso@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Alias_dummy is just an empty skeleton used to demostrate how to write
+ * a module for libalias, that will run unalterated in userland or in
+ * kernel land.
+ */
+
+#ifdef _KERNEL
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/module.h>
+#else
+#include <freebsd/errno.h>
+#include <freebsd/sys/types.h>
+#include <freebsd/stdio.h>
+#endif
+
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/udp.h>
+
+#ifdef _KERNEL
+#include <freebsd/netinet/libalias/alias_local.h>
+#include <freebsd/netinet/libalias/alias_mod.h>
+#else
+#include <freebsd/local/alias_local.h>
+#include <freebsd/local/alias_mod.h>
+#endif
+
+static void
+AliasHandleDummy(struct libalias *la, struct ip *ip, struct alias_data *ah);
+
+static int
+fingerprint(struct libalias *la, struct alias_data *ah)
+{
+
+ /*
+ * Check here all the data that will be used later, if any field
+ * is empy/NULL, return a -1 value.
+ */
+ if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL ||
+ ah->maxpktsize == 0)
+ return (-1);
+ /*
+ * Fingerprint the incoming packet, if it matches any conditions
+ * return an OK value.
+ */
+ if (ntohs(*ah->dport) == 123
+ || ntohs(*ah->sport) == 456)
+ return (0); /* I know how to handle it. */
+ return (-1); /* I don't recognize this packet. */
+}
+
+/*
+ * Wrap in this general purpose function, the real function used to alias the
+ * packets.
+ */
+
+static int
+protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ AliasHandleDummy(la, pip, ah);
+ return (0);
+}
+
+/*
+ * NOTA BENE: the next variable MUST NOT be renamed in any case if you want
+ * your module to work in userland, cause it's used to find and use all
+ * the protocol handlers present in every module.
+ * So WATCH OUT, your module needs this variables and it needs it with
+ * ITS EXACT NAME: handlers.
+ */
+
+struct proto_handler handlers [] = {
+ {
+ .pri = 666,
+ .dir = IN|OUT,
+ .proto = UDP|TCP,
+ .fingerprint = &fingerprint,
+ .protohandler = &protohandler
+ },
+ { EOH }
+};
+
+static int
+mod_handler(module_t mod, int type, void *data)
+{
+ int error;
+
+ switch (type) {
+ case MOD_LOAD:
+ error = 0;
+ LibAliasAttachHandlers(handlers);
+ break;
+ case MOD_UNLOAD:
+ error = 0;
+ LibAliasDetachHandlers(handlers);
+ break;
+ default:
+ error = EINVAL;
+ }
+ return (error);
+}
+
+#ifdef _KERNEL
+static
+#endif
+moduledata_t alias_mod = {
+ "alias_dummy", mod_handler, NULL
+};
+
+#ifdef _KERNEL
+DECLARE_MODULE(alias_dummy, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);
+MODULE_VERSION(alias_dummy, 1);
+MODULE_DEPEND(alias_dummy, libalias, 1, 1, 1);
+#endif
+
+static void
+AliasHandleDummy(struct libalias *la, struct ip *ip, struct alias_data *ah)
+{
+ ; /* Dummy. */
+}
+
diff --git a/freebsd/sys/netinet/libalias/alias_ftp.c b/freebsd/sys/netinet/libalias/alias_ftp.c
new file mode 100644
index 00000000..4e8b7177
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_ftp.c
@@ -0,0 +1,696 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ Alias_ftp.c performs special processing for FTP sessions under
+ TCP. Specifically, when a PORT/EPRT command from the client
+ side or 227/229 reply from the server is sent, it is intercepted
+ and modified. The address is changed to the gateway machine
+ and an aliasing port is used.
+
+ For this routine to work, the message must fit entirely into a
+ single TCP packet. This is typically the case, but exceptions
+ can easily be envisioned under the actual specifications.
+
+ Probably the most troubling aspect of the approach taken here is
+ that the new message will typically be a different length, and
+ this causes a certain amount of bookkeeping to keep track of the
+ changes of sequence and acknowledgment numbers, since the client
+ machine is totally unaware of the modification to the TCP stream.
+
+
+ References: RFC 959, RFC 2428.
+
+ Initial version: August, 1996 (cjm)
+
+ Version 1.6
+ Brian Somers and Martin Renters identified an IP checksum
+ error for modified IP packets.
+
+ Version 1.7: January 9, 1996 (cjm)
+ Differential checksum computation for change
+ in IP packet length.
+
+ Version 2.1: May, 1997 (cjm)
+ Very minor changes to conform with
+ local/global/function naming conventions
+ within the packet aliasing module.
+
+ Version 3.1: May, 2000 (eds)
+ Add support for passive mode, alias the 227 replies.
+
+ See HISTORY file for record of revisions.
+*/
+
+/* Includes */
+#ifdef _KERNEL
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/ctype.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/module.h>
+#else
+#include <freebsd/ctype.h>
+#include <freebsd/errno.h>
+#include <freebsd/sys/types.h>
+#include <freebsd/stdio.h>
+#include <freebsd/string.h>
+#endif
+
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/tcp.h>
+
+#ifdef _KERNEL
+#include <freebsd/netinet/libalias/alias.h>
+#include <freebsd/netinet/libalias/alias_local.h>
+#include <freebsd/netinet/libalias/alias_mod.h>
+#else
+#include <freebsd/local/alias_local.h>
+#include <freebsd/local/alias_mod.h>
+#endif
+
+#define FTP_CONTROL_PORT_NUMBER 21
+
+static void
+AliasHandleFtpOut(struct libalias *, struct ip *, struct alias_link *,
+ int maxpacketsize);
+
+static int
+fingerprint(struct libalias *la, struct alias_data *ah)
+{
+
+ if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL ||
+ ah->maxpktsize == 0)
+ return (-1);
+ if (ntohs(*ah->dport) == FTP_CONTROL_PORT_NUMBER
+ || ntohs(*ah->sport) == FTP_CONTROL_PORT_NUMBER)
+ return (0);
+ return (-1);
+}
+
+static int
+protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ AliasHandleFtpOut(la, pip, ah->lnk, ah->maxpktsize);
+ return (0);
+}
+
+struct proto_handler handlers[] = {
+ {
+ .pri = 80,
+ .dir = OUT,
+ .proto = TCP,
+ .fingerprint = &fingerprint,
+ .protohandler = &protohandler
+ },
+ { EOH }
+};
+
+static int
+mod_handler(module_t mod, int type, void *data)
+{
+ int error;
+
+ switch (type) {
+ case MOD_LOAD:
+ error = 0;
+ LibAliasAttachHandlers(handlers);
+ break;
+ case MOD_UNLOAD:
+ error = 0;
+ LibAliasDetachHandlers(handlers);
+ break;
+ default:
+ error = EINVAL;
+ }
+ return (error);
+}
+
+#ifdef _KERNEL
+static
+#endif
+moduledata_t alias_mod = {
+ "alias_ftp", mod_handler, NULL
+};
+
+#ifdef _KERNEL
+DECLARE_MODULE(alias_ftp, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);
+MODULE_VERSION(alias_ftp, 1);
+MODULE_DEPEND(alias_ftp, libalias, 1, 1, 1);
+#endif
+
+#define FTP_CONTROL_PORT_NUMBER 21
+#define MAX_MESSAGE_SIZE 128
+
+/* FTP protocol flags. */
+#define WAIT_CRLF 0x01
+
+enum ftp_message_type {
+ FTP_PORT_COMMAND,
+ FTP_EPRT_COMMAND,
+ FTP_227_REPLY,
+ FTP_229_REPLY,
+ FTP_UNKNOWN_MESSAGE
+};
+
+static int ParseFtpPortCommand(struct libalias *la, char *, int);
+static int ParseFtpEprtCommand(struct libalias *la, char *, int);
+static int ParseFtp227Reply(struct libalias *la, char *, int);
+static int ParseFtp229Reply(struct libalias *la, char *, int);
+static void NewFtpMessage(struct libalias *la, struct ip *, struct alias_link *, int, int);
+
+static void
+AliasHandleFtpOut(
+ struct libalias *la,
+ struct ip *pip, /* IP packet to examine/patch */
+ struct alias_link *lnk, /* The link to go through (aliased port) */
+ int maxpacketsize /* The maximum size this packet can grow to
+ (including headers) */ )
+{
+ int hlen, tlen, dlen, pflags;
+ char *sptr;
+ struct tcphdr *tc;
+ int ftp_message_type;
+
+/* Calculate data length of TCP packet */
+ tc = (struct tcphdr *)ip_next(pip);
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+/* Place string pointer and beginning of data */
+ sptr = (char *)pip;
+ sptr += hlen;
+
+/*
+ * Check that data length is not too long and previous message was
+ * properly terminated with CRLF.
+ */
+ pflags = GetProtocolFlags(lnk);
+ if (dlen <= MAX_MESSAGE_SIZE && !(pflags & WAIT_CRLF)) {
+ ftp_message_type = FTP_UNKNOWN_MESSAGE;
+
+ if (ntohs(tc->th_dport) == FTP_CONTROL_PORT_NUMBER) {
+/*
+ * When aliasing a client, check for the PORT/EPRT command.
+ */
+ if (ParseFtpPortCommand(la, sptr, dlen))
+ ftp_message_type = FTP_PORT_COMMAND;
+ else if (ParseFtpEprtCommand(la, sptr, dlen))
+ ftp_message_type = FTP_EPRT_COMMAND;
+ } else {
+/*
+ * When aliasing a server, check for the 227/229 reply.
+ */
+ if (ParseFtp227Reply(la, sptr, dlen))
+ ftp_message_type = FTP_227_REPLY;
+ else if (ParseFtp229Reply(la, sptr, dlen)) {
+ ftp_message_type = FTP_229_REPLY;
+ la->true_addr.s_addr = pip->ip_src.s_addr;
+ }
+ }
+
+ if (ftp_message_type != FTP_UNKNOWN_MESSAGE)
+ NewFtpMessage(la, pip, lnk, maxpacketsize, ftp_message_type);
+ }
+/* Track the msgs which are CRLF term'd for PORT/PASV FW breach */
+
+ if (dlen) { /* only if there's data */
+ sptr = (char *)pip; /* start over at beginning */
+ tlen = ntohs(pip->ip_len); /* recalc tlen, pkt may
+ * have grown */
+ if (sptr[tlen - 2] == '\r' && sptr[tlen - 1] == '\n')
+ pflags &= ~WAIT_CRLF;
+ else
+ pflags |= WAIT_CRLF;
+ SetProtocolFlags(lnk, pflags);
+ }
+}
+
+static int
+ParseFtpPortCommand(struct libalias *la, char *sptr, int dlen)
+{
+ char ch;
+ int i, state;
+ u_int32_t addr;
+ u_short port;
+ u_int8_t octet;
+
+ /* Format: "PORT A,D,D,R,PO,RT". */
+
+ /* Return if data length is too short. */
+ if (dlen < 18)
+ return (0);
+
+ if (strncasecmp("PORT ", sptr, 5))
+ return (0);
+
+ addr = port = octet = 0;
+ state = 0;
+ for (i = 5; i < dlen; i++) {
+ ch = sptr[i];
+ switch (state) {
+ case 0:
+ if (isspace(ch))
+ break;
+ else
+ state++;
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ case 9:
+ case 11:
+ if (isdigit(ch)) {
+ octet = ch - '0';
+ state++;
+ } else
+ return (0);
+ break;
+ case 2:
+ case 4:
+ case 6:
+ case 8:
+ if (isdigit(ch))
+ octet = 10 * octet + ch - '0';
+ else if (ch == ',') {
+ addr = (addr << 8) + octet;
+ state++;
+ } else
+ return (0);
+ break;
+ case 10:
+ case 12:
+ if (isdigit(ch))
+ octet = 10 * octet + ch - '0';
+ else if (ch == ',' || state == 12) {
+ port = (port << 8) + octet;
+ state++;
+ } else
+ return (0);
+ break;
+ }
+ }
+
+ if (state == 13) {
+ la->true_addr.s_addr = htonl(addr);
+ la->true_port = port;
+ return (1);
+ } else
+ return (0);
+}
+
+static int
+ParseFtpEprtCommand(struct libalias *la, char *sptr, int dlen)
+{
+ char ch, delim;
+ int i, state;
+ u_int32_t addr;
+ u_short port;
+ u_int8_t octet;
+
+ /* Format: "EPRT |1|A.D.D.R|PORT|". */
+
+ /* Return if data length is too short. */
+ if (dlen < 18)
+ return (0);
+
+ if (strncasecmp("EPRT ", sptr, 5))
+ return (0);
+
+ addr = port = octet = 0;
+ delim = '|'; /* XXX gcc -Wuninitialized */
+ state = 0;
+ for (i = 5; i < dlen; i++) {
+ ch = sptr[i];
+ switch (state) {
+ case 0:
+ if (!isspace(ch)) {
+ delim = ch;
+ state++;
+ }
+ break;
+ case 1:
+ if (ch == '1') /* IPv4 address */
+ state++;
+ else
+ return (0);
+ break;
+ case 2:
+ if (ch == delim)
+ state++;
+ else
+ return (0);
+ break;
+ case 3:
+ case 5:
+ case 7:
+ case 9:
+ if (isdigit(ch)) {
+ octet = ch - '0';
+ state++;
+ } else
+ return (0);
+ break;
+ case 4:
+ case 6:
+ case 8:
+ case 10:
+ if (isdigit(ch))
+ octet = 10 * octet + ch - '0';
+ else if (ch == '.' || state == 10) {
+ addr = (addr << 8) + octet;
+ state++;
+ } else
+ return (0);
+ break;
+ case 11:
+ if (isdigit(ch)) {
+ port = ch - '0';
+ state++;
+ } else
+ return (0);
+ break;
+ case 12:
+ if (isdigit(ch))
+ port = 10 * port + ch - '0';
+ else if (ch == delim)
+ state++;
+ else
+ return (0);
+ break;
+ }
+ }
+
+ if (state == 13) {
+ la->true_addr.s_addr = htonl(addr);
+ la->true_port = port;
+ return (1);
+ } else
+ return (0);
+}
+
+static int
+ParseFtp227Reply(struct libalias *la, char *sptr, int dlen)
+{
+ char ch;
+ int i, state;
+ u_int32_t addr;
+ u_short port;
+ u_int8_t octet;
+
+ /* Format: "227 Entering Passive Mode (A,D,D,R,PO,RT)" */
+
+ /* Return if data length is too short. */
+ if (dlen < 17)
+ return (0);
+
+ if (strncmp("227 ", sptr, 4))
+ return (0);
+
+ addr = port = octet = 0;
+
+ state = 0;
+ for (i = 4; i < dlen; i++) {
+ ch = sptr[i];
+ switch (state) {
+ case 0:
+ if (ch == '(')
+ state++;
+ break;
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ case 9:
+ case 11:
+ if (isdigit(ch)) {
+ octet = ch - '0';
+ state++;
+ } else
+ return (0);
+ break;
+ case 2:
+ case 4:
+ case 6:
+ case 8:
+ if (isdigit(ch))
+ octet = 10 * octet + ch - '0';
+ else if (ch == ',') {
+ addr = (addr << 8) + octet;
+ state++;
+ } else
+ return (0);
+ break;
+ case 10:
+ case 12:
+ if (isdigit(ch))
+ octet = 10 * octet + ch - '0';
+ else if (ch == ',' || (state == 12 && ch == ')')) {
+ port = (port << 8) + octet;
+ state++;
+ } else
+ return (0);
+ break;
+ }
+ }
+
+ if (state == 13) {
+ la->true_port = port;
+ la->true_addr.s_addr = htonl(addr);
+ return (1);
+ } else
+ return (0);
+}
+
+static int
+ParseFtp229Reply(struct libalias *la, char *sptr, int dlen)
+{
+ char ch, delim;
+ int i, state;
+ u_short port;
+
+ /* Format: "229 Entering Extended Passive Mode (|||PORT|)" */
+
+ /* Return if data length is too short. */
+ if (dlen < 11)
+ return (0);
+
+ if (strncmp("229 ", sptr, 4))
+ return (0);
+
+ port = 0;
+ delim = '|'; /* XXX gcc -Wuninitialized */
+
+ state = 0;
+ for (i = 4; i < dlen; i++) {
+ ch = sptr[i];
+ switch (state) {
+ case 0:
+ if (ch == '(')
+ state++;
+ break;
+ case 1:
+ delim = ch;
+ state++;
+ break;
+ case 2:
+ case 3:
+ if (ch == delim)
+ state++;
+ else
+ return (0);
+ break;
+ case 4:
+ if (isdigit(ch)) {
+ port = ch - '0';
+ state++;
+ } else
+ return (0);
+ break;
+ case 5:
+ if (isdigit(ch))
+ port = 10 * port + ch - '0';
+ else if (ch == delim)
+ state++;
+ else
+ return (0);
+ break;
+ case 6:
+ if (ch == ')')
+ state++;
+ else
+ return (0);
+ break;
+ }
+ }
+
+ if (state == 7) {
+ la->true_port = port;
+ return (1);
+ } else
+ return (0);
+}
+
+static void
+NewFtpMessage(struct libalias *la, struct ip *pip,
+ struct alias_link *lnk,
+ int maxpacketsize,
+ int ftp_message_type)
+{
+ struct alias_link *ftp_lnk;
+
+/* Security checks. */
+ if (pip->ip_src.s_addr != la->true_addr.s_addr)
+ return;
+
+ if (la->true_port < IPPORT_RESERVED)
+ return;
+
+/* Establish link to address and port found in FTP control message. */
+ ftp_lnk = FindUdpTcpOut(la, la->true_addr, GetDestAddress(lnk),
+ htons(la->true_port), 0, IPPROTO_TCP, 1);
+
+ if (ftp_lnk != NULL) {
+ int slen, hlen, tlen, dlen;
+ struct tcphdr *tc;
+
+#ifndef NO_FW_PUNCH
+ /* Punch hole in firewall */
+ PunchFWHole(ftp_lnk);
+#endif
+
+/* Calculate data length of TCP packet */
+ tc = (struct tcphdr *)ip_next(pip);
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+/* Create new FTP message. */
+ {
+ char stemp[MAX_MESSAGE_SIZE + 1];
+ char *sptr;
+ u_short alias_port;
+ u_char *ptr;
+ int a1, a2, a3, a4, p1, p2;
+ struct in_addr alias_address;
+
+/* Decompose alias address into quad format */
+ alias_address = GetAliasAddress(lnk);
+ ptr = (u_char *) & alias_address.s_addr;
+ a1 = *ptr++;
+ a2 = *ptr++;
+ a3 = *ptr++;
+ a4 = *ptr;
+
+ alias_port = GetAliasPort(ftp_lnk);
+
+/* Prepare new command */
+ switch (ftp_message_type) {
+ case FTP_PORT_COMMAND:
+ case FTP_227_REPLY:
+ /* Decompose alias port into pair format. */
+ ptr = (char *)&alias_port;
+ p1 = *ptr++;
+ p2 = *ptr;
+
+ if (ftp_message_type == FTP_PORT_COMMAND) {
+ /* Generate PORT command string. */
+ sprintf(stemp, "PORT %d,%d,%d,%d,%d,%d\r\n",
+ a1, a2, a3, a4, p1, p2);
+ } else {
+ /* Generate 227 reply string. */
+ sprintf(stemp,
+ "227 Entering Passive Mode (%d,%d,%d,%d,%d,%d)\r\n",
+ a1, a2, a3, a4, p1, p2);
+ }
+ break;
+ case FTP_EPRT_COMMAND:
+ /* Generate EPRT command string. */
+ sprintf(stemp, "EPRT |1|%d.%d.%d.%d|%d|\r\n",
+ a1, a2, a3, a4, ntohs(alias_port));
+ break;
+ case FTP_229_REPLY:
+ /* Generate 229 reply string. */
+ sprintf(stemp, "229 Entering Extended Passive Mode (|||%d|)\r\n",
+ ntohs(alias_port));
+ break;
+ }
+
+/* Save string length for IP header modification */
+ slen = strlen(stemp);
+
+/* Copy modified buffer into IP packet. */
+ sptr = (char *)pip;
+ sptr += hlen;
+ strncpy(sptr, stemp, maxpacketsize - hlen);
+ }
+
+/* Save information regarding modified seq and ack numbers */
+ {
+ int delta;
+
+ SetAckModified(lnk);
+ tc = (struct tcphdr *)ip_next(pip);
+ delta = GetDeltaSeqOut(tc->th_seq, lnk);
+ AddSeq(lnk, delta + slen - dlen, pip->ip_hl,
+ pip->ip_len, tc->th_seq, tc->th_off);
+ }
+
+/* Revise IP header */
+ {
+ u_short new_len;
+
+ new_len = htons(hlen + slen);
+ DifferentialChecksum(&pip->ip_sum,
+ &new_len,
+ &pip->ip_len,
+ 1);
+ pip->ip_len = new_len;
+ }
+
+/* Compute TCP checksum for revised packet */
+ tc->th_sum = 0;
+#ifdef _KERNEL
+ tc->th_x2 = 1;
+#else
+ tc->th_sum = TcpChecksum(pip);
+#endif
+ } else {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/HandleFtpOut: Cannot allocate FTP data port\n");
+#endif
+ }
+}
diff --git a/freebsd/sys/netinet/libalias/alias_irc.c b/freebsd/sys/netinet/libalias/alias_irc.c
new file mode 100644
index 00000000..05db0f4f
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_irc.c
@@ -0,0 +1,490 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/* Alias_irc.c intercepts packages contain IRC CTCP commands, and
+ changes DCC commands to export a port on the aliasing host instead
+ of an aliased host.
+
+ For this routine to work, the DCC command must fit entirely into a
+ single TCP packet. This will usually happen, but is not
+ guaranteed.
+
+ The interception is likely to change the length of the packet.
+ The handling of this is copied more-or-less verbatim from
+ ftp_alias.c
+
+ Initial version: Eivind Eklund <perhaps@yes.no> (ee) 97-01-29
+
+ Version 2.1: May, 1997 (cjm)
+ Very minor changes to conform with
+ local/global/function naming conventions
+ withing the packet alising module.
+*/
+
+/* Includes */
+#ifdef _KERNEL
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/ctype.h>
+#include <freebsd/sys/limits.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/module.h>
+#else
+#include <freebsd/ctype.h>
+#include <freebsd/errno.h>
+#include <freebsd/sys/types.h>
+#include <freebsd/stdio.h>
+#include <freebsd/stdlib.h>
+#include <freebsd/string.h>
+#include <freebsd/limits.h>
+#endif
+
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/tcp.h>
+
+#ifdef _KERNEL
+#include <freebsd/netinet/libalias/alias.h>
+#include <freebsd/netinet/libalias/alias_local.h>
+#include <freebsd/netinet/libalias/alias_mod.h>
+#else
+#include <freebsd/local/alias_local.h>
+#include <freebsd/local/alias_mod.h>
+#endif
+
+#define IRC_CONTROL_PORT_NUMBER_1 6667
+#define IRC_CONTROL_PORT_NUMBER_2 6668
+
+#define PKTSIZE (IP_MAXPACKET + 1)
+char *newpacket;
+
+/* Local defines */
+#define DBprintf(a)
+
+static void
+AliasHandleIrcOut(struct libalias *, struct ip *, struct alias_link *,
+ int maxpacketsize);
+
+static int
+fingerprint(struct libalias *la, struct alias_data *ah)
+{
+
+ if (ah->dport == NULL || ah->dport == NULL || ah->lnk == NULL ||
+ ah->maxpktsize == 0)
+ return (-1);
+ if (ntohs(*ah->dport) == IRC_CONTROL_PORT_NUMBER_1
+ || ntohs(*ah->dport) == IRC_CONTROL_PORT_NUMBER_2)
+ return (0);
+ return (-1);
+}
+
+static int
+protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ newpacket = malloc(PKTSIZE);
+ if (newpacket) {
+ AliasHandleIrcOut(la, pip, ah->lnk, ah->maxpktsize);
+ free(newpacket);
+ }
+ return (0);
+}
+
+struct proto_handler handlers[] = {
+ {
+ .pri = 90,
+ .dir = OUT,
+ .proto = TCP,
+ .fingerprint = &fingerprint,
+ .protohandler = &protohandler
+ },
+ { EOH }
+};
+
+static int
+mod_handler(module_t mod, int type, void *data)
+{
+ int error;
+
+ switch (type) {
+ case MOD_LOAD:
+ error = 0;
+ LibAliasAttachHandlers(handlers);
+ break;
+ case MOD_UNLOAD:
+ error = 0;
+ LibAliasDetachHandlers(handlers);
+ break;
+ default:
+ error = EINVAL;
+ }
+ return (error);
+}
+
+#ifdef _KERNEL
+static
+#endif
+moduledata_t alias_mod = {
+ "alias_irc", mod_handler, NULL
+};
+
+/* Kernel module definition. */
+#ifdef _KERNEL
+DECLARE_MODULE(alias_irc, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);
+MODULE_VERSION(alias_irc, 1);
+MODULE_DEPEND(alias_irc, libalias, 1, 1, 1);
+#endif
+
+static void
+AliasHandleIrcOut(struct libalias *la,
+ struct ip *pip, /* IP packet to examine */
+ struct alias_link *lnk, /* Which link are we on? */
+ int maxsize /* Maximum size of IP packet including
+ * headers */
+)
+{
+ int hlen, tlen, dlen;
+ struct in_addr true_addr;
+ u_short true_port;
+ char *sptr;
+ struct tcphdr *tc;
+ int i; /* Iterator through the source */
+
+/* Calculate data length of TCP packet */
+ tc = (struct tcphdr *)ip_next(pip);
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+ /*
+ * Return if data length is too short - assume an entire PRIVMSG in
+ * each packet.
+ */
+ if (dlen < (int)sizeof(":A!a@n.n PRIVMSG A :aDCC 1 1a") - 1)
+ return;
+
+/* Place string pointer at beginning of data */
+ sptr = (char *)pip;
+ sptr += hlen;
+ maxsize -= hlen; /* We're interested in maximum size of
+ * data, not packet */
+
+ /* Search for a CTCP command [Note 1] */
+ for (i = 0; i < dlen; i++) {
+ if (sptr[i] == '\001')
+ goto lFOUND_CTCP;
+ }
+ return; /* No CTCP commands in */
+ /* Handle CTCP commands - the buffer may have to be copied */
+lFOUND_CTCP:
+ {
+ unsigned int copyat = i;
+ unsigned int iCopy = 0; /* How much data have we written to
+ * copy-back string? */
+ unsigned long org_addr; /* Original IP address */
+ unsigned short org_port; /* Original source port
+ * address */
+
+lCTCP_START:
+ if (i >= dlen || iCopy >= PKTSIZE)
+ goto lPACKET_DONE;
+ newpacket[iCopy++] = sptr[i++]; /* Copy the CTCP start
+ * character */
+ /* Start of a CTCP */
+ if (i + 4 >= dlen) /* Too short for DCC */
+ goto lBAD_CTCP;
+ if (sptr[i + 0] != 'D')
+ goto lBAD_CTCP;
+ if (sptr[i + 1] != 'C')
+ goto lBAD_CTCP;
+ if (sptr[i + 2] != 'C')
+ goto lBAD_CTCP;
+ if (sptr[i + 3] != ' ')
+ goto lBAD_CTCP;
+ /* We have a DCC command - handle it! */
+ i += 4; /* Skip "DCC " */
+ if (iCopy + 4 > PKTSIZE)
+ goto lPACKET_DONE;
+ newpacket[iCopy++] = 'D';
+ newpacket[iCopy++] = 'C';
+ newpacket[iCopy++] = 'C';
+ newpacket[iCopy++] = ' ';
+
+ DBprintf(("Found DCC\n"));
+ /*
+ * Skip any extra spaces (should not occur according to
+ * protocol, but DCC breaks CTCP protocol anyway
+ */
+ while (sptr[i] == ' ') {
+ if (++i >= dlen) {
+ DBprintf(("DCC packet terminated in just spaces\n"));
+ goto lPACKET_DONE;
+ }
+ }
+
+ DBprintf(("Transferring command...\n"));
+ while (sptr[i] != ' ') {
+ newpacket[iCopy++] = sptr[i];
+ if (++i >= dlen || iCopy >= PKTSIZE) {
+ DBprintf(("DCC packet terminated during command\n"));
+ goto lPACKET_DONE;
+ }
+ }
+ /* Copy _one_ space */
+ if (i + 1 < dlen && iCopy < PKTSIZE)
+ newpacket[iCopy++] = sptr[i++];
+
+ DBprintf(("Done command - removing spaces\n"));
+ /*
+ * Skip any extra spaces (should not occur according to
+ * protocol, but DCC breaks CTCP protocol anyway
+ */
+ while (sptr[i] == ' ') {
+ if (++i >= dlen) {
+ DBprintf(("DCC packet terminated in just spaces (post-command)\n"));
+ goto lPACKET_DONE;
+ }
+ }
+
+ DBprintf(("Transferring filename...\n"));
+ while (sptr[i] != ' ') {
+ newpacket[iCopy++] = sptr[i];
+ if (++i >= dlen || iCopy >= PKTSIZE) {
+ DBprintf(("DCC packet terminated during filename\n"));
+ goto lPACKET_DONE;
+ }
+ }
+ /* Copy _one_ space */
+ if (i + 1 < dlen && iCopy < PKTSIZE)
+ newpacket[iCopy++] = sptr[i++];
+
+ DBprintf(("Done filename - removing spaces\n"));
+ /*
+ * Skip any extra spaces (should not occur according to
+ * protocol, but DCC breaks CTCP protocol anyway
+ */
+ while (sptr[i] == ' ') {
+ if (++i >= dlen) {
+ DBprintf(("DCC packet terminated in just spaces (post-filename)\n"));
+ goto lPACKET_DONE;
+ }
+ }
+
+ DBprintf(("Fetching IP address\n"));
+ /* Fetch IP address */
+ org_addr = 0;
+ while (i < dlen && isdigit(sptr[i])) {
+ if (org_addr > ULONG_MAX / 10UL) { /* Terminate on overflow */
+ DBprintf(("DCC Address overflow (org_addr == 0x%08lx, next char %c\n", org_addr, sptr[i]));
+ goto lBAD_CTCP;
+ }
+ org_addr *= 10;
+ org_addr += sptr[i++] - '0';
+ }
+ DBprintf(("Skipping space\n"));
+ if (i + 1 >= dlen || sptr[i] != ' ') {
+ DBprintf(("Overflow (%d >= %d) or bad character (%02x) terminating IP address\n", i + 1, dlen, sptr[i]));
+ goto lBAD_CTCP;
+ }
+ /*
+ * Skip any extra spaces (should not occur according to
+ * protocol, but DCC breaks CTCP protocol anyway, so we
+ * might as well play it safe
+ */
+ while (sptr[i] == ' ') {
+ if (++i >= dlen) {
+ DBprintf(("Packet failure - space overflow.\n"));
+ goto lPACKET_DONE;
+ }
+ }
+ DBprintf(("Fetching port number\n"));
+ /* Fetch source port */
+ org_port = 0;
+ while (i < dlen && isdigit(sptr[i])) {
+ if (org_port > 6554) { /* Terminate on overflow
+ * (65536/10 rounded up */
+ DBprintf(("DCC: port number overflow\n"));
+ goto lBAD_CTCP;
+ }
+ org_port *= 10;
+ org_port += sptr[i++] - '0';
+ }
+ /* Skip illegal addresses (or early termination) */
+ if (i >= dlen || (sptr[i] != '\001' && sptr[i] != ' ')) {
+ DBprintf(("Bad port termination\n"));
+ goto lBAD_CTCP;
+ }
+ DBprintf(("Got IP %lu and port %u\n", org_addr, (unsigned)org_port));
+
+ /* We've got the address and port - now alias it */
+ {
+ struct alias_link *dcc_lnk;
+ struct in_addr destaddr;
+
+
+ true_port = htons(org_port);
+ true_addr.s_addr = htonl(org_addr);
+ destaddr.s_addr = 0;
+
+ /* Sanity/Security checking */
+ if (!org_addr || !org_port ||
+ pip->ip_src.s_addr != true_addr.s_addr ||
+ org_port < IPPORT_RESERVED)
+ goto lBAD_CTCP;
+
+ /*
+ * Steal the FTP_DATA_PORT - it doesn't really
+ * matter, and this would probably allow it through
+ * at least _some_ firewalls.
+ */
+ dcc_lnk = FindUdpTcpOut(la, true_addr, destaddr,
+ true_port, 0,
+ IPPROTO_TCP, 1);
+ DBprintf(("Got a DCC link\n"));
+ if (dcc_lnk) {
+ struct in_addr alias_address; /* Address from aliasing */
+ u_short alias_port; /* Port given by
+ * aliasing */
+ int n;
+
+#ifndef NO_FW_PUNCH
+ /* Generate firewall hole as appropriate */
+ PunchFWHole(dcc_lnk);
+#endif
+
+ alias_address = GetAliasAddress(lnk);
+ n = snprintf(&newpacket[iCopy],
+ PKTSIZE - iCopy,
+ "%lu ", (u_long) htonl(alias_address.s_addr));
+ if (n < 0) {
+ DBprintf(("DCC packet construct failure.\n"));
+ goto lBAD_CTCP;
+ }
+ if ((iCopy += n) >= PKTSIZE) { /* Truncated/fit exactly
+ * - bad news */
+ DBprintf(("DCC constructed packet overflow.\n"));
+ goto lBAD_CTCP;
+ }
+ alias_port = GetAliasPort(dcc_lnk);
+ n = snprintf(&newpacket[iCopy],
+ PKTSIZE - iCopy,
+ "%u", htons(alias_port));
+ if (n < 0) {
+ DBprintf(("DCC packet construct failure.\n"));
+ goto lBAD_CTCP;
+ }
+ iCopy += n;
+ /*
+ * Done - truncated cases will be taken
+ * care of by lBAD_CTCP
+ */
+ DBprintf(("Aliased IP %lu and port %u\n", alias_address.s_addr, (unsigned)alias_port));
+ }
+ }
+ /*
+ * An uninteresting CTCP - state entered right after '\001'
+ * has been pushed. Also used to copy the rest of a DCC,
+ * after IP address and port has been handled
+ */
+lBAD_CTCP:
+ for (; i < dlen && iCopy < PKTSIZE; i++, iCopy++) {
+ newpacket[iCopy] = sptr[i]; /* Copy CTCP unchanged */
+ if (sptr[i] == '\001') {
+ goto lNORMAL_TEXT;
+ }
+ }
+ goto lPACKET_DONE;
+ /* Normal text */
+lNORMAL_TEXT:
+ for (; i < dlen && iCopy < PKTSIZE; i++, iCopy++) {
+ newpacket[iCopy] = sptr[i]; /* Copy CTCP unchanged */
+ if (sptr[i] == '\001') {
+ goto lCTCP_START;
+ }
+ }
+ /* Handle the end of a packet */
+lPACKET_DONE:
+ iCopy = iCopy > maxsize - copyat ? maxsize - copyat : iCopy;
+ memcpy(sptr + copyat, newpacket, iCopy);
+
+/* Save information regarding modified seq and ack numbers */
+ {
+ int delta;
+
+ SetAckModified(lnk);
+ tc = (struct tcphdr *)ip_next(pip);
+ delta = GetDeltaSeqOut(tc->th_seq, lnk);
+ AddSeq(lnk, delta + copyat + iCopy - dlen, pip->ip_hl,
+ pip->ip_len, tc->th_seq, tc->th_off);
+ }
+
+ /* Revise IP header */
+ {
+ u_short new_len;
+
+ new_len = htons(hlen + iCopy + copyat);
+ DifferentialChecksum(&pip->ip_sum,
+ &new_len,
+ &pip->ip_len,
+ 1);
+ pip->ip_len = new_len;
+ }
+
+ /* Compute TCP checksum for revised packet */
+ tc->th_sum = 0;
+#ifdef _KERNEL
+ tc->th_x2 = 1;
+#else
+ tc->th_sum = TcpChecksum(pip);
+#endif
+ return;
+ }
+}
+
+/* Notes:
+ [Note 1]
+ The initial search will most often fail; it could be replaced with a 32-bit specific search.
+ Such a search would be done for 32-bit unsigned value V:
+ V ^= 0x01010101; (Search is for null bytes)
+ if( ((V-0x01010101)^V) & 0x80808080 ) {
+ (found a null bytes which was a 01 byte)
+ }
+ To assert that the processor is 32-bits, do
+ extern int ircdccar[32]; (32 bits)
+ extern int ircdccar[CHAR_BIT*sizeof(unsigned int)];
+ which will generate a type-error on all but 32-bit machines.
+
+ [Note 2] This routine really ought to be replaced with one that
+ creates a transparent proxy on the aliasing host, to allow arbitary
+ changes in the TCP stream. This should not be too difficult given
+ this base; I (ee) will try to do this some time later.
+ */
diff --git a/freebsd/sys/netinet/libalias/alias_local.h b/freebsd/sys/netinet/libalias/alias_local.h
new file mode 100644
index 00000000..e24ece49
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_local.h
@@ -0,0 +1,397 @@
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Alias_local.h contains the function prototypes for alias.c,
+ * alias_db.c, alias_util.c and alias_ftp.c, alias_irc.c (as well
+ * as any future add-ons). It also includes macros, globals and
+ * struct definitions shared by more than one alias*.c file.
+ *
+ * This include file is intended to be used only within the aliasing
+ * software. Outside world interfaces are defined in alias.h
+ *
+ * This software is placed into the public domain with no restrictions
+ * on its distribution.
+ *
+ * Initial version: August, 1996 (cjm)
+ *
+ * <updated several times by original author and Eivind Eklund>
+ */
+
+#ifndef _ALIAS_LOCAL_HH_
+#define _ALIAS_LOCAL_HH_
+
+#include <freebsd/sys/types.h>
+#include <freebsd/sys/sysctl.h>
+
+#ifdef _KERNEL
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/mutex.h>
+
+/* XXX: LibAliasSetTarget() uses this constant. */
+#define INADDR_NONE 0xffffffff
+
+#include <freebsd/netinet/libalias/alias_sctp.h>
+#else
+#include <freebsd/local/alias_sctp.h>
+#endif
+
+/* Sizes of input and output link tables */
+#define LINK_TABLE_OUT_SIZE 4001
+#define LINK_TABLE_IN_SIZE 4001
+
+struct proxy_entry;
+
+struct libalias {
+ LIST_ENTRY(libalias) instancelist;
+
+ int packetAliasMode; /* Mode flags */
+ /* - documented in alias.h */
+
+ struct in_addr aliasAddress; /* Address written onto source */
+ /* field of IP packet. */
+
+ struct in_addr targetAddress; /* IP address incoming packets */
+ /* are sent to if no aliasing */
+ /* link already exists */
+
+ struct in_addr nullAddress; /* Used as a dummy parameter for */
+ /* some function calls */
+
+ LIST_HEAD (, alias_link) linkTableOut[LINK_TABLE_OUT_SIZE];
+ /* Lookup table of pointers to */
+ /* chains of link records. Each */
+
+ LIST_HEAD (, alias_link) linkTableIn[LINK_TABLE_IN_SIZE];
+ /* link record is doubly indexed */
+ /* into input and output lookup */
+ /* tables. */
+
+ /* Link statistics */
+ int icmpLinkCount;
+ int udpLinkCount;
+ int tcpLinkCount;
+ int pptpLinkCount;
+ int protoLinkCount;
+ int fragmentIdLinkCount;
+ int fragmentPtrLinkCount;
+ int sockCount;
+
+ int cleanupIndex; /* Index to chain of link table */
+ /* being inspected for old links */
+
+ int timeStamp; /* System time in seconds for */
+ /* current packet */
+
+ int lastCleanupTime; /* Last time
+ * IncrementalCleanup() */
+ /* was called */
+
+ int deleteAllLinks; /* If equal to zero, DeleteLink() */
+ /* will not remove permanent links */
+
+ /* log descriptor */
+#ifdef _KERNEL
+ char *logDesc;
+#else
+ FILE *logDesc;
+#endif
+ /* statistics monitoring */
+
+ int newDefaultLink; /* Indicates if a new aliasing */
+ /* link has been created after a */
+ /* call to PacketAliasIn/Out(). */
+
+#ifndef NO_FW_PUNCH
+ int fireWallFD; /* File descriptor to be able to */
+ /* control firewall. Opened by */
+ /* PacketAliasSetMode on first */
+ /* setting the PKT_ALIAS_PUNCH_FW */
+ /* flag. */
+ int fireWallBaseNum; /* The first firewall entry
+ * free for our use */
+ int fireWallNumNums; /* How many entries can we
+ * use? */
+ int fireWallActiveNum; /* Which entry did we last
+ * use? */
+ char *fireWallField; /* bool array for entries */
+#endif
+
+ unsigned int skinnyPort; /* TCP port used by the Skinny */
+ /* protocol. */
+
+ struct proxy_entry *proxyList;
+
+ struct in_addr true_addr; /* in network byte order. */
+ u_short true_port; /* in host byte order. */
+
+ /*
+ * sctp code support
+ */
+
+ /* counts associations that have progressed to UP and not yet removed */
+ int sctpLinkCount;
+#ifdef _KERNEL
+ /* timing queue for keeping track of association timeouts */
+ struct sctp_nat_timer sctpNatTimer;
+
+ /* size of hash table used in this instance */
+ u_int sctpNatTableSize;
+
+/*
+ * local look up table sorted by l_vtag/l_port
+ */
+ LIST_HEAD(sctpNatTableL, sctp_nat_assoc) *sctpTableLocal;
+/*
+ * global look up table sorted by g_vtag/g_port
+ */
+ LIST_HEAD(sctpNatTableG, sctp_nat_assoc) *sctpTableGlobal;
+
+ /*
+ * avoid races in libalias: every public function has to use it.
+ */
+ struct mtx mutex;
+#endif
+};
+
+/* Macros */
+
+#ifdef _KERNEL
+#define LIBALIAS_LOCK_INIT(l) \
+ mtx_init(&l->mutex, "per-instance libalias mutex", NULL, MTX_DEF)
+#define LIBALIAS_LOCK_ASSERT(l) mtx_assert(&l->mutex, MA_OWNED)
+#define LIBALIAS_LOCK(l) mtx_lock(&l->mutex)
+#define LIBALIAS_UNLOCK(l) mtx_unlock(&l->mutex)
+#define LIBALIAS_LOCK_DESTROY(l) mtx_destroy(&l->mutex)
+#else
+#define LIBALIAS_LOCK_INIT(l)
+#define LIBALIAS_LOCK_ASSERT(l)
+#define LIBALIAS_LOCK(l)
+#define LIBALIAS_UNLOCK(l)
+#define LIBALIAS_LOCK_DESTROY(l)
+#endif
+
+/*
+ * The following macro is used to update an
+ * internet checksum. "delta" is a 32-bit
+ * accumulation of all the changes to the
+ * checksum (adding in new 16-bit words and
+ * subtracting out old words), and "cksum"
+ * is the checksum value to be updated.
+ */
+#define ADJUST_CHECKSUM(acc, cksum) \
+ do { \
+ acc += cksum; \
+ if (acc < 0) { \
+ acc = -acc; \
+ acc = (acc >> 16) + (acc & 0xffff); \
+ acc += acc >> 16; \
+ cksum = (u_short) ~acc; \
+ } else { \
+ acc = (acc >> 16) + (acc & 0xffff); \
+ acc += acc >> 16; \
+ cksum = (u_short) acc; \
+ } \
+ } while (0)
+
+
+/* Prototypes */
+
+/*
+ * SctpFunction prototypes
+ *
+ */
+void AliasSctpInit(struct libalias *la);
+void AliasSctpTerm(struct libalias *la);
+int SctpAlias(struct libalias *la, struct ip *ip, int direction);
+
+/*
+ * We do not calculate TCP checksums when libalias is a kernel
+ * module, since it has no idea about checksum offloading.
+ * If TCP data has changed, then we just set checksum to zero,
+ * and caller must recalculate it himself.
+ * In case if libalias will edit UDP data, the same approach
+ * should be used.
+ */
+#ifndef _KERNEL
+u_short IpChecksum(struct ip *_pip);
+u_short TcpChecksum(struct ip *_pip);
+#endif
+void
+DifferentialChecksum(u_short * _cksum, void * _new, void * _old, int _n);
+
+/* Internal data access */
+struct alias_link *
+FindIcmpIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr,
+ u_short _id_alias, int _create);
+struct alias_link *
+FindIcmpOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr,
+ u_short _id, int _create);
+struct alias_link *
+FindFragmentIn1(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr,
+ u_short _ip_id);
+struct alias_link *
+FindFragmentIn2(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr,
+ u_short _ip_id);
+struct alias_link *
+ AddFragmentPtrLink(struct libalias *la, struct in_addr _dst_addr, u_short _ip_id);
+struct alias_link *
+ FindFragmentPtr(struct libalias *la, struct in_addr _dst_addr, u_short _ip_id);
+struct alias_link *
+FindProtoIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr,
+ u_char _proto);
+struct alias_link *
+FindProtoOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr,
+ u_char _proto);
+struct alias_link *
+FindUdpTcpIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr,
+ u_short _dst_port, u_short _alias_port, u_char _proto, int _create);
+struct alias_link *
+FindUdpTcpOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr,
+ u_short _src_port, u_short _dst_port, u_char _proto, int _create);
+struct alias_link *
+AddPptp(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr,
+ struct in_addr _alias_addr, u_int16_t _src_call_id);
+struct alias_link *
+FindPptpOutByCallId(struct libalias *la, struct in_addr _src_addr,
+ struct in_addr _dst_addr, u_int16_t _src_call_id);
+struct alias_link *
+FindPptpInByCallId(struct libalias *la, struct in_addr _dst_addr,
+ struct in_addr _alias_addr, u_int16_t _dst_call_id);
+struct alias_link *
+FindPptpOutByPeerCallId(struct libalias *la, struct in_addr _src_addr,
+ struct in_addr _dst_addr, u_int16_t _dst_call_id);
+struct alias_link *
+FindPptpInByPeerCallId(struct libalias *la, struct in_addr _dst_addr,
+ struct in_addr _alias_addr, u_int16_t _alias_call_id);
+struct alias_link *
+FindRtspOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr,
+ u_short _src_port, u_short _alias_port, u_char _proto);
+struct in_addr
+ FindOriginalAddress(struct libalias *la, struct in_addr _alias_addr);
+struct in_addr
+ FindAliasAddress(struct libalias *la, struct in_addr _original_addr);
+struct in_addr
+FindSctpRedirectAddress(struct libalias *la, struct sctp_nat_msg *sm);
+
+/* External data access/modification */
+int
+FindNewPortGroup(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr,
+ u_short _src_port, u_short _dst_port, u_short _port_count,
+ u_char _proto, u_char _align);
+void GetFragmentAddr(struct alias_link *_lnk, struct in_addr *_src_addr);
+void SetFragmentAddr(struct alias_link *_lnk, struct in_addr _src_addr);
+void GetFragmentPtr(struct alias_link *_lnk, char **_fptr);
+void SetFragmentPtr(struct alias_link *_lnk, char *fptr);
+void SetStateIn(struct alias_link *_lnk, int _state);
+void SetStateOut(struct alias_link *_lnk, int _state);
+int GetStateIn (struct alias_link *_lnk);
+int GetStateOut(struct alias_link *_lnk);
+struct in_addr
+ GetOriginalAddress(struct alias_link *_lnk);
+struct in_addr
+ GetDestAddress(struct alias_link *_lnk);
+struct in_addr
+ GetAliasAddress(struct alias_link *_lnk);
+struct in_addr
+ GetDefaultAliasAddress(struct libalias *la);
+void SetDefaultAliasAddress(struct libalias *la, struct in_addr _alias_addr);
+u_short GetOriginalPort(struct alias_link *_lnk);
+u_short GetAliasPort(struct alias_link *_lnk);
+struct in_addr
+ GetProxyAddress(struct alias_link *_lnk);
+void SetProxyAddress(struct alias_link *_lnk, struct in_addr _addr);
+u_short GetProxyPort(struct alias_link *_lnk);
+void SetProxyPort(struct alias_link *_lnk, u_short _port);
+void SetAckModified(struct alias_link *_lnk);
+int GetAckModified(struct alias_link *_lnk);
+int GetDeltaAckIn(u_long, struct alias_link *_lnk);
+int GetDeltaSeqOut(u_long, struct alias_link *lnk);
+void AddSeq(struct alias_link *lnk, int delta, u_int ip_hl,
+ u_short ip_len, u_long th_seq, u_int th_off);
+void SetExpire (struct alias_link *_lnk, int _expire);
+void ClearCheckNewLink(struct libalias *la);
+void SetProtocolFlags(struct alias_link *_lnk, int _pflags);
+int GetProtocolFlags(struct alias_link *_lnk);
+void SetDestCallId(struct alias_link *_lnk, u_int16_t _cid);
+
+#ifndef NO_FW_PUNCH
+void PunchFWHole(struct alias_link *_lnk);
+
+#endif
+
+/* Housekeeping function */
+void HouseKeeping(struct libalias *);
+
+/* Tcp specfic routines */
+/* lint -save -library Suppress flexelint warnings */
+
+/* Transparent proxy routines */
+int
+ProxyCheck(struct libalias *la, struct in_addr *proxy_server_addr,
+ u_short * proxy_server_port, struct in_addr src_addr,
+ struct in_addr dst_addr, u_short dst_port, u_char ip_p);
+void
+ProxyModify(struct libalias *la, struct alias_link *_lnk, struct ip *_pip,
+ int _maxpacketsize, int _proxy_type);
+
+enum alias_tcp_state {
+ ALIAS_TCP_STATE_NOT_CONNECTED,
+ ALIAS_TCP_STATE_CONNECTED,
+ ALIAS_TCP_STATE_DISCONNECTED
+};
+
+#if defined(_NETINET_IP_HH_)
+static __inline void *
+ip_next(struct ip *iphdr)
+{
+ char *p = (char *)iphdr;
+ return (&p[iphdr->ip_hl * 4]);
+}
+#endif
+
+#if defined(_NETINET_TCP_HH_)
+static __inline void *
+tcp_next(struct tcphdr *tcphdr)
+{
+ char *p = (char *)tcphdr;
+ return (&p[tcphdr->th_off * 4]);
+}
+#endif
+
+#if defined(_NETINET_UDP_HH_)
+static __inline void *
+udp_next(struct udphdr *udphdr)
+{
+ return ((void *)(udphdr + 1));
+}
+#endif
+
+#endif /* !_ALIAS_LOCAL_HH_ */
diff --git a/freebsd/sys/netinet/libalias/alias_mod.c b/freebsd/sys/netinet/libalias/alias_mod.c
new file mode 100644
index 00000000..fa15b2e4
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_mod.c
@@ -0,0 +1,292 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2005 Paolo Pisati <piso@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef _KERNEL
+#include <freebsd/sys/libkern.h>
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/rwlock.h>
+#else
+#include <freebsd/stdio.h>
+#include <freebsd/string.h>
+#include <freebsd/sys/types.h>
+#include <freebsd/errno.h>
+#endif
+
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+
+#ifdef _KERNEL
+#include <freebsd/netinet/libalias/alias_local.h>
+#include <freebsd/netinet/libalias/alias_mod.h>
+#else
+#include <freebsd/local/alias_local.h>
+#include <freebsd/local/alias_mod.h>
+#endif
+
+/* Protocol and userland module handlers chains. */
+LIST_HEAD(handler_chain, proto_handler) handler_chain = LIST_HEAD_INITIALIZER(handler_chain);
+#ifdef _KERNEL
+struct rwlock handler_rw;
+#endif
+SLIST_HEAD(dll_chain, dll) dll_chain = SLIST_HEAD_INITIALIZER(dll_chain);
+
+#ifdef _KERNEL
+
+#define LIBALIAS_RWLOCK_INIT() \
+ rw_init(&handler_rw, "Libalias_modules_rwlock")
+#define LIBALIAS_RWLOCK_DESTROY() rw_destroy(&handler_rw)
+#define LIBALIAS_WLOCK_ASSERT() \
+ rw_assert(&handler_rw, RA_WLOCKED)
+
+static __inline void
+LIBALIAS_RLOCK(void)
+{
+ rw_rlock(&handler_rw);
+}
+
+static __inline void
+LIBALIAS_RUNLOCK(void)
+{
+ rw_runlock(&handler_rw);
+}
+
+static __inline void
+LIBALIAS_WLOCK(void)
+{
+ rw_wlock(&handler_rw);
+}
+
+static __inline void
+LIBALIAS_WUNLOCK(void)
+{
+ rw_wunlock(&handler_rw);
+}
+
+static void
+_handler_chain_init(void)
+{
+
+ if (!rw_initialized(&handler_rw))
+ LIBALIAS_RWLOCK_INIT();
+}
+
+static void
+_handler_chain_destroy(void)
+{
+
+ if (rw_initialized(&handler_rw))
+ LIBALIAS_RWLOCK_DESTROY();
+}
+
+#else
+#define LIBALIAS_RWLOCK_INIT() ;
+#define LIBALIAS_RWLOCK_DESTROY() ;
+#define LIBALIAS_WLOCK_ASSERT() ;
+#define LIBALIAS_RLOCK() ;
+#define LIBALIAS_RUNLOCK() ;
+#define LIBALIAS_WLOCK() ;
+#define LIBALIAS_WUNLOCK() ;
+#define _handler_chain_init() ;
+#define _handler_chain_destroy() ;
+#endif
+
+void
+handler_chain_init(void)
+{
+ _handler_chain_init();
+}
+
+void
+handler_chain_destroy(void)
+{
+ _handler_chain_destroy();
+}
+
+static int
+_attach_handler(struct proto_handler *p)
+{
+ struct proto_handler *b;
+
+ LIBALIAS_WLOCK_ASSERT();
+ b = NULL;
+ LIST_FOREACH(b, &handler_chain, entries) {
+ if ((b->pri == p->pri) &&
+ (b->dir == p->dir) &&
+ (b->proto == p->proto))
+ return (EEXIST); /* Priority conflict. */
+ if (b->pri > p->pri) {
+ LIST_INSERT_BEFORE(b, p, entries);
+ return (0);
+ }
+ }
+ /* End of list or found right position, inserts here. */
+ if (b)
+ LIST_INSERT_AFTER(b, p, entries);
+ else
+ LIST_INSERT_HEAD(&handler_chain, p, entries);
+ return (0);
+}
+
+static int
+_detach_handler(struct proto_handler *p)
+{
+ struct proto_handler *b, *b_tmp;
+
+ LIBALIAS_WLOCK_ASSERT();
+ LIST_FOREACH_SAFE(b, &handler_chain, entries, b_tmp) {
+ if (b == p) {
+ LIST_REMOVE(b, entries);
+ return (0);
+ }
+ }
+ return (ENOENT); /* Handler not found. */
+}
+
+int
+LibAliasAttachHandlers(struct proto_handler *_p)
+{
+ int i, error;
+
+ LIBALIAS_WLOCK();
+ error = -1;
+ for (i = 0; 1; i++) {
+ if (*((int *)&_p[i]) == EOH)
+ break;
+ error = _attach_handler(&_p[i]);
+ if (error != 0)
+ break;
+ }
+ LIBALIAS_WUNLOCK();
+ return (error);
+}
+
+int
+LibAliasDetachHandlers(struct proto_handler *_p)
+{
+ int i, error;
+
+ LIBALIAS_WLOCK();
+ error = -1;
+ for (i = 0; 1; i++) {
+ if (*((int *)&_p[i]) == EOH)
+ break;
+ error = _detach_handler(&_p[i]);
+ if (error != 0)
+ break;
+ }
+ LIBALIAS_WUNLOCK();
+ return (error);
+}
+
+int
+detach_handler(struct proto_handler *_p)
+{
+ int error;
+
+ LIBALIAS_WLOCK();
+ error = -1;
+ error = _detach_handler(_p);
+ LIBALIAS_WUNLOCK();
+ return (error);
+}
+
+int
+find_handler(int8_t dir, int8_t proto, struct libalias *la, __unused struct ip *pip,
+ struct alias_data *ad)
+{
+ struct proto_handler *p;
+ int error;
+
+ LIBALIAS_RLOCK();
+ error = ENOENT;
+ LIST_FOREACH(p, &handler_chain, entries) {
+ if ((p->dir & dir) && (p->proto & proto))
+ if (p->fingerprint(la, ad) == 0) {
+ error = p->protohandler(la, pip, ad);
+ break;
+ }
+ }
+ LIBALIAS_RUNLOCK();
+ return (error);
+}
+
+struct proto_handler *
+first_handler(void)
+{
+
+ return (LIST_FIRST(&handler_chain));
+}
+
+/* Dll manipulation code - this code is not thread safe... */
+
+int
+attach_dll(struct dll *p)
+{
+ struct dll *b;
+
+ SLIST_FOREACH(b, &dll_chain, next) {
+ if (!strncmp(b->name, p->name, DLL_LEN))
+ return (EEXIST); /* Dll name conflict. */
+ }
+ SLIST_INSERT_HEAD(&dll_chain, p, next);
+ return (0);
+}
+
+void *
+detach_dll(char *p)
+{
+ struct dll *b, *b_tmp;
+ void *error;
+
+ b = NULL;
+ error = NULL;
+ SLIST_FOREACH_SAFE(b, &dll_chain, next, b_tmp)
+ if (!strncmp(b->name, p, DLL_LEN)) {
+ SLIST_REMOVE(&dll_chain, b, dll, next);
+ error = b;
+ break;
+ }
+ return (error);
+}
+
+struct dll *
+walk_dll_chain(void)
+{
+ struct dll *t;
+
+ t = SLIST_FIRST(&dll_chain);
+ if (t == NULL)
+ return (NULL);
+ SLIST_REMOVE_HEAD(&dll_chain, next);
+ return (t);
+}
diff --git a/freebsd/sys/netinet/libalias/alias_mod.h b/freebsd/sys/netinet/libalias/alias_mod.h
new file mode 100644
index 00000000..f5f98cc3
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_mod.h
@@ -0,0 +1,163 @@
+/*-
+ * Copyright (c) 2005 Paolo Pisati <piso@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Alias_mod.h defines the outside world interfaces for the packet aliasing
+ * modular framework
+ */
+
+#ifndef _ALIAS_MOD_HH_
+#define _ALIAS_MOD_HH_
+
+#ifdef _KERNEL
+MALLOC_DECLARE(M_ALIAS);
+
+/* Use kernel allocator. */
+#if defined(_SYS_MALLOC_HH_)
+#ifndef __rtems__
+#define malloc(x) malloc(x, M_ALIAS, M_NOWAIT|M_ZERO)
+#define calloc(x, n) malloc(x*n)
+#define free(x) free(x, M_ALIAS)
+#else /* __rtems__ */
+#define malloc(x) _bsd_malloc(x, M_ALIAS, M_NOWAIT|M_ZERO)
+#define calloc(x, n) malloc(x*n)
+#define free(x) _bsd_free(x, M_ALIAS)
+#endif /* __rtems__ */
+#endif
+#endif
+
+/* Protocol handlers struct & function. */
+
+/* Packet flow direction. */
+#define IN 1
+#define OUT 2
+
+/* Working protocol. */
+#define IP 1
+#define TCP 2
+#define UDP 4
+
+/*
+ * Data passed to protocol handler module, it must be filled
+ * right before calling find_handler() to determine which
+ * module is elegible to be called.
+ */
+
+struct alias_data {
+ struct alias_link *lnk;
+ struct in_addr *oaddr; /* Original address. */
+ struct in_addr *aaddr; /* Alias address. */
+ uint16_t *aport; /* Alias port. */
+ uint16_t *sport, *dport; /* Source & destination port */
+ uint16_t maxpktsize; /* Max packet size. */
+};
+
+/*
+ * This structure contains all the information necessary to make
+ * a protocol handler correctly work.
+ */
+
+struct proto_handler {
+ u_int pri; /* Handler priority. */
+ int16_t dir; /* Flow direction. */
+ uint8_t proto; /* Working protocol. */
+ int (*fingerprint)(struct libalias *, /* Fingerprint * function. */
+ struct alias_data *);
+ int (*protohandler)(struct libalias *, /* Aliasing * function. */
+ struct ip *, struct alias_data *);
+ LIST_ENTRY(proto_handler) entries;
+};
+
+
+/*
+ * Used only in userland when libalias needs to keep track of all
+ * module loaded. In kernel land (kld mode) we don't need to care
+ * care about libalias modules cause it's kld to do it for us.
+ */
+
+#define DLL_LEN 32
+struct dll {
+ char name[DLL_LEN]; /* Name of module. */
+ void *handle; /*
+ * Ptr to shared obj obtained through
+ * dlopen() - use this ptr to get access
+ * to any symbols from a loaded module
+ * via dlsym().
+ */
+ SLIST_ENTRY(dll) next;
+};
+
+/* Functions used with protocol handlers. */
+
+void handler_chain_init(void);
+void handler_chain_destroy(void);
+int LibAliasAttachHandlers(struct proto_handler *);
+int LibAliasDetachHandlers(struct proto_handler *);
+int detach_handler(struct proto_handler *);
+int find_handler(int8_t, int8_t, struct libalias *,
+ struct ip *, struct alias_data *);
+struct proto_handler *first_handler(void);
+
+/* Functions used with dll module. */
+
+void dll_chain_init(void);
+void dll_chain_destroy(void);
+int attach_dll(struct dll *);
+void *detach_dll(char *);
+struct dll *walk_dll_chain(void);
+
+/* End of handlers. */
+#define EOH -1
+
+/*
+ * Some defines borrowed from sys/module.h used to compile a kld
+ * in userland as a shared lib.
+ */
+
+#ifndef _KERNEL
+typedef enum modeventtype {
+ MOD_LOAD,
+ MOD_UNLOAD,
+ MOD_SHUTDOWN,
+ MOD_QUIESCE
+} modeventtype_t;
+
+typedef struct module *module_t;
+typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *);
+
+/*
+ * Struct for registering modules statically via SYSINIT.
+ */
+typedef struct moduledata {
+ const char *name; /* module name */
+ modeventhand_t evhand; /* event handler */
+ void *priv; /* extra data */
+} moduledata_t;
+#endif
+
+#endif /* !_ALIAS_MOD_HH_ */
diff --git a/freebsd/sys/netinet/libalias/alias_nbt.c b/freebsd/sys/netinet/libalias/alias_nbt.c
new file mode 100644
index 00000000..31ee0006
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_nbt.c
@@ -0,0 +1,855 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Written by Atsushi Murai <amurai@spec.co.jp>
+ * Copyright (c) 1998, System Planning and Engineering Co.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * TODO:
+ * oClean up.
+ * oConsidering for word alignment for other platform.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ alias_nbt.c performs special processing for NetBios over TCP/IP
+ sessions by UDP.
+
+ Initial version: May, 1998 (Atsushi Murai <amurai@spec.co.jp>)
+
+ See HISTORY file for record of revisions.
+*/
+
+/* Includes */
+#ifdef _KERNEL
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/module.h>
+#else
+#include <freebsd/errno.h>
+#include <freebsd/sys/types.h>
+#include <freebsd/stdio.h>
+#include <freebsd/strings.h>
+#endif
+
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/udp.h>
+
+#ifdef _KERNEL
+#include <freebsd/netinet/libalias/alias_local.h>
+#include <freebsd/netinet/libalias/alias_mod.h>
+#else
+#include <freebsd/local/alias_local.h>
+#include <freebsd/local/alias_mod.h>
+#endif
+
+#define NETBIOS_NS_PORT_NUMBER 137
+#define NETBIOS_DGM_PORT_NUMBER 138
+
+static int
+AliasHandleUdpNbt(struct libalias *, struct ip *, struct alias_link *,
+ struct in_addr *, u_short);
+
+static int
+AliasHandleUdpNbtNS(struct libalias *, struct ip *, struct alias_link *,
+ struct in_addr *, u_short *, struct in_addr *, u_short *);
+static int
+fingerprint1(struct libalias *la, struct alias_data *ah)
+{
+
+ if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL ||
+ ah->aaddr == NULL || ah->aport == NULL)
+ return (-1);
+ if (ntohs(*ah->dport) == NETBIOS_DGM_PORT_NUMBER
+ || ntohs(*ah->sport) == NETBIOS_DGM_PORT_NUMBER)
+ return (0);
+ return (-1);
+}
+
+static int
+protohandler1(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ return (AliasHandleUdpNbt(la, pip, ah->lnk, ah->aaddr, *ah->aport));
+}
+
+static int
+fingerprint2(struct libalias *la, struct alias_data *ah)
+{
+
+ if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL ||
+ ah->aaddr == NULL || ah->aport == NULL)
+ return (-1);
+ if (ntohs(*ah->dport) == NETBIOS_NS_PORT_NUMBER
+ || ntohs(*ah->sport) == NETBIOS_NS_PORT_NUMBER)
+ return (0);
+ return (-1);
+}
+
+static int
+protohandler2in(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ AliasHandleUdpNbtNS(la, pip, ah->lnk, ah->aaddr, ah->aport,
+ ah->oaddr, ah->dport);
+ return (0);
+}
+
+static int
+protohandler2out(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ return (AliasHandleUdpNbtNS(la, pip, ah->lnk, &pip->ip_src, ah->sport,
+ ah->aaddr, ah->aport));
+}
+
+/* Kernel module definition. */
+struct proto_handler handlers[] = {
+ {
+ .pri = 130,
+ .dir = IN|OUT,
+ .proto = UDP,
+ .fingerprint = &fingerprint1,
+ .protohandler = &protohandler1
+ },
+ {
+ .pri = 140,
+ .dir = IN,
+ .proto = UDP,
+ .fingerprint = &fingerprint2,
+ .protohandler = &protohandler2in
+ },
+ {
+ .pri = 140,
+ .dir = OUT,
+ .proto = UDP,
+ .fingerprint = &fingerprint2,
+ .protohandler = &protohandler2out
+ },
+ { EOH }
+};
+
+static int
+mod_handler(module_t mod, int type, void *data)
+{
+ int error;
+
+ switch (type) {
+ case MOD_LOAD:
+ error = 0;
+ LibAliasAttachHandlers(handlers);
+ break;
+ case MOD_UNLOAD:
+ error = 0;
+ LibAliasDetachHandlers(handlers);
+ break;
+ default:
+ error = EINVAL;
+ }
+ return (error);
+}
+
+#ifdef _KERNEL
+static
+#endif
+moduledata_t alias_mod = {
+ "alias_nbt", mod_handler, NULL
+};
+
+#ifdef _KERNEL
+DECLARE_MODULE(alias_nbt, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);
+MODULE_VERSION(alias_nbt, 1);
+MODULE_DEPEND(alias_nbt, libalias, 1, 1, 1);
+#endif
+
+typedef struct {
+ struct in_addr oldaddr;
+ u_short oldport;
+ struct in_addr newaddr;
+ u_short newport;
+ u_short *uh_sum;
+} NBTArguments;
+
+typedef struct {
+ unsigned char type;
+ unsigned char flags;
+ u_short id;
+ struct in_addr source_ip;
+ u_short source_port;
+ u_short len;
+ u_short offset;
+} NbtDataHeader;
+
+#define OpQuery 0
+#define OpUnknown 4
+#define OpRegist 5
+#define OpRelease 6
+#define OpWACK 7
+#define OpRefresh 8
+typedef struct {
+ u_short nametrid;
+ u_short dir: 1, opcode:4, nmflags:7, rcode:4;
+ u_short qdcount;
+ u_short ancount;
+ u_short nscount;
+ u_short arcount;
+} NbtNSHeader;
+
+#define FMT_ERR 0x1
+#define SRV_ERR 0x2
+#define IMP_ERR 0x4
+#define RFS_ERR 0x5
+#define ACT_ERR 0x6
+#define CFT_ERR 0x7
+
+
+#ifdef LIBALIAS_DEBUG
+static void
+PrintRcode(u_char rcode)
+{
+
+ switch (rcode) {
+ case FMT_ERR:
+ printf("\nFormat Error.");
+ case SRV_ERR:
+ printf("\nSever failure.");
+ case IMP_ERR:
+ printf("\nUnsupported request error.\n");
+ case RFS_ERR:
+ printf("\nRefused error.\n");
+ case ACT_ERR:
+ printf("\nActive error.\n");
+ case CFT_ERR:
+ printf("\nName in conflict error.\n");
+ default:
+ printf("\n?%c?=%0x\n", '?', rcode);
+
+ }
+}
+
+#endif
+
+
+/* Handling Name field */
+static u_char *
+AliasHandleName(u_char * p, char *pmax)
+{
+
+ u_char *s;
+ u_char c;
+ int compress;
+
+ /* Following length field */
+
+ if (p == NULL || (char *)p >= pmax)
+ return (NULL);
+
+ if (*p & 0xc0) {
+ p = p + 2;
+ if ((char *)p > pmax)
+ return (NULL);
+ return ((u_char *) p);
+ }
+ while ((*p & 0x3f) != 0x00) {
+ s = p + 1;
+ if (*p == 0x20)
+ compress = 1;
+ else
+ compress = 0;
+
+ /* Get next length field */
+ p = (u_char *) (p + (*p & 0x3f) + 1);
+ if ((char *)p > pmax) {
+ p = NULL;
+ break;
+ }
+#ifdef LIBALIAS_DEBUG
+ printf(":");
+#endif
+ while (s < p) {
+ if (compress == 1) {
+ c = (u_char) (((((*s & 0x0f) << 4) | (*(s + 1) & 0x0f)) - 0x11));
+#ifdef LIBALIAS_DEBUG
+ if (isprint(c))
+ printf("%c", c);
+ else
+ printf("<0x%02x>", c);
+#endif
+ s += 2;
+ } else {
+#ifdef LIBALIAS_DEBUG
+ printf("%c", *s);
+#endif
+ s++;
+ }
+ }
+#ifdef LIBALIAS_DEBUG
+ printf(":");
+ fflush(stdout);
+#endif
+ }
+
+ /* Set up to out of Name field */
+ if (p == NULL || (char *)p >= pmax)
+ p = NULL;
+ else
+ p++;
+ return ((u_char *) p);
+}
+
+/*
+ * NetBios Datagram Handler (IP/UDP)
+ */
+#define DGM_DIRECT_UNIQ 0x10
+#define DGM_DIRECT_GROUP 0x11
+#define DGM_BROADCAST 0x12
+#define DGM_ERROR 0x13
+#define DGM_QUERY 0x14
+#define DGM_POSITIVE_RES 0x15
+#define DGM_NEGATIVE_RES 0x16
+
+static int
+AliasHandleUdpNbt(
+ struct libalias *la,
+ struct ip *pip, /* IP packet to examine/patch */
+ struct alias_link *lnk,
+ struct in_addr *alias_address,
+ u_short alias_port
+)
+{
+ struct udphdr *uh;
+ NbtDataHeader *ndh;
+ u_char *p = NULL;
+ char *pmax;
+
+ (void)la;
+ (void)lnk;
+
+ /* Calculate data length of UDP packet */
+ uh = (struct udphdr *)ip_next(pip);
+ pmax = (char *)uh + ntohs(uh->uh_ulen);
+
+ ndh = (NbtDataHeader *)udp_next(uh);
+ if ((char *)(ndh + 1) > pmax)
+ return (-1);
+#ifdef LIBALIAS_DEBUG
+ printf("\nType=%02x,", ndh->type);
+#endif
+ switch (ndh->type) {
+ case DGM_DIRECT_UNIQ:
+ case DGM_DIRECT_GROUP:
+ case DGM_BROADCAST:
+ p = (u_char *) ndh + 14;
+ p = AliasHandleName(p, pmax); /* Source Name */
+ p = AliasHandleName(p, pmax); /* Destination Name */
+ break;
+ case DGM_ERROR:
+ p = (u_char *) ndh + 11;
+ break;
+ case DGM_QUERY:
+ case DGM_POSITIVE_RES:
+ case DGM_NEGATIVE_RES:
+ p = (u_char *) ndh + 10;
+ p = AliasHandleName(p, pmax); /* Destination Name */
+ break;
+ }
+ if (p == NULL || (char *)p > pmax)
+ p = NULL;
+#ifdef LIBALIAS_DEBUG
+ printf("%s:%d-->", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port));
+#endif
+ /* Doing an IP address and Port number Translation */
+ if (uh->uh_sum != 0) {
+ int acc;
+ u_short *sptr;
+
+ acc = ndh->source_port;
+ acc -= alias_port;
+ sptr = (u_short *) & (ndh->source_ip);
+ acc += *sptr++;
+ acc += *sptr;
+ sptr = (u_short *) alias_address;
+ acc -= *sptr++;
+ acc -= *sptr;
+ ADJUST_CHECKSUM(acc, uh->uh_sum);
+ }
+ ndh->source_ip = *alias_address;
+ ndh->source_port = alias_port;
+#ifdef LIBALIAS_DEBUG
+ printf("%s:%d\n", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port));
+ fflush(stdout);
+#endif
+ return ((p == NULL) ? -1 : 0);
+}
+
+/* Question Section */
+#define QS_TYPE_NB 0x0020
+#define QS_TYPE_NBSTAT 0x0021
+#define QS_CLAS_IN 0x0001
+typedef struct {
+ u_short type; /* The type of Request */
+ u_short class; /* The class of Request */
+} NBTNsQuestion;
+
+static u_char *
+AliasHandleQuestion(
+ u_short count,
+ NBTNsQuestion * q,
+ char *pmax,
+ NBTArguments * nbtarg)
+{
+
+ (void)nbtarg;
+
+ while (count != 0) {
+ /* Name Filed */
+ q = (NBTNsQuestion *) AliasHandleName((u_char *) q, pmax);
+
+ if (q == NULL || (char *)(q + 1) > pmax) {
+ q = NULL;
+ break;
+ }
+ /* Type and Class filed */
+ switch (ntohs(q->type)) {
+ case QS_TYPE_NB:
+ case QS_TYPE_NBSTAT:
+ q = q + 1;
+ break;
+ default:
+#ifdef LIBALIAS_DEBUG
+ printf("\nUnknown Type on Question %0x\n", ntohs(q->type));
+#endif
+ break;
+ }
+ count--;
+ }
+
+ /* Set up to out of Question Section */
+ return ((u_char *) q);
+}
+
+/* Resource Record */
+#define RR_TYPE_A 0x0001
+#define RR_TYPE_NS 0x0002
+#define RR_TYPE_NULL 0x000a
+#define RR_TYPE_NB 0x0020
+#define RR_TYPE_NBSTAT 0x0021
+#define RR_CLAS_IN 0x0001
+#define SizeOfNsResource 8
+typedef struct {
+ u_short type;
+ u_short class;
+ unsigned int ttl;
+ u_short rdlen;
+} NBTNsResource;
+
+#define SizeOfNsRNB 6
+typedef struct {
+ u_short g: 1 , ont:2, resv:13;
+ struct in_addr addr;
+} NBTNsRNB;
+
+static u_char *
+AliasHandleResourceNB(
+ NBTNsResource * q,
+ char *pmax,
+ NBTArguments * nbtarg)
+{
+ NBTNsRNB *nb;
+ u_short bcount;
+
+ if (q == NULL || (char *)(q + 1) > pmax)
+ return (NULL);
+ /* Check out a length */
+ bcount = ntohs(q->rdlen);
+
+ /* Forward to Resource NB position */
+ nb = (NBTNsRNB *) ((u_char *) q + SizeOfNsResource);
+
+ /* Processing all in_addr array */
+#ifdef LIBALIAS_DEBUG
+ printf("NB rec[%s", inet_ntoa(nbtarg->oldaddr));
+ printf("->%s, %dbytes] ", inet_ntoa(nbtarg->newaddr), bcount);
+#endif
+ while (nb != NULL && bcount != 0) {
+ if ((char *)(nb + 1) > pmax) {
+ nb = NULL;
+ break;
+ }
+#ifdef LIBALIAS_DEBUG
+ printf("<%s>", inet_ntoa(nb->addr));
+#endif
+ if (!bcmp(&nbtarg->oldaddr, &nb->addr, sizeof(struct in_addr))) {
+ if (*nbtarg->uh_sum != 0) {
+ int acc;
+ u_short *sptr;
+
+ sptr = (u_short *) & (nb->addr);
+ acc = *sptr++;
+ acc += *sptr;
+ sptr = (u_short *) & (nbtarg->newaddr);
+ acc -= *sptr++;
+ acc -= *sptr;
+ ADJUST_CHECKSUM(acc, *nbtarg->uh_sum);
+ }
+ nb->addr = nbtarg->newaddr;
+#ifdef LIBALIAS_DEBUG
+ printf("O");
+#endif
+ }
+#ifdef LIBALIAS_DEBUG
+ else {
+ printf(".");
+ }
+#endif
+ nb = (NBTNsRNB *) ((u_char *) nb + SizeOfNsRNB);
+ bcount -= SizeOfNsRNB;
+ }
+ if (nb == NULL || (char *)(nb + 1) > pmax) {
+ nb = NULL;
+ }
+ return ((u_char *) nb);
+}
+
+#define SizeOfResourceA 6
+typedef struct {
+ struct in_addr addr;
+} NBTNsResourceA;
+
+static u_char *
+AliasHandleResourceA(
+ NBTNsResource * q,
+ char *pmax,
+ NBTArguments * nbtarg)
+{
+ NBTNsResourceA *a;
+ u_short bcount;
+
+ if (q == NULL || (char *)(q + 1) > pmax)
+ return (NULL);
+
+ /* Forward to Resource A position */
+ a = (NBTNsResourceA *) ((u_char *) q + sizeof(NBTNsResource));
+
+ /* Check out of length */
+ bcount = ntohs(q->rdlen);
+
+ /* Processing all in_addr array */
+#ifdef LIBALIAS_DEBUG
+ printf("Arec [%s", inet_ntoa(nbtarg->oldaddr));
+ printf("->%s]", inet_ntoa(nbtarg->newaddr));
+#endif
+ while (bcount != 0) {
+ if (a == NULL || (char *)(a + 1) > pmax)
+ return (NULL);
+#ifdef LIBALIAS_DEBUG
+ printf("..%s", inet_ntoa(a->addr));
+#endif
+ if (!bcmp(&nbtarg->oldaddr, &a->addr, sizeof(struct in_addr))) {
+ if (*nbtarg->uh_sum != 0) {
+ int acc;
+ u_short *sptr;
+
+ sptr = (u_short *) & (a->addr); /* Old */
+ acc = *sptr++;
+ acc += *sptr;
+ sptr = (u_short *) & nbtarg->newaddr; /* New */
+ acc -= *sptr++;
+ acc -= *sptr;
+ ADJUST_CHECKSUM(acc, *nbtarg->uh_sum);
+ }
+ a->addr = nbtarg->newaddr;
+ }
+ a++; /* XXXX */
+ bcount -= SizeOfResourceA;
+ }
+ if (a == NULL || (char *)(a + 1) > pmax)
+ a = NULL;
+ return ((u_char *) a);
+}
+
+typedef struct {
+ u_short opcode:4, flags:8, resv:4;
+} NBTNsResourceNULL;
+
+static u_char *
+AliasHandleResourceNULL(
+ NBTNsResource * q,
+ char *pmax,
+ NBTArguments * nbtarg)
+{
+ NBTNsResourceNULL *n;
+ u_short bcount;
+
+ (void)nbtarg;
+
+ if (q == NULL || (char *)(q + 1) > pmax)
+ return (NULL);
+
+ /* Forward to Resource NULL position */
+ n = (NBTNsResourceNULL *) ((u_char *) q + sizeof(NBTNsResource));
+
+ /* Check out of length */
+ bcount = ntohs(q->rdlen);
+
+ /* Processing all in_addr array */
+ while (bcount != 0) {
+ if ((char *)(n + 1) > pmax) {
+ n = NULL;
+ break;
+ }
+ n++;
+ bcount -= sizeof(NBTNsResourceNULL);
+ }
+ if ((char *)(n + 1) > pmax)
+ n = NULL;
+
+ return ((u_char *) n);
+}
+
+static u_char *
+AliasHandleResourceNS(
+ NBTNsResource * q,
+ char *pmax,
+ NBTArguments * nbtarg)
+{
+ NBTNsResourceNULL *n;
+ u_short bcount;
+
+ (void)nbtarg;
+
+ if (q == NULL || (char *)(q + 1) > pmax)
+ return (NULL);
+
+ /* Forward to Resource NULL position */
+ n = (NBTNsResourceNULL *) ((u_char *) q + sizeof(NBTNsResource));
+
+ /* Check out of length */
+ bcount = ntohs(q->rdlen);
+
+ /* Resource Record Name Filed */
+ q = (NBTNsResource *) AliasHandleName((u_char *) n, pmax); /* XXX */
+
+ if (q == NULL || (char *)((u_char *) n + bcount) > pmax)
+ return (NULL);
+ else
+ return ((u_char *) n + bcount);
+}
+
+typedef struct {
+ u_short numnames;
+} NBTNsResourceNBSTAT;
+
+static u_char *
+AliasHandleResourceNBSTAT(
+ NBTNsResource * q,
+ char *pmax,
+ NBTArguments * nbtarg)
+{
+ NBTNsResourceNBSTAT *n;
+ u_short bcount;
+
+ (void)nbtarg;
+
+ if (q == NULL || (char *)(q + 1) > pmax)
+ return (NULL);
+
+ /* Forward to Resource NBSTAT position */
+ n = (NBTNsResourceNBSTAT *) ((u_char *) q + sizeof(NBTNsResource));
+
+ /* Check out of length */
+ bcount = ntohs(q->rdlen);
+
+ if (q == NULL || (char *)((u_char *) n + bcount) > pmax)
+ return (NULL);
+ else
+ return ((u_char *) n + bcount);
+}
+
+static u_char *
+AliasHandleResource(
+ u_short count,
+ NBTNsResource * q,
+ char *pmax,
+ NBTArguments
+ * nbtarg)
+{
+ while (count != 0) {
+ /* Resource Record Name Filed */
+ q = (NBTNsResource *) AliasHandleName((u_char *) q, pmax);
+
+ if (q == NULL || (char *)(q + 1) > pmax)
+ break;
+#ifdef LIBALIAS_DEBUG
+ printf("type=%02x, count=%d\n", ntohs(q->type), count);
+#endif
+
+ /* Type and Class filed */
+ switch (ntohs(q->type)) {
+ case RR_TYPE_NB:
+ q = (NBTNsResource *) AliasHandleResourceNB(
+ q,
+ pmax,
+ nbtarg
+ );
+ break;
+ case RR_TYPE_A:
+ q = (NBTNsResource *) AliasHandleResourceA(
+ q,
+ pmax,
+ nbtarg
+ );
+ break;
+ case RR_TYPE_NS:
+ q = (NBTNsResource *) AliasHandleResourceNS(
+ q,
+ pmax,
+ nbtarg
+ );
+ break;
+ case RR_TYPE_NULL:
+ q = (NBTNsResource *) AliasHandleResourceNULL(
+ q,
+ pmax,
+ nbtarg
+ );
+ break;
+ case RR_TYPE_NBSTAT:
+ q = (NBTNsResource *) AliasHandleResourceNBSTAT(
+ q,
+ pmax,
+ nbtarg
+ );
+ break;
+ default:
+#ifdef LIBALIAS_DEBUG
+ printf(
+ "\nUnknown Type of Resource %0x\n",
+ ntohs(q->type)
+ );
+ fflush(stdout);
+#endif
+ break;
+ }
+ count--;
+ }
+ return ((u_char *) q);
+}
+
+static int
+AliasHandleUdpNbtNS(
+ struct libalias *la,
+ struct ip *pip, /* IP packet to examine/patch */
+ struct alias_link *lnk,
+ struct in_addr *alias_address,
+ u_short * alias_port,
+ struct in_addr *original_address,
+ u_short * original_port)
+{
+ struct udphdr *uh;
+ NbtNSHeader *nsh;
+ u_char *p;
+ char *pmax;
+ NBTArguments nbtarg;
+
+ (void)la;
+ (void)lnk;
+
+ /* Set up Common Parameter */
+ nbtarg.oldaddr = *alias_address;
+ nbtarg.oldport = *alias_port;
+ nbtarg.newaddr = *original_address;
+ nbtarg.newport = *original_port;
+
+ /* Calculate data length of UDP packet */
+ uh = (struct udphdr *)ip_next(pip);
+ nbtarg.uh_sum = &(uh->uh_sum);
+ nsh = (NbtNSHeader *)udp_next(uh);
+ p = (u_char *) (nsh + 1);
+ pmax = (char *)uh + ntohs(uh->uh_ulen);
+
+ if ((char *)(nsh + 1) > pmax)
+ return (-1);
+
+#ifdef LIBALIAS_DEBUG
+ printf(" [%s] ID=%02x, op=%01x, flag=%02x, rcode=%01x, qd=%04x"
+ ", an=%04x, ns=%04x, ar=%04x, [%d]-->",
+ nsh->dir ? "Response" : "Request",
+ nsh->nametrid,
+ nsh->opcode,
+ nsh->nmflags,
+ nsh->rcode,
+ ntohs(nsh->qdcount),
+ ntohs(nsh->ancount),
+ ntohs(nsh->nscount),
+ ntohs(nsh->arcount),
+ (u_char *) p - (u_char *) nsh
+ );
+#endif
+
+ /* Question Entries */
+ if (ntohs(nsh->qdcount) != 0) {
+ p = AliasHandleQuestion(
+ ntohs(nsh->qdcount),
+ (NBTNsQuestion *) p,
+ pmax,
+ &nbtarg
+ );
+ }
+ /* Answer Resource Records */
+ if (ntohs(nsh->ancount) != 0) {
+ p = AliasHandleResource(
+ ntohs(nsh->ancount),
+ (NBTNsResource *) p,
+ pmax,
+ &nbtarg
+ );
+ }
+ /* Authority Resource Recodrs */
+ if (ntohs(nsh->nscount) != 0) {
+ p = AliasHandleResource(
+ ntohs(nsh->nscount),
+ (NBTNsResource *) p,
+ pmax,
+ &nbtarg
+ );
+ }
+ /* Additional Resource Recodrs */
+ if (ntohs(nsh->arcount) != 0) {
+ p = AliasHandleResource(
+ ntohs(nsh->arcount),
+ (NBTNsResource *) p,
+ pmax,
+ &nbtarg
+ );
+ }
+#ifdef LIBALIAS_DEBUG
+ PrintRcode(nsh->rcode);
+#endif
+ return ((p == NULL) ? -1 : 0);
+}
diff --git a/freebsd/sys/netinet/libalias/alias_pptp.c b/freebsd/sys/netinet/libalias/alias_pptp.c
new file mode 100644
index 00000000..f6c7f199
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_pptp.c
@@ -0,0 +1,525 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*
+ * alias_pptp.c
+ *
+ * Copyright (c) 2000 Whistle Communications, Inc.
+ * All rights reserved.
+ *
+ * Subject to the following obligations and disclaimer of warranty, use and
+ * redistribution of this software, in source or object code forms, with or
+ * without modifications are expressly permitted by Whistle Communications;
+ * provided, however, that:
+ * 1. Any and all reproductions of the source or object code must include the
+ * copyright notice above and the following disclaimer of warranties; and
+ * 2. No rights are granted, in any manner or form, to use Whistle
+ * Communications, Inc. trademarks, including the mark "WHISTLE
+ * COMMUNICATIONS" on advertising, endorsements, or otherwise except as
+ * such appears in the above copyright notice or in the software.
+ *
+ * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
+ * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
+ * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
+ * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
+ * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
+ * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
+ * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
+ * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
+ * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
+ * OF SUCH DAMAGE.
+ *
+ * Author: Erik Salander <erik@whistle.com>
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/* Includes */
+#ifdef _KERNEL
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/limits.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/module.h>
+#else
+#include <freebsd/errno.h>
+#include <freebsd/limits.h>
+#include <freebsd/sys/types.h>
+#include <freebsd/stdio.h>
+#endif
+
+#include <freebsd/netinet/tcp.h>
+
+#ifdef _KERNEL
+#include <freebsd/netinet/libalias/alias.h>
+#include <freebsd/netinet/libalias/alias_local.h>
+#include <freebsd/netinet/libalias/alias_mod.h>
+#else
+#include <freebsd/local/alias.h>
+#include <freebsd/local/alias_local.h>
+#include <freebsd/local/alias_mod.h>
+#endif
+
+#define PPTP_CONTROL_PORT_NUMBER 1723
+
+static void
+AliasHandlePptpOut(struct libalias *, struct ip *, struct alias_link *);
+
+static void
+AliasHandlePptpIn(struct libalias *, struct ip *, struct alias_link *);
+
+static int
+AliasHandlePptpGreOut(struct libalias *, struct ip *);
+
+static int
+AliasHandlePptpGreIn(struct libalias *, struct ip *);
+
+static int
+fingerprint(struct libalias *la, struct alias_data *ah)
+{
+
+ if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL)
+ return (-1);
+ if (ntohs(*ah->dport) == PPTP_CONTROL_PORT_NUMBER
+ || ntohs(*ah->sport) == PPTP_CONTROL_PORT_NUMBER)
+ return (0);
+ return (-1);
+}
+
+static int
+fingerprintgre(struct libalias *la, struct alias_data *ah)
+{
+
+ return (0);
+}
+
+static int
+protohandlerin(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ AliasHandlePptpIn(la, pip, ah->lnk);
+ return (0);
+}
+
+static int
+protohandlerout(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ AliasHandlePptpOut(la, pip, ah->lnk);
+ return (0);
+}
+
+static int
+protohandlergrein(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY ||
+ AliasHandlePptpGreIn(la, pip) == 0)
+ return (0);
+ return (-1);
+}
+
+static int
+protohandlergreout(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ if (AliasHandlePptpGreOut(la, pip) == 0)
+ return (0);
+ return (-1);
+}
+
+/* Kernel module definition. */
+struct proto_handler handlers[] = {
+ {
+ .pri = 200,
+ .dir = IN,
+ .proto = TCP,
+ .fingerprint = &fingerprint,
+ .protohandler = &protohandlerin
+ },
+ {
+ .pri = 210,
+ .dir = OUT,
+ .proto = TCP,
+ .fingerprint = &fingerprint,
+ .protohandler = &protohandlerout
+ },
+/*
+ * WATCH OUT!!! these 2 handlers NEED a priority of INT_MAX (highest possible)
+ * cause they will ALWAYS process packets, so they must be the last one
+ * in chain: look fingerprintgre() above.
+ */
+ {
+ .pri = INT_MAX,
+ .dir = IN,
+ .proto = IP,
+ .fingerprint = &fingerprintgre,
+ .protohandler = &protohandlergrein
+ },
+ {
+ .pri = INT_MAX,
+ .dir = OUT,
+ .proto = IP,
+ .fingerprint = &fingerprintgre,
+ .protohandler = &protohandlergreout
+ },
+ { EOH }
+};
+static int
+mod_handler(module_t mod, int type, void *data)
+{
+ int error;
+
+ switch (type) {
+ case MOD_LOAD:
+ error = 0;
+ LibAliasAttachHandlers(handlers);
+ break;
+ case MOD_UNLOAD:
+ error = 0;
+ LibAliasDetachHandlers(handlers);
+ break;
+ default:
+ error = EINVAL;
+ }
+ return (error);
+}
+
+#ifdef _KERNEL
+static
+#endif
+moduledata_t alias_mod = {
+ "alias_pptp", mod_handler, NULL
+};
+
+#ifdef _KERNEL
+DECLARE_MODULE(alias_pptp, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);
+MODULE_VERSION(alias_pptp, 1);
+MODULE_DEPEND(alias_pptp, libalias, 1, 1, 1);
+#endif
+
+/*
+ Alias_pptp.c performs special processing for PPTP sessions under TCP.
+ Specifically, watch PPTP control messages and alias the Call ID or the
+ Peer's Call ID in the appropriate messages. Note, PPTP requires
+ "de-aliasing" of incoming packets, this is different than any other
+ TCP applications that are currently (ie. FTP, IRC and RTSP) aliased.
+
+ For Call IDs encountered for the first time, a PPTP alias link is created.
+ The PPTP alias link uses the Call ID in place of the original port number.
+ An alias Call ID is created.
+
+ For this routine to work, the PPTP control messages must fit entirely
+ into a single TCP packet. This is typically the case, but is not
+ required by the spec.
+
+ Unlike some of the other TCP applications that are aliased (ie. FTP,
+ IRC and RTSP), the PPTP control messages that need to be aliased are
+ guaranteed to remain the same length. The aliased Call ID is a fixed
+ length field.
+
+ Reference: RFC 2637
+
+ Initial version: May, 2000 (eds)
+
+*/
+
+/*
+ * PPTP definitions
+ */
+
+struct grehdr { /* Enhanced GRE header. */
+ u_int16_t gh_flags; /* Flags. */
+ u_int16_t gh_protocol; /* Protocol type. */
+ u_int16_t gh_length; /* Payload length. */
+ u_int16_t gh_call_id; /* Call ID. */
+ u_int32_t gh_seq_no; /* Sequence number (optional). */
+ u_int32_t gh_ack_no; /* Acknowledgment number
+ * (optional). */
+};
+typedef struct grehdr GreHdr;
+
+/* The PPTP protocol ID used in the GRE 'proto' field. */
+#define PPTP_GRE_PROTO 0x880b
+
+/* Bits that must be set a certain way in all PPTP/GRE packets. */
+#define PPTP_INIT_VALUE ((0x2001 << 16) | PPTP_GRE_PROTO)
+#define PPTP_INIT_MASK 0xef7fffff
+
+#define PPTP_MAGIC 0x1a2b3c4d
+#define PPTP_CTRL_MSG_TYPE 1
+
+enum {
+ PPTP_StartCtrlConnRequest = 1,
+ PPTP_StartCtrlConnReply = 2,
+ PPTP_StopCtrlConnRequest = 3,
+ PPTP_StopCtrlConnReply = 4,
+ PPTP_EchoRequest = 5,
+ PPTP_EchoReply = 6,
+ PPTP_OutCallRequest = 7,
+ PPTP_OutCallReply = 8,
+ PPTP_InCallRequest = 9,
+ PPTP_InCallReply = 10,
+ PPTP_InCallConn = 11,
+ PPTP_CallClearRequest = 12,
+ PPTP_CallDiscNotify = 13,
+ PPTP_WanErrorNotify = 14,
+ PPTP_SetLinkInfo = 15
+};
+
+ /* Message structures */
+struct pptpMsgHead {
+ u_int16_t length; /* total length */
+ u_int16_t msgType;/* PPTP message type */
+ u_int32_t magic; /* magic cookie */
+ u_int16_t type; /* control message type */
+ u_int16_t resv0; /* reserved */
+};
+typedef struct pptpMsgHead *PptpMsgHead;
+
+struct pptpCodes {
+ u_int8_t resCode;/* Result Code */
+ u_int8_t errCode;/* Error Code */
+};
+typedef struct pptpCodes *PptpCode;
+
+struct pptpCallIds {
+ u_int16_t cid1; /* Call ID field #1 */
+ u_int16_t cid2; /* Call ID field #2 */
+};
+typedef struct pptpCallIds *PptpCallId;
+
+static PptpCallId AliasVerifyPptp(struct ip *, u_int16_t *);
+
+
+static void
+AliasHandlePptpOut(struct libalias *la,
+ struct ip *pip, /* IP packet to examine/patch */
+ struct alias_link *lnk)
+{ /* The PPTP control link */
+ struct alias_link *pptp_lnk;
+ PptpCallId cptr;
+ PptpCode codes;
+ u_int16_t ctl_type; /* control message type */
+ struct tcphdr *tc;
+
+ /* Verify valid PPTP control message */
+ if ((cptr = AliasVerifyPptp(pip, &ctl_type)) == NULL)
+ return;
+
+ /* Modify certain PPTP messages */
+ switch (ctl_type) {
+ case PPTP_OutCallRequest:
+ case PPTP_OutCallReply:
+ case PPTP_InCallRequest:
+ case PPTP_InCallReply:
+ /*
+ * Establish PPTP link for address and Call ID found in
+ * control message.
+ */
+ pptp_lnk = AddPptp(la, GetOriginalAddress(lnk), GetDestAddress(lnk),
+ GetAliasAddress(lnk), cptr->cid1);
+ break;
+ case PPTP_CallClearRequest:
+ case PPTP_CallDiscNotify:
+ /*
+ * Find PPTP link for address and Call ID found in control
+ * message.
+ */
+ pptp_lnk = FindPptpOutByCallId(la, GetOriginalAddress(lnk),
+ GetDestAddress(lnk),
+ cptr->cid1);
+ break;
+ default:
+ return;
+ }
+
+ if (pptp_lnk != NULL) {
+ int accumulate = cptr->cid1;
+
+ /* alias the Call Id */
+ cptr->cid1 = GetAliasPort(pptp_lnk);
+
+ /* Compute TCP checksum for revised packet */
+ tc = (struct tcphdr *)ip_next(pip);
+ accumulate -= cptr->cid1;
+ ADJUST_CHECKSUM(accumulate, tc->th_sum);
+
+ switch (ctl_type) {
+ case PPTP_OutCallReply:
+ case PPTP_InCallReply:
+ codes = (PptpCode) (cptr + 1);
+ if (codes->resCode == 1) /* Connection
+ * established, */
+ SetDestCallId(pptp_lnk, /* note the Peer's Call
+ * ID. */
+ cptr->cid2);
+ else
+ SetExpire(pptp_lnk, 0); /* Connection refused. */
+ break;
+ case PPTP_CallDiscNotify: /* Connection closed. */
+ SetExpire(pptp_lnk, 0);
+ break;
+ }
+ }
+}
+
+static void
+AliasHandlePptpIn(struct libalias *la,
+ struct ip *pip, /* IP packet to examine/patch */
+ struct alias_link *lnk)
+{ /* The PPTP control link */
+ struct alias_link *pptp_lnk;
+ PptpCallId cptr;
+ u_int16_t *pcall_id;
+ u_int16_t ctl_type; /* control message type */
+ struct tcphdr *tc;
+
+ /* Verify valid PPTP control message */
+ if ((cptr = AliasVerifyPptp(pip, &ctl_type)) == NULL)
+ return;
+
+ /* Modify certain PPTP messages */
+ switch (ctl_type) {
+ case PPTP_InCallConn:
+ case PPTP_WanErrorNotify:
+ case PPTP_SetLinkInfo:
+ pcall_id = &cptr->cid1;
+ break;
+ case PPTP_OutCallReply:
+ case PPTP_InCallReply:
+ pcall_id = &cptr->cid2;
+ break;
+ case PPTP_CallDiscNotify: /* Connection closed. */
+ pptp_lnk = FindPptpInByCallId(la, GetDestAddress(lnk),
+ GetAliasAddress(lnk),
+ cptr->cid1);
+ if (pptp_lnk != NULL)
+ SetExpire(pptp_lnk, 0);
+ return;
+ default:
+ return;
+ }
+
+ /* Find PPTP link for address and Call ID found in PPTP Control Msg */
+ pptp_lnk = FindPptpInByPeerCallId(la, GetDestAddress(lnk),
+ GetAliasAddress(lnk),
+ *pcall_id);
+
+ if (pptp_lnk != NULL) {
+ int accumulate = *pcall_id;
+
+ /* De-alias the Peer's Call Id. */
+ *pcall_id = GetOriginalPort(pptp_lnk);
+
+ /* Compute TCP checksum for modified packet */
+ tc = (struct tcphdr *)ip_next(pip);
+ accumulate -= *pcall_id;
+ ADJUST_CHECKSUM(accumulate, tc->th_sum);
+
+ if (ctl_type == PPTP_OutCallReply || ctl_type == PPTP_InCallReply) {
+ PptpCode codes = (PptpCode) (cptr + 1);
+
+ if (codes->resCode == 1) /* Connection
+ * established, */
+ SetDestCallId(pptp_lnk, /* note the Call ID. */
+ cptr->cid1);
+ else
+ SetExpire(pptp_lnk, 0); /* Connection refused. */
+ }
+ }
+}
+
+static PptpCallId
+AliasVerifyPptp(struct ip *pip, u_int16_t * ptype)
+{ /* IP packet to examine/patch */
+ int hlen, tlen, dlen;
+ PptpMsgHead hptr;
+ struct tcphdr *tc;
+
+ /* Calculate some lengths */
+ tc = (struct tcphdr *)ip_next(pip);
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+ /* Verify data length */
+ if (dlen < (int)(sizeof(struct pptpMsgHead) + sizeof(struct pptpCallIds)))
+ return (NULL);
+
+ /* Move up to PPTP message header */
+ hptr = (PptpMsgHead) tcp_next(tc);
+
+ /* Return the control message type */
+ *ptype = ntohs(hptr->type);
+
+ /* Verify PPTP Control Message */
+ if ((ntohs(hptr->msgType) != PPTP_CTRL_MSG_TYPE) ||
+ (ntohl(hptr->magic) != PPTP_MAGIC))
+ return (NULL);
+
+ /* Verify data length. */
+ if ((*ptype == PPTP_OutCallReply || *ptype == PPTP_InCallReply) &&
+ (dlen < (int)(sizeof(struct pptpMsgHead) + sizeof(struct pptpCallIds) +
+ sizeof(struct pptpCodes))))
+ return (NULL);
+ else
+ return (PptpCallId) (hptr + 1);
+}
+
+static int
+AliasHandlePptpGreOut(struct libalias *la, struct ip *pip)
+{
+ GreHdr *gr;
+ struct alias_link *lnk;
+
+ gr = (GreHdr *) ip_next(pip);
+
+ /* Check GRE header bits. */
+ if ((ntohl(*((u_int32_t *) gr)) & PPTP_INIT_MASK) != PPTP_INIT_VALUE)
+ return (-1);
+
+ lnk = FindPptpOutByPeerCallId(la, pip->ip_src, pip->ip_dst, gr->gh_call_id);
+ if (lnk != NULL) {
+ struct in_addr alias_addr = GetAliasAddress(lnk);
+
+ /* Change source IP address. */
+ DifferentialChecksum(&pip->ip_sum,
+ &alias_addr, &pip->ip_src, 2);
+ pip->ip_src = alias_addr;
+ }
+ return (0);
+}
+
+static int
+AliasHandlePptpGreIn(struct libalias *la, struct ip *pip)
+{
+ GreHdr *gr;
+ struct alias_link *lnk;
+
+ gr = (GreHdr *) ip_next(pip);
+
+ /* Check GRE header bits. */
+ if ((ntohl(*((u_int32_t *) gr)) & PPTP_INIT_MASK) != PPTP_INIT_VALUE)
+ return (-1);
+
+ lnk = FindPptpInByPeerCallId(la, pip->ip_src, pip->ip_dst, gr->gh_call_id);
+ if (lnk != NULL) {
+ struct in_addr src_addr = GetOriginalAddress(lnk);
+
+ /* De-alias the Peer's Call Id. */
+ gr->gh_call_id = GetOriginalPort(lnk);
+
+ /* Restore original IP address. */
+ DifferentialChecksum(&pip->ip_sum,
+ &src_addr, &pip->ip_dst, 2);
+ pip->ip_dst = src_addr;
+ }
+ return (0);
+}
diff --git a/freebsd/sys/netinet/libalias/alias_proxy.c b/freebsd/sys/netinet/libalias/alias_proxy.c
new file mode 100644
index 00000000..f4f2b643
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_proxy.c
@@ -0,0 +1,870 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/* file: alias_proxy.c
+
+ This file encapsulates special operations related to transparent
+ proxy redirection. This is where packets with a particular destination,
+ usually tcp port 80, are redirected to a proxy server.
+
+ When packets are proxied, the destination address and port are
+ modified. In certain cases, it is necessary to somehow encode
+ the original address/port info into the packet. Two methods are
+ presently supported: addition of a [DEST addr port] string at the
+ beginning of a tcp stream, or inclusion of an optional field
+ in the IP header.
+
+ There is one public API function:
+
+ PacketAliasProxyRule() -- Adds and deletes proxy
+ rules.
+
+ Rules are stored in a linear linked list, so lookup efficiency
+ won't be too good for large lists.
+
+
+ Initial development: April, 1998 (cjm)
+*/
+
+
+/* System includes */
+#ifdef _KERNEL
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/ctype.h>
+#include <freebsd/sys/libkern.h>
+#include <freebsd/sys/limits.h>
+#else
+#include <freebsd/sys/types.h>
+#include <freebsd/ctype.h>
+#include <freebsd/stdio.h>
+#include <freebsd/stdlib.h>
+#include <freebsd/netdb.h>
+#include <freebsd/string.h>
+#endif
+
+#include <freebsd/netinet/tcp.h>
+
+#ifdef _KERNEL
+#include <freebsd/netinet/libalias/alias.h>
+#include <freebsd/netinet/libalias/alias_local.h>
+#include <freebsd/netinet/libalias/alias_mod.h>
+#else
+#include <freebsd/arpa/inet.h>
+#include <freebsd/local/alias.h> /* Public API functions for libalias */
+#include <freebsd/local/alias_local.h> /* Functions used by alias*.c */
+#endif
+
+/*
+ Data structures
+ */
+
+/*
+ * A linked list of arbitrary length, based on struct proxy_entry is
+ * used to store proxy rules.
+ */
+struct proxy_entry {
+ struct libalias *la;
+#define PROXY_TYPE_ENCODE_NONE 1
+#define PROXY_TYPE_ENCODE_TCPSTREAM 2
+#define PROXY_TYPE_ENCODE_IPHDR 3
+ int rule_index;
+ int proxy_type;
+ u_char proto;
+ u_short proxy_port;
+ u_short server_port;
+
+ struct in_addr server_addr;
+
+ struct in_addr src_addr;
+ struct in_addr src_mask;
+
+ struct in_addr dst_addr;
+ struct in_addr dst_mask;
+
+ struct proxy_entry *next;
+ struct proxy_entry *last;
+};
+
+
+
+/*
+ File scope variables
+*/
+
+
+
+/* Local (static) functions:
+
+ IpMask() -- Utility function for creating IP
+ masks from integer (1-32) specification.
+ IpAddr() -- Utility function for converting string
+ to IP address
+ IpPort() -- Utility function for converting string
+ to port number
+ RuleAdd() -- Adds an element to the rule list.
+ RuleDelete() -- Removes an element from the rule list.
+ RuleNumberDelete() -- Removes all elements from the rule list
+ having a certain rule number.
+ ProxyEncodeTcpStream() -- Adds [DEST x.x.x.x xxxx] to the beginning
+ of a TCP stream.
+ ProxyEncodeIpHeader() -- Adds an IP option indicating the true
+ destination of a proxied IP packet
+*/
+
+static int IpMask(int, struct in_addr *);
+static int IpAddr(char *, struct in_addr *);
+static int IpPort(char *, int, int *);
+static void RuleAdd(struct libalias *la, struct proxy_entry *);
+static void RuleDelete(struct proxy_entry *);
+static int RuleNumberDelete(struct libalias *la, int);
+static void ProxyEncodeTcpStream(struct alias_link *, struct ip *, int);
+static void ProxyEncodeIpHeader(struct ip *, int);
+
+static int
+IpMask(int nbits, struct in_addr *mask)
+{
+ int i;
+ u_int imask;
+
+ if (nbits < 0 || nbits > 32)
+ return (-1);
+
+ imask = 0;
+ for (i = 0; i < nbits; i++)
+ imask = (imask >> 1) + 0x80000000;
+ mask->s_addr = htonl(imask);
+
+ return (0);
+}
+
+static int
+IpAddr(char *s, struct in_addr *addr)
+{
+ if (inet_aton(s, addr) == 0)
+ return (-1);
+ else
+ return (0);
+}
+
+static int
+IpPort(char *s, int proto, int *port)
+{
+ int n;
+
+ n = sscanf(s, "%d", port);
+ if (n != 1)
+#ifndef _KERNEL /* XXX: we accept only numeric ports in kernel */
+ {
+ struct servent *se;
+
+ if (proto == IPPROTO_TCP)
+ se = getservbyname(s, "tcp");
+ else if (proto == IPPROTO_UDP)
+ se = getservbyname(s, "udp");
+ else
+ return (-1);
+
+ if (se == NULL)
+ return (-1);
+
+ *port = (u_int) ntohs(se->s_port);
+ }
+#else
+ return (-1);
+#endif
+ return (0);
+}
+
+void
+RuleAdd(struct libalias *la, struct proxy_entry *entry)
+{
+ int rule_index;
+ struct proxy_entry *ptr;
+ struct proxy_entry *ptr_last;
+
+ LIBALIAS_LOCK_ASSERT(la);
+
+ if (la->proxyList == NULL) {
+ la->proxyList = entry;
+ entry->last = NULL;
+ entry->next = NULL;
+ return;
+ }
+ entry->la = la;
+
+ rule_index = entry->rule_index;
+ ptr = la->proxyList;
+ ptr_last = NULL;
+ while (ptr != NULL) {
+ if (ptr->rule_index >= rule_index) {
+ if (ptr_last == NULL) {
+ entry->next = la->proxyList;
+ entry->last = NULL;
+ la->proxyList->last = entry;
+ la->proxyList = entry;
+ return;
+ }
+ ptr_last->next = entry;
+ ptr->last = entry;
+ entry->last = ptr->last;
+ entry->next = ptr;
+ return;
+ }
+ ptr_last = ptr;
+ ptr = ptr->next;
+ }
+
+ ptr_last->next = entry;
+ entry->last = ptr_last;
+ entry->next = NULL;
+}
+
+static void
+RuleDelete(struct proxy_entry *entry)
+{
+ struct libalias *la;
+
+ la = entry->la;
+ LIBALIAS_LOCK_ASSERT(la);
+ if (entry->last != NULL)
+ entry->last->next = entry->next;
+ else
+ la->proxyList = entry->next;
+
+ if (entry->next != NULL)
+ entry->next->last = entry->last;
+
+ free(entry);
+}
+
+static int
+RuleNumberDelete(struct libalias *la, int rule_index)
+{
+ int err;
+ struct proxy_entry *ptr;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ err = -1;
+ ptr = la->proxyList;
+ while (ptr != NULL) {
+ struct proxy_entry *ptr_next;
+
+ ptr_next = ptr->next;
+ if (ptr->rule_index == rule_index) {
+ err = 0;
+ RuleDelete(ptr);
+ }
+ ptr = ptr_next;
+ }
+
+ return (err);
+}
+
+static void
+ProxyEncodeTcpStream(struct alias_link *lnk,
+ struct ip *pip,
+ int maxpacketsize)
+{
+ int slen;
+ char buffer[40];
+ struct tcphdr *tc;
+
+/* Compute pointer to tcp header */
+ tc = (struct tcphdr *)ip_next(pip);
+
+/* Don't modify if once already modified */
+
+ if (GetAckModified(lnk))
+ return;
+
+/* Translate destination address and port to string form */
+ snprintf(buffer, sizeof(buffer) - 2, "[DEST %s %d]",
+ inet_ntoa(GetProxyAddress(lnk)), (u_int) ntohs(GetProxyPort(lnk)));
+
+/* Pad string out to a multiple of two in length */
+ slen = strlen(buffer);
+ switch (slen % 2) {
+ case 0:
+ strcat(buffer, " \n");
+ slen += 2;
+ break;
+ case 1:
+ strcat(buffer, "\n");
+ slen += 1;
+ }
+
+/* Check for packet overflow */
+ if ((int)(ntohs(pip->ip_len) + strlen(buffer)) > maxpacketsize)
+ return;
+
+/* Shift existing TCP data and insert destination string */
+ {
+ int dlen;
+ int hlen;
+ char *p;
+
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ dlen = ntohs(pip->ip_len) - hlen;
+
+/* Modify first packet that has data in it */
+
+ if (dlen == 0)
+ return;
+
+ p = (char *)pip;
+ p += hlen;
+
+ bcopy(p, p + slen, dlen);
+ memcpy(p, buffer, slen);
+ }
+
+/* Save information about modfied sequence number */
+ {
+ int delta;
+
+ SetAckModified(lnk);
+ tc = (struct tcphdr *)ip_next(pip);
+ delta = GetDeltaSeqOut(tc->th_seq, lnk);
+ AddSeq(lnk, delta + slen, pip->ip_hl, pip->ip_len, tc->th_seq,
+ tc->th_off);
+ }
+
+/* Update IP header packet length and checksum */
+ {
+ int accumulate;
+
+ accumulate = pip->ip_len;
+ pip->ip_len = htons(ntohs(pip->ip_len) + slen);
+ accumulate -= pip->ip_len;
+
+ ADJUST_CHECKSUM(accumulate, pip->ip_sum);
+ }
+
+/* Update TCP checksum, Use TcpChecksum since so many things have
+ already changed. */
+
+ tc->th_sum = 0;
+#ifdef _KERNEL
+ tc->th_x2 = 1;
+#else
+ tc->th_sum = TcpChecksum(pip);
+#endif
+}
+
+static void
+ProxyEncodeIpHeader(struct ip *pip,
+ int maxpacketsize)
+{
+#define OPTION_LEN_BYTES 8
+#define OPTION_LEN_INT16 4
+#define OPTION_LEN_INT32 2
+ u_char option[OPTION_LEN_BYTES];
+
+#ifdef LIBALIAS_DEBUG
+ fprintf(stdout, " ip cksum 1 = %x\n", (u_int) IpChecksum(pip));
+ fprintf(stdout, "tcp cksum 1 = %x\n", (u_int) TcpChecksum(pip));
+#endif
+
+ (void)maxpacketsize;
+
+/* Check to see that there is room to add an IP option */
+ if (pip->ip_hl > (0x0f - OPTION_LEN_INT32))
+ return;
+
+/* Build option and copy into packet */
+ {
+ u_char *ptr;
+ struct tcphdr *tc;
+
+ ptr = (u_char *) pip;
+ ptr += 20;
+ memcpy(ptr + OPTION_LEN_BYTES, ptr, ntohs(pip->ip_len) - 20);
+
+ option[0] = 0x64; /* class: 3 (reserved), option 4 */
+ option[1] = OPTION_LEN_BYTES;
+
+ memcpy(&option[2], (u_char *) & pip->ip_dst, 4);
+
+ tc = (struct tcphdr *)ip_next(pip);
+ memcpy(&option[6], (u_char *) & tc->th_sport, 2);
+
+ memcpy(ptr, option, 8);
+ }
+
+/* Update checksum, header length and packet length */
+ {
+ int i;
+ int accumulate;
+ u_short *sptr;
+
+ sptr = (u_short *) option;
+ accumulate = 0;
+ for (i = 0; i < OPTION_LEN_INT16; i++)
+ accumulate -= *(sptr++);
+
+ sptr = (u_short *) pip;
+ accumulate += *sptr;
+ pip->ip_hl += OPTION_LEN_INT32;
+ accumulate -= *sptr;
+
+ accumulate += pip->ip_len;
+ pip->ip_len = htons(ntohs(pip->ip_len) + OPTION_LEN_BYTES);
+ accumulate -= pip->ip_len;
+
+ ADJUST_CHECKSUM(accumulate, pip->ip_sum);
+ }
+#undef OPTION_LEN_BYTES
+#undef OPTION_LEN_INT16
+#undef OPTION_LEN_INT32
+#ifdef LIBALIAS_DEBUG
+ fprintf(stdout, " ip cksum 2 = %x\n", (u_int) IpChecksum(pip));
+ fprintf(stdout, "tcp cksum 2 = %x\n", (u_int) TcpChecksum(pip));
+#endif
+}
+
+
+/* Functions by other packet alias source files
+
+ ProxyCheck() -- Checks whether an outgoing packet should
+ be proxied.
+ ProxyModify() -- Encodes the original destination address/port
+ for a packet which is to be redirected to
+ a proxy server.
+*/
+
+int
+ProxyCheck(struct libalias *la, struct in_addr *proxy_server_addr,
+ u_short * proxy_server_port, struct in_addr src_addr,
+ struct in_addr dst_addr, u_short dst_port, u_char ip_p)
+{
+ struct proxy_entry *ptr;
+
+ LIBALIAS_LOCK_ASSERT(la);
+
+ ptr = la->proxyList;
+ while (ptr != NULL) {
+ u_short proxy_port;
+
+ proxy_port = ptr->proxy_port;
+ if ((dst_port == proxy_port || proxy_port == 0)
+ && ip_p == ptr->proto
+ && src_addr.s_addr != ptr->server_addr.s_addr) {
+ struct in_addr src_addr_masked;
+ struct in_addr dst_addr_masked;
+
+ src_addr_masked.s_addr = src_addr.s_addr & ptr->src_mask.s_addr;
+ dst_addr_masked.s_addr = dst_addr.s_addr & ptr->dst_mask.s_addr;
+
+ if ((src_addr_masked.s_addr == ptr->src_addr.s_addr)
+ && (dst_addr_masked.s_addr == ptr->dst_addr.s_addr)) {
+ if ((*proxy_server_port = ptr->server_port) == 0)
+ *proxy_server_port = dst_port;
+ *proxy_server_addr = ptr->server_addr;
+ return (ptr->proxy_type);
+ }
+ }
+ ptr = ptr->next;
+ }
+
+ return (0);
+}
+
+void
+ProxyModify(struct libalias *la, struct alias_link *lnk,
+ struct ip *pip,
+ int maxpacketsize,
+ int proxy_type)
+{
+
+ LIBALIAS_LOCK_ASSERT(la);
+ (void)la;
+
+ switch (proxy_type) {
+ case PROXY_TYPE_ENCODE_IPHDR:
+ ProxyEncodeIpHeader(pip, maxpacketsize);
+ break;
+
+ case PROXY_TYPE_ENCODE_TCPSTREAM:
+ ProxyEncodeTcpStream(lnk, pip, maxpacketsize);
+ break;
+ }
+}
+
+
+/*
+ Public API functions
+*/
+
+int
+LibAliasProxyRule(struct libalias *la, const char *cmd)
+{
+/*
+ * This function takes command strings of the form:
+ *
+ * server <addr>[:<port>]
+ * [port <port>]
+ * [rule n]
+ * [proto tcp|udp]
+ * [src <addr>[/n]]
+ * [dst <addr>[/n]]
+ * [type encode_tcp_stream|encode_ip_hdr|no_encode]
+ *
+ * delete <rule number>
+ *
+ * Subfields can be in arbitrary order. Port numbers and addresses
+ * must be in either numeric or symbolic form. An optional rule number
+ * is used to control the order in which rules are searched. If two
+ * rules have the same number, then search order cannot be guaranteed,
+ * and the rules should be disjoint. If no rule number is specified,
+ * then 0 is used, and group 0 rules are always checked before any
+ * others.
+ */
+ int i, n, len, ret;
+ int cmd_len;
+ int token_count;
+ int state;
+ char *token;
+ char buffer[256];
+ char str_port[sizeof(buffer)];
+ char str_server_port[sizeof(buffer)];
+ char *res = buffer;
+
+ int rule_index;
+ int proto;
+ int proxy_type;
+ int proxy_port;
+ int server_port;
+ struct in_addr server_addr;
+ struct in_addr src_addr, src_mask;
+ struct in_addr dst_addr, dst_mask;
+ struct proxy_entry *proxy_entry;
+
+ LIBALIAS_LOCK(la);
+ ret = 0;
+/* Copy command line into a buffer */
+ cmd += strspn(cmd, " \t");
+ cmd_len = strlen(cmd);
+ if (cmd_len > (int)(sizeof(buffer) - 1)) {
+ ret = -1;
+ goto getout;
+ }
+ strcpy(buffer, cmd);
+
+/* Convert to lower case */
+ len = strlen(buffer);
+ for (i = 0; i < len; i++)
+ buffer[i] = tolower((unsigned char)buffer[i]);
+
+/* Set default proxy type */
+
+/* Set up default values */
+ rule_index = 0;
+ proxy_type = PROXY_TYPE_ENCODE_NONE;
+ proto = IPPROTO_TCP;
+ proxy_port = 0;
+ server_addr.s_addr = 0;
+ server_port = 0;
+ src_addr.s_addr = 0;
+ IpMask(0, &src_mask);
+ dst_addr.s_addr = 0;
+ IpMask(0, &dst_mask);
+
+ str_port[0] = 0;
+ str_server_port[0] = 0;
+
+/* Parse command string with state machine */
+#define STATE_READ_KEYWORD 0
+#define STATE_READ_TYPE 1
+#define STATE_READ_PORT 2
+#define STATE_READ_SERVER 3
+#define STATE_READ_RULE 4
+#define STATE_READ_DELETE 5
+#define STATE_READ_PROTO 6
+#define STATE_READ_SRC 7
+#define STATE_READ_DST 8
+ state = STATE_READ_KEYWORD;
+ token = strsep(&res, " \t");
+ token_count = 0;
+ while (token != NULL) {
+ token_count++;
+ switch (state) {
+ case STATE_READ_KEYWORD:
+ if (strcmp(token, "type") == 0)
+ state = STATE_READ_TYPE;
+ else if (strcmp(token, "port") == 0)
+ state = STATE_READ_PORT;
+ else if (strcmp(token, "server") == 0)
+ state = STATE_READ_SERVER;
+ else if (strcmp(token, "rule") == 0)
+ state = STATE_READ_RULE;
+ else if (strcmp(token, "delete") == 0)
+ state = STATE_READ_DELETE;
+ else if (strcmp(token, "proto") == 0)
+ state = STATE_READ_PROTO;
+ else if (strcmp(token, "src") == 0)
+ state = STATE_READ_SRC;
+ else if (strcmp(token, "dst") == 0)
+ state = STATE_READ_DST;
+ else {
+ ret = -1;
+ goto getout;
+ }
+ break;
+
+ case STATE_READ_TYPE:
+ if (strcmp(token, "encode_ip_hdr") == 0)
+ proxy_type = PROXY_TYPE_ENCODE_IPHDR;
+ else if (strcmp(token, "encode_tcp_stream") == 0)
+ proxy_type = PROXY_TYPE_ENCODE_TCPSTREAM;
+ else if (strcmp(token, "no_encode") == 0)
+ proxy_type = PROXY_TYPE_ENCODE_NONE;
+ else {
+ ret = -1;
+ goto getout;
+ }
+ state = STATE_READ_KEYWORD;
+ break;
+
+ case STATE_READ_PORT:
+ strcpy(str_port, token);
+ state = STATE_READ_KEYWORD;
+ break;
+
+ case STATE_READ_SERVER:
+ {
+ int err;
+ char *p;
+ char s[sizeof(buffer)];
+
+ p = token;
+ while (*p != ':' && *p != 0)
+ p++;
+
+ if (*p != ':') {
+ err = IpAddr(token, &server_addr);
+ if (err) {
+ ret = -1;
+ goto getout;
+ }
+ } else {
+ *p = ' ';
+
+ n = sscanf(token, "%s %s", s, str_server_port);
+ if (n != 2) {
+ ret = -1;
+ goto getout;
+ }
+
+ err = IpAddr(s, &server_addr);
+ if (err) {
+ ret = -1;
+ goto getout;
+ }
+ }
+ }
+ state = STATE_READ_KEYWORD;
+ break;
+
+ case STATE_READ_RULE:
+ n = sscanf(token, "%d", &rule_index);
+ if (n != 1 || rule_index < 0) {
+ ret = -1;
+ goto getout;
+ }
+ state = STATE_READ_KEYWORD;
+ break;
+
+ case STATE_READ_DELETE:
+ {
+ int err;
+ int rule_to_delete;
+
+ if (token_count != 2) {
+ ret = -1;
+ goto getout;
+ }
+
+ n = sscanf(token, "%d", &rule_to_delete);
+ if (n != 1) {
+ ret = -1;
+ goto getout;
+ }
+ err = RuleNumberDelete(la, rule_to_delete);
+ if (err)
+ ret = -1;
+ ret = 0;
+ goto getout;
+ }
+
+ case STATE_READ_PROTO:
+ if (strcmp(token, "tcp") == 0)
+ proto = IPPROTO_TCP;
+ else if (strcmp(token, "udp") == 0)
+ proto = IPPROTO_UDP;
+ else {
+ ret = -1;
+ goto getout;
+ }
+ state = STATE_READ_KEYWORD;
+ break;
+
+ case STATE_READ_SRC:
+ case STATE_READ_DST:
+ {
+ int err;
+ char *p;
+ struct in_addr mask;
+ struct in_addr addr;
+
+ p = token;
+ while (*p != '/' && *p != 0)
+ p++;
+
+ if (*p != '/') {
+ IpMask(32, &mask);
+ err = IpAddr(token, &addr);
+ if (err) {
+ ret = -1;
+ goto getout;
+ }
+ } else {
+ int nbits;
+ char s[sizeof(buffer)];
+
+ *p = ' ';
+ n = sscanf(token, "%s %d", s, &nbits);
+ if (n != 2) {
+ ret = -1;
+ goto getout;
+ }
+
+ err = IpAddr(s, &addr);
+ if (err) {
+ ret = -1;
+ goto getout;
+ }
+
+ err = IpMask(nbits, &mask);
+ if (err) {
+ ret = -1;
+ goto getout;
+ }
+ }
+
+ if (state == STATE_READ_SRC) {
+ src_addr = addr;
+ src_mask = mask;
+ } else {
+ dst_addr = addr;
+ dst_mask = mask;
+ }
+ }
+ state = STATE_READ_KEYWORD;
+ break;
+
+ default:
+ ret = -1;
+ goto getout;
+ break;
+ }
+
+ do {
+ token = strsep(&res, " \t");
+ } while (token != NULL && !*token);
+ }
+#undef STATE_READ_KEYWORD
+#undef STATE_READ_TYPE
+#undef STATE_READ_PORT
+#undef STATE_READ_SERVER
+#undef STATE_READ_RULE
+#undef STATE_READ_DELETE
+#undef STATE_READ_PROTO
+#undef STATE_READ_SRC
+#undef STATE_READ_DST
+
+/* Convert port strings to numbers. This needs to be done after
+ the string is parsed, because the prototype might not be designated
+ before the ports (which might be symbolic entries in /etc/services) */
+
+ if (strlen(str_port) != 0) {
+ int err;
+
+ err = IpPort(str_port, proto, &proxy_port);
+ if (err) {
+ ret = -1;
+ goto getout;
+ }
+ } else {
+ proxy_port = 0;
+ }
+
+ if (strlen(str_server_port) != 0) {
+ int err;
+
+ err = IpPort(str_server_port, proto, &server_port);
+ if (err) {
+ ret = -1;
+ goto getout;
+ }
+ } else {
+ server_port = 0;
+ }
+
+/* Check that at least the server address has been defined */
+ if (server_addr.s_addr == 0) {
+ ret = -1;
+ goto getout;
+ }
+
+/* Add to linked list */
+ proxy_entry = malloc(sizeof(struct proxy_entry));
+ if (proxy_entry == NULL) {
+ ret = -1;
+ goto getout;
+ }
+
+ proxy_entry->proxy_type = proxy_type;
+ proxy_entry->rule_index = rule_index;
+ proxy_entry->proto = proto;
+ proxy_entry->proxy_port = htons(proxy_port);
+ proxy_entry->server_port = htons(server_port);
+ proxy_entry->server_addr = server_addr;
+ proxy_entry->src_addr.s_addr = src_addr.s_addr & src_mask.s_addr;
+ proxy_entry->dst_addr.s_addr = dst_addr.s_addr & dst_mask.s_addr;
+ proxy_entry->src_mask = src_mask;
+ proxy_entry->dst_mask = dst_mask;
+
+ RuleAdd(la, proxy_entry);
+
+getout:
+ LIBALIAS_UNLOCK(la);
+ return (ret);
+}
diff --git a/freebsd/sys/netinet/libalias/alias_sctp.c b/freebsd/sys/netinet/libalias/alias_sctp.c
new file mode 100644
index 00000000..cdec258c
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_sctp.c
@@ -0,0 +1,2700 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2008
+ * Swinburne University of Technology, Melbourne, Australia.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Alias_sctp forms part of the libalias kernel module to handle
+ * Network Address Translation (NAT) for the SCTP protocol.
+ *
+ * This software was developed by David A. Hayes and Jason But
+ *
+ * The design is outlined in CAIA technical report number 080618A
+ * (D. Hayes and J. But, "Alias_sctp Version 0.1: SCTP NAT implementation in IPFW")
+ *
+ * Development is part of the CAIA SONATA project,
+ * proposed by Jason But and Grenville Armitage:
+ * http://caia.swin.edu.au/urp/sonata/
+ *
+ *
+ * This project has been made possible in part by a grant from
+ * the Cisco University Research Program Fund at Community
+ * Foundation Silicon Valley.
+ *
+ */
+/** @mainpage
+ * Alias_sctp is part of the SONATA (http://caia.swin.edu.au/urp/sonata) project
+ * to develop and release a BSD licensed implementation of a Network Address
+ * Translation (NAT) module that supports the Stream Control Transmission
+ * Protocol (SCTP).
+ *
+ * Traditional address and port number look ups are inadequate for SCTP's
+ * operation due to both processing requirements and issues with multi-homing.
+ * Alias_sctp integrates with FreeBSD's ipfw/libalias NAT system.
+ *
+ * Version 0.2 features include:
+ * - Support for global multi-homing
+ * - Support for ASCONF modification from Internet Draft
+ * (draft-stewart-behave-sctpnat-04, R. Stewart and M. Tuexen, "Stream control
+ * transmission protocol (SCTP) network address translation," Jul. 2008) to
+ * provide support for multi-homed privately addressed hosts
+ * - Support for forwarding of T-flagged packets
+ * - Generation and delivery of AbortM/ErrorM packets upon detection of NAT
+ * collisions
+ * - Per-port forwarding rules
+ * - Dynamically controllable logging and statistics
+ * - Dynamic management of timers
+ * - Dynamic control of hash-table size
+ */
+
+/* $FreeBSD$ */
+
+#ifdef _KERNEL
+#include <freebsd/machine/stdarg.h>
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/netinet/libalias/alias_sctp.h>
+#include <freebsd/netinet/libalias/alias.h>
+#include <freebsd/netinet/libalias/alias_local.h>
+#include <freebsd/netinet/sctp_crc32.h>
+#include <freebsd/machine/in_cksum.h>
+#else
+#include <freebsd/local/alias_sctp.h>
+#include <freebsd/arpa/inet.h>
+#include <freebsd/local/alias.h>
+#include <freebsd/local/alias_local.h>
+#include <freebsd/machine/in_cksum.h>
+#include <freebsd/sys/libkern.h>
+#endif //#ifdef _KERNEL
+
+/* ----------------------------------------------------------------------
+ * FUNCTION PROTOTYPES
+ * ----------------------------------------------------------------------
+ */
+/* Packet Parsing Functions */
+static int sctp_PktParser(struct libalias *la, int direction, struct ip *pip,
+ struct sctp_nat_msg *sm, struct sctp_nat_assoc **passoc);
+static int GetAsconfVtags(struct libalias *la, struct sctp_nat_msg *sm,
+ uint32_t *l_vtag, uint32_t *g_vtag, int direction);
+static int IsASCONFack(struct libalias *la, struct sctp_nat_msg *sm, int direction);
+
+static void AddGlobalIPAddresses(struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int direction);
+static int Add_Global_Address_to_List(struct sctp_nat_assoc *assoc, struct sctp_GlobalAddress *G_addr);
+static void RmGlobalIPAddresses(struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int direction);
+static int IsADDorDEL(struct libalias *la, struct sctp_nat_msg *sm, int direction);
+
+/* State Machine Functions */
+static int ProcessSctpMsg(struct libalias *la, int direction, \
+ struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc);
+
+static int ID_process(struct libalias *la, int direction,\
+ struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm);
+static int INi_process(struct libalias *la, int direction,\
+ struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm);
+static int INa_process(struct libalias *la, int direction,\
+ struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm);
+static int UP_process(struct libalias *la, int direction,\
+ struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm);
+static int CL_process(struct libalias *la, int direction,\
+ struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm);
+static void TxAbortErrorM(struct libalias *la, struct sctp_nat_msg *sm,\
+ struct sctp_nat_assoc *assoc, int sndrply, int direction);
+
+/* Hash Table Functions */
+static struct sctp_nat_assoc*
+FindSctpLocal(struct libalias *la, struct in_addr l_addr, struct in_addr g_addr, uint32_t l_vtag, uint16_t l_port, uint16_t g_port);
+static struct sctp_nat_assoc*
+FindSctpGlobal(struct libalias *la, struct in_addr g_addr, uint32_t g_vtag, uint16_t g_port, uint16_t l_port, int *partial_match);
+static struct sctp_nat_assoc*
+FindSctpGlobalClash(struct libalias *la, struct sctp_nat_assoc *Cassoc);
+static struct sctp_nat_assoc*
+FindSctpLocalT(struct libalias *la, struct in_addr g_addr, uint32_t l_vtag, uint16_t g_port, uint16_t l_port);
+static struct sctp_nat_assoc*
+FindSctpGlobalT(struct libalias *la, struct in_addr g_addr, uint32_t g_vtag, uint16_t l_port, uint16_t g_port);
+
+static int AddSctpAssocLocal(struct libalias *la, struct sctp_nat_assoc *assoc, struct in_addr g_addr);
+static int AddSctpAssocGlobal(struct libalias *la, struct sctp_nat_assoc *assoc);
+static void RmSctpAssoc(struct libalias *la, struct sctp_nat_assoc *assoc);
+static void freeGlobalAddressList(struct sctp_nat_assoc *assoc);
+
+/* Timer Queue Functions */
+static void sctp_AddTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc);
+static void sctp_RmTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc);
+static void sctp_ResetTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc, int newexp);
+void sctp_CheckTimers(struct libalias *la);
+
+
+/* Logging Functions */
+static void logsctperror(char* errormsg, uint32_t vtag, int error, int direction);
+static void logsctpparse(int direction, struct sctp_nat_msg *sm);
+static void logsctpassoc(struct sctp_nat_assoc *assoc, char *s);
+static void logTimerQ(struct libalias *la);
+static void logSctpGlobal(struct libalias *la);
+static void logSctpLocal(struct libalias *la);
+#ifdef _KERNEL
+static void SctpAliasLog(const char *format, ...);
+#endif
+
+/** @defgroup external External code changes and modifications
+ *
+ * Some changes have been made to files external to alias_sctp.(c|h). These
+ * changes are primarily due to code needing to call static functions within
+ * those files or to perform extra functionality that can only be performed
+ * within these files.
+ */
+/** @ingroup external
+ * @brief Log current statistics for the libalias instance
+ *
+ * This function is defined in alias_db.c, since it calls static functions in
+ * this file
+ *
+ * Calls the higher level ShowAliasStats() in alias_db.c which logs all current
+ * statistics about the libalias instance - including SCTP statistics
+ *
+ * @param la Pointer to the libalias instance
+ */
+void SctpShowAliasStats(struct libalias *la);
+
+#ifdef _KERNEL
+
+MALLOC_DEFINE(M_SCTPNAT, "sctpnat", "sctp nat dbs");
+/* Use kernel allocator. */
+#ifdef _SYS_MALLOC_HH_
+#define sn_malloc(x) malloc(x, M_SCTPNAT, M_NOWAIT|M_ZERO)
+#define sn_calloc(n,x) sn_malloc(x * n)
+#define sn_free(x) free(x, M_SCTPNAT)
+#endif// #ifdef _SYS_MALLOC_HH_
+
+#else //#ifdef _KERNEL
+#define sn_malloc(x) malloc(x)
+#define sn_calloc(n, x) calloc(n, x)
+#define sn_free(x) free(x)
+
+#endif //#ifdef _KERNEL
+
+/** @defgroup packet_parser SCTP Packet Parsing
+ *
+ * Macros to:
+ * - Return pointers to the first and next SCTP chunks within an SCTP Packet
+ * - Define possible return values of the packet parsing process
+ * - SCTP message types for storing in the sctp_nat_msg structure @{
+ */
+
+#define SN_SCTP_FIRSTCHUNK(sctphead) (struct sctp_chunkhdr *)(((char *)sctphead) + sizeof(struct sctphdr))
+/**< Returns a pointer to the first chunk in an SCTP packet given a pointer to the SCTP header */
+
+#define SN_SCTP_NEXTCHUNK(chunkhead) (struct sctp_chunkhdr *)(((char *)chunkhead) + SCTP_SIZE32(ntohs(chunkhead->chunk_length)))
+/**< Returns a pointer to the next chunk in an SCTP packet given a pointer to the current chunk */
+
+#define SN_SCTP_NEXTPARAM(param) (struct sctp_paramhdr *)(((char *)param) + SCTP_SIZE32(ntohs(param->param_length)))
+/**< Returns a pointer to the next parameter in an SCTP packet given a pointer to the current parameter */
+
+#define SN_MIN_CHUNK_SIZE 4 /**< Smallest possible SCTP chunk size in bytes */
+#define SN_MIN_PARAM_SIZE 4 /**< Smallest possible SCTP param size in bytes */
+#define SN_VTAG_PARAM_SIZE 12 /**< Size of SCTP ASCONF vtag param in bytes */
+#define SN_ASCONFACK_PARAM_SIZE 8 /**< Size of SCTP ASCONF ACK param in bytes */
+
+/* Packet parsing return codes */
+#define SN_PARSE_OK 0 /**< Packet parsed for SCTP messages */
+#define SN_PARSE_ERROR_IPSHL 1 /**< Packet parsing error - IP and SCTP common header len */
+#define SN_PARSE_ERROR_AS_MALLOC 2 /**< Packet parsing error - assoc malloc */
+#define SN_PARSE_ERROR_CHHL 3 /**< Packet parsing error - Chunk header len */
+#define SN_PARSE_ERROR_DIR 4 /**< Packet parsing error - Direction */
+#define SN_PARSE_ERROR_VTAG 5 /**< Packet parsing error - Vtag */
+#define SN_PARSE_ERROR_CHUNK 6 /**< Packet parsing error - Chunk */
+#define SN_PARSE_ERROR_PORT 7 /**< Packet parsing error - Port=0 */
+#define SN_PARSE_ERROR_LOOKUP 8 /**< Packet parsing error - Lookup */
+#define SN_PARSE_ERROR_PARTIALLOOKUP 9 /**< Packet parsing error - partial lookup only found */
+#define SN_PARSE_ERROR_LOOKUP_ABORT 10 /**< Packet parsing error - Lookup - but abort packet */
+
+/* Alias_sctp performs its processing based on a number of key messages */
+#define SN_SCTP_ABORT 0x0000 /**< a packet containing an ABORT chunk */
+#define SN_SCTP_INIT 0x0001 /**< a packet containing an INIT chunk */
+#define SN_SCTP_INITACK 0x0002 /**< a packet containing an INIT-ACK chunk */
+#define SN_SCTP_SHUTCOMP 0x0010 /**< a packet containing a SHUTDOWN-COMPLETE chunk */
+#define SN_SCTP_SHUTACK 0x0020 /**< a packet containing a SHUTDOWN-ACK chunk */
+#define SN_SCTP_ASCONF 0x0100 /**< a packet containing an ASCONF chunk */
+#define SN_SCTP_ASCONFACK 0x0200 /**< a packet containing an ASCONF-ACK chunk */
+#define SN_SCTP_OTHER 0xFFFF /**< a packet containing a chunk that is not of interest */
+
+/** @}
+ * @defgroup state_machine SCTP NAT State Machine
+ *
+ * Defines the various states an association can be within the NAT @{
+ */
+#define SN_ID 0x0000 /**< Idle state */
+#define SN_INi 0x0010 /**< Initialising, waiting for InitAck state */
+#define SN_INa 0x0020 /**< Initialising, waiting for AddIpAck state */
+#define SN_UP 0x0100 /**< Association in UP state */
+#define SN_CL 0x1000 /**< Closing state */
+#define SN_RM 0x2000 /**< Removing state */
+
+/** @}
+ * @defgroup Logging Logging Functionality
+ *
+ * Define various log levels and a macro to call specified log functions only if
+ * the current log level (sysctl_log_level) matches the specified level @{
+ */
+#define SN_LOG_LOW 0
+#define SN_LOG_EVENT 1
+#define SN_LOG_INFO 2
+#define SN_LOG_DETAIL 3
+#define SN_LOG_DEBUG 4
+#define SN_LOG_DEBUG_MAX 5
+
+#define SN_LOG(level, action) if (sysctl_log_level >= level) { action; } /**< Perform log action ONLY if the current log level meets the specified log level */
+
+/** @}
+ * @defgroup Hash Hash Table Macros and Functions
+ *
+ * Defines minimum/maximum/default values for the hash table size @{
+ */
+#define SN_MIN_HASH_SIZE 101 /**< Minimum hash table size (set to stop users choosing stupid values) */
+#define SN_MAX_HASH_SIZE 1000001 /**< Maximum hash table size (NB must be less than max int) */
+#define SN_DEFAULT_HASH_SIZE 2003 /**< A reasonable default size for the hash tables */
+
+#define SN_LOCAL_TBL 0x01 /**< assoc in local table */
+#define SN_GLOBAL_TBL 0x02 /**< assoc in global table */
+#define SN_BOTH_TBL 0x03 /**< assoc in both tables */
+#define SN_WAIT_TOLOCAL 0x10 /**< assoc waiting for TOLOCAL asconf ACK*/
+#define SN_WAIT_TOGLOBAL 0x20 /**< assoc waiting for TOLOCAL asconf ACK*/
+#define SN_NULL_TBL 0x00 /**< assoc in No table */
+#define SN_MAX_GLOBAL_ADDRESSES 100 /**< absolute maximum global address count*/
+
+#define SN_ADD_OK 0 /**< Association added to the table */
+#define SN_ADD_CLASH 1 /**< Clash when trying to add the assoc. info to the table */
+
+#define SN_TABLE_HASH(vtag, port, size) (((u_int) vtag + (u_int) port) % (u_int) size) /**< Calculate the hash table lookup position */
+
+/** @}
+ * @defgroup Timer Timer Queue Macros and Functions
+ *
+ * Timer macros set minimum/maximum timeout values and calculate timer expiry
+ * times for the provided libalias instance @{
+ */
+#define SN_MIN_TIMER 1
+#define SN_MAX_TIMER 600
+#define SN_TIMER_QUEUE_SIZE SN_MAX_TIMER+2
+
+#define SN_I_T(la) (la->timeStamp + sysctl_init_timer) /**< INIT State expiration time in seconds */
+#define SN_U_T(la) (la->timeStamp + sysctl_up_timer) /**< UP State expiration time in seconds */
+#define SN_C_T(la) (la->timeStamp + sysctl_shutdown_timer) /**< CL State expiration time in seconds */
+#define SN_X_T(la) (la->timeStamp + sysctl_holddown_timer) /**< Wait after a shutdown complete in seconds */
+
+/** @}
+ * @defgroup sysctl SysCtl Variable and callback function declarations
+ *
+ * Sysctl variables to modify NAT functionality in real-time along with associated functions
+ * to manage modifications to the sysctl variables @{
+ */
+
+/* Callbacks */
+int sysctl_chg_loglevel(SYSCTL_HANDLER_ARGS);
+int sysctl_chg_timer(SYSCTL_HANDLER_ARGS);
+int sysctl_chg_hashtable_size(SYSCTL_HANDLER_ARGS);
+int sysctl_chg_error_on_ootb(SYSCTL_HANDLER_ARGS);
+int sysctl_chg_accept_global_ootb_addip(SYSCTL_HANDLER_ARGS);
+int sysctl_chg_initialising_chunk_proc_limit(SYSCTL_HANDLER_ARGS);
+int sysctl_chg_chunk_proc_limit(SYSCTL_HANDLER_ARGS);
+int sysctl_chg_param_proc_limit(SYSCTL_HANDLER_ARGS);
+int sysctl_chg_track_global_addresses(SYSCTL_HANDLER_ARGS);
+
+/* Sysctl variables */
+/** @brief net.inet.ip.alias.sctp.log_level */
+static u_int sysctl_log_level = 0; /**< Stores the current level of logging */
+/** @brief net.inet.ip.alias.sctp.init_timer */
+static u_int sysctl_init_timer = 15; /**< Seconds to hold an association in the table waiting for an INIT-ACK or AddIP-ACK */
+/** @brief net.inet.ip.alias.sctp.up_timer */
+static u_int sysctl_up_timer = 300; /**< Seconds to hold an association in the table while no packets are transmitted */
+/** @brief net.inet.ip.alias.sctp.shutdown_timer */
+static u_int sysctl_shutdown_timer = 15; /**< Seconds to hold an association in the table waiting for a SHUTDOWN-COMPLETE */
+/** @brief net.inet.ip.alias.sctp.holddown_timer */
+static u_int sysctl_holddown_timer = 0; /**< Seconds to hold an association in the table after it has been shutdown (to allow for lost SHUTDOWN-COMPLETEs) */
+/** @brief net.inet.ip.alias.sctp.hashtable_size */
+static u_int sysctl_hashtable_size = SN_DEFAULT_HASH_SIZE; /**< Sets the hash table size for any NEW NAT instances (existing instances retain their existing Hash Table */
+/** @brief net.inet.ip.alias.sctp.error_on_ootb */
+static u_int sysctl_error_on_ootb = 1; /**< NAT response to receipt of OOTB packet
+ (0 - No response, 1 - NAT will send ErrorM only to local side,
+ 2 - NAT will send local ErrorM and global ErrorM if there was a partial association match
+ 3 - NAT will send ErrorM to both local and global) */
+/** @brief net.inet.ip.alias.sctp.accept_global_ootb_addip */
+static u_int sysctl_accept_global_ootb_addip = 0; /**<NAT responset to receipt of global OOTB AddIP (0 - No response, 1 - NAT will accept OOTB global AddIP messages for processing (Security risk)) */
+/** @brief net.inet.ip.alias.sctp.initialising_chunk_proc_limit */
+static u_int sysctl_initialising_chunk_proc_limit = 2; /**< A limit on the number of chunks that should be searched if there is no matching association (DoS prevention) */
+/** @brief net.inet.ip.alias.sctp.param_proc_limit */
+static u_int sysctl_chunk_proc_limit = 5; /**< A limit on the number of chunks that should be searched (DoS prevention) */
+/** @brief net.inet.ip.alias.sctp.param_proc_limit */
+static u_int sysctl_param_proc_limit = 25; /**< A limit on the number of parameters (in chunks) that should be searched (DoS prevention) */
+/** @brief net.inet.ip.alias.sctp.track_global_addresses */
+static u_int sysctl_track_global_addresses = 0; /**< Configures the global address tracking option within the NAT (0 - Global tracking is disabled, > 0 - enables tracking but limits the number of global IP addresses to this value)
+ If set to >=1 the NAT will track that many global IP addresses. This may reduce look up table conflicts, but increases processing */
+
+#define SN_NO_ERROR_ON_OOTB 0 /**< Send no errorM on out of the blue packets */
+#define SN_LOCAL_ERROR_ON_OOTB 1 /**< Send only local errorM on out of the blue packets */
+#define SN_LOCALandPARTIAL_ERROR_ON_OOTB 2 /**< Send local errorM and global errorM for out of the blue packets only if partial match found */
+#define SN_ERROR_ON_OOTB 3 /**< Send errorM on out of the blue packets */
+
+#ifdef SYSCTL_NODE
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_ip_alias);
+
+SYSCTL_NODE(_net_inet_ip_alias, OID_AUTO, sctp, CTLFLAG_RW, NULL, "SCTP NAT");
+
+SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, log_level, CTLTYPE_UINT | CTLFLAG_RW,
+ &sysctl_log_level, 0, sysctl_chg_loglevel, "IU",
+ "Level of detail (0 - default, 1 - event, 2 - info, 3 - detail, 4 - debug, 5 - max debug)");
+SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, init_timer, CTLTYPE_UINT | CTLFLAG_RW,
+ &sysctl_init_timer, 0, sysctl_chg_timer, "IU",
+ "Timeout value (s) while waiting for (INIT-ACK|AddIP-ACK)");
+SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, up_timer, CTLTYPE_UINT | CTLFLAG_RW,
+ &sysctl_up_timer, 0, sysctl_chg_timer, "IU",
+ "Timeout value (s) to keep an association up with no traffic");
+SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, shutdown_timer, CTLTYPE_UINT | CTLFLAG_RW,
+ &sysctl_shutdown_timer, 0, sysctl_chg_timer, "IU",
+ "Timeout value (s) while waiting for SHUTDOWN-COMPLETE");
+SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, holddown_timer, CTLTYPE_UINT | CTLFLAG_RW,
+ &sysctl_holddown_timer, 0, sysctl_chg_timer, "IU",
+ "Hold association in table for this many seconds after receiving a SHUTDOWN-COMPLETE");
+SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, hashtable_size, CTLTYPE_UINT | CTLFLAG_RW,
+ &sysctl_hashtable_size, 0, sysctl_chg_hashtable_size, "IU",
+ "Size of hash tables used for NAT lookups (100 < prime_number > 1000001)");
+SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, error_on_ootb, CTLTYPE_UINT | CTLFLAG_RW,
+ &sysctl_error_on_ootb, 0, sysctl_chg_error_on_ootb, "IU",
+ "ErrorM sent on receipt of ootb packet:\n\t0 - none,\n\t1 - to local only,\n\t2 - to local and global if a partial association match,\n\t3 - to local and global (DoS risk)");
+SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, accept_global_ootb_addip, CTLTYPE_UINT | CTLFLAG_RW,
+ &sysctl_accept_global_ootb_addip, 0, sysctl_chg_accept_global_ootb_addip, "IU",
+ "NAT response to receipt of global OOTB AddIP:\n\t0 - No response,\n\t1 - NAT will accept OOTB global AddIP messages for processing (Security risk)");
+SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, initialising_chunk_proc_limit, CTLTYPE_UINT | CTLFLAG_RW,
+ &sysctl_initialising_chunk_proc_limit, 0, sysctl_chg_initialising_chunk_proc_limit, "IU",
+ "Number of chunks that should be processed if there is no current association found:\n\t > 0 (A high value is a DoS risk)");
+SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, chunk_proc_limit, CTLTYPE_UINT | CTLFLAG_RW,
+ &sysctl_chunk_proc_limit, 0, sysctl_chg_chunk_proc_limit, "IU",
+ "Number of chunks that should be processed to find key chunk:\n\t>= initialising_chunk_proc_limit (A high value is a DoS risk)");
+SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, param_proc_limit, CTLTYPE_UINT | CTLFLAG_RW,
+ &sysctl_param_proc_limit, 0, sysctl_chg_param_proc_limit, "IU",
+ "Number of parameters (in a chunk) that should be processed to find key parameters:\n\t> 1 (A high value is a DoS risk)");
+SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, track_global_addresses, CTLTYPE_UINT | CTLFLAG_RW,
+ &sysctl_track_global_addresses, 0, sysctl_chg_track_global_addresses, "IU",
+ "Configures the global address tracking option within the NAT:\n\t0 - Global tracking is disabled,\n\t> 0 - enables tracking but limits the number of global IP addresses to this value");
+
+#endif /* SYSCTL_NODE */
+
+/** @}
+ * @ingroup sysctl
+ * @brief sysctl callback for changing net.inet.ip.fw.sctp.log_level
+ *
+ * Updates the variable sysctl_log_level to the provided value and ensures
+ * it is in the valid range (SN_LOG_LOW -> SN_LOG_DEBUG)
+ */
+int sysctl_chg_loglevel(SYSCTL_HANDLER_ARGS)
+{
+ u_int level = *(u_int *)arg1;
+ int error;
+
+ error = sysctl_handle_int(oidp, &level, 0, req);
+ if (error) return (error);
+
+ sysctl_log_level = (level > SN_LOG_DEBUG_MAX)?(SN_LOG_DEBUG_MAX):(level);
+ sysctl_log_level = (level < SN_LOG_LOW)?(SN_LOG_LOW):(level);
+
+ return (0);
+}
+
+/** @ingroup sysctl
+ * @brief sysctl callback for changing net.inet.ip.fw.sctp.(init_timer|up_timer|shutdown_timer)
+ *
+ * Updates the timer-based sysctl variables. The new values are sanity-checked
+ * to make sure that they are within the range SN_MIN_TIMER-SN_MAX_TIMER. The
+ * holddown timer is allowed to be 0
+ */
+int sysctl_chg_timer(SYSCTL_HANDLER_ARGS)
+{
+ u_int timer = *(u_int *)arg1;
+ int error;
+
+ error = sysctl_handle_int(oidp, &timer, 0, req);
+ if (error) return (error);
+
+ timer = (timer > SN_MAX_TIMER)?(SN_MAX_TIMER):(timer);
+
+ if (((u_int *)arg1) != &sysctl_holddown_timer)
+ {
+ timer = (timer < SN_MIN_TIMER)?(SN_MIN_TIMER):(timer);
+ }
+
+ *(u_int *)arg1 = timer;
+
+ return (0);
+}
+
+/** @ingroup sysctl
+ * @brief sysctl callback for changing net.inet.ip.alias.sctp.hashtable_size
+ *
+ * Updates the hashtable_size sysctl variable. The new value should be a prime
+ * number. We sanity check to ensure that the size is within the range
+ * SN_MIN_HASH_SIZE-SN_MAX_HASH_SIZE. We then check the provided number to see
+ * if it is prime. We approximate by checking that (2,3,5,7,11) are not factors,
+ * incrementing the user provided value until we find a suitable number.
+ */
+int sysctl_chg_hashtable_size(SYSCTL_HANDLER_ARGS)
+{
+ u_int size = *(u_int *)arg1;
+ int error;
+
+ error = sysctl_handle_int(oidp, &size, 0, req);
+ if (error) return (error);
+
+ size = (size < SN_MIN_HASH_SIZE)?(SN_MIN_HASH_SIZE):((size > SN_MAX_HASH_SIZE)?(SN_MAX_HASH_SIZE):(size));
+
+ size |= 0x00000001; /* make odd */
+
+ for(;(((size % 3) == 0) || ((size % 5) == 0) || ((size % 7) == 0) || ((size % 11) == 0)); size+=2);
+ sysctl_hashtable_size = size;
+
+ return (0);
+}
+
+/** @ingroup sysctl
+ * @brief sysctl callback for changing net.inet.ip.alias.sctp.error_on_ootb
+ *
+ * Updates the error_on_clash sysctl variable.
+ * If set to 0, no ErrorM will be sent if there is a look up table clash
+ * If set to 1, an ErrorM is sent only to the local side
+ * If set to 2, an ErrorM is sent to the local side and global side if there is
+ * a partial association match
+ * If set to 3, an ErrorM is sent to both local and global sides (DoS) risk.
+ */
+int sysctl_chg_error_on_ootb(SYSCTL_HANDLER_ARGS)
+{
+ u_int flag = *(u_int *)arg1;
+ int error;
+
+ error = sysctl_handle_int(oidp, &flag, 0, req);
+ if (error) return (error);
+
+ sysctl_error_on_ootb = (flag > SN_ERROR_ON_OOTB) ? SN_ERROR_ON_OOTB: flag;
+
+ return (0);
+}
+
+/** @ingroup sysctl
+ * @brief sysctl callback for changing net.inet.ip.alias.sctp.accept_global_ootb_addip
+ *
+ * If set to 1 the NAT will accept ootb global addip messages for processing (Security risk)
+ * Default is 0, only responding to local ootb AddIP messages
+ */
+int sysctl_chg_accept_global_ootb_addip(SYSCTL_HANDLER_ARGS)
+{
+ u_int flag = *(u_int *)arg1;
+ int error;
+
+ error = sysctl_handle_int(oidp, &flag, 0, req);
+ if (error) return (error);
+
+ sysctl_accept_global_ootb_addip = (flag == 1) ? 1: 0;
+
+ return (0);
+}
+
+/** @ingroup sysctl
+ * @brief sysctl callback for changing net.inet.ip.alias.sctp.initialising_chunk_proc_limit
+ *
+ * Updates the initialising_chunk_proc_limit sysctl variable. Number of chunks
+ * that should be processed if there is no current association found: > 0 (A
+ * high value is a DoS risk)
+ */
+int sysctl_chg_initialising_chunk_proc_limit(SYSCTL_HANDLER_ARGS)
+{
+ u_int proclimit = *(u_int *)arg1;
+ int error;
+
+ error = sysctl_handle_int(oidp, &proclimit, 0, req);
+ if (error) return (error);
+
+ sysctl_initialising_chunk_proc_limit = (proclimit < 1) ? 1: proclimit;
+ sysctl_chunk_proc_limit =
+ (sysctl_chunk_proc_limit < sysctl_initialising_chunk_proc_limit) ? sysctl_initialising_chunk_proc_limit : sysctl_chunk_proc_limit;
+
+ return (0);
+}
+
+/** @ingroup sysctl
+ * @brief sysctl callback for changing net.inet.ip.alias.sctp.chunk_proc_limit
+ *
+ * Updates the chunk_proc_limit sysctl variable.
+ * Number of chunks that should be processed to find key chunk:
+ * >= initialising_chunk_proc_limit (A high value is a DoS risk)
+ */
+int sysctl_chg_chunk_proc_limit(SYSCTL_HANDLER_ARGS)
+{
+ u_int proclimit = *(u_int *)arg1;
+ int error;
+
+ error = sysctl_handle_int(oidp, &proclimit, 0, req);
+ if (error) return (error);
+
+ sysctl_chunk_proc_limit =
+ (proclimit < sysctl_initialising_chunk_proc_limit) ? sysctl_initialising_chunk_proc_limit : proclimit;
+
+ return (0);
+}
+
+
+/** @ingroup sysctl
+ * @brief sysctl callback for changing net.inet.ip.alias.sctp.param_proc_limit
+ *
+ * Updates the param_proc_limit sysctl variable.
+ * Number of parameters that should be processed to find key parameters:
+ * > 1 (A high value is a DoS risk)
+ */
+int sysctl_chg_param_proc_limit(SYSCTL_HANDLER_ARGS)
+{
+ u_int proclimit = *(u_int *)arg1;
+ int error;
+
+ error = sysctl_handle_int(oidp, &proclimit, 0, req);
+ if (error) return (error);
+
+ sysctl_param_proc_limit =
+ (proclimit < 2) ? 2 : proclimit;
+
+ return (0);
+}
+
+/** @ingroup sysctl
+ * @brief sysctl callback for changing net.inet.ip.alias.sctp.track_global_addresses
+ *
+ *Configures the global address tracking option within the NAT (0 - Global
+ *tracking is disabled, > 0 - enables tracking but limits the number of global
+ *IP addresses to this value)
+ */
+int sysctl_chg_track_global_addresses(SYSCTL_HANDLER_ARGS)
+{
+ u_int num_to_track = *(u_int *)arg1;
+ int error;
+
+ error = sysctl_handle_int(oidp, &num_to_track, 0, req);
+ if (error) return (error);
+
+ sysctl_track_global_addresses = (num_to_track > SN_MAX_GLOBAL_ADDRESSES) ? SN_MAX_GLOBAL_ADDRESSES : num_to_track;
+
+ return (0);
+}
+
+
+/* ----------------------------------------------------------------------
+ * CODE BEGINS HERE
+ * ----------------------------------------------------------------------
+ */
+/**
+ * @brief Initialises the SCTP NAT Implementation
+ *
+ * Creates the look-up tables and the timer queue and initialises all state
+ * variables
+ *
+ * @param la Pointer to the relevant libalias instance
+ */
+void AliasSctpInit(struct libalias *la)
+{
+ /* Initialise association tables*/
+ int i;
+ la->sctpNatTableSize = sysctl_hashtable_size;
+ SN_LOG(SN_LOG_EVENT,
+ SctpAliasLog("Initialising SCTP NAT Instance (hash_table_size:%d)\n", la->sctpNatTableSize));
+ la->sctpTableLocal = sn_calloc(la->sctpNatTableSize, sizeof(struct sctpNatTableL));
+ la->sctpTableGlobal = sn_calloc(la->sctpNatTableSize, sizeof(struct sctpNatTableG));
+ la->sctpNatTimer.TimerQ = sn_calloc(SN_TIMER_QUEUE_SIZE, sizeof(struct sctpTimerQ));
+ /* Initialise hash table */
+ for (i = 0; i < la->sctpNatTableSize; i++) {
+ LIST_INIT(&la->sctpTableLocal[i]);
+ LIST_INIT(&la->sctpTableGlobal[i]);
+ }
+
+ /* Initialise circular timer Q*/
+ for (i = 0; i < SN_TIMER_QUEUE_SIZE; i++)
+ LIST_INIT(&la->sctpNatTimer.TimerQ[i]);
+#ifdef _KERNEL
+ la->sctpNatTimer.loc_time=time_uptime; /* la->timeStamp is not set yet */
+#else
+ la->sctpNatTimer.loc_time=la->timeStamp;
+#endif
+ la->sctpNatTimer.cur_loc = 0;
+ la->sctpLinkCount = 0;
+}
+
+/**
+ * @brief Cleans-up the SCTP NAT Implementation prior to unloading
+ *
+ * Removes all entries from the timer queue, freeing associations as it goes.
+ * We then free memory allocated to the look-up tables and the time queue
+ *
+ * NOTE: We do not need to traverse the look-up tables as each association
+ * will always have an entry in the timer queue, freeing this memory
+ * once will free all memory allocated to entries in the look-up tables
+ *
+ * @param la Pointer to the relevant libalias instance
+ */
+void AliasSctpTerm(struct libalias *la)
+{
+ struct sctp_nat_assoc *assoc1, *assoc2;
+ int i;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ SN_LOG(SN_LOG_EVENT,
+ SctpAliasLog("Removing SCTP NAT Instance\n"));
+ for (i = 0; i < SN_TIMER_QUEUE_SIZE; i++) {
+ assoc1 = LIST_FIRST(&la->sctpNatTimer.TimerQ[i]);
+ while (assoc1 != NULL) {
+ freeGlobalAddressList(assoc1);
+ assoc2 = LIST_NEXT(assoc1, timer_Q);
+ sn_free(assoc1);
+ assoc1 = assoc2;
+ }
+ }
+
+ sn_free(la->sctpTableLocal);
+ sn_free(la->sctpTableGlobal);
+ sn_free(la->sctpNatTimer.TimerQ);
+}
+
+/**
+ * @brief Handles SCTP packets passed from libalias
+ *
+ * This function needs to actually NAT/drop packets and possibly create and
+ * send AbortM or ErrorM packets in response. The process involves:
+ * - Validating the direction parameter passed by the caller
+ * - Checking and handling any expired timers for the NAT
+ * - Calling sctp_PktParser() to parse the packet
+ * - Call ProcessSctpMsg() to decide the appropriate outcome and to update
+ * the NAT tables
+ * - Based on the return code either:
+ * - NAT the packet
+ * - Construct and send an ErrorM|AbortM packet
+ * - Mark the association for removal from the tables
+ * - Potentially remove the association from all lookup tables
+ * - Return the appropriate result to libalias
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param pip Pointer to IP packet to process
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ *
+ * @return PKT_ALIAS_OK | PKT_ALIAS_IGNORE | PKT_ALIAS_ERROR
+ */
+int
+SctpAlias(struct libalias *la, struct ip *pip, int direction)
+{
+ int rtnval;
+ struct sctp_nat_msg msg;
+ struct sctp_nat_assoc *assoc = NULL;
+
+ if ((direction != SN_TO_LOCAL) && (direction != SN_TO_GLOBAL)) {
+ SctpAliasLog("ERROR: Invalid direction\n");
+ return(PKT_ALIAS_ERROR);
+ }
+
+ sctp_CheckTimers(la); /* Check timers */
+
+ /* Parse the packet */
+ rtnval = sctp_PktParser(la, direction, pip, &msg, &assoc); //using *char (change to mbuf when get code from paolo)
+ switch (rtnval) {
+ case SN_PARSE_OK:
+ break;
+ case SN_PARSE_ERROR_CHHL:
+ /* Not an error if there is a chunk length parsing error and this is a fragmented packet */
+ if (ntohs(pip->ip_off) & IP_MF) {
+ rtnval = SN_PARSE_OK;
+ break;
+ }
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("SN_PARSE_ERROR", msg.sctp_hdr->v_tag, rtnval, direction));
+ return(PKT_ALIAS_ERROR);
+ case SN_PARSE_ERROR_PARTIALLOOKUP:
+ if (sysctl_error_on_ootb > SN_LOCALandPARTIAL_ERROR_ON_OOTB) {
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("SN_PARSE_ERROR", msg.sctp_hdr->v_tag, rtnval, direction));
+ return(PKT_ALIAS_ERROR);
+ }
+ case SN_PARSE_ERROR_LOOKUP:
+ if (sysctl_error_on_ootb == SN_ERROR_ON_OOTB ||
+ (sysctl_error_on_ootb == SN_LOCALandPARTIAL_ERROR_ON_OOTB && direction == SN_TO_LOCAL) ||
+ (sysctl_error_on_ootb == SN_LOCAL_ERROR_ON_OOTB && direction == SN_TO_GLOBAL)) {
+ TxAbortErrorM(la, &msg, assoc, SN_REFLECT_ERROR, direction); /*NB assoc=NULL */
+ return(PKT_ALIAS_RESPOND);
+ }
+ default:
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("SN_PARSE_ERROR", msg.sctp_hdr->v_tag, rtnval, direction));
+ return(PKT_ALIAS_ERROR);
+ }
+
+ SN_LOG(SN_LOG_DETAIL,
+ logsctpassoc(assoc, "*");
+ logsctpparse(direction, &msg);
+ );
+
+ /* Process the SCTP message */
+ rtnval = ProcessSctpMsg(la, direction, &msg, assoc);
+
+ SN_LOG(SN_LOG_DEBUG_MAX,
+ logsctpassoc(assoc, "-");
+ logSctpLocal(la);
+ logSctpGlobal(la);
+ );
+ SN_LOG(SN_LOG_DEBUG, logTimerQ(la));
+
+ switch(rtnval){
+ case SN_NAT_PKT:
+ switch(direction) {
+ case SN_TO_LOCAL:
+ DifferentialChecksum(&(msg.ip_hdr->ip_sum),
+ &(assoc->l_addr), &(msg.ip_hdr->ip_dst), 2);
+ msg.ip_hdr->ip_dst = assoc->l_addr; /* change dst address to local address*/
+ break;
+ case SN_TO_GLOBAL:
+ DifferentialChecksum(&(msg.ip_hdr->ip_sum),
+ &(assoc->a_addr), &(msg.ip_hdr->ip_src), 2);
+ msg.ip_hdr->ip_src = assoc->a_addr; /* change src to alias addr*/
+ break;
+ default:
+ rtnval = SN_DROP_PKT; /* shouldn't get here, but if it does drop packet */
+ SN_LOG(SN_LOG_LOW, logsctperror("ERROR: Invalid direction", msg.sctp_hdr->v_tag, rtnval, direction));
+ break;
+ }
+ break;
+ case SN_DROP_PKT:
+ SN_LOG(SN_LOG_DETAIL, logsctperror("SN_DROP_PKT", msg.sctp_hdr->v_tag, rtnval, direction));
+ break;
+ case SN_REPLY_ABORT:
+ case SN_REPLY_ERROR:
+ case SN_SEND_ABORT:
+ TxAbortErrorM(la, &msg, assoc, rtnval, direction);
+ break;
+ default:
+ // big error, remove association and go to idle and write log messages
+ SN_LOG(SN_LOG_LOW, logsctperror("SN_PROCESSING_ERROR", msg.sctp_hdr->v_tag, rtnval, direction));
+ assoc->state=SN_RM;/* Mark for removal*/
+ break;
+ }
+
+ /* Remove association if tagged for removal */
+ if (assoc->state == SN_RM) {
+ if (assoc->TableRegister) {
+ sctp_RmTimeOut(la, assoc);
+ RmSctpAssoc(la, assoc);
+ }
+ LIBALIAS_LOCK_ASSERT(la);
+ freeGlobalAddressList(assoc);
+ sn_free(assoc);
+ }
+ switch(rtnval) {
+ case SN_NAT_PKT:
+ return(PKT_ALIAS_OK);
+ case SN_SEND_ABORT:
+ return(PKT_ALIAS_OK);
+ case SN_REPLY_ABORT:
+ case SN_REPLY_ERROR:
+ case SN_REFLECT_ERROR:
+ return(PKT_ALIAS_RESPOND);
+ case SN_DROP_PKT:
+ default:
+ return(PKT_ALIAS_ERROR);
+ }
+}
+
+/**
+ * @brief Send an AbortM or ErrorM
+ *
+ * We construct the new SCTP packet to send in place of the existing packet we
+ * have been asked to NAT. This function can only be called if the original
+ * packet was successfully parsed as a valid SCTP packet.
+ *
+ * An AbortM (without cause) packet is the smallest SCTP packet available and as
+ * such there is always space in the existing packet buffer to fit the AbortM
+ * packet. An ErrorM packet is 4 bytes longer than the (the error cause is not
+ * optional). An ErrorM is sent in response to an AddIP when the Vtag/address
+ * combination, if added, will produce a conflict in the association look up
+ * tables. It may also be used for an unexpected packet - a packet with no
+ * matching association in the NAT table and we are requesting an AddIP so we
+ * can add it. The smallest valid SCTP packet while the association is in an
+ * up-state is a Heartbeat packet, which is big enough to be transformed to an
+ * ErrorM.
+ *
+ * We create a temporary character array to store the packet as we are constructing
+ * it. We then populate the array with appropriate values based on:
+ * - Packet type (AbortM | ErrorM)
+ * - Initial packet direction (SN_TO_LOCAL | SN_TO_GLOBAL)
+ * - NAT response (Send packet | Reply packet)
+ *
+ * Once complete, we copy the contents of the temporary packet over the original
+ * SCTP packet we were asked to NAT
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param sm Pointer to sctp message information
+ * @param assoc Pointer to current association details
+ * @param sndrply SN_SEND_ABORT | SN_REPLY_ABORT | SN_REPLY_ERROR
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ */
+static uint32_t
+local_sctp_finalize_crc32(uint32_t crc32c)
+{
+ /* This routine is duplicated from SCTP
+ * we need to do that since it MAY be that SCTP
+ * is NOT compiled into the kernel. The CRC32C routines
+ * however are always available in libkern.
+ */
+ uint32_t result;
+#if BYTE_ORDER == BIG_ENDIAN
+ uint8_t byte0, byte1, byte2, byte3;
+
+#endif
+ /* Complement the result */
+ result = ~crc32c;
+#if BYTE_ORDER == BIG_ENDIAN
+ /*
+ * For BIG-ENDIAN.. aka Motorola byte order the result is in
+ * little-endian form. So we must manually swap the bytes. Then we
+ * can call htonl() which does nothing...
+ */
+ byte0 = result & 0x000000ff;
+ byte1 = (result >> 8) & 0x000000ff;
+ byte2 = (result >> 16) & 0x000000ff;
+ byte3 = (result >> 24) & 0x000000ff;
+ crc32c = ((byte0 << 24) | (byte1 << 16) | (byte2 << 8) | byte3);
+#else
+ /*
+ * For INTEL platforms the result comes out in network order. No
+ * htonl is required or the swap above. So we optimize out both the
+ * htonl and the manual swap above.
+ */
+ crc32c = result;
+#endif
+ return (crc32c);
+}
+
+static void
+TxAbortErrorM(struct libalias *la, struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int sndrply, int direction)
+{
+ int sctp_size = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_error_cause);
+ int ip_size = sizeof(struct ip) + sctp_size;
+ int include_error_cause = 1;
+ char tmp_ip[ip_size];
+
+ if (ntohs(sm->ip_hdr->ip_len) < ip_size) { /* short packet, cannot send error cause */
+ include_error_cause = 0;
+ ip_size = ip_size - sizeof(struct sctp_error_cause);
+ sctp_size = sctp_size - sizeof(struct sctp_error_cause);
+ }
+ /* Assign header pointers packet */
+ struct ip* ip = (struct ip *) tmp_ip;
+ struct sctphdr* sctp_hdr = (struct sctphdr *) ((char *) ip + sizeof(*ip));
+ struct sctp_chunkhdr* chunk_hdr = (struct sctp_chunkhdr *) ((char *) sctp_hdr + sizeof(*sctp_hdr));
+ struct sctp_error_cause* error_cause = (struct sctp_error_cause *) ((char *) chunk_hdr + sizeof(*chunk_hdr));
+
+ /* construct ip header */
+ ip->ip_v = sm->ip_hdr->ip_v;
+ ip->ip_hl = 5; /* 5*32 bit words */
+ ip->ip_tos = 0;
+ ip->ip_len = htons(ip_size);
+ ip->ip_id = sm->ip_hdr->ip_id;
+ ip->ip_off = 0;
+ ip->ip_ttl = 255;
+ ip->ip_p = IPPROTO_SCTP;
+ /*
+ The definitions below should be removed when they make it into the SCTP stack
+ */
+#define SCTP_MIDDLEBOX_FLAG 0x02
+#define SCTP_NAT_TABLE_COLLISION 0x00b0
+#define SCTP_MISSING_NAT 0x00b1
+ chunk_hdr->chunk_type = (sndrply & SN_TX_ABORT) ? SCTP_ABORT_ASSOCIATION : SCTP_OPERATION_ERROR;
+ chunk_hdr->chunk_flags = SCTP_MIDDLEBOX_FLAG;
+ if (include_error_cause) {
+ error_cause->code = htons((sndrply & SN_REFLECT_ERROR) ? SCTP_MISSING_NAT : SCTP_NAT_TABLE_COLLISION);
+ error_cause->length = htons(sizeof(struct sctp_error_cause));
+ chunk_hdr->chunk_length = htons(sizeof(*chunk_hdr) + sizeof(struct sctp_error_cause));
+ } else {
+ chunk_hdr->chunk_length = htons(sizeof(*chunk_hdr));
+ }
+
+ /* set specific values */
+ switch(sndrply) {
+ case SN_REFLECT_ERROR:
+ chunk_hdr->chunk_flags |= SCTP_HAD_NO_TCB; /* set Tbit */
+ sctp_hdr->v_tag = sm->sctp_hdr->v_tag;
+ break;
+ case SN_REPLY_ERROR:
+ sctp_hdr->v_tag = (direction == SN_TO_LOCAL) ? assoc->g_vtag : assoc->l_vtag ;
+ break;
+ case SN_SEND_ABORT:
+ sctp_hdr->v_tag = sm->sctp_hdr->v_tag;
+ break;
+ case SN_REPLY_ABORT:
+ sctp_hdr->v_tag = sm->sctpchnk.Init->initiate_tag;
+ break;
+ }
+
+ /* Set send/reply values */
+ if (sndrply == SN_SEND_ABORT) { /*pass through NAT */
+ ip->ip_src = (direction == SN_TO_LOCAL) ? sm->ip_hdr->ip_src : assoc->a_addr;
+ ip->ip_dst = (direction == SN_TO_LOCAL) ? assoc->l_addr : sm->ip_hdr->ip_dst;
+ sctp_hdr->src_port = sm->sctp_hdr->src_port;
+ sctp_hdr->dest_port = sm->sctp_hdr->dest_port;
+ } else { /* reply and reflect */
+ ip->ip_src = sm->ip_hdr->ip_dst;
+ ip->ip_dst = sm->ip_hdr->ip_src;
+ sctp_hdr->src_port = sm->sctp_hdr->dest_port;
+ sctp_hdr->dest_port = sm->sctp_hdr->src_port;
+ }
+
+ /* Calculate IP header checksum */
+ ip->ip_sum = in_cksum_hdr(ip);
+
+ /* calculate SCTP header CRC32 */
+ sctp_hdr->checksum = 0;
+ sctp_hdr->checksum = local_sctp_finalize_crc32(calculate_crc32c(0xffffffff, (unsigned char *) sctp_hdr, sctp_size));
+
+ memcpy(sm->ip_hdr, ip, ip_size);
+
+ SN_LOG(SN_LOG_EVENT,SctpAliasLog("%s %s 0x%x (->%s:%u vtag=0x%x crc=0x%x)\n",
+ ((sndrply == SN_SEND_ABORT) ? "Sending" : "Replying"),
+ ((sndrply & SN_TX_ERROR) ? "ErrorM" : "AbortM"),
+ (include_error_cause ? ntohs(error_cause->code) : 0),
+ inet_ntoa(ip->ip_dst),ntohs(sctp_hdr->dest_port),
+ ntohl(sctp_hdr->v_tag), ntohl(sctp_hdr->checksum)));
+}
+
+/* ----------------------------------------------------------------------
+ * PACKET PARSER CODE
+ * ----------------------------------------------------------------------
+ */
+/** @addtogroup packet_parser
+ *
+ * These functions parse the SCTP packet and fill a sctp_nat_msg structure
+ * with the parsed contents.
+ */
+/** @ingroup packet_parser
+ * @brief Parses SCTP packets for the key SCTP chunk that will be processed
+ *
+ * This module parses SCTP packets for the key SCTP chunk that will be processed
+ * The module completes the sctp_nat_msg structure and either retrieves the
+ * relevant (existing) stored association from the Hash Tables or creates a new
+ * association entity with state SN_ID
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ * @param pip
+ * @param sm Pointer to sctp message information
+ * @param passoc Pointer to the association this SCTP Message belongs to
+ *
+ * @return SN_PARSE_OK | SN_PARSE_ERROR_*
+ */
+static int
+sctp_PktParser(struct libalias *la, int direction, struct ip *pip,
+ struct sctp_nat_msg *sm, struct sctp_nat_assoc **passoc)
+//sctp_PktParser(int direction, struct mbuf *ipak, int ip_hdr_len,struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc)
+{
+ struct sctphdr *sctp_hdr;
+ struct sctp_chunkhdr *chunk_hdr;
+ struct sctp_paramhdr *param_hdr;
+ struct in_addr ipv4addr;
+ int bytes_left; /* bytes left in ip packet */
+ int chunk_length;
+ int chunk_count;
+ int partial_match = 0;
+ // mbuf *mp;
+ // int mlen;
+
+ // mlen = SCTP_HEADER_LEN(i_pak);
+ // mp = SCTP_HEADER_TO_CHAIN(i_pak); /* does nothing in bsd since header and chain not separate */
+
+ /*
+ * Note, that if the VTag is zero, it must be an INIT
+ * Also, I am only interested in the content of INIT and ADDIP chunks
+ */
+
+ // no mbuf stuff from Paolo yet so ...
+ sm->ip_hdr = pip;
+ /* remove ip header length from the bytes_left */
+ bytes_left = ntohs(pip->ip_len) - (pip->ip_hl << 2);
+
+ /* Check SCTP header length and move to first chunk */
+ if (bytes_left < sizeof(struct sctphdr)) {
+ sm->sctp_hdr = NULL;
+ return(SN_PARSE_ERROR_IPSHL); /* packet not long enough*/
+ }
+
+ sm->sctp_hdr = sctp_hdr = (struct sctphdr *) ip_next(pip);
+ bytes_left -= sizeof(struct sctphdr);
+
+ /* Check for valid ports (zero valued ports would find partially initialised associations */
+ if (sctp_hdr->src_port == 0 || sctp_hdr->dest_port == 0)
+ return(SN_PARSE_ERROR_PORT);
+
+ /* Check length of first chunk */
+ if (bytes_left < SN_MIN_CHUNK_SIZE) /* malformed chunk - could cause endless loop*/
+ return(SN_PARSE_ERROR_CHHL); /* packet not long enough for this chunk */
+
+ /* First chunk */
+ chunk_hdr = SN_SCTP_FIRSTCHUNK(sctp_hdr);
+
+ chunk_length = SCTP_SIZE32(ntohs(chunk_hdr->chunk_length));
+ if ((chunk_length < SN_MIN_CHUNK_SIZE) || (chunk_length > bytes_left)) /* malformed chunk - could cause endless loop*/
+ return(SN_PARSE_ERROR_CHHL);
+
+ if ((chunk_hdr->chunk_flags & SCTP_HAD_NO_TCB) &&
+ ((chunk_hdr->chunk_type == SCTP_ABORT_ASSOCIATION) ||
+ (chunk_hdr->chunk_type == SCTP_SHUTDOWN_COMPLETE))) {
+ /* T-Bit set */
+ if (direction == SN_TO_LOCAL)
+ *passoc = FindSctpGlobalT(la, pip->ip_src, sctp_hdr->v_tag, sctp_hdr->dest_port, sctp_hdr->src_port);
+ else
+ *passoc = FindSctpLocalT(la, pip->ip_dst, sctp_hdr->v_tag, sctp_hdr->dest_port, sctp_hdr->src_port);
+ } else {
+ /* Proper v_tag settings */
+ if (direction == SN_TO_LOCAL)
+ *passoc = FindSctpGlobal(la, pip->ip_src, sctp_hdr->v_tag, sctp_hdr->src_port, sctp_hdr->dest_port, &partial_match);
+ else
+ *passoc = FindSctpLocal(la, pip->ip_src, pip->ip_dst, sctp_hdr->v_tag, sctp_hdr->src_port, sctp_hdr->dest_port);
+ }
+
+ chunk_count = 1;
+ /* Real packet parsing occurs below */
+ sm->msg = SN_SCTP_OTHER;/* Initialise to largest value*/
+ sm->chunk_length = 0; /* only care about length for key chunks */
+ while (IS_SCTP_CONTROL(chunk_hdr)) {
+ switch(chunk_hdr->chunk_type) {
+ case SCTP_INITIATION:
+ if (chunk_length < sizeof(struct sctp_init_chunk)) /* malformed chunk*/
+ return(SN_PARSE_ERROR_CHHL);
+ sm->msg = SN_SCTP_INIT;
+ sm->sctpchnk.Init = (struct sctp_init *) ((char *) chunk_hdr + sizeof(struct sctp_chunkhdr));
+ sm->chunk_length = chunk_length;
+ /* if no existing association, create a new one */
+ if (*passoc == NULL) {
+ if (sctp_hdr->v_tag == 0){ //Init requires vtag=0
+ *passoc = (struct sctp_nat_assoc *) sn_malloc(sizeof(struct sctp_nat_assoc));
+ if (*passoc == NULL) {/* out of resources */
+ return(SN_PARSE_ERROR_AS_MALLOC);
+ }
+ /* Initialise association - malloc initialises memory to zeros */
+ (*passoc)->state = SN_ID;
+ LIST_INIT(&((*passoc)->Gaddr)); /* always initialise to avoid memory problems */
+ (*passoc)->TableRegister = SN_NULL_TBL;
+ return(SN_PARSE_OK);
+ }
+ return(SN_PARSE_ERROR_VTAG);
+ }
+ return(SN_PARSE_ERROR_LOOKUP);
+ case SCTP_INITIATION_ACK:
+ if (chunk_length < sizeof(struct sctp_init_ack_chunk)) /* malformed chunk*/
+ return(SN_PARSE_ERROR_CHHL);
+ sm->msg = SN_SCTP_INITACK;
+ sm->sctpchnk.InitAck = (struct sctp_init_ack *) ((char *) chunk_hdr + sizeof(struct sctp_chunkhdr));
+ sm->chunk_length = chunk_length;
+ return ((*passoc == NULL)?(SN_PARSE_ERROR_LOOKUP):(SN_PARSE_OK));
+ case SCTP_ABORT_ASSOCIATION: /* access only minimum sized chunk */
+ sm->msg = SN_SCTP_ABORT;
+ sm->chunk_length = chunk_length;
+ return ((*passoc == NULL)?(SN_PARSE_ERROR_LOOKUP_ABORT):(SN_PARSE_OK));
+ case SCTP_SHUTDOWN_ACK:
+ if (chunk_length < sizeof(struct sctp_shutdown_ack_chunk)) /* malformed chunk*/
+ return(SN_PARSE_ERROR_CHHL);
+ if (sm->msg > SN_SCTP_SHUTACK) {
+ sm->msg = SN_SCTP_SHUTACK;
+ sm->chunk_length = chunk_length;
+ }
+ break;
+ case SCTP_SHUTDOWN_COMPLETE: /* minimum sized chunk */
+ if (sm->msg > SN_SCTP_SHUTCOMP) {
+ sm->msg = SN_SCTP_SHUTCOMP;
+ sm->chunk_length = chunk_length;
+ }
+ return ((*passoc == NULL)?(SN_PARSE_ERROR_LOOKUP):(SN_PARSE_OK));
+ case SCTP_ASCONF:
+ if (sm->msg > SN_SCTP_ASCONF) {
+ if (chunk_length < (sizeof(struct sctp_asconf_chunk) + sizeof(struct sctp_ipv4addr_param))) /* malformed chunk*/
+ return(SN_PARSE_ERROR_CHHL);
+ //leave parameter searching to later, if required
+ param_hdr = (struct sctp_paramhdr *) ((char *) chunk_hdr + sizeof(struct sctp_asconf_chunk)); /*compulsory IP parameter*/
+ if (ntohs(param_hdr->param_type) == SCTP_IPV4_ADDRESS) {
+ if ((*passoc == NULL) && (direction == SN_TO_LOCAL)) { /* AddIP with no association */
+ /* try look up with the ASCONF packet's alternative address */
+ ipv4addr.s_addr = ((struct sctp_ipv4addr_param *) param_hdr)->addr;
+ *passoc = FindSctpGlobal(la, ipv4addr, sctp_hdr->v_tag, sctp_hdr->src_port, sctp_hdr->dest_port, &partial_match);
+ }
+ param_hdr = (struct sctp_paramhdr *)
+ ((char *) param_hdr + sizeof(struct sctp_ipv4addr_param)); /*asconf's compulsory address parameter */
+ sm->chunk_length = chunk_length - sizeof(struct sctp_asconf_chunk) - sizeof(struct sctp_ipv4addr_param); /* rest of chunk */
+ } else {
+ if (chunk_length < (sizeof(struct sctp_asconf_chunk) + sizeof(struct sctp_ipv6addr_param))) /* malformed chunk*/
+ return(SN_PARSE_ERROR_CHHL);
+ param_hdr = (struct sctp_paramhdr *)
+ ((char *) param_hdr + sizeof(struct sctp_ipv6addr_param)); /*asconf's compulsory address parameter */
+ sm->chunk_length = chunk_length - sizeof(struct sctp_asconf_chunk) - sizeof(struct sctp_ipv6addr_param); /* rest of chunk */
+ }
+ sm->msg = SN_SCTP_ASCONF;
+ sm->sctpchnk.Asconf = param_hdr;
+
+ if (*passoc == NULL) { /* AddIP with no association */
+ *passoc = (struct sctp_nat_assoc *) sn_malloc(sizeof(struct sctp_nat_assoc));
+ if (*passoc == NULL) {/* out of resources */
+ return(SN_PARSE_ERROR_AS_MALLOC);
+ }
+ /* Initialise association - malloc initialises memory to zeros */
+ (*passoc)->state = SN_ID;
+ LIST_INIT(&((*passoc)->Gaddr)); /* always initialise to avoid memory problems */
+ (*passoc)->TableRegister = SN_NULL_TBL;
+ return(SN_PARSE_OK);
+ }
+ }
+ break;
+ case SCTP_ASCONF_ACK:
+ if (sm->msg > SN_SCTP_ASCONFACK) {
+ if (chunk_length < sizeof(struct sctp_asconf_ack_chunk)) /* malformed chunk*/
+ return(SN_PARSE_ERROR_CHHL);
+ //leave parameter searching to later, if required
+ param_hdr = (struct sctp_paramhdr *) ((char *) chunk_hdr
+ + sizeof(struct sctp_asconf_ack_chunk));
+ sm->msg = SN_SCTP_ASCONFACK;
+ sm->sctpchnk.Asconf = param_hdr;
+ sm->chunk_length = chunk_length - sizeof(struct sctp_asconf_ack_chunk);
+ }
+ break;
+ default:
+ break; /* do nothing*/
+ }
+
+ /* if no association is found exit - we need to find an Init or AddIP within sysctl_initialising_chunk_proc_limit */
+ if ((*passoc == NULL) && (chunk_count >= sysctl_initialising_chunk_proc_limit))
+ return(SN_PARSE_ERROR_LOOKUP);
+
+ /* finished with this chunk, on to the next chunk*/
+ bytes_left-= chunk_length;
+
+ /* Is this the end of the packet ? */
+ if (bytes_left == 0)
+ return (*passoc == NULL)?(SN_PARSE_ERROR_LOOKUP):(SN_PARSE_OK);
+
+ /* Are there enough bytes in packet to at least retrieve length of next chunk ? */
+ if (bytes_left < SN_MIN_CHUNK_SIZE)
+ return(SN_PARSE_ERROR_CHHL);
+
+ chunk_hdr = SN_SCTP_NEXTCHUNK(chunk_hdr);
+
+ /* Is the chunk long enough to not cause endless look and are there enough bytes in packet to read the chunk ? */
+ chunk_length = SCTP_SIZE32(ntohs(chunk_hdr->chunk_length));
+ if ((chunk_length < SN_MIN_CHUNK_SIZE) || (chunk_length > bytes_left))
+ return(SN_PARSE_ERROR_CHHL);
+ if(++chunk_count > sysctl_chunk_proc_limit)
+ return(SN_PARSE_OK); /* limit for processing chunks, take what we get */
+ }
+
+ if (*passoc == NULL)
+ return (partial_match)?(SN_PARSE_ERROR_PARTIALLOOKUP):(SN_PARSE_ERROR_LOOKUP);
+ else
+ return(SN_PARSE_OK);
+}
+
+/** @ingroup packet_parser
+ * @brief Extract Vtags from Asconf Chunk
+ *
+ * GetAsconfVtags scans an Asconf Chunk for the vtags parameter, and then
+ * extracts the vtags.
+ *
+ * GetAsconfVtags is not called from within sctp_PktParser. It is called only
+ * from within ID_process when an AddIP has been received.
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param sm Pointer to sctp message information
+ * @param l_vtag Pointer to the local vtag in the association this SCTP Message belongs to
+ * @param g_vtag Pointer to the local vtag in the association this SCTP Message belongs to
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ *
+ * @return 1 - success | 0 - fail
+ */
+static int
+GetAsconfVtags(struct libalias *la, struct sctp_nat_msg *sm, uint32_t *l_vtag, uint32_t *g_vtag, int direction)
+{
+ /* To be removed when information is in the sctp headers */
+#define SCTP_VTAG_PARAM 0xC007
+ struct sctp_vtag_param {
+ struct sctp_paramhdr ph;/* type=SCTP_VTAG_PARAM */
+ uint32_t local_vtag;
+ uint32_t remote_vtag;
+ } __attribute__((packed));
+
+ struct sctp_vtag_param *vtag_param;
+ struct sctp_paramhdr *param;
+ int bytes_left;
+ int param_size;
+ int param_count;
+
+ param_count = 1;
+ param = sm->sctpchnk.Asconf;
+ param_size = SCTP_SIZE32(ntohs(param->param_length));
+ bytes_left = sm->chunk_length;
+ /* step through Asconf parameters */
+ while((bytes_left >= param_size) && (bytes_left >= SN_VTAG_PARAM_SIZE)) {
+ if (ntohs(param->param_type) == SCTP_VTAG_PARAM) {
+ vtag_param = (struct sctp_vtag_param *) param;
+ switch(direction) {
+ /* The Internet draft is a little ambigious as to order of these vtags.
+ We think it is this way around. If we are wrong, the order will need
+ to be changed. */
+ case SN_TO_GLOBAL:
+ *g_vtag = vtag_param->local_vtag;
+ *l_vtag = vtag_param->remote_vtag;
+ break;
+ case SN_TO_LOCAL:
+ *g_vtag = vtag_param->remote_vtag;
+ *l_vtag = vtag_param->local_vtag;
+ break;
+ }
+ return(1); /* found */
+ }
+
+ bytes_left -= param_size;
+ if (bytes_left < SN_MIN_PARAM_SIZE) return(0);
+
+ param = SN_SCTP_NEXTPARAM(param);
+ param_size = SCTP_SIZE32(ntohs(param->param_length));
+ if (++param_count > sysctl_param_proc_limit) {
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("Parameter parse limit exceeded (GetAsconfVtags)",
+ sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction));
+ return(0); /* not found limit exceeded*/
+ }
+ }
+ return(0); /* not found */
+}
+
+/** @ingroup packet_parser
+ * @brief AddGlobalIPAddresses from Init,InitAck,or AddIP packets
+ *
+ * AddGlobalIPAddresses scans an SCTP chunk (in sm) for Global IP addresses, and
+ * adds them.
+ *
+ * @param sm Pointer to sctp message information
+ * @param assoc Pointer to the association this SCTP Message belongs to
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ *
+ */
+static void
+AddGlobalIPAddresses(struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int direction)
+{
+ struct sctp_ipv4addr_param *ipv4_param;
+ struct sctp_paramhdr *param = NULL;
+ struct sctp_GlobalAddress *G_Addr;
+ struct in_addr g_addr = {0};
+ int bytes_left = 0;
+ int param_size;
+ int param_count, addr_param_count = 0;
+
+ switch(direction) {
+ case SN_TO_GLOBAL: /* does not contain global addresses */
+ g_addr = sm->ip_hdr->ip_dst;
+ bytes_left = 0; /* force exit */
+ break;
+ case SN_TO_LOCAL:
+ g_addr = sm->ip_hdr->ip_src;
+ param_count = 1;
+ switch(sm->msg) {
+ case SN_SCTP_INIT:
+ bytes_left = sm->chunk_length - sizeof(struct sctp_init_chunk);
+ param = (struct sctp_paramhdr *)((char *)sm->sctpchnk.Init + sizeof(struct sctp_init));
+ break;
+ case SN_SCTP_INITACK:
+ bytes_left = sm->chunk_length - sizeof(struct sctp_init_ack_chunk);
+ param = (struct sctp_paramhdr *)((char *)sm->sctpchnk.InitAck + sizeof(struct sctp_init_ack));
+ break;
+ case SN_SCTP_ASCONF:
+ bytes_left = sm->chunk_length;
+ param = sm->sctpchnk.Asconf;
+ break;
+ }
+ }
+ if (bytes_left >= SN_MIN_PARAM_SIZE)
+ param_size = SCTP_SIZE32(ntohs(param->param_length));
+ else
+ param_size = bytes_left+1; /* force skip loop */
+
+ if ((assoc->state == SN_ID) && ((sm->msg == SN_SCTP_INIT) || (bytes_left < SN_MIN_PARAM_SIZE))) {/* add pkt address */
+ G_Addr = (struct sctp_GlobalAddress *) sn_malloc(sizeof(struct sctp_GlobalAddress));
+ if (G_Addr == NULL) {/* out of resources */
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("AddGlobalIPAddress: No resources for adding global address - revert to no tracking",
+ sm->sctp_hdr->v_tag, 0, direction));
+ assoc->num_Gaddr = 0; /* don't track any more for this assoc*/
+ sysctl_track_global_addresses=0;
+ return;
+ }
+ G_Addr->g_addr = g_addr;
+ if (!Add_Global_Address_to_List(assoc, G_Addr))
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("AddGlobalIPAddress: Address already in list",
+ sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction));
+ }
+
+ /* step through parameters */
+ while((bytes_left >= param_size) && (bytes_left >= sizeof(struct sctp_ipv4addr_param))) {
+ if (assoc->num_Gaddr >= sysctl_track_global_addresses) {
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("AddGlobalIPAddress: Maximum Number of addresses reached",
+ sm->sctp_hdr->v_tag, sysctl_track_global_addresses, direction));
+ return;
+ }
+ switch(ntohs(param->param_type)) {
+ case SCTP_ADD_IP_ADDRESS:
+ /* skip to address parameter - leave param_size so bytes left will be calculated properly*/
+ param = (struct sctp_paramhdr *) &((struct sctp_asconf_addrv4_param *) param)->addrp;
+ case SCTP_IPV4_ADDRESS:
+ ipv4_param = (struct sctp_ipv4addr_param *) param;
+ /* add addresses to association */
+ G_Addr = (struct sctp_GlobalAddress *) sn_malloc(sizeof(struct sctp_GlobalAddress));
+ if (G_Addr == NULL) {/* out of resources */
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("AddGlobalIPAddress: No resources for adding global address - revert to no tracking",
+ sm->sctp_hdr->v_tag, 0, direction));
+ assoc->num_Gaddr = 0; /* don't track any more for this assoc*/
+ sysctl_track_global_addresses=0;
+ return;
+ }
+ /* add address */
+ addr_param_count++;
+ if ((sm->msg == SN_SCTP_ASCONF) && (ipv4_param->addr == INADDR_ANY)) { /* use packet address */
+ G_Addr->g_addr = g_addr;
+ if (!Add_Global_Address_to_List(assoc, G_Addr))
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("AddGlobalIPAddress: Address already in list",
+ sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction));
+ return; /*shouldn't be any other addresses if the zero address is given*/
+ } else {
+ G_Addr->g_addr.s_addr = ipv4_param->addr;
+ if (!Add_Global_Address_to_List(assoc, G_Addr))
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("AddGlobalIPAddress: Address already in list",
+ sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction));
+ }
+ }
+
+ bytes_left -= param_size;
+ if (bytes_left < SN_MIN_PARAM_SIZE)
+ break;
+
+ param = SN_SCTP_NEXTPARAM(param);
+ param_size = SCTP_SIZE32(ntohs(param->param_length));
+ if (++param_count > sysctl_param_proc_limit) {
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("Parameter parse limit exceeded (AddGlobalIPAddress)",
+ sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction));
+ break; /* limit exceeded*/
+ }
+ }
+ if (addr_param_count == 0) {
+ SN_LOG(SN_LOG_DETAIL,
+ logsctperror("AddGlobalIPAddress: no address parameters to add",
+ sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction));
+ }
+}
+
+/**
+ * @brief Add_Global_Address_to_List
+ *
+ * Adds a global IP address to an associations address list, if it is not
+ * already there. The first address added us usually the packet's address, and
+ * is most likely to be used, so it is added at the beginning. Subsequent
+ * addresses are added after this one.
+ *
+ * @param assoc Pointer to the association this SCTP Message belongs to
+ * @param G_addr Pointer to the global address to add
+ *
+ * @return 1 - success | 0 - fail
+ */
+static int Add_Global_Address_to_List(struct sctp_nat_assoc *assoc, struct sctp_GlobalAddress *G_addr)
+{
+ struct sctp_GlobalAddress *iter_G_Addr = NULL, *first_G_Addr = NULL;
+ first_G_Addr = LIST_FIRST(&(assoc->Gaddr));
+ if (first_G_Addr == NULL) {
+ LIST_INSERT_HEAD(&(assoc->Gaddr), G_addr, list_Gaddr); /* add new address to beginning of list*/
+ } else {
+ LIST_FOREACH(iter_G_Addr, &(assoc->Gaddr), list_Gaddr) {
+ if (G_addr->g_addr.s_addr == iter_G_Addr->g_addr.s_addr)
+ return(0); /* already exists, so don't add */
+ }
+ LIST_INSERT_AFTER(first_G_Addr, G_addr, list_Gaddr); /* add address to end of list*/
+ }
+ assoc->num_Gaddr++;
+ return(1); /* success */
+}
+
+/** @ingroup packet_parser
+ * @brief RmGlobalIPAddresses from DelIP packets
+ *
+ * RmGlobalIPAddresses scans an ASCONF chunk for DelIP parameters to remove the
+ * given Global IP addresses from the association. It will not delete the
+ * the address if it is a list of one address.
+ *
+ *
+ * @param sm Pointer to sctp message information
+ * @param assoc Pointer to the association this SCTP Message belongs to
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ *
+ */
+static void
+RmGlobalIPAddresses(struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int direction)
+{
+ struct sctp_asconf_addrv4_param *asconf_ipv4_param;
+ struct sctp_paramhdr *param;
+ struct sctp_GlobalAddress *G_Addr, *G_Addr_tmp;
+ struct in_addr g_addr;
+ int bytes_left;
+ int param_size;
+ int param_count;
+
+ if(direction == SN_TO_GLOBAL)
+ g_addr = sm->ip_hdr->ip_dst;
+ else
+ g_addr = sm->ip_hdr->ip_src;
+
+ bytes_left = sm->chunk_length;
+ param_count = 1;
+ param = sm->sctpchnk.Asconf;
+ if (bytes_left >= SN_MIN_PARAM_SIZE) {
+ param_size = SCTP_SIZE32(ntohs(param->param_length));
+ } else {
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("RmGlobalIPAddress: truncated packet - cannot remove IP addresses",
+ sm->sctp_hdr->v_tag, sysctl_track_global_addresses, direction));
+ return;
+ }
+
+ /* step through Asconf parameters */
+ while((bytes_left >= param_size) && (bytes_left >= sizeof(struct sctp_ipv4addr_param))) {
+ if (ntohs(param->param_type) == SCTP_DEL_IP_ADDRESS) {
+ asconf_ipv4_param = (struct sctp_asconf_addrv4_param *) param;
+ if (asconf_ipv4_param->addrp.addr == INADDR_ANY) { /* remove all bar pkt address */
+ LIST_FOREACH_SAFE(G_Addr, &(assoc->Gaddr), list_Gaddr, G_Addr_tmp) {
+ if(G_Addr->g_addr.s_addr != sm->ip_hdr->ip_src.s_addr) {
+ if (assoc->num_Gaddr > 1) { /* only delete if more than one */
+ LIST_REMOVE(G_Addr, list_Gaddr);
+ sn_free(G_Addr);
+ assoc->num_Gaddr--;
+ } else {
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("RmGlobalIPAddress: Request to remove last IP address (didn't)",
+ sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction));
+ }
+ }
+ }
+ return; /*shouldn't be any other addresses if the zero address is given*/
+ } else {
+ LIST_FOREACH_SAFE(G_Addr, &(assoc->Gaddr), list_Gaddr, G_Addr_tmp) {
+ if(G_Addr->g_addr.s_addr == asconf_ipv4_param->addrp.addr) {
+ if (assoc->num_Gaddr > 1) { /* only delete if more than one */
+ LIST_REMOVE(G_Addr, list_Gaddr);
+ sn_free(G_Addr);
+ assoc->num_Gaddr--;
+ break; /* Since add only adds new addresses, there should be no double entries */
+ } else {
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("RmGlobalIPAddress: Request to remove last IP address (didn't)",
+ sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction));
+ }
+ }
+ }
+ }
+ }
+ bytes_left -= param_size;
+ if (bytes_left == 0) return;
+ else if (bytes_left < SN_MIN_PARAM_SIZE) {
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("RmGlobalIPAddress: truncated packet - may not have removed all IP addresses",
+ sm->sctp_hdr->v_tag, sysctl_track_global_addresses, direction));
+ return;
+ }
+
+ param = SN_SCTP_NEXTPARAM(param);
+ param_size = SCTP_SIZE32(ntohs(param->param_length));
+ if (++param_count > sysctl_param_proc_limit) {
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("Parameter parse limit exceeded (RmGlobalIPAddress)",
+ sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction));
+ return; /* limit exceeded*/
+ }
+ }
+}
+
+/** @ingroup packet_parser
+ * @brief Check that ASCONF was successful
+ *
+ * Each ASCONF configuration parameter carries a correlation ID which should be
+ * matched with an ASCONFack. This is difficult for a NAT, since every
+ * association could potentially have a number of outstanding ASCONF
+ * configuration parameters, which should only be activated on receipt of the
+ * ACK.
+ *
+ * Currently we only look for an ACK when the NAT is setting up a new
+ * association (ie AddIP for a connection that the NAT does not know about
+ * because the original Init went through a public interface or another NAT)
+ * Since there is currently no connection on this path, there should be no other
+ * ASCONF configuration parameters outstanding, so we presume that if there is
+ * an ACK that it is responding to the AddIP and activate the new association.
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param sm Pointer to sctp message information
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ *
+ * @return 1 - success | 0 - fail
+ */
+static int
+IsASCONFack(struct libalias *la, struct sctp_nat_msg *sm, int direction)
+{
+ struct sctp_paramhdr *param;
+ int bytes_left;
+ int param_size;
+ int param_count;
+
+ param_count = 1;
+ param = sm->sctpchnk.Asconf;
+ param_size = SCTP_SIZE32(ntohs(param->param_length));
+ if (param_size == 8)
+ return(1); /*success - default acknowledgement of everything */
+
+ bytes_left = sm->chunk_length;
+ if (bytes_left < param_size)
+ return(0); /* not found */
+ /* step through Asconf parameters */
+ while(bytes_left >= SN_ASCONFACK_PARAM_SIZE) {
+ if (ntohs(param->param_type) == SCTP_SUCCESS_REPORT)
+ return(1); /* success - but can't match correlation IDs - should only be one */
+ /* check others just in case */
+ bytes_left -= param_size;
+ if (bytes_left >= SN_MIN_PARAM_SIZE) {
+ param = SN_SCTP_NEXTPARAM(param);
+ } else {
+ return(0);
+ }
+ param_size = SCTP_SIZE32(ntohs(param->param_length));
+ if (bytes_left < param_size) return(0);
+
+ if (++param_count > sysctl_param_proc_limit) {
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("Parameter parse limit exceeded (IsASCONFack)",
+ sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction));
+ return(0); /* not found limit exceeded*/
+ }
+ }
+ return(0); /* not success */
+}
+
+/** @ingroup packet_parser
+ * @brief Check to see if ASCONF contains an Add IP or Del IP parameter
+ *
+ * IsADDorDEL scans an ASCONF packet to see if it contains an AddIP or DelIP
+ * parameter
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param sm Pointer to sctp message information
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ *
+ * @return SCTP_ADD_IP_ADDRESS | SCTP_DEL_IP_ADDRESS | 0 - fail
+ */
+static int
+IsADDorDEL(struct libalias *la, struct sctp_nat_msg *sm, int direction)
+{
+ struct sctp_paramhdr *param;
+ int bytes_left;
+ int param_size;
+ int param_count;
+
+ param_count = 1;
+ param = sm->sctpchnk.Asconf;
+ param_size = SCTP_SIZE32(ntohs(param->param_length));
+
+ bytes_left = sm->chunk_length;
+ if (bytes_left < param_size)
+ return(0); /* not found */
+ /* step through Asconf parameters */
+ while(bytes_left >= SN_ASCONFACK_PARAM_SIZE) {
+ if (ntohs(param->param_type) == SCTP_ADD_IP_ADDRESS)
+ return(SCTP_ADD_IP_ADDRESS);
+ else if (ntohs(param->param_type) == SCTP_DEL_IP_ADDRESS)
+ return(SCTP_DEL_IP_ADDRESS);
+ /* check others just in case */
+ bytes_left -= param_size;
+ if (bytes_left >= SN_MIN_PARAM_SIZE) {
+ param = SN_SCTP_NEXTPARAM(param);
+ } else {
+ return(0); /*Neither found */
+ }
+ param_size = SCTP_SIZE32(ntohs(param->param_length));
+ if (bytes_left < param_size) return(0);
+
+ if (++param_count > sysctl_param_proc_limit) {
+ SN_LOG(SN_LOG_EVENT,
+ logsctperror("Parameter parse limit exceeded IsADDorDEL)",
+ sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction));
+ return(0); /* not found limit exceeded*/
+ }
+ }
+ return(0); /*Neither found */
+}
+
+/* ----------------------------------------------------------------------
+ * STATE MACHINE CODE
+ * ----------------------------------------------------------------------
+ */
+/** @addtogroup state_machine
+ *
+ * The SCTP NAT State Machine functions will:
+ * - Process an already parsed packet
+ * - Use the existing NAT Hash Tables
+ * - Determine the next state for the association
+ * - Update the NAT Hash Tables and Timer Queues
+ * - Return the appropriate action to take with the packet
+ */
+/** @ingroup state_machine
+ * @brief Process SCTP message
+ *
+ * This function is the base state machine. It calls the processing engine for
+ * each state.
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ * @param sm Pointer to sctp message information
+ * @param assoc Pointer to the association this SCTP Message belongs to
+ *
+ * @return SN_DROP_PKT | SN_NAT_PKT | SN_REPLY_ABORT | SN_REPLY_ERROR | SN_PROCESSING_ERROR
+ */
+static int
+ProcessSctpMsg(struct libalias *la, int direction, struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc)
+{
+ int rtnval;
+
+ switch (assoc->state) {
+ case SN_ID: /* Idle */
+ rtnval = ID_process(la, direction, assoc, sm);
+ if (rtnval != SN_NAT_PKT) {
+ assoc->state = SN_RM;/* Mark for removal*/
+ }
+ return(rtnval);
+ case SN_INi: /* Initialising - Init */
+ return(INi_process(la, direction, assoc, sm));
+ case SN_INa: /* Initialising - AddIP */
+ return(INa_process(la, direction, assoc, sm));
+ case SN_UP: /* Association UP */
+ return(UP_process(la, direction, assoc, sm));
+ case SN_CL: /* Association Closing */
+ return(CL_process(la, direction, assoc, sm));
+ }
+ return(SN_PROCESSING_ERROR);
+}
+
+/** @ingroup state_machine
+ * @brief Process SCTP message while in the Idle state
+ *
+ * This function looks for an Incoming INIT or AddIP message.
+ *
+ * All other SCTP messages are invalid when in SN_ID, and are dropped.
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ * @param sm Pointer to sctp message information
+ * @param assoc Pointer to the association this SCTP Message belongs to
+ *
+ * @return SN_NAT_PKT | SN_DROP_PKT | SN_REPLY_ABORT | SN_REPLY_ERROR
+ */
+static int
+ID_process(struct libalias *la, int direction, struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm)
+{
+ switch(sm->msg) {
+ case SN_SCTP_ASCONF: /* a packet containing an ASCONF chunk with ADDIP */
+ if (!sysctl_accept_global_ootb_addip && (direction == SN_TO_LOCAL))
+ return(SN_DROP_PKT);
+ /* if this Asconf packet does not contain the Vtag parameters it is of no use in Idle state */
+ if (!GetAsconfVtags(la, sm, &(assoc->l_vtag), &(assoc->g_vtag), direction))
+ return(SN_DROP_PKT);
+ case SN_SCTP_INIT: /* a packet containing an INIT chunk or an ASCONF AddIP */
+ if (sysctl_track_global_addresses)
+ AddGlobalIPAddresses(sm, assoc, direction);
+ switch(direction){
+ case SN_TO_GLOBAL:
+ assoc->l_addr = sm->ip_hdr->ip_src;
+ assoc->a_addr = FindAliasAddress(la, assoc->l_addr);
+ assoc->l_port = sm->sctp_hdr->src_port;
+ assoc->g_port = sm->sctp_hdr->dest_port;
+ if(sm->msg == SN_SCTP_INIT)
+ assoc->g_vtag = sm->sctpchnk.Init->initiate_tag;
+ if (AddSctpAssocGlobal(la, assoc)) /* DB clash *///**** need to add dst address
+ return((sm->msg == SN_SCTP_INIT) ? SN_REPLY_ABORT : SN_REPLY_ERROR);
+ if(sm->msg == SN_SCTP_ASCONF) {
+ if (AddSctpAssocLocal(la, assoc, sm->ip_hdr->ip_dst)) /* DB clash */
+ return(SN_REPLY_ERROR);
+ assoc->TableRegister |= SN_WAIT_TOLOCAL; /* wait for tolocal ack */
+ }
+ break;
+ case SN_TO_LOCAL:
+ assoc->l_addr = FindSctpRedirectAddress(la, sm);
+ assoc->a_addr = sm->ip_hdr->ip_dst;
+ assoc->l_port = sm->sctp_hdr->dest_port;
+ assoc->g_port = sm->sctp_hdr->src_port;
+ if(sm->msg == SN_SCTP_INIT)
+ assoc->l_vtag = sm->sctpchnk.Init->initiate_tag;
+ if (AddSctpAssocLocal(la, assoc, sm->ip_hdr->ip_src)) /* DB clash */
+ return((sm->msg == SN_SCTP_INIT) ? SN_REPLY_ABORT : SN_REPLY_ERROR);
+ if(sm->msg == SN_SCTP_ASCONF) {
+ if (AddSctpAssocGlobal(la, assoc)) /* DB clash */ //**** need to add src address
+ return(SN_REPLY_ERROR);
+ assoc->TableRegister |= SN_WAIT_TOGLOBAL; /* wait for toglobal ack */
+ }
+ break;
+ }
+ assoc->state = (sm->msg == SN_SCTP_INIT) ? SN_INi : SN_INa;
+ assoc->exp = SN_I_T(la);
+ sctp_AddTimeOut(la,assoc);
+ return(SN_NAT_PKT);
+ default: /* Any other type of SCTP message is not valid in Idle */
+ return(SN_DROP_PKT);
+ }
+return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */
+}
+
+/** @ingroup state_machine
+ * @brief Process SCTP message while waiting for an INIT-ACK message
+ *
+ * Only an INIT-ACK, resent INIT, or an ABORT SCTP packet are valid in this
+ * state, all other packets are dropped.
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ * @param sm Pointer to sctp message information
+ * @param assoc Pointer to the association this SCTP Message belongs to
+ *
+ * @return SN_NAT_PKT | SN_DROP_PKT | SN_REPLY_ABORT
+ */
+static int
+INi_process(struct libalias *la, int direction, struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm)
+{
+ switch(sm->msg) {
+ case SN_SCTP_INIT: /* a packet containing a retransmitted INIT chunk */
+ sctp_ResetTimeOut(la, assoc, SN_I_T(la));
+ return(SN_NAT_PKT);
+ case SN_SCTP_INITACK: /* a packet containing an INIT-ACK chunk */
+ switch(direction){
+ case SN_TO_LOCAL:
+ if (assoc->num_Gaddr) /*If tracking global addresses for this association */
+ AddGlobalIPAddresses(sm, assoc, direction);
+ assoc->l_vtag = sm->sctpchnk.Init->initiate_tag;
+ if (AddSctpAssocLocal(la, assoc, sm->ip_hdr->ip_src)) { /* DB clash */
+ assoc->state = SN_RM;/* Mark for removal*/
+ return(SN_SEND_ABORT);
+ }
+ break;
+ case SN_TO_GLOBAL:
+ assoc->l_addr = sm->ip_hdr->ip_src; // Only if not set in Init! *
+ assoc->g_vtag = sm->sctpchnk.Init->initiate_tag;
+ if (AddSctpAssocGlobal(la, assoc)) { /* DB clash */
+ assoc->state = SN_RM;/* Mark for removal*/
+ return(SN_SEND_ABORT);
+ }
+ break;
+ }
+ assoc->state = SN_UP;/* association established for NAT */
+ sctp_ResetTimeOut(la,assoc, SN_U_T(la));
+ return(SN_NAT_PKT);
+ case SN_SCTP_ABORT: /* a packet containing an ABORT chunk */
+ assoc->state = SN_RM;/* Mark for removal*/
+ return(SN_NAT_PKT);
+ default:
+ return(SN_DROP_PKT);
+ }
+ return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */
+}
+
+/** @ingroup state_machine
+ * @brief Process SCTP message while waiting for an AddIp-ACK message
+ *
+ * Only an AddIP-ACK, resent AddIP, or an ABORT message are valid, all other
+ * SCTP packets are dropped
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ * @param sm Pointer to sctp message information
+ * @param assoc Pointer to the association this SCTP Message belongs to
+ *
+ * @return SN_NAT_PKT | SN_DROP_PKT
+ */
+static int
+INa_process(struct libalias *la, int direction,struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm)
+{
+ switch(sm->msg) {
+ case SN_SCTP_ASCONF: /* a packet containing an ASCONF chunk*/
+ sctp_ResetTimeOut(la,assoc, SN_I_T(la));
+ return(SN_NAT_PKT);
+ case SN_SCTP_ASCONFACK: /* a packet containing an ASCONF chunk with a ADDIP-ACK */
+ switch(direction){
+ case SN_TO_LOCAL:
+ if (!(assoc->TableRegister & SN_WAIT_TOLOCAL)) /* wrong direction */
+ return(SN_DROP_PKT);
+ break;
+ case SN_TO_GLOBAL:
+ if (!(assoc->TableRegister & SN_WAIT_TOGLOBAL)) /* wrong direction */
+ return(SN_DROP_PKT);
+ }
+ if (IsASCONFack(la,sm,direction)) {
+ assoc->TableRegister &= SN_BOTH_TBL; /* remove wait flags */
+ assoc->state = SN_UP; /* association established for NAT */
+ sctp_ResetTimeOut(la,assoc, SN_U_T(la));
+ return(SN_NAT_PKT);
+ } else {
+ assoc->state = SN_RM;/* Mark for removal*/
+ return(SN_NAT_PKT);
+ }
+ case SN_SCTP_ABORT: /* a packet containing an ABORT chunk */
+ assoc->state = SN_RM;/* Mark for removal*/
+ return(SN_NAT_PKT);
+ default:
+ return(SN_DROP_PKT);
+ }
+ return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */
+}
+
+/** @ingroup state_machine
+ * @brief Process SCTP messages while association is UP redirecting packets
+ *
+ * While in the SN_UP state, all packets for the particular association
+ * are passed. Only a SHUT-ACK or an ABORT will cause a change of state.
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ * @param sm Pointer to sctp message information
+ * @param assoc Pointer to the association this SCTP Message belongs to
+ *
+ * @return SN_NAT_PKT | SN_DROP_PKT
+ */
+static int
+UP_process(struct libalias *la, int direction, struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm)
+{
+ switch(sm->msg) {
+ case SN_SCTP_SHUTACK: /* a packet containing a SHUTDOWN-ACK chunk */
+ assoc->state = SN_CL;
+ sctp_ResetTimeOut(la,assoc, SN_C_T(la));
+ return(SN_NAT_PKT);
+ case SN_SCTP_ABORT: /* a packet containing an ABORT chunk */
+ assoc->state = SN_RM;/* Mark for removal*/
+ return(SN_NAT_PKT);
+ case SN_SCTP_ASCONF: /* a packet containing an ASCONF chunk*/
+ if ((direction == SN_TO_LOCAL) && assoc->num_Gaddr) /*If tracking global addresses for this association & from global side */
+ switch(IsADDorDEL(la,sm,direction)) {
+ case SCTP_ADD_IP_ADDRESS:
+ AddGlobalIPAddresses(sm, assoc, direction);
+ break;
+ case SCTP_DEL_IP_ADDRESS:
+ RmGlobalIPAddresses(sm, assoc, direction);
+ break;
+ } /* fall through to default */
+ default:
+ sctp_ResetTimeOut(la,assoc, SN_U_T(la));
+ return(SN_NAT_PKT); /* forward packet */
+ }
+ return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */
+}
+
+/** @ingroup state_machine
+ * @brief Process SCTP message while association is in the process of closing
+ *
+ * This function waits for a SHUT-COMP to close the association. Depending on
+ * the the setting of sysctl_holddown_timer it may not remove the association
+ * immediately, but leave it up until SN_X_T(la). Only SHUT-COMP, SHUT-ACK, and
+ * ABORT packets are permitted in this state. All other packets are dropped.
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param direction SN_TO_LOCAL | SN_TO_GLOBAL
+ * @param sm Pointer to sctp message information
+ * @param assoc Pointer to the association this SCTP Message belongs to
+ *
+ * @return SN_NAT_PKT | SN_DROP_PKT
+ */
+static int
+CL_process(struct libalias *la, int direction,struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm)
+{
+ switch(sm->msg) {
+ case SN_SCTP_SHUTCOMP: /* a packet containing a SHUTDOWN-COMPLETE chunk */
+ assoc->state = SN_CL; /* Stay in Close state until timeout */
+ if (sysctl_holddown_timer > 0)
+ sctp_ResetTimeOut(la, assoc, SN_X_T(la));/* allow to stay open for Tbit packets*/
+ else
+ assoc->state = SN_RM;/* Mark for removal*/
+ return(SN_NAT_PKT);
+ case SN_SCTP_SHUTACK: /* a packet containing a SHUTDOWN-ACK chunk */
+ assoc->state = SN_CL; /* Stay in Close state until timeout */
+ sctp_ResetTimeOut(la, assoc, SN_C_T(la));
+ return(SN_NAT_PKT);
+ case SN_SCTP_ABORT: /* a packet containing an ABORT chunk */
+ assoc->state = SN_RM;/* Mark for removal*/
+ return(SN_NAT_PKT);
+ default:
+ return(SN_DROP_PKT);
+ }
+ return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */
+}
+
+/* ----------------------------------------------------------------------
+ * HASH TABLE CODE
+ * ----------------------------------------------------------------------
+ */
+/** @addtogroup Hash
+ *
+ * The Hash functions facilitate searching the NAT Hash Tables for associations
+ * as well as adding/removing associations from the table(s).
+ */
+/** @ingroup Hash
+ * @brief Find the SCTP association given the local address, port and vtag
+ *
+ * Searches the local look-up table for the association entry matching the
+ * provided local <address:ports:vtag> tuple
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param l_addr local address
+ * @param g_addr global address
+ * @param l_vtag local Vtag
+ * @param l_port local Port
+ * @param g_port global Port
+ *
+ * @return pointer to association or NULL
+ */
+static struct sctp_nat_assoc*
+FindSctpLocal(struct libalias *la, struct in_addr l_addr, struct in_addr g_addr, uint32_t l_vtag, uint16_t l_port, uint16_t g_port)
+{
+ u_int i;
+ struct sctp_nat_assoc *assoc = NULL;
+ struct sctp_GlobalAddress *G_Addr = NULL;
+
+ if (l_vtag != 0) { /* an init packet, vtag==0 */
+ i = SN_TABLE_HASH(l_vtag, l_port, la->sctpNatTableSize);
+ LIST_FOREACH(assoc, &la->sctpTableLocal[i], list_L) {
+ if ((assoc->l_vtag == l_vtag) && (assoc->l_port == l_port) && (assoc->g_port == g_port)\
+ && (assoc->l_addr.s_addr == l_addr.s_addr)) {
+ if (assoc->num_Gaddr) {
+ LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) {
+ if(G_Addr->g_addr.s_addr == g_addr.s_addr)
+ return(assoc);
+ }
+ } else {
+ return(assoc);
+ }
+ }
+ }
+ }
+ return(NULL);
+}
+
+/** @ingroup Hash
+ * @brief Check for Global Clash
+ *
+ * Searches the global look-up table for the association entry matching the
+ * provided global <(addresses):ports:vtag> tuple
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param Cassoc association being checked for a clash
+ *
+ * @return pointer to association or NULL
+ */
+static struct sctp_nat_assoc*
+FindSctpGlobalClash(struct libalias *la, struct sctp_nat_assoc *Cassoc)
+{
+ u_int i;
+ struct sctp_nat_assoc *assoc = NULL;
+ struct sctp_GlobalAddress *G_Addr = NULL;
+ struct sctp_GlobalAddress *G_AddrC = NULL;
+
+ if (Cassoc->g_vtag != 0) { /* an init packet, vtag==0 */
+ i = SN_TABLE_HASH(Cassoc->g_vtag, Cassoc->g_port, la->sctpNatTableSize);
+ LIST_FOREACH(assoc, &la->sctpTableGlobal[i], list_G) {
+ if ((assoc->g_vtag == Cassoc->g_vtag) && (assoc->g_port == Cassoc->g_port) && (assoc->l_port == Cassoc->l_port)) {
+ if (assoc->num_Gaddr) {
+ LIST_FOREACH(G_AddrC, &(Cassoc->Gaddr), list_Gaddr) {
+ LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) {
+ if(G_Addr->g_addr.s_addr == G_AddrC->g_addr.s_addr)
+ return(assoc);
+ }
+ }
+ } else {
+ return(assoc);
+ }
+ }
+ }
+ }
+ return(NULL);
+}
+
+/** @ingroup Hash
+ * @brief Find the SCTP association given the global port and vtag
+ *
+ * Searches the global look-up table for the association entry matching the
+ * provided global <address:ports:vtag> tuple
+ *
+ * If all but the global address match it sets partial_match to 1 to indicate a
+ * partial match. If the NAT is tracking global IP addresses for this
+ * association, the NAT may respond with an ERRORM to request the missing
+ * address to be added.
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param g_addr global address
+ * @param g_vtag global vtag
+ * @param g_port global port
+ * @param l_port local port
+ *
+ * @return pointer to association or NULL
+ */
+static struct sctp_nat_assoc*
+FindSctpGlobal(struct libalias *la, struct in_addr g_addr, uint32_t g_vtag, uint16_t g_port, uint16_t l_port, int *partial_match)
+{
+ u_int i;
+ struct sctp_nat_assoc *assoc = NULL;
+ struct sctp_GlobalAddress *G_Addr = NULL;
+
+ *partial_match = 0;
+ if (g_vtag != 0) { /* an init packet, vtag==0 */
+ i = SN_TABLE_HASH(g_vtag, g_port, la->sctpNatTableSize);
+ LIST_FOREACH(assoc, &la->sctpTableGlobal[i], list_G) {
+ if ((assoc->g_vtag == g_vtag) && (assoc->g_port == g_port) && (assoc->l_port == l_port)) {
+ *partial_match = 1;
+ if (assoc->num_Gaddr) {
+ LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) {
+ if(G_Addr->g_addr.s_addr == g_addr.s_addr)
+ return(assoc);
+ }
+ } else {
+ return(assoc);
+ }
+ }
+ }
+ }
+ return(NULL);
+}
+
+/** @ingroup Hash
+ * @brief Find the SCTP association for a T-Flag message (given the global port and local vtag)
+ *
+ * Searches the local look-up table for a unique association entry matching the
+ * provided global port and local vtag information
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param g_addr global address
+ * @param l_vtag local Vtag
+ * @param g_port global Port
+ * @param l_port local Port
+ *
+ * @return pointer to association or NULL
+ */
+static struct sctp_nat_assoc*
+FindSctpLocalT(struct libalias *la, struct in_addr g_addr, uint32_t l_vtag, uint16_t g_port, uint16_t l_port)
+{
+ u_int i;
+ struct sctp_nat_assoc *assoc = NULL, *lastmatch = NULL;
+ struct sctp_GlobalAddress *G_Addr = NULL;
+ int cnt = 0;
+
+ if (l_vtag != 0) { /* an init packet, vtag==0 */
+ i = SN_TABLE_HASH(l_vtag, g_port, la->sctpNatTableSize);
+ LIST_FOREACH(assoc, &la->sctpTableGlobal[i], list_G) {
+ if ((assoc->g_vtag == l_vtag) && (assoc->g_port == g_port) && (assoc->l_port == l_port)) {
+ if (assoc->num_Gaddr) {
+ LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) {
+ if(G_Addr->g_addr.s_addr == G_Addr->g_addr.s_addr)
+ return(assoc); /* full match */
+ }
+ } else {
+ if (++cnt > 1) return(NULL);
+ lastmatch = assoc;
+ }
+ }
+ }
+ }
+ /* If there is more than one match we do not know which local address to send to */
+ return( cnt ? lastmatch : NULL );
+}
+
+/** @ingroup Hash
+ * @brief Find the SCTP association for a T-Flag message (given the local port and global vtag)
+ *
+ * Searches the global look-up table for a unique association entry matching the
+ * provided local port and global vtag information
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param g_addr global address
+ * @param g_vtag global vtag
+ * @param l_port local port
+ * @param g_port global port
+ *
+ * @return pointer to association or NULL
+ */
+static struct sctp_nat_assoc*
+FindSctpGlobalT(struct libalias *la, struct in_addr g_addr, uint32_t g_vtag, uint16_t l_port, uint16_t g_port)
+{
+ u_int i;
+ struct sctp_nat_assoc *assoc = NULL;
+ struct sctp_GlobalAddress *G_Addr = NULL;
+
+ if (g_vtag != 0) { /* an init packet, vtag==0 */
+ i = SN_TABLE_HASH(g_vtag, l_port, la->sctpNatTableSize);
+ LIST_FOREACH(assoc, &la->sctpTableLocal[i], list_L) {
+ if ((assoc->l_vtag == g_vtag) && (assoc->l_port == l_port) && (assoc->g_port == g_port)) {
+ if (assoc->num_Gaddr) {
+ LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) {
+ if(G_Addr->g_addr.s_addr == g_addr.s_addr)
+ return(assoc);
+ }
+ } else {
+ return(assoc);
+ }
+ }
+ }
+ }
+ return(NULL);
+}
+
+/** @ingroup Hash
+ * @brief Add the sctp association information to the local look up table
+ *
+ * Searches the local look-up table for an existing association with the same
+ * details. If a match exists and is ONLY in the local look-up table then this
+ * is a repeated INIT packet, we need to remove this association from the
+ * look-up table and add the new association
+ *
+ * The new association is added to the head of the list and state is updated
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param assoc pointer to sctp association
+ * @param g_addr global address
+ *
+ * @return SN_ADD_OK | SN_ADD_CLASH
+ */
+static int
+AddSctpAssocLocal(struct libalias *la, struct sctp_nat_assoc *assoc, struct in_addr g_addr)
+{
+ struct sctp_nat_assoc *found;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ found = FindSctpLocal(la, assoc->l_addr, g_addr, assoc->l_vtag, assoc->l_port, assoc->g_port);
+ /*
+ * Note that if a different global address initiated this Init,
+ * ie it wasn't resent as presumed:
+ * - the local receiver if receiving it for the first time will establish
+ * an association with the new global host
+ * - if receiving an init from a different global address after sending a
+ * lost initack it will send an initack to the new global host, the first
+ * association attempt will then be blocked if retried.
+ */
+ if (found != NULL) {
+ if ((found->TableRegister == SN_LOCAL_TBL) && (found->g_port == assoc->g_port)) { /* resent message */
+ RmSctpAssoc(la, found);
+ sctp_RmTimeOut(la, found);
+ freeGlobalAddressList(found);
+ sn_free(found);
+ } else
+ return(SN_ADD_CLASH);
+ }
+
+ LIST_INSERT_HEAD(&la->sctpTableLocal[SN_TABLE_HASH(assoc->l_vtag, assoc->l_port, la->sctpNatTableSize)],
+ assoc, list_L);
+ assoc->TableRegister |= SN_LOCAL_TBL;
+ la->sctpLinkCount++; //increment link count
+
+ if (assoc->TableRegister == SN_BOTH_TBL) {
+ /* libalias log -- controlled by libalias */
+ if (la->packetAliasMode & PKT_ALIAS_LOG)
+ SctpShowAliasStats(la);
+
+ SN_LOG(SN_LOG_INFO, logsctpassoc(assoc, "^"));
+ }
+
+ return(SN_ADD_OK);
+}
+
+/** @ingroup Hash
+ * @brief Add the sctp association information to the global look up table
+ *
+ * Searches the global look-up table for an existing association with the same
+ * details. If a match exists and is ONLY in the global look-up table then this
+ * is a repeated INIT packet, we need to remove this association from the
+ * look-up table and add the new association
+ *
+ * The new association is added to the head of the list and state is updated
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param assoc pointer to sctp association
+ *
+ * @return SN_ADD_OK | SN_ADD_CLASH
+ */
+static int
+AddSctpAssocGlobal(struct libalias *la, struct sctp_nat_assoc *assoc)
+{
+ struct sctp_nat_assoc *found;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ found = FindSctpGlobalClash(la, assoc);
+ if (found != NULL) {
+ if ((found->TableRegister == SN_GLOBAL_TBL) && \
+ (found->l_addr.s_addr == assoc->l_addr.s_addr) && (found->l_port == assoc->l_port)) { /* resent message */
+ RmSctpAssoc(la, found);
+ sctp_RmTimeOut(la, found);
+ freeGlobalAddressList(found);
+ sn_free(found);
+ } else
+ return(SN_ADD_CLASH);
+ }
+
+ LIST_INSERT_HEAD(&la->sctpTableGlobal[SN_TABLE_HASH(assoc->g_vtag, assoc->g_port, la->sctpNatTableSize)],
+ assoc, list_G);
+ assoc->TableRegister |= SN_GLOBAL_TBL;
+ la->sctpLinkCount++; //increment link count
+
+ if (assoc->TableRegister == SN_BOTH_TBL) {
+ /* libalias log -- controlled by libalias */
+ if (la->packetAliasMode & PKT_ALIAS_LOG)
+ SctpShowAliasStats(la);
+
+ SN_LOG(SN_LOG_INFO, logsctpassoc(assoc, "^"));
+ }
+
+ return(SN_ADD_OK);
+}
+
+/** @ingroup Hash
+ * @brief Remove the sctp association information from the look up table
+ *
+ * For each of the two (local/global) look-up tables, remove the association
+ * from that table IF it has been registered in that table.
+ *
+ * NOTE: The calling code is responsible for freeing memory allocated to the
+ * association structure itself
+ *
+ * NOTE: The association is NOT removed from the timer queue
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param assoc pointer to sctp association
+ */
+static void
+RmSctpAssoc(struct libalias *la, struct sctp_nat_assoc *assoc)
+{
+ // struct sctp_nat_assoc *found;
+ if (assoc == NULL) {
+ /* very bad, log and die*/
+ SN_LOG(SN_LOG_LOW,
+ logsctperror("ERROR: alias_sctp:RmSctpAssoc(NULL)\n", 0, 0, SN_TO_NODIR));
+ return;
+ }
+ /* log if association is fully up and now closing */
+ if (assoc->TableRegister == SN_BOTH_TBL) {
+ SN_LOG(SN_LOG_INFO, logsctpassoc(assoc, "$"));
+ }
+ LIBALIAS_LOCK_ASSERT(la);
+ if (assoc->TableRegister & SN_LOCAL_TBL) {
+ assoc->TableRegister ^= SN_LOCAL_TBL;
+ la->sctpLinkCount--; //decrement link count
+ LIST_REMOVE(assoc, list_L);
+ }
+
+ if (assoc->TableRegister & SN_GLOBAL_TBL) {
+ assoc->TableRegister ^= SN_GLOBAL_TBL;
+ la->sctpLinkCount--; //decrement link count
+ LIST_REMOVE(assoc, list_G);
+ }
+ // sn_free(assoc); //Don't remove now, remove if needed later
+ /* libalias logging -- controlled by libalias log definition */
+ if (la->packetAliasMode & PKT_ALIAS_LOG)
+ SctpShowAliasStats(la);
+}
+
+/**
+ * @ingroup Hash
+ * @brief free the Global Address List memory
+ *
+ * freeGlobalAddressList deletes all global IP addresses in an associations
+ * global IP address list.
+ *
+ * @param assoc
+ */
+static void freeGlobalAddressList(struct sctp_nat_assoc *assoc)
+{
+ struct sctp_GlobalAddress *gaddr1=NULL,*gaddr2=NULL;
+ /*free global address list*/
+ gaddr1 = LIST_FIRST(&(assoc->Gaddr));
+ while (gaddr1 != NULL) {
+ gaddr2 = LIST_NEXT(gaddr1, list_Gaddr);
+ sn_free(gaddr1);
+ gaddr1 = gaddr2;
+ }
+}
+/* ----------------------------------------------------------------------
+ * TIMER QUEUE CODE
+ * ----------------------------------------------------------------------
+ */
+/** @addtogroup Timer
+ *
+ * The timer queue management functions are designed to operate efficiently with
+ * a minimum of interaction with the queues.
+ *
+ * Once a timeout is set in the queue it will not be altered in the queue unless
+ * it has to be changed to a shorter time (usually only for aborts and closing).
+ * On a queue timeout, the real expiry time is checked, and if not leq than the
+ * timeout it is requeued (O(1)) at its later time. This is especially important
+ * for normal packets sent during an association. When a timer expires, it is
+ * updated to its new expiration time if necessary, or processed as a
+ * timeout. This means that while in UP state, the timing queue is only altered
+ * every U_T (every few minutes) for a particular association.
+ */
+/** @ingroup Timer
+ * @brief Add an association timeout to the timer queue
+ *
+ * Determine the location in the queue to add the timeout and insert the
+ * association into the list at that queue position
+ *
+ * @param la
+ * @param assoc
+ */
+static void
+sctp_AddTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc)
+{
+ int add_loc;
+ LIBALIAS_LOCK_ASSERT(la);
+ add_loc = assoc->exp - la->sctpNatTimer.loc_time + la->sctpNatTimer.cur_loc;
+ if (add_loc >= SN_TIMER_QUEUE_SIZE)
+ add_loc -= SN_TIMER_QUEUE_SIZE;
+ LIST_INSERT_HEAD(&la->sctpNatTimer.TimerQ[add_loc], assoc, timer_Q);
+ assoc->exp_loc = add_loc;
+}
+
+/** @ingroup Timer
+ * @brief Remove an association from timer queue
+ *
+ * This is an O(1) operation to remove the association pointer from its
+ * current position in the timer queue
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param assoc pointer to sctp association
+ */
+static void
+sctp_RmTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc)
+{
+ LIBALIAS_LOCK_ASSERT(la);
+ LIST_REMOVE(assoc, timer_Q);/* Note this is O(1) */
+}
+
+
+/** @ingroup Timer
+ * @brief Reset timer in timer queue
+ *
+ * Reset the actual timeout for the specified association. If it is earlier than
+ * the existing timeout, then remove and re-install the association into the
+ * queue
+ *
+ * @param la Pointer to the relevant libalias instance
+ * @param assoc pointer to sctp association
+ * @param newexp New expiration time
+ */
+static void
+sctp_ResetTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc, int newexp)
+{
+ if (newexp < assoc->exp) {
+ sctp_RmTimeOut(la, assoc);
+ assoc->exp = newexp;
+ sctp_AddTimeOut(la, assoc);
+ } else {
+ assoc->exp = newexp;
+ }
+}
+
+/** @ingroup Timer
+ * @brief Check timer Q against current time
+ *
+ * Loop through each entry in the timer queue since the last time we processed
+ * the timer queue until now (the current time). For each association in the
+ * event list, we remove it from that position in the timer queue and check if
+ * it has really expired. If so we:
+ * - Log the timer expiry
+ * - Remove the association from the NAT tables
+ * - Release the memory used by the association
+ *
+ * If the timer hasn't really expired we place the association into its new
+ * correct position in the timer queue.
+ *
+ * @param la Pointer to the relevant libalias instance
+ */
+void
+sctp_CheckTimers(struct libalias *la)
+{
+ struct sctp_nat_assoc *assoc;
+
+ LIBALIAS_LOCK_ASSERT(la);
+ while(la->timeStamp >= la->sctpNatTimer.loc_time) {
+ while (!LIST_EMPTY(&la->sctpNatTimer.TimerQ[la->sctpNatTimer.cur_loc])) {
+ assoc = LIST_FIRST(&la->sctpNatTimer.TimerQ[la->sctpNatTimer.cur_loc]);
+ //SLIST_REMOVE_HEAD(&la->sctpNatTimer.TimerQ[la->sctpNatTimer.cur_loc], timer_Q);
+ LIST_REMOVE(assoc, timer_Q);
+ if (la->timeStamp >= assoc->exp) { /* state expired */
+ SN_LOG(((assoc->state == SN_CL)?(SN_LOG_DEBUG):(SN_LOG_INFO)),
+ logsctperror("Timer Expired", assoc->g_vtag, assoc->state, SN_TO_NODIR));
+ RmSctpAssoc(la, assoc);
+ freeGlobalAddressList(assoc);
+ sn_free(assoc);
+ } else {/* state not expired, reschedule timer*/
+ sctp_AddTimeOut(la, assoc);
+ }
+ }
+ /* Goto next location in the timer queue*/
+ ++la->sctpNatTimer.loc_time;
+ if (++la->sctpNatTimer.cur_loc >= SN_TIMER_QUEUE_SIZE)
+ la->sctpNatTimer.cur_loc = 0;
+ }
+}
+
+/* ----------------------------------------------------------------------
+ * LOGGING CODE
+ * ----------------------------------------------------------------------
+ */
+/** @addtogroup Logging
+ *
+ * The logging functions provide logging of different items ranging from logging
+ * a simple message, through logging an association details to logging the
+ * current state of the NAT tables
+ */
+/** @ingroup Logging
+ * @brief Log sctp nat errors
+ *
+ * @param errormsg Error message to be logged
+ * @param vtag Current Vtag
+ * @param error Error number
+ * @param direction Direction of packet
+ */
+static void
+logsctperror(char* errormsg, uint32_t vtag, int error, int direction)
+{
+ char dir;
+ switch(direction) {
+ case SN_TO_LOCAL:
+ dir = 'L';
+ break;
+ case SN_TO_GLOBAL:
+ dir = 'G';
+ break;
+ default:
+ dir = '*';
+ break;
+ }
+ SctpAliasLog("->%c %s (vt=%u) %d\n", dir, errormsg, ntohl(vtag), error);
+}
+
+/** @ingroup Logging
+ * @brief Log what the parser parsed
+ *
+ * @param direction Direction of packet
+ * @param sm Pointer to sctp message information
+ */
+static void
+logsctpparse(int direction, struct sctp_nat_msg *sm)
+{
+ char *ploc, *pstate;
+ switch(direction) {
+ case SN_TO_LOCAL:
+ ploc = "TO_LOCAL -";
+ break;
+ case SN_TO_GLOBAL:
+ ploc = "TO_GLOBAL -";
+ break;
+ default:
+ ploc = "";
+ }
+ switch(sm->msg) {
+ case SN_SCTP_INIT:
+ pstate = "Init";
+ break;
+ case SN_SCTP_INITACK:
+ pstate = "InitAck";
+ break;
+ case SN_SCTP_ABORT:
+ pstate = "Abort";
+ break;
+ case SN_SCTP_SHUTACK:
+ pstate = "ShutAck";
+ break;
+ case SN_SCTP_SHUTCOMP:
+ pstate = "ShutComp";
+ break;
+ case SN_SCTP_ASCONF:
+ pstate = "Asconf";
+ break;
+ case SN_SCTP_ASCONFACK:
+ pstate = "AsconfAck";
+ break;
+ case SN_SCTP_OTHER:
+ pstate = "Other";
+ break;
+ default:
+ pstate = "***ERROR***";
+ break;
+ }
+ SctpAliasLog("Parsed: %s %s\n", ploc, pstate);
+}
+
+/** @ingroup Logging
+ * @brief Log an SCTP association's details
+ *
+ * @param assoc pointer to sctp association
+ * @param s Character that indicates the state of processing for this packet
+ */
+static void logsctpassoc(struct sctp_nat_assoc *assoc, char* s)
+{
+ struct sctp_GlobalAddress *G_Addr = NULL;
+ char *sp;
+ switch(assoc->state) {
+ case SN_ID:
+ sp = "ID ";
+ break;
+ case SN_INi:
+ sp = "INi ";
+ break;
+ case SN_INa:
+ sp = "INa ";
+ break;
+ case SN_UP:
+ sp = "UP ";
+ break;
+ case SN_CL:
+ sp = "CL ";
+ break;
+ case SN_RM:
+ sp = "RM ";
+ break;
+ default:
+ sp = "***ERROR***";
+ break;
+ }
+ SctpAliasLog("%sAssoc: %s exp=%u la=%s lv=%u lp=%u gv=%u gp=%u tbl=%d\n",
+ s, sp, assoc->exp, inet_ntoa(assoc->l_addr), ntohl(assoc->l_vtag),
+ ntohs(assoc->l_port), ntohl(assoc->g_vtag), ntohs(assoc->g_port),
+ assoc->TableRegister);
+ /* list global addresses */
+ LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) {
+ SctpAliasLog("\t\tga=%s\n",inet_ntoa(G_Addr->g_addr));
+ }
+}
+
+/** @ingroup Logging
+ * @brief Output Global table to log
+ *
+ * @param la Pointer to the relevant libalias instance
+ */
+static void logSctpGlobal(struct libalias *la)
+{
+ u_int i;
+ struct sctp_nat_assoc *assoc = NULL;
+
+ SctpAliasLog("G->\n");
+ for (i=0; i < la->sctpNatTableSize; i++) {
+ LIST_FOREACH(assoc, &la->sctpTableGlobal[i], list_G) {
+ logsctpassoc(assoc, " ");
+ }
+ }
+}
+
+/** @ingroup Logging
+ * @brief Output Local table to log
+ *
+ * @param la Pointer to the relevant libalias instance
+ */
+static void logSctpLocal(struct libalias *la)
+{
+ u_int i;
+ struct sctp_nat_assoc *assoc = NULL;
+
+ SctpAliasLog("L->\n");
+ for (i=0; i < la->sctpNatTableSize; i++) {
+ LIST_FOREACH(assoc, &la->sctpTableLocal[i], list_L) {
+ logsctpassoc(assoc, " ");
+ }
+ }
+}
+
+/** @ingroup Logging
+ * @brief Output timer queue to log
+ *
+ * @param la Pointer to the relevant libalias instance
+ */
+static void logTimerQ(struct libalias *la)
+{
+ static char buf[50];
+ u_int i;
+ struct sctp_nat_assoc *assoc = NULL;
+
+ SctpAliasLog("t->\n");
+ for (i=0; i < SN_TIMER_QUEUE_SIZE; i++) {
+ LIST_FOREACH(assoc, &la->sctpNatTimer.TimerQ[i], timer_Q) {
+ snprintf(buf, 50, " l=%u ",i);
+ //SctpAliasLog(la->logDesc," l=%d ",i);
+ logsctpassoc(assoc, buf);
+ }
+ }
+}
+
+/** @ingroup Logging
+ * @brief Sctp NAT logging function
+ *
+ * This function is based on a similar function in alias_db.c
+ *
+ * @param str/stream logging descriptor
+ * @param format printf type string
+ */
+#ifdef _KERNEL
+static void
+SctpAliasLog(const char *format, ...)
+{
+ char buffer[LIBALIAS_BUF_SIZE];
+ va_list ap;
+ va_start(ap, format);
+ vsnprintf(buffer, LIBALIAS_BUF_SIZE, format, ap);
+ va_end(ap);
+ log(LOG_SECURITY | LOG_INFO,
+ "alias_sctp: %s", buffer);
+}
+#else
+static void
+SctpAliasLog(FILE *stream, const char *format, ...)
+{
+ va_list ap;
+
+ va_start(ap, format);
+ vfprintf(stream, format, ap);
+ va_end(ap);
+ fflush(stream);
+}
+#endif
diff --git a/freebsd/sys/netinet/libalias/alias_sctp.h b/freebsd/sys/netinet/libalias/alias_sctp.h
new file mode 100644
index 00000000..9ea21959
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_sctp.h
@@ -0,0 +1,201 @@
+/*-
+ * Copyright (c) 2008
+ * Swinburne University of Technology, Melbourne, Australia.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Alias_sctp forms part of the libalias kernel module to handle
+ * Network Address Translation (NAT) for the SCTP protocol.
+ *
+ * This software was developed by David A. Hayes
+ * with leadership and advice from Jason But
+ *
+ * The design is outlined in CAIA technical report number 080618A
+ * (D. Hayes and J. But, "Alias_sctp Version 0.1: SCTP NAT implementation in IPFW")
+ *
+ * Development is part of the CAIA SONATA project,
+ * proposed by Jason But and Grenville Armitage:
+ * http://caia.swin.edu.au/urp/sonata/
+ *
+ *
+ * This project has been made possible in part by a grant from
+ * the Cisco University Research Program Fund at Community
+ * Foundation Silicon Valley.
+ *
+ */
+
+/* $FreeBSD$ */
+
+#ifndef _ALIAS_SCTP_HH_
+#define _ALIAS_SCTP_HH_
+
+#include <freebsd/sys/param.h>
+#ifdef _KERNEL
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/module.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/uio.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/syslog.h>
+#endif // #ifdef _KERNEL
+#include <freebsd/sys/types.h>
+
+#include <freebsd/sys/queue.h>
+#include <freebsd/sys/types.h>
+#include <freebsd/sys/time.h>
+
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+
+/**
+ * These are defined in sctp_os_bsd.h, but it can't be included due to its local file
+ * inclusion, so I'm defining them here.
+ *
+ */
+#include <freebsd/machine/cpufunc.h>
+#include <freebsd/machine/cpu.h>
+/* The packed define for 64 bit platforms */
+#ifndef SCTP_PACKED
+#define SCTP_PACKED __attribute__((packed))
+#endif //#ifndef SCTP_PACKED
+#ifndef SCTP_UNUSED
+#define SCTP_UNUSED __attribute__((unused))
+#endif //#ifndef SCTP_UNUSED
+
+
+#include <freebsd/netinet/sctp.h>
+//#include <freebsd/netinet/sctp_os_bsd.h> --might be needed later for mbuf stuff
+#include <freebsd/netinet/sctp_header.h>
+
+#ifndef _KERNEL
+#include <freebsd/stdlib.h>
+#include <freebsd/stdio.h>
+#include <freebsd/curses.h>
+#endif //#ifdef _KERNEL
+
+
+#define LINK_SCTP IPPROTO_SCTP
+
+
+#define SN_TO_LOCAL 0 /**< packet traveling from global to local */
+#define SN_TO_GLOBAL 1 /**< packet traveling from local to global */
+#define SN_TO_NODIR 99 /**< used where direction is not important */
+
+#define SN_NAT_PKT 0x0000 /**< Network Address Translate packet */
+#define SN_DROP_PKT 0x0001 /**< drop packet (don't forward it) */
+#define SN_PROCESSING_ERROR 0x0003 /**< Packet processing error */
+#define SN_REPLY_ABORT 0x0010 /**< Reply with ABORT to sender (don't forward it) */
+#define SN_SEND_ABORT 0x0020 /**< Send ABORT to destination */
+#define SN_TX_ABORT 0x0030 /**< mask for transmitting abort */
+#define SN_REFLECT_ERROR 0x0100 /**< Reply with ERROR to sender on OOTB packet Tbit set */
+#define SN_REPLY_ERROR 0x0200 /**< Reply with ERROR to sender on ASCONF clash */
+#define SN_TX_ERROR 0x0300 /**< mask for transmitting error */
+
+
+#define PKT_ALIAS_RESPOND 0x1000 /**< Signal to libalias that there is a response packet to send */
+/*
+ * Data structures
+ */
+
+/**
+ * @brief sctp association information
+ *
+ * Structure that contains information about a particular sctp association
+ * currently under Network Address Translation.
+ * Information is stored in network byte order (as is libalias)***
+ */
+struct sctp_nat_assoc {
+ uint32_t l_vtag; /**< local side verification tag */
+ uint16_t l_port; /**< local side port number */
+ uint32_t g_vtag; /**< global side verification tag */
+ uint16_t g_port; /**< global side port number */
+ struct in_addr l_addr; /**< local ip address */
+ struct in_addr a_addr; /**< alias ip address */
+ int state; /**< current state of NAT association */
+ int TableRegister; /**< stores which look up tables association is registered in */
+ int exp; /**< timer expiration in seconds from uptime */
+ int exp_loc; /**< current location in timer_Q */
+ int num_Gaddr; /**< number of global IP addresses in the list */
+ LIST_HEAD(sctpGlobalAddresshead,sctp_GlobalAddress) Gaddr; /**< List of global addresses */
+ LIST_ENTRY (sctp_nat_assoc) list_L; /**< Linked list of pointers for Local table*/
+ LIST_ENTRY (sctp_nat_assoc) list_G; /**< Linked list of pointers for Global table */
+ LIST_ENTRY (sctp_nat_assoc) timer_Q; /**< Linked list of pointers for timer Q */
+//Using libalias locking
+};
+
+struct sctp_GlobalAddress {
+ struct in_addr g_addr;
+ LIST_ENTRY (sctp_GlobalAddress) list_Gaddr; /**< Linked list of pointers for Global table */
+};
+
+/**
+ * @brief SCTP chunk of interest
+ *
+ * The only chunks whose contents are of any interest are the INIT and ASCONF_AddIP
+ */
+union sctpChunkOfInt {
+ struct sctp_init *Init; /**< Pointer to Init Chunk */
+ struct sctp_init_ack *InitAck; /**< Pointer to Init Chunk */
+ struct sctp_paramhdr *Asconf; /**< Pointer to ASCONF chunk */
+};
+
+
+/**
+ * @brief SCTP message
+ *
+ * Structure containing the relevant information from the SCTP message
+ */
+struct sctp_nat_msg {
+ uint16_t msg; /**< one of the key messages defined above */
+#ifndef __rtems__
+#ifdef INET6
+ // struct ip6_hdr *ip_hdr; /**< pointer to ip packet header */ /*no inet6 support yet*/
+#else
+ struct ip *ip_hdr; /**< pointer to ip packet header */
+#endif //#ifdef INET6
+#else //__rtems__
+ struct ip *ip_hdr; /**< pointer to ip packet header */
+#endif //__rtems__
+ struct sctphdr *sctp_hdr; /**< pointer to sctp common header */
+ union sctpChunkOfInt sctpchnk; /**< union of pointers to the chunk of interest */
+ int chunk_length; /**< length of chunk of interest */
+};
+
+
+/**
+ * @brief sctp nat timer queue structure
+ *
+ */
+
+struct sctp_nat_timer {
+ int loc_time; /**< time in seconds for the current location in the queue */
+ int cur_loc; /**< index of the current location in the circular queue */
+ LIST_HEAD(sctpTimerQ,sctp_nat_assoc) *TimerQ; /**< List of associations at this position in the timer Q */
+};
+
+
+
+#endif //#ifndef _ALIAS_SCTP_H
diff --git a/freebsd/sys/netinet/libalias/alias_skinny.c b/freebsd/sys/netinet/libalias/alias_skinny.c
new file mode 100644
index 00000000..4d311efe
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_skinny.c
@@ -0,0 +1,449 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * alias_skinny.c
+ *
+ * Copyright (c) 2002, 2003 MarcusCom, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author: Joe Marcus Clarke <marcus@FreeBSD.org>
+ *
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/module.h>
+#else
+#include <freebsd/errno.h>
+#include <freebsd/stdio.h>
+#include <freebsd/unistd.h>
+#endif
+
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/tcp.h>
+
+#ifdef _KERNEL
+#include <freebsd/netinet/libalias/alias_local.h>
+#include <freebsd/netinet/libalias/alias_mod.h>
+#else
+#include <freebsd/local/alias_local.h>
+#include <freebsd/local/alias_mod.h>
+#endif
+
+static void
+AliasHandleSkinny(struct libalias *, struct ip *, struct alias_link *);
+
+static int
+fingerprint(struct libalias *la, struct alias_data *ah)
+{
+
+ if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL)
+ return (-1);
+ if (la->skinnyPort != 0 && (ntohs(*ah->sport) == la->skinnyPort ||
+ ntohs(*ah->dport) == la->skinnyPort))
+ return (0);
+ return (-1);
+}
+
+static int
+protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ AliasHandleSkinny(la, pip, ah->lnk);
+ return (0);
+}
+
+struct proto_handler handlers[] = {
+ {
+ .pri = 110,
+ .dir = IN|OUT,
+ .proto = TCP,
+ .fingerprint = &fingerprint,
+ .protohandler = &protohandler
+ },
+ { EOH }
+};
+
+static int
+mod_handler(module_t mod, int type, void *data)
+{
+ int error;
+
+ switch (type) {
+ case MOD_LOAD:
+ error = 0;
+ LibAliasAttachHandlers(handlers);
+ break;
+ case MOD_UNLOAD:
+ error = 0;
+ LibAliasDetachHandlers(handlers);
+ break;
+ default:
+ error = EINVAL;
+ }
+ return (error);
+}
+
+#ifdef _KERNEL
+static
+#endif
+moduledata_t alias_mod = {
+ "alias_skinny", mod_handler, NULL
+};
+
+#ifdef _KERNEL
+DECLARE_MODULE(alias_skinny, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);
+MODULE_VERSION(alias_skinny, 1);
+MODULE_DEPEND(alias_skinny, libalias, 1, 1, 1);
+#endif
+
+/*
+ * alias_skinny.c handles the translation for the Cisco Skinny Station
+ * protocol. Skinny typically uses TCP port 2000 to set up calls between
+ * a Cisco Call Manager and a Cisco IP phone. When a phone comes on line,
+ * it first needs to register with the Call Manager. To do this it sends
+ * a registration message. This message contains the IP address of the
+ * IP phone. This message must then be translated to reflect our global
+ * IP address. Along with the registration message (and usually in the
+ * same packet), the phone sends an IP port message. This message indicates
+ * the TCP port over which it will communicate.
+ *
+ * When a call is placed from the phone, the Call Manager will send an
+ * Open Receive Channel message to the phone to let the caller know someone
+ * has answered. The phone then sends back an Open Receive Channel
+ * Acknowledgement. In this packet, the phone sends its IP address again,
+ * and the UDP port over which the voice traffic should flow. These values
+ * need translation. Right after the Open Receive Channel Acknowledgement,
+ * the Call Manager sends a Start Media Transmission message indicating the
+ * call is connected. This message contains the IP address and UDP port
+ * number of the remote (called) party. Once this message is translated, the
+ * call can commence. The called part sends the first UDP packet to the
+ * calling phone at the pre-arranged UDP port in the Open Receive Channel
+ * Acknowledgement.
+ *
+ * Skinny is a Cisco-proprietary protocol and is a trademark of Cisco Systems,
+ * Inc. All rights reserved.
+*/
+
+/* #define LIBALIAS_DEBUG 1 */
+
+/* Message types that need translating */
+#define REG_MSG 0x00000001
+#define IP_PORT_MSG 0x00000002
+#define OPNRCVCH_ACK 0x00000022
+#define START_MEDIATX 0x0000008a
+
+struct skinny_header {
+ u_int32_t len;
+ u_int32_t reserved;
+ u_int32_t msgId;
+};
+
+struct RegisterMessage {
+ u_int32_t msgId;
+ char devName [16];
+ u_int32_t uid;
+ u_int32_t instance;
+ u_int32_t ipAddr;
+ u_char devType;
+ u_int32_t maxStreams;
+};
+
+struct IpPortMessage {
+ u_int32_t msgId;
+ u_int32_t stationIpPort; /* Note: Skinny uses 32-bit port
+ * numbers */
+};
+
+struct OpenReceiveChannelAck {
+ u_int32_t msgId;
+ u_int32_t status;
+ u_int32_t ipAddr;
+ u_int32_t port;
+ u_int32_t passThruPartyID;
+};
+
+struct StartMediaTransmission {
+ u_int32_t msgId;
+ u_int32_t conferenceID;
+ u_int32_t passThruPartyID;
+ u_int32_t remoteIpAddr;
+ u_int32_t remotePort;
+ u_int32_t MSPacket;
+ u_int32_t payloadCap;
+ u_int32_t precedence;
+ u_int32_t silenceSuppression;
+ u_short maxFramesPerPacket;
+ u_int32_t G723BitRate;
+};
+
+typedef enum {
+ ClientToServer = 0,
+ ServerToClient = 1
+} ConvDirection;
+
+
+static int
+alias_skinny_reg_msg(struct RegisterMessage *reg_msg, struct ip *pip,
+ struct tcphdr *tc, struct alias_link *lnk,
+ ConvDirection direction)
+{
+ (void)direction;
+
+ reg_msg->ipAddr = (u_int32_t) GetAliasAddress(lnk).s_addr;
+
+ tc->th_sum = 0;
+#ifdef _KERNEL
+ tc->th_x2 = 1;
+#else
+ tc->th_sum = TcpChecksum(pip);
+#endif
+
+ return (0);
+}
+
+static int
+alias_skinny_startmedia(struct StartMediaTransmission *start_media,
+ struct ip *pip, struct tcphdr *tc,
+ struct alias_link *lnk, u_int32_t localIpAddr,
+ ConvDirection direction)
+{
+ struct in_addr dst, src;
+
+ (void)pip;
+ (void)tc;
+ (void)lnk;
+ (void)direction;
+
+ dst.s_addr = start_media->remoteIpAddr;
+ src.s_addr = localIpAddr;
+
+ /*
+ * XXX I should probably handle in bound global translations as
+ * well.
+ */
+
+ return (0);
+}
+
+static int
+alias_skinny_port_msg(struct IpPortMessage *port_msg, struct ip *pip,
+ struct tcphdr *tc, struct alias_link *lnk,
+ ConvDirection direction)
+{
+ (void)direction;
+
+ port_msg->stationIpPort = (u_int32_t) ntohs(GetAliasPort(lnk));
+
+ tc->th_sum = 0;
+#ifdef _KERNEL
+ tc->th_x2 = 1;
+#else
+ tc->th_sum = TcpChecksum(pip);
+#endif
+ return (0);
+}
+
+static int
+alias_skinny_opnrcvch_ack(struct libalias *la, struct OpenReceiveChannelAck *opnrcvch_ack,
+ struct ip *pip, struct tcphdr *tc,
+ struct alias_link *lnk, u_int32_t * localIpAddr,
+ ConvDirection direction)
+{
+ struct in_addr null_addr;
+ struct alias_link *opnrcv_lnk;
+ u_int32_t localPort;
+
+ (void)lnk;
+ (void)direction;
+
+ *localIpAddr = (u_int32_t) opnrcvch_ack->ipAddr;
+ localPort = opnrcvch_ack->port;
+
+ null_addr.s_addr = INADDR_ANY;
+ opnrcv_lnk = FindUdpTcpOut(la, pip->ip_src, null_addr,
+ htons((u_short) opnrcvch_ack->port), 0,
+ IPPROTO_UDP, 1);
+ opnrcvch_ack->ipAddr = (u_int32_t) GetAliasAddress(opnrcv_lnk).s_addr;
+ opnrcvch_ack->port = (u_int32_t) ntohs(GetAliasPort(opnrcv_lnk));
+
+ tc->th_sum = 0;
+#ifdef _KERNEL
+ tc->th_x2 = 1;
+#else
+ tc->th_sum = TcpChecksum(pip);
+#endif
+ return (0);
+}
+
+static void
+AliasHandleSkinny(struct libalias *la, struct ip *pip, struct alias_link *lnk)
+{
+ size_t hlen, tlen, dlen;
+ struct tcphdr *tc;
+ u_int32_t msgId, t, len, lip;
+ struct skinny_header *sd;
+ size_t orig_len, skinny_hdr_len = sizeof(struct skinny_header);
+ ConvDirection direction;
+
+ lip = -1;
+ tc = (struct tcphdr *)ip_next(pip);
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+ sd = (struct skinny_header *)tcp_next(tc);
+
+ /*
+ * XXX This direction is reserved for future use. I still need to
+ * handle the scenario where the call manager is on the inside, and
+ * the calling phone is on the global outside.
+ */
+ if (ntohs(tc->th_dport) == la->skinnyPort) {
+ direction = ClientToServer;
+ } else if (ntohs(tc->th_sport) == la->skinnyPort) {
+ direction = ServerToClient;
+ } else {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/Skinny: Invalid port number, not a Skinny packet\n");
+#endif
+ return;
+ }
+
+ orig_len = dlen;
+ /*
+ * Skinny packets can contain many messages. We need to loop
+ * through the packet using len to determine message boundaries.
+ * This comes into play big time with port messages being in the
+ * same packet as register messages. Also, open receive channel
+ * acks are usually buried in a pakcet some 400 bytes long.
+ */
+ while (dlen >= skinny_hdr_len) {
+ len = (sd->len);
+ msgId = (sd->msgId);
+ t = len;
+
+ if (t > orig_len || t > dlen) {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/Skinny: Not a skinny packet, invalid length \n");
+#endif
+ return;
+ }
+ switch (msgId) {
+ case REG_MSG: {
+ struct RegisterMessage *reg_mesg;
+
+ if (len < (int)sizeof(struct RegisterMessage)) {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/Skinny: Not a skinny packet, bad registration message\n");
+#endif
+ return;
+ }
+ reg_mesg = (struct RegisterMessage *)&sd->msgId;
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/Skinny: Received a register message");
+#endif
+ alias_skinny_reg_msg(reg_mesg, pip, tc, lnk, direction);
+ break;
+ }
+ case IP_PORT_MSG: {
+ struct IpPortMessage *port_mesg;
+
+ if (len < (int)sizeof(struct IpPortMessage)) {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/Skinny: Not a skinny packet, port message\n");
+#endif
+ return;
+ }
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/Skinny: Received ipport message\n");
+#endif
+ port_mesg = (struct IpPortMessage *)&sd->msgId;
+ alias_skinny_port_msg(port_mesg, pip, tc, lnk, direction);
+ break;
+ }
+ case OPNRCVCH_ACK: {
+ struct OpenReceiveChannelAck *opnrcvchn_ack;
+
+ if (len < (int)sizeof(struct OpenReceiveChannelAck)) {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/Skinny: Not a skinny packet, packet,OpnRcvChnAckMsg\n");
+#endif
+ return;
+ }
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/Skinny: Received open rcv channel msg\n");
+#endif
+ opnrcvchn_ack = (struct OpenReceiveChannelAck *)&sd->msgId;
+ alias_skinny_opnrcvch_ack(la, opnrcvchn_ack, pip, tc, lnk, &lip, direction);
+ break;
+ }
+ case START_MEDIATX: {
+ struct StartMediaTransmission *startmedia_tx;
+
+ if (len < (int)sizeof(struct StartMediaTransmission)) {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/Skinny: Not a skinny packet,StartMediaTx Message\n");
+#endif
+ return;
+ }
+ if (lip == -1) {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/Skinny: received a"
+ " packet,StartMediaTx Message before"
+ " packet,OpnRcvChnAckMsg\n"
+#endif
+ return;
+ }
+
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/Skinny: Received start media trans msg\n");
+#endif
+ startmedia_tx = (struct StartMediaTransmission *)&sd->msgId;
+ alias_skinny_startmedia(startmedia_tx, pip, tc, lnk, lip, direction);
+ break;
+ }
+ default:
+ break;
+ }
+ /* Place the pointer at the next message in the packet. */
+ dlen -= len + (skinny_hdr_len - sizeof(msgId));
+ sd = (struct skinny_header *)(((char *)&sd->msgId) + len);
+ }
+}
diff --git a/freebsd/sys/netinet/libalias/alias_smedia.c b/freebsd/sys/netinet/libalias/alias_smedia.c
new file mode 100644
index 00000000..3d558a94
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_smedia.c
@@ -0,0 +1,551 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*
+ * alias_smedia.c
+ *
+ * Copyright (c) 2000 Whistle Communications, Inc.
+ * All rights reserved.
+ *
+ * Subject to the following obligations and disclaimer of warranty, use and
+ * redistribution of this software, in source or object code forms, with or
+ * without modifications are expressly permitted by Whistle Communications;
+ * provided, however, that:
+ * 1. Any and all reproductions of the source or object code must include the
+ * copyright notice above and the following disclaimer of warranties; and
+ * 2. No rights are granted, in any manner or form, to use Whistle
+ * Communications, Inc. trademarks, including the mark "WHISTLE
+ * COMMUNICATIONS" on advertising, endorsements, or otherwise except as
+ * such appears in the above copyright notice or in the software.
+ *
+ * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
+ * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
+ * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
+ * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
+ * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
+ * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
+ * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
+ * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
+ * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
+ * OF SUCH DAMAGE.
+ *
+ * Copyright (c) 2000 Junichi SATOH <junichi@astec.co.jp>
+ * <junichi@junichi.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors: Erik Salander <erik@whistle.com>
+ * Junichi SATOH <junichi@astec.co.jp>
+ * <junichi@junichi.org>
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ Alias_smedia.c is meant to contain the aliasing code for streaming media
+ protocols. It performs special processing for RSTP sessions under TCP.
+ Specifically, when a SETUP request is sent by a client, or a 200 reply
+ is sent by a server, it is intercepted and modified. The address is
+ changed to the gateway machine and an aliasing port is used.
+
+ More specifically, the "client_port" configuration parameter is
+ parsed for SETUP requests. The "server_port" configuration parameter is
+ parsed for 200 replies eminating from a server. This is intended to handle
+ the unicast case.
+
+ RTSP also allows a redirection of a stream to another client by using the
+ "destination" configuration parameter. The destination config parm would
+ indicate a different IP address. This function is NOT supported by the
+ RTSP translation code below.
+
+ The RTSP multicast functions without any address translation intervention.
+
+ For this routine to work, the SETUP/200 must fit entirely
+ into a single TCP packet. This is typically the case, but exceptions
+ can easily be envisioned under the actual specifications.
+
+ Probably the most troubling aspect of the approach taken here is
+ that the new SETUP/200 will typically be a different length, and
+ this causes a certain amount of bookkeeping to keep track of the
+ changes of sequence and acknowledgment numbers, since the client
+ machine is totally unaware of the modification to the TCP stream.
+
+ Initial version: May, 2000 (eds)
+*/
+
+#ifdef _KERNEL
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/module.h>
+#else
+#include <freebsd/errno.h>
+#include <freebsd/sys/types.h>
+#include <freebsd/stdio.h>
+#include <freebsd/string.h>
+#endif
+
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/tcp.h>
+
+#ifdef _KERNEL
+#include <freebsd/netinet/libalias/alias.h>
+#include <freebsd/netinet/libalias/alias_local.h>
+#include <freebsd/netinet/libalias/alias_mod.h>
+#else
+#include <freebsd/local/alias_local.h>
+#include <freebsd/local/alias_mod.h>
+#endif
+
+#define RTSP_CONTROL_PORT_NUMBER_1 554
+#define RTSP_CONTROL_PORT_NUMBER_2 7070
+#define TFTP_PORT_NUMBER 69
+
+static void
+AliasHandleRtspOut(struct libalias *, struct ip *, struct alias_link *,
+ int maxpacketsize);
+static int
+fingerprint(struct libalias *la, struct alias_data *ah)
+{
+
+ if (ah->dport != NULL && ah->aport != NULL && ah->sport != NULL &&
+ ntohs(*ah->dport) == TFTP_PORT_NUMBER)
+ return (0);
+ if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL ||
+ ah->maxpktsize == 0)
+ return (-1);
+ if (ntohs(*ah->dport) == RTSP_CONTROL_PORT_NUMBER_1
+ || ntohs(*ah->sport) == RTSP_CONTROL_PORT_NUMBER_1
+ || ntohs(*ah->dport) == RTSP_CONTROL_PORT_NUMBER_2
+ || ntohs(*ah->sport) == RTSP_CONTROL_PORT_NUMBER_2)
+ return (0);
+ return (-1);
+}
+
+static int
+protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah)
+{
+
+ if (ntohs(*ah->dport) == TFTP_PORT_NUMBER)
+ FindRtspOut(la, pip->ip_src, pip->ip_dst,
+ *ah->sport, *ah->aport, IPPROTO_UDP);
+ else AliasHandleRtspOut(la, pip, ah->lnk, ah->maxpktsize);
+ return (0);
+}
+
+struct proto_handler handlers[] = {
+ {
+ .pri = 100,
+ .dir = OUT,
+ .proto = TCP|UDP,
+ .fingerprint = &fingerprint,
+ .protohandler = &protohandler
+ },
+ { EOH }
+};
+
+static int
+mod_handler(module_t mod, int type, void *data)
+{
+ int error;
+
+ switch (type) {
+ case MOD_LOAD:
+ error = 0;
+ LibAliasAttachHandlers(handlers);
+ break;
+ case MOD_UNLOAD:
+ error = 0;
+ LibAliasDetachHandlers(handlers);
+ break;
+ default:
+ error = EINVAL;
+ }
+ return (error);
+}
+
+#ifdef _KERNEL
+static
+#endif
+moduledata_t alias_mod = {
+ "alias_smedia", mod_handler, NULL
+};
+
+#ifdef _KERNEL
+DECLARE_MODULE(alias_smedia, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);
+MODULE_VERSION(alias_smedia, 1);
+MODULE_DEPEND(alias_smedia, libalias, 1, 1, 1);
+#endif
+
+#define RTSP_CONTROL_PORT_NUMBER_1 554
+#define RTSP_CONTROL_PORT_NUMBER_2 7070
+#define RTSP_PORT_GROUP 2
+
+#define ISDIGIT(a) (((a) >= '0') && ((a) <= '9'))
+
+static int
+search_string(char *data, int dlen, const char *search_str)
+{
+ int i, j, k;
+ int search_str_len;
+
+ search_str_len = strlen(search_str);
+ for (i = 0; i < dlen - search_str_len; i++) {
+ for (j = i, k = 0; j < dlen - search_str_len; j++, k++) {
+ if (data[j] != search_str[k] &&
+ data[j] != search_str[k] - ('a' - 'A')) {
+ break;
+ }
+ if (k == search_str_len - 1) {
+ return (j + 1);
+ }
+ }
+ }
+ return (-1);
+}
+
+static int
+alias_rtsp_out(struct libalias *la, struct ip *pip,
+ struct alias_link *lnk,
+ char *data,
+ const char *port_str)
+{
+ int hlen, tlen, dlen;
+ struct tcphdr *tc;
+ int i, j, pos, state, port_dlen, new_dlen, delta;
+ u_short p[2], new_len;
+ u_short sport, eport, base_port;
+ u_short salias = 0, ealias = 0, base_alias = 0;
+ const char *transport_str = "transport:";
+ char newdata[2048], *port_data, *port_newdata, stemp[80];
+ int links_created = 0, pkt_updated = 0;
+ struct alias_link *rtsp_lnk = NULL;
+ struct in_addr null_addr;
+
+ /* Calculate data length of TCP packet */
+ tc = (struct tcphdr *)ip_next(pip);
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+ /* Find keyword, "Transport: " */
+ pos = search_string(data, dlen, transport_str);
+ if (pos < 0) {
+ return (-1);
+ }
+ port_data = data + pos;
+ port_dlen = dlen - pos;
+
+ memcpy(newdata, data, pos);
+ port_newdata = newdata + pos;
+
+ while (port_dlen > (int)strlen(port_str)) {
+ /* Find keyword, appropriate port string */
+ pos = search_string(port_data, port_dlen, port_str);
+ if (pos < 0) {
+ break;
+ }
+ memcpy(port_newdata, port_data, pos + 1);
+ port_newdata += (pos + 1);
+
+ p[0] = p[1] = 0;
+ sport = eport = 0;
+ state = 0;
+ for (i = pos; i < port_dlen; i++) {
+ switch (state) {
+ case 0:
+ if (port_data[i] == '=') {
+ state++;
+ }
+ break;
+ case 1:
+ if (ISDIGIT(port_data[i])) {
+ p[0] = p[0] * 10 + port_data[i] - '0';
+ } else {
+ if (port_data[i] == ';') {
+ state = 3;
+ }
+ if (port_data[i] == '-') {
+ state++;
+ }
+ }
+ break;
+ case 2:
+ if (ISDIGIT(port_data[i])) {
+ p[1] = p[1] * 10 + port_data[i] - '0';
+ } else {
+ state++;
+ }
+ break;
+ case 3:
+ base_port = p[0];
+ sport = htons(p[0]);
+ eport = htons(p[1]);
+
+ if (!links_created) {
+
+ links_created = 1;
+ /*
+ * Find an even numbered port
+ * number base that satisfies the
+ * contiguous number of ports we
+ * need
+ */
+ null_addr.s_addr = 0;
+ if (0 == (salias = FindNewPortGroup(la, null_addr,
+ FindAliasAddress(la, pip->ip_src),
+ sport, 0,
+ RTSP_PORT_GROUP,
+ IPPROTO_UDP, 1))) {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/RTSP: Cannot find contiguous RTSP data ports\n");
+#endif
+ } else {
+
+ base_alias = ntohs(salias);
+ for (j = 0; j < RTSP_PORT_GROUP; j++) {
+ /*
+ * Establish link
+ * to port found in
+ * RTSP packet
+ */
+ rtsp_lnk = FindRtspOut(la, GetOriginalAddress(lnk), null_addr,
+ htons(base_port + j), htons(base_alias + j),
+ IPPROTO_UDP);
+ if (rtsp_lnk != NULL) {
+#ifndef NO_FW_PUNCH
+ /*
+ * Punch
+ * hole in
+ * firewall
+ */
+ PunchFWHole(rtsp_lnk);
+#endif
+ } else {
+#ifdef LIBALIAS_DEBUG
+ fprintf(stderr,
+ "PacketAlias/RTSP: Cannot allocate RTSP data ports\n");
+#endif
+ break;
+ }
+ }
+ }
+ ealias = htons(base_alias + (RTSP_PORT_GROUP - 1));
+ }
+ if (salias && rtsp_lnk) {
+
+ pkt_updated = 1;
+
+ /* Copy into IP packet */
+ sprintf(stemp, "%d", ntohs(salias));
+ memcpy(port_newdata, stemp, strlen(stemp));
+ port_newdata += strlen(stemp);
+
+ if (eport != 0) {
+ *port_newdata = '-';
+ port_newdata++;
+
+ /* Copy into IP packet */
+ sprintf(stemp, "%d", ntohs(ealias));
+ memcpy(port_newdata, stemp, strlen(stemp));
+ port_newdata += strlen(stemp);
+ }
+ *port_newdata = ';';
+ port_newdata++;
+ }
+ state++;
+ break;
+ }
+ if (state > 3) {
+ break;
+ }
+ }
+ port_data += i;
+ port_dlen -= i;
+ }
+
+ if (!pkt_updated)
+ return (-1);
+
+ memcpy(port_newdata, port_data, port_dlen);
+ port_newdata += port_dlen;
+ *port_newdata = '\0';
+
+ /* Create new packet */
+ new_dlen = port_newdata - newdata;
+ memcpy(data, newdata, new_dlen);
+
+ SetAckModified(lnk);
+ tc = (struct tcphdr *)ip_next(pip);
+ delta = GetDeltaSeqOut(tc->th_seq, lnk);
+ AddSeq(lnk, delta + new_dlen - dlen, pip->ip_hl, pip->ip_len,
+ tc->th_seq, tc->th_off);
+
+ new_len = htons(hlen + new_dlen);
+ DifferentialChecksum(&pip->ip_sum,
+ &new_len,
+ &pip->ip_len,
+ 1);
+ pip->ip_len = new_len;
+
+ tc->th_sum = 0;
+#ifdef _KERNEL
+ tc->th_x2 = 1;
+#else
+ tc->th_sum = TcpChecksum(pip);
+#endif
+ return (0);
+}
+
+/* Support the protocol used by early versions of RealPlayer */
+
+static int
+alias_pna_out(struct libalias *la, struct ip *pip,
+ struct alias_link *lnk,
+ char *data,
+ int dlen)
+{
+ struct alias_link *pna_links;
+ u_short msg_id, msg_len;
+ char *work;
+ u_short alias_port, port;
+ struct tcphdr *tc;
+
+ work = data;
+ work += 5;
+ while (work + 4 < data + dlen) {
+ memcpy(&msg_id, work, 2);
+ work += 2;
+ memcpy(&msg_len, work, 2);
+ work += 2;
+ if (ntohs(msg_id) == 0) {
+ /* end of options */
+ return (0);
+ }
+ if ((ntohs(msg_id) == 1) || (ntohs(msg_id) == 7)) {
+ memcpy(&port, work, 2);
+ pna_links = FindUdpTcpOut(la, pip->ip_src, GetDestAddress(lnk),
+ port, 0, IPPROTO_UDP, 1);
+ if (pna_links != NULL) {
+#ifndef NO_FW_PUNCH
+ /* Punch hole in firewall */
+ PunchFWHole(pna_links);
+#endif
+ tc = (struct tcphdr *)ip_next(pip);
+ alias_port = GetAliasPort(pna_links);
+ memcpy(work, &alias_port, 2);
+
+ /* Compute TCP checksum for revised packet */
+ tc->th_sum = 0;
+#ifdef _KERNEL
+ tc->th_x2 = 1;
+#else
+ tc->th_sum = TcpChecksum(pip);
+#endif
+ }
+ }
+ work += ntohs(msg_len);
+ }
+
+ return (0);
+}
+
+static void
+AliasHandleRtspOut(struct libalias *la, struct ip *pip, struct alias_link *lnk, int maxpacketsize)
+{
+ int hlen, tlen, dlen;
+ struct tcphdr *tc;
+ char *data;
+ const char *setup = "SETUP", *pna = "PNA", *str200 = "200";
+ const char *okstr = "OK", *client_port_str = "client_port";
+ const char *server_port_str = "server_port";
+ int i, parseOk;
+
+ (void)maxpacketsize;
+
+ tc = (struct tcphdr *)ip_next(pip);
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+ data = (char *)pip;
+ data += hlen;
+
+ /* When aliasing a client, check for the SETUP request */
+ if ((ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_1) ||
+ (ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_2)) {
+
+ if (dlen >= (int)strlen(setup)) {
+ if (memcmp(data, setup, strlen(setup)) == 0) {
+ alias_rtsp_out(la, pip, lnk, data, client_port_str);
+ return;
+ }
+ }
+ if (dlen >= (int)strlen(pna)) {
+ if (memcmp(data, pna, strlen(pna)) == 0) {
+ alias_pna_out(la, pip, lnk, data, dlen);
+ }
+ }
+ } else {
+
+ /*
+ * When aliasing a server, check for the 200 reply
+ * Accomodate varying number of blanks between 200 & OK
+ */
+
+ if (dlen >= (int)strlen(str200)) {
+
+ for (parseOk = 0, i = 0;
+ i <= dlen - (int)strlen(str200);
+ i++) {
+ if (memcmp(&data[i], str200, strlen(str200)) == 0) {
+ parseOk = 1;
+ break;
+ }
+ }
+ if (parseOk) {
+
+ i += strlen(str200); /* skip string found */
+ while (data[i] == ' ') /* skip blank(s) */
+ i++;
+
+ if ((dlen - i) >= (int)strlen(okstr)) {
+
+ if (memcmp(&data[i], okstr, strlen(okstr)) == 0)
+ alias_rtsp_out(la, pip, lnk, data, server_port_str);
+
+ }
+ }
+ }
+ }
+}
diff --git a/freebsd/sys/netinet/libalias/alias_util.c b/freebsd/sys/netinet/libalias/alias_util.c
new file mode 100644
index 00000000..1e0c95ae
--- /dev/null
+++ b/freebsd/sys/netinet/libalias/alias_util.c
@@ -0,0 +1,178 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+
+/*
+ Alias_util.c contains general utilities used by other functions
+ in the packet aliasing module. At the moment, there are functions
+ for computing IP header and TCP packet checksums.
+
+ The checksum routines are based upon example code in a Unix networking
+ text written by Stevens (sorry, I can't remember the title -- but
+ at least this is a good author).
+
+ Initial Version: August, 1996 (cjm)
+
+ Version 1.7: January 9, 1997
+ Added differential checksum update function.
+*/
+
+#ifdef _KERNEL
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/proc.h>
+#else
+#include <freebsd/sys/types.h>
+#include <freebsd/stdio.h>
+#endif
+
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/tcp.h>
+
+#ifdef _KERNEL
+#include <freebsd/netinet/libalias/alias.h>
+#include <freebsd/netinet/libalias/alias_local.h>
+#else
+#include <freebsd/local/alias.h>
+#include <freebsd/local/alias_local.h>
+#endif
+
+/*
+ * Note: the checksum routines assume that the actual checksum word has
+ * been zeroed out. If the checksum word is filled with the proper value,
+ * then these routines will give a result of zero (useful for testing
+ * purposes);
+ */
+u_short
+LibAliasInternetChecksum(struct libalias *la __unused, u_short * ptr,
+ int nbytes)
+{
+ int sum, oddbyte;
+
+ LIBALIAS_LOCK(la);
+ sum = 0;
+ while (nbytes > 1) {
+ sum += *ptr++;
+ nbytes -= 2;
+ }
+ if (nbytes == 1) {
+ oddbyte = 0;
+ ((u_char *) & oddbyte)[0] = *(u_char *) ptr;
+ ((u_char *) & oddbyte)[1] = 0;
+ sum += oddbyte;
+ }
+ sum = (sum >> 16) + (sum & 0xffff);
+ sum += (sum >> 16);
+ LIBALIAS_UNLOCK(la);
+ return (~sum);
+}
+
+#ifndef _KERNEL
+u_short
+IpChecksum(struct ip *pip)
+{
+ return (LibAliasInternetChecksum(NULL, (u_short *) pip,
+ (pip->ip_hl << 2)));
+
+}
+
+u_short
+TcpChecksum(struct ip *pip)
+{
+ u_short *ptr;
+ struct tcphdr *tc;
+ int nhdr, ntcp, nbytes;
+ int sum, oddbyte;
+
+ nhdr = pip->ip_hl << 2;
+ ntcp = ntohs(pip->ip_len) - nhdr;
+
+ tc = (struct tcphdr *)ip_next(pip);
+ ptr = (u_short *) tc;
+
+/* Add up TCP header and data */
+ nbytes = ntcp;
+ sum = 0;
+ while (nbytes > 1) {
+ sum += *ptr++;
+ nbytes -= 2;
+ }
+ if (nbytes == 1) {
+ oddbyte = 0;
+ ((u_char *) & oddbyte)[0] = *(u_char *) ptr;
+ ((u_char *) & oddbyte)[1] = 0;
+ sum += oddbyte;
+ }
+/* "Pseudo-header" data */
+ ptr = (void *)&pip->ip_dst;
+ sum += *ptr++;
+ sum += *ptr;
+ ptr = (void *)&pip->ip_src;
+ sum += *ptr++;
+ sum += *ptr;
+ sum += htons((u_short) ntcp);
+ sum += htons((u_short) pip->ip_p);
+
+/* Roll over carry bits */
+ sum = (sum >> 16) + (sum & 0xffff);
+ sum += (sum >> 16);
+
+/* Return checksum */
+ return ((u_short) ~ sum);
+}
+#endif /* not _KERNEL */
+
+void
+DifferentialChecksum(u_short * cksum, void *newp, void *oldp, int n)
+{
+ int i;
+ int accumulate;
+ u_short *new = newp;
+ u_short *old = oldp;
+
+ accumulate = *cksum;
+ for (i = 0; i < n; i++) {
+ accumulate -= *new++;
+ accumulate += *old++;
+ }
+
+ if (accumulate < 0) {
+ accumulate = -accumulate;
+ accumulate = (accumulate >> 16) + (accumulate & 0xffff);
+ accumulate += accumulate >> 16;
+ *cksum = (u_short) ~ accumulate;
+ } else {
+ accumulate = (accumulate >> 16) + (accumulate & 0xffff);
+ accumulate += accumulate >> 16;
+ *cksum = (u_short) accumulate;
+ }
+}
diff --git a/freebsd/sys/netinet/pim.h b/freebsd/sys/netinet/pim.h
new file mode 100644
index 00000000..2f887cc2
--- /dev/null
+++ b/freebsd/sys/netinet/pim.h
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 1996-2000
+ * University of Southern California/Information Sciences Institute.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_PIM_HH_
+#define _NETINET_PIM_HH_
+
+/*
+ * Protocol Independent Multicast (PIM) definitions.
+ * RFC 2362, June 1998.
+ *
+ * Written by Ahmed Helmy, USC/SGI, July 1996.
+ * Modified by George Edmond Eddy (Rusty), ISI, February 1998.
+ * Modified by Pavlin Radoslavov, USC/ISI, May 1998, October 2000.
+ */
+
+#include <freebsd/sys/types.h>
+
+#ifndef _PIM_VT
+#ifndef BYTE_ORDER
+# error BYTE_ORDER is not defined!
+#endif
+#if (BYTE_ORDER != BIG_ENDIAN) && (BYTE_ORDER != LITTLE_ENDIAN)
+# error BYTE_ORDER must be defined to either BIG_ENDIAN or LITTLE_ENDIAN
+#endif
+#endif /* ! _PIM_VT */
+
+/*
+ * PIM packet header
+ */
+struct pim {
+#ifdef _PIM_VT
+ uint8_t pim_vt; /* PIM version and message type */
+#else /* ! _PIM_VT */
+#if BYTE_ORDER == BIG_ENDIAN
+ u_int pim_vers:4, /* PIM protocol version */
+ pim_type:4; /* PIM message type */
+#endif
+#if BYTE_ORDER == LITTLE_ENDIAN
+ u_int pim_type:4, /* PIM message type */
+ pim_vers:4; /* PIM protocol version */
+#endif
+#endif /* ! _PIM_VT */
+ uint8_t pim_reserved; /* Reserved */
+ uint16_t pim_cksum; /* IP-style checksum */
+};
+/* KAME-related name backward compatibility */
+#define pim_ver pim_vers
+#define pim_rsv pim_reserved
+
+#ifdef _PIM_VT
+#define PIM_MAKE_VT(v, t) (0xff & (((v) << 4) | (0x0f & (t))))
+#define PIM_VT_V(x) (((x) >> 4) & 0x0f)
+#define PIM_VT_T(x) ((x) & 0x0f)
+#endif /* _PIM_VT */
+
+#define PIM_VERSION 2
+#define PIM_MINLEN 8 /* PIM message min. length */
+#define PIM_REG_MINLEN (PIM_MINLEN+20) /* PIM Register hdr + inner IPv4 hdr */
+#define PIM6_REG_MINLEN (PIM_MINLEN+40) /* PIM Register hdr + inner IPv6 hdr */
+
+/*
+ * PIM message types
+ */
+#define PIM_HELLO 0x0 /* PIM-SM and PIM-DM */
+#define PIM_REGISTER 0x1 /* PIM-SM only */
+#define PIM_REGISTER_STOP 0x2 /* PIM-SM only */
+#define PIM_JOIN_PRUNE 0x3 /* PIM-SM and PIM-DM */
+#define PIM_BOOTSTRAP 0x4 /* PIM-SM only */
+#define PIM_ASSERT 0x5 /* PIM-SM and PIM-DM */
+#define PIM_GRAFT 0x6 /* PIM-DM only */
+#define PIM_GRAFT_ACK 0x7 /* PIM-DM only */
+#define PIM_CAND_RP_ADV 0x8 /* PIM-SM only */
+#define PIM_ALL_DF_ELECTION 0xa /* Bidir-PIM-SM only */
+
+/*
+ * PIM-Register message flags
+ */
+#define PIM_BORDER_REGISTER 0x80000000U /* The Border bit (host-order) */
+#define PIM_NULL_REGISTER 0x40000000U /* The Null-Register bit (host-order)*/
+
+/*
+ * All-PIM-Routers IPv4 and IPv6 multicast addresses
+ */
+#define INADDR_ALLPIM_ROUTERS_GROUP (uint32_t)0xe000000dU /* 224.0.0.13 */
+#define IN6ADDR_LINKLOCAL_ALLPIM_ROUTERS "ff02::d"
+#define IN6ADDR_LINKLOCAL_ALLPIM_ROUTERS_INIT \
+ {{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d }}}
+
+#endif /* _NETINET_PIM_HH_ */
diff --git a/freebsd/sys/netinet/pim_var.h b/freebsd/sys/netinet/pim_var.h
new file mode 100644
index 00000000..9d80bbb2
--- /dev/null
+++ b/freebsd/sys/netinet/pim_var.h
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 1998-2000
+ * University of Southern California/Information Sciences Institute.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_PIM_VAR_HH_
+#define _NETINET_PIM_VAR_HH_
+
+/*
+ * Protocol Independent Multicast (PIM),
+ * kernel variables and implementation-specific definitions.
+ *
+ * Written by George Edmond Eddy (Rusty), ISI, February 1998.
+ * Modified by Pavlin Radoslavov, USC/ISI, May 1998, Aug 1999, October 2000.
+ * Modified by Hitoshi Asaeda, WIDE, August 1998.
+ */
+
+/*
+ * PIM statistics kept in the kernel
+ */
+struct pimstat {
+ u_quad_t pims_rcv_total_msgs; /* total PIM messages received */
+ u_quad_t pims_rcv_total_bytes; /* total PIM bytes received */
+ u_quad_t pims_rcv_tooshort; /* rcvd with too few bytes */
+ u_quad_t pims_rcv_badsum; /* rcvd with bad checksum */
+ u_quad_t pims_rcv_badversion; /* rcvd bad PIM version */
+ u_quad_t pims_rcv_registers_msgs; /* rcvd regs. msgs (data only) */
+ u_quad_t pims_rcv_registers_bytes; /* rcvd regs. bytes (data only) */
+ u_quad_t pims_rcv_registers_wrongiif; /* rcvd regs. on wrong iif */
+ u_quad_t pims_rcv_badregisters; /* rcvd invalid registers */
+ u_quad_t pims_snd_registers_msgs; /* sent regs. msgs (data only) */
+ u_quad_t pims_snd_registers_bytes; /* sent regs. bytes (data only) */
+};
+
+#ifdef _KERNEL
+#define PIMSTAT_ADD(name, val) V_pimstat.name += (val)
+#define PIMSTAT_INC(name) PIMSTAT_ADD(name, 1)
+#endif
+
+/*
+ * Names for PIM sysctl objects
+ */
+#define PIMCTL_STATS 1 /* statistics (read-only) */
+#define PIMCTL_MAXID 2
+
+#define PIMCTL_NAMES { \
+ { 0, 0 }, \
+ { "stats", CTLTYPE_STRUCT }, \
+}
+
+#ifdef _KERNEL
+
+void pim_input(struct mbuf *, int);
+SYSCTL_DECL(_net_inet_pim);
+#endif
+
+#endif /* _NETINET_PIM_VAR_HH_ */
diff --git a/freebsd/sys/netinet/raw_ip.c b/freebsd/sys/netinet/raw_ip.c
new file mode 100644
index 00000000..fb90880f
--- /dev/null
+++ b/freebsd/sys/netinet/raw_ip.c
@@ -0,0 +1,1116 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_ipsec.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/jail.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/rwlock.h>
+#include <freebsd/sys/signalvar.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/sx.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/systm.h>
+
+#include <freebsd/vm/uma.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_mroute.h>
+
+#ifdef IPSEC
+#include <freebsd/netipsec/ipsec.h>
+#endif /*IPSEC*/
+
+#include <freebsd/security/mac/mac_framework.h>
+
+VNET_DEFINE(struct inpcbhead, ripcb);
+VNET_DEFINE(struct inpcbinfo, ripcbinfo);
+
+#define V_ripcb VNET(ripcb)
+#define V_ripcbinfo VNET(ripcbinfo)
+
+/*
+ * Control and data hooks for ipfw, dummynet, divert and so on.
+ * The data hooks are not used here but it is convenient
+ * to keep them all in one place.
+ */
+VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL;
+VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL;
+
+int (*ip_dn_ctl_ptr)(struct sockopt *);
+int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *);
+void (*ip_divert_ptr)(struct mbuf *, int);
+int (*ng_ipfw_input_p)(struct mbuf **, int,
+ struct ip_fw_args *, int);
+
+/*
+ * Hooks for multicast routing. They all default to NULL, so leave them not
+ * initialized and rely on BSS being set to 0.
+ */
+
+/*
+ * The socket used to communicate with the multicast routing daemon.
+ */
+VNET_DEFINE(struct socket *, ip_mrouter);
+
+/*
+ * The various mrouter and rsvp functions.
+ */
+int (*ip_mrouter_set)(struct socket *, struct sockopt *);
+int (*ip_mrouter_get)(struct socket *, struct sockopt *);
+int (*ip_mrouter_done)(void);
+int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
+ struct ip_moptions *);
+int (*mrt_ioctl)(u_long, caddr_t, int);
+int (*legal_vif_num)(int);
+u_long (*ip_mcast_src)(int);
+
+void (*rsvp_input_p)(struct mbuf *m, int off);
+int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
+void (*ip_rsvp_force_done)(struct socket *);
+
+/*
+ * Hash functions
+ */
+
+#define INP_PCBHASH_RAW_SIZE 256
+#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
+ (((proto) + (laddr) + (faddr)) % (mask) + 1)
+
+static void
+rip_inshash(struct inpcb *inp)
+{
+ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+ struct inpcbhead *pcbhash;
+ int hash;
+
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ if (inp->inp_ip_p != 0 &&
+ inp->inp_laddr.s_addr != INADDR_ANY &&
+ inp->inp_faddr.s_addr != INADDR_ANY) {
+ hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
+ inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
+ } else
+ hash = 0;
+ pcbhash = &pcbinfo->ipi_hashbase[hash];
+ LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
+}
+
+static void
+rip_delhash(struct inpcb *inp)
+{
+
+ INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ LIST_REMOVE(inp, inp_hash);
+}
+
+/*
+ * Raw interface to IP protocol.
+ */
+
+/*
+ * Initialize raw connection block q.
+ */
+static void
+rip_zone_change(void *tag)
+{
+
+ uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
+}
+
+static int
+rip_inpcb_init(void *mem, int size, int flags)
+{
+ struct inpcb *inp = mem;
+
+ INP_LOCK_INIT(inp, "inp", "rawinp");
+ return (0);
+}
+
+void
+rip_init(void)
+{
+
+ INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip");
+ LIST_INIT(&V_ripcb);
+#ifdef VIMAGE
+ V_ripcbinfo.ipi_vnet = curvnet;
+#endif
+ V_ripcbinfo.ipi_listhead = &V_ripcb;
+ V_ripcbinfo.ipi_hashbase =
+ hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, &V_ripcbinfo.ipi_hashmask);
+ V_ripcbinfo.ipi_porthashbase =
+ hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask);
+ V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
+ NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
+ EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
+ EVENTHANDLER_PRI_ANY);
+}
+
+#ifdef VIMAGE
+void
+rip_destroy(void)
+{
+
+ hashdestroy(V_ripcbinfo.ipi_hashbase, M_PCB,
+ V_ripcbinfo.ipi_hashmask);
+ hashdestroy(V_ripcbinfo.ipi_porthashbase, M_PCB,
+ V_ripcbinfo.ipi_porthashmask);
+}
+#endif
+
+static int
+rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
+ struct sockaddr_in *ripsrc)
+{
+ int policyfail = 0;
+
+ INP_RLOCK_ASSERT(last);
+
+#ifdef IPSEC
+ /* check AH/ESP integrity. */
+ if (ipsec4_in_reject(n, last)) {
+ policyfail = 1;
+ }
+#endif /* IPSEC */
+#ifdef MAC
+ if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
+ policyfail = 1;
+#endif
+ /* Check the minimum TTL for socket. */
+ if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
+ policyfail = 1;
+ if (!policyfail) {
+ struct mbuf *opts = NULL;
+ struct socket *so;
+
+ so = last->inp_socket;
+ if ((last->inp_flags & INP_CONTROLOPTS) ||
+ (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
+ ip_savecontrol(last, &opts, ip, n);
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (sbappendaddr_locked(&so->so_rcv,
+ (struct sockaddr *)ripsrc, n, opts) == 0) {
+ /* should notify about lost packet */
+ m_freem(n);
+ if (opts)
+ m_freem(opts);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ } else
+ sorwakeup_locked(so);
+ } else
+ m_freem(n);
+ return (policyfail);
+}
+
+/*
+ * Setup generic address and protocol structures for raw_input routine, then
+ * pass them along with mbuf chain.
+ */
+void
+rip_input(struct mbuf *m, int off)
+{
+ struct ifnet *ifp;
+ struct ip *ip = mtod(m, struct ip *);
+ int proto = ip->ip_p;
+ struct inpcb *inp, *last;
+ struct sockaddr_in ripsrc;
+ int hash;
+
+ bzero(&ripsrc, sizeof(ripsrc));
+ ripsrc.sin_len = sizeof(ripsrc);
+ ripsrc.sin_family = AF_INET;
+ ripsrc.sin_addr = ip->ip_src;
+ last = NULL;
+
+ ifp = m->m_pkthdr.rcvif;
+
+ hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
+ INP_INFO_RLOCK(&V_ripcbinfo);
+ LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
+ if (inp->inp_ip_p != proto)
+ continue;
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
+ continue;
+ if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
+ continue;
+ if (jailed_without_vnet(inp->inp_cred)) {
+ /*
+ * XXX: If faddr was bound to multicast group,
+ * jailed raw socket will drop datagram.
+ */
+ if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
+ continue;
+ }
+ if (last != NULL) {
+ struct mbuf *n;
+
+ n = m_copy(m, 0, (int)M_COPYALL);
+ if (n != NULL)
+ (void) rip_append(last, ip, n, &ripsrc);
+ /* XXX count dropped packet */
+ INP_RUNLOCK(last);
+ }
+ INP_RLOCK(inp);
+ last = inp;
+ }
+ LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
+ if (inp->inp_ip_p && inp->inp_ip_p != proto)
+ continue;
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (!in_nullhost(inp->inp_laddr) &&
+ !in_hosteq(inp->inp_laddr, ip->ip_dst))
+ continue;
+ if (!in_nullhost(inp->inp_faddr) &&
+ !in_hosteq(inp->inp_faddr, ip->ip_src))
+ continue;
+ if (jailed_without_vnet(inp->inp_cred)) {
+ /*
+ * Allow raw socket in jail to receive multicast;
+ * assume process had PRIV_NETINET_RAW at attach,
+ * and fall through into normal filter path if so.
+ */
+ if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
+ prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
+ continue;
+ }
+ /*
+ * If this raw socket has multicast state, and we
+ * have received a multicast, check if this socket
+ * should receive it, as multicast filtering is now
+ * the responsibility of the transport layer.
+ */
+ if (inp->inp_moptions != NULL &&
+ IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
+ /*
+ * If the incoming datagram is for IGMP, allow it
+ * through unconditionally to the raw socket.
+ *
+ * In the case of IGMPv2, we may not have explicitly
+ * joined the group, and may have set IFF_ALLMULTI
+ * on the interface. imo_multi_filter() may discard
+ * control traffic we actually need to see.
+ *
+ * Userland multicast routing daemons should continue
+ * filter the control traffic appropriately.
+ */
+ int blocked;
+
+ blocked = MCAST_PASS;
+ if (proto != IPPROTO_IGMP) {
+ struct sockaddr_in group;
+
+ bzero(&group, sizeof(struct sockaddr_in));
+ group.sin_len = sizeof(struct sockaddr_in);
+ group.sin_family = AF_INET;
+ group.sin_addr = ip->ip_dst;
+
+ blocked = imo_multi_filter(inp->inp_moptions,
+ ifp,
+ (struct sockaddr *)&group,
+ (struct sockaddr *)&ripsrc);
+ }
+
+ if (blocked != MCAST_PASS) {
+ IPSTAT_INC(ips_notmember);
+ continue;
+ }
+ }
+ if (last != NULL) {
+ struct mbuf *n;
+
+ n = m_copy(m, 0, (int)M_COPYALL);
+ if (n != NULL)
+ (void) rip_append(last, ip, n, &ripsrc);
+ /* XXX count dropped packet */
+ INP_RUNLOCK(last);
+ }
+ INP_RLOCK(inp);
+ last = inp;
+ }
+ INP_INFO_RUNLOCK(&V_ripcbinfo);
+ if (last != NULL) {
+ if (rip_append(last, ip, m, &ripsrc) != 0)
+ IPSTAT_INC(ips_delivered);
+ INP_RUNLOCK(last);
+ } else {
+ m_freem(m);
+ IPSTAT_INC(ips_noproto);
+ IPSTAT_DEC(ips_delivered);
+ }
+}
+
+/*
+ * Generate IP header and pass packet to ip_output. Tack on options user may
+ * have setup with control call.
+ */
+int
+rip_output(struct mbuf *m, struct socket *so, u_long dst)
+{
+ struct ip *ip;
+ int error;
+ struct inpcb *inp = sotoinpcb(so);
+ int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
+ IP_ALLOWBROADCAST;
+
+ /*
+ * If the user handed us a complete IP packet, use it. Otherwise,
+ * allocate an mbuf for a header and fill it in.
+ */
+ if ((inp->inp_flags & INP_HDRINCL) == 0) {
+ if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
+ m_freem(m);
+ return(EMSGSIZE);
+ }
+ M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
+ if (m == NULL)
+ return(ENOBUFS);
+
+ INP_RLOCK(inp);
+ ip = mtod(m, struct ip *);
+ ip->ip_tos = inp->inp_ip_tos;
+ if (inp->inp_flags & INP_DONTFRAG)
+ ip->ip_off = IP_DF;
+ else
+ ip->ip_off = 0;
+ ip->ip_p = inp->inp_ip_p;
+ ip->ip_len = m->m_pkthdr.len;
+ ip->ip_src = inp->inp_laddr;
+ if (jailed(inp->inp_cred)) {
+ /*
+ * prison_local_ip4() would be good enough but would
+ * let a source of INADDR_ANY pass, which we do not
+ * want to see from jails. We do not go through the
+ * pain of in_pcbladdr() for raw sockets.
+ */
+ if (ip->ip_src.s_addr == INADDR_ANY)
+ error = prison_get_ip4(inp->inp_cred,
+ &ip->ip_src);
+ else
+ error = prison_local_ip4(inp->inp_cred,
+ &ip->ip_src);
+ if (error != 0) {
+ INP_RUNLOCK(inp);
+ m_freem(m);
+ return (error);
+ }
+ }
+ ip->ip_dst.s_addr = dst;
+ ip->ip_ttl = inp->inp_ip_ttl;
+ } else {
+ if (m->m_pkthdr.len > IP_MAXPACKET) {
+ m_freem(m);
+ return(EMSGSIZE);
+ }
+ INP_RLOCK(inp);
+ ip = mtod(m, struct ip *);
+ error = prison_check_ip4(inp->inp_cred, &ip->ip_src);
+ if (error != 0) {
+ INP_RUNLOCK(inp);
+ m_freem(m);
+ return (error);
+ }
+
+ /*
+ * Don't allow both user specified and setsockopt options,
+ * and don't allow packet length sizes that will crash.
+ */
+ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options)
+ || (ip->ip_len > m->m_pkthdr.len)
+ || (ip->ip_len < (ip->ip_hl << 2))) {
+ INP_RUNLOCK(inp);
+ m_freem(m);
+ return (EINVAL);
+ }
+ if (ip->ip_id == 0)
+ ip->ip_id = ip_newid();
+
+ /*
+ * XXX prevent ip_output from overwriting header fields.
+ */
+ flags |= IP_RAWOUTPUT;
+ IPSTAT_INC(ips_rawout);
+ }
+
+ if (inp->inp_flags & INP_ONESBCAST)
+ flags |= IP_SENDONES;
+
+#ifdef MAC
+ mac_inpcb_create_mbuf(inp, m);
+#endif
+
+ error = ip_output(m, inp->inp_options, NULL, flags,
+ inp->inp_moptions, inp);
+ INP_RUNLOCK(inp);
+ return (error);
+}
+
+/*
+ * Raw IP socket option processing.
+ *
+ * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
+ * only be created by a privileged process, and as such, socket option
+ * operations to manage system properties on any raw socket were allowed to
+ * take place without explicit additional access control checks. However,
+ * raw sockets can now also be created in jail(), and therefore explicit
+ * checks are now required. Likewise, raw sockets can be used by a process
+ * after it gives up privilege, so some caution is required. For options
+ * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
+ * performed in ip_ctloutput() and therefore no check occurs here.
+ * Unilaterally checking priv_check() here breaks normal IP socket option
+ * operations on raw sockets.
+ *
+ * When adding new socket options here, make sure to add access control
+ * checks here as necessary.
+ */
+int
+rip_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ struct inpcb *inp = sotoinpcb(so);
+ int error, optval;
+
+ if (sopt->sopt_level != IPPROTO_IP) {
+ if ((sopt->sopt_level == SOL_SOCKET) &&
+ (sopt->sopt_name == SO_SETFIB)) {
+ inp->inp_inc.inc_fibnum = so->so_fibnum;
+ return (0);
+ }
+ return (EINVAL);
+ }
+
+ error = 0;
+ switch (sopt->sopt_dir) {
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case IP_HDRINCL:
+ optval = inp->inp_flags & INP_HDRINCL;
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+
+ case IP_FW3: /* generic ipfw v.3 functions */
+ case IP_FW_ADD: /* ADD actually returns the body... */
+ case IP_FW_GET:
+ case IP_FW_TABLE_GETSIZE:
+ case IP_FW_TABLE_LIST:
+ case IP_FW_NAT_GET_CONFIG:
+ case IP_FW_NAT_GET_LOG:
+ if (V_ip_fw_ctl_ptr != NULL)
+ error = V_ip_fw_ctl_ptr(sopt);
+ else
+ error = ENOPROTOOPT;
+ break;
+
+ case IP_DUMMYNET3: /* generic dummynet v.3 functions */
+ case IP_DUMMYNET_GET:
+ if (ip_dn_ctl_ptr != NULL)
+ error = ip_dn_ctl_ptr(sopt);
+ else
+ error = ENOPROTOOPT;
+ break ;
+
+ case MRT_INIT:
+ case MRT_DONE:
+ case MRT_ADD_VIF:
+ case MRT_DEL_VIF:
+ case MRT_ADD_MFC:
+ case MRT_DEL_MFC:
+ case MRT_VERSION:
+ case MRT_ASSERT:
+ case MRT_API_SUPPORT:
+ case MRT_API_CONFIG:
+ case MRT_ADD_BW_UPCALL:
+ case MRT_DEL_BW_UPCALL:
+ error = priv_check(curthread, PRIV_NETINET_MROUTE);
+ if (error != 0)
+ return (error);
+ error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
+ EOPNOTSUPP;
+ break;
+
+ default:
+ error = ip_ctloutput(so, sopt);
+ break;
+ }
+ break;
+
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case IP_HDRINCL:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ break;
+ if (optval)
+ inp->inp_flags |= INP_HDRINCL;
+ else
+ inp->inp_flags &= ~INP_HDRINCL;
+ break;
+
+ case IP_FW3: /* generic ipfw v.3 functions */
+ case IP_FW_ADD:
+ case IP_FW_DEL:
+ case IP_FW_FLUSH:
+ case IP_FW_ZERO:
+ case IP_FW_RESETLOG:
+ case IP_FW_TABLE_ADD:
+ case IP_FW_TABLE_DEL:
+ case IP_FW_TABLE_FLUSH:
+ case IP_FW_NAT_CFG:
+ case IP_FW_NAT_DEL:
+ if (V_ip_fw_ctl_ptr != NULL)
+ error = V_ip_fw_ctl_ptr(sopt);
+ else
+ error = ENOPROTOOPT;
+ break;
+
+ case IP_DUMMYNET3: /* generic dummynet v.3 functions */
+ case IP_DUMMYNET_CONFIGURE:
+ case IP_DUMMYNET_DEL:
+ case IP_DUMMYNET_FLUSH:
+ if (ip_dn_ctl_ptr != NULL)
+ error = ip_dn_ctl_ptr(sopt);
+ else
+ error = ENOPROTOOPT ;
+ break ;
+
+ case IP_RSVP_ON:
+ error = priv_check(curthread, PRIV_NETINET_MROUTE);
+ if (error != 0)
+ return (error);
+ error = ip_rsvp_init(so);
+ break;
+
+ case IP_RSVP_OFF:
+ error = priv_check(curthread, PRIV_NETINET_MROUTE);
+ if (error != 0)
+ return (error);
+ error = ip_rsvp_done();
+ break;
+
+ case IP_RSVP_VIF_ON:
+ case IP_RSVP_VIF_OFF:
+ error = priv_check(curthread, PRIV_NETINET_MROUTE);
+ if (error != 0)
+ return (error);
+ error = ip_rsvp_vif ?
+ ip_rsvp_vif(so, sopt) : EINVAL;
+ break;
+
+ case MRT_INIT:
+ case MRT_DONE:
+ case MRT_ADD_VIF:
+ case MRT_DEL_VIF:
+ case MRT_ADD_MFC:
+ case MRT_DEL_MFC:
+ case MRT_VERSION:
+ case MRT_ASSERT:
+ case MRT_API_SUPPORT:
+ case MRT_API_CONFIG:
+ case MRT_ADD_BW_UPCALL:
+ case MRT_DEL_BW_UPCALL:
+ error = priv_check(curthread, PRIV_NETINET_MROUTE);
+ if (error != 0)
+ return (error);
+ error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
+ EOPNOTSUPP;
+ break;
+
+ default:
+ error = ip_ctloutput(so, sopt);
+ break;
+ }
+ break;
+ }
+
+ return (error);
+}
+
+/*
+ * This function exists solely to receive the PRC_IFDOWN messages which are
+ * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls
+ * in_ifadown() to remove all routes corresponding to that address. It also
+ * receives the PRC_IFUP messages from if_up() and reinstalls the interface
+ * routes.
+ */
+void
+rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
+{
+ struct in_ifaddr *ia;
+ struct ifnet *ifp;
+ int err;
+ int flags;
+
+ switch (cmd) {
+ case PRC_IFDOWN:
+ IN_IFADDR_RLOCK();
+ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ if (ia->ia_ifa.ifa_addr == sa
+ && (ia->ia_flags & IFA_ROUTE)) {
+ ifa_ref(&ia->ia_ifa);
+ IN_IFADDR_RUNLOCK();
+ /*
+ * in_ifscrub kills the interface route.
+ */
+ in_ifscrub(ia->ia_ifp, ia);
+ /*
+ * in_ifadown gets rid of all the rest of the
+ * routes. This is not quite the right thing
+ * to do, but at least if we are running a
+ * routing process they will come back.
+ */
+ in_ifadown(&ia->ia_ifa, 0);
+ ifa_free(&ia->ia_ifa);
+ break;
+ }
+ }
+ if (ia == NULL) /* If ia matched, already unlocked. */
+ IN_IFADDR_RUNLOCK();
+ break;
+
+ case PRC_IFUP:
+ IN_IFADDR_RLOCK();
+ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ if (ia->ia_ifa.ifa_addr == sa)
+ break;
+ }
+ if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) {
+ IN_IFADDR_RUNLOCK();
+ return;
+ }
+ ifa_ref(&ia->ia_ifa);
+ IN_IFADDR_RUNLOCK();
+ flags = RTF_UP;
+ ifp = ia->ia_ifa.ifa_ifp;
+
+ if ((ifp->if_flags & IFF_LOOPBACK)
+ || (ifp->if_flags & IFF_POINTOPOINT))
+ flags |= RTF_HOST;
+
+ err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
+ if (err == 0)
+ ia->ia_flags |= IFA_ROUTE;
+ err = ifa_add_loopback_route((struct ifaddr *)ia, sa);
+ ifa_free(&ia->ia_ifa);
+ break;
+ }
+}
+
+u_long rip_sendspace = 9216;
+u_long rip_recvspace = 9216;
+
+SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
+ &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
+SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
+ &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
+
+static int
+rip_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct inpcb *inp;
+ int error;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
+
+ error = priv_check(td, PRIV_NETINET_RAW);
+ if (error)
+ return (error);
+ if (proto >= IPPROTO_MAX || proto < 0)
+ return EPROTONOSUPPORT;
+ error = soreserve(so, rip_sendspace, rip_recvspace);
+ if (error)
+ return (error);
+ INP_INFO_WLOCK(&V_ripcbinfo);
+ error = in_pcballoc(so, &V_ripcbinfo);
+ if (error) {
+ INP_INFO_WUNLOCK(&V_ripcbinfo);
+ return (error);
+ }
+ inp = (struct inpcb *)so->so_pcb;
+ inp->inp_vflag |= INP_IPV4;
+ inp->inp_ip_p = proto;
+ inp->inp_ip_ttl = V_ip_defttl;
+ rip_inshash(inp);
+ INP_INFO_WUNLOCK(&V_ripcbinfo);
+ INP_WUNLOCK(inp);
+ return (0);
+}
+
+static void
+rip_detach(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
+ KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
+ ("rip_detach: not closed"));
+
+ INP_INFO_WLOCK(&V_ripcbinfo);
+ INP_WLOCK(inp);
+ rip_delhash(inp);
+ if (so == V_ip_mrouter && ip_mrouter_done)
+ ip_mrouter_done();
+ if (ip_rsvp_force_done)
+ ip_rsvp_force_done(so);
+ if (so == V_ip_rsvpd)
+ ip_rsvp_done();
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+ INP_INFO_WUNLOCK(&V_ripcbinfo);
+}
+
+static void
+rip_dodisconnect(struct socket *so, struct inpcb *inp)
+{
+
+ INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ rip_delhash(inp);
+ inp->inp_faddr.s_addr = INADDR_ANY;
+ rip_inshash(inp);
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_ISCONNECTED;
+ SOCK_UNLOCK(so);
+}
+
+static void
+rip_abort(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
+
+ INP_INFO_WLOCK(&V_ripcbinfo);
+ INP_WLOCK(inp);
+ rip_dodisconnect(so, inp);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_ripcbinfo);
+}
+
+static void
+rip_close(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("rip_close: inp == NULL"));
+
+ INP_INFO_WLOCK(&V_ripcbinfo);
+ INP_WLOCK(inp);
+ rip_dodisconnect(so, inp);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_ripcbinfo);
+}
+
+static int
+rip_disconnect(struct socket *so)
+{
+ struct inpcb *inp;
+
+ if ((so->so_state & SS_ISCONNECTED) == 0)
+ return (ENOTCONN);
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
+
+ INP_INFO_WLOCK(&V_ripcbinfo);
+ INP_WLOCK(inp);
+ rip_dodisconnect(so, inp);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_ripcbinfo);
+ return (0);
+}
+
+static int
+rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct sockaddr_in *addr = (struct sockaddr_in *)nam;
+ struct inpcb *inp;
+ int error;
+
+ if (nam->sa_len != sizeof(*addr))
+ return (EINVAL);
+
+ error = prison_check_ip4(td->td_ucred, &addr->sin_addr);
+ if (error != 0)
+ return (error);
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
+
+ if (TAILQ_EMPTY(&V_ifnet) ||
+ (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
+ (addr->sin_addr.s_addr &&
+ (inp->inp_flags & INP_BINDANY) == 0 &&
+ ifa_ifwithaddr_check((struct sockaddr *)addr) == 0))
+ return (EADDRNOTAVAIL);
+
+ INP_INFO_WLOCK(&V_ripcbinfo);
+ INP_WLOCK(inp);
+ rip_delhash(inp);
+ inp->inp_laddr = addr->sin_addr;
+ rip_inshash(inp);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_ripcbinfo);
+ return (0);
+}
+
+static int
+rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct sockaddr_in *addr = (struct sockaddr_in *)nam;
+ struct inpcb *inp;
+
+ if (nam->sa_len != sizeof(*addr))
+ return (EINVAL);
+ if (TAILQ_EMPTY(&V_ifnet))
+ return (EADDRNOTAVAIL);
+ if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
+ return (EAFNOSUPPORT);
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
+
+ INP_INFO_WLOCK(&V_ripcbinfo);
+ INP_WLOCK(inp);
+ rip_delhash(inp);
+ inp->inp_faddr = addr->sin_addr;
+ rip_inshash(inp);
+ soisconnected(so);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_ripcbinfo);
+ return (0);
+}
+
+static int
+rip_shutdown(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
+
+ INP_WLOCK(inp);
+ socantsendmore(so);
+ INP_WUNLOCK(inp);
+ return (0);
+}
+
+static int
+rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
+ struct mbuf *control, struct thread *td)
+{
+ struct inpcb *inp;
+ u_long dst;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("rip_send: inp == NULL"));
+
+ /*
+ * Note: 'dst' reads below are unlocked.
+ */
+ if (so->so_state & SS_ISCONNECTED) {
+ if (nam) {
+ m_freem(m);
+ return (EISCONN);
+ }
+ dst = inp->inp_faddr.s_addr; /* Unlocked read. */
+ } else {
+ if (nam == NULL) {
+ m_freem(m);
+ return (ENOTCONN);
+ }
+ dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
+ }
+ return (rip_output(m, so, dst));
+}
+
+static int
+rip_pcblist(SYSCTL_HANDLER_ARGS)
+{
+ int error, i, n;
+ struct inpcb *inp, **inp_list;
+ inp_gen_t gencnt;
+ struct xinpgen xig;
+
+ /*
+ * The process of preparing the TCB list is too time-consuming and
+ * resource-intensive to repeat twice on every request.
+ */
+ if (req->oldptr == 0) {
+ n = V_ripcbinfo.ipi_count;
+ n += imax(n / 8, 10);
+ req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
+ return (0);
+ }
+
+ if (req->newptr != 0)
+ return (EPERM);
+
+ /*
+ * OK, now we're committed to doing something.
+ */
+ INP_INFO_RLOCK(&V_ripcbinfo);
+ gencnt = V_ripcbinfo.ipi_gencnt;
+ n = V_ripcbinfo.ipi_count;
+ INP_INFO_RUNLOCK(&V_ripcbinfo);
+
+ xig.xig_len = sizeof xig;
+ xig.xig_count = n;
+ xig.xig_gen = gencnt;
+ xig.xig_sogen = so_gencnt;
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ if (error)
+ return (error);
+
+ inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
+ if (inp_list == 0)
+ return (ENOMEM);
+
+ INP_INFO_RLOCK(&V_ripcbinfo);
+ for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n;
+ inp = LIST_NEXT(inp, inp_list)) {
+ INP_WLOCK(inp);
+ if (inp->inp_gencnt <= gencnt &&
+ cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
+ in_pcbref(inp);
+ inp_list[i++] = inp;
+ }
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(&V_ripcbinfo);
+ n = i;
+
+ error = 0;
+ for (i = 0; i < n; i++) {
+ inp = inp_list[i];
+ INP_RLOCK(inp);
+ if (inp->inp_gencnt <= gencnt) {
+ struct xinpcb xi;
+
+ bzero(&xi, sizeof(xi));
+ xi.xi_len = sizeof xi;
+ /* XXX should avoid extra copy */
+ bcopy(inp, &xi.xi_inp, sizeof *inp);
+ if (inp->inp_socket)
+ sotoxsocket(inp->inp_socket, &xi.xi_socket);
+ INP_RUNLOCK(inp);
+ error = SYSCTL_OUT(req, &xi, sizeof xi);
+ } else
+ INP_RUNLOCK(inp);
+ }
+ INP_INFO_WLOCK(&V_ripcbinfo);
+ for (i = 0; i < n; i++) {
+ inp = inp_list[i];
+ INP_WLOCK(inp);
+ if (!in_pcbrele(inp))
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_WUNLOCK(&V_ripcbinfo);
+
+ if (!error) {
+ /*
+ * Give the user an updated idea of our state. If the
+ * generation differs from what we told her before, she knows
+ * that something happened while we were processing this
+ * request, and it might be necessary to retry.
+ */
+ INP_INFO_RLOCK(&V_ripcbinfo);
+ xig.xig_gen = V_ripcbinfo.ipi_gencnt;
+ xig.xig_sogen = so_gencnt;
+ xig.xig_count = V_ripcbinfo.ipi_count;
+ INP_INFO_RUNLOCK(&V_ripcbinfo);
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ }
+ free(inp_list, M_TEMP);
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
+ rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
+
+struct pr_usrreqs rip_usrreqs = {
+ .pru_abort = rip_abort,
+ .pru_attach = rip_attach,
+ .pru_bind = rip_bind,
+ .pru_connect = rip_connect,
+ .pru_control = in_control,
+ .pru_detach = rip_detach,
+ .pru_disconnect = rip_disconnect,
+ .pru_peeraddr = in_getpeeraddr,
+ .pru_send = rip_send,
+ .pru_shutdown = rip_shutdown,
+ .pru_sockaddr = in_getsockaddr,
+ .pru_sosetlabel = in_pcbsosetlabel,
+ .pru_close = rip_close,
+};
diff --git a/freebsd/sys/netinet/sctp.h b/freebsd/sys/netinet/sctp.h
new file mode 100644
index 00000000..bf188a23
--- /dev/null
+++ b/freebsd/sys/netinet/sctp.h
@@ -0,0 +1,549 @@
+/*-
+ * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/* $KAME: sctp.h,v 1.18 2005/03/06 16:04:16 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef _NETINET_SCTP_HH_
+#define _NETINET_SCTP_HH_
+
+#include <freebsd/sys/types.h>
+
+
+#define SCTP_PACKED __attribute__((packed))
+
+/*
+ * SCTP protocol - RFC2960.
+ */
+struct sctphdr {
+ uint16_t src_port; /* source port */
+ uint16_t dest_port; /* destination port */
+ uint32_t v_tag; /* verification tag of packet */
+ uint32_t checksum; /* Adler32 C-Sum */
+ /* chunks follow... */
+} SCTP_PACKED;
+
+/*
+ * SCTP Chunks
+ */
+struct sctp_chunkhdr {
+ uint8_t chunk_type; /* chunk type */
+ uint8_t chunk_flags; /* chunk flags */
+ uint16_t chunk_length; /* chunk length */
+ /* optional params follow */
+} SCTP_PACKED;
+
+/*
+ * SCTP chunk parameters
+ */
+struct sctp_paramhdr {
+ uint16_t param_type; /* parameter type */
+ uint16_t param_length; /* parameter length */
+} SCTP_PACKED;
+
+/*
+ * user socket options: socket API defined
+ */
+/*
+ * read-write options
+ */
+#define SCTP_RTOINFO 0x00000001
+#define SCTP_ASSOCINFO 0x00000002
+#define SCTP_INITMSG 0x00000003
+#define SCTP_NODELAY 0x00000004
+#define SCTP_AUTOCLOSE 0x00000005
+#define SCTP_SET_PEER_PRIMARY_ADDR 0x00000006
+#define SCTP_PRIMARY_ADDR 0x00000007
+#define SCTP_ADAPTATION_LAYER 0x00000008
+/* same as above */
+#define SCTP_ADAPTION_LAYER 0x00000008
+#define SCTP_DISABLE_FRAGMENTS 0x00000009
+#define SCTP_PEER_ADDR_PARAMS 0x0000000a
+#define SCTP_DEFAULT_SEND_PARAM 0x0000000b
+/* ancillary data/notification interest options */
+#define SCTP_EVENTS 0x0000000c
+/* Without this applied we will give V4 and V6 addresses on a V6 socket */
+#define SCTP_I_WANT_MAPPED_V4_ADDR 0x0000000d
+#define SCTP_MAXSEG 0x0000000e
+#define SCTP_DELAYED_SACK 0x0000000f
+#define SCTP_FRAGMENT_INTERLEAVE 0x00000010
+#define SCTP_PARTIAL_DELIVERY_POINT 0x00000011
+/* authentication support */
+#define SCTP_AUTH_CHUNK 0x00000012
+#define SCTP_AUTH_KEY 0x00000013
+#define SCTP_HMAC_IDENT 0x00000014
+#define SCTP_AUTH_ACTIVE_KEY 0x00000015
+#define SCTP_AUTH_DELETE_KEY 0x00000016
+#define SCTP_USE_EXT_RCVINFO 0x00000017
+#define SCTP_AUTO_ASCONF 0x00000018 /* rw */
+#define SCTP_MAXBURST 0x00000019 /* rw */
+#define SCTP_MAX_BURST 0x00000019 /* rw */
+/* assoc level context */
+#define SCTP_CONTEXT 0x0000001a /* rw */
+/* explicit EOR signalling */
+#define SCTP_EXPLICIT_EOR 0x0000001b
+#define SCTP_REUSE_PORT 0x0000001c /* rw */
+#define SCTP_AUTH_DEACTIVATE_KEY 0x0000001d
+
+/*
+ * read-only options
+ */
+#define SCTP_STATUS 0x00000100
+#define SCTP_GET_PEER_ADDR_INFO 0x00000101
+/* authentication support */
+#define SCTP_PEER_AUTH_CHUNKS 0x00000102
+#define SCTP_LOCAL_AUTH_CHUNKS 0x00000103
+#define SCTP_GET_ASSOC_NUMBER 0x00000104 /* ro */
+#define SCTP_GET_ASSOC_ID_LIST 0x00000105 /* ro */
+#define SCTP_TIMEOUTS 0x00000106
+
+/*
+ * user socket options: BSD implementation specific
+ */
+/*
+ * Blocking I/O is enabled on any TCP type socket by default. For the UDP
+ * model if this is turned on then the socket buffer is shared for send
+ * resources amongst all associations. The default for the UDP model is that
+ * is SS_NBIO is set. Which means all associations have a separate send
+ * limit BUT they will NOT ever BLOCK instead you will get an error back
+ * EAGAIN if you try to send too much. If you want the blocking semantics you
+ * set this option at the cost of sharing one socket send buffer size amongst
+ * all associations. Peeled off sockets turn this option off and block. But
+ * since both TCP and peeled off sockets have only one assoc per socket this
+ * is fine. It probably does NOT make sense to set this on SS_NBIO on a TCP
+ * model OR peeled off UDP model, but we do allow you to do so. You just use
+ * the normal syscall to toggle SS_NBIO the way you want.
+ *
+ * Blocking I/O is controlled by the SS_NBIO flag on the socket state so_state
+ * field.
+ */
+
+/* these should probably go into sockets API */
+#define SCTP_RESET_STREAMS 0x00001004 /* wo */
+
+
+/* here on down are more implementation specific */
+#define SCTP_SET_DEBUG_LEVEL 0x00001005
+#define SCTP_CLR_STAT_LOG 0x00001007
+/* CMT ON/OFF socket option */
+#define SCTP_CMT_ON_OFF 0x00001200
+#define SCTP_CMT_USE_DAC 0x00001201
+/* JRS - Pluggable Congestion Control Socket option */
+#define SCTP_PLUGGABLE_CC 0x00001202
+
+/* read only */
+#define SCTP_GET_SNDBUF_USE 0x00001101
+#define SCTP_GET_STAT_LOG 0x00001103
+#define SCTP_PCB_STATUS 0x00001104
+#define SCTP_GET_NONCE_VALUES 0x00001105
+
+
+/* Special hook for dynamically setting primary for all assoc's,
+ * this is a write only option that requires root privilege.
+ */
+#define SCTP_SET_DYNAMIC_PRIMARY 0x00002001
+
+/* VRF (virtual router feature) and multi-VRF support
+ * options. VRF's provide splits within a router
+ * that give the views of multiple routers. A
+ * standard host, without VRF support, is just
+ * a single VRF. If VRF's are supported then
+ * the transport must be VRF aware. This means
+ * that every socket call coming in must be directed
+ * within the endpoint to one of the VRF's it belongs
+ * to. The endpoint, before binding, may select
+ * the "default" VRF it is in by using a set socket
+ * option with SCTP_VRF_ID. This will also
+ * get propagated to the default VRF. Once the
+ * endpoint binds an address then it CANNOT add
+ * additional VRF's to become a Multi-VRF endpoint.
+ *
+ * Before BINDING additional VRF's can be added with
+ * the SCTP_ADD_VRF_ID call or deleted with
+ * SCTP_DEL_VRF_ID.
+ *
+ * Associations are ALWAYS contained inside a single
+ * VRF. They cannot reside in two (or more) VRF's. Incoming
+ * packets, assuming the router is VRF aware, can always
+ * tell us what VRF they arrived on. A host not supporting
+ * any VRF's will find that the packets always arrived on the
+ * single VRF that the host has.
+ *
+ */
+
+#define SCTP_VRF_ID 0x00003001
+#define SCTP_ADD_VRF_ID 0x00003002
+#define SCTP_GET_VRF_IDS 0x00003003
+#define SCTP_GET_ASOC_VRF 0x00003004
+#define SCTP_DEL_VRF_ID 0x00003005
+
+/*
+ * If you enable packet logging you can get
+ * a poor mans ethereal output in binary
+ * form. Note this is a compile option to
+ * the kernel, SCTP_PACKET_LOGGING, and
+ * without it in your kernel you
+ * will get a EOPNOTSUPP
+ */
+#define SCTP_GET_PACKET_LOG 0x00004001
+
+/*
+ * hidden implementation specific options these are NOT user visible (should
+ * move out of sctp.h)
+ */
+/* sctp_bindx() flags as hidden socket options */
+#define SCTP_BINDX_ADD_ADDR 0x00008001
+#define SCTP_BINDX_REM_ADDR 0x00008002
+/* Hidden socket option that gets the addresses */
+#define SCTP_GET_PEER_ADDRESSES 0x00008003
+#define SCTP_GET_LOCAL_ADDRESSES 0x00008004
+/* return the total count in bytes needed to hold all local addresses bound */
+#define SCTP_GET_LOCAL_ADDR_SIZE 0x00008005
+/* Return the total count in bytes needed to hold the remote address */
+#define SCTP_GET_REMOTE_ADDR_SIZE 0x00008006
+/* hidden option for connectx */
+#define SCTP_CONNECT_X 0x00008007
+/* hidden option for connectx_delayed, part of sendx */
+#define SCTP_CONNECT_X_DELAYED 0x00008008
+#define SCTP_CONNECT_X_COMPLETE 0x00008009
+/* hidden socket option based sctp_peeloff */
+#define SCTP_PEELOFF 0x0000800a
+/* the real worker for sctp_getaddrlen() */
+#define SCTP_GET_ADDR_LEN 0x0000800b
+/* temporary workaround for Apple listen() issue, no args used */
+#define SCTP_LISTEN_FIX 0x0000800c
+/* Debug things that need to be purged */
+#define SCTP_SET_INITIAL_DBG_SEQ 0x00009f00
+
+/* JRS - Supported congestion control modules for pluggable
+ * congestion control
+ */
+/* Standard TCP Congestion Control */
+#define SCTP_CC_RFC2581 0x00000000
+/* High Speed TCP Congestion Control (Floyd) */
+#define SCTP_CC_HSTCP 0x00000001
+/* HTCP Congestion Control */
+#define SCTP_CC_HTCP 0x00000002
+
+
+/* fragment interleave constants
+ * setting must be one of these or
+ * EINVAL returned.
+ */
+#define SCTP_FRAG_LEVEL_0 0x00000000
+#define SCTP_FRAG_LEVEL_1 0x00000001
+#define SCTP_FRAG_LEVEL_2 0x00000002
+
+/*
+ * user state values
+ */
+#define SCTP_CLOSED 0x0000
+#define SCTP_BOUND 0x1000
+#define SCTP_LISTEN 0x2000
+#define SCTP_COOKIE_WAIT 0x0002
+#define SCTP_COOKIE_ECHOED 0x0004
+#define SCTP_ESTABLISHED 0x0008
+#define SCTP_SHUTDOWN_SENT 0x0010
+#define SCTP_SHUTDOWN_RECEIVED 0x0020
+#define SCTP_SHUTDOWN_ACK_SENT 0x0040
+#define SCTP_SHUTDOWN_PENDING 0x0080
+
+/*
+ * SCTP operational error codes (user visible)
+ */
+#define SCTP_CAUSE_NO_ERROR 0x0000
+#define SCTP_CAUSE_INVALID_STREAM 0x0001
+#define SCTP_CAUSE_MISSING_PARAM 0x0002
+#define SCTP_CAUSE_STALE_COOKIE 0x0003
+#define SCTP_CAUSE_OUT_OF_RESC 0x0004
+#define SCTP_CAUSE_UNRESOLVABLE_ADDR 0x0005
+#define SCTP_CAUSE_UNRECOG_CHUNK 0x0006
+#define SCTP_CAUSE_INVALID_PARAM 0x0007
+#define SCTP_CAUSE_UNRECOG_PARAM 0x0008
+#define SCTP_CAUSE_NO_USER_DATA 0x0009
+#define SCTP_CAUSE_COOKIE_IN_SHUTDOWN 0x000a
+#define SCTP_CAUSE_RESTART_W_NEWADDR 0x000b
+#define SCTP_CAUSE_USER_INITIATED_ABT 0x000c
+#define SCTP_CAUSE_PROTOCOL_VIOLATION 0x000d
+
+/* Error causes from RFC5061 */
+#define SCTP_CAUSE_DELETING_LAST_ADDR 0x00a0
+#define SCTP_CAUSE_RESOURCE_SHORTAGE 0x00a1
+#define SCTP_CAUSE_DELETING_SRC_ADDR 0x00a2
+#define SCTP_CAUSE_ILLEGAL_ASCONF_ACK 0x00a3
+#define SCTP_CAUSE_REQUEST_REFUSED 0x00a4
+
+/* Error causes from nat-draft */
+#define SCTP_CAUSE_NAT_COLLIDING_STATE 0x00b0
+#define SCTP_CAUSE_NAT_MISSING_STATE 0x00b1
+
+/* Error causes from RFC4895 */
+#define SCTP_CAUSE_UNSUPPORTED_HMACID 0x0105
+
+/*
+ * error cause parameters (user visible)
+ */
+struct sctp_error_cause {
+ uint16_t code;
+ uint16_t length;
+ /* optional cause-specific info may follow */
+} SCTP_PACKED;
+
+struct sctp_error_invalid_stream {
+ struct sctp_error_cause cause; /* code=SCTP_ERROR_INVALID_STREAM */
+ uint16_t stream_id; /* stream id of the DATA in error */
+ uint16_t reserved;
+} SCTP_PACKED;
+
+struct sctp_error_missing_param {
+ struct sctp_error_cause cause; /* code=SCTP_ERROR_MISSING_PARAM */
+ uint32_t num_missing_params; /* number of missing parameters */
+ /* uint16_t param_type's follow */
+} SCTP_PACKED;
+
+struct sctp_error_stale_cookie {
+ struct sctp_error_cause cause; /* code=SCTP_ERROR_STALE_COOKIE */
+ uint32_t stale_time; /* time in usec of staleness */
+} SCTP_PACKED;
+
+struct sctp_error_out_of_resource {
+ struct sctp_error_cause cause; /* code=SCTP_ERROR_OUT_OF_RESOURCES */
+} SCTP_PACKED;
+
+struct sctp_error_unresolv_addr {
+ struct sctp_error_cause cause; /* code=SCTP_ERROR_UNRESOLVABLE_ADDR */
+
+} SCTP_PACKED;
+
+struct sctp_error_unrecognized_chunk {
+ struct sctp_error_cause cause; /* code=SCTP_ERROR_UNRECOG_CHUNK */
+ struct sctp_chunkhdr ch;/* header from chunk in error */
+} SCTP_PACKED;
+
+/*
+ * Main SCTP chunk types we place these here so natd and f/w's in user land
+ * can find them.
+ */
+/************0x00 series ***********/
+#define SCTP_DATA 0x00
+#define SCTP_INITIATION 0x01
+#define SCTP_INITIATION_ACK 0x02
+#define SCTP_SELECTIVE_ACK 0x03
+#define SCTP_HEARTBEAT_REQUEST 0x04
+#define SCTP_HEARTBEAT_ACK 0x05
+#define SCTP_ABORT_ASSOCIATION 0x06
+#define SCTP_SHUTDOWN 0x07
+#define SCTP_SHUTDOWN_ACK 0x08
+#define SCTP_OPERATION_ERROR 0x09
+#define SCTP_COOKIE_ECHO 0x0a
+#define SCTP_COOKIE_ACK 0x0b
+#define SCTP_ECN_ECHO 0x0c
+#define SCTP_ECN_CWR 0x0d
+#define SCTP_SHUTDOWN_COMPLETE 0x0e
+/* RFC4895 */
+#define SCTP_AUTHENTICATION 0x0f
+/* EY nr_sack chunk id*/
+#define SCTP_NR_SELECTIVE_ACK 0x10
+/************0x40 series ***********/
+/************0x80 series ***********/
+/* RFC5061 */
+#define SCTP_ASCONF_ACK 0x80
+/* draft-ietf-stewart-pktdrpsctp */
+#define SCTP_PACKET_DROPPED 0x81
+/* draft-ietf-stewart-strreset-xxx */
+#define SCTP_STREAM_RESET 0x82
+
+/* RFC4820 */
+#define SCTP_PAD_CHUNK 0x84
+/************0xc0 series ***********/
+/* RFC3758 */
+#define SCTP_FORWARD_CUM_TSN 0xc0
+/* RFC5061 */
+#define SCTP_ASCONF 0xc1
+
+
+/* ABORT and SHUTDOWN COMPLETE FLAG */
+#define SCTP_HAD_NO_TCB 0x01
+
+/* Packet dropped flags */
+#define SCTP_FROM_MIDDLE_BOX SCTP_HAD_NO_TCB
+#define SCTP_BADCRC 0x02
+#define SCTP_PACKET_TRUNCATED 0x04
+
+#define SCTP_SAT_NETWORK_MIN 400 /* min ms for RTT to set satellite
+ * time */
+#define SCTP_SAT_NETWORK_BURST_INCR 2 /* how many times to multiply maxburst
+ * in sat */
+
+/* Data Chuck Specific Flags */
+#define SCTP_DATA_FRAG_MASK 0x03
+#define SCTP_DATA_MIDDLE_FRAG 0x00
+#define SCTP_DATA_LAST_FRAG 0x01
+#define SCTP_DATA_FIRST_FRAG 0x02
+#define SCTP_DATA_NOT_FRAG 0x03
+#define SCTP_DATA_UNORDERED 0x04
+#define SCTP_DATA_SACK_IMMEDIATELY 0x08
+/* ECN Nonce: SACK Chunk Specific Flags */
+#define SCTP_SACK_NONCE_SUM 0x01
+
+/* CMT DAC algorithm SACK flag */
+#define SCTP_SACK_CMT_DAC 0x80
+
+/*
+ * PCB flags (in sctp_flags bitmask).
+ * Note the features and flags are meant
+ * for use by netstat.
+ */
+#define SCTP_PCB_FLAGS_UDPTYPE 0x00000001
+#define SCTP_PCB_FLAGS_TCPTYPE 0x00000002
+#define SCTP_PCB_FLAGS_BOUNDALL 0x00000004
+#define SCTP_PCB_FLAGS_ACCEPTING 0x00000008
+#define SCTP_PCB_FLAGS_UNBOUND 0x00000010
+#define SCTP_PCB_FLAGS_CLOSE_IP 0x00040000
+#define SCTP_PCB_FLAGS_WAS_CONNECTED 0x00080000
+#define SCTP_PCB_FLAGS_WAS_ABORTED 0x00100000
+/* TCP model support */
+
+#define SCTP_PCB_FLAGS_CONNECTED 0x00200000
+#define SCTP_PCB_FLAGS_IN_TCPPOOL 0x00400000
+#define SCTP_PCB_FLAGS_DONT_WAKE 0x00800000
+#define SCTP_PCB_FLAGS_WAKEOUTPUT 0x01000000
+#define SCTP_PCB_FLAGS_WAKEINPUT 0x02000000
+#define SCTP_PCB_FLAGS_BOUND_V6 0x04000000
+#define SCTP_PCB_FLAGS_BLOCKING_IO 0x08000000
+#define SCTP_PCB_FLAGS_SOCKET_GONE 0x10000000
+#define SCTP_PCB_FLAGS_SOCKET_ALLGONE 0x20000000
+#define SCTP_PCB_FLAGS_SOCKET_CANT_READ 0x40000000
+/* flags to copy to new PCB */
+#define SCTP_PCB_COPY_FLAGS (SCTP_PCB_FLAGS_BOUNDALL|\
+ SCTP_PCB_FLAGS_WAKEINPUT|\
+ SCTP_PCB_FLAGS_BOUND_V6)
+
+
+/*
+ * PCB Features (in sctp_features bitmask)
+ */
+#define SCTP_PCB_FLAGS_EXT_RCVINFO 0x00000002
+#define SCTP_PCB_FLAGS_DONOT_HEARTBEAT 0x00000004
+#define SCTP_PCB_FLAGS_FRAG_INTERLEAVE 0x00000008
+#define SCTP_PCB_FLAGS_INTERLEAVE_STRMS 0x00000010
+#define SCTP_PCB_FLAGS_DO_ASCONF 0x00000020
+#define SCTP_PCB_FLAGS_AUTO_ASCONF 0x00000040
+#define SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE 0x00000080
+/* socket options */
+#define SCTP_PCB_FLAGS_NODELAY 0x00000100
+#define SCTP_PCB_FLAGS_AUTOCLOSE 0x00000200
+#define SCTP_PCB_FLAGS_RECVDATAIOEVNT 0x00000400
+#define SCTP_PCB_FLAGS_RECVASSOCEVNT 0x00000800
+#define SCTP_PCB_FLAGS_RECVPADDREVNT 0x00001000
+#define SCTP_PCB_FLAGS_RECVPEERERR 0x00002000
+#define SCTP_PCB_FLAGS_RECVSENDFAILEVNT 0x00004000
+#define SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT 0x00008000
+#define SCTP_PCB_FLAGS_ADAPTATIONEVNT 0x00010000
+#define SCTP_PCB_FLAGS_PDAPIEVNT 0x00020000
+#define SCTP_PCB_FLAGS_AUTHEVNT 0x00040000
+#define SCTP_PCB_FLAGS_STREAM_RESETEVNT 0x00080000
+#define SCTP_PCB_FLAGS_NO_FRAGMENT 0x00100000
+#define SCTP_PCB_FLAGS_EXPLICIT_EOR 0x00400000
+#define SCTP_PCB_FLAGS_NEEDS_MAPPED_V4 0x00800000
+#define SCTP_PCB_FLAGS_MULTIPLE_ASCONFS 0x01000000
+#define SCTP_PCB_FLAGS_PORTREUSE 0x02000000
+#define SCTP_PCB_FLAGS_DRYEVNT 0x04000000
+/*-
+ * mobility_features parameters (by micchie).Note
+ * these features are applied against the
+ * sctp_mobility_features flags.. not the sctp_features
+ * flags.
+ */
+#define SCTP_MOBILITY_BASE 0x00000001
+#define SCTP_MOBILITY_FASTHANDOFF 0x00000002
+#define SCTP_MOBILITY_PRIM_DELETED 0x00000004
+
+
+#define SCTP_SMALLEST_PMTU 512 /* smallest pmtu allowed when disabling PMTU
+ * discovery */
+
+#include <freebsd/netinet/sctp_uio.h>
+
+/* This dictates the size of the packet
+ * collection buffer. This only applies
+ * if SCTP_PACKET_LOGGING is enabled in
+ * your config.
+ */
+#define SCTP_PACKET_LOG_SIZE 65536
+
+/* Maximum delays and such a user can set for options that
+ * take ms.
+ */
+#define SCTP_MAX_SACK_DELAY 500 /* per RFC4960 */
+#define SCTP_MAX_HB_INTERVAL 14400000 /* 4 hours in ms */
+#define SCTP_MAX_COOKIE_LIFE 3600000 /* 1 hour in ms */
+
+
+/* Types of logging/KTR tracing that can be enabled via the
+ * sysctl net.inet.sctp.sctp_logging. You must also enable
+ * SUBSYS tracing.
+ * Note that you must have the SCTP option in the kernel
+ * to enable these as well.
+ */
+#define SCTP_BLK_LOGGING_ENABLE 0x00000001
+#define SCTP_CWND_MONITOR_ENABLE 0x00000002
+#define SCTP_CWND_LOGGING_ENABLE 0x00000004
+#define SCTP_EARLYFR_LOGGING_ENABLE 0x00000010
+#define SCTP_FLIGHT_LOGGING_ENABLE 0x00000020
+#define SCTP_FR_LOGGING_ENABLE 0x00000040
+#define SCTP_LOCK_LOGGING_ENABLE 0x00000080
+#define SCTP_MAP_LOGGING_ENABLE 0x00000100
+#define SCTP_MBCNT_LOGGING_ENABLE 0x00000200
+#define SCTP_MBUF_LOGGING_ENABLE 0x00000400
+#define SCTP_NAGLE_LOGGING_ENABLE 0x00000800
+#define SCTP_RECV_RWND_LOGGING_ENABLE 0x00001000
+#define SCTP_RTTVAR_LOGGING_ENABLE 0x00002000
+#define SCTP_SACK_LOGGING_ENABLE 0x00004000
+#define SCTP_SACK_RWND_LOGGING_ENABLE 0x00008000
+#define SCTP_SB_LOGGING_ENABLE 0x00010000
+#define SCTP_STR_LOGGING_ENABLE 0x00020000
+#define SCTP_WAKE_LOGGING_ENABLE 0x00040000
+#define SCTP_LOG_MAXBURST_ENABLE 0x00080000
+#define SCTP_LOG_RWND_ENABLE 0x00100000
+#define SCTP_LOG_SACK_ARRIVALS_ENABLE 0x00200000
+#define SCTP_LTRACE_CHUNK_ENABLE 0x00400000
+#define SCTP_LTRACE_ERROR_ENABLE 0x00800000
+#define SCTP_LAST_PACKET_TRACING 0x01000000
+#define SCTP_THRESHOLD_LOGGING 0x02000000
+#define SCTP_LOG_AT_SEND_2_SCTP 0x04000000
+#define SCTP_LOG_AT_SEND_2_OUTQ 0x08000000
+#define SCTP_LOG_TRY_ADVANCE 0x10000000
+
+
+#undef SCTP_PACKED
+
+#endif /* !_NETINET_SCTP_HH_ */
diff --git a/freebsd/sys/netinet/sctp_asconf.c b/freebsd/sys/netinet/sctp_asconf.c
new file mode 100644
index 00000000..206cf600
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_asconf.c
@@ -0,0 +1,3397 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_asconf.c,v 1.24 2005/03/06 16:04:16 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp_var.h>
+#include <freebsd/netinet/sctp_sysctl.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#include <freebsd/netinet/sctp_header.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_output.h>
+#include <freebsd/netinet/sctp_asconf.h>
+#include <freebsd/netinet/sctp_timer.h>
+
+/*
+ * debug flags:
+ * SCTP_DEBUG_ASCONF1: protocol info, general info and errors
+ * SCTP_DEBUG_ASCONF2: detailed info
+ */
+#ifdef SCTP_DEBUG
+#endif /* SCTP_DEBUG */
+
+
+static void
+sctp_asconf_get_source_ip(struct mbuf *m, struct sockaddr *sa)
+{
+ struct ip *iph;
+ struct sockaddr_in *sin;
+
+#ifdef INET6
+ struct sockaddr_in6 *sin6;
+
+#endif
+
+ iph = mtod(m, struct ip *);
+ if (iph->ip_v == IPVERSION) {
+ /* IPv4 source */
+ sin = (struct sockaddr_in *)sa;
+ bzero(sin, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_port = 0;
+ sin->sin_addr.s_addr = iph->ip_src.s_addr;
+ return;
+ }
+#ifdef INET6
+ else if (iph->ip_v == (IPV6_VERSION >> 4)) {
+ /* IPv6 source */
+ struct ip6_hdr *ip6;
+
+ sin6 = (struct sockaddr_in6 *)sa;
+ bzero(sin6, sizeof(*sin6));
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+ sin6->sin6_port = 0;
+ ip6 = mtod(m, struct ip6_hdr *);
+ sin6->sin6_addr = ip6->ip6_src;
+ return;
+ }
+#endif /* INET6 */
+ else
+ return;
+}
+
+/*
+ * draft-ietf-tsvwg-addip-sctp
+ *
+ * An ASCONF parameter queue exists per asoc which holds the pending address
+ * operations. Lists are updated upon receipt of ASCONF-ACK.
+ *
+ * A restricted_addrs list exists per assoc to hold local addresses that are
+ * not (yet) usable by the assoc as a source address. These addresses are
+ * either pending an ASCONF operation (and exist on the ASCONF parameter
+ * queue), or they are permanently restricted (the peer has returned an
+ * ERROR indication to an ASCONF(ADD), or the peer does not support ASCONF).
+ *
+ * Deleted addresses are always immediately removed from the lists as they will
+ * (shortly) no longer exist in the kernel. We send ASCONFs as a courtesy,
+ * only if allowed.
+ */
+
+/*
+ * ASCONF parameter processing.
+ * response_required: set if a reply is required (eg. SUCCESS_REPORT).
+ * returns a mbuf to an "error" response parameter or NULL/"success" if ok.
+ * FIX: allocating this many mbufs on the fly is pretty inefficient...
+ */
+static struct mbuf *
+sctp_asconf_success_response(uint32_t id)
+{
+ struct mbuf *m_reply = NULL;
+ struct sctp_asconf_paramhdr *aph;
+
+ m_reply = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_paramhdr),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_reply == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "asconf_success_response: couldn't get mbuf!\n");
+ return NULL;
+ }
+ aph = mtod(m_reply, struct sctp_asconf_paramhdr *);
+ aph->correlation_id = id;
+ aph->ph.param_type = htons(SCTP_SUCCESS_REPORT);
+ aph->ph.param_length = sizeof(struct sctp_asconf_paramhdr);
+ SCTP_BUF_LEN(m_reply) = aph->ph.param_length;
+ aph->ph.param_length = htons(aph->ph.param_length);
+
+ return m_reply;
+}
+
+static struct mbuf *
+sctp_asconf_error_response(uint32_t id, uint16_t cause, uint8_t * error_tlv,
+ uint16_t tlv_length)
+{
+ struct mbuf *m_reply = NULL;
+ struct sctp_asconf_paramhdr *aph;
+ struct sctp_error_cause *error;
+ uint8_t *tlv;
+
+ m_reply = sctp_get_mbuf_for_msg((sizeof(struct sctp_asconf_paramhdr) +
+ tlv_length +
+ sizeof(struct sctp_error_cause)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_reply == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "asconf_error_response: couldn't get mbuf!\n");
+ return NULL;
+ }
+ aph = mtod(m_reply, struct sctp_asconf_paramhdr *);
+ error = (struct sctp_error_cause *)(aph + 1);
+
+ aph->correlation_id = id;
+ aph->ph.param_type = htons(SCTP_ERROR_CAUSE_IND);
+ error->code = htons(cause);
+ error->length = tlv_length + sizeof(struct sctp_error_cause);
+ aph->ph.param_length = error->length +
+ sizeof(struct sctp_asconf_paramhdr);
+
+ if (aph->ph.param_length > MLEN) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "asconf_error_response: tlv_length (%xh) too big\n",
+ tlv_length);
+ sctp_m_freem(m_reply); /* discard */
+ return NULL;
+ }
+ if (error_tlv != NULL) {
+ tlv = (uint8_t *) (error + 1);
+ memcpy(tlv, error_tlv, tlv_length);
+ }
+ SCTP_BUF_LEN(m_reply) = aph->ph.param_length;
+ error->length = htons(error->length);
+ aph->ph.param_length = htons(aph->ph.param_length);
+
+ return m_reply;
+}
+
+static struct mbuf *
+sctp_process_asconf_add_ip(struct mbuf *m, struct sctp_asconf_paramhdr *aph,
+ struct sctp_tcb *stcb, int response_required)
+{
+ struct mbuf *m_reply = NULL;
+ struct sockaddr_storage sa_source, sa_store;
+ struct sctp_ipv4addr_param *v4addr;
+ uint16_t param_type, param_length, aparam_length;
+ struct sockaddr *sa;
+ struct sockaddr_in *sin;
+ int zero_address = 0;
+
+#ifdef INET6
+ struct sockaddr_in6 *sin6;
+ struct sctp_ipv6addr_param *v6addr;
+
+#endif /* INET6 */
+
+ aparam_length = ntohs(aph->ph.param_length);
+ v4addr = (struct sctp_ipv4addr_param *)(aph + 1);
+#ifdef INET6
+ v6addr = (struct sctp_ipv6addr_param *)(aph + 1);
+#endif /* INET6 */
+ param_type = ntohs(v4addr->ph.param_type);
+ param_length = ntohs(v4addr->ph.param_length);
+
+ sa = (struct sockaddr *)&sa_store;
+ switch (param_type) {
+ case SCTP_IPV4_ADDRESS:
+ if (param_length != sizeof(struct sctp_ipv4addr_param)) {
+ /* invalid param size */
+ return NULL;
+ }
+ sin = (struct sockaddr_in *)&sa_store;
+ bzero(sin, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_port = stcb->rport;
+ sin->sin_addr.s_addr = v4addr->addr;
+ if (sin->sin_addr.s_addr == INADDR_ANY)
+ zero_address = 1;
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_add_ip: adding ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+ break;
+ case SCTP_IPV6_ADDRESS:
+#ifdef INET6
+ if (param_length != sizeof(struct sctp_ipv6addr_param)) {
+ /* invalid param size */
+ return NULL;
+ }
+ sin6 = (struct sockaddr_in6 *)&sa_store;
+ bzero(sin6, sizeof(*sin6));
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+ sin6->sin6_port = stcb->rport;
+ memcpy((caddr_t)&sin6->sin6_addr, v6addr->addr,
+ sizeof(struct in6_addr));
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+ zero_address = 1;
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_add_ip: adding ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+#else
+ /* IPv6 not enabled! */
+ /* FIX ME: currently sends back an invalid param error */
+ m_reply = sctp_asconf_error_response(aph->correlation_id,
+ SCTP_CAUSE_INVALID_PARAM, (uint8_t *) aph, aparam_length);
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "process_asconf_add_ip: v6 disabled- skipping ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+ return m_reply;
+#endif
+ break;
+ default:
+ m_reply = sctp_asconf_error_response(aph->correlation_id,
+ SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *) aph,
+ aparam_length);
+ return m_reply;
+ } /* end switch */
+
+ /* if 0.0.0.0/::0, add the source address instead */
+ if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) {
+ sa = (struct sockaddr *)&sa_source;
+ sctp_asconf_get_source_ip(m, sa);
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "process_asconf_add_ip: using source addr ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+ }
+ /* add the address */
+ if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE,
+ SCTP_ADDR_DYNAMIC_ADDED) != 0) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "process_asconf_add_ip: error adding address\n");
+ m_reply = sctp_asconf_error_response(aph->correlation_id,
+ SCTP_CAUSE_RESOURCE_SHORTAGE, (uint8_t *) aph,
+ aparam_length);
+ } else {
+ /* notify upper layer */
+ sctp_ulp_notify(SCTP_NOTIFY_ASCONF_ADD_IP, stcb, 0, sa, SCTP_SO_NOT_LOCKED);
+ if (response_required) {
+ m_reply =
+ sctp_asconf_success_response(aph->correlation_id);
+ }
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb,
+ NULL, SCTP_FROM_SCTP_ASCONF + SCTP_LOC_1);
+ sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep,
+ stcb, NULL);
+ }
+
+ return m_reply;
+}
+
+static int
+sctp_asconf_del_remote_addrs_except(struct sctp_tcb *stcb, struct sockaddr *src)
+{
+ struct sctp_nets *src_net, *net;
+
+ /* make sure the source address exists as a destination net */
+ src_net = sctp_findnet(stcb, src);
+ if (src_net == NULL) {
+ /* not found */
+ return -1;
+ }
+ /* delete all destination addresses except the source */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ if (net != src_net) {
+ /* delete this address */
+ sctp_remove_net(stcb, net);
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "asconf_del_remote_addrs_except: deleting ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1,
+ (struct sockaddr *)&net->ro._l_addr);
+ /* notify upper layer */
+ sctp_ulp_notify(SCTP_NOTIFY_ASCONF_DELETE_IP, stcb, 0,
+ (struct sockaddr *)&net->ro._l_addr, SCTP_SO_NOT_LOCKED);
+ }
+ }
+ return 0;
+}
+
+static struct mbuf *
+sctp_process_asconf_delete_ip(struct mbuf *m, struct sctp_asconf_paramhdr *aph,
+ struct sctp_tcb *stcb, int response_required)
+{
+ struct mbuf *m_reply = NULL;
+ struct sockaddr_storage sa_source, sa_store;
+ struct sctp_ipv4addr_param *v4addr;
+ uint16_t param_type, param_length, aparam_length;
+ struct sockaddr *sa;
+ struct sockaddr_in *sin;
+ int zero_address = 0;
+ int result;
+
+#ifdef INET6
+ struct sockaddr_in6 *sin6;
+ struct sctp_ipv6addr_param *v6addr;
+
+#endif /* INET6 */
+
+ /* get the source IP address for src and 0.0.0.0/::0 delete checks */
+ sctp_asconf_get_source_ip(m, (struct sockaddr *)&sa_source);
+
+ aparam_length = ntohs(aph->ph.param_length);
+ v4addr = (struct sctp_ipv4addr_param *)(aph + 1);
+#ifdef INET6
+ v6addr = (struct sctp_ipv6addr_param *)(aph + 1);
+#endif /* INET6 */
+ param_type = ntohs(v4addr->ph.param_type);
+ param_length = ntohs(v4addr->ph.param_length);
+
+ sa = (struct sockaddr *)&sa_store;
+ switch (param_type) {
+ case SCTP_IPV4_ADDRESS:
+ if (param_length != sizeof(struct sctp_ipv4addr_param)) {
+ /* invalid param size */
+ return NULL;
+ }
+ sin = (struct sockaddr_in *)&sa_store;
+ bzero(sin, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_port = stcb->rport;
+ sin->sin_addr.s_addr = v4addr->addr;
+ if (sin->sin_addr.s_addr == INADDR_ANY)
+ zero_address = 1;
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "process_asconf_delete_ip: deleting ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+ break;
+ case SCTP_IPV6_ADDRESS:
+ if (param_length != sizeof(struct sctp_ipv6addr_param)) {
+ /* invalid param size */
+ return NULL;
+ }
+#ifdef INET6
+ sin6 = (struct sockaddr_in6 *)&sa_store;
+ bzero(sin6, sizeof(*sin6));
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+ sin6->sin6_port = stcb->rport;
+ memcpy(&sin6->sin6_addr, v6addr->addr,
+ sizeof(struct in6_addr));
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+ zero_address = 1;
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "process_asconf_delete_ip: deleting ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+#else
+ /* IPv6 not enabled! No "action" needed; just ack it */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "process_asconf_delete_ip: v6 disabled- ignoring: ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+ /* just respond with a "success" ASCONF-ACK */
+ return NULL;
+#endif
+ break;
+ default:
+ m_reply = sctp_asconf_error_response(aph->correlation_id,
+ SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *) aph,
+ aparam_length);
+ return m_reply;
+ }
+
+ /* make sure the source address is not being deleted */
+ if (sctp_cmpaddr(sa, (struct sockaddr *)&sa_source)) {
+ /* trying to delete the source address! */
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: tried to delete source addr\n");
+ m_reply = sctp_asconf_error_response(aph->correlation_id,
+ SCTP_CAUSE_DELETING_SRC_ADDR, (uint8_t *) aph,
+ aparam_length);
+ return m_reply;
+ }
+ /* if deleting 0.0.0.0/::0, delete all addresses except src addr */
+ if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) {
+ result = sctp_asconf_del_remote_addrs_except(stcb,
+ (struct sockaddr *)&sa_source);
+
+ if (result) {
+ /* src address did not exist? */
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: src addr does not exist?\n");
+ /* what error to reply with?? */
+ m_reply =
+ sctp_asconf_error_response(aph->correlation_id,
+ SCTP_CAUSE_REQUEST_REFUSED, (uint8_t *) aph,
+ aparam_length);
+ } else if (response_required) {
+ m_reply =
+ sctp_asconf_success_response(aph->correlation_id);
+ }
+ return m_reply;
+ }
+ /* delete the address */
+ result = sctp_del_remote_addr(stcb, sa);
+ /*
+ * note if result == -2, the address doesn't exist in the asoc but
+ * since it's being deleted anyways, we just ack the delete -- but
+ * this probably means something has already gone awry
+ */
+ if (result == -1) {
+ /* only one address in the asoc */
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: tried to delete last IP addr!\n");
+ m_reply = sctp_asconf_error_response(aph->correlation_id,
+ SCTP_CAUSE_DELETING_LAST_ADDR, (uint8_t *) aph,
+ aparam_length);
+ } else {
+ if (response_required) {
+ m_reply = sctp_asconf_success_response(aph->correlation_id);
+ }
+ /* notify upper layer */
+ sctp_ulp_notify(SCTP_NOTIFY_ASCONF_DELETE_IP, stcb, 0, sa, SCTP_SO_NOT_LOCKED);
+ }
+ return m_reply;
+}
+
+static struct mbuf *
+sctp_process_asconf_set_primary(struct mbuf *m,
+ struct sctp_asconf_paramhdr *aph,
+ struct sctp_tcb *stcb, int response_required)
+{
+ struct mbuf *m_reply = NULL;
+ struct sockaddr_storage sa_source, sa_store;
+ struct sctp_ipv4addr_param *v4addr;
+ uint16_t param_type, param_length, aparam_length;
+ struct sockaddr *sa;
+ struct sockaddr_in *sin;
+ int zero_address = 0;
+
+#ifdef INET6
+ struct sockaddr_in6 *sin6;
+ struct sctp_ipv6addr_param *v6addr;
+
+#endif /* INET6 */
+
+ aparam_length = ntohs(aph->ph.param_length);
+ v4addr = (struct sctp_ipv4addr_param *)(aph + 1);
+#ifdef INET6
+ v6addr = (struct sctp_ipv6addr_param *)(aph + 1);
+#endif /* INET6 */
+ param_type = ntohs(v4addr->ph.param_type);
+ param_length = ntohs(v4addr->ph.param_length);
+
+ sa = (struct sockaddr *)&sa_store;
+ switch (param_type) {
+ case SCTP_IPV4_ADDRESS:
+ if (param_length != sizeof(struct sctp_ipv4addr_param)) {
+ /* invalid param size */
+ return NULL;
+ }
+ sin = (struct sockaddr_in *)&sa_store;
+ bzero(sin, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_addr.s_addr = v4addr->addr;
+ if (sin->sin_addr.s_addr == INADDR_ANY)
+ zero_address = 1;
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_set_primary: ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+ break;
+ case SCTP_IPV6_ADDRESS:
+ if (param_length != sizeof(struct sctp_ipv6addr_param)) {
+ /* invalid param size */
+ return NULL;
+ }
+#ifdef INET6
+ sin6 = (struct sockaddr_in6 *)&sa_store;
+ bzero(sin6, sizeof(*sin6));
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+ memcpy((caddr_t)&sin6->sin6_addr, v6addr->addr,
+ sizeof(struct in6_addr));
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+ zero_address = 1;
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_set_primary: ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+#else
+ /* IPv6 not enabled! No "action" needed; just ack it */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "process_asconf_set_primary: v6 disabled- ignoring: ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+ /* just respond with a "success" ASCONF-ACK */
+ return NULL;
+#endif
+ break;
+ default:
+ m_reply = sctp_asconf_error_response(aph->correlation_id,
+ SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *) aph,
+ aparam_length);
+ return m_reply;
+ }
+
+ /* if 0.0.0.0/::0, use the source address instead */
+ if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) {
+ sa = (struct sockaddr *)&sa_source;
+ sctp_asconf_get_source_ip(m, sa);
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "process_asconf_set_primary: using source addr ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+ }
+ /* set the primary address */
+ if (sctp_set_primary_addr(stcb, sa, NULL) == 0) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "process_asconf_set_primary: primary address set\n");
+ /* notify upper layer */
+ sctp_ulp_notify(SCTP_NOTIFY_ASCONF_SET_PRIMARY, stcb, 0, sa, SCTP_SO_NOT_LOCKED);
+
+ if (response_required) {
+ m_reply = sctp_asconf_success_response(aph->correlation_id);
+ }
+ /*
+ * Mobility adaptation. Ideally, when the reception of SET
+ * PRIMARY with DELETE IP ADDRESS of the previous primary
+ * destination, unacknowledged DATA are retransmitted
+ * immediately to the new primary destination for seamless
+ * handover. If the destination is UNCONFIRMED and marked to
+ * REQ_PRIM, The retransmission occur when reception of the
+ * HEARTBEAT-ACK. (See sctp_handle_heartbeat_ack in
+ * sctp_input.c) Also, when change of the primary
+ * destination, it is better that all subsequent new DATA
+ * containing already queued DATA are transmitted to the new
+ * primary destination. (by micchie)
+ */
+ if ((sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_BASE) ||
+ sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_FASTHANDOFF)) &&
+ sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_PRIM_DELETED) &&
+ (stcb->asoc.primary_destination->dest_state &
+ SCTP_ADDR_UNCONFIRMED) == 0) {
+
+ sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_TIMER + SCTP_LOC_7);
+ if (sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_FASTHANDOFF)) {
+ sctp_assoc_immediate_retrans(stcb,
+ stcb->asoc.primary_destination);
+ }
+ if (sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_BASE)) {
+ sctp_move_chunks_from_net(stcb,
+ stcb->asoc.deleted_primary);
+ }
+ sctp_delete_prim_timer(stcb->sctp_ep, stcb,
+ stcb->asoc.deleted_primary);
+ }
+ } else {
+ /* couldn't set the requested primary address! */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "process_asconf_set_primary: set primary failed!\n");
+ /* must have been an invalid address, so report */
+ m_reply = sctp_asconf_error_response(aph->correlation_id,
+ SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *) aph,
+ aparam_length);
+ }
+
+ return m_reply;
+}
+
+/*
+ * handles an ASCONF chunk.
+ * if all parameters are processed ok, send a plain (empty) ASCONF-ACK
+ */
+void
+sctp_handle_asconf(struct mbuf *m, unsigned int offset,
+ struct sctp_asconf_chunk *cp, struct sctp_tcb *stcb,
+ int first)
+{
+ struct sctp_association *asoc;
+ uint32_t serial_num;
+ struct mbuf *n, *m_ack, *m_result, *m_tail;
+ struct sctp_asconf_ack_chunk *ack_cp;
+ struct sctp_asconf_paramhdr *aph, *ack_aph;
+ struct sctp_ipv6addr_param *p_addr;
+ unsigned int asconf_limit;
+ int error = 0; /* did an error occur? */
+
+ /* asconf param buffer */
+ uint8_t aparam_buf[SCTP_PARAM_BUFFER_SIZE];
+ struct sctp_asconf_ack *ack, *ack_next;
+
+ /* verify minimum length */
+ if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_asconf_chunk)) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "handle_asconf: chunk too small = %xh\n",
+ ntohs(cp->ch.chunk_length));
+ return;
+ }
+ asoc = &stcb->asoc;
+ serial_num = ntohl(cp->serial_number);
+
+ if (compare_with_wrap(asoc->asconf_seq_in, serial_num, MAX_SEQ) ||
+ serial_num == asoc->asconf_seq_in) {
+ /* got a duplicate ASCONF */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "handle_asconf: got duplicate serial number = %xh\n",
+ serial_num);
+ return;
+ } else if (serial_num != (asoc->asconf_seq_in + 1)) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: incorrect serial number = %xh (expected next = %xh)\n",
+ serial_num, asoc->asconf_seq_in + 1);
+ return;
+ }
+ /* it's the expected "next" sequence number, so process it */
+ asoc->asconf_seq_in = serial_num; /* update sequence */
+ /* get length of all the param's in the ASCONF */
+ asconf_limit = offset + ntohs(cp->ch.chunk_length);
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "handle_asconf: asconf_limit=%u, sequence=%xh\n",
+ asconf_limit, serial_num);
+
+ if (first) {
+ /* delete old cache */
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: Now processing firstASCONF. Try to delte old cache\n");
+
+ ack = TAILQ_FIRST(&stcb->asoc.asconf_ack_sent);
+ while (ack != NULL) {
+ ack_next = TAILQ_NEXT(ack, next);
+ if (ack->serial_number == serial_num)
+ break;
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: delete old(%u) < first(%u)\n",
+ ack->serial_number, serial_num);
+ TAILQ_REMOVE(&stcb->asoc.asconf_ack_sent, ack, next);
+ if (ack->data != NULL) {
+ sctp_m_freem(ack->data);
+ }
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asconf_ack), ack);
+ ack = ack_next;
+ }
+ }
+ m_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_ack_chunk), 0,
+ M_DONTWAIT, 1, MT_DATA);
+ if (m_ack == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "handle_asconf: couldn't get mbuf!\n");
+ return;
+ }
+ m_tail = m_ack; /* current reply chain's tail */
+
+ /* fill in ASCONF-ACK header */
+ ack_cp = mtod(m_ack, struct sctp_asconf_ack_chunk *);
+ ack_cp->ch.chunk_type = SCTP_ASCONF_ACK;
+ ack_cp->ch.chunk_flags = 0;
+ ack_cp->serial_number = htonl(serial_num);
+ /* set initial lengths (eg. just an ASCONF-ACK), ntohx at the end! */
+ SCTP_BUF_LEN(m_ack) = sizeof(struct sctp_asconf_ack_chunk);
+ ack_cp->ch.chunk_length = sizeof(struct sctp_asconf_ack_chunk);
+
+ /* skip the lookup address parameter */
+ offset += sizeof(struct sctp_asconf_chunk);
+ p_addr = (struct sctp_ipv6addr_param *)sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), (uint8_t *) & aparam_buf);
+ if (p_addr == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "handle_asconf: couldn't get lookup addr!\n");
+ /* respond with a missing/invalid mandatory parameter error */
+ return;
+ }
+ /* param_length is already validated in process_control... */
+ offset += ntohs(p_addr->ph.param_length); /* skip lookup addr */
+
+ /* get pointer to first asconf param in ASCONF-ACK */
+ ack_aph = (struct sctp_asconf_paramhdr *)(mtod(m_ack, caddr_t)+sizeof(struct sctp_asconf_ack_chunk));
+ if (ack_aph == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "Gak in asconf2\n");
+ return;
+ }
+ /* get pointer to first asconf param in ASCONF */
+ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_asconf_paramhdr), (uint8_t *) & aparam_buf);
+ if (aph == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "Empty ASCONF received?\n");
+ goto send_reply;
+ }
+ /* process through all parameters */
+ while (aph != NULL) {
+ unsigned int param_length, param_type;
+
+ param_type = ntohs(aph->ph.param_type);
+ param_length = ntohs(aph->ph.param_length);
+ if (offset + param_length > asconf_limit) {
+ /* parameter goes beyond end of chunk! */
+ sctp_m_freem(m_ack);
+ return;
+ }
+ m_result = NULL;
+
+ if (param_length > sizeof(aparam_buf)) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: param length (%u) larger than buffer size!\n", param_length);
+ sctp_m_freem(m_ack);
+ return;
+ }
+ if (param_length <= sizeof(struct sctp_paramhdr)) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: param length (%u) too short\n", param_length);
+ sctp_m_freem(m_ack);
+ }
+ /* get the entire parameter */
+ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, param_length, aparam_buf);
+ if (aph == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: couldn't get entire param\n");
+ sctp_m_freem(m_ack);
+ return;
+ }
+ switch (param_type) {
+ case SCTP_ADD_IP_ADDRESS:
+ asoc->peer_supports_asconf = 1;
+ m_result = sctp_process_asconf_add_ip(m, aph, stcb,
+ error);
+ break;
+ case SCTP_DEL_IP_ADDRESS:
+ asoc->peer_supports_asconf = 1;
+ m_result = sctp_process_asconf_delete_ip(m, aph, stcb,
+ error);
+ break;
+ case SCTP_ERROR_CAUSE_IND:
+ /* not valid in an ASCONF chunk */
+ break;
+ case SCTP_SET_PRIM_ADDR:
+ asoc->peer_supports_asconf = 1;
+ m_result = sctp_process_asconf_set_primary(m, aph,
+ stcb, error);
+ break;
+ case SCTP_NAT_VTAGS:
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: sees a NAT VTAG state parameter\n");
+ break;
+ case SCTP_SUCCESS_REPORT:
+ /* not valid in an ASCONF chunk */
+ break;
+ case SCTP_ULP_ADAPTATION:
+ /* FIX */
+ break;
+ default:
+ if ((param_type & 0x8000) == 0) {
+ /* Been told to STOP at this param */
+ asconf_limit = offset;
+ /*
+ * FIX FIX - We need to call
+ * sctp_arethere_unrecognized_parameters()
+ * to get a operr and send it for any
+ * param's with the 0x4000 bit set OR do it
+ * here ourselves... note we still must STOP
+ * if the 0x8000 bit is clear.
+ */
+ }
+ /* unknown/invalid param type */
+ break;
+ } /* switch */
+
+ /* add any (error) result to the reply mbuf chain */
+ if (m_result != NULL) {
+ SCTP_BUF_NEXT(m_tail) = m_result;
+ m_tail = m_result;
+ /* update lengths, make sure it's aligned too */
+ SCTP_BUF_LEN(m_result) = SCTP_SIZE32(SCTP_BUF_LEN(m_result));
+ ack_cp->ch.chunk_length += SCTP_BUF_LEN(m_result);
+ /* set flag to force success reports */
+ error = 1;
+ }
+ offset += SCTP_SIZE32(param_length);
+ /* update remaining ASCONF message length to process */
+ if (offset >= asconf_limit) {
+ /* no more data in the mbuf chain */
+ break;
+ }
+ /* get pointer to next asconf param */
+ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset,
+ sizeof(struct sctp_asconf_paramhdr),
+ (uint8_t *) & aparam_buf);
+ if (aph == NULL) {
+ /* can't get an asconf paramhdr */
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: can't get asconf param hdr!\n");
+ /* FIX ME - add error here... */
+ }
+ }
+
+send_reply:
+ ack_cp->ch.chunk_length = htons(ack_cp->ch.chunk_length);
+ /* save the ASCONF-ACK reply */
+ ack = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_asconf_ack),
+ struct sctp_asconf_ack);
+ if (ack == NULL) {
+ sctp_m_freem(m_ack);
+ return;
+ }
+ ack->serial_number = serial_num;
+ ack->last_sent_to = NULL;
+ ack->data = m_ack;
+ ack->len = 0;
+ n = m_ack;
+ while (n) {
+ ack->len += SCTP_BUF_LEN(n);
+ n = SCTP_BUF_NEXT(n);
+ }
+ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_ack_sent, ack, next);
+
+ /* see if last_control_chunk_from is set properly (use IP src addr) */
+ if (stcb->asoc.last_control_chunk_from == NULL) {
+ /*
+ * this could happen if the source address was just newly
+ * added
+ */
+ struct ip *iph;
+ struct sctphdr *sh;
+ struct sockaddr_storage from_store;
+ struct sockaddr *from = (struct sockaddr *)&from_store;
+
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: looking up net for IP source address\n");
+ /* pullup already done, IP options already stripped */
+ iph = mtod(m, struct ip *);
+ sh = (struct sctphdr *)((caddr_t)iph + sizeof(*iph));
+ switch (iph->ip_v) {
+ case IPVERSION:
+ {
+ struct sockaddr_in *from4;
+
+ from4 = (struct sockaddr_in *)&from_store;
+ bzero(from4, sizeof(*from4));
+ from4->sin_family = AF_INET;
+ from4->sin_len = sizeof(struct sockaddr_in);
+ from4->sin_addr.s_addr = iph->ip_src.s_addr;
+ from4->sin_port = sh->src_port;
+ break;
+ }
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ {
+ struct ip6_hdr *ip6;
+ struct sockaddr_in6 *from6;
+
+ ip6 = mtod(m, struct ip6_hdr *);
+ from6 = (struct sockaddr_in6 *)&from_store;
+ bzero(from6, sizeof(*from6));
+ from6->sin6_family = AF_INET6;
+ from6->sin6_len = sizeof(struct sockaddr_in6);
+ from6->sin6_addr = ip6->ip6_src;
+ from6->sin6_port = sh->src_port;
+ /*
+ * Get the scopes in properly to the sin6
+ * addr's
+ */
+ /* we probably don't need these operations */
+ (void)sa6_recoverscope(from6);
+ sa6_embedscope(from6,
+ MODULE_GLOBAL(ip6_use_defzone));
+
+ break;
+ }
+#endif
+ default:
+ /* unknown address type */
+ from = NULL;
+ }
+ if (from != NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "Looking for IP source: ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, from);
+ /* look up the from address */
+ stcb->asoc.last_control_chunk_from = sctp_findnet(stcb, from);
+#ifdef SCTP_DEBUG
+ if (stcb->asoc.last_control_chunk_from == NULL)
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: IP source address not found?!\n");
+#endif
+ }
+ }
+}
+
+/*
+ * does the address match? returns 0 if not, 1 if so
+ */
+static uint32_t
+sctp_asconf_addr_match(struct sctp_asconf_addr *aa, struct sockaddr *sa)
+{
+#ifdef INET6
+ if (sa->sa_family == AF_INET6) {
+ /* IPv6 sa address */
+ /* XXX scopeid */
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa;
+
+ if ((aa->ap.addrp.ph.param_type == SCTP_IPV6_ADDRESS) &&
+ (memcmp(&aa->ap.addrp.addr, &sin6->sin6_addr,
+ sizeof(struct in6_addr)) == 0)) {
+ return (1);
+ }
+ } else
+#endif /* INET6 */
+ if (sa->sa_family == AF_INET) {
+ /* IPv4 sa address */
+ struct sockaddr_in *sin = (struct sockaddr_in *)sa;
+
+ if ((aa->ap.addrp.ph.param_type == SCTP_IPV4_ADDRESS) &&
+ (memcmp(&aa->ap.addrp.addr, &sin->sin_addr,
+ sizeof(struct in_addr)) == 0)) {
+ return (1);
+ }
+ }
+ return (0);
+}
+
+/*
+ * does the address match? returns 0 if not, 1 if so
+ */
+static uint32_t
+sctp_addr_match(
+ struct sctp_ipv6addr_param *v6addr,
+ struct sockaddr *sa)
+{
+ uint16_t param_type, param_length;
+ struct sctp_ipv4addr_param *v4addr = (struct sctp_ipv4addr_param *)v6addr;
+
+#ifdef INET6
+ if (sa->sa_family == AF_INET6) {
+ /* IPv6 sa address */
+ /* XXX scopeid */
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa;
+
+ param_type = ntohs(v6addr->ph.param_type);
+ param_length = ntohs(v6addr->ph.param_length);
+
+ if ((param_type == SCTP_IPV6_ADDRESS) &&
+ param_length == sizeof(struct sctp_ipv6addr_param) &&
+ (memcmp(&v6addr->addr, &sin6->sin6_addr,
+ sizeof(struct in6_addr)) == 0)) {
+ return (1);
+ }
+ }
+#endif
+ if (sa->sa_family == AF_INET) {
+ /* IPv4 sa address */
+ struct sockaddr_in *sin = (struct sockaddr_in *)sa;
+
+ param_type = ntohs(v4addr->ph.param_type);
+ param_length = ntohs(v4addr->ph.param_length);
+
+ if ((param_type == SCTP_IPV4_ADDRESS) &&
+ param_length == sizeof(struct sctp_ipv4addr_param) &&
+ (memcmp(&v4addr->addr, &sin->sin_addr,
+ sizeof(struct in_addr)) == 0)) {
+ return (1);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Cleanup for non-responded/OP ERR'd ASCONF
+ */
+void
+sctp_asconf_cleanup(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ /* mark peer as ASCONF incapable */
+ stcb->asoc.peer_supports_asconf = 0;
+ /*
+ * clear out any existing asconfs going out
+ */
+ sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_ASCONF + SCTP_LOC_2);
+ stcb->asoc.asconf_seq_out_acked = stcb->asoc.asconf_seq_out;
+ /* remove the old ASCONF on our outbound queue */
+ sctp_toss_old_asconf(stcb);
+}
+
+/*
+ * cleanup any cached source addresses that may be topologically
+ * incorrect after a new address has been added to this interface.
+ */
+static void
+sctp_asconf_nets_cleanup(struct sctp_tcb *stcb, struct sctp_ifn *ifn)
+{
+ struct sctp_nets *net;
+
+ /*
+ * Ideally, we want to only clear cached routes and source addresses
+ * that are topologically incorrect. But since there is no easy way
+ * to know whether the newly added address on the ifn would cause a
+ * routing change (i.e. a new egress interface would be chosen)
+ * without doing a new routing lookup and source address selection,
+ * we will (for now) just flush any cached route using a different
+ * ifn (and cached source addrs) and let output re-choose them
+ * during the next send on that net.
+ */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ /*
+ * clear any cached route (and cached source address) if the
+ * route's interface is NOT the same as the address change.
+ * If it's the same interface, just clear the cached source
+ * address.
+ */
+ if (SCTP_ROUTE_HAS_VALID_IFN(&net->ro) &&
+ ((ifn == NULL) ||
+ (SCTP_GET_IF_INDEX_FROM_ROUTE(&net->ro) != ifn->ifn_index))) {
+ /* clear any cached route */
+ RTFREE(net->ro.ro_rt);
+ net->ro.ro_rt = NULL;
+ }
+ /* clear any cached source address */
+ if (net->src_addr_selected) {
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ net->src_addr_selected = 0;
+ }
+ }
+}
+
+
+void
+sctp_assoc_immediate_retrans(struct sctp_tcb *stcb, struct sctp_nets *dstnet)
+{
+ int error;
+
+ if (dstnet->dest_state & SCTP_ADDR_UNCONFIRMED) {
+ return;
+ }
+ if (stcb->asoc.deleted_primary == NULL) {
+ return;
+ }
+ if (!TAILQ_EMPTY(&stcb->asoc.sent_queue)) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "assoc_immediate_retrans: Deleted primary is ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &stcb->asoc.deleted_primary->ro._l_addr.sa);
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "Current Primary is ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &stcb->asoc.primary_destination->ro._l_addr.sa);
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb,
+ stcb->asoc.deleted_primary,
+ SCTP_FROM_SCTP_TIMER + SCTP_LOC_8);
+ stcb->asoc.num_send_timers_up--;
+ if (stcb->asoc.num_send_timers_up < 0) {
+ stcb->asoc.num_send_timers_up = 0;
+ }
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ error = sctp_t3rxt_timer(stcb->sctp_ep, stcb,
+ stcb->asoc.deleted_primary);
+ if (error) {
+ SCTP_INP_DECR_REF(stcb->sctp_ep);
+ return;
+ }
+ SCTP_TCB_LOCK_ASSERT(stcb);
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(4, stcb->sctp_ep, stcb, stcb->asoc.deleted_primary);
+#endif
+ sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED);
+ if ((stcb->asoc.num_send_timers_up == 0) &&
+ (stcb->asoc.sent_queue_cnt > 0)) {
+ struct sctp_tmit_chunk *chk;
+
+ chk = TAILQ_FIRST(&stcb->asoc.sent_queue);
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
+ stcb, chk->whoTo);
+ }
+ }
+ return;
+}
+
+static int
+ sctp_asconf_queue_mgmt(struct sctp_tcb *, struct sctp_ifa *, uint16_t);
+
+void
+sctp_net_immediate_retrans(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ struct sctp_tmit_chunk *chk;
+
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "net_immediate_retrans: RTO is %d\n", net->RTO);
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_TIMER + SCTP_LOC_5);
+ stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net);
+ net->error_count = 0;
+ TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
+ if (chk->whoTo == net) {
+ if (chk->sent < SCTP_DATAGRAM_RESEND) {
+ chk->sent = SCTP_DATAGRAM_RESEND;
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ sctp_flight_size_decrease(chk);
+ sctp_total_flight_decrease(stcb, chk);
+ net->marked_retrans++;
+ stcb->asoc.marked_retrans++;
+ }
+ }
+ }
+ if (net->marked_retrans) {
+ sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED);
+ }
+}
+
+static void
+sctp_path_check_and_react(struct sctp_tcb *stcb, struct sctp_ifa *newifa)
+{
+ struct sctp_nets *net;
+ int addrnum, changed;
+
+ /*
+ * If number of local valid addresses is 1, the valid address is
+ * probably newly added address. Several valid addresses in this
+ * association. A source address may not be changed. Additionally,
+ * they can be configured on a same interface as "alias" addresses.
+ * (by micchie)
+ */
+ addrnum = sctp_local_addr_count(stcb);
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "p_check_react(): %d local addresses\n",
+ addrnum);
+ if (addrnum == 1) {
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ /* clear any cached route and source address */
+ if (net->ro.ro_rt) {
+ RTFREE(net->ro.ro_rt);
+ net->ro.ro_rt = NULL;
+ }
+ if (net->src_addr_selected) {
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ net->src_addr_selected = 0;
+ }
+ /* Retransmit unacknowledged DATA chunks immediately */
+ if (sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_FASTHANDOFF)) {
+ sctp_net_immediate_retrans(stcb, net);
+ }
+ /* also, SET PRIMARY is maybe already sent */
+ }
+ return;
+ }
+ /* Multiple local addresses exsist in the association. */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ /* clear any cached route and source address */
+ if (net->ro.ro_rt) {
+ RTFREE(net->ro.ro_rt);
+ net->ro.ro_rt = NULL;
+ }
+ if (net->src_addr_selected) {
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ net->src_addr_selected = 0;
+ }
+ /*
+ * Check if the nexthop is corresponding to the new address.
+ * If the new address is corresponding to the current
+ * nexthop, the path will be changed. If the new address is
+ * NOT corresponding to the current nexthop, the path will
+ * not be changed.
+ */
+ SCTP_RTALLOC((sctp_route_t *) & net->ro,
+ stcb->sctp_ep->def_vrf_id);
+ if (net->ro.ro_rt == NULL)
+ continue;
+
+ changed = 0;
+ if (net->ro._l_addr.sa.sa_family == AF_INET) {
+ if (sctp_v4src_match_nexthop(newifa, (sctp_route_t *) & net->ro))
+ changed = 1;
+ }
+#ifdef INET6
+ if (net->ro._l_addr.sa.sa_family == AF_INET6) {
+ if (sctp_v6src_match_nexthop(
+ &newifa->address.sin6, (sctp_route_t *) & net->ro))
+ changed = 1;
+ }
+#endif
+ /*
+ * if the newly added address does not relate routing
+ * information, we skip.
+ */
+ if (changed == 0)
+ continue;
+ /* Retransmit unacknowledged DATA chunks immediately */
+ if (sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_FASTHANDOFF)) {
+ sctp_net_immediate_retrans(stcb, net);
+ }
+ /* Send SET PRIMARY for this new address */
+ if (net == stcb->asoc.primary_destination) {
+ (void)sctp_asconf_queue_mgmt(stcb, newifa,
+ SCTP_SET_PRIM_ADDR);
+ }
+ }
+}
+
+/*
+ * process an ADD/DELETE IP ack from peer.
+ * addr: corresponding sctp_ifa to the address being added/deleted.
+ * type: SCTP_ADD_IP_ADDRESS or SCTP_DEL_IP_ADDRESS.
+ * flag: 1=success, 0=failure.
+ */
+static void
+sctp_asconf_addr_mgmt_ack(struct sctp_tcb *stcb, struct sctp_ifa *addr,
+ uint16_t type, uint32_t flag)
+{
+ /*
+ * do the necessary asoc list work- if we get a failure indication,
+ * leave the address on the assoc's restricted list. If we get a
+ * success indication, remove the address from the restricted list.
+ */
+ /*
+ * Note: this will only occur for ADD_IP_ADDRESS, since
+ * DEL_IP_ADDRESS is never actually added to the list...
+ */
+ if (flag) {
+ /* success case, so remove from the restricted list */
+ sctp_del_local_addr_restricted(stcb, addr);
+
+ if (sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_BASE) ||
+ sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_FASTHANDOFF)) {
+ sctp_path_check_and_react(stcb, addr);
+ return;
+ }
+ /* clear any cached/topologically incorrect source addresses */
+ sctp_asconf_nets_cleanup(stcb, addr->ifn_p);
+ }
+ /* else, leave it on the list */
+}
+
+/*
+ * add an asconf add/delete/set primary IP address parameter to the queue.
+ * type = SCTP_ADD_IP_ADDRESS, SCTP_DEL_IP_ADDRESS, SCTP_SET_PRIM_ADDR.
+ * returns 0 if queued, -1 if not queued/removed.
+ * NOTE: if adding, but a delete for the same address is already scheduled
+ * (and not yet sent out), simply remove it from queue. Same for deleting
+ * an address already scheduled for add. If a duplicate operation is found,
+ * ignore the new one.
+ */
+static int
+sctp_asconf_queue_mgmt(struct sctp_tcb *stcb, struct sctp_ifa *ifa,
+ uint16_t type)
+{
+ struct sctp_asconf_addr *aa, *aa_next;
+ struct sockaddr *sa;
+
+ /* make sure the request isn't already in the queue */
+ for (aa = TAILQ_FIRST(&stcb->asoc.asconf_queue); aa != NULL;
+ aa = aa_next) {
+ aa_next = TAILQ_NEXT(aa, next);
+ /* address match? */
+ if (sctp_asconf_addr_match(aa, &ifa->address.sa) == 0)
+ continue;
+ /*
+ * is the request already in queue but not sent? pass the
+ * request already sent in order to resolve the following
+ * case: 1. arrival of ADD, then sent 2. arrival of DEL. we
+ * can't remove the ADD request already sent 3. arrival of
+ * ADD
+ */
+ if (aa->ap.aph.ph.param_type == type && aa->sent == 0) {
+ return (-1);
+ }
+ /* is the negative request already in queue, and not sent */
+ if ((aa->sent == 0) && (type == SCTP_ADD_IP_ADDRESS) &&
+ (aa->ap.aph.ph.param_type == SCTP_DEL_IP_ADDRESS)) {
+ /* add requested, delete already queued */
+ TAILQ_REMOVE(&stcb->asoc.asconf_queue, aa, next);
+ /* remove the ifa from the restricted list */
+ sctp_del_local_addr_restricted(stcb, ifa);
+ /* free the asconf param */
+ SCTP_FREE(aa, SCTP_M_ASC_ADDR);
+ SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_mgmt: add removes queued entry\n");
+ return (-1);
+ }
+ if ((aa->sent == 0) && (type == SCTP_DEL_IP_ADDRESS) &&
+ (aa->ap.aph.ph.param_type == SCTP_ADD_IP_ADDRESS)) {
+ /* delete requested, add already queued */
+ TAILQ_REMOVE(&stcb->asoc.asconf_queue, aa, next);
+ /* remove the aa->ifa from the restricted list */
+ sctp_del_local_addr_restricted(stcb, aa->ifa);
+ /* free the asconf param */
+ SCTP_FREE(aa, SCTP_M_ASC_ADDR);
+ SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_mgmt: delete removes queued entry\n");
+ return (-1);
+ }
+ } /* for each aa */
+
+ /* adding new request to the queue */
+ SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa),
+ SCTP_M_ASC_ADDR);
+ if (aa == NULL) {
+ /* didn't get memory */
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_queue_mgmt: failed to get memory!\n");
+ return (-1);
+ }
+ aa->special_del = 0;
+ /* fill in asconf address parameter fields */
+ /* top level elements are "networked" during send */
+ aa->ap.aph.ph.param_type = type;
+ aa->ifa = ifa;
+ atomic_add_int(&ifa->refcount, 1);
+ /* correlation_id filled in during send routine later... */
+ if (ifa->address.sa.sa_family == AF_INET6) {
+ /* IPv6 address */
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&ifa->address.sa;
+ sa = (struct sockaddr *)sin6;
+ aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS;
+ aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv6addr_param));
+ aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) +
+ sizeof(struct sctp_ipv6addr_param);
+ memcpy(&aa->ap.addrp.addr, &sin6->sin6_addr,
+ sizeof(struct in6_addr));
+ } else if (ifa->address.sa.sa_family == AF_INET) {
+ /* IPv4 address */
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&ifa->address.sa;
+ sa = (struct sockaddr *)sin;
+ aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS;
+ aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv4addr_param));
+ aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) +
+ sizeof(struct sctp_ipv4addr_param);
+ memcpy(&aa->ap.addrp.addr, &sin->sin_addr,
+ sizeof(struct in_addr));
+ } else {
+ /* invalid family! */
+ SCTP_FREE(aa, SCTP_M_ASC_ADDR);
+ sctp_free_ifa(ifa);
+ return (-1);
+ }
+ aa->sent = 0; /* clear sent flag */
+
+ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
+#ifdef SCTP_DEBUG
+ if (SCTP_BASE_SYSCTL(sctp_debug_on) && SCTP_DEBUG_ASCONF2) {
+ if (type == SCTP_ADD_IP_ADDRESS) {
+ SCTP_PRINTF("asconf_queue_mgmt: inserted asconf ADD_IP_ADDRESS: ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, sa);
+ } else if (type == SCTP_DEL_IP_ADDRESS) {
+ SCTP_PRINTF("asconf_queue_mgmt: appended asconf DEL_IP_ADDRESS: ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, sa);
+ } else {
+ SCTP_PRINTF("asconf_queue_mgmt: appended asconf SET_PRIM_ADDR: ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, sa);
+ }
+ }
+#endif
+
+ return (0);
+}
+
+
+/*
+ * add an asconf operation for the given ifa and type.
+ * type = SCTP_ADD_IP_ADDRESS, SCTP_DEL_IP_ADDRESS, SCTP_SET_PRIM_ADDR.
+ * returns 0 if completed, -1 if not completed, 1 if immediate send is
+ * advisable.
+ */
+static int
+sctp_asconf_queue_add(struct sctp_tcb *stcb, struct sctp_ifa *ifa,
+ uint16_t type)
+{
+ uint32_t status;
+ int pending_delete_queued = 0;
+
+ /* see if peer supports ASCONF */
+ if (stcb->asoc.peer_supports_asconf == 0) {
+ return (-1);
+ }
+ /*
+ * if this is deleting the last address from the assoc, mark it as
+ * pending.
+ */
+ if ((type == SCTP_DEL_IP_ADDRESS) && !stcb->asoc.asconf_del_pending &&
+ (sctp_local_addr_count(stcb) < 2)) {
+ /* set the pending delete info only */
+ stcb->asoc.asconf_del_pending = 1;
+ stcb->asoc.asconf_addr_del_pending = ifa;
+ atomic_add_int(&ifa->refcount, 1);
+ SCTPDBG(SCTP_DEBUG_ASCONF2,
+ "asconf_queue_add: mark delete last address pending\n");
+ return (-1);
+ }
+ /* queue an asconf parameter */
+ status = sctp_asconf_queue_mgmt(stcb, ifa, type);
+
+ /*
+ * if this is an add, and there is a delete also pending (i.e. the
+ * last local address is being changed), queue the pending delete
+ * too.
+ */
+ if ((type == SCTP_ADD_IP_ADDRESS) && stcb->asoc.asconf_del_pending && (status == 0)) {
+ /* queue in the pending delete */
+ if (sctp_asconf_queue_mgmt(stcb,
+ stcb->asoc.asconf_addr_del_pending,
+ SCTP_DEL_IP_ADDRESS) == 0) {
+ SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_add: queing pending delete\n");
+ pending_delete_queued = 1;
+ /* clear out the pending delete info */
+ stcb->asoc.asconf_del_pending = 0;
+ sctp_free_ifa(stcb->asoc.asconf_addr_del_pending);
+ stcb->asoc.asconf_addr_del_pending = NULL;
+ }
+ }
+ if (pending_delete_queued) {
+ struct sctp_nets *net;
+
+ /*
+ * since we know that the only/last address is now being
+ * changed in this case, reset the cwnd/rto on all nets to
+ * start as a new address and path. Also clear the error
+ * counts to give the assoc the best chance to complete the
+ * address change.
+ */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb,
+ net);
+ net->RTO = 0;
+ net->error_count = 0;
+ }
+ stcb->asoc.overall_error_count = 0;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_ASCONF,
+ __LINE__);
+ }
+ /* queue in an advisory set primary too */
+ (void)sctp_asconf_queue_mgmt(stcb, ifa, SCTP_SET_PRIM_ADDR);
+ /* let caller know we should send this out immediately */
+ status = 1;
+ }
+ return (status);
+}
+
+/*-
+ * add an asconf delete IP address parameter to the queue by sockaddr and
+ * possibly with no sctp_ifa available. This is only called by the routine
+ * that checks the addresses in an INIT-ACK against the current address list.
+ * returns 0 if completed, non-zero if not completed.
+ * NOTE: if an add is already scheduled (and not yet sent out), simply
+ * remove it from queue. If a duplicate operation is found, ignore the
+ * new one.
+ */
+static int
+sctp_asconf_queue_sa_delete(struct sctp_tcb *stcb, struct sockaddr *sa)
+{
+ struct sctp_ifa *ifa;
+ struct sctp_asconf_addr *aa, *aa_next;
+ uint32_t vrf_id;
+
+ if (stcb == NULL) {
+ return (-1);
+ }
+ /* see if peer supports ASCONF */
+ if (stcb->asoc.peer_supports_asconf == 0) {
+ return (-1);
+ }
+ /* make sure the request isn't already in the queue */
+ for (aa = TAILQ_FIRST(&stcb->asoc.asconf_queue); aa != NULL;
+ aa = aa_next) {
+ aa_next = TAILQ_NEXT(aa, next);
+ /* address match? */
+ if (sctp_asconf_addr_match(aa, sa) == 0)
+ continue;
+ /* is the request already in queue (sent or not) */
+ if (aa->ap.aph.ph.param_type == SCTP_DEL_IP_ADDRESS) {
+ return (-1);
+ }
+ /* is the negative request already in queue, and not sent */
+ if (aa->sent == 1)
+ continue;
+ if (aa->ap.aph.ph.param_type == SCTP_ADD_IP_ADDRESS) {
+ /* add already queued, so remove existing entry */
+ TAILQ_REMOVE(&stcb->asoc.asconf_queue, aa, next);
+ sctp_del_local_addr_restricted(stcb, aa->ifa);
+ /* free the entry */
+ SCTP_FREE(aa, SCTP_M_ASC_ADDR);
+ return (-1);
+ }
+ } /* for each aa */
+
+ /* find any existing ifa-- NOTE ifa CAN be allowed to be NULL */
+ if (stcb) {
+ vrf_id = stcb->asoc.vrf_id;
+ } else {
+ vrf_id = SCTP_DEFAULT_VRFID;
+ }
+ ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED);
+
+ /* adding new request to the queue */
+ SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa),
+ SCTP_M_ASC_ADDR);
+ if (aa == NULL) {
+ /* didn't get memory */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "sctp_asconf_queue_sa_delete: failed to get memory!\n");
+ return (-1);
+ }
+ aa->special_del = 0;
+ /* fill in asconf address parameter fields */
+ /* top level elements are "networked" during send */
+ aa->ap.aph.ph.param_type = SCTP_DEL_IP_ADDRESS;
+ aa->ifa = ifa;
+ if (ifa)
+ atomic_add_int(&ifa->refcount, 1);
+ /* correlation_id filled in during send routine later... */
+ if (sa->sa_family == AF_INET6) {
+ /* IPv6 address */
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)sa;
+ aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS;
+ aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv6addr_param));
+ aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + sizeof(struct sctp_ipv6addr_param);
+ memcpy(&aa->ap.addrp.addr, &sin6->sin6_addr,
+ sizeof(struct in6_addr));
+ } else if (sa->sa_family == AF_INET) {
+ /* IPv4 address */
+ struct sockaddr_in *sin = (struct sockaddr_in *)sa;
+
+ aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS;
+ aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv4addr_param));
+ aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + sizeof(struct sctp_ipv4addr_param);
+ memcpy(&aa->ap.addrp.addr, &sin->sin_addr,
+ sizeof(struct in_addr));
+ } else {
+ /* invalid family! */
+ SCTP_FREE(aa, SCTP_M_ASC_ADDR);
+ if (ifa)
+ sctp_free_ifa(ifa);
+ return (-1);
+ }
+ aa->sent = 0; /* clear sent flag */
+
+ /* delete goes to the back of the queue */
+ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
+
+ /* sa_ignore MEMLEAK {memory is put on the tailq} */
+ return (0);
+}
+
+/*
+ * find a specific asconf param on our "sent" queue
+ */
+static struct sctp_asconf_addr *
+sctp_asconf_find_param(struct sctp_tcb *stcb, uint32_t correlation_id)
+{
+ struct sctp_asconf_addr *aa;
+
+ TAILQ_FOREACH(aa, &stcb->asoc.asconf_queue, next) {
+ if (aa->ap.aph.correlation_id == correlation_id &&
+ aa->sent == 1) {
+ /* found it */
+ return (aa);
+ }
+ }
+ /* didn't find it */
+ return (NULL);
+}
+
+/*
+ * process an SCTP_ERROR_CAUSE_IND for a ASCONF-ACK parameter and do
+ * notifications based on the error response
+ */
+static void
+sctp_asconf_process_error(struct sctp_tcb *stcb,
+ struct sctp_asconf_paramhdr *aph)
+{
+ struct sctp_error_cause *eh;
+ struct sctp_paramhdr *ph;
+ uint16_t param_type;
+ uint16_t error_code;
+
+ eh = (struct sctp_error_cause *)(aph + 1);
+ ph = (struct sctp_paramhdr *)(eh + 1);
+ /* validate lengths */
+ if (htons(eh->length) + sizeof(struct sctp_error_cause) >
+ htons(aph->ph.param_length)) {
+ /* invalid error cause length */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "asconf_process_error: cause element too long\n");
+ return;
+ }
+ if (htons(ph->param_length) + sizeof(struct sctp_paramhdr) >
+ htons(eh->length)) {
+ /* invalid included TLV length */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "asconf_process_error: included TLV too long\n");
+ return;
+ }
+ /* which error code ? */
+ error_code = ntohs(eh->code);
+ param_type = ntohs(aph->ph.param_type);
+ /* FIX: this should go back up the REMOTE_ERROR ULP notify */
+ switch (error_code) {
+ case SCTP_CAUSE_RESOURCE_SHORTAGE:
+ /* we allow ourselves to "try again" for this error */
+ break;
+ default:
+ /* peer can't handle it... */
+ switch (param_type) {
+ case SCTP_ADD_IP_ADDRESS:
+ case SCTP_DEL_IP_ADDRESS:
+ stcb->asoc.peer_supports_asconf = 0;
+ break;
+ case SCTP_SET_PRIM_ADDR:
+ stcb->asoc.peer_supports_asconf = 0;
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+/*
+ * process an asconf queue param.
+ * aparam: parameter to process, will be removed from the queue.
+ * flag: 1=success case, 0=failure case
+ */
+static void
+sctp_asconf_process_param_ack(struct sctp_tcb *stcb,
+ struct sctp_asconf_addr *aparam, uint32_t flag)
+{
+ uint16_t param_type;
+
+ /* process this param */
+ param_type = aparam->ap.aph.ph.param_type;
+ switch (param_type) {
+ case SCTP_ADD_IP_ADDRESS:
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "process_param_ack: added IP address\n");
+ sctp_asconf_addr_mgmt_ack(stcb, aparam->ifa, param_type, flag);
+ break;
+ case SCTP_DEL_IP_ADDRESS:
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "process_param_ack: deleted IP address\n");
+ /* nothing really to do... lists already updated */
+ break;
+ case SCTP_SET_PRIM_ADDR:
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "process_param_ack: set primary IP address\n");
+ /* nothing to do... peer may start using this addr */
+ if (flag == 0)
+ stcb->asoc.peer_supports_asconf = 0;
+ break;
+ default:
+ /* should NEVER happen */
+ break;
+ }
+
+ /* remove the param and free it */
+ TAILQ_REMOVE(&stcb->asoc.asconf_queue, aparam, next);
+ if (aparam->ifa)
+ sctp_free_ifa(aparam->ifa);
+ SCTP_FREE(aparam, SCTP_M_ASC_ADDR);
+}
+
+/*
+ * cleanup from a bad asconf ack parameter
+ */
+static void
+sctp_asconf_ack_clear(struct sctp_tcb *stcb)
+{
+ /* assume peer doesn't really know how to do asconfs */
+ stcb->asoc.peer_supports_asconf = 0;
+ /* XXX we could free the pending queue here */
+}
+
+void
+sctp_handle_asconf_ack(struct mbuf *m, int offset,
+ struct sctp_asconf_ack_chunk *cp, struct sctp_tcb *stcb,
+ struct sctp_nets *net, int *abort_no_unlock)
+{
+ struct sctp_association *asoc;
+ uint32_t serial_num;
+ uint16_t ack_length;
+ struct sctp_asconf_paramhdr *aph;
+ struct sctp_asconf_addr *aa, *aa_next;
+ uint32_t last_error_id = 0; /* last error correlation id */
+ uint32_t id;
+ struct sctp_asconf_addr *ap;
+
+ /* asconf param buffer */
+ uint8_t aparam_buf[SCTP_PARAM_BUFFER_SIZE];
+
+ /* verify minimum length */
+ if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_asconf_ack_chunk)) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "handle_asconf_ack: chunk too small = %xh\n",
+ ntohs(cp->ch.chunk_length));
+ return;
+ }
+ asoc = &stcb->asoc;
+ serial_num = ntohl(cp->serial_number);
+
+ /*
+ * NOTE: we may want to handle this differently- currently, we will
+ * abort when we get an ack for the expected serial number + 1 (eg.
+ * we didn't send it), process an ack normally if it is the expected
+ * serial number, and re-send the previous ack for *ALL* other
+ * serial numbers
+ */
+
+ /*
+ * if the serial number is the next expected, but I didn't send it,
+ * abort the asoc, since someone probably just hijacked us...
+ */
+ if (serial_num == (asoc->asconf_seq_out + 1)) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf_ack: got unexpected next serial number! Aborting asoc!\n");
+ sctp_abort_an_association(stcb->sctp_ep, stcb,
+ SCTP_CAUSE_ILLEGAL_ASCONF_ACK, NULL, SCTP_SO_NOT_LOCKED);
+ *abort_no_unlock = 1;
+ return;
+ }
+ if (serial_num != asoc->asconf_seq_out_acked + 1) {
+ /* got a duplicate/unexpected ASCONF-ACK */
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf_ack: got duplicate/unexpected serial number = %xh (expected = %xh)\n",
+ serial_num, asoc->asconf_seq_out_acked + 1);
+ return;
+ }
+ if (serial_num == asoc->asconf_seq_out - 1) {
+ /* stop our timer */
+ sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_ASCONF + SCTP_LOC_3);
+ }
+ /* process the ASCONF-ACK contents */
+ ack_length = ntohs(cp->ch.chunk_length) -
+ sizeof(struct sctp_asconf_ack_chunk);
+ offset += sizeof(struct sctp_asconf_ack_chunk);
+ /* process through all parameters */
+ while (ack_length >= sizeof(struct sctp_asconf_paramhdr)) {
+ unsigned int param_length, param_type;
+
+ /* get pointer to next asconf parameter */
+ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset,
+ sizeof(struct sctp_asconf_paramhdr), aparam_buf);
+ if (aph == NULL) {
+ /* can't get an asconf paramhdr */
+ sctp_asconf_ack_clear(stcb);
+ return;
+ }
+ param_type = ntohs(aph->ph.param_type);
+ param_length = ntohs(aph->ph.param_length);
+ if (param_length > ack_length) {
+ sctp_asconf_ack_clear(stcb);
+ return;
+ }
+ if (param_length < sizeof(struct sctp_paramhdr)) {
+ sctp_asconf_ack_clear(stcb);
+ return;
+ }
+ /* get the complete parameter... */
+ if (param_length > sizeof(aparam_buf)) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "param length (%u) larger than buffer size!\n", param_length);
+ sctp_asconf_ack_clear(stcb);
+ return;
+ }
+ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, param_length, aparam_buf);
+ if (aph == NULL) {
+ sctp_asconf_ack_clear(stcb);
+ return;
+ }
+ /* correlation_id is transparent to peer, no ntohl needed */
+ id = aph->correlation_id;
+
+ switch (param_type) {
+ case SCTP_ERROR_CAUSE_IND:
+ last_error_id = id;
+ /* find the corresponding asconf param in our queue */
+ ap = sctp_asconf_find_param(stcb, id);
+ if (ap == NULL) {
+ /* hmm... can't find this in our queue! */
+ break;
+ }
+ /* process the parameter, failed flag */
+ sctp_asconf_process_param_ack(stcb, ap, 0);
+ /* process the error response */
+ sctp_asconf_process_error(stcb, aph);
+ break;
+ case SCTP_SUCCESS_REPORT:
+ /* find the corresponding asconf param in our queue */
+ ap = sctp_asconf_find_param(stcb, id);
+ if (ap == NULL) {
+ /* hmm... can't find this in our queue! */
+ break;
+ }
+ /* process the parameter, success flag */
+ sctp_asconf_process_param_ack(stcb, ap, 1);
+ break;
+ default:
+ break;
+ } /* switch */
+
+ /* update remaining ASCONF-ACK message length to process */
+ ack_length -= SCTP_SIZE32(param_length);
+ if (ack_length <= 0) {
+ /* no more data in the mbuf chain */
+ break;
+ }
+ offset += SCTP_SIZE32(param_length);
+ } /* while */
+
+ /*
+ * if there are any "sent" params still on the queue, these are
+ * implicitly "success", or "failed" (if we got an error back) ...
+ * so process these appropriately
+ *
+ * we assume that the correlation_id's are monotonically increasing
+ * beginning from 1 and that we don't have *that* many outstanding
+ * at any given time
+ */
+ if (last_error_id == 0)
+ last_error_id--;/* set to "max" value */
+ for (aa = TAILQ_FIRST(&stcb->asoc.asconf_queue); aa != NULL;
+ aa = aa_next) {
+ aa_next = TAILQ_NEXT(aa, next);
+ if (aa->sent == 1) {
+ /*
+ * implicitly successful or failed if correlation_id
+ * < last_error_id, then success else, failure
+ */
+ if (aa->ap.aph.correlation_id < last_error_id)
+ sctp_asconf_process_param_ack(stcb, aa, 1);
+ else
+ sctp_asconf_process_param_ack(stcb, aa, 0);
+ } else {
+ /*
+ * since we always process in order (FIFO queue) if
+ * we reach one that hasn't been sent, the rest
+ * should not have been sent either. so, we're
+ * done...
+ */
+ break;
+ }
+ }
+
+ /* update the next sequence number to use */
+ asoc->asconf_seq_out_acked++;
+ /* remove the old ASCONF on our outbound queue */
+ sctp_toss_old_asconf(stcb);
+ if (!TAILQ_EMPTY(&stcb->asoc.asconf_queue)) {
+#ifdef SCTP_TIMER_BASED_ASCONF
+ /* we have more params, so restart our timer */
+ sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep,
+ stcb, net);
+#else
+ /* we have more params, so send out more */
+ sctp_send_asconf(stcb, net, SCTP_ADDR_NOT_LOCKED);
+#endif
+ }
+}
+
+#ifdef INET6
+static uint32_t
+sctp_is_scopeid_in_nets(struct sctp_tcb *stcb, struct sockaddr *sa)
+{
+ struct sockaddr_in6 *sin6, *net6;
+ struct sctp_nets *net;
+
+ if (sa->sa_family != AF_INET6) {
+ /* wrong family */
+ return (0);
+ }
+ sin6 = (struct sockaddr_in6 *)sa;
+ if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) == 0) {
+ /* not link local address */
+ return (0);
+ }
+ /* hunt through our destination nets list for this scope_id */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ if (((struct sockaddr *)(&net->ro._l_addr))->sa_family !=
+ AF_INET6)
+ continue;
+ net6 = (struct sockaddr_in6 *)&net->ro._l_addr;
+ if (IN6_IS_ADDR_LINKLOCAL(&net6->sin6_addr) == 0)
+ continue;
+ if (sctp_is_same_scope(sin6, net6)) {
+ /* found one */
+ return (1);
+ }
+ }
+ /* didn't find one */
+ return (0);
+}
+
+#endif
+
+/*
+ * address management functions
+ */
+static void
+sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_ifa *ifa, uint16_t type, int addr_locked)
+{
+ int status;
+
+
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0 &&
+ sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF)) {
+ /* subset bound, no ASCONF allowed case, so ignore */
+ return;
+ }
+ /*
+ * note: we know this is not the subset bound, no ASCONF case eg.
+ * this is boundall or subset bound w/ASCONF allowed
+ */
+
+ /* first, make sure it's a good address family */
+ if (ifa->address.sa.sa_family != AF_INET6 &&
+ ifa->address.sa.sa_family != AF_INET) {
+ return;
+ }
+ /* make sure we're "allowed" to add this type of addr */
+ if (ifa->address.sa.sa_family == AF_INET6) {
+ /* invalid if we're not a v6 endpoint */
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0)
+ return;
+ /* is the v6 addr really valid ? */
+ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
+ return;
+ }
+ }
+ /* put this address on the "pending/do not use yet" list */
+ sctp_add_local_addr_restricted(stcb, ifa);
+ /*
+ * check address scope if address is out of scope, don't queue
+ * anything... note: this would leave the address on both inp and
+ * asoc lists
+ */
+ switch (ifa->address.sa.sa_family) {
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&ifa->address.sin6;
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+ /* we skip unspecifed addresses */
+ return;
+ }
+ if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
+ if (stcb->asoc.local_scope == 0) {
+ return;
+ }
+ /* is it the right link local scope? */
+ if (sctp_is_scopeid_in_nets(stcb, &ifa->address.sa) == 0) {
+ return;
+ }
+ }
+ if (stcb->asoc.site_scope == 0 &&
+ IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) {
+ return;
+ }
+ break;
+ }
+#endif
+ case AF_INET:
+ {
+ struct sockaddr_in *sin;
+ struct in6pcb *inp6;
+
+ inp6 = (struct in6pcb *)&inp->ip_inp.inp;
+ /* invalid if we are a v6 only endpoint */
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
+ SCTP_IPV6_V6ONLY(inp6))
+ return;
+
+ sin = (struct sockaddr_in *)&ifa->address.sa;
+ if (sin->sin_addr.s_addr == 0) {
+ /* we skip unspecifed addresses */
+ return;
+ }
+ if (stcb->asoc.ipv4_local_scope == 0 &&
+ IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) {
+ return;
+ }
+ break;
+ }
+ default:
+ /* else, not AF_INET or AF_INET6, so skip */
+ return;
+ }
+
+ /* queue an asconf for this address add/delete */
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF)) {
+ /* does the peer do asconf? */
+ if (stcb->asoc.peer_supports_asconf) {
+ /* queue an asconf for this addr */
+ status = sctp_asconf_queue_add(stcb, ifa, type);
+
+ /*
+ * if queued ok, and in the open state, send out the
+ * ASCONF. If in the non-open state, these will be
+ * sent when the state goes open.
+ */
+ if (status == 0 &&
+ SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) {
+#ifdef SCTP_TIMER_BASED_ASCONF
+ sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp,
+ stcb, stcb->asoc.primary_destination);
+#else
+ sctp_send_asconf(stcb, stcb->asoc.primary_destination,
+ addr_locked);
+#endif
+ }
+ }
+ }
+}
+
+
+int
+sctp_asconf_iterator_ep(struct sctp_inpcb *inp, void *ptr, uint32_t val)
+{
+ struct sctp_asconf_iterator *asc;
+ struct sctp_ifa *ifa;
+ struct sctp_laddr *l;
+ int cnt_invalid = 0;
+
+ asc = (struct sctp_asconf_iterator *)ptr;
+ LIST_FOREACH(l, &asc->list_of_work, sctp_nxt_addr) {
+ ifa = l->ifa;
+ if (ifa->address.sa.sa_family == AF_INET6) {
+ /* invalid if we're not a v6 endpoint */
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
+ cnt_invalid++;
+ if (asc->cnt == cnt_invalid)
+ return (1);
+ else
+ continue;
+ }
+ } else if (ifa->address.sa.sa_family == AF_INET) {
+ /* invalid if we are a v6 only endpoint */
+ struct in6pcb *inp6;
+
+ inp6 = (struct in6pcb *)&inp->ip_inp.inp;
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
+ SCTP_IPV6_V6ONLY(inp6)) {
+ cnt_invalid++;
+ if (asc->cnt == cnt_invalid)
+ return (1);
+ else
+ continue;
+ }
+ } else {
+ /* invalid address family */
+ cnt_invalid++;
+ if (asc->cnt == cnt_invalid)
+ return (1);
+ else
+ continue;
+ }
+ }
+ return (0);
+}
+
+static int
+sctp_asconf_iterator_ep_end(struct sctp_inpcb *inp, void *ptr, uint32_t val)
+{
+ struct sctp_ifa *ifa;
+ struct sctp_asconf_iterator *asc;
+ struct sctp_laddr *laddr, *nladdr, *l;
+
+ /* Only for specific case not bound all */
+ asc = (struct sctp_asconf_iterator *)ptr;
+ LIST_FOREACH(l, &asc->list_of_work, sctp_nxt_addr) {
+ ifa = l->ifa;
+ if (l->action == SCTP_ADD_IP_ADDRESS) {
+ LIST_FOREACH(laddr, &inp->sctp_addr_list,
+ sctp_nxt_addr) {
+ if (laddr->ifa == ifa) {
+ laddr->action = 0;
+ break;
+ }
+ }
+ } else if (l->action == SCTP_DEL_IP_ADDRESS) {
+ laddr = LIST_FIRST(&inp->sctp_addr_list);
+ while (laddr) {
+ nladdr = LIST_NEXT(laddr, sctp_nxt_addr);
+ /* remove only after all guys are done */
+ if (laddr->ifa == ifa) {
+ sctp_del_local_addr_ep(inp, ifa);
+ }
+ laddr = nladdr;
+ }
+ }
+ }
+ return (0);
+}
+
+void
+sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ void *ptr, uint32_t val)
+{
+ struct sctp_asconf_iterator *asc;
+ struct sctp_ifa *ifa;
+ struct sctp_laddr *l;
+ int cnt_invalid = 0;
+ int type, status;
+ int num_queued = 0;
+
+ asc = (struct sctp_asconf_iterator *)ptr;
+ LIST_FOREACH(l, &asc->list_of_work, sctp_nxt_addr) {
+ ifa = l->ifa;
+ type = l->action;
+
+ /* address's vrf_id must be the vrf_id of the assoc */
+ if (ifa->vrf_id != stcb->asoc.vrf_id) {
+ continue;
+ }
+ /* Same checks again for assoc */
+ switch (ifa->address.sa.sa_family) {
+#ifdef INET6
+ case AF_INET6:
+ {
+ /* invalid if we're not a v6 endpoint */
+ struct sockaddr_in6 *sin6;
+
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
+ cnt_invalid++;
+ if (asc->cnt == cnt_invalid)
+ return;
+ else
+ continue;
+ }
+ sin6 = (struct sockaddr_in6 *)&ifa->address.sin6;
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+ /* we skip unspecifed addresses */
+ continue;
+ }
+ if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
+ if (stcb->asoc.local_scope == 0) {
+ continue;
+ }
+ /* is it the right link local scope? */
+ if (sctp_is_scopeid_in_nets(stcb, &ifa->address.sa) == 0) {
+ continue;
+ }
+ }
+ break;
+ }
+#endif
+ case AF_INET:
+ {
+ /* invalid if we are a v6 only endpoint */
+ struct in6pcb *inp6;
+ struct sockaddr_in *sin;
+
+ inp6 = (struct in6pcb *)&inp->ip_inp.inp;
+ /* invalid if we are a v6 only endpoint */
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
+ SCTP_IPV6_V6ONLY(inp6))
+ continue;
+
+ sin = (struct sockaddr_in *)&ifa->address.sa;
+ if (sin->sin_addr.s_addr == 0) {
+ /* we skip unspecifed addresses */
+ continue;
+ }
+ if (stcb->asoc.ipv4_local_scope == 0 &&
+ IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) {
+ continue;
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
+ SCTP_IPV6_V6ONLY(inp6)) {
+ cnt_invalid++;
+ if (asc->cnt == cnt_invalid)
+ return;
+ else
+ continue;
+ }
+ break;
+ }
+ default:
+ /* invalid address family */
+ cnt_invalid++;
+ if (asc->cnt == cnt_invalid)
+ return;
+ else
+ continue;
+ break;
+ }
+
+ if (type == SCTP_ADD_IP_ADDRESS) {
+ /* prevent this address from being used as a source */
+ sctp_add_local_addr_restricted(stcb, ifa);
+ } else if (type == SCTP_DEL_IP_ADDRESS) {
+ struct sctp_nets *net;
+
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ sctp_rtentry_t *rt;
+
+ /* delete this address if cached */
+ if (net->ro._s_addr == ifa) {
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ net->src_addr_selected = 0;
+ rt = net->ro.ro_rt;
+ if (rt) {
+ RTFREE(rt);
+ net->ro.ro_rt = NULL;
+ }
+ /*
+ * Now we deleted our src address,
+ * should we not also now reset the
+ * cwnd/rto to start as if its a new
+ * address?
+ */
+ stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net);
+ net->RTO = 0;
+
+ }
+ }
+ } else if (type == SCTP_SET_PRIM_ADDR) {
+ if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
+ /* must validate the ifa is in the ep */
+ if (sctp_is_addr_in_ep(stcb->sctp_ep, ifa) == 0) {
+ continue;
+ }
+ } else {
+ /* Need to check scopes for this guy */
+ if (sctp_is_address_in_scope(ifa,
+ stcb->asoc.ipv4_addr_legal,
+ stcb->asoc.ipv6_addr_legal,
+ stcb->asoc.loopback_scope,
+ stcb->asoc.ipv4_local_scope,
+ stcb->asoc.local_scope,
+ stcb->asoc.site_scope, 0) == 0) {
+ continue;
+ }
+ }
+ }
+ /* queue an asconf for this address add/delete */
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF) &&
+ stcb->asoc.peer_supports_asconf) {
+ /* queue an asconf for this addr */
+ status = sctp_asconf_queue_add(stcb, ifa, type);
+ /*
+ * if queued ok, and in the open state, update the
+ * count of queued params. If in the non-open
+ * state, these get sent when the assoc goes open.
+ */
+ if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) {
+ if (status >= 0) {
+ num_queued++;
+ }
+ }
+ }
+ }
+ /*
+ * If we have queued params in the open state, send out an ASCONF.
+ */
+ if (num_queued > 0) {
+ sctp_send_asconf(stcb, stcb->asoc.primary_destination,
+ SCTP_ADDR_NOT_LOCKED);
+ }
+}
+
+void
+sctp_asconf_iterator_end(void *ptr, uint32_t val)
+{
+ struct sctp_asconf_iterator *asc;
+ struct sctp_ifa *ifa;
+ struct sctp_laddr *l, *l_next;
+
+ asc = (struct sctp_asconf_iterator *)ptr;
+ l = LIST_FIRST(&asc->list_of_work);
+ while (l != NULL) {
+ l_next = LIST_NEXT(l, sctp_nxt_addr);
+ ifa = l->ifa;
+ if (l->action == SCTP_ADD_IP_ADDRESS) {
+ /* Clear the defer use flag */
+ ifa->localifa_flags &= ~SCTP_ADDR_DEFER_USE;
+ }
+ sctp_free_ifa(ifa);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), l);
+ SCTP_DECR_LADDR_COUNT();
+ l = l_next;
+ }
+ SCTP_FREE(asc, SCTP_M_ASC_IT);
+}
+
+/*
+ * sa is the sockaddr to ask the peer to set primary to.
+ * returns: 0 = completed, -1 = error
+ */
+int32_t
+sctp_set_primary_ip_address_sa(struct sctp_tcb *stcb, struct sockaddr *sa)
+{
+ uint32_t vrf_id;
+ struct sctp_ifa *ifa;
+
+ /* find the ifa for the desired set primary */
+ vrf_id = stcb->asoc.vrf_id;
+ ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED);
+ if (ifa == NULL) {
+ /* Invalid address */
+ return (-1);
+ }
+ /* queue an ASCONF:SET_PRIM_ADDR to be sent */
+ if (!sctp_asconf_queue_add(stcb, ifa, SCTP_SET_PRIM_ADDR)) {
+ /* set primary queuing succeeded */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "set_primary_ip_address_sa: queued on tcb=%p, ",
+ stcb);
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+ if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) {
+#ifdef SCTP_TIMER_BASED_ASCONF
+ sctp_timer_start(SCTP_TIMER_TYPE_ASCONF,
+ stcb->sctp_ep, stcb,
+ stcb->asoc.primary_destination);
+#else
+ sctp_send_asconf(stcb, stcb->asoc.primary_destination,
+ SCTP_ADDR_NOT_LOCKED);
+#endif
+ }
+ } else {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "set_primary_ip_address_sa: failed to add to queue on tcb=%p, ",
+ stcb);
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+ return (-1);
+ }
+ return (0);
+}
+
+void
+sctp_set_primary_ip_address(struct sctp_ifa *ifa)
+{
+ struct sctp_inpcb *inp;
+
+ /* go through all our PCB's */
+ LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) {
+ struct sctp_tcb *stcb;
+
+ /* process for all associations for this endpoint */
+ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
+ /* queue an ASCONF:SET_PRIM_ADDR to be sent */
+ if (!sctp_asconf_queue_add(stcb, ifa,
+ SCTP_SET_PRIM_ADDR)) {
+ /* set primary queuing succeeded */
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "set_primary_ip_address: queued on stcb=%p, ",
+ stcb);
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &ifa->address.sa);
+ if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) {
+#ifdef SCTP_TIMER_BASED_ASCONF
+ sctp_timer_start(SCTP_TIMER_TYPE_ASCONF,
+ stcb->sctp_ep, stcb,
+ stcb->asoc.primary_destination);
+#else
+ sctp_send_asconf(stcb, stcb->asoc.primary_destination,
+ SCTP_ADDR_NOT_LOCKED);
+#endif
+ }
+ }
+ } /* for each stcb */
+ } /* for each inp */
+}
+
+int
+sctp_is_addr_pending(struct sctp_tcb *stcb, struct sctp_ifa *sctp_ifa)
+{
+ struct sctp_tmit_chunk *chk, *nchk;
+ unsigned int offset, asconf_limit;
+ struct sctp_asconf_chunk *acp;
+ struct sctp_asconf_paramhdr *aph;
+ uint8_t aparam_buf[SCTP_PARAM_BUFFER_SIZE];
+ struct sctp_ipv6addr_param *p_addr;
+ int add_cnt, del_cnt;
+ uint16_t last_param_type;
+
+ add_cnt = del_cnt = 0;
+ last_param_type = 0;
+ for (chk = TAILQ_FIRST(&stcb->asoc.asconf_send_queue); chk != NULL;
+ chk = nchk) {
+ /* get next chk */
+ nchk = TAILQ_NEXT(chk, sctp_next);
+
+ if (chk->data == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: No mbuf data?\n");
+ continue;
+ }
+ offset = 0;
+ acp = mtod(chk->data, struct sctp_asconf_chunk *);
+ offset += sizeof(struct sctp_asconf_chunk);
+ asconf_limit = ntohs(acp->ch.chunk_length);
+ p_addr = (struct sctp_ipv6addr_param *)sctp_m_getptr(chk->data, offset, sizeof(struct sctp_paramhdr), aparam_buf);
+ if (p_addr == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: couldn't get lookup addr!\n");
+ continue;
+ }
+ offset += ntohs(p_addr->ph.param_length);
+
+ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(chk->data, offset, sizeof(struct sctp_asconf_paramhdr), aparam_buf);
+ if (aph == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: Empty ASCONF will be sent?\n");
+ continue;
+ }
+ while (aph != NULL) {
+ unsigned int param_length, param_type;
+
+ param_type = ntohs(aph->ph.param_type);
+ param_length = ntohs(aph->ph.param_length);
+ if (offset + param_length > asconf_limit) {
+ /* parameter goes beyond end of chunk! */
+ break;
+ }
+ if (param_length > sizeof(aparam_buf)) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: param length (%u) larger than buffer size!\n", param_length);
+ break;
+ }
+ if (param_length <= sizeof(struct sctp_paramhdr)) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: param length(%u) too short\n", param_length);
+ break;
+ }
+ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(chk->data, offset, param_length, aparam_buf);
+ if (aph == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: couldn't get entire param\n");
+ break;
+ }
+ p_addr = (struct sctp_ipv6addr_param *)(aph + 1);
+ if (sctp_addr_match(p_addr, &sctp_ifa->address.sa) != 0) {
+ switch (param_type) {
+ case SCTP_ADD_IP_ADDRESS:
+ add_cnt++;
+ break;
+ case SCTP_DEL_IP_ADDRESS:
+ del_cnt++;
+ break;
+ default:
+ break;
+ }
+ last_param_type = param_type;
+ }
+ offset += SCTP_SIZE32(param_length);
+ if (offset >= asconf_limit) {
+ /* no more data in the mbuf chain */
+ break;
+ }
+ /* get pointer to next asconf param */
+ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(chk->data, offset, sizeof(struct sctp_asconf_paramhdr), aparam_buf);
+ }
+ }
+
+ /*
+ * we want to find the sequences which consist of ADD -> DEL -> ADD
+ * or DEL -> ADD
+ */
+ if (add_cnt > del_cnt ||
+ (add_cnt == del_cnt && last_param_type == SCTP_ADD_IP_ADDRESS)) {
+ return 1;
+ }
+ return 0;
+}
+
+static struct sockaddr *
+sctp_find_valid_localaddr(struct sctp_tcb *stcb, int addr_locked)
+{
+ struct sctp_vrf *vrf = NULL;
+ struct sctp_ifn *sctp_ifn;
+ struct sctp_ifa *sctp_ifa;
+
+ if (addr_locked == SCTP_ADDR_NOT_LOCKED)
+ SCTP_IPI_ADDR_RLOCK();
+ vrf = sctp_find_vrf(stcb->asoc.vrf_id);
+ if (vrf == NULL) {
+ if (addr_locked == SCTP_ADDR_NOT_LOCKED)
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (NULL);
+ }
+ LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
+ if (stcb->asoc.loopback_scope == 0 &&
+ SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
+ /* Skip if loopback_scope not set */
+ continue;
+ }
+ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
+ if (sctp_ifa->address.sa.sa_family == AF_INET &&
+ stcb->asoc.ipv4_addr_legal) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&sctp_ifa->address.sa;
+ if (sin->sin_addr.s_addr == 0) {
+ /* skip unspecifed addresses */
+ continue;
+ }
+ if (stcb->asoc.ipv4_local_scope == 0 &&
+ IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))
+ continue;
+
+ if (sctp_is_addr_restricted(stcb, sctp_ifa) &&
+ (!sctp_is_addr_pending(stcb, sctp_ifa)))
+ continue;
+ /* found a valid local v4 address to use */
+ if (addr_locked == SCTP_ADDR_NOT_LOCKED)
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (&sctp_ifa->address.sa);
+ } else if (sctp_ifa->address.sa.sa_family == AF_INET6 &&
+ stcb->asoc.ipv6_addr_legal) {
+ struct sockaddr_in6 *sin6;
+
+ if (sctp_ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
+ continue;
+ }
+ sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa;
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+ /* we skip unspecifed addresses */
+ continue;
+ }
+ if (stcb->asoc.local_scope == 0 &&
+ IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
+ continue;
+ if (stcb->asoc.site_scope == 0 &&
+ IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))
+ continue;
+
+ if (sctp_is_addr_restricted(stcb, sctp_ifa) &&
+ (!sctp_is_addr_pending(stcb, sctp_ifa)))
+ continue;
+ /* found a valid local v6 address to use */
+ if (addr_locked == SCTP_ADDR_NOT_LOCKED)
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (&sctp_ifa->address.sa);
+ }
+ }
+ }
+ /* no valid addresses found */
+ if (addr_locked == SCTP_ADDR_NOT_LOCKED)
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (NULL);
+}
+
+static struct sockaddr *
+sctp_find_valid_localaddr_ep(struct sctp_tcb *stcb)
+{
+ struct sctp_laddr *laddr;
+
+ LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) {
+ if (laddr->ifa == NULL) {
+ continue;
+ }
+ /* is the address restricted ? */
+ if (sctp_is_addr_restricted(stcb, laddr->ifa) &&
+ (!sctp_is_addr_pending(stcb, laddr->ifa)))
+ continue;
+
+ /* found a valid local address to use */
+ return (&laddr->ifa->address.sa);
+ }
+ /* no valid addresses found */
+ return (NULL);
+}
+
+/*
+ * builds an ASCONF chunk from queued ASCONF params.
+ * returns NULL on error (no mbuf, no ASCONF params queued, etc).
+ */
+struct mbuf *
+sctp_compose_asconf(struct sctp_tcb *stcb, int *retlen, int addr_locked)
+{
+ struct mbuf *m_asconf, *m_asconf_chk;
+ struct sctp_asconf_addr *aa;
+ struct sctp_asconf_chunk *acp;
+ struct sctp_asconf_paramhdr *aph;
+ struct sctp_asconf_addr_param *aap;
+ uint32_t p_length;
+ uint32_t correlation_id = 1; /* 0 is reserved... */
+ caddr_t ptr, lookup_ptr;
+ uint8_t lookup_used = 0;
+
+ /* are there any asconf params to send? */
+ TAILQ_FOREACH(aa, &stcb->asoc.asconf_queue, next) {
+ if (aa->sent == 0)
+ break;
+ }
+ if (aa == NULL)
+ return (NULL);
+
+ /*
+ * get a chunk header mbuf and a cluster for the asconf params since
+ * it's simpler to fill in the asconf chunk header lookup address on
+ * the fly
+ */
+ m_asconf_chk = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_chunk), 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_asconf_chk == NULL) {
+ /* no mbuf's */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "compose_asconf: couldn't get chunk mbuf!\n");
+ return (NULL);
+ }
+ m_asconf = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_asconf == NULL) {
+ /* no mbuf's */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "compose_asconf: couldn't get mbuf!\n");
+ sctp_m_freem(m_asconf_chk);
+ return (NULL);
+ }
+ SCTP_BUF_LEN(m_asconf_chk) = sizeof(struct sctp_asconf_chunk);
+ SCTP_BUF_LEN(m_asconf) = 0;
+ acp = mtod(m_asconf_chk, struct sctp_asconf_chunk *);
+ bzero(acp, sizeof(struct sctp_asconf_chunk));
+ /* save pointers to lookup address and asconf params */
+ lookup_ptr = (caddr_t)(acp + 1); /* after the header */
+ ptr = mtod(m_asconf, caddr_t); /* beginning of cluster */
+
+ /* fill in chunk header info */
+ acp->ch.chunk_type = SCTP_ASCONF;
+ acp->ch.chunk_flags = 0;
+ acp->serial_number = htonl(stcb->asoc.asconf_seq_out);
+ stcb->asoc.asconf_seq_out++;
+
+ /* add parameters... up to smallest MTU allowed */
+ TAILQ_FOREACH(aa, &stcb->asoc.asconf_queue, next) {
+ if (aa->sent)
+ continue;
+ /* get the parameter length */
+ p_length = SCTP_SIZE32(aa->ap.aph.ph.param_length);
+ /* will it fit in current chunk? */
+ if (SCTP_BUF_LEN(m_asconf) + p_length > stcb->asoc.smallest_mtu) {
+ /* won't fit, so we're done with this chunk */
+ break;
+ }
+ /* assign (and store) a correlation id */
+ aa->ap.aph.correlation_id = correlation_id++;
+
+ /*
+ * fill in address if we're doing a delete this is a simple
+ * way for us to fill in the correlation address, which
+ * should only be used by the peer if we're deleting our
+ * source address and adding a new address (e.g. renumbering
+ * case)
+ */
+ if (lookup_used == 0 &&
+ (aa->special_del == 0) &&
+ aa->ap.aph.ph.param_type == SCTP_DEL_IP_ADDRESS) {
+ struct sctp_ipv6addr_param *lookup;
+ uint16_t p_size, addr_size;
+
+ lookup = (struct sctp_ipv6addr_param *)lookup_ptr;
+ lookup->ph.param_type =
+ htons(aa->ap.addrp.ph.param_type);
+ if (aa->ap.addrp.ph.param_type == SCTP_IPV6_ADDRESS) {
+ /* copy IPv6 address */
+ p_size = sizeof(struct sctp_ipv6addr_param);
+ addr_size = sizeof(struct in6_addr);
+ } else {
+ /* copy IPv4 address */
+ p_size = sizeof(struct sctp_ipv4addr_param);
+ addr_size = sizeof(struct in_addr);
+ }
+ lookup->ph.param_length = htons(SCTP_SIZE32(p_size));
+ memcpy(lookup->addr, &aa->ap.addrp.addr, addr_size);
+ SCTP_BUF_LEN(m_asconf_chk) += SCTP_SIZE32(p_size);
+ lookup_used = 1;
+ }
+ /* copy into current space */
+ memcpy(ptr, &aa->ap, p_length);
+
+ /* network elements and update lengths */
+ aph = (struct sctp_asconf_paramhdr *)ptr;
+ aap = (struct sctp_asconf_addr_param *)ptr;
+ /* correlation_id is transparent to peer, no htonl needed */
+ aph->ph.param_type = htons(aph->ph.param_type);
+ aph->ph.param_length = htons(aph->ph.param_length);
+ aap->addrp.ph.param_type = htons(aap->addrp.ph.param_type);
+ aap->addrp.ph.param_length = htons(aap->addrp.ph.param_length);
+
+ SCTP_BUF_LEN(m_asconf) += SCTP_SIZE32(p_length);
+ ptr += SCTP_SIZE32(p_length);
+
+ /*
+ * these params are removed off the pending list upon
+ * getting an ASCONF-ACK back from the peer, just set flag
+ */
+ aa->sent = 1;
+ }
+ /* check to see if the lookup addr has been populated yet */
+ if (lookup_used == 0) {
+ /* NOTE: if the address param is optional, can skip this... */
+ /* add any valid (existing) address... */
+ struct sctp_ipv6addr_param *lookup;
+ uint16_t p_size, addr_size;
+ struct sockaddr *found_addr;
+ caddr_t addr_ptr;
+
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL)
+ found_addr = sctp_find_valid_localaddr(stcb,
+ addr_locked);
+ else
+ found_addr = sctp_find_valid_localaddr_ep(stcb);
+
+ lookup = (struct sctp_ipv6addr_param *)lookup_ptr;
+ if (found_addr != NULL) {
+ if (found_addr->sa_family == AF_INET6) {
+ /* copy IPv6 address */
+ lookup->ph.param_type =
+ htons(SCTP_IPV6_ADDRESS);
+ p_size = sizeof(struct sctp_ipv6addr_param);
+ addr_size = sizeof(struct in6_addr);
+ addr_ptr = (caddr_t)&((struct sockaddr_in6 *)
+ found_addr)->sin6_addr;
+ } else {
+ /* copy IPv4 address */
+ lookup->ph.param_type =
+ htons(SCTP_IPV4_ADDRESS);
+ p_size = sizeof(struct sctp_ipv4addr_param);
+ addr_size = sizeof(struct in_addr);
+ addr_ptr = (caddr_t)&((struct sockaddr_in *)
+ found_addr)->sin_addr;
+ }
+ lookup->ph.param_length = htons(SCTP_SIZE32(p_size));
+ memcpy(lookup->addr, addr_ptr, addr_size);
+ SCTP_BUF_LEN(m_asconf_chk) += SCTP_SIZE32(p_size);
+ lookup_used = 1;
+ } else {
+ /* uh oh... don't have any address?? */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "compose_asconf: no lookup addr!\n");
+ /* for now, we send a IPv4 address of 0.0.0.0 */
+ lookup->ph.param_type = htons(SCTP_IPV4_ADDRESS);
+ lookup->ph.param_length = htons(SCTP_SIZE32(sizeof(struct sctp_ipv4addr_param)));
+ bzero(lookup->addr, sizeof(struct in_addr));
+ SCTP_BUF_LEN(m_asconf_chk) += SCTP_SIZE32(sizeof(struct sctp_ipv4addr_param));
+ lookup_used = 1;
+ }
+ }
+ /* chain it all together */
+ SCTP_BUF_NEXT(m_asconf_chk) = m_asconf;
+ *retlen = SCTP_BUF_LEN(m_asconf_chk) + SCTP_BUF_LEN(m_asconf);
+ acp->ch.chunk_length = ntohs(*retlen);
+
+ return (m_asconf_chk);
+}
+
+/*
+ * section to handle address changes before an association is up eg. changes
+ * during INIT/INIT-ACK/COOKIE-ECHO handshake
+ */
+
+/*
+ * processes the (local) addresses in the INIT-ACK chunk
+ */
+static void
+sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m,
+ unsigned int offset, unsigned int length)
+{
+ struct sctp_paramhdr tmp_param, *ph;
+ uint16_t plen, ptype;
+ struct sctp_ifa *sctp_ifa;
+ struct sctp_ipv6addr_param addr_store;
+ struct sockaddr_in6 sin6;
+ struct sockaddr_in sin;
+ struct sockaddr *sa;
+ uint32_t vrf_id;
+
+ SCTPDBG(SCTP_DEBUG_ASCONF2, "processing init-ack addresses\n");
+ if (stcb == NULL) /* Un-needed check for SA */
+ return;
+
+ /* convert to upper bound */
+ length += offset;
+
+ if ((offset + sizeof(struct sctp_paramhdr)) > length) {
+ return;
+ }
+ /* init the addresses */
+ bzero(&sin6, sizeof(sin6));
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_len = sizeof(sin6);
+ sin6.sin6_port = stcb->rport;
+
+ bzero(&sin, sizeof(sin));
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_port = stcb->rport;
+
+ /* go through the addresses in the init-ack */
+ ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset,
+ sizeof(struct sctp_paramhdr), (uint8_t *) & tmp_param);
+ while (ph != NULL) {
+ ptype = ntohs(ph->param_type);
+ plen = ntohs(ph->param_length);
+ if (ptype == SCTP_IPV6_ADDRESS) {
+ struct sctp_ipv6addr_param *a6p;
+
+ /* get the entire IPv6 address param */
+ a6p = (struct sctp_ipv6addr_param *)
+ sctp_m_getptr(m, offset,
+ sizeof(struct sctp_ipv6addr_param),
+ (uint8_t *) & addr_store);
+ if (plen != sizeof(struct sctp_ipv6addr_param) ||
+ a6p == NULL) {
+ return;
+ }
+ memcpy(&sin6.sin6_addr, a6p->addr,
+ sizeof(struct in6_addr));
+ sa = (struct sockaddr *)&sin6;
+ } else if (ptype == SCTP_IPV4_ADDRESS) {
+ struct sctp_ipv4addr_param *a4p;
+
+ /* get the entire IPv4 address param */
+ a4p = (struct sctp_ipv4addr_param *)sctp_m_getptr(m, offset,
+ sizeof(struct sctp_ipv4addr_param),
+ (uint8_t *) & addr_store);
+ if (plen != sizeof(struct sctp_ipv4addr_param) ||
+ a4p == NULL) {
+ return;
+ }
+ sin.sin_addr.s_addr = a4p->addr;
+ sa = (struct sockaddr *)&sin;
+ } else {
+ goto next_addr;
+ }
+
+ /* see if this address really (still) exists */
+ if (stcb) {
+ vrf_id = stcb->asoc.vrf_id;
+ } else {
+ vrf_id = SCTP_DEFAULT_VRFID;
+ }
+ sctp_ifa = sctp_find_ifa_by_addr(sa, vrf_id,
+ SCTP_ADDR_NOT_LOCKED);
+ if (sctp_ifa == NULL) {
+ /* address doesn't exist anymore */
+ int status;
+
+ /* are ASCONFs allowed ? */
+ if ((sctp_is_feature_on(stcb->sctp_ep,
+ SCTP_PCB_FLAGS_DO_ASCONF)) &&
+ stcb->asoc.peer_supports_asconf) {
+ /* queue an ASCONF DEL_IP_ADDRESS */
+ status = sctp_asconf_queue_sa_delete(stcb, sa);
+ /*
+ * if queued ok, and in correct state, send
+ * out the ASCONF.
+ */
+ if (status == 0 &&
+ SCTP_GET_STATE(&stcb->asoc) ==
+ SCTP_STATE_OPEN) {
+#ifdef SCTP_TIMER_BASED_ASCONF
+ sctp_timer_start(SCTP_TIMER_TYPE_ASCONF,
+ stcb->sctp_ep, stcb,
+ stcb->asoc.primary_destination);
+#else
+ sctp_send_asconf(stcb, stcb->asoc.primary_destination,
+ SCTP_ADDR_NOT_LOCKED);
+#endif
+ }
+ }
+ }
+next_addr:
+ /*
+ * Sanity check: Make sure the length isn't 0, otherwise
+ * we'll be stuck in this loop for a long time...
+ */
+ if (SCTP_SIZE32(plen) == 0) {
+ SCTP_PRINTF("process_initack_addrs: bad len (%d) type=%xh\n",
+ plen, ptype);
+ return;
+ }
+ /* get next parameter */
+ offset += SCTP_SIZE32(plen);
+ if ((offset + sizeof(struct sctp_paramhdr)) > length)
+ return;
+ ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset,
+ sizeof(struct sctp_paramhdr), (uint8_t *) & tmp_param);
+ } /* while */
+}
+
+/* FIX ME: need to verify return result for v6 address type if v6 disabled */
+/*
+ * checks to see if a specific address is in the initack address list returns
+ * 1 if found, 0 if not
+ */
+static uint32_t
+sctp_addr_in_initack(struct sctp_tcb *stcb, struct mbuf *m, uint32_t offset,
+ uint32_t length, struct sockaddr *sa)
+{
+ struct sctp_paramhdr tmp_param, *ph;
+ uint16_t plen, ptype;
+ struct sctp_ipv6addr_param addr_store;
+ struct sockaddr_in *sin;
+ struct sctp_ipv4addr_param *a4p;
+
+#ifdef INET6
+ struct sockaddr_in6 *sin6;
+ struct sctp_ipv6addr_param *a6p;
+ struct sockaddr_in6 sin6_tmp;
+
+#endif /* INET6 */
+
+ if (
+#ifdef INET6
+ (sa->sa_family != AF_INET6) &&
+#endif /* INET6 */
+ (sa->sa_family != AF_INET))
+ return (0);
+
+ SCTPDBG(SCTP_DEBUG_ASCONF2, "find_initack_addr: starting search for ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, sa);
+ /* convert to upper bound */
+ length += offset;
+
+ if ((offset + sizeof(struct sctp_paramhdr)) > length) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "find_initack_addr: invalid offset?\n");
+ return (0);
+ }
+ /* go through the addresses in the init-ack */
+ ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset,
+ sizeof(struct sctp_paramhdr), (uint8_t *) & tmp_param);
+ while (ph != NULL) {
+ ptype = ntohs(ph->param_type);
+ plen = ntohs(ph->param_length);
+#ifdef INET6
+ if (ptype == SCTP_IPV6_ADDRESS && sa->sa_family == AF_INET6) {
+ /* get the entire IPv6 address param */
+ a6p = (struct sctp_ipv6addr_param *)
+ sctp_m_getptr(m, offset,
+ sizeof(struct sctp_ipv6addr_param),
+ (uint8_t *) & addr_store);
+ if (plen != sizeof(struct sctp_ipv6addr_param) ||
+ (ph == NULL) ||
+ (a6p == NULL)) {
+ return (0);
+ }
+ sin6 = (struct sockaddr_in6 *)sa;
+ if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) {
+ /* create a copy and clear scope */
+ memcpy(&sin6_tmp, sin6,
+ sizeof(struct sockaddr_in6));
+ sin6 = &sin6_tmp;
+ in6_clearscope(&sin6->sin6_addr);
+ }
+ if (memcmp(&sin6->sin6_addr, a6p->addr,
+ sizeof(struct in6_addr)) == 0) {
+ /* found it */
+ return (1);
+ }
+ } else
+#endif /* INET6 */
+
+ if (ptype == SCTP_IPV4_ADDRESS &&
+ sa->sa_family == AF_INET) {
+ /* get the entire IPv4 address param */
+ a4p = (struct sctp_ipv4addr_param *)sctp_m_getptr(m,
+ offset, sizeof(struct sctp_ipv4addr_param),
+ (uint8_t *) & addr_store);
+ if (plen != sizeof(struct sctp_ipv4addr_param) ||
+ (ph == NULL) ||
+ (a4p == NULL)) {
+ return (0);
+ }
+ sin = (struct sockaddr_in *)sa;
+ if (sin->sin_addr.s_addr == a4p->addr) {
+ /* found it */
+ return (1);
+ }
+ }
+ /* get next parameter */
+ offset += SCTP_SIZE32(plen);
+ if (offset + sizeof(struct sctp_paramhdr) > length)
+ return (0);
+ ph = (struct sctp_paramhdr *)
+ sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr),
+ (uint8_t *) & tmp_param);
+ } /* while */
+ /* not found! */
+ return (0);
+}
+
+/*
+ * makes sure that the current endpoint local addr list is consistent with
+ * the new association (eg. subset bound, asconf allowed) adds addresses as
+ * necessary
+ */
+static void
+sctp_check_address_list_ep(struct sctp_tcb *stcb, struct mbuf *m, int offset,
+ int length, struct sockaddr *init_addr)
+{
+ struct sctp_laddr *laddr;
+
+ /* go through the endpoint list */
+ LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) {
+ /* be paranoid and validate the laddr */
+ if (laddr->ifa == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "check_addr_list_ep: laddr->ifa is NULL");
+ continue;
+ }
+ if (laddr->ifa == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "check_addr_list_ep: laddr->ifa->ifa_addr is NULL");
+ continue;
+ }
+ /* do i have it implicitly? */
+ if (sctp_cmpaddr(&laddr->ifa->address.sa, init_addr)) {
+ continue;
+ }
+ /* check to see if in the init-ack */
+ if (!sctp_addr_in_initack(stcb, m, offset, length,
+ &laddr->ifa->address.sa)) {
+ /* try to add it */
+ sctp_addr_mgmt_assoc(stcb->sctp_ep, stcb, laddr->ifa,
+ SCTP_ADD_IP_ADDRESS, SCTP_ADDR_NOT_LOCKED);
+ }
+ }
+}
+
+/*
+ * makes sure that the current kernel address list is consistent with the new
+ * association (with all addrs bound) adds addresses as necessary
+ */
+static void
+sctp_check_address_list_all(struct sctp_tcb *stcb, struct mbuf *m, int offset,
+ int length, struct sockaddr *init_addr,
+ uint16_t local_scope, uint16_t site_scope,
+ uint16_t ipv4_scope, uint16_t loopback_scope)
+{
+ struct sctp_vrf *vrf = NULL;
+ struct sctp_ifn *sctp_ifn;
+ struct sctp_ifa *sctp_ifa;
+ uint32_t vrf_id;
+
+ if (stcb) {
+ vrf_id = stcb->asoc.vrf_id;
+ } else {
+ return;
+ }
+ SCTP_IPI_ADDR_RLOCK();
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf == NULL) {
+ SCTP_IPI_ADDR_RUNLOCK();
+ return;
+ }
+ /* go through all our known interfaces */
+ LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
+ if (loopback_scope == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
+ /* skip loopback interface */
+ continue;
+ }
+ /* go through each interface address */
+ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
+ /* do i have it implicitly? */
+ if (sctp_cmpaddr(&sctp_ifa->address.sa, init_addr)) {
+ continue;
+ }
+ /* check to see if in the init-ack */
+ if (!sctp_addr_in_initack(stcb, m, offset, length,
+ &sctp_ifa->address.sa)) {
+ /* try to add it */
+ sctp_addr_mgmt_assoc(stcb->sctp_ep, stcb,
+ sctp_ifa, SCTP_ADD_IP_ADDRESS,
+ SCTP_ADDR_LOCKED);
+ }
+ } /* end foreach ifa */
+ } /* end foreach ifn */
+ SCTP_IPI_ADDR_RUNLOCK();
+}
+
+/*
+ * validates an init-ack chunk (from a cookie-echo) with current addresses
+ * adds addresses from the init-ack into our local address list, if needed
+ * queues asconf adds/deletes addresses as needed and makes appropriate list
+ * changes for source address selection m, offset: points to the start of the
+ * address list in an init-ack chunk length: total length of the address
+ * params only init_addr: address where my INIT-ACK was sent from
+ */
+void
+sctp_check_address_list(struct sctp_tcb *stcb, struct mbuf *m, int offset,
+ int length, struct sockaddr *init_addr,
+ uint16_t local_scope, uint16_t site_scope,
+ uint16_t ipv4_scope, uint16_t loopback_scope)
+{
+ /* process the local addresses in the initack */
+ sctp_process_initack_addresses(stcb, m, offset, length);
+
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ /* bound all case */
+ sctp_check_address_list_all(stcb, m, offset, length, init_addr,
+ local_scope, site_scope, ipv4_scope, loopback_scope);
+ } else {
+ /* subset bound case */
+ if (sctp_is_feature_on(stcb->sctp_ep,
+ SCTP_PCB_FLAGS_DO_ASCONF)) {
+ /* asconf's allowed */
+ sctp_check_address_list_ep(stcb, m, offset, length,
+ init_addr);
+ }
+ /* else, no asconfs allowed, so what we sent is what we get */
+ }
+}
+
+/*
+ * sctp_bindx() support
+ */
+uint32_t
+sctp_addr_mgmt_ep_sa(struct sctp_inpcb *inp, struct sockaddr *sa,
+ uint32_t type, uint32_t vrf_id, struct sctp_ifa *sctp_ifap)
+{
+ struct sctp_ifa *ifa;
+
+ if (sa->sa_len == 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EINVAL);
+ return (EINVAL);
+ }
+ if (sctp_ifap) {
+ ifa = sctp_ifap;
+ } else if (type == SCTP_ADD_IP_ADDRESS) {
+ /* For an add the address MUST be on the system */
+ ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED);
+ } else if (type == SCTP_DEL_IP_ADDRESS) {
+ /* For a delete we need to find it in the inp */
+ ifa = sctp_find_ifa_in_ep(inp, sa, SCTP_ADDR_NOT_LOCKED);
+ } else {
+ ifa = NULL;
+ }
+ if (ifa != NULL) {
+ if (type == SCTP_ADD_IP_ADDRESS) {
+ sctp_add_local_addr_ep(inp, ifa, type);
+ } else if (type == SCTP_DEL_IP_ADDRESS) {
+ struct sctp_laddr *laddr;
+
+ if (inp->laddr_count < 2) {
+ /* can't delete the last local address */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EINVAL);
+ return (EINVAL);
+ }
+ LIST_FOREACH(laddr, &inp->sctp_addr_list,
+ sctp_nxt_addr) {
+ if (ifa == laddr->ifa) {
+ /* Mark in the delete */
+ laddr->action = type;
+ }
+ }
+ }
+ if (!LIST_EMPTY(&inp->sctp_asoc_list)) {
+ /*
+ * There is no need to start the iterator if the inp
+ * has no associations.
+ */
+ struct sctp_asconf_iterator *asc;
+ struct sctp_laddr *wi;
+
+ SCTP_MALLOC(asc, struct sctp_asconf_iterator *,
+ sizeof(struct sctp_asconf_iterator),
+ SCTP_M_ASC_IT);
+ if (asc == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, ENOMEM);
+ return (ENOMEM);
+ }
+ wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
+ if (wi == NULL) {
+ SCTP_FREE(asc, SCTP_M_ASC_IT);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, ENOMEM);
+ return (ENOMEM);
+ }
+ LIST_INIT(&asc->list_of_work);
+ asc->cnt = 1;
+ SCTP_INCR_LADDR_COUNT();
+ wi->ifa = ifa;
+ wi->action = type;
+ atomic_add_int(&ifa->refcount, 1);
+ LIST_INSERT_HEAD(&asc->list_of_work, wi, sctp_nxt_addr);
+ (void)sctp_initiate_iterator(sctp_asconf_iterator_ep,
+ sctp_asconf_iterator_stcb,
+ sctp_asconf_iterator_ep_end,
+ SCTP_PCB_ANY_FLAGS,
+ SCTP_PCB_ANY_FEATURES,
+ SCTP_ASOC_ANY_STATE,
+ (void *)asc, 0,
+ sctp_asconf_iterator_end, inp, 0);
+ }
+ return (0);
+ } else {
+ /* invalid address! */
+ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EADDRNOTAVAIL);
+ return (EADDRNOTAVAIL);
+ }
+}
+
+void
+sctp_asconf_send_nat_state_update(struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ struct sctp_asconf_addr *aa;
+ struct sctp_ifa *sctp_ifap;
+ struct sctp_asconf_tag_param *vtag;
+ struct sockaddr_in *to;
+
+#ifdef INET6
+ struct sockaddr_in6 *to6;
+
+#endif
+ if (net == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_send_nat_state_update: Missing net\n");
+ return;
+ }
+ if (stcb == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_send_nat_state_update: Missing stcb\n");
+ return;
+ }
+ /*
+ * Need to have in the asconf: - vtagparam(my_vtag/peer_vtag) -
+ * add(0.0.0.0) - del(0.0.0.0) - Any global addresses add(addr)
+ */
+ SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa),
+ SCTP_M_ASC_ADDR);
+ if (aa == NULL) {
+ /* didn't get memory */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "sctp_asconf_send_nat_state_update: failed to get memory!\n");
+ return;
+ }
+ aa->special_del = 0;
+ /* fill in asconf address parameter fields */
+ /* top level elements are "networked" during send */
+ aa->ifa = NULL;
+ aa->sent = 0; /* clear sent flag */
+ vtag = (struct sctp_asconf_tag_param *)&aa->ap.aph;
+ vtag->aph.ph.param_type = SCTP_NAT_VTAGS;
+ vtag->aph.ph.param_length = sizeof(struct sctp_asconf_tag_param);
+ vtag->local_vtag = htonl(stcb->asoc.my_vtag);
+ vtag->remote_vtag = htonl(stcb->asoc.peer_vtag);
+ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
+
+ SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa),
+ SCTP_M_ASC_ADDR);
+ if (aa == NULL) {
+ /* didn't get memory */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "sctp_asconf_send_nat_state_update: failed to get memory!\n");
+ return;
+ }
+ memset(aa, 0, sizeof(struct sctp_asconf_addr));
+ /* fill in asconf address parameter fields */
+ /* ADD(0.0.0.0) */
+ if (net->ro._l_addr.sa.sa_family == AF_INET) {
+ aa->ap.aph.ph.param_type = SCTP_ADD_IP_ADDRESS;
+ aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addrv4_param);
+ aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS;
+ aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv4addr_param);
+ /* No need to add an address, we are using 0.0.0.0 */
+ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
+ }
+#ifdef INET6
+ else if (net->ro._l_addr.sa.sa_family == AF_INET6) {
+ aa->ap.aph.ph.param_type = SCTP_ADD_IP_ADDRESS;
+ aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addr_param);
+ aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS;
+ aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv6addr_param);
+ /* No need to add an address, we are using 0.0.0.0 */
+ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
+ }
+#endif /* INET6 */
+ SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa),
+ SCTP_M_ASC_ADDR);
+ if (aa == NULL) {
+ /* didn't get memory */
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "sctp_asconf_send_nat_state_update: failed to get memory!\n");
+ return;
+ }
+ memset(aa, 0, sizeof(struct sctp_asconf_addr));
+ /* fill in asconf address parameter fields */
+ /* ADD(0.0.0.0) */
+ if (net->ro._l_addr.sa.sa_family == AF_INET) {
+ aa->ap.aph.ph.param_type = SCTP_ADD_IP_ADDRESS;
+ aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addrv4_param);
+ aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS;
+ aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv4addr_param);
+ /* No need to add an address, we are using 0.0.0.0 */
+ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
+ }
+#ifdef INET6
+ else if (net->ro._l_addr.sa.sa_family == AF_INET6) {
+ aa->ap.aph.ph.param_type = SCTP_DEL_IP_ADDRESS;
+ aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addr_param);
+ aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS;
+ aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv6addr_param);
+ /* No need to add an address, we are using 0.0.0.0 */
+ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
+ }
+#endif /* INET6 */
+ /* Now we must hunt the addresses and add all global addresses */
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ struct sctp_vrf *vrf = NULL;
+ struct sctp_ifn *sctp_ifnp;
+ uint32_t vrf_id;
+
+ vrf_id = stcb->sctp_ep->def_vrf_id;
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf == NULL) {
+ goto skip_rest;
+ }
+ SCTP_IPI_ADDR_RLOCK();
+ LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) {
+ LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) {
+ if (sctp_ifap->address.sa.sa_family == AF_INET) {
+ to = &sctp_ifap->address.sin;
+
+ if (IN4_ISPRIVATE_ADDRESS(&to->sin_addr)) {
+ continue;
+ }
+ if (IN4_ISLOOPBACK_ADDRESS(&to->sin_addr)) {
+ continue;
+ }
+ }
+#ifdef INET6
+ else if (sctp_ifap->address.sa.sa_family == AF_INET6) {
+ to6 = &sctp_ifap->address.sin6;
+ if (IN6_IS_ADDR_LOOPBACK(&to6->sin6_addr)) {
+ continue;
+ }
+ if (IN6_IS_ADDR_LINKLOCAL(&to6->sin6_addr)) {
+ continue;
+ }
+ }
+#endif
+ sctp_asconf_queue_mgmt(stcb, sctp_ifap, SCTP_ADD_IP_ADDRESS);
+ }
+ }
+ SCTP_IPI_ADDR_RUNLOCK();
+ } else {
+ struct sctp_laddr *laddr;
+
+ LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) {
+ if (laddr->ifa == NULL) {
+ continue;
+ }
+ if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED)
+ /*
+ * Address being deleted by the system, dont
+ * list.
+ */
+ continue;
+ if (laddr->action == SCTP_DEL_IP_ADDRESS) {
+ /*
+ * Address being deleted on this ep don't
+ * list.
+ */
+ continue;
+ }
+ sctp_ifap = laddr->ifa;
+ if (sctp_ifap->address.sa.sa_family == AF_INET) {
+ to = &sctp_ifap->address.sin;
+
+ if (IN4_ISPRIVATE_ADDRESS(&to->sin_addr)) {
+ continue;
+ }
+ if (IN4_ISLOOPBACK_ADDRESS(&to->sin_addr)) {
+ continue;
+ }
+ }
+#ifdef INET6
+ else if (sctp_ifap->address.sa.sa_family == AF_INET6) {
+ to6 = &sctp_ifap->address.sin6;
+ if (IN6_IS_ADDR_LOOPBACK(&to6->sin6_addr)) {
+ continue;
+ }
+ if (IN6_IS_ADDR_LINKLOCAL(&to6->sin6_addr)) {
+ continue;
+ }
+ }
+#endif
+ sctp_asconf_queue_mgmt(stcb, sctp_ifap, SCTP_ADD_IP_ADDRESS);
+ }
+ }
+skip_rest:
+ /* Now we must send the asconf into the queue */
+ sctp_send_asconf(stcb, net, 0);
+}
diff --git a/freebsd/sys/netinet/sctp_asconf.h b/freebsd/sys/netinet/sctp_asconf.h
new file mode 100644
index 00000000..ff8cf378
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_asconf.h
@@ -0,0 +1,96 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_asconf.h,v 1.8 2005/03/06 16:04:16 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef _NETINET_SCTP_ASCONF_HH_
+#define _NETINET_SCTP_ASCONF_HH_
+
+#if defined(_KERNEL) || defined(__Userspace__)
+
+/*
+ * function prototypes
+ */
+extern void sctp_asconf_cleanup(struct sctp_tcb *, struct sctp_nets *);
+
+extern struct mbuf *sctp_compose_asconf(struct sctp_tcb *, int *, int);
+
+extern void
+sctp_handle_asconf(struct mbuf *, unsigned int, struct sctp_asconf_chunk *,
+ struct sctp_tcb *, int i);
+
+extern void
+sctp_handle_asconf_ack(struct mbuf *, int, struct sctp_asconf_ack_chunk *,
+ struct sctp_tcb *, struct sctp_nets *, int *);
+
+extern uint32_t
+sctp_addr_mgmt_ep_sa(struct sctp_inpcb *, struct sockaddr *,
+ uint32_t, uint32_t, struct sctp_ifa *);
+
+
+extern int
+sctp_asconf_iterator_ep(struct sctp_inpcb *inp, void *ptr,
+ uint32_t val);
+extern void
+sctp_asconf_iterator_stcb(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ void *ptr, uint32_t type);
+extern void sctp_asconf_iterator_end(void *ptr, uint32_t val);
+
+
+extern int32_t
+sctp_set_primary_ip_address_sa(struct sctp_tcb *,
+ struct sockaddr *);
+
+extern void
+ sctp_set_primary_ip_address(struct sctp_ifa *ifa);
+
+extern void
+sctp_check_address_list(struct sctp_tcb *, struct mbuf *, int, int,
+ struct sockaddr *, uint16_t, uint16_t, uint16_t, uint16_t);
+
+extern void
+ sctp_assoc_immediate_retrans(struct sctp_tcb *, struct sctp_nets *);
+extern void
+ sctp_net_immediate_retrans(struct sctp_tcb *, struct sctp_nets *);
+
+extern void
+sctp_asconf_send_nat_state_update(struct sctp_tcb *stcb,
+ struct sctp_nets *net);
+
+extern int
+ sctp_is_addr_pending(struct sctp_tcb *, struct sctp_ifa *);
+
+#endif /* _KERNEL */
+
+#endif /* !_NETINET_SCTP_ASCONF_HH_ */
diff --git a/freebsd/sys/netinet/sctp_auth.c b/freebsd/sys/netinet/sctp_auth.c
new file mode 100644
index 00000000..6c2bf908
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_auth.c
@@ -0,0 +1,2128 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp.h>
+#include <freebsd/netinet/sctp_header.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#include <freebsd/netinet/sctp_var.h>
+#include <freebsd/netinet/sctp_sysctl.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_indata.h>
+#include <freebsd/netinet/sctp_output.h>
+#include <freebsd/netinet/sctp_auth.h>
+
+#ifdef SCTP_DEBUG
+#define SCTP_AUTH_DEBUG (SCTP_BASE_SYSCTL(sctp_debug_on) & SCTP_DEBUG_AUTH1)
+#define SCTP_AUTH_DEBUG2 (SCTP_BASE_SYSCTL(sctp_debug_on) & SCTP_DEBUG_AUTH2)
+#endif /* SCTP_DEBUG */
+
+
+void
+sctp_clear_chunklist(sctp_auth_chklist_t * chklist)
+{
+ bzero(chklist, sizeof(*chklist));
+ /* chklist->num_chunks = 0; */
+}
+
+sctp_auth_chklist_t *
+sctp_alloc_chunklist(void)
+{
+ sctp_auth_chklist_t *chklist;
+
+ SCTP_MALLOC(chklist, sctp_auth_chklist_t *, sizeof(*chklist),
+ SCTP_M_AUTH_CL);
+ if (chklist == NULL) {
+ SCTPDBG(SCTP_DEBUG_AUTH1, "sctp_alloc_chunklist: failed to get memory!\n");
+ } else {
+ sctp_clear_chunklist(chklist);
+ }
+ return (chklist);
+}
+
+void
+sctp_free_chunklist(sctp_auth_chklist_t * list)
+{
+ if (list != NULL)
+ SCTP_FREE(list, SCTP_M_AUTH_CL);
+}
+
+sctp_auth_chklist_t *
+sctp_copy_chunklist(sctp_auth_chklist_t * list)
+{
+ sctp_auth_chklist_t *new_list;
+
+ if (list == NULL)
+ return (NULL);
+
+ /* get a new list */
+ new_list = sctp_alloc_chunklist();
+ if (new_list == NULL)
+ return (NULL);
+ /* copy it */
+ bcopy(list, new_list, sizeof(*new_list));
+
+ return (new_list);
+}
+
+
+/*
+ * add a chunk to the required chunks list
+ */
+int
+sctp_auth_add_chunk(uint8_t chunk, sctp_auth_chklist_t * list)
+{
+ if (list == NULL)
+ return (-1);
+
+ /* is chunk restricted? */
+ if ((chunk == SCTP_INITIATION) ||
+ (chunk == SCTP_INITIATION_ACK) ||
+ (chunk == SCTP_SHUTDOWN_COMPLETE) ||
+ (chunk == SCTP_AUTHENTICATION)) {
+ return (-1);
+ }
+ if (list->chunks[chunk] == 0) {
+ list->chunks[chunk] = 1;
+ list->num_chunks++;
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "SCTP: added chunk %u (0x%02x) to Auth list\n",
+ chunk, chunk);
+ }
+ return (0);
+}
+
+/*
+ * delete a chunk from the required chunks list
+ */
+int
+sctp_auth_delete_chunk(uint8_t chunk, sctp_auth_chklist_t * list)
+{
+ if (list == NULL)
+ return (-1);
+
+ /* is chunk restricted? */
+ if ((chunk == SCTP_ASCONF) ||
+ (chunk == SCTP_ASCONF_ACK)) {
+ return (-1);
+ }
+ if (list->chunks[chunk] == 1) {
+ list->chunks[chunk] = 0;
+ list->num_chunks--;
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "SCTP: deleted chunk %u (0x%02x) from Auth list\n",
+ chunk, chunk);
+ }
+ return (0);
+}
+
+size_t
+sctp_auth_get_chklist_size(const sctp_auth_chklist_t * list)
+{
+ if (list == NULL)
+ return (0);
+ else
+ return (list->num_chunks);
+}
+
+/*
+ * set the default list of chunks requiring AUTH
+ */
+void
+sctp_auth_set_default_chunks(sctp_auth_chklist_t * list)
+{
+ (void)sctp_auth_add_chunk(SCTP_ASCONF, list);
+ (void)sctp_auth_add_chunk(SCTP_ASCONF_ACK, list);
+}
+
+/*
+ * return the current number and list of required chunks caller must
+ * guarantee ptr has space for up to 256 bytes
+ */
+int
+sctp_serialize_auth_chunks(const sctp_auth_chklist_t * list, uint8_t * ptr)
+{
+ int i, count = 0;
+
+ if (list == NULL)
+ return (0);
+
+ for (i = 0; i < 256; i++) {
+ if (list->chunks[i] != 0) {
+ *ptr++ = i;
+ count++;
+ }
+ }
+ return (count);
+}
+
+int
+sctp_pack_auth_chunks(const sctp_auth_chklist_t * list, uint8_t * ptr)
+{
+ int i, size = 0;
+
+ if (list == NULL)
+ return (0);
+
+ if (list->num_chunks <= 32) {
+ /* just list them, one byte each */
+ for (i = 0; i < 256; i++) {
+ if (list->chunks[i] != 0) {
+ *ptr++ = i;
+ size++;
+ }
+ }
+ } else {
+ int index, offset;
+
+ /* pack into a 32 byte bitfield */
+ for (i = 0; i < 256; i++) {
+ if (list->chunks[i] != 0) {
+ index = i / 8;
+ offset = i % 8;
+ ptr[index] |= (1 << offset);
+ }
+ }
+ size = 32;
+ }
+ return (size);
+}
+
+int
+sctp_unpack_auth_chunks(const uint8_t * ptr, uint8_t num_chunks,
+ sctp_auth_chklist_t * list)
+{
+ int i;
+ int size;
+
+ if (list == NULL)
+ return (0);
+
+ if (num_chunks <= 32) {
+ /* just pull them, one byte each */
+ for (i = 0; i < num_chunks; i++) {
+ (void)sctp_auth_add_chunk(*ptr++, list);
+ }
+ size = num_chunks;
+ } else {
+ int index, offset;
+
+ /* unpack from a 32 byte bitfield */
+ for (index = 0; index < 32; index++) {
+ for (offset = 0; offset < 8; offset++) {
+ if (ptr[index] & (1 << offset)) {
+ (void)sctp_auth_add_chunk((index * 8) + offset, list);
+ }
+ }
+ }
+ size = 32;
+ }
+ return (size);
+}
+
+
+/*
+ * allocate structure space for a key of length keylen
+ */
+sctp_key_t *
+sctp_alloc_key(uint32_t keylen)
+{
+ sctp_key_t *new_key;
+
+ SCTP_MALLOC(new_key, sctp_key_t *, sizeof(*new_key) + keylen,
+ SCTP_M_AUTH_KY);
+ if (new_key == NULL) {
+ /* out of memory */
+ return (NULL);
+ }
+ new_key->keylen = keylen;
+ return (new_key);
+}
+
+void
+sctp_free_key(sctp_key_t * key)
+{
+ if (key != NULL)
+ SCTP_FREE(key, SCTP_M_AUTH_KY);
+}
+
+void
+sctp_print_key(sctp_key_t * key, const char *str)
+{
+ uint32_t i;
+
+ if (key == NULL) {
+ printf("%s: [Null key]\n", str);
+ return;
+ }
+ printf("%s: len %u, ", str, key->keylen);
+ if (key->keylen) {
+ for (i = 0; i < key->keylen; i++)
+ printf("%02x", key->key[i]);
+ printf("\n");
+ } else {
+ printf("[Null key]\n");
+ }
+}
+
+void
+sctp_show_key(sctp_key_t * key, const char *str)
+{
+ uint32_t i;
+
+ if (key == NULL) {
+ printf("%s: [Null key]\n", str);
+ return;
+ }
+ printf("%s: len %u, ", str, key->keylen);
+ if (key->keylen) {
+ for (i = 0; i < key->keylen; i++)
+ printf("%02x", key->key[i]);
+ printf("\n");
+ } else {
+ printf("[Null key]\n");
+ }
+}
+
+static uint32_t
+sctp_get_keylen(sctp_key_t * key)
+{
+ if (key != NULL)
+ return (key->keylen);
+ else
+ return (0);
+}
+
+/*
+ * generate a new random key of length 'keylen'
+ */
+sctp_key_t *
+sctp_generate_random_key(uint32_t keylen)
+{
+ sctp_key_t *new_key;
+
+ /* validate keylen */
+ if (keylen > SCTP_AUTH_RANDOM_SIZE_MAX)
+ keylen = SCTP_AUTH_RANDOM_SIZE_MAX;
+
+ new_key = sctp_alloc_key(keylen);
+ if (new_key == NULL) {
+ /* out of memory */
+ return (NULL);
+ }
+ SCTP_READ_RANDOM(new_key->key, keylen);
+ new_key->keylen = keylen;
+ return (new_key);
+}
+
+sctp_key_t *
+sctp_set_key(uint8_t * key, uint32_t keylen)
+{
+ sctp_key_t *new_key;
+
+ new_key = sctp_alloc_key(keylen);
+ if (new_key == NULL) {
+ /* out of memory */
+ return (NULL);
+ }
+ bcopy(key, new_key->key, keylen);
+ return (new_key);
+}
+
+/*-
+ * given two keys of variable size, compute which key is "larger/smaller"
+ * returns: 1 if key1 > key2
+ * -1 if key1 < key2
+ * 0 if key1 = key2
+ */
+static int
+sctp_compare_key(sctp_key_t * key1, sctp_key_t * key2)
+{
+ uint32_t maxlen;
+ uint32_t i;
+ uint32_t key1len, key2len;
+ uint8_t *key_1, *key_2;
+ uint8_t temp[SCTP_AUTH_RANDOM_SIZE_MAX];
+
+ /* sanity/length check */
+ key1len = sctp_get_keylen(key1);
+ key2len = sctp_get_keylen(key2);
+ if ((key1len == 0) && (key2len == 0))
+ return (0);
+ else if (key1len == 0)
+ return (-1);
+ else if (key2len == 0)
+ return (1);
+
+ if (key1len != key2len) {
+ if (key1len >= key2len)
+ maxlen = key1len;
+ else
+ maxlen = key2len;
+ bzero(temp, maxlen);
+ if (key1len < maxlen) {
+ /* prepend zeroes to key1 */
+ bcopy(key1->key, temp + (maxlen - key1len), key1len);
+ key_1 = temp;
+ key_2 = key2->key;
+ } else {
+ /* prepend zeroes to key2 */
+ bcopy(key2->key, temp + (maxlen - key2len), key2len);
+ key_1 = key1->key;
+ key_2 = temp;
+ }
+ } else {
+ maxlen = key1len;
+ key_1 = key1->key;
+ key_2 = key2->key;
+ }
+
+ for (i = 0; i < maxlen; i++) {
+ if (*key_1 > *key_2)
+ return (1);
+ else if (*key_1 < *key_2)
+ return (-1);
+ key_1++;
+ key_2++;
+ }
+
+ /* keys are equal value, so check lengths */
+ if (key1len == key2len)
+ return (0);
+ else if (key1len < key2len)
+ return (-1);
+ else
+ return (1);
+}
+
+/*
+ * generate the concatenated keying material based on the two keys and the
+ * shared key (if available). draft-ietf-tsvwg-auth specifies the specific
+ * order for concatenation
+ */
+sctp_key_t *
+sctp_compute_hashkey(sctp_key_t * key1, sctp_key_t * key2, sctp_key_t * shared)
+{
+ uint32_t keylen;
+ sctp_key_t *new_key;
+ uint8_t *key_ptr;
+
+ keylen = sctp_get_keylen(key1) + sctp_get_keylen(key2) +
+ sctp_get_keylen(shared);
+
+ if (keylen > 0) {
+ /* get space for the new key */
+ new_key = sctp_alloc_key(keylen);
+ if (new_key == NULL) {
+ /* out of memory */
+ return (NULL);
+ }
+ new_key->keylen = keylen;
+ key_ptr = new_key->key;
+ } else {
+ /* all keys empty/null?! */
+ return (NULL);
+ }
+
+ /* concatenate the keys */
+ if (sctp_compare_key(key1, key2) <= 0) {
+ /* key is shared + key1 + key2 */
+ if (sctp_get_keylen(shared)) {
+ bcopy(shared->key, key_ptr, shared->keylen);
+ key_ptr += shared->keylen;
+ }
+ if (sctp_get_keylen(key1)) {
+ bcopy(key1->key, key_ptr, key1->keylen);
+ key_ptr += key1->keylen;
+ }
+ if (sctp_get_keylen(key2)) {
+ bcopy(key2->key, key_ptr, key2->keylen);
+ key_ptr += key2->keylen;
+ }
+ } else {
+ /* key is shared + key2 + key1 */
+ if (sctp_get_keylen(shared)) {
+ bcopy(shared->key, key_ptr, shared->keylen);
+ key_ptr += shared->keylen;
+ }
+ if (sctp_get_keylen(key2)) {
+ bcopy(key2->key, key_ptr, key2->keylen);
+ key_ptr += key2->keylen;
+ }
+ if (sctp_get_keylen(key1)) {
+ bcopy(key1->key, key_ptr, key1->keylen);
+ key_ptr += key1->keylen;
+ }
+ }
+ return (new_key);
+}
+
+
+sctp_sharedkey_t *
+sctp_alloc_sharedkey(void)
+{
+ sctp_sharedkey_t *new_key;
+
+ SCTP_MALLOC(new_key, sctp_sharedkey_t *, sizeof(*new_key),
+ SCTP_M_AUTH_KY);
+ if (new_key == NULL) {
+ /* out of memory */
+ return (NULL);
+ }
+ new_key->keyid = 0;
+ new_key->key = NULL;
+ new_key->refcount = 1;
+ new_key->deactivated = 0;
+ return (new_key);
+}
+
+void
+sctp_free_sharedkey(sctp_sharedkey_t * skey)
+{
+ if (skey == NULL)
+ return;
+
+ if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&skey->refcount)) {
+ if (skey->key != NULL)
+ sctp_free_key(skey->key);
+ SCTP_FREE(skey, SCTP_M_AUTH_KY);
+ }
+}
+
+sctp_sharedkey_t *
+sctp_find_sharedkey(struct sctp_keyhead *shared_keys, uint16_t key_id)
+{
+ sctp_sharedkey_t *skey;
+
+ LIST_FOREACH(skey, shared_keys, next) {
+ if (skey->keyid == key_id)
+ return (skey);
+ }
+ return (NULL);
+}
+
+int
+sctp_insert_sharedkey(struct sctp_keyhead *shared_keys,
+ sctp_sharedkey_t * new_skey)
+{
+ sctp_sharedkey_t *skey;
+
+ if ((shared_keys == NULL) || (new_skey == NULL))
+ return (EINVAL);
+
+ /* insert into an empty list? */
+ if (LIST_EMPTY(shared_keys)) {
+ LIST_INSERT_HEAD(shared_keys, new_skey, next);
+ return (0);
+ }
+ /* insert into the existing list, ordered by key id */
+ LIST_FOREACH(skey, shared_keys, next) {
+ if (new_skey->keyid < skey->keyid) {
+ /* insert it before here */
+ LIST_INSERT_BEFORE(skey, new_skey, next);
+ return (0);
+ } else if (new_skey->keyid == skey->keyid) {
+ /* replace the existing key */
+ /* verify this key *can* be replaced */
+ if ((skey->deactivated) && (skey->refcount > 1)) {
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "can't replace shared key id %u\n",
+ new_skey->keyid);
+ return (EBUSY);
+ }
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "replacing shared key id %u\n",
+ new_skey->keyid);
+ LIST_INSERT_BEFORE(skey, new_skey, next);
+ LIST_REMOVE(skey, next);
+ sctp_free_sharedkey(skey);
+ return (0);
+ }
+ if (LIST_NEXT(skey, next) == NULL) {
+ /* belongs at the end of the list */
+ LIST_INSERT_AFTER(skey, new_skey, next);
+ return (0);
+ }
+ }
+ /* shouldn't reach here */
+ return (0);
+}
+
+void
+sctp_auth_key_acquire(struct sctp_tcb *stcb, uint16_t key_id)
+{
+ sctp_sharedkey_t *skey;
+
+ /* find the shared key */
+ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, key_id);
+
+ /* bump the ref count */
+ if (skey) {
+ atomic_add_int(&skey->refcount, 1);
+ SCTPDBG(SCTP_DEBUG_AUTH2,
+ "%s: stcb %p key %u refcount acquire to %d\n",
+ __FUNCTION__, stcb, key_id, skey->refcount);
+ }
+}
+
+void
+sctp_auth_key_release(struct sctp_tcb *stcb, uint16_t key_id)
+{
+ sctp_sharedkey_t *skey;
+
+ /* find the shared key */
+ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, key_id);
+
+ /* decrement the ref count */
+ if (skey) {
+ sctp_free_sharedkey(skey);
+ SCTPDBG(SCTP_DEBUG_AUTH2,
+ "%s: stcb %p key %u refcount release to %d\n",
+ __FUNCTION__, stcb, key_id, skey->refcount);
+
+ /* see if a notification should be generated */
+ if ((skey->refcount <= 1) && (skey->deactivated)) {
+ /* notify ULP that key is no longer used */
+ sctp_ulp_notify(SCTP_NOTIFY_AUTH_FREE_KEY, stcb,
+ key_id, 0, SCTP_SO_LOCKED);
+ SCTPDBG(SCTP_DEBUG_AUTH2,
+ "%s: stcb %p key %u no longer used, %d\n",
+ __FUNCTION__, stcb, key_id, skey->refcount);
+ }
+ }
+}
+
+static sctp_sharedkey_t *
+sctp_copy_sharedkey(const sctp_sharedkey_t * skey)
+{
+ sctp_sharedkey_t *new_skey;
+
+ if (skey == NULL)
+ return (NULL);
+ new_skey = sctp_alloc_sharedkey();
+ if (new_skey == NULL)
+ return (NULL);
+ if (skey->key != NULL)
+ new_skey->key = sctp_set_key(skey->key->key, skey->key->keylen);
+ else
+ new_skey->key = NULL;
+ new_skey->keyid = skey->keyid;
+ return (new_skey);
+}
+
+int
+sctp_copy_skeylist(const struct sctp_keyhead *src, struct sctp_keyhead *dest)
+{
+ sctp_sharedkey_t *skey, *new_skey;
+ int count = 0;
+
+ if ((src == NULL) || (dest == NULL))
+ return (0);
+ LIST_FOREACH(skey, src, next) {
+ new_skey = sctp_copy_sharedkey(skey);
+ if (new_skey != NULL) {
+ (void)sctp_insert_sharedkey(dest, new_skey);
+ count++;
+ }
+ }
+ return (count);
+}
+
+
+sctp_hmaclist_t *
+sctp_alloc_hmaclist(uint8_t num_hmacs)
+{
+ sctp_hmaclist_t *new_list;
+ int alloc_size;
+
+ alloc_size = sizeof(*new_list) + num_hmacs * sizeof(new_list->hmac[0]);
+ SCTP_MALLOC(new_list, sctp_hmaclist_t *, alloc_size,
+ SCTP_M_AUTH_HL);
+ if (new_list == NULL) {
+ /* out of memory */
+ return (NULL);
+ }
+ new_list->max_algo = num_hmacs;
+ new_list->num_algo = 0;
+ return (new_list);
+}
+
+void
+sctp_free_hmaclist(sctp_hmaclist_t * list)
+{
+ if (list != NULL) {
+ SCTP_FREE(list, SCTP_M_AUTH_HL);
+ list = NULL;
+ }
+}
+
+int
+sctp_auth_add_hmacid(sctp_hmaclist_t * list, uint16_t hmac_id)
+{
+ int i;
+
+ if (list == NULL)
+ return (-1);
+ if (list->num_algo == list->max_algo) {
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "SCTP: HMAC id list full, ignoring add %u\n", hmac_id);
+ return (-1);
+ }
+ if ((hmac_id != SCTP_AUTH_HMAC_ID_SHA1) &&
+#ifdef HAVE_SHA224
+ (hmac_id != SCTP_AUTH_HMAC_ID_SHA224) &&
+#endif
+#ifdef HAVE_SHA2
+ (hmac_id != SCTP_AUTH_HMAC_ID_SHA256) &&
+ (hmac_id != SCTP_AUTH_HMAC_ID_SHA384) &&
+ (hmac_id != SCTP_AUTH_HMAC_ID_SHA512) &&
+#endif
+ 1) {
+ return (-1);
+ }
+ /* Now is it already in the list */
+ for (i = 0; i < list->num_algo; i++) {
+ if (list->hmac[i] == hmac_id) {
+ /* already in list */
+ return (-1);
+ }
+ }
+ SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: add HMAC id %u to list\n", hmac_id);
+ list->hmac[list->num_algo++] = hmac_id;
+ return (0);
+}
+
+sctp_hmaclist_t *
+sctp_copy_hmaclist(sctp_hmaclist_t * list)
+{
+ sctp_hmaclist_t *new_list;
+ int i;
+
+ if (list == NULL)
+ return (NULL);
+ /* get a new list */
+ new_list = sctp_alloc_hmaclist(list->max_algo);
+ if (new_list == NULL)
+ return (NULL);
+ /* copy it */
+ new_list->max_algo = list->max_algo;
+ new_list->num_algo = list->num_algo;
+ for (i = 0; i < list->num_algo; i++)
+ new_list->hmac[i] = list->hmac[i];
+ return (new_list);
+}
+
+sctp_hmaclist_t *
+sctp_default_supported_hmaclist(void)
+{
+ sctp_hmaclist_t *new_list;
+
+ new_list = sctp_alloc_hmaclist(2);
+ if (new_list == NULL)
+ return (NULL);
+ (void)sctp_auth_add_hmacid(new_list, SCTP_AUTH_HMAC_ID_SHA1);
+ (void)sctp_auth_add_hmacid(new_list, SCTP_AUTH_HMAC_ID_SHA256);
+ return (new_list);
+}
+
+/*-
+ * HMAC algos are listed in priority/preference order
+ * find the best HMAC id to use for the peer based on local support
+ */
+uint16_t
+sctp_negotiate_hmacid(sctp_hmaclist_t * peer, sctp_hmaclist_t * local)
+{
+ int i, j;
+
+ if ((local == NULL) || (peer == NULL))
+ return (SCTP_AUTH_HMAC_ID_RSVD);
+
+ for (i = 0; i < peer->num_algo; i++) {
+ for (j = 0; j < local->num_algo; j++) {
+ if (peer->hmac[i] == local->hmac[j]) {
+ /* found the "best" one */
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "SCTP: negotiated peer HMAC id %u\n",
+ peer->hmac[i]);
+ return (peer->hmac[i]);
+ }
+ }
+ }
+ /* didn't find one! */
+ return (SCTP_AUTH_HMAC_ID_RSVD);
+}
+
+/*-
+ * serialize the HMAC algo list and return space used
+ * caller must guarantee ptr has appropriate space
+ */
+int
+sctp_serialize_hmaclist(sctp_hmaclist_t * list, uint8_t * ptr)
+{
+ int i;
+ uint16_t hmac_id;
+
+ if (list == NULL)
+ return (0);
+
+ for (i = 0; i < list->num_algo; i++) {
+ hmac_id = htons(list->hmac[i]);
+ bcopy(&hmac_id, ptr, sizeof(hmac_id));
+ ptr += sizeof(hmac_id);
+ }
+ return (list->num_algo * sizeof(hmac_id));
+}
+
+int
+sctp_verify_hmac_param(struct sctp_auth_hmac_algo *hmacs, uint32_t num_hmacs)
+{
+ uint32_t i;
+ uint16_t hmac_id;
+ uint32_t sha1_supported = 0;
+
+ for (i = 0; i < num_hmacs; i++) {
+ hmac_id = ntohs(hmacs->hmac_ids[i]);
+ if (hmac_id == SCTP_AUTH_HMAC_ID_SHA1)
+ sha1_supported = 1;
+ }
+ /* all HMAC id's are supported */
+ if (sha1_supported == 0)
+ return (-1);
+ else
+ return (0);
+}
+
+sctp_authinfo_t *
+sctp_alloc_authinfo(void)
+{
+ sctp_authinfo_t *new_authinfo;
+
+ SCTP_MALLOC(new_authinfo, sctp_authinfo_t *, sizeof(*new_authinfo),
+ SCTP_M_AUTH_IF);
+
+ if (new_authinfo == NULL) {
+ /* out of memory */
+ return (NULL);
+ }
+ bzero(new_authinfo, sizeof(*new_authinfo));
+ return (new_authinfo);
+}
+
+void
+sctp_free_authinfo(sctp_authinfo_t * authinfo)
+{
+ if (authinfo == NULL)
+ return;
+
+ if (authinfo->random != NULL)
+ sctp_free_key(authinfo->random);
+ if (authinfo->peer_random != NULL)
+ sctp_free_key(authinfo->peer_random);
+ if (authinfo->assoc_key != NULL)
+ sctp_free_key(authinfo->assoc_key);
+ if (authinfo->recv_key != NULL)
+ sctp_free_key(authinfo->recv_key);
+
+ /* We are NOT dynamically allocating authinfo's right now... */
+ /* SCTP_FREE(authinfo, SCTP_M_AUTH_??); */
+}
+
+
+uint32_t
+sctp_get_auth_chunk_len(uint16_t hmac_algo)
+{
+ int size;
+
+ size = sizeof(struct sctp_auth_chunk) + sctp_get_hmac_digest_len(hmac_algo);
+ return (SCTP_SIZE32(size));
+}
+
+uint32_t
+sctp_get_hmac_digest_len(uint16_t hmac_algo)
+{
+ switch (hmac_algo) {
+ case SCTP_AUTH_HMAC_ID_SHA1:
+ return (SCTP_AUTH_DIGEST_LEN_SHA1);
+#ifdef HAVE_SHA224
+ case SCTP_AUTH_HMAC_ID_SHA224:
+ return (SCTP_AUTH_DIGEST_LEN_SHA224);
+#endif
+#ifdef HAVE_SHA2
+ case SCTP_AUTH_HMAC_ID_SHA256:
+ return (SCTP_AUTH_DIGEST_LEN_SHA256);
+ case SCTP_AUTH_HMAC_ID_SHA384:
+ return (SCTP_AUTH_DIGEST_LEN_SHA384);
+ case SCTP_AUTH_HMAC_ID_SHA512:
+ return (SCTP_AUTH_DIGEST_LEN_SHA512);
+#endif
+ default:
+ /* unknown HMAC algorithm: can't do anything */
+ return (0);
+ } /* end switch */
+}
+
+static inline int
+sctp_get_hmac_block_len(uint16_t hmac_algo)
+{
+ switch (hmac_algo) {
+ case SCTP_AUTH_HMAC_ID_SHA1:
+#ifdef HAVE_SHA224
+ case SCTP_AUTH_HMAC_ID_SHA224:
+#endif
+ return (64);
+#ifdef HAVE_SHA2
+ case SCTP_AUTH_HMAC_ID_SHA256:
+ return (64);
+ case SCTP_AUTH_HMAC_ID_SHA384:
+ case SCTP_AUTH_HMAC_ID_SHA512:
+ return (128);
+#endif
+ case SCTP_AUTH_HMAC_ID_RSVD:
+ default:
+ /* unknown HMAC algorithm: can't do anything */
+ return (0);
+ } /* end switch */
+}
+
+static void
+sctp_hmac_init(uint16_t hmac_algo, sctp_hash_context_t * ctx)
+{
+ switch (hmac_algo) {
+ case SCTP_AUTH_HMAC_ID_SHA1:
+ SHA1_Init(&ctx->sha1);
+ break;
+#ifdef HAVE_SHA224
+ case SCTP_AUTH_HMAC_ID_SHA224:
+ break;
+#endif
+#ifdef HAVE_SHA2
+ case SCTP_AUTH_HMAC_ID_SHA256:
+ SHA256_Init(&ctx->sha256);
+ break;
+ case SCTP_AUTH_HMAC_ID_SHA384:
+ SHA384_Init(&ctx->sha384);
+ break;
+ case SCTP_AUTH_HMAC_ID_SHA512:
+ SHA512_Init(&ctx->sha512);
+ break;
+#endif
+ case SCTP_AUTH_HMAC_ID_RSVD:
+ default:
+ /* unknown HMAC algorithm: can't do anything */
+ return;
+ } /* end switch */
+}
+
+static void
+sctp_hmac_update(uint16_t hmac_algo, sctp_hash_context_t * ctx,
+ uint8_t * text, uint32_t textlen)
+{
+ switch (hmac_algo) {
+ case SCTP_AUTH_HMAC_ID_SHA1:
+ SHA1_Update(&ctx->sha1, text, textlen);
+ break;
+#ifdef HAVE_SHA224
+ case SCTP_AUTH_HMAC_ID_SHA224:
+ break;
+#endif
+#ifdef HAVE_SHA2
+ case SCTP_AUTH_HMAC_ID_SHA256:
+ SHA256_Update(&ctx->sha256, text, textlen);
+ break;
+ case SCTP_AUTH_HMAC_ID_SHA384:
+ SHA384_Update(&ctx->sha384, text, textlen);
+ break;
+ case SCTP_AUTH_HMAC_ID_SHA512:
+ SHA512_Update(&ctx->sha512, text, textlen);
+ break;
+#endif
+ case SCTP_AUTH_HMAC_ID_RSVD:
+ default:
+ /* unknown HMAC algorithm: can't do anything */
+ return;
+ } /* end switch */
+}
+
+static void
+sctp_hmac_final(uint16_t hmac_algo, sctp_hash_context_t * ctx,
+ uint8_t * digest)
+{
+ switch (hmac_algo) {
+ case SCTP_AUTH_HMAC_ID_SHA1:
+ SHA1_Final(digest, &ctx->sha1);
+ break;
+#ifdef HAVE_SHA224
+ case SCTP_AUTH_HMAC_ID_SHA224:
+ break;
+#endif
+#ifdef HAVE_SHA2
+ case SCTP_AUTH_HMAC_ID_SHA256:
+ SHA256_Final(digest, &ctx->sha256);
+ break;
+ case SCTP_AUTH_HMAC_ID_SHA384:
+ /* SHA384 is truncated SHA512 */
+ SHA384_Final(digest, &ctx->sha384);
+ break;
+ case SCTP_AUTH_HMAC_ID_SHA512:
+ SHA512_Final(digest, &ctx->sha512);
+ break;
+#endif
+ case SCTP_AUTH_HMAC_ID_RSVD:
+ default:
+ /* unknown HMAC algorithm: can't do anything */
+ return;
+ } /* end switch */
+}
+
+/*-
+ * Keyed-Hashing for Message Authentication: FIPS 198 (RFC 2104)
+ *
+ * Compute the HMAC digest using the desired hash key, text, and HMAC
+ * algorithm. Resulting digest is placed in 'digest' and digest length
+ * is returned, if the HMAC was performed.
+ *
+ * WARNING: it is up to the caller to supply sufficient space to hold the
+ * resultant digest.
+ */
+uint32_t
+sctp_hmac(uint16_t hmac_algo, uint8_t * key, uint32_t keylen,
+ uint8_t * text, uint32_t textlen, uint8_t * digest)
+{
+ uint32_t digestlen;
+ uint32_t blocklen;
+ sctp_hash_context_t ctx;
+ uint8_t ipad[128], opad[128]; /* keyed hash inner/outer pads */
+ uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX];
+ uint32_t i;
+
+ /* sanity check the material and length */
+ if ((key == NULL) || (keylen == 0) || (text == NULL) ||
+ (textlen == 0) || (digest == NULL)) {
+ /* can't do HMAC with empty key or text or digest store */
+ return (0);
+ }
+ /* validate the hmac algo and get the digest length */
+ digestlen = sctp_get_hmac_digest_len(hmac_algo);
+ if (digestlen == 0)
+ return (0);
+
+ /* hash the key if it is longer than the hash block size */
+ blocklen = sctp_get_hmac_block_len(hmac_algo);
+ if (keylen > blocklen) {
+ sctp_hmac_init(hmac_algo, &ctx);
+ sctp_hmac_update(hmac_algo, &ctx, key, keylen);
+ sctp_hmac_final(hmac_algo, &ctx, temp);
+ /* set the hashed key as the key */
+ keylen = digestlen;
+ key = temp;
+ }
+ /* initialize the inner/outer pads with the key and "append" zeroes */
+ bzero(ipad, blocklen);
+ bzero(opad, blocklen);
+ bcopy(key, ipad, keylen);
+ bcopy(key, opad, keylen);
+
+ /* XOR the key with ipad and opad values */
+ for (i = 0; i < blocklen; i++) {
+ ipad[i] ^= 0x36;
+ opad[i] ^= 0x5c;
+ }
+
+ /* perform inner hash */
+ sctp_hmac_init(hmac_algo, &ctx);
+ sctp_hmac_update(hmac_algo, &ctx, ipad, blocklen);
+ sctp_hmac_update(hmac_algo, &ctx, text, textlen);
+ sctp_hmac_final(hmac_algo, &ctx, temp);
+
+ /* perform outer hash */
+ sctp_hmac_init(hmac_algo, &ctx);
+ sctp_hmac_update(hmac_algo, &ctx, opad, blocklen);
+ sctp_hmac_update(hmac_algo, &ctx, temp, digestlen);
+ sctp_hmac_final(hmac_algo, &ctx, digest);
+
+ return (digestlen);
+}
+
+/* mbuf version */
+uint32_t
+sctp_hmac_m(uint16_t hmac_algo, uint8_t * key, uint32_t keylen,
+ struct mbuf *m, uint32_t m_offset, uint8_t * digest, uint32_t trailer)
+{
+ uint32_t digestlen;
+ uint32_t blocklen;
+ sctp_hash_context_t ctx;
+ uint8_t ipad[128], opad[128]; /* keyed hash inner/outer pads */
+ uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX];
+ uint32_t i;
+ struct mbuf *m_tmp;
+
+ /* sanity check the material and length */
+ if ((key == NULL) || (keylen == 0) || (m == NULL) || (digest == NULL)) {
+ /* can't do HMAC with empty key or text or digest store */
+ return (0);
+ }
+ /* validate the hmac algo and get the digest length */
+ digestlen = sctp_get_hmac_digest_len(hmac_algo);
+ if (digestlen == 0)
+ return (0);
+
+ /* hash the key if it is longer than the hash block size */
+ blocklen = sctp_get_hmac_block_len(hmac_algo);
+ if (keylen > blocklen) {
+ sctp_hmac_init(hmac_algo, &ctx);
+ sctp_hmac_update(hmac_algo, &ctx, key, keylen);
+ sctp_hmac_final(hmac_algo, &ctx, temp);
+ /* set the hashed key as the key */
+ keylen = digestlen;
+ key = temp;
+ }
+ /* initialize the inner/outer pads with the key and "append" zeroes */
+ bzero(ipad, blocklen);
+ bzero(opad, blocklen);
+ bcopy(key, ipad, keylen);
+ bcopy(key, opad, keylen);
+
+ /* XOR the key with ipad and opad values */
+ for (i = 0; i < blocklen; i++) {
+ ipad[i] ^= 0x36;
+ opad[i] ^= 0x5c;
+ }
+
+ /* perform inner hash */
+ sctp_hmac_init(hmac_algo, &ctx);
+ sctp_hmac_update(hmac_algo, &ctx, ipad, blocklen);
+ /* find the correct starting mbuf and offset (get start of text) */
+ m_tmp = m;
+ while ((m_tmp != NULL) && (m_offset >= (uint32_t) SCTP_BUF_LEN(m_tmp))) {
+ m_offset -= SCTP_BUF_LEN(m_tmp);
+ m_tmp = SCTP_BUF_NEXT(m_tmp);
+ }
+ /* now use the rest of the mbuf chain for the text */
+ while (m_tmp != NULL) {
+ if ((SCTP_BUF_NEXT(m_tmp) == NULL) && trailer) {
+ sctp_hmac_update(hmac_algo, &ctx, mtod(m_tmp, uint8_t *) + m_offset,
+ SCTP_BUF_LEN(m_tmp) - (trailer + m_offset));
+ } else {
+ sctp_hmac_update(hmac_algo, &ctx, mtod(m_tmp, uint8_t *) + m_offset,
+ SCTP_BUF_LEN(m_tmp) - m_offset);
+ }
+
+ /* clear the offset since it's only for the first mbuf */
+ m_offset = 0;
+ m_tmp = SCTP_BUF_NEXT(m_tmp);
+ }
+ sctp_hmac_final(hmac_algo, &ctx, temp);
+
+ /* perform outer hash */
+ sctp_hmac_init(hmac_algo, &ctx);
+ sctp_hmac_update(hmac_algo, &ctx, opad, blocklen);
+ sctp_hmac_update(hmac_algo, &ctx, temp, digestlen);
+ sctp_hmac_final(hmac_algo, &ctx, digest);
+
+ return (digestlen);
+}
+
+/*-
+ * verify the HMAC digest using the desired hash key, text, and HMAC
+ * algorithm.
+ * Returns -1 on error, 0 on success.
+ */
+int
+sctp_verify_hmac(uint16_t hmac_algo, uint8_t * key, uint32_t keylen,
+ uint8_t * text, uint32_t textlen,
+ uint8_t * digest, uint32_t digestlen)
+{
+ uint32_t len;
+ uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX];
+
+ /* sanity check the material and length */
+ if ((key == NULL) || (keylen == 0) ||
+ (text == NULL) || (textlen == 0) || (digest == NULL)) {
+ /* can't do HMAC with empty key or text or digest */
+ return (-1);
+ }
+ len = sctp_get_hmac_digest_len(hmac_algo);
+ if ((len == 0) || (digestlen != len))
+ return (-1);
+
+ /* compute the expected hash */
+ if (sctp_hmac(hmac_algo, key, keylen, text, textlen, temp) != len)
+ return (-1);
+
+ if (memcmp(digest, temp, digestlen) != 0)
+ return (-1);
+ else
+ return (0);
+}
+
+
+/*
+ * computes the requested HMAC using a key struct (which may be modified if
+ * the keylen exceeds the HMAC block len).
+ */
+uint32_t
+sctp_compute_hmac(uint16_t hmac_algo, sctp_key_t * key, uint8_t * text,
+ uint32_t textlen, uint8_t * digest)
+{
+ uint32_t digestlen;
+ uint32_t blocklen;
+ sctp_hash_context_t ctx;
+ uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX];
+
+ /* sanity check */
+ if ((key == NULL) || (text == NULL) || (textlen == 0) ||
+ (digest == NULL)) {
+ /* can't do HMAC with empty key or text or digest store */
+ return (0);
+ }
+ /* validate the hmac algo and get the digest length */
+ digestlen = sctp_get_hmac_digest_len(hmac_algo);
+ if (digestlen == 0)
+ return (0);
+
+ /* hash the key if it is longer than the hash block size */
+ blocklen = sctp_get_hmac_block_len(hmac_algo);
+ if (key->keylen > blocklen) {
+ sctp_hmac_init(hmac_algo, &ctx);
+ sctp_hmac_update(hmac_algo, &ctx, key->key, key->keylen);
+ sctp_hmac_final(hmac_algo, &ctx, temp);
+ /* save the hashed key as the new key */
+ key->keylen = digestlen;
+ bcopy(temp, key->key, key->keylen);
+ }
+ return (sctp_hmac(hmac_algo, key->key, key->keylen, text, textlen,
+ digest));
+}
+
+/* mbuf version */
+uint32_t
+sctp_compute_hmac_m(uint16_t hmac_algo, sctp_key_t * key, struct mbuf *m,
+ uint32_t m_offset, uint8_t * digest)
+{
+ uint32_t digestlen;
+ uint32_t blocklen;
+ sctp_hash_context_t ctx;
+ uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX];
+
+ /* sanity check */
+ if ((key == NULL) || (m == NULL) || (digest == NULL)) {
+ /* can't do HMAC with empty key or text or digest store */
+ return (0);
+ }
+ /* validate the hmac algo and get the digest length */
+ digestlen = sctp_get_hmac_digest_len(hmac_algo);
+ if (digestlen == 0)
+ return (0);
+
+ /* hash the key if it is longer than the hash block size */
+ blocklen = sctp_get_hmac_block_len(hmac_algo);
+ if (key->keylen > blocklen) {
+ sctp_hmac_init(hmac_algo, &ctx);
+ sctp_hmac_update(hmac_algo, &ctx, key->key, key->keylen);
+ sctp_hmac_final(hmac_algo, &ctx, temp);
+ /* save the hashed key as the new key */
+ key->keylen = digestlen;
+ bcopy(temp, key->key, key->keylen);
+ }
+ return (sctp_hmac_m(hmac_algo, key->key, key->keylen, m, m_offset, digest, 0));
+}
+
+int
+sctp_auth_is_supported_hmac(sctp_hmaclist_t * list, uint16_t id)
+{
+ int i;
+
+ if ((list == NULL) || (id == SCTP_AUTH_HMAC_ID_RSVD))
+ return (0);
+
+ for (i = 0; i < list->num_algo; i++)
+ if (list->hmac[i] == id)
+ return (1);
+
+ /* not in the list */
+ return (0);
+}
+
+
+/*-
+ * clear any cached key(s) if they match the given key id on an association.
+ * the cached key(s) will be recomputed and re-cached at next use.
+ * ASSUMES TCB_LOCK is already held
+ */
+void
+sctp_clear_cachedkeys(struct sctp_tcb *stcb, uint16_t keyid)
+{
+ if (stcb == NULL)
+ return;
+
+ if (keyid == stcb->asoc.authinfo.assoc_keyid) {
+ sctp_free_key(stcb->asoc.authinfo.assoc_key);
+ stcb->asoc.authinfo.assoc_key = NULL;
+ }
+ if (keyid == stcb->asoc.authinfo.recv_keyid) {
+ sctp_free_key(stcb->asoc.authinfo.recv_key);
+ stcb->asoc.authinfo.recv_key = NULL;
+ }
+}
+
+/*-
+ * clear any cached key(s) if they match the given key id for all assocs on
+ * an endpoint.
+ * ASSUMES INP_WLOCK is already held
+ */
+void
+sctp_clear_cachedkeys_ep(struct sctp_inpcb *inp, uint16_t keyid)
+{
+ struct sctp_tcb *stcb;
+
+ if (inp == NULL)
+ return;
+
+ /* clear the cached keys on all assocs on this instance */
+ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
+ SCTP_TCB_LOCK(stcb);
+ sctp_clear_cachedkeys(stcb, keyid);
+ SCTP_TCB_UNLOCK(stcb);
+ }
+}
+
+/*-
+ * delete a shared key from an association
+ * ASSUMES TCB_LOCK is already held
+ */
+int
+sctp_delete_sharedkey(struct sctp_tcb *stcb, uint16_t keyid)
+{
+ sctp_sharedkey_t *skey;
+
+ if (stcb == NULL)
+ return (-1);
+
+ /* is the keyid the assoc active sending key */
+ if (keyid == stcb->asoc.authinfo.active_keyid)
+ return (-1);
+
+ /* does the key exist? */
+ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid);
+ if (skey == NULL)
+ return (-1);
+
+ /* are there other refcount holders on the key? */
+ if (skey->refcount > 1)
+ return (-1);
+
+ /* remove it */
+ LIST_REMOVE(skey, next);
+ sctp_free_sharedkey(skey); /* frees skey->key as well */
+
+ /* clear any cached keys */
+ sctp_clear_cachedkeys(stcb, keyid);
+ return (0);
+}
+
+/*-
+ * deletes a shared key from the endpoint
+ * ASSUMES INP_WLOCK is already held
+ */
+int
+sctp_delete_sharedkey_ep(struct sctp_inpcb *inp, uint16_t keyid)
+{
+ sctp_sharedkey_t *skey;
+
+ if (inp == NULL)
+ return (-1);
+
+ /* is the keyid the active sending key on the endpoint */
+ if (keyid == inp->sctp_ep.default_keyid)
+ return (-1);
+
+ /* does the key exist? */
+ skey = sctp_find_sharedkey(&inp->sctp_ep.shared_keys, keyid);
+ if (skey == NULL)
+ return (-1);
+
+ /* endpoint keys are not refcounted */
+
+ /* remove it */
+ LIST_REMOVE(skey, next);
+ sctp_free_sharedkey(skey); /* frees skey->key as well */
+
+ /* clear any cached keys */
+ sctp_clear_cachedkeys_ep(inp, keyid);
+ return (0);
+}
+
+/*-
+ * set the active key on an association
+ * ASSUMES TCB_LOCK is already held
+ */
+int
+sctp_auth_setactivekey(struct sctp_tcb *stcb, uint16_t keyid)
+{
+ sctp_sharedkey_t *skey = NULL;
+
+ /* find the key on the assoc */
+ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid);
+ if (skey == NULL) {
+ /* that key doesn't exist */
+ return (-1);
+ }
+ if ((skey->deactivated) && (skey->refcount > 1)) {
+ /* can't reactivate a deactivated key with other refcounts */
+ return (-1);
+ }
+ /* set the (new) active key */
+ stcb->asoc.authinfo.active_keyid = keyid;
+ /* reset the deactivated flag */
+ skey->deactivated = 0;
+
+ return (0);
+}
+
+/*-
+ * set the active key on an endpoint
+ * ASSUMES INP_WLOCK is already held
+ */
+int
+sctp_auth_setactivekey_ep(struct sctp_inpcb *inp, uint16_t keyid)
+{
+ sctp_sharedkey_t *skey;
+
+ /* find the key */
+ skey = sctp_find_sharedkey(&inp->sctp_ep.shared_keys, keyid);
+ if (skey == NULL) {
+ /* that key doesn't exist */
+ return (-1);
+ }
+ inp->sctp_ep.default_keyid = keyid;
+ return (0);
+}
+
+/*-
+ * deactivates a shared key from the association
+ * ASSUMES INP_WLOCK is already held
+ */
+int
+sctp_deact_sharedkey(struct sctp_tcb *stcb, uint16_t keyid)
+{
+ sctp_sharedkey_t *skey;
+
+ if (stcb == NULL)
+ return (-1);
+
+ /* is the keyid the assoc active sending key */
+ if (keyid == stcb->asoc.authinfo.active_keyid)
+ return (-1);
+
+ /* does the key exist? */
+ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid);
+ if (skey == NULL)
+ return (-1);
+
+ /* are there other refcount holders on the key? */
+ if (skey->refcount == 1) {
+ /* no other users, send a notification for this key */
+ sctp_ulp_notify(SCTP_NOTIFY_AUTH_FREE_KEY, stcb, keyid, 0,
+ SCTP_SO_LOCKED);
+ }
+ /* mark the key as deactivated */
+ skey->deactivated = 1;
+
+ return (0);
+}
+
+/*-
+ * deactivates a shared key from the endpoint
+ * ASSUMES INP_WLOCK is already held
+ */
+int
+sctp_deact_sharedkey_ep(struct sctp_inpcb *inp, uint16_t keyid)
+{
+ sctp_sharedkey_t *skey;
+
+ if (inp == NULL)
+ return (-1);
+
+ /* is the keyid the active sending key on the endpoint */
+ if (keyid == inp->sctp_ep.default_keyid)
+ return (-1);
+
+ /* does the key exist? */
+ skey = sctp_find_sharedkey(&inp->sctp_ep.shared_keys, keyid);
+ if (skey == NULL)
+ return (-1);
+
+ /* endpoint keys are not refcounted */
+
+ /* remove it */
+ LIST_REMOVE(skey, next);
+ sctp_free_sharedkey(skey); /* frees skey->key as well */
+
+ return (0);
+}
+
+/*
+ * get local authentication parameters from cookie (from INIT-ACK)
+ */
+void
+sctp_auth_get_cookie_params(struct sctp_tcb *stcb, struct mbuf *m,
+ uint32_t offset, uint32_t length)
+{
+ struct sctp_paramhdr *phdr, tmp_param;
+ uint16_t plen, ptype;
+ uint8_t random_store[SCTP_PARAM_BUFFER_SIZE];
+ struct sctp_auth_random *p_random = NULL;
+ uint16_t random_len = 0;
+ uint8_t hmacs_store[SCTP_PARAM_BUFFER_SIZE];
+ struct sctp_auth_hmac_algo *hmacs = NULL;
+ uint16_t hmacs_len = 0;
+ uint8_t chunks_store[SCTP_PARAM_BUFFER_SIZE];
+ struct sctp_auth_chunk_list *chunks = NULL;
+ uint16_t num_chunks = 0;
+ sctp_key_t *new_key;
+ uint32_t keylen;
+
+ /* convert to upper bound */
+ length += offset;
+
+ phdr = (struct sctp_paramhdr *)sctp_m_getptr(m, offset,
+ sizeof(struct sctp_paramhdr), (uint8_t *) & tmp_param);
+ while (phdr != NULL) {
+ ptype = ntohs(phdr->param_type);
+ plen = ntohs(phdr->param_length);
+
+ if ((plen == 0) || (offset + plen > length))
+ break;
+
+ if (ptype == SCTP_RANDOM) {
+ if (plen > sizeof(random_store))
+ break;
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)random_store, min(plen, sizeof(random_store)));
+ if (phdr == NULL)
+ return;
+ /* save the random and length for the key */
+ p_random = (struct sctp_auth_random *)phdr;
+ random_len = plen - sizeof(*p_random);
+ } else if (ptype == SCTP_HMAC_LIST) {
+ int num_hmacs;
+ int i;
+
+ if (plen > sizeof(hmacs_store))
+ break;
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)hmacs_store, min(plen, sizeof(hmacs_store)));
+ if (phdr == NULL)
+ return;
+ /* save the hmacs list and num for the key */
+ hmacs = (struct sctp_auth_hmac_algo *)phdr;
+ hmacs_len = plen - sizeof(*hmacs);
+ num_hmacs = hmacs_len / sizeof(hmacs->hmac_ids[0]);
+ if (stcb->asoc.local_hmacs != NULL)
+ sctp_free_hmaclist(stcb->asoc.local_hmacs);
+ stcb->asoc.local_hmacs = sctp_alloc_hmaclist(num_hmacs);
+ if (stcb->asoc.local_hmacs != NULL) {
+ for (i = 0; i < num_hmacs; i++) {
+ (void)sctp_auth_add_hmacid(stcb->asoc.local_hmacs,
+ ntohs(hmacs->hmac_ids[i]));
+ }
+ }
+ } else if (ptype == SCTP_CHUNK_LIST) {
+ int i;
+
+ if (plen > sizeof(chunks_store))
+ break;
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)chunks_store, min(plen, sizeof(chunks_store)));
+ if (phdr == NULL)
+ return;
+ chunks = (struct sctp_auth_chunk_list *)phdr;
+ num_chunks = plen - sizeof(*chunks);
+ /* save chunks list and num for the key */
+ if (stcb->asoc.local_auth_chunks != NULL)
+ sctp_clear_chunklist(stcb->asoc.local_auth_chunks);
+ else
+ stcb->asoc.local_auth_chunks = sctp_alloc_chunklist();
+ for (i = 0; i < num_chunks; i++) {
+ (void)sctp_auth_add_chunk(chunks->chunk_types[i],
+ stcb->asoc.local_auth_chunks);
+ }
+ }
+ /* get next parameter */
+ offset += SCTP_SIZE32(plen);
+ if (offset + sizeof(struct sctp_paramhdr) > length)
+ break;
+ phdr = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr),
+ (uint8_t *) & tmp_param);
+ }
+ /* concatenate the full random key */
+ keylen = sizeof(*p_random) + random_len + sizeof(*hmacs) + hmacs_len;
+ if (chunks != NULL) {
+ keylen += sizeof(*chunks) + num_chunks;
+ }
+ new_key = sctp_alloc_key(keylen);
+ if (new_key != NULL) {
+ /* copy in the RANDOM */
+ if (p_random != NULL) {
+ keylen = sizeof(*p_random) + random_len;
+ bcopy(p_random, new_key->key, keylen);
+ }
+ /* append in the AUTH chunks */
+ if (chunks != NULL) {
+ bcopy(chunks, new_key->key + keylen,
+ sizeof(*chunks) + num_chunks);
+ keylen += sizeof(*chunks) + num_chunks;
+ }
+ /* append in the HMACs */
+ if (hmacs != NULL) {
+ bcopy(hmacs, new_key->key + keylen,
+ sizeof(*hmacs) + hmacs_len);
+ }
+ }
+ if (stcb->asoc.authinfo.random != NULL)
+ sctp_free_key(stcb->asoc.authinfo.random);
+ stcb->asoc.authinfo.random = new_key;
+ stcb->asoc.authinfo.random_len = random_len;
+ sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.assoc_keyid);
+ sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.recv_keyid);
+
+ /* negotiate what HMAC to use for the peer */
+ stcb->asoc.peer_hmac_id = sctp_negotiate_hmacid(stcb->asoc.peer_hmacs,
+ stcb->asoc.local_hmacs);
+
+ /* copy defaults from the endpoint */
+ /* FIX ME: put in cookie? */
+ stcb->asoc.authinfo.active_keyid = stcb->sctp_ep->sctp_ep.default_keyid;
+ /* copy out the shared key list (by reference) from the endpoint */
+ (void)sctp_copy_skeylist(&stcb->sctp_ep->sctp_ep.shared_keys,
+ &stcb->asoc.shared_keys);
+}
+
+/*
+ * compute and fill in the HMAC digest for a packet
+ */
+void
+sctp_fill_hmac_digest_m(struct mbuf *m, uint32_t auth_offset,
+ struct sctp_auth_chunk *auth, struct sctp_tcb *stcb, uint16_t keyid)
+{
+ uint32_t digestlen;
+ sctp_sharedkey_t *skey;
+ sctp_key_t *key;
+
+ if ((stcb == NULL) || (auth == NULL))
+ return;
+
+ /* zero the digest + chunk padding */
+ digestlen = sctp_get_hmac_digest_len(stcb->asoc.peer_hmac_id);
+ bzero(auth->hmac, SCTP_SIZE32(digestlen));
+
+ /* is the desired key cached? */
+ if ((keyid != stcb->asoc.authinfo.assoc_keyid) ||
+ (stcb->asoc.authinfo.assoc_key == NULL)) {
+ if (stcb->asoc.authinfo.assoc_key != NULL) {
+ /* free the old cached key */
+ sctp_free_key(stcb->asoc.authinfo.assoc_key);
+ }
+ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid);
+ /* the only way skey is NULL is if null key id 0 is used */
+ if (skey != NULL)
+ key = skey->key;
+ else
+ key = NULL;
+ /* compute a new assoc key and cache it */
+ stcb->asoc.authinfo.assoc_key =
+ sctp_compute_hashkey(stcb->asoc.authinfo.random,
+ stcb->asoc.authinfo.peer_random, key);
+ stcb->asoc.authinfo.assoc_keyid = keyid;
+ SCTPDBG(SCTP_DEBUG_AUTH1, "caching key id %u\n",
+ stcb->asoc.authinfo.assoc_keyid);
+#ifdef SCTP_DEBUG
+ if (SCTP_AUTH_DEBUG)
+ sctp_print_key(stcb->asoc.authinfo.assoc_key,
+ "Assoc Key");
+#endif
+ }
+ /* set in the active key id */
+ auth->shared_key_id = htons(keyid);
+
+ /* compute and fill in the digest */
+ (void)sctp_compute_hmac_m(stcb->asoc.peer_hmac_id, stcb->asoc.authinfo.assoc_key,
+ m, auth_offset, auth->hmac);
+}
+
+
+static void
+sctp_bzero_m(struct mbuf *m, uint32_t m_offset, uint32_t size)
+{
+ struct mbuf *m_tmp;
+ uint8_t *data;
+
+ /* sanity check */
+ if (m == NULL)
+ return;
+
+ /* find the correct starting mbuf and offset (get start position) */
+ m_tmp = m;
+ while ((m_tmp != NULL) && (m_offset >= (uint32_t) SCTP_BUF_LEN(m_tmp))) {
+ m_offset -= SCTP_BUF_LEN(m_tmp);
+ m_tmp = SCTP_BUF_NEXT(m_tmp);
+ }
+ /* now use the rest of the mbuf chain */
+ while ((m_tmp != NULL) && (size > 0)) {
+ data = mtod(m_tmp, uint8_t *) + m_offset;
+ if (size > (uint32_t) SCTP_BUF_LEN(m_tmp)) {
+ bzero(data, SCTP_BUF_LEN(m_tmp));
+ size -= SCTP_BUF_LEN(m_tmp);
+ } else {
+ bzero(data, size);
+ size = 0;
+ }
+ /* clear the offset since it's only for the first mbuf */
+ m_offset = 0;
+ m_tmp = SCTP_BUF_NEXT(m_tmp);
+ }
+}
+
+/*-
+ * process the incoming Authentication chunk
+ * return codes:
+ * -1 on any authentication error
+ * 0 on authentication verification
+ */
+int
+sctp_handle_auth(struct sctp_tcb *stcb, struct sctp_auth_chunk *auth,
+ struct mbuf *m, uint32_t offset)
+{
+ uint16_t chunklen;
+ uint16_t shared_key_id;
+ uint16_t hmac_id;
+ sctp_sharedkey_t *skey;
+ uint32_t digestlen;
+ uint8_t digest[SCTP_AUTH_DIGEST_LEN_MAX];
+ uint8_t computed_digest[SCTP_AUTH_DIGEST_LEN_MAX];
+
+ /* auth is checked for NULL by caller */
+ chunklen = ntohs(auth->ch.chunk_length);
+ if (chunklen < sizeof(*auth)) {
+ SCTP_STAT_INCR(sctps_recvauthfailed);
+ return (-1);
+ }
+ SCTP_STAT_INCR(sctps_recvauth);
+
+ /* get the auth params */
+ shared_key_id = ntohs(auth->shared_key_id);
+ hmac_id = ntohs(auth->hmac_id);
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "SCTP AUTH Chunk: shared key %u, HMAC id %u\n",
+ shared_key_id, hmac_id);
+
+ /* is the indicated HMAC supported? */
+ if (!sctp_auth_is_supported_hmac(stcb->asoc.local_hmacs, hmac_id)) {
+ struct mbuf *m_err;
+ struct sctp_auth_invalid_hmac *err;
+
+ SCTP_STAT_INCR(sctps_recvivalhmacid);
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "SCTP Auth: unsupported HMAC id %u\n",
+ hmac_id);
+ /*
+ * report this in an Error Chunk: Unsupported HMAC
+ * Identifier
+ */
+ m_err = sctp_get_mbuf_for_msg(sizeof(*err), 0, M_DONTWAIT,
+ 1, MT_HEADER);
+ if (m_err != NULL) {
+ /* pre-reserve some space */
+ SCTP_BUF_RESV_UF(m_err, sizeof(struct sctp_chunkhdr));
+ /* fill in the error */
+ err = mtod(m_err, struct sctp_auth_invalid_hmac *);
+ bzero(err, sizeof(*err));
+ err->ph.param_type = htons(SCTP_CAUSE_UNSUPPORTED_HMACID);
+ err->ph.param_length = htons(sizeof(*err));
+ err->hmac_id = ntohs(hmac_id);
+ SCTP_BUF_LEN(m_err) = sizeof(*err);
+ /* queue it */
+ sctp_queue_op_err(stcb, m_err);
+ }
+ return (-1);
+ }
+ /* get the indicated shared key, if available */
+ if ((stcb->asoc.authinfo.recv_key == NULL) ||
+ (stcb->asoc.authinfo.recv_keyid != shared_key_id)) {
+ /* find the shared key on the assoc first */
+ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys,
+ shared_key_id);
+ /* if the shared key isn't found, discard the chunk */
+ if (skey == NULL) {
+ SCTP_STAT_INCR(sctps_recvivalkeyid);
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "SCTP Auth: unknown key id %u\n",
+ shared_key_id);
+ return (-1);
+ }
+ /* generate a notification if this is a new key id */
+ if (stcb->asoc.authinfo.recv_keyid != shared_key_id)
+ /*
+ * sctp_ulp_notify(SCTP_NOTIFY_AUTH_NEW_KEY, stcb,
+ * shared_key_id, (void
+ * *)stcb->asoc.authinfo.recv_keyid);
+ */
+ sctp_notify_authentication(stcb, SCTP_AUTH_NEWKEY,
+ shared_key_id, stcb->asoc.authinfo.recv_keyid,
+ SCTP_SO_NOT_LOCKED);
+ /* compute a new recv assoc key and cache it */
+ if (stcb->asoc.authinfo.recv_key != NULL)
+ sctp_free_key(stcb->asoc.authinfo.recv_key);
+ stcb->asoc.authinfo.recv_key =
+ sctp_compute_hashkey(stcb->asoc.authinfo.random,
+ stcb->asoc.authinfo.peer_random, skey->key);
+ stcb->asoc.authinfo.recv_keyid = shared_key_id;
+#ifdef SCTP_DEBUG
+ if (SCTP_AUTH_DEBUG)
+ sctp_print_key(stcb->asoc.authinfo.recv_key, "Recv Key");
+#endif
+ }
+ /* validate the digest length */
+ digestlen = sctp_get_hmac_digest_len(hmac_id);
+ if (chunklen < (sizeof(*auth) + digestlen)) {
+ /* invalid digest length */
+ SCTP_STAT_INCR(sctps_recvauthfailed);
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "SCTP Auth: chunk too short for HMAC\n");
+ return (-1);
+ }
+ /* save a copy of the digest, zero the pseudo header, and validate */
+ bcopy(auth->hmac, digest, digestlen);
+ sctp_bzero_m(m, offset + sizeof(*auth), SCTP_SIZE32(digestlen));
+ (void)sctp_compute_hmac_m(hmac_id, stcb->asoc.authinfo.recv_key,
+ m, offset, computed_digest);
+
+ /* compare the computed digest with the one in the AUTH chunk */
+ if (memcmp(digest, computed_digest, digestlen) != 0) {
+ SCTP_STAT_INCR(sctps_recvauthfailed);
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "SCTP Auth: HMAC digest check failed\n");
+ return (-1);
+ }
+ return (0);
+}
+
+/*
+ * Generate NOTIFICATION
+ */
+void
+sctp_notify_authentication(struct sctp_tcb *stcb, uint32_t indication,
+ uint16_t keyid, uint16_t alt_keyid, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ struct mbuf *m_notify;
+ struct sctp_authkey_event *auth;
+ struct sctp_queued_to_read *control;
+
+ if ((stcb == NULL) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
+ (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)
+ ) {
+ /* If the socket is gone we are out of here */
+ return;
+ }
+ if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_AUTHEVNT))
+ /* event not enabled */
+ return;
+
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_authkey_event),
+ 0, M_DONTWAIT, 1, MT_HEADER);
+ if (m_notify == NULL)
+ /* no space left */
+ return;
+
+ SCTP_BUF_LEN(m_notify) = 0;
+ auth = mtod(m_notify, struct sctp_authkey_event *);
+ auth->auth_type = SCTP_AUTHENTICATION_EVENT;
+ auth->auth_flags = 0;
+ auth->auth_length = sizeof(*auth);
+ auth->auth_keynumber = keyid;
+ auth->auth_altkeynumber = alt_keyid;
+ auth->auth_indication = indication;
+ auth->auth_assoc_id = sctp_get_associd(stcb);
+
+ SCTP_BUF_LEN(m_notify) = sizeof(*auth);
+ SCTP_BUF_NEXT(m_notify) = NULL;
+
+ /* append to socket */
+ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
+ 0, 0, 0, 0, 0, 0, m_notify);
+ if (control == NULL) {
+ /* no memory */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ control->spec_flags = M_NOTIFICATION;
+ control->length = SCTP_BUF_LEN(m_notify);
+ /* not that we need this */
+ control->tail_mbuf = m_notify;
+ sctp_add_to_readq(stcb->sctp_ep, stcb, control,
+ &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked);
+}
+
+
+/*-
+ * validates the AUTHentication related parameters in an INIT/INIT-ACK
+ * Note: currently only used for INIT as INIT-ACK is handled inline
+ * with sctp_load_addresses_from_init()
+ */
+int
+sctp_validate_init_auth_params(struct mbuf *m, int offset, int limit)
+{
+ struct sctp_paramhdr *phdr, parm_buf;
+ uint16_t ptype, plen;
+ int peer_supports_asconf = 0;
+ int peer_supports_auth = 0;
+ int got_random = 0, got_hmacs = 0, got_chklist = 0;
+ uint8_t saw_asconf = 0;
+ uint8_t saw_asconf_ack = 0;
+
+ /* go through each of the params. */
+ phdr = sctp_get_next_param(m, offset, &parm_buf, sizeof(parm_buf));
+ while (phdr) {
+ ptype = ntohs(phdr->param_type);
+ plen = ntohs(phdr->param_length);
+
+ if (offset + plen > limit) {
+ break;
+ }
+ if (plen < sizeof(struct sctp_paramhdr)) {
+ break;
+ }
+ if (ptype == SCTP_SUPPORTED_CHUNK_EXT) {
+ /* A supported extension chunk */
+ struct sctp_supported_chunk_types_param *pr_supported;
+ uint8_t local_store[SCTP_PARAM_BUFFER_SIZE];
+ int num_ent, i;
+
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)&local_store, min(plen, sizeof(local_store)));
+ if (phdr == NULL) {
+ return (-1);
+ }
+ pr_supported = (struct sctp_supported_chunk_types_param *)phdr;
+ num_ent = plen - sizeof(struct sctp_paramhdr);
+ for (i = 0; i < num_ent; i++) {
+ switch (pr_supported->chunk_types[i]) {
+ case SCTP_ASCONF:
+ case SCTP_ASCONF_ACK:
+ peer_supports_asconf = 1;
+ break;
+ case SCTP_AUTHENTICATION:
+ peer_supports_auth = 1;
+ break;
+ default:
+ /* one we don't care about */
+ break;
+ }
+ }
+ } else if (ptype == SCTP_RANDOM) {
+ got_random = 1;
+ /* enforce the random length */
+ if (plen != (sizeof(struct sctp_auth_random) +
+ SCTP_AUTH_RANDOM_SIZE_REQUIRED)) {
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "SCTP: invalid RANDOM len\n");
+ return (-1);
+ }
+ } else if (ptype == SCTP_HMAC_LIST) {
+ uint8_t store[SCTP_PARAM_BUFFER_SIZE];
+ struct sctp_auth_hmac_algo *hmacs;
+ int num_hmacs;
+
+ if (plen > sizeof(store))
+ break;
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)store, min(plen, sizeof(store)));
+ if (phdr == NULL)
+ return (-1);
+ hmacs = (struct sctp_auth_hmac_algo *)phdr;
+ num_hmacs = (plen - sizeof(*hmacs)) /
+ sizeof(hmacs->hmac_ids[0]);
+ /* validate the hmac list */
+ if (sctp_verify_hmac_param(hmacs, num_hmacs)) {
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "SCTP: invalid HMAC param\n");
+ return (-1);
+ }
+ got_hmacs = 1;
+ } else if (ptype == SCTP_CHUNK_LIST) {
+ int i, num_chunks;
+ uint8_t chunks_store[SCTP_SMALL_CHUNK_STORE];
+
+ /* did the peer send a non-empty chunk list? */
+ struct sctp_auth_chunk_list *chunks = NULL;
+
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)chunks_store,
+ min(plen, sizeof(chunks_store)));
+ if (phdr == NULL)
+ return (-1);
+
+ /*-
+ * Flip through the list and mark that the
+ * peer supports asconf/asconf_ack.
+ */
+ chunks = (struct sctp_auth_chunk_list *)phdr;
+ num_chunks = plen - sizeof(*chunks);
+ for (i = 0; i < num_chunks; i++) {
+ /* record asconf/asconf-ack if listed */
+ if (chunks->chunk_types[i] == SCTP_ASCONF)
+ saw_asconf = 1;
+ if (chunks->chunk_types[i] == SCTP_ASCONF_ACK)
+ saw_asconf_ack = 1;
+
+ }
+ if (num_chunks)
+ got_chklist = 1;
+ }
+ offset += SCTP_SIZE32(plen);
+ if (offset >= limit) {
+ break;
+ }
+ phdr = sctp_get_next_param(m, offset, &parm_buf,
+ sizeof(parm_buf));
+ }
+ /* validate authentication required parameters */
+ if (got_random && got_hmacs) {
+ peer_supports_auth = 1;
+ } else {
+ peer_supports_auth = 0;
+ }
+ if (!peer_supports_auth && got_chklist) {
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "SCTP: peer sent chunk list w/o AUTH\n");
+ return (-1);
+ }
+ if (!SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk) && peer_supports_asconf &&
+ !peer_supports_auth) {
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "SCTP: peer supports ASCONF but not AUTH\n");
+ return (-1);
+ } else if ((peer_supports_asconf) && (peer_supports_auth) &&
+ ((saw_asconf == 0) || (saw_asconf_ack == 0))) {
+ return (-2);
+ }
+ return (0);
+}
+
+void
+sctp_initialize_auth_params(struct sctp_inpcb *inp, struct sctp_tcb *stcb)
+{
+ uint16_t chunks_len = 0;
+ uint16_t hmacs_len = 0;
+ uint16_t random_len = SCTP_AUTH_RANDOM_SIZE_DEFAULT;
+ sctp_key_t *new_key;
+ uint16_t keylen;
+
+ /* initialize hmac list from endpoint */
+ stcb->asoc.local_hmacs = sctp_copy_hmaclist(inp->sctp_ep.local_hmacs);
+ if (stcb->asoc.local_hmacs != NULL) {
+ hmacs_len = stcb->asoc.local_hmacs->num_algo *
+ sizeof(stcb->asoc.local_hmacs->hmac[0]);
+ }
+ /* initialize auth chunks list from endpoint */
+ stcb->asoc.local_auth_chunks =
+ sctp_copy_chunklist(inp->sctp_ep.local_auth_chunks);
+ if (stcb->asoc.local_auth_chunks != NULL) {
+ int i;
+
+ for (i = 0; i < 256; i++) {
+ if (stcb->asoc.local_auth_chunks->chunks[i])
+ chunks_len++;
+ }
+ }
+ /* copy defaults from the endpoint */
+ stcb->asoc.authinfo.active_keyid = inp->sctp_ep.default_keyid;
+
+ /* copy out the shared key list (by reference) from the endpoint */
+ (void)sctp_copy_skeylist(&inp->sctp_ep.shared_keys,
+ &stcb->asoc.shared_keys);
+
+ /* now set the concatenated key (random + chunks + hmacs) */
+ /* key includes parameter headers */
+ keylen = (3 * sizeof(struct sctp_paramhdr)) + random_len + chunks_len +
+ hmacs_len;
+ new_key = sctp_alloc_key(keylen);
+ if (new_key != NULL) {
+ struct sctp_paramhdr *ph;
+ int plen;
+
+ /* generate and copy in the RANDOM */
+ ph = (struct sctp_paramhdr *)new_key->key;
+ ph->param_type = htons(SCTP_RANDOM);
+ plen = sizeof(*ph) + random_len;
+ ph->param_length = htons(plen);
+ SCTP_READ_RANDOM(new_key->key + sizeof(*ph), random_len);
+ keylen = plen;
+
+ /* append in the AUTH chunks */
+ /* NOTE: currently we always have chunks to list */
+ ph = (struct sctp_paramhdr *)(new_key->key + keylen);
+ ph->param_type = htons(SCTP_CHUNK_LIST);
+ plen = sizeof(*ph) + chunks_len;
+ ph->param_length = htons(plen);
+ keylen += sizeof(*ph);
+ if (stcb->asoc.local_auth_chunks) {
+ int i;
+
+ for (i = 0; i < 256; i++) {
+ if (stcb->asoc.local_auth_chunks->chunks[i])
+ new_key->key[keylen++] = i;
+ }
+ }
+ /* append in the HMACs */
+ ph = (struct sctp_paramhdr *)(new_key->key + keylen);
+ ph->param_type = htons(SCTP_HMAC_LIST);
+ plen = sizeof(*ph) + hmacs_len;
+ ph->param_length = htons(plen);
+ keylen += sizeof(*ph);
+ (void)sctp_serialize_hmaclist(stcb->asoc.local_hmacs,
+ new_key->key + keylen);
+ }
+ if (stcb->asoc.authinfo.random != NULL)
+ sctp_free_key(stcb->asoc.authinfo.random);
+ stcb->asoc.authinfo.random = new_key;
+ stcb->asoc.authinfo.random_len = random_len;
+}
diff --git a/freebsd/sys/netinet/sctp_auth.h b/freebsd/sys/netinet/sctp_auth.h
new file mode 100644
index 00000000..da4dc09b
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_auth.h
@@ -0,0 +1,235 @@
+/*-
+ * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __SCTP_AUTH_HH__
+#define __SCTP_AUTH_HH__
+
+
+/* digest lengths */
+#define SCTP_AUTH_DIGEST_LEN_SHA1 20
+#define SCTP_AUTH_DIGEST_LEN_SHA224 28
+#define SCTP_AUTH_DIGEST_LEN_SHA256 32
+#define SCTP_AUTH_DIGEST_LEN_SHA384 48
+#define SCTP_AUTH_DIGEST_LEN_SHA512 64
+#define SCTP_AUTH_DIGEST_LEN_MAX 64
+
+/* random sizes */
+#define SCTP_AUTH_RANDOM_SIZE_DEFAULT 32
+#define SCTP_AUTH_RANDOM_SIZE_REQUIRED 32
+#define SCTP_AUTH_RANDOM_SIZE_MAX 256
+
+/* union of all supported HMAC algorithm contexts */
+typedef union sctp_hash_context {
+ SHA1_CTX sha1;
+#ifdef HAVE_SHA2
+ SHA256_CTX sha256;
+ SHA384_CTX sha384;
+ SHA512_CTX sha512;
+#endif
+} sctp_hash_context_t;
+
+typedef struct sctp_key {
+ uint32_t keylen;
+ uint8_t key[];
+} sctp_key_t;
+
+typedef struct sctp_shared_key {
+ LIST_ENTRY(sctp_shared_key) next;
+ sctp_key_t *key; /* key text */
+ uint32_t refcount; /* reference count */
+ uint16_t keyid; /* shared key ID */
+ uint8_t deactivated; /* key is deactivated */
+} sctp_sharedkey_t;
+
+LIST_HEAD(sctp_keyhead, sctp_shared_key);
+
+/* authentication chunks list */
+typedef struct sctp_auth_chklist {
+ uint8_t chunks[256];
+ uint8_t num_chunks;
+} sctp_auth_chklist_t;
+
+/* hmac algos supported list */
+typedef struct sctp_hmaclist {
+ uint16_t max_algo; /* max algorithms allocated */
+ uint16_t num_algo; /* num algorithms used */
+ uint16_t hmac[];
+} sctp_hmaclist_t;
+
+/* authentication info */
+typedef struct sctp_authinfo {
+ sctp_key_t *random; /* local random key (concatenated) */
+ uint32_t random_len; /* local random number length for param */
+ sctp_key_t *peer_random;/* peer's random key (concatenated) */
+ sctp_key_t *assoc_key; /* cached concatenated send key */
+ sctp_key_t *recv_key; /* cached concatenated recv key */
+ uint16_t active_keyid; /* active send keyid */
+ uint16_t assoc_keyid; /* current send keyid (cached) */
+ uint16_t recv_keyid; /* last recv keyid (cached) */
+} sctp_authinfo_t;
+
+
+
+/*
+ * Macros
+ */
+#define sctp_auth_is_required_chunk(chunk, list) ((list == NULL) ? (0) : (list->chunks[chunk] != 0))
+
+/*
+ * function prototypes
+ */
+
+/* socket option api functions */
+extern sctp_auth_chklist_t *sctp_alloc_chunklist(void);
+extern void sctp_free_chunklist(sctp_auth_chklist_t * chklist);
+extern void sctp_clear_chunklist(sctp_auth_chklist_t * chklist);
+extern sctp_auth_chklist_t *sctp_copy_chunklist(sctp_auth_chklist_t * chklist);
+extern int sctp_auth_add_chunk(uint8_t chunk, sctp_auth_chklist_t * list);
+extern int sctp_auth_delete_chunk(uint8_t chunk, sctp_auth_chklist_t * list);
+extern size_t sctp_auth_get_chklist_size(const sctp_auth_chklist_t * list);
+extern void sctp_auth_set_default_chunks(sctp_auth_chklist_t * list);
+extern int
+sctp_serialize_auth_chunks(const sctp_auth_chklist_t * list,
+ uint8_t * ptr);
+extern int
+sctp_pack_auth_chunks(const sctp_auth_chklist_t * list,
+ uint8_t * ptr);
+extern int
+sctp_unpack_auth_chunks(const uint8_t * ptr, uint8_t num_chunks,
+ sctp_auth_chklist_t * list);
+
+/* key handling */
+extern sctp_key_t *sctp_alloc_key(uint32_t keylen);
+extern void sctp_free_key(sctp_key_t * key);
+extern void sctp_print_key(sctp_key_t * key, const char *str);
+extern void sctp_show_key(sctp_key_t * key, const char *str);
+extern sctp_key_t *sctp_generate_random_key(uint32_t keylen);
+extern sctp_key_t *sctp_set_key(uint8_t * key, uint32_t keylen);
+extern sctp_key_t *
+sctp_compute_hashkey(sctp_key_t * key1, sctp_key_t * key2,
+ sctp_key_t * shared);
+
+/* shared key handling */
+extern sctp_sharedkey_t *sctp_alloc_sharedkey(void);
+extern void sctp_free_sharedkey(sctp_sharedkey_t * skey);
+extern sctp_sharedkey_t *
+sctp_find_sharedkey(struct sctp_keyhead *shared_keys,
+ uint16_t key_id);
+extern int
+sctp_insert_sharedkey(struct sctp_keyhead *shared_keys,
+ sctp_sharedkey_t * new_skey);
+extern int
+sctp_copy_skeylist(const struct sctp_keyhead *src,
+ struct sctp_keyhead *dest);
+
+/* ref counts on shared keys, by key id */
+extern void sctp_auth_key_acquire(struct sctp_tcb *stcb, uint16_t keyid);
+extern void sctp_auth_key_release(struct sctp_tcb *stcb, uint16_t keyid);
+
+
+/* hmac list handling */
+extern sctp_hmaclist_t *sctp_alloc_hmaclist(uint8_t num_hmacs);
+extern void sctp_free_hmaclist(sctp_hmaclist_t * list);
+extern int sctp_auth_add_hmacid(sctp_hmaclist_t * list, uint16_t hmac_id);
+extern sctp_hmaclist_t *sctp_copy_hmaclist(sctp_hmaclist_t * list);
+extern sctp_hmaclist_t *sctp_default_supported_hmaclist(void);
+extern uint16_t
+sctp_negotiate_hmacid(sctp_hmaclist_t * peer,
+ sctp_hmaclist_t * local);
+extern int sctp_serialize_hmaclist(sctp_hmaclist_t * list, uint8_t * ptr);
+extern int
+sctp_verify_hmac_param(struct sctp_auth_hmac_algo *hmacs,
+ uint32_t num_hmacs);
+
+extern sctp_authinfo_t *sctp_alloc_authinfo(void);
+extern void sctp_free_authinfo(sctp_authinfo_t * authinfo);
+
+/* keyed-HMAC functions */
+extern uint32_t sctp_get_auth_chunk_len(uint16_t hmac_algo);
+extern uint32_t sctp_get_hmac_digest_len(uint16_t hmac_algo);
+extern uint32_t
+sctp_hmac(uint16_t hmac_algo, uint8_t * key, uint32_t keylen,
+ uint8_t * text, uint32_t textlen, uint8_t * digest);
+extern int
+sctp_verify_hmac(uint16_t hmac_algo, uint8_t * key, uint32_t keylen,
+ uint8_t * text, uint32_t textlen, uint8_t * digest, uint32_t digestlen);
+extern uint32_t
+sctp_compute_hmac(uint16_t hmac_algo, sctp_key_t * key,
+ uint8_t * text, uint32_t textlen, uint8_t * digest);
+extern int sctp_auth_is_supported_hmac(sctp_hmaclist_t * list, uint16_t id);
+
+/* mbuf versions */
+extern uint32_t
+sctp_hmac_m(uint16_t hmac_algo, uint8_t * key, uint32_t keylen,
+ struct mbuf *m, uint32_t m_offset, uint8_t * digest, uint32_t trailer);
+extern uint32_t
+sctp_compute_hmac_m(uint16_t hmac_algo, sctp_key_t * key,
+ struct mbuf *m, uint32_t m_offset, uint8_t * digest);
+
+/*
+ * authentication routines
+ */
+extern void sctp_clear_cachedkeys(struct sctp_tcb *stcb, uint16_t keyid);
+extern void sctp_clear_cachedkeys_ep(struct sctp_inpcb *inp, uint16_t keyid);
+extern int sctp_delete_sharedkey(struct sctp_tcb *stcb, uint16_t keyid);
+extern int sctp_delete_sharedkey_ep(struct sctp_inpcb *inp, uint16_t keyid);
+extern int sctp_auth_setactivekey(struct sctp_tcb *stcb, uint16_t keyid);
+extern int sctp_auth_setactivekey_ep(struct sctp_inpcb *inp, uint16_t keyid);
+extern int sctp_deact_sharedkey(struct sctp_tcb *stcb, uint16_t keyid);
+extern int sctp_deact_sharedkey_ep(struct sctp_inpcb *inp, uint16_t keyid);
+
+extern void
+sctp_auth_get_cookie_params(struct sctp_tcb *stcb, struct mbuf *m,
+ uint32_t offset, uint32_t length);
+extern void
+sctp_fill_hmac_digest_m(struct mbuf *m, uint32_t auth_offset,
+ struct sctp_auth_chunk *auth, struct sctp_tcb *stcb, uint16_t key_id);
+extern struct mbuf *
+sctp_add_auth_chunk(struct mbuf *m, struct mbuf **m_end,
+ struct sctp_auth_chunk **auth_ret, uint32_t * offset,
+ struct sctp_tcb *stcb, uint8_t chunk);
+extern int
+sctp_handle_auth(struct sctp_tcb *stcb, struct sctp_auth_chunk *ch,
+ struct mbuf *m, uint32_t offset);
+extern void
+sctp_notify_authentication(struct sctp_tcb *stcb,
+ uint32_t indication, uint16_t keyid, uint16_t alt_keyid, int so_locked);
+extern int
+sctp_validate_init_auth_params(struct mbuf *m, int offset,
+ int limit);
+extern void
+sctp_initialize_auth_params(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb);
+
+/* test functions */
+#endif /* __SCTP_AUTH_HH__ */
diff --git a/freebsd/sys/netinet/sctp_bsd_addr.c b/freebsd/sys/netinet/sctp_bsd_addr.c
new file mode 100644
index 00000000..8782e681
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_bsd_addr.c
@@ -0,0 +1,562 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_output.c,v 1.46 2005/03/06 16:04:17 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp_var.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#include <freebsd/netinet/sctp_header.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_output.h>
+#include <freebsd/netinet/sctp_bsd_addr.h>
+#include <freebsd/netinet/sctp_uio.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_timer.h>
+#include <freebsd/netinet/sctp_asconf.h>
+#include <freebsd/netinet/sctp_sysctl.h>
+#include <freebsd/netinet/sctp_indata.h>
+#include <freebsd/sys/unistd.h>
+
+/* Declare all of our malloc named types */
+MALLOC_DEFINE(SCTP_M_MAP, "sctp_map", "sctp asoc map descriptor");
+MALLOC_DEFINE(SCTP_M_STRMI, "sctp_stri", "sctp stream in array");
+MALLOC_DEFINE(SCTP_M_STRMO, "sctp_stro", "sctp stream out array");
+MALLOC_DEFINE(SCTP_M_ASC_ADDR, "sctp_aadr", "sctp asconf address");
+MALLOC_DEFINE(SCTP_M_ASC_IT, "sctp_a_it", "sctp asconf iterator");
+MALLOC_DEFINE(SCTP_M_AUTH_CL, "sctp_atcl", "sctp auth chunklist");
+MALLOC_DEFINE(SCTP_M_AUTH_KY, "sctp_atky", "sctp auth key");
+MALLOC_DEFINE(SCTP_M_AUTH_HL, "sctp_athm", "sctp auth hmac list");
+MALLOC_DEFINE(SCTP_M_AUTH_IF, "sctp_athi", "sctp auth info");
+MALLOC_DEFINE(SCTP_M_STRESET, "sctp_stre", "sctp stream reset");
+MALLOC_DEFINE(SCTP_M_CMSG, "sctp_cmsg", "sctp CMSG buffer");
+MALLOC_DEFINE(SCTP_M_COPYAL, "sctp_cpal", "sctp copy all");
+MALLOC_DEFINE(SCTP_M_VRF, "sctp_vrf", "sctp vrf struct");
+MALLOC_DEFINE(SCTP_M_IFA, "sctp_ifa", "sctp ifa struct");
+MALLOC_DEFINE(SCTP_M_IFN, "sctp_ifn", "sctp ifn struct");
+MALLOC_DEFINE(SCTP_M_TIMW, "sctp_timw", "sctp time block");
+MALLOC_DEFINE(SCTP_M_MVRF, "sctp_mvrf", "sctp mvrf pcb list");
+MALLOC_DEFINE(SCTP_M_ITER, "sctp_iter", "sctp iterator control");
+MALLOC_DEFINE(SCTP_M_SOCKOPT, "sctp_socko", "sctp socket option");
+
+/* Global NON-VNET structure that controls the iterator */
+struct iterator_control sctp_it_ctl;
+static int __sctp_thread_based_iterator_started = 0;
+
+
+static void
+sctp_cleanup_itqueue(void)
+{
+ struct sctp_iterator *it;
+
+ while ((it = TAILQ_FIRST(&sctp_it_ctl.iteratorhead)) != NULL) {
+ if (it->function_atend != NULL) {
+ (*it->function_atend) (it->pointer, it->val);
+ }
+ TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr);
+ SCTP_FREE(it, SCTP_M_ITER);
+ }
+}
+
+
+void
+sctp_wakeup_iterator(void)
+{
+ wakeup(&sctp_it_ctl.iterator_running);
+}
+
+static void
+sctp_iterator_thread(void *v)
+{
+ SCTP_IPI_ITERATOR_WQ_LOCK();
+ while (1) {
+ msleep(&sctp_it_ctl.iterator_running,
+ &sctp_it_ctl.ipi_iterator_wq_mtx,
+ 0, "waiting_for_work", 0);
+ if (sctp_it_ctl.iterator_flags & SCTP_ITERATOR_MUST_EXIT) {
+ SCTP_IPI_ITERATOR_WQ_DESTROY();
+ SCTP_ITERATOR_LOCK_DESTROY();
+ sctp_cleanup_itqueue();
+ __sctp_thread_based_iterator_started = 0;
+ kthread_exit();
+ }
+ sctp_iterator_worker();
+ }
+}
+
+void
+sctp_startup_iterator(void)
+{
+ if (__sctp_thread_based_iterator_started) {
+ /* You only get one */
+ return;
+ }
+ /* init the iterator head */
+ __sctp_thread_based_iterator_started = 1;
+ sctp_it_ctl.iterator_running = 0;
+ sctp_it_ctl.iterator_flags = 0;
+ sctp_it_ctl.cur_it = NULL;
+ SCTP_ITERATOR_LOCK_INIT();
+ SCTP_IPI_ITERATOR_WQ_INIT();
+ TAILQ_INIT(&sctp_it_ctl.iteratorhead);
+
+ int ret;
+
+ ret = kproc_create(sctp_iterator_thread,
+ (void *)NULL,
+ &sctp_it_ctl.thread_proc,
+ RFPROC,
+ SCTP_KTHREAD_PAGES,
+ SCTP_KTRHEAD_NAME);
+}
+
+#ifdef INET6
+
+void
+sctp_gather_internal_ifa_flags(struct sctp_ifa *ifa)
+{
+ struct in6_ifaddr *ifa6;
+
+ ifa6 = (struct in6_ifaddr *)ifa->ifa;
+ ifa->flags = ifa6->ia6_flags;
+ if (!MODULE_GLOBAL(ip6_use_deprecated)) {
+ if (ifa->flags &
+ IN6_IFF_DEPRECATED) {
+ ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE;
+ } else {
+ ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE;
+ }
+ } else {
+ ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE;
+ }
+ if (ifa->flags &
+ (IN6_IFF_DETACHED |
+ IN6_IFF_ANYCAST |
+ IN6_IFF_NOTREADY)) {
+ ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE;
+ } else {
+ ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE;
+ }
+}
+
+#endif /* INET6 */
+
+
+static uint32_t
+sctp_is_desired_interface_type(struct ifaddr *ifa)
+{
+ int result;
+
+ /* check the interface type to see if it's one we care about */
+ switch (ifa->ifa_ifp->if_type) {
+ case IFT_ETHER:
+ case IFT_ISO88023:
+ case IFT_ISO88024:
+ case IFT_ISO88025:
+ case IFT_ISO88026:
+ case IFT_STARLAN:
+ case IFT_P10:
+ case IFT_P80:
+ case IFT_HY:
+ case IFT_FDDI:
+ case IFT_XETHER:
+ case IFT_ISDNBASIC:
+ case IFT_ISDNPRIMARY:
+ case IFT_PTPSERIAL:
+ case IFT_OTHER:
+ case IFT_PPP:
+ case IFT_LOOP:
+ case IFT_SLIP:
+ case IFT_GIF:
+ case IFT_L2VLAN:
+ case IFT_IP:
+ case IFT_IPOVERCDLC:
+ case IFT_IPOVERCLAW:
+ case IFT_VIRTUALIPADDRESS:
+ result = 1;
+ break;
+ default:
+ result = 0;
+ }
+
+ return (result);
+}
+
+
+
+
+static void
+sctp_init_ifns_for_vrf(int vrfid)
+{
+ /*
+ * Here we must apply ANY locks needed by the IFN we access and also
+ * make sure we lock any IFA that exists as we float through the
+ * list of IFA's
+ */
+ struct ifnet *ifn;
+ struct ifaddr *ifa;
+ struct in6_ifaddr *ifa6;
+ struct sctp_ifa *sctp_ifa;
+ uint32_t ifa_flags;
+
+ IFNET_RLOCK();
+ TAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_list) {
+ IF_ADDR_LOCK(ifn);
+ TAILQ_FOREACH(ifa, &ifn->if_addrlist, ifa_list) {
+ if (ifa->ifa_addr == NULL) {
+ continue;
+ }
+ if ((ifa->ifa_addr->sa_family != AF_INET) && (ifa->ifa_addr->sa_family != AF_INET6)) {
+ /* non inet/inet6 skip */
+ continue;
+ }
+ if (ifa->ifa_addr->sa_family == AF_INET6) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr)) {
+ /* skip unspecifed addresses */
+ continue;
+ }
+ } else {
+ if (((struct sockaddr_in *)ifa->ifa_addr)->sin_addr.s_addr == 0) {
+ continue;
+ }
+ }
+ if (sctp_is_desired_interface_type(ifa) == 0) {
+ /* non desired type */
+ continue;
+ }
+ if (ifa->ifa_addr->sa_family == AF_INET6) {
+ ifa6 = (struct in6_ifaddr *)ifa;
+ ifa_flags = ifa6->ia6_flags;
+ } else {
+ ifa_flags = 0;
+ }
+ sctp_ifa = sctp_add_addr_to_vrf(vrfid,
+ (void *)ifn,
+ ifn->if_index,
+ ifn->if_type,
+ ifn->if_xname,
+ (void *)ifa,
+ ifa->ifa_addr,
+ ifa_flags,
+ 0);
+ if (sctp_ifa) {
+ sctp_ifa->localifa_flags &= ~SCTP_ADDR_DEFER_USE;
+ }
+ }
+ IF_ADDR_UNLOCK(ifn);
+ }
+ IFNET_RUNLOCK();
+}
+
+void
+sctp_init_vrf_list(int vrfid)
+{
+ if (vrfid > SCTP_MAX_VRF_ID)
+ /* can't do that */
+ return;
+
+ /* Don't care about return here */
+ (void)sctp_allocate_vrf(vrfid);
+
+ /*
+ * Now we need to build all the ifn's for this vrf and there
+ * addresses
+ */
+ sctp_init_ifns_for_vrf(vrfid);
+}
+
+void
+sctp_addr_change(struct ifaddr *ifa, int cmd)
+{
+ uint32_t ifa_flags = 0;
+
+ /*
+ * BSD only has one VRF, if this changes we will need to hook in the
+ * right things here to get the id to pass to the address managment
+ * routine.
+ */
+ if (SCTP_BASE_VAR(first_time) == 0) {
+ /* Special test to see if my ::1 will showup with this */
+ SCTP_BASE_VAR(first_time) = 1;
+ sctp_init_ifns_for_vrf(SCTP_DEFAULT_VRFID);
+ }
+ if ((cmd != RTM_ADD) && (cmd != RTM_DELETE)) {
+ /* don't know what to do with this */
+ return;
+ }
+ if (ifa->ifa_addr == NULL) {
+ return;
+ }
+ if ((ifa->ifa_addr->sa_family != AF_INET) && (ifa->ifa_addr->sa_family != AF_INET6)) {
+ /* non inet/inet6 skip */
+ return;
+ }
+ if (ifa->ifa_addr->sa_family == AF_INET6) {
+ ifa_flags = ((struct in6_ifaddr *)ifa)->ia6_flags;
+ if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr)) {
+ /* skip unspecifed addresses */
+ return;
+ }
+ } else {
+ if (((struct sockaddr_in *)ifa->ifa_addr)->sin_addr.s_addr == 0) {
+ return;
+ }
+ }
+
+ if (sctp_is_desired_interface_type(ifa) == 0) {
+ /* non desired type */
+ return;
+ }
+ if (cmd == RTM_ADD) {
+ (void)sctp_add_addr_to_vrf(SCTP_DEFAULT_VRFID, (void *)ifa->ifa_ifp,
+ ifa->ifa_ifp->if_index, ifa->ifa_ifp->if_type,
+ ifa->ifa_ifp->if_xname,
+ (void *)ifa, ifa->ifa_addr, ifa_flags, 1);
+ } else {
+
+ sctp_del_addr_from_vrf(SCTP_DEFAULT_VRFID, ifa->ifa_addr,
+ ifa->ifa_ifp->if_index,
+ ifa->ifa_ifp->if_xname
+ );
+ /*
+ * We don't bump refcount here so when it completes the
+ * final delete will happen.
+ */
+ }
+}
+
+void
+ sctp_add_or_del_interfaces(int (*pred) (struct ifnet *), int add){
+ struct ifnet *ifn;
+ struct ifaddr *ifa;
+
+ IFNET_RLOCK();
+ TAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_list) {
+ if (!(*pred) (ifn)) {
+ continue;
+ }
+ TAILQ_FOREACH(ifa, &ifn->if_addrlist, ifa_list) {
+ sctp_addr_change(ifa, add ? RTM_ADD : RTM_DELETE);
+ }
+ }
+ IFNET_RUNLOCK();
+}
+
+struct mbuf *
+sctp_get_mbuf_for_msg(unsigned int space_needed, int want_header,
+ int how, int allonebuf, int type)
+{
+ struct mbuf *m = NULL;
+
+ m = m_getm2(NULL, space_needed, how, type, want_header ? M_PKTHDR : 0);
+ if (m == NULL) {
+ /* bad, no memory */
+ return (m);
+ }
+ if (allonebuf) {
+ int siz;
+
+ if (SCTP_BUF_IS_EXTENDED(m)) {
+ siz = SCTP_BUF_EXTEND_SIZE(m);
+ } else {
+ if (want_header)
+ siz = MHLEN;
+ else
+ siz = MLEN;
+ }
+ if (siz < space_needed) {
+ m_freem(m);
+ return (NULL);
+ }
+ }
+ if (SCTP_BUF_NEXT(m)) {
+ sctp_m_freem(SCTP_BUF_NEXT(m));
+ SCTP_BUF_NEXT(m) = NULL;
+ }
+#ifdef SCTP_MBUF_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ if (SCTP_BUF_IS_EXTENDED(m)) {
+ sctp_log_mb(m, SCTP_MBUF_IALLOC);
+ }
+ }
+#endif
+ return (m);
+}
+
+
+#ifdef SCTP_PACKET_LOGGING
+void
+sctp_packet_log(struct mbuf *m, int length)
+{
+ int *lenat, thisone;
+ void *copyto;
+ uint32_t *tick_tock;
+ int total_len;
+ int grabbed_lock = 0;
+ int value, newval, thisend, thisbegin;
+
+ /*
+ * Buffer layout. -sizeof this entry (total_len) -previous end
+ * (value) -ticks of log (ticks) o -ip packet o -as logged -
+ * where this started (thisbegin) x <--end points here
+ */
+ total_len = SCTP_SIZE32((length + (4 * sizeof(int))));
+ /* Log a packet to the buffer. */
+ if (total_len > SCTP_PACKET_LOG_SIZE) {
+ /* Can't log this packet I have not a buffer big enough */
+ return;
+ }
+ if (length < (int)(SCTP_MIN_V4_OVERHEAD + sizeof(struct sctp_cookie_ack_chunk))) {
+ return;
+ }
+ atomic_add_int(&SCTP_BASE_VAR(packet_log_writers), 1);
+try_again:
+ if (SCTP_BASE_VAR(packet_log_writers) > SCTP_PKTLOG_WRITERS_NEED_LOCK) {
+ SCTP_IP_PKTLOG_LOCK();
+ grabbed_lock = 1;
+again_locked:
+ value = SCTP_BASE_VAR(packet_log_end);
+ newval = SCTP_BASE_VAR(packet_log_end) + total_len;
+ if (newval >= SCTP_PACKET_LOG_SIZE) {
+ /* we wrapped */
+ thisbegin = 0;
+ thisend = total_len;
+ } else {
+ thisbegin = SCTP_BASE_VAR(packet_log_end);
+ thisend = newval;
+ }
+ if (!(atomic_cmpset_int(&SCTP_BASE_VAR(packet_log_end), value, thisend))) {
+ goto again_locked;
+ }
+ } else {
+ value = SCTP_BASE_VAR(packet_log_end);
+ newval = SCTP_BASE_VAR(packet_log_end) + total_len;
+ if (newval >= SCTP_PACKET_LOG_SIZE) {
+ /* we wrapped */
+ thisbegin = 0;
+ thisend = total_len;
+ } else {
+ thisbegin = SCTP_BASE_VAR(packet_log_end);
+ thisend = newval;
+ }
+ if (!(atomic_cmpset_int(&SCTP_BASE_VAR(packet_log_end), value, thisend))) {
+ goto try_again;
+ }
+ }
+ /* Sanity check */
+ if (thisend >= SCTP_PACKET_LOG_SIZE) {
+ printf("Insanity stops a log thisbegin:%d thisend:%d writers:%d lock:%d end:%d\n",
+ thisbegin,
+ thisend,
+ SCTP_BASE_VAR(packet_log_writers),
+ grabbed_lock,
+ SCTP_BASE_VAR(packet_log_end));
+ SCTP_BASE_VAR(packet_log_end) = 0;
+ goto no_log;
+
+ }
+ lenat = (int *)&SCTP_BASE_VAR(packet_log_buffer)[thisbegin];
+ *lenat = total_len;
+ lenat++;
+ *lenat = value;
+ lenat++;
+ tick_tock = (uint32_t *) lenat;
+ lenat++;
+ *tick_tock = sctp_get_tick_count();
+ copyto = (void *)lenat;
+ thisone = thisend - sizeof(int);
+ lenat = (int *)&SCTP_BASE_VAR(packet_log_buffer)[thisone];
+ *lenat = thisbegin;
+ if (grabbed_lock) {
+ SCTP_IP_PKTLOG_UNLOCK();
+ grabbed_lock = 0;
+ }
+ m_copydata(m, 0, length, (caddr_t)copyto);
+no_log:
+ if (grabbed_lock) {
+ SCTP_IP_PKTLOG_UNLOCK();
+ }
+ atomic_subtract_int(&SCTP_BASE_VAR(packet_log_writers), 1);
+}
+
+
+int
+sctp_copy_out_packet_log(uint8_t * target, int length)
+{
+ /*
+ * We wind through the packet log starting at start copying up to
+ * length bytes out. We return the number of bytes copied.
+ */
+ int tocopy, this_copy;
+ int *lenat;
+ int did_delay = 0;
+
+ tocopy = length;
+ if (length < (int)(2 * sizeof(int))) {
+ /* not enough room */
+ return (0);
+ }
+ if (SCTP_PKTLOG_WRITERS_NEED_LOCK) {
+ atomic_add_int(&SCTP_BASE_VAR(packet_log_writers), SCTP_PKTLOG_WRITERS_NEED_LOCK);
+again:
+ if ((did_delay == 0) && (SCTP_BASE_VAR(packet_log_writers) != SCTP_PKTLOG_WRITERS_NEED_LOCK)) {
+ /*
+ * we delay here for just a moment hoping the
+ * writer(s) that were present when we entered will
+ * have left and we only have locking ones that will
+ * contend with us for the lock. This does not
+ * assure 100% access, but its good enough for a
+ * logging facility like this.
+ */
+ did_delay = 1;
+ DELAY(10);
+ goto again;
+ }
+ }
+ SCTP_IP_PKTLOG_LOCK();
+ lenat = (int *)target;
+ *lenat = SCTP_BASE_VAR(packet_log_end);
+ lenat++;
+ this_copy = min((length - sizeof(int)), SCTP_PACKET_LOG_SIZE);
+ memcpy((void *)lenat, (void *)SCTP_BASE_VAR(packet_log_buffer), this_copy);
+ if (SCTP_PKTLOG_WRITERS_NEED_LOCK) {
+ atomic_subtract_int(&SCTP_BASE_VAR(packet_log_writers),
+ SCTP_PKTLOG_WRITERS_NEED_LOCK);
+ }
+ SCTP_IP_PKTLOG_UNLOCK();
+ return (this_copy + sizeof(int));
+}
+
+#endif
diff --git a/freebsd/sys/netinet/sctp_bsd_addr.h b/freebsd/sys/netinet/sctp_bsd_addr.h
new file mode 100644
index 00000000..67d65dc6
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_bsd_addr.h
@@ -0,0 +1,63 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __sctp_bsd_addr_h__
+#define __sctp_bsd_addr_h__
+#include <freebsd/netinet/sctp_pcb.h>
+
+#if defined(_KERNEL) || defined(__Userspace__)
+
+extern struct iterator_control sctp_it_ctl;
+void sctp_wakeup_iterator(void);
+
+void sctp_startup_iterator(void);
+
+
+#ifdef INET6
+void sctp_gather_internal_ifa_flags(struct sctp_ifa *ifa);
+
+#endif
+
+#ifdef SCTP_PACKET_LOGGING
+
+void sctp_packet_log(struct mbuf *m, int length);
+int sctp_copy_out_packet_log(uint8_t * target, int length);
+
+#endif
+
+void sctp_addr_change(struct ifaddr *ifa, int cmd);
+
+void sctp_add_or_del_interfaces(int (*pred) (struct ifnet *), int add);
+
+#endif
+#endif
diff --git a/freebsd/sys/netinet/sctp_cc_functions.c b/freebsd/sys/netinet/sctp_cc_functions.c
new file mode 100644
index 00000000..668fd673
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_cc_functions.c
@@ -0,0 +1,1565 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp_var.h>
+#include <freebsd/netinet/sctp_sysctl.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#include <freebsd/netinet/sctp_header.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_output.h>
+#include <freebsd/netinet/sctp_input.h>
+#include <freebsd/netinet/sctp_indata.h>
+#include <freebsd/netinet/sctp_uio.h>
+#include <freebsd/netinet/sctp_timer.h>
+#include <freebsd/netinet/sctp_auth.h>
+#include <freebsd/netinet/sctp_asconf.h>
+#include <freebsd/netinet/sctp_cc_functions.h>
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+void
+sctp_set_initial_cc_param(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ struct sctp_association *assoc;
+ uint32_t cwnd_in_mtu;
+
+ assoc = &stcb->asoc;
+ /*
+ * We take the minimum of the burst limit and the initial congestion
+ * window. The initial congestion window is at least two times the
+ * MTU.
+ */
+ cwnd_in_mtu = SCTP_BASE_SYSCTL(sctp_initial_cwnd);
+ if ((assoc->max_burst > 0) && (cwnd_in_mtu > assoc->max_burst))
+ cwnd_in_mtu = assoc->max_burst;
+ net->cwnd = (net->mtu - sizeof(struct sctphdr)) * cwnd_in_mtu;
+ net->ssthresh = assoc->peers_rwnd;
+
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) &
+ (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) {
+ sctp_log_cwnd(stcb, net, 0, SCTP_CWND_INITIALIZATION);
+ }
+}
+
+void
+sctp_cwnd_update_after_fr(struct sctp_tcb *stcb,
+ struct sctp_association *asoc)
+{
+ struct sctp_nets *net;
+
+ /*-
+ * CMT fast recovery code. Need to debug. ((sctp_cmt_on_off == 1) &&
+ * (net->fast_retran_loss_recovery == 0)))
+ */
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ if ((asoc->fast_retran_loss_recovery == 0) ||
+ (asoc->sctp_cmt_on_off == 1)) {
+ /* out of a RFC2582 Fast recovery window? */
+ if (net->net_ack > 0) {
+ /*
+ * per section 7.2.3, are there any
+ * destinations that had a fast retransmit
+ * to them. If so what we need to do is
+ * adjust ssthresh and cwnd.
+ */
+ struct sctp_tmit_chunk *lchk;
+ int old_cwnd = net->cwnd;
+
+ net->ssthresh = net->cwnd / 2;
+ if (net->ssthresh < (net->mtu * 2)) {
+ net->ssthresh = 2 * net->mtu;
+ }
+ net->cwnd = net->ssthresh;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd),
+ SCTP_CWND_LOG_FROM_FR);
+ }
+ lchk = TAILQ_FIRST(&asoc->send_queue);
+
+ net->partial_bytes_acked = 0;
+ /* Turn on fast recovery window */
+ asoc->fast_retran_loss_recovery = 1;
+ if (lchk == NULL) {
+ /* Mark end of the window */
+ asoc->fast_recovery_tsn = asoc->sending_seq - 1;
+ } else {
+ asoc->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1;
+ }
+
+ /*
+ * CMT fast recovery -- per destination
+ * recovery variable.
+ */
+ net->fast_retran_loss_recovery = 1;
+
+ if (lchk == NULL) {
+ /* Mark end of the window */
+ net->fast_recovery_tsn = asoc->sending_seq - 1;
+ } else {
+ net->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1;
+ }
+
+ /*
+ * Disable Nonce Sum Checking and store the
+ * resync tsn
+ */
+ asoc->nonce_sum_check = 0;
+ asoc->nonce_resync_tsn = asoc->fast_recovery_tsn + 1;
+
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND,
+ stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_32);
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND,
+ stcb->sctp_ep, stcb, net);
+ }
+ } else if (net->net_ack > 0) {
+ /*
+ * Mark a peg that we WOULD have done a cwnd
+ * reduction but RFC2582 prevented this action.
+ */
+ SCTP_STAT_INCR(sctps_fastretransinrtt);
+ }
+ }
+}
+
+void
+sctp_cwnd_update_after_sack(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ int accum_moved, int reneged_all, int will_exit)
+{
+ struct sctp_nets *net;
+
+ /******************************/
+ /* update cwnd and Early FR */
+ /******************************/
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+
+#ifdef JANA_CMT_FAST_RECOVERY
+ /*
+ * CMT fast recovery code. Need to debug.
+ */
+ if (net->fast_retran_loss_recovery && net->new_pseudo_cumack) {
+ if (compare_with_wrap(asoc->last_acked_seq,
+ net->fast_recovery_tsn, MAX_TSN) ||
+ (asoc->last_acked_seq == net->fast_recovery_tsn) ||
+ compare_with_wrap(net->pseudo_cumack, net->fast_recovery_tsn, MAX_TSN) ||
+ (net->pseudo_cumack == net->fast_recovery_tsn)) {
+ net->will_exit_fast_recovery = 1;
+ }
+ }
+#endif
+ if (SCTP_BASE_SYSCTL(sctp_early_fr)) {
+ /*
+ * So, first of all do we need to have a Early FR
+ * timer running?
+ */
+ if ((!TAILQ_EMPTY(&asoc->sent_queue) &&
+ (net->ref_count > 1) &&
+ (net->flight_size < net->cwnd)) ||
+ (reneged_all)) {
+ /*
+ * yes, so in this case stop it if its
+ * running, and then restart it. Reneging
+ * all is a special case where we want to
+ * run the Early FR timer and then force the
+ * last few unacked to be sent, causing us
+ * to illicit a sack with gaps to force out
+ * the others.
+ */
+ if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) {
+ SCTP_STAT_INCR(sctps_earlyfrstpidsck2);
+ sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_20);
+ }
+ SCTP_STAT_INCR(sctps_earlyfrstrid);
+ sctp_timer_start(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net);
+ } else {
+ /* No, stop it if its running */
+ if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) {
+ SCTP_STAT_INCR(sctps_earlyfrstpidsck3);
+ sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_21);
+ }
+ }
+ }
+ /* if nothing was acked on this destination skip it */
+ if (net->net_ack == 0) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, 0, SCTP_CWND_LOG_FROM_SACK);
+ }
+ continue;
+ }
+ if (net->net_ack2 > 0) {
+ /*
+ * Karn's rule applies to clearing error count, this
+ * is optional.
+ */
+ net->error_count = 0;
+ if ((net->dest_state & SCTP_ADDR_NOT_REACHABLE) ==
+ SCTP_ADDR_NOT_REACHABLE) {
+ /* addr came good */
+ net->dest_state &= ~SCTP_ADDR_NOT_REACHABLE;
+ net->dest_state |= SCTP_ADDR_REACHABLE;
+ sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb,
+ SCTP_RECEIVED_SACK, (void *)net, SCTP_SO_NOT_LOCKED);
+ /* now was it the primary? if so restore */
+ if (net->dest_state & SCTP_ADDR_WAS_PRIMARY) {
+ (void)sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net);
+ }
+ }
+ /*
+ * JRS 5/14/07 - If CMT PF is on and the destination
+ * is in PF state, set the destination to active
+ * state and set the cwnd to one or two MTU's based
+ * on whether PF1 or PF2 is being used.
+ *
+ * Should we stop any running T3 timer here?
+ */
+ if ((asoc->sctp_cmt_on_off == 1) &&
+ (asoc->sctp_cmt_pf > 0) &&
+ ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF)) {
+ net->dest_state &= ~SCTP_ADDR_PF;
+ net->cwnd = net->mtu * asoc->sctp_cmt_pf;
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Destination %p moved from PF to reachable with cwnd %d.\n",
+ net, net->cwnd);
+ /*
+ * Since the cwnd value is explicitly set,
+ * skip the code that updates the cwnd
+ * value.
+ */
+ goto skip_cwnd_update;
+ }
+ }
+#ifdef JANA_CMT_FAST_RECOVERY
+ /*
+ * CMT fast recovery code
+ */
+ /*
+ * if (sctp_cmt_on_off == 1 &&
+ * net->fast_retran_loss_recovery &&
+ * net->will_exit_fast_recovery == 0) { @@@ Do something }
+ * else if (sctp_cmt_on_off == 0 &&
+ * asoc->fast_retran_loss_recovery && will_exit == 0) {
+ */
+#endif
+
+ if (asoc->fast_retran_loss_recovery &&
+ (will_exit == 0) &&
+ (asoc->sctp_cmt_on_off == 0)) {
+ /*
+ * If we are in loss recovery we skip any cwnd
+ * update
+ */
+ goto skip_cwnd_update;
+ }
+ /*
+ * CMT: CUC algorithm. Update cwnd if pseudo-cumack has
+ * moved.
+ */
+ if (accum_moved ||
+ ((asoc->sctp_cmt_on_off == 1) && net->new_pseudo_cumack)) {
+ /* If the cumulative ack moved we can proceed */
+ if (net->cwnd <= net->ssthresh) {
+ /* We are in slow start */
+ if (net->flight_size + net->net_ack >= net->cwnd) {
+ if (net->net_ack > (net->mtu * SCTP_BASE_SYSCTL(sctp_L2_abc_variable))) {
+ net->cwnd += (net->mtu * SCTP_BASE_SYSCTL(sctp_L2_abc_variable));
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->mtu,
+ SCTP_CWND_LOG_FROM_SS);
+ }
+ } else {
+ net->cwnd += net->net_ack;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->net_ack,
+ SCTP_CWND_LOG_FROM_SS);
+ }
+ }
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->net_ack,
+ SCTP_CWND_LOG_NOADV_SS);
+ }
+ }
+ } else {
+ /* We are in congestion avoidance */
+ /*
+ * Add to pba
+ */
+ net->partial_bytes_acked += net->net_ack;
+
+ if ((net->flight_size + net->net_ack >= net->cwnd) &&
+ (net->partial_bytes_acked >= net->cwnd)) {
+ net->partial_bytes_acked -= net->cwnd;
+ net->cwnd += net->mtu;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->mtu,
+ SCTP_CWND_LOG_FROM_CA);
+ }
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->net_ack,
+ SCTP_CWND_LOG_NOADV_CA);
+ }
+ }
+ }
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->mtu,
+ SCTP_CWND_LOG_NO_CUMACK);
+ }
+ }
+skip_cwnd_update:
+ /*
+ * NOW, according to Karn's rule do we need to restore the
+ * RTO timer back? Check our net_ack2. If not set then we
+ * have a ambiguity.. i.e. all data ack'd was sent to more
+ * than one place.
+ */
+ if (net->net_ack2) {
+ /* restore any doubled timers */
+ net->RTO = ((net->lastsa >> 2) + net->lastsv) >> 1;
+ if (net->RTO < stcb->asoc.minrto) {
+ net->RTO = stcb->asoc.minrto;
+ }
+ if (net->RTO > stcb->asoc.maxrto) {
+ net->RTO = stcb->asoc.maxrto;
+ }
+ }
+ }
+}
+
+void
+sctp_cwnd_update_after_timeout(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ int old_cwnd = net->cwnd;
+
+ net->ssthresh = max(net->cwnd / 2, 4 * net->mtu);
+ net->cwnd = net->mtu;
+ net->partial_bytes_acked = 0;
+
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->cwnd - old_cwnd, SCTP_CWND_LOG_FROM_RTX);
+ }
+}
+
+void
+sctp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ int old_cwnd = net->cwnd;
+
+ SCTP_STAT_INCR(sctps_ecnereducedcwnd);
+ net->ssthresh = net->cwnd / 2;
+ if (net->ssthresh < net->mtu) {
+ net->ssthresh = net->mtu;
+ /* here back off the timer as well, to slow us down */
+ net->RTO <<= 1;
+ }
+ net->cwnd = net->ssthresh;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SAT);
+ }
+}
+
+void
+sctp_cwnd_update_after_packet_dropped(struct sctp_tcb *stcb,
+ struct sctp_nets *net, struct sctp_pktdrop_chunk *cp,
+ uint32_t * bottle_bw, uint32_t * on_queue)
+{
+ uint32_t bw_avail;
+ int rtt, incr;
+ int old_cwnd = net->cwnd;
+
+ /* need real RTT for this calc */
+ rtt = ((net->lastsa >> 2) + net->lastsv) >> 1;
+ /* get bottle neck bw */
+ *bottle_bw = ntohl(cp->bottle_bw);
+ /* and whats on queue */
+ *on_queue = ntohl(cp->current_onq);
+ /*
+ * adjust the on-queue if our flight is more it could be that the
+ * router has not yet gotten data "in-flight" to it
+ */
+ if (*on_queue < net->flight_size)
+ *on_queue = net->flight_size;
+ /* calculate the available space */
+ bw_avail = (*bottle_bw * rtt) / 1000;
+ if (bw_avail > *bottle_bw) {
+ /*
+ * Cap the growth to no more than the bottle neck. This can
+ * happen as RTT slides up due to queues. It also means if
+ * you have more than a 1 second RTT with a empty queue you
+ * will be limited to the bottle_bw per second no matter if
+ * other points have 1/2 the RTT and you could get more
+ * out...
+ */
+ bw_avail = *bottle_bw;
+ }
+ if (*on_queue > bw_avail) {
+ /*
+ * No room for anything else don't allow anything else to be
+ * "added to the fire".
+ */
+ int seg_inflight, seg_onqueue, my_portion;
+
+ net->partial_bytes_acked = 0;
+
+ /* how much are we over queue size? */
+ incr = *on_queue - bw_avail;
+ if (stcb->asoc.seen_a_sack_this_pkt) {
+ /*
+ * undo any cwnd adjustment that the sack might have
+ * made
+ */
+ net->cwnd = net->prev_cwnd;
+ }
+ /* Now how much of that is mine? */
+ seg_inflight = net->flight_size / net->mtu;
+ seg_onqueue = *on_queue / net->mtu;
+ my_portion = (incr * seg_inflight) / seg_onqueue;
+
+ /* Have I made an adjustment already */
+ if (net->cwnd > net->flight_size) {
+ /*
+ * for this flight I made an adjustment we need to
+ * decrease the portion by a share our previous
+ * adjustment.
+ */
+ int diff_adj;
+
+ diff_adj = net->cwnd - net->flight_size;
+ if (diff_adj > my_portion)
+ my_portion = 0;
+ else
+ my_portion -= diff_adj;
+ }
+ /*
+ * back down to the previous cwnd (assume we have had a sack
+ * before this packet). minus what ever portion of the
+ * overage is my fault.
+ */
+ net->cwnd -= my_portion;
+
+ /* we will NOT back down more than 1 MTU */
+ if (net->cwnd <= net->mtu) {
+ net->cwnd = net->mtu;
+ }
+ /* force into CA */
+ net->ssthresh = net->cwnd - 1;
+ } else {
+ /*
+ * Take 1/4 of the space left or max burst up .. whichever
+ * is less.
+ */
+ incr = min((bw_avail - *on_queue) >> 2,
+ stcb->asoc.max_burst * net->mtu);
+ net->cwnd += incr;
+ }
+ if (net->cwnd > bw_avail) {
+ /* We can't exceed the pipe size */
+ net->cwnd = bw_avail;
+ }
+ if (net->cwnd < net->mtu) {
+ /* We always have 1 MTU */
+ net->cwnd = net->mtu;
+ }
+ if (net->cwnd - old_cwnd != 0) {
+ /* log only changes */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd),
+ SCTP_CWND_LOG_FROM_SAT);
+ }
+ }
+}
+
+void
+sctp_cwnd_update_after_output(struct sctp_tcb *stcb,
+ struct sctp_nets *net, int burst_limit)
+{
+ int old_cwnd = net->cwnd;
+
+ if (net->ssthresh < net->cwnd)
+ net->ssthresh = net->cwnd;
+ net->cwnd = (net->flight_size + (burst_limit * net->mtu));
+
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_BRST);
+ }
+}
+
+void
+sctp_cwnd_update_after_fr_timer(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ int old_cwnd = net->cwnd;
+
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_EARLY_FR_TMR, SCTP_SO_NOT_LOCKED);
+ /*
+ * make a small adjustment to cwnd and force to CA.
+ */
+ if (net->cwnd > net->mtu)
+ /* drop down one MTU after sending */
+ net->cwnd -= net->mtu;
+ if (net->cwnd < net->ssthresh)
+ /* still in SS move to CA */
+ net->ssthresh = net->cwnd - 1;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, (old_cwnd - net->cwnd), SCTP_CWND_LOG_FROM_FR);
+ }
+}
+
+struct sctp_hs_raise_drop {
+ int32_t cwnd;
+ int32_t increase;
+ int32_t drop_percent;
+};
+
+#define SCTP_HS_TABLE_SIZE 73
+
+struct sctp_hs_raise_drop sctp_cwnd_adjust[SCTP_HS_TABLE_SIZE] = {
+ {38, 1, 50}, /* 0 */
+ {118, 2, 44}, /* 1 */
+ {221, 3, 41}, /* 2 */
+ {347, 4, 38}, /* 3 */
+ {495, 5, 37}, /* 4 */
+ {663, 6, 35}, /* 5 */
+ {851, 7, 34}, /* 6 */
+ {1058, 8, 33}, /* 7 */
+ {1284, 9, 32}, /* 8 */
+ {1529, 10, 31}, /* 9 */
+ {1793, 11, 30}, /* 10 */
+ {2076, 12, 29}, /* 11 */
+ {2378, 13, 28}, /* 12 */
+ {2699, 14, 28}, /* 13 */
+ {3039, 15, 27}, /* 14 */
+ {3399, 16, 27}, /* 15 */
+ {3778, 17, 26}, /* 16 */
+ {4177, 18, 26}, /* 17 */
+ {4596, 19, 25}, /* 18 */
+ {5036, 20, 25}, /* 19 */
+ {5497, 21, 24}, /* 20 */
+ {5979, 22, 24}, /* 21 */
+ {6483, 23, 23}, /* 22 */
+ {7009, 24, 23}, /* 23 */
+ {7558, 25, 22}, /* 24 */
+ {8130, 26, 22}, /* 25 */
+ {8726, 27, 22}, /* 26 */
+ {9346, 28, 21}, /* 27 */
+ {9991, 29, 21}, /* 28 */
+ {10661, 30, 21}, /* 29 */
+ {11358, 31, 20}, /* 30 */
+ {12082, 32, 20}, /* 31 */
+ {12834, 33, 20}, /* 32 */
+ {13614, 34, 19}, /* 33 */
+ {14424, 35, 19}, /* 34 */
+ {15265, 36, 19}, /* 35 */
+ {16137, 37, 19}, /* 36 */
+ {17042, 38, 18}, /* 37 */
+ {17981, 39, 18}, /* 38 */
+ {18955, 40, 18}, /* 39 */
+ {19965, 41, 17}, /* 40 */
+ {21013, 42, 17}, /* 41 */
+ {22101, 43, 17}, /* 42 */
+ {23230, 44, 17}, /* 43 */
+ {24402, 45, 16}, /* 44 */
+ {25618, 46, 16}, /* 45 */
+ {26881, 47, 16}, /* 46 */
+ {28193, 48, 16}, /* 47 */
+ {29557, 49, 15}, /* 48 */
+ {30975, 50, 15}, /* 49 */
+ {32450, 51, 15}, /* 50 */
+ {33986, 52, 15}, /* 51 */
+ {35586, 53, 14}, /* 52 */
+ {37253, 54, 14}, /* 53 */
+ {38992, 55, 14}, /* 54 */
+ {40808, 56, 14}, /* 55 */
+ {42707, 57, 13}, /* 56 */
+ {44694, 58, 13}, /* 57 */
+ {46776, 59, 13}, /* 58 */
+ {48961, 60, 13}, /* 59 */
+ {51258, 61, 13}, /* 60 */
+ {53677, 62, 12}, /* 61 */
+ {56230, 63, 12}, /* 62 */
+ {58932, 64, 12}, /* 63 */
+ {61799, 65, 12}, /* 64 */
+ {64851, 66, 11}, /* 65 */
+ {68113, 67, 11}, /* 66 */
+ {71617, 68, 11}, /* 67 */
+ {75401, 69, 10}, /* 68 */
+ {79517, 70, 10}, /* 69 */
+ {84035, 71, 10}, /* 70 */
+ {89053, 72, 10}, /* 71 */
+ {94717, 73, 9} /* 72 */
+};
+
+static void
+sctp_hs_cwnd_increase(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ int cur_val, i, indx, incr;
+
+ cur_val = net->cwnd >> 10;
+ indx = SCTP_HS_TABLE_SIZE - 1;
+#ifdef SCTP_DEBUG
+ printf("HS CC CAlled.\n");
+#endif
+ if (cur_val < sctp_cwnd_adjust[0].cwnd) {
+ /* normal mode */
+ if (net->net_ack > net->mtu) {
+ net->cwnd += net->mtu;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->mtu, SCTP_CWND_LOG_FROM_SS);
+ }
+ } else {
+ net->cwnd += net->net_ack;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->net_ack, SCTP_CWND_LOG_FROM_SS);
+ }
+ }
+ } else {
+ for (i = net->last_hs_used; i < SCTP_HS_TABLE_SIZE; i++) {
+ if (cur_val < sctp_cwnd_adjust[i].cwnd) {
+ indx = i;
+ break;
+ }
+ }
+ net->last_hs_used = indx;
+ incr = ((sctp_cwnd_adjust[indx].increase) << 10);
+ net->cwnd += incr;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, incr, SCTP_CWND_LOG_FROM_SS);
+ }
+ }
+}
+
+static void
+sctp_hs_cwnd_decrease(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ int cur_val, i, indx;
+ int old_cwnd = net->cwnd;
+
+ cur_val = net->cwnd >> 10;
+ if (cur_val < sctp_cwnd_adjust[0].cwnd) {
+ /* normal mode */
+ net->ssthresh = net->cwnd / 2;
+ if (net->ssthresh < (net->mtu * 2)) {
+ net->ssthresh = 2 * net->mtu;
+ }
+ net->cwnd = net->ssthresh;
+ } else {
+ /* drop by the proper amount */
+ net->ssthresh = net->cwnd - (int)((net->cwnd / 100) *
+ sctp_cwnd_adjust[net->last_hs_used].drop_percent);
+ net->cwnd = net->ssthresh;
+ /* now where are we */
+ indx = net->last_hs_used;
+ cur_val = net->cwnd >> 10;
+ /* reset where we are in the table */
+ if (cur_val < sctp_cwnd_adjust[0].cwnd) {
+ /* feel out of hs */
+ net->last_hs_used = 0;
+ } else {
+ for (i = indx; i >= 1; i--) {
+ if (cur_val > sctp_cwnd_adjust[i - 1].cwnd) {
+ break;
+ }
+ }
+ net->last_hs_used = indx;
+ }
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_FR);
+ }
+}
+
+void
+sctp_hs_cwnd_update_after_fr(struct sctp_tcb *stcb,
+ struct sctp_association *asoc)
+{
+ struct sctp_nets *net;
+
+ /*
+ * CMT fast recovery code. Need to debug. ((sctp_cmt_on_off == 1) &&
+ * (net->fast_retran_loss_recovery == 0)))
+ */
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ if ((asoc->fast_retran_loss_recovery == 0) ||
+ (asoc->sctp_cmt_on_off == 1)) {
+ /* out of a RFC2582 Fast recovery window? */
+ if (net->net_ack > 0) {
+ /*
+ * per section 7.2.3, are there any
+ * destinations that had a fast retransmit
+ * to them. If so what we need to do is
+ * adjust ssthresh and cwnd.
+ */
+ struct sctp_tmit_chunk *lchk;
+
+ sctp_hs_cwnd_decrease(stcb, net);
+
+ lchk = TAILQ_FIRST(&asoc->send_queue);
+
+ net->partial_bytes_acked = 0;
+ /* Turn on fast recovery window */
+ asoc->fast_retran_loss_recovery = 1;
+ if (lchk == NULL) {
+ /* Mark end of the window */
+ asoc->fast_recovery_tsn = asoc->sending_seq - 1;
+ } else {
+ asoc->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1;
+ }
+
+ /*
+ * CMT fast recovery -- per destination
+ * recovery variable.
+ */
+ net->fast_retran_loss_recovery = 1;
+
+ if (lchk == NULL) {
+ /* Mark end of the window */
+ net->fast_recovery_tsn = asoc->sending_seq - 1;
+ } else {
+ net->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1;
+ }
+
+ /*
+ * Disable Nonce Sum Checking and store the
+ * resync tsn
+ */
+ asoc->nonce_sum_check = 0;
+ asoc->nonce_resync_tsn = asoc->fast_recovery_tsn + 1;
+
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND,
+ stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_32);
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND,
+ stcb->sctp_ep, stcb, net);
+ }
+ } else if (net->net_ack > 0) {
+ /*
+ * Mark a peg that we WOULD have done a cwnd
+ * reduction but RFC2582 prevented this action.
+ */
+ SCTP_STAT_INCR(sctps_fastretransinrtt);
+ }
+ }
+}
+
+void
+sctp_hs_cwnd_update_after_sack(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ int accum_moved, int reneged_all, int will_exit)
+{
+ struct sctp_nets *net;
+
+ /******************************/
+ /* update cwnd and Early FR */
+ /******************************/
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+
+#ifdef JANA_CMT_FAST_RECOVERY
+ /*
+ * CMT fast recovery code. Need to debug.
+ */
+ if (net->fast_retran_loss_recovery && net->new_pseudo_cumack) {
+ if (compare_with_wrap(asoc->last_acked_seq,
+ net->fast_recovery_tsn, MAX_TSN) ||
+ (asoc->last_acked_seq == net->fast_recovery_tsn) ||
+ compare_with_wrap(net->pseudo_cumack, net->fast_recovery_tsn, MAX_TSN) ||
+ (net->pseudo_cumack == net->fast_recovery_tsn)) {
+ net->will_exit_fast_recovery = 1;
+ }
+ }
+#endif
+ if (SCTP_BASE_SYSCTL(sctp_early_fr)) {
+ /*
+ * So, first of all do we need to have a Early FR
+ * timer running?
+ */
+ if ((!TAILQ_EMPTY(&asoc->sent_queue) &&
+ (net->ref_count > 1) &&
+ (net->flight_size < net->cwnd)) ||
+ (reneged_all)) {
+ /*
+ * yes, so in this case stop it if its
+ * running, and then restart it. Reneging
+ * all is a special case where we want to
+ * run the Early FR timer and then force the
+ * last few unacked to be sent, causing us
+ * to illicit a sack with gaps to force out
+ * the others.
+ */
+ if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) {
+ SCTP_STAT_INCR(sctps_earlyfrstpidsck2);
+ sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_20);
+ }
+ SCTP_STAT_INCR(sctps_earlyfrstrid);
+ sctp_timer_start(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net);
+ } else {
+ /* No, stop it if its running */
+ if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) {
+ SCTP_STAT_INCR(sctps_earlyfrstpidsck3);
+ sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_21);
+ }
+ }
+ }
+ /* if nothing was acked on this destination skip it */
+ if (net->net_ack == 0) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, 0, SCTP_CWND_LOG_FROM_SACK);
+ }
+ continue;
+ }
+ if (net->net_ack2 > 0) {
+ /*
+ * Karn's rule applies to clearing error count, this
+ * is optional.
+ */
+ net->error_count = 0;
+ if ((net->dest_state & SCTP_ADDR_NOT_REACHABLE) ==
+ SCTP_ADDR_NOT_REACHABLE) {
+ /* addr came good */
+ net->dest_state &= ~SCTP_ADDR_NOT_REACHABLE;
+ net->dest_state |= SCTP_ADDR_REACHABLE;
+ sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb,
+ SCTP_RECEIVED_SACK, (void *)net, SCTP_SO_NOT_LOCKED);
+ /* now was it the primary? if so restore */
+ if (net->dest_state & SCTP_ADDR_WAS_PRIMARY) {
+ (void)sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net);
+ }
+ }
+ /*
+ * JRS 5/14/07 - If CMT PF is on and the destination
+ * is in PF state, set the destination to active
+ * state and set the cwnd to one or two MTU's based
+ * on whether PF1 or PF2 is being used.
+ *
+ * Should we stop any running T3 timer here?
+ */
+ if ((asoc->sctp_cmt_on_off == 1) &&
+ (asoc->sctp_cmt_pf > 0) &&
+ ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF)) {
+ net->dest_state &= ~SCTP_ADDR_PF;
+ net->cwnd = net->mtu * asoc->sctp_cmt_pf;
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Destination %p moved from PF to reachable with cwnd %d.\n",
+ net, net->cwnd);
+ /*
+ * Since the cwnd value is explicitly set,
+ * skip the code that updates the cwnd
+ * value.
+ */
+ goto skip_cwnd_update;
+ }
+ }
+#ifdef JANA_CMT_FAST_RECOVERY
+ /*
+ * CMT fast recovery code
+ */
+ /*
+ * if (sctp_cmt_on_off == 1 &&
+ * net->fast_retran_loss_recovery &&
+ * net->will_exit_fast_recovery == 0) { @@@ Do something }
+ * else if (sctp_cmt_on_off == 0 &&
+ * asoc->fast_retran_loss_recovery && will_exit == 0) {
+ */
+#endif
+
+ if (asoc->fast_retran_loss_recovery &&
+ (will_exit == 0) &&
+ (asoc->sctp_cmt_on_off == 0)) {
+ /*
+ * If we are in loss recovery we skip any cwnd
+ * update
+ */
+ goto skip_cwnd_update;
+ }
+ /*
+ * CMT: CUC algorithm. Update cwnd if pseudo-cumack has
+ * moved.
+ */
+ if (accum_moved ||
+ ((asoc->sctp_cmt_on_off == 1) && net->new_pseudo_cumack)) {
+ /* If the cumulative ack moved we can proceed */
+ if (net->cwnd <= net->ssthresh) {
+ /* We are in slow start */
+ if (net->flight_size + net->net_ack >= net->cwnd) {
+
+ sctp_hs_cwnd_increase(stcb, net);
+
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->net_ack,
+ SCTP_CWND_LOG_NOADV_SS);
+ }
+ }
+ } else {
+ /* We are in congestion avoidance */
+ net->partial_bytes_acked += net->net_ack;
+ if ((net->flight_size + net->net_ack >= net->cwnd) &&
+ (net->partial_bytes_acked >= net->cwnd)) {
+ net->partial_bytes_acked -= net->cwnd;
+ net->cwnd += net->mtu;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->mtu,
+ SCTP_CWND_LOG_FROM_CA);
+ }
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->net_ack,
+ SCTP_CWND_LOG_NOADV_CA);
+ }
+ }
+ }
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->mtu,
+ SCTP_CWND_LOG_NO_CUMACK);
+ }
+ }
+skip_cwnd_update:
+ /*
+ * NOW, according to Karn's rule do we need to restore the
+ * RTO timer back? Check our net_ack2. If not set then we
+ * have a ambiguity.. i.e. all data ack'd was sent to more
+ * than one place.
+ */
+ if (net->net_ack2) {
+ /* restore any doubled timers */
+ net->RTO = ((net->lastsa >> 2) + net->lastsv) >> 1;
+ if (net->RTO < stcb->asoc.minrto) {
+ net->RTO = stcb->asoc.minrto;
+ }
+ if (net->RTO > stcb->asoc.maxrto) {
+ net->RTO = stcb->asoc.maxrto;
+ }
+ }
+ }
+}
+
+
+/*
+ * H-TCP congestion control. The algorithm is detailed in:
+ * R.N.Shorten, D.J.Leith:
+ * "H-TCP: TCP for high-speed and long-distance networks"
+ * Proc. PFLDnet, Argonne, 2004.
+ * http://www.hamilton.ie/net/htcp3.pdf
+ */
+
+
+static int use_rtt_scaling = 1;
+static int use_bandwidth_switch = 1;
+
+static inline int
+between(uint32_t seq1, uint32_t seq2, uint32_t seq3)
+{
+ return seq3 - seq2 >= seq1 - seq2;
+}
+
+static inline uint32_t
+htcp_cong_time(struct htcp *ca)
+{
+ return sctp_get_tick_count() - ca->last_cong;
+}
+
+static inline uint32_t
+htcp_ccount(struct htcp *ca)
+{
+ return htcp_cong_time(ca) / ca->minRTT;
+}
+
+static inline void
+htcp_reset(struct htcp *ca)
+{
+ ca->undo_last_cong = ca->last_cong;
+ ca->undo_maxRTT = ca->maxRTT;
+ ca->undo_old_maxB = ca->old_maxB;
+ ca->last_cong = sctp_get_tick_count();
+}
+
+#ifdef SCTP_NOT_USED
+
+static uint32_t
+htcp_cwnd_undo(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ net->htcp_ca.last_cong = net->htcp_ca.undo_last_cong;
+ net->htcp_ca.maxRTT = net->htcp_ca.undo_maxRTT;
+ net->htcp_ca.old_maxB = net->htcp_ca.undo_old_maxB;
+ return max(net->cwnd, ((net->ssthresh / net->mtu << 7) / net->htcp_ca.beta) * net->mtu);
+}
+
+#endif
+
+static inline void
+measure_rtt(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ uint32_t srtt = net->lastsa >> 3;
+
+ /* keep track of minimum RTT seen so far, minRTT is zero at first */
+ if (net->htcp_ca.minRTT > srtt || !net->htcp_ca.minRTT)
+ net->htcp_ca.minRTT = srtt;
+
+ /* max RTT */
+ if (net->fast_retran_ip == 0 && net->ssthresh < 0xFFFF && htcp_ccount(&net->htcp_ca) > 3) {
+ if (net->htcp_ca.maxRTT < net->htcp_ca.minRTT)
+ net->htcp_ca.maxRTT = net->htcp_ca.minRTT;
+ if (net->htcp_ca.maxRTT < srtt && srtt <= net->htcp_ca.maxRTT + MSEC_TO_TICKS(20))
+ net->htcp_ca.maxRTT = srtt;
+ }
+}
+
+static void
+measure_achieved_throughput(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ uint32_t now = sctp_get_tick_count();
+
+ if (net->fast_retran_ip == 0)
+ net->htcp_ca.bytes_acked = net->net_ack;
+
+ if (!use_bandwidth_switch)
+ return;
+
+ /* achieved throughput calculations */
+ /* JRS - not 100% sure of this statement */
+ if (net->fast_retran_ip == 1) {
+ net->htcp_ca.bytecount = 0;
+ net->htcp_ca.lasttime = now;
+ return;
+ }
+ net->htcp_ca.bytecount += net->net_ack;
+
+ if (net->htcp_ca.bytecount >= net->cwnd - ((net->htcp_ca.alpha >> 7 ? : 1) * net->mtu)
+ && now - net->htcp_ca.lasttime >= net->htcp_ca.minRTT
+ && net->htcp_ca.minRTT > 0) {
+ uint32_t cur_Bi = net->htcp_ca.bytecount / net->mtu * hz / (now - net->htcp_ca.lasttime);
+
+ if (htcp_ccount(&net->htcp_ca) <= 3) {
+ /* just after backoff */
+ net->htcp_ca.minB = net->htcp_ca.maxB = net->htcp_ca.Bi = cur_Bi;
+ } else {
+ net->htcp_ca.Bi = (3 * net->htcp_ca.Bi + cur_Bi) / 4;
+ if (net->htcp_ca.Bi > net->htcp_ca.maxB)
+ net->htcp_ca.maxB = net->htcp_ca.Bi;
+ if (net->htcp_ca.minB > net->htcp_ca.maxB)
+ net->htcp_ca.minB = net->htcp_ca.maxB;
+ }
+ net->htcp_ca.bytecount = 0;
+ net->htcp_ca.lasttime = now;
+ }
+}
+
+static inline void
+htcp_beta_update(struct htcp *ca, uint32_t minRTT, uint32_t maxRTT)
+{
+ if (use_bandwidth_switch) {
+ uint32_t maxB = ca->maxB;
+ uint32_t old_maxB = ca->old_maxB;
+
+ ca->old_maxB = ca->maxB;
+
+ if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) {
+ ca->beta = BETA_MIN;
+ ca->modeswitch = 0;
+ return;
+ }
+ }
+ if (ca->modeswitch && minRTT > (uint32_t) MSEC_TO_TICKS(10) && maxRTT) {
+ ca->beta = (minRTT << 7) / maxRTT;
+ if (ca->beta < BETA_MIN)
+ ca->beta = BETA_MIN;
+ else if (ca->beta > BETA_MAX)
+ ca->beta = BETA_MAX;
+ } else {
+ ca->beta = BETA_MIN;
+ ca->modeswitch = 1;
+ }
+}
+
+static inline void
+htcp_alpha_update(struct htcp *ca)
+{
+ uint32_t minRTT = ca->minRTT;
+ uint32_t factor = 1;
+ uint32_t diff = htcp_cong_time(ca);
+
+ if (diff > (uint32_t) hz) {
+ diff -= hz;
+ factor = 1 + (10 * diff + ((diff / 2) * (diff / 2) / hz)) / hz;
+ }
+ if (use_rtt_scaling && minRTT) {
+ uint32_t scale = (hz << 3) / (10 * minRTT);
+
+ scale = min(max(scale, 1U << 2), 10U << 3); /* clamping ratio to
+ * interval [0.5,10]<<3 */
+ factor = (factor << 3) / scale;
+ if (!factor)
+ factor = 1;
+ }
+ ca->alpha = 2 * factor * ((1 << 7) - ca->beta);
+ if (!ca->alpha)
+ ca->alpha = ALPHA_BASE;
+}
+
+/* After we have the rtt data to calculate beta, we'd still prefer to wait one
+ * rtt before we adjust our beta to ensure we are working from a consistent
+ * data.
+ *
+ * This function should be called when we hit a congestion event since only at
+ * that point do we really have a real sense of maxRTT (the queues en route
+ * were getting just too full now).
+ */
+static void
+htcp_param_update(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ uint32_t minRTT = net->htcp_ca.minRTT;
+ uint32_t maxRTT = net->htcp_ca.maxRTT;
+
+ htcp_beta_update(&net->htcp_ca, minRTT, maxRTT);
+ htcp_alpha_update(&net->htcp_ca);
+
+ /*
+ * add slowly fading memory for maxRTT to accommodate routing
+ * changes etc
+ */
+ if (minRTT > 0 && maxRTT > minRTT)
+ net->htcp_ca.maxRTT = minRTT + ((maxRTT - minRTT) * 95) / 100;
+}
+
+static uint32_t
+htcp_recalc_ssthresh(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ htcp_param_update(stcb, net);
+ return max(((net->cwnd / net->mtu * net->htcp_ca.beta) >> 7) * net->mtu, 2U * net->mtu);
+}
+
+static void
+htcp_cong_avoid(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ /*-
+ * How to handle these functions?
+ * if (!tcp_is_cwnd_limited(sk, in_flight)) RRS - good question.
+ * return;
+ */
+ if (net->cwnd <= net->ssthresh) {
+ /* We are in slow start */
+ if (net->flight_size + net->net_ack >= net->cwnd) {
+ if (net->net_ack > (net->mtu * SCTP_BASE_SYSCTL(sctp_L2_abc_variable))) {
+ net->cwnd += (net->mtu * SCTP_BASE_SYSCTL(sctp_L2_abc_variable));
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->mtu,
+ SCTP_CWND_LOG_FROM_SS);
+ }
+ } else {
+ net->cwnd += net->net_ack;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->net_ack,
+ SCTP_CWND_LOG_FROM_SS);
+ }
+ }
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->net_ack,
+ SCTP_CWND_LOG_NOADV_SS);
+ }
+ }
+ } else {
+ measure_rtt(stcb, net);
+
+ /*
+ * In dangerous area, increase slowly. In theory this is
+ * net->cwnd += alpha / net->cwnd
+ */
+ /* What is snd_cwnd_cnt?? */
+ if (((net->partial_bytes_acked / net->mtu * net->htcp_ca.alpha) >> 7) * net->mtu >= net->cwnd) {
+ /*-
+ * Does SCTP have a cwnd clamp?
+ * if (net->snd_cwnd < net->snd_cwnd_clamp) - Nope (RRS).
+ */
+ net->cwnd += net->mtu;
+ net->partial_bytes_acked = 0;
+ htcp_alpha_update(&net->htcp_ca);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->mtu,
+ SCTP_CWND_LOG_FROM_CA);
+ }
+ } else {
+ net->partial_bytes_acked += net->net_ack;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->net_ack,
+ SCTP_CWND_LOG_NOADV_CA);
+ }
+ }
+
+ net->htcp_ca.bytes_acked = net->mtu;
+ }
+}
+
+#ifdef SCTP_NOT_USED
+/* Lower bound on congestion window. */
+static uint32_t
+htcp_min_cwnd(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ return net->ssthresh;
+}
+
+#endif
+
+static void
+htcp_init(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ memset(&net->htcp_ca, 0, sizeof(struct htcp));
+ net->htcp_ca.alpha = ALPHA_BASE;
+ net->htcp_ca.beta = BETA_MIN;
+ net->htcp_ca.bytes_acked = net->mtu;
+ net->htcp_ca.last_cong = sctp_get_tick_count();
+}
+
+void
+sctp_htcp_set_initial_cc_param(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ /*
+ * We take the max of the burst limit times a MTU or the
+ * INITIAL_CWND. We then limit this to 4 MTU's of sending.
+ */
+ net->cwnd = min((net->mtu * 4), max((2 * net->mtu), SCTP_INITIAL_CWND));
+ net->ssthresh = stcb->asoc.peers_rwnd;
+ htcp_init(stcb, net);
+
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) {
+ sctp_log_cwnd(stcb, net, 0, SCTP_CWND_INITIALIZATION);
+ }
+}
+
+void
+sctp_htcp_cwnd_update_after_sack(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ int accum_moved, int reneged_all, int will_exit)
+{
+ struct sctp_nets *net;
+
+ /******************************/
+ /* update cwnd and Early FR */
+ /******************************/
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+
+#ifdef JANA_CMT_FAST_RECOVERY
+ /*
+ * CMT fast recovery code. Need to debug.
+ */
+ if (net->fast_retran_loss_recovery && net->new_pseudo_cumack) {
+ if (compare_with_wrap(asoc->last_acked_seq,
+ net->fast_recovery_tsn, MAX_TSN) ||
+ (asoc->last_acked_seq == net->fast_recovery_tsn) ||
+ compare_with_wrap(net->pseudo_cumack, net->fast_recovery_tsn, MAX_TSN) ||
+ (net->pseudo_cumack == net->fast_recovery_tsn)) {
+ net->will_exit_fast_recovery = 1;
+ }
+ }
+#endif
+ if (SCTP_BASE_SYSCTL(sctp_early_fr)) {
+ /*
+ * So, first of all do we need to have a Early FR
+ * timer running?
+ */
+ if ((!TAILQ_EMPTY(&asoc->sent_queue) &&
+ (net->ref_count > 1) &&
+ (net->flight_size < net->cwnd)) ||
+ (reneged_all)) {
+ /*
+ * yes, so in this case stop it if its
+ * running, and then restart it. Reneging
+ * all is a special case where we want to
+ * run the Early FR timer and then force the
+ * last few unacked to be sent, causing us
+ * to illicit a sack with gaps to force out
+ * the others.
+ */
+ if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) {
+ SCTP_STAT_INCR(sctps_earlyfrstpidsck2);
+ sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_20);
+ }
+ SCTP_STAT_INCR(sctps_earlyfrstrid);
+ sctp_timer_start(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net);
+ } else {
+ /* No, stop it if its running */
+ if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) {
+ SCTP_STAT_INCR(sctps_earlyfrstpidsck3);
+ sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_21);
+ }
+ }
+ }
+ /* if nothing was acked on this destination skip it */
+ if (net->net_ack == 0) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, 0, SCTP_CWND_LOG_FROM_SACK);
+ }
+ continue;
+ }
+ if (net->net_ack2 > 0) {
+ /*
+ * Karn's rule applies to clearing error count, this
+ * is optional.
+ */
+ net->error_count = 0;
+ if ((net->dest_state & SCTP_ADDR_NOT_REACHABLE) ==
+ SCTP_ADDR_NOT_REACHABLE) {
+ /* addr came good */
+ net->dest_state &= ~SCTP_ADDR_NOT_REACHABLE;
+ net->dest_state |= SCTP_ADDR_REACHABLE;
+ sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb,
+ SCTP_RECEIVED_SACK, (void *)net, SCTP_SO_NOT_LOCKED);
+ /* now was it the primary? if so restore */
+ if (net->dest_state & SCTP_ADDR_WAS_PRIMARY) {
+ (void)sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net);
+ }
+ }
+ /*
+ * JRS 5/14/07 - If CMT PF is on and the destination
+ * is in PF state, set the destination to active
+ * state and set the cwnd to one or two MTU's based
+ * on whether PF1 or PF2 is being used.
+ *
+ * Should we stop any running T3 timer here?
+ */
+ if ((asoc->sctp_cmt_on_off == 1) &&
+ (asoc->sctp_cmt_pf > 0) &&
+ ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF)) {
+ net->dest_state &= ~SCTP_ADDR_PF;
+ net->cwnd = net->mtu * asoc->sctp_cmt_pf;
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Destination %p moved from PF to reachable with cwnd %d.\n",
+ net, net->cwnd);
+ /*
+ * Since the cwnd value is explicitly set,
+ * skip the code that updates the cwnd
+ * value.
+ */
+ goto skip_cwnd_update;
+ }
+ }
+#ifdef JANA_CMT_FAST_RECOVERY
+ /*
+ * CMT fast recovery code
+ */
+ /*
+ * if (sctp_cmt_on_off == 1 &&
+ * net->fast_retran_loss_recovery &&
+ * net->will_exit_fast_recovery == 0) { @@@ Do something }
+ * else if (sctp_cmt_on_off == 0 &&
+ * asoc->fast_retran_loss_recovery && will_exit == 0) {
+ */
+#endif
+
+ if (asoc->fast_retran_loss_recovery &&
+ will_exit == 0 &&
+ (asoc->sctp_cmt_on_off == 0)) {
+ /*
+ * If we are in loss recovery we skip any cwnd
+ * update
+ */
+ goto skip_cwnd_update;
+ }
+ /*
+ * CMT: CUC algorithm. Update cwnd if pseudo-cumack has
+ * moved.
+ */
+ if (accum_moved ||
+ ((asoc->sctp_cmt_on_off == 1) && net->new_pseudo_cumack)) {
+ htcp_cong_avoid(stcb, net);
+ measure_achieved_throughput(stcb, net);
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->mtu,
+ SCTP_CWND_LOG_NO_CUMACK);
+ }
+ }
+skip_cwnd_update:
+ /*
+ * NOW, according to Karn's rule do we need to restore the
+ * RTO timer back? Check our net_ack2. If not set then we
+ * have a ambiguity.. i.e. all data ack'd was sent to more
+ * than one place.
+ */
+ if (net->net_ack2) {
+ /* restore any doubled timers */
+ net->RTO = ((net->lastsa >> 2) + net->lastsv) >> 1;
+ if (net->RTO < stcb->asoc.minrto) {
+ net->RTO = stcb->asoc.minrto;
+ }
+ if (net->RTO > stcb->asoc.maxrto) {
+ net->RTO = stcb->asoc.maxrto;
+ }
+ }
+ }
+}
+
+void
+sctp_htcp_cwnd_update_after_fr(struct sctp_tcb *stcb,
+ struct sctp_association *asoc)
+{
+ struct sctp_nets *net;
+
+ /*
+ * CMT fast recovery code. Need to debug. ((sctp_cmt_on_off == 1) &&
+ * (net->fast_retran_loss_recovery == 0)))
+ */
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ if ((asoc->fast_retran_loss_recovery == 0) ||
+ (asoc->sctp_cmt_on_off == 1)) {
+ /* out of a RFC2582 Fast recovery window? */
+ if (net->net_ack > 0) {
+ /*
+ * per section 7.2.3, are there any
+ * destinations that had a fast retransmit
+ * to them. If so what we need to do is
+ * adjust ssthresh and cwnd.
+ */
+ struct sctp_tmit_chunk *lchk;
+ int old_cwnd = net->cwnd;
+
+ /* JRS - reset as if state were changed */
+ htcp_reset(&net->htcp_ca);
+ net->ssthresh = htcp_recalc_ssthresh(stcb, net);
+ net->cwnd = net->ssthresh;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd),
+ SCTP_CWND_LOG_FROM_FR);
+ }
+ lchk = TAILQ_FIRST(&asoc->send_queue);
+
+ net->partial_bytes_acked = 0;
+ /* Turn on fast recovery window */
+ asoc->fast_retran_loss_recovery = 1;
+ if (lchk == NULL) {
+ /* Mark end of the window */
+ asoc->fast_recovery_tsn = asoc->sending_seq - 1;
+ } else {
+ asoc->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1;
+ }
+
+ /*
+ * CMT fast recovery -- per destination
+ * recovery variable.
+ */
+ net->fast_retran_loss_recovery = 1;
+
+ if (lchk == NULL) {
+ /* Mark end of the window */
+ net->fast_recovery_tsn = asoc->sending_seq - 1;
+ } else {
+ net->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1;
+ }
+
+ /*
+ * Disable Nonce Sum Checking and store the
+ * resync tsn
+ */
+ asoc->nonce_sum_check = 0;
+ asoc->nonce_resync_tsn = asoc->fast_recovery_tsn + 1;
+
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND,
+ stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_32);
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND,
+ stcb->sctp_ep, stcb, net);
+ }
+ } else if (net->net_ack > 0) {
+ /*
+ * Mark a peg that we WOULD have done a cwnd
+ * reduction but RFC2582 prevented this action.
+ */
+ SCTP_STAT_INCR(sctps_fastretransinrtt);
+ }
+ }
+}
+
+void
+sctp_htcp_cwnd_update_after_timeout(struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ int old_cwnd = net->cwnd;
+
+ /* JRS - reset as if the state were being changed to timeout */
+ htcp_reset(&net->htcp_ca);
+ net->ssthresh = htcp_recalc_ssthresh(stcb, net);
+ net->cwnd = net->mtu;
+ net->partial_bytes_acked = 0;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->cwnd - old_cwnd, SCTP_CWND_LOG_FROM_RTX);
+ }
+}
+
+void
+sctp_htcp_cwnd_update_after_fr_timer(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ int old_cwnd;
+
+ old_cwnd = net->cwnd;
+
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_EARLY_FR_TMR, SCTP_SO_NOT_LOCKED);
+ net->htcp_ca.last_cong = sctp_get_tick_count();
+ /*
+ * make a small adjustment to cwnd and force to CA.
+ */
+ if (net->cwnd > net->mtu)
+ /* drop down one MTU after sending */
+ net->cwnd -= net->mtu;
+ if (net->cwnd < net->ssthresh)
+ /* still in SS move to CA */
+ net->ssthresh = net->cwnd - 1;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, (old_cwnd - net->cwnd), SCTP_CWND_LOG_FROM_FR);
+ }
+}
+
+void
+sctp_htcp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ int old_cwnd;
+
+ old_cwnd = net->cwnd;
+
+ /* JRS - reset hctp as if state changed */
+ htcp_reset(&net->htcp_ca);
+ SCTP_STAT_INCR(sctps_ecnereducedcwnd);
+ net->ssthresh = htcp_recalc_ssthresh(stcb, net);
+ if (net->ssthresh < net->mtu) {
+ net->ssthresh = net->mtu;
+ /* here back off the timer as well, to slow us down */
+ net->RTO <<= 1;
+ }
+ net->cwnd = net->ssthresh;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SAT);
+ }
+}
diff --git a/freebsd/sys/netinet/sctp_cc_functions.h b/freebsd/sys/netinet/sctp_cc_functions.h
new file mode 100644
index 00000000..3b95d7de
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_cc_functions.h
@@ -0,0 +1,116 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __sctp_cc_functions_h__
+#define __sctp_cc_functions_h__
+
+#if defined(_KERNEL) || defined(__Userspace__)
+
+void
+sctp_set_initial_cc_param(struct sctp_tcb *stcb,
+ struct sctp_nets *net);
+
+void
+sctp_cwnd_update_after_fr(struct sctp_tcb *stcb,
+ struct sctp_association *asoc);
+
+void
+sctp_cwnd_update_after_sack(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ int accum_moved, int reneged_all, int will_exit);
+
+void
+sctp_cwnd_update_after_timeout(struct sctp_tcb *stcb,
+ struct sctp_nets *net);
+
+void
+sctp_hs_cwnd_update_after_fr(struct sctp_tcb *stcb,
+ struct sctp_association *asoc);
+
+void
+sctp_hs_cwnd_update_after_sack(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ int accum_moved, int reneged_all, int will_exit);
+
+void
+sctp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb,
+ struct sctp_nets *net);
+
+void
+sctp_cwnd_update_after_packet_dropped(struct sctp_tcb *stcb,
+ struct sctp_nets *net, struct sctp_pktdrop_chunk *cp,
+ uint32_t * bottle_bw, uint32_t * on_queue);
+
+void
+sctp_cwnd_update_after_output(struct sctp_tcb *stcb,
+ struct sctp_nets *net, int burst_limit);
+
+void
+sctp_cwnd_update_after_fr_timer(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb, struct sctp_nets *net);
+
+/*
+ * HTCP algorithms are directly taken from
+ * R.N.Shorten, D.J.Leith and are work/outcome from
+ * a Cisco-URP grant to enhance HTCP for satellite
+ * communications. We use the BSD Liscense
+ * granted from his source and have modified his
+ * algorithms to fit within the SCTP BSD framework.
+ */
+
+void
+sctp_htcp_set_initial_cc_param(struct sctp_tcb *stcb,
+ struct sctp_nets *net);
+
+void
+sctp_htcp_cwnd_update_after_fr(struct sctp_tcb *stcb,
+ struct sctp_association *asoc);
+
+void
+sctp_htcp_cwnd_update_after_sack(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ int accum_moved, int reneged_all, int will_exit);
+
+void
+sctp_htcp_cwnd_update_after_timeout(struct sctp_tcb *stcb,
+ struct sctp_nets *net);
+
+void
+sctp_htcp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb,
+ struct sctp_nets *net);
+
+void
+sctp_htcp_cwnd_update_after_fr_timer(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb, struct sctp_nets *net);
+
+#endif
+#endif
diff --git a/freebsd/sys/netinet/sctp_constants.h b/freebsd/sys/netinet/sctp_constants.h
new file mode 100644
index 00000000..c4f4be23
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_constants.h
@@ -0,0 +1,1051 @@
+/*-
+ * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_constants.h,v 1.17 2005/03/06 16:04:17 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __sctp_constants_h__
+#define __sctp_constants_h__
+
+/* IANA assigned port number for SCTP over UDP encapsulation */
+/* For freebsd we cannot bind the port at
+ * startup. Otherwise what will happen is
+ * we really won't be bound. The user must
+ * put it into the sysctl... or we need
+ * to build a special timer for this to allow
+ * us to wait 1 second or so after the system
+ * comes up.
+ */
+#define SCTP_OVER_UDP_TUNNELING_PORT 0
+/* Number of packets to get before sack sent by default */
+#define SCTP_DEFAULT_SACK_FREQ 2
+
+/* Address limit - This variable is calculated
+ * based on an 65535 byte max ip packet. We take out 100 bytes
+ * for the cookie, 40 bytes for a v6 header and 32
+ * bytes for the init structure. A second init structure
+ * for the init-ack and then finally a third one for the
+ * imbedded init. This yeilds 100+40+(3 * 32) = 236 bytes.
+ * This leaves 65299 bytes for addresses. We throw out the 299 bytes.
+ * Now whatever we send in the INIT() we need to allow to get back in the
+ * INIT-ACK plus all the values from INIT and INIT-ACK
+ * listed in the cookie. Plus we need some overhead for
+ * maybe copied parameters in the COOKIE. If we
+ * allow 1080 addresses, and each side has 1080 V6 addresses
+ * that will be 21600 bytes. In the INIT-ACK we will
+ * see the INIT-ACK 21600 + 43200 in the cookie. This leaves
+ * about 500 bytes slack for misc things in the cookie.
+ */
+#define SCTP_ADDRESS_LIMIT 1080
+
+/* We need at least 2k of space for us, inits
+ * larger than that lets abort.
+ */
+#define SCTP_LARGEST_INIT_ACCEPTED (65535 - 2048)
+
+/* Number of addresses where we just skip the counting */
+#define SCTP_COUNT_LIMIT 40
+
+#define SCTP_ZERO_COPY_TICK_DELAY (((100 * hz) + 999) / 1000)
+#define SCTP_ZERO_COPY_SENDQ_TICK_DELAY (((100 * hz) + 999) / 1000)
+
+/* Number of ticks to delay before running
+ * iterator on an address change.
+ */
+#define SCTP_ADDRESS_TICK_DELAY 2
+
+#define SCTP_VERSION_STRING "KAME-BSD 1.1"
+/* #define SCTP_AUDITING_ENABLED 1 used for debug/auditing */
+#define SCTP_AUDIT_SIZE 256
+
+
+#define SCTP_KTRHEAD_NAME "sctp_iterator"
+#define SCTP_KTHREAD_PAGES 0
+
+
+/* If you support Multi-VRF how big to
+ * make the initial array of VRF's to.
+ */
+#define SCTP_DEFAULT_VRF_SIZE 4
+
+/* constants for rto calc */
+#define sctp_align_safe_nocopy 0
+#define sctp_align_unsafe_makecopy 1
+
+/* JRS - Values defined for the HTCP algorithm */
+#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */
+#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */
+#define BETA_MAX 102 /* 0.8 with shift << 7 */
+
+/* Places that CWND log can happen from */
+#define SCTP_CWND_LOG_FROM_FR 1
+#define SCTP_CWND_LOG_FROM_RTX 2
+#define SCTP_CWND_LOG_FROM_BRST 3
+#define SCTP_CWND_LOG_FROM_SS 4
+#define SCTP_CWND_LOG_FROM_CA 5
+#define SCTP_CWND_LOG_FROM_SAT 6
+#define SCTP_BLOCK_LOG_INTO_BLK 7
+#define SCTP_BLOCK_LOG_OUTOF_BLK 8
+#define SCTP_BLOCK_LOG_CHECK 9
+#define SCTP_STR_LOG_FROM_INTO_STRD 10
+#define SCTP_STR_LOG_FROM_IMMED_DEL 11
+#define SCTP_STR_LOG_FROM_INSERT_HD 12
+#define SCTP_STR_LOG_FROM_INSERT_MD 13
+#define SCTP_STR_LOG_FROM_INSERT_TL 14
+#define SCTP_STR_LOG_FROM_MARK_TSN 15
+#define SCTP_STR_LOG_FROM_EXPRS_DEL 16
+#define SCTP_FR_LOG_BIGGEST_TSNS 17
+#define SCTP_FR_LOG_STRIKE_TEST 18
+#define SCTP_FR_LOG_STRIKE_CHUNK 19
+#define SCTP_FR_T3_TIMEOUT 20
+#define SCTP_MAP_PREPARE_SLIDE 21
+#define SCTP_MAP_SLIDE_FROM 22
+#define SCTP_MAP_SLIDE_RESULT 23
+#define SCTP_MAP_SLIDE_CLEARED 24
+#define SCTP_MAP_SLIDE_NONE 25
+#define SCTP_FR_T3_MARK_TIME 26
+#define SCTP_FR_T3_MARKED 27
+#define SCTP_FR_T3_STOPPED 28
+#define SCTP_FR_MARKED 30
+#define SCTP_CWND_LOG_NOADV_SS 31
+#define SCTP_CWND_LOG_NOADV_CA 32
+#define SCTP_MAX_BURST_APPLIED 33
+#define SCTP_MAX_IFP_APPLIED 34
+#define SCTP_MAX_BURST_ERROR_STOP 35
+#define SCTP_INCREASE_PEER_RWND 36
+#define SCTP_DECREASE_PEER_RWND 37
+#define SCTP_SET_PEER_RWND_VIA_SACK 38
+#define SCTP_LOG_MBCNT_INCREASE 39
+#define SCTP_LOG_MBCNT_DECREASE 40
+#define SCTP_LOG_MBCNT_CHKSET 41
+#define SCTP_LOG_NEW_SACK 42
+#define SCTP_LOG_TSN_ACKED 43
+#define SCTP_LOG_TSN_REVOKED 44
+#define SCTP_LOG_LOCK_TCB 45
+#define SCTP_LOG_LOCK_INP 46
+#define SCTP_LOG_LOCK_SOCK 47
+#define SCTP_LOG_LOCK_SOCKBUF_R 48
+#define SCTP_LOG_LOCK_SOCKBUF_S 49
+#define SCTP_LOG_LOCK_CREATE 50
+#define SCTP_LOG_INITIAL_RTT 51
+#define SCTP_LOG_RTTVAR 52
+#define SCTP_LOG_SBALLOC 53
+#define SCTP_LOG_SBFREE 54
+#define SCTP_LOG_SBRESULT 55
+#define SCTP_FR_DUPED 56
+#define SCTP_FR_MARKED_EARLY 57
+#define SCTP_FR_CWND_REPORT 58
+#define SCTP_FR_CWND_REPORT_START 59
+#define SCTP_FR_CWND_REPORT_STOP 60
+#define SCTP_CWND_LOG_FROM_SEND 61
+#define SCTP_CWND_INITIALIZATION 62
+#define SCTP_CWND_LOG_FROM_T3 63
+#define SCTP_CWND_LOG_FROM_SACK 64
+#define SCTP_CWND_LOG_NO_CUMACK 65
+#define SCTP_CWND_LOG_FROM_RESEND 66
+#define SCTP_FR_LOG_CHECK_STRIKE 67
+#define SCTP_SEND_NOW_COMPLETES 68
+#define SCTP_CWND_LOG_FILL_OUTQ_CALLED 69
+#define SCTP_CWND_LOG_FILL_OUTQ_FILLS 70
+#define SCTP_LOG_FREE_SENT 71
+#define SCTP_NAGLE_APPLIED 72
+#define SCTP_NAGLE_SKIPPED 73
+#define SCTP_WAKESND_FROM_SACK 74
+#define SCTP_WAKESND_FROM_FWDTSN 75
+#define SCTP_NOWAKE_FROM_SACK 76
+#define SCTP_CWNDLOG_PRESEND 77
+#define SCTP_CWNDLOG_ENDSEND 78
+#define SCTP_AT_END_OF_SACK 79
+#define SCTP_REASON_FOR_SC 80
+#define SCTP_BLOCK_LOG_INTO_BLKA 81
+#define SCTP_ENTER_USER_RECV 82
+#define SCTP_USER_RECV_SACKS 83
+#define SCTP_SORECV_BLOCKSA 84
+#define SCTP_SORECV_BLOCKSB 85
+#define SCTP_SORECV_DONE 86
+#define SCTP_SACK_RWND_UPDATE 87
+#define SCTP_SORECV_ENTER 88
+#define SCTP_SORECV_ENTERPL 89
+#define SCTP_MBUF_INPUT 90
+#define SCTP_MBUF_IALLOC 91
+#define SCTP_MBUF_IFREE 92
+#define SCTP_MBUF_ICOPY 93
+#define SCTP_MBUF_SPLIT 94
+#define SCTP_SORCV_FREECTL 95
+#define SCTP_SORCV_DOESCPY 96
+#define SCTP_SORCV_DOESLCK 97
+#define SCTP_SORCV_DOESADJ 98
+#define SCTP_SORCV_BOTWHILE 99
+#define SCTP_SORCV_PASSBF 100
+#define SCTP_SORCV_ADJD 101
+#define SCTP_UNKNOWN_MAX 102
+#define SCTP_RANDY_STUFF 103
+#define SCTP_RANDY_STUFF1 104
+#define SCTP_STRMOUT_LOG_ASSIGN 105
+#define SCTP_STRMOUT_LOG_SEND 106
+#define SCTP_FLIGHT_LOG_DOWN_CA 107
+#define SCTP_FLIGHT_LOG_UP 108
+#define SCTP_FLIGHT_LOG_DOWN_GAP 109
+#define SCTP_FLIGHT_LOG_DOWN_RSND 110
+#define SCTP_FLIGHT_LOG_UP_RSND 111
+#define SCTP_FLIGHT_LOG_DOWN_RSND_TO 112
+#define SCTP_FLIGHT_LOG_DOWN_WP 113
+#define SCTP_FLIGHT_LOG_UP_REVOKE 114
+#define SCTP_FLIGHT_LOG_DOWN_PDRP 115
+#define SCTP_FLIGHT_LOG_DOWN_PMTU 116
+#define SCTP_SACK_LOG_NORMAL 117
+#define SCTP_SACK_LOG_EXPRESS 118
+#define SCTP_MAP_TSN_ENTERS 119
+#define SCTP_THRESHOLD_CLEAR 120
+#define SCTP_THRESHOLD_INCR 121
+#define SCTP_FLIGHT_LOG_DWN_WP_FWD 122
+#define SCTP_FWD_TSN_CHECK 123
+#define SCTP_LOG_MAX_TYPES 124
+/*
+ * To turn on various logging, you must first enable 'options KTR' and
+ * you might want to bump the entires 'options KTR_ENTRIES=80000'.
+ * To get something to log you define one of the logging defines.
+ * (see LINT).
+ *
+ * This gets the compile in place, but you still need to turn the
+ * logging flag on too in the sysctl (see in sctp.h).
+ */
+
+#define SCTP_LOG_EVENT_UNKNOWN 0
+#define SCTP_LOG_EVENT_CWND 1
+#define SCTP_LOG_EVENT_BLOCK 2
+#define SCTP_LOG_EVENT_STRM 3
+#define SCTP_LOG_EVENT_FR 4
+#define SCTP_LOG_EVENT_MAP 5
+#define SCTP_LOG_EVENT_MAXBURST 6
+#define SCTP_LOG_EVENT_RWND 7
+#define SCTP_LOG_EVENT_MBCNT 8
+#define SCTP_LOG_EVENT_SACK 9
+#define SCTP_LOG_LOCK_EVENT 10
+#define SCTP_LOG_EVENT_RTT 11
+#define SCTP_LOG_EVENT_SB 12
+#define SCTP_LOG_EVENT_NAGLE 13
+#define SCTP_LOG_EVENT_WAKE 14
+#define SCTP_LOG_MISC_EVENT 15
+#define SCTP_LOG_EVENT_CLOSE 16
+#define SCTP_LOG_EVENT_MBUF 17
+#define SCTP_LOG_CHUNK_PROC 18
+#define SCTP_LOG_ERROR_RET 19
+
+#define SCTP_LOG_MAX_EVENT 20
+
+#define SCTP_LOCK_UNKNOWN 2
+
+
+/* number of associations by default for zone allocation */
+#define SCTP_MAX_NUM_OF_ASOC 40000
+/* how many addresses per assoc remote and local */
+#define SCTP_SCALE_FOR_ADDR 2
+
+/* default AUTO_ASCONF mode enable(1)/disable(0) value (sysctl) */
+#define SCTP_DEFAULT_AUTO_ASCONF 1
+
+/* default MULTIPLE_ASCONF mode enable(1)/disable(0) value (sysctl) */
+#define SCTP_DEFAULT_MULTIPLE_ASCONFS 0
+
+/* default MOBILITY_BASE mode enable(1)/disable(0) value (sysctl) */
+#define SCTP_DEFAULT_MOBILITY_BASE 0
+
+/* default MOBILITY_FASTHANDOFF mode enable(1)/disable(0) value (sysctl) */
+#define SCTP_DEFAULT_MOBILITY_FASTHANDOFF 0
+
+/*
+ * Theshold for rwnd updates, we have to read (sb_hiwat >>
+ * SCTP_RWND_HIWAT_SHIFT) before we will look to see if we need to send a
+ * window update sack. When we look, we compare the last rwnd we sent vs the
+ * current rwnd. It too must be greater than this value. Using 3 divdes the
+ * hiwat by 8, so for 200k rwnd we need to read 24k. For a 64k rwnd we need
+ * to read 8k. This seems about right.. I hope :-D.. we do set a
+ * min of a MTU on it so if the rwnd is real small we will insist
+ * on a full MTU of 1500 bytes.
+ */
+#define SCTP_RWND_HIWAT_SHIFT 3
+
+/* How much of the rwnd must the
+ * message be taking up to start partial delivery.
+ * We calculate this by shifing the hi_water (recv_win)
+ * left the following .. set to 1, when a message holds
+ * 1/2 the rwnd. If we set it to 2 when a message holds
+ * 1/4 the rwnd...etc..
+ */
+
+#define SCTP_PARTIAL_DELIVERY_SHIFT 1
+
+/*
+ * default HMAC for cookies, etc... use one of the AUTH HMAC id's
+ * SCTP_HMAC is the HMAC_ID to use
+ * SCTP_SIGNATURE_SIZE is the digest length
+ */
+#define SCTP_HMAC SCTP_AUTH_HMAC_ID_SHA1
+#define SCTP_SIGNATURE_SIZE SCTP_AUTH_DIGEST_LEN_SHA1
+#define SCTP_SIGNATURE_ALOC_SIZE SCTP_SIGNATURE_SIZE
+
+/*
+ * the SCTP protocol signature this includes the version number encoded in
+ * the last 4 bits of the signature.
+ */
+#define PROTO_SIGNATURE_A 0x30000000
+#define SCTP_VERSION_NUMBER 0x3
+
+#define MAX_TSN 0xffffffff
+#define MAX_SEQ 0xffff
+
+/* how many executions every N tick's */
+#define SCTP_ITERATOR_MAX_AT_ONCE 20
+
+/* number of clock ticks between iterator executions */
+#define SCTP_ITERATOR_TICKS 1
+
+/*
+ * option: If you comment out the following you will receive the old behavior
+ * of obeying cwnd for the fast retransmit algorithm. With this defined a FR
+ * happens right away with-out waiting for the flightsize to drop below the
+ * cwnd value (which is reduced by the FR to 1/2 the inflight packets).
+ */
+#define SCTP_IGNORE_CWND_ON_FR 1
+
+/*
+ * Adds implementors guide behavior to only use newest highest update in SACK
+ * gap ack's to figure out if you need to stroke a chunk for FR.
+ */
+#define SCTP_NO_FR_UNLESS_SEGMENT_SMALLER 1
+
+/* default max I can burst out after a fast retransmit */
+#define SCTP_DEF_MAX_BURST 4
+/* IP hdr (20/40) + 12+2+2 (enet) + sctp common 12 */
+#define SCTP_FIRST_MBUF_RESV 68
+/* Packet transmit states in the sent field */
+#define SCTP_DATAGRAM_UNSENT 0
+#define SCTP_DATAGRAM_SENT 1
+#define SCTP_DATAGRAM_RESEND1 2 /* not used (in code, but may
+ * hit this value) */
+#define SCTP_DATAGRAM_RESEND2 3 /* not used (in code, but may
+ * hit this value) */
+#define SCTP_DATAGRAM_RESEND 4
+#define SCTP_DATAGRAM_ACKED 10010
+#define SCTP_DATAGRAM_MARKED 20010
+#define SCTP_FORWARD_TSN_SKIP 30010
+
+/* chunk output send from locations */
+#define SCTP_OUTPUT_FROM_USR_SEND 0
+#define SCTP_OUTPUT_FROM_T3 1
+#define SCTP_OUTPUT_FROM_INPUT_ERROR 2
+#define SCTP_OUTPUT_FROM_CONTROL_PROC 3
+#define SCTP_OUTPUT_FROM_SACK_TMR 4
+#define SCTP_OUTPUT_FROM_SHUT_TMR 5
+#define SCTP_OUTPUT_FROM_HB_TMR 6
+#define SCTP_OUTPUT_FROM_SHUT_ACK_TMR 7
+#define SCTP_OUTPUT_FROM_ASCONF_TMR 8
+#define SCTP_OUTPUT_FROM_STRRST_TMR 9
+#define SCTP_OUTPUT_FROM_AUTOCLOSE_TMR 10
+#define SCTP_OUTPUT_FROM_EARLY_FR_TMR 11
+#define SCTP_OUTPUT_FROM_STRRST_REQ 12
+#define SCTP_OUTPUT_FROM_USR_RCVD 13
+#define SCTP_OUTPUT_FROM_COOKIE_ACK 14
+#define SCTP_OUTPUT_FROM_DRAIN 15
+#define SCTP_OUTPUT_FROM_CLOSING 16
+/* SCTP chunk types are moved sctp.h for application (NAT, FW) use */
+
+/* align to 32-bit sizes */
+#define SCTP_SIZE32(x) ((((x)+3) >> 2) << 2)
+
+#define IS_SCTP_CONTROL(a) ((a)->chunk_type != SCTP_DATA)
+#define IS_SCTP_DATA(a) ((a)->chunk_type == SCTP_DATA)
+
+
+/* SCTP parameter types */
+/*************0x0000 series*************/
+#define SCTP_HEARTBEAT_INFO 0x0001
+#define SCTP_IPV4_ADDRESS 0x0005
+#define SCTP_IPV6_ADDRESS 0x0006
+#define SCTP_STATE_COOKIE 0x0007
+#define SCTP_UNRECOG_PARAM 0x0008
+#define SCTP_COOKIE_PRESERVE 0x0009
+#define SCTP_HOSTNAME_ADDRESS 0x000b
+#define SCTP_SUPPORTED_ADDRTYPE 0x000c
+
+/* draft-ietf-stewart-tsvwg-strreset-xxx */
+#define SCTP_STR_RESET_OUT_REQUEST 0x000d
+#define SCTP_STR_RESET_IN_REQUEST 0x000e
+#define SCTP_STR_RESET_TSN_REQUEST 0x000f
+#define SCTP_STR_RESET_RESPONSE 0x0010
+#define SCTP_STR_RESET_ADD_STREAMS 0x0011
+
+#define SCTP_MAX_RESET_PARAMS 2
+#define SCTP_STREAM_RESET_TSN_DELTA 0x1000
+
+/*************0x4000 series*************/
+
+/*************0x8000 series*************/
+#define SCTP_ECN_CAPABLE 0x8000
+/* ECN Nonce: draft-ladha-sctp-ecn-nonce */
+#define SCTP_ECN_NONCE_SUPPORTED 0x8001
+/* draft-ietf-tsvwg-auth-xxx */
+#define SCTP_RANDOM 0x8002
+#define SCTP_CHUNK_LIST 0x8003
+#define SCTP_HMAC_LIST 0x8004
+/*
+ * draft-ietf-tsvwg-addip-sctp-xx param=0x8008 len=0xNNNN Byte | Byte | Byte
+ * | Byte Byte | Byte ...
+ *
+ * Where each byte is a chunk type extension supported. For example, to support
+ * all chunks one would have (in hex):
+ *
+ * 80 01 00 09 C0 C1 80 81 82 00 00 00
+ *
+ * Has the parameter. C0 = PR-SCTP (RFC3758) C1, 80 = ASCONF (addip draft) 81
+ * = Packet Drop 82 = Stream Reset 83 = Authentication
+ */
+#define SCTP_SUPPORTED_CHUNK_EXT 0x8008
+
+/*************0xC000 series*************/
+#define SCTP_PRSCTP_SUPPORTED 0xc000
+/* draft-ietf-tsvwg-addip-sctp */
+#define SCTP_ADD_IP_ADDRESS 0xc001
+#define SCTP_DEL_IP_ADDRESS 0xc002
+#define SCTP_ERROR_CAUSE_IND 0xc003
+#define SCTP_SET_PRIM_ADDR 0xc004
+#define SCTP_SUCCESS_REPORT 0xc005
+#define SCTP_ULP_ADAPTATION 0xc006
+/* behave-nat-draft */
+#define SCTP_HAS_NAT_SUPPORT 0xc007
+#define SCTP_NAT_VTAGS 0xc008
+
+/* Notification error codes */
+#define SCTP_NOTIFY_DATAGRAM_UNSENT 0x0001
+#define SCTP_NOTIFY_DATAGRAM_SENT 0x0002
+#define SCTP_FAILED_THRESHOLD 0x0004
+#define SCTP_HEARTBEAT_SUCCESS 0x0008
+#define SCTP_RESPONSE_TO_USER_REQ 0x0010
+#define SCTP_INTERNAL_ERROR 0x0020
+#define SCTP_SHUTDOWN_GUARD_EXPIRES 0x0040
+#define SCTP_RECEIVED_SACK 0x0080
+#define SCTP_PEER_FAULTY 0x0100
+#define SCTP_ICMP_REFUSED 0x0200
+
+/* bits for TOS field */
+#define SCTP_ECT0_BIT 0x02
+#define SCTP_ECT1_BIT 0x01
+#define SCTP_CE_BITS 0x03
+
+/* below turns off above */
+#define SCTP_FLEXIBLE_ADDRESS 0x20
+#define SCTP_NO_HEARTBEAT 0x40
+
+/* mask to get sticky */
+#define SCTP_STICKY_OPTIONS_MASK 0x0c
+
+
+/*
+ * SCTP states for internal state machine XXX (should match "user" values)
+ */
+#define SCTP_STATE_EMPTY 0x0000
+#define SCTP_STATE_INUSE 0x0001
+#define SCTP_STATE_COOKIE_WAIT 0x0002
+#define SCTP_STATE_COOKIE_ECHOED 0x0004
+#define SCTP_STATE_OPEN 0x0008
+#define SCTP_STATE_SHUTDOWN_SENT 0x0010
+#define SCTP_STATE_SHUTDOWN_RECEIVED 0x0020
+#define SCTP_STATE_SHUTDOWN_ACK_SENT 0x0040
+#define SCTP_STATE_SHUTDOWN_PENDING 0x0080
+#define SCTP_STATE_CLOSED_SOCKET 0x0100
+#define SCTP_STATE_ABOUT_TO_BE_FREED 0x0200
+#define SCTP_STATE_PARTIAL_MSG_LEFT 0x0400
+#define SCTP_STATE_WAS_ABORTED 0x0800
+#define SCTP_STATE_IN_ACCEPT_QUEUE 0x1000
+#define SCTP_STATE_MASK 0x007f
+
+#define SCTP_GET_STATE(asoc) ((asoc)->state & SCTP_STATE_MASK)
+#define SCTP_SET_STATE(asoc, newstate) ((asoc)->state = ((asoc)->state & ~SCTP_STATE_MASK) | newstate)
+#define SCTP_CLEAR_SUBSTATE(asoc, substate) ((asoc)->state &= ~substate)
+#define SCTP_ADD_SUBSTATE(asoc, substate) ((asoc)->state |= substate)
+
+/* SCTP reachability state for each address */
+#define SCTP_ADDR_REACHABLE 0x001
+#define SCTP_ADDR_NOT_REACHABLE 0x002
+#define SCTP_ADDR_NOHB 0x004
+#define SCTP_ADDR_BEING_DELETED 0x008
+#define SCTP_ADDR_NOT_IN_ASSOC 0x010
+#define SCTP_ADDR_WAS_PRIMARY 0x020
+#define SCTP_ADDR_SWITCH_PRIMARY 0x040
+#define SCTP_ADDR_OUT_OF_SCOPE 0x080
+#define SCTP_ADDR_DOUBLE_SWITCH 0x100
+#define SCTP_ADDR_UNCONFIRMED 0x200
+#define SCTP_ADDR_REQ_PRIMARY 0x400
+/* JRS 5/13/07 - Added potentially failed state for CMT PF */
+#define SCTP_ADDR_PF 0x800
+#define SCTP_REACHABLE_MASK 0x203
+
+/* bound address types (e.g. valid address types to allow) */
+#define SCTP_BOUND_V6 0x01
+#define SCTP_BOUND_V4 0x02
+
+/*
+ * what is the default number of mbufs in a chain I allow before switching to
+ * a cluster
+ */
+#define SCTP_DEFAULT_MBUFS_IN_CHAIN 5
+
+/* How long a cookie lives in milli-seconds */
+#define SCTP_DEFAULT_COOKIE_LIFE 60000
+
+/* resource limit of streams */
+#define MAX_SCTP_STREAMS 2048
+
+/* Maximum the mapping array will grow to (TSN mapping array) */
+#define SCTP_MAPPING_ARRAY 512
+
+/* size of the inital malloc on the mapping array */
+#define SCTP_INITIAL_MAPPING_ARRAY 16
+/* how much we grow the mapping array each call */
+#define SCTP_MAPPING_ARRAY_INCR 32
+
+/*
+ * Here we define the timer types used by the implementation as arguments in
+ * the set/get timer type calls.
+ */
+#define SCTP_TIMER_INIT 0
+#define SCTP_TIMER_RECV 1
+#define SCTP_TIMER_SEND 2
+#define SCTP_TIMER_HEARTBEAT 3
+#define SCTP_TIMER_PMTU 4
+#define SCTP_TIMER_MAXSHUTDOWN 5
+#define SCTP_TIMER_SIGNATURE 6
+/*
+ * number of timer types in the base SCTP structure used in the set/get and
+ * has the base default.
+ */
+#define SCTP_NUM_TMRS 7
+
+/* timer types */
+#define SCTP_TIMER_TYPE_NONE 0
+#define SCTP_TIMER_TYPE_SEND 1
+#define SCTP_TIMER_TYPE_INIT 2
+#define SCTP_TIMER_TYPE_RECV 3
+#define SCTP_TIMER_TYPE_SHUTDOWN 4
+#define SCTP_TIMER_TYPE_HEARTBEAT 5
+#define SCTP_TIMER_TYPE_COOKIE 6
+#define SCTP_TIMER_TYPE_NEWCOOKIE 7
+#define SCTP_TIMER_TYPE_PATHMTURAISE 8
+#define SCTP_TIMER_TYPE_SHUTDOWNACK 9
+#define SCTP_TIMER_TYPE_ASCONF 10
+#define SCTP_TIMER_TYPE_SHUTDOWNGUARD 11
+#define SCTP_TIMER_TYPE_AUTOCLOSE 12
+#define SCTP_TIMER_TYPE_EVENTWAKE 13
+#define SCTP_TIMER_TYPE_STRRESET 14
+#define SCTP_TIMER_TYPE_INPKILL 15
+#define SCTP_TIMER_TYPE_EARLYFR 17
+#define SCTP_TIMER_TYPE_ASOCKILL 18
+#define SCTP_TIMER_TYPE_ADDR_WQ 19
+#define SCTP_TIMER_TYPE_ZERO_COPY 20
+#define SCTP_TIMER_TYPE_ZCOPY_SENDQ 21
+#define SCTP_TIMER_TYPE_PRIM_DELETED 22
+/* add new timers here - and increment LAST */
+#define SCTP_TIMER_TYPE_LAST 23
+
+#define SCTP_IS_TIMER_TYPE_VALID(t) (((t) > SCTP_TIMER_TYPE_NONE) && \
+ ((t) < SCTP_TIMER_TYPE_LAST))
+
+
+
+/* max number of TSN's dup'd that I will hold */
+#define SCTP_MAX_DUP_TSNS 20
+
+/*
+ * Here we define the types used when setting the retry amounts.
+ */
+/* How many drop re-attempts we make on INIT/COOKIE-ECHO */
+#define SCTP_RETRY_DROPPED_THRESH 4
+
+/*
+ * Maxmium number of chunks a single association can have on it. Note that
+ * this is a squishy number since the count can run over this if the user
+ * sends a large message down .. the fragmented chunks don't count until
+ * AFTER the message is on queue.. it would be the next send that blocks
+ * things. This number will get tuned up at boot in the sctp_init and use the
+ * number of clusters as a base. This way high bandwidth environments will
+ * not get impacted by the lower bandwidth sending a bunch of 1 byte chunks
+ */
+#define SCTP_ASOC_MAX_CHUNKS_ON_QUEUE 512
+
+
+/* The conversion from time to ticks and vice versa is done by rounding
+ * upwards. This way we can test in the code the time to be positive and
+ * know that this corresponds to a positive number of ticks.
+ */
+#define MSEC_TO_TICKS(x) ((hz == 1000) ? x : ((((x) * hz) + 999) / 1000))
+#define TICKS_TO_MSEC(x) ((hz == 1000) ? x : ((((x) * 1000) + (hz - 1)) / hz))
+
+#define SEC_TO_TICKS(x) ((x) * hz)
+#define TICKS_TO_SEC(x) (((x) + (hz - 1)) / hz)
+
+/*
+ * Basically the minimum amount of time before I do a early FR. Making this
+ * value to low will cause duplicate retransmissions.
+ */
+#define SCTP_MINFR_MSEC_TIMER 250
+/* The floor this value is allowed to fall to when starting a timer. */
+#define SCTP_MINFR_MSEC_FLOOR 20
+
+/* init timer def = 1 sec */
+#define SCTP_INIT_SEC 1
+
+/* send timer def = 1 seconds */
+#define SCTP_SEND_SEC 1
+
+/* recv timer def = 200ms */
+#define SCTP_RECV_MSEC 200
+
+/* 30 seconds + RTO (in ms) */
+#define SCTP_HB_DEFAULT_MSEC 30000
+
+/* Max time I will wait for Shutdown to complete */
+#define SCTP_DEF_MAX_SHUTDOWN_SEC 180
+
+
+/*
+ * This is how long a secret lives, NOT how long a cookie lives how many
+ * ticks the current secret will live.
+ */
+#define SCTP_DEFAULT_SECRET_LIFE_SEC 3600
+
+#define SCTP_RTO_UPPER_BOUND (60000) /* 60 sec in ms */
+#define SCTP_RTO_UPPER_BOUND_SEC 60 /* for the init timer */
+#define SCTP_RTO_LOWER_BOUND (1000) /* 1 sec in ms */
+#define SCTP_RTO_INITIAL (3000) /* 3 sec in ms */
+
+
+#define SCTP_INP_KILL_TIMEOUT 20/* number of ms to retry kill of inpcb */
+#define SCTP_ASOC_KILL_TIMEOUT 10 /* number of ms to retry kill of inpcb */
+
+#define SCTP_DEF_MAX_INIT 8
+#define SCTP_DEF_MAX_SEND 10
+#define SCTP_DEF_MAX_PATH_RTX 5
+
+#define SCTP_DEF_PMTU_RAISE_SEC 600 /* 10 min between raise attempts */
+
+
+/* How many streams I request initally by default */
+#define SCTP_OSTREAM_INITIAL 10
+
+/*
+ * How many smallest_mtu's need to increase before a window update sack is
+ * sent (should be a power of 2).
+ */
+/* Send window update (incr * this > hiwat). Should be a power of 2 */
+#define SCTP_MINIMAL_RWND (4096) /* minimal rwnd */
+
+#define SCTP_ADDRMAX 24
+
+/* SCTP DEBUG Switch parameters */
+#define SCTP_DEBUG_TIMER1 0x00000001
+#define SCTP_DEBUG_TIMER2 0x00000002 /* unused */
+#define SCTP_DEBUG_TIMER3 0x00000004 /* unused */
+#define SCTP_DEBUG_TIMER4 0x00000008
+#define SCTP_DEBUG_OUTPUT1 0x00000010
+#define SCTP_DEBUG_OUTPUT2 0x00000020
+#define SCTP_DEBUG_OUTPUT3 0x00000040
+#define SCTP_DEBUG_OUTPUT4 0x00000080
+#define SCTP_DEBUG_UTIL1 0x00000100
+#define SCTP_DEBUG_UTIL2 0x00000200 /* unused */
+#define SCTP_DEBUG_AUTH1 0x00000400
+#define SCTP_DEBUG_AUTH2 0x00000800 /* unused */
+#define SCTP_DEBUG_INPUT1 0x00001000
+#define SCTP_DEBUG_INPUT2 0x00002000
+#define SCTP_DEBUG_INPUT3 0x00004000
+#define SCTP_DEBUG_INPUT4 0x00008000 /* unused */
+#define SCTP_DEBUG_ASCONF1 0x00010000
+#define SCTP_DEBUG_ASCONF2 0x00020000
+#define SCTP_DEBUG_OUTPUT5 0x00040000 /* unused */
+#define SCTP_DEBUG_XXX 0x00080000 /* unused */
+#define SCTP_DEBUG_PCB1 0x00100000
+#define SCTP_DEBUG_PCB2 0x00200000 /* unused */
+#define SCTP_DEBUG_PCB3 0x00400000
+#define SCTP_DEBUG_PCB4 0x00800000
+#define SCTP_DEBUG_INDATA1 0x01000000
+#define SCTP_DEBUG_INDATA2 0x02000000 /* unused */
+#define SCTP_DEBUG_INDATA3 0x04000000 /* unused */
+#define SCTP_DEBUG_CRCOFFLOAD 0x08000000 /* unused */
+#define SCTP_DEBUG_USRREQ1 0x10000000 /* unused */
+#define SCTP_DEBUG_USRREQ2 0x20000000 /* unused */
+#define SCTP_DEBUG_PEEL1 0x40000000
+#define SCTP_DEBUG_XXXXX 0x80000000 /* unused */
+#define SCTP_DEBUG_ALL 0x7ff3ffff
+#define SCTP_DEBUG_NOISY 0x00040000
+
+/* What sender needs to see to avoid SWS or we consider peers rwnd 0 */
+#define SCTP_SWS_SENDER_DEF 1420
+
+/*
+ * SWS is scaled to the sb_hiwat of the socket. A value of 2 is hiwat/4, 1
+ * would be hiwat/2 etc.
+ */
+/* What receiver needs to see in sockbuf or we tell peer its 1 */
+#define SCTP_SWS_RECEIVER_DEF 3000
+
+#define SCTP_INITIAL_CWND 4380
+
+#define SCTP_DEFAULT_MTU 1500 /* emergency default MTU */
+/* amount peer is obligated to have in rwnd or I will abort */
+#define SCTP_MIN_RWND 1500
+
+#define SCTP_DEFAULT_MAXSEGMENT 65535
+
+#define SCTP_CHUNK_BUFFER_SIZE 512
+#define SCTP_PARAM_BUFFER_SIZE 512
+
+/* small chunk store for looking at chunk_list in auth */
+#define SCTP_SMALL_CHUNK_STORE 260
+
+#define SCTP_DEFAULT_MINSEGMENT 512 /* MTU size ... if no mtu disc */
+#define SCTP_HOW_MANY_SECRETS 2 /* how many secrets I keep */
+
+#define SCTP_NUMBER_OF_SECRETS 8 /* or 8 * 4 = 32 octets */
+#define SCTP_SECRET_SIZE 32 /* number of octets in a 256 bits */
+
+
+/*
+ * SCTP upper layer notifications
+ */
+#define SCTP_NOTIFY_ASSOC_UP 1
+#define SCTP_NOTIFY_ASSOC_DOWN 2
+#define SCTP_NOTIFY_INTERFACE_DOWN 3
+#define SCTP_NOTIFY_INTERFACE_UP 4
+#define SCTP_NOTIFY_DG_FAIL 5
+#define SCTP_NOTIFY_STRDATA_ERR 6
+#define SCTP_NOTIFY_ASSOC_ABORTED 7
+#define SCTP_NOTIFY_PEER_OPENED_STREAM 8
+#define SCTP_NOTIFY_STREAM_OPENED_OK 9
+#define SCTP_NOTIFY_ASSOC_RESTART 10
+#define SCTP_NOTIFY_HB_RESP 11
+#define SCTP_NOTIFY_ASCONF_SUCCESS 12
+#define SCTP_NOTIFY_ASCONF_FAILED 13
+#define SCTP_NOTIFY_PEER_SHUTDOWN 14
+#define SCTP_NOTIFY_ASCONF_ADD_IP 15
+#define SCTP_NOTIFY_ASCONF_DELETE_IP 16
+#define SCTP_NOTIFY_ASCONF_SET_PRIMARY 17
+#define SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION 18
+#define SCTP_NOTIFY_INTERFACE_CONFIRMED 20
+#define SCTP_NOTIFY_STR_RESET_RECV 21
+#define SCTP_NOTIFY_STR_RESET_SEND 22
+#define SCTP_NOTIFY_STR_RESET_FAILED_OUT 23
+#define SCTP_NOTIFY_STR_RESET_FAILED_IN 24
+#define SCTP_NOTIFY_AUTH_NEW_KEY 25
+#define SCTP_NOTIFY_AUTH_FREE_KEY 26
+#define SCTP_NOTIFY_SPECIAL_SP_FAIL 27
+#define SCTP_NOTIFY_NO_PEER_AUTH 28
+#define SCTP_NOTIFY_SENDER_DRY 29
+#define SCTP_NOTIFY_STR_RESET_ADD_OK 30
+#define SCTP_NOTIFY_STR_RESET_ADD_FAIL 31
+#define SCTP_NOTIFY_STR_RESET_INSTREAM_ADD_OK 32
+#define SCTP_NOTIFY_MAX 32
+
+
+/* This is the value for messages that are NOT completely
+ * copied down where we will start to split the message.
+ * So, with our default, we split only if the piece we
+ * want to take will fill up a full MTU (assuming
+ * a 1500 byte MTU).
+ */
+#define SCTP_DEFAULT_SPLIT_POINT_MIN 2904
+
+/* ABORT CODES and other tell-tale location
+ * codes are generated by adding the below
+ * to the instance id.
+ */
+
+/* File defines */
+#define SCTP_FROM_SCTP_INPUT 0x10000000
+#define SCTP_FROM_SCTP_PCB 0x20000000
+#define SCTP_FROM_SCTP_INDATA 0x30000000
+#define SCTP_FROM_SCTP_TIMER 0x40000000
+#define SCTP_FROM_SCTP_USRREQ 0x50000000
+#define SCTP_FROM_SCTPUTIL 0x60000000
+#define SCTP_FROM_SCTP6_USRREQ 0x70000000
+#define SCTP_FROM_SCTP_ASCONF 0x80000000
+#define SCTP_FROM_SCTP_OUTPUT 0x90000000
+#define SCTP_FROM_SCTP_PEELOFF 0xa0000000
+#define SCTP_FROM_SCTP_PANDA 0xb0000000
+#define SCTP_FROM_SCTP_SYSCTL 0xc0000000
+
+/* Location ID's */
+#define SCTP_LOC_1 0x00000001
+#define SCTP_LOC_2 0x00000002
+#define SCTP_LOC_3 0x00000003
+#define SCTP_LOC_4 0x00000004
+#define SCTP_LOC_5 0x00000005
+#define SCTP_LOC_6 0x00000006
+#define SCTP_LOC_7 0x00000007
+#define SCTP_LOC_8 0x00000008
+#define SCTP_LOC_9 0x00000009
+#define SCTP_LOC_10 0x0000000a
+#define SCTP_LOC_11 0x0000000b
+#define SCTP_LOC_12 0x0000000c
+#define SCTP_LOC_13 0x0000000d
+#define SCTP_LOC_14 0x0000000e
+#define SCTP_LOC_15 0x0000000f
+#define SCTP_LOC_16 0x00000010
+#define SCTP_LOC_17 0x00000011
+#define SCTP_LOC_18 0x00000012
+#define SCTP_LOC_19 0x00000013
+#define SCTP_LOC_20 0x00000014
+#define SCTP_LOC_21 0x00000015
+#define SCTP_LOC_22 0x00000016
+#define SCTP_LOC_23 0x00000017
+#define SCTP_LOC_24 0x00000018
+#define SCTP_LOC_25 0x00000019
+#define SCTP_LOC_26 0x0000001a
+#define SCTP_LOC_27 0x0000001b
+#define SCTP_LOC_28 0x0000001c
+#define SCTP_LOC_29 0x0000001d
+#define SCTP_LOC_30 0x0000001e
+#define SCTP_LOC_31 0x0000001f
+#define SCTP_LOC_32 0x00000020
+#define SCTP_LOC_33 0x00000021
+
+
+/* Free assoc codes */
+#define SCTP_NORMAL_PROC 0
+#define SCTP_PCBFREE_NOFORCE 1
+#define SCTP_PCBFREE_FORCE 2
+
+/* From codes for adding addresses */
+#define SCTP_ADDR_IS_CONFIRMED 8
+#define SCTP_ADDR_DYNAMIC_ADDED 6
+#define SCTP_IN_COOKIE_PROC 100
+#define SCTP_ALLOC_ASOC 1
+#define SCTP_LOAD_ADDR_2 2
+#define SCTP_LOAD_ADDR_3 3
+#define SCTP_LOAD_ADDR_4 4
+#define SCTP_LOAD_ADDR_5 5
+
+#define SCTP_DONOT_SETSCOPE 0
+#define SCTP_DO_SETSCOPE 1
+
+
+/* This value determines the default for when
+ * we try to add more on the send queue., if
+ * there is room. This prevents us from cycling
+ * into the copy_resume routine to often if
+ * we have not got enough space to add a decent
+ * enough size message. Note that if we have enough
+ * space to complete the message copy we will always
+ * add to the message, no matter what the size. Its
+ * only when we reach the point that we have some left
+ * to add, there is only room for part of it that we
+ * will use this threshold. Its also a sysctl.
+ */
+#define SCTP_DEFAULT_ADD_MORE 1452
+
+#ifndef SCTP_PCBHASHSIZE
+/* default number of association hash buckets in each endpoint */
+#define SCTP_PCBHASHSIZE 256
+#endif
+#ifndef SCTP_TCBHASHSIZE
+#define SCTP_TCBHASHSIZE 1024
+#endif
+
+#ifndef SCTP_CHUNKQUEUE_SCALE
+#define SCTP_CHUNKQUEUE_SCALE 10
+#endif
+
+/* clock variance is 1 ms */
+#define SCTP_CLOCK_GRANULARITY 1
+#define IP_HDR_SIZE 40 /* we use the size of a IP6 header here this
+ * detracts a small amount for ipv4 but it
+ * simplifies the ipv6 addition */
+
+/* Argument magic number for sctp_inpcb_free() */
+
+/* third argument */
+#define SCTP_CALLED_DIRECTLY_NOCMPSET 0
+#define SCTP_CALLED_AFTER_CMPSET_OFCLOSE 1
+#define SCTP_CALLED_FROM_INPKILL_TIMER 2
+/* second argument */
+#define SCTP_FREE_SHOULD_USE_ABORT 1
+#define SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE 0
+
+#ifndef IPPROTO_SCTP
+#define IPPROTO_SCTP 132 /* the Official IANA number :-) */
+#endif /* !IPPROTO_SCTP */
+
+#define SCTP_MAX_DATA_BUNDLING 256
+
+/* modular comparison */
+/* True if a > b (mod = M) */
+#define compare_with_wrap(a, b, M) (((a > b) && ((a - b) < ((M >> 1) + 1))) || \
+ ((b > a) && ((b - a) > ((M >> 1) + 1))))
+
+
+/* Mapping array manipulation routines */
+#define SCTP_IS_TSN_PRESENT(arry, gap) ((arry[(gap >> 3)] >> (gap & 0x07)) & 0x01)
+#define SCTP_SET_TSN_PRESENT(arry, gap) (arry[(gap >> 3)] |= (0x01 << ((gap & 0x07))))
+#define SCTP_UNSET_TSN_PRESENT(arry, gap) (arry[(gap >> 3)] &= ((~(0x01 << ((gap & 0x07)))) & 0xff))
+#define SCTP_CALC_TSN_TO_GAP(gap, tsn, mapping_tsn) do { \
+ if (tsn >= mapping_tsn) { \
+ gap = tsn - mapping_tsn; \
+ } else { \
+ gap = (MAX_TSN - mapping_tsn) + tsn + 1; \
+ } \
+ } while(0)
+
+
+#define SCTP_RETRAN_DONE -1
+#define SCTP_RETRAN_EXIT -2
+
+/*
+ * This value defines the number of vtag block time wait entry's per list
+ * element. Each entry will take 2 4 byte ints (and of course the overhead
+ * of the next pointer as well). Using 15 as an example will yield * ((8 *
+ * 15) + 8) or 128 bytes of overhead for each timewait block that gets
+ * initialized. Increasing it to 31 would yeild 256 bytes per block.
+ */
+#define SCTP_NUMBER_IN_VTAG_BLOCK 15
+/*
+ * If we use the STACK option, we have an array of this size head pointers.
+ * This array is mod'd the with the size to find which bucket and then all
+ * entries must be searched to see if the tag is in timed wait. If so we
+ * reject it.
+ */
+#define SCTP_STACK_VTAG_HASH_SIZE 32
+
+/*
+ * Number of seconds of time wait for a vtag.
+ */
+#define SCTP_TIME_WAIT 60
+
+#define SCTP_SEND_BUFFER_SPLITTING 0x00000001
+#define SCTP_RECV_BUFFER_SPLITTING 0x00000002
+
+/* The system retains a cache of free chunks such to
+ * cut down on calls the memory allocation system. There
+ * is a per association limit of free items and a overall
+ * system limit. If either one gets hit then the resource
+ * stops being cached.
+ */
+
+#define SCTP_DEF_ASOC_RESC_LIMIT 10
+#define SCTP_DEF_SYSTEM_RESC_LIMIT 1000
+
+/*-
+ * defines for socket lock states.
+ * Used by __APPLE__ and SCTP_SO_LOCK_TESTING
+ */
+#define SCTP_SO_LOCKED 1
+#define SCTP_SO_NOT_LOCKED 0
+
+
+#define SCTP_HOLDS_LOCK 1
+#define SCTP_NOT_LOCKED 0
+
+/*-
+ * For address locks, do we hold the lock?
+ */
+#define SCTP_ADDR_LOCKED 1
+#define SCTP_ADDR_NOT_LOCKED 0
+
+#define IN4_ISPRIVATE_ADDRESS(a) \
+ ((((uint8_t *)&(a)->s_addr)[0] == 10) || \
+ ((((uint8_t *)&(a)->s_addr)[0] == 172) && \
+ (((uint8_t *)&(a)->s_addr)[1] >= 16) && \
+ (((uint8_t *)&(a)->s_addr)[1] <= 32)) || \
+ ((((uint8_t *)&(a)->s_addr)[0] == 192) && \
+ (((uint8_t *)&(a)->s_addr)[1] == 168)))
+
+#define IN4_ISLOOPBACK_ADDRESS(a) \
+ ((((uint8_t *)&(a)->s_addr)[0] == 127) && \
+ (((uint8_t *)&(a)->s_addr)[1] == 0) && \
+ (((uint8_t *)&(a)->s_addr)[2] == 0) && \
+ (((uint8_t *)&(a)->s_addr)[3] == 1))
+
+
+#if defined(_KERNEL)
+
+#define SCTP_GETTIME_TIMEVAL(x) (getmicrouptime(x))
+#define SCTP_GETPTIME_TIMEVAL(x) (microuptime(x))
+#endif
+/*#if defined(__FreeBSD__) || defined(__APPLE__)*/
+/*#define SCTP_GETTIME_TIMEVAL(x) { \*/
+/* (x)->tv_sec = ticks / 1000; \*/
+/* (x)->tv_usec = (ticks % 1000) * 1000; \*/
+/*}*/
+
+/*#else*/
+/*#define SCTP_GETTIME_TIMEVAL(x) (microtime(x))*/
+/*#endif __FreeBSD__ */
+
+#if defined(_KERNEL) || defined(__Userspace__)
+#define sctp_sowwakeup(inp, so) \
+do { \
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) { \
+ inp->sctp_flags |= SCTP_PCB_FLAGS_WAKEOUTPUT; \
+ } else { \
+ sowwakeup(so); \
+ } \
+} while (0)
+
+#define sctp_sowwakeup_locked(inp, so) \
+do { \
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) { \
+ SOCKBUF_UNLOCK(&((so)->so_snd)); \
+ inp->sctp_flags |= SCTP_PCB_FLAGS_WAKEOUTPUT; \
+ } else { \
+ sowwakeup_locked(so); \
+ } \
+} while (0)
+
+#define sctp_sorwakeup(inp, so) \
+do { \
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) { \
+ inp->sctp_flags |= SCTP_PCB_FLAGS_WAKEINPUT; \
+ } else { \
+ sorwakeup(so); \
+ } \
+} while (0)
+
+#define sctp_sorwakeup_locked(inp, so) \
+do { \
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) { \
+ inp->sctp_flags |= SCTP_PCB_FLAGS_WAKEINPUT; \
+ SOCKBUF_UNLOCK(&((so)->so_rcv)); \
+ } else { \
+ sorwakeup_locked(so); \
+ } \
+} while (0)
+
+#endif /* _KERNEL || __Userspace__ */
+#endif
diff --git a/freebsd/sys/netinet/sctp_crc32.c b/freebsd/sys/netinet/sctp_crc32.c
new file mode 100644
index 00000000..aa4c08cf
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_crc32.c
@@ -0,0 +1,148 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_crc32.c,v 1.12 2005/03/06 16:04:17 itojun Exp $ */
+
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp.h>
+#include <freebsd/netinet/sctp_crc32.h>
+#include <freebsd/netinet/sctp_pcb.h>
+
+
+#if !defined(SCTP_WITH_NO_CSUM)
+
+static uint32_t
+sctp_finalize_crc32c(uint32_t crc32c)
+{
+ uint32_t result;
+
+#if BYTE_ORDER == BIG_ENDIAN
+ uint8_t byte0, byte1, byte2, byte3;
+
+#endif
+ /* Complement the result */
+ result = ~crc32c;
+#if BYTE_ORDER == BIG_ENDIAN
+ /*
+ * For BIG-ENDIAN.. aka Motorola byte order the result is in
+ * little-endian form. So we must manually swap the bytes. Then we
+ * can call htonl() which does nothing...
+ */
+ byte0 = result & 0x000000ff;
+ byte1 = (result >> 8) & 0x000000ff;
+ byte2 = (result >> 16) & 0x000000ff;
+ byte3 = (result >> 24) & 0x000000ff;
+ crc32c = ((byte0 << 24) | (byte1 << 16) | (byte2 << 8) | byte3);
+#else
+ /*
+ * For INTEL platforms the result comes out in network order. No
+ * htonl is required or the swap above. So we optimize out both the
+ * htonl and the manual swap above.
+ */
+ crc32c = result;
+#endif
+ return (crc32c);
+}
+
+uint32_t
+sctp_calculate_cksum(struct mbuf *m, uint32_t offset)
+{
+ /*
+ * given a mbuf chain with a packetheader offset by 'offset'
+ * pointing at a sctphdr (with csum set to 0) go through the chain
+ * of SCTP_BUF_NEXT()'s and calculate the SCTP checksum. This also
+ * has a side bonus as it will calculate the total length of the
+ * mbuf chain. Note: if offset is greater than the total mbuf
+ * length, checksum=1, pktlen=0 is returned (ie. no real error code)
+ */
+ uint32_t base = 0xffffffff;
+ struct mbuf *at;
+
+ at = m;
+ /* find the correct mbuf and offset into mbuf */
+ while ((at != NULL) && (offset > (uint32_t) SCTP_BUF_LEN(at))) {
+ offset -= SCTP_BUF_LEN(at); /* update remaining offset
+ * left */
+ at = SCTP_BUF_NEXT(at);
+ }
+ while (at != NULL) {
+ if ((SCTP_BUF_LEN(at) - offset) > 0) {
+ base = calculate_crc32c(base,
+ (unsigned char *)(SCTP_BUF_AT(at, offset)),
+ (unsigned int)(SCTP_BUF_LEN(at) - offset));
+ }
+ if (offset) {
+ /* we only offset once into the first mbuf */
+ if (offset < (uint32_t) SCTP_BUF_LEN(at))
+ offset = 0;
+ else
+ offset -= SCTP_BUF_LEN(at);
+ }
+ at = SCTP_BUF_NEXT(at);
+ }
+ base = sctp_finalize_crc32c(base);
+ return (base);
+}
+
+#endif /* !defined(SCTP_WITH_NO_CSUM) */
+
+
+void
+sctp_delayed_cksum(struct mbuf *m, uint32_t offset)
+{
+#if defined(SCTP_WITH_NO_CSUM)
+ panic("sctp_delayed_cksum() called when using no SCTP CRC.");
+#else
+ uint32_t checksum;
+
+ checksum = sctp_calculate_cksum(m, offset);
+ SCTP_STAT_DECR(sctps_sendhwcrc);
+ SCTP_STAT_INCR(sctps_sendswcrc);
+ offset += offsetof(struct sctphdr, checksum);
+
+ if (offset + sizeof(uint32_t) > (uint32_t) (m->m_len)) {
+ printf("sctp_delayed_cksum(): m->len: %d, off: %d.\n",
+ (uint32_t) m->m_len, offset);
+ /*
+ * XXX this shouldn't happen, but if it does, the correct
+ * behavior may be to insert the checksum in the appropriate
+ * next mbuf in the chain.
+ */
+ return;
+ }
+ *(uint32_t *) (m->m_data + offset) = checksum;
+#endif
+}
diff --git a/freebsd/sys/netinet/sctp_crc32.h b/freebsd/sys/netinet/sctp_crc32.h
new file mode 100644
index 00000000..768b25d5
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_crc32.h
@@ -0,0 +1,47 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_crc32.h,v 1.5 2004/08/17 04:06:16 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __crc32c_h__
+#define __crc32c_h__
+
+#if defined(_KERNEL)
+#if !defined(SCTP_WITH_NO_CSUM)
+uint32_t sctp_calculate_cksum(struct mbuf *, uint32_t);
+
+#endif
+void sctp_delayed_cksum(struct mbuf *, uint32_t offset);
+
+#endif /* _KERNEL */
+#endif /* __crc32c_h__ */
diff --git a/freebsd/sys/netinet/sctp_header.h b/freebsd/sys/netinet/sctp_header.h
new file mode 100644
index 00000000..141bfcda
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_header.h
@@ -0,0 +1,624 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_header.h,v 1.14 2005/03/06 16:04:17 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __sctp_header_h__
+#define __sctp_header_h__
+
+#include <freebsd/sys/time.h>
+#include <freebsd/netinet/sctp.h>
+#include <freebsd/netinet/sctp_constants.h>
+
+#define SCTP_PACKED __attribute__((packed))
+
+/*
+ * Parameter structures
+ */
+struct sctp_ipv4addr_param {
+ struct sctp_paramhdr ph;/* type=SCTP_IPV4_PARAM_TYPE, len=8 */
+ uint32_t addr; /* IPV4 address */
+} SCTP_PACKED;
+
+#define SCTP_V6_ADDR_BYTES 16
+
+
+struct sctp_ipv6addr_param {
+ struct sctp_paramhdr ph;/* type=SCTP_IPV6_PARAM_TYPE, len=20 */
+ uint8_t addr[SCTP_V6_ADDR_BYTES]; /* IPV6 address */
+} SCTP_PACKED;
+
+/* Cookie Preservative */
+struct sctp_cookie_perserve_param {
+ struct sctp_paramhdr ph;/* type=SCTP_COOKIE_PRESERVE, len=8 */
+ uint32_t time; /* time in ms to extend cookie */
+} SCTP_PACKED;
+
+#define SCTP_ARRAY_MIN_LEN 1
+/* Host Name Address */
+struct sctp_host_name_param {
+ struct sctp_paramhdr ph;/* type=SCTP_HOSTNAME_ADDRESS */
+ char name[SCTP_ARRAY_MIN_LEN]; /* host name */
+} SCTP_PACKED;
+
+/*
+ * This is the maximum padded size of a s-a-p
+ * so paramheadr + 3 address types (6 bytes) + 2 byte pad = 12
+ */
+#define SCTP_MAX_ADDR_PARAMS_SIZE 12
+/* supported address type */
+struct sctp_supported_addr_param {
+ struct sctp_paramhdr ph;/* type=SCTP_SUPPORTED_ADDRTYPE */
+ uint16_t addr_type[SCTP_ARRAY_MIN_LEN]; /* array of supported address
+ * types */
+} SCTP_PACKED;
+
+/* ECN parameter */
+struct sctp_ecn_supported_param {
+ struct sctp_paramhdr ph;/* type=SCTP_ECN_CAPABLE */
+} SCTP_PACKED;
+
+
+/* heartbeat info parameter */
+struct sctp_heartbeat_info_param {
+ struct sctp_paramhdr ph;
+ uint32_t time_value_1;
+ uint32_t time_value_2;
+ uint32_t random_value1;
+ uint32_t random_value2;
+ uint16_t user_req;
+ uint8_t addr_family;
+ uint8_t addr_len;
+ char address[SCTP_ADDRMAX];
+} SCTP_PACKED;
+
+
+/* draft-ietf-tsvwg-prsctp */
+/* PR-SCTP supported parameter */
+struct sctp_prsctp_supported_param {
+ struct sctp_paramhdr ph;
+} SCTP_PACKED;
+
+
+/* draft-ietf-tsvwg-addip-sctp */
+struct sctp_asconf_paramhdr { /* an ASCONF "parameter" */
+ struct sctp_paramhdr ph;/* a SCTP parameter header */
+ uint32_t correlation_id;/* correlation id for this param */
+} SCTP_PACKED;
+
+struct sctp_asconf_addr_param { /* an ASCONF address parameter */
+ struct sctp_asconf_paramhdr aph; /* asconf "parameter" */
+ struct sctp_ipv6addr_param addrp; /* max storage size */
+} SCTP_PACKED;
+
+
+struct sctp_asconf_tag_param { /* an ASCONF NAT-Vtag parameter */
+ struct sctp_asconf_paramhdr aph; /* asconf "parameter" */
+ uint32_t local_vtag;
+ uint32_t remote_vtag;
+} SCTP_PACKED;
+
+
+struct sctp_asconf_addrv4_param { /* an ASCONF address (v4) parameter */
+ struct sctp_asconf_paramhdr aph; /* asconf "parameter" */
+ struct sctp_ipv4addr_param addrp; /* max storage size */
+} SCTP_PACKED;
+
+#define SCTP_MAX_SUPPORTED_EXT 256
+
+struct sctp_supported_chunk_types_param {
+ struct sctp_paramhdr ph;/* type = 0x8008 len = x */
+ uint8_t chunk_types[];
+} SCTP_PACKED;
+
+
+/* ECN Nonce: draft-ladha-sctp-ecn-nonce */
+struct sctp_ecn_nonce_supported_param {
+ struct sctp_paramhdr ph;/* type = 0x8001 len = 4 */
+} SCTP_PACKED;
+
+
+/*
+ * Structures for DATA chunks
+ */
+struct sctp_data {
+ uint32_t tsn;
+ uint16_t stream_id;
+ uint16_t stream_sequence;
+ uint32_t protocol_id;
+ /* user data follows */
+} SCTP_PACKED;
+
+struct sctp_data_chunk {
+ struct sctp_chunkhdr ch;
+ struct sctp_data dp;
+} SCTP_PACKED;
+
+/*
+ * Structures for the control chunks
+ */
+
+/* Initiate (INIT)/Initiate Ack (INIT ACK) */
+struct sctp_init {
+ uint32_t initiate_tag; /* initiate tag */
+ uint32_t a_rwnd; /* a_rwnd */
+ uint16_t num_outbound_streams; /* OS */
+ uint16_t num_inbound_streams; /* MIS */
+ uint32_t initial_tsn; /* I-TSN */
+ /* optional param's follow */
+} SCTP_PACKED;
+
+#define SCTP_IDENTIFICATION_SIZE 16
+#define SCTP_ADDRESS_SIZE 4
+#define SCTP_RESERVE_SPACE 6
+/* state cookie header */
+struct sctp_state_cookie { /* this is our definition... */
+ uint8_t identification[SCTP_IDENTIFICATION_SIZE]; /* id of who we are */
+ struct timeval time_entered; /* the time I built cookie */
+ uint32_t cookie_life; /* life I will award this cookie */
+ uint32_t tie_tag_my_vtag; /* my tag in old association */
+
+ uint32_t tie_tag_peer_vtag; /* peers tag in old association */
+ uint32_t peers_vtag; /* peers tag in INIT (for quick ref) */
+
+ uint32_t my_vtag; /* my tag in INIT-ACK (for quick ref) */
+ uint32_t address[SCTP_ADDRESS_SIZE]; /* 4 ints/128 bits */
+ uint32_t addr_type; /* address type */
+ uint32_t laddress[SCTP_ADDRESS_SIZE]; /* my local from address */
+ uint32_t laddr_type; /* my local from address type */
+ uint32_t scope_id; /* v6 scope id for link-locals */
+
+ uint16_t peerport; /* port address of the peer in the INIT */
+ uint16_t myport; /* my port address used in the INIT */
+ uint8_t ipv4_addr_legal;/* Are V4 addr legal? */
+ uint8_t ipv6_addr_legal;/* Are V6 addr legal? */
+ uint8_t local_scope; /* IPv6 local scope flag */
+ uint8_t site_scope; /* IPv6 site scope flag */
+
+ uint8_t ipv4_scope; /* IPv4 private addr scope */
+ uint8_t loopback_scope; /* loopback scope information */
+ uint8_t reserved[SCTP_RESERVE_SPACE]; /* Align to 64 bits */
+ /*
+ * at the end is tacked on the INIT chunk and the INIT-ACK chunk
+ * (minus the cookie).
+ */
+} SCTP_PACKED;
+
+
+/* Used for NAT state error cause */
+struct sctp_missing_nat_state {
+ uint16_t cause;
+ uint16_t length;
+ uint8_t data[];
+} SCTP_PACKED;
+
+
+struct sctp_inv_mandatory_param {
+ uint16_t cause;
+ uint16_t length;
+ uint32_t num_param;
+ uint16_t param;
+ /*
+ * We include this to 0 it since only a missing cookie will cause
+ * this error.
+ */
+ uint16_t resv;
+} SCTP_PACKED;
+
+struct sctp_unresolv_addr {
+ uint16_t cause;
+ uint16_t length;
+ uint16_t addr_type;
+ uint16_t reserved; /* Only one invalid addr type */
+} SCTP_PACKED;
+
+/* state cookie parameter */
+struct sctp_state_cookie_param {
+ struct sctp_paramhdr ph;
+ struct sctp_state_cookie cookie;
+} SCTP_PACKED;
+
+struct sctp_init_chunk {
+ struct sctp_chunkhdr ch;
+ struct sctp_init init;
+} SCTP_PACKED;
+
+struct sctp_init_msg {
+ struct sctphdr sh;
+ struct sctp_init_chunk msg;
+} SCTP_PACKED;
+
+/* ... used for both INIT and INIT ACK */
+#define sctp_init_ack sctp_init
+#define sctp_init_ack_chunk sctp_init_chunk
+#define sctp_init_ack_msg sctp_init_msg
+
+
+/* Selective Ack (SACK) */
+struct sctp_gap_ack_block {
+ uint16_t start; /* Gap Ack block start */
+ uint16_t end; /* Gap Ack block end */
+} SCTP_PACKED;
+
+struct sctp_sack {
+ uint32_t cum_tsn_ack; /* cumulative TSN Ack */
+ uint32_t a_rwnd; /* updated a_rwnd of sender */
+ uint16_t num_gap_ack_blks; /* number of Gap Ack blocks */
+ uint16_t num_dup_tsns; /* number of duplicate TSNs */
+ /* struct sctp_gap_ack_block's follow */
+ /* uint32_t duplicate_tsn's follow */
+} SCTP_PACKED;
+
+struct sctp_sack_chunk {
+ struct sctp_chunkhdr ch;
+ struct sctp_sack sack;
+} SCTP_PACKED;
+
+struct sctp_nr_sack {
+ uint32_t cum_tsn_ack; /* cumulative TSN Ack */
+ uint32_t a_rwnd; /* updated a_rwnd of sender */
+ uint16_t num_gap_ack_blks; /* number of Gap Ack blocks */
+ uint16_t num_nr_gap_ack_blks; /* number of NR Gap Ack blocks */
+ uint16_t num_dup_tsns; /* number of duplicate TSNs */
+ uint16_t reserved; /* not currently used */
+ /* struct sctp_gap_ack_block's follow */
+ /* uint32_t duplicate_tsn's follow */
+} SCTP_PACKED;
+
+struct sctp_nr_sack_chunk {
+ struct sctp_chunkhdr ch;
+ struct sctp_nr_sack nr_sack;
+} SCTP_PACKED;
+
+
+/* Heartbeat Request (HEARTBEAT) */
+struct sctp_heartbeat {
+ struct sctp_heartbeat_info_param hb_info;
+} SCTP_PACKED;
+
+struct sctp_heartbeat_chunk {
+ struct sctp_chunkhdr ch;
+ struct sctp_heartbeat heartbeat;
+} SCTP_PACKED;
+
+/* ... used for Heartbeat Ack (HEARTBEAT ACK) */
+#define sctp_heartbeat_ack sctp_heartbeat
+#define sctp_heartbeat_ack_chunk sctp_heartbeat_chunk
+
+
+/* Abort Asssociation (ABORT) */
+struct sctp_abort_chunk {
+ struct sctp_chunkhdr ch;
+ /* optional error cause may follow */
+} SCTP_PACKED;
+
+struct sctp_abort_msg {
+ struct sctphdr sh;
+ struct sctp_abort_chunk msg;
+} SCTP_PACKED;
+
+
+/* Shutdown Association (SHUTDOWN) */
+struct sctp_shutdown_chunk {
+ struct sctp_chunkhdr ch;
+ uint32_t cumulative_tsn_ack;
+} SCTP_PACKED;
+
+
+/* Shutdown Acknowledgment (SHUTDOWN ACK) */
+struct sctp_shutdown_ack_chunk {
+ struct sctp_chunkhdr ch;
+} SCTP_PACKED;
+
+
+/* Operation Error (ERROR) */
+struct sctp_error_chunk {
+ struct sctp_chunkhdr ch;
+ /* optional error causes follow */
+} SCTP_PACKED;
+
+
+/* Cookie Echo (COOKIE ECHO) */
+struct sctp_cookie_echo_chunk {
+ struct sctp_chunkhdr ch;
+ struct sctp_state_cookie cookie;
+} SCTP_PACKED;
+
+/* Cookie Acknowledgment (COOKIE ACK) */
+struct sctp_cookie_ack_chunk {
+ struct sctp_chunkhdr ch;
+} SCTP_PACKED;
+
+/* Explicit Congestion Notification Echo (ECNE) */
+struct sctp_ecne_chunk {
+ struct sctp_chunkhdr ch;
+ uint32_t tsn;
+} SCTP_PACKED;
+
+/* Congestion Window Reduced (CWR) */
+struct sctp_cwr_chunk {
+ struct sctp_chunkhdr ch;
+ uint32_t tsn;
+} SCTP_PACKED;
+
+/* Shutdown Complete (SHUTDOWN COMPLETE) */
+struct sctp_shutdown_complete_chunk {
+ struct sctp_chunkhdr ch;
+} SCTP_PACKED;
+
+/* Oper error holding a stale cookie */
+struct sctp_stale_cookie_msg {
+ struct sctp_paramhdr ph;/* really an error cause */
+ uint32_t time_usec;
+} SCTP_PACKED;
+
+struct sctp_adaptation_layer_indication {
+ struct sctp_paramhdr ph;
+ uint32_t indication;
+} SCTP_PACKED;
+
+struct sctp_cookie_while_shutting_down {
+ struct sctphdr sh;
+ struct sctp_chunkhdr ch;
+ struct sctp_paramhdr ph;/* really an error cause */
+} SCTP_PACKED;
+
+struct sctp_shutdown_complete_msg {
+ struct sctphdr sh;
+ struct sctp_shutdown_complete_chunk shut_cmp;
+} SCTP_PACKED;
+
+/*
+ * draft-ietf-tsvwg-addip-sctp
+ */
+/* Address/Stream Configuration Change (ASCONF) */
+struct sctp_asconf_chunk {
+ struct sctp_chunkhdr ch;
+ uint32_t serial_number;
+ /* lookup address parameter (mandatory) */
+ /* asconf parameters follow */
+} SCTP_PACKED;
+
+/* Address/Stream Configuration Acknowledge (ASCONF ACK) */
+struct sctp_asconf_ack_chunk {
+ struct sctp_chunkhdr ch;
+ uint32_t serial_number;
+ /* asconf parameters follow */
+} SCTP_PACKED;
+
+/* draft-ietf-tsvwg-prsctp */
+/* Forward Cumulative TSN (FORWARD TSN) */
+struct sctp_forward_tsn_chunk {
+ struct sctp_chunkhdr ch;
+ uint32_t new_cumulative_tsn;
+ /* stream/sequence pairs (sctp_strseq) follow */
+} SCTP_PACKED;
+
+struct sctp_strseq {
+ uint16_t stream;
+ uint16_t sequence;
+} SCTP_PACKED;
+
+struct sctp_forward_tsn_msg {
+ struct sctphdr sh;
+ struct sctp_forward_tsn_chunk msg;
+} SCTP_PACKED;
+
+/* should be a multiple of 4 - 1 aka 3/7/11 etc. */
+
+#define SCTP_NUM_DB_TO_VERIFY 31
+
+struct sctp_chunk_desc {
+ uint8_t chunk_type;
+ uint8_t data_bytes[SCTP_NUM_DB_TO_VERIFY];
+ uint32_t tsn_ifany;
+} SCTP_PACKED;
+
+
+struct sctp_pktdrop_chunk {
+ struct sctp_chunkhdr ch;
+ uint32_t bottle_bw;
+ uint32_t current_onq;
+ uint16_t trunc_len;
+ uint16_t reserved;
+ uint8_t data[];
+} SCTP_PACKED;
+
+/**********STREAM RESET STUFF ******************/
+
+struct sctp_stream_reset_out_request {
+ struct sctp_paramhdr ph;
+ uint32_t request_seq; /* monotonically increasing seq no */
+ uint32_t response_seq; /* if a response, the resp seq no */
+ uint32_t send_reset_at_tsn; /* last TSN I assigned outbound */
+ uint16_t list_of_streams[]; /* if not all list of streams */
+} SCTP_PACKED;
+
+struct sctp_stream_reset_in_request {
+ struct sctp_paramhdr ph;
+ uint32_t request_seq;
+ uint16_t list_of_streams[]; /* if not all list of streams */
+} SCTP_PACKED;
+
+
+struct sctp_stream_reset_tsn_request {
+ struct sctp_paramhdr ph;
+ uint32_t request_seq;
+} SCTP_PACKED;
+
+struct sctp_stream_reset_response {
+ struct sctp_paramhdr ph;
+ uint32_t response_seq; /* if a response, the resp seq no */
+ uint32_t result;
+} SCTP_PACKED;
+
+struct sctp_stream_reset_response_tsn {
+ struct sctp_paramhdr ph;
+ uint32_t response_seq; /* if a response, the resp seq no */
+ uint32_t result;
+ uint32_t senders_next_tsn;
+ uint32_t receivers_next_tsn;
+} SCTP_PACKED;
+
+struct sctp_stream_reset_add_strm {
+ struct sctp_paramhdr ph;
+ uint32_t request_seq;
+ uint16_t number_of_streams;
+ uint16_t reserved;
+} SCTP_PACKED;
+
+#define SCTP_STREAM_RESET_NOTHING 0x00000000 /* Nothing for me to do */
+#define SCTP_STREAM_RESET_PERFORMED 0x00000001 /* Did it */
+#define SCTP_STREAM_RESET_DENIED 0x00000002 /* refused to do it */
+#define SCTP_STREAM_RESET_ERROR_STR 0x00000003 /* bad Stream no */
+#define SCTP_STREAM_RESET_TRY_LATER 0x00000004 /* collision, try again */
+#define SCTP_STREAM_RESET_BAD_SEQNO 0x00000005 /* bad str-reset seq no */
+
+/*
+ * convience structures, note that if you are making a request for specific
+ * streams then the request will need to be an overlay structure.
+ */
+
+struct sctp_stream_reset_out_req {
+ struct sctp_chunkhdr ch;
+ struct sctp_stream_reset_out_request sr_req;
+} SCTP_PACKED;
+
+struct sctp_stream_reset_in_req {
+ struct sctp_chunkhdr ch;
+ struct sctp_stream_reset_in_request sr_req;
+} SCTP_PACKED;
+
+struct sctp_stream_reset_tsn_req {
+ struct sctp_chunkhdr ch;
+ struct sctp_stream_reset_tsn_request sr_req;
+} SCTP_PACKED;
+
+struct sctp_stream_reset_resp {
+ struct sctp_chunkhdr ch;
+ struct sctp_stream_reset_response sr_resp;
+} SCTP_PACKED;
+
+/* respone only valid with a TSN request */
+struct sctp_stream_reset_resp_tsn {
+ struct sctp_chunkhdr ch;
+ struct sctp_stream_reset_response_tsn sr_resp;
+} SCTP_PACKED;
+
+/****************************************************/
+
+/*
+ * Authenticated chunks support draft-ietf-tsvwg-sctp-auth
+ */
+
+/* Should we make the max be 32? */
+#define SCTP_RANDOM_MAX_SIZE 256
+struct sctp_auth_random {
+ struct sctp_paramhdr ph;/* type = 0x8002 */
+ uint8_t random_data[];
+} SCTP_PACKED;
+
+struct sctp_auth_chunk_list {
+ struct sctp_paramhdr ph;/* type = 0x8003 */
+ uint8_t chunk_types[];
+} SCTP_PACKED;
+
+struct sctp_auth_hmac_algo {
+ struct sctp_paramhdr ph;/* type = 0x8004 */
+ uint16_t hmac_ids[];
+} SCTP_PACKED;
+
+struct sctp_auth_chunk {
+ struct sctp_chunkhdr ch;
+ uint16_t shared_key_id;
+ uint16_t hmac_id;
+ uint8_t hmac[];
+} SCTP_PACKED;
+
+struct sctp_auth_invalid_hmac {
+ struct sctp_paramhdr ph;
+ uint16_t hmac_id;
+ uint16_t padding;
+} SCTP_PACKED;
+
+/*
+ * we pre-reserve enough room for a ECNE or CWR AND a SACK with no missing
+ * pieces. If ENCE is missing we could have a couple of blocks. This way we
+ * optimize so we MOST likely can bundle a SACK/ECN with the smallest size
+ * data chunk I will split into. We could increase throughput slightly by
+ * taking out these two but the 24-sack/8-CWR i.e. 32 bytes I pre-reserve I
+ * feel is worth it for now.
+ */
+#ifndef SCTP_MAX_OVERHEAD
+#ifdef INET6
+#define SCTP_MAX_OVERHEAD (sizeof(struct sctp_data_chunk) + \
+ sizeof(struct sctphdr) + \
+ sizeof(struct sctp_ecne_chunk) + \
+ sizeof(struct sctp_sack_chunk) + \
+ sizeof(struct ip6_hdr))
+
+#define SCTP_MED_OVERHEAD (sizeof(struct sctp_data_chunk) + \
+ sizeof(struct sctphdr) + \
+ sizeof(struct ip6_hdr))
+
+
+#define SCTP_MIN_OVERHEAD (sizeof(struct ip6_hdr) + \
+ sizeof(struct sctphdr))
+
+#else
+#define SCTP_MAX_OVERHEAD (sizeof(struct sctp_data_chunk) + \
+ sizeof(struct sctphdr) + \
+ sizeof(struct sctp_ecne_chunk) + \
+ sizeof(struct sctp_sack_chunk) + \
+ sizeof(struct ip))
+
+#define SCTP_MED_OVERHEAD (sizeof(struct sctp_data_chunk) + \
+ sizeof(struct sctphdr) + \
+ sizeof(struct ip))
+
+
+#define SCTP_MIN_OVERHEAD (sizeof(struct ip) + \
+ sizeof(struct sctphdr))
+
+#endif /* INET6 */
+#endif /* !SCTP_MAX_OVERHEAD */
+
+#define SCTP_MED_V4_OVERHEAD (sizeof(struct sctp_data_chunk) + \
+ sizeof(struct sctphdr) + \
+ sizeof(struct ip))
+
+#define SCTP_MIN_V4_OVERHEAD (sizeof(struct ip) + \
+ sizeof(struct sctphdr))
+
+#undef SCTP_PACKED
+#endif /* !__sctp_header_h__ */
diff --git a/freebsd/sys/netinet/sctp_indata.c b/freebsd/sys/netinet/sctp_indata.c
new file mode 100644
index 00000000..963b3205
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_indata.c
@@ -0,0 +1,5800 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_indata.c,v 1.36 2005/03/06 16:04:17 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp_var.h>
+#include <freebsd/netinet/sctp_sysctl.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#include <freebsd/netinet/sctp_header.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_output.h>
+#include <freebsd/netinet/sctp_input.h>
+#include <freebsd/netinet/sctp_indata.h>
+#include <freebsd/netinet/sctp_uio.h>
+#include <freebsd/netinet/sctp_timer.h>
+
+
+/*
+ * NOTES: On the outbound side of things I need to check the sack timer to
+ * see if I should generate a sack into the chunk queue (if I have data to
+ * send that is and will be sending it .. for bundling.
+ *
+ * The callback in sctp_usrreq.c will get called when the socket is read from.
+ * This will cause sctp_service_queues() to get called on the top entry in
+ * the list.
+ */
+
+void
+sctp_set_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc)
+{
+ asoc->my_rwnd = sctp_calc_rwnd(stcb, asoc);
+}
+
+/* Calculate what the rwnd would be */
+uint32_t
+sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc)
+{
+ uint32_t calc = 0;
+
+ /*
+ * This is really set wrong with respect to a 1-2-m socket. Since
+ * the sb_cc is the count that everyone as put up. When we re-write
+ * sctp_soreceive then we will fix this so that ONLY this
+ * associations data is taken into account.
+ */
+ if (stcb->sctp_socket == NULL)
+ return (calc);
+
+ if (stcb->asoc.sb_cc == 0 &&
+ asoc->size_on_reasm_queue == 0 &&
+ asoc->size_on_all_streams == 0) {
+ /* Full rwnd granted */
+ calc = max(SCTP_SB_LIMIT_RCV(stcb->sctp_socket), SCTP_MINIMAL_RWND);
+ return (calc);
+ }
+ /* get actual space */
+ calc = (uint32_t) sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv);
+
+ /*
+ * take out what has NOT been put on socket queue and we yet hold
+ * for putting up.
+ */
+ calc = sctp_sbspace_sub(calc, (uint32_t) (asoc->size_on_reasm_queue +
+ asoc->cnt_on_reasm_queue * MSIZE));
+ calc = sctp_sbspace_sub(calc, (uint32_t) (asoc->size_on_all_streams +
+ asoc->cnt_on_all_streams * MSIZE));
+
+ if (calc == 0) {
+ /* out of space */
+ return (calc);
+ }
+ /* what is the overhead of all these rwnd's */
+ calc = sctp_sbspace_sub(calc, stcb->asoc.my_rwnd_control_len);
+ /*
+ * If the window gets too small due to ctrl-stuff, reduce it to 1,
+ * even it is 0. SWS engaged
+ */
+ if (calc < stcb->asoc.my_rwnd_control_len) {
+ calc = 1;
+ }
+ return (calc);
+}
+
+
+
+/*
+ * Build out our readq entry based on the incoming packet.
+ */
+struct sctp_queued_to_read *
+sctp_build_readq_entry(struct sctp_tcb *stcb,
+ struct sctp_nets *net,
+ uint32_t tsn, uint32_t ppid,
+ uint32_t context, uint16_t stream_no,
+ uint16_t stream_seq, uint8_t flags,
+ struct mbuf *dm)
+{
+ struct sctp_queued_to_read *read_queue_e = NULL;
+
+ sctp_alloc_a_readq(stcb, read_queue_e);
+ if (read_queue_e == NULL) {
+ goto failed_build;
+ }
+ read_queue_e->sinfo_stream = stream_no;
+ read_queue_e->sinfo_ssn = stream_seq;
+ read_queue_e->sinfo_flags = (flags << 8);
+ read_queue_e->sinfo_ppid = ppid;
+ read_queue_e->sinfo_context = stcb->asoc.context;
+ read_queue_e->sinfo_timetolive = 0;
+ read_queue_e->sinfo_tsn = tsn;
+ read_queue_e->sinfo_cumtsn = tsn;
+ read_queue_e->sinfo_assoc_id = sctp_get_associd(stcb);
+ read_queue_e->whoFrom = net;
+ read_queue_e->length = 0;
+ atomic_add_int(&net->ref_count, 1);
+ read_queue_e->data = dm;
+ read_queue_e->spec_flags = 0;
+ read_queue_e->tail_mbuf = NULL;
+ read_queue_e->aux_data = NULL;
+ read_queue_e->stcb = stcb;
+ read_queue_e->port_from = stcb->rport;
+ read_queue_e->do_not_ref_stcb = 0;
+ read_queue_e->end_added = 0;
+ read_queue_e->some_taken = 0;
+ read_queue_e->pdapi_aborted = 0;
+failed_build:
+ return (read_queue_e);
+}
+
+
+/*
+ * Build out our readq entry based on the incoming packet.
+ */
+static struct sctp_queued_to_read *
+sctp_build_readq_entry_chk(struct sctp_tcb *stcb,
+ struct sctp_tmit_chunk *chk)
+{
+ struct sctp_queued_to_read *read_queue_e = NULL;
+
+ sctp_alloc_a_readq(stcb, read_queue_e);
+ if (read_queue_e == NULL) {
+ goto failed_build;
+ }
+ read_queue_e->sinfo_stream = chk->rec.data.stream_number;
+ read_queue_e->sinfo_ssn = chk->rec.data.stream_seq;
+ read_queue_e->sinfo_flags = (chk->rec.data.rcv_flags << 8);
+ read_queue_e->sinfo_ppid = chk->rec.data.payloadtype;
+ read_queue_e->sinfo_context = stcb->asoc.context;
+ read_queue_e->sinfo_timetolive = 0;
+ read_queue_e->sinfo_tsn = chk->rec.data.TSN_seq;
+ read_queue_e->sinfo_cumtsn = chk->rec.data.TSN_seq;
+ read_queue_e->sinfo_assoc_id = sctp_get_associd(stcb);
+ read_queue_e->whoFrom = chk->whoTo;
+ read_queue_e->aux_data = NULL;
+ read_queue_e->length = 0;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ read_queue_e->data = chk->data;
+ read_queue_e->tail_mbuf = NULL;
+ read_queue_e->stcb = stcb;
+ read_queue_e->port_from = stcb->rport;
+ read_queue_e->spec_flags = 0;
+ read_queue_e->do_not_ref_stcb = 0;
+ read_queue_e->end_added = 0;
+ read_queue_e->some_taken = 0;
+ read_queue_e->pdapi_aborted = 0;
+failed_build:
+ return (read_queue_e);
+}
+
+
+struct mbuf *
+sctp_build_ctl_nchunk(struct sctp_inpcb *inp,
+ struct sctp_sndrcvinfo *sinfo)
+{
+ struct sctp_sndrcvinfo *outinfo;
+ struct cmsghdr *cmh;
+ struct mbuf *ret;
+ int len;
+ int use_extended = 0;
+
+ if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) {
+ /* user does not want the sndrcv ctl */
+ return (NULL);
+ }
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) {
+ use_extended = 1;
+ len = CMSG_LEN(sizeof(struct sctp_extrcvinfo));
+ } else {
+ len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+ }
+
+
+ ret = sctp_get_mbuf_for_msg(len,
+ 0, M_DONTWAIT, 1, MT_DATA);
+
+ if (ret == NULL) {
+ /* No space */
+ return (ret);
+ }
+ /* We need a CMSG header followed by the struct */
+ cmh = mtod(ret, struct cmsghdr *);
+ outinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmh);
+ cmh->cmsg_level = IPPROTO_SCTP;
+ if (use_extended) {
+ cmh->cmsg_type = SCTP_EXTRCV;
+ cmh->cmsg_len = len;
+ memcpy(outinfo, sinfo, len);
+ } else {
+ cmh->cmsg_type = SCTP_SNDRCV;
+ cmh->cmsg_len = len;
+ *outinfo = *sinfo;
+ }
+ SCTP_BUF_LEN(ret) = cmh->cmsg_len;
+ return (ret);
+}
+
+
+char *
+sctp_build_ctl_cchunk(struct sctp_inpcb *inp,
+ int *control_len,
+ struct sctp_sndrcvinfo *sinfo)
+{
+ struct sctp_sndrcvinfo *outinfo;
+ struct cmsghdr *cmh;
+ char *buf;
+ int len;
+ int use_extended = 0;
+
+ if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) {
+ /* user does not want the sndrcv ctl */
+ return (NULL);
+ }
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) {
+ use_extended = 1;
+ len = CMSG_LEN(sizeof(struct sctp_extrcvinfo));
+ } else {
+ len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+ }
+ SCTP_MALLOC(buf, char *, len, SCTP_M_CMSG);
+ if (buf == NULL) {
+ /* No space */
+ return (buf);
+ }
+ /* We need a CMSG header followed by the struct */
+ cmh = (struct cmsghdr *)buf;
+ outinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmh);
+ cmh->cmsg_level = IPPROTO_SCTP;
+ if (use_extended) {
+ cmh->cmsg_type = SCTP_EXTRCV;
+ cmh->cmsg_len = len;
+ memcpy(outinfo, sinfo, len);
+ } else {
+ cmh->cmsg_type = SCTP_SNDRCV;
+ cmh->cmsg_len = len;
+ *outinfo = *sinfo;
+ }
+ *control_len = len;
+ return (buf);
+}
+
+static void
+sctp_mark_non_revokable(struct sctp_association *asoc, uint32_t tsn)
+{
+ uint32_t gap, i, cumackp1;
+ int fnd = 0;
+
+ if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) {
+ return;
+ }
+ cumackp1 = asoc->cumulative_tsn + 1;
+ if (compare_with_wrap(cumackp1, tsn, MAX_TSN)) {
+ /*
+ * this tsn is behind the cum ack and thus we don't need to
+ * worry about it being moved from one to the other.
+ */
+ return;
+ }
+ SCTP_CALC_TSN_TO_GAP(gap, tsn, asoc->mapping_array_base_tsn);
+ if (!SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) {
+ printf("gap:%x tsn:%x\n", gap, tsn);
+ sctp_print_mapping_array(asoc);
+#ifdef INVARIANTS
+ panic("Things are really messed up now!!");
+#endif
+ }
+ SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap);
+ SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
+ if (compare_with_wrap(tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) {
+ asoc->highest_tsn_inside_nr_map = tsn;
+ }
+ if (tsn == asoc->highest_tsn_inside_map) {
+ /* We must back down to see what the new highest is */
+ for (i = tsn - 1; (compare_with_wrap(i, asoc->mapping_array_base_tsn, MAX_TSN) ||
+ (i == asoc->mapping_array_base_tsn)); i--) {
+ SCTP_CALC_TSN_TO_GAP(gap, i, asoc->mapping_array_base_tsn);
+ if (SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) {
+ asoc->highest_tsn_inside_map = i;
+ fnd = 1;
+ break;
+ }
+ }
+ if (!fnd) {
+ asoc->highest_tsn_inside_map = asoc->mapping_array_base_tsn - 1;
+ }
+ }
+}
+
+
+/*
+ * We are delivering currently from the reassembly queue. We must continue to
+ * deliver until we either: 1) run out of space. 2) run out of sequential
+ * TSN's 3) hit the SCTP_DATA_LAST_FRAG flag.
+ */
+static void
+sctp_service_reassembly(struct sctp_tcb *stcb, struct sctp_association *asoc)
+{
+ struct sctp_tmit_chunk *chk;
+ uint16_t nxt_todel;
+ uint16_t stream_no;
+ int end = 0;
+ int cntDel;
+
+ struct sctp_queued_to_read *control, *ctl, *ctlat;
+
+ if (stcb == NULL)
+ return;
+
+ cntDel = stream_no = 0;
+ if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
+ (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) ||
+ (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) {
+ /* socket above is long gone or going.. */
+abandon:
+ asoc->fragmented_delivery_inprogress = 0;
+ chk = TAILQ_FIRST(&asoc->reasmqueue);
+ while (chk) {
+ TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
+ asoc->size_on_reasm_queue -= chk->send_size;
+ sctp_ucount_decr(asoc->cnt_on_reasm_queue);
+ /*
+ * Lose the data pointer, since its in the socket
+ * buffer
+ */
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ /* Now free the address and data */
+ sctp_free_a_chunk(stcb, chk);
+ /* sa_ignore FREED_MEMORY */
+ chk = TAILQ_FIRST(&asoc->reasmqueue);
+ }
+ return;
+ }
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ do {
+ chk = TAILQ_FIRST(&asoc->reasmqueue);
+ if (chk == NULL) {
+ return;
+ }
+ if (chk->rec.data.TSN_seq != (asoc->tsn_last_delivered + 1)) {
+ /* Can't deliver more :< */
+ return;
+ }
+ stream_no = chk->rec.data.stream_number;
+ nxt_todel = asoc->strmin[stream_no].last_sequence_delivered + 1;
+ if (nxt_todel != chk->rec.data.stream_seq &&
+ (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) {
+ /*
+ * Not the next sequence to deliver in its stream OR
+ * unordered
+ */
+ return;
+ }
+ if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) {
+
+ control = sctp_build_readq_entry_chk(stcb, chk);
+ if (control == NULL) {
+ /* out of memory? */
+ return;
+ }
+ /* save it off for our future deliveries */
+ stcb->asoc.control_pdapi = control;
+ if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG)
+ end = 1;
+ else
+ end = 0;
+ sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq);
+ sctp_add_to_readq(stcb->sctp_ep,
+ stcb, control, &stcb->sctp_socket->so_rcv, end,
+ SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
+ cntDel++;
+ } else {
+ if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG)
+ end = 1;
+ else
+ end = 0;
+ sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq);
+ if (sctp_append_to_readq(stcb->sctp_ep, stcb,
+ stcb->asoc.control_pdapi,
+ chk->data, end, chk->rec.data.TSN_seq,
+ &stcb->sctp_socket->so_rcv)) {
+ /*
+ * something is very wrong, either
+ * control_pdapi is NULL, or the tail_mbuf
+ * is corrupt, or there is a EOM already on
+ * the mbuf chain.
+ */
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ goto abandon;
+ } else {
+#ifdef INVARIANTS
+ if ((stcb->asoc.control_pdapi == NULL) || (stcb->asoc.control_pdapi->tail_mbuf == NULL)) {
+ panic("This should not happen control_pdapi NULL?");
+ }
+ /* if we did not panic, it was a EOM */
+ panic("Bad chunking ??");
+#else
+ if ((stcb->asoc.control_pdapi == NULL) || (stcb->asoc.control_pdapi->tail_mbuf == NULL)) {
+ SCTP_PRINTF("This should not happen control_pdapi NULL?\n");
+ }
+ SCTP_PRINTF("Bad chunking ??\n");
+ SCTP_PRINTF("Dumping re-assembly queue this will probably hose the association\n");
+
+#endif
+ goto abandon;
+ }
+ }
+ cntDel++;
+ }
+ /* pull it we did it */
+ TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
+ if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) {
+ asoc->fragmented_delivery_inprogress = 0;
+ if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) {
+ asoc->strmin[stream_no].last_sequence_delivered++;
+ }
+ if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0) {
+ SCTP_STAT_INCR_COUNTER64(sctps_reasmusrmsgs);
+ }
+ } else if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) {
+ /*
+ * turn the flag back on since we just delivered
+ * yet another one.
+ */
+ asoc->fragmented_delivery_inprogress = 1;
+ }
+ asoc->tsn_of_pdapi_last_delivered = chk->rec.data.TSN_seq;
+ asoc->last_flags_delivered = chk->rec.data.rcv_flags;
+ asoc->last_strm_seq_delivered = chk->rec.data.stream_seq;
+ asoc->last_strm_no_delivered = chk->rec.data.stream_number;
+
+ asoc->tsn_last_delivered = chk->rec.data.TSN_seq;
+ asoc->size_on_reasm_queue -= chk->send_size;
+ sctp_ucount_decr(asoc->cnt_on_reasm_queue);
+ /* free up the chk */
+ chk->data = NULL;
+ sctp_free_a_chunk(stcb, chk);
+
+ if (asoc->fragmented_delivery_inprogress == 0) {
+ /*
+ * Now lets see if we can deliver the next one on
+ * the stream
+ */
+ struct sctp_stream_in *strm;
+
+ strm = &asoc->strmin[stream_no];
+ nxt_todel = strm->last_sequence_delivered + 1;
+ ctl = TAILQ_FIRST(&strm->inqueue);
+ if (ctl && (nxt_todel == ctl->sinfo_ssn)) {
+ while (ctl != NULL) {
+ /* Deliver more if we can. */
+ if (nxt_todel == ctl->sinfo_ssn) {
+ ctlat = TAILQ_NEXT(ctl, next);
+ TAILQ_REMOVE(&strm->inqueue, ctl, next);
+ asoc->size_on_all_streams -= ctl->length;
+ sctp_ucount_decr(asoc->cnt_on_all_streams);
+ strm->last_sequence_delivered++;
+ sctp_mark_non_revokable(asoc, ctl->sinfo_tsn);
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ ctl,
+ &stcb->sctp_socket->so_rcv, 1,
+ SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
+ ctl = ctlat;
+ } else {
+ break;
+ }
+ nxt_todel = strm->last_sequence_delivered + 1;
+ }
+ }
+ break;
+ }
+ /* sa_ignore FREED_MEMORY */
+ chk = TAILQ_FIRST(&asoc->reasmqueue);
+ } while (chk);
+}
+
+/*
+ * Queue the chunk either right into the socket buffer if it is the next one
+ * to go OR put it in the correct place in the delivery queue. If we do
+ * append to the so_buf, keep doing so until we are out of order. One big
+ * question still remains, what to do when the socket buffer is FULL??
+ */
+static void
+sctp_queue_data_to_stream(struct sctp_tcb *stcb, struct sctp_association *asoc,
+ struct sctp_queued_to_read *control, int *abort_flag)
+{
+ /*
+ * FIX-ME maybe? What happens when the ssn wraps? If we are getting
+ * all the data in one stream this could happen quite rapidly. One
+ * could use the TSN to keep track of things, but this scheme breaks
+ * down in the other type of stream useage that could occur. Send a
+ * single msg to stream 0, send 4Billion messages to stream 1, now
+ * send a message to stream 0. You have a situation where the TSN
+ * has wrapped but not in the stream. Is this worth worrying about
+ * or should we just change our queue sort at the bottom to be by
+ * TSN.
+ *
+ * Could it also be legal for a peer to send ssn 1 with TSN 2 and ssn 2
+ * with TSN 1? If the peer is doing some sort of funky TSN/SSN
+ * assignment this could happen... and I don't see how this would be
+ * a violation. So for now I am undecided an will leave the sort by
+ * SSN alone. Maybe a hybred approach is the answer
+ *
+ */
+ struct sctp_stream_in *strm;
+ struct sctp_queued_to_read *at;
+ int queue_needed;
+ uint16_t nxt_todel;
+ struct mbuf *oper;
+
+ queue_needed = 1;
+ asoc->size_on_all_streams += control->length;
+ sctp_ucount_incr(asoc->cnt_on_all_streams);
+ strm = &asoc->strmin[control->sinfo_stream];
+ nxt_todel = strm->last_sequence_delivered + 1;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
+ sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_INTO_STRD);
+ }
+ SCTPDBG(SCTP_DEBUG_INDATA1,
+ "queue to stream called for ssn:%u lastdel:%u nxt:%u\n",
+ (uint32_t) control->sinfo_stream,
+ (uint32_t) strm->last_sequence_delivered,
+ (uint32_t) nxt_todel);
+ if (compare_with_wrap(strm->last_sequence_delivered,
+ control->sinfo_ssn, MAX_SEQ) ||
+ (strm->last_sequence_delivered == control->sinfo_ssn)) {
+ /* The incoming sseq is behind where we last delivered? */
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Duplicate S-SEQ:%d delivered:%d from peer, Abort association\n",
+ control->sinfo_ssn, strm->last_sequence_delivered);
+protocol_error:
+ /*
+ * throw it in the stream so it gets cleaned up in
+ * association destruction
+ */
+ TAILQ_INSERT_HEAD(&strm->inqueue, control, next);
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) +
+ (sizeof(uint32_t) * 3);
+ ph = mtod(oper, struct sctp_paramhdr *);
+ ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length = htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_1);
+ ippp++;
+ *ippp = control->sinfo_tsn;
+ ippp++;
+ *ippp = ((control->sinfo_stream << 16) | control->sinfo_ssn);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_1;
+ sctp_abort_an_association(stcb->sctp_ep, stcb,
+ SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+
+ *abort_flag = 1;
+ return;
+
+ }
+ if (nxt_todel == control->sinfo_ssn) {
+ /* can be delivered right away? */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
+ sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_IMMED_DEL);
+ }
+ /* EY it wont be queued if it could be delivered directly */
+ queue_needed = 0;
+ asoc->size_on_all_streams -= control->length;
+ sctp_ucount_decr(asoc->cnt_on_all_streams);
+ strm->last_sequence_delivered++;
+
+ sctp_mark_non_revokable(asoc, control->sinfo_tsn);
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, 1,
+ SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
+ control = TAILQ_FIRST(&strm->inqueue);
+ while (control != NULL) {
+ /* all delivered */
+ nxt_todel = strm->last_sequence_delivered + 1;
+ if (nxt_todel == control->sinfo_ssn) {
+ at = TAILQ_NEXT(control, next);
+ TAILQ_REMOVE(&strm->inqueue, control, next);
+ asoc->size_on_all_streams -= control->length;
+ sctp_ucount_decr(asoc->cnt_on_all_streams);
+ strm->last_sequence_delivered++;
+ /*
+ * We ignore the return of deliver_data here
+ * since we always can hold the chunk on the
+ * d-queue. And we have a finite number that
+ * can be delivered from the strq.
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
+ sctp_log_strm_del(control, NULL,
+ SCTP_STR_LOG_FROM_IMMED_DEL);
+ }
+ sctp_mark_non_revokable(asoc, control->sinfo_tsn);
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, 1,
+ SCTP_READ_LOCK_NOT_HELD,
+ SCTP_SO_NOT_LOCKED);
+ control = at;
+ continue;
+ }
+ break;
+ }
+ }
+ if (queue_needed) {
+ /*
+ * Ok, we did not deliver this guy, find the correct place
+ * to put it on the queue.
+ */
+ if ((compare_with_wrap(asoc->cumulative_tsn,
+ control->sinfo_tsn, MAX_TSN)) ||
+ (control->sinfo_tsn == asoc->cumulative_tsn)) {
+ goto protocol_error;
+ }
+ if (TAILQ_EMPTY(&strm->inqueue)) {
+ /* Empty queue */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
+ sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_INSERT_HD);
+ }
+ TAILQ_INSERT_HEAD(&strm->inqueue, control, next);
+ } else {
+ TAILQ_FOREACH(at, &strm->inqueue, next) {
+ if (compare_with_wrap(at->sinfo_ssn,
+ control->sinfo_ssn, MAX_SEQ)) {
+ /*
+ * one in queue is bigger than the
+ * new one, insert before this one
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
+ sctp_log_strm_del(control, at,
+ SCTP_STR_LOG_FROM_INSERT_MD);
+ }
+ TAILQ_INSERT_BEFORE(at, control, next);
+ break;
+ } else if (at->sinfo_ssn == control->sinfo_ssn) {
+ /*
+ * Gak, He sent me a duplicate str
+ * seq number
+ */
+ /*
+ * foo bar, I guess I will just free
+ * this new guy, should we abort
+ * too? FIX ME MAYBE? Or it COULD be
+ * that the SSN's have wrapped.
+ * Maybe I should compare to TSN
+ * somehow... sigh for now just blow
+ * away the chunk!
+ */
+
+ if (control->data)
+ sctp_m_freem(control->data);
+ control->data = NULL;
+ asoc->size_on_all_streams -= control->length;
+ sctp_ucount_decr(asoc->cnt_on_all_streams);
+ if (control->whoFrom) {
+ sctp_free_remote_addr(control->whoFrom);
+ control->whoFrom = NULL;
+ }
+ sctp_free_a_readq(stcb, control);
+ return;
+ } else {
+ if (TAILQ_NEXT(at, next) == NULL) {
+ /*
+ * We are at the end, insert
+ * it after this one
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
+ sctp_log_strm_del(control, at,
+ SCTP_STR_LOG_FROM_INSERT_TL);
+ }
+ TAILQ_INSERT_AFTER(&strm->inqueue,
+ at, control, next);
+ break;
+ }
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Returns two things: You get the total size of the deliverable parts of the
+ * first fragmented message on the reassembly queue. And you get a 1 back if
+ * all of the message is ready or a 0 back if the message is still incomplete
+ */
+static int
+sctp_is_all_msg_on_reasm(struct sctp_association *asoc, uint32_t * t_size)
+{
+ struct sctp_tmit_chunk *chk;
+ uint32_t tsn;
+
+ *t_size = 0;
+ chk = TAILQ_FIRST(&asoc->reasmqueue);
+ if (chk == NULL) {
+ /* nothing on the queue */
+ return (0);
+ }
+ if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0) {
+ /* Not a first on the queue */
+ return (0);
+ }
+ tsn = chk->rec.data.TSN_seq;
+ while (chk) {
+ if (tsn != chk->rec.data.TSN_seq) {
+ return (0);
+ }
+ *t_size += chk->send_size;
+ if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) {
+ return (1);
+ }
+ tsn++;
+ chk = TAILQ_NEXT(chk, sctp_next);
+ }
+ return (0);
+}
+
+static void
+sctp_deliver_reasm_check(struct sctp_tcb *stcb, struct sctp_association *asoc)
+{
+ struct sctp_tmit_chunk *chk;
+ uint16_t nxt_todel;
+ uint32_t tsize, pd_point;
+
+doit_again:
+ chk = TAILQ_FIRST(&asoc->reasmqueue);
+ if (chk == NULL) {
+ /* Huh? */
+ asoc->size_on_reasm_queue = 0;
+ asoc->cnt_on_reasm_queue = 0;
+ return;
+ }
+ if (asoc->fragmented_delivery_inprogress == 0) {
+ nxt_todel =
+ asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered + 1;
+ if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) &&
+ (nxt_todel == chk->rec.data.stream_seq ||
+ (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED))) {
+ /*
+ * Yep the first one is here and its ok to deliver
+ * but should we?
+ */
+ if (stcb->sctp_socket) {
+ pd_point = min(SCTP_SB_LIMIT_RCV(stcb->sctp_socket),
+ stcb->sctp_ep->partial_delivery_point);
+ } else {
+ pd_point = stcb->sctp_ep->partial_delivery_point;
+ }
+ if (sctp_is_all_msg_on_reasm(asoc, &tsize) || (tsize >= pd_point)) {
+
+ /*
+ * Yes, we setup to start reception, by
+ * backing down the TSN just in case we
+ * can't deliver. If we
+ */
+ asoc->fragmented_delivery_inprogress = 1;
+ asoc->tsn_last_delivered =
+ chk->rec.data.TSN_seq - 1;
+ asoc->str_of_pdapi =
+ chk->rec.data.stream_number;
+ asoc->ssn_of_pdapi = chk->rec.data.stream_seq;
+ asoc->pdapi_ppid = chk->rec.data.payloadtype;
+ asoc->fragment_flags = chk->rec.data.rcv_flags;
+ sctp_service_reassembly(stcb, asoc);
+ }
+ }
+ } else {
+ /*
+ * Service re-assembly will deliver stream data queued at
+ * the end of fragmented delivery.. but it wont know to go
+ * back and call itself again... we do that here with the
+ * got doit_again
+ */
+ sctp_service_reassembly(stcb, asoc);
+ if (asoc->fragmented_delivery_inprogress == 0) {
+ /*
+ * finished our Fragmented delivery, could be more
+ * waiting?
+ */
+ goto doit_again;
+ }
+ }
+}
+
+/*
+ * Dump onto the re-assembly queue, in its proper place. After dumping on the
+ * queue, see if anthing can be delivered. If so pull it off (or as much as
+ * we can. If we run out of space then we must dump what we can and set the
+ * appropriate flag to say we queued what we could.
+ */
+static void
+sctp_queue_data_for_reasm(struct sctp_tcb *stcb, struct sctp_association *asoc,
+ struct sctp_tmit_chunk *chk, int *abort_flag)
+{
+ struct mbuf *oper;
+ uint32_t cum_ackp1, last_tsn, prev_tsn, post_tsn;
+ u_char last_flags;
+ struct sctp_tmit_chunk *at, *prev, *next;
+
+ prev = next = NULL;
+ cum_ackp1 = asoc->tsn_last_delivered + 1;
+ if (TAILQ_EMPTY(&asoc->reasmqueue)) {
+ /* This is the first one on the queue */
+ TAILQ_INSERT_HEAD(&asoc->reasmqueue, chk, sctp_next);
+ /*
+ * we do not check for delivery of anything when only one
+ * fragment is here
+ */
+ asoc->size_on_reasm_queue = chk->send_size;
+ sctp_ucount_incr(asoc->cnt_on_reasm_queue);
+ if (chk->rec.data.TSN_seq == cum_ackp1) {
+ if (asoc->fragmented_delivery_inprogress == 0 &&
+ (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) !=
+ SCTP_DATA_FIRST_FRAG) {
+ /*
+ * An empty queue, no delivery inprogress,
+ * we hit the next one and it does NOT have
+ * a FIRST fragment mark.
+ */
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, its not first, no fragmented delivery in progress\n");
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (sizeof(uint32_t) * 3);
+ ph = mtod(oper, struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length = htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_2);
+ ippp++;
+ *ippp = chk->rec.data.TSN_seq;
+ ippp++;
+ *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
+
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_2;
+ sctp_abort_an_association(stcb->sctp_ep, stcb,
+ SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+ *abort_flag = 1;
+ } else if (asoc->fragmented_delivery_inprogress &&
+ (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == SCTP_DATA_FIRST_FRAG) {
+ /*
+ * We are doing a partial delivery and the
+ * NEXT chunk MUST be either the LAST or
+ * MIDDLE fragment NOT a FIRST
+ */
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS a first and fragmented delivery in progress\n");
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper, struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length = htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_3);
+ ippp++;
+ *ippp = chk->rec.data.TSN_seq;
+ ippp++;
+ *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_3;
+ sctp_abort_an_association(stcb->sctp_ep, stcb,
+ SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+ *abort_flag = 1;
+ } else if (asoc->fragmented_delivery_inprogress) {
+ /*
+ * Here we are ok with a MIDDLE or LAST
+ * piece
+ */
+ if (chk->rec.data.stream_number !=
+ asoc->str_of_pdapi) {
+ /* Got to be the right STR No */
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS not same stream number %d vs %d\n",
+ chk->rec.data.stream_number,
+ asoc->str_of_pdapi);
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (sizeof(uint32_t) * 3);
+ ph = mtod(oper,
+ struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length =
+ htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_4);
+ ippp++;
+ *ippp = chk->rec.data.TSN_seq;
+ ippp++;
+ *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_4;
+ sctp_abort_an_association(stcb->sctp_ep,
+ stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+ *abort_flag = 1;
+ } else if ((asoc->fragment_flags & SCTP_DATA_UNORDERED) !=
+ SCTP_DATA_UNORDERED &&
+ chk->rec.data.stream_seq != asoc->ssn_of_pdapi) {
+ /* Got to be the right STR Seq */
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS not same stream seq %d vs %d\n",
+ chk->rec.data.stream_seq,
+ asoc->ssn_of_pdapi);
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper,
+ struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length =
+ htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_5);
+ ippp++;
+ *ippp = chk->rec.data.TSN_seq;
+ ippp++;
+ *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
+
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_5;
+ sctp_abort_an_association(stcb->sctp_ep,
+ stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+ *abort_flag = 1;
+ }
+ }
+ }
+ return;
+ }
+ /* Find its place */
+ TAILQ_FOREACH(at, &asoc->reasmqueue, sctp_next) {
+ if (compare_with_wrap(at->rec.data.TSN_seq,
+ chk->rec.data.TSN_seq, MAX_TSN)) {
+ /*
+ * one in queue is bigger than the new one, insert
+ * before this one
+ */
+ /* A check */
+ asoc->size_on_reasm_queue += chk->send_size;
+ sctp_ucount_incr(asoc->cnt_on_reasm_queue);
+ next = at;
+ TAILQ_INSERT_BEFORE(at, chk, sctp_next);
+ break;
+ } else if (at->rec.data.TSN_seq == chk->rec.data.TSN_seq) {
+ /* Gak, He sent me a duplicate str seq number */
+ /*
+ * foo bar, I guess I will just free this new guy,
+ * should we abort too? FIX ME MAYBE? Or it COULD be
+ * that the SSN's have wrapped. Maybe I should
+ * compare to TSN somehow... sigh for now just blow
+ * away the chunk!
+ */
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ sctp_free_a_chunk(stcb, chk);
+ return;
+ } else {
+ last_flags = at->rec.data.rcv_flags;
+ last_tsn = at->rec.data.TSN_seq;
+ prev = at;
+ if (TAILQ_NEXT(at, sctp_next) == NULL) {
+ /*
+ * We are at the end, insert it after this
+ * one
+ */
+ /* check it first */
+ asoc->size_on_reasm_queue += chk->send_size;
+ sctp_ucount_incr(asoc->cnt_on_reasm_queue);
+ TAILQ_INSERT_AFTER(&asoc->reasmqueue, at, chk, sctp_next);
+ break;
+ }
+ }
+ }
+ /* Now the audits */
+ if (prev) {
+ prev_tsn = chk->rec.data.TSN_seq - 1;
+ if (prev_tsn == prev->rec.data.TSN_seq) {
+ /*
+ * Ok the one I am dropping onto the end is the
+ * NEXT. A bit of valdiation here.
+ */
+ if ((prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
+ SCTP_DATA_FIRST_FRAG ||
+ (prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
+ SCTP_DATA_MIDDLE_FRAG) {
+ /*
+ * Insert chk MUST be a MIDDLE or LAST
+ * fragment
+ */
+ if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
+ SCTP_DATA_FIRST_FRAG) {
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - It can be a midlle or last but not a first\n");
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it's a FIRST!\n");
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper,
+ struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length =
+ htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_6);
+ ippp++;
+ *ippp = chk->rec.data.TSN_seq;
+ ippp++;
+ *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
+
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_6;
+ sctp_abort_an_association(stcb->sctp_ep,
+ stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+ *abort_flag = 1;
+ return;
+ }
+ if (chk->rec.data.stream_number !=
+ prev->rec.data.stream_number) {
+ /*
+ * Huh, need the correct STR here,
+ * they must be the same.
+ */
+ SCTP_PRINTF("Prev check - Gak, Evil plot, ssn:%d not the same as at:%d\n",
+ chk->rec.data.stream_number,
+ prev->rec.data.stream_number);
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper,
+ struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length =
+ htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_7);
+ ippp++;
+ *ippp = chk->rec.data.TSN_seq;
+ ippp++;
+ *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_7;
+ sctp_abort_an_association(stcb->sctp_ep,
+ stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+
+ *abort_flag = 1;
+ return;
+ }
+ if ((prev->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0 &&
+ chk->rec.data.stream_seq !=
+ prev->rec.data.stream_seq) {
+ /*
+ * Huh, need the correct STR here,
+ * they must be the same.
+ */
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, Evil plot, sseq:%d not the same as at:%d\n",
+ chk->rec.data.stream_seq,
+ prev->rec.data.stream_seq);
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper,
+ struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length =
+ htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_8);
+ ippp++;
+ *ippp = chk->rec.data.TSN_seq;
+ ippp++;
+ *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_8;
+ sctp_abort_an_association(stcb->sctp_ep,
+ stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+
+ *abort_flag = 1;
+ return;
+ }
+ } else if ((prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
+ SCTP_DATA_LAST_FRAG) {
+ /* Insert chk MUST be a FIRST */
+ if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) !=
+ SCTP_DATA_FIRST_FRAG) {
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, evil plot, its not FIRST and it must be!\n");
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper,
+ struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length =
+ htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_9);
+ ippp++;
+ *ippp = chk->rec.data.TSN_seq;
+ ippp++;
+ *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
+
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_9;
+ sctp_abort_an_association(stcb->sctp_ep,
+ stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+
+ *abort_flag = 1;
+ return;
+ }
+ }
+ }
+ }
+ if (next) {
+ post_tsn = chk->rec.data.TSN_seq + 1;
+ if (post_tsn == next->rec.data.TSN_seq) {
+ /*
+ * Ok the one I am inserting ahead of is my NEXT
+ * one. A bit of valdiation here.
+ */
+ if (next->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) {
+ /* Insert chk MUST be a last fragment */
+ if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK)
+ != SCTP_DATA_LAST_FRAG) {
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Next is FIRST, we must be LAST\n");
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, its not a last!\n");
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper,
+ struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length =
+ htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_10);
+ ippp++;
+ *ippp = chk->rec.data.TSN_seq;
+ ippp++;
+ *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_10;
+ sctp_abort_an_association(stcb->sctp_ep,
+ stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+
+ *abort_flag = 1;
+ return;
+ }
+ } else if ((next->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
+ SCTP_DATA_MIDDLE_FRAG ||
+ (next->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
+ SCTP_DATA_LAST_FRAG) {
+ /*
+ * Insert chk CAN be MIDDLE or FIRST NOT
+ * LAST
+ */
+ if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
+ SCTP_DATA_LAST_FRAG) {
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Next is a MIDDLE/LAST\n");
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, new prev chunk is a LAST\n");
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper,
+ struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length =
+ htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_11);
+ ippp++;
+ *ippp = chk->rec.data.TSN_seq;
+ ippp++;
+ *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
+
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_11;
+ sctp_abort_an_association(stcb->sctp_ep,
+ stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+
+ *abort_flag = 1;
+ return;
+ }
+ if (chk->rec.data.stream_number !=
+ next->rec.data.stream_number) {
+ /*
+ * Huh, need the correct STR here,
+ * they must be the same.
+ */
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Gak, Evil plot, ssn:%d not the same as at:%d\n",
+ chk->rec.data.stream_number,
+ next->rec.data.stream_number);
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper,
+ struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length =
+ htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_12);
+ ippp++;
+ *ippp = chk->rec.data.TSN_seq;
+ ippp++;
+ *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
+
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_12;
+ sctp_abort_an_association(stcb->sctp_ep,
+ stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+
+ *abort_flag = 1;
+ return;
+ }
+ if ((next->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0 &&
+ chk->rec.data.stream_seq !=
+ next->rec.data.stream_seq) {
+ /*
+ * Huh, need the correct STR here,
+ * they must be the same.
+ */
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Gak, Evil plot, sseq:%d not the same as at:%d\n",
+ chk->rec.data.stream_seq,
+ next->rec.data.stream_seq);
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper,
+ struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length =
+ htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_13);
+ ippp++;
+ *ippp = chk->rec.data.TSN_seq;
+ ippp++;
+ *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_13;
+ sctp_abort_an_association(stcb->sctp_ep,
+ stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+
+ *abort_flag = 1;
+ return;
+ }
+ }
+ }
+ }
+ /* Do we need to do some delivery? check */
+ sctp_deliver_reasm_check(stcb, asoc);
+}
+
+/*
+ * This is an unfortunate routine. It checks to make sure a evil guy is not
+ * stuffing us full of bad packet fragments. A broken peer could also do this
+ * but this is doubtful. It is to bad I must worry about evil crackers sigh
+ * :< more cycles.
+ */
+static int
+sctp_does_tsn_belong_to_reasm(struct sctp_association *asoc,
+ uint32_t TSN_seq)
+{
+ struct sctp_tmit_chunk *at;
+ uint32_t tsn_est;
+
+ TAILQ_FOREACH(at, &asoc->reasmqueue, sctp_next) {
+ if (compare_with_wrap(TSN_seq,
+ at->rec.data.TSN_seq, MAX_TSN)) {
+ /* is it one bigger? */
+ tsn_est = at->rec.data.TSN_seq + 1;
+ if (tsn_est == TSN_seq) {
+ /* yep. It better be a last then */
+ if ((at->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) !=
+ SCTP_DATA_LAST_FRAG) {
+ /*
+ * Ok this guy belongs next to a guy
+ * that is NOT last, it should be a
+ * middle/last, not a complete
+ * chunk.
+ */
+ return (1);
+ } else {
+ /*
+ * This guy is ok since its a LAST
+ * and the new chunk is a fully
+ * self- contained one.
+ */
+ return (0);
+ }
+ }
+ } else if (TSN_seq == at->rec.data.TSN_seq) {
+ /* Software error since I have a dup? */
+ return (1);
+ } else {
+ /*
+ * Ok, 'at' is larger than new chunk but does it
+ * need to be right before it.
+ */
+ tsn_est = TSN_seq + 1;
+ if (tsn_est == at->rec.data.TSN_seq) {
+ /* Yep, It better be a first */
+ if ((at->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) !=
+ SCTP_DATA_FIRST_FRAG) {
+ return (1);
+ } else {
+ return (0);
+ }
+ }
+ }
+ }
+ return (0);
+}
+
+
+static int
+sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc,
+ struct mbuf **m, int offset, struct sctp_data_chunk *ch, int chk_length,
+ struct sctp_nets *net, uint32_t * high_tsn, int *abort_flag,
+ int *break_flag, int last_chunk)
+{
+ /* Process a data chunk */
+ /* struct sctp_tmit_chunk *chk; */
+ struct sctp_tmit_chunk *chk;
+ uint32_t tsn, gap;
+ struct mbuf *dmbuf;
+ int indx, the_len;
+ int need_reasm_check = 0;
+ uint16_t strmno, strmseq;
+ struct mbuf *oper;
+ struct sctp_queued_to_read *control;
+ int ordered;
+ uint32_t protocol_id;
+ uint8_t chunk_flags;
+ struct sctp_stream_reset_list *liste;
+
+ chk = NULL;
+ tsn = ntohl(ch->dp.tsn);
+ chunk_flags = ch->ch.chunk_flags;
+ if ((chunk_flags & SCTP_DATA_SACK_IMMEDIATELY) == SCTP_DATA_SACK_IMMEDIATELY) {
+ asoc->send_sack = 1;
+ }
+ protocol_id = ch->dp.protocol_id;
+ ordered = ((chunk_flags & SCTP_DATA_UNORDERED) == 0);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
+ sctp_log_map(tsn, asoc->cumulative_tsn, asoc->highest_tsn_inside_map, SCTP_MAP_TSN_ENTERS);
+ }
+ if (stcb == NULL) {
+ return (0);
+ }
+ SCTP_LTRACE_CHK(stcb->sctp_ep, stcb, ch->ch.chunk_type, tsn);
+ if (compare_with_wrap(asoc->cumulative_tsn, tsn, MAX_TSN) ||
+ asoc->cumulative_tsn == tsn) {
+ /* It is a duplicate */
+ SCTP_STAT_INCR(sctps_recvdupdata);
+ if (asoc->numduptsns < SCTP_MAX_DUP_TSNS) {
+ /* Record a dup for the next outbound sack */
+ asoc->dup_tsns[asoc->numduptsns] = tsn;
+ asoc->numduptsns++;
+ }
+ asoc->send_sack = 1;
+ return (0);
+ }
+ /* Calculate the number of TSN's between the base and this TSN */
+ SCTP_CALC_TSN_TO_GAP(gap, tsn, asoc->mapping_array_base_tsn);
+ if (gap >= (SCTP_MAPPING_ARRAY << 3)) {
+ /* Can't hold the bit in the mapping at max array, toss it */
+ return (0);
+ }
+ if (gap >= (uint32_t) (asoc->mapping_array_size << 3)) {
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ if (sctp_expand_mapping_array(asoc, gap)) {
+ /* Can't expand, drop it */
+ return (0);
+ }
+ }
+ if (compare_with_wrap(tsn, *high_tsn, MAX_TSN)) {
+ *high_tsn = tsn;
+ }
+ /* See if we have received this one already */
+ if (SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap) ||
+ SCTP_IS_TSN_PRESENT(asoc->nr_mapping_array, gap)) {
+ SCTP_STAT_INCR(sctps_recvdupdata);
+ if (asoc->numduptsns < SCTP_MAX_DUP_TSNS) {
+ /* Record a dup for the next outbound sack */
+ asoc->dup_tsns[asoc->numduptsns] = tsn;
+ asoc->numduptsns++;
+ }
+ asoc->send_sack = 1;
+ return (0);
+ }
+ /*
+ * Check to see about the GONE flag, duplicates would cause a sack
+ * to be sent up above
+ */
+ if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
+ (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET))
+ ) {
+ /*
+ * wait a minute, this guy is gone, there is no longer a
+ * receiver. Send peer an ABORT!
+ */
+ struct mbuf *op_err;
+
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC);
+ sctp_abort_an_association(stcb->sctp_ep, stcb, 0, op_err, SCTP_SO_NOT_LOCKED);
+ *abort_flag = 1;
+ return (0);
+ }
+ /*
+ * Now before going further we see if there is room. If NOT then we
+ * MAY let one through only IF this TSN is the one we are waiting
+ * for on a partial delivery API.
+ */
+
+ /* now do the tests */
+ if (((asoc->cnt_on_all_streams +
+ asoc->cnt_on_reasm_queue +
+ asoc->cnt_msg_on_sb) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue)) ||
+ (((int)asoc->my_rwnd) <= 0)) {
+ /*
+ * When we have NO room in the rwnd we check to make sure
+ * the reader is doing its job...
+ */
+ if (stcb->sctp_socket->so_rcv.sb_cc) {
+ /* some to read, wake-up */
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ /* assoc was freed while we were unlocked */
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return (0);
+ }
+#endif
+ sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ }
+ /* now is it in the mapping array of what we have accepted? */
+ if (compare_with_wrap(tsn, asoc->highest_tsn_inside_map, MAX_TSN) &&
+ compare_with_wrap(tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) {
+ /* Nope not in the valid range dump it */
+ sctp_set_rwnd(stcb, asoc);
+ if ((asoc->cnt_on_all_streams +
+ asoc->cnt_on_reasm_queue +
+ asoc->cnt_msg_on_sb) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue)) {
+ SCTP_STAT_INCR(sctps_datadropchklmt);
+ } else {
+ SCTP_STAT_INCR(sctps_datadroprwnd);
+ }
+ indx = *break_flag;
+ *break_flag = 1;
+ return (0);
+ }
+ }
+ strmno = ntohs(ch->dp.stream_id);
+ if (strmno >= asoc->streamincnt) {
+ struct sctp_paramhdr *phdr;
+ struct mbuf *mb;
+
+ mb = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) * 2),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (mb != NULL) {
+ /* add some space up front so prepend will work well */
+ SCTP_BUF_RESV_UF(mb, sizeof(struct sctp_chunkhdr));
+ phdr = mtod(mb, struct sctp_paramhdr *);
+ /*
+ * Error causes are just param's and this one has
+ * two back to back phdr, one with the error type
+ * and size, the other with the streamid and a rsvd
+ */
+ SCTP_BUF_LEN(mb) = (sizeof(struct sctp_paramhdr) * 2);
+ phdr->param_type = htons(SCTP_CAUSE_INVALID_STREAM);
+ phdr->param_length =
+ htons(sizeof(struct sctp_paramhdr) * 2);
+ phdr++;
+ /* We insert the stream in the type field */
+ phdr->param_type = ch->dp.stream_id;
+ /* And set the length to 0 for the rsvd field */
+ phdr->param_length = 0;
+ sctp_queue_op_err(stcb, mb);
+ }
+ SCTP_STAT_INCR(sctps_badsid);
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap);
+ if (compare_with_wrap(tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) {
+ asoc->highest_tsn_inside_nr_map = tsn;
+ }
+ if (tsn == (asoc->cumulative_tsn + 1)) {
+ /* Update cum-ack */
+ asoc->cumulative_tsn = tsn;
+ }
+ return (0);
+ }
+ /*
+ * Before we continue lets validate that we are not being fooled by
+ * an evil attacker. We can only have 4k chunks based on our TSN
+ * spread allowed by the mapping array 512 * 8 bits, so there is no
+ * way our stream sequence numbers could have wrapped. We of course
+ * only validate the FIRST fragment so the bit must be set.
+ */
+ strmseq = ntohs(ch->dp.stream_sequence);
+#ifdef SCTP_ASOCLOG_OF_TSNS
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ if (asoc->tsn_in_at >= SCTP_TSN_LOG_SIZE) {
+ asoc->tsn_in_at = 0;
+ asoc->tsn_in_wrapped = 1;
+ }
+ asoc->in_tsnlog[asoc->tsn_in_at].tsn = tsn;
+ asoc->in_tsnlog[asoc->tsn_in_at].strm = strmno;
+ asoc->in_tsnlog[asoc->tsn_in_at].seq = strmseq;
+ asoc->in_tsnlog[asoc->tsn_in_at].sz = chk_length;
+ asoc->in_tsnlog[asoc->tsn_in_at].flgs = chunk_flags;
+ asoc->in_tsnlog[asoc->tsn_in_at].stcb = (void *)stcb;
+ asoc->in_tsnlog[asoc->tsn_in_at].in_pos = asoc->tsn_in_at;
+ asoc->in_tsnlog[asoc->tsn_in_at].in_out = 1;
+ asoc->tsn_in_at++;
+#endif
+ if ((chunk_flags & SCTP_DATA_FIRST_FRAG) &&
+ (TAILQ_EMPTY(&asoc->resetHead)) &&
+ (chunk_flags & SCTP_DATA_UNORDERED) == 0 &&
+ (compare_with_wrap(asoc->strmin[strmno].last_sequence_delivered,
+ strmseq, MAX_SEQ) ||
+ asoc->strmin[strmno].last_sequence_delivered == strmseq)) {
+ /* The incoming sseq is behind where we last delivered? */
+ SCTPDBG(SCTP_DEBUG_INDATA1, "EVIL/Broken-Dup S-SEQ:%d delivered:%d from peer, Abort!\n",
+ strmseq, asoc->strmin[strmno].last_sequence_delivered);
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper, struct sctp_paramhdr *);
+ ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length = htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_14);
+ ippp++;
+ *ippp = tsn;
+ ippp++;
+ *ippp = ((strmno << 16) | strmseq);
+
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_14;
+ sctp_abort_an_association(stcb->sctp_ep, stcb,
+ SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+ *abort_flag = 1;
+ return (0);
+ }
+ /************************************
+ * From here down we may find ch-> invalid
+ * so its a good idea NOT to use it.
+ *************************************/
+
+ the_len = (chk_length - sizeof(struct sctp_data_chunk));
+ if (last_chunk == 0) {
+ dmbuf = SCTP_M_COPYM(*m,
+ (offset + sizeof(struct sctp_data_chunk)),
+ the_len, M_DONTWAIT);
+#ifdef SCTP_MBUF_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ struct mbuf *mat;
+
+ mat = dmbuf;
+ while (mat) {
+ if (SCTP_BUF_IS_EXTENDED(mat)) {
+ sctp_log_mb(mat, SCTP_MBUF_ICOPY);
+ }
+ mat = SCTP_BUF_NEXT(mat);
+ }
+ }
+#endif
+ } else {
+ /* We can steal the last chunk */
+ int l_len;
+
+ dmbuf = *m;
+ /* lop off the top part */
+ m_adj(dmbuf, (offset + sizeof(struct sctp_data_chunk)));
+ if (SCTP_BUF_NEXT(dmbuf) == NULL) {
+ l_len = SCTP_BUF_LEN(dmbuf);
+ } else {
+ /*
+ * need to count up the size hopefully does not hit
+ * this to often :-0
+ */
+ struct mbuf *lat;
+
+ l_len = 0;
+ lat = dmbuf;
+ while (lat) {
+ l_len += SCTP_BUF_LEN(lat);
+ lat = SCTP_BUF_NEXT(lat);
+ }
+ }
+ if (l_len > the_len) {
+ /* Trim the end round bytes off too */
+ m_adj(dmbuf, -(l_len - the_len));
+ }
+ }
+ if (dmbuf == NULL) {
+ SCTP_STAT_INCR(sctps_nomem);
+ return (0);
+ }
+ if ((chunk_flags & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG &&
+ asoc->fragmented_delivery_inprogress == 0 &&
+ TAILQ_EMPTY(&asoc->resetHead) &&
+ ((ordered == 0) ||
+ ((uint16_t) (asoc->strmin[strmno].last_sequence_delivered + 1) == strmseq &&
+ TAILQ_EMPTY(&asoc->strmin[strmno].inqueue)))) {
+ /* Candidate for express delivery */
+ /*
+ * Its not fragmented, No PD-API is up, Nothing in the
+ * delivery queue, Its un-ordered OR ordered and the next to
+ * deliver AND nothing else is stuck on the stream queue,
+ * And there is room for it in the socket buffer. Lets just
+ * stuff it up the buffer....
+ */
+
+ /* It would be nice to avoid this copy if we could :< */
+ sctp_alloc_a_readq(stcb, control);
+ sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn,
+ protocol_id,
+ stcb->asoc.context,
+ strmno, strmseq,
+ chunk_flags,
+ dmbuf);
+ if (control == NULL) {
+ goto failed_express_del;
+ }
+ SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap);
+ if (compare_with_wrap(tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) {
+ asoc->highest_tsn_inside_nr_map = tsn;
+ }
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control, &stcb->sctp_socket->so_rcv,
+ 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
+
+ if ((chunk_flags & SCTP_DATA_UNORDERED) == 0) {
+ /* for ordered, bump what we delivered */
+ asoc->strmin[strmno].last_sequence_delivered++;
+ }
+ SCTP_STAT_INCR(sctps_recvexpress);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
+ sctp_log_strm_del_alt(stcb, tsn, strmseq, strmno,
+ SCTP_STR_LOG_FROM_EXPRS_DEL);
+ }
+ control = NULL;
+
+ goto finish_express_del;
+ }
+failed_express_del:
+ /* If we reach here this is a new chunk */
+ chk = NULL;
+ control = NULL;
+ /* Express for fragmented delivery? */
+ if ((asoc->fragmented_delivery_inprogress) &&
+ (stcb->asoc.control_pdapi) &&
+ (asoc->str_of_pdapi == strmno) &&
+ (asoc->ssn_of_pdapi == strmseq)
+ ) {
+ control = stcb->asoc.control_pdapi;
+ if ((chunk_flags & SCTP_DATA_FIRST_FRAG) == SCTP_DATA_FIRST_FRAG) {
+ /* Can't be another first? */
+ goto failed_pdapi_express_del;
+ }
+ if (tsn == (control->sinfo_tsn + 1)) {
+ /* Yep, we can add it on */
+ int end = 0;
+ uint32_t cumack;
+
+ if (chunk_flags & SCTP_DATA_LAST_FRAG) {
+ end = 1;
+ }
+ cumack = asoc->cumulative_tsn;
+ if ((cumack + 1) == tsn)
+ cumack = tsn;
+
+ if (sctp_append_to_readq(stcb->sctp_ep, stcb, control, dmbuf, end,
+ tsn,
+ &stcb->sctp_socket->so_rcv)) {
+ SCTP_PRINTF("Append fails end:%d\n", end);
+ goto failed_pdapi_express_del;
+ }
+ SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap);
+ if (compare_with_wrap(tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) {
+ asoc->highest_tsn_inside_nr_map = tsn;
+ }
+ SCTP_STAT_INCR(sctps_recvexpressm);
+ control->sinfo_tsn = tsn;
+ asoc->tsn_last_delivered = tsn;
+ asoc->fragment_flags = chunk_flags;
+ asoc->tsn_of_pdapi_last_delivered = tsn;
+ asoc->last_flags_delivered = chunk_flags;
+ asoc->last_strm_seq_delivered = strmseq;
+ asoc->last_strm_no_delivered = strmno;
+ if (end) {
+ /* clean up the flags and such */
+ asoc->fragmented_delivery_inprogress = 0;
+ if ((chunk_flags & SCTP_DATA_UNORDERED) == 0) {
+ asoc->strmin[strmno].last_sequence_delivered++;
+ }
+ stcb->asoc.control_pdapi = NULL;
+ if (TAILQ_EMPTY(&asoc->reasmqueue) == 0) {
+ /*
+ * There could be another message
+ * ready
+ */
+ need_reasm_check = 1;
+ }
+ }
+ control = NULL;
+ goto finish_express_del;
+ }
+ }
+failed_pdapi_express_del:
+ control = NULL;
+ if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) {
+ SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap);
+ if (compare_with_wrap(tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) {
+ asoc->highest_tsn_inside_nr_map = tsn;
+ }
+ } else {
+ SCTP_SET_TSN_PRESENT(asoc->mapping_array, gap);
+ if (compare_with_wrap(tsn, asoc->highest_tsn_inside_map, MAX_TSN)) {
+ asoc->highest_tsn_inside_map = tsn;
+ }
+ }
+ if ((chunk_flags & SCTP_DATA_NOT_FRAG) != SCTP_DATA_NOT_FRAG) {
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ /* No memory so we drop the chunk */
+ SCTP_STAT_INCR(sctps_nomem);
+ if (last_chunk == 0) {
+ /* we copied it, free the copy */
+ sctp_m_freem(dmbuf);
+ }
+ return (0);
+ }
+ chk->rec.data.TSN_seq = tsn;
+ chk->no_fr_allowed = 0;
+ chk->rec.data.stream_seq = strmseq;
+ chk->rec.data.stream_number = strmno;
+ chk->rec.data.payloadtype = protocol_id;
+ chk->rec.data.context = stcb->asoc.context;
+ chk->rec.data.doing_fast_retransmit = 0;
+ chk->rec.data.rcv_flags = chunk_flags;
+ chk->asoc = asoc;
+ chk->send_size = the_len;
+ chk->whoTo = net;
+ atomic_add_int(&net->ref_count, 1);
+ chk->data = dmbuf;
+ } else {
+ sctp_alloc_a_readq(stcb, control);
+ sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn,
+ protocol_id,
+ stcb->asoc.context,
+ strmno, strmseq,
+ chunk_flags,
+ dmbuf);
+ if (control == NULL) {
+ /* No memory so we drop the chunk */
+ SCTP_STAT_INCR(sctps_nomem);
+ if (last_chunk == 0) {
+ /* we copied it, free the copy */
+ sctp_m_freem(dmbuf);
+ }
+ return (0);
+ }
+ control->length = the_len;
+ }
+
+ /* Mark it as received */
+ /* Now queue it where it belongs */
+ if (control != NULL) {
+ /* First a sanity check */
+ if (asoc->fragmented_delivery_inprogress) {
+ /*
+ * Ok, we have a fragmented delivery in progress if
+ * this chunk is next to deliver OR belongs in our
+ * view to the reassembly, the peer is evil or
+ * broken.
+ */
+ uint32_t estimate_tsn;
+
+ estimate_tsn = asoc->tsn_last_delivered + 1;
+ if (TAILQ_EMPTY(&asoc->reasmqueue) &&
+ (estimate_tsn == control->sinfo_tsn)) {
+ /* Evil/Broke peer */
+ sctp_m_freem(control->data);
+ control->data = NULL;
+ if (control->whoFrom) {
+ sctp_free_remote_addr(control->whoFrom);
+ control->whoFrom = NULL;
+ }
+ sctp_free_a_readq(stcb, control);
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper, struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length = htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_15);
+ ippp++;
+ *ippp = tsn;
+ ippp++;
+ *ippp = ((strmno << 16) | strmseq);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_15;
+ sctp_abort_an_association(stcb->sctp_ep, stcb,
+ SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+
+ *abort_flag = 1;
+ return (0);
+ } else {
+ if (sctp_does_tsn_belong_to_reasm(asoc, control->sinfo_tsn)) {
+ sctp_m_freem(control->data);
+ control->data = NULL;
+ if (control->whoFrom) {
+ sctp_free_remote_addr(control->whoFrom);
+ control->whoFrom = NULL;
+ }
+ sctp_free_a_readq(stcb, control);
+
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper,
+ struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length =
+ htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_16);
+ ippp++;
+ *ippp = tsn;
+ ippp++;
+ *ippp = ((strmno << 16) | strmseq);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_16;
+ sctp_abort_an_association(stcb->sctp_ep,
+ stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+
+ *abort_flag = 1;
+ return (0);
+ }
+ }
+ } else {
+ /* No PDAPI running */
+ if (!TAILQ_EMPTY(&asoc->reasmqueue)) {
+ /*
+ * Reassembly queue is NOT empty validate
+ * that this tsn does not need to be in
+ * reasembly queue. If it does then our peer
+ * is broken or evil.
+ */
+ if (sctp_does_tsn_belong_to_reasm(asoc, control->sinfo_tsn)) {
+ sctp_m_freem(control->data);
+ control->data = NULL;
+ if (control->whoFrom) {
+ sctp_free_remote_addr(control->whoFrom);
+ control->whoFrom = NULL;
+ }
+ sctp_free_a_readq(stcb, control);
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr) +
+ (3 * sizeof(uint32_t));
+ ph = mtod(oper,
+ struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length =
+ htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_17);
+ ippp++;
+ *ippp = tsn;
+ ippp++;
+ *ippp = ((strmno << 16) | strmseq);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_17;
+ sctp_abort_an_association(stcb->sctp_ep,
+ stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+
+ *abort_flag = 1;
+ return (0);
+ }
+ }
+ }
+ /* ok, if we reach here we have passed the sanity checks */
+ if (chunk_flags & SCTP_DATA_UNORDERED) {
+ /* queue directly into socket buffer */
+ sctp_mark_non_revokable(asoc, control->sinfo_tsn);
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
+ } else {
+ /*
+ * Special check for when streams are resetting. We
+ * could be more smart about this and check the
+ * actual stream to see if it is not being reset..
+ * that way we would not create a HOLB when amongst
+ * streams being reset and those not being reset.
+ *
+ * We take complete messages that have a stream reset
+ * intervening (aka the TSN is after where our
+ * cum-ack needs to be) off and put them on a
+ * pending_reply_queue. The reassembly ones we do
+ * not have to worry about since they are all sorted
+ * and proceessed by TSN order. It is only the
+ * singletons I must worry about.
+ */
+ if (((liste = TAILQ_FIRST(&asoc->resetHead)) != NULL) &&
+ ((compare_with_wrap(tsn, liste->tsn, MAX_TSN)))
+ ) {
+ /*
+ * yep its past where we need to reset... go
+ * ahead and queue it.
+ */
+ if (TAILQ_EMPTY(&asoc->pending_reply_queue)) {
+ /* first one on */
+ TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next);
+ } else {
+ struct sctp_queued_to_read *ctlOn;
+ unsigned char inserted = 0;
+
+ ctlOn = TAILQ_FIRST(&asoc->pending_reply_queue);
+ while (ctlOn) {
+ if (compare_with_wrap(control->sinfo_tsn,
+ ctlOn->sinfo_tsn, MAX_TSN)) {
+ ctlOn = TAILQ_NEXT(ctlOn, next);
+ } else {
+ /* found it */
+ TAILQ_INSERT_BEFORE(ctlOn, control, next);
+ inserted = 1;
+ break;
+ }
+ }
+ if (inserted == 0) {
+ /*
+ * must be put at end, use
+ * prevP (all setup from
+ * loop) to setup nextP.
+ */
+ TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next);
+ }
+ }
+ } else {
+ sctp_queue_data_to_stream(stcb, asoc, control, abort_flag);
+ if (*abort_flag) {
+ return (0);
+ }
+ }
+ }
+ } else {
+ /* Into the re-assembly queue */
+ sctp_queue_data_for_reasm(stcb, asoc, chk, abort_flag);
+ if (*abort_flag) {
+ /*
+ * the assoc is now gone and chk was put onto the
+ * reasm queue, which has all been freed.
+ */
+ *m = NULL;
+ return (0);
+ }
+ }
+finish_express_del:
+ if (tsn == (asoc->cumulative_tsn + 1)) {
+ /* Update cum-ack */
+ asoc->cumulative_tsn = tsn;
+ }
+ if (last_chunk) {
+ *m = NULL;
+ }
+ if (ordered) {
+ SCTP_STAT_INCR_COUNTER64(sctps_inorderchunks);
+ } else {
+ SCTP_STAT_INCR_COUNTER64(sctps_inunorderchunks);
+ }
+ SCTP_STAT_INCR(sctps_recvdata);
+ /* Set it present please */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
+ sctp_log_strm_del_alt(stcb, tsn, strmseq, strmno, SCTP_STR_LOG_FROM_MARK_TSN);
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
+ sctp_log_map(asoc->mapping_array_base_tsn, asoc->cumulative_tsn,
+ asoc->highest_tsn_inside_map, SCTP_MAP_PREPARE_SLIDE);
+ }
+ /* check the special flag for stream resets */
+ if (((liste = TAILQ_FIRST(&asoc->resetHead)) != NULL) &&
+ ((compare_with_wrap(asoc->cumulative_tsn, liste->tsn, MAX_TSN)) ||
+ (asoc->cumulative_tsn == liste->tsn))
+ ) {
+ /*
+ * we have finished working through the backlogged TSN's now
+ * time to reset streams. 1: call reset function. 2: free
+ * pending_reply space 3: distribute any chunks in
+ * pending_reply_queue.
+ */
+ struct sctp_queued_to_read *ctl;
+
+ sctp_reset_in_stream(stcb, liste->number_entries, liste->req.list_of_streams);
+ TAILQ_REMOVE(&asoc->resetHead, liste, next_resp);
+ SCTP_FREE(liste, SCTP_M_STRESET);
+ /* sa_ignore FREED_MEMORY */
+ liste = TAILQ_FIRST(&asoc->resetHead);
+ ctl = TAILQ_FIRST(&asoc->pending_reply_queue);
+ if (ctl && (liste == NULL)) {
+ /* All can be removed */
+ while (ctl) {
+ TAILQ_REMOVE(&asoc->pending_reply_queue, ctl, next);
+ sctp_queue_data_to_stream(stcb, asoc, ctl, abort_flag);
+ if (*abort_flag) {
+ return (0);
+ }
+ ctl = TAILQ_FIRST(&asoc->pending_reply_queue);
+ }
+ } else if (ctl) {
+ /* more than one in queue */
+ while (!compare_with_wrap(ctl->sinfo_tsn, liste->tsn, MAX_TSN)) {
+ /*
+ * if ctl->sinfo_tsn is <= liste->tsn we can
+ * process it which is the NOT of
+ * ctl->sinfo_tsn > liste->tsn
+ */
+ TAILQ_REMOVE(&asoc->pending_reply_queue, ctl, next);
+ sctp_queue_data_to_stream(stcb, asoc, ctl, abort_flag);
+ if (*abort_flag) {
+ return (0);
+ }
+ ctl = TAILQ_FIRST(&asoc->pending_reply_queue);
+ }
+ }
+ /*
+ * Now service re-assembly to pick up anything that has been
+ * held on reassembly queue?
+ */
+ sctp_deliver_reasm_check(stcb, asoc);
+ need_reasm_check = 0;
+ }
+ if (need_reasm_check) {
+ /* Another one waits ? */
+ sctp_deliver_reasm_check(stcb, asoc);
+ }
+ return (1);
+}
+
+int8_t sctp_map_lookup_tab[256] = {
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 4,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 5,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 4,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 6,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 4,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 5,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 4,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 7,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 4,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 5,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 4,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 6,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 4,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 5,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 4,
+ 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 8
+};
+
+
+void
+sctp_slide_mapping_arrays(struct sctp_tcb *stcb)
+{
+ /*
+ * Now we also need to check the mapping array in a couple of ways.
+ * 1) Did we move the cum-ack point?
+ *
+ * When you first glance at this you might think that all entries that
+ * make up the postion of the cum-ack would be in the nr-mapping
+ * array only.. i.e. things up to the cum-ack are always
+ * deliverable. Thats true with one exception, when its a fragmented
+ * message we may not deliver the data until some threshold (or all
+ * of it) is in place. So we must OR the nr_mapping_array and
+ * mapping_array to get a true picture of the cum-ack.
+ */
+ struct sctp_association *asoc;
+ int at;
+ uint8_t val;
+ int slide_from, slide_end, lgap, distance;
+ uint32_t old_cumack, old_base, old_highest, highest_tsn;
+
+ asoc = &stcb->asoc;
+ at = 0;
+
+ old_cumack = asoc->cumulative_tsn;
+ old_base = asoc->mapping_array_base_tsn;
+ old_highest = asoc->highest_tsn_inside_map;
+ /*
+ * We could probably improve this a small bit by calculating the
+ * offset of the current cum-ack as the starting point.
+ */
+ at = 0;
+ for (slide_from = 0; slide_from < stcb->asoc.mapping_array_size; slide_from++) {
+ val = asoc->nr_mapping_array[slide_from] | asoc->mapping_array[slide_from];
+ if (val == 0xff) {
+ at += 8;
+ } else {
+ /* there is a 0 bit */
+ at += sctp_map_lookup_tab[val];
+ break;
+ }
+ }
+ asoc->cumulative_tsn = asoc->mapping_array_base_tsn + (at - 1);
+
+ if (compare_with_wrap(asoc->cumulative_tsn, asoc->highest_tsn_inside_map, MAX_TSN) &&
+ compare_with_wrap(asoc->cumulative_tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) {
+#ifdef INVARIANTS
+ panic("huh, cumack 0x%x greater than high-tsn 0x%x in map",
+ asoc->cumulative_tsn, asoc->highest_tsn_inside_map);
+#else
+ SCTP_PRINTF("huh, cumack 0x%x greater than high-tsn 0x%x in map - should panic?\n",
+ asoc->cumulative_tsn, asoc->highest_tsn_inside_map);
+ sctp_print_mapping_array(asoc);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
+ sctp_log_map(0, 6, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT);
+ }
+ asoc->highest_tsn_inside_map = asoc->cumulative_tsn;
+ asoc->highest_tsn_inside_nr_map = asoc->cumulative_tsn;
+#endif
+ }
+ if (compare_with_wrap(asoc->highest_tsn_inside_nr_map,
+ asoc->highest_tsn_inside_map,
+ MAX_TSN)) {
+ highest_tsn = asoc->highest_tsn_inside_nr_map;
+ } else {
+ highest_tsn = asoc->highest_tsn_inside_map;
+ }
+ if ((asoc->cumulative_tsn == highest_tsn) && (at >= 8)) {
+ /* The complete array was completed by a single FR */
+ /* highest becomes the cum-ack */
+ int clr;
+
+#ifdef INVARIANTS
+ unsigned int i;
+
+#endif
+
+ /* clear the array */
+ clr = ((at + 7) >> 3);
+ if (clr > asoc->mapping_array_size) {
+ clr = asoc->mapping_array_size;
+ }
+ memset(asoc->mapping_array, 0, clr);
+ memset(asoc->nr_mapping_array, 0, clr);
+#ifdef INVARIANTS
+ for (i = 0; i < asoc->mapping_array_size; i++) {
+ if ((asoc->mapping_array[i]) || (asoc->nr_mapping_array[i])) {
+ printf("Error Mapping array's not clean at clear\n");
+ sctp_print_mapping_array(asoc);
+ }
+ }
+#endif
+ asoc->mapping_array_base_tsn = asoc->cumulative_tsn + 1;
+ asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map = asoc->cumulative_tsn;
+ } else if (at >= 8) {
+ /* we can slide the mapping array down */
+ /* slide_from holds where we hit the first NON 0xff byte */
+
+ /*
+ * now calculate the ceiling of the move using our highest
+ * TSN value
+ */
+ SCTP_CALC_TSN_TO_GAP(lgap, highest_tsn, asoc->mapping_array_base_tsn);
+ slide_end = (lgap >> 3);
+ if (slide_end < slide_from) {
+ sctp_print_mapping_array(asoc);
+#ifdef INVARIANTS
+ panic("impossible slide");
+#else
+ printf("impossible slide lgap:%x slide_end:%x slide_from:%x? at:%d\n",
+ lgap, slide_end, slide_from, at);
+ return;
+#endif
+ }
+ if (slide_end > asoc->mapping_array_size) {
+#ifdef INVARIANTS
+ panic("would overrun buffer");
+#else
+ printf("Gak, would have overrun map end:%d slide_end:%d\n",
+ asoc->mapping_array_size, slide_end);
+ slide_end = asoc->mapping_array_size;
+#endif
+ }
+ distance = (slide_end - slide_from) + 1;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
+ sctp_log_map(old_base, old_cumack, old_highest,
+ SCTP_MAP_PREPARE_SLIDE);
+ sctp_log_map((uint32_t) slide_from, (uint32_t) slide_end,
+ (uint32_t) lgap, SCTP_MAP_SLIDE_FROM);
+ }
+ if (distance + slide_from > asoc->mapping_array_size ||
+ distance < 0) {
+ /*
+ * Here we do NOT slide forward the array so that
+ * hopefully when more data comes in to fill it up
+ * we will be able to slide it forward. Really I
+ * don't think this should happen :-0
+ */
+
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
+ sctp_log_map((uint32_t) distance, (uint32_t) slide_from,
+ (uint32_t) asoc->mapping_array_size,
+ SCTP_MAP_SLIDE_NONE);
+ }
+ } else {
+ int ii;
+
+ for (ii = 0; ii < distance; ii++) {
+ asoc->mapping_array[ii] = asoc->mapping_array[slide_from + ii];
+ asoc->nr_mapping_array[ii] = asoc->nr_mapping_array[slide_from + ii];
+
+ }
+ for (ii = distance; ii < asoc->mapping_array_size; ii++) {
+ asoc->mapping_array[ii] = 0;
+ asoc->nr_mapping_array[ii] = 0;
+ }
+ if (asoc->highest_tsn_inside_map + 1 == asoc->mapping_array_base_tsn) {
+ asoc->highest_tsn_inside_map += (slide_from << 3);
+ }
+ if (asoc->highest_tsn_inside_nr_map + 1 == asoc->mapping_array_base_tsn) {
+ asoc->highest_tsn_inside_nr_map += (slide_from << 3);
+ }
+ asoc->mapping_array_base_tsn += (slide_from << 3);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
+ sctp_log_map(asoc->mapping_array_base_tsn,
+ asoc->cumulative_tsn, asoc->highest_tsn_inside_map,
+ SCTP_MAP_SLIDE_RESULT);
+ }
+ }
+ }
+}
+
+
+void
+sctp_sack_check(struct sctp_tcb *stcb, int was_a_gap, int *abort_flag)
+{
+ struct sctp_association *asoc;
+ uint32_t highest_tsn;
+
+ asoc = &stcb->asoc;
+ if (compare_with_wrap(asoc->highest_tsn_inside_nr_map,
+ asoc->highest_tsn_inside_map,
+ MAX_TSN)) {
+ highest_tsn = asoc->highest_tsn_inside_nr_map;
+ } else {
+ highest_tsn = asoc->highest_tsn_inside_map;
+ }
+
+ /*
+ * Now we need to see if we need to queue a sack or just start the
+ * timer (if allowed).
+ */
+ if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) {
+ /*
+ * Ok special case, in SHUTDOWN-SENT case. here we maker
+ * sure SACK timer is off and instead send a SHUTDOWN and a
+ * SACK
+ */
+ if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_RECV,
+ stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_INDATA + SCTP_LOC_18);
+ }
+ sctp_send_shutdown(stcb, stcb->asoc.primary_destination);
+ sctp_send_sack(stcb);
+ } else {
+ int is_a_gap;
+
+ /* is there a gap now ? */
+ is_a_gap = compare_with_wrap(highest_tsn, stcb->asoc.cumulative_tsn, MAX_TSN);
+
+ /*
+ * CMT DAC algorithm: increase number of packets received
+ * since last ack
+ */
+ stcb->asoc.cmt_dac_pkts_rcvd++;
+
+ if ((stcb->asoc.send_sack == 1) || /* We need to send a
+ * SACK */
+ ((was_a_gap) && (is_a_gap == 0)) || /* was a gap, but no
+ * longer is one */
+ (stcb->asoc.numduptsns) || /* we have dup's */
+ (is_a_gap) || /* is still a gap */
+ (stcb->asoc.delayed_ack == 0) || /* Delayed sack disabled */
+ (stcb->asoc.data_pkts_seen >= stcb->asoc.sack_freq) /* hit limit of pkts */
+ ) {
+
+ if ((stcb->asoc.sctp_cmt_on_off == 1) &&
+ (SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) &&
+ (stcb->asoc.send_sack == 0) &&
+ (stcb->asoc.numduptsns == 0) &&
+ (stcb->asoc.delayed_ack) &&
+ (!SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer))) {
+
+ /*
+ * CMT DAC algorithm: With CMT, delay acks
+ * even in the face of
+ *
+ * reordering. Therefore, if acks that do not
+ * have to be sent because of the above
+ * reasons, will be delayed. That is, acks
+ * that would have been sent due to gap
+ * reports will be delayed with DAC. Start
+ * the delayed ack timer.
+ */
+ sctp_timer_start(SCTP_TIMER_TYPE_RECV,
+ stcb->sctp_ep, stcb, NULL);
+ } else {
+ /*
+ * Ok we must build a SACK since the timer
+ * is pending, we got our first packet OR
+ * there are gaps or duplicates.
+ */
+ (void)SCTP_OS_TIMER_STOP(&stcb->asoc.dack_timer.timer);
+ sctp_send_sack(stcb);
+ }
+ } else {
+ if (!SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) {
+ sctp_timer_start(SCTP_TIMER_TYPE_RECV,
+ stcb->sctp_ep, stcb, NULL);
+ }
+ }
+ }
+}
+
+void
+sctp_service_queues(struct sctp_tcb *stcb, struct sctp_association *asoc)
+{
+ struct sctp_tmit_chunk *chk;
+ uint32_t tsize, pd_point;
+ uint16_t nxt_todel;
+
+ if (asoc->fragmented_delivery_inprogress) {
+ sctp_service_reassembly(stcb, asoc);
+ }
+ /* Can we proceed further, i.e. the PD-API is complete */
+ if (asoc->fragmented_delivery_inprogress) {
+ /* no */
+ return;
+ }
+ /*
+ * Now is there some other chunk I can deliver from the reassembly
+ * queue.
+ */
+doit_again:
+ chk = TAILQ_FIRST(&asoc->reasmqueue);
+ if (chk == NULL) {
+ asoc->size_on_reasm_queue = 0;
+ asoc->cnt_on_reasm_queue = 0;
+ return;
+ }
+ nxt_todel = asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered + 1;
+ if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) &&
+ ((nxt_todel == chk->rec.data.stream_seq) ||
+ (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED))) {
+ /*
+ * Yep the first one is here. We setup to start reception,
+ * by backing down the TSN just in case we can't deliver.
+ */
+
+ /*
+ * Before we start though either all of the message should
+ * be here or the socket buffer max or nothing on the
+ * delivery queue and something can be delivered.
+ */
+ if (stcb->sctp_socket) {
+ pd_point = min(SCTP_SB_LIMIT_RCV(stcb->sctp_socket),
+ stcb->sctp_ep->partial_delivery_point);
+ } else {
+ pd_point = stcb->sctp_ep->partial_delivery_point;
+ }
+ if (sctp_is_all_msg_on_reasm(asoc, &tsize) || (tsize >= pd_point)) {
+ asoc->fragmented_delivery_inprogress = 1;
+ asoc->tsn_last_delivered = chk->rec.data.TSN_seq - 1;
+ asoc->str_of_pdapi = chk->rec.data.stream_number;
+ asoc->ssn_of_pdapi = chk->rec.data.stream_seq;
+ asoc->pdapi_ppid = chk->rec.data.payloadtype;
+ asoc->fragment_flags = chk->rec.data.rcv_flags;
+ sctp_service_reassembly(stcb, asoc);
+ if (asoc->fragmented_delivery_inprogress == 0) {
+ goto doit_again;
+ }
+ }
+ }
+}
+
+int
+sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
+ struct sctphdr *sh, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net, uint32_t * high_tsn)
+{
+ struct sctp_data_chunk *ch, chunk_buf;
+ struct sctp_association *asoc;
+ int num_chunks = 0; /* number of control chunks processed */
+ int stop_proc = 0;
+ int chk_length, break_flag, last_chunk;
+ int abort_flag = 0, was_a_gap;
+ struct mbuf *m;
+ uint32_t highest_tsn;
+
+ /* set the rwnd */
+ sctp_set_rwnd(stcb, &stcb->asoc);
+
+ m = *mm;
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ asoc = &stcb->asoc;
+ if (compare_with_wrap(asoc->highest_tsn_inside_nr_map, asoc->highest_tsn_inside_map, MAX_TSN)) {
+ highest_tsn = asoc->highest_tsn_inside_nr_map;
+ } else {
+ highest_tsn = asoc->highest_tsn_inside_map;
+ }
+ was_a_gap = compare_with_wrap(highest_tsn, stcb->asoc.cumulative_tsn, MAX_TSN);
+ /*
+ * setup where we got the last DATA packet from for any SACK that
+ * may need to go out. Don't bump the net. This is done ONLY when a
+ * chunk is assigned.
+ */
+ asoc->last_data_chunk_from = net;
+
+ /*-
+ * Now before we proceed we must figure out if this is a wasted
+ * cluster... i.e. it is a small packet sent in and yet the driver
+ * underneath allocated a full cluster for it. If so we must copy it
+ * to a smaller mbuf and free up the cluster mbuf. This will help
+ * with cluster starvation. Note for __Panda__ we don't do this
+ * since it has clusters all the way down to 64 bytes.
+ */
+ if (SCTP_BUF_LEN(m) < (long)MLEN && SCTP_BUF_NEXT(m) == NULL) {
+ /* we only handle mbufs that are singletons.. not chains */
+ m = sctp_get_mbuf_for_msg(SCTP_BUF_LEN(m), 0, M_DONTWAIT, 1, MT_DATA);
+ if (m) {
+ /* ok lets see if we can copy the data up */
+ caddr_t *from, *to;
+
+ /* get the pointers and copy */
+ to = mtod(m, caddr_t *);
+ from = mtod((*mm), caddr_t *);
+ memcpy(to, from, SCTP_BUF_LEN((*mm)));
+ /* copy the length and free up the old */
+ SCTP_BUF_LEN(m) = SCTP_BUF_LEN((*mm));
+ sctp_m_freem(*mm);
+ /* sucess, back copy */
+ *mm = m;
+ } else {
+ /* We are in trouble in the mbuf world .. yikes */
+ m = *mm;
+ }
+ }
+ /* get pointer to the first chunk header */
+ ch = (struct sctp_data_chunk *)sctp_m_getptr(m, *offset,
+ sizeof(struct sctp_data_chunk), (uint8_t *) & chunk_buf);
+ if (ch == NULL) {
+ return (1);
+ }
+ /*
+ * process all DATA chunks...
+ */
+ *high_tsn = asoc->cumulative_tsn;
+ break_flag = 0;
+ asoc->data_pkts_seen++;
+ while (stop_proc == 0) {
+ /* validate chunk length */
+ chk_length = ntohs(ch->ch.chunk_length);
+ if (length - *offset < chk_length) {
+ /* all done, mutulated chunk */
+ stop_proc = 1;
+ break;
+ }
+ if (ch->ch.chunk_type == SCTP_DATA) {
+ if ((size_t)chk_length < sizeof(struct sctp_data_chunk) + 1) {
+ /*
+ * Need to send an abort since we had a
+ * invalid data chunk.
+ */
+ struct mbuf *op_err;
+
+ op_err = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 2 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+
+ if (op_err) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(op_err) = sizeof(struct sctp_paramhdr) +
+ (2 * sizeof(uint32_t));
+ ph = mtod(op_err, struct sctp_paramhdr *);
+ ph->param_type =
+ htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length = htons(SCTP_BUF_LEN(op_err));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_19);
+ ippp++;
+ *ippp = asoc->cumulative_tsn;
+
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_19;
+ sctp_abort_association(inp, stcb, m, iphlen, sh,
+ op_err, 0, net->port);
+ return (2);
+ }
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xB1, 0);
+#endif
+ if (SCTP_SIZE32(chk_length) == (length - *offset)) {
+ last_chunk = 1;
+ } else {
+ last_chunk = 0;
+ }
+ if (sctp_process_a_data_chunk(stcb, asoc, mm, *offset, ch,
+ chk_length, net, high_tsn, &abort_flag, &break_flag,
+ last_chunk)) {
+ num_chunks++;
+ }
+ if (abort_flag)
+ return (2);
+
+ if (break_flag) {
+ /*
+ * Set because of out of rwnd space and no
+ * drop rep space left.
+ */
+ stop_proc = 1;
+ break;
+ }
+ } else {
+ /* not a data chunk in the data region */
+ switch (ch->ch.chunk_type) {
+ case SCTP_INITIATION:
+ case SCTP_INITIATION_ACK:
+ case SCTP_SELECTIVE_ACK:
+ case SCTP_NR_SELECTIVE_ACK: /* EY */
+ case SCTP_HEARTBEAT_REQUEST:
+ case SCTP_HEARTBEAT_ACK:
+ case SCTP_ABORT_ASSOCIATION:
+ case SCTP_SHUTDOWN:
+ case SCTP_SHUTDOWN_ACK:
+ case SCTP_OPERATION_ERROR:
+ case SCTP_COOKIE_ECHO:
+ case SCTP_COOKIE_ACK:
+ case SCTP_ECN_ECHO:
+ case SCTP_ECN_CWR:
+ case SCTP_SHUTDOWN_COMPLETE:
+ case SCTP_AUTHENTICATION:
+ case SCTP_ASCONF_ACK:
+ case SCTP_PACKET_DROPPED:
+ case SCTP_STREAM_RESET:
+ case SCTP_FORWARD_CUM_TSN:
+ case SCTP_ASCONF:
+ /*
+ * Now, what do we do with KNOWN chunks that
+ * are NOT in the right place?
+ *
+ * For now, I do nothing but ignore them. We
+ * may later want to add sysctl stuff to
+ * switch out and do either an ABORT() or
+ * possibly process them.
+ */
+ if (SCTP_BASE_SYSCTL(sctp_strict_data_order)) {
+ struct mbuf *op_err;
+
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ sctp_abort_association(inp, stcb, m, iphlen, sh, op_err, 0, net->port);
+ return (2);
+ }
+ break;
+ default:
+ /* unknown chunk type, use bit rules */
+ if (ch->ch.chunk_type & 0x40) {
+ /* Add a error report to the queue */
+ struct mbuf *merr;
+ struct sctp_paramhdr *phd;
+
+ merr = sctp_get_mbuf_for_msg(sizeof(*phd), 0, M_DONTWAIT, 1, MT_DATA);
+ if (merr) {
+ phd = mtod(merr, struct sctp_paramhdr *);
+ /*
+ * We cheat and use param
+ * type since we did not
+ * bother to define a error
+ * cause struct. They are
+ * the same basic format
+ * with different names.
+ */
+ phd->param_type =
+ htons(SCTP_CAUSE_UNRECOG_CHUNK);
+ phd->param_length =
+ htons(chk_length + sizeof(*phd));
+ SCTP_BUF_LEN(merr) = sizeof(*phd);
+ SCTP_BUF_NEXT(merr) = SCTP_M_COPYM(m, *offset,
+ SCTP_SIZE32(chk_length),
+ M_DONTWAIT);
+ if (SCTP_BUF_NEXT(merr)) {
+ sctp_queue_op_err(stcb, merr);
+ } else {
+ sctp_m_freem(merr);
+ }
+ }
+ }
+ if ((ch->ch.chunk_type & 0x80) == 0) {
+ /* discard the rest of this packet */
+ stop_proc = 1;
+ } /* else skip this bad chunk and
+ * continue... */
+ break;
+ }; /* switch of chunk type */
+ }
+ *offset += SCTP_SIZE32(chk_length);
+ if ((*offset >= length) || stop_proc) {
+ /* no more data left in the mbuf chain */
+ stop_proc = 1;
+ continue;
+ }
+ ch = (struct sctp_data_chunk *)sctp_m_getptr(m, *offset,
+ sizeof(struct sctp_data_chunk), (uint8_t *) & chunk_buf);
+ if (ch == NULL) {
+ *offset = length;
+ stop_proc = 1;
+ break;
+
+ }
+ } /* while */
+ if (break_flag) {
+ /*
+ * we need to report rwnd overrun drops.
+ */
+ sctp_send_packet_dropped(stcb, net, *mm, iphlen, 0);
+ }
+ if (num_chunks) {
+ /*
+ * Did we get data, if so update the time for auto-close and
+ * give peer credit for being alive.
+ */
+ SCTP_STAT_INCR(sctps_recvpktwithdata);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INDATA,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_last_rcvd);
+ }
+ /* now service all of the reassm queue if needed */
+ if (!(TAILQ_EMPTY(&asoc->reasmqueue)))
+ sctp_service_queues(stcb, asoc);
+
+ if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) {
+ /* Assure that we ack right away */
+ stcb->asoc.send_sack = 1;
+ }
+ /* Start a sack timer or QUEUE a SACK for sending */
+ sctp_sack_check(stcb, was_a_gap, &abort_flag);
+ if (abort_flag)
+ return (2);
+
+ return (0);
+}
+
+static int
+sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1, uint32_t last_tsn,
+ uint16_t frag_strt, uint16_t frag_end, int nr_sacking,
+ int *num_frs,
+ uint32_t * biggest_newly_acked_tsn,
+ uint32_t * this_sack_lowest_newack,
+ int *ecn_seg_sums)
+{
+ struct sctp_tmit_chunk *tp1;
+ unsigned int theTSN;
+ int j, wake_him = 0, circled = 0;
+
+ /* Recover the tp1 we last saw */
+ tp1 = *p_tp1;
+ if (tp1 == NULL) {
+ tp1 = TAILQ_FIRST(&stcb->asoc.sent_queue);
+ }
+ for (j = frag_strt; j <= frag_end; j++) {
+ theTSN = j + last_tsn;
+ while (tp1) {
+ if (tp1->rec.data.doing_fast_retransmit)
+ (*num_frs) += 1;
+
+ /*-
+ * CMT: CUCv2 algorithm. For each TSN being
+ * processed from the sent queue, track the
+ * next expected pseudo-cumack, or
+ * rtx_pseudo_cumack, if required. Separate
+ * cumack trackers for first transmissions,
+ * and retransmissions.
+ */
+ if ((tp1->whoTo->find_pseudo_cumack == 1) && (tp1->sent < SCTP_DATAGRAM_RESEND) &&
+ (tp1->snd_count == 1)) {
+ tp1->whoTo->pseudo_cumack = tp1->rec.data.TSN_seq;
+ tp1->whoTo->find_pseudo_cumack = 0;
+ }
+ if ((tp1->whoTo->find_rtx_pseudo_cumack == 1) && (tp1->sent < SCTP_DATAGRAM_RESEND) &&
+ (tp1->snd_count > 1)) {
+ tp1->whoTo->rtx_pseudo_cumack = tp1->rec.data.TSN_seq;
+ tp1->whoTo->find_rtx_pseudo_cumack = 0;
+ }
+ if (tp1->rec.data.TSN_seq == theTSN) {
+ if (tp1->sent != SCTP_DATAGRAM_UNSENT) {
+ /*-
+ * must be held until
+ * cum-ack passes
+ */
+ /*-
+ * ECN Nonce: Add the nonce
+ * value to the sender's
+ * nonce sum
+ */
+ if (tp1->sent < SCTP_DATAGRAM_RESEND) {
+ /*-
+ * If it is less than RESEND, it is
+ * now no-longer in flight.
+ * Higher values may already be set
+ * via previous Gap Ack Blocks...
+ * i.e. ACKED or RESEND.
+ */
+ if (compare_with_wrap(tp1->rec.data.TSN_seq,
+ *biggest_newly_acked_tsn, MAX_TSN)) {
+ *biggest_newly_acked_tsn = tp1->rec.data.TSN_seq;
+ }
+ /*-
+ * CMT: SFR algo (and HTNA) - set
+ * saw_newack to 1 for dest being
+ * newly acked. update
+ * this_sack_highest_newack if
+ * appropriate.
+ */
+ if (tp1->rec.data.chunk_was_revoked == 0)
+ tp1->whoTo->saw_newack = 1;
+
+ if (compare_with_wrap(tp1->rec.data.TSN_seq,
+ tp1->whoTo->this_sack_highest_newack,
+ MAX_TSN)) {
+ tp1->whoTo->this_sack_highest_newack =
+ tp1->rec.data.TSN_seq;
+ }
+ /*-
+ * CMT DAC algo: also update
+ * this_sack_lowest_newack
+ */
+ if (*this_sack_lowest_newack == 0) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) {
+ sctp_log_sack(*this_sack_lowest_newack,
+ last_tsn,
+ tp1->rec.data.TSN_seq,
+ 0,
+ 0,
+ SCTP_LOG_TSN_ACKED);
+ }
+ *this_sack_lowest_newack = tp1->rec.data.TSN_seq;
+ }
+ /*-
+ * CMT: CUCv2 algorithm. If (rtx-)pseudo-cumack for corresp
+ * dest is being acked, then we have a new (rtx-)pseudo-cumack. Set
+ * new_(rtx_)pseudo_cumack to TRUE so that the cwnd for this dest can be
+ * updated. Also trigger search for the next expected (rtx-)pseudo-cumack.
+ * Separate pseudo_cumack trackers for first transmissions and
+ * retransmissions.
+ */
+ if (tp1->rec.data.TSN_seq == tp1->whoTo->pseudo_cumack) {
+ if (tp1->rec.data.chunk_was_revoked == 0) {
+ tp1->whoTo->new_pseudo_cumack = 1;
+ }
+ tp1->whoTo->find_pseudo_cumack = 1;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, tp1->whoTo, tp1->rec.data.TSN_seq, SCTP_CWND_LOG_FROM_SACK);
+ }
+ if (tp1->rec.data.TSN_seq == tp1->whoTo->rtx_pseudo_cumack) {
+ if (tp1->rec.data.chunk_was_revoked == 0) {
+ tp1->whoTo->new_pseudo_cumack = 1;
+ }
+ tp1->whoTo->find_rtx_pseudo_cumack = 1;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) {
+ sctp_log_sack(*biggest_newly_acked_tsn,
+ last_tsn,
+ tp1->rec.data.TSN_seq,
+ frag_strt,
+ frag_end,
+ SCTP_LOG_TSN_ACKED);
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_GAP,
+ tp1->whoTo->flight_size,
+ tp1->book_size,
+ (uintptr_t) tp1->whoTo,
+ tp1->rec.data.TSN_seq);
+ }
+ sctp_flight_size_decrease(tp1);
+ sctp_total_flight_decrease(stcb, tp1);
+
+ tp1->whoTo->net_ack += tp1->send_size;
+ if (tp1->snd_count < 2) {
+ /*-
+ * True non-retransmited chunk
+ */
+ tp1->whoTo->net_ack2 += tp1->send_size;
+
+ /*-
+ * update RTO too ?
+ */
+ if (tp1->do_rtt) {
+ tp1->whoTo->RTO =
+ sctp_calculate_rto(stcb,
+ &stcb->asoc,
+ tp1->whoTo,
+ &tp1->sent_rcv_time,
+ sctp_align_safe_nocopy);
+ tp1->do_rtt = 0;
+ }
+ }
+ }
+ if (tp1->sent <= SCTP_DATAGRAM_RESEND) {
+ (*ecn_seg_sums) += tp1->rec.data.ect_nonce;
+ (*ecn_seg_sums) &= SCTP_SACK_NONCE_SUM;
+ if (compare_with_wrap(tp1->rec.data.TSN_seq,
+ stcb->asoc.this_sack_highest_gap,
+ MAX_TSN)) {
+ stcb->asoc.this_sack_highest_gap =
+ tp1->rec.data.TSN_seq;
+ }
+ if (tp1->sent == SCTP_DATAGRAM_RESEND) {
+ sctp_ucount_decr(stcb->asoc.sent_queue_retran_cnt);
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xB2,
+ (stcb->asoc.sent_queue_retran_cnt & 0x000000ff));
+#endif
+ }
+ }
+ /*-
+ * All chunks NOT UNSENT fall through here and are marked
+ * (leave PR-SCTP ones that are to skip alone though)
+ */
+ if (tp1->sent != SCTP_FORWARD_TSN_SKIP)
+ tp1->sent = SCTP_DATAGRAM_MARKED;
+
+ if (tp1->rec.data.chunk_was_revoked) {
+ /* deflate the cwnd */
+ tp1->whoTo->cwnd -= tp1->book_size;
+ tp1->rec.data.chunk_was_revoked = 0;
+ }
+ /* NR Sack code here */
+ if (nr_sacking) {
+ if (tp1->data) {
+ /*
+ * sa_ignore
+ * NO_NULL_CHK
+ */
+ sctp_free_bufspace(stcb, &stcb->asoc, tp1, 1);
+ sctp_m_freem(tp1->data);
+ tp1->data = NULL;
+ }
+ wake_him++;
+ }
+ }
+ break;
+ } /* if (tp1->TSN_seq == theTSN) */
+ if (compare_with_wrap(tp1->rec.data.TSN_seq, theTSN,
+ MAX_TSN))
+ break;
+
+ tp1 = TAILQ_NEXT(tp1, sctp_next);
+ if ((tp1 == NULL) && (circled == 0)) {
+ circled++;
+ tp1 = TAILQ_FIRST(&stcb->asoc.sent_queue);
+ }
+ } /* end while (tp1) */
+ if (tp1 == NULL) {
+ circled = 0;
+ tp1 = TAILQ_FIRST(&stcb->asoc.sent_queue);
+ }
+ /* In case the fragments were not in order we must reset */
+ } /* end for (j = fragStart */
+ *p_tp1 = tp1;
+ return (wake_him); /* Return value only used for nr-sack */
+}
+
+
+static int
+sctp_handle_segments(struct mbuf *m, int *offset, struct sctp_tcb *stcb, struct sctp_association *asoc,
+ uint32_t last_tsn, uint32_t * biggest_tsn_acked,
+ uint32_t * biggest_newly_acked_tsn, uint32_t * this_sack_lowest_newack,
+ int num_seg, int num_nr_seg, int *ecn_seg_sums)
+{
+ struct sctp_gap_ack_block *frag, block;
+ struct sctp_tmit_chunk *tp1;
+ int i;
+ int num_frs = 0;
+ int chunk_freed;
+ int non_revocable;
+ uint16_t frag_strt, frag_end, prev_frag_end;
+
+ tp1 = TAILQ_FIRST(&asoc->sent_queue);
+ prev_frag_end = 0;
+ chunk_freed = 0;
+
+ for (i = 0; i < (num_seg + num_nr_seg); i++) {
+ if (i == num_seg) {
+ prev_frag_end = 0;
+ tp1 = TAILQ_FIRST(&asoc->sent_queue);
+ }
+ frag = (struct sctp_gap_ack_block *)sctp_m_getptr(m, *offset,
+ sizeof(struct sctp_gap_ack_block), (uint8_t *) & block);
+ *offset += sizeof(block);
+ if (frag == NULL) {
+ return (chunk_freed);
+ }
+ frag_strt = ntohs(frag->start);
+ frag_end = ntohs(frag->end);
+
+ if (frag_strt > frag_end) {
+ /* This gap report is malformed, skip it. */
+ continue;
+ }
+ if (frag_strt <= prev_frag_end) {
+ /* This gap report is not in order, so restart. */
+ tp1 = TAILQ_FIRST(&asoc->sent_queue);
+ }
+ if (compare_with_wrap((last_tsn + frag_end), *biggest_tsn_acked, MAX_TSN)) {
+ *biggest_tsn_acked = last_tsn + frag_end;
+ }
+ if (i < num_seg) {
+ non_revocable = 0;
+ } else {
+ non_revocable = 1;
+ }
+ if (sctp_process_segment_range(stcb, &tp1, last_tsn, frag_strt, frag_end,
+ non_revocable, &num_frs, biggest_newly_acked_tsn,
+ this_sack_lowest_newack, ecn_seg_sums)) {
+ chunk_freed = 1;
+ }
+ prev_frag_end = frag_end;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) {
+ if (num_frs)
+ sctp_log_fr(*biggest_tsn_acked,
+ *biggest_newly_acked_tsn,
+ last_tsn, SCTP_FR_LOG_BIGGEST_TSNS);
+ }
+ return (chunk_freed);
+}
+
+static void
+sctp_check_for_revoked(struct sctp_tcb *stcb,
+ struct sctp_association *asoc, uint32_t cumack,
+ uint32_t biggest_tsn_acked)
+{
+ struct sctp_tmit_chunk *tp1;
+ int tot_revoked = 0;
+
+ tp1 = TAILQ_FIRST(&asoc->sent_queue);
+ while (tp1) {
+ if (compare_with_wrap(tp1->rec.data.TSN_seq, cumack,
+ MAX_TSN)) {
+ /*
+ * ok this guy is either ACK or MARKED. If it is
+ * ACKED it has been previously acked but not this
+ * time i.e. revoked. If it is MARKED it was ACK'ed
+ * again.
+ */
+ if (compare_with_wrap(tp1->rec.data.TSN_seq, biggest_tsn_acked,
+ MAX_TSN))
+ break;
+
+
+ if (tp1->sent == SCTP_DATAGRAM_ACKED) {
+ /* it has been revoked */
+ tp1->sent = SCTP_DATAGRAM_SENT;
+ tp1->rec.data.chunk_was_revoked = 1;
+ /*
+ * We must add this stuff back in to assure
+ * timers and such get started.
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_UP_REVOKE,
+ tp1->whoTo->flight_size,
+ tp1->book_size,
+ (uintptr_t) tp1->whoTo,
+ tp1->rec.data.TSN_seq);
+ }
+ sctp_flight_size_increase(tp1);
+ sctp_total_flight_increase(stcb, tp1);
+ /*
+ * We inflate the cwnd to compensate for our
+ * artificial inflation of the flight_size.
+ */
+ tp1->whoTo->cwnd += tp1->book_size;
+ tot_revoked++;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) {
+ sctp_log_sack(asoc->last_acked_seq,
+ cumack,
+ tp1->rec.data.TSN_seq,
+ 0,
+ 0,
+ SCTP_LOG_TSN_REVOKED);
+ }
+ } else if (tp1->sent == SCTP_DATAGRAM_MARKED) {
+ /* it has been re-acked in this SACK */
+ tp1->sent = SCTP_DATAGRAM_ACKED;
+ }
+ }
+ if (tp1->sent == SCTP_DATAGRAM_UNSENT)
+ break;
+ tp1 = TAILQ_NEXT(tp1, sctp_next);
+ }
+ if (tot_revoked > 0) {
+ /*
+ * Setup the ecn nonce re-sync point. We do this since once
+ * data is revoked we begin to retransmit things, which do
+ * NOT have the ECN bits set. This means we are now out of
+ * sync and must wait until we get back in sync with the
+ * peer to check ECN bits.
+ */
+ tp1 = TAILQ_FIRST(&asoc->send_queue);
+ if (tp1 == NULL) {
+ asoc->nonce_resync_tsn = asoc->sending_seq;
+ } else {
+ asoc->nonce_resync_tsn = tp1->rec.data.TSN_seq;
+ }
+ asoc->nonce_wait_for_ecne = 0;
+ asoc->nonce_sum_check = 0;
+ }
+}
+
+
+static void
+sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc,
+ uint32_t biggest_tsn_acked, uint32_t biggest_tsn_newly_acked, uint32_t this_sack_lowest_newack, int accum_moved)
+{
+ struct sctp_tmit_chunk *tp1;
+ int strike_flag = 0;
+ struct timeval now;
+ int tot_retrans = 0;
+ uint32_t sending_seq;
+ struct sctp_nets *net;
+ int num_dests_sacked = 0;
+
+ /*
+ * select the sending_seq, this is either the next thing ready to be
+ * sent but not transmitted, OR, the next seq we assign.
+ */
+ tp1 = TAILQ_FIRST(&stcb->asoc.send_queue);
+ if (tp1 == NULL) {
+ sending_seq = asoc->sending_seq;
+ } else {
+ sending_seq = tp1->rec.data.TSN_seq;
+ }
+
+ /* CMT DAC algo: finding out if SACK is a mixed SACK */
+ if ((asoc->sctp_cmt_on_off == 1) &&
+ SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) {
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ if (net->saw_newack)
+ num_dests_sacked++;
+ }
+ }
+ if (stcb->asoc.peer_supports_prsctp) {
+ (void)SCTP_GETTIME_TIMEVAL(&now);
+ }
+ tp1 = TAILQ_FIRST(&asoc->sent_queue);
+ while (tp1) {
+ strike_flag = 0;
+ if (tp1->no_fr_allowed) {
+ /* this one had a timeout or something */
+ tp1 = TAILQ_NEXT(tp1, sctp_next);
+ continue;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) {
+ if (tp1->sent < SCTP_DATAGRAM_RESEND)
+ sctp_log_fr(biggest_tsn_newly_acked,
+ tp1->rec.data.TSN_seq,
+ tp1->sent,
+ SCTP_FR_LOG_CHECK_STRIKE);
+ }
+ if (compare_with_wrap(tp1->rec.data.TSN_seq, biggest_tsn_acked,
+ MAX_TSN) ||
+ tp1->sent == SCTP_DATAGRAM_UNSENT) {
+ /* done */
+ break;
+ }
+ if (stcb->asoc.peer_supports_prsctp) {
+ if ((PR_SCTP_TTL_ENABLED(tp1->flags)) && tp1->sent < SCTP_DATAGRAM_ACKED) {
+ /* Is it expired? */
+ if (timevalcmp(&now, &tp1->rec.data.timetodrop, >)) {
+ /* Yes so drop it */
+ if (tp1->data != NULL) {
+ (void)sctp_release_pr_sctp_chunk(stcb, tp1,
+ (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
+ SCTP_SO_NOT_LOCKED);
+ }
+ tp1 = TAILQ_NEXT(tp1, sctp_next);
+ continue;
+ }
+ }
+ }
+ if (compare_with_wrap(tp1->rec.data.TSN_seq,
+ asoc->this_sack_highest_gap, MAX_TSN)) {
+ /* we are beyond the tsn in the sack */
+ break;
+ }
+ if (tp1->sent >= SCTP_DATAGRAM_RESEND) {
+ /* either a RESEND, ACKED, or MARKED */
+ /* skip */
+ if (tp1->sent == SCTP_FORWARD_TSN_SKIP) {
+ /* Continue strikin FWD-TSN chunks */
+ tp1->rec.data.fwd_tsn_cnt++;
+ }
+ tp1 = TAILQ_NEXT(tp1, sctp_next);
+ continue;
+ }
+ /*
+ * CMT : SFR algo (covers part of DAC and HTNA as well)
+ */
+ if (tp1->whoTo && tp1->whoTo->saw_newack == 0) {
+ /*
+ * No new acks were receieved for data sent to this
+ * dest. Therefore, according to the SFR algo for
+ * CMT, no data sent to this dest can be marked for
+ * FR using this SACK.
+ */
+ tp1 = TAILQ_NEXT(tp1, sctp_next);
+ continue;
+ } else if (tp1->whoTo && compare_with_wrap(tp1->rec.data.TSN_seq,
+ tp1->whoTo->this_sack_highest_newack, MAX_TSN)) {
+ /*
+ * CMT: New acks were receieved for data sent to
+ * this dest. But no new acks were seen for data
+ * sent after tp1. Therefore, according to the SFR
+ * algo for CMT, tp1 cannot be marked for FR using
+ * this SACK. This step covers part of the DAC algo
+ * and the HTNA algo as well.
+ */
+ tp1 = TAILQ_NEXT(tp1, sctp_next);
+ continue;
+ }
+ /*
+ * Here we check to see if we were have already done a FR
+ * and if so we see if the biggest TSN we saw in the sack is
+ * smaller than the recovery point. If so we don't strike
+ * the tsn... otherwise we CAN strike the TSN.
+ */
+ /*
+ * @@@ JRI: Check for CMT if (accum_moved &&
+ * asoc->fast_retran_loss_recovery && (sctp_cmt_on_off ==
+ * 0)) {
+ */
+ if (accum_moved && asoc->fast_retran_loss_recovery) {
+ /*
+ * Strike the TSN if in fast-recovery and cum-ack
+ * moved.
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) {
+ sctp_log_fr(biggest_tsn_newly_acked,
+ tp1->rec.data.TSN_seq,
+ tp1->sent,
+ SCTP_FR_LOG_STRIKE_CHUNK);
+ }
+ if (tp1->sent < SCTP_DATAGRAM_RESEND) {
+ tp1->sent++;
+ }
+ if ((asoc->sctp_cmt_on_off == 1) &&
+ SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) {
+ /*
+ * CMT DAC algorithm: If SACK flag is set to
+ * 0, then lowest_newack test will not pass
+ * because it would have been set to the
+ * cumack earlier. If not already to be
+ * rtx'd, If not a mixed sack and if tp1 is
+ * not between two sacked TSNs, then mark by
+ * one more. NOTE that we are marking by one
+ * additional time since the SACK DAC flag
+ * indicates that two packets have been
+ * received after this missing TSN.
+ */
+ if ((tp1->sent < SCTP_DATAGRAM_RESEND) && (num_dests_sacked == 1) &&
+ compare_with_wrap(this_sack_lowest_newack, tp1->rec.data.TSN_seq, MAX_TSN)) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) {
+ sctp_log_fr(16 + num_dests_sacked,
+ tp1->rec.data.TSN_seq,
+ tp1->sent,
+ SCTP_FR_LOG_STRIKE_CHUNK);
+ }
+ tp1->sent++;
+ }
+ }
+ } else if ((tp1->rec.data.doing_fast_retransmit) &&
+ (asoc->sctp_cmt_on_off == 0)) {
+ /*
+ * For those that have done a FR we must take
+ * special consideration if we strike. I.e the
+ * biggest_newly_acked must be higher than the
+ * sending_seq at the time we did the FR.
+ */
+ if (
+#ifdef SCTP_FR_TO_ALTERNATE
+ /*
+ * If FR's go to new networks, then we must only do
+ * this for singly homed asoc's. However if the FR's
+ * go to the same network (Armando's work) then its
+ * ok to FR multiple times.
+ */
+ (asoc->numnets < 2)
+#else
+ (1)
+#endif
+ ) {
+
+ if ((compare_with_wrap(biggest_tsn_newly_acked,
+ tp1->rec.data.fast_retran_tsn, MAX_TSN)) ||
+ (biggest_tsn_newly_acked ==
+ tp1->rec.data.fast_retran_tsn)) {
+ /*
+ * Strike the TSN, since this ack is
+ * beyond where things were when we
+ * did a FR.
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) {
+ sctp_log_fr(biggest_tsn_newly_acked,
+ tp1->rec.data.TSN_seq,
+ tp1->sent,
+ SCTP_FR_LOG_STRIKE_CHUNK);
+ }
+ if (tp1->sent < SCTP_DATAGRAM_RESEND) {
+ tp1->sent++;
+ }
+ strike_flag = 1;
+ if ((asoc->sctp_cmt_on_off == 1) &&
+ SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) {
+ /*
+ * CMT DAC algorithm: If
+ * SACK flag is set to 0,
+ * then lowest_newack test
+ * will not pass because it
+ * would have been set to
+ * the cumack earlier. If
+ * not already to be rtx'd,
+ * If not a mixed sack and
+ * if tp1 is not between two
+ * sacked TSNs, then mark by
+ * one more. NOTE that we
+ * are marking by one
+ * additional time since the
+ * SACK DAC flag indicates
+ * that two packets have
+ * been received after this
+ * missing TSN.
+ */
+ if ((tp1->sent < SCTP_DATAGRAM_RESEND) &&
+ (num_dests_sacked == 1) &&
+ compare_with_wrap(this_sack_lowest_newack,
+ tp1->rec.data.TSN_seq, MAX_TSN)) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) {
+ sctp_log_fr(32 + num_dests_sacked,
+ tp1->rec.data.TSN_seq,
+ tp1->sent,
+ SCTP_FR_LOG_STRIKE_CHUNK);
+ }
+ if (tp1->sent < SCTP_DATAGRAM_RESEND) {
+ tp1->sent++;
+ }
+ }
+ }
+ }
+ }
+ /*
+ * JRI: TODO: remove code for HTNA algo. CMT's SFR
+ * algo covers HTNA.
+ */
+ } else if (compare_with_wrap(tp1->rec.data.TSN_seq,
+ biggest_tsn_newly_acked, MAX_TSN)) {
+ /*
+ * We don't strike these: This is the HTNA
+ * algorithm i.e. we don't strike If our TSN is
+ * larger than the Highest TSN Newly Acked.
+ */
+ ;
+ } else {
+ /* Strike the TSN */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) {
+ sctp_log_fr(biggest_tsn_newly_acked,
+ tp1->rec.data.TSN_seq,
+ tp1->sent,
+ SCTP_FR_LOG_STRIKE_CHUNK);
+ }
+ if (tp1->sent < SCTP_DATAGRAM_RESEND) {
+ tp1->sent++;
+ }
+ if ((asoc->sctp_cmt_on_off == 1) &&
+ SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) {
+ /*
+ * CMT DAC algorithm: If SACK flag is set to
+ * 0, then lowest_newack test will not pass
+ * because it would have been set to the
+ * cumack earlier. If not already to be
+ * rtx'd, If not a mixed sack and if tp1 is
+ * not between two sacked TSNs, then mark by
+ * one more. NOTE that we are marking by one
+ * additional time since the SACK DAC flag
+ * indicates that two packets have been
+ * received after this missing TSN.
+ */
+ if ((tp1->sent < SCTP_DATAGRAM_RESEND) && (num_dests_sacked == 1) &&
+ compare_with_wrap(this_sack_lowest_newack, tp1->rec.data.TSN_seq, MAX_TSN)) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) {
+ sctp_log_fr(48 + num_dests_sacked,
+ tp1->rec.data.TSN_seq,
+ tp1->sent,
+ SCTP_FR_LOG_STRIKE_CHUNK);
+ }
+ tp1->sent++;
+ }
+ }
+ }
+ if (tp1->sent == SCTP_DATAGRAM_RESEND) {
+ struct sctp_nets *alt;
+
+ /* fix counts and things */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_RSND,
+ (tp1->whoTo ? (tp1->whoTo->flight_size) : 0),
+ tp1->book_size,
+ (uintptr_t) tp1->whoTo,
+ tp1->rec.data.TSN_seq);
+ }
+ if (tp1->whoTo) {
+ tp1->whoTo->net_ack++;
+ sctp_flight_size_decrease(tp1);
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) {
+ sctp_log_rwnd(SCTP_INCREASE_PEER_RWND,
+ asoc->peers_rwnd, tp1->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh));
+ }
+ /* add back to the rwnd */
+ asoc->peers_rwnd += (tp1->send_size + SCTP_BASE_SYSCTL(sctp_peer_chunk_oh));
+
+ /* remove from the total flight */
+ sctp_total_flight_decrease(stcb, tp1);
+
+ if ((stcb->asoc.peer_supports_prsctp) &&
+ (PR_SCTP_RTX_ENABLED(tp1->flags))) {
+ /*
+ * Has it been retransmitted tv_sec times? -
+ * we store the retran count there.
+ */
+ if (tp1->snd_count > tp1->rec.data.timetodrop.tv_sec) {
+ /* Yes, so drop it */
+ if (tp1->data != NULL) {
+ (void)sctp_release_pr_sctp_chunk(stcb, tp1,
+ (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
+ SCTP_SO_NOT_LOCKED);
+ }
+ /* Make sure to flag we had a FR */
+ tp1->whoTo->net_ack++;
+ tp1 = TAILQ_NEXT(tp1, sctp_next);
+ continue;
+ }
+ }
+ /* printf("OK, we are now ready to FR this guy\n"); */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) {
+ sctp_log_fr(tp1->rec.data.TSN_seq, tp1->snd_count,
+ 0, SCTP_FR_MARKED);
+ }
+ if (strike_flag) {
+ /* This is a subsequent FR */
+ SCTP_STAT_INCR(sctps_sendmultfastretrans);
+ }
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ if (asoc->sctp_cmt_on_off == 1) {
+ /*
+ * CMT: Using RTX_SSTHRESH policy for CMT.
+ * If CMT is being used, then pick dest with
+ * largest ssthresh for any retransmission.
+ */
+ tp1->no_fr_allowed = 1;
+ alt = tp1->whoTo;
+ /* sa_ignore NO_NULL_CHK */
+ if (asoc->sctp_cmt_pf > 0) {
+ /*
+ * JRS 5/18/07 - If CMT PF is on,
+ * use the PF version of
+ * find_alt_net()
+ */
+ alt = sctp_find_alternate_net(stcb, alt, 2);
+ } else {
+ /*
+ * JRS 5/18/07 - If only CMT is on,
+ * use the CMT version of
+ * find_alt_net()
+ */
+ /* sa_ignore NO_NULL_CHK */
+ alt = sctp_find_alternate_net(stcb, alt, 1);
+ }
+ if (alt == NULL) {
+ alt = tp1->whoTo;
+ }
+ /*
+ * CUCv2: If a different dest is picked for
+ * the retransmission, then new
+ * (rtx-)pseudo_cumack needs to be tracked
+ * for orig dest. Let CUCv2 track new (rtx-)
+ * pseudo-cumack always.
+ */
+ if (tp1->whoTo) {
+ tp1->whoTo->find_pseudo_cumack = 1;
+ tp1->whoTo->find_rtx_pseudo_cumack = 1;
+ }
+ } else {/* CMT is OFF */
+
+#ifdef SCTP_FR_TO_ALTERNATE
+ /* Can we find an alternate? */
+ alt = sctp_find_alternate_net(stcb, tp1->whoTo, 0);
+#else
+ /*
+ * default behavior is to NOT retransmit
+ * FR's to an alternate. Armando Caro's
+ * paper details why.
+ */
+ alt = tp1->whoTo;
+#endif
+ }
+
+ tp1->rec.data.doing_fast_retransmit = 1;
+ tot_retrans++;
+ /* mark the sending seq for possible subsequent FR's */
+ /*
+ * printf("Marking TSN for FR new value %x\n",
+ * (uint32_t)tpi->rec.data.TSN_seq);
+ */
+ if (TAILQ_EMPTY(&asoc->send_queue)) {
+ /*
+ * If the queue of send is empty then its
+ * the next sequence number that will be
+ * assigned so we subtract one from this to
+ * get the one we last sent.
+ */
+ tp1->rec.data.fast_retran_tsn = sending_seq;
+ } else {
+ /*
+ * If there are chunks on the send queue
+ * (unsent data that has made it from the
+ * stream queues but not out the door, we
+ * take the first one (which will have the
+ * lowest TSN) and subtract one to get the
+ * one we last sent.
+ */
+ struct sctp_tmit_chunk *ttt;
+
+ ttt = TAILQ_FIRST(&asoc->send_queue);
+ tp1->rec.data.fast_retran_tsn =
+ ttt->rec.data.TSN_seq;
+ }
+
+ if (tp1->do_rtt) {
+ /*
+ * this guy had a RTO calculation pending on
+ * it, cancel it
+ */
+ tp1->do_rtt = 0;
+ }
+ if (alt != tp1->whoTo) {
+ /* yes, there is an alternate. */
+ sctp_free_remote_addr(tp1->whoTo);
+ /* sa_ignore FREED_MEMORY */
+ tp1->whoTo = alt;
+ atomic_add_int(&alt->ref_count, 1);
+ }
+ }
+ tp1 = TAILQ_NEXT(tp1, sctp_next);
+ } /* while (tp1) */
+
+ if (tot_retrans > 0) {
+ /*
+ * Setup the ecn nonce re-sync point. We do this since once
+ * we go to FR something we introduce a Karn's rule scenario
+ * and won't know the totals for the ECN bits.
+ */
+ asoc->nonce_resync_tsn = sending_seq;
+ asoc->nonce_wait_for_ecne = 0;
+ asoc->nonce_sum_check = 0;
+ }
+}
+
+struct sctp_tmit_chunk *
+sctp_try_advance_peer_ack_point(struct sctp_tcb *stcb,
+ struct sctp_association *asoc)
+{
+ struct sctp_tmit_chunk *tp1, *tp2, *a_adv = NULL;
+ struct timeval now;
+ int now_filled = 0;
+
+ if (asoc->peer_supports_prsctp == 0) {
+ return (NULL);
+ }
+ tp1 = TAILQ_FIRST(&asoc->sent_queue);
+ while (tp1) {
+ if (tp1->sent != SCTP_FORWARD_TSN_SKIP &&
+ tp1->sent != SCTP_DATAGRAM_RESEND) {
+ /* no chance to advance, out of here */
+ break;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) {
+ if (tp1->sent == SCTP_FORWARD_TSN_SKIP) {
+ sctp_misc_ints(SCTP_FWD_TSN_CHECK,
+ asoc->advanced_peer_ack_point,
+ tp1->rec.data.TSN_seq, 0, 0);
+ }
+ }
+ if (!PR_SCTP_ENABLED(tp1->flags)) {
+ /*
+ * We can't fwd-tsn past any that are reliable aka
+ * retransmitted until the asoc fails.
+ */
+ break;
+ }
+ if (!now_filled) {
+ (void)SCTP_GETTIME_TIMEVAL(&now);
+ now_filled = 1;
+ }
+ tp2 = TAILQ_NEXT(tp1, sctp_next);
+ /*
+ * now we got a chunk which is marked for another
+ * retransmission to a PR-stream but has run out its chances
+ * already maybe OR has been marked to skip now. Can we skip
+ * it if its a resend?
+ */
+ if (tp1->sent == SCTP_DATAGRAM_RESEND &&
+ (PR_SCTP_TTL_ENABLED(tp1->flags))) {
+ /*
+ * Now is this one marked for resend and its time is
+ * now up?
+ */
+ if (timevalcmp(&now, &tp1->rec.data.timetodrop, >)) {
+ /* Yes so drop it */
+ if (tp1->data) {
+ (void)sctp_release_pr_sctp_chunk(stcb, tp1,
+ (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
+ SCTP_SO_NOT_LOCKED);
+ }
+ } else {
+ /*
+ * No, we are done when hit one for resend
+ * whos time as not expired.
+ */
+ break;
+ }
+ }
+ /*
+ * Ok now if this chunk is marked to drop it we can clean up
+ * the chunk, advance our peer ack point and we can check
+ * the next chunk.
+ */
+ if (tp1->sent == SCTP_FORWARD_TSN_SKIP) {
+ /* advance PeerAckPoint goes forward */
+ if (compare_with_wrap(tp1->rec.data.TSN_seq,
+ asoc->advanced_peer_ack_point,
+ MAX_TSN)) {
+
+ asoc->advanced_peer_ack_point = tp1->rec.data.TSN_seq;
+ a_adv = tp1;
+ } else if (tp1->rec.data.TSN_seq == asoc->advanced_peer_ack_point) {
+ /* No update but we do save the chk */
+ a_adv = tp1;
+ }
+ } else {
+ /*
+ * If it is still in RESEND we can advance no
+ * further
+ */
+ break;
+ }
+ /*
+ * If we hit here we just dumped tp1, move to next tsn on
+ * sent queue.
+ */
+ tp1 = tp2;
+ }
+ return (a_adv);
+}
+
+static int
+sctp_fs_audit(struct sctp_association *asoc)
+{
+ struct sctp_tmit_chunk *chk;
+ int inflight = 0, resend = 0, inbetween = 0, acked = 0, above = 0;
+ int entry_flight, entry_cnt, ret;
+
+ entry_flight = asoc->total_flight;
+ entry_cnt = asoc->total_flight_count;
+ ret = 0;
+
+ if (asoc->pr_sctp_cnt >= asoc->sent_queue_cnt)
+ return (0);
+
+ TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
+ if (chk->sent < SCTP_DATAGRAM_RESEND) {
+ printf("Chk TSN:%u size:%d inflight cnt:%d\n",
+ chk->rec.data.TSN_seq,
+ chk->send_size,
+ chk->snd_count
+ );
+ inflight++;
+ } else if (chk->sent == SCTP_DATAGRAM_RESEND) {
+ resend++;
+ } else if (chk->sent < SCTP_DATAGRAM_ACKED) {
+ inbetween++;
+ } else if (chk->sent > SCTP_DATAGRAM_ACKED) {
+ above++;
+ } else {
+ acked++;
+ }
+ }
+
+ if ((inflight > 0) || (inbetween > 0)) {
+#ifdef INVARIANTS
+ panic("Flight size-express incorrect? \n");
+#else
+ printf("asoc->total_flight:%d cnt:%d\n",
+ entry_flight, entry_cnt);
+
+ SCTP_PRINTF("Flight size-express incorrect F:%d I:%d R:%d Ab:%d ACK:%d\n",
+ inflight, inbetween, resend, above, acked);
+ ret = 1;
+#endif
+ }
+ return (ret);
+}
+
+
+static void
+sctp_window_probe_recovery(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ struct sctp_nets *net,
+ struct sctp_tmit_chunk *tp1)
+{
+ tp1->window_probe = 0;
+ if ((tp1->sent >= SCTP_DATAGRAM_ACKED) || (tp1->data == NULL)) {
+ /* TSN's skipped we do NOT move back. */
+ sctp_misc_ints(SCTP_FLIGHT_LOG_DWN_WP_FWD,
+ tp1->whoTo->flight_size,
+ tp1->book_size,
+ (uintptr_t) tp1->whoTo,
+ tp1->rec.data.TSN_seq);
+ return;
+ }
+ /* First setup this by shrinking flight */
+ sctp_flight_size_decrease(tp1);
+ sctp_total_flight_decrease(stcb, tp1);
+ /* Now mark for resend */
+ tp1->sent = SCTP_DATAGRAM_RESEND;
+ sctp_ucount_incr(asoc->sent_queue_retran_cnt);
+
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_WP,
+ tp1->whoTo->flight_size,
+ tp1->book_size,
+ (uintptr_t) tp1->whoTo,
+ tp1->rec.data.TSN_seq);
+ }
+}
+
+void
+sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack,
+ uint32_t rwnd, int nonce_sum_flag, int *abort_now)
+{
+ struct sctp_nets *net;
+ struct sctp_association *asoc;
+ struct sctp_tmit_chunk *tp1, *tp2;
+ uint32_t old_rwnd;
+ int win_probe_recovery = 0;
+ int win_probe_recovered = 0;
+ int j, done_once = 0;
+
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_SACK_ARRIVALS_ENABLE) {
+ sctp_misc_ints(SCTP_SACK_LOG_EXPRESS, cumack,
+ rwnd, stcb->asoc.last_acked_seq, stcb->asoc.peers_rwnd);
+ }
+ SCTP_TCB_LOCK_ASSERT(stcb);
+#ifdef SCTP_ASOCLOG_OF_TSNS
+ stcb->asoc.cumack_log[stcb->asoc.cumack_log_at] = cumack;
+ stcb->asoc.cumack_log_at++;
+ if (stcb->asoc.cumack_log_at > SCTP_TSN_LOG_SIZE) {
+ stcb->asoc.cumack_log_at = 0;
+ }
+#endif
+ asoc = &stcb->asoc;
+ old_rwnd = asoc->peers_rwnd;
+ if (compare_with_wrap(asoc->last_acked_seq, cumack, MAX_TSN)) {
+ /* old ack */
+ return;
+ } else if (asoc->last_acked_seq == cumack) {
+ /* Window update sack */
+ asoc->peers_rwnd = sctp_sbspace_sub(rwnd,
+ (uint32_t) (asoc->total_flight + (asoc->total_flight_count * SCTP_BASE_SYSCTL(sctp_peer_chunk_oh))));
+ if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) {
+ /* SWS sender side engages */
+ asoc->peers_rwnd = 0;
+ }
+ if (asoc->peers_rwnd > old_rwnd) {
+ goto again;
+ }
+ return;
+ }
+ /* First setup for CC stuff */
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ net->prev_cwnd = net->cwnd;
+ net->net_ack = 0;
+ net->net_ack2 = 0;
+
+ /*
+ * CMT: Reset CUC and Fast recovery algo variables before
+ * SACK processing
+ */
+ net->new_pseudo_cumack = 0;
+ net->will_exit_fast_recovery = 0;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) {
+ uint32_t send_s;
+
+ if (!TAILQ_EMPTY(&asoc->sent_queue)) {
+ tp1 = TAILQ_LAST(&asoc->sent_queue,
+ sctpchunk_listhead);
+ send_s = tp1->rec.data.TSN_seq + 1;
+ } else {
+ send_s = asoc->sending_seq;
+ }
+ if ((cumack == send_s) ||
+ compare_with_wrap(cumack, send_s, MAX_TSN)) {
+#ifndef INVARIANTS
+ struct mbuf *oper;
+
+#endif
+#ifdef INVARIANTS
+ panic("Impossible sack 1");
+#else
+
+ *abort_now = 1;
+ /* XXX */
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) +
+ sizeof(uint32_t);
+ ph = mtod(oper, struct sctp_paramhdr *);
+ ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length = htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_25);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25;
+ sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+ return;
+#endif
+ }
+ }
+ asoc->this_sack_highest_gap = cumack;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INDATA,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ if (compare_with_wrap(cumack, asoc->last_acked_seq, MAX_TSN)) {
+ /* process the new consecutive TSN first */
+ tp1 = TAILQ_FIRST(&asoc->sent_queue);
+ while (tp1) {
+ tp2 = TAILQ_NEXT(tp1, sctp_next);
+ if (compare_with_wrap(cumack, tp1->rec.data.TSN_seq,
+ MAX_TSN) ||
+ cumack == tp1->rec.data.TSN_seq) {
+ if (tp1->sent == SCTP_DATAGRAM_UNSENT) {
+ printf("Warning, an unsent is now acked?\n");
+ }
+ /*
+ * ECN Nonce: Add the nonce to the sender's
+ * nonce sum
+ */
+ asoc->nonce_sum_expect_base += tp1->rec.data.ect_nonce;
+ if (tp1->sent < SCTP_DATAGRAM_ACKED) {
+ /*
+ * If it is less than ACKED, it is
+ * now no-longer in flight. Higher
+ * values may occur during marking
+ */
+ if (tp1->sent < SCTP_DATAGRAM_RESEND) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_CA,
+ tp1->whoTo->flight_size,
+ tp1->book_size,
+ (uintptr_t) tp1->whoTo,
+ tp1->rec.data.TSN_seq);
+ }
+ sctp_flight_size_decrease(tp1);
+ /* sa_ignore NO_NULL_CHK */
+ sctp_total_flight_decrease(stcb, tp1);
+ }
+ tp1->whoTo->net_ack += tp1->send_size;
+ if (tp1->snd_count < 2) {
+ /*
+ * True non-retransmited
+ * chunk
+ */
+ tp1->whoTo->net_ack2 +=
+ tp1->send_size;
+
+ /* update RTO too? */
+ if (tp1->do_rtt) {
+ tp1->whoTo->RTO =
+ /*
+ * sa_ignore
+ * NO_NULL_CHK
+ */
+ sctp_calculate_rto(stcb,
+ asoc, tp1->whoTo,
+ &tp1->sent_rcv_time,
+ sctp_align_safe_nocopy);
+ tp1->do_rtt = 0;
+ }
+ }
+ /*
+ * CMT: CUCv2 algorithm. From the
+ * cumack'd TSNs, for each TSN being
+ * acked for the first time, set the
+ * following variables for the
+ * corresp destination.
+ * new_pseudo_cumack will trigger a
+ * cwnd update.
+ * find_(rtx_)pseudo_cumack will
+ * trigger search for the next
+ * expected (rtx-)pseudo-cumack.
+ */
+ tp1->whoTo->new_pseudo_cumack = 1;
+ tp1->whoTo->find_pseudo_cumack = 1;
+ tp1->whoTo->find_rtx_pseudo_cumack = 1;
+
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ /* sa_ignore NO_NULL_CHK */
+ sctp_log_cwnd(stcb, tp1->whoTo, tp1->rec.data.TSN_seq, SCTP_CWND_LOG_FROM_SACK);
+ }
+ }
+ if (tp1->sent == SCTP_DATAGRAM_RESEND) {
+ sctp_ucount_decr(asoc->sent_queue_retran_cnt);
+ }
+ if (tp1->rec.data.chunk_was_revoked) {
+ /* deflate the cwnd */
+ tp1->whoTo->cwnd -= tp1->book_size;
+ tp1->rec.data.chunk_was_revoked = 0;
+ }
+ tp1->sent = SCTP_DATAGRAM_ACKED;
+ TAILQ_REMOVE(&asoc->sent_queue, tp1, sctp_next);
+ if (tp1->data) {
+ /* sa_ignore NO_NULL_CHK */
+ sctp_free_bufspace(stcb, asoc, tp1, 1);
+ sctp_m_freem(tp1->data);
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) {
+ sctp_log_sack(asoc->last_acked_seq,
+ cumack,
+ tp1->rec.data.TSN_seq,
+ 0,
+ 0,
+ SCTP_LOG_FREE_SENT);
+ }
+ tp1->data = NULL;
+ asoc->sent_queue_cnt--;
+ sctp_free_a_chunk(stcb, tp1);
+ tp1 = tp2;
+ } else {
+ break;
+ }
+ }
+
+ }
+ /* sa_ignore NO_NULL_CHK */
+ if (stcb->sctp_socket) {
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+ SOCKBUF_LOCK(&stcb->sctp_socket->so_snd);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_WAKE_LOGGING_ENABLE) {
+ /* sa_ignore NO_NULL_CHK */
+ sctp_wakeup_log(stcb, cumack, 1, SCTP_WAKESND_FROM_SACK);
+ }
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ /* assoc was freed while we were unlocked */
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return;
+ }
+#endif
+ sctp_sowwakeup_locked(stcb->sctp_ep, stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_WAKE_LOGGING_ENABLE) {
+ sctp_wakeup_log(stcb, cumack, 1, SCTP_NOWAKE_FROM_SACK);
+ }
+ }
+
+ /* JRS - Use the congestion control given in the CC module */
+ if (asoc->last_acked_seq != cumack)
+ asoc->cc_functions.sctp_cwnd_update_after_sack(stcb, asoc, 1, 0, 0);
+
+ asoc->last_acked_seq = cumack;
+
+ if (TAILQ_EMPTY(&asoc->sent_queue)) {
+ /* nothing left in-flight */
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ net->flight_size = 0;
+ net->partial_bytes_acked = 0;
+ }
+ asoc->total_flight = 0;
+ asoc->total_flight_count = 0;
+ }
+ /* ECN Nonce updates */
+ if (asoc->ecn_nonce_allowed) {
+ if (asoc->nonce_sum_check) {
+ if (nonce_sum_flag != ((asoc->nonce_sum_expect_base) & SCTP_SACK_NONCE_SUM)) {
+ if (asoc->nonce_wait_for_ecne == 0) {
+ struct sctp_tmit_chunk *lchk;
+
+ lchk = TAILQ_FIRST(&asoc->send_queue);
+ asoc->nonce_wait_for_ecne = 1;
+ if (lchk) {
+ asoc->nonce_wait_tsn = lchk->rec.data.TSN_seq;
+ } else {
+ asoc->nonce_wait_tsn = asoc->sending_seq;
+ }
+ } else {
+ if (compare_with_wrap(asoc->last_acked_seq, asoc->nonce_wait_tsn, MAX_TSN) ||
+ (asoc->last_acked_seq == asoc->nonce_wait_tsn)) {
+ /*
+ * Misbehaving peer. We need
+ * to react to this guy
+ */
+ asoc->ecn_allowed = 0;
+ asoc->ecn_nonce_allowed = 0;
+ }
+ }
+ }
+ } else {
+ /* See if Resynchronization Possible */
+ if (compare_with_wrap(asoc->last_acked_seq, asoc->nonce_resync_tsn, MAX_TSN)) {
+ asoc->nonce_sum_check = 1;
+ /*
+ * Now we must calculate what the base is.
+ * We do this based on two things, we know
+ * the total's for all the segments
+ * gap-acked in the SACK (none). We also
+ * know the SACK's nonce sum, its in
+ * nonce_sum_flag. So we can build a truth
+ * table to back-calculate the new value of
+ * asoc->nonce_sum_expect_base:
+ *
+ * SACK-flag-Value Seg-Sums Base 0 0 0
+ * 1 0 1 0 1 1 1
+ * 1 0
+ */
+ asoc->nonce_sum_expect_base = (0 ^ nonce_sum_flag) & SCTP_SACK_NONCE_SUM;
+ }
+ }
+ }
+ /* RWND update */
+ asoc->peers_rwnd = sctp_sbspace_sub(rwnd,
+ (uint32_t) (asoc->total_flight + (asoc->total_flight_count * SCTP_BASE_SYSCTL(sctp_peer_chunk_oh))));
+ if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) {
+ /* SWS sender side engages */
+ asoc->peers_rwnd = 0;
+ }
+ if (asoc->peers_rwnd > old_rwnd) {
+ win_probe_recovery = 1;
+ }
+ /* Now assure a timer where data is queued at */
+again:
+ j = 0;
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ int to_ticks;
+
+ if (win_probe_recovery && (net->window_probe)) {
+ win_probe_recovered = 1;
+ /*
+ * Find first chunk that was used with window probe
+ * and clear the sent
+ */
+ /* sa_ignore FREED_MEMORY */
+ TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) {
+ if (tp1->window_probe) {
+ /* move back to data send queue */
+ sctp_window_probe_recovery(stcb, asoc, net, tp1);
+ break;
+ }
+ }
+ }
+ if (net->RTO == 0) {
+ to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto);
+ } else {
+ to_ticks = MSEC_TO_TICKS(net->RTO);
+ }
+ if (net->flight_size) {
+ j++;
+ (void)SCTP_OS_TIMER_START(&net->rxt_timer.timer, to_ticks,
+ sctp_timeout_handler, &net->rxt_timer);
+ if (net->window_probe) {
+ net->window_probe = 0;
+ }
+ } else {
+ if (net->window_probe) {
+ /*
+ * In window probes we must assure a timer
+ * is still running there
+ */
+ net->window_probe = 0;
+ if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
+ SCTP_OS_TIMER_START(&net->rxt_timer.timer, to_ticks,
+ sctp_timeout_handler, &net->rxt_timer);
+ }
+ } else if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
+ stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_22);
+ }
+ if (SCTP_BASE_SYSCTL(sctp_early_fr)) {
+ if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) {
+ SCTP_STAT_INCR(sctps_earlyfrstpidsck4);
+ sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_23);
+ }
+ }
+ }
+ }
+ if ((j == 0) &&
+ (!TAILQ_EMPTY(&asoc->sent_queue)) &&
+ (asoc->sent_queue_retran_cnt == 0) &&
+ (win_probe_recovered == 0) &&
+ (done_once == 0)) {
+ /*
+ * huh, this should not happen unless all packets are
+ * PR-SCTP and marked to skip of course.
+ */
+ if (sctp_fs_audit(asoc)) {
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ net->flight_size = 0;
+ }
+ asoc->total_flight = 0;
+ asoc->total_flight_count = 0;
+ asoc->sent_queue_retran_cnt = 0;
+ TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) {
+ if (tp1->sent < SCTP_DATAGRAM_RESEND) {
+ sctp_flight_size_increase(tp1);
+ sctp_total_flight_increase(stcb, tp1);
+ } else if (tp1->sent == SCTP_DATAGRAM_RESEND) {
+ sctp_ucount_incr(asoc->sent_queue_retran_cnt);
+ }
+ }
+ }
+ done_once = 1;
+ goto again;
+ }
+ /**********************************/
+ /* Now what about shutdown issues */
+ /**********************************/
+ if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue)) {
+ /* nothing left on sendqueue.. consider done */
+ /* clean up */
+ if ((asoc->stream_queue_cnt == 1) &&
+ ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) ||
+ (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED)) &&
+ (asoc->locked_on_sending)
+ ) {
+ struct sctp_stream_queue_pending *sp;
+
+ /*
+ * I may be in a state where we got all across.. but
+ * cannot write more due to a shutdown... we abort
+ * since the user did not indicate EOR in this case.
+ * The sp will be cleaned during free of the asoc.
+ */
+ sp = TAILQ_LAST(&((asoc->locked_on_sending)->outqueue),
+ sctp_streamhead);
+ if ((sp) && (sp->length == 0)) {
+ /* Let cleanup code purge it */
+ if (sp->msg_is_complete) {
+ asoc->stream_queue_cnt--;
+ } else {
+ asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
+ asoc->locked_on_sending = NULL;
+ asoc->stream_queue_cnt--;
+ }
+ }
+ }
+ if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) &&
+ (asoc->stream_queue_cnt == 0)) {
+ if (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT) {
+ /* Need to abort here */
+ struct mbuf *oper;
+
+ abort_out_now:
+ *abort_now = 1;
+ /* XXX */
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) +
+ sizeof(uint32_t);
+ ph = mtod(oper, struct sctp_paramhdr *);
+ ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT);
+ ph->param_length = htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_24);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_24;
+ sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_RESPONSE_TO_USER_REQ, oper, SCTP_SO_NOT_LOCKED);
+ } else {
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT);
+ SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ sctp_stop_timers_for_shutdown(stcb);
+ sctp_send_shutdown(stcb,
+ stcb->asoc.primary_destination);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN,
+ stcb->sctp_ep, stcb, asoc->primary_destination);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
+ stcb->sctp_ep, stcb, asoc->primary_destination);
+ }
+ } else if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) &&
+ (asoc->stream_queue_cnt == 0)) {
+ if (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT) {
+ goto abort_out_now;
+ }
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT);
+ SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ sctp_send_shutdown_ack(stcb,
+ stcb->asoc.primary_destination);
+ sctp_stop_timers_for_shutdown(stcb);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK,
+ stcb->sctp_ep, stcb, asoc->primary_destination);
+ }
+ }
+ /*********************************************/
+ /* Here we perform PR-SCTP procedures */
+ /* (section 4.2) */
+ /*********************************************/
+ /* C1. update advancedPeerAckPoint */
+ if (compare_with_wrap(cumack, asoc->advanced_peer_ack_point, MAX_TSN)) {
+ asoc->advanced_peer_ack_point = cumack;
+ }
+ /* PR-Sctp issues need to be addressed too */
+ if ((asoc->peer_supports_prsctp) && (asoc->pr_sctp_cnt > 0)) {
+ struct sctp_tmit_chunk *lchk;
+ uint32_t old_adv_peer_ack_point;
+
+ old_adv_peer_ack_point = asoc->advanced_peer_ack_point;
+ lchk = sctp_try_advance_peer_ack_point(stcb, asoc);
+ /* C3. See if we need to send a Fwd-TSN */
+ if (compare_with_wrap(asoc->advanced_peer_ack_point, cumack,
+ MAX_TSN)) {
+ /*
+ * ISSUE with ECN, see FWD-TSN processing for notes
+ * on issues that will occur when the ECN NONCE
+ * stuff is put into SCTP for cross checking.
+ */
+ if (compare_with_wrap(asoc->advanced_peer_ack_point, old_adv_peer_ack_point,
+ MAX_TSN)) {
+ send_forward_tsn(stcb, asoc);
+ /*
+ * ECN Nonce: Disable Nonce Sum check when
+ * FWD TSN is sent and store resync tsn
+ */
+ asoc->nonce_sum_check = 0;
+ asoc->nonce_resync_tsn = asoc->advanced_peer_ack_point;
+ } else if (lchk) {
+ /* try to FR fwd-tsn's that get lost too */
+ if (lchk->rec.data.fwd_tsn_cnt >= 3) {
+ send_forward_tsn(stcb, asoc);
+ }
+ }
+ }
+ if (lchk) {
+ /* Assure a timer is up */
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND,
+ stcb->sctp_ep, stcb, lchk->whoTo);
+ }
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_RWND_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_SACK_RWND_UPDATE,
+ rwnd,
+ stcb->asoc.peers_rwnd,
+ stcb->asoc.total_flight,
+ stcb->asoc.total_output_queue_size);
+ }
+}
+
+void
+sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup,
+ struct sctp_tcb *stcb, struct sctp_nets *net_from,
+ uint16_t num_seg, uint16_t num_nr_seg, uint16_t num_dup,
+ int *abort_now, uint8_t flags,
+ uint32_t cum_ack, uint32_t rwnd)
+{
+ struct sctp_association *asoc;
+ struct sctp_tmit_chunk *tp1, *tp2;
+ uint32_t last_tsn, biggest_tsn_acked, biggest_tsn_newly_acked, this_sack_lowest_newack;
+ uint32_t sav_cum_ack;
+ uint16_t wake_him = 0;
+ uint32_t send_s = 0;
+ long j;
+ int accum_moved = 0;
+ int will_exit_fast_recovery = 0;
+ uint32_t a_rwnd, old_rwnd;
+ int win_probe_recovery = 0;
+ int win_probe_recovered = 0;
+ struct sctp_nets *net = NULL;
+ int nonce_sum_flag, ecn_seg_sums = 0;
+ int done_once;
+ uint8_t reneged_all = 0;
+ uint8_t cmt_dac_flag;
+
+ /*
+ * we take any chance we can to service our queues since we cannot
+ * get awoken when the socket is read from :<
+ */
+ /*
+ * Now perform the actual SACK handling: 1) Verify that it is not an
+ * old sack, if so discard. 2) If there is nothing left in the send
+ * queue (cum-ack is equal to last acked) then you have a duplicate
+ * too, update any rwnd change and verify no timers are running.
+ * then return. 3) Process any new consequtive data i.e. cum-ack
+ * moved process these first and note that it moved. 4) Process any
+ * sack blocks. 5) Drop any acked from the queue. 6) Check for any
+ * revoked blocks and mark. 7) Update the cwnd. 8) Nothing left,
+ * sync up flightsizes and things, stop all timers and also check
+ * for shutdown_pending state. If so then go ahead and send off the
+ * shutdown. If in shutdown recv, send off the shutdown-ack and
+ * start that timer, Ret. 9) Strike any non-acked things and do FR
+ * procedure if needed being sure to set the FR flag. 10) Do pr-sctp
+ * procedures. 11) Apply any FR penalties. 12) Assure we will SACK
+ * if in shutdown_recv state.
+ */
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ /* CMT DAC algo */
+ this_sack_lowest_newack = 0;
+ j = 0;
+ SCTP_STAT_INCR(sctps_slowpath_sack);
+ last_tsn = cum_ack;
+ nonce_sum_flag = flags & SCTP_SACK_NONCE_SUM;
+ cmt_dac_flag = flags & SCTP_SACK_CMT_DAC;
+#ifdef SCTP_ASOCLOG_OF_TSNS
+ stcb->asoc.cumack_log[stcb->asoc.cumack_log_at] = cum_ack;
+ stcb->asoc.cumack_log_at++;
+ if (stcb->asoc.cumack_log_at > SCTP_TSN_LOG_SIZE) {
+ stcb->asoc.cumack_log_at = 0;
+ }
+#endif
+ a_rwnd = rwnd;
+
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_SACK_ARRIVALS_ENABLE) {
+ sctp_misc_ints(SCTP_SACK_LOG_NORMAL, cum_ack,
+ rwnd, stcb->asoc.last_acked_seq, stcb->asoc.peers_rwnd);
+ }
+ old_rwnd = stcb->asoc.peers_rwnd;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INDATA,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ asoc = &stcb->asoc;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) {
+ sctp_log_sack(asoc->last_acked_seq,
+ cum_ack,
+ 0,
+ num_seg,
+ num_dup,
+ SCTP_LOG_NEW_SACK);
+ }
+ if ((num_dup) && (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_FR_LOGGING_ENABLE | SCTP_EARLYFR_LOGGING_ENABLE))) {
+ uint16_t i;
+ uint32_t *dupdata, dblock;
+
+ for (i = 0; i < num_dup; i++) {
+ dupdata = (uint32_t *) sctp_m_getptr(m, offset_dup + i * sizeof(uint32_t),
+ sizeof(uint32_t), (uint8_t *) & dblock);
+ if (dupdata == NULL) {
+ break;
+ }
+ sctp_log_fr(*dupdata, 0, 0, SCTP_FR_DUPED);
+ }
+ }
+ if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) {
+ /* reality check */
+ if (!TAILQ_EMPTY(&asoc->sent_queue)) {
+ tp1 = TAILQ_LAST(&asoc->sent_queue,
+ sctpchunk_listhead);
+ send_s = tp1->rec.data.TSN_seq + 1;
+ } else {
+ tp1 = NULL;
+ send_s = asoc->sending_seq;
+ }
+ if (cum_ack == send_s ||
+ compare_with_wrap(cum_ack, send_s, MAX_TSN)) {
+ struct mbuf *oper;
+
+ /*
+ * no way, we have not even sent this TSN out yet.
+ * Peer is hopelessly messed up with us.
+ */
+ printf("NEW cum_ack:%x send_s:%x is smaller or equal\n",
+ cum_ack, send_s);
+ if (tp1) {
+ printf("Got send_s from tsn:%x + 1 of tp1:%p\n",
+ tp1->rec.data.TSN_seq, tp1);
+ }
+ hopeless_peer:
+ *abort_now = 1;
+ /* XXX */
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) +
+ sizeof(uint32_t);
+ ph = mtod(oper, struct sctp_paramhdr *);
+ ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length = htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_25);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25;
+ sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+ return;
+ }
+ }
+ /**********************/
+ /* 1) check the range */
+ /**********************/
+ if (compare_with_wrap(asoc->last_acked_seq, last_tsn, MAX_TSN)) {
+ /* acking something behind */
+ return;
+ }
+ sav_cum_ack = asoc->last_acked_seq;
+
+ /* update the Rwnd of the peer */
+ if (TAILQ_EMPTY(&asoc->sent_queue) &&
+ TAILQ_EMPTY(&asoc->send_queue) &&
+ (asoc->stream_queue_cnt == 0)) {
+ /* nothing left on send/sent and strmq */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) {
+ sctp_log_rwnd_set(SCTP_SET_PEER_RWND_VIA_SACK,
+ asoc->peers_rwnd, 0, 0, a_rwnd);
+ }
+ asoc->peers_rwnd = a_rwnd;
+ if (asoc->sent_queue_retran_cnt) {
+ asoc->sent_queue_retran_cnt = 0;
+ }
+ if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) {
+ /* SWS sender side engages */
+ asoc->peers_rwnd = 0;
+ }
+ /* stop any timers */
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
+ stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_26);
+ if (SCTP_BASE_SYSCTL(sctp_early_fr)) {
+ if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) {
+ SCTP_STAT_INCR(sctps_earlyfrstpidsck1);
+ sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_26);
+ }
+ }
+ net->partial_bytes_acked = 0;
+ net->flight_size = 0;
+ }
+ asoc->total_flight = 0;
+ asoc->total_flight_count = 0;
+ return;
+ }
+ /*
+ * We init netAckSz and netAckSz2 to 0. These are used to track 2
+ * things. The total byte count acked is tracked in netAckSz AND
+ * netAck2 is used to track the total bytes acked that are un-
+ * amibguious and were never retransmitted. We track these on a per
+ * destination address basis.
+ */
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ net->prev_cwnd = net->cwnd;
+ net->net_ack = 0;
+ net->net_ack2 = 0;
+
+ /*
+ * CMT: Reset CUC and Fast recovery algo variables before
+ * SACK processing
+ */
+ net->new_pseudo_cumack = 0;
+ net->will_exit_fast_recovery = 0;
+ }
+ /* process the new consecutive TSN first */
+ tp1 = TAILQ_FIRST(&asoc->sent_queue);
+ while (tp1) {
+ if (compare_with_wrap(last_tsn, tp1->rec.data.TSN_seq,
+ MAX_TSN) ||
+ last_tsn == tp1->rec.data.TSN_seq) {
+ if (tp1->sent != SCTP_DATAGRAM_UNSENT) {
+ /*
+ * ECN Nonce: Add the nonce to the sender's
+ * nonce sum
+ */
+ asoc->nonce_sum_expect_base += tp1->rec.data.ect_nonce;
+ accum_moved = 1;
+ if (tp1->sent < SCTP_DATAGRAM_ACKED) {
+ /*
+ * If it is less than ACKED, it is
+ * now no-longer in flight. Higher
+ * values may occur during marking
+ */
+ if ((tp1->whoTo->dest_state &
+ SCTP_ADDR_UNCONFIRMED) &&
+ (tp1->snd_count < 2)) {
+ /*
+ * If there was no retran
+ * and the address is
+ * un-confirmed and we sent
+ * there and are now
+ * sacked.. its confirmed,
+ * mark it so.
+ */
+ tp1->whoTo->dest_state &=
+ ~SCTP_ADDR_UNCONFIRMED;
+ }
+ if (tp1->sent < SCTP_DATAGRAM_RESEND) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_CA,
+ tp1->whoTo->flight_size,
+ tp1->book_size,
+ (uintptr_t) tp1->whoTo,
+ tp1->rec.data.TSN_seq);
+ }
+ sctp_flight_size_decrease(tp1);
+ sctp_total_flight_decrease(stcb, tp1);
+ }
+ tp1->whoTo->net_ack += tp1->send_size;
+
+ /* CMT SFR and DAC algos */
+ this_sack_lowest_newack = tp1->rec.data.TSN_seq;
+ tp1->whoTo->saw_newack = 1;
+
+ if (tp1->snd_count < 2) {
+ /*
+ * True non-retransmited
+ * chunk
+ */
+ tp1->whoTo->net_ack2 +=
+ tp1->send_size;
+
+ /* update RTO too? */
+ if (tp1->do_rtt) {
+ tp1->whoTo->RTO =
+ sctp_calculate_rto(stcb,
+ asoc, tp1->whoTo,
+ &tp1->sent_rcv_time,
+ sctp_align_safe_nocopy);
+ tp1->do_rtt = 0;
+ }
+ }
+ /*
+ * CMT: CUCv2 algorithm. From the
+ * cumack'd TSNs, for each TSN being
+ * acked for the first time, set the
+ * following variables for the
+ * corresp destination.
+ * new_pseudo_cumack will trigger a
+ * cwnd update.
+ * find_(rtx_)pseudo_cumack will
+ * trigger search for the next
+ * expected (rtx-)pseudo-cumack.
+ */
+ tp1->whoTo->new_pseudo_cumack = 1;
+ tp1->whoTo->find_pseudo_cumack = 1;
+ tp1->whoTo->find_rtx_pseudo_cumack = 1;
+
+
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) {
+ sctp_log_sack(asoc->last_acked_seq,
+ cum_ack,
+ tp1->rec.data.TSN_seq,
+ 0,
+ 0,
+ SCTP_LOG_TSN_ACKED);
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, tp1->whoTo, tp1->rec.data.TSN_seq, SCTP_CWND_LOG_FROM_SACK);
+ }
+ }
+ if (tp1->sent == SCTP_DATAGRAM_RESEND) {
+ sctp_ucount_decr(asoc->sent_queue_retran_cnt);
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xB3,
+ (asoc->sent_queue_retran_cnt & 0x000000ff));
+#endif
+ }
+ if (tp1->rec.data.chunk_was_revoked) {
+ /* deflate the cwnd */
+ tp1->whoTo->cwnd -= tp1->book_size;
+ tp1->rec.data.chunk_was_revoked = 0;
+ }
+ tp1->sent = SCTP_DATAGRAM_ACKED;
+ }
+ } else {
+ break;
+ }
+ tp1 = TAILQ_NEXT(tp1, sctp_next);
+ }
+ biggest_tsn_newly_acked = biggest_tsn_acked = last_tsn;
+ /* always set this up to cum-ack */
+ asoc->this_sack_highest_gap = last_tsn;
+
+ if ((num_seg > 0) || (num_nr_seg > 0)) {
+
+ /*
+ * CMT: SFR algo (and HTNA) - this_sack_highest_newack has
+ * to be greater than the cumack. Also reset saw_newack to 0
+ * for all dests.
+ */
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ net->saw_newack = 0;
+ net->this_sack_highest_newack = last_tsn;
+ }
+
+ /*
+ * thisSackHighestGap will increase while handling NEW
+ * segments this_sack_highest_newack will increase while
+ * handling NEWLY ACKED chunks. this_sack_lowest_newack is
+ * used for CMT DAC algo. saw_newack will also change.
+ */
+ if (sctp_handle_segments(m, &offset_seg, stcb, asoc, last_tsn, &biggest_tsn_acked,
+ &biggest_tsn_newly_acked, &this_sack_lowest_newack,
+ num_seg, num_nr_seg, &ecn_seg_sums)) {
+ wake_him++;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) {
+ /*
+ * validate the biggest_tsn_acked in the gap acks if
+ * strict adherence is wanted.
+ */
+ if ((biggest_tsn_acked == send_s) ||
+ (compare_with_wrap(biggest_tsn_acked, send_s, MAX_TSN))) {
+ /*
+ * peer is either confused or we are under
+ * attack. We must abort.
+ */
+ printf("Hopeless peer! biggest_tsn_acked:%x largest seq:%x\n",
+ biggest_tsn_acked,
+ send_s);
+
+ goto hopeless_peer;
+ }
+ }
+ }
+ /*******************************************/
+ /* cancel ALL T3-send timer if accum moved */
+ /*******************************************/
+ if (asoc->sctp_cmt_on_off == 1) {
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ if (net->new_pseudo_cumack)
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
+ stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_27);
+
+ }
+ } else {
+ if (accum_moved) {
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
+ stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_28);
+ }
+ }
+ }
+ /********************************************/
+ /* drop the acked chunks from the sentqueue */
+ /********************************************/
+ asoc->last_acked_seq = cum_ack;
+
+ tp1 = TAILQ_FIRST(&asoc->sent_queue);
+ if (tp1 == NULL)
+ goto done_with_it;
+ do {
+ if (compare_with_wrap(tp1->rec.data.TSN_seq, cum_ack,
+ MAX_TSN)) {
+ break;
+ }
+ if (tp1->sent == SCTP_DATAGRAM_UNSENT) {
+ /* no more sent on list */
+ printf("Warning, tp1->sent == %d and its now acked?\n",
+ tp1->sent);
+ }
+ tp2 = TAILQ_NEXT(tp1, sctp_next);
+ TAILQ_REMOVE(&asoc->sent_queue, tp1, sctp_next);
+ if (tp1->pr_sctp_on) {
+ if (asoc->pr_sctp_cnt != 0)
+ asoc->pr_sctp_cnt--;
+ }
+ if (TAILQ_EMPTY(&asoc->sent_queue) &&
+ (asoc->total_flight > 0)) {
+#ifdef INVARIANTS
+ panic("Warning flight size is postive and should be 0");
+#else
+ SCTP_PRINTF("Warning flight size incorrect should be 0 is %d\n",
+ asoc->total_flight);
+#endif
+ asoc->total_flight = 0;
+ }
+ if (tp1->data) {
+ /* sa_ignore NO_NULL_CHK */
+ sctp_free_bufspace(stcb, asoc, tp1, 1);
+ sctp_m_freem(tp1->data);
+ if (asoc->peer_supports_prsctp && PR_SCTP_BUF_ENABLED(tp1->flags)) {
+ asoc->sent_queue_cnt_removeable--;
+ }
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) {
+ sctp_log_sack(asoc->last_acked_seq,
+ cum_ack,
+ tp1->rec.data.TSN_seq,
+ 0,
+ 0,
+ SCTP_LOG_FREE_SENT);
+ }
+ tp1->data = NULL;
+ asoc->sent_queue_cnt--;
+ sctp_free_a_chunk(stcb, tp1);
+ wake_him++;
+ tp1 = tp2;
+ } while (tp1 != NULL);
+
+done_with_it:
+ /* sa_ignore NO_NULL_CHK */
+ if ((wake_him) && (stcb->sctp_socket)) {
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+ SOCKBUF_LOCK(&stcb->sctp_socket->so_snd);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_WAKE_LOGGING_ENABLE) {
+ sctp_wakeup_log(stcb, cum_ack, wake_him, SCTP_WAKESND_FROM_SACK);
+ }
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ /* assoc was freed while we were unlocked */
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return;
+ }
+#endif
+ sctp_sowwakeup_locked(stcb->sctp_ep, stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_WAKE_LOGGING_ENABLE) {
+ sctp_wakeup_log(stcb, cum_ack, wake_him, SCTP_NOWAKE_FROM_SACK);
+ }
+ }
+
+ if (asoc->fast_retran_loss_recovery && accum_moved) {
+ if (compare_with_wrap(asoc->last_acked_seq,
+ asoc->fast_recovery_tsn, MAX_TSN) ||
+ asoc->last_acked_seq == asoc->fast_recovery_tsn) {
+ /* Setup so we will exit RFC2582 fast recovery */
+ will_exit_fast_recovery = 1;
+ }
+ }
+ /*
+ * Check for revoked fragments:
+ *
+ * if Previous sack - Had no frags then we can't have any revoked if
+ * Previous sack - Had frag's then - If we now have frags aka
+ * num_seg > 0 call sctp_check_for_revoked() to tell if peer revoked
+ * some of them. else - The peer revoked all ACKED fragments, since
+ * we had some before and now we have NONE.
+ */
+
+ if (num_seg) {
+ sctp_check_for_revoked(stcb, asoc, cum_ack, biggest_tsn_acked);
+ asoc->saw_sack_with_frags = 1;
+ } else if (asoc->saw_sack_with_frags) {
+ int cnt_revoked = 0;
+
+ tp1 = TAILQ_FIRST(&asoc->sent_queue);
+ if (tp1 != NULL) {
+ /* Peer revoked all dg's marked or acked */
+ TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) {
+ if (tp1->sent == SCTP_DATAGRAM_ACKED) {
+ tp1->sent = SCTP_DATAGRAM_SENT;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_UP_REVOKE,
+ tp1->whoTo->flight_size,
+ tp1->book_size,
+ (uintptr_t) tp1->whoTo,
+ tp1->rec.data.TSN_seq);
+ }
+ sctp_flight_size_increase(tp1);
+ sctp_total_flight_increase(stcb, tp1);
+ tp1->rec.data.chunk_was_revoked = 1;
+ /*
+ * To ensure that this increase in
+ * flightsize, which is artificial,
+ * does not throttle the sender, we
+ * also increase the cwnd
+ * artificially.
+ */
+ tp1->whoTo->cwnd += tp1->book_size;
+ cnt_revoked++;
+ }
+ }
+ if (cnt_revoked) {
+ reneged_all = 1;
+ }
+ }
+ asoc->saw_sack_with_frags = 0;
+ }
+ if (num_nr_seg > 0)
+ asoc->saw_sack_with_nr_frags = 1;
+ else
+ asoc->saw_sack_with_nr_frags = 0;
+
+ /* JRS - Use the congestion control given in the CC module */
+ asoc->cc_functions.sctp_cwnd_update_after_sack(stcb, asoc, accum_moved, reneged_all, will_exit_fast_recovery);
+
+ if (TAILQ_EMPTY(&asoc->sent_queue)) {
+ /* nothing left in-flight */
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ /* stop all timers */
+ if (SCTP_BASE_SYSCTL(sctp_early_fr)) {
+ if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) {
+ SCTP_STAT_INCR(sctps_earlyfrstpidsck4);
+ sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_29);
+ }
+ }
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
+ stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_30);
+ net->flight_size = 0;
+ net->partial_bytes_acked = 0;
+ }
+ asoc->total_flight = 0;
+ asoc->total_flight_count = 0;
+ }
+ /**********************************/
+ /* Now what about shutdown issues */
+ /**********************************/
+ if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue)) {
+ /* nothing left on sendqueue.. consider done */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) {
+ sctp_log_rwnd_set(SCTP_SET_PEER_RWND_VIA_SACK,
+ asoc->peers_rwnd, 0, 0, a_rwnd);
+ }
+ asoc->peers_rwnd = a_rwnd;
+ if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) {
+ /* SWS sender side engages */
+ asoc->peers_rwnd = 0;
+ }
+ /* clean up */
+ if ((asoc->stream_queue_cnt == 1) &&
+ ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) ||
+ (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED)) &&
+ (asoc->locked_on_sending)
+ ) {
+ struct sctp_stream_queue_pending *sp;
+
+ /*
+ * I may be in a state where we got all across.. but
+ * cannot write more due to a shutdown... we abort
+ * since the user did not indicate EOR in this case.
+ */
+ sp = TAILQ_LAST(&((asoc->locked_on_sending)->outqueue),
+ sctp_streamhead);
+ if ((sp) && (sp->length == 0)) {
+ asoc->locked_on_sending = NULL;
+ if (sp->msg_is_complete) {
+ asoc->stream_queue_cnt--;
+ } else {
+ asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
+ asoc->stream_queue_cnt--;
+ }
+ }
+ }
+ if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) &&
+ (asoc->stream_queue_cnt == 0)) {
+ if (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT) {
+ /* Need to abort here */
+ struct mbuf *oper;
+
+ abort_out_now:
+ *abort_now = 1;
+ /* XXX */
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) +
+ sizeof(uint32_t);
+ ph = mtod(oper, struct sctp_paramhdr *);
+ ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT);
+ ph->param_length = htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_31);
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_31;
+ sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_RESPONSE_TO_USER_REQ, oper, SCTP_SO_NOT_LOCKED);
+ return;
+ } else {
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT);
+ SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ sctp_stop_timers_for_shutdown(stcb);
+ sctp_send_shutdown(stcb,
+ stcb->asoc.primary_destination);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN,
+ stcb->sctp_ep, stcb, asoc->primary_destination);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
+ stcb->sctp_ep, stcb, asoc->primary_destination);
+ }
+ return;
+ } else if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) &&
+ (asoc->stream_queue_cnt == 0)) {
+ if (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT) {
+ goto abort_out_now;
+ }
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT);
+ SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ sctp_send_shutdown_ack(stcb,
+ stcb->asoc.primary_destination);
+ sctp_stop_timers_for_shutdown(stcb);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK,
+ stcb->sctp_ep, stcb, asoc->primary_destination);
+ return;
+ }
+ }
+ /*
+ * Now here we are going to recycle net_ack for a different use...
+ * HEADS UP.
+ */
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ net->net_ack = 0;
+ }
+
+ /*
+ * CMT DAC algorithm: If SACK DAC flag was 0, then no extra marking
+ * to be done. Setting this_sack_lowest_newack to the cum_ack will
+ * automatically ensure that.
+ */
+ if ((asoc->sctp_cmt_on_off == 1) &&
+ SCTP_BASE_SYSCTL(sctp_cmt_use_dac) &&
+ (cmt_dac_flag == 0)) {
+ this_sack_lowest_newack = cum_ack;
+ }
+ if ((num_seg > 0) || (num_nr_seg > 0)) {
+ sctp_strike_gap_ack_chunks(stcb, asoc, biggest_tsn_acked,
+ biggest_tsn_newly_acked, this_sack_lowest_newack, accum_moved);
+ }
+ /* JRS - Use the congestion control given in the CC module */
+ asoc->cc_functions.sctp_cwnd_update_after_fr(stcb, asoc);
+
+ /******************************************************************
+ * Here we do the stuff with ECN Nonce checking.
+ * We basically check to see if the nonce sum flag was incorrect
+ * or if resynchronization needs to be done. Also if we catch a
+ * misbehaving receiver we give him the kick.
+ ******************************************************************/
+
+ if (asoc->ecn_nonce_allowed) {
+ if (asoc->nonce_sum_check) {
+ if (nonce_sum_flag != ((asoc->nonce_sum_expect_base + ecn_seg_sums) & SCTP_SACK_NONCE_SUM)) {
+ if (asoc->nonce_wait_for_ecne == 0) {
+ struct sctp_tmit_chunk *lchk;
+
+ lchk = TAILQ_FIRST(&asoc->send_queue);
+ asoc->nonce_wait_for_ecne = 1;
+ if (lchk) {
+ asoc->nonce_wait_tsn = lchk->rec.data.TSN_seq;
+ } else {
+ asoc->nonce_wait_tsn = asoc->sending_seq;
+ }
+ } else {
+ if (compare_with_wrap(asoc->last_acked_seq, asoc->nonce_wait_tsn, MAX_TSN) ||
+ (asoc->last_acked_seq == asoc->nonce_wait_tsn)) {
+ /*
+ * Misbehaving peer. We need
+ * to react to this guy
+ */
+ asoc->ecn_allowed = 0;
+ asoc->ecn_nonce_allowed = 0;
+ }
+ }
+ }
+ } else {
+ /* See if Resynchronization Possible */
+ if (compare_with_wrap(asoc->last_acked_seq, asoc->nonce_resync_tsn, MAX_TSN)) {
+ asoc->nonce_sum_check = 1;
+ /*
+ * now we must calculate what the base is.
+ * We do this based on two things, we know
+ * the total's for all the segments
+ * gap-acked in the SACK, its stored in
+ * ecn_seg_sums. We also know the SACK's
+ * nonce sum, its in nonce_sum_flag. So we
+ * can build a truth table to back-calculate
+ * the new value of
+ * asoc->nonce_sum_expect_base:
+ *
+ * SACK-flag-Value Seg-Sums Base 0 0 0
+ * 1 0 1 0 1 1 1
+ * 1 0
+ */
+ asoc->nonce_sum_expect_base = (ecn_seg_sums ^ nonce_sum_flag) & SCTP_SACK_NONCE_SUM;
+ }
+ }
+ }
+ /* Now are we exiting loss recovery ? */
+ if (will_exit_fast_recovery) {
+ /* Ok, we must exit fast recovery */
+ asoc->fast_retran_loss_recovery = 0;
+ }
+ if ((asoc->sat_t3_loss_recovery) &&
+ ((compare_with_wrap(asoc->last_acked_seq, asoc->sat_t3_recovery_tsn,
+ MAX_TSN) ||
+ (asoc->last_acked_seq == asoc->sat_t3_recovery_tsn)))) {
+ /* end satellite t3 loss recovery */
+ asoc->sat_t3_loss_recovery = 0;
+ }
+ /*
+ * CMT Fast recovery
+ */
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ if (net->will_exit_fast_recovery) {
+ /* Ok, we must exit fast recovery */
+ net->fast_retran_loss_recovery = 0;
+ }
+ }
+
+ /* Adjust and set the new rwnd value */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) {
+ sctp_log_rwnd_set(SCTP_SET_PEER_RWND_VIA_SACK,
+ asoc->peers_rwnd, asoc->total_flight, (asoc->total_flight_count * SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)), a_rwnd);
+ }
+ asoc->peers_rwnd = sctp_sbspace_sub(a_rwnd,
+ (uint32_t) (asoc->total_flight + (asoc->total_flight_count * SCTP_BASE_SYSCTL(sctp_peer_chunk_oh))));
+ if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) {
+ /* SWS sender side engages */
+ asoc->peers_rwnd = 0;
+ }
+ if (asoc->peers_rwnd > old_rwnd) {
+ win_probe_recovery = 1;
+ }
+ /*
+ * Now we must setup so we have a timer up for anyone with
+ * outstanding data.
+ */
+ done_once = 0;
+again:
+ j = 0;
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ if (win_probe_recovery && (net->window_probe)) {
+ win_probe_recovered = 1;
+ /*-
+ * Find first chunk that was used with
+ * window probe and clear the event. Put
+ * it back into the send queue as if has
+ * not been sent.
+ */
+ TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) {
+ if (tp1->window_probe) {
+ sctp_window_probe_recovery(stcb, asoc, net, tp1);
+ break;
+ }
+ }
+ }
+ if (net->flight_size) {
+ j++;
+ if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND,
+ stcb->sctp_ep, stcb, net);
+ }
+ if (net->window_probe) {
+ net->window_probe = 0;
+ }
+ } else {
+ if (net->window_probe) {
+ /*
+ * In window probes we must assure a timer
+ * is still running there
+ */
+ if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND,
+ stcb->sctp_ep, stcb, net);
+
+ }
+ } else if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
+ stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_22);
+ }
+ if (SCTP_BASE_SYSCTL(sctp_early_fr)) {
+ if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) {
+ SCTP_STAT_INCR(sctps_earlyfrstpidsck4);
+ sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_23);
+ }
+ }
+ }
+ }
+ if ((j == 0) &&
+ (!TAILQ_EMPTY(&asoc->sent_queue)) &&
+ (asoc->sent_queue_retran_cnt == 0) &&
+ (win_probe_recovered == 0) &&
+ (done_once == 0)) {
+ /*
+ * huh, this should not happen unless all packets are
+ * PR-SCTP and marked to skip of course.
+ */
+ if (sctp_fs_audit(asoc)) {
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ net->flight_size = 0;
+ }
+ asoc->total_flight = 0;
+ asoc->total_flight_count = 0;
+ asoc->sent_queue_retran_cnt = 0;
+ TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) {
+ if (tp1->sent < SCTP_DATAGRAM_RESEND) {
+ sctp_flight_size_increase(tp1);
+ sctp_total_flight_increase(stcb, tp1);
+ } else if (tp1->sent == SCTP_DATAGRAM_RESEND) {
+ sctp_ucount_incr(asoc->sent_queue_retran_cnt);
+ }
+ }
+ }
+ done_once = 1;
+ goto again;
+ }
+ /*********************************************/
+ /* Here we perform PR-SCTP procedures */
+ /* (section 4.2) */
+ /*********************************************/
+ /* C1. update advancedPeerAckPoint */
+ if (compare_with_wrap(cum_ack, asoc->advanced_peer_ack_point, MAX_TSN)) {
+ asoc->advanced_peer_ack_point = cum_ack;
+ }
+ /* C2. try to further move advancedPeerAckPoint ahead */
+ if ((asoc->peer_supports_prsctp) && (asoc->pr_sctp_cnt > 0)) {
+ struct sctp_tmit_chunk *lchk;
+ uint32_t old_adv_peer_ack_point;
+
+ old_adv_peer_ack_point = asoc->advanced_peer_ack_point;
+ lchk = sctp_try_advance_peer_ack_point(stcb, asoc);
+ /* C3. See if we need to send a Fwd-TSN */
+ if (compare_with_wrap(asoc->advanced_peer_ack_point, cum_ack,
+ MAX_TSN)) {
+ /*
+ * ISSUE with ECN, see FWD-TSN processing for notes
+ * on issues that will occur when the ECN NONCE
+ * stuff is put into SCTP for cross checking.
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) {
+ sctp_misc_ints(SCTP_FWD_TSN_CHECK,
+ 0xee, cum_ack, asoc->advanced_peer_ack_point,
+ old_adv_peer_ack_point);
+ }
+ if (compare_with_wrap(asoc->advanced_peer_ack_point, old_adv_peer_ack_point,
+ MAX_TSN)) {
+
+ send_forward_tsn(stcb, asoc);
+ /*
+ * ECN Nonce: Disable Nonce Sum check when
+ * FWD TSN is sent and store resync tsn
+ */
+ asoc->nonce_sum_check = 0;
+ asoc->nonce_resync_tsn = asoc->advanced_peer_ack_point;
+ } else if (lchk) {
+ /* try to FR fwd-tsn's that get lost too */
+ if (lchk->rec.data.fwd_tsn_cnt >= 3) {
+ send_forward_tsn(stcb, asoc);
+ }
+ }
+ }
+ if (lchk) {
+ /* Assure a timer is up */
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND,
+ stcb->sctp_ep, stcb, lchk->whoTo);
+ }
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_RWND_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_SACK_RWND_UPDATE,
+ a_rwnd,
+ stcb->asoc.peers_rwnd,
+ stcb->asoc.total_flight,
+ stcb->asoc.total_output_queue_size);
+ }
+}
+
+void
+sctp_update_acked(struct sctp_tcb *stcb, struct sctp_shutdown_chunk *cp,
+ struct sctp_nets *netp, int *abort_flag)
+{
+ /* Copy cum-ack */
+ uint32_t cum_ack, a_rwnd;
+
+ cum_ack = ntohl(cp->cumulative_tsn_ack);
+ /* Arrange so a_rwnd does NOT change */
+ a_rwnd = stcb->asoc.peers_rwnd + stcb->asoc.total_flight;
+
+ /* Now call the express sack handling */
+ sctp_express_handle_sack(stcb, cum_ack, a_rwnd, 0, abort_flag);
+}
+
+static void
+sctp_kick_prsctp_reorder_queue(struct sctp_tcb *stcb,
+ struct sctp_stream_in *strmin)
+{
+ struct sctp_queued_to_read *ctl, *nctl;
+ struct sctp_association *asoc;
+ uint16_t tt;
+
+ asoc = &stcb->asoc;
+ tt = strmin->last_sequence_delivered;
+ /*
+ * First deliver anything prior to and including the stream no that
+ * came in
+ */
+ ctl = TAILQ_FIRST(&strmin->inqueue);
+ while (ctl) {
+ nctl = TAILQ_NEXT(ctl, next);
+ if (compare_with_wrap(tt, ctl->sinfo_ssn, MAX_SEQ) ||
+ (tt == ctl->sinfo_ssn)) {
+ /* this is deliverable now */
+ TAILQ_REMOVE(&strmin->inqueue, ctl, next);
+ /* subtract pending on streams */
+ asoc->size_on_all_streams -= ctl->length;
+ sctp_ucount_decr(asoc->cnt_on_all_streams);
+ /* deliver it to at least the delivery-q */
+ if (stcb->sctp_socket) {
+ sctp_mark_non_revokable(asoc, ctl->sinfo_tsn);
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ ctl,
+ &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED);
+ }
+ } else {
+ /* no more delivery now. */
+ break;
+ }
+ ctl = nctl;
+ }
+ /*
+ * now we must deliver things in queue the normal way if any are
+ * now ready.
+ */
+ tt = strmin->last_sequence_delivered + 1;
+ ctl = TAILQ_FIRST(&strmin->inqueue);
+ while (ctl) {
+ nctl = TAILQ_NEXT(ctl, next);
+ if (tt == ctl->sinfo_ssn) {
+ /* this is deliverable now */
+ TAILQ_REMOVE(&strmin->inqueue, ctl, next);
+ /* subtract pending on streams */
+ asoc->size_on_all_streams -= ctl->length;
+ sctp_ucount_decr(asoc->cnt_on_all_streams);
+ /* deliver it to at least the delivery-q */
+ strmin->last_sequence_delivered = ctl->sinfo_ssn;
+ if (stcb->sctp_socket) {
+ sctp_mark_non_revokable(asoc, ctl->sinfo_tsn);
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ ctl,
+ &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED);
+
+ }
+ tt = strmin->last_sequence_delivered + 1;
+ } else {
+ break;
+ }
+ ctl = nctl;
+ }
+}
+
+static void
+sctp_flush_reassm_for_str_seq(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ uint16_t stream, uint16_t seq)
+{
+ struct sctp_tmit_chunk *chk, *at;
+
+ if (!TAILQ_EMPTY(&asoc->reasmqueue)) {
+ /* For each one on here see if we need to toss it */
+ /*
+ * For now large messages held on the reasmqueue that are
+ * complete will be tossed too. We could in theory do more
+ * work to spin through and stop after dumping one msg aka
+ * seeing the start of a new msg at the head, and call the
+ * delivery function... to see if it can be delivered... But
+ * for now we just dump everything on the queue.
+ */
+ chk = TAILQ_FIRST(&asoc->reasmqueue);
+ while (chk) {
+ at = TAILQ_NEXT(chk, sctp_next);
+ /*
+ * Do not toss it if on a different stream or marked
+ * for unordered delivery in which case the stream
+ * sequence number has no meaning.
+ */
+ if ((chk->rec.data.stream_number != stream) ||
+ ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == SCTP_DATA_UNORDERED)) {
+ chk = at;
+ continue;
+ }
+ if (chk->rec.data.stream_seq == seq) {
+ /* It needs to be tossed */
+ TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
+ if (compare_with_wrap(chk->rec.data.TSN_seq,
+ asoc->tsn_last_delivered, MAX_TSN)) {
+ asoc->tsn_last_delivered =
+ chk->rec.data.TSN_seq;
+ asoc->str_of_pdapi =
+ chk->rec.data.stream_number;
+ asoc->ssn_of_pdapi =
+ chk->rec.data.stream_seq;
+ asoc->fragment_flags =
+ chk->rec.data.rcv_flags;
+ }
+ asoc->size_on_reasm_queue -= chk->send_size;
+ sctp_ucount_decr(asoc->cnt_on_reasm_queue);
+
+ /* Clear up any stream problem */
+ if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) !=
+ SCTP_DATA_UNORDERED &&
+ (compare_with_wrap(chk->rec.data.stream_seq,
+ asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered,
+ MAX_SEQ))) {
+ /*
+ * We must dump forward this streams
+ * sequence number if the chunk is
+ * not unordered that is being
+ * skipped. There is a chance that
+ * if the peer does not include the
+ * last fragment in its FWD-TSN we
+ * WILL have a problem here since
+ * you would have a partial chunk in
+ * queue that may not be
+ * deliverable. Also if a Partial
+ * delivery API as started the user
+ * may get a partial chunk. The next
+ * read returning a new chunk...
+ * really ugly but I see no way
+ * around it! Maybe a notify??
+ */
+ asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered =
+ chk->rec.data.stream_seq;
+ }
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ sctp_free_a_chunk(stcb, chk);
+ } else if (compare_with_wrap(chk->rec.data.stream_seq, seq, MAX_SEQ)) {
+ /*
+ * If the stream_seq is > than the purging
+ * one, we are done
+ */
+ break;
+ }
+ chk = at;
+ }
+ }
+}
+
+
+void
+sctp_handle_forward_tsn(struct sctp_tcb *stcb,
+ struct sctp_forward_tsn_chunk *fwd,
+ int *abort_flag, struct mbuf *m, int offset)
+{
+ /*
+ * ISSUES that MUST be fixed for ECN! When we are the sender of the
+ * forward TSN, when the SACK comes back that acknowledges the
+ * FWD-TSN we must reset the NONCE sum to match correctly. This will
+ * get quite tricky since we may have sent more data interveneing
+ * and must carefully account for what the SACK says on the nonce
+ * and any gaps that are reported. This work will NOT be done here,
+ * but I note it here since it is really related to PR-SCTP and
+ * FWD-TSN's
+ */
+
+ /* The pr-sctp fwd tsn */
+ /*
+ * here we will perform all the data receiver side steps for
+ * processing FwdTSN, as required in by pr-sctp draft:
+ *
+ * Assume we get FwdTSN(x):
+ *
+ * 1) update local cumTSN to x 2) try to further advance cumTSN to x +
+ * others we have 3) examine and update re-ordering queue on
+ * pr-in-streams 4) clean up re-assembly queue 5) Send a sack to
+ * report where we are.
+ */
+ struct sctp_association *asoc;
+ uint32_t new_cum_tsn, gap;
+ unsigned int i, fwd_sz, cumack_set_flag, m_size;
+ uint32_t str_seq;
+ struct sctp_stream_in *strm;
+ struct sctp_tmit_chunk *chk, *at;
+ struct sctp_queued_to_read *ctl, *sv;
+
+ cumack_set_flag = 0;
+ asoc = &stcb->asoc;
+ if ((fwd_sz = ntohs(fwd->ch.chunk_length)) < sizeof(struct sctp_forward_tsn_chunk)) {
+ SCTPDBG(SCTP_DEBUG_INDATA1,
+ "Bad size too small/big fwd-tsn\n");
+ return;
+ }
+ m_size = (stcb->asoc.mapping_array_size << 3);
+ /*************************************************************/
+ /* 1. Here we update local cumTSN and shift the bitmap array */
+ /*************************************************************/
+ new_cum_tsn = ntohl(fwd->new_cumulative_tsn);
+
+ if (compare_with_wrap(asoc->cumulative_tsn, new_cum_tsn, MAX_TSN) ||
+ asoc->cumulative_tsn == new_cum_tsn) {
+ /* Already got there ... */
+ return;
+ }
+ /*
+ * now we know the new TSN is more advanced, let's find the actual
+ * gap
+ */
+ SCTP_CALC_TSN_TO_GAP(gap, new_cum_tsn, asoc->mapping_array_base_tsn);
+ asoc->cumulative_tsn = new_cum_tsn;
+ if (gap >= m_size) {
+ if ((long)gap > sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv)) {
+ struct mbuf *oper;
+
+ /*
+ * out of range (of single byte chunks in the rwnd I
+ * give out). This must be an attacker.
+ */
+ *abort_flag = 1;
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) +
+ (sizeof(uint32_t) * 3);
+ ph = mtod(oper, struct sctp_paramhdr *);
+ ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length = htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_33);
+ ippp++;
+ *ippp = asoc->highest_tsn_inside_map;
+ ippp++;
+ *ippp = new_cum_tsn;
+ }
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_33;
+ sctp_abort_an_association(stcb->sctp_ep, stcb,
+ SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+ return;
+ }
+ SCTP_STAT_INCR(sctps_fwdtsn_map_over);
+
+ memset(stcb->asoc.mapping_array, 0, stcb->asoc.mapping_array_size);
+ asoc->mapping_array_base_tsn = new_cum_tsn + 1;
+ asoc->highest_tsn_inside_map = new_cum_tsn;
+
+ memset(stcb->asoc.nr_mapping_array, 0, stcb->asoc.mapping_array_size);
+ asoc->highest_tsn_inside_nr_map = new_cum_tsn;
+
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
+ sctp_log_map(0, 3, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT);
+ }
+ asoc->last_echo_tsn = asoc->highest_tsn_inside_map;
+ } else {
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ for (i = 0; i <= gap; i++) {
+ if (!SCTP_IS_TSN_PRESENT(asoc->mapping_array, i) &&
+ !SCTP_IS_TSN_PRESENT(asoc->nr_mapping_array, i)) {
+ SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, i);
+ if (compare_with_wrap(asoc->mapping_array_base_tsn + i, asoc->highest_tsn_inside_nr_map, MAX_TSN)) {
+ asoc->highest_tsn_inside_nr_map = asoc->mapping_array_base_tsn + i;
+ }
+ }
+ }
+ }
+ /*************************************************************/
+ /* 2. Clear up re-assembly queue */
+ /*************************************************************/
+ /*
+ * First service it if pd-api is up, just in case we can progress it
+ * forward
+ */
+ if (asoc->fragmented_delivery_inprogress) {
+ sctp_service_reassembly(stcb, asoc);
+ }
+ if (!TAILQ_EMPTY(&asoc->reasmqueue)) {
+ /* For each one on here see if we need to toss it */
+ /*
+ * For now large messages held on the reasmqueue that are
+ * complete will be tossed too. We could in theory do more
+ * work to spin through and stop after dumping one msg aka
+ * seeing the start of a new msg at the head, and call the
+ * delivery function... to see if it can be delivered... But
+ * for now we just dump everything on the queue.
+ */
+ chk = TAILQ_FIRST(&asoc->reasmqueue);
+ while (chk) {
+ at = TAILQ_NEXT(chk, sctp_next);
+ if ((compare_with_wrap(new_cum_tsn,
+ chk->rec.data.TSN_seq, MAX_TSN)) ||
+ (new_cum_tsn == chk->rec.data.TSN_seq)) {
+ /* It needs to be tossed */
+ TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
+ if (compare_with_wrap(chk->rec.data.TSN_seq,
+ asoc->tsn_last_delivered, MAX_TSN)) {
+ asoc->tsn_last_delivered =
+ chk->rec.data.TSN_seq;
+ asoc->str_of_pdapi =
+ chk->rec.data.stream_number;
+ asoc->ssn_of_pdapi =
+ chk->rec.data.stream_seq;
+ asoc->fragment_flags =
+ chk->rec.data.rcv_flags;
+ }
+ asoc->size_on_reasm_queue -= chk->send_size;
+ sctp_ucount_decr(asoc->cnt_on_reasm_queue);
+
+ /* Clear up any stream problem */
+ if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) !=
+ SCTP_DATA_UNORDERED &&
+ (compare_with_wrap(chk->rec.data.stream_seq,
+ asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered,
+ MAX_SEQ))) {
+ /*
+ * We must dump forward this streams
+ * sequence number if the chunk is
+ * not unordered that is being
+ * skipped. There is a chance that
+ * if the peer does not include the
+ * last fragment in its FWD-TSN we
+ * WILL have a problem here since
+ * you would have a partial chunk in
+ * queue that may not be
+ * deliverable. Also if a Partial
+ * delivery API as started the user
+ * may get a partial chunk. The next
+ * read returning a new chunk...
+ * really ugly but I see no way
+ * around it! Maybe a notify??
+ */
+ asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered =
+ chk->rec.data.stream_seq;
+ }
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ sctp_free_a_chunk(stcb, chk);
+ } else {
+ /*
+ * Ok we have gone beyond the end of the
+ * fwd-tsn's mark.
+ */
+ break;
+ }
+ chk = at;
+ }
+ }
+ /*******************************************************/
+ /* 3. Update the PR-stream re-ordering queues and fix */
+ /* delivery issues as needed. */
+ /*******************************************************/
+ fwd_sz -= sizeof(*fwd);
+ if (m && fwd_sz) {
+ /* New method. */
+ unsigned int num_str;
+ struct sctp_strseq *stseq, strseqbuf;
+
+ offset += sizeof(*fwd);
+
+ SCTP_INP_READ_LOCK(stcb->sctp_ep);
+ num_str = fwd_sz / sizeof(struct sctp_strseq);
+ for (i = 0; i < num_str; i++) {
+ uint16_t st;
+
+ stseq = (struct sctp_strseq *)sctp_m_getptr(m, offset,
+ sizeof(struct sctp_strseq),
+ (uint8_t *) & strseqbuf);
+ offset += sizeof(struct sctp_strseq);
+ if (stseq == NULL) {
+ break;
+ }
+ /* Convert */
+ st = ntohs(stseq->stream);
+ stseq->stream = st;
+ st = ntohs(stseq->sequence);
+ stseq->sequence = st;
+
+ /* now process */
+
+ /*
+ * Ok we now look for the stream/seq on the read
+ * queue where its not all delivered. If we find it
+ * we transmute the read entry into a PDI_ABORTED.
+ */
+ if (stseq->stream >= asoc->streamincnt) {
+ /* screwed up streams, stop! */
+ break;
+ }
+ if ((asoc->str_of_pdapi == stseq->stream) &&
+ (asoc->ssn_of_pdapi == stseq->sequence)) {
+ /*
+ * If this is the one we were partially
+ * delivering now then we no longer are.
+ * Note this will change with the reassembly
+ * re-write.
+ */
+ asoc->fragmented_delivery_inprogress = 0;
+ }
+ sctp_flush_reassm_for_str_seq(stcb, asoc, stseq->stream, stseq->sequence);
+ TAILQ_FOREACH(ctl, &stcb->sctp_ep->read_queue, next) {
+ if ((ctl->sinfo_stream == stseq->stream) &&
+ (ctl->sinfo_ssn == stseq->sequence)) {
+ str_seq = (stseq->stream << 16) | stseq->sequence;
+ ctl->end_added = 1;
+ ctl->pdapi_aborted = 1;
+ sv = stcb->asoc.control_pdapi;
+ stcb->asoc.control_pdapi = ctl;
+ sctp_ulp_notify(SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION,
+ stcb,
+ SCTP_PARTIAL_DELIVERY_ABORTED,
+ (void *)&str_seq,
+ SCTP_SO_NOT_LOCKED);
+ stcb->asoc.control_pdapi = sv;
+ break;
+ } else if ((ctl->sinfo_stream == stseq->stream) &&
+ (compare_with_wrap(ctl->sinfo_ssn, stseq->sequence, MAX_SEQ))) {
+ /* We are past our victim SSN */
+ break;
+ }
+ }
+ strm = &asoc->strmin[stseq->stream];
+ if (compare_with_wrap(stseq->sequence,
+ strm->last_sequence_delivered, MAX_SEQ)) {
+ /* Update the sequence number */
+ strm->last_sequence_delivered =
+ stseq->sequence;
+ }
+ /* now kick the stream the new way */
+ /* sa_ignore NO_NULL_CHK */
+ sctp_kick_prsctp_reorder_queue(stcb, strm);
+ }
+ SCTP_INP_READ_UNLOCK(stcb->sctp_ep);
+ }
+ /*
+ * Now slide thing forward.
+ */
+ sctp_slide_mapping_arrays(stcb);
+
+ if (!TAILQ_EMPTY(&asoc->reasmqueue)) {
+ /* now lets kick out and check for more fragmented delivery */
+ /* sa_ignore NO_NULL_CHK */
+ sctp_deliver_reasm_check(stcb, &stcb->asoc);
+ }
+}
diff --git a/freebsd/sys/netinet/sctp_indata.h b/freebsd/sys/netinet/sctp_indata.h
new file mode 100644
index 00000000..a231ecaf
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_indata.h
@@ -0,0 +1,129 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_indata.h,v 1.9 2005/03/06 16:04:17 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __sctp_indata_h__
+#define __sctp_indata_h__
+
+#if defined(_KERNEL) || defined(__Userspace__)
+
+struct sctp_queued_to_read *
+sctp_build_readq_entry(struct sctp_tcb *stcb,
+ struct sctp_nets *net,
+ uint32_t tsn, uint32_t ppid,
+ uint32_t context, uint16_t stream_no,
+ uint16_t stream_seq, uint8_t flags,
+ struct mbuf *dm);
+
+
+#define sctp_build_readq_entry_mac(_ctl, in_it, a, net, tsn, ppid, context, stream_no, stream_seq, flags, dm) do { \
+ if (_ctl) { \
+ atomic_add_int(&((net)->ref_count), 1); \
+ (_ctl)->sinfo_stream = stream_no; \
+ (_ctl)->sinfo_ssn = stream_seq; \
+ (_ctl)->sinfo_flags = (flags << 8); \
+ (_ctl)->sinfo_ppid = ppid; \
+ (_ctl)->sinfo_context = a; \
+ (_ctl)->sinfo_timetolive = 0; \
+ (_ctl)->sinfo_tsn = tsn; \
+ (_ctl)->sinfo_cumtsn = tsn; \
+ (_ctl)->sinfo_assoc_id = sctp_get_associd((in_it)); \
+ (_ctl)->length = 0; \
+ (_ctl)->held_length = 0; \
+ (_ctl)->whoFrom = net; \
+ (_ctl)->data = dm; \
+ (_ctl)->tail_mbuf = NULL; \
+ (_ctl)->aux_data = NULL; \
+ (_ctl)->stcb = (in_it); \
+ (_ctl)->port_from = (in_it)->rport; \
+ (_ctl)->spec_flags = 0; \
+ (_ctl)->do_not_ref_stcb = 0; \
+ (_ctl)->end_added = 0; \
+ (_ctl)->pdapi_aborted = 0; \
+ (_ctl)->some_taken = 0; \
+ } \
+} while (0)
+
+
+
+struct mbuf *
+sctp_build_ctl_nchunk(struct sctp_inpcb *inp,
+ struct sctp_sndrcvinfo *sinfo);
+
+char *
+sctp_build_ctl_cchunk(struct sctp_inpcb *inp,
+ int *control_len,
+ struct sctp_sndrcvinfo *sinfo);
+
+void sctp_set_rwnd(struct sctp_tcb *, struct sctp_association *);
+
+uint32_t
+sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc);
+
+void
+sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack,
+ uint32_t rwnd, int nonce_sum_flag, int *abort_now);
+
+void
+sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup,
+ struct sctp_tcb *stcb, struct sctp_nets *net_from,
+ uint16_t num_seg, uint16_t num_nr_seg, uint16_t num_dup,
+ int *abort_now, uint8_t flags,
+ uint32_t cum_ack, uint32_t rwnd);
+
+/* draft-ietf-tsvwg-usctp */
+void
+sctp_handle_forward_tsn(struct sctp_tcb *,
+ struct sctp_forward_tsn_chunk *, int *, struct mbuf *, int);
+
+struct sctp_tmit_chunk *
+ sctp_try_advance_peer_ack_point(struct sctp_tcb *, struct sctp_association *);
+
+void sctp_service_queues(struct sctp_tcb *, struct sctp_association *);
+
+void
+sctp_update_acked(struct sctp_tcb *, struct sctp_shutdown_chunk *,
+ struct sctp_nets *, int *);
+
+int
+sctp_process_data(struct mbuf **, int, int *, int, struct sctphdr *,
+ struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *, uint32_t *);
+
+void sctp_slide_mapping_arrays(struct sctp_tcb *stcb);
+
+void sctp_sack_check(struct sctp_tcb *, int, int *);
+
+#endif
+#endif
diff --git a/freebsd/sys/netinet/sctp_input.c b/freebsd/sys/netinet/sctp_input.c
new file mode 100644
index 00000000..080813b4
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_input.c
@@ -0,0 +1,5965 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_input.c,v 1.27 2005/03/06 16:04:17 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp_var.h>
+#include <freebsd/netinet/sctp_sysctl.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#include <freebsd/netinet/sctp_header.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_output.h>
+#include <freebsd/netinet/sctp_input.h>
+#include <freebsd/netinet/sctp_auth.h>
+#include <freebsd/netinet/sctp_indata.h>
+#include <freebsd/netinet/sctp_asconf.h>
+#include <freebsd/netinet/sctp_bsd_addr.h>
+#include <freebsd/netinet/sctp_timer.h>
+#include <freebsd/netinet/sctp_crc32.h>
+#include <freebsd/netinet/udp.h>
+
+
+
+static void
+sctp_stop_all_cookie_timers(struct sctp_tcb *stcb)
+{
+ struct sctp_nets *net;
+
+ /*
+ * This now not only stops all cookie timers it also stops any INIT
+ * timers as well. This will make sure that the timers are stopped
+ * in all collision cases.
+ */
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ if (net->rxt_timer.type == SCTP_TIMER_TYPE_COOKIE) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_COOKIE,
+ stcb->sctp_ep,
+ stcb,
+ net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_1);
+ } else if (net->rxt_timer.type == SCTP_TIMER_TYPE_INIT) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_INIT,
+ stcb->sctp_ep,
+ stcb,
+ net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_2);
+ }
+ }
+}
+
+/* INIT handler */
+static void
+sctp_handle_init(struct mbuf *m, int iphlen, int offset, struct sctphdr *sh,
+ struct sctp_init_chunk *cp, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net, int *abort_no_unlock, uint32_t vrf_id, uint16_t port)
+{
+ struct sctp_init *init;
+ struct mbuf *op_err;
+ uint32_t init_limit;
+
+ SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_init: handling INIT tcb:%p\n",
+ stcb);
+ if (stcb == NULL) {
+ SCTP_INP_RLOCK(inp);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ goto outnow;
+ }
+ }
+ op_err = NULL;
+ init = &cp->init;
+ /* First are we accepting? */
+ if ((inp->sctp_socket->so_qlimit == 0) && (stcb == NULL)) {
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "sctp_handle_init: Abort, so_qlimit:%d\n",
+ inp->sctp_socket->so_qlimit);
+ /*
+ * FIX ME ?? What about TCP model and we have a
+ * match/restart case? Actually no fix is needed. the lookup
+ * will always find the existing assoc so stcb would not be
+ * NULL. It may be questionable to do this since we COULD
+ * just send back the INIT-ACK and hope that the app did
+ * accept()'s by the time the COOKIE was sent. But there is
+ * a price to pay for COOKIE generation and I don't want to
+ * pay it on the chance that the app will actually do some
+ * accepts(). The App just looses and should NOT be in this
+ * state :-)
+ */
+ sctp_abort_association(inp, stcb, m, iphlen, sh, op_err,
+ vrf_id, port);
+ if (stcb)
+ *abort_no_unlock = 1;
+ goto outnow;
+ }
+ if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_chunk)) {
+ /* Invalid length */
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
+ sctp_abort_association(inp, stcb, m, iphlen, sh, op_err,
+ vrf_id, port);
+ if (stcb)
+ *abort_no_unlock = 1;
+ goto outnow;
+ }
+ /* validate parameters */
+ if (init->initiate_tag == 0) {
+ /* protocol error... send abort */
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
+ sctp_abort_association(inp, stcb, m, iphlen, sh, op_err,
+ vrf_id, port);
+ if (stcb)
+ *abort_no_unlock = 1;
+ goto outnow;
+ }
+ if (ntohl(init->a_rwnd) < SCTP_MIN_RWND) {
+ /* invalid parameter... send abort */
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
+ sctp_abort_association(inp, stcb, m, iphlen, sh, op_err,
+ vrf_id, port);
+ if (stcb)
+ *abort_no_unlock = 1;
+ goto outnow;
+ }
+ if (init->num_inbound_streams == 0) {
+ /* protocol error... send abort */
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
+ sctp_abort_association(inp, stcb, m, iphlen, sh, op_err,
+ vrf_id, port);
+ if (stcb)
+ *abort_no_unlock = 1;
+ goto outnow;
+ }
+ if (init->num_outbound_streams == 0) {
+ /* protocol error... send abort */
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
+ sctp_abort_association(inp, stcb, m, iphlen, sh, op_err,
+ vrf_id, port);
+ if (stcb)
+ *abort_no_unlock = 1;
+ goto outnow;
+ }
+ init_limit = offset + ntohs(cp->ch.chunk_length);
+ if (sctp_validate_init_auth_params(m, offset + sizeof(*cp),
+ init_limit)) {
+ /* auth parameter(s) error... send abort */
+ sctp_abort_association(inp, stcb, m, iphlen, sh, NULL, vrf_id, port);
+ if (stcb)
+ *abort_no_unlock = 1;
+ goto outnow;
+ }
+ /* send an INIT-ACK w/cookie */
+ SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending INIT-ACK\n");
+ sctp_send_initiate_ack(inp, stcb, m, iphlen, offset, sh, cp, vrf_id, port,
+ ((stcb == NULL) ? SCTP_HOLDS_LOCK : SCTP_NOT_LOCKED));
+outnow:
+ if (stcb == NULL) {
+ SCTP_INP_RUNLOCK(inp);
+ }
+}
+
+/*
+ * process peer "INIT/INIT-ACK" chunk returns value < 0 on error
+ */
+
+int
+sctp_is_there_unsent_data(struct sctp_tcb *stcb)
+{
+ int unsent_data = 0;
+ struct sctp_stream_queue_pending *sp;
+ struct sctp_stream_out *strq;
+ struct sctp_association *asoc;
+
+ /*
+ * This function returns the number of streams that have true unsent
+ * data on them. Note that as it looks through it will clean up any
+ * places that have old data that has been sent but left at top of
+ * stream queue.
+ */
+ asoc = &stcb->asoc;
+ SCTP_TCB_SEND_LOCK(stcb);
+ if (!TAILQ_EMPTY(&asoc->out_wheel)) {
+ /* Check to see if some data queued */
+ TAILQ_FOREACH(strq, &asoc->out_wheel, next_spoke) {
+ is_there_another:
+ /* sa_ignore FREED_MEMORY */
+ sp = TAILQ_FIRST(&strq->outqueue);
+ if (sp == NULL) {
+ continue;
+ }
+ if ((sp->msg_is_complete) &&
+ (sp->length == 0) &&
+ (sp->sender_all_done)) {
+ /*
+ * We are doing differed cleanup. Last time
+ * through when we took all the data the
+ * sender_all_done was not set.
+ */
+ if (sp->put_last_out == 0) {
+ SCTP_PRINTF("Gak, put out entire msg with NO end!-1\n");
+ SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d\n",
+ sp->sender_all_done,
+ sp->length,
+ sp->msg_is_complete,
+ sp->put_last_out);
+ }
+ atomic_subtract_int(&stcb->asoc.stream_queue_cnt, 1);
+ TAILQ_REMOVE(&strq->outqueue, sp, next);
+ if (sp->net) {
+ sctp_free_remote_addr(sp->net);
+ sp->net = NULL;
+ }
+ if (sp->data) {
+ sctp_m_freem(sp->data);
+ sp->data = NULL;
+ }
+ sctp_free_a_strmoq(stcb, sp);
+ goto is_there_another;
+ } else {
+ unsent_data++;
+ continue;
+ }
+ }
+ }
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ return (unsent_data);
+}
+
+static int
+sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ struct sctp_init *init;
+ struct sctp_association *asoc;
+ struct sctp_nets *lnet;
+ unsigned int i;
+
+ init = &cp->init;
+ asoc = &stcb->asoc;
+ /* save off parameters */
+ asoc->peer_vtag = ntohl(init->initiate_tag);
+ asoc->peers_rwnd = ntohl(init->a_rwnd);
+ if (!TAILQ_EMPTY(&asoc->nets)) {
+ /* update any ssthresh's that may have a default */
+ TAILQ_FOREACH(lnet, &asoc->nets, sctp_next) {
+ lnet->ssthresh = asoc->peers_rwnd;
+
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) {
+ sctp_log_cwnd(stcb, lnet, 0, SCTP_CWND_INITIALIZATION);
+ }
+ }
+ }
+ SCTP_TCB_SEND_LOCK(stcb);
+ if (asoc->pre_open_streams > ntohs(init->num_inbound_streams)) {
+ unsigned int newcnt;
+ struct sctp_stream_out *outs;
+ struct sctp_stream_queue_pending *sp;
+ struct sctp_tmit_chunk *chk, *chk_next;
+
+ /* abandon the upper streams */
+ newcnt = ntohs(init->num_inbound_streams);
+ if (!TAILQ_EMPTY(&asoc->send_queue)) {
+ chk = TAILQ_FIRST(&asoc->send_queue);
+ while (chk) {
+ chk_next = TAILQ_NEXT(chk, sctp_next);
+ if (chk->rec.data.stream_number >= newcnt) {
+ TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next);
+ asoc->send_queue_cnt--;
+ if (chk->data != NULL) {
+ sctp_free_bufspace(stcb, asoc, chk, 1);
+ sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb,
+ SCTP_NOTIFY_DATAGRAM_UNSENT, chk, SCTP_SO_NOT_LOCKED);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ }
+ sctp_free_a_chunk(stcb, chk);
+ /* sa_ignore FREED_MEMORY */
+ }
+ chk = chk_next;
+ }
+ }
+ if (asoc->strmout) {
+ for (i = newcnt; i < asoc->pre_open_streams; i++) {
+ outs = &asoc->strmout[i];
+ sp = TAILQ_FIRST(&outs->outqueue);
+ while (sp) {
+ TAILQ_REMOVE(&outs->outqueue, sp, next);
+ asoc->stream_queue_cnt--;
+ sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL,
+ stcb, SCTP_NOTIFY_DATAGRAM_UNSENT,
+ sp, SCTP_SO_NOT_LOCKED);
+ if (sp->data) {
+ sctp_m_freem(sp->data);
+ sp->data = NULL;
+ }
+ if (sp->net) {
+ sctp_free_remote_addr(sp->net);
+ sp->net = NULL;
+ }
+ /* Free the chunk */
+ sctp_free_a_strmoq(stcb, sp);
+ /* sa_ignore FREED_MEMORY */
+ sp = TAILQ_FIRST(&outs->outqueue);
+ }
+ }
+ }
+ /* cut back the count */
+ asoc->pre_open_streams = newcnt;
+ }
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams;
+ /* init tsn's */
+ asoc->highest_tsn_inside_map = asoc->asconf_seq_in = ntohl(init->initial_tsn) - 1;
+ /* EY - nr_sack: initialize highest tsn in nr_mapping_array */
+ asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
+ sctp_log_map(0, 5, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT);
+ }
+ /* This is the next one we expect */
+ asoc->str_reset_seq_in = asoc->asconf_seq_in + 1;
+
+ asoc->mapping_array_base_tsn = ntohl(init->initial_tsn);
+ asoc->tsn_last_delivered = asoc->cumulative_tsn = asoc->asconf_seq_in;
+ asoc->last_echo_tsn = asoc->asconf_seq_in;
+ asoc->advanced_peer_ack_point = asoc->last_acked_seq;
+ /* open the requested streams */
+
+ if (asoc->strmin != NULL) {
+ /* Free the old ones */
+ struct sctp_queued_to_read *ctl;
+
+ for (i = 0; i < asoc->streamincnt; i++) {
+ ctl = TAILQ_FIRST(&asoc->strmin[i].inqueue);
+ while (ctl) {
+ TAILQ_REMOVE(&asoc->strmin[i].inqueue, ctl, next);
+ sctp_free_remote_addr(ctl->whoFrom);
+ ctl->whoFrom = NULL;
+ sctp_m_freem(ctl->data);
+ ctl->data = NULL;
+ sctp_free_a_readq(stcb, ctl);
+ ctl = TAILQ_FIRST(&asoc->strmin[i].inqueue);
+ }
+ }
+ SCTP_FREE(asoc->strmin, SCTP_M_STRMI);
+ }
+ asoc->streamincnt = ntohs(init->num_outbound_streams);
+ if (asoc->streamincnt > MAX_SCTP_STREAMS) {
+ asoc->streamincnt = MAX_SCTP_STREAMS;
+ }
+ SCTP_MALLOC(asoc->strmin, struct sctp_stream_in *, asoc->streamincnt *
+ sizeof(struct sctp_stream_in), SCTP_M_STRMI);
+ if (asoc->strmin == NULL) {
+ /* we didn't get memory for the streams! */
+ SCTPDBG(SCTP_DEBUG_INPUT2, "process_init: couldn't get memory for the streams!\n");
+ return (-1);
+ }
+ for (i = 0; i < asoc->streamincnt; i++) {
+ asoc->strmin[i].stream_no = i;
+ asoc->strmin[i].last_sequence_delivered = 0xffff;
+ /*
+ * U-stream ranges will be set when the cookie is unpacked.
+ * Or for the INIT sender they are un set (if pr-sctp not
+ * supported) when the INIT-ACK arrives.
+ */
+ TAILQ_INIT(&asoc->strmin[i].inqueue);
+ asoc->strmin[i].delivery_started = 0;
+ }
+ /*
+ * load_address_from_init will put the addresses into the
+ * association when the COOKIE is processed or the INIT-ACK is
+ * processed. Both types of COOKIE's existing and new call this
+ * routine. It will remove addresses that are no longer in the
+ * association (for the restarting case where addresses are
+ * removed). Up front when the INIT arrives we will discard it if it
+ * is a restart and new addresses have been added.
+ */
+ /* sa_ignore MEMLEAK */
+ return (0);
+}
+
+/*
+ * INIT-ACK message processing/consumption returns value < 0 on error
+ */
+static int
+sctp_process_init_ack(struct mbuf *m, int iphlen, int offset,
+ struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb,
+ struct sctp_nets *net, int *abort_no_unlock, uint32_t vrf_id)
+{
+ struct sctp_association *asoc;
+ struct mbuf *op_err;
+ int retval, abort_flag;
+ uint32_t initack_limit;
+ int nat_friendly = 0;
+
+ /* First verify that we have no illegal param's */
+ abort_flag = 0;
+ op_err = NULL;
+
+ op_err = sctp_arethere_unrecognized_parameters(m,
+ (offset + sizeof(struct sctp_init_chunk)),
+ &abort_flag, (struct sctp_chunkhdr *)cp, &nat_friendly);
+ if (abort_flag) {
+ /* Send an abort and notify peer */
+ sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_CAUSE_PROTOCOL_VIOLATION, op_err, SCTP_SO_NOT_LOCKED);
+ *abort_no_unlock = 1;
+ return (-1);
+ }
+ asoc = &stcb->asoc;
+ asoc->peer_supports_nat = (uint8_t) nat_friendly;
+ /* process the peer's parameters in the INIT-ACK */
+ retval = sctp_process_init((struct sctp_init_chunk *)cp, stcb, net);
+ if (retval < 0) {
+ return (retval);
+ }
+ initack_limit = offset + ntohs(cp->ch.chunk_length);
+ /* load all addresses */
+ if ((retval = sctp_load_addresses_from_init(stcb, m, iphlen,
+ (offset + sizeof(struct sctp_init_chunk)), initack_limit, sh,
+ NULL))) {
+ /* Huh, we should abort */
+ SCTPDBG(SCTP_DEBUG_INPUT1,
+ "Load addresses from INIT causes an abort %d\n",
+ retval);
+ sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh,
+ NULL, 0, net->port);
+ *abort_no_unlock = 1;
+ return (-1);
+ }
+ /* if the peer doesn't support asconf, flush the asconf queue */
+ if (asoc->peer_supports_asconf == 0) {
+ struct sctp_asconf_addr *aparam;
+
+ while (!TAILQ_EMPTY(&asoc->asconf_queue)) {
+ /* sa_ignore FREED_MEMORY */
+ aparam = TAILQ_FIRST(&asoc->asconf_queue);
+ TAILQ_REMOVE(&asoc->asconf_queue, aparam, next);
+ SCTP_FREE(aparam, SCTP_M_ASC_ADDR);
+ }
+ }
+ stcb->asoc.peer_hmac_id = sctp_negotiate_hmacid(stcb->asoc.peer_hmacs,
+ stcb->asoc.local_hmacs);
+ if (op_err) {
+ sctp_queue_op_err(stcb, op_err);
+ /* queuing will steal away the mbuf chain to the out queue */
+ op_err = NULL;
+ }
+ /* extract the cookie and queue it to "echo" it back... */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INPUT,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ net->error_count = 0;
+
+ /*
+ * Cancel the INIT timer, We do this first before queueing the
+ * cookie. We always cancel at the primary to assue that we are
+ * canceling the timer started by the INIT which always goes to the
+ * primary.
+ */
+ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb,
+ asoc->primary_destination, SCTP_FROM_SCTP_INPUT + SCTP_LOC_4);
+
+ /* calculate the RTO */
+ net->RTO = sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, sctp_align_safe_nocopy);
+
+ retval = sctp_send_cookie_echo(m, offset, stcb, net);
+ if (retval < 0) {
+ /*
+ * No cookie, we probably should send a op error. But in any
+ * case if there is no cookie in the INIT-ACK, we can
+ * abandon the peer, its broke.
+ */
+ if (retval == -3) {
+ /* We abort with an error of missing mandatory param */
+ op_err =
+ sctp_generate_invmanparam(SCTP_CAUSE_MISSING_PARAM);
+ if (op_err) {
+ /*
+ * Expand beyond to include the mandatory
+ * param cookie
+ */
+ struct sctp_inv_mandatory_param *mp;
+
+ SCTP_BUF_LEN(op_err) =
+ sizeof(struct sctp_inv_mandatory_param);
+ mp = mtod(op_err,
+ struct sctp_inv_mandatory_param *);
+ /* Subtract the reserved param */
+ mp->length =
+ htons(sizeof(struct sctp_inv_mandatory_param) - 2);
+ mp->num_param = htonl(1);
+ mp->param = htons(SCTP_STATE_COOKIE);
+ mp->resv = 0;
+ }
+ sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+ sh, op_err, 0, net->port);
+ *abort_no_unlock = 1;
+ }
+ return (retval);
+ }
+ return (0);
+}
+
+static void
+sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp,
+ struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ struct sockaddr_storage store;
+ struct sockaddr_in *sin;
+ struct sockaddr_in6 *sin6;
+ struct sctp_nets *r_net, *f_net;
+ struct timeval tv;
+ int req_prim = 0;
+
+ if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_heartbeat_chunk)) {
+ /* Invalid length */
+ return;
+ }
+ sin = (struct sockaddr_in *)&store;
+ sin6 = (struct sockaddr_in6 *)&store;
+
+ memset(&store, 0, sizeof(store));
+ if (cp->heartbeat.hb_info.addr_family == AF_INET &&
+ cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in)) {
+ sin->sin_family = cp->heartbeat.hb_info.addr_family;
+ sin->sin_len = cp->heartbeat.hb_info.addr_len;
+ sin->sin_port = stcb->rport;
+ memcpy(&sin->sin_addr, cp->heartbeat.hb_info.address,
+ sizeof(sin->sin_addr));
+ } else if (cp->heartbeat.hb_info.addr_family == AF_INET6 &&
+ cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in6)) {
+ sin6->sin6_family = cp->heartbeat.hb_info.addr_family;
+ sin6->sin6_len = cp->heartbeat.hb_info.addr_len;
+ sin6->sin6_port = stcb->rport;
+ memcpy(&sin6->sin6_addr, cp->heartbeat.hb_info.address,
+ sizeof(sin6->sin6_addr));
+ } else {
+ return;
+ }
+ r_net = sctp_findnet(stcb, (struct sockaddr *)sin);
+ if (r_net == NULL) {
+ SCTPDBG(SCTP_DEBUG_INPUT1, "Huh? I can't find the address I sent it to, discard\n");
+ return;
+ }
+ if ((r_net && (r_net->dest_state & SCTP_ADDR_UNCONFIRMED)) &&
+ (r_net->heartbeat_random1 == cp->heartbeat.hb_info.random_value1) &&
+ (r_net->heartbeat_random2 == cp->heartbeat.hb_info.random_value2)) {
+ /*
+ * If the its a HB and it's random value is correct when can
+ * confirm the destination.
+ */
+ r_net->dest_state &= ~SCTP_ADDR_UNCONFIRMED;
+ if (r_net->dest_state & SCTP_ADDR_REQ_PRIMARY) {
+ stcb->asoc.primary_destination = r_net;
+ r_net->dest_state &= ~SCTP_ADDR_WAS_PRIMARY;
+ r_net->dest_state &= ~SCTP_ADDR_REQ_PRIMARY;
+ f_net = TAILQ_FIRST(&stcb->asoc.nets);
+ if (f_net != r_net) {
+ /*
+ * first one on the list is NOT the primary
+ * sctp_cmpaddr() is much more efficent if
+ * the primary is the first on the list,
+ * make it so.
+ */
+ TAILQ_REMOVE(&stcb->asoc.nets, r_net, sctp_next);
+ TAILQ_INSERT_HEAD(&stcb->asoc.nets, r_net, sctp_next);
+ }
+ req_prim = 1;
+ }
+ sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED,
+ stcb, 0, (void *)r_net, SCTP_SO_NOT_LOCKED);
+ }
+ r_net->error_count = 0;
+ r_net->hb_responded = 1;
+ tv.tv_sec = cp->heartbeat.hb_info.time_value_1;
+ tv.tv_usec = cp->heartbeat.hb_info.time_value_2;
+ if (r_net->dest_state & SCTP_ADDR_NOT_REACHABLE) {
+ r_net->dest_state &= ~SCTP_ADDR_NOT_REACHABLE;
+ r_net->dest_state |= SCTP_ADDR_REACHABLE;
+ sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb,
+ SCTP_HEARTBEAT_SUCCESS, (void *)r_net, SCTP_SO_NOT_LOCKED);
+ /* now was it the primary? if so restore */
+ if (r_net->dest_state & SCTP_ADDR_WAS_PRIMARY) {
+ (void)sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, r_net);
+ }
+ }
+ /*
+ * JRS 5/14/07 - If CMT PF is on and the destination is in PF state,
+ * set the destination to active state and set the cwnd to one or
+ * two MTU's based on whether PF1 or PF2 is being used. If a T3
+ * timer is running, for the destination, stop the timer because a
+ * PF-heartbeat was received.
+ */
+ if ((stcb->asoc.sctp_cmt_on_off == 1) &&
+ (stcb->asoc.sctp_cmt_pf > 0) &&
+ ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF)) {
+ if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
+ stcb, net,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_5);
+ }
+ net->dest_state &= ~SCTP_ADDR_PF;
+ net->cwnd = net->mtu * stcb->asoc.sctp_cmt_pf;
+ SCTPDBG(SCTP_DEBUG_INPUT1, "Destination %p moved from PF to reachable with cwnd %d.\n",
+ net, net->cwnd);
+ }
+ /* Now lets do a RTO with this */
+ r_net->RTO = sctp_calculate_rto(stcb, &stcb->asoc, r_net, &tv, sctp_align_safe_nocopy);
+ /* Mobility adaptation */
+ if (req_prim) {
+ if ((sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_BASE) ||
+ sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_FASTHANDOFF)) &&
+ sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_PRIM_DELETED)) {
+
+ sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_TIMER + SCTP_LOC_7);
+ if (sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_FASTHANDOFF)) {
+ sctp_assoc_immediate_retrans(stcb,
+ stcb->asoc.primary_destination);
+ }
+ if (sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_BASE)) {
+ sctp_move_chunks_from_net(stcb,
+ stcb->asoc.deleted_primary);
+ }
+ sctp_delete_prim_timer(stcb->sctp_ep, stcb,
+ stcb->asoc.deleted_primary);
+ }
+ }
+}
+
+static int
+sctp_handle_nat_colliding_state(struct sctp_tcb *stcb)
+{
+ /*
+ * return 0 means we want you to proceed with the abort non-zero
+ * means no abort processing
+ */
+ struct sctpasochead *head;
+
+ if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) {
+ /* generate a new vtag and send init */
+ LIST_REMOVE(stcb, sctp_asocs);
+ stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1);
+ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))];
+ /*
+ * put it in the bucket in the vtag hash of assoc's for the
+ * system
+ */
+ LIST_INSERT_HEAD(head, stcb, sctp_asocs);
+ sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED);
+ return (1);
+ }
+ if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED) {
+ /*
+ * treat like a case where the cookie expired i.e.: - dump
+ * current cookie. - generate a new vtag. - resend init.
+ */
+ /* generate a new vtag and send init */
+ LIST_REMOVE(stcb, sctp_asocs);
+ stcb->asoc.state &= ~SCTP_STATE_COOKIE_ECHOED;
+ stcb->asoc.state |= SCTP_STATE_COOKIE_WAIT;
+ sctp_stop_all_cookie_timers(stcb);
+ sctp_toss_old_cookies(stcb, &stcb->asoc);
+ stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1);
+ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))];
+ /*
+ * put it in the bucket in the vtag hash of assoc's for the
+ * system
+ */
+ LIST_INSERT_HEAD(head, stcb, sctp_asocs);
+ sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED);
+ return (1);
+ }
+ return (0);
+}
+
+static int
+sctp_handle_nat_missing_state(struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ /*
+ * return 0 means we want you to proceed with the abort non-zero
+ * means no abort processing
+ */
+ if (stcb->asoc.peer_supports_auth == 0) {
+ SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_nat_missing_state: Peer does not support AUTH, cannot send an asconf\n");
+ return (0);
+ }
+ sctp_asconf_send_nat_state_update(stcb, net);
+ return (1);
+}
+
+
+static void
+sctp_handle_abort(struct sctp_abort_chunk *cp,
+ struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+ uint16_t len;
+
+ SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: handling ABORT\n");
+ if (stcb == NULL)
+ return;
+
+ len = ntohs(cp->ch.chunk_length);
+ if (len > sizeof(struct sctp_chunkhdr)) {
+ /*
+ * Need to check the cause codes for our two magic nat
+ * aborts which don't kill the assoc necessarily.
+ */
+ struct sctp_abort_chunk *cpnext;
+ struct sctp_missing_nat_state *natc;
+ uint16_t cause;
+
+ cpnext = cp;
+ cpnext++;
+ natc = (struct sctp_missing_nat_state *)cpnext;
+ cause = ntohs(natc->cause);
+ if (cause == SCTP_CAUSE_NAT_COLLIDING_STATE) {
+ SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state abort flags:%x\n",
+ cp->ch.chunk_flags);
+ if (sctp_handle_nat_colliding_state(stcb)) {
+ return;
+ }
+ } else if (cause == SCTP_CAUSE_NAT_MISSING_STATE) {
+ SCTPDBG(SCTP_DEBUG_INPUT2, "Received missing state abort flags:%x\n",
+ cp->ch.chunk_flags);
+ if (sctp_handle_nat_missing_state(stcb, net)) {
+ return;
+ }
+ }
+ }
+ /* stop any receive timers */
+ sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_6);
+ /* notify user of the abort and clean up... */
+ sctp_abort_notification(stcb, 0, SCTP_SO_NOT_LOCKED);
+ /* free the tcb */
+#if defined(SCTP_PANIC_ON_ABORT)
+ printf("stcb:%p state:%d rport:%d net:%p\n",
+ stcb, stcb->asoc.state, stcb->rport, net);
+ if (!(stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) {
+ panic("Received an ABORT");
+ } else {
+ printf("No panic its in state %x closed\n", stcb->asoc.state);
+ }
+#endif
+ SCTP_STAT_INCR_COUNTER32(sctps_aborted);
+ if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+#ifdef SCTP_ASOCLOG_OF_TSNS
+ sctp_print_out_track_log(stcb);
+#endif
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+#endif
+ stcb->asoc.state |= SCTP_STATE_WAS_ABORTED;
+ (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_6);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: finished\n");
+}
+
+static void
+sctp_handle_shutdown(struct sctp_shutdown_chunk *cp,
+ struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_flag)
+{
+ struct sctp_association *asoc;
+ int some_on_streamwheel;
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "sctp_handle_shutdown: handling SHUTDOWN\n");
+ if (stcb == NULL)
+ return;
+ asoc = &stcb->asoc;
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) {
+ return;
+ }
+ if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_shutdown_chunk)) {
+ /* Shutdown NOT the expected size */
+ return;
+ } else {
+ sctp_update_acked(stcb, cp, net, abort_flag);
+ if (*abort_flag) {
+ return;
+ }
+ }
+ if (asoc->control_pdapi) {
+ /*
+ * With a normal shutdown we assume the end of last record.
+ */
+ SCTP_INP_READ_LOCK(stcb->sctp_ep);
+ asoc->control_pdapi->end_added = 1;
+ asoc->control_pdapi->pdapi_aborted = 1;
+ asoc->control_pdapi = NULL;
+ SCTP_INP_READ_UNLOCK(stcb->sctp_ep);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ /* assoc was freed while we were unlocked */
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return;
+ }
+#endif
+ sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ }
+ /* goto SHUTDOWN_RECEIVED state to block new requests */
+ if (stcb->sctp_socket) {
+ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT)) {
+ SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_RECEIVED);
+ SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ /*
+ * notify upper layer that peer has initiated a
+ * shutdown
+ */
+ sctp_ulp_notify(SCTP_NOTIFY_PEER_SHUTDOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
+
+ /* reset time */
+ (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered);
+ }
+ }
+ if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) {
+ /*
+ * stop the shutdown timer, since we WILL move to
+ * SHUTDOWN-ACK-SENT.
+ */
+ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_8);
+ }
+ /* Now is there unsent data on a stream somewhere? */
+ some_on_streamwheel = sctp_is_there_unsent_data(stcb);
+
+ if (!TAILQ_EMPTY(&asoc->send_queue) ||
+ !TAILQ_EMPTY(&asoc->sent_queue) ||
+ some_on_streamwheel) {
+ /* By returning we will push more data out */
+ return;
+ } else {
+ /* no outstanding data to send, so move on... */
+ /* send SHUTDOWN-ACK */
+ sctp_send_shutdown_ack(stcb, stcb->asoc.primary_destination);
+ /* move to SHUTDOWN-ACK-SENT state */
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT);
+ SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ sctp_stop_timers_for_shutdown(stcb);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep,
+ stcb, net);
+ }
+}
+
+static void
+sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp,
+ struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ struct sctp_association *asoc;
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+ so = SCTP_INP_SO(stcb->sctp_ep);
+#endif
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "sctp_handle_shutdown_ack: handling SHUTDOWN ACK\n");
+ if (stcb == NULL)
+ return;
+
+ asoc = &stcb->asoc;
+ /* process according to association state */
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) {
+ /* unexpected SHUTDOWN-ACK... do OOTB handling... */
+ sctp_send_shutdown_complete(stcb, net, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
+ /* unexpected SHUTDOWN-ACK... so ignore... */
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ if (asoc->control_pdapi) {
+ /*
+ * With a normal shutdown we assume the end of last record.
+ */
+ SCTP_INP_READ_LOCK(stcb->sctp_ep);
+ asoc->control_pdapi->end_added = 1;
+ asoc->control_pdapi->pdapi_aborted = 1;
+ asoc->control_pdapi = NULL;
+ SCTP_INP_READ_UNLOCK(stcb->sctp_ep);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ /* assoc was freed while we were unlocked */
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return;
+ }
+#endif
+ sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ }
+ /* are the queues empty? */
+ if (!TAILQ_EMPTY(&asoc->send_queue) ||
+ !TAILQ_EMPTY(&asoc->sent_queue) ||
+ !TAILQ_EMPTY(&asoc->out_wheel)) {
+ sctp_report_all_outbound(stcb, 0, SCTP_SO_NOT_LOCKED);
+ }
+ /* stop the timer */
+ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_9);
+ /* send SHUTDOWN-COMPLETE */
+ sctp_send_shutdown_complete(stcb, net, 0);
+ /* notify upper layer protocol */
+ if (stcb->sctp_socket) {
+ sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
+ if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ /* Set the connected flag to disconnected */
+ stcb->sctp_ep->sctp_socket->so_snd.sb_cc = 0;
+ }
+ }
+ SCTP_STAT_INCR_COUNTER32(sctps_shutdown);
+ /* free the TCB but first save off the ep */
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+#endif
+ (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_10);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+}
+
+/*
+ * Skip past the param header and then we will find the chunk that caused the
+ * problem. There are two possiblities ASCONF or FWD-TSN other than that and
+ * our peer must be broken.
+ */
+static void
+sctp_process_unrecog_chunk(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr,
+ struct sctp_nets *net)
+{
+ struct sctp_chunkhdr *chk;
+
+ chk = (struct sctp_chunkhdr *)((caddr_t)phdr + sizeof(*phdr));
+ switch (chk->chunk_type) {
+ case SCTP_ASCONF_ACK:
+ case SCTP_ASCONF:
+ sctp_asconf_cleanup(stcb, net);
+ break;
+ case SCTP_FORWARD_CUM_TSN:
+ stcb->asoc.peer_supports_prsctp = 0;
+ break;
+ default:
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "Peer does not support chunk type %d(%x)??\n",
+ chk->chunk_type, (uint32_t) chk->chunk_type);
+ break;
+ }
+}
+
+/*
+ * Skip past the param header and then we will find the param that caused the
+ * problem. There are a number of param's in a ASCONF OR the prsctp param
+ * these will turn of specific features.
+ */
+static void
+sctp_process_unrecog_param(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr)
+{
+ struct sctp_paramhdr *pbad;
+
+ pbad = phdr + 1;
+ switch (ntohs(pbad->param_type)) {
+ /* pr-sctp draft */
+ case SCTP_PRSCTP_SUPPORTED:
+ stcb->asoc.peer_supports_prsctp = 0;
+ break;
+ case SCTP_SUPPORTED_CHUNK_EXT:
+ break;
+ /* draft-ietf-tsvwg-addip-sctp */
+ case SCTP_HAS_NAT_SUPPORT:
+ stcb->asoc.peer_supports_nat = 0;
+ break;
+ case SCTP_ECN_NONCE_SUPPORTED:
+ stcb->asoc.peer_supports_ecn_nonce = 0;
+ stcb->asoc.ecn_nonce_allowed = 0;
+ stcb->asoc.ecn_allowed = 0;
+ break;
+ case SCTP_ADD_IP_ADDRESS:
+ case SCTP_DEL_IP_ADDRESS:
+ case SCTP_SET_PRIM_ADDR:
+ stcb->asoc.peer_supports_asconf = 0;
+ break;
+ case SCTP_SUCCESS_REPORT:
+ case SCTP_ERROR_CAUSE_IND:
+ SCTPDBG(SCTP_DEBUG_INPUT2, "Huh, the peer does not support success? or error cause?\n");
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "Turning off ASCONF to this strange peer\n");
+ stcb->asoc.peer_supports_asconf = 0;
+ break;
+ default:
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "Peer does not support param type %d(%x)??\n",
+ pbad->param_type, (uint32_t) pbad->param_type);
+ break;
+ }
+}
+
+static int
+sctp_handle_error(struct sctp_chunkhdr *ch,
+ struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ int chklen;
+ struct sctp_paramhdr *phdr;
+ uint16_t error_type;
+ uint16_t error_len;
+ struct sctp_association *asoc;
+ int adjust;
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+
+ /* parse through all of the errors and process */
+ asoc = &stcb->asoc;
+ phdr = (struct sctp_paramhdr *)((caddr_t)ch +
+ sizeof(struct sctp_chunkhdr));
+ chklen = ntohs(ch->chunk_length) - sizeof(struct sctp_chunkhdr);
+ while ((size_t)chklen >= sizeof(struct sctp_paramhdr)) {
+ /* Process an Error Cause */
+ error_type = ntohs(phdr->param_type);
+ error_len = ntohs(phdr->param_length);
+ if ((error_len > chklen) || (error_len == 0)) {
+ /* invalid param length for this param */
+ SCTPDBG(SCTP_DEBUG_INPUT1, "Bogus length in error param- chunk left:%d errorlen:%d\n",
+ chklen, error_len);
+ return (0);
+ }
+ switch (error_type) {
+ case SCTP_CAUSE_INVALID_STREAM:
+ case SCTP_CAUSE_MISSING_PARAM:
+ case SCTP_CAUSE_INVALID_PARAM:
+ case SCTP_CAUSE_NO_USER_DATA:
+ SCTPDBG(SCTP_DEBUG_INPUT1, "Software error we got a %d back? We have a bug :/ (or do they?)\n",
+ error_type);
+ break;
+ case SCTP_CAUSE_NAT_COLLIDING_STATE:
+ SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state abort flags:%x\n",
+ ch->chunk_flags);
+ if (sctp_handle_nat_colliding_state(stcb)) {
+ return (0);
+ }
+ break;
+ case SCTP_CAUSE_NAT_MISSING_STATE:
+ SCTPDBG(SCTP_DEBUG_INPUT2, "Received missing state abort flags:%x\n",
+ ch->chunk_flags);
+ if (sctp_handle_nat_missing_state(stcb, net)) {
+ return (0);
+ }
+ break;
+ case SCTP_CAUSE_STALE_COOKIE:
+ /*
+ * We only act if we have echoed a cookie and are
+ * waiting.
+ */
+ if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) {
+ int *p;
+
+ p = (int *)((caddr_t)phdr + sizeof(*phdr));
+ /* Save the time doubled */
+ asoc->cookie_preserve_req = ntohl(*p) << 1;
+ asoc->stale_cookie_count++;
+ if (asoc->stale_cookie_count >
+ asoc->max_init_times) {
+ sctp_abort_notification(stcb, 0, SCTP_SO_NOT_LOCKED);
+ /* now free the asoc */
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+#endif
+ (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_11);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ return (-1);
+ }
+ /* blast back to INIT state */
+ sctp_toss_old_cookies(stcb, &stcb->asoc);
+ asoc->state &= ~SCTP_STATE_COOKIE_ECHOED;
+ asoc->state |= SCTP_STATE_COOKIE_WAIT;
+ sctp_stop_all_cookie_timers(stcb);
+ sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED);
+ }
+ break;
+ case SCTP_CAUSE_UNRESOLVABLE_ADDR:
+ /*
+ * Nothing we can do here, we don't do hostname
+ * addresses so if the peer does not like my IPv6
+ * (or IPv4 for that matter) it does not matter. If
+ * they don't support that type of address, they can
+ * NOT possibly get that packet type... i.e. with no
+ * IPv6 you can't recieve a IPv6 packet. so we can
+ * safely ignore this one. If we ever added support
+ * for HOSTNAME Addresses, then we would need to do
+ * something here.
+ */
+ break;
+ case SCTP_CAUSE_UNRECOG_CHUNK:
+ sctp_process_unrecog_chunk(stcb, phdr, net);
+ break;
+ case SCTP_CAUSE_UNRECOG_PARAM:
+ sctp_process_unrecog_param(stcb, phdr);
+ break;
+ case SCTP_CAUSE_COOKIE_IN_SHUTDOWN:
+ /*
+ * We ignore this since the timer will drive out a
+ * new cookie anyway and there timer will drive us
+ * to send a SHUTDOWN_COMPLETE. We can't send one
+ * here since we don't have their tag.
+ */
+ break;
+ case SCTP_CAUSE_DELETING_LAST_ADDR:
+ case SCTP_CAUSE_RESOURCE_SHORTAGE:
+ case SCTP_CAUSE_DELETING_SRC_ADDR:
+ /*
+ * We should NOT get these here, but in a
+ * ASCONF-ACK.
+ */
+ SCTPDBG(SCTP_DEBUG_INPUT2, "Peer sends ASCONF errors in a Operational Error?<%d>?\n",
+ error_type);
+ break;
+ case SCTP_CAUSE_OUT_OF_RESC:
+ /*
+ * And what, pray tell do we do with the fact that
+ * the peer is out of resources? Not really sure we
+ * could do anything but abort. I suspect this
+ * should have came WITH an abort instead of in a
+ * OP-ERROR.
+ */
+ break;
+ default:
+ SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_handle_error: unknown error type = 0x%xh\n",
+ error_type);
+ break;
+ }
+ adjust = SCTP_SIZE32(error_len);
+ chklen -= adjust;
+ phdr = (struct sctp_paramhdr *)((caddr_t)phdr + adjust);
+ }
+ return (0);
+}
+
+static int
+sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset,
+ struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb,
+ struct sctp_nets *net, int *abort_no_unlock, uint32_t vrf_id)
+{
+ struct sctp_init_ack *init_ack;
+ struct mbuf *op_err;
+
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "sctp_handle_init_ack: handling INIT-ACK\n");
+
+ if (stcb == NULL) {
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "sctp_handle_init_ack: TCB is null\n");
+ return (-1);
+ }
+ if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_ack_chunk)) {
+ /* Invalid length */
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
+ sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh,
+ op_err, 0, net->port);
+ *abort_no_unlock = 1;
+ return (-1);
+ }
+ init_ack = &cp->init;
+ /* validate parameters */
+ if (init_ack->initiate_tag == 0) {
+ /* protocol error... send an abort */
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
+ sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh,
+ op_err, 0, net->port);
+ *abort_no_unlock = 1;
+ return (-1);
+ }
+ if (ntohl(init_ack->a_rwnd) < SCTP_MIN_RWND) {
+ /* protocol error... send an abort */
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
+ sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh,
+ op_err, 0, net->port);
+ *abort_no_unlock = 1;
+ return (-1);
+ }
+ if (init_ack->num_inbound_streams == 0) {
+ /* protocol error... send an abort */
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
+ sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh,
+ op_err, 0, net->port);
+ *abort_no_unlock = 1;
+ return (-1);
+ }
+ if (init_ack->num_outbound_streams == 0) {
+ /* protocol error... send an abort */
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
+ sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh,
+ op_err, 0, net->port);
+ *abort_no_unlock = 1;
+ return (-1);
+ }
+ /* process according to association state... */
+ switch (stcb->asoc.state & SCTP_STATE_MASK) {
+ case SCTP_STATE_COOKIE_WAIT:
+ /* this is the expected state for this chunk */
+ /* process the INIT-ACK parameters */
+ if (stcb->asoc.primary_destination->dest_state &
+ SCTP_ADDR_UNCONFIRMED) {
+ /*
+ * The primary is where we sent the INIT, we can
+ * always consider it confirmed when the INIT-ACK is
+ * returned. Do this before we load addresses
+ * though.
+ */
+ stcb->asoc.primary_destination->dest_state &=
+ ~SCTP_ADDR_UNCONFIRMED;
+ sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED,
+ stcb, 0, (void *)stcb->asoc.primary_destination, SCTP_SO_NOT_LOCKED);
+ }
+ if (sctp_process_init_ack(m, iphlen, offset, sh, cp, stcb,
+ net, abort_no_unlock, vrf_id) < 0) {
+ /* error in parsing parameters */
+ return (-1);
+ }
+ /* update our state */
+ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to COOKIE-ECHOED state\n");
+ SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_ECHOED);
+
+ /* reset the RTO calc */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INPUT,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
+ /*
+ * collapse the init timer back in case of a exponential
+ * backoff
+ */
+ sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, stcb->sctp_ep,
+ stcb, net);
+ /*
+ * the send at the end of the inbound data processing will
+ * cause the cookie to be sent
+ */
+ break;
+ case SCTP_STATE_SHUTDOWN_SENT:
+ /* incorrect state... discard */
+ break;
+ case SCTP_STATE_COOKIE_ECHOED:
+ /* incorrect state... discard */
+ break;
+ case SCTP_STATE_OPEN:
+ /* incorrect state... discard */
+ break;
+ case SCTP_STATE_EMPTY:
+ case SCTP_STATE_INUSE:
+ default:
+ /* incorrect state... discard */
+ return (-1);
+ break;
+ }
+ SCTPDBG(SCTP_DEBUG_INPUT1, "Leaving handle-init-ack end\n");
+ return (0);
+}
+
+static struct sctp_tcb *
+sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
+ struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len,
+ struct sctp_inpcb *inp, struct sctp_nets **netp,
+ struct sockaddr *init_src, int *notification,
+ int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
+ uint32_t vrf_id, uint16_t port);
+
+
+/*
+ * handle a state cookie for an existing association m: input packet mbuf
+ * chain-- assumes a pullup on IP/SCTP/COOKIE-ECHO chunk note: this is a
+ * "split" mbuf and the cookie signature does not exist offset: offset into
+ * mbuf to the cookie-echo chunk
+ */
+static struct sctp_tcb *
+sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
+ struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len,
+ struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets **netp,
+ struct sockaddr *init_src, int *notification, sctp_assoc_t * sac_assoc_id,
+ uint32_t vrf_id, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, uint16_t port)
+{
+ struct sctp_association *asoc;
+ struct sctp_init_chunk *init_cp, init_buf;
+ struct sctp_init_ack_chunk *initack_cp, initack_buf;
+ struct sctp_nets *net;
+ struct mbuf *op_err;
+ struct sctp_paramhdr *ph;
+ int chk_length;
+ int init_offset, initack_offset, i;
+ int retval;
+ int spec_flag = 0;
+ uint32_t how_indx;
+
+ net = *netp;
+ /* I know that the TCB is non-NULL from the caller */
+ asoc = &stcb->asoc;
+ for (how_indx = 0; how_indx < sizeof(asoc->cookie_how); how_indx++) {
+ if (asoc->cookie_how[how_indx] == 0)
+ break;
+ }
+ if (how_indx < sizeof(asoc->cookie_how)) {
+ asoc->cookie_how[how_indx] = 1;
+ }
+ if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) {
+ /* SHUTDOWN came in after sending INIT-ACK */
+ sctp_send_shutdown_ack(stcb, stcb->asoc.primary_destination);
+ op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (op_err == NULL) {
+ /* FOOBAR */
+ return (NULL);
+ }
+ /* Set the len */
+ SCTP_BUF_LEN(op_err) = sizeof(struct sctp_paramhdr);
+ ph = mtod(op_err, struct sctp_paramhdr *);
+ ph->param_type = htons(SCTP_CAUSE_COOKIE_IN_SHUTDOWN);
+ ph->param_length = htons(sizeof(struct sctp_paramhdr));
+ sctp_send_operr_to(m, iphlen, op_err, cookie->peers_vtag,
+ vrf_id, net->port);
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 2;
+ return (NULL);
+ }
+ /*
+ * find and validate the INIT chunk in the cookie (peer's info) the
+ * INIT should start after the cookie-echo header struct (chunk
+ * header, state cookie header struct)
+ */
+ init_offset = offset += sizeof(struct sctp_cookie_echo_chunk);
+
+ init_cp = (struct sctp_init_chunk *)
+ sctp_m_getptr(m, init_offset, sizeof(struct sctp_init_chunk),
+ (uint8_t *) & init_buf);
+ if (init_cp == NULL) {
+ /* could not pull a INIT chunk in cookie */
+ return (NULL);
+ }
+ chk_length = ntohs(init_cp->ch.chunk_length);
+ if (init_cp->ch.chunk_type != SCTP_INITIATION) {
+ return (NULL);
+ }
+ /*
+ * find and validate the INIT-ACK chunk in the cookie (my info) the
+ * INIT-ACK follows the INIT chunk
+ */
+ initack_offset = init_offset + SCTP_SIZE32(chk_length);
+ initack_cp = (struct sctp_init_ack_chunk *)
+ sctp_m_getptr(m, initack_offset, sizeof(struct sctp_init_ack_chunk),
+ (uint8_t *) & initack_buf);
+ if (initack_cp == NULL) {
+ /* could not pull INIT-ACK chunk in cookie */
+ return (NULL);
+ }
+ chk_length = ntohs(initack_cp->ch.chunk_length);
+ if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) {
+ return (NULL);
+ }
+ if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) &&
+ (ntohl(init_cp->init.initiate_tag) == asoc->peer_vtag)) {
+ /*
+ * case D in Section 5.2.4 Table 2: MMAA process accordingly
+ * to get into the OPEN state
+ */
+ if (ntohl(initack_cp->init.initial_tsn) != asoc->init_seq_number) {
+ /*-
+ * Opps, this means that we somehow generated two vtag's
+ * the same. I.e. we did:
+ * Us Peer
+ * <---INIT(tag=a)------
+ * ----INIT-ACK(tag=t)-->
+ * ----INIT(tag=t)------> *1
+ * <---INIT-ACK(tag=a)---
+ * <----CE(tag=t)------------- *2
+ *
+ * At point *1 we should be generating a different
+ * tag t'. Which means we would throw away the CE and send
+ * ours instead. Basically this is case C (throw away side).
+ */
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 17;
+ return (NULL);
+
+ }
+ switch SCTP_GET_STATE
+ (asoc) {
+ case SCTP_STATE_COOKIE_WAIT:
+ case SCTP_STATE_COOKIE_ECHOED:
+ /*
+ * INIT was sent but got a COOKIE_ECHO with the
+ * correct tags... just accept it...but we must
+ * process the init so that we can make sure we have
+ * the right seq no's.
+ */
+ /* First we must process the INIT !! */
+ retval = sctp_process_init(init_cp, stcb, net);
+ if (retval < 0) {
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 3;
+ return (NULL);
+ }
+ /* we have already processed the INIT so no problem */
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb,
+ net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_12);
+ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_13);
+ /* update current state */
+ if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)
+ SCTP_STAT_INCR_COUNTER32(sctps_activeestab);
+ else
+ SCTP_STAT_INCR_COUNTER32(sctps_collisionestab);
+
+ SCTP_SET_STATE(asoc, SCTP_STATE_OPEN);
+ if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) {
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
+ stcb->sctp_ep, stcb, asoc->primary_destination);
+ }
+ SCTP_STAT_INCR_GAUGE32(sctps_currestab);
+ sctp_stop_all_cookie_timers(stcb);
+ if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) &&
+ (inp->sctp_socket->so_qlimit == 0)
+ ) {
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+ /*
+ * Here is where collision would go if we
+ * did a connect() and instead got a
+ * init/init-ack/cookie done before the
+ * init-ack came back..
+ */
+ stcb->sctp_ep->sctp_flags |=
+ SCTP_PCB_FLAGS_CONNECTED;
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return (NULL);
+ }
+#endif
+ soisconnected(stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ }
+ /* notify upper layer */
+ *notification = SCTP_NOTIFY_ASSOC_UP;
+ /*
+ * since we did not send a HB make sure we don't
+ * double things
+ */
+ net->hb_responded = 1;
+ net->RTO = sctp_calculate_rto(stcb, asoc, net,
+ &cookie->time_entered, sctp_align_unsafe_makecopy);
+
+ if (stcb->asoc.sctp_autoclose_ticks &&
+ (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE))) {
+ sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE,
+ inp, stcb, NULL);
+ }
+ break;
+ default:
+ /*
+ * we're in the OPEN state (or beyond), so peer must
+ * have simply lost the COOKIE-ACK
+ */
+ break;
+ } /* end switch */
+ sctp_stop_all_cookie_timers(stcb);
+ /*
+ * We ignore the return code here.. not sure if we should
+ * somehow abort.. but we do have an existing asoc. This
+ * really should not fail.
+ */
+ if (sctp_load_addresses_from_init(stcb, m, iphlen,
+ init_offset + sizeof(struct sctp_init_chunk),
+ initack_offset, sh, init_src)) {
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 4;
+ return (NULL);
+ }
+ /* respond with a COOKIE-ACK */
+ sctp_toss_old_cookies(stcb, asoc);
+ sctp_send_cookie_ack(stcb);
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 5;
+ return (stcb);
+ }
+ if (ntohl(initack_cp->init.initiate_tag) != asoc->my_vtag &&
+ ntohl(init_cp->init.initiate_tag) == asoc->peer_vtag &&
+ cookie->tie_tag_my_vtag == 0 &&
+ cookie->tie_tag_peer_vtag == 0) {
+ /*
+ * case C in Section 5.2.4 Table 2: XMOO silently discard
+ */
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 6;
+ return (NULL);
+ }
+ /*
+ * If nat support, and the below and stcb is established, send back
+ * a ABORT(colliding state) if we are established.
+ */
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) &&
+ (asoc->peer_supports_nat) &&
+ ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) &&
+ ((ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) ||
+ (asoc->peer_vtag == 0)))) {
+ /*
+ * Special case - Peer's support nat. We may have two init's
+ * that we gave out the same tag on since one was not
+ * established.. i.e. we get INIT from host-1 behind the nat
+ * and we respond tag-a, we get a INIT from host-2 behind
+ * the nat and we get tag-a again. Then we bring up host-1
+ * (or 2's) assoc, Then comes the cookie from hsot-2 (or 1).
+ * Now we have colliding state. We must send an abort here
+ * with colliding state indication.
+ */
+ op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (op_err == NULL) {
+ /* FOOBAR */
+ return (NULL);
+ }
+ /* pre-reserve some space */
+#ifdef INET6
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr));
+#else
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct ip));
+#endif
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr));
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr));
+ /* Set the len */
+ SCTP_BUF_LEN(op_err) = sizeof(struct sctp_paramhdr);
+ ph = mtod(op_err, struct sctp_paramhdr *);
+ ph->param_type = htons(SCTP_CAUSE_NAT_COLLIDING_STATE);
+ ph->param_length = htons(sizeof(struct sctp_paramhdr));
+ sctp_send_abort(m, iphlen, sh, 0, op_err, vrf_id, port);
+ return (NULL);
+ }
+ if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) &&
+ ((ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) ||
+ (asoc->peer_vtag == 0))) {
+ /*
+ * case B in Section 5.2.4 Table 2: MXAA or MOAA my info
+ * should be ok, re-accept peer info
+ */
+ if (ntohl(initack_cp->init.initial_tsn) != asoc->init_seq_number) {
+ /*
+ * Extension of case C. If we hit this, then the
+ * random number generator returned the same vtag
+ * when we first sent our INIT-ACK and when we later
+ * sent our INIT. The side with the seq numbers that
+ * are different will be the one that normnally
+ * would have hit case C. This in effect "extends"
+ * our vtags in this collision case to be 64 bits.
+ * The same collision could occur aka you get both
+ * vtag and seq number the same twice in a row.. but
+ * is much less likely. If it did happen then we
+ * would proceed through and bring up the assoc.. we
+ * may end up with the wrong stream setup however..
+ * which would be bad.. but there is no way to
+ * tell.. until we send on a stream that does not
+ * exist :-)
+ */
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 7;
+
+ return (NULL);
+ }
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 8;
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_14);
+ sctp_stop_all_cookie_timers(stcb);
+ /*
+ * since we did not send a HB make sure we don't double
+ * things
+ */
+ net->hb_responded = 1;
+ if (stcb->asoc.sctp_autoclose_ticks &&
+ sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) {
+ sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb,
+ NULL);
+ }
+ asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd);
+ asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams);
+
+ /* Note last_cwr_tsn? where is this used? */
+ asoc->last_cwr_tsn = asoc->init_seq_number - 1;
+ if (ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) {
+ /*
+ * Ok the peer probably discarded our data (if we
+ * echoed a cookie+data). So anything on the
+ * sent_queue should be marked for retransmit, we
+ * may not get something to kick us so it COULD
+ * still take a timeout to move these.. but it can't
+ * hurt to mark them.
+ */
+ struct sctp_tmit_chunk *chk;
+
+ TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
+ if (chk->sent < SCTP_DATAGRAM_RESEND) {
+ chk->sent = SCTP_DATAGRAM_RESEND;
+ sctp_flight_size_decrease(chk);
+ sctp_total_flight_decrease(stcb, chk);
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ spec_flag++;
+ }
+ }
+
+ }
+ /* process the INIT info (peer's info) */
+ retval = sctp_process_init(init_cp, stcb, net);
+ if (retval < 0) {
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 9;
+ return (NULL);
+ }
+ if (sctp_load_addresses_from_init(stcb, m, iphlen,
+ init_offset + sizeof(struct sctp_init_chunk),
+ initack_offset, sh, init_src)) {
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 10;
+ return (NULL);
+ }
+ if ((asoc->state & SCTP_STATE_COOKIE_WAIT) ||
+ (asoc->state & SCTP_STATE_COOKIE_ECHOED)) {
+ *notification = SCTP_NOTIFY_ASSOC_UP;
+
+ if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) &&
+ (inp->sctp_socket->so_qlimit == 0)) {
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+ stcb->sctp_ep->sctp_flags |=
+ SCTP_PCB_FLAGS_CONNECTED;
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return (NULL);
+ }
+#endif
+ soisconnected(stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ }
+ if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)
+ SCTP_STAT_INCR_COUNTER32(sctps_activeestab);
+ else
+ SCTP_STAT_INCR_COUNTER32(sctps_collisionestab);
+ SCTP_STAT_INCR_GAUGE32(sctps_currestab);
+ } else if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) {
+ SCTP_STAT_INCR_COUNTER32(sctps_restartestab);
+ } else {
+ SCTP_STAT_INCR_COUNTER32(sctps_collisionestab);
+ }
+ SCTP_SET_STATE(asoc, SCTP_STATE_OPEN);
+ if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) {
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
+ stcb->sctp_ep, stcb, asoc->primary_destination);
+ }
+ sctp_stop_all_cookie_timers(stcb);
+ sctp_toss_old_cookies(stcb, asoc);
+ sctp_send_cookie_ack(stcb);
+ if (spec_flag) {
+ /*
+ * only if we have retrans set do we do this. What
+ * this call does is get only the COOKIE-ACK out and
+ * then when we return the normal call to
+ * sctp_chunk_output will get the retrans out behind
+ * this.
+ */
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_COOKIE_ACK, SCTP_SO_NOT_LOCKED);
+ }
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 11;
+
+ return (stcb);
+ }
+ if ((ntohl(initack_cp->init.initiate_tag) != asoc->my_vtag &&
+ ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) &&
+ cookie->tie_tag_my_vtag == asoc->my_vtag_nonce &&
+ cookie->tie_tag_peer_vtag == asoc->peer_vtag_nonce &&
+ cookie->tie_tag_peer_vtag != 0) {
+ struct sctpasochead *head;
+
+ if (asoc->peer_supports_nat) {
+ /*
+ * This is a gross gross hack. just call the
+ * cookie_new code since we are allowing a duplicate
+ * association. I hope this works...
+ */
+ return (sctp_process_cookie_new(m, iphlen, offset, sh, cookie, cookie_len,
+ inp, netp, init_src, notification,
+ auth_skipped, auth_offset, auth_len,
+ vrf_id, port));
+ }
+ /*
+ * case A in Section 5.2.4 Table 2: XXMM (peer restarted)
+ */
+ /* temp code */
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 12;
+ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_15);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_16);
+
+ *sac_assoc_id = sctp_get_associd(stcb);
+ /* notify upper layer */
+ *notification = SCTP_NOTIFY_ASSOC_RESTART;
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_OPEN) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT)) {
+ SCTP_STAT_INCR_GAUGE32(sctps_currestab);
+ }
+ if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) {
+ SCTP_STAT_INCR_GAUGE32(sctps_restartestab);
+ } else if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) {
+ SCTP_STAT_INCR_GAUGE32(sctps_collisionestab);
+ }
+ if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) {
+ SCTP_SET_STATE(asoc, SCTP_STATE_OPEN);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
+ stcb->sctp_ep, stcb, asoc->primary_destination);
+
+ } else if (!(asoc->state & SCTP_STATE_SHUTDOWN_SENT)) {
+ /* move to OPEN state, if not in SHUTDOWN_SENT */
+ SCTP_SET_STATE(asoc, SCTP_STATE_OPEN);
+ }
+ asoc->pre_open_streams =
+ ntohs(initack_cp->init.num_outbound_streams);
+ asoc->init_seq_number = ntohl(initack_cp->init.initial_tsn);
+ asoc->sending_seq = asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number;
+ asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1;
+
+ asoc->last_cwr_tsn = asoc->init_seq_number - 1;
+ asoc->asconf_seq_in = asoc->last_acked_seq = asoc->init_seq_number - 1;
+
+ asoc->str_reset_seq_in = asoc->init_seq_number;
+
+ asoc->advanced_peer_ack_point = asoc->last_acked_seq;
+ if (asoc->mapping_array) {
+ memset(asoc->mapping_array, 0,
+ asoc->mapping_array_size);
+ }
+ if (asoc->nr_mapping_array) {
+ memset(asoc->nr_mapping_array, 0,
+ asoc->mapping_array_size);
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_INP_INFO_WLOCK();
+ SCTP_INP_WLOCK(stcb->sctp_ep);
+ SCTP_TCB_LOCK(stcb);
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ /* send up all the data */
+ SCTP_TCB_SEND_LOCK(stcb);
+
+ sctp_report_all_outbound(stcb, 1, SCTP_SO_NOT_LOCKED);
+ for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
+ stcb->asoc.strmout[i].stream_no = i;
+ stcb->asoc.strmout[i].next_sequence_sent = 0;
+ stcb->asoc.strmout[i].last_msg_incomplete = 0;
+ }
+ /* process the INIT-ACK info (my info) */
+ asoc->my_vtag = ntohl(initack_cp->init.initiate_tag);
+ asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd);
+
+ /* pull from vtag hash */
+ LIST_REMOVE(stcb, sctp_asocs);
+ /* re-insert to new vtag position */
+ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag,
+ SCTP_BASE_INFO(hashasocmark))];
+ /*
+ * put it in the bucket in the vtag hash of assoc's for the
+ * system
+ */
+ LIST_INSERT_HEAD(head, stcb, sctp_asocs);
+
+ /* process the INIT info (peer's info) */
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ SCTP_INP_WUNLOCK(stcb->sctp_ep);
+ SCTP_INP_INFO_WUNLOCK();
+
+ retval = sctp_process_init(init_cp, stcb, net);
+ if (retval < 0) {
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 13;
+
+ return (NULL);
+ }
+ /*
+ * since we did not send a HB make sure we don't double
+ * things
+ */
+ net->hb_responded = 1;
+
+ if (sctp_load_addresses_from_init(stcb, m, iphlen,
+ init_offset + sizeof(struct sctp_init_chunk),
+ initack_offset, sh, init_src)) {
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 14;
+
+ return (NULL);
+ }
+ /* respond with a COOKIE-ACK */
+ sctp_stop_all_cookie_timers(stcb);
+ sctp_toss_old_cookies(stcb, asoc);
+ sctp_send_cookie_ack(stcb);
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 15;
+
+ return (stcb);
+ }
+ if (how_indx < sizeof(asoc->cookie_how))
+ asoc->cookie_how[how_indx] = 16;
+ /* all other cases... */
+ return (NULL);
+}
+
+
+/*
+ * handle a state cookie for a new association m: input packet mbuf chain--
+ * assumes a pullup on IP/SCTP/COOKIE-ECHO chunk note: this is a "split" mbuf
+ * and the cookie signature does not exist offset: offset into mbuf to the
+ * cookie-echo chunk length: length of the cookie chunk to: where the init
+ * was from returns a new TCB
+ */
+struct sctp_tcb *
+sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
+ struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len,
+ struct sctp_inpcb *inp, struct sctp_nets **netp,
+ struct sockaddr *init_src, int *notification,
+ int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
+ uint32_t vrf_id, uint16_t port)
+{
+ struct sctp_tcb *stcb;
+ struct sctp_init_chunk *init_cp, init_buf;
+ struct sctp_init_ack_chunk *initack_cp, initack_buf;
+ struct sockaddr_storage sa_store;
+ struct sockaddr *initack_src = (struct sockaddr *)&sa_store;
+ struct sockaddr_in *sin;
+ struct sockaddr_in6 *sin6;
+ struct sctp_association *asoc;
+ int chk_length;
+ int init_offset, initack_offset, initack_limit;
+ int retval;
+ int error = 0;
+ uint32_t old_tag;
+ uint8_t auth_chunk_buf[SCTP_PARAM_BUFFER_SIZE];
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+ so = SCTP_INP_SO(inp);
+#endif
+
+ /*
+ * find and validate the INIT chunk in the cookie (peer's info) the
+ * INIT should start after the cookie-echo header struct (chunk
+ * header, state cookie header struct)
+ */
+ init_offset = offset + sizeof(struct sctp_cookie_echo_chunk);
+ init_cp = (struct sctp_init_chunk *)
+ sctp_m_getptr(m, init_offset, sizeof(struct sctp_init_chunk),
+ (uint8_t *) & init_buf);
+ if (init_cp == NULL) {
+ /* could not pull a INIT chunk in cookie */
+ SCTPDBG(SCTP_DEBUG_INPUT1,
+ "process_cookie_new: could not pull INIT chunk hdr\n");
+ return (NULL);
+ }
+ chk_length = ntohs(init_cp->ch.chunk_length);
+ if (init_cp->ch.chunk_type != SCTP_INITIATION) {
+ SCTPDBG(SCTP_DEBUG_INPUT1, "HUH? process_cookie_new: could not find INIT chunk!\n");
+ return (NULL);
+ }
+ initack_offset = init_offset + SCTP_SIZE32(chk_length);
+ /*
+ * find and validate the INIT-ACK chunk in the cookie (my info) the
+ * INIT-ACK follows the INIT chunk
+ */
+ initack_cp = (struct sctp_init_ack_chunk *)
+ sctp_m_getptr(m, initack_offset, sizeof(struct sctp_init_ack_chunk),
+ (uint8_t *) & initack_buf);
+ if (initack_cp == NULL) {
+ /* could not pull INIT-ACK chunk in cookie */
+ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: could not pull INIT-ACK chunk hdr\n");
+ return (NULL);
+ }
+ chk_length = ntohs(initack_cp->ch.chunk_length);
+ if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) {
+ return (NULL);
+ }
+ /*
+ * NOTE: We can't use the INIT_ACK's chk_length to determine the
+ * "initack_limit" value. This is because the chk_length field
+ * includes the length of the cookie, but the cookie is omitted when
+ * the INIT and INIT_ACK are tacked onto the cookie...
+ */
+ initack_limit = offset + cookie_len;
+
+ /*
+ * now that we know the INIT/INIT-ACK are in place, create a new TCB
+ * and popluate
+ */
+
+ /*
+ * Here we do a trick, we set in NULL for the proc/thread argument.
+ * We do this since in effect we only use the p argument when the
+ * socket is unbound and we must do an implicit bind. Since we are
+ * getting a cookie, we cannot be unbound.
+ */
+ stcb = sctp_aloc_assoc(inp, init_src, &error,
+ ntohl(initack_cp->init.initiate_tag), vrf_id,
+ (struct thread *)NULL
+ );
+ if (stcb == NULL) {
+ struct mbuf *op_err;
+
+ /* memory problem? */
+ SCTPDBG(SCTP_DEBUG_INPUT1,
+ "process_cookie_new: no room for another TCB!\n");
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC);
+
+ sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen,
+ sh, op_err, vrf_id, port);
+ return (NULL);
+ }
+ /* get the correct sctp_nets */
+ if (netp)
+ *netp = sctp_findnet(stcb, init_src);
+
+ asoc = &stcb->asoc;
+ /* get scope variables out of cookie */
+ asoc->ipv4_local_scope = cookie->ipv4_scope;
+ asoc->site_scope = cookie->site_scope;
+ asoc->local_scope = cookie->local_scope;
+ asoc->loopback_scope = cookie->loopback_scope;
+
+ if ((asoc->ipv4_addr_legal != cookie->ipv4_addr_legal) ||
+ (asoc->ipv6_addr_legal != cookie->ipv6_addr_legal)) {
+ struct mbuf *op_err;
+
+ /*
+ * Houston we have a problem. The EP changed while the
+ * cookie was in flight. Only recourse is to abort the
+ * association.
+ */
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC);
+ sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen,
+ sh, op_err, vrf_id, port);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+#endif
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_16);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ return (NULL);
+ }
+ /* process the INIT-ACK info (my info) */
+ old_tag = asoc->my_vtag;
+ asoc->my_vtag = ntohl(initack_cp->init.initiate_tag);
+ asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd);
+ asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams);
+ asoc->init_seq_number = ntohl(initack_cp->init.initial_tsn);
+ asoc->sending_seq = asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number;
+ asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1;
+ asoc->last_cwr_tsn = asoc->init_seq_number - 1;
+ asoc->asconf_seq_in = asoc->last_acked_seq = asoc->init_seq_number - 1;
+ asoc->str_reset_seq_in = asoc->init_seq_number;
+
+ asoc->advanced_peer_ack_point = asoc->last_acked_seq;
+
+ /* process the INIT info (peer's info) */
+ if (netp)
+ retval = sctp_process_init(init_cp, stcb, *netp);
+ else
+ retval = 0;
+ if (retval < 0) {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+#endif
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_16);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ return (NULL);
+ }
+ /* load all addresses */
+ if (sctp_load_addresses_from_init(stcb, m, iphlen,
+ init_offset + sizeof(struct sctp_init_chunk), initack_offset, sh,
+ init_src)) {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+#endif
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_17);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ return (NULL);
+ }
+ /*
+ * verify any preceding AUTH chunk that was skipped
+ */
+ /* pull the local authentication parameters from the cookie/init-ack */
+ sctp_auth_get_cookie_params(stcb, m,
+ initack_offset + sizeof(struct sctp_init_ack_chunk),
+ initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk)));
+ if (auth_skipped) {
+ struct sctp_auth_chunk *auth;
+
+ auth = (struct sctp_auth_chunk *)
+ sctp_m_getptr(m, auth_offset, auth_len, auth_chunk_buf);
+ if ((auth == NULL) || sctp_handle_auth(stcb, auth, m, auth_offset)) {
+ /* auth HMAC failed, dump the assoc and packet */
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "COOKIE-ECHO: AUTH failed\n");
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+#endif
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_18);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ return (NULL);
+ } else {
+ /* remaining chunks checked... good to go */
+ stcb->asoc.authenticated = 1;
+ }
+ }
+ /* update current state */
+ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n");
+ SCTP_SET_STATE(asoc, SCTP_STATE_OPEN);
+ if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) {
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
+ stcb->sctp_ep, stcb, asoc->primary_destination);
+ }
+ sctp_stop_all_cookie_timers(stcb);
+ SCTP_STAT_INCR_COUNTER32(sctps_passiveestab);
+ SCTP_STAT_INCR_GAUGE32(sctps_currestab);
+
+ /*
+ * if we're doing ASCONFs, check to see if we have any new local
+ * addresses that need to get added to the peer (eg. addresses
+ * changed while cookie echo in flight). This needs to be done
+ * after we go to the OPEN state to do the correct asconf
+ * processing. else, make sure we have the correct addresses in our
+ * lists
+ */
+
+ /* warning, we re-use sin, sin6, sa_store here! */
+ /* pull in local_address (our "from" address) */
+ if (cookie->laddr_type == SCTP_IPV4_ADDRESS) {
+ /* source addr is IPv4 */
+ sin = (struct sockaddr_in *)initack_src;
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_addr.s_addr = cookie->laddress[0];
+ } else if (cookie->laddr_type == SCTP_IPV6_ADDRESS) {
+ /* source addr is IPv6 */
+ sin6 = (struct sockaddr_in6 *)initack_src;
+ memset(sin6, 0, sizeof(*sin6));
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+ sin6->sin6_scope_id = cookie->scope_id;
+ memcpy(&sin6->sin6_addr, cookie->laddress,
+ sizeof(sin6->sin6_addr));
+ } else {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+#endif
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_19);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ return (NULL);
+ }
+
+ /* set up to notify upper layer */
+ *notification = SCTP_NOTIFY_ASSOC_UP;
+ if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) &&
+ (inp->sctp_socket->so_qlimit == 0)) {
+ /*
+ * This is an endpoint that called connect() how it got a
+ * cookie that is NEW is a bit of a mystery. It must be that
+ * the INIT was sent, but before it got there.. a complete
+ * INIT/INIT-ACK/COOKIE arrived. But of course then it
+ * should have went to the other code.. not here.. oh well..
+ * a bit of protection is worth having..
+ */
+ stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED;
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return (NULL);
+ }
+#endif
+ soisconnected(stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ } else if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
+ (inp->sctp_socket->so_qlimit)) {
+ /*
+ * We don't want to do anything with this one. Since it is
+ * the listening guy. The timer will get started for
+ * accepted connections in the caller.
+ */
+ ;
+ }
+ /* since we did not send a HB make sure we don't double things */
+ if ((netp) && (*netp))
+ (*netp)->hb_responded = 1;
+
+ if (stcb->asoc.sctp_autoclose_ticks &&
+ sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) {
+ sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL);
+ }
+ /* calculate the RTT */
+ (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
+ if ((netp) && (*netp)) {
+ (*netp)->RTO = sctp_calculate_rto(stcb, asoc, *netp,
+ &cookie->time_entered, sctp_align_unsafe_makecopy);
+ }
+ /* respond with a COOKIE-ACK */
+ sctp_send_cookie_ack(stcb);
+
+ /*
+ * check the address lists for any ASCONFs that need to be sent
+ * AFTER the cookie-ack is sent
+ */
+ sctp_check_address_list(stcb, m,
+ initack_offset + sizeof(struct sctp_init_ack_chunk),
+ initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk)),
+ initack_src, cookie->local_scope, cookie->site_scope,
+ cookie->ipv4_scope, cookie->loopback_scope);
+
+
+ return (stcb);
+}
+
+/*
+ * CODE LIKE THIS NEEDS TO RUN IF the peer supports the NAT extension, i.e
+ * we NEED to make sure we are not already using the vtag. If so we
+ * need to send back an ABORT-TRY-AGAIN-WITH-NEW-TAG No middle box bit!
+ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag,
+ SCTP_BASE_INFO(hashasocmark))];
+ LIST_FOREACH(stcb, head, sctp_asocs) {
+ if ((stcb->asoc.my_vtag == tag) && (stcb->rport == rport) && (inp == stcb->sctp_ep)) {
+ -- SEND ABORT - TRY AGAIN --
+ }
+ }
+*/
+
+/*
+ * handles a COOKIE-ECHO message stcb: modified to either a new or left as
+ * existing (non-NULL) TCB
+ */
+static struct mbuf *
+sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
+ struct sctphdr *sh, struct sctp_cookie_echo_chunk *cp,
+ struct sctp_inpcb **inp_p, struct sctp_tcb **stcb, struct sctp_nets **netp,
+ int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
+ struct sctp_tcb **locked_tcb, uint32_t vrf_id, uint16_t port)
+{
+ struct sctp_state_cookie *cookie;
+ struct sockaddr_in6 sin6;
+ struct sockaddr_in sin;
+ struct sctp_tcb *l_stcb = *stcb;
+ struct sctp_inpcb *l_inp;
+ struct sockaddr *to;
+ sctp_assoc_t sac_restart_id;
+ struct sctp_pcb *ep;
+ struct mbuf *m_sig;
+ uint8_t calc_sig[SCTP_SIGNATURE_SIZE], tmp_sig[SCTP_SIGNATURE_SIZE];
+ uint8_t *sig;
+ uint8_t cookie_ok = 0;
+ unsigned int size_of_pkt, sig_offset, cookie_offset;
+ unsigned int cookie_len;
+ struct timeval now;
+ struct timeval time_expires;
+ struct sockaddr_storage dest_store;
+ struct sockaddr *localep_sa = (struct sockaddr *)&dest_store;
+ struct ip *iph;
+ int notification = 0;
+ struct sctp_nets *netl;
+ int had_a_existing_tcb = 0;
+
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "sctp_handle_cookie: handling COOKIE-ECHO\n");
+
+ if (inp_p == NULL) {
+ return (NULL);
+ }
+ /* First get the destination address setup too. */
+ iph = mtod(m, struct ip *);
+ switch (iph->ip_v) {
+ case IPVERSION:
+ {
+ /* its IPv4 */
+ struct sockaddr_in *lsin;
+
+ lsin = (struct sockaddr_in *)(localep_sa);
+ memset(lsin, 0, sizeof(*lsin));
+ lsin->sin_family = AF_INET;
+ lsin->sin_len = sizeof(*lsin);
+ lsin->sin_port = sh->dest_port;
+ lsin->sin_addr.s_addr = iph->ip_dst.s_addr;
+ size_of_pkt = SCTP_GET_IPV4_LENGTH(iph);
+ break;
+ }
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ {
+ /* its IPv6 */
+ struct ip6_hdr *ip6;
+ struct sockaddr_in6 *lsin6;
+
+ lsin6 = (struct sockaddr_in6 *)(localep_sa);
+ memset(lsin6, 0, sizeof(*lsin6));
+ lsin6->sin6_family = AF_INET6;
+ lsin6->sin6_len = sizeof(struct sockaddr_in6);
+ ip6 = mtod(m, struct ip6_hdr *);
+ lsin6->sin6_port = sh->dest_port;
+ lsin6->sin6_addr = ip6->ip6_dst;
+ size_of_pkt = SCTP_GET_IPV6_LENGTH(ip6) + iphlen;
+ break;
+ }
+#endif
+ default:
+ return (NULL);
+ }
+
+ cookie = &cp->cookie;
+ cookie_offset = offset + sizeof(struct sctp_chunkhdr);
+ cookie_len = ntohs(cp->ch.chunk_length);
+
+ if ((cookie->peerport != sh->src_port) &&
+ (cookie->myport != sh->dest_port) &&
+ (cookie->my_vtag != sh->v_tag)) {
+ /*
+ * invalid ports or bad tag. Note that we always leave the
+ * v_tag in the header in network order and when we stored
+ * it in the my_vtag slot we also left it in network order.
+ * This maintains the match even though it may be in the
+ * opposite byte order of the machine :->
+ */
+ return (NULL);
+ }
+ if (cookie_len > size_of_pkt ||
+ cookie_len < sizeof(struct sctp_cookie_echo_chunk) +
+ sizeof(struct sctp_init_chunk) +
+ sizeof(struct sctp_init_ack_chunk) + SCTP_SIGNATURE_SIZE) {
+ /* cookie too long! or too small */
+ return (NULL);
+ }
+ /*
+ * split off the signature into its own mbuf (since it should not be
+ * calculated in the sctp_hmac_m() call).
+ */
+ sig_offset = offset + cookie_len - SCTP_SIGNATURE_SIZE;
+ if (sig_offset > size_of_pkt) {
+ /* packet not correct size! */
+ /* XXX this may already be accounted for earlier... */
+ return (NULL);
+ }
+ m_sig = m_split(m, sig_offset, M_DONTWAIT);
+ if (m_sig == NULL) {
+ /* out of memory or ?? */
+ return (NULL);
+ }
+#ifdef SCTP_MBUF_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ struct mbuf *mat;
+
+ mat = m_sig;
+ while (mat) {
+ if (SCTP_BUF_IS_EXTENDED(mat)) {
+ sctp_log_mb(mat, SCTP_MBUF_SPLIT);
+ }
+ mat = SCTP_BUF_NEXT(mat);
+ }
+ }
+#endif
+
+ /*
+ * compute the signature/digest for the cookie
+ */
+ ep = &(*inp_p)->sctp_ep;
+ l_inp = *inp_p;
+ if (l_stcb) {
+ SCTP_TCB_UNLOCK(l_stcb);
+ }
+ SCTP_INP_RLOCK(l_inp);
+ if (l_stcb) {
+ SCTP_TCB_LOCK(l_stcb);
+ }
+ /* which cookie is it? */
+ if ((cookie->time_entered.tv_sec < (long)ep->time_of_secret_change) &&
+ (ep->current_secret_number != ep->last_secret_number)) {
+ /* it's the old cookie */
+ (void)sctp_hmac_m(SCTP_HMAC,
+ (uint8_t *) ep->secret_key[(int)ep->last_secret_number],
+ SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0);
+ } else {
+ /* it's the current cookie */
+ (void)sctp_hmac_m(SCTP_HMAC,
+ (uint8_t *) ep->secret_key[(int)ep->current_secret_number],
+ SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0);
+ }
+ /* get the signature */
+ SCTP_INP_RUNLOCK(l_inp);
+ sig = (uint8_t *) sctp_m_getptr(m_sig, 0, SCTP_SIGNATURE_SIZE, (uint8_t *) & tmp_sig);
+ if (sig == NULL) {
+ /* couldn't find signature */
+ sctp_m_freem(m_sig);
+ return (NULL);
+ }
+ /* compare the received digest with the computed digest */
+ if (memcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) != 0) {
+ /* try the old cookie? */
+ if ((cookie->time_entered.tv_sec == (long)ep->time_of_secret_change) &&
+ (ep->current_secret_number != ep->last_secret_number)) {
+ /* compute digest with old */
+ (void)sctp_hmac_m(SCTP_HMAC,
+ (uint8_t *) ep->secret_key[(int)ep->last_secret_number],
+ SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0);
+ /* compare */
+ if (memcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) == 0)
+ cookie_ok = 1;
+ }
+ } else {
+ cookie_ok = 1;
+ }
+
+ /*
+ * Now before we continue we must reconstruct our mbuf so that
+ * normal processing of any other chunks will work.
+ */
+ {
+ struct mbuf *m_at;
+
+ m_at = m;
+ while (SCTP_BUF_NEXT(m_at) != NULL) {
+ m_at = SCTP_BUF_NEXT(m_at);
+ }
+ SCTP_BUF_NEXT(m_at) = m_sig;
+ }
+
+ if (cookie_ok == 0) {
+ SCTPDBG(SCTP_DEBUG_INPUT2, "handle_cookie_echo: cookie signature validation failed!\n");
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "offset = %u, cookie_offset = %u, sig_offset = %u\n",
+ (uint32_t) offset, cookie_offset, sig_offset);
+ return (NULL);
+ }
+ /*
+ * check the cookie timestamps to be sure it's not stale
+ */
+ (void)SCTP_GETTIME_TIMEVAL(&now);
+ /* Expire time is in Ticks, so we convert to seconds */
+ time_expires.tv_sec = cookie->time_entered.tv_sec + TICKS_TO_SEC(cookie->cookie_life);
+ time_expires.tv_usec = cookie->time_entered.tv_usec;
+ /*
+ * TODO sctp_constants.h needs alternative time macros when _KERNEL
+ * is undefined.
+ */
+ if (timevalcmp(&now, &time_expires, >)) {
+ /* cookie is stale! */
+ struct mbuf *op_err;
+ struct sctp_stale_cookie_msg *scm;
+ uint32_t tim;
+
+ op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_stale_cookie_msg),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (op_err == NULL) {
+ /* FOOBAR */
+ return (NULL);
+ }
+ /* Set the len */
+ SCTP_BUF_LEN(op_err) = sizeof(struct sctp_stale_cookie_msg);
+ scm = mtod(op_err, struct sctp_stale_cookie_msg *);
+ scm->ph.param_type = htons(SCTP_CAUSE_STALE_COOKIE);
+ scm->ph.param_length = htons((sizeof(struct sctp_paramhdr) +
+ (sizeof(uint32_t))));
+ /* seconds to usec */
+ tim = (now.tv_sec - time_expires.tv_sec) * 1000000;
+ /* add in usec */
+ if (tim == 0)
+ tim = now.tv_usec - cookie->time_entered.tv_usec;
+ scm->time_usec = htonl(tim);
+ sctp_send_operr_to(m, iphlen, op_err, cookie->peers_vtag,
+ vrf_id, port);
+ return (NULL);
+ }
+ /*
+ * Now we must see with the lookup address if we have an existing
+ * asoc. This will only happen if we were in the COOKIE-WAIT state
+ * and a INIT collided with us and somewhere the peer sent the
+ * cookie on another address besides the single address our assoc
+ * had for him. In this case we will have one of the tie-tags set at
+ * least AND the address field in the cookie can be used to look it
+ * up.
+ */
+ to = NULL;
+ if (cookie->addr_type == SCTP_IPV6_ADDRESS) {
+ memset(&sin6, 0, sizeof(sin6));
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_len = sizeof(sin6);
+ sin6.sin6_port = sh->src_port;
+ sin6.sin6_scope_id = cookie->scope_id;
+ memcpy(&sin6.sin6_addr.s6_addr, cookie->address,
+ sizeof(sin6.sin6_addr.s6_addr));
+ to = (struct sockaddr *)&sin6;
+ } else if (cookie->addr_type == SCTP_IPV4_ADDRESS) {
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_len = sizeof(sin);
+ sin.sin_port = sh->src_port;
+ sin.sin_addr.s_addr = cookie->address[0];
+ to = (struct sockaddr *)&sin;
+ } else {
+ /* This should not happen */
+ return (NULL);
+ }
+ if ((*stcb == NULL) && to) {
+ /* Yep, lets check */
+ *stcb = sctp_findassociation_ep_addr(inp_p, to, netp, localep_sa, NULL);
+ if (*stcb == NULL) {
+ /*
+ * We should have only got back the same inp. If we
+ * got back a different ep we have a problem. The
+ * original findep got back l_inp and now
+ */
+ if (l_inp != *inp_p) {
+ SCTP_PRINTF("Bad problem find_ep got a diff inp then special_locate?\n");
+ }
+ } else {
+ if (*locked_tcb == NULL) {
+ /*
+ * In this case we found the assoc only
+ * after we locked the create lock. This
+ * means we are in a colliding case and we
+ * must make sure that we unlock the tcb if
+ * its one of the cases where we throw away
+ * the incoming packets.
+ */
+ *locked_tcb = *stcb;
+
+ /*
+ * We must also increment the inp ref count
+ * since the ref_count flags was set when we
+ * did not find the TCB, now we found it
+ * which reduces the refcount.. we must
+ * raise it back out to balance it all :-)
+ */
+ SCTP_INP_INCR_REF((*stcb)->sctp_ep);
+ if ((*stcb)->sctp_ep != l_inp) {
+ SCTP_PRINTF("Huh? ep:%p diff then l_inp:%p?\n",
+ (*stcb)->sctp_ep, l_inp);
+ }
+ }
+ }
+ }
+ if (to == NULL) {
+ return (NULL);
+ }
+ cookie_len -= SCTP_SIGNATURE_SIZE;
+ if (*stcb == NULL) {
+ /* this is the "normal" case... get a new TCB */
+ *stcb = sctp_process_cookie_new(m, iphlen, offset, sh, cookie,
+ cookie_len, *inp_p, netp, to, &notification,
+ auth_skipped, auth_offset, auth_len, vrf_id, port);
+ } else {
+ /* this is abnormal... cookie-echo on existing TCB */
+ had_a_existing_tcb = 1;
+ *stcb = sctp_process_cookie_existing(m, iphlen, offset, sh,
+ cookie, cookie_len, *inp_p, *stcb, netp, to,
+ &notification, &sac_restart_id, vrf_id, auth_skipped, auth_offset, auth_len, port);
+ }
+
+ if (*stcb == NULL) {
+ /* still no TCB... must be bad cookie-echo */
+ return (NULL);
+ }
+ /*
+ * Ok, we built an association so confirm the address we sent the
+ * INIT-ACK to.
+ */
+ netl = sctp_findnet(*stcb, to);
+ /*
+ * This code should in theory NOT run but
+ */
+ if (netl == NULL) {
+ /* TSNH! Huh, why do I need to add this address here? */
+ int ret;
+
+ ret = sctp_add_remote_addr(*stcb, to, SCTP_DONOT_SETSCOPE,
+ SCTP_IN_COOKIE_PROC);
+ netl = sctp_findnet(*stcb, to);
+ }
+ if (netl) {
+ if (netl->dest_state & SCTP_ADDR_UNCONFIRMED) {
+ netl->dest_state &= ~SCTP_ADDR_UNCONFIRMED;
+ (void)sctp_set_primary_addr((*stcb), (struct sockaddr *)NULL,
+ netl);
+ sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED,
+ (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED);
+ }
+ }
+ if (*stcb) {
+ sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, *inp_p,
+ *stcb, NULL);
+ }
+ if ((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) {
+ if (!had_a_existing_tcb ||
+ (((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) {
+ /*
+ * If we have a NEW cookie or the connect never
+ * reached the connected state during collision we
+ * must do the TCP accept thing.
+ */
+ struct socket *so, *oso;
+ struct sctp_inpcb *inp;
+
+ if (notification == SCTP_NOTIFY_ASSOC_RESTART) {
+ /*
+ * For a restart we will keep the same
+ * socket, no need to do anything. I THINK!!
+ */
+ sctp_ulp_notify(notification, *stcb, 0, (void *)&sac_restart_id, SCTP_SO_NOT_LOCKED);
+ return (m);
+ }
+ oso = (*inp_p)->sctp_socket;
+ atomic_add_int(&(*stcb)->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK((*stcb));
+ so = sonewconn(oso, 0
+ );
+ SCTP_TCB_LOCK((*stcb));
+ atomic_subtract_int(&(*stcb)->asoc.refcnt, 1);
+
+ if (so == NULL) {
+ struct mbuf *op_err;
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *pcb_so;
+
+#endif
+ /* Too many sockets */
+ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: no room for another socket!\n");
+ op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC);
+ sctp_abort_association(*inp_p, NULL, m, iphlen,
+ sh, op_err, vrf_id, port);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ pcb_so = SCTP_INP_SO(*inp_p);
+ atomic_add_int(&(*stcb)->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK((*stcb));
+ SCTP_SOCKET_LOCK(pcb_so, 1);
+ SCTP_TCB_LOCK((*stcb));
+ atomic_subtract_int(&(*stcb)->asoc.refcnt, 1);
+#endif
+ (void)sctp_free_assoc(*inp_p, *stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_20);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(pcb_so, 1);
+#endif
+ return (NULL);
+ }
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ SCTP_INP_INCR_REF(inp);
+ /*
+ * We add the unbound flag here so that if we get an
+ * soabort() before we get the move_pcb done, we
+ * will properly cleanup.
+ */
+ inp->sctp_flags = (SCTP_PCB_FLAGS_TCPTYPE |
+ SCTP_PCB_FLAGS_CONNECTED |
+ SCTP_PCB_FLAGS_IN_TCPPOOL |
+ SCTP_PCB_FLAGS_UNBOUND |
+ (SCTP_PCB_COPY_FLAGS & (*inp_p)->sctp_flags) |
+ SCTP_PCB_FLAGS_DONT_WAKE);
+ inp->sctp_features = (*inp_p)->sctp_features;
+ inp->sctp_mobility_features = (*inp_p)->sctp_mobility_features;
+ inp->sctp_socket = so;
+ inp->sctp_frag_point = (*inp_p)->sctp_frag_point;
+ inp->sctp_cmt_on_off = (*inp_p)->sctp_cmt_on_off;
+ inp->partial_delivery_point = (*inp_p)->partial_delivery_point;
+ inp->sctp_context = (*inp_p)->sctp_context;
+ inp->inp_starting_point_for_iterator = NULL;
+ /*
+ * copy in the authentication parameters from the
+ * original endpoint
+ */
+ if (inp->sctp_ep.local_hmacs)
+ sctp_free_hmaclist(inp->sctp_ep.local_hmacs);
+ inp->sctp_ep.local_hmacs =
+ sctp_copy_hmaclist((*inp_p)->sctp_ep.local_hmacs);
+ if (inp->sctp_ep.local_auth_chunks)
+ sctp_free_chunklist(inp->sctp_ep.local_auth_chunks);
+ inp->sctp_ep.local_auth_chunks =
+ sctp_copy_chunklist((*inp_p)->sctp_ep.local_auth_chunks);
+
+ /*
+ * Now we must move it from one hash table to
+ * another and get the tcb in the right place.
+ */
+
+ /*
+ * This is where the one-2-one socket is put into
+ * the accept state waiting for the accept!
+ */
+ if (*stcb) {
+ (*stcb)->asoc.state |= SCTP_STATE_IN_ACCEPT_QUEUE;
+ }
+ sctp_move_pcb_and_assoc(*inp_p, inp, *stcb);
+
+ atomic_add_int(&(*stcb)->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK((*stcb));
+
+ sctp_pull_off_control_to_new_inp((*inp_p), inp, *stcb,
+ 0);
+ SCTP_TCB_LOCK((*stcb));
+ atomic_subtract_int(&(*stcb)->asoc.refcnt, 1);
+
+
+ /*
+ * now we must check to see if we were aborted while
+ * the move was going on and the lock/unlock
+ * happened.
+ */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ /*
+ * yep it was, we leave the assoc attached
+ * to the socket since the sctp_inpcb_free()
+ * call will send an abort for us.
+ */
+ SCTP_INP_DECR_REF(inp);
+ return (NULL);
+ }
+ SCTP_INP_DECR_REF(inp);
+ /* Switch over to the new guy */
+ *inp_p = inp;
+ sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
+
+ /*
+ * Pull it from the incomplete queue and wake the
+ * guy
+ */
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ atomic_add_int(&(*stcb)->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK((*stcb));
+ SCTP_SOCKET_LOCK(so, 1);
+#endif
+ soisconnected(so);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_TCB_LOCK((*stcb));
+ atomic_subtract_int(&(*stcb)->asoc.refcnt, 1);
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ return (m);
+ }
+ }
+ if ((notification) && ((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) {
+ sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
+ }
+ return (m);
+}
+
+static void
+sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp,
+ struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ /* cp must not be used, others call this without a c-ack :-) */
+ struct sctp_association *asoc;
+
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "sctp_handle_cookie_ack: handling COOKIE-ACK\n");
+ if (stcb == NULL)
+ return;
+
+ asoc = &stcb->asoc;
+
+ sctp_stop_all_cookie_timers(stcb);
+ /* process according to association state */
+ if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) {
+ /* state change only needed when I am in right state */
+ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n");
+ SCTP_SET_STATE(asoc, SCTP_STATE_OPEN);
+ if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) {
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
+ stcb->sctp_ep, stcb, asoc->primary_destination);
+
+ }
+ /* update RTO */
+ SCTP_STAT_INCR_COUNTER32(sctps_activeestab);
+ SCTP_STAT_INCR_GAUGE32(sctps_currestab);
+ if (asoc->overall_error_count == 0) {
+ net->RTO = sctp_calculate_rto(stcb, asoc, net,
+ &asoc->time_entered, sctp_align_safe_nocopy);
+ }
+ (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered);
+ sctp_ulp_notify(SCTP_NOTIFY_ASSOC_UP, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
+ if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+ stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED;
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return;
+ }
+#endif
+ soisconnected(stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ }
+ sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep,
+ stcb, net);
+ /*
+ * since we did not send a HB make sure we don't double
+ * things
+ */
+ net->hb_responded = 1;
+
+ if (stcb->asoc.sctp_autoclose_ticks &&
+ sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_AUTOCLOSE)) {
+ sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE,
+ stcb->sctp_ep, stcb, NULL);
+ }
+ /*
+ * send ASCONF if parameters are pending and ASCONFs are
+ * allowed (eg. addresses changed when init/cookie echo were
+ * in flight)
+ */
+ if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_DO_ASCONF)) &&
+ (stcb->asoc.peer_supports_asconf) &&
+ (!TAILQ_EMPTY(&stcb->asoc.asconf_queue))) {
+#ifdef SCTP_TIMER_BASED_ASCONF
+ sctp_timer_start(SCTP_TIMER_TYPE_ASCONF,
+ stcb->sctp_ep, stcb,
+ stcb->asoc.primary_destination);
+#else
+ sctp_send_asconf(stcb, stcb->asoc.primary_destination,
+ SCTP_ADDR_NOT_LOCKED);
+#endif
+ }
+ }
+ /* Toss the cookie if I can */
+ sctp_toss_old_cookies(stcb, asoc);
+ if (!TAILQ_EMPTY(&asoc->sent_queue)) {
+ /* Restart the timer if we have pending data */
+ struct sctp_tmit_chunk *chk;
+
+ chk = TAILQ_FIRST(&asoc->sent_queue);
+ if (chk) {
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
+ stcb, chk->whoTo);
+ }
+ }
+}
+
+static void
+sctp_handle_ecn_echo(struct sctp_ecne_chunk *cp,
+ struct sctp_tcb *stcb)
+{
+ struct sctp_nets *net;
+ struct sctp_tmit_chunk *lchk;
+ uint32_t tsn;
+
+ if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_ecne_chunk)) {
+ return;
+ }
+ SCTP_STAT_INCR(sctps_recvecne);
+ tsn = ntohl(cp->tsn);
+ /* ECN Nonce stuff: need a resync and disable the nonce sum check */
+ /* Also we make sure we disable the nonce_wait */
+ lchk = TAILQ_FIRST(&stcb->asoc.send_queue);
+ if (lchk == NULL) {
+ stcb->asoc.nonce_resync_tsn = stcb->asoc.sending_seq;
+ } else {
+ stcb->asoc.nonce_resync_tsn = lchk->rec.data.TSN_seq;
+ }
+ stcb->asoc.nonce_wait_for_ecne = 0;
+ stcb->asoc.nonce_sum_check = 0;
+
+ /* Find where it was sent, if possible */
+ net = NULL;
+ lchk = TAILQ_FIRST(&stcb->asoc.sent_queue);
+ while (lchk) {
+ if (lchk->rec.data.TSN_seq == tsn) {
+ net = lchk->whoTo;
+ break;
+ }
+ if (compare_with_wrap(lchk->rec.data.TSN_seq, tsn, MAX_SEQ))
+ break;
+ lchk = TAILQ_NEXT(lchk, sctp_next);
+ }
+ if (net == NULL)
+ /* default is we use the primary */
+ net = stcb->asoc.primary_destination;
+
+ if (compare_with_wrap(tsn, stcb->asoc.last_cwr_tsn, MAX_TSN)) {
+ /*
+ * JRS - Use the congestion control given in the pluggable
+ * CC module
+ */
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo(stcb, net);
+ /*
+ * we reduce once every RTT. So we will only lower cwnd at
+ * the next sending seq i.e. the resync_tsn.
+ */
+ stcb->asoc.last_cwr_tsn = stcb->asoc.nonce_resync_tsn;
+ }
+ /*
+ * We always send a CWR this way if our previous one was lost our
+ * peer will get an update, or if it is not time again to reduce we
+ * still get the cwr to the peer.
+ */
+ sctp_send_cwr(stcb, net, tsn);
+}
+
+static void
+sctp_handle_ecn_cwr(struct sctp_cwr_chunk *cp, struct sctp_tcb *stcb)
+{
+ /*
+ * Here we get a CWR from the peer. We must look in the outqueue and
+ * make sure that we have a covered ECNE in teh control chunk part.
+ * If so remove it.
+ */
+ struct sctp_tmit_chunk *chk;
+ struct sctp_ecne_chunk *ecne;
+
+ TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) {
+ if (chk->rec.chunk_id.id != SCTP_ECN_ECHO) {
+ continue;
+ }
+ /*
+ * Look for and remove if it is the right TSN. Since there
+ * is only ONE ECNE on the control queue at any one time we
+ * don't need to worry about more than one!
+ */
+ ecne = mtod(chk->data, struct sctp_ecne_chunk *);
+ if (compare_with_wrap(ntohl(cp->tsn), ntohl(ecne->tsn),
+ MAX_TSN) || (cp->tsn == ecne->tsn)) {
+ /* this covers this ECNE, we can remove it */
+ stcb->asoc.ecn_echo_cnt_onq--;
+ TAILQ_REMOVE(&stcb->asoc.control_send_queue, chk,
+ sctp_next);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ stcb->asoc.ctrl_queue_cnt--;
+ sctp_free_a_chunk(stcb, chk);
+ break;
+ }
+ }
+}
+
+static void
+sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp,
+ struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ struct sctp_association *asoc;
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "sctp_handle_shutdown_complete: handling SHUTDOWN-COMPLETE\n");
+ if (stcb == NULL)
+ return;
+
+ asoc = &stcb->asoc;
+ /* process according to association state */
+ if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) {
+ /* unexpected SHUTDOWN-COMPLETE... so ignore... */
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "sctp_handle_shutdown_complete: not in SCTP_STATE_SHUTDOWN_ACK_SENT --- ignore\n");
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ /* notify upper layer protocol */
+ if (stcb->sctp_socket) {
+ sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
+ /* are the queues empty? they should be */
+ if (!TAILQ_EMPTY(&asoc->send_queue) ||
+ !TAILQ_EMPTY(&asoc->sent_queue) ||
+ !TAILQ_EMPTY(&asoc->out_wheel)) {
+ sctp_report_all_outbound(stcb, 0, SCTP_SO_NOT_LOCKED);
+ }
+ }
+ /* stop the timer */
+ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_22);
+ SCTP_STAT_INCR_COUNTER32(sctps_shutdown);
+ /* free the TCB */
+ SCTPDBG(SCTP_DEBUG_INPUT2,
+ "sctp_handle_shutdown_complete: calls free-asoc\n");
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+#endif
+ (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_23);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ return;
+}
+
+static int
+process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc,
+ struct sctp_nets *net, uint8_t flg)
+{
+ switch (desc->chunk_type) {
+ case SCTP_DATA:
+ /* find the tsn to resend (possibly */
+ {
+ uint32_t tsn;
+ struct sctp_tmit_chunk *tp1;
+
+ tsn = ntohl(desc->tsn_ifany);
+ tp1 = TAILQ_FIRST(&stcb->asoc.sent_queue);
+ while (tp1) {
+ if (tp1->rec.data.TSN_seq == tsn) {
+ /* found it */
+ break;
+ }
+ if (compare_with_wrap(tp1->rec.data.TSN_seq, tsn,
+ MAX_TSN)) {
+ /* not found */
+ tp1 = NULL;
+ break;
+ }
+ tp1 = TAILQ_NEXT(tp1, sctp_next);
+ }
+ if (tp1 == NULL) {
+ /*
+ * Do it the other way , aka without paying
+ * attention to queue seq order.
+ */
+ SCTP_STAT_INCR(sctps_pdrpdnfnd);
+ tp1 = TAILQ_FIRST(&stcb->asoc.sent_queue);
+ while (tp1) {
+ if (tp1->rec.data.TSN_seq == tsn) {
+ /* found it */
+ break;
+ }
+ tp1 = TAILQ_NEXT(tp1, sctp_next);
+ }
+ }
+ if (tp1 == NULL) {
+ SCTP_STAT_INCR(sctps_pdrptsnnf);
+ }
+ if ((tp1) && (tp1->sent < SCTP_DATAGRAM_ACKED)) {
+ uint8_t *ddp;
+
+ if (((flg & SCTP_BADCRC) == 0) &&
+ ((flg & SCTP_FROM_MIDDLE_BOX) == 0)) {
+ return (0);
+ }
+ if ((stcb->asoc.peers_rwnd == 0) &&
+ ((flg & SCTP_FROM_MIDDLE_BOX) == 0)) {
+ SCTP_STAT_INCR(sctps_pdrpdiwnp);
+ return (0);
+ }
+ if (stcb->asoc.peers_rwnd == 0 &&
+ (flg & SCTP_FROM_MIDDLE_BOX)) {
+ SCTP_STAT_INCR(sctps_pdrpdizrw);
+ return (0);
+ }
+ ddp = (uint8_t *) (mtod(tp1->data, caddr_t)+
+ sizeof(struct sctp_data_chunk));
+ {
+ unsigned int iii;
+
+ for (iii = 0; iii < sizeof(desc->data_bytes);
+ iii++) {
+ if (ddp[iii] != desc->data_bytes[iii]) {
+ SCTP_STAT_INCR(sctps_pdrpbadd);
+ return (-1);
+ }
+ }
+ }
+ /*
+ * We zero out the nonce so resync not
+ * needed
+ */
+ tp1->rec.data.ect_nonce = 0;
+
+ if (tp1->do_rtt) {
+ /*
+ * this guy had a RTO calculation
+ * pending on it, cancel it
+ */
+ tp1->do_rtt = 0;
+ }
+ SCTP_STAT_INCR(sctps_pdrpmark);
+ if (tp1->sent != SCTP_DATAGRAM_RESEND)
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ /*
+ * mark it as if we were doing a FR, since
+ * we will be getting gap ack reports behind
+ * the info from the router.
+ */
+ tp1->rec.data.doing_fast_retransmit = 1;
+ /*
+ * mark the tsn with what sequences can
+ * cause a new FR.
+ */
+ if (TAILQ_EMPTY(&stcb->asoc.send_queue)) {
+ tp1->rec.data.fast_retran_tsn = stcb->asoc.sending_seq;
+ } else {
+ tp1->rec.data.fast_retran_tsn = (TAILQ_FIRST(&stcb->asoc.send_queue))->rec.data.TSN_seq;
+ }
+
+ /* restart the timer */
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
+ stcb, tp1->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_24);
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
+ stcb, tp1->whoTo);
+
+ /* fix counts and things */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PDRP,
+ tp1->whoTo->flight_size,
+ tp1->book_size,
+ (uintptr_t) stcb,
+ tp1->rec.data.TSN_seq);
+ }
+ if (tp1->sent < SCTP_DATAGRAM_RESEND) {
+ sctp_flight_size_decrease(tp1);
+ sctp_total_flight_decrease(stcb, tp1);
+ }
+ tp1->sent = SCTP_DATAGRAM_RESEND;
+ } {
+ /* audit code */
+ unsigned int audit;
+
+ audit = 0;
+ TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) {
+ if (tp1->sent == SCTP_DATAGRAM_RESEND)
+ audit++;
+ }
+ TAILQ_FOREACH(tp1, &stcb->asoc.control_send_queue,
+ sctp_next) {
+ if (tp1->sent == SCTP_DATAGRAM_RESEND)
+ audit++;
+ }
+ if (audit != stcb->asoc.sent_queue_retran_cnt) {
+ SCTP_PRINTF("**Local Audit finds cnt:%d asoc cnt:%d\n",
+ audit, stcb->asoc.sent_queue_retran_cnt);
+#ifndef SCTP_AUDITING_ENABLED
+ stcb->asoc.sent_queue_retran_cnt = audit;
+#endif
+ }
+ }
+ }
+ break;
+ case SCTP_ASCONF:
+ {
+ struct sctp_tmit_chunk *asconf;
+
+ TAILQ_FOREACH(asconf, &stcb->asoc.control_send_queue,
+ sctp_next) {
+ if (asconf->rec.chunk_id.id == SCTP_ASCONF) {
+ break;
+ }
+ }
+ if (asconf) {
+ if (asconf->sent != SCTP_DATAGRAM_RESEND)
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ asconf->sent = SCTP_DATAGRAM_RESEND;
+ asconf->snd_count--;
+ }
+ }
+ break;
+ case SCTP_INITIATION:
+ /* resend the INIT */
+ stcb->asoc.dropped_special_cnt++;
+ if (stcb->asoc.dropped_special_cnt < SCTP_RETRY_DROPPED_THRESH) {
+ /*
+ * If we can get it in, in a few attempts we do
+ * this, otherwise we let the timer fire.
+ */
+ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep,
+ stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_25);
+ sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED);
+ }
+ break;
+ case SCTP_SELECTIVE_ACK:
+ case SCTP_NR_SELECTIVE_ACK:
+ /* resend the sack */
+ sctp_send_sack(stcb);
+ break;
+ case SCTP_HEARTBEAT_REQUEST:
+ /* resend a demand HB */
+ if ((stcb->asoc.overall_error_count + 3) < stcb->asoc.max_send_times) {
+ /*
+ * Only retransmit if we KNOW we wont destroy the
+ * tcb
+ */
+ (void)sctp_send_hb(stcb, 1, net);
+ }
+ break;
+ case SCTP_SHUTDOWN:
+ sctp_send_shutdown(stcb, net);
+ break;
+ case SCTP_SHUTDOWN_ACK:
+ sctp_send_shutdown_ack(stcb, net);
+ break;
+ case SCTP_COOKIE_ECHO:
+ {
+ struct sctp_tmit_chunk *cookie;
+
+ cookie = NULL;
+ TAILQ_FOREACH(cookie, &stcb->asoc.control_send_queue,
+ sctp_next) {
+ if (cookie->rec.chunk_id.id == SCTP_COOKIE_ECHO) {
+ break;
+ }
+ }
+ if (cookie) {
+ if (cookie->sent != SCTP_DATAGRAM_RESEND)
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ cookie->sent = SCTP_DATAGRAM_RESEND;
+ sctp_stop_all_cookie_timers(stcb);
+ }
+ }
+ break;
+ case SCTP_COOKIE_ACK:
+ sctp_send_cookie_ack(stcb);
+ break;
+ case SCTP_ASCONF_ACK:
+ /* resend last asconf ack */
+ sctp_send_asconf_ack(stcb);
+ break;
+ case SCTP_FORWARD_CUM_TSN:
+ send_forward_tsn(stcb, &stcb->asoc);
+ break;
+ /* can't do anything with these */
+ case SCTP_PACKET_DROPPED:
+ case SCTP_INITIATION_ACK: /* this should not happen */
+ case SCTP_HEARTBEAT_ACK:
+ case SCTP_ABORT_ASSOCIATION:
+ case SCTP_OPERATION_ERROR:
+ case SCTP_SHUTDOWN_COMPLETE:
+ case SCTP_ECN_ECHO:
+ case SCTP_ECN_CWR:
+ default:
+ break;
+ }
+ return (0);
+}
+
+void
+sctp_reset_in_stream(struct sctp_tcb *stcb, int number_entries, uint16_t * list)
+{
+ int i;
+ uint16_t temp;
+
+ /*
+ * We set things to 0xffff since this is the last delivered sequence
+ * and we will be sending in 0 after the reset.
+ */
+
+ if (number_entries) {
+ for (i = 0; i < number_entries; i++) {
+ temp = ntohs(list[i]);
+ if (temp >= stcb->asoc.streamincnt) {
+ continue;
+ }
+ stcb->asoc.strmin[temp].last_sequence_delivered = 0xffff;
+ }
+ } else {
+ list = NULL;
+ for (i = 0; i < stcb->asoc.streamincnt; i++) {
+ stcb->asoc.strmin[i].last_sequence_delivered = 0xffff;
+ }
+ }
+ sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_RECV, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED);
+}
+
+static void
+sctp_reset_out_streams(struct sctp_tcb *stcb, int number_entries, uint16_t * list)
+{
+ int i;
+
+ if (number_entries == 0) {
+ for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
+ stcb->asoc.strmout[i].next_sequence_sent = 0;
+ }
+ } else if (number_entries) {
+ for (i = 0; i < number_entries; i++) {
+ uint16_t temp;
+
+ temp = ntohs(list[i]);
+ if (temp >= stcb->asoc.streamoutcnt) {
+ /* no such stream */
+ continue;
+ }
+ stcb->asoc.strmout[temp].next_sequence_sent = 0;
+ }
+ }
+ sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_SEND, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED);
+}
+
+
+struct sctp_stream_reset_out_request *
+sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chunk **bchk)
+{
+ struct sctp_association *asoc;
+ struct sctp_stream_reset_out_req *req;
+ struct sctp_stream_reset_out_request *r;
+ struct sctp_tmit_chunk *chk;
+ int len, clen;
+
+ asoc = &stcb->asoc;
+ if (TAILQ_EMPTY(&stcb->asoc.control_send_queue)) {
+ asoc->stream_reset_outstanding = 0;
+ return (NULL);
+ }
+ if (stcb->asoc.str_reset == NULL) {
+ asoc->stream_reset_outstanding = 0;
+ return (NULL);
+ }
+ chk = stcb->asoc.str_reset;
+ if (chk->data == NULL) {
+ return (NULL);
+ }
+ if (bchk) {
+ /* he wants a copy of the chk pointer */
+ *bchk = chk;
+ }
+ clen = chk->send_size;
+ req = mtod(chk->data, struct sctp_stream_reset_out_req *);
+ r = &req->sr_req;
+ if (ntohl(r->request_seq) == seq) {
+ /* found it */
+ return (r);
+ }
+ len = SCTP_SIZE32(ntohs(r->ph.param_length));
+ if (clen > (len + (int)sizeof(struct sctp_chunkhdr))) {
+ /* move to the next one, there can only be a max of two */
+ r = (struct sctp_stream_reset_out_request *)((caddr_t)r + len);
+ if (ntohl(r->request_seq) == seq) {
+ return (r);
+ }
+ }
+ /* that seq is not here */
+ return (NULL);
+}
+
+static void
+sctp_clean_up_stream_reset(struct sctp_tcb *stcb)
+{
+ struct sctp_association *asoc;
+ struct sctp_tmit_chunk *chk = stcb->asoc.str_reset;
+
+ if (stcb->asoc.str_reset == NULL) {
+ return;
+ }
+ asoc = &stcb->asoc;
+
+ sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_26);
+ TAILQ_REMOVE(&asoc->control_send_queue,
+ chk,
+ sctp_next);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ asoc->ctrl_queue_cnt--;
+ sctp_free_a_chunk(stcb, chk);
+ /* sa_ignore NO_NULL_CHK */
+ stcb->asoc.str_reset = NULL;
+}
+
+
+static int
+sctp_handle_stream_reset_response(struct sctp_tcb *stcb,
+ uint32_t seq, uint32_t action,
+ struct sctp_stream_reset_response *respin)
+{
+ uint16_t type;
+ int lparm_len;
+ struct sctp_association *asoc = &stcb->asoc;
+ struct sctp_tmit_chunk *chk;
+ struct sctp_stream_reset_out_request *srparam;
+ int number_entries;
+
+ if (asoc->stream_reset_outstanding == 0) {
+ /* duplicate */
+ return (0);
+ }
+ if (seq == stcb->asoc.str_reset_seq_out) {
+ srparam = sctp_find_stream_reset(stcb, seq, &chk);
+ if (srparam) {
+ stcb->asoc.str_reset_seq_out++;
+ type = ntohs(srparam->ph.param_type);
+ lparm_len = ntohs(srparam->ph.param_length);
+ if (type == SCTP_STR_RESET_OUT_REQUEST) {
+ number_entries = (lparm_len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t);
+ asoc->stream_reset_out_is_outstanding = 0;
+ if (asoc->stream_reset_outstanding)
+ asoc->stream_reset_outstanding--;
+ if (action == SCTP_STREAM_RESET_PERFORMED) {
+ /* do it */
+ sctp_reset_out_streams(stcb, number_entries, srparam->list_of_streams);
+ } else {
+ sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_OUT, stcb, number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED);
+ }
+ } else if (type == SCTP_STR_RESET_IN_REQUEST) {
+ /* Answered my request */
+ number_entries = (lparm_len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t);
+ if (asoc->stream_reset_outstanding)
+ asoc->stream_reset_outstanding--;
+ if (action != SCTP_STREAM_RESET_PERFORMED) {
+ sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_IN, stcb, number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED);
+ }
+ } else if (type == SCTP_STR_RESET_ADD_STREAMS) {
+ /* Ok we now may have more streams */
+ if (asoc->stream_reset_outstanding)
+ asoc->stream_reset_outstanding--;
+ if (action == SCTP_STREAM_RESET_PERFORMED) {
+ /* Put the new streams into effect */
+ stcb->asoc.streamoutcnt = stcb->asoc.strm_realoutsize;
+ sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_ADD_OK, stcb,
+ (uint32_t) stcb->asoc.streamoutcnt, NULL, SCTP_SO_NOT_LOCKED);
+ } else {
+ sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_ADD_FAIL, stcb,
+ (uint32_t) stcb->asoc.streamoutcnt, NULL, SCTP_SO_NOT_LOCKED);
+ }
+ } else if (type == SCTP_STR_RESET_TSN_REQUEST) {
+ /**
+ * a) Adopt the new in tsn.
+ * b) reset the map
+ * c) Adopt the new out-tsn
+ */
+ struct sctp_stream_reset_response_tsn *resp;
+ struct sctp_forward_tsn_chunk fwdtsn;
+ int abort_flag = 0;
+
+ if (respin == NULL) {
+ /* huh ? */
+ return (0);
+ }
+ if (action == SCTP_STREAM_RESET_PERFORMED) {
+ resp = (struct sctp_stream_reset_response_tsn *)respin;
+ asoc->stream_reset_outstanding--;
+ fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk));
+ fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN;
+ fwdtsn.new_cumulative_tsn = htonl(ntohl(resp->senders_next_tsn) - 1);
+ sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0);
+ if (abort_flag) {
+ return (1);
+ }
+ stcb->asoc.highest_tsn_inside_map = (ntohl(resp->senders_next_tsn) - 1);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
+ sctp_log_map(0, 7, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT);
+ }
+ stcb->asoc.tsn_last_delivered = stcb->asoc.cumulative_tsn = stcb->asoc.highest_tsn_inside_map;
+ stcb->asoc.mapping_array_base_tsn = ntohl(resp->senders_next_tsn);
+ memset(stcb->asoc.mapping_array, 0, stcb->asoc.mapping_array_size);
+
+ stcb->asoc.highest_tsn_inside_nr_map = stcb->asoc.highest_tsn_inside_map;
+ memset(stcb->asoc.nr_mapping_array, 0, stcb->asoc.mapping_array_size);
+
+ stcb->asoc.sending_seq = ntohl(resp->receivers_next_tsn);
+ stcb->asoc.last_acked_seq = stcb->asoc.cumulative_tsn;
+
+ sctp_reset_out_streams(stcb, 0, (uint16_t *) NULL);
+ sctp_reset_in_stream(stcb, 0, (uint16_t *) NULL);
+
+ }
+ }
+ /* get rid of the request and get the request flags */
+ if (asoc->stream_reset_outstanding == 0) {
+ sctp_clean_up_stream_reset(stcb);
+ }
+ }
+ }
+ return (0);
+}
+
+static void
+sctp_handle_str_reset_request_in(struct sctp_tcb *stcb,
+ struct sctp_tmit_chunk *chk,
+ struct sctp_stream_reset_in_request *req, int trunc)
+{
+ uint32_t seq;
+ int len, i;
+ int number_entries;
+ uint16_t temp;
+
+ /*
+ * peer wants me to send a str-reset to him for my outgoing seq's if
+ * seq_in is right.
+ */
+ struct sctp_association *asoc = &stcb->asoc;
+
+ seq = ntohl(req->request_seq);
+ if (asoc->str_reset_seq_in == seq) {
+ if (trunc) {
+ /* Can't do it, since they exceeded our buffer size */
+ asoc->last_reset_action[1] = asoc->last_reset_action[0];
+ asoc->last_reset_action[0] = SCTP_STREAM_RESET_DENIED;
+ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
+ } else if (stcb->asoc.stream_reset_out_is_outstanding == 0) {
+ len = ntohs(req->ph.param_length);
+ number_entries = ((len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t));
+ for (i = 0; i < number_entries; i++) {
+ temp = ntohs(req->list_of_streams[i]);
+ req->list_of_streams[i] = temp;
+ }
+ /* move the reset action back one */
+ asoc->last_reset_action[1] = asoc->last_reset_action[0];
+ asoc->last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED;
+ sctp_add_stream_reset_out(chk, number_entries, req->list_of_streams,
+ asoc->str_reset_seq_out,
+ seq, (asoc->sending_seq - 1));
+ asoc->stream_reset_out_is_outstanding = 1;
+ asoc->str_reset = chk;
+ sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo);
+ stcb->asoc.stream_reset_outstanding++;
+ } else {
+ /* Can't do it, since we have sent one out */
+ asoc->last_reset_action[1] = asoc->last_reset_action[0];
+ asoc->last_reset_action[0] = SCTP_STREAM_RESET_TRY_LATER;
+ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
+ }
+ asoc->str_reset_seq_in++;
+ } else if (asoc->str_reset_seq_in - 1 == seq) {
+ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
+ } else if (asoc->str_reset_seq_in - 2 == seq) {
+ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]);
+ } else {
+ sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_BAD_SEQNO);
+ }
+}
+
+static int
+sctp_handle_str_reset_request_tsn(struct sctp_tcb *stcb,
+ struct sctp_tmit_chunk *chk,
+ struct sctp_stream_reset_tsn_request *req)
+{
+ /* reset all in and out and update the tsn */
+ /*
+ * A) reset my str-seq's on in and out. B) Select a receive next,
+ * and set cum-ack to it. Also process this selected number as a
+ * fwd-tsn as well. C) set in the response my next sending seq.
+ */
+ struct sctp_forward_tsn_chunk fwdtsn;
+ struct sctp_association *asoc = &stcb->asoc;
+ int abort_flag = 0;
+ uint32_t seq;
+
+ seq = ntohl(req->request_seq);
+ if (asoc->str_reset_seq_in == seq) {
+ fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk));
+ fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN;
+ fwdtsn.ch.chunk_flags = 0;
+ fwdtsn.new_cumulative_tsn = htonl(stcb->asoc.highest_tsn_inside_map + 1);
+ sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0);
+ if (abort_flag) {
+ return (1);
+ }
+ stcb->asoc.highest_tsn_inside_map += SCTP_STREAM_RESET_TSN_DELTA;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
+ sctp_log_map(0, 10, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT);
+ }
+ stcb->asoc.tsn_last_delivered = stcb->asoc.cumulative_tsn = stcb->asoc.highest_tsn_inside_map;
+ stcb->asoc.mapping_array_base_tsn = stcb->asoc.highest_tsn_inside_map + 1;
+ memset(stcb->asoc.mapping_array, 0, stcb->asoc.mapping_array_size);
+ stcb->asoc.highest_tsn_inside_nr_map = stcb->asoc.highest_tsn_inside_map;
+ memset(stcb->asoc.nr_mapping_array, 0, stcb->asoc.mapping_array_size);
+ atomic_add_int(&stcb->asoc.sending_seq, 1);
+ /* save off historical data for retrans */
+ stcb->asoc.last_sending_seq[1] = stcb->asoc.last_sending_seq[0];
+ stcb->asoc.last_sending_seq[0] = stcb->asoc.sending_seq;
+ stcb->asoc.last_base_tsnsent[1] = stcb->asoc.last_base_tsnsent[0];
+ stcb->asoc.last_base_tsnsent[0] = stcb->asoc.mapping_array_base_tsn;
+
+ sctp_add_stream_reset_result_tsn(chk,
+ ntohl(req->request_seq),
+ SCTP_STREAM_RESET_PERFORMED,
+ stcb->asoc.sending_seq,
+ stcb->asoc.mapping_array_base_tsn);
+ sctp_reset_out_streams(stcb, 0, (uint16_t *) NULL);
+ sctp_reset_in_stream(stcb, 0, (uint16_t *) NULL);
+ stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0];
+ stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED;
+
+ asoc->str_reset_seq_in++;
+ } else if (asoc->str_reset_seq_in - 1 == seq) {
+ sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[0],
+ stcb->asoc.last_sending_seq[0],
+ stcb->asoc.last_base_tsnsent[0]
+ );
+ } else if (asoc->str_reset_seq_in - 2 == seq) {
+ sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[1],
+ stcb->asoc.last_sending_seq[1],
+ stcb->asoc.last_base_tsnsent[1]
+ );
+ } else {
+ sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_BAD_SEQNO);
+ }
+ return (0);
+}
+
+static void
+sctp_handle_str_reset_request_out(struct sctp_tcb *stcb,
+ struct sctp_tmit_chunk *chk,
+ struct sctp_stream_reset_out_request *req, int trunc)
+{
+ uint32_t seq, tsn;
+ int number_entries, len;
+ struct sctp_association *asoc = &stcb->asoc;
+
+ seq = ntohl(req->request_seq);
+
+ /* now if its not a duplicate we process it */
+ if (asoc->str_reset_seq_in == seq) {
+ len = ntohs(req->ph.param_length);
+ number_entries = ((len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t));
+ /*
+ * the sender is resetting, handle the list issue.. we must
+ * a) verify if we can do the reset, if so no problem b) If
+ * we can't do the reset we must copy the request. c) queue
+ * it, and setup the data in processor to trigger it off
+ * when needed and dequeue all the queued data.
+ */
+ tsn = ntohl(req->send_reset_at_tsn);
+
+ /* move the reset action back one */
+ asoc->last_reset_action[1] = asoc->last_reset_action[0];
+ if (trunc) {
+ sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_DENIED);
+ asoc->last_reset_action[0] = SCTP_STREAM_RESET_DENIED;
+ } else if ((tsn == asoc->cumulative_tsn) ||
+ (compare_with_wrap(asoc->cumulative_tsn, tsn, MAX_TSN))) {
+ /* we can do it now */
+ sctp_reset_in_stream(stcb, number_entries, req->list_of_streams);
+ sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_PERFORMED);
+ asoc->last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED;
+ } else {
+ /*
+ * we must queue it up and thus wait for the TSN's
+ * to arrive that are at or before tsn
+ */
+ struct sctp_stream_reset_list *liste;
+ int siz;
+
+ siz = sizeof(struct sctp_stream_reset_list) + (number_entries * sizeof(uint16_t));
+ SCTP_MALLOC(liste, struct sctp_stream_reset_list *,
+ siz, SCTP_M_STRESET);
+ if (liste == NULL) {
+ /* gak out of memory */
+ sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_DENIED);
+ asoc->last_reset_action[0] = SCTP_STREAM_RESET_DENIED;
+ return;
+ }
+ liste->tsn = tsn;
+ liste->number_entries = number_entries;
+ memcpy(&liste->req, req,
+ (sizeof(struct sctp_stream_reset_out_request) + (number_entries * sizeof(uint16_t))));
+ TAILQ_INSERT_TAIL(&asoc->resetHead, liste, next_resp);
+ sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_PERFORMED);
+ asoc->last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED;
+ }
+ asoc->str_reset_seq_in++;
+ } else if ((asoc->str_reset_seq_in - 1) == seq) {
+ /*
+ * one seq back, just echo back last action since my
+ * response was lost.
+ */
+ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
+ } else if ((asoc->str_reset_seq_in - 2) == seq) {
+ /*
+ * two seq back, just echo back last action since my
+ * response was lost.
+ */
+ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]);
+ } else {
+ sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_BAD_SEQNO);
+ }
+}
+
+static void
+sctp_handle_str_reset_add_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk,
+ struct sctp_stream_reset_add_strm *str_add)
+{
+ /*
+ * Peer is requesting to add more streams. If its within our
+ * max-streams we will allow it.
+ */
+ uint16_t num_stream, i;
+ uint32_t seq;
+ struct sctp_association *asoc = &stcb->asoc;
+ struct sctp_queued_to_read *ctl;
+
+ /* Get the number. */
+ seq = ntohl(str_add->request_seq);
+ num_stream = ntohs(str_add->number_of_streams);
+ /* Now what would be the new total? */
+ if (asoc->str_reset_seq_in == seq) {
+ num_stream += stcb->asoc.streamincnt;
+ if (num_stream > stcb->asoc.max_inbound_streams) {
+ /* We must reject it they ask for to many */
+ denied:
+ sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_DENIED);
+ stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0];
+ stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_DENIED;
+ } else {
+ /* Ok, we can do that :-) */
+ struct sctp_stream_in *oldstrm;
+
+ /* save off the old */
+ oldstrm = stcb->asoc.strmin;
+ SCTP_MALLOC(stcb->asoc.strmin, struct sctp_stream_in *,
+ (num_stream * sizeof(struct sctp_stream_in)),
+ SCTP_M_STRMI);
+ if (stcb->asoc.strmin == NULL) {
+ stcb->asoc.strmin = oldstrm;
+ goto denied;
+ }
+ /* copy off the old data */
+ for (i = 0; i < stcb->asoc.streamincnt; i++) {
+ TAILQ_INIT(&stcb->asoc.strmin[i].inqueue);
+ stcb->asoc.strmin[i].stream_no = i;
+ stcb->asoc.strmin[i].last_sequence_delivered = oldstrm[i].last_sequence_delivered;
+ stcb->asoc.strmin[i].delivery_started = oldstrm[i].delivery_started;
+ /* now anything on those queues? */
+ while (TAILQ_EMPTY(&oldstrm[i].inqueue) == 0) {
+ ctl = TAILQ_FIRST(&oldstrm[i].inqueue);
+ TAILQ_REMOVE(&oldstrm[i].inqueue, ctl, next);
+ TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].inqueue, ctl, next);
+ }
+ }
+ /* Init the new streams */
+ for (i = stcb->asoc.streamincnt; i < num_stream; i++) {
+ TAILQ_INIT(&stcb->asoc.strmin[i].inqueue);
+ stcb->asoc.strmin[i].stream_no = i;
+ stcb->asoc.strmin[i].last_sequence_delivered = 0xffff;
+ stcb->asoc.strmin[i].delivery_started = 0;
+ }
+ SCTP_FREE(oldstrm, SCTP_M_STRMI);
+ /* update the size */
+ stcb->asoc.streamincnt = num_stream;
+ /* Send the ack */
+ sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_PERFORMED);
+ stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0];
+ stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED;
+ sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_INSTREAM_ADD_OK, stcb,
+ (uint32_t) stcb->asoc.streamincnt, NULL, SCTP_SO_NOT_LOCKED);
+ }
+ } else if ((asoc->str_reset_seq_in - 1) == seq) {
+ /*
+ * one seq back, just echo back last action since my
+ * response was lost.
+ */
+ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
+ } else if ((asoc->str_reset_seq_in - 2) == seq) {
+ /*
+ * two seq back, just echo back last action since my
+ * response was lost.
+ */
+ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]);
+ } else {
+ sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_BAD_SEQNO);
+
+ }
+}
+
+#ifdef __GNUC__
+__attribute__((noinline))
+#endif
+ static int
+ sctp_handle_stream_reset(struct sctp_tcb *stcb, struct mbuf *m, int offset,
+ struct sctp_stream_reset_out_req *sr_req)
+{
+ int chk_length, param_len, ptype;
+ struct sctp_paramhdr pstore;
+ uint8_t cstore[SCTP_CHUNK_BUFFER_SIZE];
+
+ uint32_t seq;
+ int num_req = 0;
+ int trunc = 0;
+ struct sctp_tmit_chunk *chk;
+ struct sctp_chunkhdr *ch;
+ struct sctp_paramhdr *ph;
+ int ret_code = 0;
+ int num_param = 0;
+
+ /* now it may be a reset or a reset-response */
+ chk_length = ntohs(sr_req->ch.chunk_length);
+
+ /* setup for adding the response */
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ return (ret_code);
+ }
+ chk->rec.chunk_id.id = SCTP_STREAM_RESET;
+ chk->rec.chunk_id.can_take_data = 0;
+ chk->asoc = &stcb->asoc;
+ chk->no_fr_allowed = 0;
+ chk->book_size = chk->send_size = sizeof(struct sctp_chunkhdr);
+ chk->book_size_scale = 0;
+ chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ if (chk->data == NULL) {
+strres_nochunk:
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ sctp_free_a_chunk(stcb, chk);
+ return (ret_code);
+ }
+ SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
+
+ /* setup chunk parameters */
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->whoTo = stcb->asoc.primary_destination;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+
+ ch = mtod(chk->data, struct sctp_chunkhdr *);
+ ch->chunk_type = SCTP_STREAM_RESET;
+ ch->chunk_flags = 0;
+ ch->chunk_length = htons(chk->send_size);
+ SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size);
+ offset += sizeof(struct sctp_chunkhdr);
+ while ((size_t)chk_length >= sizeof(struct sctp_stream_reset_tsn_request)) {
+ ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(pstore), (uint8_t *) & pstore);
+ if (ph == NULL)
+ break;
+ param_len = ntohs(ph->param_length);
+ if (param_len < (int)sizeof(struct sctp_stream_reset_tsn_request)) {
+ /* bad param */
+ break;
+ }
+ ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, min(param_len, (int)sizeof(cstore)),
+ (uint8_t *) & cstore);
+ ptype = ntohs(ph->param_type);
+ num_param++;
+ if (param_len > (int)sizeof(cstore)) {
+ trunc = 1;
+ } else {
+ trunc = 0;
+ }
+
+ if (num_param > SCTP_MAX_RESET_PARAMS) {
+ /* hit the max of parameters already sorry.. */
+ break;
+ }
+ if (ptype == SCTP_STR_RESET_OUT_REQUEST) {
+ struct sctp_stream_reset_out_request *req_out;
+
+ req_out = (struct sctp_stream_reset_out_request *)ph;
+ num_req++;
+ if (stcb->asoc.stream_reset_outstanding) {
+ seq = ntohl(req_out->response_seq);
+ if (seq == stcb->asoc.str_reset_seq_out) {
+ /* implicit ack */
+ (void)sctp_handle_stream_reset_response(stcb, seq, SCTP_STREAM_RESET_PERFORMED, NULL);
+ }
+ }
+ sctp_handle_str_reset_request_out(stcb, chk, req_out, trunc);
+ } else if (ptype == SCTP_STR_RESET_ADD_STREAMS) {
+ struct sctp_stream_reset_add_strm *str_add;
+
+ str_add = (struct sctp_stream_reset_add_strm *)ph;
+ num_req++;
+ sctp_handle_str_reset_add_strm(stcb, chk, str_add);
+ } else if (ptype == SCTP_STR_RESET_IN_REQUEST) {
+ struct sctp_stream_reset_in_request *req_in;
+
+ num_req++;
+
+ req_in = (struct sctp_stream_reset_in_request *)ph;
+
+ sctp_handle_str_reset_request_in(stcb, chk, req_in, trunc);
+ } else if (ptype == SCTP_STR_RESET_TSN_REQUEST) {
+ struct sctp_stream_reset_tsn_request *req_tsn;
+
+ num_req++;
+ req_tsn = (struct sctp_stream_reset_tsn_request *)ph;
+
+ if (sctp_handle_str_reset_request_tsn(stcb, chk, req_tsn)) {
+ ret_code = 1;
+ goto strres_nochunk;
+ }
+ /* no more */
+ break;
+ } else if (ptype == SCTP_STR_RESET_RESPONSE) {
+ struct sctp_stream_reset_response *resp;
+ uint32_t result;
+
+ resp = (struct sctp_stream_reset_response *)ph;
+ seq = ntohl(resp->response_seq);
+ result = ntohl(resp->result);
+ if (sctp_handle_stream_reset_response(stcb, seq, result, resp)) {
+ ret_code = 1;
+ goto strres_nochunk;
+ }
+ } else {
+ break;
+ }
+ offset += SCTP_SIZE32(param_len);
+ chk_length -= SCTP_SIZE32(param_len);
+ }
+ if (num_req == 0) {
+ /* we have no response free the stuff */
+ goto strres_nochunk;
+ }
+ /* ok we have a chunk to link in */
+ TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue,
+ chk,
+ sctp_next);
+ stcb->asoc.ctrl_queue_cnt++;
+ return (ret_code);
+}
+
+/*
+ * Handle a router or endpoints report of a packet loss, there are two ways
+ * to handle this, either we get the whole packet and must disect it
+ * ourselves (possibly with truncation and or corruption) or it is a summary
+ * from a middle box that did the disectting for us.
+ */
+static void
+sctp_handle_packet_dropped(struct sctp_pktdrop_chunk *cp,
+ struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t limit)
+{
+ uint32_t bottle_bw, on_queue;
+ uint16_t trunc_len;
+ unsigned int chlen;
+ unsigned int at;
+ struct sctp_chunk_desc desc;
+ struct sctp_chunkhdr *ch;
+
+ chlen = ntohs(cp->ch.chunk_length);
+ chlen -= sizeof(struct sctp_pktdrop_chunk);
+ /* XXX possible chlen underflow */
+ if (chlen == 0) {
+ ch = NULL;
+ if (cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX)
+ SCTP_STAT_INCR(sctps_pdrpbwrpt);
+ } else {
+ ch = (struct sctp_chunkhdr *)(cp->data + sizeof(struct sctphdr));
+ chlen -= sizeof(struct sctphdr);
+ /* XXX possible chlen underflow */
+ memset(&desc, 0, sizeof(desc));
+ }
+ trunc_len = (uint16_t) ntohs(cp->trunc_len);
+ if (trunc_len > limit) {
+ trunc_len = limit;
+ }
+ /* now the chunks themselves */
+ while ((ch != NULL) && (chlen >= sizeof(struct sctp_chunkhdr))) {
+ desc.chunk_type = ch->chunk_type;
+ /* get amount we need to move */
+ at = ntohs(ch->chunk_length);
+ if (at < sizeof(struct sctp_chunkhdr)) {
+ /* corrupt chunk, maybe at the end? */
+ SCTP_STAT_INCR(sctps_pdrpcrupt);
+ break;
+ }
+ if (trunc_len == 0) {
+ /* we are supposed to have all of it */
+ if (at > chlen) {
+ /* corrupt skip it */
+ SCTP_STAT_INCR(sctps_pdrpcrupt);
+ break;
+ }
+ } else {
+ /* is there enough of it left ? */
+ if (desc.chunk_type == SCTP_DATA) {
+ if (chlen < (sizeof(struct sctp_data_chunk) +
+ sizeof(desc.data_bytes))) {
+ break;
+ }
+ } else {
+ if (chlen < sizeof(struct sctp_chunkhdr)) {
+ break;
+ }
+ }
+ }
+ if (desc.chunk_type == SCTP_DATA) {
+ /* can we get out the tsn? */
+ if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX))
+ SCTP_STAT_INCR(sctps_pdrpmbda);
+
+ if (chlen >= (sizeof(struct sctp_data_chunk) + sizeof(uint32_t))) {
+ /* yep */
+ struct sctp_data_chunk *dcp;
+ uint8_t *ddp;
+ unsigned int iii;
+
+ dcp = (struct sctp_data_chunk *)ch;
+ ddp = (uint8_t *) (dcp + 1);
+ for (iii = 0; iii < sizeof(desc.data_bytes); iii++) {
+ desc.data_bytes[iii] = ddp[iii];
+ }
+ desc.tsn_ifany = dcp->dp.tsn;
+ } else {
+ /* nope we are done. */
+ SCTP_STAT_INCR(sctps_pdrpnedat);
+ break;
+ }
+ } else {
+ if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX))
+ SCTP_STAT_INCR(sctps_pdrpmbct);
+ }
+
+ if (process_chunk_drop(stcb, &desc, net, cp->ch.chunk_flags)) {
+ SCTP_STAT_INCR(sctps_pdrppdbrk);
+ break;
+ }
+ if (SCTP_SIZE32(at) > chlen) {
+ break;
+ }
+ chlen -= SCTP_SIZE32(at);
+ if (chlen < sizeof(struct sctp_chunkhdr)) {
+ /* done, none left */
+ break;
+ }
+ ch = (struct sctp_chunkhdr *)((caddr_t)ch + SCTP_SIZE32(at));
+ }
+ /* Now update any rwnd --- possibly */
+ if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX) == 0) {
+ /* From a peer, we get a rwnd report */
+ uint32_t a_rwnd;
+
+ SCTP_STAT_INCR(sctps_pdrpfehos);
+
+ bottle_bw = ntohl(cp->bottle_bw);
+ on_queue = ntohl(cp->current_onq);
+ if (bottle_bw && on_queue) {
+ /* a rwnd report is in here */
+ if (bottle_bw > on_queue)
+ a_rwnd = bottle_bw - on_queue;
+ else
+ a_rwnd = 0;
+
+ if (a_rwnd == 0)
+ stcb->asoc.peers_rwnd = 0;
+ else {
+ if (a_rwnd > stcb->asoc.total_flight) {
+ stcb->asoc.peers_rwnd =
+ a_rwnd - stcb->asoc.total_flight;
+ } else {
+ stcb->asoc.peers_rwnd = 0;
+ }
+ if (stcb->asoc.peers_rwnd <
+ stcb->sctp_ep->sctp_ep.sctp_sws_sender) {
+ /* SWS sender side engages */
+ stcb->asoc.peers_rwnd = 0;
+ }
+ }
+ }
+ } else {
+ SCTP_STAT_INCR(sctps_pdrpfmbox);
+ }
+
+ /* now middle boxes in sat networks get a cwnd bump */
+ if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX) &&
+ (stcb->asoc.sat_t3_loss_recovery == 0) &&
+ (stcb->asoc.sat_network)) {
+ /*
+ * This is debateable but for sat networks it makes sense
+ * Note if a T3 timer has went off, we will prohibit any
+ * changes to cwnd until we exit the t3 loss recovery.
+ */
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped(stcb,
+ net, cp, &bottle_bw, &on_queue);
+ }
+}
+
+/*
+ * handles all control chunks in a packet inputs: - m: mbuf chain, assumed to
+ * still contain IP/SCTP header - stcb: is the tcb found for this packet -
+ * offset: offset into the mbuf chain to first chunkhdr - length: is the
+ * length of the complete packet outputs: - length: modified to remaining
+ * length after control processing - netp: modified to new sctp_nets after
+ * cookie-echo processing - return NULL to discard the packet (ie. no asoc,
+ * bad packet,...) otherwise return the tcb for this packet
+ */
+#ifdef __GNUC__
+__attribute__((noinline))
+#endif
+ static struct sctp_tcb *
+ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
+ struct sctphdr *sh, struct sctp_chunkhdr *ch, struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb, struct sctp_nets **netp, int *fwd_tsn_seen,
+ uint32_t vrf_id, uint16_t port)
+{
+ struct sctp_association *asoc;
+ uint32_t vtag_in;
+ int num_chunks = 0; /* number of control chunks processed */
+ uint32_t chk_length;
+ int ret;
+ int abort_no_unlock = 0;
+
+ /*
+ * How big should this be, and should it be alloc'd? Lets try the
+ * d-mtu-ceiling for now (2k) and that should hopefully work ...
+ * until we get into jumbo grams and such..
+ */
+ uint8_t chunk_buf[SCTP_CHUNK_BUFFER_SIZE];
+ struct sctp_tcb *locked_tcb = stcb;
+ int got_auth = 0;
+ uint32_t auth_offset = 0, auth_len = 0;
+ int auth_skipped = 0;
+ int asconf_cnt = 0;
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+
+ SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_control: iphlen=%u, offset=%u, length=%u stcb:%p\n",
+ iphlen, *offset, length, stcb);
+
+ /* validate chunk header length... */
+ if (ntohs(ch->chunk_length) < sizeof(*ch)) {
+ SCTPDBG(SCTP_DEBUG_INPUT1, "Invalid header length %d\n",
+ ntohs(ch->chunk_length));
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ /*
+ * validate the verification tag
+ */
+ vtag_in = ntohl(sh->v_tag);
+
+ if (locked_tcb) {
+ SCTP_TCB_LOCK_ASSERT(locked_tcb);
+ }
+ if (ch->chunk_type == SCTP_INITIATION) {
+ SCTPDBG(SCTP_DEBUG_INPUT1, "Its an INIT of len:%d vtag:%x\n",
+ ntohs(ch->chunk_length), vtag_in);
+ if (vtag_in != 0) {
+ /* protocol error- silently discard... */
+ SCTP_STAT_INCR(sctps_badvtag);
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ } else if (ch->chunk_type != SCTP_COOKIE_ECHO) {
+ /*
+ * If there is no stcb, skip the AUTH chunk and process
+ * later after a stcb is found (to validate the lookup was
+ * valid.
+ */
+ if ((ch->chunk_type == SCTP_AUTHENTICATION) &&
+ (stcb == NULL) &&
+ !SCTP_BASE_SYSCTL(sctp_auth_disable)) {
+ /* save this chunk for later processing */
+ auth_skipped = 1;
+ auth_offset = *offset;
+ auth_len = ntohs(ch->chunk_length);
+
+ /* (temporarily) move past this chunk */
+ *offset += SCTP_SIZE32(auth_len);
+ if (*offset >= length) {
+ /* no more data left in the mbuf chain */
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset,
+ sizeof(struct sctp_chunkhdr), chunk_buf);
+ }
+ if (ch == NULL) {
+ /* Help */
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ if (ch->chunk_type == SCTP_COOKIE_ECHO) {
+ goto process_control_chunks;
+ }
+ /*
+ * first check if it's an ASCONF with an unknown src addr we
+ * need to look inside to find the association
+ */
+ if (ch->chunk_type == SCTP_ASCONF && stcb == NULL) {
+ struct sctp_chunkhdr *asconf_ch = ch;
+ uint32_t asconf_offset = 0, asconf_len = 0;
+
+ /* inp's refcount may be reduced */
+ SCTP_INP_INCR_REF(inp);
+
+ asconf_offset = *offset;
+ do {
+ asconf_len = ntohs(asconf_ch->chunk_length);
+ if (asconf_len < sizeof(struct sctp_asconf_paramhdr))
+ break;
+ stcb = sctp_findassociation_ep_asconf(m, iphlen,
+ *offset, sh, &inp, netp, vrf_id);
+ if (stcb != NULL)
+ break;
+ asconf_offset += SCTP_SIZE32(asconf_len);
+ asconf_ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, asconf_offset,
+ sizeof(struct sctp_chunkhdr), chunk_buf);
+ } while (asconf_ch != NULL && asconf_ch->chunk_type == SCTP_ASCONF);
+ if (stcb == NULL) {
+ /*
+ * reduce inp's refcount if not reduced in
+ * sctp_findassociation_ep_asconf().
+ */
+ SCTP_INP_DECR_REF(inp);
+ } else {
+ locked_tcb = stcb;
+ }
+
+ /* now go back and verify any auth chunk to be sure */
+ if (auth_skipped && (stcb != NULL)) {
+ struct sctp_auth_chunk *auth;
+
+ auth = (struct sctp_auth_chunk *)
+ sctp_m_getptr(m, auth_offset,
+ auth_len, chunk_buf);
+ got_auth = 1;
+ auth_skipped = 0;
+ if ((auth == NULL) || sctp_handle_auth(stcb, auth, m,
+ auth_offset)) {
+ /* auth HMAC failed so dump it */
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ } else {
+ /* remaining chunks are HMAC checked */
+ stcb->asoc.authenticated = 1;
+ }
+ }
+ }
+ if (stcb == NULL) {
+ /* no association, so it's out of the blue... */
+ sctp_handle_ootb(m, iphlen, *offset, sh, inp, NULL,
+ vrf_id, port);
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ asoc = &stcb->asoc;
+ /* ABORT and SHUTDOWN can use either v_tag... */
+ if ((ch->chunk_type == SCTP_ABORT_ASSOCIATION) ||
+ (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) ||
+ (ch->chunk_type == SCTP_PACKET_DROPPED)) {
+ if ((vtag_in == asoc->my_vtag) ||
+ ((ch->chunk_flags & SCTP_HAD_NO_TCB) &&
+ (vtag_in == asoc->peer_vtag))) {
+ /* this is valid */
+ } else {
+ /* drop this packet... */
+ SCTP_STAT_INCR(sctps_badvtag);
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ } else if (ch->chunk_type == SCTP_SHUTDOWN_ACK) {
+ if (vtag_in != asoc->my_vtag) {
+ /*
+ * this could be a stale SHUTDOWN-ACK or the
+ * peer never got the SHUTDOWN-COMPLETE and
+ * is still hung; we have started a new asoc
+ * but it won't complete until the shutdown
+ * is completed
+ */
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ sctp_handle_ootb(m, iphlen, *offset, sh, inp,
+ NULL, vrf_id, port);
+ return (NULL);
+ }
+ } else {
+ /* for all other chunks, vtag must match */
+ if (vtag_in != asoc->my_vtag) {
+ /* invalid vtag... */
+ SCTPDBG(SCTP_DEBUG_INPUT3,
+ "invalid vtag: %xh, expect %xh\n",
+ vtag_in, asoc->my_vtag);
+ SCTP_STAT_INCR(sctps_badvtag);
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ *offset = length;
+ return (NULL);
+ }
+ }
+ } /* end if !SCTP_COOKIE_ECHO */
+ /*
+ * process all control chunks...
+ */
+ if (((ch->chunk_type == SCTP_SELECTIVE_ACK) ||
+ /* EY */
+ (ch->chunk_type == SCTP_NR_SELECTIVE_ACK) ||
+ (ch->chunk_type == SCTP_HEARTBEAT_REQUEST)) &&
+ (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED)) {
+ /* implied cookie-ack.. we must have lost the ack */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INPUT,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb,
+ *netp);
+ }
+process_control_chunks:
+ while (IS_SCTP_CONTROL(ch)) {
+ /* validate chunk length */
+ chk_length = ntohs(ch->chunk_length);
+ SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_process_control: processing a chunk type=%u, len=%u\n",
+ ch->chunk_type, chk_length);
+ SCTP_LTRACE_CHK(inp, stcb, ch->chunk_type, chk_length);
+ if (chk_length < sizeof(*ch) ||
+ (*offset + (int)chk_length) > length) {
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ SCTP_STAT_INCR_COUNTER64(sctps_incontrolchunks);
+ /*
+ * INIT-ACK only gets the init ack "header" portion only
+ * because we don't have to process the peer's COOKIE. All
+ * others get a complete chunk.
+ */
+ if ((ch->chunk_type == SCTP_INITIATION_ACK) ||
+ (ch->chunk_type == SCTP_INITIATION)) {
+ /* get an init-ack chunk */
+ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset,
+ sizeof(struct sctp_init_ack_chunk), chunk_buf);
+ if (ch == NULL) {
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ } else {
+ /* For cookies and all other chunks. */
+ if (chk_length > sizeof(chunk_buf)) {
+ /*
+ * use just the size of the chunk buffer so
+ * the front part of our chunks fit in
+ * contiguous space up to the chunk buffer
+ * size (508 bytes). For chunks that need to
+ * get more than that they must use the
+ * sctp_m_getptr() function or other means
+ * (e.g. know how to parse mbuf chains).
+ * Cookies do this already.
+ */
+ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset,
+ (sizeof(chunk_buf) - 4),
+ chunk_buf);
+ if (ch == NULL) {
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ } else {
+ /* We can fit it all */
+ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset,
+ chk_length, chunk_buf);
+ if (ch == NULL) {
+ SCTP_PRINTF("sctp_process_control: Can't get the all data....\n");
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ }
+ }
+ num_chunks++;
+ /* Save off the last place we got a control from */
+ if (stcb != NULL) {
+ if (((netp != NULL) && (*netp != NULL)) || (ch->chunk_type == SCTP_ASCONF)) {
+ /*
+ * allow last_control to be NULL if
+ * ASCONF... ASCONF processing will find the
+ * right net later
+ */
+ if ((netp != NULL) && (*netp != NULL))
+ stcb->asoc.last_control_chunk_from = *netp;
+ }
+ }
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xB0, ch->chunk_type);
+#endif
+
+ /* check to see if this chunk required auth, but isn't */
+ if ((stcb != NULL) &&
+ !SCTP_BASE_SYSCTL(sctp_auth_disable) &&
+ sctp_auth_is_required_chunk(ch->chunk_type, stcb->asoc.local_auth_chunks) &&
+ !stcb->asoc.authenticated) {
+ /* "silently" ignore */
+ SCTP_STAT_INCR(sctps_recvauthmissing);
+ goto next_chunk;
+ }
+ switch (ch->chunk_type) {
+ case SCTP_INITIATION:
+ /* must be first and only chunk */
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_INIT\n");
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ /* We are not interested anymore? */
+ if ((stcb) && (stcb->asoc.total_output_queue_size)) {
+ /*
+ * collision case where we are
+ * sending to them too
+ */
+ ;
+ } else {
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ *offset = length;
+ return (NULL);
+ }
+ }
+ if ((chk_length > SCTP_LARGEST_INIT_ACCEPTED) ||
+ (num_chunks > 1) ||
+ (SCTP_BASE_SYSCTL(sctp_strict_init) && (length - *offset > (int)SCTP_SIZE32(chk_length)))) {
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ if ((stcb != NULL) &&
+ (SCTP_GET_STATE(&stcb->asoc) ==
+ SCTP_STATE_SHUTDOWN_ACK_SENT)) {
+ sctp_send_shutdown_ack(stcb,
+ stcb->asoc.primary_destination);
+ *offset = length;
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED);
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ if (netp) {
+ sctp_handle_init(m, iphlen, *offset, sh,
+ (struct sctp_init_chunk *)ch, inp,
+ stcb, *netp, &abort_no_unlock, vrf_id, port);
+ }
+ if (abort_no_unlock)
+ return (NULL);
+
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ break;
+ case SCTP_PAD_CHUNK:
+ break;
+ case SCTP_INITIATION_ACK:
+ /* must be first and only chunk */
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_INIT-ACK\n");
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ /* We are not interested anymore */
+ if ((stcb) && (stcb->asoc.total_output_queue_size)) {
+ ;
+ } else {
+ if (locked_tcb != stcb) {
+ /* Very unlikely */
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ *offset = length;
+ if (stcb) {
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(inp);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+#endif
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_27);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ }
+ return (NULL);
+ }
+ }
+ if ((num_chunks > 1) ||
+ (SCTP_BASE_SYSCTL(sctp_strict_init) && (length - *offset > (int)SCTP_SIZE32(chk_length)))) {
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ if ((netp) && (*netp)) {
+ ret = sctp_handle_init_ack(m, iphlen, *offset, sh,
+ (struct sctp_init_ack_chunk *)ch, stcb, *netp, &abort_no_unlock, vrf_id);
+ } else {
+ ret = -1;
+ }
+ /*
+ * Special case, I must call the output routine to
+ * get the cookie echoed
+ */
+ if (abort_no_unlock)
+ return (NULL);
+
+ if ((stcb) && ret == 0)
+ sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED);
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ break;
+ case SCTP_SELECTIVE_ACK:
+ {
+ struct sctp_sack_chunk *sack;
+ int abort_now = 0;
+ uint32_t a_rwnd, cum_ack;
+ uint16_t num_seg, num_dup;
+ uint8_t flags;
+ int offset_seg, offset_dup;
+ int nonce_sum_flag;
+
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SACK\n");
+ SCTP_STAT_INCR(sctps_recvsacks);
+ if (stcb == NULL) {
+ SCTPDBG(SCTP_DEBUG_INDATA1, "No stcb when processing SACK chunk\n");
+ break;
+ }
+ if (chk_length < sizeof(struct sctp_sack_chunk)) {
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size on SACK chunk, too small\n");
+ break;
+ }
+ if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) {
+ /*-
+ * If we have sent a shutdown-ack, we will pay no
+ * attention to a sack sent in to us since
+ * we don't care anymore.
+ */
+ break;
+ }
+ sack = (struct sctp_sack_chunk *)ch;
+ flags = ch->chunk_flags;
+ nonce_sum_flag = flags & SCTP_SACK_NONCE_SUM;
+ cum_ack = ntohl(sack->sack.cum_tsn_ack);
+ num_seg = ntohs(sack->sack.num_gap_ack_blks);
+ num_dup = ntohs(sack->sack.num_dup_tsns);
+ a_rwnd = (uint32_t) ntohl(sack->sack.a_rwnd);
+ if (sizeof(struct sctp_sack_chunk) +
+ num_seg * sizeof(struct sctp_gap_ack_block) +
+ num_dup * sizeof(uint32_t) != chk_length) {
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size of SACK chunk\n");
+ break;
+ }
+ offset_seg = *offset + sizeof(struct sctp_sack_chunk);
+ offset_dup = offset_seg + num_seg * sizeof(struct sctp_gap_ack_block);
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SACK process cum_ack:%x num_seg:%d a_rwnd:%d\n",
+ cum_ack, num_seg, a_rwnd);
+ stcb->asoc.seen_a_sack_this_pkt = 1;
+ if ((stcb->asoc.pr_sctp_cnt == 0) &&
+ (num_seg == 0) &&
+ ((compare_with_wrap(cum_ack, stcb->asoc.last_acked_seq, MAX_TSN)) ||
+ (cum_ack == stcb->asoc.last_acked_seq)) &&
+ (stcb->asoc.saw_sack_with_frags == 0) &&
+ (stcb->asoc.saw_sack_with_nr_frags == 0) &&
+ (!TAILQ_EMPTY(&stcb->asoc.sent_queue))
+ ) {
+ /*
+ * We have a SIMPLE sack having no
+ * prior segments and data on sent
+ * queue to be acked.. Use the
+ * faster path sack processing. We
+ * also allow window update sacks
+ * with no missing segments to go
+ * this way too.
+ */
+ sctp_express_handle_sack(stcb, cum_ack, a_rwnd, nonce_sum_flag,
+ &abort_now);
+ } else {
+ if (netp && *netp)
+ sctp_handle_sack(m, offset_seg, offset_dup,
+ stcb, *netp,
+ num_seg, 0, num_dup, &abort_now, flags,
+ cum_ack, a_rwnd);
+ }
+ if (abort_now) {
+ /* ABORT signal from sack processing */
+ *offset = length;
+ return (NULL);
+ }
+ if (TAILQ_EMPTY(&stcb->asoc.send_queue) &&
+ TAILQ_EMPTY(&stcb->asoc.sent_queue) &&
+ (stcb->asoc.stream_queue_cnt == 0)) {
+ sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
+ }
+ }
+ break;
+ /*
+ * EY - nr_sack: If the received chunk is an
+ * nr_sack chunk
+ */
+ case SCTP_NR_SELECTIVE_ACK:
+ {
+ struct sctp_nr_sack_chunk *nr_sack;
+ int abort_now = 0;
+ uint32_t a_rwnd, cum_ack;
+ uint16_t num_seg, num_nr_seg, num_dup;
+ uint8_t flags;
+ int offset_seg, offset_dup;
+ int nonce_sum_flag;
+
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_NR_SACK\n");
+ SCTP_STAT_INCR(sctps_recvsacks);
+ if (stcb == NULL) {
+ SCTPDBG(SCTP_DEBUG_INDATA1, "No stcb when processing NR-SACK chunk\n");
+ break;
+ }
+ if ((stcb->asoc.sctp_nr_sack_on_off == 0) ||
+ (stcb->asoc.peer_supports_nr_sack == 0)) {
+ goto unknown_chunk;
+ }
+ if (chk_length < sizeof(struct sctp_nr_sack_chunk)) {
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size on NR-SACK chunk, too small\n");
+ break;
+ }
+ if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) {
+ /*-
+ * If we have sent a shutdown-ack, we will pay no
+ * attention to a sack sent in to us since
+ * we don't care anymore.
+ */
+ break;
+ }
+ nr_sack = (struct sctp_nr_sack_chunk *)ch;
+ flags = ch->chunk_flags;
+ nonce_sum_flag = flags & SCTP_SACK_NONCE_SUM;
+
+ cum_ack = ntohl(nr_sack->nr_sack.cum_tsn_ack);
+ num_seg = ntohs(nr_sack->nr_sack.num_gap_ack_blks);
+ num_nr_seg = ntohs(nr_sack->nr_sack.num_nr_gap_ack_blks);
+ num_dup = ntohs(nr_sack->nr_sack.num_dup_tsns);
+ a_rwnd = (uint32_t) ntohl(nr_sack->nr_sack.a_rwnd);
+ if (sizeof(struct sctp_nr_sack_chunk) +
+ (num_seg + num_nr_seg) * sizeof(struct sctp_gap_ack_block) +
+ num_dup * sizeof(uint32_t) != chk_length) {
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size of NR_SACK chunk\n");
+ break;
+ }
+ offset_seg = *offset + sizeof(struct sctp_nr_sack_chunk);
+ offset_dup = offset_seg + num_seg * sizeof(struct sctp_gap_ack_block);
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_NR_SACK process cum_ack:%x num_seg:%d a_rwnd:%d\n",
+ cum_ack, num_seg, a_rwnd);
+ stcb->asoc.seen_a_sack_this_pkt = 1;
+ if ((stcb->asoc.pr_sctp_cnt == 0) &&
+ (num_seg == 0) && (num_nr_seg == 0) &&
+ ((compare_with_wrap(cum_ack, stcb->asoc.last_acked_seq, MAX_TSN)) ||
+ (cum_ack == stcb->asoc.last_acked_seq)) &&
+ (stcb->asoc.saw_sack_with_frags == 0) &&
+ (stcb->asoc.saw_sack_with_nr_frags == 0) &&
+ (!TAILQ_EMPTY(&stcb->asoc.sent_queue))) {
+ /*
+ * We have a SIMPLE sack having no
+ * prior segments and data on sent
+ * queue to be acked. Use the faster
+ * path sack processing. We also
+ * allow window update sacks with no
+ * missing segments to go this way
+ * too.
+ */
+ sctp_express_handle_sack(stcb, cum_ack, a_rwnd, nonce_sum_flag,
+ &abort_now);
+ } else {
+ if (netp && *netp)
+ sctp_handle_sack(m, offset_seg, offset_dup,
+ stcb, *netp,
+ num_seg, num_nr_seg, num_dup, &abort_now, flags,
+ cum_ack, a_rwnd);
+ }
+ if (abort_now) {
+ /* ABORT signal from sack processing */
+ *offset = length;
+ return (NULL);
+ }
+ if (TAILQ_EMPTY(&stcb->asoc.send_queue) &&
+ TAILQ_EMPTY(&stcb->asoc.sent_queue) &&
+ (stcb->asoc.stream_queue_cnt == 0)) {
+ sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
+ }
+ }
+ break;
+
+ case SCTP_HEARTBEAT_REQUEST:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT\n");
+ if ((stcb) && netp && *netp) {
+ SCTP_STAT_INCR(sctps_recvheartbeat);
+ sctp_send_heartbeat_ack(stcb, m, *offset,
+ chk_length, *netp);
+
+ /* He's alive so give him credit */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INPUT,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ }
+ break;
+ case SCTP_HEARTBEAT_ACK:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT-ACK\n");
+ if ((stcb == NULL) || (chk_length != sizeof(struct sctp_heartbeat_chunk))) {
+ /* Its not ours */
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ /* He's alive so give him credit */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INPUT,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ SCTP_STAT_INCR(sctps_recvheartbeatack);
+ if (netp && *netp)
+ sctp_handle_heartbeat_ack((struct sctp_heartbeat_chunk *)ch,
+ stcb, *netp);
+ break;
+ case SCTP_ABORT_ASSOCIATION:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ABORT, stcb %p\n",
+ stcb);
+ if ((stcb) && netp && *netp)
+ sctp_handle_abort((struct sctp_abort_chunk *)ch,
+ stcb, *netp);
+ *offset = length;
+ return (NULL);
+ break;
+ case SCTP_SHUTDOWN:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN, stcb %p\n",
+ stcb);
+ if ((stcb == NULL) || (chk_length != sizeof(struct sctp_shutdown_chunk))) {
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ if (netp && *netp) {
+ int abort_flag = 0;
+
+ sctp_handle_shutdown((struct sctp_shutdown_chunk *)ch,
+ stcb, *netp, &abort_flag);
+ if (abort_flag) {
+ *offset = length;
+ return (NULL);
+ }
+ }
+ break;
+ case SCTP_SHUTDOWN_ACK:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN-ACK, stcb %p\n", stcb);
+ if ((stcb) && (netp) && (*netp))
+ sctp_handle_shutdown_ack((struct sctp_shutdown_ack_chunk *)ch, stcb, *netp);
+ *offset = length;
+ return (NULL);
+ break;
+
+ case SCTP_OPERATION_ERROR:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_OP-ERR\n");
+ if ((stcb) && netp && *netp && sctp_handle_error(ch, stcb, *netp) < 0) {
+
+ *offset = length;
+ return (NULL);
+ }
+ break;
+ case SCTP_COOKIE_ECHO:
+ SCTPDBG(SCTP_DEBUG_INPUT3,
+ "SCTP_COOKIE-ECHO, stcb %p\n", stcb);
+ if ((stcb) && (stcb->asoc.total_output_queue_size)) {
+ ;
+ } else {
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ /* We are not interested anymore */
+ abend:
+ if (stcb) {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ *offset = length;
+ return (NULL);
+ }
+ }
+ /*
+ * First are we accepting? We do this again here
+ * since it is possible that a previous endpoint WAS
+ * listening responded to a INIT-ACK and then
+ * closed. We opened and bound.. and are now no
+ * longer listening.
+ */
+
+ if ((stcb == NULL) && (inp->sctp_socket->so_qlen >= inp->sctp_socket->so_qlimit)) {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
+ (SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit))) {
+ struct mbuf *oper;
+ struct sctp_paramhdr *phdr;
+
+ oper = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ SCTP_BUF_LEN(oper) =
+ sizeof(struct sctp_paramhdr);
+ phdr = mtod(oper,
+ struct sctp_paramhdr *);
+ phdr->param_type =
+ htons(SCTP_CAUSE_OUT_OF_RESC);
+ phdr->param_length =
+ htons(sizeof(struct sctp_paramhdr));
+ }
+ sctp_abort_association(inp, stcb, m,
+ iphlen, sh, oper, vrf_id, port);
+ }
+ *offset = length;
+ return (NULL);
+ } else {
+ struct mbuf *ret_buf;
+ struct sctp_inpcb *linp;
+
+ if (stcb) {
+ linp = NULL;
+ } else {
+ linp = inp;
+ }
+
+ if (linp) {
+ SCTP_ASOC_CREATE_LOCK(linp);
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
+ SCTP_ASOC_CREATE_UNLOCK(linp);
+ goto abend;
+ }
+ }
+ if (netp) {
+ ret_buf =
+ sctp_handle_cookie_echo(m, iphlen,
+ *offset, sh,
+ (struct sctp_cookie_echo_chunk *)ch,
+ &inp, &stcb, netp,
+ auth_skipped,
+ auth_offset,
+ auth_len,
+ &locked_tcb,
+ vrf_id,
+ port);
+ } else {
+ ret_buf = NULL;
+ }
+ if (linp) {
+ SCTP_ASOC_CREATE_UNLOCK(linp);
+ }
+ if (ret_buf == NULL) {
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ SCTPDBG(SCTP_DEBUG_INPUT3,
+ "GAK, null buffer\n");
+ auth_skipped = 0;
+ *offset = length;
+ return (NULL);
+ }
+ /* if AUTH skipped, see if it verified... */
+ if (auth_skipped) {
+ got_auth = 1;
+ auth_skipped = 0;
+ }
+ if (!TAILQ_EMPTY(&stcb->asoc.sent_queue)) {
+ /*
+ * Restart the timer if we have
+ * pending data
+ */
+ struct sctp_tmit_chunk *chk;
+
+ chk = TAILQ_FIRST(&stcb->asoc.sent_queue);
+ if (chk) {
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND,
+ stcb->sctp_ep, stcb,
+ chk->whoTo);
+ }
+ }
+ }
+ break;
+ case SCTP_COOKIE_ACK:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_COOKIE-ACK, stcb %p\n", stcb);
+ if ((stcb == NULL) || chk_length != sizeof(struct sctp_cookie_ack_chunk)) {
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ /* We are not interested anymore */
+ if ((stcb) && (stcb->asoc.total_output_queue_size)) {
+ ;
+ } else if (stcb) {
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(inp);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+#endif
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_27);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ *offset = length;
+ return (NULL);
+ }
+ }
+ /* He's alive so give him credit */
+ if ((stcb) && netp && *netp) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INPUT,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, *netp);
+ }
+ break;
+ case SCTP_ECN_ECHO:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN-ECHO\n");
+ /* He's alive so give him credit */
+ if ((stcb == NULL) || (chk_length != sizeof(struct sctp_ecne_chunk))) {
+ /* Its not ours */
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ *offset = length;
+ return (NULL);
+ }
+ if (stcb) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INPUT,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ sctp_handle_ecn_echo((struct sctp_ecne_chunk *)ch,
+ stcb);
+ }
+ break;
+ case SCTP_ECN_CWR:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN-CWR\n");
+ /* He's alive so give him credit */
+ if ((stcb == NULL) || (chk_length != sizeof(struct sctp_cwr_chunk))) {
+ /* Its not ours */
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ *offset = length;
+ return (NULL);
+ }
+ if (stcb) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INPUT,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ sctp_handle_ecn_cwr((struct sctp_cwr_chunk *)ch, stcb);
+ }
+ break;
+ case SCTP_SHUTDOWN_COMPLETE:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN-COMPLETE, stcb %p\n", stcb);
+ /* must be first and only chunk */
+ if ((num_chunks > 1) ||
+ (length - *offset > (int)SCTP_SIZE32(chk_length))) {
+ *offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ return (NULL);
+ }
+ if ((stcb) && netp && *netp) {
+ sctp_handle_shutdown_complete((struct sctp_shutdown_complete_chunk *)ch,
+ stcb, *netp);
+ }
+ *offset = length;
+ return (NULL);
+ break;
+ case SCTP_ASCONF:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF\n");
+ /* He's alive so give him credit */
+ if (stcb) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INPUT,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ sctp_handle_asconf(m, *offset,
+ (struct sctp_asconf_chunk *)ch, stcb, asconf_cnt == 0);
+ asconf_cnt++;
+ }
+ break;
+ case SCTP_ASCONF_ACK:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF-ACK\n");
+ if (chk_length < sizeof(struct sctp_asconf_ack_chunk)) {
+ /* Its not ours */
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ *offset = length;
+ return (NULL);
+ }
+ if ((stcb) && netp && *netp) {
+ /* He's alive so give him credit */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INPUT,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ sctp_handle_asconf_ack(m, *offset,
+ (struct sctp_asconf_ack_chunk *)ch, stcb, *netp, &abort_no_unlock);
+ if (abort_no_unlock)
+ return (NULL);
+ }
+ break;
+ case SCTP_FORWARD_CUM_TSN:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_FWD-TSN\n");
+ if (chk_length < sizeof(struct sctp_forward_tsn_chunk)) {
+ /* Its not ours */
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ *offset = length;
+ return (NULL);
+ }
+ /* He's alive so give him credit */
+ if (stcb) {
+ int abort_flag = 0;
+
+ stcb->asoc.overall_error_count = 0;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INPUT,
+ __LINE__);
+ }
+ *fwd_tsn_seen = 1;
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ /* We are not interested anymore */
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(inp);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+#endif
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_29);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ *offset = length;
+ return (NULL);
+ }
+ sctp_handle_forward_tsn(stcb,
+ (struct sctp_forward_tsn_chunk *)ch, &abort_flag, m, *offset);
+ if (abort_flag) {
+ *offset = length;
+ return (NULL);
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INPUT,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ }
+
+ }
+ break;
+ case SCTP_STREAM_RESET:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_STREAM_RESET\n");
+ if (((stcb == NULL) || (ch == NULL) || (chk_length < sizeof(struct sctp_stream_reset_tsn_req)))) {
+ /* Its not ours */
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ *offset = length;
+ return (NULL);
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ /* We are not interested anymore */
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(inp);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+#endif
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_30);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ *offset = length;
+ return (NULL);
+ }
+ if (stcb->asoc.peer_supports_strreset == 0) {
+ /*
+ * hmm, peer should have announced this, but
+ * we will turn it on since he is sending us
+ * a stream reset.
+ */
+ stcb->asoc.peer_supports_strreset = 1;
+ }
+ if (sctp_handle_stream_reset(stcb, m, *offset, (struct sctp_stream_reset_out_req *)ch)) {
+ /* stop processing */
+ *offset = length;
+ return (NULL);
+ }
+ break;
+ case SCTP_PACKET_DROPPED:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_PACKET_DROPPED\n");
+ /* re-get it all please */
+ if (chk_length < sizeof(struct sctp_pktdrop_chunk)) {
+ /* Its not ours */
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ *offset = length;
+ return (NULL);
+ }
+ if (ch && (stcb) && netp && (*netp)) {
+ sctp_handle_packet_dropped((struct sctp_pktdrop_chunk *)ch,
+ stcb, *netp,
+ min(chk_length, (sizeof(chunk_buf) - 4)));
+
+ }
+ break;
+
+ case SCTP_AUTHENTICATION:
+ SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_AUTHENTICATION\n");
+ if (SCTP_BASE_SYSCTL(sctp_auth_disable))
+ goto unknown_chunk;
+
+ if (stcb == NULL) {
+ /* save the first AUTH for later processing */
+ if (auth_skipped == 0) {
+ auth_offset = *offset;
+ auth_len = chk_length;
+ auth_skipped = 1;
+ }
+ /* skip this chunk (temporarily) */
+ goto next_chunk;
+ }
+ if ((chk_length < (sizeof(struct sctp_auth_chunk))) ||
+ (chk_length > (sizeof(struct sctp_auth_chunk) +
+ SCTP_AUTH_DIGEST_LEN_MAX))) {
+ /* Its not ours */
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ *offset = length;
+ return (NULL);
+ }
+ if (got_auth == 1) {
+ /* skip this chunk... it's already auth'd */
+ goto next_chunk;
+ }
+ got_auth = 1;
+ if ((ch == NULL) || sctp_handle_auth(stcb, (struct sctp_auth_chunk *)ch,
+ m, *offset)) {
+ /* auth HMAC failed so dump the packet */
+ *offset = length;
+ return (stcb);
+ } else {
+ /* remaining chunks are HMAC checked */
+ stcb->asoc.authenticated = 1;
+ }
+ break;
+
+ default:
+ unknown_chunk:
+ /* it's an unknown chunk! */
+ if ((ch->chunk_type & 0x40) && (stcb != NULL)) {
+ struct mbuf *mm;
+ struct sctp_paramhdr *phd;
+
+ mm = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (mm) {
+ phd = mtod(mm, struct sctp_paramhdr *);
+ /*
+ * We cheat and use param type since
+ * we did not bother to define a
+ * error cause struct. They are the
+ * same basic format with different
+ * names.
+ */
+ phd->param_type = htons(SCTP_CAUSE_UNRECOG_CHUNK);
+ phd->param_length = htons(chk_length + sizeof(*phd));
+ SCTP_BUF_LEN(mm) = sizeof(*phd);
+ SCTP_BUF_NEXT(mm) = SCTP_M_COPYM(m, *offset, SCTP_SIZE32(chk_length),
+ M_DONTWAIT);
+ if (SCTP_BUF_NEXT(mm)) {
+#ifdef SCTP_MBUF_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ struct mbuf *mat;
+
+ mat = SCTP_BUF_NEXT(mm);
+ while (mat) {
+ if (SCTP_BUF_IS_EXTENDED(mat)) {
+ sctp_log_mb(mat, SCTP_MBUF_ICOPY);
+ }
+ mat = SCTP_BUF_NEXT(mat);
+ }
+ }
+#endif
+ sctp_queue_op_err(stcb, mm);
+ } else {
+ sctp_m_freem(mm);
+ }
+ }
+ }
+ if ((ch->chunk_type & 0x80) == 0) {
+ /* discard this packet */
+ *offset = length;
+ return (stcb);
+ } /* else skip this bad chunk and continue... */
+ break;
+ } /* switch (ch->chunk_type) */
+
+
+next_chunk:
+ /* get the next chunk */
+ *offset += SCTP_SIZE32(chk_length);
+ if (*offset >= length) {
+ /* no more data left in the mbuf chain */
+ break;
+ }
+ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset,
+ sizeof(struct sctp_chunkhdr), chunk_buf);
+ if (ch == NULL) {
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ *offset = length;
+ return (NULL);
+ }
+ } /* while */
+
+ if (asconf_cnt > 0 && stcb != NULL) {
+ sctp_send_asconf_ack(stcb);
+ }
+ return (stcb);
+}
+
+
+/*
+ * Process the ECN bits we have something set so we must look to see if it is
+ * ECN(0) or ECN(1) or CE
+ */
+static void
+sctp_process_ecn_marked_a(struct sctp_tcb *stcb, struct sctp_nets *net,
+ uint8_t ecn_bits)
+{
+ if ((ecn_bits & SCTP_CE_BITS) == SCTP_CE_BITS) {
+ ;
+ } else if ((ecn_bits & SCTP_ECT1_BIT) == SCTP_ECT1_BIT) {
+ /*
+ * we only add to the nonce sum for ECT1, ECT0 does not
+ * change the NS bit (that we have yet to find a way to send
+ * it yet).
+ */
+
+ /* ECN Nonce stuff */
+ stcb->asoc.receiver_nonce_sum++;
+ stcb->asoc.receiver_nonce_sum &= SCTP_SACK_NONCE_SUM;
+
+ /*
+ * Drag up the last_echo point if cumack is larger since we
+ * don't want the point falling way behind by more than
+ * 2^^31 and then having it be incorrect.
+ */
+ if (compare_with_wrap(stcb->asoc.cumulative_tsn,
+ stcb->asoc.last_echo_tsn, MAX_TSN)) {
+ stcb->asoc.last_echo_tsn = stcb->asoc.cumulative_tsn;
+ }
+ } else if ((ecn_bits & SCTP_ECT0_BIT) == SCTP_ECT0_BIT) {
+ /*
+ * Drag up the last_echo point if cumack is larger since we
+ * don't want the point falling way behind by more than
+ * 2^^31 and then having it be incorrect.
+ */
+ if (compare_with_wrap(stcb->asoc.cumulative_tsn,
+ stcb->asoc.last_echo_tsn, MAX_TSN)) {
+ stcb->asoc.last_echo_tsn = stcb->asoc.cumulative_tsn;
+ }
+ }
+}
+
+static void
+sctp_process_ecn_marked_b(struct sctp_tcb *stcb, struct sctp_nets *net,
+ uint32_t high_tsn, uint8_t ecn_bits)
+{
+ if ((ecn_bits & SCTP_CE_BITS) == SCTP_CE_BITS) {
+ /*
+ * we possibly must notify the sender that a congestion
+ * window reduction is in order. We do this by adding a ECNE
+ * chunk to the output chunk queue. The incoming CWR will
+ * remove this chunk.
+ */
+ if (compare_with_wrap(high_tsn, stcb->asoc.last_echo_tsn,
+ MAX_TSN)) {
+ /* Yep, we need to add a ECNE */
+ sctp_send_ecn_echo(stcb, net, high_tsn);
+ stcb->asoc.last_echo_tsn = high_tsn;
+ }
+ }
+}
+
+#ifdef INVARIANTS
+#ifdef __GNUC__
+__attribute__((noinline))
+#endif
+ void
+ sctp_validate_no_locks(struct sctp_inpcb *inp)
+{
+ struct sctp_tcb *lstcb;
+
+ LIST_FOREACH(lstcb, &inp->sctp_asoc_list, sctp_tcblist) {
+ if (mtx_owned(&lstcb->tcb_mtx)) {
+ panic("Own lock on stcb at return from input");
+ }
+ }
+ if (mtx_owned(&inp->inp_create_mtx)) {
+ panic("Own create lock on inp");
+ }
+ if (mtx_owned(&inp->inp_mtx)) {
+ panic("Own inp lock on inp");
+ }
+}
+
+#endif
+
+/*
+ * common input chunk processing (v4 and v6)
+ */
+void
+sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset,
+ int length, struct sctphdr *sh, struct sctp_chunkhdr *ch,
+ struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net,
+ uint8_t ecn_bits, uint32_t vrf_id, uint16_t port)
+{
+ /*
+ * Control chunk processing
+ */
+ uint32_t high_tsn;
+ int fwd_tsn_seen = 0, data_processed = 0;
+ struct mbuf *m = *mm;
+ int abort_flag = 0;
+ int un_sent;
+
+ SCTP_STAT_INCR(sctps_recvdatagrams);
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xE0, 1);
+ sctp_auditing(0, inp, stcb, net);
+#endif
+
+ SCTPDBG(SCTP_DEBUG_INPUT1, "Ok, Common input processing called, m:%p iphlen:%d offset:%d length:%d stcb:%p\n",
+ m, iphlen, offset, length, stcb);
+ if (stcb) {
+ /* always clear this before beginning a packet */
+ stcb->asoc.authenticated = 0;
+ stcb->asoc.seen_a_sack_this_pkt = 0;
+ SCTPDBG(SCTP_DEBUG_INPUT1, "stcb:%p state:%x\n",
+ stcb, stcb->asoc.state);
+
+ if ((stcb->asoc.state & SCTP_STATE_WAS_ABORTED) ||
+ (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED)) {
+ /*-
+ * If we hit here, we had a ref count
+ * up when the assoc was aborted and the
+ * timer is clearing out the assoc, we should
+ * NOT respond to any packet.. its OOTB.
+ */
+ SCTP_TCB_UNLOCK(stcb);
+ sctp_handle_ootb(m, iphlen, offset, sh, inp, NULL,
+ vrf_id, port);
+ goto out_now;
+ }
+ }
+ if (IS_SCTP_CONTROL(ch)) {
+ /* process the control portion of the SCTP packet */
+ /* sa_ignore NO_NULL_CHK */
+ stcb = sctp_process_control(m, iphlen, &offset, length, sh, ch,
+ inp, stcb, &net, &fwd_tsn_seen, vrf_id, port);
+ if (stcb) {
+ /*
+ * This covers us if the cookie-echo was there and
+ * it changes our INP.
+ */
+ inp = stcb->sctp_ep;
+ if ((net) && (port)) {
+ if (net->port == 0) {
+ sctp_pathmtu_adjustment(inp, stcb, net, net->mtu - sizeof(struct udphdr));
+ }
+ net->port = port;
+ }
+ }
+ } else {
+ /*
+ * no control chunks, so pre-process DATA chunks (these
+ * checks are taken care of by control processing)
+ */
+
+ /*
+ * if DATA only packet, and auth is required, then punt...
+ * can't have authenticated without any AUTH (control)
+ * chunks
+ */
+ if ((stcb != NULL) &&
+ !SCTP_BASE_SYSCTL(sctp_auth_disable) &&
+ sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks)) {
+ /* "silently" ignore */
+ SCTP_STAT_INCR(sctps_recvauthmissing);
+ SCTP_TCB_UNLOCK(stcb);
+ goto out_now;
+ }
+ if (stcb == NULL) {
+ /* out of the blue DATA chunk */
+ sctp_handle_ootb(m, iphlen, offset, sh, inp, NULL,
+ vrf_id, port);
+ goto out_now;
+ }
+ if (stcb->asoc.my_vtag != ntohl(sh->v_tag)) {
+ /* v_tag mismatch! */
+ SCTP_STAT_INCR(sctps_badvtag);
+ SCTP_TCB_UNLOCK(stcb);
+ goto out_now;
+ }
+ }
+
+ if (stcb == NULL) {
+ /*
+ * no valid TCB for this packet, or we found it's a bad
+ * packet while processing control, or we're done with this
+ * packet (done or skip rest of data), so we drop it...
+ */
+ goto out_now;
+ }
+ /*
+ * DATA chunk processing
+ */
+ /* plow through the data chunks while length > offset */
+
+ /*
+ * Rest should be DATA only. Check authentication state if AUTH for
+ * DATA is required.
+ */
+ if ((length > offset) &&
+ (stcb != NULL) &&
+ !SCTP_BASE_SYSCTL(sctp_auth_disable) &&
+ sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks) &&
+ !stcb->asoc.authenticated) {
+ /* "silently" ignore */
+ SCTP_STAT_INCR(sctps_recvauthmissing);
+ SCTPDBG(SCTP_DEBUG_AUTH1,
+ "Data chunk requires AUTH, skipped\n");
+ goto trigger_send;
+ }
+ if (length > offset) {
+ int retval;
+
+ /*
+ * First check to make sure our state is correct. We would
+ * not get here unless we really did have a tag, so we don't
+ * abort if this happens, just dump the chunk silently.
+ */
+ switch (SCTP_GET_STATE(&stcb->asoc)) {
+ case SCTP_STATE_COOKIE_ECHOED:
+ /*
+ * we consider data with valid tags in this state
+ * shows us the cookie-ack was lost. Imply it was
+ * there.
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
+ stcb->asoc.overall_error_count,
+ 0,
+ SCTP_FROM_SCTP_INPUT,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count = 0;
+ sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, net);
+ break;
+ case SCTP_STATE_COOKIE_WAIT:
+ /*
+ * We consider OOTB any data sent during asoc setup.
+ */
+ sctp_handle_ootb(m, iphlen, offset, sh, inp, NULL,
+ vrf_id, port);
+ SCTP_TCB_UNLOCK(stcb);
+ goto out_now;
+ /* sa_ignore NOTREACHED */
+ break;
+ case SCTP_STATE_EMPTY: /* should not happen */
+ case SCTP_STATE_INUSE: /* should not happen */
+ case SCTP_STATE_SHUTDOWN_RECEIVED: /* This is a peer error */
+ case SCTP_STATE_SHUTDOWN_ACK_SENT:
+ default:
+ SCTP_TCB_UNLOCK(stcb);
+ goto out_now;
+ /* sa_ignore NOTREACHED */
+ break;
+ case SCTP_STATE_OPEN:
+ case SCTP_STATE_SHUTDOWN_SENT:
+ break;
+ }
+ /* take care of ECN, part 1. */
+ if (stcb->asoc.ecn_allowed &&
+ (ecn_bits & (SCTP_ECT0_BIT | SCTP_ECT1_BIT))) {
+ sctp_process_ecn_marked_a(stcb, net, ecn_bits);
+ }
+ /* plow through the data chunks while length > offset */
+ retval = sctp_process_data(mm, iphlen, &offset, length, sh,
+ inp, stcb, net, &high_tsn);
+ if (retval == 2) {
+ /*
+ * The association aborted, NO UNLOCK needed since
+ * the association is destroyed.
+ */
+ goto out_now;
+ }
+ data_processed = 1;
+ if (retval == 0) {
+ /* take care of ecn part 2. */
+ if (stcb->asoc.ecn_allowed &&
+ (ecn_bits & (SCTP_ECT0_BIT | SCTP_ECT1_BIT))) {
+ sctp_process_ecn_marked_b(stcb, net, high_tsn,
+ ecn_bits);
+ }
+ }
+ /*
+ * Anything important needs to have been m_copy'ed in
+ * process_data
+ */
+ }
+ if ((data_processed == 0) && (fwd_tsn_seen)) {
+ int was_a_gap;
+ uint32_t highest_tsn;
+
+ if (compare_with_wrap(stcb->asoc.highest_tsn_inside_nr_map, stcb->asoc.highest_tsn_inside_map, MAX_TSN)) {
+ highest_tsn = stcb->asoc.highest_tsn_inside_nr_map;
+ } else {
+ highest_tsn = stcb->asoc.highest_tsn_inside_map;
+ }
+ was_a_gap = compare_with_wrap(highest_tsn, stcb->asoc.cumulative_tsn, MAX_TSN);
+ stcb->asoc.send_sack = 1;
+ sctp_sack_check(stcb, was_a_gap, &abort_flag);
+ if (abort_flag) {
+ /* Again, we aborted so NO UNLOCK needed */
+ goto out_now;
+ }
+ } else if (fwd_tsn_seen) {
+ stcb->asoc.send_sack = 1;
+ }
+ /* trigger send of any chunks in queue... */
+trigger_send:
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xE0, 2);
+ sctp_auditing(1, inp, stcb, net);
+#endif
+ SCTPDBG(SCTP_DEBUG_INPUT1,
+ "Check for chunk output prw:%d tqe:%d tf=%d\n",
+ stcb->asoc.peers_rwnd,
+ TAILQ_EMPTY(&stcb->asoc.control_send_queue),
+ stcb->asoc.total_flight);
+ un_sent = (stcb->asoc.total_output_queue_size - stcb->asoc.total_flight);
+
+ if (!TAILQ_EMPTY(&stcb->asoc.control_send_queue) ||
+ ((un_sent) &&
+ (stcb->asoc.peers_rwnd > 0 ||
+ (stcb->asoc.peers_rwnd <= 0 && stcb->asoc.total_flight == 0)))) {
+ SCTPDBG(SCTP_DEBUG_INPUT3, "Calling chunk OUTPUT\n");
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED);
+ SCTPDBG(SCTP_DEBUG_INPUT3, "chunk OUTPUT returns\n");
+ }
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xE0, 3);
+ sctp_auditing(2, inp, stcb, net);
+#endif
+ SCTP_TCB_UNLOCK(stcb);
+out_now:
+#ifdef INVARIANTS
+ sctp_validate_no_locks(inp);
+#endif
+ return;
+}
+
+#if 0
+static void
+sctp_print_mbuf_chain(struct mbuf *m)
+{
+ for (; m; m = SCTP_BUF_NEXT(m)) {
+ printf("%p: m_len = %ld\n", m, SCTP_BUF_LEN(m));
+ if (SCTP_BUF_IS_EXTENDED(m))
+ printf("%p: extend_size = %d\n", m, SCTP_BUF_EXTEND_SIZE(m));
+ }
+}
+
+#endif
+
+void
+sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port)
+{
+#ifdef SCTP_MBUF_LOGGING
+ struct mbuf *mat;
+
+#endif
+ struct mbuf *m;
+ int iphlen;
+ uint32_t vrf_id = 0;
+ uint8_t ecn_bits;
+ struct ip *ip;
+ struct sctphdr *sh;
+ struct sctp_inpcb *inp = NULL;
+ struct sctp_nets *net;
+ struct sctp_tcb *stcb = NULL;
+ struct sctp_chunkhdr *ch;
+ int refcount_up = 0;
+ int length, mlen, offset;
+
+#if !defined(SCTP_WITH_NO_CSUM)
+ uint32_t check, calc_check;
+
+#endif
+
+ if (SCTP_GET_PKT_VRFID(i_pak, vrf_id)) {
+ SCTP_RELEASE_PKT(i_pak);
+ return;
+ }
+ mlen = SCTP_HEADER_LEN(i_pak);
+ iphlen = off;
+ m = SCTP_HEADER_TO_CHAIN(i_pak);
+
+ net = NULL;
+ SCTP_STAT_INCR(sctps_recvpackets);
+ SCTP_STAT_INCR_COUNTER64(sctps_inpackets);
+
+
+#ifdef SCTP_MBUF_LOGGING
+ /* Log in any input mbufs */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ mat = m;
+ while (mat) {
+ if (SCTP_BUF_IS_EXTENDED(mat)) {
+ sctp_log_mb(mat, SCTP_MBUF_INPUT);
+ }
+ mat = SCTP_BUF_NEXT(mat);
+ }
+ }
+#endif
+#ifdef SCTP_PACKET_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
+ sctp_packet_log(m, mlen);
+#endif
+ /*
+ * Must take out the iphlen, since mlen expects this (only effect lb
+ * case)
+ */
+ mlen -= iphlen;
+
+ /*
+ * Get IP, SCTP, and first chunk header together in first mbuf.
+ */
+ ip = mtod(m, struct ip *);
+ offset = iphlen + sizeof(*sh) + sizeof(*ch);
+ if (SCTP_BUF_LEN(m) < offset) {
+ if ((m = m_pullup(m, offset)) == 0) {
+ SCTP_STAT_INCR(sctps_hdrops);
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ }
+ /* validate mbuf chain length with IP payload length */
+ if (mlen < (SCTP_GET_IPV4_LENGTH(ip) - iphlen)) {
+ SCTP_STAT_INCR(sctps_hdrops);
+ goto bad;
+ }
+ sh = (struct sctphdr *)((caddr_t)ip + iphlen);
+ ch = (struct sctp_chunkhdr *)((caddr_t)sh + sizeof(*sh));
+ SCTPDBG(SCTP_DEBUG_INPUT1,
+ "sctp_input() length:%d iphlen:%d\n", mlen, iphlen);
+
+ /* SCTP does not allow broadcasts or multicasts */
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
+ goto bad;
+ }
+ if (SCTP_IS_IT_BROADCAST(ip->ip_dst, m)) {
+ /*
+ * We only look at broadcast if its a front state, All
+ * others we will not have a tcb for anyway.
+ */
+ goto bad;
+ }
+ /* validate SCTP checksum */
+ SCTPDBG(SCTP_DEBUG_CRCOFFLOAD,
+ "sctp_input(): Packet of length %d received on %s with csum_flags 0x%x.\n",
+ m->m_pkthdr.len,
+ if_name(m->m_pkthdr.rcvif),
+ m->m_pkthdr.csum_flags);
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_recvnocrc);
+#else
+ if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) {
+ SCTP_STAT_INCR(sctps_recvhwcrc);
+ goto sctp_skip_csum_4;
+ }
+ check = sh->checksum; /* save incoming checksum */
+ sh->checksum = 0; /* prepare for calc */
+ calc_check = sctp_calculate_cksum(m, iphlen);
+ sh->checksum = check;
+ SCTP_STAT_INCR(sctps_recvswcrc);
+ if (calc_check != check) {
+ SCTPDBG(SCTP_DEBUG_INPUT1, "Bad CSUM on SCTP packet calc_check:%x check:%x m:%p mlen:%d iphlen:%d\n",
+ calc_check, check, m, mlen, iphlen);
+
+ stcb = sctp_findassociation_addr(m, iphlen,
+ offset - sizeof(*ch),
+ sh, ch, &inp, &net,
+ vrf_id);
+ if ((net) && (port)) {
+ if (net->port == 0) {
+ sctp_pathmtu_adjustment(inp, stcb, net, net->mtu - sizeof(struct udphdr));
+ }
+ net->port = port;
+ }
+ if ((inp) && (stcb)) {
+ sctp_send_packet_dropped(stcb, net, m, iphlen, 1);
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_INPUT_ERROR, SCTP_SO_NOT_LOCKED);
+ } else if ((inp != NULL) && (stcb == NULL)) {
+ refcount_up = 1;
+ }
+ SCTP_STAT_INCR(sctps_badsum);
+ SCTP_STAT_INCR_COUNTER32(sctps_checksumerrors);
+ goto bad;
+ }
+sctp_skip_csum_4:
+#endif
+ /* destination port of 0 is illegal, based on RFC2960. */
+ if (sh->dest_port == 0) {
+ SCTP_STAT_INCR(sctps_hdrops);
+ goto bad;
+ }
+ /*
+ * Locate pcb and tcb for datagram sctp_findassociation_addr() wants
+ * IP/SCTP/first chunk header...
+ */
+ stcb = sctp_findassociation_addr(m, iphlen, offset - sizeof(*ch),
+ sh, ch, &inp, &net, vrf_id);
+ if ((net) && (port)) {
+ if (net->port == 0) {
+ sctp_pathmtu_adjustment(inp, stcb, net, net->mtu - sizeof(struct udphdr));
+ }
+ net->port = port;
+ }
+ /* inp's ref-count increased && stcb locked */
+ if (inp == NULL) {
+ struct sctp_init_chunk *init_chk, chunk_buf;
+
+ SCTP_STAT_INCR(sctps_noport);
+#ifdef ICMP_BANDLIM
+ /*
+ * we use the bandwidth limiting to protect against sending
+ * too many ABORTS all at once. In this case these count the
+ * same as an ICMP message.
+ */
+ if (badport_bandlim(0) < 0)
+ goto bad;
+#endif /* ICMP_BANDLIM */
+ SCTPDBG(SCTP_DEBUG_INPUT1,
+ "Sending a ABORT from packet entry!\n");
+ if (ch->chunk_type == SCTP_INITIATION) {
+ /*
+ * we do a trick here to get the INIT tag, dig in
+ * and get the tag from the INIT and put it in the
+ * common header.
+ */
+ init_chk = (struct sctp_init_chunk *)sctp_m_getptr(m,
+ iphlen + sizeof(*sh), sizeof(*init_chk),
+ (uint8_t *) & chunk_buf);
+ if (init_chk != NULL)
+ sh->v_tag = init_chk->init.initiate_tag;
+ }
+ if (ch->chunk_type == SCTP_SHUTDOWN_ACK) {
+ sctp_send_shutdown_complete2(m, iphlen, sh, vrf_id, port);
+ goto bad;
+ }
+ if (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) {
+ goto bad;
+ }
+ if (ch->chunk_type != SCTP_ABORT_ASSOCIATION)
+ sctp_send_abort(m, iphlen, sh, 0, NULL, vrf_id, port);
+ goto bad;
+ } else if (stcb == NULL) {
+ refcount_up = 1;
+ }
+#ifdef IPSEC
+ /*
+ * I very much doubt any of the IPSEC stuff will work but I have no
+ * idea, so I will leave it in place.
+ */
+ if (inp && ipsec4_in_reject(m, &inp->ip_inp.inp)) {
+ MODULE_GLOBAL(ipsec4stat).in_polvio++;
+ SCTP_STAT_INCR(sctps_hdrops);
+ goto bad;
+ }
+#endif /* IPSEC */
+
+ /*
+ * common chunk processing
+ */
+ length = ip->ip_len + iphlen;
+ offset -= sizeof(struct sctp_chunkhdr);
+
+ ecn_bits = ip->ip_tos;
+
+ /* sa_ignore NO_NULL_CHK */
+ sctp_common_input_processing(&m, iphlen, offset, length, sh, ch,
+ inp, stcb, net, ecn_bits, vrf_id, port);
+ /* inp's ref-count reduced && stcb unlocked */
+ if (m) {
+ sctp_m_freem(m);
+ }
+ if ((inp) && (refcount_up)) {
+ /* reduce ref-count */
+ SCTP_INP_DECR_REF(inp);
+ }
+ return;
+bad:
+ if (stcb) {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ if ((inp) && (refcount_up)) {
+ /* reduce ref-count */
+ SCTP_INP_DECR_REF(inp);
+ }
+ if (m) {
+ sctp_m_freem(m);
+ }
+ return;
+}
+void
+sctp_input(i_pak, off)
+ struct mbuf *i_pak;
+ int off;
+{
+ sctp_input_with_port(i_pak, off, 0);
+}
diff --git a/freebsd/sys/netinet/sctp_input.h b/freebsd/sys/netinet/sctp_input.h
new file mode 100644
index 00000000..90cd098a
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_input.h
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_input.h,v 1.6 2005/03/06 16:04:17 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __sctp_input_h__
+#define __sctp_input_h__
+
+#if defined(_KERNEL) || defined(__Userspace__)
+void
+sctp_common_input_processing(struct mbuf **, int, int, int,
+ struct sctphdr *, struct sctp_chunkhdr *, struct sctp_inpcb *,
+ struct sctp_tcb *, struct sctp_nets *, uint8_t, uint32_t, uint16_t);
+
+struct sctp_stream_reset_out_request *
+sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq,
+ struct sctp_tmit_chunk **bchk);
+
+void
+sctp_reset_in_stream(struct sctp_tcb *stcb, int number_entries,
+ uint16_t * list);
+
+
+int sctp_is_there_unsent_data(struct sctp_tcb *stcb);
+
+#endif
+#endif
diff --git a/freebsd/sys/netinet/sctp_lock_bsd.h b/freebsd/sys/netinet/sctp_lock_bsd.h
new file mode 100644
index 00000000..81e4a35f
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_lock_bsd.h
@@ -0,0 +1,430 @@
+#ifndef __sctp_lock_bsd_h__
+#define __sctp_lock_bsd_h__
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * General locking concepts: The goal of our locking is to of course provide
+ * consistency and yet minimize overhead. We will attempt to use
+ * non-recursive locks which are supposed to be quite inexpensive. Now in
+ * order to do this the goal is that most functions are not aware of locking.
+ * Once we have a TCB we lock it and unlock when we are through. This means
+ * that the TCB lock is kind-of a "global" lock when working on an
+ * association. Caution must be used when asserting a TCB_LOCK since if we
+ * recurse we deadlock.
+ *
+ * Most other locks (INP and INFO) attempt to localize the locking i.e. we try
+ * to contain the lock and unlock within the function that needs to lock it.
+ * This sometimes mean we do extra locks and unlocks and lose a bit of
+ * efficency, but if the performance statements about non-recursive locks are
+ * true this should not be a problem. One issue that arises with this only
+ * lock when needed is that if an implicit association setup is done we have
+ * a problem. If at the time I lookup an association I have NULL in the tcb
+ * return, by the time I call to create the association some other processor
+ * could have created it. This is what the CREATE lock on the endpoint.
+ * Places where we will be implicitly creating the association OR just
+ * creating an association (the connect call) will assert the CREATE_INP
+ * lock. This will assure us that during all the lookup of INP and INFO if
+ * another creator is also locking/looking up we can gate the two to
+ * synchronize. So the CREATE_INP lock is also another one we must use
+ * extreme caution in locking to make sure we don't hit a re-entrancy issue.
+ *
+ * For non FreeBSD 5.x we provide a bunch of EMPTY lock macros so we can
+ * blatantly put locks everywhere and they reduce to nothing on
+ * NetBSD/OpenBSD and FreeBSD 4.x
+ *
+ */
+
+/*
+ * When working with the global SCTP lists we lock and unlock the INP_INFO
+ * lock. So when we go to lookup an association we will want to do a
+ * SCTP_INP_INFO_RLOCK() and then when we want to add a new association to
+ * the SCTP_BASE_INFO() list's we will do a SCTP_INP_INFO_WLOCK().
+ */
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+
+extern struct sctp_foo_stuff sctp_logoff[];
+extern int sctp_logoff_stuff;
+
+#define SCTP_IPI_COUNT_INIT()
+
+#define SCTP_STATLOG_INIT_LOCK()
+#define SCTP_STATLOG_LOCK()
+#define SCTP_STATLOG_UNLOCK()
+#define SCTP_STATLOG_DESTROY()
+
+#define SCTP_INP_INFO_LOCK_DESTROY() do { \
+ if(rw_wowned(&SCTP_BASE_INFO(ipi_ep_mtx))) { \
+ rw_wunlock(&SCTP_BASE_INFO(ipi_ep_mtx)); \
+ } \
+ rw_destroy(&SCTP_BASE_INFO(ipi_ep_mtx)); \
+ } while (0)
+
+#define SCTP_INP_INFO_LOCK_INIT() \
+ rw_init(&SCTP_BASE_INFO(ipi_ep_mtx), "sctp-info");
+
+
+#define SCTP_INP_INFO_RLOCK() do { \
+ rw_rlock(&SCTP_BASE_INFO(ipi_ep_mtx)); \
+} while (0)
+
+
+#define SCTP_INP_INFO_WLOCK() do { \
+ rw_wlock(&SCTP_BASE_INFO(ipi_ep_mtx)); \
+} while (0)
+
+
+#define SCTP_INP_INFO_RUNLOCK() rw_runlock(&SCTP_BASE_INFO(ipi_ep_mtx))
+#define SCTP_INP_INFO_WUNLOCK() rw_wunlock(&SCTP_BASE_INFO(ipi_ep_mtx))
+
+
+#define SCTP_IPI_ADDR_INIT() \
+ rw_init(&SCTP_BASE_INFO(ipi_addr_mtx), "sctp-addr")
+#define SCTP_IPI_ADDR_DESTROY() do { \
+ if(rw_wowned(&SCTP_BASE_INFO(ipi_addr_mtx))) { \
+ rw_wunlock(&SCTP_BASE_INFO(ipi_addr_mtx)); \
+ } \
+ rw_destroy(&SCTP_BASE_INFO(ipi_addr_mtx)); \
+ } while (0)
+#define SCTP_IPI_ADDR_RLOCK() do { \
+ rw_rlock(&SCTP_BASE_INFO(ipi_addr_mtx)); \
+} while (0)
+#define SCTP_IPI_ADDR_WLOCK() do { \
+ rw_wlock(&SCTP_BASE_INFO(ipi_addr_mtx)); \
+} while (0)
+
+#define SCTP_IPI_ADDR_RUNLOCK() rw_runlock(&SCTP_BASE_INFO(ipi_addr_mtx))
+#define SCTP_IPI_ADDR_WUNLOCK() rw_wunlock(&SCTP_BASE_INFO(ipi_addr_mtx))
+
+
+#define SCTP_IPI_ITERATOR_WQ_INIT() \
+ mtx_init(&sctp_it_ctl.ipi_iterator_wq_mtx, "sctp-it-wq", "sctp_it_wq", MTX_DEF)
+
+#define SCTP_IPI_ITERATOR_WQ_DESTROY() \
+ mtx_destroy(&sctp_it_ctl.ipi_iterator_wq_mtx)
+
+#define SCTP_IPI_ITERATOR_WQ_LOCK() do { \
+ mtx_lock(&sctp_it_ctl.ipi_iterator_wq_mtx); \
+} while (0)
+
+#define SCTP_IPI_ITERATOR_WQ_UNLOCK() mtx_unlock(&sctp_it_ctl.ipi_iterator_wq_mtx)
+
+
+#define SCTP_IP_PKTLOG_INIT() \
+ mtx_init(&SCTP_BASE_INFO(ipi_pktlog_mtx), "sctp-pktlog", "packetlog", MTX_DEF)
+
+
+#define SCTP_IP_PKTLOG_LOCK() do { \
+ mtx_lock(&SCTP_BASE_INFO(ipi_pktlog_mtx)); \
+} while (0)
+
+#define SCTP_IP_PKTLOG_UNLOCK() mtx_unlock(&SCTP_BASE_INFO(ipi_pktlog_mtx))
+
+#define SCTP_IP_PKTLOG_DESTROY() \
+ mtx_destroy(&SCTP_BASE_INFO(ipi_pktlog_mtx))
+
+
+
+
+
+/*
+ * The INP locks we will use for locking an SCTP endpoint, so for example if
+ * we want to change something at the endpoint level for example random_store
+ * or cookie secrets we lock the INP level.
+ */
+
+#define SCTP_INP_READ_INIT(_inp) \
+ mtx_init(&(_inp)->inp_rdata_mtx, "sctp-read", "inpr", MTX_DEF | MTX_DUPOK)
+
+#define SCTP_INP_READ_DESTROY(_inp) \
+ mtx_destroy(&(_inp)->inp_rdata_mtx)
+
+#define SCTP_INP_READ_LOCK(_inp) do { \
+ mtx_lock(&(_inp)->inp_rdata_mtx); \
+} while (0)
+
+
+#define SCTP_INP_READ_UNLOCK(_inp) mtx_unlock(&(_inp)->inp_rdata_mtx)
+
+
+#define SCTP_INP_LOCK_INIT(_inp) \
+ mtx_init(&(_inp)->inp_mtx, "sctp-inp", "inp", MTX_DEF | MTX_DUPOK)
+#define SCTP_ASOC_CREATE_LOCK_INIT(_inp) \
+ mtx_init(&(_inp)->inp_create_mtx, "sctp-create", "inp_create", \
+ MTX_DEF | MTX_DUPOK)
+
+#define SCTP_INP_LOCK_DESTROY(_inp) \
+ mtx_destroy(&(_inp)->inp_mtx)
+
+#define SCTP_INP_LOCK_CONTENDED(_inp) ((_inp)->inp_mtx.mtx_lock & MTX_CONTESTED)
+
+#define SCTP_INP_READ_CONTENDED(_inp) ((_inp)->inp_rdata_mtx.mtx_lock & MTX_CONTESTED)
+
+#define SCTP_ASOC_CREATE_LOCK_CONTENDED(_inp) ((_inp)->inp_create_mtx.mtx_lock & MTX_CONTESTED)
+
+
+#define SCTP_ASOC_CREATE_LOCK_DESTROY(_inp) \
+ mtx_destroy(&(_inp)->inp_create_mtx)
+
+
+#ifdef SCTP_LOCK_LOGGING
+#define SCTP_INP_RLOCK(_inp) do { \
+ if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) sctp_log_lock(_inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_INP);\
+ mtx_lock(&(_inp)->inp_mtx); \
+} while (0)
+
+#define SCTP_INP_WLOCK(_inp) do { \
+ if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) sctp_log_lock(_inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_INP);\
+ mtx_lock(&(_inp)->inp_mtx); \
+} while (0)
+
+#else
+
+#define SCTP_INP_RLOCK(_inp) do { \
+ mtx_lock(&(_inp)->inp_mtx); \
+} while (0)
+
+#define SCTP_INP_WLOCK(_inp) do { \
+ mtx_lock(&(_inp)->inp_mtx); \
+} while (0)
+
+#endif
+
+
+#define SCTP_TCB_SEND_LOCK_INIT(_tcb) \
+ mtx_init(&(_tcb)->tcb_send_mtx, "sctp-send-tcb", "tcbs", MTX_DEF | MTX_DUPOK)
+
+#define SCTP_TCB_SEND_LOCK_DESTROY(_tcb) mtx_destroy(&(_tcb)->tcb_send_mtx)
+
+#define SCTP_TCB_SEND_LOCK(_tcb) do { \
+ mtx_lock(&(_tcb)->tcb_send_mtx); \
+} while (0)
+
+#define SCTP_TCB_SEND_UNLOCK(_tcb) mtx_unlock(&(_tcb)->tcb_send_mtx)
+
+#define SCTP_INP_INCR_REF(_inp) atomic_add_int(&((_inp)->refcount), 1)
+#define SCTP_INP_DECR_REF(_inp) atomic_add_int(&((_inp)->refcount), -1)
+
+
+#ifdef SCTP_LOCK_LOGGING
+#define SCTP_ASOC_CREATE_LOCK(_inp) \
+ do { \
+ if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) sctp_log_lock(_inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_CREATE); \
+ mtx_lock(&(_inp)->inp_create_mtx); \
+ } while (0)
+#else
+
+#define SCTP_ASOC_CREATE_LOCK(_inp) \
+ do { \
+ mtx_lock(&(_inp)->inp_create_mtx); \
+ } while (0)
+#endif
+
+#define SCTP_INP_RUNLOCK(_inp) mtx_unlock(&(_inp)->inp_mtx)
+#define SCTP_INP_WUNLOCK(_inp) mtx_unlock(&(_inp)->inp_mtx)
+#define SCTP_ASOC_CREATE_UNLOCK(_inp) mtx_unlock(&(_inp)->inp_create_mtx)
+
+/*
+ * For the majority of things (once we have found the association) we will
+ * lock the actual association mutex. This will protect all the assoiciation
+ * level queues and streams and such. We will need to lock the socket layer
+ * when we stuff data up into the receiving sb_mb. I.e. we will need to do an
+ * extra SOCKBUF_LOCK(&so->so_rcv) even though the association is locked.
+ */
+
+#define SCTP_TCB_LOCK_INIT(_tcb) \
+ mtx_init(&(_tcb)->tcb_mtx, "sctp-tcb", "tcb", MTX_DEF | MTX_DUPOK)
+
+#define SCTP_TCB_LOCK_DESTROY(_tcb) mtx_destroy(&(_tcb)->tcb_mtx)
+
+#ifdef SCTP_LOCK_LOGGING
+#define SCTP_TCB_LOCK(_tcb) do { \
+ if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) sctp_log_lock(_tcb->sctp_ep, _tcb, SCTP_LOG_LOCK_TCB); \
+ mtx_lock(&(_tcb)->tcb_mtx); \
+} while (0)
+
+#else
+#define SCTP_TCB_LOCK(_tcb) do { \
+ mtx_lock(&(_tcb)->tcb_mtx); \
+} while (0)
+
+#endif
+
+
+#define SCTP_TCB_TRYLOCK(_tcb) mtx_trylock(&(_tcb)->tcb_mtx)
+
+#define SCTP_TCB_UNLOCK(_tcb) mtx_unlock(&(_tcb)->tcb_mtx)
+
+#define SCTP_TCB_UNLOCK_IFOWNED(_tcb) do { \
+ if (mtx_owned(&(_tcb)->tcb_mtx)) \
+ mtx_unlock(&(_tcb)->tcb_mtx); \
+ } while (0)
+
+
+
+#ifdef INVARIANTS
+#define SCTP_TCB_LOCK_ASSERT(_tcb) do { \
+ if (mtx_owned(&(_tcb)->tcb_mtx) == 0) \
+ panic("Don't own TCB lock"); \
+ } while (0)
+#else
+#define SCTP_TCB_LOCK_ASSERT(_tcb)
+#endif
+
+#define SCTP_ITERATOR_LOCK_INIT() \
+ mtx_init(&sctp_it_ctl.it_mtx, "sctp-it", "iterator", MTX_DEF)
+
+#ifdef INVARIANTS
+#define SCTP_ITERATOR_LOCK() \
+ do { \
+ if (mtx_owned(&sctp_it_ctl.it_mtx)) \
+ panic("Iterator Lock"); \
+ mtx_lock(&sctp_it_ctl.it_mtx); \
+ } while (0)
+#else
+#define SCTP_ITERATOR_LOCK() \
+ do { \
+ mtx_lock(&sctp_it_ctl.it_mtx); \
+ } while (0)
+
+#endif
+
+#define SCTP_ITERATOR_UNLOCK() mtx_unlock(&sctp_it_ctl.it_mtx)
+#define SCTP_ITERATOR_LOCK_DESTROY() mtx_destroy(&sctp_it_ctl.it_mtx)
+
+
+#define SCTP_WQ_ADDR_INIT() do { \
+ mtx_init(&SCTP_BASE_INFO(wq_addr_mtx), "sctp-addr-wq","sctp_addr_wq",MTX_DEF); \
+ } while (0)
+
+#define SCTP_WQ_ADDR_DESTROY() do { \
+ if(mtx_owned(&SCTP_BASE_INFO(wq_addr_mtx))) { \
+ mtx_unlock(&SCTP_BASE_INFO(wq_addr_mtx)); \
+ } \
+ mtx_destroy(&SCTP_BASE_INFO(wq_addr_mtx)); \
+ } while (0)
+
+#define SCTP_WQ_ADDR_LOCK() do { \
+ mtx_lock(&SCTP_BASE_INFO(wq_addr_mtx)); \
+} while (0)
+#define SCTP_WQ_ADDR_UNLOCK() do { \
+ mtx_unlock(&SCTP_BASE_INFO(wq_addr_mtx)); \
+} while (0)
+
+
+
+#define SCTP_INCR_EP_COUNT() \
+ do { \
+ atomic_add_int(&SCTP_BASE_INFO(ipi_count_ep), 1); \
+ } while (0)
+
+#define SCTP_DECR_EP_COUNT() \
+ do { \
+ atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ep), 1); \
+ } while (0)
+
+#define SCTP_INCR_ASOC_COUNT() \
+ do { \
+ atomic_add_int(&SCTP_BASE_INFO(ipi_count_asoc), 1); \
+ } while (0)
+
+#define SCTP_DECR_ASOC_COUNT() \
+ do { \
+ atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_asoc), 1); \
+ } while (0)
+
+#define SCTP_INCR_LADDR_COUNT() \
+ do { \
+ atomic_add_int(&SCTP_BASE_INFO(ipi_count_laddr), 1); \
+ } while (0)
+
+#define SCTP_DECR_LADDR_COUNT() \
+ do { \
+ atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_laddr), 1); \
+ } while (0)
+
+#define SCTP_INCR_RADDR_COUNT() \
+ do { \
+ atomic_add_int(&SCTP_BASE_INFO(ipi_count_raddr), 1); \
+ } while (0)
+
+#define SCTP_DECR_RADDR_COUNT() \
+ do { \
+ atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_raddr),1); \
+ } while (0)
+
+#define SCTP_INCR_CHK_COUNT() \
+ do { \
+ atomic_add_int(&SCTP_BASE_INFO(ipi_count_chunk), 1); \
+ } while (0)
+#ifdef INVARIANTS
+#define SCTP_DECR_CHK_COUNT() \
+ do { \
+ if(SCTP_BASE_INFO(ipi_count_chunk) == 0) \
+ panic("chunk count to 0?"); \
+ atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_chunk), 1); \
+ } while (0)
+#else
+#define SCTP_DECR_CHK_COUNT() \
+ do { \
+ if(SCTP_BASE_INFO(ipi_count_chunk) != 0) \
+ atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_chunk), 1); \
+ } while (0)
+#endif
+#define SCTP_INCR_READQ_COUNT() \
+ do { \
+ atomic_add_int(&SCTP_BASE_INFO(ipi_count_readq),1); \
+ } while (0)
+
+#define SCTP_DECR_READQ_COUNT() \
+ do { \
+ atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_readq), 1); \
+ } while (0)
+
+#define SCTP_INCR_STRMOQ_COUNT() \
+ do { \
+ atomic_add_int(&SCTP_BASE_INFO(ipi_count_strmoq), 1); \
+ } while (0)
+
+#define SCTP_DECR_STRMOQ_COUNT() \
+ do { \
+ atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_strmoq), 1); \
+ } while (0)
+
+
+#if defined(SCTP_SO_LOCK_TESTING)
+#define SCTP_INP_SO(sctpinp) (sctpinp)->ip_inp.inp.inp_socket
+#define SCTP_SOCKET_LOCK(so, refcnt)
+#define SCTP_SOCKET_UNLOCK(so, refcnt)
+#endif
+
+#endif
diff --git a/freebsd/sys/netinet/sctp_os.h b/freebsd/sys/netinet/sctp_os.h
new file mode 100644
index 00000000..c1a392f0
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_os.h
@@ -0,0 +1,72 @@
+/*-
+ * Copyright (c) 2006-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#ifndef __sctp_os_h__
+#define __sctp_os_h__
+
+/*
+ * General kernel memory allocation:
+ * SCTP_MALLOC(element, type, size, name)
+ * SCTP_FREE(element)
+ * Kernel memory allocation for "soname"- memory must be zeroed.
+ * SCTP_MALLOC_SONAME(name, type, size)
+ * SCTP_FREE_SONAME(name)
+ */
+
+/*
+ * Zone(pool) allocation routines: MUST be defined for each OS.
+ * zone = zone/pool pointer.
+ * name = string name of the zone/pool.
+ * size = size of each zone/pool element.
+ * number = number of elements in zone/pool.
+ * type = structure type to allocate
+ *
+ * sctp_zone_t
+ * SCTP_ZONE_INIT(zone, name, size, number)
+ * SCTP_ZONE_GET(zone, type)
+ * SCTP_ZONE_FREE(zone, element)
+ * SCTP_ZONE_DESTROY(zone)
+ */
+
+#include <freebsd/netinet/sctp_os_bsd.h>
+
+
+
+
+
+/* All os's must implement this address gatherer. If
+ * no VRF's exist, then vrf 0 is the only one and all
+ * addresses and ifn's live here.
+ */
+#define SCTP_DEFAULT_VRF 0
+void sctp_init_vrf_list(int vrfid);
+
+#endif
diff --git a/freebsd/sys/netinet/sctp_os_bsd.h b/freebsd/sys/netinet/sctp_os_bsd.h
new file mode 100644
index 00000000..cf29776f
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_os_bsd.h
@@ -0,0 +1,503 @@
+/*-
+ * Copyright (c) 2006-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#ifndef __sctp_os_bsd_h__
+#define __sctp_os_bsd_h__
+/*
+ * includes
+ */
+#include <freebsd/local/opt_ipsec.h>
+#include <freebsd/local/opt_compat.h>
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_sctp.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/ktr.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/jail.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/resourcevar.h>
+#include <freebsd/sys/uio.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/rwlock.h>
+#include <freebsd/sys/kthread.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/random.h>
+#include <freebsd/sys/limits.h>
+#include <freebsd/sys/queue.h>
+#include <freebsd/machine/cpu.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/if_types.h>
+#include <freebsd/net/if_var.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_icmp.h>
+#include <freebsd/netinet/icmp_var.h>
+
+#ifdef IPSEC
+#include <freebsd/netipsec/ipsec.h>
+#include <freebsd/netipsec/key.h>
+#endif /* IPSEC */
+
+#ifdef INET6
+#include <freebsd/sys/domain.h>
+#ifdef IPSEC
+#include <freebsd/netipsec/ipsec6.h>
+#endif
+#include <freebsd/netinet/ip6.h>
+#include <freebsd/netinet6/ip6_var.h>
+#include <freebsd/netinet6/in6_pcb.h>
+#include <freebsd/netinet/icmp6.h>
+#include <freebsd/netinet6/ip6protosw.h>
+#include <freebsd/netinet6/nd6.h>
+#include <freebsd/netinet6/scope6_var.h>
+#endif /* INET6 */
+
+
+#include <freebsd/netinet/ip_options.h>
+
+#ifndef in6pcb
+#define in6pcb inpcb
+#endif
+/* Declare all the malloc names for all the various mallocs */
+MALLOC_DECLARE(SCTP_M_MAP);
+MALLOC_DECLARE(SCTP_M_STRMI);
+MALLOC_DECLARE(SCTP_M_STRMO);
+MALLOC_DECLARE(SCTP_M_ASC_ADDR);
+MALLOC_DECLARE(SCTP_M_ASC_IT);
+MALLOC_DECLARE(SCTP_M_AUTH_CL);
+MALLOC_DECLARE(SCTP_M_AUTH_KY);
+MALLOC_DECLARE(SCTP_M_AUTH_HL);
+MALLOC_DECLARE(SCTP_M_AUTH_IF);
+MALLOC_DECLARE(SCTP_M_STRESET);
+MALLOC_DECLARE(SCTP_M_CMSG);
+MALLOC_DECLARE(SCTP_M_COPYAL);
+MALLOC_DECLARE(SCTP_M_VRF);
+MALLOC_DECLARE(SCTP_M_IFA);
+MALLOC_DECLARE(SCTP_M_IFN);
+MALLOC_DECLARE(SCTP_M_TIMW);
+MALLOC_DECLARE(SCTP_M_MVRF);
+MALLOC_DECLARE(SCTP_M_ITER);
+MALLOC_DECLARE(SCTP_M_SOCKOPT);
+
+#if defined(SCTP_LOCAL_TRACE_BUF)
+
+#define SCTP_GET_CYCLECOUNT get_cyclecount()
+#define SCTP_CTR6 sctp_log_trace
+
+#else
+#define SCTP_CTR6 CTR6
+#endif
+
+/*
+ * Macros to expand out globals defined by various modules
+ * to either a real global or a virtualized instance of one,
+ * depending on whether VIMAGE is defined.
+ */
+/* then define the macro(s) that hook into the vimage macros */
+#define MODULE_GLOBAL(__SYMBOL) V_##__SYMBOL
+
+#define V_system_base_info VNET(system_base_info)
+#define SCTP_BASE_INFO(__m) V_system_base_info.sctppcbinfo.__m
+#define SCTP_BASE_STATS V_system_base_info.sctpstat
+#define SCTP_BASE_STATS_SYSCTL VNET_NAME(system_base_info.sctpstat)
+#define SCTP_BASE_STAT(__m) V_system_base_info.sctpstat.__m
+#define SCTP_BASE_SYSCTL(__m) VNET_NAME(system_base_info.sctpsysctl.__m)
+#define SCTP_BASE_VAR(__m) V_system_base_info.__m
+
+/*
+ *
+ */
+#define USER_ADDR_NULL (NULL) /* FIX ME: temp */
+
+#if defined(SCTP_DEBUG)
+#define SCTPDBG(level, params...) \
+{ \
+ do { \
+ if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \
+ printf(params); \
+ } \
+ } while (0); \
+}
+#define SCTPDBG_ADDR(level, addr) \
+{ \
+ do { \
+ if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \
+ sctp_print_address(addr); \
+ } \
+ } while (0); \
+}
+#define SCTPDBG_PKT(level, iph, sh) \
+{ \
+ do { \
+ if (SCTP_BASE_SYSCTL(sctp_debug_on) & level) { \
+ sctp_print_address_pkt(iph, sh); \
+ } \
+ } while (0); \
+}
+#else
+#define SCTPDBG(level, params...)
+#define SCTPDBG_ADDR(level, addr)
+#define SCTPDBG_PKT(level, iph, sh)
+#endif
+#define SCTP_PRINTF(params...) printf(params)
+
+#ifdef SCTP_LTRACE_CHUNKS
+#define SCTP_LTRACE_CHK(a, b, c, d) if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_CHUNK_ENABLE) SCTP_CTR6(KTR_SUBSYS, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_CHUNK_PROC, 0, a, b, c, d)
+#else
+#define SCTP_LTRACE_CHK(a, b, c, d)
+#endif
+
+#ifdef SCTP_LTRACE_ERRORS
+#define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err) if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \
+ printf("mbuf:%p inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
+ m, inp, stcb, net, file, __LINE__, err);
+#define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err) if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \
+ printf("inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
+ inp, stcb, net, file, __LINE__, err);
+#else
+#define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err)
+#define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err)
+#endif
+
+
+/*
+ * Local address and interface list handling
+ */
+#define SCTP_MAX_VRF_ID 0
+#define SCTP_SIZE_OF_VRF_HASH 3
+#define SCTP_IFNAMSIZ IFNAMSIZ
+#define SCTP_DEFAULT_VRFID 0
+#define SCTP_VRF_ADDR_HASH_SIZE 16
+#define SCTP_VRF_IFN_HASH_SIZE 3
+#define SCTP_INIT_VRF_TABLEID(vrf)
+
+#define SCTP_IFN_IS_IFT_LOOP(ifn) ((ifn)->ifn_type == IFT_LOOP)
+#define SCTP_ROUTE_IS_REAL_LOOP(ro) ((ro)->ro_rt && (ro)->ro_rt->rt_ifa && (ro)->ro_rt->rt_ifa->ifa_ifp && (ro)->ro_rt->rt_ifa->ifa_ifp->if_type == IFT_LOOP)
+
+/*
+ * Access to IFN's to help with src-addr-selection
+ */
+/* This could return VOID if the index works but for BSD we provide both. */
+#define SCTP_GET_IFN_VOID_FROM_ROUTE(ro) (void *)ro->ro_rt->rt_ifp
+#define SCTP_GET_IF_INDEX_FROM_ROUTE(ro) (ro)->ro_rt->rt_ifp->if_index
+#define SCTP_ROUTE_HAS_VALID_IFN(ro) ((ro)->ro_rt && (ro)->ro_rt->rt_ifp)
+
+/*
+ * general memory allocation
+ */
+#define SCTP_MALLOC(var, type, size, name) \
+ do { \
+ var = (type)malloc(size, name, M_NOWAIT); \
+ } while (0)
+
+#define SCTP_FREE(var, type) free(var, type)
+
+#define SCTP_MALLOC_SONAME(var, type, size) \
+ do { \
+ var = (type)malloc(size, M_SONAME, M_WAITOK | M_ZERO); \
+ } while (0)
+
+#define SCTP_FREE_SONAME(var) free(var, M_SONAME)
+
+#define SCTP_PROCESS_STRUCT struct proc *
+
+/*
+ * zone allocation functions
+ */
+#include <freebsd/vm/uma.h>
+
+/* SCTP_ZONE_INIT: initialize the zone */
+typedef struct uma_zone *sctp_zone_t;
+
+#define SCTP_ZONE_INIT(zone, name, size, number) { \
+ zone = uma_zcreate(name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,\
+ 0); \
+ uma_zone_set_max(zone, number); \
+}
+
+#define SCTP_ZONE_DESTROY(zone) uma_zdestroy(zone)
+
+/* SCTP_ZONE_GET: allocate element from the zone */
+#define SCTP_ZONE_GET(zone, type) \
+ (type *)uma_zalloc(zone, M_NOWAIT);
+
+/* SCTP_ZONE_FREE: free element from the zone */
+#define SCTP_ZONE_FREE(zone, element) \
+ uma_zfree(zone, element);
+
+#define SCTP_HASH_INIT(size, hashmark) hashinit_flags(size, M_PCB, hashmark, HASH_NOWAIT)
+#define SCTP_HASH_FREE(table, hashmark) hashdestroy(table, M_PCB, hashmark)
+
+#define SCTP_M_COPYM m_copym
+
+/*
+ * timers
+ */
+#include <freebsd/sys/callout.h>
+typedef struct callout sctp_os_timer_t;
+
+
+#define SCTP_OS_TIMER_INIT(tmr) callout_init(tmr, 1)
+#define SCTP_OS_TIMER_START callout_reset
+#define SCTP_OS_TIMER_STOP callout_stop
+#define SCTP_OS_TIMER_STOP_DRAIN callout_drain
+#define SCTP_OS_TIMER_PENDING callout_pending
+#define SCTP_OS_TIMER_ACTIVE callout_active
+#define SCTP_OS_TIMER_DEACTIVATE callout_deactivate
+
+#define sctp_get_tick_count() (ticks)
+
+#define SCTP_UNUSED __attribute__((unused))
+
+/*
+ * Functions
+ */
+/* Mbuf manipulation and access macros */
+#define SCTP_BUF_LEN(m) (m->m_len)
+#define SCTP_BUF_NEXT(m) (m->m_next)
+#define SCTP_BUF_NEXT_PKT(m) (m->m_nextpkt)
+#define SCTP_BUF_RESV_UF(m, size) m->m_data += size
+#define SCTP_BUF_AT(m, size) m->m_data + size
+#define SCTP_BUF_IS_EXTENDED(m) (m->m_flags & M_EXT)
+#define SCTP_BUF_EXTEND_SIZE(m) (m->m_ext.ext_size)
+#define SCTP_BUF_TYPE(m) (m->m_type)
+#define SCTP_BUF_RECVIF(m) (m->m_pkthdr.rcvif)
+#define SCTP_BUF_PREPEND M_PREPEND
+
+#define SCTP_ALIGN_TO_END(m, len) if(m->m_flags & M_PKTHDR) { \
+ MH_ALIGN(m, len); \
+ } else if ((m->m_flags & M_EXT) == 0) { \
+ M_ALIGN(m, len); \
+ }
+
+/* We make it so if you have up to 4 threads
+ * writing based on the default size of
+ * the packet log 65 k, that would be
+ * 4 16k packets before we would hit
+ * a problem.
+ */
+#define SCTP_PKTLOG_WRITERS_NEED_LOCK 3
+
+/*************************/
+/* MTU */
+/*************************/
+#define SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index, af) ((struct ifnet *)ifn)->if_mtu
+#define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, rt) ((rt != NULL) ? rt->rt_rmx.rmx_mtu : 0)
+#define SCTP_GATHER_MTU_FROM_INTFC(sctp_ifn) ((sctp_ifn->ifn_p != NULL) ? ((struct ifnet *)(sctp_ifn->ifn_p))->if_mtu : 0)
+#define SCTP_SET_MTU_OF_ROUTE(sa, rt, mtu) do { \
+ if (rt != NULL) \
+ rt->rt_rmx.rmx_mtu = mtu; \
+ } while(0)
+
+/* (de-)register interface event notifications */
+#define SCTP_REGISTER_INTERFACE(ifhandle, af)
+#define SCTP_DEREGISTER_INTERFACE(ifhandle, af)
+
+
+/*************************/
+/* These are for logging */
+/*************************/
+/* return the base ext data pointer */
+#define SCTP_BUF_EXTEND_BASE(m) (m->m_ext.ext_buf)
+ /* return the refcnt of the data pointer */
+#define SCTP_BUF_EXTEND_REFCNT(m) (*m->m_ext.ref_cnt)
+/* return any buffer related flags, this is
+ * used beyond logging for apple only.
+ */
+#define SCTP_BUF_GET_FLAGS(m) (m->m_flags)
+
+/* For BSD this just accesses the M_PKTHDR length
+ * so it operates on an mbuf with hdr flag. Other
+ * O/S's may have separate packet header and mbuf
+ * chain pointers.. thus the macro.
+ */
+#define SCTP_HEADER_TO_CHAIN(m) (m)
+#define SCTP_DETACH_HEADER_FROM_CHAIN(m)
+#define SCTP_HEADER_LEN(m) (m->m_pkthdr.len)
+#define SCTP_GET_HEADER_FOR_OUTPUT(o_pak) 0
+#define SCTP_RELEASE_HEADER(m)
+#define SCTP_RELEASE_PKT(m) sctp_m_freem(m)
+#define SCTP_ENABLE_UDP_CSUM(m) do { \
+ m->m_pkthdr.csum_flags = CSUM_UDP; \
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); \
+ } while (0)
+
+#define SCTP_GET_PKT_VRFID(m, vrf_id) ((vrf_id = SCTP_DEFAULT_VRFID) != SCTP_DEFAULT_VRFID)
+
+
+
+/* Attach the chain of data into the sendable packet. */
+#define SCTP_ATTACH_CHAIN(pak, m, packet_length) do { \
+ pak = m; \
+ pak->m_pkthdr.len = packet_length; \
+ } while(0)
+
+/* Other m_pkthdr type things */
+#define SCTP_IS_IT_BROADCAST(dst, m) ((m->m_flags & M_PKTHDR) ? in_broadcast(dst, m->m_pkthdr.rcvif) : 0)
+#define SCTP_IS_IT_LOOPBACK(m) ((m->m_flags & M_PKTHDR) && ((m->m_pkthdr.rcvif == NULL) || (m->m_pkthdr.rcvif->if_type == IFT_LOOP)))
+
+
+/* This converts any input packet header
+ * into the chain of data holders, for BSD
+ * its a NOP.
+ */
+
+/* Macro's for getting length from V6/V4 header */
+#define SCTP_GET_IPV4_LENGTH(iph) (iph->ip_len)
+#define SCTP_GET_IPV6_LENGTH(ip6) (ntohs(ip6->ip6_plen))
+
+/* get the v6 hop limit */
+#define SCTP_GET_HLIM(inp, ro) in6_selecthlim((struct in6pcb *)&inp->ip_inp.inp, (ro ? (ro->ro_rt ? (ro->ro_rt->rt_ifp) : (NULL)) : (NULL)));
+
+/* is the endpoint v6only? */
+#define SCTP_IPV6_V6ONLY(inp) (((struct inpcb *)inp)->inp_flags & IN6P_IPV6_V6ONLY)
+/* is the socket non-blocking? */
+#define SCTP_SO_IS_NBIO(so) ((so)->so_state & SS_NBIO)
+#define SCTP_SET_SO_NBIO(so) ((so)->so_state |= SS_NBIO)
+#define SCTP_CLEAR_SO_NBIO(so) ((so)->so_state &= ~SS_NBIO)
+/* get the socket type */
+#define SCTP_SO_TYPE(so) ((so)->so_type)
+/* reserve sb space for a socket */
+#define SCTP_SORESERVE(so, send, recv) soreserve(so, send, recv)
+/* wakeup a socket */
+#define SCTP_SOWAKEUP(so) wakeup(&(so)->so_timeo)
+/* clear the socket buffer state */
+#define SCTP_SB_CLEAR(sb) \
+ (sb).sb_cc = 0; \
+ (sb).sb_mb = NULL; \
+ (sb).sb_mbcnt = 0;
+
+#define SCTP_SB_LIMIT_RCV(so) so->so_rcv.sb_hiwat
+#define SCTP_SB_LIMIT_SND(so) so->so_snd.sb_hiwat
+
+/*
+ * routes, output, etc.
+ */
+typedef struct route sctp_route_t;
+typedef struct rtentry sctp_rtentry_t;
+
+#define SCTP_RTALLOC(ro, vrf_id) rtalloc_ign((struct route *)ro, 0UL)
+
+/* Future zero copy wakeup/send function */
+#define SCTP_ZERO_COPY_EVENT(inp, so)
+/* This is re-pulse ourselves for sendbuf */
+#define SCTP_ZERO_COPY_SENDQ_EVENT(inp, so)
+
+/*
+ * IP output routines
+ */
+#define SCTP_IP_OUTPUT(result, o_pak, ro, stcb, vrf_id) \
+{ \
+ int o_flgs = IP_RAWOUTPUT; \
+ struct sctp_tcb *local_stcb = stcb; \
+ if (local_stcb && \
+ local_stcb->sctp_ep && \
+ local_stcb->sctp_ep->sctp_socket) \
+ o_flgs |= local_stcb->sctp_ep->sctp_socket->so_options & SO_DONTROUTE; \
+ result = ip_output(o_pak, NULL, ro, o_flgs, 0, NULL); \
+}
+
+#define SCTP_IP6_OUTPUT(result, o_pak, ro, ifp, stcb, vrf_id) \
+{ \
+ struct sctp_tcb *local_stcb = stcb; \
+ if (local_stcb && local_stcb->sctp_ep) \
+ result = ip6_output(o_pak, \
+ ((struct in6pcb *)(local_stcb->sctp_ep))->in6p_outputopts, \
+ (ro), 0, 0, ifp, NULL); \
+ else \
+ result = ip6_output(o_pak, NULL, (ro), 0, 0, ifp, NULL); \
+}
+
+struct mbuf *
+sctp_get_mbuf_for_msg(unsigned int space_needed,
+ int want_header, int how, int allonebuf, int type);
+
+
+/*
+ * SCTP AUTH
+ */
+#define HAVE_SHA2
+
+#define SCTP_READ_RANDOM(buf, len) read_random(buf, len)
+
+#ifdef USE_SCTP_SHA1
+#include <freebsd/netinet/sctp_sha1.h>
+#else
+#include <freebsd/crypto/sha1.h>
+/* map standard crypto API names */
+#define SHA1_Init SHA1Init
+#define SHA1_Update SHA1Update
+#define SHA1_Final(x,y) SHA1Final((caddr_t)x, y)
+#endif
+
+#if defined(HAVE_SHA2)
+#include <freebsd/crypto/sha2/sha2.h>
+#endif
+
+#endif
+
+#define SCTP_DECREMENT_AND_CHECK_REFCOUNT(addr) (atomic_fetchadd_int(addr, -1) == 1)
+#if defined(INVARIANTS)
+#define SCTP_SAVE_ATOMIC_DECREMENT(addr, val) \
+{ \
+ int32_t oldval; \
+ oldval = atomic_fetchadd_int(addr, -val); \
+ if (oldval < val) { \
+ panic("Counter goes negative"); \
+ } \
+}
+#else
+#define SCTP_SAVE_ATOMIC_DECREMENT(addr, val) \
+{ \
+ int32_t oldval; \
+ oldval = atomic_fetchadd_int(addr, -val); \
+ if (oldval < val) { \
+ *addr = 0; \
+ } \
+}
+#endif
diff --git a/freebsd/sys/netinet/sctp_output.c b/freebsd/sys/netinet/sctp_output.c
new file mode 100644
index 00000000..9acd3288
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_output.c
@@ -0,0 +1,13539 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_output.c,v 1.46 2005/03/06 16:04:17 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/netinet/sctp_var.h>
+#include <freebsd/netinet/sctp_sysctl.h>
+#include <freebsd/netinet/sctp_header.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_output.h>
+#include <freebsd/netinet/sctp_uio.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_auth.h>
+#include <freebsd/netinet/sctp_timer.h>
+#include <freebsd/netinet/sctp_asconf.h>
+#include <freebsd/netinet/sctp_indata.h>
+#include <freebsd/netinet/sctp_bsd_addr.h>
+#include <freebsd/netinet/sctp_input.h>
+#include <freebsd/netinet/sctp_crc32.h>
+#include <freebsd/netinet/udp.h>
+#include <freebsd/machine/in_cksum.h>
+
+
+
+#define SCTP_MAX_GAPS_INARRAY 4
+struct sack_track {
+ uint8_t right_edge; /* mergable on the right edge */
+ uint8_t left_edge; /* mergable on the left edge */
+ uint8_t num_entries;
+ uint8_t spare;
+ struct sctp_gap_ack_block gaps[SCTP_MAX_GAPS_INARRAY];
+};
+
+struct sack_track sack_array[256] = {
+ {0, 0, 0, 0, /* 0x00 */
+ {{0, 0},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 1, 0, /* 0x01 */
+ {{0, 0},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x02 */
+ {{1, 1},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 1, 0, /* 0x03 */
+ {{0, 1},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x04 */
+ {{2, 2},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x05 */
+ {{0, 0},
+ {2, 2},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x06 */
+ {{1, 2},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 1, 0, /* 0x07 */
+ {{0, 2},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x08 */
+ {{3, 3},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x09 */
+ {{0, 0},
+ {3, 3},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x0a */
+ {{1, 1},
+ {3, 3},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x0b */
+ {{0, 1},
+ {3, 3},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x0c */
+ {{2, 3},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x0d */
+ {{0, 0},
+ {2, 3},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x0e */
+ {{1, 3},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 1, 0, /* 0x0f */
+ {{0, 3},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x10 */
+ {{4, 4},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x11 */
+ {{0, 0},
+ {4, 4},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x12 */
+ {{1, 1},
+ {4, 4},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x13 */
+ {{0, 1},
+ {4, 4},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x14 */
+ {{2, 2},
+ {4, 4},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x15 */
+ {{0, 0},
+ {2, 2},
+ {4, 4},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x16 */
+ {{1, 2},
+ {4, 4},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x17 */
+ {{0, 2},
+ {4, 4},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x18 */
+ {{3, 4},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x19 */
+ {{0, 0},
+ {3, 4},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x1a */
+ {{1, 1},
+ {3, 4},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x1b */
+ {{0, 1},
+ {3, 4},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x1c */
+ {{2, 4},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x1d */
+ {{0, 0},
+ {2, 4},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x1e */
+ {{1, 4},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 1, 0, /* 0x1f */
+ {{0, 4},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x20 */
+ {{5, 5},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x21 */
+ {{0, 0},
+ {5, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x22 */
+ {{1, 1},
+ {5, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x23 */
+ {{0, 1},
+ {5, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x24 */
+ {{2, 2},
+ {5, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x25 */
+ {{0, 0},
+ {2, 2},
+ {5, 5},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x26 */
+ {{1, 2},
+ {5, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x27 */
+ {{0, 2},
+ {5, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x28 */
+ {{3, 3},
+ {5, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x29 */
+ {{0, 0},
+ {3, 3},
+ {5, 5},
+ {0, 0}
+ }
+ },
+ {0, 0, 3, 0, /* 0x2a */
+ {{1, 1},
+ {3, 3},
+ {5, 5},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x2b */
+ {{0, 1},
+ {3, 3},
+ {5, 5},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x2c */
+ {{2, 3},
+ {5, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x2d */
+ {{0, 0},
+ {2, 3},
+ {5, 5},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x2e */
+ {{1, 3},
+ {5, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x2f */
+ {{0, 3},
+ {5, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x30 */
+ {{4, 5},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x31 */
+ {{0, 0},
+ {4, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x32 */
+ {{1, 1},
+ {4, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x33 */
+ {{0, 1},
+ {4, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x34 */
+ {{2, 2},
+ {4, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x35 */
+ {{0, 0},
+ {2, 2},
+ {4, 5},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x36 */
+ {{1, 2},
+ {4, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x37 */
+ {{0, 2},
+ {4, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x38 */
+ {{3, 5},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x39 */
+ {{0, 0},
+ {3, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x3a */
+ {{1, 1},
+ {3, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x3b */
+ {{0, 1},
+ {3, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x3c */
+ {{2, 5},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x3d */
+ {{0, 0},
+ {2, 5},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x3e */
+ {{1, 5},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 1, 0, /* 0x3f */
+ {{0, 5},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x40 */
+ {{6, 6},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x41 */
+ {{0, 0},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x42 */
+ {{1, 1},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x43 */
+ {{0, 1},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x44 */
+ {{2, 2},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x45 */
+ {{0, 0},
+ {2, 2},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x46 */
+ {{1, 2},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x47 */
+ {{0, 2},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x48 */
+ {{3, 3},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x49 */
+ {{0, 0},
+ {3, 3},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 3, 0, /* 0x4a */
+ {{1, 1},
+ {3, 3},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x4b */
+ {{0, 1},
+ {3, 3},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x4c */
+ {{2, 3},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x4d */
+ {{0, 0},
+ {2, 3},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x4e */
+ {{1, 3},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x4f */
+ {{0, 3},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x50 */
+ {{4, 4},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x51 */
+ {{0, 0},
+ {4, 4},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 3, 0, /* 0x52 */
+ {{1, 1},
+ {4, 4},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x53 */
+ {{0, 1},
+ {4, 4},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 3, 0, /* 0x54 */
+ {{2, 2},
+ {4, 4},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {1, 0, 4, 0, /* 0x55 */
+ {{0, 0},
+ {2, 2},
+ {4, 4},
+ {6, 6}
+ }
+ },
+ {0, 0, 3, 0, /* 0x56 */
+ {{1, 2},
+ {4, 4},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x57 */
+ {{0, 2},
+ {4, 4},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x58 */
+ {{3, 4},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x59 */
+ {{0, 0},
+ {3, 4},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 3, 0, /* 0x5a */
+ {{1, 1},
+ {3, 4},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x5b */
+ {{0, 1},
+ {3, 4},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x5c */
+ {{2, 4},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x5d */
+ {{0, 0},
+ {2, 4},
+ {6, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x5e */
+ {{1, 4},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x5f */
+ {{0, 4},
+ {6, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x60 */
+ {{5, 6},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x61 */
+ {{0, 0},
+ {5, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x62 */
+ {{1, 1},
+ {5, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x63 */
+ {{0, 1},
+ {5, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x64 */
+ {{2, 2},
+ {5, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x65 */
+ {{0, 0},
+ {2, 2},
+ {5, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x66 */
+ {{1, 2},
+ {5, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x67 */
+ {{0, 2},
+ {5, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x68 */
+ {{3, 3},
+ {5, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x69 */
+ {{0, 0},
+ {3, 3},
+ {5, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 3, 0, /* 0x6a */
+ {{1, 1},
+ {3, 3},
+ {5, 6},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x6b */
+ {{0, 1},
+ {3, 3},
+ {5, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x6c */
+ {{2, 3},
+ {5, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x6d */
+ {{0, 0},
+ {2, 3},
+ {5, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x6e */
+ {{1, 3},
+ {5, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x6f */
+ {{0, 3},
+ {5, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x70 */
+ {{4, 6},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x71 */
+ {{0, 0},
+ {4, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x72 */
+ {{1, 1},
+ {4, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x73 */
+ {{0, 1},
+ {4, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x74 */
+ {{2, 2},
+ {4, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 3, 0, /* 0x75 */
+ {{0, 0},
+ {2, 2},
+ {4, 6},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x76 */
+ {{1, 2},
+ {4, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x77 */
+ {{0, 2},
+ {4, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x78 */
+ {{3, 6},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x79 */
+ {{0, 0},
+ {3, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 2, 0, /* 0x7a */
+ {{1, 1},
+ {3, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x7b */
+ {{0, 1},
+ {3, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x7c */
+ {{2, 6},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 2, 0, /* 0x7d */
+ {{0, 0},
+ {2, 6},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 0, 1, 0, /* 0x7e */
+ {{1, 6},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 0, 1, 0, /* 0x7f */
+ {{0, 6},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 1, 0, /* 0x80 */
+ {{7, 7},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0x81 */
+ {{0, 0},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0x82 */
+ {{1, 1},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0x83 */
+ {{0, 1},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0x84 */
+ {{2, 2},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0x85 */
+ {{0, 0},
+ {2, 2},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0x86 */
+ {{1, 2},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0x87 */
+ {{0, 2},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0x88 */
+ {{3, 3},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0x89 */
+ {{0, 0},
+ {3, 3},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0x8a */
+ {{1, 1},
+ {3, 3},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0x8b */
+ {{0, 1},
+ {3, 3},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0x8c */
+ {{2, 3},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0x8d */
+ {{0, 0},
+ {2, 3},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0x8e */
+ {{1, 3},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0x8f */
+ {{0, 3},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0x90 */
+ {{4, 4},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0x91 */
+ {{0, 0},
+ {4, 4},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0x92 */
+ {{1, 1},
+ {4, 4},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0x93 */
+ {{0, 1},
+ {4, 4},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0x94 */
+ {{2, 2},
+ {4, 4},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 4, 0, /* 0x95 */
+ {{0, 0},
+ {2, 2},
+ {4, 4},
+ {7, 7}
+ }
+ },
+ {0, 1, 3, 0, /* 0x96 */
+ {{1, 2},
+ {4, 4},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0x97 */
+ {{0, 2},
+ {4, 4},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0x98 */
+ {{3, 4},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0x99 */
+ {{0, 0},
+ {3, 4},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0x9a */
+ {{1, 1},
+ {3, 4},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0x9b */
+ {{0, 1},
+ {3, 4},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0x9c */
+ {{2, 4},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0x9d */
+ {{0, 0},
+ {2, 4},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0x9e */
+ {{1, 4},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0x9f */
+ {{0, 4},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xa0 */
+ {{5, 5},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xa1 */
+ {{0, 0},
+ {5, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0xa2 */
+ {{1, 1},
+ {5, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xa3 */
+ {{0, 1},
+ {5, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0xa4 */
+ {{2, 2},
+ {5, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 4, 0, /* 0xa5 */
+ {{0, 0},
+ {2, 2},
+ {5, 5},
+ {7, 7}
+ }
+ },
+ {0, 1, 3, 0, /* 0xa6 */
+ {{1, 2},
+ {5, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xa7 */
+ {{0, 2},
+ {5, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0xa8 */
+ {{3, 3},
+ {5, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 4, 0, /* 0xa9 */
+ {{0, 0},
+ {3, 3},
+ {5, 5},
+ {7, 7}
+ }
+ },
+ {0, 1, 4, 0, /* 0xaa */
+ {{1, 1},
+ {3, 3},
+ {5, 5},
+ {7, 7}
+ }
+ },
+ {1, 1, 4, 0, /* 0xab */
+ {{0, 1},
+ {3, 3},
+ {5, 5},
+ {7, 7}
+ }
+ },
+ {0, 1, 3, 0, /* 0xac */
+ {{2, 3},
+ {5, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 4, 0, /* 0xad */
+ {{0, 0},
+ {2, 3},
+ {5, 5},
+ {7, 7}
+ }
+ },
+ {0, 1, 3, 0, /* 0xae */
+ {{1, 3},
+ {5, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xaf */
+ {{0, 3},
+ {5, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xb0 */
+ {{4, 5},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xb1 */
+ {{0, 0},
+ {4, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0xb2 */
+ {{1, 1},
+ {4, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xb3 */
+ {{0, 1},
+ {4, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0xb4 */
+ {{2, 2},
+ {4, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 4, 0, /* 0xb5 */
+ {{0, 0},
+ {2, 2},
+ {4, 5},
+ {7, 7}
+ }
+ },
+ {0, 1, 3, 0, /* 0xb6 */
+ {{1, 2},
+ {4, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xb7 */
+ {{0, 2},
+ {4, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xb8 */
+ {{3, 5},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xb9 */
+ {{0, 0},
+ {3, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0xba */
+ {{1, 1},
+ {3, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xbb */
+ {{0, 1},
+ {3, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xbc */
+ {{2, 5},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xbd */
+ {{0, 0},
+ {2, 5},
+ {7, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xbe */
+ {{1, 5},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xbf */
+ {{0, 5},
+ {7, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 1, 0, /* 0xc0 */
+ {{6, 7},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xc1 */
+ {{0, 0},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xc2 */
+ {{1, 1},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xc3 */
+ {{0, 1},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xc4 */
+ {{2, 2},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xc5 */
+ {{0, 0},
+ {2, 2},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xc6 */
+ {{1, 2},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xc7 */
+ {{0, 2},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xc8 */
+ {{3, 3},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xc9 */
+ {{0, 0},
+ {3, 3},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0xca */
+ {{1, 1},
+ {3, 3},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xcb */
+ {{0, 1},
+ {3, 3},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xcc */
+ {{2, 3},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xcd */
+ {{0, 0},
+ {2, 3},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xce */
+ {{1, 3},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xcf */
+ {{0, 3},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xd0 */
+ {{4, 4},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xd1 */
+ {{0, 0},
+ {4, 4},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0xd2 */
+ {{1, 1},
+ {4, 4},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xd3 */
+ {{0, 1},
+ {4, 4},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0xd4 */
+ {{2, 2},
+ {4, 4},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 4, 0, /* 0xd5 */
+ {{0, 0},
+ {2, 2},
+ {4, 4},
+ {6, 7}
+ }
+ },
+ {0, 1, 3, 0, /* 0xd6 */
+ {{1, 2},
+ {4, 4},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xd7 */
+ {{0, 2},
+ {4, 4},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xd8 */
+ {{3, 4},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xd9 */
+ {{0, 0},
+ {3, 4},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0xda */
+ {{1, 1},
+ {3, 4},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xdb */
+ {{0, 1},
+ {3, 4},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xdc */
+ {{2, 4},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xdd */
+ {{0, 0},
+ {2, 4},
+ {6, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xde */
+ {{1, 4},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xdf */
+ {{0, 4},
+ {6, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 1, 0, /* 0xe0 */
+ {{5, 7},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xe1 */
+ {{0, 0},
+ {5, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xe2 */
+ {{1, 1},
+ {5, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xe3 */
+ {{0, 1},
+ {5, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xe4 */
+ {{2, 2},
+ {5, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xe5 */
+ {{0, 0},
+ {2, 2},
+ {5, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xe6 */
+ {{1, 2},
+ {5, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xe7 */
+ {{0, 2},
+ {5, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xe8 */
+ {{3, 3},
+ {5, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xe9 */
+ {{0, 0},
+ {3, 3},
+ {5, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 3, 0, /* 0xea */
+ {{1, 1},
+ {3, 3},
+ {5, 7},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xeb */
+ {{0, 1},
+ {3, 3},
+ {5, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xec */
+ {{2, 3},
+ {5, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xed */
+ {{0, 0},
+ {2, 3},
+ {5, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xee */
+ {{1, 3},
+ {5, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xef */
+ {{0, 3},
+ {5, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 1, 0, /* 0xf0 */
+ {{4, 7},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xf1 */
+ {{0, 0},
+ {4, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xf2 */
+ {{1, 1},
+ {4, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xf3 */
+ {{0, 1},
+ {4, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xf4 */
+ {{2, 2},
+ {4, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 3, 0, /* 0xf5 */
+ {{0, 0},
+ {2, 2},
+ {4, 7},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xf6 */
+ {{1, 2},
+ {4, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xf7 */
+ {{0, 2},
+ {4, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 1, 0, /* 0xf8 */
+ {{3, 7},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xf9 */
+ {{0, 0},
+ {3, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 2, 0, /* 0xfa */
+ {{1, 1},
+ {3, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xfb */
+ {{0, 1},
+ {3, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 1, 0, /* 0xfc */
+ {{2, 7},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 2, 0, /* 0xfd */
+ {{0, 0},
+ {2, 7},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {0, 1, 1, 0, /* 0xfe */
+ {{1, 7},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ },
+ {1, 1, 1, 0, /* 0xff */
+ {{0, 7},
+ {0, 0},
+ {0, 0},
+ {0, 0}
+ }
+ }
+};
+
+
+int
+sctp_is_address_in_scope(struct sctp_ifa *ifa,
+ int ipv4_addr_legal,
+ int ipv6_addr_legal,
+ int loopback_scope,
+ int ipv4_local_scope,
+ int local_scope,
+ int site_scope,
+ int do_update)
+{
+ if ((loopback_scope == 0) &&
+ (ifa->ifn_p) && SCTP_IFN_IS_IFT_LOOP(ifa->ifn_p)) {
+ /*
+ * skip loopback if not in scope *
+ */
+ return (0);
+ }
+ switch (ifa->address.sa.sa_family) {
+ case AF_INET:
+ if (ipv4_addr_legal) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&ifa->address.sin;
+ if (sin->sin_addr.s_addr == 0) {
+ /* not in scope , unspecified */
+ return (0);
+ }
+ if ((ipv4_local_scope == 0) &&
+ (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) {
+ /* private address not in scope */
+ return (0);
+ }
+ } else {
+ return (0);
+ }
+ break;
+#ifdef INET6
+ case AF_INET6:
+ if (ipv6_addr_legal) {
+ struct sockaddr_in6 *sin6;
+
+ /*
+ * Must update the flags, bummer, which means any
+ * IFA locks must now be applied HERE <->
+ */
+ if (do_update) {
+ sctp_gather_internal_ifa_flags(ifa);
+ }
+ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
+ return (0);
+ }
+ /* ok to use deprecated addresses? */
+ sin6 = (struct sockaddr_in6 *)&ifa->address.sin6;
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+ /* skip unspecifed addresses */
+ return (0);
+ }
+ if ( /* (local_scope == 0) && */
+ (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))) {
+ return (0);
+ }
+ if ((site_scope == 0) &&
+ (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) {
+ return (0);
+ }
+ } else {
+ return (0);
+ }
+ break;
+#endif
+ default:
+ return (0);
+ }
+ return (1);
+}
+
+static struct mbuf *
+sctp_add_addr_to_mbuf(struct mbuf *m, struct sctp_ifa *ifa)
+{
+ struct sctp_paramhdr *parmh;
+ struct mbuf *mret;
+ int len;
+
+ if (ifa->address.sa.sa_family == AF_INET) {
+ len = sizeof(struct sctp_ipv4addr_param);
+ } else if (ifa->address.sa.sa_family == AF_INET6) {
+ len = sizeof(struct sctp_ipv6addr_param);
+ } else {
+ /* unknown type */
+ return (m);
+ }
+ if (M_TRAILINGSPACE(m) >= len) {
+ /* easy side we just drop it on the end */
+ parmh = (struct sctp_paramhdr *)(SCTP_BUF_AT(m, SCTP_BUF_LEN(m)));
+ mret = m;
+ } else {
+ /* Need more space */
+ mret = m;
+ while (SCTP_BUF_NEXT(mret) != NULL) {
+ mret = SCTP_BUF_NEXT(mret);
+ }
+ SCTP_BUF_NEXT(mret) = sctp_get_mbuf_for_msg(len, 0, M_DONTWAIT, 1, MT_DATA);
+ if (SCTP_BUF_NEXT(mret) == NULL) {
+ /* We are hosed, can't add more addresses */
+ return (m);
+ }
+ mret = SCTP_BUF_NEXT(mret);
+ parmh = mtod(mret, struct sctp_paramhdr *);
+ }
+ /* now add the parameter */
+ switch (ifa->address.sa.sa_family) {
+ case AF_INET:
+ {
+ struct sctp_ipv4addr_param *ipv4p;
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&ifa->address.sin;
+ ipv4p = (struct sctp_ipv4addr_param *)parmh;
+ parmh->param_type = htons(SCTP_IPV4_ADDRESS);
+ parmh->param_length = htons(len);
+ ipv4p->addr = sin->sin_addr.s_addr;
+ SCTP_BUF_LEN(mret) += len;
+ break;
+ }
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct sctp_ipv6addr_param *ipv6p;
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&ifa->address.sin6;
+ ipv6p = (struct sctp_ipv6addr_param *)parmh;
+ parmh->param_type = htons(SCTP_IPV6_ADDRESS);
+ parmh->param_length = htons(len);
+ memcpy(ipv6p->addr, &sin6->sin6_addr,
+ sizeof(ipv6p->addr));
+ /* clear embedded scope in the address */
+ in6_clearscope((struct in6_addr *)ipv6p->addr);
+ SCTP_BUF_LEN(mret) += len;
+ break;
+ }
+#endif
+ default:
+ return (m);
+ }
+ return (mret);
+}
+
+
+struct mbuf *
+sctp_add_addresses_to_i_ia(struct sctp_inpcb *inp, struct sctp_scoping *scope,
+ struct mbuf *m_at, int cnt_inits_to)
+{
+ struct sctp_vrf *vrf = NULL;
+ int cnt, limit_out = 0, total_count;
+ uint32_t vrf_id;
+
+ vrf_id = inp->def_vrf_id;
+ SCTP_IPI_ADDR_RLOCK();
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf == NULL) {
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (m_at);
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ struct sctp_ifa *sctp_ifap;
+ struct sctp_ifn *sctp_ifnp;
+
+ cnt = cnt_inits_to;
+ if (vrf->total_ifa_count > SCTP_COUNT_LIMIT) {
+ limit_out = 1;
+ cnt = SCTP_ADDRESS_LIMIT;
+ goto skip_count;
+ }
+ LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) {
+ if ((scope->loopback_scope == 0) &&
+ SCTP_IFN_IS_IFT_LOOP(sctp_ifnp)) {
+ /*
+ * Skip loopback devices if loopback_scope
+ * not set
+ */
+ continue;
+ }
+ LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) {
+ if (sctp_is_address_in_scope(sctp_ifap,
+ scope->ipv4_addr_legal,
+ scope->ipv6_addr_legal,
+ scope->loopback_scope,
+ scope->ipv4_local_scope,
+ scope->local_scope,
+ scope->site_scope, 1) == 0) {
+ continue;
+ }
+ cnt++;
+ if (cnt > SCTP_ADDRESS_LIMIT) {
+ break;
+ }
+ }
+ if (cnt > SCTP_ADDRESS_LIMIT) {
+ break;
+ }
+ }
+skip_count:
+ if (cnt > 1) {
+ total_count = 0;
+ LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) {
+ cnt = 0;
+ if ((scope->loopback_scope == 0) &&
+ SCTP_IFN_IS_IFT_LOOP(sctp_ifnp)) {
+ /*
+ * Skip loopback devices if
+ * loopback_scope not set
+ */
+ continue;
+ }
+ LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) {
+ if (sctp_is_address_in_scope(sctp_ifap,
+ scope->ipv4_addr_legal,
+ scope->ipv6_addr_legal,
+ scope->loopback_scope,
+ scope->ipv4_local_scope,
+ scope->local_scope,
+ scope->site_scope, 0) == 0) {
+ continue;
+ }
+ m_at = sctp_add_addr_to_mbuf(m_at, sctp_ifap);
+ if (limit_out) {
+ cnt++;
+ total_count++;
+ if (cnt >= 2) {
+ /*
+ * two from each
+ * address
+ */
+ break;
+ }
+ if (total_count > SCTP_ADDRESS_LIMIT) {
+ /* No more addresses */
+ break;
+ }
+ }
+ }
+ }
+ }
+ } else {
+ struct sctp_laddr *laddr;
+
+ cnt = cnt_inits_to;
+ /* First, how many ? */
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ if (laddr->ifa == NULL) {
+ continue;
+ }
+ if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED)
+ /*
+ * Address being deleted by the system, dont
+ * list.
+ */
+ continue;
+ if (laddr->action == SCTP_DEL_IP_ADDRESS) {
+ /*
+ * Address being deleted on this ep don't
+ * list.
+ */
+ continue;
+ }
+ if (sctp_is_address_in_scope(laddr->ifa,
+ scope->ipv4_addr_legal,
+ scope->ipv6_addr_legal,
+ scope->loopback_scope,
+ scope->ipv4_local_scope,
+ scope->local_scope,
+ scope->site_scope, 1) == 0) {
+ continue;
+ }
+ cnt++;
+ }
+ if (cnt > SCTP_ADDRESS_LIMIT) {
+ limit_out = 1;
+ }
+ /*
+ * To get through a NAT we only list addresses if we have
+ * more than one. That way if you just bind a single address
+ * we let the source of the init dictate our address.
+ */
+ if (cnt > 1) {
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ cnt = 0;
+ if (laddr->ifa == NULL) {
+ continue;
+ }
+ if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED)
+ continue;
+
+ if (sctp_is_address_in_scope(laddr->ifa,
+ scope->ipv4_addr_legal,
+ scope->ipv6_addr_legal,
+ scope->loopback_scope,
+ scope->ipv4_local_scope,
+ scope->local_scope,
+ scope->site_scope, 0) == 0) {
+ continue;
+ }
+ m_at = sctp_add_addr_to_mbuf(m_at, laddr->ifa);
+ cnt++;
+ if (cnt >= SCTP_ADDRESS_LIMIT) {
+ break;
+ }
+ }
+ }
+ }
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (m_at);
+}
+
+static struct sctp_ifa *
+sctp_is_ifa_addr_preferred(struct sctp_ifa *ifa,
+ uint8_t dest_is_loop,
+ uint8_t dest_is_priv,
+ sa_family_t fam)
+{
+ uint8_t dest_is_global = 0;
+
+ /* dest_is_priv is true if destination is a private address */
+ /* dest_is_loop is true if destination is a loopback addresses */
+
+ /**
+ * Here we determine if its a preferred address. A preferred address
+ * means it is the same scope or higher scope then the destination.
+ * L = loopback, P = private, G = global
+ * -----------------------------------------
+ * src | dest | result
+ * ----------------------------------------
+ * L | L | yes
+ * -----------------------------------------
+ * P | L | yes-v4 no-v6
+ * -----------------------------------------
+ * G | L | yes-v4 no-v6
+ * -----------------------------------------
+ * L | P | no
+ * -----------------------------------------
+ * P | P | yes
+ * -----------------------------------------
+ * G | P | no
+ * -----------------------------------------
+ * L | G | no
+ * -----------------------------------------
+ * P | G | no
+ * -----------------------------------------
+ * G | G | yes
+ * -----------------------------------------
+ */
+
+ if (ifa->address.sa.sa_family != fam) {
+ /* forget mis-matched family */
+ return (NULL);
+ }
+ if ((dest_is_priv == 0) && (dest_is_loop == 0)) {
+ dest_is_global = 1;
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Is destination preferred:");
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ifa->address.sa);
+ /* Ok the address may be ok */
+ if (fam == AF_INET6) {
+ /* ok to use deprecated addresses? no lets not! */
+ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:1\n");
+ return (NULL);
+ }
+ if (ifa->src_is_priv && !ifa->src_is_loop) {
+ if (dest_is_loop) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:2\n");
+ return (NULL);
+ }
+ }
+ if (ifa->src_is_glob) {
+ if (dest_is_loop) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:3\n");
+ return (NULL);
+ }
+ }
+ }
+ /*
+ * Now that we know what is what, implement or table this could in
+ * theory be done slicker (it used to be), but this is
+ * straightforward and easier to validate :-)
+ */
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "src_loop:%d src_priv:%d src_glob:%d\n",
+ ifa->src_is_loop, ifa->src_is_priv, ifa->src_is_glob);
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "dest_loop:%d dest_priv:%d dest_glob:%d\n",
+ dest_is_loop, dest_is_priv, dest_is_global);
+
+ if ((ifa->src_is_loop) && (dest_is_priv)) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:4\n");
+ return (NULL);
+ }
+ if ((ifa->src_is_glob) && (dest_is_priv)) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:5\n");
+ return (NULL);
+ }
+ if ((ifa->src_is_loop) && (dest_is_global)) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:6\n");
+ return (NULL);
+ }
+ if ((ifa->src_is_priv) && (dest_is_global)) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:7\n");
+ return (NULL);
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "YES\n");
+ /* its a preferred address */
+ return (ifa);
+}
+
+static struct sctp_ifa *
+sctp_is_ifa_addr_acceptable(struct sctp_ifa *ifa,
+ uint8_t dest_is_loop,
+ uint8_t dest_is_priv,
+ sa_family_t fam)
+{
+ uint8_t dest_is_global = 0;
+
+ /*
+ * Here we determine if its a acceptable address. A acceptable
+ * address means it is the same scope or higher scope but we can
+ * allow for NAT which means its ok to have a global dest and a
+ * private src.
+ *
+ * L = loopback, P = private, G = global
+ * ----------------------------------------- src | dest | result
+ * ----------------------------------------- L | L | yes
+ * ----------------------------------------- P | L |
+ * yes-v4 no-v6 ----------------------------------------- G |
+ * L | yes ----------------------------------------- L |
+ * P | no ----------------------------------------- P | P
+ * | yes ----------------------------------------- G | P
+ * | yes - May not work -----------------------------------------
+ * L | G | no ----------------------------------------- P
+ * | G | yes - May not work
+ * ----------------------------------------- G | G | yes
+ * -----------------------------------------
+ */
+
+ if (ifa->address.sa.sa_family != fam) {
+ /* forget non matching family */
+ return (NULL);
+ }
+ /* Ok the address may be ok */
+ if ((dest_is_loop == 0) && (dest_is_priv == 0)) {
+ dest_is_global = 1;
+ }
+ if (fam == AF_INET6) {
+ /* ok to use deprecated addresses? */
+ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
+ return (NULL);
+ }
+ if (ifa->src_is_priv) {
+ /* Special case, linklocal to loop */
+ if (dest_is_loop)
+ return (NULL);
+ }
+ }
+ /*
+ * Now that we know what is what, implement our table. This could in
+ * theory be done slicker (it used to be), but this is
+ * straightforward and easier to validate :-)
+ */
+ if ((ifa->src_is_loop == 1) && (dest_is_priv)) {
+ return (NULL);
+ }
+ if ((ifa->src_is_loop == 1) && (dest_is_global)) {
+ return (NULL);
+ }
+ /* its an acceptable address */
+ return (ifa);
+}
+
+int
+sctp_is_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa)
+{
+ struct sctp_laddr *laddr;
+
+ if (stcb == NULL) {
+ /* There are no restrictions, no TCB :-) */
+ return (0);
+ }
+ LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) {
+ if (laddr->ifa == NULL) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n",
+ __FUNCTION__);
+ continue;
+ }
+ if (laddr->ifa == ifa) {
+ /* Yes it is on the list */
+ return (1);
+ }
+ }
+ return (0);
+}
+
+
+int
+sctp_is_addr_in_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa)
+{
+ struct sctp_laddr *laddr;
+
+ if (ifa == NULL)
+ return (0);
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ if (laddr->ifa == NULL) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n",
+ __FUNCTION__);
+ continue;
+ }
+ if ((laddr->ifa == ifa) && laddr->action == 0)
+ /* same pointer */
+ return (1);
+ }
+ return (0);
+}
+
+
+
+static struct sctp_ifa *
+sctp_choose_boundspecific_inp(struct sctp_inpcb *inp,
+ sctp_route_t * ro,
+ uint32_t vrf_id,
+ int non_asoc_addr_ok,
+ uint8_t dest_is_priv,
+ uint8_t dest_is_loop,
+ sa_family_t fam)
+{
+ struct sctp_laddr *laddr, *starting_point;
+ void *ifn;
+ int resettotop = 0;
+ struct sctp_ifn *sctp_ifn;
+ struct sctp_ifa *sctp_ifa, *sifa;
+ struct sctp_vrf *vrf;
+ uint32_t ifn_index;
+
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf == NULL)
+ return (NULL);
+
+ ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro);
+ ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro);
+ sctp_ifn = sctp_find_ifn(ifn, ifn_index);
+ /*
+ * first question, is the ifn we will emit on in our list, if so, we
+ * want such an address. Note that we first looked for a preferred
+ * address.
+ */
+ if (sctp_ifn) {
+ /* is a preferred one on the interface we route out? */
+ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
+ if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) &&
+ (non_asoc_addr_ok == 0))
+ continue;
+ sifa = sctp_is_ifa_addr_preferred(sctp_ifa,
+ dest_is_loop,
+ dest_is_priv, fam);
+ if (sifa == NULL)
+ continue;
+ if (sctp_is_addr_in_ep(inp, sifa)) {
+ atomic_add_int(&sifa->refcount, 1);
+ return (sifa);
+ }
+ }
+ }
+ /*
+ * ok, now we now need to find one on the list of the addresses. We
+ * can't get one on the emitting interface so let's find first a
+ * preferred one. If not that an acceptable one otherwise... we
+ * return NULL.
+ */
+ starting_point = inp->next_addr_touse;
+once_again:
+ if (inp->next_addr_touse == NULL) {
+ inp->next_addr_touse = LIST_FIRST(&inp->sctp_addr_list);
+ resettotop = 1;
+ }
+ for (laddr = inp->next_addr_touse; laddr;
+ laddr = LIST_NEXT(laddr, sctp_nxt_addr)) {
+ if (laddr->ifa == NULL) {
+ /* address has been removed */
+ continue;
+ }
+ if (laddr->action == SCTP_DEL_IP_ADDRESS) {
+ /* address is being deleted */
+ continue;
+ }
+ sifa = sctp_is_ifa_addr_preferred(laddr->ifa, dest_is_loop,
+ dest_is_priv, fam);
+ if (sifa == NULL)
+ continue;
+ atomic_add_int(&sifa->refcount, 1);
+ return (sifa);
+ }
+ if (resettotop == 0) {
+ inp->next_addr_touse = NULL;
+ goto once_again;
+ }
+ inp->next_addr_touse = starting_point;
+ resettotop = 0;
+once_again_too:
+ if (inp->next_addr_touse == NULL) {
+ inp->next_addr_touse = LIST_FIRST(&inp->sctp_addr_list);
+ resettotop = 1;
+ }
+ /* ok, what about an acceptable address in the inp */
+ for (laddr = inp->next_addr_touse; laddr;
+ laddr = LIST_NEXT(laddr, sctp_nxt_addr)) {
+ if (laddr->ifa == NULL) {
+ /* address has been removed */
+ continue;
+ }
+ if (laddr->action == SCTP_DEL_IP_ADDRESS) {
+ /* address is being deleted */
+ continue;
+ }
+ sifa = sctp_is_ifa_addr_acceptable(laddr->ifa, dest_is_loop,
+ dest_is_priv, fam);
+ if (sifa == NULL)
+ continue;
+ atomic_add_int(&sifa->refcount, 1);
+ return (sifa);
+ }
+ if (resettotop == 0) {
+ inp->next_addr_touse = NULL;
+ goto once_again_too;
+ }
+ /*
+ * no address bound can be a source for the destination we are in
+ * trouble
+ */
+ return (NULL);
+}
+
+
+
+static struct sctp_ifa *
+sctp_choose_boundspecific_stcb(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_nets *net,
+ sctp_route_t * ro,
+ uint32_t vrf_id,
+ uint8_t dest_is_priv,
+ uint8_t dest_is_loop,
+ int non_asoc_addr_ok,
+ sa_family_t fam)
+{
+ struct sctp_laddr *laddr, *starting_point;
+ void *ifn;
+ struct sctp_ifn *sctp_ifn;
+ struct sctp_ifa *sctp_ifa, *sifa;
+ uint8_t start_at_beginning = 0;
+ struct sctp_vrf *vrf;
+ uint32_t ifn_index;
+
+ /*
+ * first question, is the ifn we will emit on in our list, if so, we
+ * want that one.
+ */
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf == NULL)
+ return (NULL);
+
+ ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro);
+ ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro);
+ sctp_ifn = sctp_find_ifn(ifn, ifn_index);
+
+ /*
+ * first question, is the ifn we will emit on in our list? If so,
+ * we want that one. First we look for a preferred. Second, we go
+ * for an acceptable.
+ */
+ if (sctp_ifn) {
+ /* first try for a preferred address on the ep */
+ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
+ if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0))
+ continue;
+ if (sctp_is_addr_in_ep(inp, sctp_ifa)) {
+ sifa = sctp_is_ifa_addr_preferred(sctp_ifa, dest_is_loop, dest_is_priv, fam);
+ if (sifa == NULL)
+ continue;
+ if (((non_asoc_addr_ok == 0) &&
+ (sctp_is_addr_restricted(stcb, sifa))) ||
+ (non_asoc_addr_ok &&
+ (sctp_is_addr_restricted(stcb, sifa)) &&
+ (!sctp_is_addr_pending(stcb, sifa)))) {
+ /* on the no-no list */
+ continue;
+ }
+ atomic_add_int(&sifa->refcount, 1);
+ return (sifa);
+ }
+ }
+ /* next try for an acceptable address on the ep */
+ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
+ if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0))
+ continue;
+ if (sctp_is_addr_in_ep(inp, sctp_ifa)) {
+ sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop, dest_is_priv, fam);
+ if (sifa == NULL)
+ continue;
+ if (((non_asoc_addr_ok == 0) &&
+ (sctp_is_addr_restricted(stcb, sifa))) ||
+ (non_asoc_addr_ok &&
+ (sctp_is_addr_restricted(stcb, sifa)) &&
+ (!sctp_is_addr_pending(stcb, sifa)))) {
+ /* on the no-no list */
+ continue;
+ }
+ atomic_add_int(&sifa->refcount, 1);
+ return (sifa);
+ }
+ }
+
+ }
+ /*
+ * if we can't find one like that then we must look at all addresses
+ * bound to pick one at first preferable then secondly acceptable.
+ */
+ starting_point = stcb->asoc.last_used_address;
+sctp_from_the_top:
+ if (stcb->asoc.last_used_address == NULL) {
+ start_at_beginning = 1;
+ stcb->asoc.last_used_address = LIST_FIRST(&inp->sctp_addr_list);
+ }
+ /* search beginning with the last used address */
+ for (laddr = stcb->asoc.last_used_address; laddr;
+ laddr = LIST_NEXT(laddr, sctp_nxt_addr)) {
+ if (laddr->ifa == NULL) {
+ /* address has been removed */
+ continue;
+ }
+ if (laddr->action == SCTP_DEL_IP_ADDRESS) {
+ /* address is being deleted */
+ continue;
+ }
+ sifa = sctp_is_ifa_addr_preferred(laddr->ifa, dest_is_loop, dest_is_priv, fam);
+ if (sifa == NULL)
+ continue;
+ if (((non_asoc_addr_ok == 0) &&
+ (sctp_is_addr_restricted(stcb, sifa))) ||
+ (non_asoc_addr_ok &&
+ (sctp_is_addr_restricted(stcb, sifa)) &&
+ (!sctp_is_addr_pending(stcb, sifa)))) {
+ /* on the no-no list */
+ continue;
+ }
+ stcb->asoc.last_used_address = laddr;
+ atomic_add_int(&sifa->refcount, 1);
+ return (sifa);
+ }
+ if (start_at_beginning == 0) {
+ stcb->asoc.last_used_address = NULL;
+ goto sctp_from_the_top;
+ }
+ /* now try for any higher scope than the destination */
+ stcb->asoc.last_used_address = starting_point;
+ start_at_beginning = 0;
+sctp_from_the_top2:
+ if (stcb->asoc.last_used_address == NULL) {
+ start_at_beginning = 1;
+ stcb->asoc.last_used_address = LIST_FIRST(&inp->sctp_addr_list);
+ }
+ /* search beginning with the last used address */
+ for (laddr = stcb->asoc.last_used_address; laddr;
+ laddr = LIST_NEXT(laddr, sctp_nxt_addr)) {
+ if (laddr->ifa == NULL) {
+ /* address has been removed */
+ continue;
+ }
+ if (laddr->action == SCTP_DEL_IP_ADDRESS) {
+ /* address is being deleted */
+ continue;
+ }
+ sifa = sctp_is_ifa_addr_acceptable(laddr->ifa, dest_is_loop,
+ dest_is_priv, fam);
+ if (sifa == NULL)
+ continue;
+ if (((non_asoc_addr_ok == 0) &&
+ (sctp_is_addr_restricted(stcb, sifa))) ||
+ (non_asoc_addr_ok &&
+ (sctp_is_addr_restricted(stcb, sifa)) &&
+ (!sctp_is_addr_pending(stcb, sifa)))) {
+ /* on the no-no list */
+ continue;
+ }
+ stcb->asoc.last_used_address = laddr;
+ atomic_add_int(&sifa->refcount, 1);
+ return (sifa);
+ }
+ if (start_at_beginning == 0) {
+ stcb->asoc.last_used_address = NULL;
+ goto sctp_from_the_top2;
+ }
+ return (NULL);
+}
+
+static struct sctp_ifa *
+sctp_select_nth_preferred_addr_from_ifn_boundall(struct sctp_ifn *ifn,
+ struct sctp_tcb *stcb,
+ int non_asoc_addr_ok,
+ uint8_t dest_is_loop,
+ uint8_t dest_is_priv,
+ int addr_wanted,
+ sa_family_t fam,
+ sctp_route_t * ro
+)
+{
+ struct sctp_ifa *ifa, *sifa;
+ int num_eligible_addr = 0;
+
+#ifdef INET6
+ struct sockaddr_in6 sin6, lsa6;
+
+ if (fam == AF_INET6) {
+ memcpy(&sin6, &ro->ro_dst, sizeof(struct sockaddr_in6));
+ (void)sa6_recoverscope(&sin6);
+ }
+#endif /* INET6 */
+ LIST_FOREACH(ifa, &ifn->ifalist, next_ifa) {
+ if ((ifa->localifa_flags & SCTP_ADDR_DEFER_USE) &&
+ (non_asoc_addr_ok == 0))
+ continue;
+ sifa = sctp_is_ifa_addr_preferred(ifa, dest_is_loop,
+ dest_is_priv, fam);
+ if (sifa == NULL)
+ continue;
+#ifdef INET6
+ if (fam == AF_INET6 &&
+ dest_is_loop &&
+ sifa->src_is_loop && sifa->src_is_priv) {
+ /*
+ * don't allow fe80::1 to be a src on loop ::1, we
+ * don't list it to the peer so we will get an
+ * abort.
+ */
+ continue;
+ }
+ if (fam == AF_INET6 &&
+ IN6_IS_ADDR_LINKLOCAL(&sifa->address.sin6.sin6_addr) &&
+ IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr)) {
+ /*
+ * link-local <-> link-local must belong to the same
+ * scope.
+ */
+ memcpy(&lsa6, &sifa->address.sin6, sizeof(struct sockaddr_in6));
+ (void)sa6_recoverscope(&lsa6);
+ if (sin6.sin6_scope_id != lsa6.sin6_scope_id) {
+ continue;
+ }
+ }
+#endif /* INET6 */
+
+ /*
+ * Check if the IPv6 address matches to next-hop. In the
+ * mobile case, old IPv6 address may be not deleted from the
+ * interface. Then, the interface has previous and new
+ * addresses. We should use one corresponding to the
+ * next-hop. (by micchie)
+ */
+#ifdef INET6
+ if (stcb && fam == AF_INET6 &&
+ sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) {
+ if (sctp_v6src_match_nexthop(&sifa->address.sin6, ro)
+ == 0) {
+ continue;
+ }
+ }
+#endif
+ /* Avoid topologically incorrect IPv4 address */
+ if (stcb && fam == AF_INET &&
+ sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) {
+ if (sctp_v4src_match_nexthop(sifa, ro) == 0) {
+ continue;
+ }
+ }
+ if (stcb) {
+ if (sctp_is_address_in_scope(ifa,
+ stcb->asoc.ipv4_addr_legal,
+ stcb->asoc.ipv6_addr_legal,
+ stcb->asoc.loopback_scope,
+ stcb->asoc.ipv4_local_scope,
+ stcb->asoc.local_scope,
+ stcb->asoc.site_scope, 0) == 0) {
+ continue;
+ }
+ if (((non_asoc_addr_ok == 0) &&
+ (sctp_is_addr_restricted(stcb, sifa))) ||
+ (non_asoc_addr_ok &&
+ (sctp_is_addr_restricted(stcb, sifa)) &&
+ (!sctp_is_addr_pending(stcb, sifa)))) {
+ /*
+ * It is restricted for some reason..
+ * probably not yet added.
+ */
+ continue;
+ }
+ }
+ if (num_eligible_addr >= addr_wanted) {
+ return (sifa);
+ }
+ num_eligible_addr++;
+ }
+ return (NULL);
+}
+
+
+static int
+sctp_count_num_preferred_boundall(struct sctp_ifn *ifn,
+ struct sctp_tcb *stcb,
+ int non_asoc_addr_ok,
+ uint8_t dest_is_loop,
+ uint8_t dest_is_priv,
+ sa_family_t fam)
+{
+ struct sctp_ifa *ifa, *sifa;
+ int num_eligible_addr = 0;
+
+ LIST_FOREACH(ifa, &ifn->ifalist, next_ifa) {
+ if ((ifa->localifa_flags & SCTP_ADDR_DEFER_USE) &&
+ (non_asoc_addr_ok == 0)) {
+ continue;
+ }
+ sifa = sctp_is_ifa_addr_preferred(ifa, dest_is_loop,
+ dest_is_priv, fam);
+ if (sifa == NULL) {
+ continue;
+ }
+ if (stcb) {
+ if (sctp_is_address_in_scope(ifa,
+ stcb->asoc.ipv4_addr_legal,
+ stcb->asoc.ipv6_addr_legal,
+ stcb->asoc.loopback_scope,
+ stcb->asoc.ipv4_local_scope,
+ stcb->asoc.local_scope,
+ stcb->asoc.site_scope, 0) == 0) {
+ continue;
+ }
+ if (((non_asoc_addr_ok == 0) &&
+ (sctp_is_addr_restricted(stcb, sifa))) ||
+ (non_asoc_addr_ok &&
+ (sctp_is_addr_restricted(stcb, sifa)) &&
+ (!sctp_is_addr_pending(stcb, sifa)))) {
+ /*
+ * It is restricted for some reason..
+ * probably not yet added.
+ */
+ continue;
+ }
+ }
+ num_eligible_addr++;
+ }
+ return (num_eligible_addr);
+}
+
+static struct sctp_ifa *
+sctp_choose_boundall(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_nets *net,
+ sctp_route_t * ro,
+ uint32_t vrf_id,
+ uint8_t dest_is_priv,
+ uint8_t dest_is_loop,
+ int non_asoc_addr_ok,
+ sa_family_t fam)
+{
+ int cur_addr_num = 0, num_preferred = 0;
+ void *ifn;
+ struct sctp_ifn *sctp_ifn, *looked_at = NULL, *emit_ifn;
+ struct sctp_ifa *sctp_ifa, *sifa;
+ uint32_t ifn_index;
+ struct sctp_vrf *vrf;
+
+ /*-
+ * For boundall we can use any address in the association.
+ * If non_asoc_addr_ok is set we can use any address (at least in
+ * theory). So we look for preferred addresses first. If we find one,
+ * we use it. Otherwise we next try to get an address on the
+ * interface, which we should be able to do (unless non_asoc_addr_ok
+ * is false and we are routed out that way). In these cases where we
+ * can't use the address of the interface we go through all the
+ * ifn's looking for an address we can use and fill that in. Punting
+ * means we send back address 0, which will probably cause problems
+ * actually since then IP will fill in the address of the route ifn,
+ * which means we probably already rejected it.. i.e. here comes an
+ * abort :-<.
+ */
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf == NULL)
+ return (NULL);
+
+ ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro);
+ ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro);
+ emit_ifn = looked_at = sctp_ifn = sctp_find_ifn(ifn, ifn_index);
+ if (sctp_ifn == NULL) {
+ /* ?? We don't have this guy ?? */
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "No ifn emit interface?\n");
+ goto bound_all_plan_b;
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "ifn_index:%d name:%s is emit interface\n",
+ ifn_index, sctp_ifn->ifn_name);
+
+ if (net) {
+ cur_addr_num = net->indx_of_eligible_next_to_use;
+ }
+ num_preferred = sctp_count_num_preferred_boundall(sctp_ifn,
+ stcb,
+ non_asoc_addr_ok,
+ dest_is_loop,
+ dest_is_priv, fam);
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Found %d preferred source addresses for intf:%s\n",
+ num_preferred, sctp_ifn->ifn_name);
+ if (num_preferred == 0) {
+ /*
+ * no eligible addresses, we must use some other interface
+ * address if we can find one.
+ */
+ goto bound_all_plan_b;
+ }
+ /*
+ * Ok we have num_eligible_addr set with how many we can use, this
+ * may vary from call to call due to addresses being deprecated
+ * etc..
+ */
+ if (cur_addr_num >= num_preferred) {
+ cur_addr_num = 0;
+ }
+ /*
+ * select the nth address from the list (where cur_addr_num is the
+ * nth) and 0 is the first one, 1 is the second one etc...
+ */
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "cur_addr_num:%d\n", cur_addr_num);
+
+ sctp_ifa = sctp_select_nth_preferred_addr_from_ifn_boundall(sctp_ifn, stcb, non_asoc_addr_ok, dest_is_loop,
+ dest_is_priv, cur_addr_num, fam, ro);
+
+ /* if sctp_ifa is NULL something changed??, fall to plan b. */
+ if (sctp_ifa) {
+ atomic_add_int(&sctp_ifa->refcount, 1);
+ if (net) {
+ /* save off where the next one we will want */
+ net->indx_of_eligible_next_to_use = cur_addr_num + 1;
+ }
+ return (sctp_ifa);
+ }
+ /*
+ * plan_b: Look at all interfaces and find a preferred address. If
+ * no preferred fall through to plan_c.
+ */
+bound_all_plan_b:
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan B\n");
+ LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Examine interface %s\n",
+ sctp_ifn->ifn_name);
+ if (dest_is_loop == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
+ /* wrong base scope */
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "skip\n");
+ continue;
+ }
+ if ((sctp_ifn == looked_at) && looked_at) {
+ /* already looked at this guy */
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "already seen\n");
+ continue;
+ }
+ num_preferred = sctp_count_num_preferred_boundall(sctp_ifn, stcb, non_asoc_addr_ok,
+ dest_is_loop, dest_is_priv, fam);
+ SCTPDBG(SCTP_DEBUG_OUTPUT2,
+ "Found ifn:%p %d preferred source addresses\n",
+ ifn, num_preferred);
+ if (num_preferred == 0) {
+ /* None on this interface. */
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "No prefered -- skipping to next\n");
+ continue;
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT2,
+ "num preferred:%d on interface:%p cur_addr_num:%d\n",
+ num_preferred, sctp_ifn, cur_addr_num);
+
+ /*
+ * Ok we have num_eligible_addr set with how many we can
+ * use, this may vary from call to call due to addresses
+ * being deprecated etc..
+ */
+ if (cur_addr_num >= num_preferred) {
+ cur_addr_num = 0;
+ }
+ sifa = sctp_select_nth_preferred_addr_from_ifn_boundall(sctp_ifn, stcb, non_asoc_addr_ok, dest_is_loop,
+ dest_is_priv, cur_addr_num, fam, ro);
+ if (sifa == NULL)
+ continue;
+ if (net) {
+ net->indx_of_eligible_next_to_use = cur_addr_num + 1;
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "we selected %d\n",
+ cur_addr_num);
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Source:");
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa);
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Dest:");
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &net->ro._l_addr.sa);
+ }
+ atomic_add_int(&sifa->refcount, 1);
+ return (sifa);
+
+ }
+
+ /* plan_c: do we have an acceptable address on the emit interface */
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan C: find acceptable on interface\n");
+ if (emit_ifn == NULL) {
+ goto plan_d;
+ }
+ LIST_FOREACH(sctp_ifa, &emit_ifn->ifalist, next_ifa) {
+ if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) &&
+ (non_asoc_addr_ok == 0))
+ continue;
+ sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop,
+ dest_is_priv, fam);
+ if (sifa == NULL)
+ continue;
+ if (stcb) {
+ if (sctp_is_address_in_scope(sifa,
+ stcb->asoc.ipv4_addr_legal,
+ stcb->asoc.ipv6_addr_legal,
+ stcb->asoc.loopback_scope,
+ stcb->asoc.ipv4_local_scope,
+ stcb->asoc.local_scope,
+ stcb->asoc.site_scope, 0) == 0) {
+ continue;
+ }
+ if (((non_asoc_addr_ok == 0) &&
+ (sctp_is_addr_restricted(stcb, sifa))) ||
+ (non_asoc_addr_ok &&
+ (sctp_is_addr_restricted(stcb, sifa)) &&
+ (!sctp_is_addr_pending(stcb, sifa)))) {
+ /*
+ * It is restricted for some reason..
+ * probably not yet added.
+ */
+ continue;
+ }
+ }
+ atomic_add_int(&sifa->refcount, 1);
+ return (sifa);
+ }
+plan_d:
+ /*
+ * plan_d: We are in trouble. No preferred address on the emit
+ * interface. And not even a preferred address on all interfaces. Go
+ * out and see if we can find an acceptable address somewhere
+ * amongst all interfaces.
+ */
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan D\n");
+ LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
+ if (dest_is_loop == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
+ /* wrong base scope */
+ continue;
+ }
+ if ((sctp_ifn == looked_at) && looked_at)
+ /* already looked at this guy */
+ continue;
+
+ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
+ if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) &&
+ (non_asoc_addr_ok == 0))
+ continue;
+ sifa = sctp_is_ifa_addr_acceptable(sctp_ifa,
+ dest_is_loop,
+ dest_is_priv, fam);
+ if (sifa == NULL)
+ continue;
+ if (stcb) {
+ if (sctp_is_address_in_scope(sifa,
+ stcb->asoc.ipv4_addr_legal,
+ stcb->asoc.ipv6_addr_legal,
+ stcb->asoc.loopback_scope,
+ stcb->asoc.ipv4_local_scope,
+ stcb->asoc.local_scope,
+ stcb->asoc.site_scope, 0) == 0) {
+ continue;
+ }
+ if (((non_asoc_addr_ok == 0) &&
+ (sctp_is_addr_restricted(stcb, sifa))) ||
+ (non_asoc_addr_ok &&
+ (sctp_is_addr_restricted(stcb, sifa)) &&
+ (!sctp_is_addr_pending(stcb, sifa)))) {
+ /*
+ * It is restricted for some
+ * reason.. probably not yet added.
+ */
+ continue;
+ }
+ }
+ atomic_add_int(&sifa->refcount, 1);
+ return (sifa);
+ }
+ }
+ /*
+ * Ok we can find NO address to source from that is not on our
+ * restricted list and non_asoc_address is NOT ok, or it is on our
+ * restricted list. We can't source to it :-(
+ */
+ return (NULL);
+}
+
+
+
+/* tcb may be NULL */
+struct sctp_ifa *
+sctp_source_address_selection(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ sctp_route_t * ro,
+ struct sctp_nets *net,
+ int non_asoc_addr_ok, uint32_t vrf_id)
+{
+ struct sockaddr_in *to = (struct sockaddr_in *)&ro->ro_dst;
+
+#ifdef INET6
+ struct sockaddr_in6 *to6 = (struct sockaddr_in6 *)&ro->ro_dst;
+
+#endif
+ struct sctp_ifa *answer;
+ uint8_t dest_is_priv, dest_is_loop;
+ sa_family_t fam;
+
+ /*-
+ * Rules: - Find the route if needed, cache if I can. - Look at
+ * interface address in route, Is it in the bound list. If so we
+ * have the best source. - If not we must rotate amongst the
+ * addresses.
+ *
+ * Cavets and issues
+ *
+ * Do we need to pay attention to scope. We can have a private address
+ * or a global address we are sourcing or sending to. So if we draw
+ * it out
+ * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
+ * For V4
+ * ------------------------------------------
+ * source * dest * result
+ * -----------------------------------------
+ * <a> Private * Global * NAT
+ * -----------------------------------------
+ * <b> Private * Private * No problem
+ * -----------------------------------------
+ * <c> Global * Private * Huh, How will this work?
+ * -----------------------------------------
+ * <d> Global * Global * No Problem
+ *------------------------------------------
+ * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
+ * For V6
+ *------------------------------------------
+ * source * dest * result
+ * -----------------------------------------
+ * <a> Linklocal * Global *
+ * -----------------------------------------
+ * <b> Linklocal * Linklocal * No problem
+ * -----------------------------------------
+ * <c> Global * Linklocal * Huh, How will this work?
+ * -----------------------------------------
+ * <d> Global * Global * No Problem
+ *------------------------------------------
+ * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
+ *
+ * And then we add to that what happens if there are multiple addresses
+ * assigned to an interface. Remember the ifa on a ifn is a linked
+ * list of addresses. So one interface can have more than one IP
+ * address. What happens if we have both a private and a global
+ * address? Do we then use context of destination to sort out which
+ * one is best? And what about NAT's sending P->G may get you a NAT
+ * translation, or should you select the G thats on the interface in
+ * preference.
+ *
+ * Decisions:
+ *
+ * - count the number of addresses on the interface.
+ * - if it is one, no problem except case <c>.
+ * For <a> we will assume a NAT out there.
+ * - if there are more than one, then we need to worry about scope P
+ * or G. We should prefer G -> G and P -> P if possible.
+ * Then as a secondary fall back to mixed types G->P being a last
+ * ditch one.
+ * - The above all works for bound all, but bound specific we need to
+ * use the same concept but instead only consider the bound
+ * addresses. If the bound set is NOT assigned to the interface then
+ * we must use rotation amongst the bound addresses..
+ */
+ if (ro->ro_rt == NULL) {
+ /*
+ * Need a route to cache.
+ */
+ SCTP_RTALLOC(ro, vrf_id);
+ }
+ if (ro->ro_rt == NULL) {
+ return (NULL);
+ }
+ fam = to->sin_family;
+ dest_is_priv = dest_is_loop = 0;
+ /* Setup our scopes for the destination */
+ switch (fam) {
+ case AF_INET:
+ /* Scope based on outbound address */
+ if (IN4_ISLOOPBACK_ADDRESS(&to->sin_addr)) {
+ dest_is_loop = 1;
+ if (net != NULL) {
+ /* mark it as local */
+ net->addr_is_local = 1;
+ }
+ } else if ((IN4_ISPRIVATE_ADDRESS(&to->sin_addr))) {
+ dest_is_priv = 1;
+ }
+ break;
+#ifdef INET6
+ case AF_INET6:
+ /* Scope based on outbound address */
+ if (IN6_IS_ADDR_LOOPBACK(&to6->sin6_addr) ||
+ SCTP_ROUTE_IS_REAL_LOOP(ro)) {
+ /*
+ * If the address is a loopback address, which
+ * consists of "::1" OR "fe80::1%lo0", we are
+ * loopback scope. But we don't use dest_is_priv
+ * (link local addresses).
+ */
+ dest_is_loop = 1;
+ if (net != NULL) {
+ /* mark it as local */
+ net->addr_is_local = 1;
+ }
+ } else if (IN6_IS_ADDR_LINKLOCAL(&to6->sin6_addr)) {
+ dest_is_priv = 1;
+ }
+ break;
+#endif
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Select source addr for:");
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)to);
+ SCTP_IPI_ADDR_RLOCK();
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ /*
+ * Bound all case
+ */
+ answer = sctp_choose_boundall(inp, stcb, net, ro, vrf_id,
+ dest_is_priv, dest_is_loop,
+ non_asoc_addr_ok, fam);
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (answer);
+ }
+ /*
+ * Subset bound case
+ */
+ if (stcb) {
+ answer = sctp_choose_boundspecific_stcb(inp, stcb, net, ro,
+ vrf_id, dest_is_priv,
+ dest_is_loop,
+ non_asoc_addr_ok, fam);
+ } else {
+ answer = sctp_choose_boundspecific_inp(inp, ro, vrf_id,
+ non_asoc_addr_ok,
+ dest_is_priv,
+ dest_is_loop, fam);
+ }
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (answer);
+}
+
+static int
+sctp_find_cmsg(int c_type, void *data, struct mbuf *control, int cpsize)
+{
+ struct cmsghdr cmh;
+ int tlen, at;
+
+ tlen = SCTP_BUF_LEN(control);
+ at = 0;
+ /*
+ * Independent of how many mbufs, find the c_type inside the control
+ * structure and copy out the data.
+ */
+ while (at < tlen) {
+ if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) {
+ /* not enough room for one more we are done. */
+ return (0);
+ }
+ m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh);
+ if (((int)cmh.cmsg_len + at) > tlen) {
+ /*
+ * this is real messed up since there is not enough
+ * data here to cover the cmsg header. We are done.
+ */
+ return (0);
+ }
+ if ((cmh.cmsg_level == IPPROTO_SCTP) &&
+ (c_type == cmh.cmsg_type)) {
+ /* found the one we want, copy it out */
+ at += CMSG_ALIGN(sizeof(struct cmsghdr));
+ if ((int)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < cpsize) {
+ /*
+ * space of cmsg_len after header not big
+ * enough
+ */
+ return (0);
+ }
+ m_copydata(control, at, cpsize, data);
+ return (1);
+ } else {
+ at += CMSG_ALIGN(cmh.cmsg_len);
+ if (cmh.cmsg_len == 0) {
+ break;
+ }
+ }
+ }
+ /* not found */
+ return (0);
+}
+
+static struct mbuf *
+sctp_add_cookie(struct sctp_inpcb *inp, struct mbuf *init, int init_offset,
+ struct mbuf *initack, int initack_offset, struct sctp_state_cookie *stc_in, uint8_t ** signature)
+{
+ struct mbuf *copy_init, *copy_initack, *m_at, *sig, *mret;
+ struct sctp_state_cookie *stc;
+ struct sctp_paramhdr *ph;
+ uint8_t *foo;
+ int sig_offset;
+ uint16_t cookie_sz;
+
+ mret = NULL;
+ mret = sctp_get_mbuf_for_msg((sizeof(struct sctp_state_cookie) +
+ sizeof(struct sctp_paramhdr)), 0,
+ M_DONTWAIT, 1, MT_DATA);
+ if (mret == NULL) {
+ return (NULL);
+ }
+ copy_init = SCTP_M_COPYM(init, init_offset, M_COPYALL, M_DONTWAIT);
+ if (copy_init == NULL) {
+ sctp_m_freem(mret);
+ return (NULL);
+ }
+#ifdef SCTP_MBUF_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ struct mbuf *mat;
+
+ mat = copy_init;
+ while (mat) {
+ if (SCTP_BUF_IS_EXTENDED(mat)) {
+ sctp_log_mb(mat, SCTP_MBUF_ICOPY);
+ }
+ mat = SCTP_BUF_NEXT(mat);
+ }
+ }
+#endif
+ copy_initack = SCTP_M_COPYM(initack, initack_offset, M_COPYALL,
+ M_DONTWAIT);
+ if (copy_initack == NULL) {
+ sctp_m_freem(mret);
+ sctp_m_freem(copy_init);
+ return (NULL);
+ }
+#ifdef SCTP_MBUF_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ struct mbuf *mat;
+
+ mat = copy_initack;
+ while (mat) {
+ if (SCTP_BUF_IS_EXTENDED(mat)) {
+ sctp_log_mb(mat, SCTP_MBUF_ICOPY);
+ }
+ mat = SCTP_BUF_NEXT(mat);
+ }
+ }
+#endif
+ /* easy side we just drop it on the end */
+ ph = mtod(mret, struct sctp_paramhdr *);
+ SCTP_BUF_LEN(mret) = sizeof(struct sctp_state_cookie) +
+ sizeof(struct sctp_paramhdr);
+ stc = (struct sctp_state_cookie *)((caddr_t)ph +
+ sizeof(struct sctp_paramhdr));
+ ph->param_type = htons(SCTP_STATE_COOKIE);
+ ph->param_length = 0; /* fill in at the end */
+ /* Fill in the stc cookie data */
+ memcpy(stc, stc_in, sizeof(struct sctp_state_cookie));
+
+ /* tack the INIT and then the INIT-ACK onto the chain */
+ cookie_sz = 0;
+ m_at = mret;
+ for (m_at = mret; m_at; m_at = SCTP_BUF_NEXT(m_at)) {
+ cookie_sz += SCTP_BUF_LEN(m_at);
+ if (SCTP_BUF_NEXT(m_at) == NULL) {
+ SCTP_BUF_NEXT(m_at) = copy_init;
+ break;
+ }
+ }
+
+ for (m_at = copy_init; m_at; m_at = SCTP_BUF_NEXT(m_at)) {
+ cookie_sz += SCTP_BUF_LEN(m_at);
+ if (SCTP_BUF_NEXT(m_at) == NULL) {
+ SCTP_BUF_NEXT(m_at) = copy_initack;
+ break;
+ }
+ }
+
+ for (m_at = copy_initack; m_at; m_at = SCTP_BUF_NEXT(m_at)) {
+ cookie_sz += SCTP_BUF_LEN(m_at);
+ if (SCTP_BUF_NEXT(m_at) == NULL) {
+ break;
+ }
+ }
+ sig = sctp_get_mbuf_for_msg(SCTP_SECRET_SIZE, 0, M_DONTWAIT, 1, MT_DATA);
+ if (sig == NULL) {
+ /* no space, so free the entire chain */
+ sctp_m_freem(mret);
+ return (NULL);
+ }
+ SCTP_BUF_LEN(sig) = 0;
+ SCTP_BUF_NEXT(m_at) = sig;
+ sig_offset = 0;
+ foo = (uint8_t *) (mtod(sig, caddr_t)+sig_offset);
+ memset(foo, 0, SCTP_SIGNATURE_SIZE);
+ *signature = foo;
+ SCTP_BUF_LEN(sig) += SCTP_SIGNATURE_SIZE;
+ cookie_sz += SCTP_SIGNATURE_SIZE;
+ ph->param_length = htons(cookie_sz);
+ return (mret);
+}
+
+
+static uint8_t
+sctp_get_ect(struct sctp_tcb *stcb,
+ struct sctp_tmit_chunk *chk)
+{
+ uint8_t this_random;
+
+ /* Huh? */
+ if (SCTP_BASE_SYSCTL(sctp_ecn_enable) == 0)
+ return (0);
+
+ if (SCTP_BASE_SYSCTL(sctp_ecn_nonce) == 0)
+ /* no nonce, always return ECT0 */
+ return (SCTP_ECT0_BIT);
+
+ if (stcb->asoc.peer_supports_ecn_nonce == 0) {
+ /* Peer does NOT support it, so we send a ECT0 only */
+ return (SCTP_ECT0_BIT);
+ }
+ if (chk == NULL)
+ return (SCTP_ECT0_BIT);
+
+ if ((stcb->asoc.hb_random_idx > 3) ||
+ ((stcb->asoc.hb_random_idx == 3) &&
+ (stcb->asoc.hb_ect_randombit > 7))) {
+ uint32_t rndval;
+
+warp_drive_sa:
+ rndval = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep);
+ memcpy(stcb->asoc.hb_random_values, &rndval,
+ sizeof(stcb->asoc.hb_random_values));
+ this_random = stcb->asoc.hb_random_values[0];
+ stcb->asoc.hb_random_idx = 0;
+ stcb->asoc.hb_ect_randombit = 0;
+ } else {
+ if (stcb->asoc.hb_ect_randombit > 7) {
+ stcb->asoc.hb_ect_randombit = 0;
+ stcb->asoc.hb_random_idx++;
+ if (stcb->asoc.hb_random_idx > 3) {
+ goto warp_drive_sa;
+ }
+ }
+ this_random = stcb->asoc.hb_random_values[stcb->asoc.hb_random_idx];
+ }
+ if ((this_random >> stcb->asoc.hb_ect_randombit) & 0x01) {
+ if (chk != NULL)
+ /* ECN Nonce stuff */
+ chk->rec.data.ect_nonce = SCTP_ECT1_BIT;
+ stcb->asoc.hb_ect_randombit++;
+ return (SCTP_ECT1_BIT);
+ } else {
+ stcb->asoc.hb_ect_randombit++;
+ return (SCTP_ECT0_BIT);
+ }
+}
+
+static int
+sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb, /* may be NULL */
+ struct sctp_nets *net,
+ struct sockaddr *to,
+ struct mbuf *m,
+ uint32_t auth_offset,
+ struct sctp_auth_chunk *auth,
+ uint16_t auth_keyid,
+ int nofragment_flag,
+ int ecn_ok,
+ struct sctp_tmit_chunk *chk,
+ int out_of_asoc_ok,
+ uint16_t src_port,
+ uint16_t dest_port,
+ uint32_t v_tag,
+ uint16_t port,
+ int so_locked,
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+ union sctp_sockstore *over_addr
+)
+/* nofragment_flag to tell if IP_DF should be set (IPv4 only) */
+{
+ /*
+ * Given a mbuf chain (via SCTP_BUF_NEXT()) that holds a packet
+ * header WITH an SCTPHDR but no IP header, endpoint inp and sa
+ * structure: - fill in the HMAC digest of any AUTH chunk in the
+ * packet. - calculate and fill in the SCTP checksum. - prepend an
+ * IP address header. - if boundall use INADDR_ANY. - if
+ * boundspecific do source address selection. - set fragmentation
+ * option for ipV4. - On return from IP output, check/adjust mtu
+ * size of output interface and smallest_mtu size as well.
+ */
+ /* Will need ifdefs around this */
+ struct mbuf *o_pak;
+ struct mbuf *newm;
+ struct sctphdr *sctphdr;
+ int packet_length;
+ int ret;
+ uint32_t vrf_id;
+ sctp_route_t *ro = NULL;
+ struct udphdr *udp = NULL;
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so = NULL;
+
+#endif
+
+ if ((net) && (net->dest_state & SCTP_ADDR_OUT_OF_SCOPE)) {
+ SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT);
+ sctp_m_freem(m);
+ return (EFAULT);
+ }
+ if (stcb) {
+ vrf_id = stcb->asoc.vrf_id;
+ } else {
+ vrf_id = inp->def_vrf_id;
+ }
+
+ /* fill in the HMAC digest for any AUTH chunk in the packet */
+ if ((auth != NULL) && (stcb != NULL)) {
+ sctp_fill_hmac_digest_m(m, auth_offset, auth, stcb, auth_keyid);
+ }
+ if (to->sa_family == AF_INET) {
+ struct ip *ip = NULL;
+ sctp_route_t iproute;
+ uint8_t tos_value;
+ int len;
+
+ len = sizeof(struct ip) + sizeof(struct sctphdr);
+ if (port) {
+ len += sizeof(struct udphdr);
+ }
+ newm = sctp_get_mbuf_for_msg(len, 1, M_DONTWAIT, 1, MT_DATA);
+ if (newm == NULL) {
+ sctp_m_freem(m);
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ SCTP_ALIGN_TO_END(newm, len);
+ SCTP_BUF_LEN(newm) = len;
+ SCTP_BUF_NEXT(newm) = m;
+ m = newm;
+ packet_length = sctp_calculate_len(m);
+ ip = mtod(m, struct ip *);
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = (sizeof(struct ip) >> 2);
+ if (net) {
+ tos_value = net->tos_flowlabel & 0x000000ff;
+ } else {
+ tos_value = inp->ip_inp.inp.inp_ip_tos;
+ }
+ if ((nofragment_flag) && (port == 0)) {
+ ip->ip_off = IP_DF;
+ } else
+ ip->ip_off = 0;
+
+ /* FreeBSD has a function for ip_id's */
+ ip->ip_id = ip_newid();
+
+ ip->ip_ttl = inp->ip_inp.inp.inp_ip_ttl;
+ ip->ip_len = packet_length;
+ if (stcb) {
+ if ((stcb->asoc.ecn_allowed) && ecn_ok) {
+ /* Enable ECN */
+ ip->ip_tos = ((u_char)(tos_value & 0xfc) | sctp_get_ect(stcb, chk));
+ } else {
+ /* No ECN */
+ ip->ip_tos = (u_char)(tos_value & 0xfc);
+ }
+ } else {
+ /* no association at all */
+ ip->ip_tos = (tos_value & 0xfc);
+ }
+ if (port) {
+ ip->ip_p = IPPROTO_UDP;
+ } else {
+ ip->ip_p = IPPROTO_SCTP;
+ }
+ ip->ip_sum = 0;
+ if (net == NULL) {
+ ro = &iproute;
+ memset(&iproute, 0, sizeof(iproute));
+ memcpy(&ro->ro_dst, to, to->sa_len);
+ } else {
+ ro = (sctp_route_t *) & net->ro;
+ }
+ /* Now the address selection part */
+ ip->ip_dst.s_addr = ((struct sockaddr_in *)to)->sin_addr.s_addr;
+
+ /* call the routine to select the src address */
+ if (net && out_of_asoc_ok == 0) {
+ if (net->ro._s_addr && (net->ro._s_addr->localifa_flags & (SCTP_BEING_DELETED | SCTP_ADDR_IFA_UNUSEABLE))) {
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ net->src_addr_selected = 0;
+ if (ro->ro_rt) {
+ RTFREE(ro->ro_rt);
+ ro->ro_rt = NULL;
+ }
+ }
+ if (net->src_addr_selected == 0) {
+ /* Cache the source address */
+ net->ro._s_addr = sctp_source_address_selection(inp, stcb,
+ ro, net, 0,
+ vrf_id);
+ net->src_addr_selected = 1;
+ }
+ if (net->ro._s_addr == NULL) {
+ /* No route to host */
+ net->src_addr_selected = 0;
+ goto no_route;
+ }
+ ip->ip_src = net->ro._s_addr->address.sin.sin_addr;
+ } else {
+ if (over_addr == NULL) {
+ struct sctp_ifa *_lsrc;
+
+ _lsrc = sctp_source_address_selection(inp, stcb, ro,
+ net,
+ out_of_asoc_ok,
+ vrf_id);
+ if (_lsrc == NULL) {
+ goto no_route;
+ }
+ ip->ip_src = _lsrc->address.sin.sin_addr;
+ sctp_free_ifa(_lsrc);
+ } else {
+ ip->ip_src = over_addr->sin.sin_addr;
+ SCTP_RTALLOC(ro, vrf_id);
+ }
+ }
+ if (port) {
+ udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
+ udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
+ udp->uh_dport = port;
+ udp->uh_ulen = htons(packet_length - sizeof(struct ip));
+ udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP));
+ sctphdr = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr));
+ } else {
+ sctphdr = (struct sctphdr *)((caddr_t)ip + sizeof(struct ip));
+ }
+
+ sctphdr->src_port = src_port;
+ sctphdr->dest_port = dest_port;
+ sctphdr->v_tag = v_tag;
+ sctphdr->checksum = 0;
+
+ /*
+ * If source address selection fails and we find no route
+ * then the ip_output should fail as well with a
+ * NO_ROUTE_TO_HOST type error. We probably should catch
+ * that somewhere and abort the association right away
+ * (assuming this is an INIT being sent).
+ */
+ if ((ro->ro_rt == NULL)) {
+ /*
+ * src addr selection failed to find a route (or
+ * valid source addr), so we can't get there from
+ * here (yet)!
+ */
+ no_route:
+ SCTPDBG(SCTP_DEBUG_OUTPUT1,
+ "%s: dropped packet - no valid source addr\n",
+ __FUNCTION__);
+ if (net) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1,
+ "Destination was ");
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT1,
+ &net->ro._l_addr.sa);
+ if (net->dest_state & SCTP_ADDR_CONFIRMED) {
+ if ((net->dest_state & SCTP_ADDR_REACHABLE) && stcb) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "no route takes interface %p down\n", net);
+ sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN,
+ stcb,
+ SCTP_FAILED_THRESHOLD,
+ (void *)net,
+ so_locked);
+ net->dest_state &= ~SCTP_ADDR_REACHABLE;
+ net->dest_state |= SCTP_ADDR_NOT_REACHABLE;
+ /*
+ * JRS 5/14/07 - If a
+ * destination is
+ * unreachable, the PF bit
+ * is turned off. This
+ * allows an unambiguous use
+ * of the PF bit for
+ * destinations that are
+ * reachable but potentially
+ * failed. If the
+ * destination is set to the
+ * unreachable state, also
+ * set the destination to
+ * the PF state.
+ */
+ /*
+ * Add debug message here if
+ * destination is not in PF
+ * state.
+ */
+ /*
+ * Stop any running T3
+ * timers here?
+ */
+ if ((stcb->asoc.sctp_cmt_on_off == 1) &&
+ (stcb->asoc.sctp_cmt_pf > 0)) {
+ net->dest_state &= ~SCTP_ADDR_PF;
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Destination %p moved from PF to unreachable.\n",
+ net);
+ }
+ }
+ }
+ if (stcb) {
+ if (net == stcb->asoc.primary_destination) {
+ /* need a new primary */
+ struct sctp_nets *alt;
+
+ alt = sctp_find_alternate_net(stcb, net, 0);
+ if (alt != net) {
+ if (sctp_set_primary_addr(stcb,
+ (struct sockaddr *)NULL,
+ alt) == 0) {
+ net->dest_state |= SCTP_ADDR_WAS_PRIMARY;
+ if (net->ro._s_addr) {
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ }
+ net->src_addr_selected = 0;
+ }
+ }
+ }
+ }
+ }
+ SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH);
+ sctp_m_freem(m);
+ return (EHOSTUNREACH);
+ }
+ if (ro != &iproute) {
+ memcpy(&iproute, ro, sizeof(*ro));
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Calling ipv4 output routine from low level src addr:%x\n",
+ (uint32_t) (ntohl(ip->ip_src.s_addr)));
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Destination is %x\n",
+ (uint32_t) (ntohl(ip->ip_dst.s_addr)));
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "RTP route is %p through\n",
+ ro->ro_rt);
+
+ if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) {
+ /* failed to prepend data, give up */
+ SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ sctp_m_freem(m);
+ return (ENOMEM);
+ }
+#ifdef SCTP_PACKET_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
+ sctp_packet_log(m, packet_length);
+#endif
+ SCTP_ATTACH_CHAIN(o_pak, m, packet_length);
+ if (port) {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ if (!(SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback) &&
+ (stcb) &&
+ (stcb->asoc.loopback_scope))) {
+ sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip) + sizeof(struct udphdr));
+ SCTP_STAT_INCR(sctps_sendswcrc);
+ } else {
+ SCTP_STAT_INCR(sctps_sendnocrc);
+ }
+#endif
+ SCTP_ENABLE_UDP_CSUM(o_pak);
+ } else {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ m->m_pkthdr.csum_flags = CSUM_SCTP;
+ m->m_pkthdr.csum_data = 0;
+ SCTP_STAT_INCR(sctps_sendhwcrc);
+#endif
+ }
+ /* send it out. table id is taken from stcb */
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) {
+ so = SCTP_INP_SO(inp);
+ SCTP_SOCKET_UNLOCK(so, 0);
+ }
+#endif
+ SCTP_IP_OUTPUT(ret, o_pak, ro, stcb, vrf_id);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 0);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ }
+#endif
+ SCTP_STAT_INCR(sctps_sendpackets);
+ SCTP_STAT_INCR_COUNTER64(sctps_outpackets);
+ if (ret)
+ SCTP_STAT_INCR(sctps_senderrors);
+
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "IP output returns %d\n", ret);
+ if (net == NULL) {
+ /* free tempy routes */
+ if (ro->ro_rt) {
+ RTFREE(ro->ro_rt);
+ ro->ro_rt = NULL;
+ }
+ } else {
+ /* PMTU check versus smallest asoc MTU goes here */
+ if ((ro->ro_rt != NULL) &&
+ (net->ro._s_addr)) {
+ uint32_t mtu;
+
+ mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_rt);
+ if (net->port) {
+ mtu -= sizeof(struct udphdr);
+ }
+ if (mtu && (stcb->asoc.smallest_mtu > mtu)) {
+ sctp_mtu_size_reset(inp, &stcb->asoc, mtu);
+ net->mtu = mtu;
+ }
+ } else if (ro->ro_rt == NULL) {
+ /* route was freed */
+ if (net->ro._s_addr &&
+ net->src_addr_selected) {
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ }
+ net->src_addr_selected = 0;
+ }
+ }
+ return (ret);
+ }
+#ifdef INET6
+ else if (to->sa_family == AF_INET6) {
+ uint32_t flowlabel;
+ struct ip6_hdr *ip6h;
+ struct route_in6 ip6route;
+ struct ifnet *ifp;
+ u_char flowTop;
+ uint16_t flowBottom;
+ u_char tosBottom, tosTop;
+ struct sockaddr_in6 *sin6, tmp, *lsa6, lsa6_tmp;
+ int prev_scope = 0;
+ struct sockaddr_in6 lsa6_storage;
+ int error;
+ u_short prev_port = 0;
+ int len;
+
+ if (net != NULL) {
+ flowlabel = net->tos_flowlabel;
+ } else {
+ flowlabel = ((struct in6pcb *)inp)->in6p_flowinfo;
+ }
+
+ len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr);
+ if (port) {
+ len += sizeof(struct udphdr);
+ }
+ newm = sctp_get_mbuf_for_msg(len, 1, M_DONTWAIT, 1, MT_DATA);
+ if (newm == NULL) {
+ sctp_m_freem(m);
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ SCTP_ALIGN_TO_END(newm, len);
+ SCTP_BUF_LEN(newm) = len;
+ SCTP_BUF_NEXT(newm) = m;
+ m = newm;
+ packet_length = sctp_calculate_len(m);
+
+ ip6h = mtod(m, struct ip6_hdr *);
+ /*
+ * We assume here that inp_flow is in host byte order within
+ * the TCB!
+ */
+ flowBottom = flowlabel & 0x0000ffff;
+ flowTop = ((flowlabel & 0x000f0000) >> 16);
+ tosTop = (((flowlabel & 0xf0) >> 4) | IPV6_VERSION);
+ /* protect *sin6 from overwrite */
+ sin6 = (struct sockaddr_in6 *)to;
+ tmp = *sin6;
+ sin6 = &tmp;
+
+ /* KAME hack: embed scopeid */
+ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
+ SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ return (EINVAL);
+ }
+ if (net == NULL) {
+ memset(&ip6route, 0, sizeof(ip6route));
+ ro = (sctp_route_t *) & ip6route;
+ memcpy(&ro->ro_dst, sin6, sin6->sin6_len);
+ } else {
+ ro = (sctp_route_t *) & net->ro;
+ }
+ if (stcb != NULL) {
+ if ((stcb->asoc.ecn_allowed) && ecn_ok) {
+ /* Enable ECN */
+ tosBottom = (((((struct in6pcb *)inp)->in6p_flowinfo & 0x0c) | sctp_get_ect(stcb, chk)) << 4);
+ } else {
+ /* No ECN */
+ tosBottom = ((((struct in6pcb *)inp)->in6p_flowinfo & 0x0c) << 4);
+ }
+ } else {
+ /* we could get no asoc if it is a O-O-T-B packet */
+ tosBottom = ((((struct in6pcb *)inp)->in6p_flowinfo & 0x0c) << 4);
+ }
+ ip6h->ip6_flow = htonl(((tosTop << 24) | ((tosBottom | flowTop) << 16) | flowBottom));
+ if (port) {
+ ip6h->ip6_nxt = IPPROTO_UDP;
+ } else {
+ ip6h->ip6_nxt = IPPROTO_SCTP;
+ }
+ ip6h->ip6_plen = (packet_length - sizeof(struct ip6_hdr));
+ ip6h->ip6_dst = sin6->sin6_addr;
+
+ /*
+ * Add SRC address selection here: we can only reuse to a
+ * limited degree the kame src-addr-sel, since we can try
+ * their selection but it may not be bound.
+ */
+ bzero(&lsa6_tmp, sizeof(lsa6_tmp));
+ lsa6_tmp.sin6_family = AF_INET6;
+ lsa6_tmp.sin6_len = sizeof(lsa6_tmp);
+ lsa6 = &lsa6_tmp;
+ if (net && out_of_asoc_ok == 0) {
+ if (net->ro._s_addr && (net->ro._s_addr->localifa_flags & (SCTP_BEING_DELETED | SCTP_ADDR_IFA_UNUSEABLE))) {
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ net->src_addr_selected = 0;
+ if (ro->ro_rt) {
+ RTFREE(ro->ro_rt);
+ ro->ro_rt = NULL;
+ }
+ }
+ if (net->src_addr_selected == 0) {
+ sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
+ /* KAME hack: embed scopeid */
+ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
+ SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ return (EINVAL);
+ }
+ /* Cache the source address */
+ net->ro._s_addr = sctp_source_address_selection(inp,
+ stcb,
+ ro,
+ net,
+ 0,
+ vrf_id);
+ (void)sa6_recoverscope(sin6);
+ net->src_addr_selected = 1;
+ }
+ if (net->ro._s_addr == NULL) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "V6:No route to host\n");
+ net->src_addr_selected = 0;
+ goto no_route;
+ }
+ lsa6->sin6_addr = net->ro._s_addr->address.sin6.sin6_addr;
+ } else {
+ sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
+ /* KAME hack: embed scopeid */
+ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
+ SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ return (EINVAL);
+ }
+ if (over_addr == NULL) {
+ struct sctp_ifa *_lsrc;
+
+ _lsrc = sctp_source_address_selection(inp, stcb, ro,
+ net,
+ out_of_asoc_ok,
+ vrf_id);
+ if (_lsrc == NULL) {
+ goto no_route;
+ }
+ lsa6->sin6_addr = _lsrc->address.sin6.sin6_addr;
+ sctp_free_ifa(_lsrc);
+ } else {
+ lsa6->sin6_addr = over_addr->sin6.sin6_addr;
+ SCTP_RTALLOC(ro, vrf_id);
+ }
+ (void)sa6_recoverscope(sin6);
+ }
+ lsa6->sin6_port = inp->sctp_lport;
+
+ if (ro->ro_rt == NULL) {
+ /*
+ * src addr selection failed to find a route (or
+ * valid source addr), so we can't get there from
+ * here!
+ */
+ goto no_route;
+ }
+ /*
+ * XXX: sa6 may not have a valid sin6_scope_id in the
+ * non-SCOPEDROUTING case.
+ */
+ bzero(&lsa6_storage, sizeof(lsa6_storage));
+ lsa6_storage.sin6_family = AF_INET6;
+ lsa6_storage.sin6_len = sizeof(lsa6_storage);
+ lsa6_storage.sin6_addr = lsa6->sin6_addr;
+ if ((error = sa6_recoverscope(&lsa6_storage)) != 0) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "recover scope fails error %d\n", error);
+ sctp_m_freem(m);
+ return (error);
+ }
+ /* XXX */
+ lsa6_storage.sin6_addr = lsa6->sin6_addr;
+ lsa6_storage.sin6_port = inp->sctp_lport;
+ lsa6 = &lsa6_storage;
+ ip6h->ip6_src = lsa6->sin6_addr;
+
+ if (port) {
+ udp = (struct udphdr *)((caddr_t)ip6h + sizeof(struct ip6_hdr));
+ udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
+ udp->uh_dport = port;
+ udp->uh_ulen = htons(packet_length - sizeof(struct ip6_hdr));
+ udp->uh_sum = 0;
+ sctphdr = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr));
+ } else {
+ sctphdr = (struct sctphdr *)((caddr_t)ip6h + sizeof(struct ip6_hdr));
+ }
+
+ sctphdr->src_port = src_port;
+ sctphdr->dest_port = dest_port;
+ sctphdr->v_tag = v_tag;
+ sctphdr->checksum = 0;
+
+ /*
+ * We set the hop limit now since there is a good chance
+ * that our ro pointer is now filled
+ */
+ ip6h->ip6_hlim = SCTP_GET_HLIM(inp, ro);
+ ifp = SCTP_GET_IFN_VOID_FROM_ROUTE(ro);
+
+#ifdef SCTP_DEBUG
+ /* Copy to be sure something bad is not happening */
+ sin6->sin6_addr = ip6h->ip6_dst;
+ lsa6->sin6_addr = ip6h->ip6_src;
+#endif
+
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Calling ipv6 output routine from low level\n");
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "src: ");
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT3, (struct sockaddr *)lsa6);
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "dst: ");
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT3, (struct sockaddr *)sin6);
+ if (net) {
+ sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
+ /* preserve the port and scope for link local send */
+ prev_scope = sin6->sin6_scope_id;
+ prev_port = sin6->sin6_port;
+ }
+ if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) {
+ /* failed to prepend data, give up */
+ sctp_m_freem(m);
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+#ifdef SCTP_PACKET_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
+ sctp_packet_log(m, packet_length);
+#endif
+ SCTP_ATTACH_CHAIN(o_pak, m, packet_length);
+ if (port) {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ if (!(SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback) &&
+ (stcb) &&
+ (stcb->asoc.loopback_scope))) {
+ sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip6_hdr) + sizeof(struct udphdr));
+ SCTP_STAT_INCR(sctps_sendswcrc);
+ } else {
+ SCTP_STAT_INCR(sctps_sendnocrc);
+ }
+#endif
+ if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), packet_length - sizeof(struct ip6_hdr))) == 0) {
+ udp->uh_sum = 0xffff;
+ }
+ } else {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ m->m_pkthdr.csum_flags = CSUM_SCTP;
+ m->m_pkthdr.csum_data = 0;
+ SCTP_STAT_INCR(sctps_sendhwcrc);
+#endif
+ }
+ /* send it out. table id is taken from stcb */
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) {
+ so = SCTP_INP_SO(inp);
+ SCTP_SOCKET_UNLOCK(so, 0);
+ }
+#endif
+ SCTP_IP6_OUTPUT(ret, o_pak, (struct route_in6 *)ro, &ifp, stcb, vrf_id);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 0);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ }
+#endif
+ if (net) {
+ /* for link local this must be done */
+ sin6->sin6_scope_id = prev_scope;
+ sin6->sin6_port = prev_port;
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "return from send is %d\n", ret);
+ SCTP_STAT_INCR(sctps_sendpackets);
+ SCTP_STAT_INCR_COUNTER64(sctps_outpackets);
+ if (ret) {
+ SCTP_STAT_INCR(sctps_senderrors);
+ }
+ if (net == NULL) {
+ /* Now if we had a temp route free it */
+ if (ro->ro_rt) {
+ RTFREE(ro->ro_rt);
+ }
+ } else {
+ /* PMTU check versus smallest asoc MTU goes here */
+ if (ro->ro_rt == NULL) {
+ /* Route was freed */
+ if (net->ro._s_addr &&
+ net->src_addr_selected) {
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ }
+ net->src_addr_selected = 0;
+ }
+ if ((ro->ro_rt != NULL) &&
+ (net->ro._s_addr)) {
+ uint32_t mtu;
+
+ mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_rt);
+ if (mtu &&
+ (stcb->asoc.smallest_mtu > mtu)) {
+ sctp_mtu_size_reset(inp, &stcb->asoc, mtu);
+ net->mtu = mtu;
+ if (net->port) {
+ net->mtu -= sizeof(struct udphdr);
+ }
+ }
+ } else if (ifp) {
+ if (ND_IFINFO(ifp)->linkmtu &&
+ (stcb->asoc.smallest_mtu > ND_IFINFO(ifp)->linkmtu)) {
+ sctp_mtu_size_reset(inp,
+ &stcb->asoc,
+ ND_IFINFO(ifp)->linkmtu);
+ }
+ }
+ }
+ return (ret);
+ }
+#endif
+ else {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Unknown protocol (TSNH) type %d\n",
+ ((struct sockaddr *)to)->sa_family);
+ sctp_m_freem(m);
+ SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT);
+ return (EFAULT);
+ }
+}
+
+
+void
+sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ struct mbuf *m, *m_at, *mp_last;
+ struct sctp_nets *net;
+ struct sctp_init_chunk *init;
+ struct sctp_supported_addr_param *sup_addr;
+ struct sctp_adaptation_layer_indication *ali;
+ struct sctp_ecn_supported_param *ecn;
+ struct sctp_prsctp_supported_param *prsctp;
+ struct sctp_ecn_nonce_supported_param *ecn_nonce;
+ struct sctp_supported_chunk_types_param *pr_supported;
+ int cnt_inits_to = 0;
+ int padval, ret;
+ int num_ext;
+ int p_len;
+
+ /* INIT's always go to the primary (and usually ONLY address) */
+ mp_last = NULL;
+ net = stcb->asoc.primary_destination;
+ if (net == NULL) {
+ net = TAILQ_FIRST(&stcb->asoc.nets);
+ if (net == NULL) {
+ /* TSNH */
+ return;
+ }
+ /* we confirm any address we send an INIT to */
+ net->dest_state &= ~SCTP_ADDR_UNCONFIRMED;
+ (void)sctp_set_primary_addr(stcb, NULL, net);
+ } else {
+ /* we confirm any address we send an INIT to */
+ net->dest_state &= ~SCTP_ADDR_UNCONFIRMED;
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT\n");
+#ifdef INET6
+ if (((struct sockaddr *)&(net->ro._l_addr))->sa_family == AF_INET6) {
+ /*
+ * special hook, if we are sending to link local it will not
+ * show up in our private address count.
+ */
+ struct sockaddr_in6 *sin6l;
+
+ sin6l = &net->ro._l_addr.sin6;
+ if (IN6_IS_ADDR_LINKLOCAL(&sin6l->sin6_addr))
+ cnt_inits_to = 1;
+ }
+#endif
+ if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
+ /* This case should not happen */
+ SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - failed timer?\n");
+ return;
+ }
+ /* start the INIT timer */
+ sctp_timer_start(SCTP_TIMER_TYPE_INIT, inp, stcb, net);
+
+ m = sctp_get_mbuf_for_msg(MCLBYTES, 1, M_DONTWAIT, 1, MT_DATA);
+ if (m == NULL) {
+ /* No memory, INIT timer will re-attempt. */
+ SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - mbuf?\n");
+ return;
+ }
+ SCTP_BUF_LEN(m) = sizeof(struct sctp_init_chunk);
+ /*
+ * assume peer supports asconf in order to be able to queue local
+ * address changes while an INIT is in flight and before the assoc
+ * is established.
+ */
+ stcb->asoc.peer_supports_asconf = 1;
+ /* Now lets put the SCTP header in place */
+ init = mtod(m, struct sctp_init_chunk *);
+ /* now the chunk header */
+ init->ch.chunk_type = SCTP_INITIATION;
+ init->ch.chunk_flags = 0;
+ /* fill in later from mbuf we build */
+ init->ch.chunk_length = 0;
+ /* place in my tag */
+ init->init.initiate_tag = htonl(stcb->asoc.my_vtag);
+ /* set up some of the credits. */
+ init->init.a_rwnd = htonl(max(inp->sctp_socket ? SCTP_SB_LIMIT_RCV(inp->sctp_socket) : 0,
+ SCTP_MINIMAL_RWND));
+
+ init->init.num_outbound_streams = htons(stcb->asoc.pre_open_streams);
+ init->init.num_inbound_streams = htons(stcb->asoc.max_inbound_streams);
+ init->init.initial_tsn = htonl(stcb->asoc.init_seq_number);
+ /* now the address restriction */
+ sup_addr = (struct sctp_supported_addr_param *)((caddr_t)init +
+ sizeof(*init));
+ sup_addr->ph.param_type = htons(SCTP_SUPPORTED_ADDRTYPE);
+#ifdef INET6
+ /* we support 2 types: IPv6/IPv4 */
+ sup_addr->ph.param_length = htons(sizeof(*sup_addr) + sizeof(uint16_t));
+ sup_addr->addr_type[0] = htons(SCTP_IPV4_ADDRESS);
+ sup_addr->addr_type[1] = htons(SCTP_IPV6_ADDRESS);
+#else
+ /* we support 1 type: IPv4 */
+ sup_addr->ph.param_length = htons(sizeof(*sup_addr) + sizeof(uint8_t));
+ sup_addr->addr_type[0] = htons(SCTP_IPV4_ADDRESS);
+ sup_addr->addr_type[1] = htons(0); /* this is the padding */
+#endif
+ SCTP_BUF_LEN(m) += sizeof(*sup_addr) + sizeof(uint16_t);
+ /* adaptation layer indication parameter */
+ ali = (struct sctp_adaptation_layer_indication *)((caddr_t)sup_addr + sizeof(*sup_addr) + sizeof(uint16_t));
+ ali->ph.param_type = htons(SCTP_ULP_ADAPTATION);
+ ali->ph.param_length = htons(sizeof(*ali));
+ ali->indication = ntohl(inp->sctp_ep.adaptation_layer_indicator);
+ SCTP_BUF_LEN(m) += sizeof(*ali);
+ ecn = (struct sctp_ecn_supported_param *)((caddr_t)ali + sizeof(*ali));
+
+ if (SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly)) {
+ /* Add NAT friendly parameter */
+ struct sctp_paramhdr *ph;
+
+ ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
+ ph->param_type = htons(SCTP_HAS_NAT_SUPPORT);
+ ph->param_length = htons(sizeof(struct sctp_paramhdr));
+ SCTP_BUF_LEN(m) += sizeof(struct sctp_paramhdr);
+ ecn = (struct sctp_ecn_supported_param *)((caddr_t)ph + sizeof(*ph));
+ }
+ /* now any cookie time extensions */
+ if (stcb->asoc.cookie_preserve_req) {
+ struct sctp_cookie_perserve_param *cookie_preserve;
+
+ cookie_preserve = (struct sctp_cookie_perserve_param *)(ecn);
+ cookie_preserve->ph.param_type = htons(SCTP_COOKIE_PRESERVE);
+ cookie_preserve->ph.param_length = htons(
+ sizeof(*cookie_preserve));
+ cookie_preserve->time = htonl(stcb->asoc.cookie_preserve_req);
+ SCTP_BUF_LEN(m) += sizeof(*cookie_preserve);
+ ecn = (struct sctp_ecn_supported_param *)(
+ (caddr_t)cookie_preserve + sizeof(*cookie_preserve));
+ stcb->asoc.cookie_preserve_req = 0;
+ }
+ /* ECN parameter */
+ if (SCTP_BASE_SYSCTL(sctp_ecn_enable) == 1) {
+ ecn->ph.param_type = htons(SCTP_ECN_CAPABLE);
+ ecn->ph.param_length = htons(sizeof(*ecn));
+ SCTP_BUF_LEN(m) += sizeof(*ecn);
+ prsctp = (struct sctp_prsctp_supported_param *)((caddr_t)ecn +
+ sizeof(*ecn));
+ } else {
+ prsctp = (struct sctp_prsctp_supported_param *)((caddr_t)ecn);
+ }
+ /* And now tell the peer we do pr-sctp */
+ prsctp->ph.param_type = htons(SCTP_PRSCTP_SUPPORTED);
+ prsctp->ph.param_length = htons(sizeof(*prsctp));
+ SCTP_BUF_LEN(m) += sizeof(*prsctp);
+
+ /* And now tell the peer we do all the extensions */
+ pr_supported = (struct sctp_supported_chunk_types_param *)
+ ((caddr_t)prsctp + sizeof(*prsctp));
+ pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT);
+ num_ext = 0;
+ pr_supported->chunk_types[num_ext++] = SCTP_ASCONF;
+ pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK;
+ pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN;
+ pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED;
+ pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET;
+ if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) {
+ pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION;
+ }
+ if (stcb->asoc.sctp_nr_sack_on_off == 1) {
+ pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK;
+ }
+ p_len = sizeof(*pr_supported) + num_ext;
+ pr_supported->ph.param_length = htons(p_len);
+ bzero((caddr_t)pr_supported + p_len, SCTP_SIZE32(p_len) - p_len);
+ SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len);
+
+
+ /* ECN nonce: And now tell the peer we support ECN nonce */
+ if (SCTP_BASE_SYSCTL(sctp_ecn_nonce)) {
+ ecn_nonce = (struct sctp_ecn_nonce_supported_param *)
+ ((caddr_t)pr_supported + SCTP_SIZE32(p_len));
+ ecn_nonce->ph.param_type = htons(SCTP_ECN_NONCE_SUPPORTED);
+ ecn_nonce->ph.param_length = htons(sizeof(*ecn_nonce));
+ SCTP_BUF_LEN(m) += sizeof(*ecn_nonce);
+ }
+ /* add authentication parameters */
+ if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) {
+ struct sctp_auth_random *randp;
+ struct sctp_auth_hmac_algo *hmacs;
+ struct sctp_auth_chunk_list *chunks;
+
+ /* attach RANDOM parameter, if available */
+ if (stcb->asoc.authinfo.random != NULL) {
+ randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
+ p_len = sizeof(*randp) + stcb->asoc.authinfo.random_len;
+ /* random key already contains the header */
+ bcopy(stcb->asoc.authinfo.random->key, randp, p_len);
+ /* zero out any padding required */
+ bzero((caddr_t)randp + p_len, SCTP_SIZE32(p_len) - p_len);
+ SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len);
+ }
+ /* add HMAC_ALGO parameter */
+ hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
+ p_len = sctp_serialize_hmaclist(stcb->asoc.local_hmacs,
+ (uint8_t *) hmacs->hmac_ids);
+ if (p_len > 0) {
+ p_len += sizeof(*hmacs);
+ hmacs->ph.param_type = htons(SCTP_HMAC_LIST);
+ hmacs->ph.param_length = htons(p_len);
+ /* zero out any padding required */
+ bzero((caddr_t)hmacs + p_len, SCTP_SIZE32(p_len) - p_len);
+ SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len);
+ }
+ /* add CHUNKS parameter */
+ chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
+ p_len = sctp_serialize_auth_chunks(stcb->asoc.local_auth_chunks,
+ chunks->chunk_types);
+ if (p_len > 0) {
+ p_len += sizeof(*chunks);
+ chunks->ph.param_type = htons(SCTP_CHUNK_LIST);
+ chunks->ph.param_length = htons(p_len);
+ /* zero out any padding required */
+ bzero((caddr_t)chunks + p_len, SCTP_SIZE32(p_len) - p_len);
+ SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len);
+ }
+ }
+ m_at = m;
+ /* now the addresses */
+ {
+ struct sctp_scoping scp;
+
+ /*
+ * To optimize this we could put the scoping stuff into a
+ * structure and remove the individual uint8's from the
+ * assoc structure. Then we could just sifa in the address
+ * within the stcb.. but for now this is a quick hack to get
+ * the address stuff teased apart.
+ */
+ scp.ipv4_addr_legal = stcb->asoc.ipv4_addr_legal;
+ scp.ipv6_addr_legal = stcb->asoc.ipv6_addr_legal;
+ scp.loopback_scope = stcb->asoc.loopback_scope;
+ scp.ipv4_local_scope = stcb->asoc.ipv4_local_scope;
+ scp.local_scope = stcb->asoc.local_scope;
+ scp.site_scope = stcb->asoc.site_scope;
+
+ m_at = sctp_add_addresses_to_i_ia(inp, &scp, m_at, cnt_inits_to);
+ }
+
+ /* calulate the size and update pkt header and chunk header */
+ p_len = 0;
+ for (m_at = m; m_at; m_at = SCTP_BUF_NEXT(m_at)) {
+ if (SCTP_BUF_NEXT(m_at) == NULL)
+ mp_last = m_at;
+ p_len += SCTP_BUF_LEN(m_at);
+ }
+ init->ch.chunk_length = htons(p_len);
+ /*
+ * We sifa 0 here to NOT set IP_DF if its IPv4, we ignore the return
+ * here since the timer will drive a retranmission.
+ */
+
+ /* I don't expect this to execute but we will be safe here */
+ padval = p_len % 4;
+ if ((padval) && (mp_last)) {
+ /*
+ * The compiler worries that mp_last may not be set even
+ * though I think it is impossible :-> however we add
+ * mp_last here just in case.
+ */
+ ret = sctp_add_pad_tombuf(mp_last, (4 - padval));
+ if (ret) {
+ /* Houston we have a problem, no space */
+ sctp_m_freem(m);
+ return;
+ }
+ p_len += padval;
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - calls lowlevel_output\n");
+ ret = sctp_lowlevel_chunk_output(inp, stcb, net,
+ (struct sockaddr *)&net->ro._l_addr,
+ m, 0, NULL, 0, 0, 0, NULL, 0,
+ inp->sctp_lport, stcb->rport, htonl(0),
+ net->port, so_locked, NULL);
+ SCTPDBG(SCTP_DEBUG_OUTPUT4, "lowlevel_output - %d\n", ret);
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+ (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
+}
+
+struct mbuf *
+sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt,
+ int param_offset, int *abort_processing, struct sctp_chunkhdr *cp, int *nat_friendly)
+{
+ /*
+ * Given a mbuf containing an INIT or INIT-ACK with the param_offset
+ * being equal to the beginning of the params i.e. (iphlen +
+ * sizeof(struct sctp_init_msg) parse through the parameters to the
+ * end of the mbuf verifying that all parameters are known.
+ *
+ * For unknown parameters build and return a mbuf with
+ * UNRECOGNIZED_PARAMETER errors. If the flags indicate to stop
+ * processing this chunk stop, and set *abort_processing to 1.
+ *
+ * By having param_offset be pre-set to where parameters begin it is
+ * hoped that this routine may be reused in the future by new
+ * features.
+ */
+ struct sctp_paramhdr *phdr, params;
+
+ struct mbuf *mat, *op_err;
+ char tempbuf[SCTP_PARAM_BUFFER_SIZE];
+ int at, limit, pad_needed;
+ uint16_t ptype, plen, padded_size;
+ int err_at;
+
+ *abort_processing = 0;
+ mat = in_initpkt;
+ err_at = 0;
+ limit = ntohs(cp->chunk_length) - sizeof(struct sctp_init_chunk);
+ at = param_offset;
+ op_err = NULL;
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Check for unrecognized param's\n");
+ phdr = sctp_get_next_param(mat, at, &params, sizeof(params));
+ while ((phdr != NULL) && ((size_t)limit >= sizeof(struct sctp_paramhdr))) {
+ ptype = ntohs(phdr->param_type);
+ plen = ntohs(phdr->param_length);
+ if ((plen > limit) || (plen < sizeof(struct sctp_paramhdr))) {
+ /* wacked parameter */
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error %d\n", plen);
+ goto invalid_size;
+ }
+ limit -= SCTP_SIZE32(plen);
+ /*-
+ * All parameters for all chunks that we know/understand are
+ * listed here. We process them other places and make
+ * appropriate stop actions per the upper bits. However this
+ * is the generic routine processor's can call to get back
+ * an operr.. to either incorporate (init-ack) or send.
+ */
+ padded_size = SCTP_SIZE32(plen);
+ switch (ptype) {
+ /* Param's with variable size */
+ case SCTP_HEARTBEAT_INFO:
+ case SCTP_STATE_COOKIE:
+ case SCTP_UNRECOG_PARAM:
+ case SCTP_ERROR_CAUSE_IND:
+ /* ok skip fwd */
+ at += padded_size;
+ break;
+ /* Param's with variable size within a range */
+ case SCTP_CHUNK_LIST:
+ case SCTP_SUPPORTED_CHUNK_EXT:
+ if (padded_size > (sizeof(struct sctp_supported_chunk_types_param) + (sizeof(uint8_t) * SCTP_MAX_SUPPORTED_EXT))) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error chklist %d\n", plen);
+ goto invalid_size;
+ }
+ at += padded_size;
+ break;
+ case SCTP_SUPPORTED_ADDRTYPE:
+ if (padded_size > SCTP_MAX_ADDR_PARAMS_SIZE) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error supaddrtype %d\n", plen);
+ goto invalid_size;
+ }
+ at += padded_size;
+ break;
+ case SCTP_RANDOM:
+ if (padded_size > (sizeof(struct sctp_auth_random) + SCTP_RANDOM_MAX_SIZE)) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error random %d\n", plen);
+ goto invalid_size;
+ }
+ at += padded_size;
+ break;
+ case SCTP_SET_PRIM_ADDR:
+ case SCTP_DEL_IP_ADDRESS:
+ case SCTP_ADD_IP_ADDRESS:
+ if ((padded_size != sizeof(struct sctp_asconf_addrv4_param)) &&
+ (padded_size != sizeof(struct sctp_asconf_addr_param))) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error setprim %d\n", plen);
+ goto invalid_size;
+ }
+ at += padded_size;
+ break;
+ /* Param's with a fixed size */
+ case SCTP_IPV4_ADDRESS:
+ if (padded_size != sizeof(struct sctp_ipv4addr_param)) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ipv4 addr %d\n", plen);
+ goto invalid_size;
+ }
+ at += padded_size;
+ break;
+ case SCTP_IPV6_ADDRESS:
+ if (padded_size != sizeof(struct sctp_ipv6addr_param)) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ipv6 addr %d\n", plen);
+ goto invalid_size;
+ }
+ at += padded_size;
+ break;
+ case SCTP_COOKIE_PRESERVE:
+ if (padded_size != sizeof(struct sctp_cookie_perserve_param)) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error cookie-preserve %d\n", plen);
+ goto invalid_size;
+ }
+ at += padded_size;
+ break;
+ case SCTP_HAS_NAT_SUPPORT:
+ *nat_friendly = 1;
+ /* fall through */
+ case SCTP_ECN_NONCE_SUPPORTED:
+ case SCTP_PRSCTP_SUPPORTED:
+
+ if (padded_size != sizeof(struct sctp_paramhdr)) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ecnnonce/prsctp/nat support %d\n", plen);
+ goto invalid_size;
+ }
+ at += padded_size;
+ break;
+ case SCTP_ECN_CAPABLE:
+ if (padded_size != sizeof(struct sctp_ecn_supported_param)) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ecn %d\n", plen);
+ goto invalid_size;
+ }
+ at += padded_size;
+ break;
+ case SCTP_ULP_ADAPTATION:
+ if (padded_size != sizeof(struct sctp_adaptation_layer_indication)) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error adapatation %d\n", plen);
+ goto invalid_size;
+ }
+ at += padded_size;
+ break;
+ case SCTP_SUCCESS_REPORT:
+ if (padded_size != sizeof(struct sctp_asconf_paramhdr)) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error success %d\n", plen);
+ goto invalid_size;
+ }
+ at += padded_size;
+ break;
+ case SCTP_HOSTNAME_ADDRESS:
+ {
+ /* We can NOT handle HOST NAME addresses!! */
+ int l_len;
+
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Can't handle hostname addresses.. abort processing\n");
+ *abort_processing = 1;
+ if (op_err == NULL) {
+ /* Ok need to try to get a mbuf */
+#ifdef INET6
+ l_len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+#else
+ l_len = sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+#endif
+ l_len += plen;
+ l_len += sizeof(struct sctp_paramhdr);
+ op_err = sctp_get_mbuf_for_msg(l_len, 0, M_DONTWAIT, 1, MT_DATA);
+ if (op_err) {
+ SCTP_BUF_LEN(op_err) = 0;
+ /*
+ * pre-reserve space for ip
+ * and sctp header and
+ * chunk hdr
+ */
+#ifdef INET6
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr));
+#else
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct ip));
+#endif
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr));
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr));
+ }
+ }
+ if (op_err) {
+ /* If we have space */
+ struct sctp_paramhdr s;
+
+ if (err_at % 4) {
+ uint32_t cpthis = 0;
+
+ pad_needed = 4 - (err_at % 4);
+ m_copyback(op_err, err_at, pad_needed, (caddr_t)&cpthis);
+ err_at += pad_needed;
+ }
+ s.param_type = htons(SCTP_CAUSE_UNRESOLVABLE_ADDR);
+ s.param_length = htons(sizeof(s) + plen);
+ m_copyback(op_err, err_at, sizeof(s), (caddr_t)&s);
+ err_at += sizeof(s);
+ phdr = sctp_get_next_param(mat, at, (struct sctp_paramhdr *)tempbuf, min(sizeof(tempbuf), plen));
+ if (phdr == NULL) {
+ sctp_m_freem(op_err);
+ /*
+ * we are out of memory but
+ * we still need to have a
+ * look at what to do (the
+ * system is in trouble
+ * though).
+ */
+ return (NULL);
+ }
+ m_copyback(op_err, err_at, plen, (caddr_t)phdr);
+ err_at += plen;
+ }
+ return (op_err);
+ break;
+ }
+ default:
+ /*
+ * we do not recognize the parameter figure out what
+ * we do.
+ */
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Hit default param %x\n", ptype);
+ if ((ptype & 0x4000) == 0x4000) {
+ /* Report bit is set?? */
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "report op err\n");
+ if (op_err == NULL) {
+ int l_len;
+
+ /* Ok need to try to get an mbuf */
+#ifdef INET6
+ l_len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+#else
+ l_len = sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+#endif
+ l_len += plen;
+ l_len += sizeof(struct sctp_paramhdr);
+ op_err = sctp_get_mbuf_for_msg(l_len, 0, M_DONTWAIT, 1, MT_DATA);
+ if (op_err) {
+ SCTP_BUF_LEN(op_err) = 0;
+#ifdef INET6
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr));
+#else
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct ip));
+#endif
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr));
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr));
+ }
+ }
+ if (op_err) {
+ /* If we have space */
+ struct sctp_paramhdr s;
+
+ if (err_at % 4) {
+ uint32_t cpthis = 0;
+
+ pad_needed = 4 - (err_at % 4);
+ m_copyback(op_err, err_at, pad_needed, (caddr_t)&cpthis);
+ err_at += pad_needed;
+ }
+ s.param_type = htons(SCTP_UNRECOG_PARAM);
+ s.param_length = htons(sizeof(s) + plen);
+ m_copyback(op_err, err_at, sizeof(s), (caddr_t)&s);
+ err_at += sizeof(s);
+ if (plen > sizeof(tempbuf)) {
+ plen = sizeof(tempbuf);
+ }
+ phdr = sctp_get_next_param(mat, at, (struct sctp_paramhdr *)tempbuf, min(sizeof(tempbuf), plen));
+ if (phdr == NULL) {
+ sctp_m_freem(op_err);
+ /*
+ * we are out of memory but
+ * we still need to have a
+ * look at what to do (the
+ * system is in trouble
+ * though).
+ */
+ op_err = NULL;
+ goto more_processing;
+ }
+ m_copyback(op_err, err_at, plen, (caddr_t)phdr);
+ err_at += plen;
+ }
+ }
+ more_processing:
+ if ((ptype & 0x8000) == 0x0000) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "stop proc\n");
+ return (op_err);
+ } else {
+ /* skip this chunk and continue processing */
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "move on\n");
+ at += SCTP_SIZE32(plen);
+ }
+ break;
+
+ }
+ phdr = sctp_get_next_param(mat, at, &params, sizeof(params));
+ }
+ return (op_err);
+invalid_size:
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "abort flag set\n");
+ *abort_processing = 1;
+ if ((op_err == NULL) && phdr) {
+ int l_len;
+
+#ifdef INET6
+ l_len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+#else
+ l_len = sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+#endif
+ l_len += (2 * sizeof(struct sctp_paramhdr));
+ op_err = sctp_get_mbuf_for_msg(l_len, 0, M_DONTWAIT, 1, MT_DATA);
+ if (op_err) {
+ SCTP_BUF_LEN(op_err) = 0;
+#ifdef INET6
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr));
+#else
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct ip));
+#endif
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr));
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr));
+ }
+ }
+ if ((op_err) && phdr) {
+ struct sctp_paramhdr s;
+
+ if (err_at % 4) {
+ uint32_t cpthis = 0;
+
+ pad_needed = 4 - (err_at % 4);
+ m_copyback(op_err, err_at, pad_needed, (caddr_t)&cpthis);
+ err_at += pad_needed;
+ }
+ s.param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ s.param_length = htons(sizeof(s) + sizeof(struct sctp_paramhdr));
+ m_copyback(op_err, err_at, sizeof(s), (caddr_t)&s);
+ err_at += sizeof(s);
+ /* Only copy back the p-hdr that caused the issue */
+ m_copyback(op_err, err_at, sizeof(struct sctp_paramhdr), (caddr_t)phdr);
+ }
+ return (op_err);
+}
+
+static int
+sctp_are_there_new_addresses(struct sctp_association *asoc,
+ struct mbuf *in_initpkt, int iphlen, int offset)
+{
+ /*
+ * Given a INIT packet, look through the packet to verify that there
+ * are NO new addresses. As we go through the parameters add reports
+ * of any un-understood parameters that require an error. Also we
+ * must return (1) to drop the packet if we see a un-understood
+ * parameter that tells us to drop the chunk.
+ */
+ struct sockaddr_in sin4, *sa4;
+
+#ifdef INET6
+ struct sockaddr_in6 sin6, *sa6;
+
+#endif
+ struct sockaddr *sa_touse;
+ struct sockaddr *sa;
+ struct sctp_paramhdr *phdr, params;
+ struct ip *iph;
+
+#ifdef INET6
+ struct ip6_hdr *ip6h;
+
+#endif
+ struct mbuf *mat;
+ uint16_t ptype, plen;
+ int err_at;
+ uint8_t fnd;
+ struct sctp_nets *net;
+
+ memset(&sin4, 0, sizeof(sin4));
+#ifdef INET6
+ memset(&sin6, 0, sizeof(sin6));
+#endif
+ sin4.sin_family = AF_INET;
+ sin4.sin_len = sizeof(sin4);
+#ifdef INET6
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_len = sizeof(sin6);
+#endif
+ sa_touse = NULL;
+ /* First what about the src address of the pkt ? */
+ iph = mtod(in_initpkt, struct ip *);
+ switch (iph->ip_v) {
+ case IPVERSION:
+ /* source addr is IPv4 */
+ sin4.sin_addr = iph->ip_src;
+ sa_touse = (struct sockaddr *)&sin4;
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ /* source addr is IPv6 */
+ ip6h = mtod(in_initpkt, struct ip6_hdr *);
+ sin6.sin6_addr = ip6h->ip6_src;
+ sa_touse = (struct sockaddr *)&sin6;
+ break;
+#endif
+ default:
+ return (1);
+ }
+
+ fnd = 0;
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ sa = (struct sockaddr *)&net->ro._l_addr;
+ if (sa->sa_family == sa_touse->sa_family) {
+ if (sa->sa_family == AF_INET) {
+ sa4 = (struct sockaddr_in *)sa;
+ if (sa4->sin_addr.s_addr ==
+ sin4.sin_addr.s_addr) {
+ fnd = 1;
+ break;
+ }
+ }
+#ifdef INET6
+ if (sa->sa_family == AF_INET6) {
+ sa6 = (struct sockaddr_in6 *)sa;
+ if (SCTP6_ARE_ADDR_EQUAL(sa6,
+ &sin6)) {
+ fnd = 1;
+ break;
+ }
+ }
+#endif
+ }
+ }
+ if (fnd == 0) {
+ /* New address added! no need to look futher. */
+ return (1);
+ }
+ /* Ok so far lets munge through the rest of the packet */
+ mat = in_initpkt;
+ err_at = 0;
+ sa_touse = NULL;
+ offset += sizeof(struct sctp_init_chunk);
+ phdr = sctp_get_next_param(mat, offset, &params, sizeof(params));
+ while (phdr) {
+ ptype = ntohs(phdr->param_type);
+ plen = ntohs(phdr->param_length);
+ if (ptype == SCTP_IPV4_ADDRESS) {
+ struct sctp_ipv4addr_param *p4, p4_buf;
+
+ phdr = sctp_get_next_param(mat, offset,
+ (struct sctp_paramhdr *)&p4_buf, sizeof(p4_buf));
+ if (plen != sizeof(struct sctp_ipv4addr_param) ||
+ phdr == NULL) {
+ return (1);
+ }
+ p4 = (struct sctp_ipv4addr_param *)phdr;
+ sin4.sin_addr.s_addr = p4->addr;
+ sa_touse = (struct sockaddr *)&sin4;
+ } else if (ptype == SCTP_IPV6_ADDRESS) {
+ struct sctp_ipv6addr_param *p6, p6_buf;
+
+ phdr = sctp_get_next_param(mat, offset,
+ (struct sctp_paramhdr *)&p6_buf, sizeof(p6_buf));
+ if (plen != sizeof(struct sctp_ipv6addr_param) ||
+ phdr == NULL) {
+ return (1);
+ }
+ p6 = (struct sctp_ipv6addr_param *)phdr;
+#ifdef INET6
+ memcpy((caddr_t)&sin6.sin6_addr, p6->addr,
+ sizeof(p6->addr));
+#endif
+ sa_touse = (struct sockaddr *)&sin4;
+ }
+ if (sa_touse) {
+ /* ok, sa_touse points to one to check */
+ fnd = 0;
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ sa = (struct sockaddr *)&net->ro._l_addr;
+ if (sa->sa_family != sa_touse->sa_family) {
+ continue;
+ }
+ if (sa->sa_family == AF_INET) {
+ sa4 = (struct sockaddr_in *)sa;
+ if (sa4->sin_addr.s_addr ==
+ sin4.sin_addr.s_addr) {
+ fnd = 1;
+ break;
+ }
+ }
+#ifdef INET6
+ if (sa->sa_family == AF_INET6) {
+ sa6 = (struct sockaddr_in6 *)sa;
+ if (SCTP6_ARE_ADDR_EQUAL(
+ sa6, &sin6)) {
+ fnd = 1;
+ break;
+ }
+ }
+#endif
+ }
+ if (!fnd) {
+ /* New addr added! no need to look further */
+ return (1);
+ }
+ }
+ offset += SCTP_SIZE32(plen);
+ phdr = sctp_get_next_param(mat, offset, &params, sizeof(params));
+ }
+ return (0);
+}
+
+/*
+ * Given a MBUF chain that was sent into us containing an INIT. Build a
+ * INIT-ACK with COOKIE and send back. We assume that the in_initpkt has done
+ * a pullup to include IPv6/4header, SCTP header and initial part of INIT
+ * message (i.e. the struct sctp_init_msg).
+ */
+void
+sctp_send_initiate_ack(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct mbuf *init_pkt, int iphlen, int offset, struct sctphdr *sh,
+ struct sctp_init_chunk *init_chk, uint32_t vrf_id, uint16_t port, int hold_inp_lock)
+{
+ struct sctp_association *asoc;
+ struct mbuf *m, *m_at, *m_tmp, *m_cookie, *op_err, *mp_last;
+ struct sctp_init_ack_chunk *initack;
+ struct sctp_adaptation_layer_indication *ali;
+ struct sctp_ecn_supported_param *ecn;
+ struct sctp_prsctp_supported_param *prsctp;
+ struct sctp_ecn_nonce_supported_param *ecn_nonce;
+ struct sctp_supported_chunk_types_param *pr_supported;
+ union sctp_sockstore store, store1, *over_addr;
+ struct sockaddr_in *sin, *to_sin;
+
+#ifdef INET6
+ struct sockaddr_in6 *sin6, *to_sin6;
+
+#endif
+ struct ip *iph;
+
+#ifdef INET6
+ struct ip6_hdr *ip6;
+
+#endif
+ struct sockaddr *to;
+ struct sctp_state_cookie stc;
+ struct sctp_nets *net = NULL;
+ uint8_t *signature = NULL;
+ int cnt_inits_to = 0;
+ uint16_t his_limit, i_want;
+ int abort_flag, padval;
+ int num_ext;
+ int p_len;
+ int nat_friendly = 0;
+ struct socket *so;
+
+ if (stcb)
+ asoc = &stcb->asoc;
+ else
+ asoc = NULL;
+ mp_last = NULL;
+ if ((asoc != NULL) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) &&
+ (sctp_are_there_new_addresses(asoc, init_pkt, iphlen, offset))) {
+ /* new addresses, out of here in non-cookie-wait states */
+ /*
+ * Send a ABORT, we don't add the new address error clause
+ * though we even set the T bit and copy in the 0 tag.. this
+ * looks no different than if no listener was present.
+ */
+ sctp_send_abort(init_pkt, iphlen, sh, 0, NULL, vrf_id, port);
+ return;
+ }
+ abort_flag = 0;
+ op_err = sctp_arethere_unrecognized_parameters(init_pkt,
+ (offset + sizeof(struct sctp_init_chunk)),
+ &abort_flag, (struct sctp_chunkhdr *)init_chk, &nat_friendly);
+ if (abort_flag) {
+do_a_abort:
+ sctp_send_abort(init_pkt, iphlen, sh,
+ init_chk->init.initiate_tag, op_err, vrf_id, port);
+ return;
+ }
+ m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ if (m == NULL) {
+ /* No memory, INIT timer will re-attempt. */
+ if (op_err)
+ sctp_m_freem(op_err);
+ return;
+ }
+ SCTP_BUF_LEN(m) = sizeof(struct sctp_init_chunk);
+
+ /* the time I built cookie */
+ (void)SCTP_GETTIME_TIMEVAL(&stc.time_entered);
+
+ /* populate any tie tags */
+ if (asoc != NULL) {
+ /* unlock before tag selections */
+ stc.tie_tag_my_vtag = asoc->my_vtag_nonce;
+ stc.tie_tag_peer_vtag = asoc->peer_vtag_nonce;
+ stc.cookie_life = asoc->cookie_life;
+ net = asoc->primary_destination;
+ } else {
+ stc.tie_tag_my_vtag = 0;
+ stc.tie_tag_peer_vtag = 0;
+ /* life I will award this cookie */
+ stc.cookie_life = inp->sctp_ep.def_cookie_life;
+ }
+
+ /* copy in the ports for later check */
+ stc.myport = sh->dest_port;
+ stc.peerport = sh->src_port;
+
+ /*
+ * If we wanted to honor cookie life extentions, we would add to
+ * stc.cookie_life. For now we should NOT honor any extension
+ */
+ stc.site_scope = stc.local_scope = stc.loopback_scope = 0;
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ struct inpcb *in_inp;
+
+ /* Its a V6 socket */
+ in_inp = (struct inpcb *)inp;
+ stc.ipv6_addr_legal = 1;
+ /* Now look at the binding flag to see if V4 will be legal */
+ if (SCTP_IPV6_V6ONLY(in_inp) == 0) {
+ stc.ipv4_addr_legal = 1;
+ } else {
+ /* V4 addresses are NOT legal on the association */
+ stc.ipv4_addr_legal = 0;
+ }
+ } else {
+ /* Its a V4 socket, no - V6 */
+ stc.ipv4_addr_legal = 1;
+ stc.ipv6_addr_legal = 0;
+ }
+
+#ifdef SCTP_DONT_DO_PRIVADDR_SCOPE
+ stc.ipv4_scope = 1;
+#else
+ stc.ipv4_scope = 0;
+#endif
+ /* now for scope setup */
+ memset((caddr_t)&store, 0, sizeof(store));
+ memset((caddr_t)&store1, 0, sizeof(store1));
+ sin = &store.sin;
+ to_sin = &store1.sin;
+#ifdef INET6
+ sin6 = &store.sin6;
+ to_sin6 = &store1.sin6;
+#endif
+ iph = mtod(init_pkt, struct ip *);
+ /* establish the to_addr's */
+ switch (iph->ip_v) {
+ case IPVERSION:
+ to_sin->sin_port = sh->dest_port;
+ to_sin->sin_family = AF_INET;
+ to_sin->sin_len = sizeof(struct sockaddr_in);
+ to_sin->sin_addr = iph->ip_dst;
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ ip6 = mtod(init_pkt, struct ip6_hdr *);
+ to_sin6->sin6_addr = ip6->ip6_dst;
+ to_sin6->sin6_scope_id = 0;
+ to_sin6->sin6_port = sh->dest_port;
+ to_sin6->sin6_family = AF_INET6;
+ to_sin6->sin6_len = sizeof(struct sockaddr_in6);
+ break;
+#endif
+ default:
+ goto do_a_abort;
+ break;
+ };
+
+ if (net == NULL) {
+ to = (struct sockaddr *)&store;
+ switch (iph->ip_v) {
+ case IPVERSION:
+ {
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_port = sh->src_port;
+ sin->sin_addr = iph->ip_src;
+ /* lookup address */
+ stc.address[0] = sin->sin_addr.s_addr;
+ stc.address[1] = 0;
+ stc.address[2] = 0;
+ stc.address[3] = 0;
+ stc.addr_type = SCTP_IPV4_ADDRESS;
+ /* local from address */
+ stc.laddress[0] = to_sin->sin_addr.s_addr;
+ stc.laddress[1] = 0;
+ stc.laddress[2] = 0;
+ stc.laddress[3] = 0;
+ stc.laddr_type = SCTP_IPV4_ADDRESS;
+ /* scope_id is only for v6 */
+ stc.scope_id = 0;
+#ifndef SCTP_DONT_DO_PRIVADDR_SCOPE
+ if (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) {
+ stc.ipv4_scope = 1;
+ }
+#else
+ stc.ipv4_scope = 1;
+#endif /* SCTP_DONT_DO_PRIVADDR_SCOPE */
+ /* Must use the address in this case */
+ if (sctp_is_address_on_local_host((struct sockaddr *)sin, vrf_id)) {
+ stc.loopback_scope = 1;
+ stc.ipv4_scope = 1;
+ stc.site_scope = 1;
+ stc.local_scope = 0;
+ }
+ break;
+ }
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ {
+ ip6 = mtod(init_pkt, struct ip6_hdr *);
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+ sin6->sin6_port = sh->src_port;
+ sin6->sin6_addr = ip6->ip6_src;
+ /* lookup address */
+ memcpy(&stc.address, &sin6->sin6_addr,
+ sizeof(struct in6_addr));
+ sin6->sin6_scope_id = 0;
+ stc.addr_type = SCTP_IPV6_ADDRESS;
+ stc.scope_id = 0;
+ if (sctp_is_address_on_local_host((struct sockaddr *)sin6, vrf_id)) {
+ /*
+ * FIX ME: does this have scope from
+ * rcvif?
+ */
+ (void)sa6_recoverscope(sin6);
+ stc.scope_id = sin6->sin6_scope_id;
+ sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone));
+ stc.loopback_scope = 1;
+ stc.local_scope = 0;
+ stc.site_scope = 1;
+ stc.ipv4_scope = 1;
+ } else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
+ /*
+ * If the new destination is a
+ * LINK_LOCAL we must have common
+ * both site and local scope. Don't
+ * set local scope though since we
+ * must depend on the source to be
+ * added implicitly. We cannot
+ * assure just because we share one
+ * link that all links are common.
+ */
+ stc.local_scope = 0;
+ stc.site_scope = 1;
+ stc.ipv4_scope = 1;
+ /*
+ * we start counting for the private
+ * address stuff at 1. since the
+ * link local we source from won't
+ * show up in our scoped count.
+ */
+ cnt_inits_to = 1;
+ /*
+ * pull out the scope_id from
+ * incoming pkt
+ */
+ /*
+ * FIX ME: does this have scope from
+ * rcvif?
+ */
+ (void)sa6_recoverscope(sin6);
+ stc.scope_id = sin6->sin6_scope_id;
+ sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone));
+ } else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) {
+ /*
+ * If the new destination is
+ * SITE_LOCAL then we must have site
+ * scope in common.
+ */
+ stc.site_scope = 1;
+ }
+ memcpy(&stc.laddress, &to_sin6->sin6_addr, sizeof(struct in6_addr));
+ stc.laddr_type = SCTP_IPV6_ADDRESS;
+ break;
+ }
+#endif
+ default:
+ /* TSNH */
+ goto do_a_abort;
+ break;
+ }
+ } else {
+ /* set the scope per the existing tcb */
+
+#ifdef INET6
+ struct sctp_nets *lnet;
+
+#endif
+
+ stc.loopback_scope = asoc->loopback_scope;
+ stc.ipv4_scope = asoc->ipv4_local_scope;
+ stc.site_scope = asoc->site_scope;
+ stc.local_scope = asoc->local_scope;
+#ifdef INET6
+ /* Why do we not consider IPv4 LL addresses? */
+ TAILQ_FOREACH(lnet, &asoc->nets, sctp_next) {
+ if (lnet->ro._l_addr.sin6.sin6_family == AF_INET6) {
+ if (IN6_IS_ADDR_LINKLOCAL(&lnet->ro._l_addr.sin6.sin6_addr)) {
+ /*
+ * if we have a LL address, start
+ * counting at 1.
+ */
+ cnt_inits_to = 1;
+ }
+ }
+ }
+#endif
+ /* use the net pointer */
+ to = (struct sockaddr *)&net->ro._l_addr;
+ switch (to->sa_family) {
+ case AF_INET:
+ sin = (struct sockaddr_in *)to;
+ stc.address[0] = sin->sin_addr.s_addr;
+ stc.address[1] = 0;
+ stc.address[2] = 0;
+ stc.address[3] = 0;
+ stc.addr_type = SCTP_IPV4_ADDRESS;
+ if (net->src_addr_selected == 0) {
+ /*
+ * strange case here, the INIT should have
+ * did the selection.
+ */
+ net->ro._s_addr = sctp_source_address_selection(inp,
+ stcb, (sctp_route_t *) & net->ro,
+ net, 0, vrf_id);
+ if (net->ro._s_addr == NULL)
+ return;
+
+ net->src_addr_selected = 1;
+
+ }
+ stc.laddress[0] = net->ro._s_addr->address.sin.sin_addr.s_addr;
+ stc.laddress[1] = 0;
+ stc.laddress[2] = 0;
+ stc.laddress[3] = 0;
+ stc.laddr_type = SCTP_IPV4_ADDRESS;
+ break;
+#ifdef INET6
+ case AF_INET6:
+ sin6 = (struct sockaddr_in6 *)to;
+ memcpy(&stc.address, &sin6->sin6_addr,
+ sizeof(struct in6_addr));
+ stc.addr_type = SCTP_IPV6_ADDRESS;
+ if (net->src_addr_selected == 0) {
+ /*
+ * strange case here, the INIT should have
+ * did the selection.
+ */
+ net->ro._s_addr = sctp_source_address_selection(inp,
+ stcb, (sctp_route_t *) & net->ro,
+ net, 0, vrf_id);
+ if (net->ro._s_addr == NULL)
+ return;
+
+ net->src_addr_selected = 1;
+ }
+ memcpy(&stc.laddress, &net->ro._s_addr->address.sin6.sin6_addr,
+ sizeof(struct in6_addr));
+ stc.laddr_type = SCTP_IPV6_ADDRESS;
+ break;
+#endif
+ }
+ }
+ /* Now lets put the SCTP header in place */
+ initack = mtod(m, struct sctp_init_ack_chunk *);
+ /* Save it off for quick ref */
+ stc.peers_vtag = init_chk->init.initiate_tag;
+ /* who are we */
+ memcpy(stc.identification, SCTP_VERSION_STRING,
+ min(strlen(SCTP_VERSION_STRING), sizeof(stc.identification)));
+ /* now the chunk header */
+ initack->ch.chunk_type = SCTP_INITIATION_ACK;
+ initack->ch.chunk_flags = 0;
+ /* fill in later from mbuf we build */
+ initack->ch.chunk_length = 0;
+ /* place in my tag */
+ if ((asoc != NULL) &&
+ ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_INUSE) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED))) {
+ /* re-use the v-tags and init-seq here */
+ initack->init.initiate_tag = htonl(asoc->my_vtag);
+ initack->init.initial_tsn = htonl(asoc->init_seq_number);
+ } else {
+ uint32_t vtag, itsn;
+
+ if (hold_inp_lock) {
+ SCTP_INP_INCR_REF(inp);
+ SCTP_INP_RUNLOCK(inp);
+ }
+ if (asoc) {
+ atomic_add_int(&asoc->refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ new_tag:
+ vtag = sctp_select_a_tag(inp, inp->sctp_lport, sh->src_port, 1);
+ if ((asoc->peer_supports_nat) && (vtag == asoc->my_vtag)) {
+ /*
+ * Got a duplicate vtag on some guy behind a
+ * nat make sure we don't use it.
+ */
+ goto new_tag;
+ }
+ initack->init.initiate_tag = htonl(vtag);
+ /* get a TSN to use too */
+ itsn = sctp_select_initial_TSN(&inp->sctp_ep);
+ initack->init.initial_tsn = htonl(itsn);
+ SCTP_TCB_LOCK(stcb);
+ atomic_add_int(&asoc->refcnt, -1);
+ } else {
+ vtag = sctp_select_a_tag(inp, inp->sctp_lport, sh->src_port, 1);
+ initack->init.initiate_tag = htonl(vtag);
+ /* get a TSN to use too */
+ initack->init.initial_tsn = htonl(sctp_select_initial_TSN(&inp->sctp_ep));
+ }
+ if (hold_inp_lock) {
+ SCTP_INP_RLOCK(inp);
+ SCTP_INP_DECR_REF(inp);
+ }
+ }
+ /* save away my tag to */
+ stc.my_vtag = initack->init.initiate_tag;
+
+ /* set up some of the credits. */
+ so = inp->sctp_socket;
+ if (so == NULL) {
+ /* memory problem */
+ sctp_m_freem(m);
+ return;
+ } else {
+ initack->init.a_rwnd = htonl(max(SCTP_SB_LIMIT_RCV(so), SCTP_MINIMAL_RWND));
+ }
+ /* set what I want */
+ his_limit = ntohs(init_chk->init.num_inbound_streams);
+ /* choose what I want */
+ if (asoc != NULL) {
+ if (asoc->streamoutcnt > inp->sctp_ep.pre_open_stream_count) {
+ i_want = asoc->streamoutcnt;
+ } else {
+ i_want = inp->sctp_ep.pre_open_stream_count;
+ }
+ } else {
+ i_want = inp->sctp_ep.pre_open_stream_count;
+ }
+ if (his_limit < i_want) {
+ /* I Want more :< */
+ initack->init.num_outbound_streams = init_chk->init.num_inbound_streams;
+ } else {
+ /* I can have what I want :> */
+ initack->init.num_outbound_streams = htons(i_want);
+ }
+ /* tell him his limt. */
+ initack->init.num_inbound_streams =
+ htons(inp->sctp_ep.max_open_streams_intome);
+
+ /* adaptation layer indication parameter */
+ ali = (struct sctp_adaptation_layer_indication *)((caddr_t)initack + sizeof(*initack));
+ ali->ph.param_type = htons(SCTP_ULP_ADAPTATION);
+ ali->ph.param_length = htons(sizeof(*ali));
+ ali->indication = ntohl(inp->sctp_ep.adaptation_layer_indicator);
+ SCTP_BUF_LEN(m) += sizeof(*ali);
+ ecn = (struct sctp_ecn_supported_param *)((caddr_t)ali + sizeof(*ali));
+
+ /* ECN parameter */
+ if (SCTP_BASE_SYSCTL(sctp_ecn_enable) == 1) {
+ ecn->ph.param_type = htons(SCTP_ECN_CAPABLE);
+ ecn->ph.param_length = htons(sizeof(*ecn));
+ SCTP_BUF_LEN(m) += sizeof(*ecn);
+
+ prsctp = (struct sctp_prsctp_supported_param *)((caddr_t)ecn +
+ sizeof(*ecn));
+ } else {
+ prsctp = (struct sctp_prsctp_supported_param *)((caddr_t)ecn);
+ }
+ /* And now tell the peer we do pr-sctp */
+ prsctp->ph.param_type = htons(SCTP_PRSCTP_SUPPORTED);
+ prsctp->ph.param_length = htons(sizeof(*prsctp));
+ SCTP_BUF_LEN(m) += sizeof(*prsctp);
+ if (nat_friendly) {
+ /* Add NAT friendly parameter */
+ struct sctp_paramhdr *ph;
+
+ ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
+ ph->param_type = htons(SCTP_HAS_NAT_SUPPORT);
+ ph->param_length = htons(sizeof(struct sctp_paramhdr));
+ SCTP_BUF_LEN(m) += sizeof(struct sctp_paramhdr);
+ }
+ /* And now tell the peer we do all the extensions */
+ pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
+ pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT);
+ num_ext = 0;
+ pr_supported->chunk_types[num_ext++] = SCTP_ASCONF;
+ pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK;
+ pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN;
+ pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED;
+ pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET;
+ if (!SCTP_BASE_SYSCTL(sctp_auth_disable))
+ pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION;
+ if (SCTP_BASE_SYSCTL(sctp_nr_sack_on_off))
+ pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK;
+ p_len = sizeof(*pr_supported) + num_ext;
+ pr_supported->ph.param_length = htons(p_len);
+ bzero((caddr_t)pr_supported + p_len, SCTP_SIZE32(p_len) - p_len);
+ SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len);
+
+ /* ECN nonce: And now tell the peer we support ECN nonce */
+ if (SCTP_BASE_SYSCTL(sctp_ecn_nonce)) {
+ ecn_nonce = (struct sctp_ecn_nonce_supported_param *)
+ ((caddr_t)pr_supported + SCTP_SIZE32(p_len));
+ ecn_nonce->ph.param_type = htons(SCTP_ECN_NONCE_SUPPORTED);
+ ecn_nonce->ph.param_length = htons(sizeof(*ecn_nonce));
+ SCTP_BUF_LEN(m) += sizeof(*ecn_nonce);
+ }
+ /* add authentication parameters */
+ if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) {
+ struct sctp_auth_random *randp;
+ struct sctp_auth_hmac_algo *hmacs;
+ struct sctp_auth_chunk_list *chunks;
+ uint16_t random_len;
+
+ /* generate and add RANDOM parameter */
+ random_len = SCTP_AUTH_RANDOM_SIZE_DEFAULT;
+ randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
+ randp->ph.param_type = htons(SCTP_RANDOM);
+ p_len = sizeof(*randp) + random_len;
+ randp->ph.param_length = htons(p_len);
+ SCTP_READ_RANDOM(randp->random_data, random_len);
+ /* zero out any padding required */
+ bzero((caddr_t)randp + p_len, SCTP_SIZE32(p_len) - p_len);
+ SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len);
+
+ /* add HMAC_ALGO parameter */
+ hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
+ p_len = sctp_serialize_hmaclist(inp->sctp_ep.local_hmacs,
+ (uint8_t *) hmacs->hmac_ids);
+ if (p_len > 0) {
+ p_len += sizeof(*hmacs);
+ hmacs->ph.param_type = htons(SCTP_HMAC_LIST);
+ hmacs->ph.param_length = htons(p_len);
+ /* zero out any padding required */
+ bzero((caddr_t)hmacs + p_len, SCTP_SIZE32(p_len) - p_len);
+ SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len);
+ }
+ /* add CHUNKS parameter */
+ chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
+ p_len = sctp_serialize_auth_chunks(inp->sctp_ep.local_auth_chunks,
+ chunks->chunk_types);
+ if (p_len > 0) {
+ p_len += sizeof(*chunks);
+ chunks->ph.param_type = htons(SCTP_CHUNK_LIST);
+ chunks->ph.param_length = htons(p_len);
+ /* zero out any padding required */
+ bzero((caddr_t)chunks + p_len, SCTP_SIZE32(p_len) - p_len);
+ SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len);
+ }
+ }
+ m_at = m;
+ /* now the addresses */
+ {
+ struct sctp_scoping scp;
+
+ /*
+ * To optimize this we could put the scoping stuff into a
+ * structure and remove the individual uint8's from the stc
+ * structure. Then we could just sifa in the address within
+ * the stc.. but for now this is a quick hack to get the
+ * address stuff teased apart.
+ */
+ scp.ipv4_addr_legal = stc.ipv4_addr_legal;
+ scp.ipv6_addr_legal = stc.ipv6_addr_legal;
+ scp.loopback_scope = stc.loopback_scope;
+ scp.ipv4_local_scope = stc.ipv4_scope;
+ scp.local_scope = stc.local_scope;
+ scp.site_scope = stc.site_scope;
+ m_at = sctp_add_addresses_to_i_ia(inp, &scp, m_at, cnt_inits_to);
+ }
+
+ /* tack on the operational error if present */
+ if (op_err) {
+ struct mbuf *ol;
+ int llen;
+
+ llen = 0;
+ ol = op_err;
+ while (ol) {
+ llen += SCTP_BUF_LEN(ol);
+ ol = SCTP_BUF_NEXT(ol);
+ }
+ if (llen % 4) {
+ /* must add a pad to the param */
+ uint32_t cpthis = 0;
+ int padlen;
+
+ padlen = 4 - (llen % 4);
+ m_copyback(op_err, llen, padlen, (caddr_t)&cpthis);
+ }
+ while (SCTP_BUF_NEXT(m_at) != NULL) {
+ m_at = SCTP_BUF_NEXT(m_at);
+ }
+ SCTP_BUF_NEXT(m_at) = op_err;
+ while (SCTP_BUF_NEXT(m_at) != NULL) {
+ m_at = SCTP_BUF_NEXT(m_at);
+ }
+ }
+ /* pre-calulate the size and update pkt header and chunk header */
+ p_len = 0;
+ for (m_tmp = m; m_tmp; m_tmp = SCTP_BUF_NEXT(m_tmp)) {
+ p_len += SCTP_BUF_LEN(m_tmp);
+ if (SCTP_BUF_NEXT(m_tmp) == NULL) {
+ /* m_tmp should now point to last one */
+ break;
+ }
+ }
+
+ /* Now we must build a cookie */
+ m_cookie = sctp_add_cookie(inp, init_pkt, offset, m, 0, &stc, &signature);
+ if (m_cookie == NULL) {
+ /* memory problem */
+ sctp_m_freem(m);
+ return;
+ }
+ /* Now append the cookie to the end and update the space/size */
+ SCTP_BUF_NEXT(m_tmp) = m_cookie;
+
+ for (m_tmp = m_cookie; m_tmp; m_tmp = SCTP_BUF_NEXT(m_tmp)) {
+ p_len += SCTP_BUF_LEN(m_tmp);
+ if (SCTP_BUF_NEXT(m_tmp) == NULL) {
+ /* m_tmp should now point to last one */
+ mp_last = m_tmp;
+ break;
+ }
+ }
+ /*
+ * Place in the size, but we don't include the last pad (if any) in
+ * the INIT-ACK.
+ */
+ initack->ch.chunk_length = htons(p_len);
+
+ /*
+ * Time to sign the cookie, we don't sign over the cookie signature
+ * though thus we set trailer.
+ */
+ (void)sctp_hmac_m(SCTP_HMAC,
+ (uint8_t *) inp->sctp_ep.secret_key[(int)(inp->sctp_ep.current_secret_number)],
+ SCTP_SECRET_SIZE, m_cookie, sizeof(struct sctp_paramhdr),
+ (uint8_t *) signature, SCTP_SIGNATURE_SIZE);
+ /*
+ * We sifa 0 here to NOT set IP_DF if its IPv4, we ignore the return
+ * here since the timer will drive a retranmission.
+ */
+ padval = p_len % 4;
+ if ((padval) && (mp_last)) {
+ /* see my previous comments on mp_last */
+ int ret;
+
+ ret = sctp_add_pad_tombuf(mp_last, (4 - padval));
+ if (ret) {
+ /* Houston we have a problem, no space */
+ sctp_m_freem(m);
+ return;
+ }
+ p_len += padval;
+ }
+ if (stc.loopback_scope) {
+ over_addr = &store1;
+ } else {
+ over_addr = NULL;
+ }
+
+ (void)sctp_lowlevel_chunk_output(inp, NULL, NULL, to, m, 0, NULL, 0, 0,
+ 0, NULL, 0,
+ inp->sctp_lport, sh->src_port, init_chk->init.initiate_tag,
+ port, SCTP_SO_NOT_LOCKED, over_addr);
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+}
+
+
+void
+sctp_insert_on_wheel(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ struct sctp_stream_out *strq, int holds_lock)
+{
+ if (holds_lock == 0) {
+ SCTP_TCB_SEND_LOCK(stcb);
+ }
+ if ((strq->next_spoke.tqe_next == NULL) &&
+ (strq->next_spoke.tqe_prev == NULL)) {
+ TAILQ_INSERT_TAIL(&asoc->out_wheel, strq, next_spoke);
+ }
+ if (holds_lock == 0) {
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ }
+}
+
+void
+sctp_remove_from_wheel(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ struct sctp_stream_out *strq,
+ int holds_lock)
+{
+ /* take off and then setup so we know it is not on the wheel */
+ if (holds_lock == 0) {
+ SCTP_TCB_SEND_LOCK(stcb);
+ }
+ if (TAILQ_EMPTY(&strq->outqueue)) {
+ if (asoc->last_out_stream == strq) {
+ asoc->last_out_stream = TAILQ_PREV(asoc->last_out_stream, sctpwheel_listhead, next_spoke);
+ if (asoc->last_out_stream == NULL) {
+ asoc->last_out_stream = TAILQ_LAST(&asoc->out_wheel, sctpwheel_listhead);
+ }
+ if (asoc->last_out_stream == strq) {
+ asoc->last_out_stream = NULL;
+ }
+ }
+ TAILQ_REMOVE(&asoc->out_wheel, strq, next_spoke);
+ strq->next_spoke.tqe_next = NULL;
+ strq->next_spoke.tqe_prev = NULL;
+ }
+ if (holds_lock == 0) {
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ }
+}
+
+static void
+sctp_prune_prsctp(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ struct sctp_sndrcvinfo *srcv,
+ int dataout)
+{
+ int freed_spc = 0;
+ struct sctp_tmit_chunk *chk, *nchk;
+
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ if ((asoc->peer_supports_prsctp) &&
+ (asoc->sent_queue_cnt_removeable > 0)) {
+ TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
+ /*
+ * Look for chunks marked with the PR_SCTP flag AND
+ * the buffer space flag. If the one being sent is
+ * equal or greater priority then purge the old one
+ * and free some space.
+ */
+ if (PR_SCTP_BUF_ENABLED(chk->flags)) {
+ /*
+ * This one is PR-SCTP AND buffer space
+ * limited type
+ */
+ if (chk->rec.data.timetodrop.tv_sec >= (long)srcv->sinfo_timetolive) {
+ /*
+ * Lower numbers equates to higher
+ * priority so if the one we are
+ * looking at has a larger or equal
+ * priority we want to drop the data
+ * and NOT retransmit it.
+ */
+ if (chk->data) {
+ /*
+ * We release the book_size
+ * if the mbuf is here
+ */
+ int ret_spc;
+ int cause;
+
+ if (chk->sent > SCTP_DATAGRAM_UNSENT)
+ cause = SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT;
+ else
+ cause = SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_UNSENT;
+ ret_spc = sctp_release_pr_sctp_chunk(stcb, chk,
+ cause,
+ SCTP_SO_LOCKED);
+ freed_spc += ret_spc;
+ if (freed_spc >= dataout) {
+ return;
+ }
+ } /* if chunk was present */
+ } /* if of sufficent priority */
+ } /* if chunk has enabled */
+ } /* tailqforeach */
+
+ chk = TAILQ_FIRST(&asoc->send_queue);
+ while (chk) {
+ nchk = TAILQ_NEXT(chk, sctp_next);
+ /* Here we must move to the sent queue and mark */
+ if (PR_SCTP_BUF_ENABLED(chk->flags)) {
+ if (chk->rec.data.timetodrop.tv_sec >= (long)srcv->sinfo_timetolive) {
+ if (chk->data) {
+ /*
+ * We release the book_size
+ * if the mbuf is here
+ */
+ int ret_spc;
+
+ ret_spc = sctp_release_pr_sctp_chunk(stcb, chk,
+ SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_UNSENT,
+ SCTP_SO_LOCKED);
+
+ freed_spc += ret_spc;
+ if (freed_spc >= dataout) {
+ return;
+ }
+ } /* end if chk->data */
+ } /* end if right class */
+ } /* end if chk pr-sctp */
+ chk = nchk;
+ } /* end while (chk) */
+ } /* if enabled in asoc */
+}
+
+int
+sctp_get_frag_point(struct sctp_tcb *stcb,
+ struct sctp_association *asoc)
+{
+ int siz, ovh;
+
+ /*
+ * For endpoints that have both v6 and v4 addresses we must reserve
+ * room for the ipv6 header, for those that are only dealing with V4
+ * we use a larger frag point.
+ */
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ ovh = SCTP_MED_OVERHEAD;
+ } else {
+ ovh = SCTP_MED_V4_OVERHEAD;
+ }
+
+ if (stcb->asoc.sctp_frag_point > asoc->smallest_mtu)
+ siz = asoc->smallest_mtu - ovh;
+ else
+ siz = (stcb->asoc.sctp_frag_point - ovh);
+ /*
+ * if (siz > (MCLBYTES-sizeof(struct sctp_data_chunk))) {
+ */
+ /* A data chunk MUST fit in a cluster */
+ /* siz = (MCLBYTES - sizeof(struct sctp_data_chunk)); */
+ /* } */
+
+ /* adjust for an AUTH chunk if DATA requires auth */
+ if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks))
+ siz -= sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
+
+ if (siz % 4) {
+ /* make it an even word boundary please */
+ siz -= (siz % 4);
+ }
+ return (siz);
+}
+
+static void
+sctp_set_prsctp_policy(struct sctp_stream_queue_pending *sp)
+{
+ sp->pr_sctp_on = 0;
+ /*
+ * We assume that the user wants PR_SCTP_TTL if the user provides a
+ * positive lifetime but does not specify any PR_SCTP policy. This
+ * is a BAD assumption and causes problems at least with the
+ * U-Vancovers MPI folks. I will change this to be no policy means
+ * NO PR-SCTP.
+ */
+ if (PR_SCTP_ENABLED(sp->sinfo_flags)) {
+ sp->act_flags |= PR_SCTP_POLICY(sp->sinfo_flags);
+ sp->pr_sctp_on = 1;
+ } else {
+ return;
+ }
+ switch (PR_SCTP_POLICY(sp->sinfo_flags)) {
+ case CHUNK_FLAGS_PR_SCTP_BUF:
+ /*
+ * Time to live is a priority stored in tv_sec when doing
+ * the buffer drop thing.
+ */
+ sp->ts.tv_sec = sp->timetolive;
+ sp->ts.tv_usec = 0;
+ break;
+ case CHUNK_FLAGS_PR_SCTP_TTL:
+ {
+ struct timeval tv;
+
+ (void)SCTP_GETTIME_TIMEVAL(&sp->ts);
+ tv.tv_sec = sp->timetolive / 1000;
+ tv.tv_usec = (sp->timetolive * 1000) % 1000000;
+ /*
+ * TODO sctp_constants.h needs alternative time
+ * macros when _KERNEL is undefined.
+ */
+ timevaladd(&sp->ts, &tv);
+ }
+ break;
+ case CHUNK_FLAGS_PR_SCTP_RTX:
+ /*
+ * Time to live is a the number or retransmissions stored in
+ * tv_sec.
+ */
+ sp->ts.tv_sec = sp->timetolive;
+ sp->ts.tv_usec = 0;
+ break;
+ default:
+ SCTPDBG(SCTP_DEBUG_USRREQ1,
+ "Unknown PR_SCTP policy %u.\n",
+ PR_SCTP_POLICY(sp->sinfo_flags));
+ break;
+ }
+}
+
+static int
+sctp_msg_append(struct sctp_tcb *stcb,
+ struct sctp_nets *net,
+ struct mbuf *m,
+ struct sctp_sndrcvinfo *srcv, int hold_stcb_lock)
+{
+ int error = 0, holds_lock;
+ struct mbuf *at;
+ struct sctp_stream_queue_pending *sp = NULL;
+ struct sctp_stream_out *strm;
+
+ /*
+ * Given an mbuf chain, put it into the association send queue and
+ * place it on the wheel
+ */
+ holds_lock = hold_stcb_lock;
+ if (srcv->sinfo_stream >= stcb->asoc.streamoutcnt) {
+ /* Invalid stream number */
+ SCTP_LTRACE_ERR_RET_PKT(m, NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out_now;
+ }
+ if ((stcb->asoc.stream_locked) &&
+ (stcb->asoc.stream_locked_on != srcv->sinfo_stream)) {
+ SCTP_LTRACE_ERR_RET_PKT(m, NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out_now;
+ }
+ strm = &stcb->asoc.strmout[srcv->sinfo_stream];
+ /* Now can we send this? */
+ if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_SENT) ||
+ (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) ||
+ (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) ||
+ (stcb->asoc.state & SCTP_STATE_SHUTDOWN_PENDING)) {
+ /* got data while shutting down */
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET);
+ error = ECONNRESET;
+ goto out_now;
+ }
+ sctp_alloc_a_strmoq(stcb, sp);
+ if (sp == NULL) {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ error = ENOMEM;
+ goto out_now;
+ }
+ sp->sinfo_flags = srcv->sinfo_flags;
+ sp->timetolive = srcv->sinfo_timetolive;
+ sp->ppid = srcv->sinfo_ppid;
+ sp->context = srcv->sinfo_context;
+ sp->strseq = 0;
+ if (sp->sinfo_flags & SCTP_ADDR_OVER) {
+ sp->net = net;
+ atomic_add_int(&sp->net->ref_count, 1);
+ } else {
+ sp->net = NULL;
+ }
+ (void)SCTP_GETTIME_TIMEVAL(&sp->ts);
+ sp->stream = srcv->sinfo_stream;
+ sp->msg_is_complete = 1;
+ sp->sender_all_done = 1;
+ sp->some_taken = 0;
+ sp->data = m;
+ sp->tail_mbuf = NULL;
+ sp->length = 0;
+ at = m;
+ sctp_set_prsctp_policy(sp);
+ /*
+ * We could in theory (for sendall) sifa the length in, but we would
+ * still have to hunt through the chain since we need to setup the
+ * tail_mbuf
+ */
+ while (at) {
+ if (SCTP_BUF_NEXT(at) == NULL)
+ sp->tail_mbuf = at;
+ sp->length += SCTP_BUF_LEN(at);
+ at = SCTP_BUF_NEXT(at);
+ }
+ SCTP_TCB_SEND_LOCK(stcb);
+ sctp_snd_sb_alloc(stcb, sp->length);
+ atomic_add_int(&stcb->asoc.stream_queue_cnt, 1);
+ TAILQ_INSERT_TAIL(&strm->outqueue, sp, next);
+ if ((srcv->sinfo_flags & SCTP_UNORDERED) == 0) {
+ sp->strseq = strm->next_sequence_sent;
+ strm->next_sequence_sent++;
+ }
+ if ((strm->next_spoke.tqe_next == NULL) &&
+ (strm->next_spoke.tqe_prev == NULL)) {
+ /* Not on wheel, insert */
+ sctp_insert_on_wheel(stcb, &stcb->asoc, strm, 1);
+ }
+ m = NULL;
+ SCTP_TCB_SEND_UNLOCK(stcb);
+out_now:
+ if (m) {
+ sctp_m_freem(m);
+ }
+ return (error);
+}
+
+
+static struct mbuf *
+sctp_copy_mbufchain(struct mbuf *clonechain,
+ struct mbuf *outchain,
+ struct mbuf **endofchain,
+ int can_take_mbuf,
+ int sizeofcpy,
+ uint8_t copy_by_ref)
+{
+ struct mbuf *m;
+ struct mbuf *appendchain;
+ caddr_t cp;
+ int len;
+
+ if (endofchain == NULL) {
+ /* error */
+error_out:
+ if (outchain)
+ sctp_m_freem(outchain);
+ return (NULL);
+ }
+ if (can_take_mbuf) {
+ appendchain = clonechain;
+ } else {
+ if (!copy_by_ref &&
+ (sizeofcpy <= (int)((((SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count) - 1) * MLEN) + MHLEN)))
+ ) {
+ /* Its not in a cluster */
+ if (*endofchain == NULL) {
+ /* lets get a mbuf cluster */
+ if (outchain == NULL) {
+ /* This is the general case */
+ new_mbuf:
+ outchain = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_HEADER);
+ if (outchain == NULL) {
+ goto error_out;
+ }
+ SCTP_BUF_LEN(outchain) = 0;
+ *endofchain = outchain;
+ /* get the prepend space */
+ SCTP_BUF_RESV_UF(outchain, (SCTP_FIRST_MBUF_RESV + 4));
+ } else {
+ /*
+ * We really should not get a NULL
+ * in endofchain
+ */
+ /* find end */
+ m = outchain;
+ while (m) {
+ if (SCTP_BUF_NEXT(m) == NULL) {
+ *endofchain = m;
+ break;
+ }
+ m = SCTP_BUF_NEXT(m);
+ }
+ /* sanity */
+ if (*endofchain == NULL) {
+ /*
+ * huh, TSNH XXX maybe we
+ * should panic
+ */
+ sctp_m_freem(outchain);
+ goto new_mbuf;
+ }
+ }
+ /* get the new end of length */
+ len = M_TRAILINGSPACE(*endofchain);
+ } else {
+ /* how much is left at the end? */
+ len = M_TRAILINGSPACE(*endofchain);
+ }
+ /* Find the end of the data, for appending */
+ cp = (mtod((*endofchain), caddr_t)+SCTP_BUF_LEN((*endofchain)));
+
+ /* Now lets copy it out */
+ if (len >= sizeofcpy) {
+ /* It all fits, copy it in */
+ m_copydata(clonechain, 0, sizeofcpy, cp);
+ SCTP_BUF_LEN((*endofchain)) += sizeofcpy;
+ } else {
+ /* fill up the end of the chain */
+ if (len > 0) {
+ m_copydata(clonechain, 0, len, cp);
+ SCTP_BUF_LEN((*endofchain)) += len;
+ /* now we need another one */
+ sizeofcpy -= len;
+ }
+ m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_HEADER);
+ if (m == NULL) {
+ /* We failed */
+ goto error_out;
+ }
+ SCTP_BUF_NEXT((*endofchain)) = m;
+ *endofchain = m;
+ cp = mtod((*endofchain), caddr_t);
+ m_copydata(clonechain, len, sizeofcpy, cp);
+ SCTP_BUF_LEN((*endofchain)) += sizeofcpy;
+ }
+ return (outchain);
+ } else {
+ /* copy the old fashion way */
+ appendchain = SCTP_M_COPYM(clonechain, 0, M_COPYALL, M_DONTWAIT);
+#ifdef SCTP_MBUF_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ struct mbuf *mat;
+
+ mat = appendchain;
+ while (mat) {
+ if (SCTP_BUF_IS_EXTENDED(mat)) {
+ sctp_log_mb(mat, SCTP_MBUF_ICOPY);
+ }
+ mat = SCTP_BUF_NEXT(mat);
+ }
+ }
+#endif
+ }
+ }
+ if (appendchain == NULL) {
+ /* error */
+ if (outchain)
+ sctp_m_freem(outchain);
+ return (NULL);
+ }
+ if (outchain) {
+ /* tack on to the end */
+ if (*endofchain != NULL) {
+ SCTP_BUF_NEXT(((*endofchain))) = appendchain;
+ } else {
+ m = outchain;
+ while (m) {
+ if (SCTP_BUF_NEXT(m) == NULL) {
+ SCTP_BUF_NEXT(m) = appendchain;
+ break;
+ }
+ m = SCTP_BUF_NEXT(m);
+ }
+ }
+ /*
+ * save off the end and update the end-chain postion
+ */
+ m = appendchain;
+ while (m) {
+ if (SCTP_BUF_NEXT(m) == NULL) {
+ *endofchain = m;
+ break;
+ }
+ m = SCTP_BUF_NEXT(m);
+ }
+ return (outchain);
+ } else {
+ /* save off the end and update the end-chain postion */
+ m = appendchain;
+ while (m) {
+ if (SCTP_BUF_NEXT(m) == NULL) {
+ *endofchain = m;
+ break;
+ }
+ m = SCTP_BUF_NEXT(m);
+ }
+ return (appendchain);
+ }
+}
+
+int
+sctp_med_chunk_output(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ int *num_out,
+ int *reason_code,
+ int control_only, int from_where,
+ struct timeval *now, int *now_filled, int frag_point, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+);
+
+static void
+sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr,
+ uint32_t val)
+{
+ struct sctp_copy_all *ca;
+ struct mbuf *m;
+ int ret = 0;
+ int added_control = 0;
+ int un_sent, do_chunk_output = 1;
+ struct sctp_association *asoc;
+
+ ca = (struct sctp_copy_all *)ptr;
+ if (ca->m == NULL) {
+ return;
+ }
+ if (ca->inp != inp) {
+ /* TSNH */
+ return;
+ }
+ if ((ca->m) && ca->sndlen) {
+ m = SCTP_M_COPYM(ca->m, 0, M_COPYALL, M_DONTWAIT);
+ if (m == NULL) {
+ /* can't copy so we are done */
+ ca->cnt_failed++;
+ return;
+ }
+#ifdef SCTP_MBUF_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ struct mbuf *mat;
+
+ mat = m;
+ while (mat) {
+ if (SCTP_BUF_IS_EXTENDED(mat)) {
+ sctp_log_mb(mat, SCTP_MBUF_ICOPY);
+ }
+ mat = SCTP_BUF_NEXT(mat);
+ }
+ }
+#endif
+ } else {
+ m = NULL;
+ }
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ if (ca->sndrcv.sinfo_flags & SCTP_ABORT) {
+ /* Abort this assoc with m as the user defined reason */
+ if (m) {
+ struct sctp_paramhdr *ph;
+
+ SCTP_BUF_PREPEND(m, sizeof(struct sctp_paramhdr), M_DONTWAIT);
+ if (m) {
+ ph = mtod(m, struct sctp_paramhdr *);
+ ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT);
+ ph->param_length = htons(ca->sndlen);
+ }
+ /*
+ * We add one here to keep the assoc from
+ * dis-appearing on us.
+ */
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ sctp_abort_an_association(inp, stcb,
+ SCTP_RESPONSE_TO_USER_REQ,
+ m, SCTP_SO_NOT_LOCKED);
+ /*
+ * sctp_abort_an_association calls sctp_free_asoc()
+ * free association will NOT free it since we
+ * incremented the refcnt .. we do this to prevent
+ * it being freed and things getting tricky since we
+ * could end up (from free_asoc) calling inpcb_free
+ * which would get a recursive lock call to the
+ * iterator lock.. But as a consequence of that the
+ * stcb will return to us un-locked.. since
+ * free_asoc returns with either no TCB or the TCB
+ * unlocked, we must relock.. to unlock in the
+ * iterator timer :-0
+ */
+ SCTP_TCB_LOCK(stcb);
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ goto no_chunk_output;
+ }
+ } else {
+ if (m) {
+ ret = sctp_msg_append(stcb, stcb->asoc.primary_destination, m,
+ &ca->sndrcv, 1);
+ }
+ asoc = &stcb->asoc;
+ if (ca->sndrcv.sinfo_flags & SCTP_EOF) {
+ /* shutdown this assoc */
+ int cnt;
+
+ cnt = sctp_is_there_unsent_data(stcb);
+
+ if (TAILQ_EMPTY(&asoc->send_queue) &&
+ TAILQ_EMPTY(&asoc->sent_queue) &&
+ (cnt == 0)) {
+ if (asoc->locked_on_sending) {
+ goto abort_anyway;
+ }
+ /*
+ * there is nothing queued to send, so I'm
+ * done...
+ */
+ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
+ /*
+ * only send SHUTDOWN the first time
+ * through
+ */
+ sctp_send_shutdown(stcb, stcb->asoc.primary_destination);
+ if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT);
+ SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+ added_control = 1;
+ do_chunk_output = 0;
+ }
+ } else {
+ /*
+ * we still got (or just got) data to send,
+ * so set SHUTDOWN_PENDING
+ */
+ /*
+ * XXX sockets draft says that SCTP_EOF
+ * should be sent with no data. currently,
+ * we will allow user data to be sent first
+ * and move to SHUTDOWN-PENDING
+ */
+ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
+ if (asoc->locked_on_sending) {
+ /*
+ * Locked to send out the
+ * data
+ */
+ struct sctp_stream_queue_pending *sp;
+
+ sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead);
+ if (sp) {
+ if ((sp->length == 0) && (sp->msg_is_complete == 0))
+ asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
+ }
+ }
+ asoc->state |= SCTP_STATE_SHUTDOWN_PENDING;
+ if (TAILQ_EMPTY(&asoc->send_queue) &&
+ TAILQ_EMPTY(&asoc->sent_queue) &&
+ (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
+ abort_anyway:
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ sctp_abort_an_association(stcb->sctp_ep, stcb,
+ SCTP_RESPONSE_TO_USER_REQ,
+ NULL, SCTP_SO_NOT_LOCKED);
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ goto no_chunk_output;
+ }
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+ }
+ }
+
+ }
+ }
+ un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) +
+ (stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk)));
+
+ if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) &&
+ (stcb->asoc.total_flight > 0) &&
+ (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))
+ ) {
+ do_chunk_output = 0;
+ }
+ if (do_chunk_output)
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_NOT_LOCKED);
+ else if (added_control) {
+ int num_out = 0, reason = 0, now_filled = 0;
+ struct timeval now;
+ int frag_point;
+
+ frag_point = sctp_get_frag_point(stcb, &stcb->asoc);
+ (void)sctp_med_chunk_output(inp, stcb, &stcb->asoc, &num_out,
+ &reason, 1, 1, &now, &now_filled, frag_point, SCTP_SO_NOT_LOCKED);
+ }
+no_chunk_output:
+ if (ret) {
+ ca->cnt_failed++;
+ } else {
+ ca->cnt_sent++;
+ }
+}
+
+static void
+sctp_sendall_completes(void *ptr, uint32_t val)
+{
+ struct sctp_copy_all *ca;
+
+ ca = (struct sctp_copy_all *)ptr;
+ /*
+ * Do a notify here? Kacheong suggests that the notify be done at
+ * the send time.. so you would push up a notification if any send
+ * failed. Don't know if this is feasable since the only failures we
+ * have is "memory" related and if you cannot get an mbuf to send
+ * the data you surely can't get an mbuf to send up to notify the
+ * user you can't send the data :->
+ */
+
+ /* now free everything */
+ sctp_m_freem(ca->m);
+ SCTP_FREE(ca, SCTP_M_COPYAL);
+}
+
+
+#define MC_ALIGN(m, len) do { \
+ SCTP_BUF_RESV_UF(m, ((MCLBYTES - (len)) & ~(sizeof(long) - 1)); \
+} while (0)
+
+
+
+static struct mbuf *
+sctp_copy_out_all(struct uio *uio, int len)
+{
+ struct mbuf *ret, *at;
+ int left, willcpy, cancpy, error;
+
+ ret = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_WAIT, 1, MT_DATA);
+ if (ret == NULL) {
+ /* TSNH */
+ return (NULL);
+ }
+ left = len;
+ SCTP_BUF_LEN(ret) = 0;
+ /* save space for the data chunk header */
+ cancpy = M_TRAILINGSPACE(ret);
+ willcpy = min(cancpy, left);
+ at = ret;
+ while (left > 0) {
+ /* Align data to the end */
+ error = uiomove(mtod(at, caddr_t), willcpy, uio);
+ if (error) {
+ err_out_now:
+ sctp_m_freem(at);
+ return (NULL);
+ }
+ SCTP_BUF_LEN(at) = willcpy;
+ SCTP_BUF_NEXT_PKT(at) = SCTP_BUF_NEXT(at) = 0;
+ left -= willcpy;
+ if (left > 0) {
+ SCTP_BUF_NEXT(at) = sctp_get_mbuf_for_msg(left, 0, M_WAIT, 1, MT_DATA);
+ if (SCTP_BUF_NEXT(at) == NULL) {
+ goto err_out_now;
+ }
+ at = SCTP_BUF_NEXT(at);
+ SCTP_BUF_LEN(at) = 0;
+ cancpy = M_TRAILINGSPACE(at);
+ willcpy = min(cancpy, left);
+ }
+ }
+ return (ret);
+}
+
+static int
+sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m,
+ struct sctp_sndrcvinfo *srcv)
+{
+ int ret;
+ struct sctp_copy_all *ca;
+
+ SCTP_MALLOC(ca, struct sctp_copy_all *, sizeof(struct sctp_copy_all),
+ SCTP_M_COPYAL);
+ if (ca == NULL) {
+ sctp_m_freem(m);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ memset(ca, 0, sizeof(struct sctp_copy_all));
+
+ ca->inp = inp;
+ memcpy(&ca->sndrcv, srcv, sizeof(struct sctp_nonpad_sndrcvinfo));
+ /*
+ * take off the sendall flag, it would be bad if we failed to do
+ * this :-0
+ */
+ ca->sndrcv.sinfo_flags &= ~SCTP_SENDALL;
+ /* get length and mbuf chain */
+ if (uio) {
+ ca->sndlen = uio->uio_resid;
+ ca->m = sctp_copy_out_all(uio, ca->sndlen);
+ if (ca->m == NULL) {
+ SCTP_FREE(ca, SCTP_M_COPYAL);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ } else {
+ /* Gather the length of the send */
+ struct mbuf *mat;
+
+ mat = m;
+ ca->sndlen = 0;
+ while (m) {
+ ca->sndlen += SCTP_BUF_LEN(m);
+ m = SCTP_BUF_NEXT(m);
+ }
+ ca->m = mat;
+ }
+ ret = sctp_initiate_iterator(NULL, sctp_sendall_iterator, NULL,
+ SCTP_PCB_ANY_FLAGS, SCTP_PCB_ANY_FEATURES,
+ SCTP_ASOC_ANY_STATE,
+ (void *)ca, 0,
+ sctp_sendall_completes, inp, 1);
+ if (ret) {
+ SCTP_PRINTF("Failed to initiate iterator for sendall\n");
+ SCTP_FREE(ca, SCTP_M_COPYAL);
+ SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EFAULT);
+ return (EFAULT);
+ }
+ return (0);
+}
+
+
+void
+sctp_toss_old_cookies(struct sctp_tcb *stcb, struct sctp_association *asoc)
+{
+ struct sctp_tmit_chunk *chk, *nchk;
+
+ chk = TAILQ_FIRST(&asoc->control_send_queue);
+ while (chk) {
+ nchk = TAILQ_NEXT(chk, sctp_next);
+ if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) {
+ TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ asoc->ctrl_queue_cnt--;
+ sctp_free_a_chunk(stcb, chk);
+ }
+ chk = nchk;
+ }
+}
+
+void
+sctp_toss_old_asconf(struct sctp_tcb *stcb)
+{
+ struct sctp_association *asoc;
+ struct sctp_tmit_chunk *chk, *chk_tmp;
+ struct sctp_asconf_chunk *acp;
+
+ asoc = &stcb->asoc;
+ for (chk = TAILQ_FIRST(&asoc->asconf_send_queue); chk != NULL;
+ chk = chk_tmp) {
+ /* get next chk */
+ chk_tmp = TAILQ_NEXT(chk, sctp_next);
+ /* find SCTP_ASCONF chunk in queue */
+ if (chk->rec.chunk_id.id == SCTP_ASCONF) {
+ if (chk->data) {
+ acp = mtod(chk->data, struct sctp_asconf_chunk *);
+ if (compare_with_wrap(ntohl(acp->serial_number), stcb->asoc.asconf_seq_out_acked, MAX_SEQ)) {
+ /* Not Acked yet */
+ break;
+ }
+ }
+ TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ asoc->ctrl_queue_cnt--;
+ sctp_free_a_chunk(stcb, chk);
+ }
+ }
+}
+
+
+static void
+sctp_clean_up_datalist(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ struct sctp_tmit_chunk **data_list,
+ int bundle_at,
+ struct sctp_nets *net)
+{
+ int i;
+ struct sctp_tmit_chunk *tp1;
+
+ for (i = 0; i < bundle_at; i++) {
+ /* off of the send queue */
+ TAILQ_REMOVE(&asoc->send_queue, data_list[i], sctp_next);
+ asoc->send_queue_cnt--;
+ if (i > 0) {
+ /*
+ * Any chunk NOT 0 you zap the time chunk 0 gets
+ * zapped or set based on if a RTO measurment is
+ * needed.
+ */
+ data_list[i]->do_rtt = 0;
+ }
+ /* record time */
+ data_list[i]->sent_rcv_time = net->last_sent_time;
+ data_list[i]->rec.data.fast_retran_tsn = data_list[i]->rec.data.TSN_seq;
+ if (data_list[i]->whoTo == NULL) {
+ data_list[i]->whoTo = net;
+ atomic_add_int(&net->ref_count, 1);
+ }
+ /* on to the sent queue */
+ tp1 = TAILQ_LAST(&asoc->sent_queue, sctpchunk_listhead);
+ if ((tp1) && (compare_with_wrap(tp1->rec.data.TSN_seq,
+ data_list[i]->rec.data.TSN_seq, MAX_TSN))) {
+ struct sctp_tmit_chunk *tpp;
+
+ /* need to move back */
+ back_up_more:
+ tpp = TAILQ_PREV(tp1, sctpchunk_listhead, sctp_next);
+ if (tpp == NULL) {
+ TAILQ_INSERT_BEFORE(tp1, data_list[i], sctp_next);
+ goto all_done;
+ }
+ tp1 = tpp;
+ if (compare_with_wrap(tp1->rec.data.TSN_seq,
+ data_list[i]->rec.data.TSN_seq, MAX_TSN)) {
+ goto back_up_more;
+ }
+ TAILQ_INSERT_AFTER(&asoc->sent_queue, tp1, data_list[i], sctp_next);
+ } else {
+ TAILQ_INSERT_TAIL(&asoc->sent_queue,
+ data_list[i],
+ sctp_next);
+ }
+all_done:
+ /* This does not lower until the cum-ack passes it */
+ asoc->sent_queue_cnt++;
+ if ((asoc->peers_rwnd <= 0) &&
+ (asoc->total_flight == 0) &&
+ (bundle_at == 1)) {
+ /* Mark the chunk as being a window probe */
+ SCTP_STAT_INCR(sctps_windowprobed);
+ }
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xC2, 3);
+#endif
+ data_list[i]->sent = SCTP_DATAGRAM_SENT;
+ data_list[i]->snd_count = 1;
+ data_list[i]->rec.data.chunk_was_revoked = 0;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_UP,
+ data_list[i]->whoTo->flight_size,
+ data_list[i]->book_size,
+ (uintptr_t) data_list[i]->whoTo,
+ data_list[i]->rec.data.TSN_seq);
+ }
+ sctp_flight_size_increase(data_list[i]);
+ sctp_total_flight_increase(stcb, data_list[i]);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) {
+ sctp_log_rwnd(SCTP_DECREASE_PEER_RWND,
+ asoc->peers_rwnd, data_list[i]->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh));
+ }
+ asoc->peers_rwnd = sctp_sbspace_sub(asoc->peers_rwnd,
+ (uint32_t) (data_list[i]->send_size + SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)));
+ if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) {
+ /* SWS sender side engages */
+ asoc->peers_rwnd = 0;
+ }
+ }
+}
+
+static void
+sctp_clean_up_ctl(struct sctp_tcb *stcb, struct sctp_association *asoc)
+{
+ struct sctp_tmit_chunk *chk, *nchk;
+
+ for (chk = TAILQ_FIRST(&asoc->control_send_queue);
+ chk; chk = nchk) {
+ nchk = TAILQ_NEXT(chk, sctp_next);
+ if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) ||
+ (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK) || /* EY */
+ (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) ||
+ (chk->rec.chunk_id.id == SCTP_HEARTBEAT_ACK) ||
+ (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) ||
+ (chk->rec.chunk_id.id == SCTP_SHUTDOWN) ||
+ (chk->rec.chunk_id.id == SCTP_SHUTDOWN_ACK) ||
+ (chk->rec.chunk_id.id == SCTP_OPERATION_ERROR) ||
+ (chk->rec.chunk_id.id == SCTP_PACKET_DROPPED) ||
+ (chk->rec.chunk_id.id == SCTP_COOKIE_ACK) ||
+ (chk->rec.chunk_id.id == SCTP_ECN_CWR) ||
+ (chk->rec.chunk_id.id == SCTP_ASCONF_ACK)) {
+ /* Stray chunks must be cleaned up */
+ clean_up_anyway:
+ TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ asoc->ctrl_queue_cnt--;
+ if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN)
+ asoc->fwd_tsn_cnt--;
+ sctp_free_a_chunk(stcb, chk);
+ } else if (chk->rec.chunk_id.id == SCTP_STREAM_RESET) {
+ /* special handling, we must look into the param */
+ if (chk != asoc->str_reset) {
+ goto clean_up_anyway;
+ }
+ }
+ }
+}
+
+
+static int
+sctp_can_we_split_this(struct sctp_tcb *stcb,
+ uint32_t length,
+ uint32_t goal_mtu, uint32_t frag_point, int eeor_on)
+{
+ /*
+ * Make a decision on if I should split a msg into multiple parts.
+ * This is only asked of incomplete messages.
+ */
+ if (eeor_on) {
+ /*
+ * If we are doing EEOR we need to always send it if its the
+ * entire thing, since it might be all the guy is putting in
+ * the hopper.
+ */
+ if (goal_mtu >= length) {
+ /*-
+ * If we have data outstanding,
+ * we get another chance when the sack
+ * arrives to transmit - wait for more data
+ */
+ if (stcb->asoc.total_flight == 0) {
+ /*
+ * If nothing is in flight, we zero the
+ * packet counter.
+ */
+ return (length);
+ }
+ return (0);
+
+ } else {
+ /* You can fill the rest */
+ return (goal_mtu);
+ }
+ }
+ /*-
+ * For those strange folk that make the send buffer
+ * smaller than our fragmentation point, we can't
+ * get a full msg in so we have to allow splitting.
+ */
+ if (SCTP_SB_LIMIT_SND(stcb->sctp_socket) < frag_point) {
+ return (length);
+ }
+ if ((length <= goal_mtu) ||
+ ((length - goal_mtu) < SCTP_BASE_SYSCTL(sctp_min_residual))) {
+ /* Sub-optimial residual don't split in non-eeor mode. */
+ return (0);
+ }
+ /*
+ * If we reach here length is larger than the goal_mtu. Do we wish
+ * to split it for the sake of packet putting together?
+ */
+ if (goal_mtu >= min(SCTP_BASE_SYSCTL(sctp_min_split_point), frag_point)) {
+ /* Its ok to split it */
+ return (min(goal_mtu, frag_point));
+ }
+ /* Nope, can't split */
+ return (0);
+
+}
+
+static uint32_t
+sctp_move_to_outqueue(struct sctp_tcb *stcb,
+ struct sctp_stream_out *strq,
+ uint32_t goal_mtu,
+ uint32_t frag_point,
+ int *locked,
+ int *giveup,
+ int eeor_mode,
+ int *bail)
+{
+ /* Move from the stream to the send_queue keeping track of the total */
+ struct sctp_association *asoc;
+ struct sctp_stream_queue_pending *sp;
+ struct sctp_tmit_chunk *chk;
+ struct sctp_data_chunk *dchkh;
+ uint32_t to_move, length;
+ uint8_t rcv_flags = 0;
+ uint8_t some_taken;
+ uint8_t send_lock_up = 0;
+
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ asoc = &stcb->asoc;
+one_more_time:
+ /* sa_ignore FREED_MEMORY */
+ sp = TAILQ_FIRST(&strq->outqueue);
+ if (sp == NULL) {
+ *locked = 0;
+ if (send_lock_up == 0) {
+ SCTP_TCB_SEND_LOCK(stcb);
+ send_lock_up = 1;
+ }
+ sp = TAILQ_FIRST(&strq->outqueue);
+ if (sp) {
+ goto one_more_time;
+ }
+ if (strq->last_msg_incomplete) {
+ SCTP_PRINTF("Huh? Stream:%d lm_in_c=%d but queue is NULL\n",
+ strq->stream_no,
+ strq->last_msg_incomplete);
+ strq->last_msg_incomplete = 0;
+ }
+ to_move = 0;
+ if (send_lock_up) {
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ send_lock_up = 0;
+ }
+ goto out_of;
+ }
+ if ((sp->msg_is_complete) && (sp->length == 0)) {
+ if (sp->sender_all_done) {
+ /*
+ * We are doing differed cleanup. Last time through
+ * when we took all the data the sender_all_done was
+ * not set.
+ */
+ if ((sp->put_last_out == 0) && (sp->discard_rest == 0)) {
+ SCTP_PRINTF("Gak, put out entire msg with NO end!-1\n");
+ SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d send_lock:%d\n",
+ sp->sender_all_done,
+ sp->length,
+ sp->msg_is_complete,
+ sp->put_last_out,
+ send_lock_up);
+ }
+ if ((TAILQ_NEXT(sp, next) == NULL) && (send_lock_up == 0)) {
+ SCTP_TCB_SEND_LOCK(stcb);
+ send_lock_up = 1;
+ }
+ atomic_subtract_int(&asoc->stream_queue_cnt, 1);
+ TAILQ_REMOVE(&strq->outqueue, sp, next);
+ if (sp->net) {
+ sctp_free_remote_addr(sp->net);
+ sp->net = NULL;
+ }
+ if (sp->data) {
+ sctp_m_freem(sp->data);
+ sp->data = NULL;
+ }
+ sctp_free_a_strmoq(stcb, sp);
+ /* we can't be locked to it */
+ *locked = 0;
+ stcb->asoc.locked_on_sending = NULL;
+ if (send_lock_up) {
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ send_lock_up = 0;
+ }
+ /* back to get the next msg */
+ goto one_more_time;
+ } else {
+ /*
+ * sender just finished this but still holds a
+ * reference
+ */
+ *locked = 1;
+ *giveup = 1;
+ to_move = 0;
+ goto out_of;
+ }
+ } else {
+ /* is there some to get */
+ if (sp->length == 0) {
+ /* no */
+ *locked = 1;
+ *giveup = 1;
+ to_move = 0;
+ goto out_of;
+ } else if (sp->discard_rest) {
+ if (send_lock_up == 0) {
+ SCTP_TCB_SEND_LOCK(stcb);
+ send_lock_up = 1;
+ }
+ /* Whack down the size */
+ atomic_subtract_int(&stcb->asoc.total_output_queue_size, sp->length);
+ if ((stcb->sctp_socket != NULL) && \
+ ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) {
+ atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc, sp->length);
+ }
+ if (sp->data) {
+ sctp_m_freem(sp->data);
+ sp->data = NULL;
+ sp->tail_mbuf = NULL;
+ }
+ sp->length = 0;
+ sp->some_taken = 1;
+ *locked = 1;
+ *giveup = 1;
+ to_move = 0;
+ goto out_of;
+ }
+ }
+ some_taken = sp->some_taken;
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ sp->msg_is_complete = 1;
+ }
+re_look:
+ length = sp->length;
+ if (sp->msg_is_complete) {
+ /* The message is complete */
+ to_move = min(length, frag_point);
+ if (to_move == length) {
+ /* All of it fits in the MTU */
+ if (sp->some_taken) {
+ rcv_flags |= SCTP_DATA_LAST_FRAG;
+ sp->put_last_out = 1;
+ } else {
+ rcv_flags |= SCTP_DATA_NOT_FRAG;
+ sp->put_last_out = 1;
+ }
+ } else {
+ /* Not all of it fits, we fragment */
+ if (sp->some_taken == 0) {
+ rcv_flags |= SCTP_DATA_FIRST_FRAG;
+ }
+ sp->some_taken = 1;
+ }
+ } else {
+ to_move = sctp_can_we_split_this(stcb, length, goal_mtu, frag_point, eeor_mode);
+ if (to_move) {
+ /*-
+ * We use a snapshot of length in case it
+ * is expanding during the compare.
+ */
+ uint32_t llen;
+
+ llen = length;
+ if (to_move >= llen) {
+ to_move = llen;
+ if (send_lock_up == 0) {
+ /*-
+ * We are taking all of an incomplete msg
+ * thus we need a send lock.
+ */
+ SCTP_TCB_SEND_LOCK(stcb);
+ send_lock_up = 1;
+ if (sp->msg_is_complete) {
+ /*
+ * the sender finished the
+ * msg
+ */
+ goto re_look;
+ }
+ }
+ }
+ if (sp->some_taken == 0) {
+ rcv_flags |= SCTP_DATA_FIRST_FRAG;
+ sp->some_taken = 1;
+ }
+ } else {
+ /* Nothing to take. */
+ if (sp->some_taken) {
+ *locked = 1;
+ }
+ *giveup = 1;
+ to_move = 0;
+ goto out_of;
+ }
+ }
+
+ /* If we reach here, we can copy out a chunk */
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ /* No chunk memory */
+ *giveup = 1;
+ to_move = 0;
+ goto out_of;
+ }
+ /*
+ * Setup for unordered if needed by looking at the user sent info
+ * flags.
+ */
+ if (sp->sinfo_flags & SCTP_UNORDERED) {
+ rcv_flags |= SCTP_DATA_UNORDERED;
+ }
+ if ((SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) && ((sp->sinfo_flags & SCTP_EOF) == SCTP_EOF)) ||
+ ((sp->sinfo_flags & SCTP_SACK_IMMEDIATELY) == SCTP_SACK_IMMEDIATELY)) {
+ rcv_flags |= SCTP_DATA_SACK_IMMEDIATELY;
+ }
+ /* clear out the chunk before setting up */
+ memset(chk, 0, sizeof(*chk));
+ chk->rec.data.rcv_flags = rcv_flags;
+
+ if (to_move >= length) {
+ /* we think we can steal the whole thing */
+ if ((sp->sender_all_done == 0) && (send_lock_up == 0)) {
+ SCTP_TCB_SEND_LOCK(stcb);
+ send_lock_up = 1;
+ }
+ if (to_move < sp->length) {
+ /* bail, it changed */
+ goto dont_do_it;
+ }
+ chk->data = sp->data;
+ chk->last_mbuf = sp->tail_mbuf;
+ /* register the stealing */
+ sp->data = sp->tail_mbuf = NULL;
+ } else {
+ struct mbuf *m;
+
+dont_do_it:
+ chk->data = SCTP_M_COPYM(sp->data, 0, to_move, M_DONTWAIT);
+ chk->last_mbuf = NULL;
+ if (chk->data == NULL) {
+ sp->some_taken = some_taken;
+ sctp_free_a_chunk(stcb, chk);
+ *bail = 1;
+ to_move = 0;
+ goto out_of;
+ }
+#ifdef SCTP_MBUF_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ struct mbuf *mat;
+
+ mat = chk->data;
+ while (mat) {
+ if (SCTP_BUF_IS_EXTENDED(mat)) {
+ sctp_log_mb(mat, SCTP_MBUF_ICOPY);
+ }
+ mat = SCTP_BUF_NEXT(mat);
+ }
+ }
+#endif
+ /* Pull off the data */
+ m_adj(sp->data, to_move);
+ /* Now lets work our way down and compact it */
+ m = sp->data;
+ while (m && (SCTP_BUF_LEN(m) == 0)) {
+ sp->data = SCTP_BUF_NEXT(m);
+ SCTP_BUF_NEXT(m) = NULL;
+ if (sp->tail_mbuf == m) {
+ /*-
+ * Freeing tail? TSNH since
+ * we supposedly were taking less
+ * than the sp->length.
+ */
+#ifdef INVARIANTS
+ panic("Huh, freing tail? - TSNH");
+#else
+ SCTP_PRINTF("Huh, freeing tail? - TSNH\n");
+ sp->tail_mbuf = sp->data = NULL;
+ sp->length = 0;
+#endif
+
+ }
+ sctp_m_free(m);
+ m = sp->data;
+ }
+ }
+ if (SCTP_BUF_IS_EXTENDED(chk->data)) {
+ chk->copy_by_ref = 1;
+ } else {
+ chk->copy_by_ref = 0;
+ }
+ /*
+ * get last_mbuf and counts of mb useage This is ugly but hopefully
+ * its only one mbuf.
+ */
+ if (chk->last_mbuf == NULL) {
+ chk->last_mbuf = chk->data;
+ while (SCTP_BUF_NEXT(chk->last_mbuf) != NULL) {
+ chk->last_mbuf = SCTP_BUF_NEXT(chk->last_mbuf);
+ }
+ }
+ if (to_move > length) {
+ /*- This should not happen either
+ * since we always lower to_move to the size
+ * of sp->length if its larger.
+ */
+#ifdef INVARIANTS
+ panic("Huh, how can to_move be larger?");
+#else
+ SCTP_PRINTF("Huh, how can to_move be larger?\n");
+ sp->length = 0;
+#endif
+ } else {
+ atomic_subtract_int(&sp->length, to_move);
+ }
+ if (M_LEADINGSPACE(chk->data) < (int)sizeof(struct sctp_data_chunk)) {
+ /* Not enough room for a chunk header, get some */
+ struct mbuf *m;
+
+ m = sctp_get_mbuf_for_msg(1, 0, M_DONTWAIT, 0, MT_DATA);
+ if (m == NULL) {
+ /*
+ * we're in trouble here. _PREPEND below will free
+ * all the data if there is no leading space, so we
+ * must put the data back and restore.
+ */
+ if (send_lock_up == 0) {
+ SCTP_TCB_SEND_LOCK(stcb);
+ send_lock_up = 1;
+ }
+ if (chk->data == NULL) {
+ /* unsteal the data */
+ sp->data = chk->data;
+ sp->tail_mbuf = chk->last_mbuf;
+ } else {
+ struct mbuf *m_tmp;
+
+ /* reassemble the data */
+ m_tmp = sp->data;
+ sp->data = chk->data;
+ SCTP_BUF_NEXT(chk->last_mbuf) = m_tmp;
+ }
+ sp->some_taken = some_taken;
+ atomic_add_int(&sp->length, to_move);
+ chk->data = NULL;
+ *bail = 1;
+ sctp_free_a_chunk(stcb, chk);
+ to_move = 0;
+ goto out_of;
+ } else {
+ SCTP_BUF_LEN(m) = 0;
+ SCTP_BUF_NEXT(m) = chk->data;
+ chk->data = m;
+ M_ALIGN(chk->data, 4);
+ }
+ }
+ SCTP_BUF_PREPEND(chk->data, sizeof(struct sctp_data_chunk), M_DONTWAIT);
+ if (chk->data == NULL) {
+ /* HELP, TSNH since we assured it would not above? */
+#ifdef INVARIANTS
+ panic("prepend failes HELP?");
+#else
+ SCTP_PRINTF("prepend fails HELP?\n");
+ sctp_free_a_chunk(stcb, chk);
+#endif
+ *bail = 1;
+ to_move = 0;
+ goto out_of;
+ }
+ sctp_snd_sb_alloc(stcb, sizeof(struct sctp_data_chunk));
+ chk->book_size = chk->send_size = (to_move + sizeof(struct sctp_data_chunk));
+ chk->book_size_scale = 0;
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+
+ chk->flags = 0;
+ chk->asoc = &stcb->asoc;
+ chk->pad_inplace = 0;
+ chk->no_fr_allowed = 0;
+ chk->rec.data.stream_seq = sp->strseq;
+ chk->rec.data.stream_number = sp->stream;
+ chk->rec.data.payloadtype = sp->ppid;
+ chk->rec.data.context = sp->context;
+ chk->rec.data.doing_fast_retransmit = 0;
+ chk->rec.data.ect_nonce = 0; /* ECN Nonce */
+
+ chk->rec.data.timetodrop = sp->ts;
+ chk->flags = sp->act_flags;
+
+ if (sp->net) {
+ chk->whoTo = sp->net;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ } else
+ chk->whoTo = NULL;
+
+ if (sp->holds_key_ref) {
+ chk->auth_keyid = sp->auth_keyid;
+ sctp_auth_key_acquire(stcb, chk->auth_keyid);
+ chk->holds_key_ref = 1;
+ }
+ chk->rec.data.TSN_seq = atomic_fetchadd_int(&asoc->sending_seq, 1);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_AT_SEND_2_OUTQ) {
+ sctp_misc_ints(SCTP_STRMOUT_LOG_SEND,
+ (uintptr_t) stcb, sp->length,
+ (uint32_t) ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq),
+ chk->rec.data.TSN_seq);
+ }
+ dchkh = mtod(chk->data, struct sctp_data_chunk *);
+ /*
+ * Put the rest of the things in place now. Size was done earlier in
+ * previous loop prior to padding.
+ */
+
+#ifdef SCTP_ASOCLOG_OF_TSNS
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ if (asoc->tsn_out_at >= SCTP_TSN_LOG_SIZE) {
+ asoc->tsn_out_at = 0;
+ asoc->tsn_out_wrapped = 1;
+ }
+ asoc->out_tsnlog[asoc->tsn_out_at].tsn = chk->rec.data.TSN_seq;
+ asoc->out_tsnlog[asoc->tsn_out_at].strm = chk->rec.data.stream_number;
+ asoc->out_tsnlog[asoc->tsn_out_at].seq = chk->rec.data.stream_seq;
+ asoc->out_tsnlog[asoc->tsn_out_at].sz = chk->send_size;
+ asoc->out_tsnlog[asoc->tsn_out_at].flgs = chk->rec.data.rcv_flags;
+ asoc->out_tsnlog[asoc->tsn_out_at].stcb = (void *)stcb;
+ asoc->out_tsnlog[asoc->tsn_out_at].in_pos = asoc->tsn_out_at;
+ asoc->out_tsnlog[asoc->tsn_out_at].in_out = 2;
+ asoc->tsn_out_at++;
+#endif
+
+ dchkh->ch.chunk_type = SCTP_DATA;
+ dchkh->ch.chunk_flags = chk->rec.data.rcv_flags;
+ dchkh->dp.tsn = htonl(chk->rec.data.TSN_seq);
+ dchkh->dp.stream_id = htons(strq->stream_no);
+ dchkh->dp.stream_sequence = htons(chk->rec.data.stream_seq);
+ dchkh->dp.protocol_id = chk->rec.data.payloadtype;
+ dchkh->ch.chunk_length = htons(chk->send_size);
+ /* Now advance the chk->send_size by the actual pad needed. */
+ if (chk->send_size < SCTP_SIZE32(chk->book_size)) {
+ /* need a pad */
+ struct mbuf *lm;
+ int pads;
+
+ pads = SCTP_SIZE32(chk->book_size) - chk->send_size;
+ if (sctp_pad_lastmbuf(chk->data, pads, chk->last_mbuf) == 0) {
+ chk->pad_inplace = 1;
+ }
+ if ((lm = SCTP_BUF_NEXT(chk->last_mbuf)) != NULL) {
+ /* pad added an mbuf */
+ chk->last_mbuf = lm;
+ }
+ chk->send_size += pads;
+ }
+ /* We only re-set the policy if it is on */
+ if (sp->pr_sctp_on) {
+ sctp_set_prsctp_policy(sp);
+ asoc->pr_sctp_cnt++;
+ chk->pr_sctp_on = 1;
+ } else {
+ chk->pr_sctp_on = 0;
+ }
+ if (sp->msg_is_complete && (sp->length == 0) && (sp->sender_all_done)) {
+ /* All done pull and kill the message */
+ atomic_subtract_int(&asoc->stream_queue_cnt, 1);
+ if (sp->put_last_out == 0) {
+ SCTP_PRINTF("Gak, put out entire msg with NO end!-2\n");
+ SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d send_lock:%d\n",
+ sp->sender_all_done,
+ sp->length,
+ sp->msg_is_complete,
+ sp->put_last_out,
+ send_lock_up);
+ }
+ if ((send_lock_up == 0) && (TAILQ_NEXT(sp, next) == NULL)) {
+ SCTP_TCB_SEND_LOCK(stcb);
+ send_lock_up = 1;
+ }
+ TAILQ_REMOVE(&strq->outqueue, sp, next);
+ if (sp->net) {
+ sctp_free_remote_addr(sp->net);
+ sp->net = NULL;
+ }
+ if (sp->data) {
+ sctp_m_freem(sp->data);
+ sp->data = NULL;
+ }
+ sctp_free_a_strmoq(stcb, sp);
+
+ /* we can't be locked to it */
+ *locked = 0;
+ stcb->asoc.locked_on_sending = NULL;
+ } else {
+ /* more to go, we are locked */
+ *locked = 1;
+ }
+ asoc->chunks_on_out_queue++;
+ TAILQ_INSERT_TAIL(&asoc->send_queue, chk, sctp_next);
+ asoc->send_queue_cnt++;
+out_of:
+ if (send_lock_up) {
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ send_lock_up = 0;
+ }
+ return (to_move);
+}
+
+
+static struct sctp_stream_out *
+sctp_select_a_stream(struct sctp_tcb *stcb, struct sctp_association *asoc)
+{
+ struct sctp_stream_out *strq;
+
+ /* Find the next stream to use */
+ if (asoc->last_out_stream == NULL) {
+ strq = TAILQ_FIRST(&asoc->out_wheel);
+ } else {
+ strq = TAILQ_NEXT(asoc->last_out_stream, next_spoke);
+ if (strq == NULL) {
+ strq = TAILQ_FIRST(&asoc->out_wheel);
+ }
+ }
+ return (strq);
+}
+
+
+static void
+sctp_fill_outqueue(struct sctp_tcb *stcb,
+ struct sctp_nets *net, int frag_point, int eeor_mode, int *quit_now)
+{
+ struct sctp_association *asoc;
+ struct sctp_stream_out *strq, *strqn;
+ int goal_mtu, moved_how_much, total_moved = 0, bail = 0;
+ int locked, giveup;
+ struct sctp_stream_queue_pending *sp;
+
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ asoc = &stcb->asoc;
+#ifdef INET6
+ if (net->ro._l_addr.sin6.sin6_family == AF_INET6) {
+ goal_mtu = net->mtu - SCTP_MIN_OVERHEAD;
+ } else {
+ /* ?? not sure what else to do */
+ goal_mtu = net->mtu - SCTP_MIN_V4_OVERHEAD;
+ }
+#else
+ goal_mtu = net->mtu - SCTP_MIN_OVERHEAD;
+#endif
+ /* Need an allowance for the data chunk header too */
+ goal_mtu -= sizeof(struct sctp_data_chunk);
+
+ /* must make even word boundary */
+ goal_mtu &= 0xfffffffc;
+ if (asoc->locked_on_sending) {
+ /* We are stuck on one stream until the message completes. */
+ strq = asoc->locked_on_sending;
+ locked = 1;
+ } else {
+ strq = sctp_select_a_stream(stcb, asoc);
+ locked = 0;
+ }
+ strqn = strq;
+ while ((goal_mtu > 0) && strq) {
+ sp = TAILQ_FIRST(&strq->outqueue);
+ if (sp == NULL) {
+ break;
+ }
+ /**
+ * Honor the users' choice if given. If not given,
+ * pull it only to the primary path in case of not using
+ * CMT.
+ */
+ if (((sp->net != NULL) &&
+ (sp->net != net)) ||
+ ((sp->net == NULL) &&
+ (asoc->sctp_cmt_on_off == 0) &&
+ (asoc->primary_destination != net))) {
+ /* Do not pull to this network */
+ if (locked) {
+ break;
+ } else {
+ strq = sctp_select_a_stream(stcb, asoc);
+ if (strq == NULL)
+ /* none left */
+ break;
+ if (strqn == strq) {
+ /* I have circled */
+ break;
+ }
+ continue;
+ }
+ }
+ giveup = 0;
+ bail = 0;
+ moved_how_much = sctp_move_to_outqueue(stcb, strq, goal_mtu, frag_point, &locked,
+ &giveup, eeor_mode, &bail);
+ if (moved_how_much)
+ asoc->last_out_stream = strq;
+
+ if (locked) {
+ asoc->locked_on_sending = strq;
+ if ((moved_how_much == 0) || (giveup) || bail)
+ /* no more to move for now */
+ break;
+ } else {
+ asoc->locked_on_sending = NULL;
+ if (TAILQ_EMPTY(&strq->outqueue)) {
+ if (strq == strqn) {
+ /* Must move start to next one */
+ strqn = TAILQ_NEXT(strq, next_spoke);
+ if (strqn == NULL) {
+ strqn = TAILQ_FIRST(&asoc->out_wheel);
+ if (strqn == NULL) {
+ break;
+ }
+ }
+ }
+ sctp_remove_from_wheel(stcb, asoc, strq, 0);
+ }
+ if ((giveup) || bail) {
+ break;
+ }
+ strq = sctp_select_a_stream(stcb, asoc);
+ if (strq == NULL) {
+ break;
+ }
+ }
+ total_moved += moved_how_much;
+ goal_mtu -= (moved_how_much + sizeof(struct sctp_data_chunk));
+ goal_mtu &= 0xfffffffc;
+ }
+ if (bail)
+ *quit_now = 1;
+
+ if (total_moved == 0) {
+ if ((stcb->asoc.sctp_cmt_on_off == 0) &&
+ (net == stcb->asoc.primary_destination)) {
+ /* ran dry for primary network net */
+ SCTP_STAT_INCR(sctps_primary_randry);
+ } else if (stcb->asoc.sctp_cmt_on_off == 1) {
+ /* ran dry with CMT on */
+ SCTP_STAT_INCR(sctps_cmt_randry);
+ }
+ }
+}
+
+void
+sctp_fix_ecn_echo(struct sctp_association *asoc)
+{
+ struct sctp_tmit_chunk *chk;
+
+ TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
+ if (chk->rec.chunk_id.id == SCTP_ECN_ECHO) {
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ }
+ }
+}
+
+void
+sctp_move_chunks_from_net(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ struct sctp_association *asoc;
+ struct sctp_stream_out *outs;
+ struct sctp_tmit_chunk *chk;
+ struct sctp_stream_queue_pending *sp;
+
+ if (net == NULL) {
+ return;
+ }
+ asoc = &stcb->asoc;
+ TAILQ_FOREACH(outs, &asoc->out_wheel, next_spoke) {
+ TAILQ_FOREACH(sp, &outs->outqueue, next) {
+ if (sp->net == net) {
+ sctp_free_remote_addr(sp->net);
+ sp->net = NULL;
+ }
+ }
+ }
+ TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) {
+ if (chk->whoTo == net) {
+ sctp_free_remote_addr(chk->whoTo);
+ chk->whoTo = NULL;
+ }
+ }
+}
+
+int
+sctp_med_chunk_output(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ int *num_out,
+ int *reason_code,
+ int control_only, int from_where,
+ struct timeval *now, int *now_filled, int frag_point, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ /*
+ * Ok this is the generic chunk service queue. we must do the
+ * following: - Service the stream queue that is next, moving any
+ * message (note I must get a complete message i.e. FIRST/MIDDLE and
+ * LAST to the out queue in one pass) and assigning TSN's - Check to
+ * see if the cwnd/rwnd allows any output, if so we go ahead and
+ * fomulate and send the low level chunks. Making sure to combine
+ * any control in the control chunk queue also.
+ */
+ struct sctp_nets *net, *start_at, *old_start_at = NULL;
+ struct mbuf *outchain, *endoutchain;
+ struct sctp_tmit_chunk *chk, *nchk;
+
+ /* temp arrays for unlinking */
+ struct sctp_tmit_chunk *data_list[SCTP_MAX_DATA_BUNDLING];
+ int no_fragmentflg, error;
+ unsigned int max_rwnd_per_dest, max_send_per_dest;
+ int one_chunk, hbflag, skip_data_for_this_net;
+ int asconf, cookie, no_out_cnt;
+ int bundle_at, ctl_cnt, no_data_chunks, eeor_mode;
+ unsigned int mtu, r_mtu, omtu, mx_mtu, to_out;
+ int tsns_sent = 0;
+ uint32_t auth_offset = 0;
+ struct sctp_auth_chunk *auth = NULL;
+ uint16_t auth_keyid;
+ int override_ok = 1;
+ int data_auth_reqd = 0;
+
+ /*
+ * JRS 5/14/07 - Add flag for whether a heartbeat is sent to the
+ * destination.
+ */
+ int pf_hbflag = 0;
+ int quit_now = 0;
+
+ *num_out = 0;
+ auth_keyid = stcb->asoc.authinfo.active_keyid;
+
+ if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) ||
+ (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED) ||
+ (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) {
+ eeor_mode = 1;
+ } else {
+ eeor_mode = 0;
+ }
+ ctl_cnt = no_out_cnt = asconf = cookie = 0;
+ /*
+ * First lets prime the pump. For each destination, if there is room
+ * in the flight size, attempt to pull an MTU's worth out of the
+ * stream queues into the general send_queue
+ */
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xC2, 2);
+#endif
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ hbflag = 0;
+ if ((control_only) || (asoc->stream_reset_outstanding))
+ no_data_chunks = 1;
+ else
+ no_data_chunks = 0;
+
+ /* Nothing to possible to send? */
+ if (TAILQ_EMPTY(&asoc->control_send_queue) &&
+ TAILQ_EMPTY(&asoc->asconf_send_queue) &&
+ TAILQ_EMPTY(&asoc->send_queue) &&
+ TAILQ_EMPTY(&asoc->out_wheel)) {
+ *reason_code = 9;
+ return (0);
+ }
+ if (asoc->peers_rwnd == 0) {
+ /* No room in peers rwnd */
+ *reason_code = 1;
+ if (asoc->total_flight > 0) {
+ /* we are allowed one chunk in flight */
+ no_data_chunks = 1;
+ }
+ }
+ max_rwnd_per_dest = ((asoc->peers_rwnd + asoc->total_flight) / asoc->numnets);
+ if (stcb->sctp_socket)
+ max_send_per_dest = SCTP_SB_LIMIT_SND(stcb->sctp_socket) / asoc->numnets;
+ else
+ max_send_per_dest = 0;
+ if ((no_data_chunks == 0) && (!TAILQ_EMPTY(&asoc->out_wheel))) {
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ /*
+ * This for loop we are in takes in each net, if
+ * its's got space in cwnd and has data sent to it
+ * (when CMT is off) then it calls
+ * sctp_fill_outqueue for the net. This gets data on
+ * the send queue for that network.
+ *
+ * In sctp_fill_outqueue TSN's are assigned and data is
+ * copied out of the stream buffers. Note mostly
+ * copy by reference (we hope).
+ */
+ net->window_probe = 0;
+ if ((net->dest_state & SCTP_ADDR_NOT_REACHABLE) ||
+ (net->dest_state & SCTP_ADDR_UNCONFIRMED)) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, 1,
+ SCTP_CWND_LOG_FILL_OUTQ_CALLED);
+ }
+ continue;
+ }
+ if ((asoc->sctp_cmt_on_off == 0) &&
+ (asoc->primary_destination != net) &&
+ (net->ref_count < 2)) {
+ /* nothing can be in queue for this guy */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, 2,
+ SCTP_CWND_LOG_FILL_OUTQ_CALLED);
+ }
+ continue;
+ }
+ if (net->flight_size >= net->cwnd) {
+ /* skip this network, no room - can't fill */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, 3,
+ SCTP_CWND_LOG_FILL_OUTQ_CALLED);
+ }
+ continue;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, 4, SCTP_CWND_LOG_FILL_OUTQ_CALLED);
+ }
+ sctp_fill_outqueue(stcb, net, frag_point, eeor_mode, &quit_now);
+ if (quit_now) {
+ /* memory alloc failure */
+ no_data_chunks = 1;
+ break;
+ }
+ }
+ }
+ /* now service each destination and send out what we can for it */
+ /* Nothing to send? */
+ if (TAILQ_EMPTY(&asoc->control_send_queue) &&
+ TAILQ_EMPTY(&asoc->asconf_send_queue) &&
+ TAILQ_EMPTY(&asoc->send_queue)) {
+ *reason_code = 8;
+ return (0);
+ }
+ if (asoc->sctp_cmt_on_off == 1) {
+ /* get the last start point */
+ start_at = asoc->last_net_cmt_send_started;
+ if (start_at == NULL) {
+ /* null so to beginning */
+ start_at = TAILQ_FIRST(&asoc->nets);
+ } else {
+ start_at = TAILQ_NEXT(asoc->last_net_cmt_send_started, sctp_next);
+ if (start_at == NULL) {
+ start_at = TAILQ_FIRST(&asoc->nets);
+ }
+ }
+ asoc->last_net_cmt_send_started = start_at;
+ } else {
+ start_at = TAILQ_FIRST(&asoc->nets);
+ }
+ old_start_at = NULL;
+again_one_more_time:
+ for (net = start_at; net != NULL; net = TAILQ_NEXT(net, sctp_next)) {
+ /* how much can we send? */
+ /* SCTPDBG("Examine for sending net:%x\n", (uint32_t)net); */
+ if (old_start_at && (old_start_at == net)) {
+ /* through list ocmpletely. */
+ break;
+ }
+ tsns_sent = 0xa;
+ if ((asoc->sctp_cmt_on_off == 0) &&
+ (asoc->primary_destination != net) &&
+ (net->ref_count < 2)) {
+ /*
+ * Ref-count of 1 so we cannot have data or control
+ * queued to this address. Skip it (non-CMT).
+ */
+ continue;
+ }
+ if (TAILQ_EMPTY(&asoc->control_send_queue) &&
+ TAILQ_EMPTY(&asoc->asconf_send_queue) &&
+ (net->flight_size >= net->cwnd)) {
+ /*
+ * Nothing on control or asconf and flight is full,
+ * we can skip even in the CMT case.
+ */
+ continue;
+ }
+ ctl_cnt = bundle_at = 0;
+ endoutchain = outchain = NULL;
+ no_fragmentflg = 1;
+ one_chunk = 0;
+ if (net->dest_state & SCTP_ADDR_UNCONFIRMED) {
+ skip_data_for_this_net = 1;
+ } else {
+ skip_data_for_this_net = 0;
+ }
+ if ((net->ro.ro_rt) && (net->ro.ro_rt->rt_ifp)) {
+ /*
+ * if we have a route and an ifp check to see if we
+ * have room to send to this guy
+ */
+ struct ifnet *ifp;
+
+ ifp = net->ro.ro_rt->rt_ifp;
+ if ((ifp->if_snd.ifq_len + 2) >= ifp->if_snd.ifq_maxlen) {
+ SCTP_STAT_INCR(sctps_ifnomemqueued);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) {
+ sctp_log_maxburst(stcb, net, ifp->if_snd.ifq_len, ifp->if_snd.ifq_maxlen, SCTP_MAX_IFP_APPLIED);
+ }
+ continue;
+ }
+ }
+ switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) {
+ case AF_INET:
+ mtu = net->mtu - (sizeof(struct ip) + sizeof(struct sctphdr));
+ break;
+#ifdef INET6
+ case AF_INET6:
+ mtu = net->mtu - (sizeof(struct ip6_hdr) + sizeof(struct sctphdr));
+ break;
+#endif
+ default:
+ /* TSNH */
+ mtu = net->mtu;
+ break;
+ }
+ mx_mtu = mtu;
+ to_out = 0;
+ if (mtu > asoc->peers_rwnd) {
+ if (asoc->total_flight > 0) {
+ /* We have a packet in flight somewhere */
+ r_mtu = asoc->peers_rwnd;
+ } else {
+ /* We are always allowed to send one MTU out */
+ one_chunk = 1;
+ r_mtu = mtu;
+ }
+ } else {
+ r_mtu = mtu;
+ }
+ /************************/
+ /* ASCONF transmission */
+ /************************/
+ /* Now first lets go through the asconf queue */
+ for (chk = TAILQ_FIRST(&asoc->asconf_send_queue);
+ chk; chk = nchk) {
+ nchk = TAILQ_NEXT(chk, sctp_next);
+ if (chk->rec.chunk_id.id != SCTP_ASCONF) {
+ continue;
+ }
+ if (chk->whoTo != net) {
+ /*
+ * No, not sent to the network we are
+ * looking at
+ */
+ break;
+ }
+ if (chk->data == NULL) {
+ break;
+ }
+ if (chk->sent != SCTP_DATAGRAM_UNSENT &&
+ chk->sent != SCTP_DATAGRAM_RESEND) {
+ break;
+ }
+ /*
+ * if no AUTH is yet included and this chunk
+ * requires it, make sure to account for it. We
+ * don't apply the size until the AUTH chunk is
+ * actually added below in case there is no room for
+ * this chunk. NOTE: we overload the use of "omtu"
+ * here
+ */
+ if ((auth == NULL) &&
+ sctp_auth_is_required_chunk(chk->rec.chunk_id.id,
+ stcb->asoc.peer_auth_chunks)) {
+ omtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
+ } else
+ omtu = 0;
+ /* Here we do NOT factor the r_mtu */
+ if ((chk->send_size < (int)(mtu - omtu)) ||
+ (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) {
+ /*
+ * We probably should glom the mbuf chain
+ * from the chk->data for control but the
+ * problem is it becomes yet one more level
+ * of tracking to do if for some reason
+ * output fails. Then I have got to
+ * reconstruct the merged control chain.. el
+ * yucko.. for now we take the easy way and
+ * do the copy
+ */
+ /*
+ * Add an AUTH chunk, if chunk requires it
+ * save the offset into the chain for AUTH
+ */
+ if ((auth == NULL) &&
+ (sctp_auth_is_required_chunk(chk->rec.chunk_id.id,
+ stcb->asoc.peer_auth_chunks))) {
+ outchain = sctp_add_auth_chunk(outchain,
+ &endoutchain,
+ &auth,
+ &auth_offset,
+ stcb,
+ chk->rec.chunk_id.id);
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+ }
+ outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain,
+ (int)chk->rec.chunk_id.can_take_data,
+ chk->send_size, chk->copy_by_ref);
+ if (outchain == NULL) {
+ *reason_code = 8;
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+ /* update our MTU size */
+ if (mtu > (chk->send_size + omtu))
+ mtu -= (chk->send_size + omtu);
+ else
+ mtu = 0;
+ to_out += (chk->send_size + omtu);
+ /* Do clear IP_DF ? */
+ if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) {
+ no_fragmentflg = 0;
+ }
+ if (chk->rec.chunk_id.can_take_data)
+ chk->data = NULL;
+ /*
+ * set hb flag since we can use these for
+ * RTO
+ */
+ hbflag = 1;
+ asconf = 1;
+ /*
+ * should sysctl this: don't bundle data
+ * with ASCONF since it requires AUTH
+ */
+ no_data_chunks = 1;
+ chk->sent = SCTP_DATAGRAM_SENT;
+ chk->snd_count++;
+ if (mtu == 0) {
+ /*
+ * Ok we are out of room but we can
+ * output without effecting the
+ * flight size since this little guy
+ * is a control only packet.
+ */
+ sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, net);
+ /*
+ * do NOT clear the asconf flag as
+ * it is used to do appropriate
+ * source address selection.
+ */
+ if ((error = sctp_lowlevel_chunk_output(inp, stcb, net,
+ (struct sockaddr *)&net->ro._l_addr,
+ outchain, auth_offset, auth,
+ stcb->asoc.authinfo.active_keyid,
+ no_fragmentflg, 0, NULL, asconf,
+ inp->sctp_lport, stcb->rport,
+ htonl(stcb->asoc.peer_vtag),
+ net->port, so_locked, NULL))) {
+ if (error == ENOBUFS) {
+ asoc->ifp_had_enobuf = 1;
+ SCTP_STAT_INCR(sctps_lowlevelerr);
+ }
+ if (from_where == 0) {
+ SCTP_STAT_INCR(sctps_lowlevelerrusr);
+ }
+ if (*now_filled == 0) {
+ (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
+ *now_filled = 1;
+ *now = net->last_sent_time;
+ } else {
+ net->last_sent_time = *now;
+ }
+ hbflag = 0;
+ /* error, could not output */
+ if (error == EHOSTUNREACH) {
+ /*
+ * Destination went
+ * unreachable
+ * during this send
+ */
+ sctp_move_chunks_from_net(stcb, net);
+ }
+ *reason_code = 7;
+ continue;
+ } else
+ asoc->ifp_had_enobuf = 0;
+ if (*now_filled == 0) {
+ (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
+ *now_filled = 1;
+ *now = net->last_sent_time;
+ } else {
+ net->last_sent_time = *now;
+ }
+ hbflag = 0;
+ /*
+ * increase the number we sent, if a
+ * cookie is sent we don't tell them
+ * any was sent out.
+ */
+ outchain = endoutchain = NULL;
+ auth = NULL;
+ auth_offset = 0;
+ if (!no_out_cnt)
+ *num_out += ctl_cnt;
+ /* recalc a clean slate and setup */
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ mtu = (net->mtu - SCTP_MIN_OVERHEAD);
+ } else {
+ mtu = (net->mtu - SCTP_MIN_V4_OVERHEAD);
+ }
+ to_out = 0;
+ no_fragmentflg = 1;
+ }
+ }
+ }
+ /************************/
+ /* Control transmission */
+ /************************/
+ /* Now first lets go through the control queue */
+ for (chk = TAILQ_FIRST(&asoc->control_send_queue);
+ chk; chk = nchk) {
+ nchk = TAILQ_NEXT(chk, sctp_next);
+ if (chk->whoTo != net) {
+ /*
+ * No, not sent to the network we are
+ * looking at
+ */
+ continue;
+ }
+ if (chk->data == NULL) {
+ continue;
+ }
+ if (chk->sent != SCTP_DATAGRAM_UNSENT) {
+ /*
+ * It must be unsent. Cookies and ASCONF's
+ * hang around but there timers will force
+ * when marked for resend.
+ */
+ continue;
+ }
+ /*
+ * if no AUTH is yet included and this chunk
+ * requires it, make sure to account for it. We
+ * don't apply the size until the AUTH chunk is
+ * actually added below in case there is no room for
+ * this chunk. NOTE: we overload the use of "omtu"
+ * here
+ */
+ if ((auth == NULL) &&
+ sctp_auth_is_required_chunk(chk->rec.chunk_id.id,
+ stcb->asoc.peer_auth_chunks)) {
+ omtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
+ } else
+ omtu = 0;
+ /* Here we do NOT factor the r_mtu */
+ if ((chk->send_size <= (int)(mtu - omtu)) ||
+ (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) {
+ /*
+ * We probably should glom the mbuf chain
+ * from the chk->data for control but the
+ * problem is it becomes yet one more level
+ * of tracking to do if for some reason
+ * output fails. Then I have got to
+ * reconstruct the merged control chain.. el
+ * yucko.. for now we take the easy way and
+ * do the copy
+ */
+ /*
+ * Add an AUTH chunk, if chunk requires it
+ * save the offset into the chain for AUTH
+ */
+ if ((auth == NULL) &&
+ (sctp_auth_is_required_chunk(chk->rec.chunk_id.id,
+ stcb->asoc.peer_auth_chunks))) {
+ outchain = sctp_add_auth_chunk(outchain,
+ &endoutchain,
+ &auth,
+ &auth_offset,
+ stcb,
+ chk->rec.chunk_id.id);
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+ }
+ outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain,
+ (int)chk->rec.chunk_id.can_take_data,
+ chk->send_size, chk->copy_by_ref);
+ if (outchain == NULL) {
+ *reason_code = 8;
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+ /* update our MTU size */
+ if (mtu > (chk->send_size + omtu))
+ mtu -= (chk->send_size + omtu);
+ else
+ mtu = 0;
+ to_out += (chk->send_size + omtu);
+ /* Do clear IP_DF ? */
+ if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) {
+ no_fragmentflg = 0;
+ }
+ if (chk->rec.chunk_id.can_take_data)
+ chk->data = NULL;
+ /* Mark things to be removed, if needed */
+ if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) ||
+ (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK) || /* EY */
+ (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) ||
+ (chk->rec.chunk_id.id == SCTP_HEARTBEAT_ACK) ||
+ (chk->rec.chunk_id.id == SCTP_SHUTDOWN) ||
+ (chk->rec.chunk_id.id == SCTP_SHUTDOWN_ACK) ||
+ (chk->rec.chunk_id.id == SCTP_OPERATION_ERROR) ||
+ (chk->rec.chunk_id.id == SCTP_COOKIE_ACK) ||
+ (chk->rec.chunk_id.id == SCTP_ECN_CWR) ||
+ (chk->rec.chunk_id.id == SCTP_PACKET_DROPPED) ||
+ (chk->rec.chunk_id.id == SCTP_ASCONF_ACK)) {
+
+ if (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) {
+ hbflag = 1;
+ /*
+ * JRS 5/14/07 - Set the
+ * flag to say a heartbeat
+ * is being sent.
+ */
+ pf_hbflag = 1;
+ }
+ /* remove these chunks at the end */
+ if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) ||
+ (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK)) {
+ /* turn off the timer */
+ if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_RECV,
+ inp, stcb, net, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_1);
+ }
+ }
+ ctl_cnt++;
+ } else {
+ /*
+ * Other chunks, since they have
+ * timers running (i.e. COOKIE) we
+ * just "trust" that it gets sent or
+ * retransmitted.
+ */
+ ctl_cnt++;
+ if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) {
+ cookie = 1;
+ no_out_cnt = 1;
+ }
+ chk->sent = SCTP_DATAGRAM_SENT;
+ chk->snd_count++;
+ }
+ if (mtu == 0) {
+ /*
+ * Ok we are out of room but we can
+ * output without effecting the
+ * flight size since this little guy
+ * is a control only packet.
+ */
+ if (asconf) {
+ sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, net);
+ /*
+ * do NOT clear the asconf
+ * flag as it is used to do
+ * appropriate source
+ * address selection.
+ */
+ }
+ if (cookie) {
+ sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net);
+ cookie = 0;
+ }
+ if ((error = sctp_lowlevel_chunk_output(inp, stcb, net,
+ (struct sockaddr *)&net->ro._l_addr,
+ outchain,
+ auth_offset, auth,
+ stcb->asoc.authinfo.active_keyid,
+ no_fragmentflg, 0, NULL, asconf,
+ inp->sctp_lport, stcb->rport,
+ htonl(stcb->asoc.peer_vtag),
+ net->port, so_locked, NULL))) {
+ if (error == ENOBUFS) {
+ asoc->ifp_had_enobuf = 1;
+ SCTP_STAT_INCR(sctps_lowlevelerr);
+ }
+ if (from_where == 0) {
+ SCTP_STAT_INCR(sctps_lowlevelerrusr);
+ }
+ /* error, could not output */
+ if (hbflag) {
+ if (*now_filled == 0) {
+ (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
+ *now_filled = 1;
+ *now = net->last_sent_time;
+ } else {
+ net->last_sent_time = *now;
+ }
+ hbflag = 0;
+ }
+ if (error == EHOSTUNREACH) {
+ /*
+ * Destination went
+ * unreachable
+ * during this send
+ */
+ sctp_move_chunks_from_net(stcb, net);
+ }
+ *reason_code = 7;
+ continue;
+ } else
+ asoc->ifp_had_enobuf = 0;
+ /* Only HB or ASCONF advances time */
+ if (hbflag) {
+ if (*now_filled == 0) {
+ (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
+ *now_filled = 1;
+ *now = net->last_sent_time;
+ } else {
+ net->last_sent_time = *now;
+ }
+ hbflag = 0;
+ }
+ /*
+ * increase the number we sent, if a
+ * cookie is sent we don't tell them
+ * any was sent out.
+ */
+ outchain = endoutchain = NULL;
+ auth = NULL;
+ auth_offset = 0;
+ if (!no_out_cnt)
+ *num_out += ctl_cnt;
+ /* recalc a clean slate and setup */
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ mtu = (net->mtu - SCTP_MIN_OVERHEAD);
+ } else {
+ mtu = (net->mtu - SCTP_MIN_V4_OVERHEAD);
+ }
+ to_out = 0;
+ no_fragmentflg = 1;
+ }
+ }
+ }
+ /* JRI: if dest is in PF state, do not send data to it */
+ if ((asoc->sctp_cmt_on_off == 1) &&
+ (asoc->sctp_cmt_pf > 0) &&
+ (net->dest_state & SCTP_ADDR_PF)) {
+ goto no_data_fill;
+ }
+ if (net->flight_size >= net->cwnd) {
+ goto no_data_fill;
+ }
+ if ((asoc->sctp_cmt_on_off == 1) &&
+ (SCTP_BASE_SYSCTL(sctp_buffer_splitting) & SCTP_RECV_BUFFER_SPLITTING) &&
+ (net->flight_size > max_rwnd_per_dest)) {
+ goto no_data_fill;
+ }
+ /*
+ * We need a specific accounting for the usage of the send
+ * buffer. We also need to check the number of messages per
+ * net. For now, this is better than nothing and it disabled
+ * by default...
+ */
+ if ((asoc->sctp_cmt_on_off == 1) &&
+ (SCTP_BASE_SYSCTL(sctp_buffer_splitting) & SCTP_SEND_BUFFER_SPLITTING) &&
+ (max_send_per_dest > 0) &&
+ (net->flight_size > max_send_per_dest)) {
+ goto no_data_fill;
+ }
+ /*********************/
+ /* Data transmission */
+ /*********************/
+ /*
+ * if AUTH for DATA is required and no AUTH has been added
+ * yet, account for this in the mtu now... if no data can be
+ * bundled, this adjustment won't matter anyways since the
+ * packet will be going out...
+ */
+ data_auth_reqd = sctp_auth_is_required_chunk(SCTP_DATA,
+ stcb->asoc.peer_auth_chunks);
+ if (data_auth_reqd && (auth == NULL)) {
+ mtu -= sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
+ }
+ /* now lets add any data within the MTU constraints */
+ switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) {
+ case AF_INET:
+ if (net->mtu > (sizeof(struct ip) + sizeof(struct sctphdr)))
+ omtu = net->mtu - (sizeof(struct ip) + sizeof(struct sctphdr));
+ else
+ omtu = 0;
+ break;
+#ifdef INET6
+ case AF_INET6:
+ if (net->mtu > (sizeof(struct ip6_hdr) + sizeof(struct sctphdr)))
+ omtu = net->mtu - (sizeof(struct ip6_hdr) + sizeof(struct sctphdr));
+ else
+ omtu = 0;
+ break;
+#endif
+ default:
+ /* TSNH */
+ omtu = 0;
+ break;
+ }
+ if ((((asoc->state & SCTP_STATE_OPEN) == SCTP_STATE_OPEN) &&
+ (skip_data_for_this_net == 0)) ||
+ (cookie)) {
+ for (chk = TAILQ_FIRST(&asoc->send_queue); chk; chk = nchk) {
+ if (no_data_chunks) {
+ /* let only control go out */
+ *reason_code = 1;
+ break;
+ }
+ if (net->flight_size >= net->cwnd) {
+ /* skip this net, no room for data */
+ *reason_code = 2;
+ break;
+ }
+ nchk = TAILQ_NEXT(chk, sctp_next);
+ if ((chk->whoTo != NULL) &&
+ (chk->whoTo != net)) {
+ /* Don't send the chunk on this net */
+ continue;
+ }
+ if ((chk->send_size > omtu) && ((chk->flags & CHUNK_FLAGS_FRAGMENT_OK) == 0)) {
+ /*-
+ * strange, we have a chunk that is
+ * to big for its destination and
+ * yet no fragment ok flag.
+ * Something went wrong when the
+ * PMTU changed...we did not mark
+ * this chunk for some reason?? I
+ * will fix it here by letting IP
+ * fragment it for now and printing
+ * a warning. This really should not
+ * happen ...
+ */
+ SCTP_PRINTF("Warning chunk of %d bytes > mtu:%d and yet PMTU disc missed\n",
+ chk->send_size, mtu);
+ chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) &&
+ ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) == SCTP_STATE_SHUTDOWN_PENDING)) {
+ struct sctp_data_chunk *dchkh;
+
+ dchkh = mtod(chk->data, struct sctp_data_chunk *);
+ dchkh->ch.chunk_flags |= SCTP_DATA_SACK_IMMEDIATELY;
+ }
+ if (((chk->send_size <= mtu) && (chk->send_size <= r_mtu)) ||
+ ((chk->flags & CHUNK_FLAGS_FRAGMENT_OK) && (chk->send_size <= asoc->peers_rwnd))) {
+ /* ok we will add this one */
+
+ /*
+ * Add an AUTH chunk, if chunk
+ * requires it, save the offset into
+ * the chain for AUTH
+ */
+ if (data_auth_reqd) {
+ if (auth == NULL) {
+ outchain = sctp_add_auth_chunk(outchain,
+ &endoutchain,
+ &auth,
+ &auth_offset,
+ stcb,
+ SCTP_DATA);
+ auth_keyid = chk->auth_keyid;
+ override_ok = 0;
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+ } else if (override_ok) {
+ /*
+ * use this data's
+ * keyid
+ */
+ auth_keyid = chk->auth_keyid;
+ override_ok = 0;
+ } else if (auth_keyid != chk->auth_keyid) {
+ /*
+ * different keyid,
+ * so done bundling
+ */
+ break;
+ }
+ }
+ outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain, 0,
+ chk->send_size, chk->copy_by_ref);
+ if (outchain == NULL) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "No memory?\n");
+ if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
+ }
+ *reason_code = 3;
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ /* upate our MTU size */
+ /* Do clear IP_DF ? */
+ if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) {
+ no_fragmentflg = 0;
+ }
+ /* unsigned subtraction of mtu */
+ if (mtu > chk->send_size)
+ mtu -= chk->send_size;
+ else
+ mtu = 0;
+ /* unsigned subtraction of r_mtu */
+ if (r_mtu > chk->send_size)
+ r_mtu -= chk->send_size;
+ else
+ r_mtu = 0;
+
+ to_out += chk->send_size;
+ if ((to_out > mx_mtu) && no_fragmentflg) {
+#ifdef INVARIANTS
+ panic("Exceeding mtu of %d out size is %d", mx_mtu, to_out);
+#else
+ SCTP_PRINTF("Exceeding mtu of %d out size is %d\n",
+ mx_mtu, to_out);
+#endif
+ }
+ chk->window_probe = 0;
+ data_list[bundle_at++] = chk;
+ if (bundle_at >= SCTP_MAX_DATA_BUNDLING) {
+ mtu = 0;
+ break;
+ }
+ if (chk->sent == SCTP_DATAGRAM_UNSENT) {
+ if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) {
+ SCTP_STAT_INCR_COUNTER64(sctps_outorderchunks);
+ } else {
+ SCTP_STAT_INCR_COUNTER64(sctps_outunorderchunks);
+ }
+ if (((chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) == SCTP_DATA_LAST_FRAG) &&
+ ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0))
+ /*
+ * Count number of
+ * user msg's that
+ * were fragmented
+ * we do this by
+ * counting when we
+ * see a LAST
+ * fragment only.
+ */
+ SCTP_STAT_INCR_COUNTER64(sctps_fragusrmsgs);
+ }
+ if ((mtu == 0) || (r_mtu == 0) || (one_chunk)) {
+ if ((one_chunk) && (stcb->asoc.total_flight == 0)) {
+ data_list[0]->window_probe = 1;
+ net->window_probe = 1;
+ }
+ break;
+ }
+ } else {
+ /*
+ * Must be sent in order of the
+ * TSN's (on a network)
+ */
+ break;
+ }
+ } /* for (chunk gather loop for this net) */
+ } /* if asoc.state OPEN */
+no_data_fill:
+ /* Is there something to send for this destination? */
+ if (outchain) {
+ /* We may need to start a control timer or two */
+ if (asconf) {
+ sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp,
+ stcb, net);
+ /*
+ * do NOT clear the asconf flag as it is
+ * used to do appropriate source address
+ * selection.
+ */
+ }
+ if (cookie) {
+ sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net);
+ cookie = 0;
+ }
+ /* must start a send timer if data is being sent */
+ if (bundle_at && (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer))) {
+ /*
+ * no timer running on this destination
+ * restart it.
+ */
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
+ } else if ((asoc->sctp_cmt_on_off == 1) &&
+ (asoc->sctp_cmt_pf > 0) &&
+ pf_hbflag &&
+ ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF) &&
+ (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer))) {
+ /*
+ * JRS 5/14/07 - If a HB has been sent to a
+ * PF destination and no T3 timer is
+ * currently running, start the T3 timer to
+ * track the HBs that were sent.
+ */
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
+ }
+ /* Now send it, if there is anything to send :> */
+ if ((error = sctp_lowlevel_chunk_output(inp,
+ stcb,
+ net,
+ (struct sockaddr *)&net->ro._l_addr,
+ outchain,
+ auth_offset,
+ auth,
+ auth_keyid,
+ no_fragmentflg,
+ bundle_at,
+ data_list[0],
+ asconf,
+ inp->sctp_lport, stcb->rport,
+ htonl(stcb->asoc.peer_vtag),
+ net->port, so_locked, NULL))) {
+ /* error, we could not output */
+ if (error == ENOBUFS) {
+ SCTP_STAT_INCR(sctps_lowlevelerr);
+ asoc->ifp_had_enobuf = 1;
+ }
+ if (from_where == 0) {
+ SCTP_STAT_INCR(sctps_lowlevelerrusr);
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error);
+ if (hbflag) {
+ if (*now_filled == 0) {
+ (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
+ *now_filled = 1;
+ *now = net->last_sent_time;
+ } else {
+ net->last_sent_time = *now;
+ }
+ hbflag = 0;
+ }
+ if (error == EHOSTUNREACH) {
+ /*
+ * Destination went unreachable
+ * during this send
+ */
+ sctp_move_chunks_from_net(stcb, net);
+ }
+ *reason_code = 6;
+ /*-
+ * I add this line to be paranoid. As far as
+ * I can tell the continue, takes us back to
+ * the top of the for, but just to make sure
+ * I will reset these again here.
+ */
+ ctl_cnt = bundle_at = 0;
+ continue; /* This takes us back to the
+ * for() for the nets. */
+ } else {
+ asoc->ifp_had_enobuf = 0;
+ }
+ outchain = endoutchain = NULL;
+ auth = NULL;
+ auth_offset = 0;
+ if (bundle_at || hbflag) {
+ /* For data/asconf and hb set time */
+ if (*now_filled == 0) {
+ (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
+ *now_filled = 1;
+ *now = net->last_sent_time;
+ } else {
+ net->last_sent_time = *now;
+ }
+ }
+ if (!no_out_cnt) {
+ *num_out += (ctl_cnt + bundle_at);
+ }
+ if (bundle_at) {
+ /* setup for a RTO measurement */
+ tsns_sent = data_list[0]->rec.data.TSN_seq;
+ /* fill time if not already filled */
+ if (*now_filled == 0) {
+ (void)SCTP_GETTIME_TIMEVAL(&asoc->time_last_sent);
+ *now_filled = 1;
+ *now = asoc->time_last_sent;
+ } else {
+ asoc->time_last_sent = *now;
+ }
+ data_list[0]->do_rtt = 1;
+ SCTP_STAT_INCR_BY(sctps_senddata, bundle_at);
+ sctp_clean_up_datalist(stcb, asoc, data_list, bundle_at, net);
+ if (SCTP_BASE_SYSCTL(sctp_early_fr)) {
+ if (net->flight_size < net->cwnd) {
+ /* start or restart it */
+ if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, inp, stcb, net,
+ SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_2);
+ }
+ SCTP_STAT_INCR(sctps_earlyfrstrout);
+ sctp_timer_start(SCTP_TIMER_TYPE_EARLYFR, inp, stcb, net);
+ } else {
+ /* stop it if its running */
+ if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) {
+ SCTP_STAT_INCR(sctps_earlyfrstpout);
+ sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, inp, stcb, net,
+ SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_3);
+ }
+ }
+ }
+ }
+ if (one_chunk) {
+ break;
+ }
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, tsns_sent, SCTP_CWND_LOG_FROM_SEND);
+ }
+ }
+ if (old_start_at == NULL) {
+ old_start_at = start_at;
+ start_at = TAILQ_FIRST(&asoc->nets);
+ if (old_start_at)
+ goto again_one_more_time;
+ }
+ /*
+ * At the end there should be no NON timed chunks hanging on this
+ * queue.
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, *num_out, SCTP_CWND_LOG_FROM_SEND);
+ }
+ if ((*num_out == 0) && (*reason_code == 0)) {
+ *reason_code = 4;
+ } else {
+ *reason_code = 5;
+ }
+ sctp_clean_up_ctl(stcb, asoc);
+ return (0);
+}
+
+void
+sctp_queue_op_err(struct sctp_tcb *stcb, struct mbuf *op_err)
+{
+ /*-
+ * Prepend a OPERATIONAL_ERROR chunk header and put on the end of
+ * the control chunk queue.
+ */
+ struct sctp_chunkhdr *hdr;
+ struct sctp_tmit_chunk *chk;
+ struct mbuf *mat;
+
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ /* no memory */
+ sctp_m_freem(op_err);
+ return;
+ }
+ chk->copy_by_ref = 0;
+ SCTP_BUF_PREPEND(op_err, sizeof(struct sctp_chunkhdr), M_DONTWAIT);
+ if (op_err == NULL) {
+ sctp_free_a_chunk(stcb, chk);
+ return;
+ }
+ chk->send_size = 0;
+ mat = op_err;
+ while (mat != NULL) {
+ chk->send_size += SCTP_BUF_LEN(mat);
+ mat = SCTP_BUF_NEXT(mat);
+ }
+ chk->rec.chunk_id.id = SCTP_OPERATION_ERROR;
+ chk->rec.chunk_id.can_take_data = 1;
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->flags = 0;
+ chk->asoc = &stcb->asoc;
+ chk->data = op_err;
+ chk->whoTo = chk->asoc->primary_destination;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ hdr = mtod(op_err, struct sctp_chunkhdr *);
+ hdr->chunk_type = SCTP_OPERATION_ERROR;
+ hdr->chunk_flags = 0;
+ hdr->chunk_length = htons(chk->send_size);
+ TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue,
+ chk,
+ sctp_next);
+ chk->asoc->ctrl_queue_cnt++;
+}
+
+int
+sctp_send_cookie_echo(struct mbuf *m,
+ int offset,
+ struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ /*-
+ * pull out the cookie and put it at the front of the control chunk
+ * queue.
+ */
+ int at;
+ struct mbuf *cookie;
+ struct sctp_paramhdr parm, *phdr;
+ struct sctp_chunkhdr *hdr;
+ struct sctp_tmit_chunk *chk;
+ uint16_t ptype, plen;
+
+ /* First find the cookie in the param area */
+ cookie = NULL;
+ at = offset + sizeof(struct sctp_init_chunk);
+
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ do {
+ phdr = sctp_get_next_param(m, at, &parm, sizeof(parm));
+ if (phdr == NULL) {
+ return (-3);
+ }
+ ptype = ntohs(phdr->param_type);
+ plen = ntohs(phdr->param_length);
+ if (ptype == SCTP_STATE_COOKIE) {
+ int pad;
+
+ /* found the cookie */
+ if ((pad = (plen % 4))) {
+ plen += 4 - pad;
+ }
+ cookie = SCTP_M_COPYM(m, at, plen, M_DONTWAIT);
+ if (cookie == NULL) {
+ /* No memory */
+ return (-2);
+ }
+#ifdef SCTP_MBUF_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ struct mbuf *mat;
+
+ mat = cookie;
+ while (mat) {
+ if (SCTP_BUF_IS_EXTENDED(mat)) {
+ sctp_log_mb(mat, SCTP_MBUF_ICOPY);
+ }
+ mat = SCTP_BUF_NEXT(mat);
+ }
+ }
+#endif
+ break;
+ }
+ at += SCTP_SIZE32(plen);
+ } while (phdr);
+ if (cookie == NULL) {
+ /* Did not find the cookie */
+ return (-3);
+ }
+ /* ok, we got the cookie lets change it into a cookie echo chunk */
+
+ /* first the change from param to cookie */
+ hdr = mtod(cookie, struct sctp_chunkhdr *);
+ hdr->chunk_type = SCTP_COOKIE_ECHO;
+ hdr->chunk_flags = 0;
+ /* get the chunk stuff now and place it in the FRONT of the queue */
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ /* no memory */
+ sctp_m_freem(cookie);
+ return (-5);
+ }
+ chk->copy_by_ref = 0;
+ chk->send_size = plen;
+ chk->rec.chunk_id.id = SCTP_COOKIE_ECHO;
+ chk->rec.chunk_id.can_take_data = 0;
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->flags = CHUNK_FLAGS_FRAGMENT_OK;
+ chk->asoc = &stcb->asoc;
+ chk->data = cookie;
+ chk->whoTo = chk->asoc->primary_destination;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ TAILQ_INSERT_HEAD(&chk->asoc->control_send_queue, chk, sctp_next);
+ chk->asoc->ctrl_queue_cnt++;
+ return (0);
+}
+
+void
+sctp_send_heartbeat_ack(struct sctp_tcb *stcb,
+ struct mbuf *m,
+ int offset,
+ int chk_length,
+ struct sctp_nets *net)
+{
+ /*
+ * take a HB request and make it into a HB ack and send it.
+ */
+ struct mbuf *outchain;
+ struct sctp_chunkhdr *chdr;
+ struct sctp_tmit_chunk *chk;
+
+
+ if (net == NULL)
+ /* must have a net pointer */
+ return;
+
+ outchain = SCTP_M_COPYM(m, offset, chk_length, M_DONTWAIT);
+ if (outchain == NULL) {
+ /* gak out of memory */
+ return;
+ }
+#ifdef SCTP_MBUF_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ struct mbuf *mat;
+
+ mat = outchain;
+ while (mat) {
+ if (SCTP_BUF_IS_EXTENDED(mat)) {
+ sctp_log_mb(mat, SCTP_MBUF_ICOPY);
+ }
+ mat = SCTP_BUF_NEXT(mat);
+ }
+ }
+#endif
+ chdr = mtod(outchain, struct sctp_chunkhdr *);
+ chdr->chunk_type = SCTP_HEARTBEAT_ACK;
+ chdr->chunk_flags = 0;
+ if (chk_length % 4) {
+ /* need pad */
+ uint32_t cpthis = 0;
+ int padlen;
+
+ padlen = 4 - (chk_length % 4);
+ m_copyback(outchain, chk_length, padlen, (caddr_t)&cpthis);
+ }
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ /* no memory */
+ sctp_m_freem(outchain);
+ return;
+ }
+ chk->copy_by_ref = 0;
+ chk->send_size = chk_length;
+ chk->rec.chunk_id.id = SCTP_HEARTBEAT_ACK;
+ chk->rec.chunk_id.can_take_data = 1;
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->flags = 0;
+ chk->asoc = &stcb->asoc;
+ chk->data = outchain;
+ chk->whoTo = net;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next);
+ chk->asoc->ctrl_queue_cnt++;
+}
+
+void
+sctp_send_cookie_ack(struct sctp_tcb *stcb)
+{
+ /* formulate and queue a cookie-ack back to sender */
+ struct mbuf *cookie_ack;
+ struct sctp_chunkhdr *hdr;
+ struct sctp_tmit_chunk *chk;
+
+ cookie_ack = NULL;
+ SCTP_TCB_LOCK_ASSERT(stcb);
+
+ cookie_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_DONTWAIT, 1, MT_HEADER);
+ if (cookie_ack == NULL) {
+ /* no mbuf's */
+ return;
+ }
+ SCTP_BUF_RESV_UF(cookie_ack, SCTP_MIN_OVERHEAD);
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ /* no memory */
+ sctp_m_freem(cookie_ack);
+ return;
+ }
+ chk->copy_by_ref = 0;
+ chk->send_size = sizeof(struct sctp_chunkhdr);
+ chk->rec.chunk_id.id = SCTP_COOKIE_ACK;
+ chk->rec.chunk_id.can_take_data = 1;
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->flags = 0;
+ chk->asoc = &stcb->asoc;
+ chk->data = cookie_ack;
+ if (chk->asoc->last_control_chunk_from != NULL) {
+ chk->whoTo = chk->asoc->last_control_chunk_from;
+ } else {
+ chk->whoTo = chk->asoc->primary_destination;
+ }
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ hdr = mtod(cookie_ack, struct sctp_chunkhdr *);
+ hdr->chunk_type = SCTP_COOKIE_ACK;
+ hdr->chunk_flags = 0;
+ hdr->chunk_length = htons(chk->send_size);
+ SCTP_BUF_LEN(cookie_ack) = chk->send_size;
+ TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next);
+ chk->asoc->ctrl_queue_cnt++;
+ return;
+}
+
+
+void
+sctp_send_shutdown_ack(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ /* formulate and queue a SHUTDOWN-ACK back to the sender */
+ struct mbuf *m_shutdown_ack;
+ struct sctp_shutdown_ack_chunk *ack_cp;
+ struct sctp_tmit_chunk *chk;
+
+ m_shutdown_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_ack_chunk), 0, M_DONTWAIT, 1, MT_HEADER);
+ if (m_shutdown_ack == NULL) {
+ /* no mbuf's */
+ return;
+ }
+ SCTP_BUF_RESV_UF(m_shutdown_ack, SCTP_MIN_OVERHEAD);
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ /* no memory */
+ sctp_m_freem(m_shutdown_ack);
+ return;
+ }
+ chk->copy_by_ref = 0;
+ chk->send_size = sizeof(struct sctp_chunkhdr);
+ chk->rec.chunk_id.id = SCTP_SHUTDOWN_ACK;
+ chk->rec.chunk_id.can_take_data = 1;
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->flags = 0;
+ chk->asoc = &stcb->asoc;
+ chk->data = m_shutdown_ack;
+ chk->whoTo = net;
+ atomic_add_int(&net->ref_count, 1);
+
+ ack_cp = mtod(m_shutdown_ack, struct sctp_shutdown_ack_chunk *);
+ ack_cp->ch.chunk_type = SCTP_SHUTDOWN_ACK;
+ ack_cp->ch.chunk_flags = 0;
+ ack_cp->ch.chunk_length = htons(chk->send_size);
+ SCTP_BUF_LEN(m_shutdown_ack) = chk->send_size;
+ TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next);
+ chk->asoc->ctrl_queue_cnt++;
+ return;
+}
+
+void
+sctp_send_shutdown(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ /* formulate and queue a SHUTDOWN to the sender */
+ struct mbuf *m_shutdown;
+ struct sctp_shutdown_chunk *shutdown_cp;
+ struct sctp_tmit_chunk *chk;
+
+ m_shutdown = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_chunk), 0, M_DONTWAIT, 1, MT_HEADER);
+ if (m_shutdown == NULL) {
+ /* no mbuf's */
+ return;
+ }
+ SCTP_BUF_RESV_UF(m_shutdown, SCTP_MIN_OVERHEAD);
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ /* no memory */
+ sctp_m_freem(m_shutdown);
+ return;
+ }
+ chk->copy_by_ref = 0;
+ chk->send_size = sizeof(struct sctp_shutdown_chunk);
+ chk->rec.chunk_id.id = SCTP_SHUTDOWN;
+ chk->rec.chunk_id.can_take_data = 1;
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->flags = 0;
+ chk->asoc = &stcb->asoc;
+ chk->data = m_shutdown;
+ chk->whoTo = net;
+ atomic_add_int(&net->ref_count, 1);
+
+ shutdown_cp = mtod(m_shutdown, struct sctp_shutdown_chunk *);
+ shutdown_cp->ch.chunk_type = SCTP_SHUTDOWN;
+ shutdown_cp->ch.chunk_flags = 0;
+ shutdown_cp->ch.chunk_length = htons(chk->send_size);
+ shutdown_cp->cumulative_tsn_ack = htonl(stcb->asoc.cumulative_tsn);
+ SCTP_BUF_LEN(m_shutdown) = chk->send_size;
+ TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next);
+ chk->asoc->ctrl_queue_cnt++;
+ return;
+}
+
+void
+sctp_send_asconf(struct sctp_tcb *stcb, struct sctp_nets *net, int addr_locked)
+{
+ /*
+ * formulate and queue an ASCONF to the peer. ASCONF parameters
+ * should be queued on the assoc queue.
+ */
+ struct sctp_tmit_chunk *chk;
+ struct mbuf *m_asconf;
+ int len;
+
+ SCTP_TCB_LOCK_ASSERT(stcb);
+
+ if ((!TAILQ_EMPTY(&stcb->asoc.asconf_send_queue)) &&
+ (!sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS))) {
+ /* can't send a new one if there is one in flight already */
+ return;
+ }
+ /* compose an ASCONF chunk, maximum length is PMTU */
+ m_asconf = sctp_compose_asconf(stcb, &len, addr_locked);
+ if (m_asconf == NULL) {
+ return;
+ }
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ /* no memory */
+ sctp_m_freem(m_asconf);
+ return;
+ }
+ chk->copy_by_ref = 0;
+ chk->data = m_asconf;
+ chk->send_size = len;
+ chk->rec.chunk_id.id = SCTP_ASCONF;
+ chk->rec.chunk_id.can_take_data = 0;
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->flags = CHUNK_FLAGS_FRAGMENT_OK;
+ chk->asoc = &stcb->asoc;
+ chk->whoTo = net;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ TAILQ_INSERT_TAIL(&chk->asoc->asconf_send_queue, chk, sctp_next);
+ chk->asoc->ctrl_queue_cnt++;
+ return;
+}
+
+void
+sctp_send_asconf_ack(struct sctp_tcb *stcb)
+{
+ /*
+ * formulate and queue a asconf-ack back to sender. the asconf-ack
+ * must be stored in the tcb.
+ */
+ struct sctp_tmit_chunk *chk;
+ struct sctp_asconf_ack *ack, *latest_ack;
+ struct mbuf *m_ack, *m;
+ struct sctp_nets *net = NULL;
+
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ /* Get the latest ASCONF-ACK */
+ latest_ack = TAILQ_LAST(&stcb->asoc.asconf_ack_sent, sctp_asconf_ackhead);
+ if (latest_ack == NULL) {
+ return;
+ }
+ if (latest_ack->last_sent_to != NULL &&
+ latest_ack->last_sent_to == stcb->asoc.last_control_chunk_from) {
+ /* we're doing a retransmission */
+ net = sctp_find_alternate_net(stcb, stcb->asoc.last_control_chunk_from, 0);
+ if (net == NULL) {
+ /* no alternate */
+ if (stcb->asoc.last_control_chunk_from == NULL)
+ net = stcb->asoc.primary_destination;
+ else
+ net = stcb->asoc.last_control_chunk_from;
+ }
+ } else {
+ /* normal case */
+ if (stcb->asoc.last_control_chunk_from == NULL)
+ net = stcb->asoc.primary_destination;
+ else
+ net = stcb->asoc.last_control_chunk_from;
+ }
+ latest_ack->last_sent_to = net;
+
+ TAILQ_FOREACH(ack, &stcb->asoc.asconf_ack_sent, next) {
+ if (ack->data == NULL) {
+ continue;
+ }
+ /* copy the asconf_ack */
+ m_ack = SCTP_M_COPYM(ack->data, 0, M_COPYALL, M_DONTWAIT);
+ if (m_ack == NULL) {
+ /* couldn't copy it */
+ return;
+ }
+#ifdef SCTP_MBUF_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ struct mbuf *mat;
+
+ mat = m_ack;
+ while (mat) {
+ if (SCTP_BUF_IS_EXTENDED(mat)) {
+ sctp_log_mb(mat, SCTP_MBUF_ICOPY);
+ }
+ mat = SCTP_BUF_NEXT(mat);
+ }
+ }
+#endif
+
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ /* no memory */
+ if (m_ack)
+ sctp_m_freem(m_ack);
+ return;
+ }
+ chk->copy_by_ref = 0;
+
+ chk->whoTo = net;
+ chk->data = m_ack;
+ chk->send_size = 0;
+ /* Get size */
+ m = m_ack;
+ chk->send_size = ack->len;
+ chk->rec.chunk_id.id = SCTP_ASCONF_ACK;
+ chk->rec.chunk_id.can_take_data = 1;
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; /* XXX */
+ chk->asoc = &stcb->asoc;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+
+ TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next);
+ chk->asoc->ctrl_queue_cnt++;
+ }
+ return;
+}
+
+
+static int
+sctp_chunk_retransmission(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ int *cnt_out, struct timeval *now, int *now_filled, int *fr_done, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ /*-
+ * send out one MTU of retransmission. If fast_retransmit is
+ * happening we ignore the cwnd. Otherwise we obey the cwnd and
+ * rwnd. For a Cookie or Asconf in the control chunk queue we
+ * retransmit them by themselves.
+ *
+ * For data chunks we will pick out the lowest TSN's in the sent_queue
+ * marked for resend and bundle them all together (up to a MTU of
+ * destination). The address to send to should have been
+ * selected/changed where the retransmission was marked (i.e. in FR
+ * or t3-timeout routines).
+ */
+ struct sctp_tmit_chunk *data_list[SCTP_MAX_DATA_BUNDLING];
+ struct sctp_tmit_chunk *chk, *fwd;
+ struct mbuf *m, *endofchain;
+ struct sctp_nets *net = NULL;
+ uint32_t tsns_sent = 0;
+ int no_fragmentflg, bundle_at, cnt_thru;
+ unsigned int mtu;
+ int error, i, one_chunk, fwd_tsn, ctl_cnt, tmr_started;
+ struct sctp_auth_chunk *auth = NULL;
+ uint32_t auth_offset = 0;
+ uint16_t auth_keyid;
+ int override_ok = 1;
+ int data_auth_reqd = 0;
+ uint32_t dmtu = 0;
+
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ tmr_started = ctl_cnt = bundle_at = error = 0;
+ no_fragmentflg = 1;
+ fwd_tsn = 0;
+ *cnt_out = 0;
+ fwd = NULL;
+ endofchain = m = NULL;
+ auth_keyid = stcb->asoc.authinfo.active_keyid;
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xC3, 1);
+#endif
+ if ((TAILQ_EMPTY(&asoc->sent_queue)) &&
+ (TAILQ_EMPTY(&asoc->control_send_queue))) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "SCTP hits empty queue with cnt set to %d?\n",
+ asoc->sent_queue_retran_cnt);
+ asoc->sent_queue_cnt = 0;
+ asoc->sent_queue_cnt_removeable = 0;
+ /* send back 0/0 so we enter normal transmission */
+ *cnt_out = 0;
+ return (0);
+ }
+ TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
+ if ((chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) ||
+ (chk->rec.chunk_id.id == SCTP_STREAM_RESET) ||
+ (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN)) {
+ if (chk->sent != SCTP_DATAGRAM_RESEND) {
+ continue;
+ }
+ if (chk->rec.chunk_id.id == SCTP_STREAM_RESET) {
+ if (chk != asoc->str_reset) {
+ /*
+ * not eligible for retran if its
+ * not ours
+ */
+ continue;
+ }
+ }
+ ctl_cnt++;
+ if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) {
+ fwd_tsn = 1;
+ fwd = chk;
+ }
+ /*
+ * Add an AUTH chunk, if chunk requires it save the
+ * offset into the chain for AUTH
+ */
+ if ((auth == NULL) &&
+ (sctp_auth_is_required_chunk(chk->rec.chunk_id.id,
+ stcb->asoc.peer_auth_chunks))) {
+ m = sctp_add_auth_chunk(m, &endofchain,
+ &auth, &auth_offset,
+ stcb,
+ chk->rec.chunk_id.id);
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+ }
+ m = sctp_copy_mbufchain(chk->data, m, &endofchain, 0, chk->send_size, chk->copy_by_ref);
+ break;
+ }
+ }
+ one_chunk = 0;
+ cnt_thru = 0;
+ /* do we have control chunks to retransmit? */
+ if (m != NULL) {
+ /* Start a timer no matter if we suceed or fail */
+ if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) {
+ sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, chk->whoTo);
+ } else if (chk->rec.chunk_id.id == SCTP_ASCONF)
+ sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, chk->whoTo);
+ chk->snd_count++; /* update our count */
+ if ((error = sctp_lowlevel_chunk_output(inp, stcb, chk->whoTo,
+ (struct sockaddr *)&chk->whoTo->ro._l_addr, m,
+ auth_offset, auth, stcb->asoc.authinfo.active_keyid,
+ no_fragmentflg, 0, NULL, 0,
+ inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag),
+ chk->whoTo->port, so_locked, NULL))) {
+ SCTP_STAT_INCR(sctps_lowlevelerr);
+ return (error);
+ }
+ m = endofchain = NULL;
+ auth = NULL;
+ auth_offset = 0;
+ /*
+ * We don't want to mark the net->sent time here since this
+ * we use this for HB and retrans cannot measure RTT
+ */
+ /* (void)SCTP_GETTIME_TIMEVAL(&chk->whoTo->last_sent_time); */
+ *cnt_out += 1;
+ chk->sent = SCTP_DATAGRAM_SENT;
+ sctp_ucount_decr(stcb->asoc.sent_queue_retran_cnt);
+ if (fwd_tsn == 0) {
+ return (0);
+ } else {
+ /* Clean up the fwd-tsn list */
+ sctp_clean_up_ctl(stcb, asoc);
+ return (0);
+ }
+ }
+ /*
+ * Ok, it is just data retransmission we need to do or that and a
+ * fwd-tsn with it all.
+ */
+ if (TAILQ_EMPTY(&asoc->sent_queue)) {
+ return (SCTP_RETRAN_DONE);
+ }
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT)) {
+ /* not yet open, resend the cookie and that is it */
+ return (1);
+ }
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(20, inp, stcb, NULL);
+#endif
+ data_auth_reqd = sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks);
+ TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
+ if (chk->sent != SCTP_DATAGRAM_RESEND) {
+ /* No, not sent to this net or not ready for rtx */
+ continue;
+ }
+ if (chk->data == NULL) {
+ printf("TSN:%x chk->snd_count:%d chk->sent:%d can't retran - no data\n",
+ chk->rec.data.TSN_seq, chk->snd_count, chk->sent);
+ continue;
+ }
+ if ((SCTP_BASE_SYSCTL(sctp_max_retran_chunk)) &&
+ (chk->snd_count >= SCTP_BASE_SYSCTL(sctp_max_retran_chunk))) {
+ /* Gak, we have exceeded max unlucky retran, abort! */
+ SCTP_PRINTF("Gak, chk->snd_count:%d >= max:%d - send abort\n",
+ chk->snd_count,
+ SCTP_BASE_SYSCTL(sctp_max_retran_chunk));
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ sctp_abort_an_association(stcb->sctp_ep, stcb, 0, NULL, so_locked);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ return (SCTP_RETRAN_EXIT);
+ }
+ /* pick up the net */
+ net = chk->whoTo;
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ mtu = (net->mtu - SCTP_MIN_OVERHEAD);
+ } else {
+ mtu = net->mtu - SCTP_MIN_V4_OVERHEAD;
+ }
+
+ if ((asoc->peers_rwnd < mtu) && (asoc->total_flight > 0)) {
+ /* No room in peers rwnd */
+ uint32_t tsn;
+
+ tsn = asoc->last_acked_seq + 1;
+ if (tsn == chk->rec.data.TSN_seq) {
+ /*
+ * we make a special exception for this
+ * case. The peer has no rwnd but is missing
+ * the lowest chunk.. which is probably what
+ * is holding up the rwnd.
+ */
+ goto one_chunk_around;
+ }
+ return (1);
+ }
+one_chunk_around:
+ if (asoc->peers_rwnd < mtu) {
+ one_chunk = 1;
+ if ((asoc->peers_rwnd == 0) &&
+ (asoc->total_flight == 0)) {
+ chk->window_probe = 1;
+ chk->whoTo->window_probe = 1;
+ }
+ }
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xC3, 2);
+#endif
+ bundle_at = 0;
+ m = NULL;
+ net->fast_retran_ip = 0;
+ if (chk->rec.data.doing_fast_retransmit == 0) {
+ /*
+ * if no FR in progress skip destination that have
+ * flight_size > cwnd.
+ */
+ if (net->flight_size >= net->cwnd) {
+ continue;
+ }
+ } else {
+ /*
+ * Mark the destination net to have FR recovery
+ * limits put on it.
+ */
+ *fr_done = 1;
+ net->fast_retran_ip = 1;
+ }
+
+ /*
+ * if no AUTH is yet included and this chunk requires it,
+ * make sure to account for it. We don't apply the size
+ * until the AUTH chunk is actually added below in case
+ * there is no room for this chunk.
+ */
+ if (data_auth_reqd && (auth == NULL)) {
+ dmtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
+ } else
+ dmtu = 0;
+
+ if ((chk->send_size <= (mtu - dmtu)) ||
+ (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) {
+ /* ok we will add this one */
+ if (data_auth_reqd) {
+ if (auth == NULL) {
+ m = sctp_add_auth_chunk(m,
+ &endofchain,
+ &auth,
+ &auth_offset,
+ stcb,
+ SCTP_DATA);
+ auth_keyid = chk->auth_keyid;
+ override_ok = 0;
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+ } else if (override_ok) {
+ auth_keyid = chk->auth_keyid;
+ override_ok = 0;
+ } else if (chk->auth_keyid != auth_keyid) {
+ /* different keyid, so done bundling */
+ break;
+ }
+ }
+ m = sctp_copy_mbufchain(chk->data, m, &endofchain, 0, chk->send_size, chk->copy_by_ref);
+ if (m == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ /* Do clear IP_DF ? */
+ if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) {
+ no_fragmentflg = 0;
+ }
+ /* upate our MTU size */
+ if (mtu > (chk->send_size + dmtu))
+ mtu -= (chk->send_size + dmtu);
+ else
+ mtu = 0;
+ data_list[bundle_at++] = chk;
+ if (one_chunk && (asoc->total_flight <= 0)) {
+ SCTP_STAT_INCR(sctps_windowprobed);
+ }
+ }
+ if (one_chunk == 0) {
+ /*
+ * now are there anymore forward from chk to pick
+ * up?
+ */
+ fwd = TAILQ_NEXT(chk, sctp_next);
+ while (fwd) {
+ if (fwd->sent != SCTP_DATAGRAM_RESEND) {
+ /* Nope, not for retran */
+ fwd = TAILQ_NEXT(fwd, sctp_next);
+ continue;
+ }
+ if (fwd->whoTo != net) {
+ /* Nope, not the net in question */
+ fwd = TAILQ_NEXT(fwd, sctp_next);
+ continue;
+ }
+ if (data_auth_reqd && (auth == NULL)) {
+ dmtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
+ } else
+ dmtu = 0;
+ if (fwd->send_size <= (mtu - dmtu)) {
+ if (data_auth_reqd) {
+ if (auth == NULL) {
+ m = sctp_add_auth_chunk(m,
+ &endofchain,
+ &auth,
+ &auth_offset,
+ stcb,
+ SCTP_DATA);
+ auth_keyid = fwd->auth_keyid;
+ override_ok = 0;
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+ } else if (override_ok) {
+ auth_keyid = fwd->auth_keyid;
+ override_ok = 0;
+ } else if (fwd->auth_keyid != auth_keyid) {
+ /*
+ * different keyid,
+ * so done bundling
+ */
+ break;
+ }
+ }
+ m = sctp_copy_mbufchain(fwd->data, m, &endofchain, 0, fwd->send_size, fwd->copy_by_ref);
+ if (m == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ /* Do clear IP_DF ? */
+ if (fwd->flags & CHUNK_FLAGS_FRAGMENT_OK) {
+ no_fragmentflg = 0;
+ }
+ /* upate our MTU size */
+ if (mtu > (fwd->send_size + dmtu))
+ mtu -= (fwd->send_size + dmtu);
+ else
+ mtu = 0;
+ data_list[bundle_at++] = fwd;
+ if (bundle_at >= SCTP_MAX_DATA_BUNDLING) {
+ break;
+ }
+ fwd = TAILQ_NEXT(fwd, sctp_next);
+ } else {
+ /* can't fit so we are done */
+ break;
+ }
+ }
+ }
+ /* Is there something to send for this destination? */
+ if (m) {
+ /*
+ * No matter if we fail/or suceed we should start a
+ * timer. A failure is like a lost IP packet :-)
+ */
+ if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
+ /*
+ * no timer running on this destination
+ * restart it.
+ */
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
+ tmr_started = 1;
+ }
+ /* Now lets send it, if there is anything to send :> */
+ if ((error = sctp_lowlevel_chunk_output(inp, stcb, net,
+ (struct sockaddr *)&net->ro._l_addr, m,
+ auth_offset, auth, auth_keyid,
+ no_fragmentflg, 0, NULL, 0,
+ inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag),
+ net->port, so_locked, NULL))) {
+ /* error, we could not output */
+ SCTP_STAT_INCR(sctps_lowlevelerr);
+ return (error);
+ }
+ m = endofchain = NULL;
+ auth = NULL;
+ auth_offset = 0;
+ /* For HB's */
+ /*
+ * We don't want to mark the net->sent time here
+ * since this we use this for HB and retrans cannot
+ * measure RTT
+ */
+ /* (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); */
+
+ /* For auto-close */
+ cnt_thru++;
+ if (*now_filled == 0) {
+ (void)SCTP_GETTIME_TIMEVAL(&asoc->time_last_sent);
+ *now = asoc->time_last_sent;
+ *now_filled = 1;
+ } else {
+ asoc->time_last_sent = *now;
+ }
+ *cnt_out += bundle_at;
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xC4, bundle_at);
+#endif
+ if (bundle_at) {
+ tsns_sent = data_list[0]->rec.data.TSN_seq;
+ }
+ for (i = 0; i < bundle_at; i++) {
+ SCTP_STAT_INCR(sctps_sendretransdata);
+ data_list[i]->sent = SCTP_DATAGRAM_SENT;
+ /*
+ * When we have a revoked data, and we
+ * retransmit it, then we clear the revoked
+ * flag since this flag dictates if we
+ * subtracted from the fs
+ */
+ if (data_list[i]->rec.data.chunk_was_revoked) {
+ /* Deflate the cwnd */
+ data_list[i]->whoTo->cwnd -= data_list[i]->book_size;
+ data_list[i]->rec.data.chunk_was_revoked = 0;
+ }
+ data_list[i]->snd_count++;
+ sctp_ucount_decr(asoc->sent_queue_retran_cnt);
+ /* record the time */
+ data_list[i]->sent_rcv_time = asoc->time_last_sent;
+ if (data_list[i]->book_size_scale) {
+ /*
+ * need to double the book size on
+ * this one
+ */
+ data_list[i]->book_size_scale = 0;
+ /*
+ * Since we double the booksize, we
+ * must also double the output queue
+ * size, since this get shrunk when
+ * we free by this amount.
+ */
+ atomic_add_int(&((asoc)->total_output_queue_size), data_list[i]->book_size);
+ data_list[i]->book_size *= 2;
+
+
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) {
+ sctp_log_rwnd(SCTP_DECREASE_PEER_RWND,
+ asoc->peers_rwnd, data_list[i]->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh));
+ }
+ asoc->peers_rwnd = sctp_sbspace_sub(asoc->peers_rwnd,
+ (uint32_t) (data_list[i]->send_size +
+ SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)));
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_UP_RSND,
+ data_list[i]->whoTo->flight_size,
+ data_list[i]->book_size,
+ (uintptr_t) data_list[i]->whoTo,
+ data_list[i]->rec.data.TSN_seq);
+ }
+ sctp_flight_size_increase(data_list[i]);
+ sctp_total_flight_increase(stcb, data_list[i]);
+ if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) {
+ /* SWS sender side engages */
+ asoc->peers_rwnd = 0;
+ }
+ if ((i == 0) &&
+ (data_list[i]->rec.data.doing_fast_retransmit)) {
+ SCTP_STAT_INCR(sctps_sendfastretrans);
+ if ((data_list[i] == TAILQ_FIRST(&asoc->sent_queue)) &&
+ (tmr_started == 0)) {
+ /*-
+ * ok we just fast-retrans'd
+ * the lowest TSN, i.e the
+ * first on the list. In
+ * this case we want to give
+ * some more time to get a
+ * SACK back without a
+ * t3-expiring.
+ */
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, inp, stcb, net,
+ SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_4);
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
+ }
+ }
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, net, tsns_sent, SCTP_CWND_LOG_FROM_RESEND);
+ }
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(21, inp, stcb, NULL);
+#endif
+ } else {
+ /* None will fit */
+ return (1);
+ }
+ if (asoc->sent_queue_retran_cnt <= 0) {
+ /* all done we have no more to retran */
+ asoc->sent_queue_retran_cnt = 0;
+ break;
+ }
+ if (one_chunk) {
+ /* No more room in rwnd */
+ return (1);
+ }
+ /* stop the for loop here. we sent out a packet */
+ break;
+ }
+ return (0);
+}
+
+
+static int
+sctp_timer_validation(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ int ret)
+{
+ struct sctp_nets *net;
+
+ /* Validate that a timer is running somewhere */
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
+ /* Here is a timer */
+ return (ret);
+ }
+ }
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ /* Gak, we did not have a timer somewhere */
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Deadlock avoided starting timer on a dest at retran\n");
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, asoc->primary_destination);
+ return (ret);
+}
+
+void
+sctp_chunk_output(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ int from_where,
+ int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ /*-
+ * Ok this is the generic chunk service queue. we must do the
+ * following:
+ * - See if there are retransmits pending, if so we must
+ * do these first.
+ * - Service the stream queue that is next, moving any
+ * message (note I must get a complete message i.e.
+ * FIRST/MIDDLE and LAST to the out queue in one pass) and assigning
+ * TSN's
+ * - Check to see if the cwnd/rwnd allows any output, if so we
+ * go ahead and fomulate and send the low level chunks. Making sure
+ * to combine any control in the control chunk queue also.
+ */
+ struct sctp_association *asoc;
+ struct sctp_nets *net;
+ int error = 0, num_out = 0, tot_out = 0, ret = 0, reason_code = 0,
+ burst_cnt = 0, burst_limit = 0;
+ struct timeval now;
+ int now_filled = 0;
+ int nagle_on = 0;
+ int frag_point = sctp_get_frag_point(stcb, &stcb->asoc);
+ int un_sent = 0;
+ int fr_done, tot_frs = 0;
+
+ asoc = &stcb->asoc;
+ if (from_where == SCTP_OUTPUT_FROM_USR_SEND) {
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NODELAY)) {
+ nagle_on = 0;
+ } else {
+ nagle_on = 1;
+ }
+ }
+ SCTP_TCB_LOCK_ASSERT(stcb);
+
+ un_sent = (stcb->asoc.total_output_queue_size - stcb->asoc.total_flight);
+
+ if ((un_sent <= 0) &&
+ (TAILQ_EMPTY(&asoc->control_send_queue)) &&
+ (TAILQ_EMPTY(&asoc->asconf_send_queue)) &&
+ (asoc->sent_queue_retran_cnt == 0)) {
+ /* Nothing to do unless there is something to be sent left */
+ return;
+ }
+ /*
+ * Do we have something to send, data or control AND a sack timer
+ * running, if so piggy-back the sack.
+ */
+ if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) {
+ sctp_send_sack(stcb);
+ (void)SCTP_OS_TIMER_STOP(&stcb->asoc.dack_timer.timer);
+ }
+ while (asoc->sent_queue_retran_cnt) {
+ /*-
+ * Ok, it is retransmission time only, we send out only ONE
+ * packet with a single call off to the retran code.
+ */
+ if (from_where == SCTP_OUTPUT_FROM_COOKIE_ACK) {
+ /*-
+ * Special hook for handling cookiess discarded
+ * by peer that carried data. Send cookie-ack only
+ * and then the next call with get the retran's.
+ */
+ (void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1,
+ from_where,
+ &now, &now_filled, frag_point, so_locked);
+ return;
+ } else if (from_where != SCTP_OUTPUT_FROM_HB_TMR) {
+ /* if its not from a HB then do it */
+ fr_done = 0;
+ ret = sctp_chunk_retransmission(inp, stcb, asoc, &num_out, &now, &now_filled, &fr_done, so_locked);
+ if (fr_done) {
+ tot_frs++;
+ }
+ } else {
+ /*
+ * its from any other place, we don't allow retran
+ * output (only control)
+ */
+ ret = 1;
+ }
+ if (ret > 0) {
+ /* Can't send anymore */
+ /*-
+ * now lets push out control by calling med-level
+ * output once. this assures that we WILL send HB's
+ * if queued too.
+ */
+ (void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1,
+ from_where,
+ &now, &now_filled, frag_point, so_locked);
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(8, inp, stcb, NULL);
+#endif
+ (void)sctp_timer_validation(inp, stcb, asoc, ret);
+ return;
+ }
+ if (ret < 0) {
+ /*-
+ * The count was off.. retran is not happening so do
+ * the normal retransmission.
+ */
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(9, inp, stcb, NULL);
+#endif
+ if (ret == SCTP_RETRAN_EXIT) {
+ return;
+ }
+ break;
+ }
+ if (from_where == SCTP_OUTPUT_FROM_T3) {
+ /* Only one transmission allowed out of a timeout */
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(10, inp, stcb, NULL);
+#endif
+ /* Push out any control */
+ (void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1, from_where,
+ &now, &now_filled, frag_point, so_locked);
+ return;
+ }
+ if (tot_frs > asoc->max_burst) {
+ /* Hit FR burst limit */
+ return;
+ }
+ if ((num_out == 0) && (ret == 0)) {
+
+ /* No more retrans to send */
+ break;
+ }
+ }
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(12, inp, stcb, NULL);
+#endif
+ /* Check for bad destinations, if they exist move chunks around. */
+ burst_limit = asoc->max_burst;
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ if ((net->dest_state & SCTP_ADDR_NOT_REACHABLE) ==
+ SCTP_ADDR_NOT_REACHABLE) {
+ /*-
+ * if possible move things off of this address we
+ * still may send below due to the dormant state but
+ * we try to find an alternate address to send to
+ * and if we have one we move all queued data on the
+ * out wheel to this alternate address.
+ */
+ if (net->ref_count > 1)
+ sctp_move_chunks_from_net(stcb, net);
+ } else if ((asoc->sctp_cmt_on_off == 1) &&
+ (asoc->sctp_cmt_pf > 0) &&
+ ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF)) {
+ /*
+ * JRS 5/14/07 - If CMT PF is on and the current
+ * destination is in PF state, move all queued data
+ * to an alternate desination.
+ */
+ if (net->ref_count > 1)
+ sctp_move_chunks_from_net(stcb, net);
+ } else {
+ /*-
+ * if ((asoc->sat_network) || (net->addr_is_local))
+ * { burst_limit = asoc->max_burst *
+ * SCTP_SAT_NETWORK_BURST_INCR; }
+ */
+ if (SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst)) {
+ if ((net->flight_size + (burst_limit * net->mtu)) < net->cwnd) {
+ /*
+ * JRS - Use the congestion control
+ * given in the congestion control
+ * module
+ */
+ asoc->cc_functions.sctp_cwnd_update_after_output(stcb, net, burst_limit);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) {
+ sctp_log_maxburst(stcb, net, 0, burst_limit, SCTP_MAX_BURST_APPLIED);
+ }
+ SCTP_STAT_INCR(sctps_maxburstqueued);
+ }
+ net->fast_retran_ip = 0;
+ } else {
+ if (net->flight_size == 0) {
+ /* Should be decaying the cwnd here */
+ ;
+ }
+ }
+ }
+
+ }
+ burst_cnt = 0;
+ do {
+ error = sctp_med_chunk_output(inp, stcb, asoc, &num_out,
+ &reason_code, 0, from_where,
+ &now, &now_filled, frag_point, so_locked);
+ if (error) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Error %d was returned from med-c-op\n", error);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) {
+ sctp_log_maxburst(stcb, asoc->primary_destination, error, burst_cnt, SCTP_MAX_BURST_ERROR_STOP);
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, NULL, error, SCTP_SEND_NOW_COMPLETES);
+ sctp_log_cwnd(stcb, NULL, 0xdeadbeef, SCTP_SEND_NOW_COMPLETES);
+ }
+ break;
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "m-c-o put out %d\n", num_out);
+
+ tot_out += num_out;
+ burst_cnt++;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, NULL, num_out, SCTP_SEND_NOW_COMPLETES);
+ if (num_out == 0) {
+ sctp_log_cwnd(stcb, NULL, reason_code, SCTP_SEND_NOW_COMPLETES);
+ }
+ }
+ if (nagle_on) {
+ /*-
+ * When nagle is on, we look at how much is un_sent, then
+ * if its smaller than an MTU and we have data in
+ * flight we stop.
+ */
+ un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) +
+ (stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk)));
+ if ((un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD)) &&
+ (stcb->asoc.total_flight > 0)) {
+ break;
+ }
+ }
+ if (TAILQ_EMPTY(&asoc->control_send_queue) &&
+ TAILQ_EMPTY(&asoc->send_queue) &&
+ TAILQ_EMPTY(&asoc->out_wheel)) {
+ /* Nothing left to send */
+ break;
+ }
+ if ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) <= 0) {
+ /* Nothing left to send */
+ break;
+ }
+ } while (num_out && (SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) ||
+ (burst_cnt < burst_limit)));
+
+ if (SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) == 0) {
+ if (burst_cnt >= burst_limit) {
+ SCTP_STAT_INCR(sctps_maxburstqueued);
+ asoc->burst_limit_applied = 1;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) {
+ sctp_log_maxburst(stcb, asoc->primary_destination, 0, burst_cnt, SCTP_MAX_BURST_APPLIED);
+ }
+ } else {
+ asoc->burst_limit_applied = 0;
+ }
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ sctp_log_cwnd(stcb, NULL, tot_out, SCTP_SEND_NOW_COMPLETES);
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, we have put out %d chunks\n",
+ tot_out);
+
+ /*-
+ * Now we need to clean up the control chunk chain if a ECNE is on
+ * it. It must be marked as UNSENT again so next call will continue
+ * to send it until such time that we get a CWR, to remove it.
+ */
+ if (stcb->asoc.ecn_echo_cnt_onq)
+ sctp_fix_ecn_echo(asoc);
+ return;
+}
+
+
+int
+sctp_output(inp, m, addr, control, p, flags)
+ struct sctp_inpcb *inp;
+ struct mbuf *m;
+ struct sockaddr *addr;
+ struct mbuf *control;
+ struct thread *p;
+ int flags;
+{
+ if (inp == NULL) {
+ SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ return (EINVAL);
+ }
+ if (inp->sctp_socket == NULL) {
+ SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ return (EINVAL);
+ }
+ return (sctp_sosend(inp->sctp_socket,
+ addr,
+ (struct uio *)NULL,
+ m,
+ control,
+ flags, p
+ ));
+}
+
+void
+send_forward_tsn(struct sctp_tcb *stcb,
+ struct sctp_association *asoc)
+{
+ struct sctp_tmit_chunk *chk;
+ struct sctp_forward_tsn_chunk *fwdtsn;
+ uint32_t advance_peer_ack_point;
+
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
+ if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) {
+ /* mark it to unsent */
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ /* Do we correct its output location? */
+ if (chk->whoTo != asoc->primary_destination) {
+ sctp_free_remote_addr(chk->whoTo);
+ chk->whoTo = asoc->primary_destination;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ }
+ goto sctp_fill_in_rest;
+ }
+ }
+ /* Ok if we reach here we must build one */
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ return;
+ }
+ asoc->fwd_tsn_cnt++;
+ chk->copy_by_ref = 0;
+ chk->rec.chunk_id.id = SCTP_FORWARD_CUM_TSN;
+ chk->rec.chunk_id.can_take_data = 0;
+ chk->asoc = asoc;
+ chk->whoTo = NULL;
+ chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ if (chk->data == NULL) {
+ sctp_free_a_chunk(stcb, chk);
+ return;
+ }
+ SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->whoTo = asoc->primary_destination;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ TAILQ_INSERT_TAIL(&asoc->control_send_queue, chk, sctp_next);
+ asoc->ctrl_queue_cnt++;
+sctp_fill_in_rest:
+ /*-
+ * Here we go through and fill out the part that deals with
+ * stream/seq of the ones we skip.
+ */
+ SCTP_BUF_LEN(chk->data) = 0;
+ {
+ struct sctp_tmit_chunk *at, *tp1, *last;
+ struct sctp_strseq *strseq;
+ unsigned int cnt_of_space, i, ovh;
+ unsigned int space_needed;
+ unsigned int cnt_of_skipped = 0;
+
+ TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) {
+ if (at->sent != SCTP_FORWARD_TSN_SKIP) {
+ /* no more to look at */
+ break;
+ }
+ if (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED) {
+ /* We don't report these */
+ continue;
+ }
+ cnt_of_skipped++;
+ }
+ space_needed = (sizeof(struct sctp_forward_tsn_chunk) +
+ (cnt_of_skipped * sizeof(struct sctp_strseq)));
+
+ cnt_of_space = M_TRAILINGSPACE(chk->data);
+
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ ovh = SCTP_MIN_OVERHEAD;
+ } else {
+ ovh = SCTP_MIN_V4_OVERHEAD;
+ }
+ if (cnt_of_space > (asoc->smallest_mtu - ovh)) {
+ /* trim to a mtu size */
+ cnt_of_space = asoc->smallest_mtu - ovh;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) {
+ sctp_misc_ints(SCTP_FWD_TSN_CHECK,
+ 0xff, 0, cnt_of_skipped,
+ asoc->advanced_peer_ack_point);
+
+ }
+ advance_peer_ack_point = asoc->advanced_peer_ack_point;
+ if (cnt_of_space < space_needed) {
+ /*-
+ * ok we must trim down the chunk by lowering the
+ * advance peer ack point.
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) {
+ sctp_misc_ints(SCTP_FWD_TSN_CHECK,
+ 0xff, 0xff, cnt_of_space,
+ space_needed);
+ }
+ cnt_of_skipped = cnt_of_space - sizeof(struct sctp_forward_tsn_chunk);
+ cnt_of_skipped /= sizeof(struct sctp_strseq);
+ /*-
+ * Go through and find the TSN that will be the one
+ * we report.
+ */
+ at = TAILQ_FIRST(&asoc->sent_queue);
+ for (i = 0; i < cnt_of_skipped; i++) {
+ tp1 = TAILQ_NEXT(at, sctp_next);
+ if (tp1 == NULL) {
+ break;
+ }
+ at = tp1;
+ }
+ if (at && SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) {
+ sctp_misc_ints(SCTP_FWD_TSN_CHECK,
+ 0xff, cnt_of_skipped, at->rec.data.TSN_seq,
+ asoc->advanced_peer_ack_point);
+ }
+ last = at;
+ /*-
+ * last now points to last one I can report, update
+ * peer ack point
+ */
+ if (last)
+ advance_peer_ack_point = last->rec.data.TSN_seq;
+ space_needed = sizeof(struct sctp_forward_tsn_chunk) +
+ cnt_of_skipped * sizeof(struct sctp_strseq);
+ }
+ chk->send_size = space_needed;
+ /* Setup the chunk */
+ fwdtsn = mtod(chk->data, struct sctp_forward_tsn_chunk *);
+ fwdtsn->ch.chunk_length = htons(chk->send_size);
+ fwdtsn->ch.chunk_flags = 0;
+ fwdtsn->ch.chunk_type = SCTP_FORWARD_CUM_TSN;
+ fwdtsn->new_cumulative_tsn = htonl(advance_peer_ack_point);
+ SCTP_BUF_LEN(chk->data) = chk->send_size;
+ fwdtsn++;
+ /*-
+ * Move pointer to after the fwdtsn and transfer to the
+ * strseq pointer.
+ */
+ strseq = (struct sctp_strseq *)fwdtsn;
+ /*-
+ * Now populate the strseq list. This is done blindly
+ * without pulling out duplicate stream info. This is
+ * inefficent but won't harm the process since the peer will
+ * look at these in sequence and will thus release anything.
+ * It could mean we exceed the PMTU and chop off some that
+ * we could have included.. but this is unlikely (aka 1432/4
+ * would mean 300+ stream seq's would have to be reported in
+ * one FWD-TSN. With a bit of work we can later FIX this to
+ * optimize and pull out duplcates.. but it does add more
+ * overhead. So for now... not!
+ */
+ at = TAILQ_FIRST(&asoc->sent_queue);
+ for (i = 0; i < cnt_of_skipped; i++) {
+ tp1 = TAILQ_NEXT(at, sctp_next);
+ if (tp1 == NULL)
+ break;
+ if (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED) {
+ /* We don't report these */
+ i--;
+ at = tp1;
+ continue;
+ }
+ if (at->rec.data.TSN_seq == advance_peer_ack_point) {
+ at->rec.data.fwd_tsn_cnt = 0;
+ }
+ strseq->stream = ntohs(at->rec.data.stream_number);
+ strseq->sequence = ntohs(at->rec.data.stream_seq);
+ strseq++;
+ at = tp1;
+ }
+ }
+ return;
+
+}
+
+void
+sctp_send_sack(struct sctp_tcb *stcb)
+{
+ /*-
+ * Queue up a SACK or NR-SACK in the control queue.
+ * We must first check to see if a SACK or NR-SACK is
+ * somehow on the control queue.
+ * If so, we will take and and remove the old one.
+ */
+ struct sctp_association *asoc;
+ struct sctp_tmit_chunk *chk, *a_chk;
+ struct sctp_sack_chunk *sack;
+ struct sctp_nr_sack_chunk *nr_sack;
+ struct sctp_gap_ack_block *gap_descriptor;
+ struct sack_track *selector;
+ int mergeable = 0;
+ int offset;
+ caddr_t limit;
+ uint32_t *dup;
+ int limit_reached = 0;
+ unsigned int i, siz, j;
+ unsigned int num_gap_blocks = 0, num_nr_gap_blocks = 0, space;
+ int num_dups = 0;
+ int space_req;
+ uint32_t highest_tsn;
+ uint8_t flags;
+ uint8_t type;
+ uint8_t tsn_map;
+
+ if ((stcb->asoc.sctp_nr_sack_on_off == 1) &&
+ (stcb->asoc.peer_supports_nr_sack == 1)) {
+ type = SCTP_NR_SELECTIVE_ACK;
+ } else {
+ type = SCTP_SELECTIVE_ACK;
+ }
+ a_chk = NULL;
+ asoc = &stcb->asoc;
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ if (asoc->last_data_chunk_from == NULL) {
+ /* Hmm we never received anything */
+ return;
+ }
+ sctp_slide_mapping_arrays(stcb);
+ sctp_set_rwnd(stcb, asoc);
+ TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
+ if (chk->rec.chunk_id.id == type) {
+ /* Hmm, found a sack already on queue, remove it */
+ TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next);
+ asoc->ctrl_queue_cnt--;
+ a_chk = chk;
+ if (a_chk->data) {
+ sctp_m_freem(a_chk->data);
+ a_chk->data = NULL;
+ }
+ sctp_free_remote_addr(a_chk->whoTo);
+ a_chk->whoTo = NULL;
+ break;
+ }
+ }
+ if (a_chk == NULL) {
+ sctp_alloc_a_chunk(stcb, a_chk);
+ if (a_chk == NULL) {
+ /* No memory so we drop the idea, and set a timer */
+ if (stcb->asoc.delayed_ack) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_RECV,
+ stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_5);
+ sctp_timer_start(SCTP_TIMER_TYPE_RECV,
+ stcb->sctp_ep, stcb, NULL);
+ } else {
+ stcb->asoc.send_sack = 1;
+ }
+ return;
+ }
+ a_chk->copy_by_ref = 0;
+ a_chk->rec.chunk_id.id = type;
+ a_chk->rec.chunk_id.can_take_data = 1;
+ }
+ /* Clear our pkt counts */
+ asoc->data_pkts_seen = 0;
+
+ a_chk->asoc = asoc;
+ a_chk->snd_count = 0;
+ a_chk->send_size = 0; /* fill in later */
+ a_chk->sent = SCTP_DATAGRAM_UNSENT;
+ a_chk->whoTo = NULL;
+
+ if ((asoc->numduptsns) ||
+ (asoc->last_data_chunk_from->dest_state & SCTP_ADDR_NOT_REACHABLE)) {
+ /*-
+ * Ok, we have some duplicates or the destination for the
+ * sack is unreachable, lets see if we can select an
+ * alternate than asoc->last_data_chunk_from
+ */
+ if ((!(asoc->last_data_chunk_from->dest_state & SCTP_ADDR_NOT_REACHABLE)) &&
+ (asoc->used_alt_onsack > asoc->numnets)) {
+ /* We used an alt last time, don't this time */
+ a_chk->whoTo = NULL;
+ } else {
+ asoc->used_alt_onsack++;
+ a_chk->whoTo = sctp_find_alternate_net(stcb, asoc->last_data_chunk_from, 0);
+ }
+ if (a_chk->whoTo == NULL) {
+ /* Nope, no alternate */
+ a_chk->whoTo = asoc->last_data_chunk_from;
+ asoc->used_alt_onsack = 0;
+ }
+ } else {
+ /*
+ * No duplicates so we use the last place we received data
+ * from.
+ */
+ asoc->used_alt_onsack = 0;
+ a_chk->whoTo = asoc->last_data_chunk_from;
+ }
+ if (a_chk->whoTo) {
+ atomic_add_int(&a_chk->whoTo->ref_count, 1);
+ }
+ if (compare_with_wrap(asoc->highest_tsn_inside_map, asoc->highest_tsn_inside_nr_map, MAX_TSN)) {
+ highest_tsn = asoc->highest_tsn_inside_map;
+ } else {
+ highest_tsn = asoc->highest_tsn_inside_nr_map;
+ }
+ if (highest_tsn == asoc->cumulative_tsn) {
+ /* no gaps */
+ if (type == SCTP_SELECTIVE_ACK) {
+ space_req = sizeof(struct sctp_sack_chunk);
+ } else {
+ space_req = sizeof(struct sctp_nr_sack_chunk);
+ }
+ } else {
+ /* gaps get a cluster */
+ space_req = MCLBYTES;
+ }
+ /* Ok now lets formulate a MBUF with our sack */
+ a_chk->data = sctp_get_mbuf_for_msg(space_req, 0, M_DONTWAIT, 1, MT_DATA);
+ if ((a_chk->data == NULL) ||
+ (a_chk->whoTo == NULL)) {
+ /* rats, no mbuf memory */
+ if (a_chk->data) {
+ /* was a problem with the destination */
+ sctp_m_freem(a_chk->data);
+ a_chk->data = NULL;
+ }
+ sctp_free_a_chunk(stcb, a_chk);
+ /* sa_ignore NO_NULL_CHK */
+ if (stcb->asoc.delayed_ack) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_RECV,
+ stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_6);
+ sctp_timer_start(SCTP_TIMER_TYPE_RECV,
+ stcb->sctp_ep, stcb, NULL);
+ } else {
+ stcb->asoc.send_sack = 1;
+ }
+ return;
+ }
+ /* ok, lets go through and fill it in */
+ SCTP_BUF_RESV_UF(a_chk->data, SCTP_MIN_OVERHEAD);
+ space = M_TRAILINGSPACE(a_chk->data);
+ if (space > (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD)) {
+ space = (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD);
+ }
+ limit = mtod(a_chk->data, caddr_t);
+ limit += space;
+
+ /* 0x01 is used by nonce for ecn */
+ if ((SCTP_BASE_SYSCTL(sctp_ecn_enable)) &&
+ (SCTP_BASE_SYSCTL(sctp_ecn_nonce)) &&
+ (asoc->peer_supports_ecn_nonce))
+ flags = (asoc->receiver_nonce_sum & SCTP_SACK_NONCE_SUM);
+ else
+ flags = 0;
+
+ if ((asoc->sctp_cmt_on_off == 1) &&
+ SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) {
+ /*-
+ * CMT DAC algorithm: If 2 (i.e., 0x10) packets have been
+ * received, then set high bit to 1, else 0. Reset
+ * pkts_rcvd.
+ */
+ flags |= (asoc->cmt_dac_pkts_rcvd << 6);
+ asoc->cmt_dac_pkts_rcvd = 0;
+ }
+#ifdef SCTP_ASOCLOG_OF_TSNS
+ stcb->asoc.cumack_logsnt[stcb->asoc.cumack_log_atsnt] = asoc->cumulative_tsn;
+ stcb->asoc.cumack_log_atsnt++;
+ if (stcb->asoc.cumack_log_atsnt >= SCTP_TSN_LOG_SIZE) {
+ stcb->asoc.cumack_log_atsnt = 0;
+ }
+#endif
+ /* reset the readers interpretation */
+ stcb->freed_by_sorcv_sincelast = 0;
+
+ if (type == SCTP_SELECTIVE_ACK) {
+ sack = mtod(a_chk->data, struct sctp_sack_chunk *);
+ nr_sack = NULL;
+ gap_descriptor = (struct sctp_gap_ack_block *)((caddr_t)sack + sizeof(struct sctp_sack_chunk));
+ if (highest_tsn > asoc->mapping_array_base_tsn) {
+ siz = (((highest_tsn - asoc->mapping_array_base_tsn) + 1) + 7) / 8;
+ } else {
+ siz = (((MAX_TSN - highest_tsn) + 1) + highest_tsn + 7) / 8;
+ }
+ } else {
+ sack = NULL;
+ nr_sack = mtod(a_chk->data, struct sctp_nr_sack_chunk *);
+ gap_descriptor = (struct sctp_gap_ack_block *)((caddr_t)nr_sack + sizeof(struct sctp_nr_sack_chunk));
+ if (asoc->highest_tsn_inside_map > asoc->mapping_array_base_tsn) {
+ siz = (((asoc->highest_tsn_inside_map - asoc->mapping_array_base_tsn) + 1) + 7) / 8;
+ } else {
+ siz = (((MAX_TSN - asoc->mapping_array_base_tsn) + 1) + asoc->highest_tsn_inside_map + 7) / 8;
+ }
+ }
+
+ if (compare_with_wrap(asoc->mapping_array_base_tsn, asoc->cumulative_tsn, MAX_TSN)) {
+ offset = 1;
+ } else {
+ offset = asoc->mapping_array_base_tsn - asoc->cumulative_tsn;
+ }
+ if (((type == SCTP_SELECTIVE_ACK) &&
+ compare_with_wrap(highest_tsn, asoc->cumulative_tsn, MAX_TSN)) ||
+ ((type == SCTP_NR_SELECTIVE_ACK) &&
+ compare_with_wrap(asoc->highest_tsn_inside_map, asoc->cumulative_tsn, MAX_TSN))) {
+ /* we have a gap .. maybe */
+ for (i = 0; i < siz; i++) {
+ tsn_map = asoc->mapping_array[i];
+ if (type == SCTP_SELECTIVE_ACK) {
+ tsn_map |= asoc->nr_mapping_array[i];
+ }
+ if (i == 0) {
+ /*
+ * Clear all bits corresponding to TSNs
+ * smaller or equal to the cumulative TSN.
+ */
+ tsn_map &= (~0 << (1 - offset));
+ }
+ selector = &sack_array[tsn_map];
+ if (mergeable && selector->right_edge) {
+ /*
+ * Backup, left and right edges were ok to
+ * merge.
+ */
+ num_gap_blocks--;
+ gap_descriptor--;
+ }
+ if (selector->num_entries == 0)
+ mergeable = 0;
+ else {
+ for (j = 0; j < selector->num_entries; j++) {
+ if (mergeable && selector->right_edge) {
+ /*
+ * do a merge by NOT setting
+ * the left side
+ */
+ mergeable = 0;
+ } else {
+ /*
+ * no merge, set the left
+ * side
+ */
+ mergeable = 0;
+ gap_descriptor->start = htons((selector->gaps[j].start + offset));
+ }
+ gap_descriptor->end = htons((selector->gaps[j].end + offset));
+ num_gap_blocks++;
+ gap_descriptor++;
+ if (((caddr_t)gap_descriptor + sizeof(struct sctp_gap_ack_block)) > limit) {
+ /* no more room */
+ limit_reached = 1;
+ break;
+ }
+ }
+ if (selector->left_edge) {
+ mergeable = 1;
+ }
+ }
+ if (limit_reached) {
+ /* Reached the limit stop */
+ break;
+ }
+ offset += 8;
+ }
+ }
+ if ((type == SCTP_NR_SELECTIVE_ACK) &&
+ (limit_reached == 0)) {
+
+ mergeable = 0;
+
+ if (asoc->highest_tsn_inside_nr_map > asoc->mapping_array_base_tsn) {
+ siz = (((asoc->highest_tsn_inside_nr_map - asoc->mapping_array_base_tsn) + 1) + 7) / 8;
+ } else {
+ siz = (((MAX_TSN - asoc->mapping_array_base_tsn) + 1) + asoc->highest_tsn_inside_nr_map + 7) / 8;
+ }
+
+ if (compare_with_wrap(asoc->mapping_array_base_tsn, asoc->cumulative_tsn, MAX_TSN)) {
+ offset = 1;
+ } else {
+ offset = asoc->mapping_array_base_tsn - asoc->cumulative_tsn;
+ }
+ if (compare_with_wrap(asoc->highest_tsn_inside_nr_map, asoc->cumulative_tsn, MAX_TSN)) {
+ /* we have a gap .. maybe */
+ for (i = 0; i < siz; i++) {
+ tsn_map = asoc->nr_mapping_array[i];
+ if (i == 0) {
+ /*
+ * Clear all bits corresponding to
+ * TSNs smaller or equal to the
+ * cumulative TSN.
+ */
+ tsn_map &= (~0 << (1 - offset));
+ }
+ selector = &sack_array[tsn_map];
+ if (mergeable && selector->right_edge) {
+ /*
+ * Backup, left and right edges were
+ * ok to merge.
+ */
+ num_nr_gap_blocks--;
+ gap_descriptor--;
+ }
+ if (selector->num_entries == 0)
+ mergeable = 0;
+ else {
+ for (j = 0; j < selector->num_entries; j++) {
+ if (mergeable && selector->right_edge) {
+ /*
+ * do a merge by NOT
+ * setting the left
+ * side
+ */
+ mergeable = 0;
+ } else {
+ /*
+ * no merge, set the
+ * left side
+ */
+ mergeable = 0;
+ gap_descriptor->start = htons((selector->gaps[j].start + offset));
+ }
+ gap_descriptor->end = htons((selector->gaps[j].end + offset));
+ num_nr_gap_blocks++;
+ gap_descriptor++;
+ if (((caddr_t)gap_descriptor + sizeof(struct sctp_gap_ack_block)) > limit) {
+ /* no more room */
+ limit_reached = 1;
+ break;
+ }
+ }
+ if (selector->left_edge) {
+ mergeable = 1;
+ }
+ }
+ if (limit_reached) {
+ /* Reached the limit stop */
+ break;
+ }
+ offset += 8;
+ }
+ }
+ }
+ /* now we must add any dups we are going to report. */
+ if ((limit_reached == 0) && (asoc->numduptsns)) {
+ dup = (uint32_t *) gap_descriptor;
+ for (i = 0; i < asoc->numduptsns; i++) {
+ *dup = htonl(asoc->dup_tsns[i]);
+ dup++;
+ num_dups++;
+ if (((caddr_t)dup + sizeof(uint32_t)) > limit) {
+ /* no more room */
+ break;
+ }
+ }
+ asoc->numduptsns = 0;
+ }
+ /*
+ * now that the chunk is prepared queue it to the control chunk
+ * queue.
+ */
+ if (type == SCTP_SELECTIVE_ACK) {
+ a_chk->send_size = sizeof(struct sctp_sack_chunk) +
+ (num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) +
+ num_dups * sizeof(int32_t);
+ SCTP_BUF_LEN(a_chk->data) = a_chk->send_size;
+ sack->sack.cum_tsn_ack = htonl(asoc->cumulative_tsn);
+ sack->sack.a_rwnd = htonl(asoc->my_rwnd);
+ sack->sack.num_gap_ack_blks = htons(num_gap_blocks);
+ sack->sack.num_dup_tsns = htons(num_dups);
+ sack->ch.chunk_type = type;
+ sack->ch.chunk_flags = flags;
+ sack->ch.chunk_length = htons(a_chk->send_size);
+ } else {
+ a_chk->send_size = sizeof(struct sctp_nr_sack_chunk) +
+ (num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) +
+ num_dups * sizeof(int32_t);
+ SCTP_BUF_LEN(a_chk->data) = a_chk->send_size;
+ nr_sack->nr_sack.cum_tsn_ack = htonl(asoc->cumulative_tsn);
+ nr_sack->nr_sack.a_rwnd = htonl(asoc->my_rwnd);
+ nr_sack->nr_sack.num_gap_ack_blks = htons(num_gap_blocks);
+ nr_sack->nr_sack.num_nr_gap_ack_blks = htons(num_nr_gap_blocks);
+ nr_sack->nr_sack.num_dup_tsns = htons(num_dups);
+ nr_sack->nr_sack.reserved = 0;
+ nr_sack->ch.chunk_type = type;
+ nr_sack->ch.chunk_flags = flags;
+ nr_sack->ch.chunk_length = htons(a_chk->send_size);
+ }
+ TAILQ_INSERT_TAIL(&asoc->control_send_queue, a_chk, sctp_next);
+ asoc->my_last_reported_rwnd = asoc->my_rwnd;
+ asoc->ctrl_queue_cnt++;
+ asoc->send_sack = 0;
+ SCTP_STAT_INCR(sctps_sendsacks);
+ return;
+}
+
+void
+sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ struct mbuf *m_abort;
+ struct mbuf *m_out = NULL, *m_end = NULL;
+ struct sctp_abort_chunk *abort = NULL;
+ int sz;
+ uint32_t auth_offset = 0;
+ struct sctp_auth_chunk *auth = NULL;
+
+ /*-
+ * Add an AUTH chunk, if chunk requires it and save the offset into
+ * the chain for AUTH
+ */
+ if (sctp_auth_is_required_chunk(SCTP_ABORT_ASSOCIATION,
+ stcb->asoc.peer_auth_chunks)) {
+ m_out = sctp_add_auth_chunk(m_out, &m_end, &auth, &auth_offset,
+ stcb, SCTP_ABORT_ASSOCIATION);
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+ }
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ m_abort = sctp_get_mbuf_for_msg(sizeof(struct sctp_abort_chunk), 0, M_DONTWAIT, 1, MT_HEADER);
+ if (m_abort == NULL) {
+ /* no mbuf's */
+ if (m_out)
+ sctp_m_freem(m_out);
+ return;
+ }
+ /* link in any error */
+ SCTP_BUF_NEXT(m_abort) = operr;
+ sz = 0;
+ if (operr) {
+ struct mbuf *n;
+
+ n = operr;
+ while (n) {
+ sz += SCTP_BUF_LEN(n);
+ n = SCTP_BUF_NEXT(n);
+ }
+ }
+ SCTP_BUF_LEN(m_abort) = sizeof(*abort);
+ if (m_out == NULL) {
+ /* NO Auth chunk prepended, so reserve space in front */
+ SCTP_BUF_RESV_UF(m_abort, SCTP_MIN_OVERHEAD);
+ m_out = m_abort;
+ } else {
+ /* Put AUTH chunk at the front of the chain */
+ SCTP_BUF_NEXT(m_end) = m_abort;
+ }
+
+ /* fill in the ABORT chunk */
+ abort = mtod(m_abort, struct sctp_abort_chunk *);
+ abort->ch.chunk_type = SCTP_ABORT_ASSOCIATION;
+ abort->ch.chunk_flags = 0;
+ abort->ch.chunk_length = htons(sizeof(*abort) + sz);
+
+ (void)sctp_lowlevel_chunk_output(stcb->sctp_ep, stcb,
+ stcb->asoc.primary_destination,
+ (struct sockaddr *)&stcb->asoc.primary_destination->ro._l_addr,
+ m_out, auth_offset, auth, stcb->asoc.authinfo.active_keyid, 1, 0, NULL, 0,
+ stcb->sctp_ep->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag),
+ stcb->asoc.primary_destination->port, so_locked, NULL);
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+}
+
+void
+sctp_send_shutdown_complete(struct sctp_tcb *stcb,
+ struct sctp_nets *net,
+ int reflect_vtag)
+{
+ /* formulate and SEND a SHUTDOWN-COMPLETE */
+ struct mbuf *m_shutdown_comp;
+ struct sctp_shutdown_complete_chunk *shutdown_complete;
+ uint32_t vtag;
+ uint8_t flags;
+
+ m_shutdown_comp = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_DONTWAIT, 1, MT_HEADER);
+ if (m_shutdown_comp == NULL) {
+ /* no mbuf's */
+ return;
+ }
+ if (reflect_vtag) {
+ flags = SCTP_HAD_NO_TCB;
+ vtag = stcb->asoc.my_vtag;
+ } else {
+ flags = 0;
+ vtag = stcb->asoc.peer_vtag;
+ }
+ shutdown_complete = mtod(m_shutdown_comp, struct sctp_shutdown_complete_chunk *);
+ shutdown_complete->ch.chunk_type = SCTP_SHUTDOWN_COMPLETE;
+ shutdown_complete->ch.chunk_flags = flags;
+ shutdown_complete->ch.chunk_length = htons(sizeof(struct sctp_shutdown_complete_chunk));
+ SCTP_BUF_LEN(m_shutdown_comp) = sizeof(struct sctp_shutdown_complete_chunk);
+ (void)sctp_lowlevel_chunk_output(stcb->sctp_ep, stcb, net,
+ (struct sockaddr *)&net->ro._l_addr,
+ m_shutdown_comp, 0, NULL, 0, 1, 0, NULL, 0,
+ stcb->sctp_ep->sctp_lport, stcb->rport,
+ htonl(vtag),
+ net->port, SCTP_SO_NOT_LOCKED, NULL);
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+ return;
+}
+
+void
+sctp_send_shutdown_complete2(struct mbuf *m, int iphlen, struct sctphdr *sh,
+ uint32_t vrf_id, uint16_t port)
+{
+ /* formulate and SEND a SHUTDOWN-COMPLETE */
+ struct mbuf *o_pak;
+ struct mbuf *mout;
+ struct ip *iph, *iph_out;
+ struct udphdr *udp = NULL;
+
+#ifdef INET6
+ struct ip6_hdr *ip6, *ip6_out;
+
+#endif
+ int offset_out, len, mlen;
+ struct sctp_shutdown_complete_msg *comp_cp;
+
+ iph = mtod(m, struct ip *);
+ switch (iph->ip_v) {
+ case IPVERSION:
+ len = (sizeof(struct ip) + sizeof(struct sctp_shutdown_complete_msg));
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ len = (sizeof(struct ip6_hdr) + sizeof(struct sctp_shutdown_complete_msg));
+ break;
+#endif
+ default:
+ return;
+ }
+ if (port) {
+ len += sizeof(struct udphdr);
+ }
+ mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_DONTWAIT, 1, MT_DATA);
+ if (mout == NULL) {
+ return;
+ }
+ SCTP_BUF_RESV_UF(mout, max_linkhdr);
+ SCTP_BUF_LEN(mout) = len;
+ SCTP_BUF_NEXT(mout) = NULL;
+ iph_out = NULL;
+#ifdef INET6
+ ip6_out = NULL;
+#endif
+ offset_out = 0;
+
+ switch (iph->ip_v) {
+ case IPVERSION:
+ iph_out = mtod(mout, struct ip *);
+
+ /* Fill in the IP header for the ABORT */
+ iph_out->ip_v = IPVERSION;
+ iph_out->ip_hl = (sizeof(struct ip) / 4);
+ iph_out->ip_tos = (u_char)0;
+ iph_out->ip_id = 0;
+ iph_out->ip_off = 0;
+ iph_out->ip_ttl = MAXTTL;
+ if (port) {
+ iph_out->ip_p = IPPROTO_UDP;
+ } else {
+ iph_out->ip_p = IPPROTO_SCTP;
+ }
+ iph_out->ip_src.s_addr = iph->ip_dst.s_addr;
+ iph_out->ip_dst.s_addr = iph->ip_src.s_addr;
+
+ /* let IP layer calculate this */
+ iph_out->ip_sum = 0;
+ offset_out += sizeof(*iph_out);
+ comp_cp = (struct sctp_shutdown_complete_msg *)(
+ (caddr_t)iph_out + offset_out);
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ ip6 = (struct ip6_hdr *)iph;
+ ip6_out = mtod(mout, struct ip6_hdr *);
+
+ /* Fill in the IPv6 header for the ABORT */
+ ip6_out->ip6_flow = ip6->ip6_flow;
+ ip6_out->ip6_hlim = MODULE_GLOBAL(ip6_defhlim);
+ if (port) {
+ ip6_out->ip6_nxt = IPPROTO_UDP;
+ } else {
+ ip6_out->ip6_nxt = IPPROTO_SCTP;
+ }
+ ip6_out->ip6_src = ip6->ip6_dst;
+ ip6_out->ip6_dst = ip6->ip6_src;
+ /*
+ * ?? The old code had both the iph len + payload, I think
+ * this is wrong and would never have worked
+ */
+ ip6_out->ip6_plen = sizeof(struct sctp_shutdown_complete_msg);
+ offset_out += sizeof(*ip6_out);
+ comp_cp = (struct sctp_shutdown_complete_msg *)(
+ (caddr_t)ip6_out + offset_out);
+ break;
+#endif /* INET6 */
+ default:
+ /* Currently not supported. */
+ sctp_m_freem(mout);
+ return;
+ }
+ if (port) {
+ udp = (struct udphdr *)comp_cp;
+ udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
+ udp->uh_dport = port;
+ udp->uh_ulen = htons(sizeof(struct sctp_shutdown_complete_msg) + sizeof(struct udphdr));
+ if (iph_out)
+ udp->uh_sum = in_pseudo(iph_out->ip_src.s_addr, iph_out->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP));
+ offset_out += sizeof(struct udphdr);
+ comp_cp = (struct sctp_shutdown_complete_msg *)((caddr_t)comp_cp + sizeof(struct udphdr));
+ }
+ if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) {
+ /* no mbuf's */
+ sctp_m_freem(mout);
+ return;
+ }
+ /* Now copy in and fill in the ABORT tags etc. */
+ comp_cp->sh.src_port = sh->dest_port;
+ comp_cp->sh.dest_port = sh->src_port;
+ comp_cp->sh.checksum = 0;
+ comp_cp->sh.v_tag = sh->v_tag;
+ comp_cp->shut_cmp.ch.chunk_flags = SCTP_HAD_NO_TCB;
+ comp_cp->shut_cmp.ch.chunk_type = SCTP_SHUTDOWN_COMPLETE;
+ comp_cp->shut_cmp.ch.chunk_length = htons(sizeof(struct sctp_shutdown_complete_chunk));
+
+ if (iph_out != NULL) {
+ sctp_route_t ro;
+ int ret;
+
+ mlen = SCTP_BUF_LEN(mout);
+ bzero(&ro, sizeof ro);
+ /* set IPv4 length */
+ iph_out->ip_len = mlen;
+#ifdef SCTP_PACKET_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
+ sctp_packet_log(mout, mlen);
+#endif
+ if (port) {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ comp_cp->sh.checksum = sctp_calculate_cksum(mout, offset_out);
+ SCTP_STAT_INCR(sctps_sendswcrc);
+#endif
+ SCTP_ENABLE_UDP_CSUM(mout);
+ } else {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ mout->m_pkthdr.csum_flags = CSUM_SCTP;
+ mout->m_pkthdr.csum_data = 0;
+ SCTP_STAT_INCR(sctps_sendhwcrc);
+#endif
+ }
+ SCTP_ATTACH_CHAIN(o_pak, mout, mlen);
+ /* out it goes */
+ SCTP_IP_OUTPUT(ret, o_pak, &ro, NULL, vrf_id);
+
+ /* Free the route if we got one back */
+ if (ro.ro_rt)
+ RTFREE(ro.ro_rt);
+ }
+#ifdef INET6
+ if (ip6_out != NULL) {
+ struct route_in6 ro;
+ int ret;
+ struct ifnet *ifp = NULL;
+
+ bzero(&ro, sizeof(ro));
+ mlen = SCTP_BUF_LEN(mout);
+#ifdef SCTP_PACKET_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
+ sctp_packet_log(mout, mlen);
+#endif
+ SCTP_ATTACH_CHAIN(o_pak, mout, mlen);
+ if (port) {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ comp_cp->sh.checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr));
+ SCTP_STAT_INCR(sctps_sendswcrc);
+#endif
+ if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), mlen - sizeof(struct ip6_hdr))) == 0) {
+ udp->uh_sum = 0xffff;
+ }
+ } else {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ mout->m_pkthdr.csum_flags = CSUM_SCTP;
+ mout->m_pkthdr.csum_data = 0;
+ SCTP_STAT_INCR(sctps_sendhwcrc);
+#endif
+ }
+ SCTP_IP6_OUTPUT(ret, o_pak, &ro, &ifp, NULL, vrf_id);
+
+ /* Free the route if we got one back */
+ if (ro.ro_rt)
+ RTFREE(ro.ro_rt);
+ }
+#endif
+ SCTP_STAT_INCR(sctps_sendpackets);
+ SCTP_STAT_INCR_COUNTER64(sctps_outpackets);
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+ return;
+
+}
+
+static struct sctp_nets *
+sctp_select_hb_destination(struct sctp_tcb *stcb, struct timeval *now)
+{
+ struct sctp_nets *net, *hnet;
+ int ms_goneby, highest_ms, state_overide = 0;
+
+ (void)SCTP_GETTIME_TIMEVAL(now);
+ highest_ms = 0;
+ hnet = NULL;
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ if (
+ ((net->dest_state & SCTP_ADDR_NOHB) && ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0)) ||
+ (net->dest_state & SCTP_ADDR_OUT_OF_SCOPE)
+ ) {
+ /*
+ * Skip this guy from consideration if HB is off AND
+ * its confirmed
+ */
+ continue;
+ }
+ if (sctp_destination_is_reachable(stcb, (struct sockaddr *)&net->ro._l_addr) == 0) {
+ /* skip this dest net from consideration */
+ continue;
+ }
+ if (net->last_sent_time.tv_sec) {
+ /* Sent to so we subtract */
+ ms_goneby = (now->tv_sec - net->last_sent_time.tv_sec) * 1000;
+ } else
+ /* Never been sent to */
+ ms_goneby = 0x7fffffff;
+ /*-
+ * When the address state is unconfirmed but still
+ * considered reachable, we HB at a higher rate. Once it
+ * goes confirmed OR reaches the "unreachable" state, thenw
+ * we cut it back to HB at a more normal pace.
+ */
+ if ((net->dest_state & (SCTP_ADDR_UNCONFIRMED | SCTP_ADDR_NOT_REACHABLE)) == SCTP_ADDR_UNCONFIRMED) {
+ state_overide = 1;
+ } else {
+ state_overide = 0;
+ }
+
+ if ((((unsigned int)ms_goneby >= net->RTO) || (state_overide)) &&
+ (ms_goneby > highest_ms)) {
+ highest_ms = ms_goneby;
+ hnet = net;
+ }
+ }
+ if (hnet &&
+ ((hnet->dest_state & (SCTP_ADDR_UNCONFIRMED | SCTP_ADDR_NOT_REACHABLE)) == SCTP_ADDR_UNCONFIRMED)) {
+ state_overide = 1;
+ } else {
+ state_overide = 0;
+ }
+
+ if (hnet && highest_ms && (((unsigned int)highest_ms >= hnet->RTO) || state_overide)) {
+ /*-
+ * Found the one with longest delay bounds OR it is
+ * unconfirmed and still not marked unreachable.
+ */
+ SCTPDBG(SCTP_DEBUG_OUTPUT4, "net:%p is the hb winner -", hnet);
+#ifdef SCTP_DEBUG
+ if (hnet) {
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT4,
+ (struct sockaddr *)&hnet->ro._l_addr);
+ } else {
+ SCTPDBG(SCTP_DEBUG_OUTPUT4, " none\n");
+ }
+#endif
+ /* update the timer now */
+ hnet->last_sent_time = *now;
+ return (hnet);
+ }
+ /* Nothing to HB */
+ return (NULL);
+}
+
+int
+sctp_send_hb(struct sctp_tcb *stcb, int user_req, struct sctp_nets *u_net)
+{
+ struct sctp_tmit_chunk *chk;
+ struct sctp_nets *net;
+ struct sctp_heartbeat_chunk *hb;
+ struct timeval now;
+ struct sockaddr_in *sin;
+ struct sockaddr_in6 *sin6;
+
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ if (user_req == 0) {
+ net = sctp_select_hb_destination(stcb, &now);
+ if (net == NULL) {
+ /*-
+ * All our busy none to send to, just start the
+ * timer again.
+ */
+ if (stcb->asoc.state == 0) {
+ return (0);
+ }
+ sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT,
+ stcb->sctp_ep,
+ stcb,
+ net);
+ return (0);
+ }
+ } else {
+ net = u_net;
+ if (net == NULL) {
+ return (0);
+ }
+ (void)SCTP_GETTIME_TIMEVAL(&now);
+ }
+ sin = (struct sockaddr_in *)&net->ro._l_addr;
+ if (sin->sin_family != AF_INET) {
+ if (sin->sin_family != AF_INET6) {
+ /* huh */
+ return (0);
+ }
+ }
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT4, "Gak, can't get a chunk for hb\n");
+ return (0);
+ }
+ chk->copy_by_ref = 0;
+ chk->rec.chunk_id.id = SCTP_HEARTBEAT_REQUEST;
+ chk->rec.chunk_id.can_take_data = 1;
+ chk->asoc = &stcb->asoc;
+ chk->send_size = sizeof(struct sctp_heartbeat_chunk);
+
+ chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_DONTWAIT, 1, MT_HEADER);
+ if (chk->data == NULL) {
+ sctp_free_a_chunk(stcb, chk);
+ return (0);
+ }
+ SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
+ SCTP_BUF_LEN(chk->data) = chk->send_size;
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->whoTo = net;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ /* Now we have a mbuf that we can fill in with the details */
+ hb = mtod(chk->data, struct sctp_heartbeat_chunk *);
+ memset(hb, 0, sizeof(struct sctp_heartbeat_chunk));
+ /* fill out chunk header */
+ hb->ch.chunk_type = SCTP_HEARTBEAT_REQUEST;
+ hb->ch.chunk_flags = 0;
+ hb->ch.chunk_length = htons(chk->send_size);
+ /* Fill out hb parameter */
+ hb->heartbeat.hb_info.ph.param_type = htons(SCTP_HEARTBEAT_INFO);
+ hb->heartbeat.hb_info.ph.param_length = htons(sizeof(struct sctp_heartbeat_info_param));
+ hb->heartbeat.hb_info.time_value_1 = now.tv_sec;
+ hb->heartbeat.hb_info.time_value_2 = now.tv_usec;
+ /* Did our user request this one, put it in */
+ hb->heartbeat.hb_info.user_req = user_req;
+ hb->heartbeat.hb_info.addr_family = sin->sin_family;
+ hb->heartbeat.hb_info.addr_len = sin->sin_len;
+ if (net->dest_state & SCTP_ADDR_UNCONFIRMED) {
+ /*
+ * we only take from the entropy pool if the address is not
+ * confirmed.
+ */
+ net->heartbeat_random1 = hb->heartbeat.hb_info.random_value1 = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep);
+ net->heartbeat_random2 = hb->heartbeat.hb_info.random_value2 = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep);
+ } else {
+ net->heartbeat_random1 = hb->heartbeat.hb_info.random_value1 = 0;
+ net->heartbeat_random2 = hb->heartbeat.hb_info.random_value2 = 0;
+ }
+ if (sin->sin_family == AF_INET) {
+ memcpy(hb->heartbeat.hb_info.address, &sin->sin_addr, sizeof(sin->sin_addr));
+ } else if (sin->sin_family == AF_INET6) {
+ /* We leave the scope the way it is in our lookup table. */
+ sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
+ memcpy(hb->heartbeat.hb_info.address, &sin6->sin6_addr, sizeof(sin6->sin6_addr));
+ } else {
+ /* huh compiler bug */
+ return (0);
+ }
+
+ /*
+ * JRS 5/14/07 - In CMT PF, the T3 timer is used to track
+ * PF-heartbeats. Because of this, threshold management is done by
+ * the t3 timer handler, and does not need to be done upon the send
+ * of a PF-heartbeat. If CMT PF is on and the destination to which a
+ * heartbeat is being sent is in PF state, do NOT do threshold
+ * management.
+ */
+ if ((stcb->asoc.sctp_cmt_pf == 0) ||
+ ((net->dest_state & SCTP_ADDR_PF) != SCTP_ADDR_PF)) {
+ /* ok we have a destination that needs a beat */
+ /* lets do the theshold management Qiaobing style */
+ if (sctp_threshold_management(stcb->sctp_ep, stcb, net,
+ stcb->asoc.max_send_times)) {
+ /*-
+ * we have lost the association, in a way this is
+ * quite bad since we really are one less time since
+ * we really did not send yet. This is the down side
+ * to the Q's style as defined in the RFC and not my
+ * alternate style defined in the RFC.
+ */
+ if (chk->data != NULL) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ /*
+ * Here we do NOT use the macro since the
+ * association is now gone.
+ */
+ if (chk->whoTo) {
+ sctp_free_remote_addr(chk->whoTo);
+ chk->whoTo = NULL;
+ }
+ sctp_free_a_chunk((struct sctp_tcb *)NULL, chk);
+ return (-1);
+ }
+ }
+ net->hb_responded = 0;
+ TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next);
+ stcb->asoc.ctrl_queue_cnt++;
+ SCTP_STAT_INCR(sctps_sendheartbeat);
+ /*-
+ * Call directly med level routine to put out the chunk. It will
+ * always tumble out control chunks aka HB but it may even tumble
+ * out data too.
+ */
+ return (1);
+}
+
+void
+sctp_send_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net,
+ uint32_t high_tsn)
+{
+ struct sctp_association *asoc;
+ struct sctp_ecne_chunk *ecne;
+ struct sctp_tmit_chunk *chk;
+
+ asoc = &stcb->asoc;
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
+ if (chk->rec.chunk_id.id == SCTP_ECN_ECHO) {
+ /* found a previous ECN_ECHO update it if needed */
+ ecne = mtod(chk->data, struct sctp_ecne_chunk *);
+ ecne->tsn = htonl(high_tsn);
+ return;
+ }
+ }
+ /* nope could not find one to update so we must build one */
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ return;
+ }
+ chk->copy_by_ref = 0;
+ SCTP_STAT_INCR(sctps_sendecne);
+ chk->rec.chunk_id.id = SCTP_ECN_ECHO;
+ chk->rec.chunk_id.can_take_data = 0;
+ chk->asoc = &stcb->asoc;
+ chk->send_size = sizeof(struct sctp_ecne_chunk);
+ chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_DONTWAIT, 1, MT_HEADER);
+ if (chk->data == NULL) {
+ sctp_free_a_chunk(stcb, chk);
+ return;
+ }
+ SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
+ SCTP_BUF_LEN(chk->data) = chk->send_size;
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->whoTo = net;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ stcb->asoc.ecn_echo_cnt_onq++;
+ ecne = mtod(chk->data, struct sctp_ecne_chunk *);
+ ecne->ch.chunk_type = SCTP_ECN_ECHO;
+ ecne->ch.chunk_flags = 0;
+ ecne->ch.chunk_length = htons(sizeof(struct sctp_ecne_chunk));
+ ecne->tsn = htonl(high_tsn);
+ TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next);
+ asoc->ctrl_queue_cnt++;
+}
+
+void
+sctp_send_packet_dropped(struct sctp_tcb *stcb, struct sctp_nets *net,
+ struct mbuf *m, int iphlen, int bad_crc)
+{
+ struct sctp_association *asoc;
+ struct sctp_pktdrop_chunk *drp;
+ struct sctp_tmit_chunk *chk;
+ uint8_t *datap;
+ int len;
+ int was_trunc = 0;
+ struct ip *iph;
+
+#ifdef INET6
+ struct ip6_hdr *ip6h;
+
+#endif
+ int fullsz = 0, extra = 0;
+ long spc;
+ int offset;
+ struct sctp_chunkhdr *ch, chunk_buf;
+ unsigned int chk_length;
+
+ if (!stcb) {
+ return;
+ }
+ asoc = &stcb->asoc;
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ if (asoc->peer_supports_pktdrop == 0) {
+ /*-
+ * peer must declare support before I send one.
+ */
+ return;
+ }
+ if (stcb->sctp_socket == NULL) {
+ return;
+ }
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ return;
+ }
+ chk->copy_by_ref = 0;
+ iph = mtod(m, struct ip *);
+ if (iph == NULL) {
+ sctp_free_a_chunk(stcb, chk);
+ return;
+ }
+ switch (iph->ip_v) {
+ case IPVERSION:
+ /* IPv4 */
+ len = chk->send_size = iph->ip_len;
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ /* IPv6 */
+ ip6h = mtod(m, struct ip6_hdr *);
+ len = chk->send_size = htons(ip6h->ip6_plen);
+ break;
+#endif
+ default:
+ return;
+ }
+ /* Validate that we do not have an ABORT in here. */
+ offset = iphlen + sizeof(struct sctphdr);
+ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset,
+ sizeof(*ch), (uint8_t *) & chunk_buf);
+ while (ch != NULL) {
+ chk_length = ntohs(ch->chunk_length);
+ if (chk_length < sizeof(*ch)) {
+ /* break to abort land */
+ break;
+ }
+ switch (ch->chunk_type) {
+ case SCTP_PACKET_DROPPED:
+ case SCTP_ABORT_ASSOCIATION:
+ case SCTP_INITIATION_ACK:
+ /**
+ * We don't respond with an PKT-DROP to an ABORT
+ * or PKT-DROP. We also do not respond to an
+ * INIT-ACK, because we can't know if the initiation
+ * tag is correct or not.
+ */
+ sctp_free_a_chunk(stcb, chk);
+ return;
+ default:
+ break;
+ }
+ offset += SCTP_SIZE32(chk_length);
+ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset,
+ sizeof(*ch), (uint8_t *) & chunk_buf);
+ }
+
+ if ((len + SCTP_MAX_OVERHEAD + sizeof(struct sctp_pktdrop_chunk)) >
+ min(stcb->asoc.smallest_mtu, MCLBYTES)) {
+ /*
+ * only send 1 mtu worth, trim off the excess on the end.
+ */
+ fullsz = len - extra;
+ len = min(stcb->asoc.smallest_mtu, MCLBYTES) - SCTP_MAX_OVERHEAD;
+ was_trunc = 1;
+ }
+ chk->asoc = &stcb->asoc;
+ chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ if (chk->data == NULL) {
+jump_out:
+ sctp_free_a_chunk(stcb, chk);
+ return;
+ }
+ SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
+ drp = mtod(chk->data, struct sctp_pktdrop_chunk *);
+ if (drp == NULL) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ goto jump_out;
+ }
+ chk->book_size = SCTP_SIZE32((chk->send_size + sizeof(struct sctp_pktdrop_chunk) +
+ sizeof(struct sctphdr) + SCTP_MED_OVERHEAD));
+ chk->book_size_scale = 0;
+ if (was_trunc) {
+ drp->ch.chunk_flags = SCTP_PACKET_TRUNCATED;
+ drp->trunc_len = htons(fullsz);
+ /*
+ * Len is already adjusted to size minus overhead above take
+ * out the pkt_drop chunk itself from it.
+ */
+ chk->send_size = len - sizeof(struct sctp_pktdrop_chunk);
+ len = chk->send_size;
+ } else {
+ /* no truncation needed */
+ drp->ch.chunk_flags = 0;
+ drp->trunc_len = htons(0);
+ }
+ if (bad_crc) {
+ drp->ch.chunk_flags |= SCTP_BADCRC;
+ }
+ chk->send_size += sizeof(struct sctp_pktdrop_chunk);
+ SCTP_BUF_LEN(chk->data) = chk->send_size;
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ if (net) {
+ /* we should hit here */
+ chk->whoTo = net;
+ } else {
+ chk->whoTo = asoc->primary_destination;
+ }
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ chk->rec.chunk_id.id = SCTP_PACKET_DROPPED;
+ chk->rec.chunk_id.can_take_data = 1;
+ drp->ch.chunk_type = SCTP_PACKET_DROPPED;
+ drp->ch.chunk_length = htons(chk->send_size);
+ spc = SCTP_SB_LIMIT_RCV(stcb->sctp_socket);
+ if (spc < 0) {
+ spc = 0;
+ }
+ drp->bottle_bw = htonl(spc);
+ if (asoc->my_rwnd) {
+ drp->current_onq = htonl(asoc->size_on_reasm_queue +
+ asoc->size_on_all_streams +
+ asoc->my_rwnd_control_len +
+ stcb->sctp_socket->so_rcv.sb_cc);
+ } else {
+ /*-
+ * If my rwnd is 0, possibly from mbuf depletion as well as
+ * space used, tell the peer there is NO space aka onq == bw
+ */
+ drp->current_onq = htonl(spc);
+ }
+ drp->reserved = 0;
+ datap = drp->data;
+ m_copydata(m, iphlen, len, (caddr_t)datap);
+ TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next);
+ asoc->ctrl_queue_cnt++;
+}
+
+void
+sctp_send_cwr(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t high_tsn)
+{
+ struct sctp_association *asoc;
+ struct sctp_cwr_chunk *cwr;
+ struct sctp_tmit_chunk *chk;
+
+ asoc = &stcb->asoc;
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
+ if (chk->rec.chunk_id.id == SCTP_ECN_CWR) {
+ /* found a previous ECN_CWR update it if needed */
+ cwr = mtod(chk->data, struct sctp_cwr_chunk *);
+ if (compare_with_wrap(high_tsn, ntohl(cwr->tsn),
+ MAX_TSN)) {
+ cwr->tsn = htonl(high_tsn);
+ }
+ return;
+ }
+ }
+ /* nope could not find one to update so we must build one */
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ return;
+ }
+ chk->copy_by_ref = 0;
+ chk->rec.chunk_id.id = SCTP_ECN_CWR;
+ chk->rec.chunk_id.can_take_data = 1;
+ chk->asoc = &stcb->asoc;
+ chk->send_size = sizeof(struct sctp_cwr_chunk);
+ chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_DONTWAIT, 1, MT_HEADER);
+ if (chk->data == NULL) {
+ sctp_free_a_chunk(stcb, chk);
+ return;
+ }
+ SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
+ SCTP_BUF_LEN(chk->data) = chk->send_size;
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->whoTo = net;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ cwr = mtod(chk->data, struct sctp_cwr_chunk *);
+ cwr->ch.chunk_type = SCTP_ECN_CWR;
+ cwr->ch.chunk_flags = 0;
+ cwr->ch.chunk_length = htons(sizeof(struct sctp_cwr_chunk));
+ cwr->tsn = htonl(high_tsn);
+ TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next);
+ asoc->ctrl_queue_cnt++;
+}
+
+void
+sctp_add_stream_reset_out(struct sctp_tmit_chunk *chk,
+ int number_entries, uint16_t * list,
+ uint32_t seq, uint32_t resp_seq, uint32_t last_sent)
+{
+ int len, old_len, i;
+ struct sctp_stream_reset_out_request *req_out;
+ struct sctp_chunkhdr *ch;
+
+ ch = mtod(chk->data, struct sctp_chunkhdr *);
+
+
+ old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
+
+ /* get to new offset for the param. */
+ req_out = (struct sctp_stream_reset_out_request *)((caddr_t)ch + len);
+ /* now how long will this param be? */
+ len = (sizeof(struct sctp_stream_reset_out_request) + (sizeof(uint16_t) * number_entries));
+ req_out->ph.param_type = htons(SCTP_STR_RESET_OUT_REQUEST);
+ req_out->ph.param_length = htons(len);
+ req_out->request_seq = htonl(seq);
+ req_out->response_seq = htonl(resp_seq);
+ req_out->send_reset_at_tsn = htonl(last_sent);
+ if (number_entries) {
+ for (i = 0; i < number_entries; i++) {
+ req_out->list_of_streams[i] = htons(list[i]);
+ }
+ }
+ if (SCTP_SIZE32(len) > len) {
+ /*-
+ * Need to worry about the pad we may end up adding to the
+ * end. This is easy since the struct is either aligned to 4
+ * bytes or 2 bytes off.
+ */
+ req_out->list_of_streams[number_entries] = 0;
+ }
+ /* now fix the chunk length */
+ ch->chunk_length = htons(len + old_len);
+ chk->book_size = len + old_len;
+ chk->book_size_scale = 0;
+ chk->send_size = SCTP_SIZE32(chk->book_size);
+ SCTP_BUF_LEN(chk->data) = chk->send_size;
+ return;
+}
+
+
+void
+sctp_add_stream_reset_in(struct sctp_tmit_chunk *chk,
+ int number_entries, uint16_t * list,
+ uint32_t seq)
+{
+ int len, old_len, i;
+ struct sctp_stream_reset_in_request *req_in;
+ struct sctp_chunkhdr *ch;
+
+ ch = mtod(chk->data, struct sctp_chunkhdr *);
+
+
+ old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
+
+ /* get to new offset for the param. */
+ req_in = (struct sctp_stream_reset_in_request *)((caddr_t)ch + len);
+ /* now how long will this param be? */
+ len = (sizeof(struct sctp_stream_reset_in_request) + (sizeof(uint16_t) * number_entries));
+ req_in->ph.param_type = htons(SCTP_STR_RESET_IN_REQUEST);
+ req_in->ph.param_length = htons(len);
+ req_in->request_seq = htonl(seq);
+ if (number_entries) {
+ for (i = 0; i < number_entries; i++) {
+ req_in->list_of_streams[i] = htons(list[i]);
+ }
+ }
+ if (SCTP_SIZE32(len) > len) {
+ /*-
+ * Need to worry about the pad we may end up adding to the
+ * end. This is easy since the struct is either aligned to 4
+ * bytes or 2 bytes off.
+ */
+ req_in->list_of_streams[number_entries] = 0;
+ }
+ /* now fix the chunk length */
+ ch->chunk_length = htons(len + old_len);
+ chk->book_size = len + old_len;
+ chk->book_size_scale = 0;
+ chk->send_size = SCTP_SIZE32(chk->book_size);
+ SCTP_BUF_LEN(chk->data) = chk->send_size;
+ return;
+}
+
+
+void
+sctp_add_stream_reset_tsn(struct sctp_tmit_chunk *chk,
+ uint32_t seq)
+{
+ int len, old_len;
+ struct sctp_stream_reset_tsn_request *req_tsn;
+ struct sctp_chunkhdr *ch;
+
+ ch = mtod(chk->data, struct sctp_chunkhdr *);
+
+
+ old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
+
+ /* get to new offset for the param. */
+ req_tsn = (struct sctp_stream_reset_tsn_request *)((caddr_t)ch + len);
+ /* now how long will this param be? */
+ len = sizeof(struct sctp_stream_reset_tsn_request);
+ req_tsn->ph.param_type = htons(SCTP_STR_RESET_TSN_REQUEST);
+ req_tsn->ph.param_length = htons(len);
+ req_tsn->request_seq = htonl(seq);
+
+ /* now fix the chunk length */
+ ch->chunk_length = htons(len + old_len);
+ chk->send_size = len + old_len;
+ chk->book_size = SCTP_SIZE32(chk->send_size);
+ chk->book_size_scale = 0;
+ SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size);
+ return;
+}
+
+void
+sctp_add_stream_reset_result(struct sctp_tmit_chunk *chk,
+ uint32_t resp_seq, uint32_t result)
+{
+ int len, old_len;
+ struct sctp_stream_reset_response *resp;
+ struct sctp_chunkhdr *ch;
+
+ ch = mtod(chk->data, struct sctp_chunkhdr *);
+
+
+ old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
+
+ /* get to new offset for the param. */
+ resp = (struct sctp_stream_reset_response *)((caddr_t)ch + len);
+ /* now how long will this param be? */
+ len = sizeof(struct sctp_stream_reset_response);
+ resp->ph.param_type = htons(SCTP_STR_RESET_RESPONSE);
+ resp->ph.param_length = htons(len);
+ resp->response_seq = htonl(resp_seq);
+ resp->result = ntohl(result);
+
+ /* now fix the chunk length */
+ ch->chunk_length = htons(len + old_len);
+ chk->book_size = len + old_len;
+ chk->book_size_scale = 0;
+ chk->send_size = SCTP_SIZE32(chk->book_size);
+ SCTP_BUF_LEN(chk->data) = chk->send_size;
+ return;
+
+}
+
+
+void
+sctp_add_stream_reset_result_tsn(struct sctp_tmit_chunk *chk,
+ uint32_t resp_seq, uint32_t result,
+ uint32_t send_una, uint32_t recv_next)
+{
+ int len, old_len;
+ struct sctp_stream_reset_response_tsn *resp;
+ struct sctp_chunkhdr *ch;
+
+ ch = mtod(chk->data, struct sctp_chunkhdr *);
+
+
+ old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
+
+ /* get to new offset for the param. */
+ resp = (struct sctp_stream_reset_response_tsn *)((caddr_t)ch + len);
+ /* now how long will this param be? */
+ len = sizeof(struct sctp_stream_reset_response_tsn);
+ resp->ph.param_type = htons(SCTP_STR_RESET_RESPONSE);
+ resp->ph.param_length = htons(len);
+ resp->response_seq = htonl(resp_seq);
+ resp->result = htonl(result);
+ resp->senders_next_tsn = htonl(send_una);
+ resp->receivers_next_tsn = htonl(recv_next);
+
+ /* now fix the chunk length */
+ ch->chunk_length = htons(len + old_len);
+ chk->book_size = len + old_len;
+ chk->send_size = SCTP_SIZE32(chk->book_size);
+ chk->book_size_scale = 0;
+ SCTP_BUF_LEN(chk->data) = chk->send_size;
+ return;
+}
+
+static void
+sctp_add_a_stream(struct sctp_tmit_chunk *chk,
+ uint32_t seq,
+ uint16_t adding)
+{
+ int len, old_len;
+ struct sctp_chunkhdr *ch;
+ struct sctp_stream_reset_add_strm *addstr;
+
+ ch = mtod(chk->data, struct sctp_chunkhdr *);
+ old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
+
+ /* get to new offset for the param. */
+ addstr = (struct sctp_stream_reset_add_strm *)((caddr_t)ch + len);
+ /* now how long will this param be? */
+ len = sizeof(struct sctp_stream_reset_add_strm);
+
+ /* Fill it out. */
+ addstr->ph.param_type = htons(SCTP_STR_RESET_ADD_STREAMS);
+ addstr->ph.param_length = htons(len);
+ addstr->request_seq = htonl(seq);
+ addstr->number_of_streams = htons(adding);
+ addstr->reserved = 0;
+
+ /* now fix the chunk length */
+ ch->chunk_length = htons(len + old_len);
+ chk->send_size = len + old_len;
+ chk->book_size = SCTP_SIZE32(chk->send_size);
+ chk->book_size_scale = 0;
+ SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size);
+ return;
+}
+
+int
+sctp_send_str_reset_req(struct sctp_tcb *stcb,
+ int number_entries, uint16_t * list,
+ uint8_t send_out_req,
+ uint32_t resp_seq,
+ uint8_t send_in_req,
+ uint8_t send_tsn_req,
+ uint8_t add_stream,
+ uint16_t adding
+)
+{
+
+ struct sctp_association *asoc;
+ struct sctp_tmit_chunk *chk;
+ struct sctp_chunkhdr *ch;
+ uint32_t seq;
+
+ asoc = &stcb->asoc;
+ if (asoc->stream_reset_outstanding) {
+ /*-
+ * Already one pending, must get ACK back to clear the flag.
+ */
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EBUSY);
+ return (EBUSY);
+ }
+ if ((send_out_req == 0) && (send_in_req == 0) && (send_tsn_req == 0) &&
+ (add_stream == 0)) {
+ /* nothing to do */
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ return (EINVAL);
+ }
+ if (send_tsn_req && (send_out_req || send_in_req)) {
+ /* error, can't do that */
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ return (EINVAL);
+ }
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ chk->copy_by_ref = 0;
+ chk->rec.chunk_id.id = SCTP_STREAM_RESET;
+ chk->rec.chunk_id.can_take_data = 0;
+ chk->asoc = &stcb->asoc;
+ chk->book_size = sizeof(struct sctp_chunkhdr);
+ chk->send_size = SCTP_SIZE32(chk->book_size);
+ chk->book_size_scale = 0;
+
+ chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ if (chk->data == NULL) {
+ sctp_free_a_chunk(stcb, chk);
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
+
+ /* setup chunk parameters */
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ chk->whoTo = asoc->primary_destination;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+
+ ch = mtod(chk->data, struct sctp_chunkhdr *);
+ ch->chunk_type = SCTP_STREAM_RESET;
+ ch->chunk_flags = 0;
+ ch->chunk_length = htons(chk->book_size);
+ SCTP_BUF_LEN(chk->data) = chk->send_size;
+
+ seq = stcb->asoc.str_reset_seq_out;
+ if (send_out_req) {
+ sctp_add_stream_reset_out(chk, number_entries, list,
+ seq, resp_seq, (stcb->asoc.sending_seq - 1));
+ asoc->stream_reset_out_is_outstanding = 1;
+ seq++;
+ asoc->stream_reset_outstanding++;
+ }
+ if (add_stream) {
+ sctp_add_a_stream(chk, seq, adding);
+ seq++;
+ asoc->stream_reset_outstanding++;
+ }
+ if (send_in_req) {
+ sctp_add_stream_reset_in(chk, number_entries, list, seq);
+ asoc->stream_reset_outstanding++;
+ }
+ if (send_tsn_req) {
+ sctp_add_stream_reset_tsn(chk, seq);
+ asoc->stream_reset_outstanding++;
+ }
+ asoc->str_reset = chk;
+
+ /* insert the chunk for sending */
+ TAILQ_INSERT_TAIL(&asoc->control_send_queue,
+ chk,
+ sctp_next);
+ asoc->ctrl_queue_cnt++;
+ sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo);
+ return (0);
+}
+
+void
+sctp_send_abort(struct mbuf *m, int iphlen, struct sctphdr *sh, uint32_t vtag,
+ struct mbuf *err_cause, uint32_t vrf_id, uint16_t port)
+{
+ /*-
+ * Formulate the abort message, and send it back down.
+ */
+ struct mbuf *o_pak;
+ struct mbuf *mout;
+ struct sctp_abort_msg *abm;
+ struct ip *iph, *iph_out;
+ struct udphdr *udp;
+
+#ifdef INET6
+ struct ip6_hdr *ip6, *ip6_out;
+
+#endif
+ int iphlen_out, len;
+
+ /* don't respond to ABORT with ABORT */
+ if (sctp_is_there_an_abort_here(m, iphlen, &vtag)) {
+ if (err_cause)
+ sctp_m_freem(err_cause);
+ return;
+ }
+ iph = mtod(m, struct ip *);
+ switch (iph->ip_v) {
+ case IPVERSION:
+ len = (sizeof(struct ip) + sizeof(struct sctp_abort_msg));
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ len = (sizeof(struct ip6_hdr) + sizeof(struct sctp_abort_msg));
+ break;
+#endif
+ default:
+ if (err_cause) {
+ sctp_m_freem(err_cause);
+ }
+ return;
+ }
+ if (port) {
+ len += sizeof(struct udphdr);
+ }
+ mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_DONTWAIT, 1, MT_DATA);
+ if (mout == NULL) {
+ if (err_cause) {
+ sctp_m_freem(err_cause);
+ }
+ return;
+ }
+ SCTP_BUF_RESV_UF(mout, max_linkhdr);
+ SCTP_BUF_LEN(mout) = len;
+ SCTP_BUF_NEXT(mout) = err_cause;
+ iph_out = NULL;
+#ifdef INET6
+ ip6_out = NULL;
+#endif
+ switch (iph->ip_v) {
+ case IPVERSION:
+ iph_out = mtod(mout, struct ip *);
+
+ /* Fill in the IP header for the ABORT */
+ iph_out->ip_v = IPVERSION;
+ iph_out->ip_hl = (sizeof(struct ip) / 4);
+ iph_out->ip_tos = (u_char)0;
+ iph_out->ip_id = 0;
+ iph_out->ip_off = 0;
+ iph_out->ip_ttl = MAXTTL;
+ if (port) {
+ iph_out->ip_p = IPPROTO_UDP;
+ } else {
+ iph_out->ip_p = IPPROTO_SCTP;
+ }
+ iph_out->ip_src.s_addr = iph->ip_dst.s_addr;
+ iph_out->ip_dst.s_addr = iph->ip_src.s_addr;
+ /* let IP layer calculate this */
+ iph_out->ip_sum = 0;
+
+ iphlen_out = sizeof(*iph_out);
+ abm = (struct sctp_abort_msg *)((caddr_t)iph_out + iphlen_out);
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ ip6 = (struct ip6_hdr *)iph;
+ ip6_out = mtod(mout, struct ip6_hdr *);
+
+ /* Fill in the IP6 header for the ABORT */
+ ip6_out->ip6_flow = ip6->ip6_flow;
+ ip6_out->ip6_hlim = MODULE_GLOBAL(ip6_defhlim);
+ if (port) {
+ ip6_out->ip6_nxt = IPPROTO_UDP;
+ } else {
+ ip6_out->ip6_nxt = IPPROTO_SCTP;
+ }
+ ip6_out->ip6_src = ip6->ip6_dst;
+ ip6_out->ip6_dst = ip6->ip6_src;
+
+ iphlen_out = sizeof(*ip6_out);
+ abm = (struct sctp_abort_msg *)((caddr_t)ip6_out + iphlen_out);
+ break;
+#endif /* INET6 */
+ default:
+ /* Currently not supported */
+ sctp_m_freem(mout);
+ return;
+ }
+
+ udp = (struct udphdr *)abm;
+ if (port) {
+ udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
+ udp->uh_dport = port;
+ /* set udp->uh_ulen later */
+ udp->uh_sum = 0;
+ iphlen_out += sizeof(struct udphdr);
+ abm = (struct sctp_abort_msg *)((caddr_t)abm + sizeof(struct udphdr));
+ }
+ abm->sh.src_port = sh->dest_port;
+ abm->sh.dest_port = sh->src_port;
+ abm->sh.checksum = 0;
+ if (vtag == 0) {
+ abm->sh.v_tag = sh->v_tag;
+ abm->msg.ch.chunk_flags = SCTP_HAD_NO_TCB;
+ } else {
+ abm->sh.v_tag = htonl(vtag);
+ abm->msg.ch.chunk_flags = 0;
+ }
+ abm->msg.ch.chunk_type = SCTP_ABORT_ASSOCIATION;
+
+ if (err_cause) {
+ struct mbuf *m_tmp = err_cause;
+ int err_len = 0;
+
+ /* get length of the err_cause chain */
+ while (m_tmp != NULL) {
+ err_len += SCTP_BUF_LEN(m_tmp);
+ m_tmp = SCTP_BUF_NEXT(m_tmp);
+ }
+ len = SCTP_BUF_LEN(mout) + err_len;
+ if (err_len % 4) {
+ /* need pad at end of chunk */
+ uint32_t cpthis = 0;
+ int padlen;
+
+ padlen = 4 - (len % 4);
+ m_copyback(mout, len, padlen, (caddr_t)&cpthis);
+ len += padlen;
+ }
+ abm->msg.ch.chunk_length = htons(sizeof(abm->msg.ch) + err_len);
+ } else {
+ len = SCTP_BUF_LEN(mout);
+ abm->msg.ch.chunk_length = htons(sizeof(abm->msg.ch));
+ }
+
+ if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) {
+ /* no mbuf's */
+ sctp_m_freem(mout);
+ return;
+ }
+ if (iph_out != NULL) {
+ sctp_route_t ro;
+ int ret;
+
+ /* zap the stack pointer to the route */
+ bzero(&ro, sizeof ro);
+ if (port) {
+ udp->uh_ulen = htons(len - sizeof(struct ip));
+ udp->uh_sum = in_pseudo(iph_out->ip_src.s_addr, iph_out->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP));
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "sctp_send_abort calling ip_output:\n");
+ SCTPDBG_PKT(SCTP_DEBUG_OUTPUT2, iph_out, &abm->sh);
+ /* set IPv4 length */
+ iph_out->ip_len = len;
+ /* out it goes */
+#ifdef SCTP_PACKET_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
+ sctp_packet_log(mout, len);
+#endif
+ SCTP_ATTACH_CHAIN(o_pak, mout, len);
+ if (port) {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ abm->sh.checksum = sctp_calculate_cksum(mout, iphlen_out);
+ SCTP_STAT_INCR(sctps_sendswcrc);
+#endif
+ SCTP_ENABLE_UDP_CSUM(o_pak);
+ } else {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ mout->m_pkthdr.csum_flags = CSUM_SCTP;
+ mout->m_pkthdr.csum_data = 0;
+ SCTP_STAT_INCR(sctps_sendhwcrc);
+#endif
+ }
+ SCTP_IP_OUTPUT(ret, o_pak, &ro, NULL, vrf_id);
+
+ /* Free the route if we got one back */
+ if (ro.ro_rt)
+ RTFREE(ro.ro_rt);
+ }
+#ifdef INET6
+ if (ip6_out != NULL) {
+ struct route_in6 ro;
+ int ret;
+ struct ifnet *ifp = NULL;
+
+ /* zap the stack pointer to the route */
+ bzero(&ro, sizeof(ro));
+ if (port) {
+ udp->uh_ulen = htons(len - sizeof(struct ip6_hdr));
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "sctp_send_abort calling ip6_output:\n");
+ SCTPDBG_PKT(SCTP_DEBUG_OUTPUT2, (struct ip *)ip6_out, &abm->sh);
+ ip6_out->ip6_plen = len - sizeof(*ip6_out);
+#ifdef SCTP_PACKET_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
+ sctp_packet_log(mout, len);
+#endif
+ SCTP_ATTACH_CHAIN(o_pak, mout, len);
+ if (port) {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ abm->sh.checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr));
+ SCTP_STAT_INCR(sctps_sendswcrc);
+#endif
+ if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), len - sizeof(struct ip6_hdr))) == 0) {
+ udp->uh_sum = 0xffff;
+ }
+ } else {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ mout->m_pkthdr.csum_flags = CSUM_SCTP;
+ mout->m_pkthdr.csum_data = 0;
+ SCTP_STAT_INCR(sctps_sendhwcrc);
+#endif
+ }
+ SCTP_IP6_OUTPUT(ret, o_pak, &ro, &ifp, NULL, vrf_id);
+
+ /* Free the route if we got one back */
+ if (ro.ro_rt)
+ RTFREE(ro.ro_rt);
+ }
+#endif
+ SCTP_STAT_INCR(sctps_sendpackets);
+ SCTP_STAT_INCR_COUNTER64(sctps_outpackets);
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+}
+
+void
+sctp_send_operr_to(struct mbuf *m, int iphlen, struct mbuf *scm, uint32_t vtag,
+ uint32_t vrf_id, uint16_t port)
+{
+ struct mbuf *o_pak;
+ struct sctphdr *sh, *sh_out;
+ struct sctp_chunkhdr *ch;
+ struct ip *iph, *iph_out;
+ struct udphdr *udp = NULL;
+ struct mbuf *mout;
+
+#ifdef INET6
+ struct ip6_hdr *ip6, *ip6_out;
+
+#endif
+ int iphlen_out, len;
+
+ iph = mtod(m, struct ip *);
+ sh = (struct sctphdr *)((caddr_t)iph + iphlen);
+ switch (iph->ip_v) {
+ case IPVERSION:
+ len = (sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr));
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ len = (sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr));
+ break;
+#endif
+ default:
+ if (scm) {
+ sctp_m_freem(scm);
+ }
+ return;
+ }
+ if (port) {
+ len += sizeof(struct udphdr);
+ }
+ mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_DONTWAIT, 1, MT_DATA);
+ if (mout == NULL) {
+ if (scm) {
+ sctp_m_freem(scm);
+ }
+ return;
+ }
+ SCTP_BUF_RESV_UF(mout, max_linkhdr);
+ SCTP_BUF_LEN(mout) = len;
+ SCTP_BUF_NEXT(mout) = scm;
+ iph_out = NULL;
+#ifdef INET6
+ ip6_out = NULL;
+#endif
+ switch (iph->ip_v) {
+ case IPVERSION:
+ iph_out = mtod(mout, struct ip *);
+
+ /* Fill in the IP header for the ABORT */
+ iph_out->ip_v = IPVERSION;
+ iph_out->ip_hl = (sizeof(struct ip) / 4);
+ iph_out->ip_tos = (u_char)0;
+ iph_out->ip_id = 0;
+ iph_out->ip_off = 0;
+ iph_out->ip_ttl = MAXTTL;
+ if (port) {
+ iph_out->ip_p = IPPROTO_UDP;
+ } else {
+ iph_out->ip_p = IPPROTO_SCTP;
+ }
+ iph_out->ip_src.s_addr = iph->ip_dst.s_addr;
+ iph_out->ip_dst.s_addr = iph->ip_src.s_addr;
+ /* let IP layer calculate this */
+ iph_out->ip_sum = 0;
+
+ iphlen_out = sizeof(struct ip);
+ sh_out = (struct sctphdr *)((caddr_t)iph_out + iphlen_out);
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ ip6 = (struct ip6_hdr *)iph;
+ ip6_out = mtod(mout, struct ip6_hdr *);
+
+ /* Fill in the IP6 header for the ABORT */
+ ip6_out->ip6_flow = ip6->ip6_flow;
+ ip6_out->ip6_hlim = MODULE_GLOBAL(ip6_defhlim);
+ if (port) {
+ ip6_out->ip6_nxt = IPPROTO_UDP;
+ } else {
+ ip6_out->ip6_nxt = IPPROTO_SCTP;
+ }
+ ip6_out->ip6_src = ip6->ip6_dst;
+ ip6_out->ip6_dst = ip6->ip6_src;
+
+ iphlen_out = sizeof(struct ip6_hdr);
+ sh_out = (struct sctphdr *)((caddr_t)ip6_out + iphlen_out);
+ break;
+#endif /* INET6 */
+ default:
+ /* Currently not supported */
+ sctp_m_freem(mout);
+ return;
+ }
+
+ udp = (struct udphdr *)sh_out;
+ if (port) {
+ udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
+ udp->uh_dport = port;
+ /* set udp->uh_ulen later */
+ udp->uh_sum = 0;
+ iphlen_out += sizeof(struct udphdr);
+ sh_out = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr));
+ }
+ sh_out->src_port = sh->dest_port;
+ sh_out->dest_port = sh->src_port;
+ sh_out->v_tag = vtag;
+ sh_out->checksum = 0;
+
+ ch = (struct sctp_chunkhdr *)((caddr_t)sh_out + sizeof(struct sctphdr));
+ ch->chunk_type = SCTP_OPERATION_ERROR;
+ ch->chunk_flags = 0;
+
+ if (scm) {
+ struct mbuf *m_tmp = scm;
+ int cause_len = 0;
+
+ /* get length of the err_cause chain */
+ while (m_tmp != NULL) {
+ cause_len += SCTP_BUF_LEN(m_tmp);
+ m_tmp = SCTP_BUF_NEXT(m_tmp);
+ }
+ len = SCTP_BUF_LEN(mout) + cause_len;
+ if (cause_len % 4) {
+ /* need pad at end of chunk */
+ uint32_t cpthis = 0;
+ int padlen;
+
+ padlen = 4 - (len % 4);
+ m_copyback(mout, len, padlen, (caddr_t)&cpthis);
+ len += padlen;
+ }
+ ch->chunk_length = htons(sizeof(struct sctp_chunkhdr) + cause_len);
+ } else {
+ len = SCTP_BUF_LEN(mout);
+ ch->chunk_length = htons(sizeof(struct sctp_chunkhdr));
+ }
+
+ if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) {
+ /* no mbuf's */
+ sctp_m_freem(mout);
+ return;
+ }
+ if (iph_out != NULL) {
+ sctp_route_t ro;
+ int ret;
+
+ /* zap the stack pointer to the route */
+ bzero(&ro, sizeof ro);
+ if (port) {
+ udp->uh_ulen = htons(len - sizeof(struct ip));
+ udp->uh_sum = in_pseudo(iph_out->ip_src.s_addr, iph_out->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP));
+ }
+ /* set IPv4 length */
+ iph_out->ip_len = len;
+ /* out it goes */
+#ifdef SCTP_PACKET_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
+ sctp_packet_log(mout, len);
+#endif
+ SCTP_ATTACH_CHAIN(o_pak, mout, len);
+ if (port) {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ sh_out->checksum = sctp_calculate_cksum(mout, iphlen_out);
+ SCTP_STAT_INCR(sctps_sendswcrc);
+#endif
+ SCTP_ENABLE_UDP_CSUM(o_pak);
+ } else {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ mout->m_pkthdr.csum_flags = CSUM_SCTP;
+ mout->m_pkthdr.csum_data = 0;
+ SCTP_STAT_INCR(sctps_sendhwcrc);
+#endif
+ }
+ SCTP_IP_OUTPUT(ret, o_pak, &ro, NULL, vrf_id);
+
+ /* Free the route if we got one back */
+ if (ro.ro_rt)
+ RTFREE(ro.ro_rt);
+ }
+#ifdef INET6
+ if (ip6_out != NULL) {
+ struct route_in6 ro;
+ int ret;
+ struct ifnet *ifp = NULL;
+
+ /* zap the stack pointer to the route */
+ bzero(&ro, sizeof(ro));
+ if (port) {
+ udp->uh_ulen = htons(len - sizeof(struct ip6_hdr));
+ }
+ ip6_out->ip6_plen = len - sizeof(*ip6_out);
+#ifdef SCTP_PACKET_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
+ sctp_packet_log(mout, len);
+#endif
+ SCTP_ATTACH_CHAIN(o_pak, mout, len);
+ if (port) {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ sh_out->checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr));
+ SCTP_STAT_INCR(sctps_sendswcrc);
+#endif
+ if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), len - sizeof(struct ip6_hdr))) == 0) {
+ udp->uh_sum = 0xffff;
+ }
+ } else {
+#if defined(SCTP_WITH_NO_CSUM)
+ SCTP_STAT_INCR(sctps_sendnocrc);
+#else
+ mout->m_pkthdr.csum_flags = CSUM_SCTP;
+ mout->m_pkthdr.csum_data = 0;
+ SCTP_STAT_INCR(sctps_sendhwcrc);
+#endif
+ }
+ SCTP_IP6_OUTPUT(ret, o_pak, &ro, &ifp, NULL, vrf_id);
+
+ /* Free the route if we got one back */
+ if (ro.ro_rt)
+ RTFREE(ro.ro_rt);
+ }
+#endif
+ SCTP_STAT_INCR(sctps_sendpackets);
+ SCTP_STAT_INCR_COUNTER64(sctps_outpackets);
+ SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+}
+
+static struct mbuf *
+sctp_copy_resume(struct sctp_stream_queue_pending *sp,
+ struct uio *uio,
+ struct sctp_sndrcvinfo *srcv,
+ int max_send_len,
+ int user_marks_eor,
+ int *error,
+ uint32_t * sndout,
+ struct mbuf **new_tail)
+{
+ struct mbuf *m;
+
+ m = m_uiotombuf(uio, M_WAITOK, max_send_len, 0,
+ (M_PKTHDR | (user_marks_eor ? M_EOR : 0)));
+ if (m == NULL) {
+ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ *error = ENOMEM;
+ } else {
+ *sndout = m_length(m, NULL);
+ *new_tail = m_last(m);
+ }
+ return (m);
+}
+
+static int
+sctp_copy_one(struct sctp_stream_queue_pending *sp,
+ struct uio *uio,
+ int resv_upfront)
+{
+ int left;
+
+ left = sp->length;
+ sp->data = m_uiotombuf(uio, M_WAITOK, sp->length,
+ resv_upfront, 0);
+ if (sp->data == NULL) {
+ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ sp->tail_mbuf = m_last(sp->data);
+ return (0);
+}
+
+
+
+static struct sctp_stream_queue_pending *
+sctp_copy_it_in(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ struct sctp_sndrcvinfo *srcv,
+ struct uio *uio,
+ struct sctp_nets *net,
+ int max_send_len,
+ int user_marks_eor,
+ int *error,
+ int non_blocking)
+{
+ /*-
+ * This routine must be very careful in its work. Protocol
+ * processing is up and running so care must be taken to spl...()
+ * when you need to do something that may effect the stcb/asoc. The
+ * sb is locked however. When data is copied the protocol processing
+ * should be enabled since this is a slower operation...
+ */
+ struct sctp_stream_queue_pending *sp = NULL;
+ int resv_in_first;
+
+ *error = 0;
+ /* Now can we send this? */
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) ||
+ (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) {
+ /* got data while shutting down */
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET);
+ *error = ECONNRESET;
+ goto out_now;
+ }
+ sctp_alloc_a_strmoq(stcb, sp);
+ if (sp == NULL) {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ *error = ENOMEM;
+ goto out_now;
+ }
+ sp->act_flags = 0;
+ sp->sender_all_done = 0;
+ sp->sinfo_flags = srcv->sinfo_flags;
+ sp->timetolive = srcv->sinfo_timetolive;
+ sp->ppid = srcv->sinfo_ppid;
+ sp->context = srcv->sinfo_context;
+ sp->strseq = 0;
+ (void)SCTP_GETTIME_TIMEVAL(&sp->ts);
+
+ sp->stream = srcv->sinfo_stream;
+ sp->length = min(uio->uio_resid, max_send_len);
+ if ((sp->length == (uint32_t) uio->uio_resid) &&
+ ((user_marks_eor == 0) ||
+ (srcv->sinfo_flags & SCTP_EOF) ||
+ (user_marks_eor && (srcv->sinfo_flags & SCTP_EOR)))) {
+ sp->msg_is_complete = 1;
+ } else {
+ sp->msg_is_complete = 0;
+ }
+ sp->sender_all_done = 0;
+ sp->some_taken = 0;
+ sp->put_last_out = 0;
+ resv_in_first = sizeof(struct sctp_data_chunk);
+ sp->data = sp->tail_mbuf = NULL;
+ if (sp->length == 0) {
+ *error = 0;
+ goto skip_copy;
+ }
+ sp->auth_keyid = stcb->asoc.authinfo.active_keyid;
+ if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) {
+ sctp_auth_key_acquire(stcb, stcb->asoc.authinfo.active_keyid);
+ sp->holds_key_ref = 1;
+ }
+ *error = sctp_copy_one(sp, uio, resv_in_first);
+skip_copy:
+ if (*error) {
+ sctp_free_a_strmoq(stcb, sp);
+ sp = NULL;
+ } else {
+ if (sp->sinfo_flags & SCTP_ADDR_OVER) {
+ sp->net = net;
+ atomic_add_int(&sp->net->ref_count, 1);
+ } else {
+ sp->net = NULL;
+ }
+ sctp_set_prsctp_policy(sp);
+ }
+out_now:
+ return (sp);
+}
+
+
+int
+sctp_sosend(struct socket *so,
+ struct sockaddr *addr,
+ struct uio *uio,
+ struct mbuf *top,
+ struct mbuf *control,
+ int flags,
+ struct thread *p
+)
+{
+ int error, use_rcvinfo = 0;
+ struct sctp_sndrcvinfo srcv;
+ struct sockaddr *addr_to_use;
+
+#if defined(INET) && defined(INET6)
+ struct sockaddr_in sin;
+
+#endif
+
+ if (control) {
+ /* process cmsg snd/rcv info (maybe a assoc-id) */
+ if (sctp_find_cmsg(SCTP_SNDRCV, (void *)&srcv, control,
+ sizeof(srcv))) {
+ /* got one */
+ use_rcvinfo = 1;
+ }
+ }
+ addr_to_use = addr;
+#if defined(INET) && defined(INET6)
+ if ((addr) && (addr->sa_family == AF_INET6)) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)addr;
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ in6_sin6_2_sin(&sin, sin6);
+ addr_to_use = (struct sockaddr *)&sin;
+ }
+ }
+#endif
+ error = sctp_lower_sosend(so, addr_to_use, uio, top,
+ control,
+ flags,
+ use_rcvinfo ? &srcv : NULL
+ ,p
+ );
+ return (error);
+}
+
+
+int
+sctp_lower_sosend(struct socket *so,
+ struct sockaddr *addr,
+ struct uio *uio,
+ struct mbuf *i_pak,
+ struct mbuf *control,
+ int flags,
+ struct sctp_sndrcvinfo *srcv
+ ,
+ struct thread *p
+)
+{
+ unsigned int sndlen = 0, max_len;
+ int error, len;
+ struct mbuf *top = NULL;
+ int queue_only = 0, queue_only_for_init = 0;
+ int free_cnt_applied = 0;
+ int un_sent;
+ int now_filled = 0;
+ unsigned int inqueue_bytes = 0;
+ struct sctp_block_entry be;
+ struct sctp_inpcb *inp;
+ struct sctp_tcb *stcb = NULL;
+ struct timeval now;
+ struct sctp_nets *net;
+ struct sctp_association *asoc;
+ struct sctp_inpcb *t_inp;
+ int user_marks_eor;
+ int create_lock_applied = 0;
+ int nagle_applies = 0;
+ int some_on_control = 0;
+ int got_all_of_the_send = 0;
+ int hold_tcblock = 0;
+ int non_blocking = 0;
+ uint32_t local_add_more, local_soresv = 0;
+ uint16_t port;
+ uint16_t sinfo_flags;
+ sctp_assoc_t sinfo_assoc_id;
+
+ error = 0;
+ net = NULL;
+ stcb = NULL;
+ asoc = NULL;
+
+ t_inp = inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == NULL) {
+ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ if (i_pak) {
+ SCTP_RELEASE_PKT(i_pak);
+ }
+ return (error);
+ }
+ if ((uio == NULL) && (i_pak == NULL)) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ return (EINVAL);
+ }
+ user_marks_eor = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR);
+ atomic_add_int(&inp->total_sends, 1);
+ if (uio) {
+ if (uio->uio_resid < 0) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ return (EINVAL);
+ }
+ sndlen = uio->uio_resid;
+ } else {
+ top = SCTP_HEADER_TO_CHAIN(i_pak);
+ sndlen = SCTP_HEADER_LEN(i_pak);
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Send called addr:%p send length %d\n",
+ addr,
+ sndlen);
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
+ (inp->sctp_socket->so_qlimit)) {
+ /* The listener can NOT send */
+ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOTCONN);
+ error = ENOTCONN;
+ goto out_unlocked;
+ }
+ /**
+ * Pre-screen address, if one is given the sin-len
+ * must be set correctly!
+ */
+ if (addr) {
+ union sctp_sockstore *raddr = (union sctp_sockstore *)addr;
+
+ switch (raddr->sa.sa_family) {
+#if defined(INET)
+ case AF_INET:
+ if (raddr->sin.sin_len != sizeof(struct sockaddr_in)) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out_unlocked;
+ }
+ port = raddr->sin.sin_port;
+ break;
+#endif
+#if defined(INET6)
+ case AF_INET6:
+ if (raddr->sin6.sin6_len != sizeof(struct sockaddr_in6)) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out_unlocked;
+ }
+ port = raddr->sin6.sin6_port;
+ break;
+#endif
+ default:
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EAFNOSUPPORT);
+ error = EAFNOSUPPORT;
+ goto out_unlocked;
+ }
+ } else
+ port = 0;
+
+ if (srcv) {
+ sinfo_flags = srcv->sinfo_flags;
+ sinfo_assoc_id = srcv->sinfo_assoc_id;
+ if (INVALID_SINFO_FLAG(sinfo_flags) ||
+ PR_SCTP_INVALID_POLICY(sinfo_flags)) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out_unlocked;
+ }
+ if (srcv->sinfo_flags)
+ SCTP_STAT_INCR(sctps_sends_with_flags);
+ } else {
+ sinfo_flags = inp->def_send.sinfo_flags;
+ sinfo_assoc_id = inp->def_send.sinfo_assoc_id;
+ }
+ if (sinfo_flags & SCTP_SENDALL) {
+ /* its a sendall */
+ error = sctp_sendall(inp, uio, top, srcv);
+ top = NULL;
+ goto out_unlocked;
+ }
+ if ((sinfo_flags & SCTP_ADDR_OVER) && (addr == NULL)) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out_unlocked;
+ }
+ /* now we must find the assoc */
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ SCTP_INP_RLOCK(inp);
+ stcb = LIST_FIRST(&inp->sctp_asoc_list);
+ if (stcb == NULL) {
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOTCONN);
+ error = ENOTCONN;
+ goto out_unlocked;
+ }
+ SCTP_TCB_LOCK(stcb);
+ hold_tcblock = 1;
+ SCTP_INP_RUNLOCK(inp);
+ } else if (sinfo_assoc_id) {
+ stcb = sctp_findassociation_ep_asocid(inp, sinfo_assoc_id, 0);
+ } else if (addr) {
+ /*-
+ * Since we did not use findep we must
+ * increment it, and if we don't find a tcb
+ * decrement it.
+ */
+ SCTP_INP_WLOCK(inp);
+ SCTP_INP_INCR_REF(inp);
+ SCTP_INP_WUNLOCK(inp);
+ stcb = sctp_findassociation_ep_addr(&t_inp, addr, &net, NULL, NULL);
+ if (stcb == NULL) {
+ SCTP_INP_WLOCK(inp);
+ SCTP_INP_DECR_REF(inp);
+ SCTP_INP_WUNLOCK(inp);
+ } else {
+ hold_tcblock = 1;
+ }
+ }
+ if ((stcb == NULL) && (addr)) {
+ /* Possible implicit send? */
+ SCTP_ASOC_CREATE_LOCK(inp);
+ create_lock_applied = 1;
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
+ /* Should I really unlock ? */
+ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out_unlocked;
+
+ }
+ if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) &&
+ (addr->sa_family == AF_INET6)) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out_unlocked;
+ }
+ SCTP_INP_WLOCK(inp);
+ SCTP_INP_INCR_REF(inp);
+ SCTP_INP_WUNLOCK(inp);
+ /* With the lock applied look again */
+ stcb = sctp_findassociation_ep_addr(&t_inp, addr, &net, NULL, NULL);
+ if (stcb == NULL) {
+ SCTP_INP_WLOCK(inp);
+ SCTP_INP_DECR_REF(inp);
+ SCTP_INP_WUNLOCK(inp);
+ } else {
+ hold_tcblock = 1;
+ }
+ if (t_inp != inp) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOTCONN);
+ error = ENOTCONN;
+ goto out_unlocked;
+ }
+ }
+ if (stcb == NULL) {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOTCONN);
+ error = ENOTCONN;
+ goto out_unlocked;
+ }
+ if (addr == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOENT);
+ error = ENOENT;
+ goto out_unlocked;
+ } else {
+ /*
+ * UDP style, we must go ahead and start the INIT
+ * process
+ */
+ uint32_t vrf_id;
+
+ if ((sinfo_flags & SCTP_ABORT) ||
+ ((sinfo_flags & SCTP_EOF) && (sndlen == 0))) {
+ /*-
+ * User asks to abort a non-existant assoc,
+ * or EOF a non-existant assoc with no data
+ */
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOENT);
+ error = ENOENT;
+ goto out_unlocked;
+ }
+ /* get an asoc/stcb struct */
+ vrf_id = inp->def_vrf_id;
+#ifdef INVARIANTS
+ if (create_lock_applied == 0) {
+ panic("Error, should hold create lock and I don't?");
+ }
+#endif
+ stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id,
+ p
+ );
+ if (stcb == NULL) {
+ /* Error is setup for us in the call */
+ goto out_unlocked;
+ }
+ if (create_lock_applied) {
+ SCTP_ASOC_CREATE_UNLOCK(inp);
+ create_lock_applied = 0;
+ } else {
+ SCTP_PRINTF("Huh-3? create lock should have been on??\n");
+ }
+ /*
+ * Turn on queue only flag to prevent data from
+ * being sent
+ */
+ queue_only = 1;
+ asoc = &stcb->asoc;
+ SCTP_SET_STATE(asoc, SCTP_STATE_COOKIE_WAIT);
+ (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered);
+
+ /* initialize authentication params for the assoc */
+ sctp_initialize_auth_params(inp, stcb);
+
+ if (control) {
+ /*
+ * see if a init structure exists in cmsg
+ * headers
+ */
+ struct sctp_initmsg initm;
+ int i;
+
+ if (sctp_find_cmsg(SCTP_INIT, (void *)&initm, control,
+ sizeof(initm))) {
+ /*
+ * we have an INIT override of the
+ * default
+ */
+ if (initm.sinit_max_attempts)
+ asoc->max_init_times = initm.sinit_max_attempts;
+ if (initm.sinit_num_ostreams)
+ asoc->pre_open_streams = initm.sinit_num_ostreams;
+ if (initm.sinit_max_instreams)
+ asoc->max_inbound_streams = initm.sinit_max_instreams;
+ if (initm.sinit_max_init_timeo)
+ asoc->initial_init_rto_max = initm.sinit_max_init_timeo;
+ if (asoc->streamoutcnt < asoc->pre_open_streams) {
+ struct sctp_stream_out *tmp_str;
+ int had_lock = 0;
+
+ /* Default is NOT correct */
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, defout:%d pre_open:%d\n",
+ asoc->streamoutcnt, asoc->pre_open_streams);
+ /*
+ * What happens if this
+ * fails? we panic ...
+ */
+
+ if (hold_tcblock) {
+ had_lock = 1;
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ SCTP_MALLOC(tmp_str,
+ struct sctp_stream_out *,
+ (asoc->pre_open_streams *
+ sizeof(struct sctp_stream_out)),
+ SCTP_M_STRMO);
+ if (had_lock) {
+ SCTP_TCB_LOCK(stcb);
+ }
+ if (tmp_str != NULL) {
+ SCTP_FREE(asoc->strmout, SCTP_M_STRMO);
+ asoc->strmout = tmp_str;
+ asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams;
+ } else {
+ asoc->pre_open_streams = asoc->streamoutcnt;
+ }
+ for (i = 0; i < asoc->streamoutcnt; i++) {
+ /*-
+ * inbound side must be set
+ * to 0xffff, also NOTE when
+ * we get the INIT-ACK back
+ * (for INIT sender) we MUST
+ * reduce the count
+ * (streamoutcnt) but first
+ * check if we sent to any
+ * of the upper streams that
+ * were dropped (if some
+ * were). Those that were
+ * dropped must be notified
+ * to the upper layer as
+ * failed to send.
+ */
+ asoc->strmout[i].next_sequence_sent = 0x0;
+ TAILQ_INIT(&asoc->strmout[i].outqueue);
+ asoc->strmout[i].stream_no = i;
+ asoc->strmout[i].last_msg_incomplete = 0;
+ asoc->strmout[i].next_spoke.tqe_next = 0;
+ asoc->strmout[i].next_spoke.tqe_prev = 0;
+ }
+ }
+ }
+ }
+ hold_tcblock = 1;
+ /* out with the INIT */
+ queue_only_for_init = 1;
+ /*-
+ * we may want to dig in after this call and adjust the MTU
+ * value. It defaulted to 1500 (constant) but the ro
+ * structure may now have an update and thus we may need to
+ * change it BEFORE we append the message.
+ */
+ }
+ } else
+ asoc = &stcb->asoc;
+ if (srcv == NULL)
+ srcv = (struct sctp_sndrcvinfo *)&asoc->def_send;
+ if (srcv->sinfo_flags & SCTP_ADDR_OVER) {
+ if (addr)
+ net = sctp_findnet(stcb, addr);
+ else
+ net = NULL;
+ if ((net == NULL) ||
+ ((port != 0) && (port != stcb->rport))) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out_unlocked;
+ }
+ } else {
+ net = stcb->asoc.primary_destination;
+ }
+ atomic_add_int(&stcb->total_sends, 1);
+ /* Keep the stcb from being freed under our feet */
+ atomic_add_int(&asoc->refcnt, 1);
+ free_cnt_applied = 1;
+
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NO_FRAGMENT)) {
+ if (sndlen > asoc->smallest_mtu) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE);
+ error = EMSGSIZE;
+ goto out_unlocked;
+ }
+ }
+ if ((SCTP_SO_IS_NBIO(so)
+ || (flags & MSG_NBIO)
+ )) {
+ non_blocking = 1;
+ }
+ /* would we block? */
+ if (non_blocking) {
+ if (hold_tcblock == 0) {
+ SCTP_TCB_LOCK(stcb);
+ hold_tcblock = 1;
+ }
+ inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk));
+ if ((SCTP_SB_LIMIT_SND(so) < (sndlen + inqueue_bytes + stcb->asoc.sb_send_resv)) ||
+ (stcb->asoc.chunks_on_out_queue >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EWOULDBLOCK);
+ if (sndlen > SCTP_SB_LIMIT_SND(so))
+ error = EMSGSIZE;
+ else
+ error = EWOULDBLOCK;
+ goto out_unlocked;
+ }
+ stcb->asoc.sb_send_resv += sndlen;
+ SCTP_TCB_UNLOCK(stcb);
+ hold_tcblock = 0;
+ } else {
+ atomic_add_int(&stcb->asoc.sb_send_resv, sndlen);
+ }
+ local_soresv = sndlen;
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET);
+ error = ECONNRESET;
+ goto out_unlocked;
+ }
+ if (create_lock_applied) {
+ SCTP_ASOC_CREATE_UNLOCK(inp);
+ create_lock_applied = 0;
+ }
+ if (asoc->stream_reset_outstanding) {
+ /*
+ * Can't queue any data while stream reset is underway.
+ */
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EAGAIN);
+ error = EAGAIN;
+ goto out_unlocked;
+ }
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) {
+ queue_only = 1;
+ }
+ /* we are now done with all control */
+ if (control) {
+ sctp_m_freem(control);
+ control = NULL;
+ }
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) ||
+ (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) {
+ if (srcv->sinfo_flags & SCTP_ABORT) {
+ ;
+ } else {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET);
+ error = ECONNRESET;
+ goto out_unlocked;
+ }
+ }
+#ifndef __rtems__
+ /* Ok, we will attempt a msgsnd :> */
+ if (p) {
+ p->td_ru.ru_msgsnd++;
+ }
+#endif /* __rtems__ */
+ /* Are we aborting? */
+ if (srcv->sinfo_flags & SCTP_ABORT) {
+ struct mbuf *mm;
+ int tot_demand, tot_out = 0, max_out;
+
+ SCTP_STAT_INCR(sctps_sends_with_abort);
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) {
+ /* It has to be up before we abort */
+ /* how big is the user initiated abort? */
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out;
+ }
+ if (hold_tcblock) {
+ SCTP_TCB_UNLOCK(stcb);
+ hold_tcblock = 0;
+ }
+ if (top) {
+ struct mbuf *cntm = NULL;
+
+ mm = sctp_get_mbuf_for_msg(1, 0, M_WAIT, 1, MT_DATA);
+ if (sndlen != 0) {
+ cntm = top;
+ while (cntm) {
+ tot_out += SCTP_BUF_LEN(cntm);
+ cntm = SCTP_BUF_NEXT(cntm);
+ }
+ }
+ tot_demand = (tot_out + sizeof(struct sctp_paramhdr));
+ } else {
+ /* Must fit in a MTU */
+ tot_out = sndlen;
+ tot_demand = (tot_out + sizeof(struct sctp_paramhdr));
+ if (tot_demand > SCTP_DEFAULT_ADD_MORE) {
+ /* To big */
+ SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE);
+ error = EMSGSIZE;
+ goto out;
+ }
+ mm = sctp_get_mbuf_for_msg(tot_demand, 0, M_WAIT, 1, MT_DATA);
+ }
+ if (mm == NULL) {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ error = ENOMEM;
+ goto out;
+ }
+ max_out = asoc->smallest_mtu - sizeof(struct sctp_paramhdr);
+ max_out -= sizeof(struct sctp_abort_msg);
+ if (tot_out > max_out) {
+ tot_out = max_out;
+ }
+ if (mm) {
+ struct sctp_paramhdr *ph;
+
+ /* now move forward the data pointer */
+ ph = mtod(mm, struct sctp_paramhdr *);
+ ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT);
+ ph->param_length = htons((sizeof(struct sctp_paramhdr) + tot_out));
+ ph++;
+ SCTP_BUF_LEN(mm) = tot_out + sizeof(struct sctp_paramhdr);
+ if (top == NULL) {
+ error = uiomove((caddr_t)ph, (int)tot_out, uio);
+ if (error) {
+ /*-
+ * Here if we can't get his data we
+ * still abort we just don't get to
+ * send the users note :-0
+ */
+ sctp_m_freem(mm);
+ mm = NULL;
+ }
+ } else {
+ if (sndlen != 0) {
+ SCTP_BUF_NEXT(mm) = top;
+ }
+ }
+ }
+ if (hold_tcblock == 0) {
+ SCTP_TCB_LOCK(stcb);
+ hold_tcblock = 1;
+ }
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ free_cnt_applied = 0;
+ /* release this lock, otherwise we hang on ourselves */
+ sctp_abort_an_association(stcb->sctp_ep, stcb,
+ SCTP_RESPONSE_TO_USER_REQ,
+ mm, SCTP_SO_LOCKED);
+ /* now relock the stcb so everything is sane */
+ hold_tcblock = 0;
+ stcb = NULL;
+ /*
+ * In this case top is already chained to mm avoid double
+ * free, since we free it below if top != NULL and driver
+ * would free it after sending the packet out
+ */
+ if (sndlen != 0) {
+ top = NULL;
+ }
+ goto out_unlocked;
+ }
+ /* Calculate the maximum we can send */
+ inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk));
+ if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) {
+ if (non_blocking) {
+ /* we already checked for non-blocking above. */
+ max_len = sndlen;
+ } else {
+ max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes;
+ }
+ } else {
+ max_len = 0;
+ }
+ if (hold_tcblock) {
+ SCTP_TCB_UNLOCK(stcb);
+ hold_tcblock = 0;
+ }
+ /* Is the stream no. valid? */
+ if (srcv->sinfo_stream >= asoc->streamoutcnt) {
+ /* Invalid stream number */
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out_unlocked;
+ }
+ if (asoc->strmout == NULL) {
+ /* huh? software error */
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT);
+ error = EFAULT;
+ goto out_unlocked;
+ }
+ /* Unless E_EOR mode is on, we must make a send FIT in one call. */
+ if ((user_marks_eor == 0) &&
+ (sndlen > SCTP_SB_LIMIT_SND(stcb->sctp_socket))) {
+ /* It will NEVER fit */
+ SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE);
+ error = EMSGSIZE;
+ goto out_unlocked;
+ }
+ if ((uio == NULL) && user_marks_eor) {
+ /*-
+ * We do not support eeor mode for
+ * sending with mbuf chains (like sendfile).
+ */
+ SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out_unlocked;
+ }
+ if (user_marks_eor) {
+ local_add_more = min(SCTP_SB_LIMIT_SND(so), SCTP_BASE_SYSCTL(sctp_add_more_threshold));
+ } else {
+ /*-
+ * For non-eeor the whole message must fit in
+ * the socket send buffer.
+ */
+ local_add_more = sndlen;
+ }
+ len = 0;
+ if (non_blocking) {
+ goto skip_preblock;
+ }
+ if (((max_len <= local_add_more) &&
+ (SCTP_SB_LIMIT_SND(so) >= local_add_more)) ||
+ (max_len == 0) ||
+ ((stcb->asoc.chunks_on_out_queue + stcb->asoc.stream_queue_cnt) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) {
+ /* No room right now ! */
+ SOCKBUF_LOCK(&so->so_snd);
+ inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk));
+ while ((SCTP_SB_LIMIT_SND(so) < (inqueue_bytes + local_add_more)) ||
+ ((stcb->asoc.stream_queue_cnt + stcb->asoc.chunks_on_out_queue) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "pre_block limit:%u <(inq:%d + %d) || (%d+%d > %d)\n",
+ (unsigned int)SCTP_SB_LIMIT_SND(so),
+ inqueue_bytes,
+ local_add_more,
+ stcb->asoc.stream_queue_cnt,
+ stcb->asoc.chunks_on_out_queue,
+ SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue));
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) {
+ sctp_log_block(SCTP_BLOCK_LOG_INTO_BLKA, so, asoc, sndlen);
+ }
+ be.error = 0;
+ stcb->block_entry = &be;
+ error = sbwait(&so->so_snd);
+ stcb->block_entry = NULL;
+ if (error || so->so_error || be.error) {
+ if (error == 0) {
+ if (so->so_error)
+ error = so->so_error;
+ if (be.error) {
+ error = be.error;
+ }
+ }
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto out_unlocked;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) {
+ sctp_log_block(SCTP_BLOCK_LOG_OUTOF_BLK,
+ so, asoc, stcb->asoc.total_output_queue_size);
+ }
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ goto out_unlocked;
+ }
+ inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk));
+ }
+ if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) {
+ max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes;
+ } else {
+ max_len = 0;
+ }
+ SOCKBUF_UNLOCK(&so->so_snd);
+ }
+skip_preblock:
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ goto out_unlocked;
+ }
+ /*
+ * sndlen covers for mbuf case uio_resid covers for the non-mbuf
+ * case NOTE: uio will be null when top/mbuf is passed
+ */
+ if (sndlen == 0) {
+ if (srcv->sinfo_flags & SCTP_EOF) {
+ got_all_of_the_send = 1;
+ goto dataless_eof;
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out;
+ }
+ }
+ if (top == NULL) {
+ struct sctp_stream_queue_pending *sp;
+ struct sctp_stream_out *strm;
+ uint32_t sndout;
+
+ SCTP_TCB_SEND_LOCK(stcb);
+ if ((asoc->stream_locked) &&
+ (asoc->stream_locked_on != srcv->sinfo_stream)) {
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out;
+ }
+ SCTP_TCB_SEND_UNLOCK(stcb);
+
+ strm = &stcb->asoc.strmout[srcv->sinfo_stream];
+ if (strm->last_msg_incomplete == 0) {
+ do_a_copy_in:
+ sp = sctp_copy_it_in(stcb, asoc, srcv, uio, net, max_len, user_marks_eor, &error, non_blocking);
+ if ((sp == NULL) || (error)) {
+ goto out;
+ }
+ SCTP_TCB_SEND_LOCK(stcb);
+ if (sp->msg_is_complete) {
+ strm->last_msg_incomplete = 0;
+ asoc->stream_locked = 0;
+ } else {
+ /*
+ * Just got locked to this guy in case of an
+ * interrupt.
+ */
+ strm->last_msg_incomplete = 1;
+ asoc->stream_locked = 1;
+ asoc->stream_locked_on = srcv->sinfo_stream;
+ sp->sender_all_done = 0;
+ }
+ sctp_snd_sb_alloc(stcb, sp->length);
+ atomic_add_int(&asoc->stream_queue_cnt, 1);
+ if ((srcv->sinfo_flags & SCTP_UNORDERED) == 0) {
+ sp->strseq = strm->next_sequence_sent;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_AT_SEND_2_SCTP) {
+ sctp_misc_ints(SCTP_STRMOUT_LOG_ASSIGN,
+ (uintptr_t) stcb, sp->length,
+ (uint32_t) ((srcv->sinfo_stream << 16) | sp->strseq), 0);
+ }
+ strm->next_sequence_sent++;
+ } else {
+ SCTP_STAT_INCR(sctps_sends_with_unord);
+ }
+ TAILQ_INSERT_TAIL(&strm->outqueue, sp, next);
+ if ((strm->next_spoke.tqe_next == NULL) &&
+ (strm->next_spoke.tqe_prev == NULL)) {
+ /* Not on wheel, insert */
+ sctp_insert_on_wheel(stcb, asoc, strm, 1);
+ }
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ } else {
+ SCTP_TCB_SEND_LOCK(stcb);
+ sp = TAILQ_LAST(&strm->outqueue, sctp_streamhead);
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ if (sp == NULL) {
+ /* ???? Huh ??? last msg is gone */
+#ifdef INVARIANTS
+ panic("Warning: Last msg marked incomplete, yet nothing left?");
+#else
+ SCTP_PRINTF("Warning: Last msg marked incomplete, yet nothing left?\n");
+ strm->last_msg_incomplete = 0;
+#endif
+ goto do_a_copy_in;
+
+ }
+ }
+ while (uio->uio_resid > 0) {
+ /* How much room do we have? */
+ struct mbuf *new_tail, *mm;
+
+ if (SCTP_SB_LIMIT_SND(so) > stcb->asoc.total_output_queue_size)
+ max_len = SCTP_SB_LIMIT_SND(so) - stcb->asoc.total_output_queue_size;
+ else
+ max_len = 0;
+
+ if ((max_len > SCTP_BASE_SYSCTL(sctp_add_more_threshold)) ||
+ (max_len && (SCTP_SB_LIMIT_SND(so) < SCTP_BASE_SYSCTL(sctp_add_more_threshold))) ||
+ (uio->uio_resid && (uio->uio_resid <= (int)max_len))) {
+ sndout = 0;
+ new_tail = NULL;
+ if (hold_tcblock) {
+ SCTP_TCB_UNLOCK(stcb);
+ hold_tcblock = 0;
+ }
+ mm = sctp_copy_resume(sp, uio, srcv, max_len, user_marks_eor, &error, &sndout, &new_tail);
+ if ((mm == NULL) || error) {
+ if (mm) {
+ sctp_m_freem(mm);
+ }
+ goto out;
+ }
+ /* Update the mbuf and count */
+ SCTP_TCB_SEND_LOCK(stcb);
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ /*
+ * we need to get out. Peer probably
+ * aborted.
+ */
+ sctp_m_freem(mm);
+ if (stcb->asoc.state & SCTP_PCB_FLAGS_WAS_ABORTED) {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET);
+ error = ECONNRESET;
+ }
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ goto out;
+ }
+ if (sp->tail_mbuf) {
+ /* tack it to the end */
+ SCTP_BUF_NEXT(sp->tail_mbuf) = mm;
+ sp->tail_mbuf = new_tail;
+ } else {
+ /* A stolen mbuf */
+ sp->data = mm;
+ sp->tail_mbuf = new_tail;
+ }
+ sctp_snd_sb_alloc(stcb, sndout);
+ atomic_add_int(&sp->length, sndout);
+ len += sndout;
+
+ /* Did we reach EOR? */
+ if ((uio->uio_resid == 0) &&
+ ((user_marks_eor == 0) ||
+ (srcv->sinfo_flags & SCTP_EOF) ||
+ (user_marks_eor && (srcv->sinfo_flags & SCTP_EOR)))) {
+ sp->msg_is_complete = 1;
+ } else {
+ sp->msg_is_complete = 0;
+ }
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ }
+ if (uio->uio_resid == 0) {
+ /* got it all? */
+ continue;
+ }
+ /* PR-SCTP? */
+ if ((asoc->peer_supports_prsctp) && (asoc->sent_queue_cnt_removeable > 0)) {
+ /*
+ * This is ugly but we must assure locking
+ * order
+ */
+ if (hold_tcblock == 0) {
+ SCTP_TCB_LOCK(stcb);
+ hold_tcblock = 1;
+ }
+ sctp_prune_prsctp(stcb, asoc, srcv, sndlen);
+ inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk));
+ if (SCTP_SB_LIMIT_SND(so) > stcb->asoc.total_output_queue_size)
+ max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes;
+ else
+ max_len = 0;
+ if (max_len > 0) {
+ continue;
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ hold_tcblock = 0;
+ }
+ /* wait for space now */
+ if (non_blocking) {
+ /* Non-blocking io in place out */
+ goto skip_out_eof;
+ }
+ /* What about the INIT, send it maybe */
+ if (queue_only_for_init) {
+ if (hold_tcblock == 0) {
+ SCTP_TCB_LOCK(stcb);
+ hold_tcblock = 1;
+ }
+ if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) {
+ /* a collision took us forward? */
+ queue_only = 0;
+ } else {
+ sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
+ SCTP_SET_STATE(asoc, SCTP_STATE_COOKIE_WAIT);
+ queue_only = 1;
+ }
+ }
+ if ((net->flight_size > net->cwnd) &&
+ (asoc->sctp_cmt_on_off == 0)) {
+ SCTP_STAT_INCR(sctps_send_cwnd_avoid);
+ queue_only = 1;
+ } else if (asoc->ifp_had_enobuf) {
+ SCTP_STAT_INCR(sctps_ifnomemqueued);
+ if (net->flight_size > (2 * net->mtu)) {
+ queue_only = 1;
+ }
+ asoc->ifp_had_enobuf = 0;
+ }
+ un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) +
+ (stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk)));
+ if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) &&
+ (stcb->asoc.total_flight > 0) &&
+ (stcb->asoc.stream_queue_cnt < SCTP_MAX_DATA_BUNDLING) &&
+ (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))) {
+
+ /*-
+ * Ok, Nagle is set on and we have data outstanding.
+ * Don't send anything and let SACKs drive out the
+ * data unless wen have a "full" segment to send.
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) {
+ sctp_log_nagle_event(stcb, SCTP_NAGLE_APPLIED);
+ }
+ SCTP_STAT_INCR(sctps_naglequeued);
+ nagle_applies = 1;
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) {
+ if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY))
+ sctp_log_nagle_event(stcb, SCTP_NAGLE_SKIPPED);
+ }
+ SCTP_STAT_INCR(sctps_naglesent);
+ nagle_applies = 0;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) {
+
+ sctp_misc_ints(SCTP_CWNDLOG_PRESEND, queue_only_for_init, queue_only,
+ nagle_applies, un_sent);
+ sctp_misc_ints(SCTP_CWNDLOG_PRESEND, stcb->asoc.total_output_queue_size,
+ stcb->asoc.total_flight,
+ stcb->asoc.chunks_on_out_queue, stcb->asoc.total_flight_count);
+ }
+ if (queue_only_for_init)
+ queue_only_for_init = 0;
+ if ((queue_only == 0) && (nagle_applies == 0)) {
+ /*-
+ * need to start chunk output
+ * before blocking.. note that if
+ * a lock is already applied, then
+ * the input via the net is happening
+ * and I don't need to start output :-D
+ */
+ if (hold_tcblock == 0) {
+ if (SCTP_TCB_TRYLOCK(stcb)) {
+ hold_tcblock = 1;
+ sctp_chunk_output(inp,
+ stcb,
+ SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED);
+ }
+ } else {
+ sctp_chunk_output(inp,
+ stcb,
+ SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED);
+ }
+ if (hold_tcblock == 1) {
+ SCTP_TCB_UNLOCK(stcb);
+ hold_tcblock = 0;
+ }
+ }
+ SOCKBUF_LOCK(&so->so_snd);
+ /*-
+ * This is a bit strange, but I think it will
+ * work. The total_output_queue_size is locked and
+ * protected by the TCB_LOCK, which we just released.
+ * There is a race that can occur between releasing it
+ * above, and me getting the socket lock, where sacks
+ * come in but we have not put the SB_WAIT on the
+ * so_snd buffer to get the wakeup. After the LOCK
+ * is applied the sack_processing will also need to
+ * LOCK the so->so_snd to do the actual sowwakeup(). So
+ * once we have the socket buffer lock if we recheck the
+ * size we KNOW we will get to sleep safely with the
+ * wakeup flag in place.
+ */
+ if (SCTP_SB_LIMIT_SND(so) <= (stcb->asoc.total_output_queue_size +
+ min(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTP_SB_LIMIT_SND(so)))) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) {
+ sctp_log_block(SCTP_BLOCK_LOG_INTO_BLK,
+ so, asoc, uio->uio_resid);
+ }
+ be.error = 0;
+ stcb->block_entry = &be;
+ error = sbwait(&so->so_snd);
+ stcb->block_entry = NULL;
+
+ if (error || so->so_error || be.error) {
+ if (error == 0) {
+ if (so->so_error)
+ error = so->so_error;
+ if (be.error) {
+ error = be.error;
+ }
+ }
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto out_unlocked;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) {
+ sctp_log_block(SCTP_BLOCK_LOG_OUTOF_BLK,
+ so, asoc, stcb->asoc.total_output_queue_size);
+ }
+ }
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ goto out_unlocked;
+ }
+ }
+ SCTP_TCB_SEND_LOCK(stcb);
+ if (sp) {
+ if (sp->msg_is_complete == 0) {
+ strm->last_msg_incomplete = 1;
+ asoc->stream_locked = 1;
+ asoc->stream_locked_on = srcv->sinfo_stream;
+ } else {
+ sp->sender_all_done = 1;
+ strm->last_msg_incomplete = 0;
+ asoc->stream_locked = 0;
+ }
+ } else {
+ SCTP_PRINTF("Huh no sp TSNH?\n");
+ strm->last_msg_incomplete = 0;
+ asoc->stream_locked = 0;
+ }
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ if (uio->uio_resid == 0) {
+ got_all_of_the_send = 1;
+ }
+ } else {
+ /* We send in a 0, since we do NOT have any locks */
+ error = sctp_msg_append(stcb, net, top, srcv, 0);
+ top = NULL;
+ if (srcv->sinfo_flags & SCTP_EOF) {
+ /*
+ * This should only happen for Panda for the mbuf
+ * send case, which does NOT yet support EEOR mode.
+ * Thus, we can just set this flag to do the proper
+ * EOF handling.
+ */
+ got_all_of_the_send = 1;
+ }
+ }
+ if (error) {
+ goto out;
+ }
+dataless_eof:
+ /* EOF thing ? */
+ if ((srcv->sinfo_flags & SCTP_EOF) &&
+ (got_all_of_the_send == 1) &&
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) {
+ int cnt;
+
+ SCTP_STAT_INCR(sctps_sends_with_eof);
+ error = 0;
+ if (hold_tcblock == 0) {
+ SCTP_TCB_LOCK(stcb);
+ hold_tcblock = 1;
+ }
+ cnt = sctp_is_there_unsent_data(stcb);
+ if (TAILQ_EMPTY(&asoc->send_queue) &&
+ TAILQ_EMPTY(&asoc->sent_queue) &&
+ (cnt == 0)) {
+ if (asoc->locked_on_sending) {
+ goto abort_anyway;
+ }
+ /* there is nothing queued to send, so I'm done... */
+ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
+ /* only send SHUTDOWN the first time through */
+ sctp_send_shutdown(stcb, stcb->asoc.primary_destination);
+ if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT);
+ SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+ }
+ } else {
+ /*-
+ * we still got (or just got) data to send, so set
+ * SHUTDOWN_PENDING
+ */
+ /*-
+ * XXX sockets draft says that SCTP_EOF should be
+ * sent with no data. currently, we will allow user
+ * data to be sent first and move to
+ * SHUTDOWN-PENDING
+ */
+ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
+ if (hold_tcblock == 0) {
+ SCTP_TCB_LOCK(stcb);
+ hold_tcblock = 1;
+ }
+ if (asoc->locked_on_sending) {
+ /* Locked to send out the data */
+ struct sctp_stream_queue_pending *sp;
+
+ sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead);
+ if (sp) {
+ if ((sp->length == 0) && (sp->msg_is_complete == 0))
+ asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
+ }
+ }
+ asoc->state |= SCTP_STATE_SHUTDOWN_PENDING;
+ if (TAILQ_EMPTY(&asoc->send_queue) &&
+ TAILQ_EMPTY(&asoc->sent_queue) &&
+ (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
+ abort_anyway:
+ if (free_cnt_applied) {
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ free_cnt_applied = 0;
+ }
+ sctp_abort_an_association(stcb->sctp_ep, stcb,
+ SCTP_RESPONSE_TO_USER_REQ,
+ NULL, SCTP_SO_LOCKED);
+ /*
+ * now relock the stcb so everything
+ * is sane
+ */
+ hold_tcblock = 0;
+ stcb = NULL;
+ goto out;
+ }
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_NODELAY);
+ }
+ }
+ }
+skip_out_eof:
+ if (!TAILQ_EMPTY(&stcb->asoc.control_send_queue)) {
+ some_on_control = 1;
+ }
+ if (queue_only_for_init) {
+ if (hold_tcblock == 0) {
+ SCTP_TCB_LOCK(stcb);
+ hold_tcblock = 1;
+ }
+ if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) {
+ /* a collision took us forward? */
+ queue_only = 0;
+ } else {
+ sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
+ SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT);
+ queue_only = 1;
+ }
+ }
+ if ((net->flight_size > net->cwnd) &&
+ (stcb->asoc.sctp_cmt_on_off == 0)) {
+ SCTP_STAT_INCR(sctps_send_cwnd_avoid);
+ queue_only = 1;
+ } else if (asoc->ifp_had_enobuf) {
+ SCTP_STAT_INCR(sctps_ifnomemqueued);
+ if (net->flight_size > (2 * net->mtu)) {
+ queue_only = 1;
+ }
+ asoc->ifp_had_enobuf = 0;
+ }
+ un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) +
+ (stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk)));
+ if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) &&
+ (stcb->asoc.total_flight > 0) &&
+ (stcb->asoc.stream_queue_cnt < SCTP_MAX_DATA_BUNDLING) &&
+ (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))) {
+ /*-
+ * Ok, Nagle is set on and we have data outstanding.
+ * Don't send anything and let SACKs drive out the
+ * data unless wen have a "full" segment to send.
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) {
+ sctp_log_nagle_event(stcb, SCTP_NAGLE_APPLIED);
+ }
+ SCTP_STAT_INCR(sctps_naglequeued);
+ nagle_applies = 1;
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) {
+ if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY))
+ sctp_log_nagle_event(stcb, SCTP_NAGLE_SKIPPED);
+ }
+ SCTP_STAT_INCR(sctps_naglesent);
+ nagle_applies = 0;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_CWNDLOG_PRESEND, queue_only_for_init, queue_only,
+ nagle_applies, un_sent);
+ sctp_misc_ints(SCTP_CWNDLOG_PRESEND, stcb->asoc.total_output_queue_size,
+ stcb->asoc.total_flight,
+ stcb->asoc.chunks_on_out_queue, stcb->asoc.total_flight_count);
+ }
+ if (queue_only_for_init)
+ queue_only_for_init = 0;
+ if ((queue_only == 0) && (nagle_applies == 0) && (stcb->asoc.peers_rwnd && un_sent)) {
+ /* we can attempt to send too. */
+ if (hold_tcblock == 0) {
+ /*
+ * If there is activity recv'ing sacks no need to
+ * send
+ */
+ if (SCTP_TCB_TRYLOCK(stcb)) {
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED);
+ hold_tcblock = 1;
+ }
+ } else {
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED);
+ }
+ } else if ((queue_only == 0) &&
+ (stcb->asoc.peers_rwnd == 0) &&
+ (stcb->asoc.total_flight == 0)) {
+ /* We get to have a probe outstanding */
+ if (hold_tcblock == 0) {
+ hold_tcblock = 1;
+ SCTP_TCB_LOCK(stcb);
+ }
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED);
+ } else if (some_on_control) {
+ int num_out, reason, frag_point;
+
+ /* Here we do control only */
+ if (hold_tcblock == 0) {
+ hold_tcblock = 1;
+ SCTP_TCB_LOCK(stcb);
+ }
+ frag_point = sctp_get_frag_point(stcb, &stcb->asoc);
+ (void)sctp_med_chunk_output(inp, stcb, &stcb->asoc, &num_out,
+ &reason, 1, 1, &now, &now_filled, frag_point, SCTP_SO_LOCKED);
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "USR Send complete qo:%d prw:%d unsent:%d tf:%d cooq:%d toqs:%d err:%d\n",
+ queue_only, stcb->asoc.peers_rwnd, un_sent,
+ stcb->asoc.total_flight, stcb->asoc.chunks_on_out_queue,
+ stcb->asoc.total_output_queue_size, error);
+
+out:
+out_unlocked:
+
+ if (local_soresv && stcb) {
+ atomic_subtract_int(&stcb->asoc.sb_send_resv, sndlen);
+ local_soresv = 0;
+ }
+ if (create_lock_applied) {
+ SCTP_ASOC_CREATE_UNLOCK(inp);
+ create_lock_applied = 0;
+ }
+ if ((stcb) && hold_tcblock) {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ if (stcb && free_cnt_applied) {
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ }
+#ifdef INVARIANTS
+ if (stcb) {
+ if (mtx_owned(&stcb->tcb_mtx)) {
+ panic("Leaving with tcb mtx owned?");
+ }
+ if (mtx_owned(&stcb->tcb_send_mtx)) {
+ panic("Leaving with tcb send mtx owned?");
+ }
+ }
+#endif
+#ifdef INVARIANTS
+ if (inp) {
+ sctp_validate_no_locks(inp);
+ } else {
+ printf("Warning - inp is NULL so cant validate locks\n");
+ }
+#endif
+ if (top) {
+ sctp_m_freem(top);
+ }
+ if (control) {
+ sctp_m_freem(control);
+ }
+ return (error);
+}
+
+
+/*
+ * generate an AUTHentication chunk, if required
+ */
+struct mbuf *
+sctp_add_auth_chunk(struct mbuf *m, struct mbuf **m_end,
+ struct sctp_auth_chunk **auth_ret, uint32_t * offset,
+ struct sctp_tcb *stcb, uint8_t chunk)
+{
+ struct mbuf *m_auth;
+ struct sctp_auth_chunk *auth;
+ int chunk_len;
+
+ if ((m_end == NULL) || (auth_ret == NULL) || (offset == NULL) ||
+ (stcb == NULL))
+ return (m);
+
+ /* sysctl disabled auth? */
+ if (SCTP_BASE_SYSCTL(sctp_auth_disable))
+ return (m);
+
+ /* peer doesn't do auth... */
+ if (!stcb->asoc.peer_supports_auth) {
+ return (m);
+ }
+ /* does the requested chunk require auth? */
+ if (!sctp_auth_is_required_chunk(chunk, stcb->asoc.peer_auth_chunks)) {
+ return (m);
+ }
+ m_auth = sctp_get_mbuf_for_msg(sizeof(*auth), 0, M_DONTWAIT, 1, MT_HEADER);
+ if (m_auth == NULL) {
+ /* no mbuf's */
+ return (m);
+ }
+ /* reserve some space if this will be the first mbuf */
+ if (m == NULL)
+ SCTP_BUF_RESV_UF(m_auth, SCTP_MIN_OVERHEAD);
+ /* fill in the AUTH chunk details */
+ auth = mtod(m_auth, struct sctp_auth_chunk *);
+ bzero(auth, sizeof(*auth));
+ auth->ch.chunk_type = SCTP_AUTHENTICATION;
+ auth->ch.chunk_flags = 0;
+ chunk_len = sizeof(*auth) +
+ sctp_get_hmac_digest_len(stcb->asoc.peer_hmac_id);
+ auth->ch.chunk_length = htons(chunk_len);
+ auth->hmac_id = htons(stcb->asoc.peer_hmac_id);
+ /* key id and hmac digest will be computed and filled in upon send */
+
+ /* save the offset where the auth was inserted into the chain */
+ if (m != NULL) {
+ struct mbuf *cn;
+
+ *offset = 0;
+ cn = m;
+ while (cn) {
+ *offset += SCTP_BUF_LEN(cn);
+ cn = SCTP_BUF_NEXT(cn);
+ }
+ } else
+ *offset = 0;
+
+ /* update length and return pointer to the auth chunk */
+ SCTP_BUF_LEN(m_auth) = chunk_len;
+ m = sctp_copy_mbufchain(m_auth, m, m_end, 1, chunk_len, 0);
+ if (auth_ret != NULL)
+ *auth_ret = auth;
+
+ return (m);
+}
+
+#ifdef INET6
+int
+sctp_v6src_match_nexthop(struct sockaddr_in6 *src6, sctp_route_t * ro)
+{
+ struct nd_prefix *pfx = NULL;
+ struct nd_pfxrouter *pfxrtr = NULL;
+ struct sockaddr_in6 gw6;
+
+ if (ro == NULL || ro->ro_rt == NULL || src6->sin6_family != AF_INET6)
+ return (0);
+
+ /* get prefix entry of address */
+ LIST_FOREACH(pfx, &MODULE_GLOBAL(nd_prefix), ndpr_entry) {
+ if (pfx->ndpr_stateflags & NDPRF_DETACHED)
+ continue;
+ if (IN6_ARE_MASKED_ADDR_EQUAL(&pfx->ndpr_prefix.sin6_addr,
+ &src6->sin6_addr, &pfx->ndpr_mask))
+ break;
+ }
+ /* no prefix entry in the prefix list */
+ if (pfx == NULL) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "No prefix entry for ");
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)src6);
+ return (0);
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "v6src_match_nexthop(), Prefix entry is ");
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)src6);
+
+ /* search installed gateway from prefix entry */
+ for (pfxrtr = pfx->ndpr_advrtrs.lh_first; pfxrtr; pfxrtr =
+ pfxrtr->pfr_next) {
+ memset(&gw6, 0, sizeof(struct sockaddr_in6));
+ gw6.sin6_family = AF_INET6;
+ gw6.sin6_len = sizeof(struct sockaddr_in6);
+ memcpy(&gw6.sin6_addr, &pfxrtr->router->rtaddr,
+ sizeof(struct in6_addr));
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "prefix router is ");
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)&gw6);
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "installed router is ");
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, ro->ro_rt->rt_gateway);
+ if (sctp_cmpaddr((struct sockaddr *)&gw6,
+ ro->ro_rt->rt_gateway)) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "pfxrouter is installed\n");
+ return (1);
+ }
+ }
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "pfxrouter is not installed\n");
+ return (0);
+}
+
+#endif
+
+int
+sctp_v4src_match_nexthop(struct sctp_ifa *sifa, sctp_route_t * ro)
+{
+ struct sockaddr_in *sin, *mask;
+ struct ifaddr *ifa;
+ struct in_addr srcnetaddr, gwnetaddr;
+
+ if (ro == NULL || ro->ro_rt == NULL ||
+ sifa->address.sa.sa_family != AF_INET) {
+ return (0);
+ }
+ ifa = (struct ifaddr *)sifa->ifa;
+ mask = (struct sockaddr_in *)(ifa->ifa_netmask);
+ sin = (struct sockaddr_in *)&sifa->address.sin;
+ srcnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr);
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: src address is ");
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa);
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", srcnetaddr.s_addr);
+
+ sin = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
+ gwnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr);
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: nexthop is ");
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, ro->ro_rt->rt_gateway);
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", gwnetaddr.s_addr);
+ if (srcnetaddr.s_addr == gwnetaddr.s_addr) {
+ return (1);
+ }
+ return (0);
+}
diff --git a/freebsd/sys/netinet/sctp_output.h b/freebsd/sys/netinet/sctp_output.h
new file mode 100644
index 00000000..d9051ee7
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_output.h
@@ -0,0 +1,229 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_output.h,v 1.14 2005/03/06 16:04:18 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __sctp_output_h__
+#define __sctp_output_h__
+
+#include <freebsd/netinet/sctp_header.h>
+
+#if defined(_KERNEL) || defined(__Userspace__)
+
+
+struct mbuf *
+sctp_add_addresses_to_i_ia(struct sctp_inpcb *inp,
+ struct sctp_scoping *scope,
+ struct mbuf *m_at,
+ int cnt_inits_to);
+
+
+int sctp_is_addr_restricted(struct sctp_tcb *, struct sctp_ifa *);
+
+
+int
+sctp_is_address_in_scope(struct sctp_ifa *ifa,
+ int ipv4_addr_legal,
+ int ipv6_addr_legal,
+ int loopback_scope,
+ int ipv4_local_scope,
+ int local_scope,
+ int site_scope,
+ int do_update);
+int
+ sctp_is_addr_in_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa);
+
+struct sctp_ifa *
+sctp_source_address_selection(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ sctp_route_t * ro, struct sctp_nets *net,
+ int non_asoc_addr_ok, uint32_t vrf_id);
+
+int
+ sctp_v6src_match_nexthop(struct sockaddr_in6 *src6, sctp_route_t * ro);
+int
+ sctp_v4src_match_nexthop(struct sctp_ifa *sifa, sctp_route_t * ro);
+
+void
+sctp_send_initiate(struct sctp_inpcb *, struct sctp_tcb *, int
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+);
+
+void
+sctp_send_initiate_ack(struct sctp_inpcb *, struct sctp_tcb *,
+ struct mbuf *, int, int, struct sctphdr *, struct sctp_init_chunk *,
+ uint32_t, uint16_t, int);
+
+struct mbuf *
+sctp_arethere_unrecognized_parameters(struct mbuf *, int, int *,
+ struct sctp_chunkhdr *, int *);
+void sctp_queue_op_err(struct sctp_tcb *, struct mbuf *);
+
+int
+sctp_send_cookie_echo(struct mbuf *, int, struct sctp_tcb *,
+ struct sctp_nets *);
+
+void sctp_send_cookie_ack(struct sctp_tcb *);
+
+void
+sctp_send_heartbeat_ack(struct sctp_tcb *, struct mbuf *, int, int,
+ struct sctp_nets *);
+
+void
+sctp_remove_from_wheel(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ struct sctp_stream_out *strq, int holds_lock);
+
+
+void sctp_send_shutdown(struct sctp_tcb *, struct sctp_nets *);
+
+void sctp_send_shutdown_ack(struct sctp_tcb *, struct sctp_nets *);
+
+void sctp_send_shutdown_complete(struct sctp_tcb *, struct sctp_nets *, int);
+
+void
+sctp_send_shutdown_complete2(struct mbuf *, int, struct sctphdr *,
+ uint32_t, uint16_t);
+
+void sctp_send_asconf(struct sctp_tcb *, struct sctp_nets *, int addr_locked);
+
+void sctp_send_asconf_ack(struct sctp_tcb *);
+
+int sctp_get_frag_point(struct sctp_tcb *, struct sctp_association *);
+
+void sctp_toss_old_cookies(struct sctp_tcb *, struct sctp_association *);
+
+void sctp_toss_old_asconf(struct sctp_tcb *);
+
+void sctp_fix_ecn_echo(struct sctp_association *);
+
+void sctp_move_chunks_from_net(struct sctp_tcb *stcb, struct sctp_nets *net);
+
+int
+sctp_output(struct sctp_inpcb *, struct mbuf *, struct sockaddr *,
+ struct mbuf *, struct thread *, int);
+
+void
+sctp_insert_on_wheel(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ struct sctp_stream_out *strq, int holdslock);
+
+void
+sctp_chunk_output(struct sctp_inpcb *, struct sctp_tcb *, int, int
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+);
+void
+sctp_send_abort_tcb(struct sctp_tcb *, struct mbuf *, int
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+);
+
+void send_forward_tsn(struct sctp_tcb *, struct sctp_association *);
+
+void sctp_send_sack(struct sctp_tcb *);
+
+int sctp_send_hb(struct sctp_tcb *, int, struct sctp_nets *);
+
+void sctp_send_ecn_echo(struct sctp_tcb *, struct sctp_nets *, uint32_t);
+
+
+void
+sctp_send_packet_dropped(struct sctp_tcb *, struct sctp_nets *, struct mbuf *,
+ int, int);
+
+
+
+void sctp_send_cwr(struct sctp_tcb *, struct sctp_nets *, uint32_t);
+
+
+void
+sctp_add_stream_reset_out(struct sctp_tmit_chunk *chk,
+ int number_entries, uint16_t * list,
+ uint32_t seq, uint32_t resp_seq, uint32_t last_sent);
+
+void
+sctp_add_stream_reset_in(struct sctp_tmit_chunk *chk,
+ int number_entries, uint16_t * list,
+ uint32_t seq);
+
+void
+sctp_add_stream_reset_tsn(struct sctp_tmit_chunk *chk,
+ uint32_t seq);
+
+void
+sctp_add_stream_reset_result(struct sctp_tmit_chunk *chk,
+ uint32_t resp_seq, uint32_t result);
+
+void
+sctp_add_stream_reset_result_tsn(struct sctp_tmit_chunk *chk,
+ uint32_t resp_seq, uint32_t result,
+ uint32_t send_una, uint32_t recv_next);
+
+int
+sctp_send_str_reset_req(struct sctp_tcb *stcb,
+ int number_entries,
+ uint16_t * list,
+ uint8_t send_out_req,
+ uint32_t resp_seq,
+ uint8_t send_in_req,
+ uint8_t send_tsn_req,
+ uint8_t add_str,
+ uint16_t adding);
+
+
+void
+sctp_send_abort(struct mbuf *, int, struct sctphdr *, uint32_t,
+ struct mbuf *, uint32_t, uint16_t);
+
+void sctp_send_operr_to(struct mbuf *, int, struct mbuf *, uint32_t, uint32_t, uint16_t);
+
+#endif /* _KERNEL || __Userspace__ */
+
+#if defined(_KERNEL) || defined (__Userspace__)
+int
+sctp_sosend(struct socket *so,
+ struct sockaddr *addr,
+ struct uio *uio,
+ struct mbuf *top,
+ struct mbuf *control,
+ int flags,
+ struct thread *p
+);
+
+#endif
+#endif
diff --git a/freebsd/sys/netinet/sctp_pcb.c b/freebsd/sys/netinet/sctp_pcb.c
new file mode 100644
index 00000000..fccbda00
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_pcb.c
@@ -0,0 +1,6810 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_pcb.c,v 1.38 2005/03/06 16:04:18 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/netinet/sctp_var.h>
+#include <freebsd/netinet/sctp_sysctl.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp.h>
+#include <freebsd/netinet/sctp_header.h>
+#include <freebsd/netinet/sctp_asconf.h>
+#include <freebsd/netinet/sctp_output.h>
+#include <freebsd/netinet/sctp_timer.h>
+#include <freebsd/netinet/sctp_bsd_addr.h>
+#include <freebsd/netinet/udp.h>
+
+
+VNET_DEFINE(struct sctp_base_info, system_base_info);
+
+/* FIX: we don't handle multiple link local scopes */
+/* "scopeless" replacement IN6_ARE_ADDR_EQUAL */
+#ifdef INET6
+int
+SCTP6_ARE_ADDR_EQUAL(struct sockaddr_in6 *a, struct sockaddr_in6 *b)
+{
+ struct sockaddr_in6 tmp_a, tmp_b;
+
+ memcpy(&tmp_a, a, sizeof(struct sockaddr_in6));
+ if (sa6_embedscope(&tmp_a, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
+ return 0;
+ }
+ memcpy(&tmp_b, b, sizeof(struct sockaddr_in6));
+ if (sa6_embedscope(&tmp_b, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
+ return 0;
+ }
+ return (IN6_ARE_ADDR_EQUAL(&tmp_a.sin6_addr, &tmp_b.sin6_addr));
+}
+
+#endif
+
+void
+sctp_fill_pcbinfo(struct sctp_pcbinfo *spcb)
+{
+ /*
+ * We really don't need to lock this, but I will just because it
+ * does not hurt.
+ */
+ SCTP_INP_INFO_RLOCK();
+ spcb->ep_count = SCTP_BASE_INFO(ipi_count_ep);
+ spcb->asoc_count = SCTP_BASE_INFO(ipi_count_asoc);
+ spcb->laddr_count = SCTP_BASE_INFO(ipi_count_laddr);
+ spcb->raddr_count = SCTP_BASE_INFO(ipi_count_raddr);
+ spcb->chk_count = SCTP_BASE_INFO(ipi_count_chunk);
+ spcb->readq_count = SCTP_BASE_INFO(ipi_count_readq);
+ spcb->stream_oque = SCTP_BASE_INFO(ipi_count_strmoq);
+ spcb->free_chunks = SCTP_BASE_INFO(ipi_free_chunks);
+
+ SCTP_INP_INFO_RUNLOCK();
+}
+
+/*
+ * Addresses are added to VRF's (Virtual Router's). For BSD we
+ * have only the default VRF 0. We maintain a hash list of
+ * VRF's. Each VRF has its own list of sctp_ifn's. Each of
+ * these has a list of addresses. When we add a new address
+ * to a VRF we lookup the ifn/ifn_index, if the ifn does
+ * not exist we create it and add it to the list of IFN's
+ * within the VRF. Once we have the sctp_ifn, we add the
+ * address to the list. So we look something like:
+ *
+ * hash-vrf-table
+ * vrf-> ifn-> ifn -> ifn
+ * vrf |
+ * ... +--ifa-> ifa -> ifa
+ * vrf
+ *
+ * We keep these separate lists since the SCTP subsystem will
+ * point to these from its source address selection nets structure.
+ * When an address is deleted it does not happen right away on
+ * the SCTP side, it gets scheduled. What we do when a
+ * delete happens is immediately remove the address from
+ * the master list and decrement the refcount. As our
+ * addip iterator works through and frees the src address
+ * selection pointing to the sctp_ifa, eventually the refcount
+ * will reach 0 and we will delete it. Note that it is assumed
+ * that any locking on system level ifn/ifa is done at the
+ * caller of these functions and these routines will only
+ * lock the SCTP structures as they add or delete things.
+ *
+ * Other notes on VRF concepts.
+ * - An endpoint can be in multiple VRF's
+ * - An association lives within a VRF and only one VRF.
+ * - Any incoming packet we can deduce the VRF for by
+ * looking at the mbuf/pak inbound (for BSD its VRF=0 :D)
+ * - Any downward send call or connect call must supply the
+ * VRF via ancillary data or via some sort of set default
+ * VRF socket option call (again for BSD no brainer since
+ * the VRF is always 0).
+ * - An endpoint may add multiple VRF's to it.
+ * - Listening sockets can accept associations in any
+ * of the VRF's they are in but the assoc will end up
+ * in only one VRF (gotten from the packet or connect/send).
+ *
+ */
+
+struct sctp_vrf *
+sctp_allocate_vrf(int vrf_id)
+{
+ struct sctp_vrf *vrf = NULL;
+ struct sctp_vrflist *bucket;
+
+ /* First allocate the VRF structure */
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf) {
+ /* Already allocated */
+ return (vrf);
+ }
+ SCTP_MALLOC(vrf, struct sctp_vrf *, sizeof(struct sctp_vrf),
+ SCTP_M_VRF);
+ if (vrf == NULL) {
+ /* No memory */
+#ifdef INVARIANTS
+ panic("No memory for VRF:%d", vrf_id);
+#endif
+ return (NULL);
+ }
+ /* setup the VRF */
+ memset(vrf, 0, sizeof(struct sctp_vrf));
+ vrf->vrf_id = vrf_id;
+ LIST_INIT(&vrf->ifnlist);
+ vrf->total_ifa_count = 0;
+ vrf->refcount = 0;
+ /* now also setup table ids */
+ SCTP_INIT_VRF_TABLEID(vrf);
+ /* Init the HASH of addresses */
+ vrf->vrf_addr_hash = SCTP_HASH_INIT(SCTP_VRF_ADDR_HASH_SIZE,
+ &vrf->vrf_addr_hashmark);
+ if (vrf->vrf_addr_hash == NULL) {
+ /* No memory */
+#ifdef INVARIANTS
+ panic("No memory for VRF:%d", vrf_id);
+#endif
+ SCTP_FREE(vrf, SCTP_M_VRF);
+ return (NULL);
+ }
+ /* Add it to the hash table */
+ bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))];
+ LIST_INSERT_HEAD(bucket, vrf, next_vrf);
+ atomic_add_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1);
+ return (vrf);
+}
+
+
+struct sctp_ifn *
+sctp_find_ifn(void *ifn, uint32_t ifn_index)
+{
+ struct sctp_ifn *sctp_ifnp;
+ struct sctp_ifnlist *hash_ifn_head;
+
+ /*
+ * We assume the lock is held for the addresses if that's wrong
+ * problems could occur :-)
+ */
+ hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))];
+ LIST_FOREACH(sctp_ifnp, hash_ifn_head, next_bucket) {
+ if (sctp_ifnp->ifn_index == ifn_index) {
+ return (sctp_ifnp);
+ }
+ if (sctp_ifnp->ifn_p && ifn && (sctp_ifnp->ifn_p == ifn)) {
+ return (sctp_ifnp);
+ }
+ }
+ return (NULL);
+}
+
+
+
+struct sctp_vrf *
+sctp_find_vrf(uint32_t vrf_id)
+{
+ struct sctp_vrflist *bucket;
+ struct sctp_vrf *liste;
+
+ bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))];
+ LIST_FOREACH(liste, bucket, next_vrf) {
+ if (vrf_id == liste->vrf_id) {
+ return (liste);
+ }
+ }
+ return (NULL);
+}
+
+void
+sctp_free_vrf(struct sctp_vrf *vrf)
+{
+ if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&vrf->refcount)) {
+ if (vrf->vrf_addr_hash) {
+ SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark);
+ vrf->vrf_addr_hash = NULL;
+ }
+ /* We zero'd the count */
+ LIST_REMOVE(vrf, next_vrf);
+ SCTP_FREE(vrf, SCTP_M_VRF);
+ atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1);
+ }
+}
+
+void
+sctp_free_ifn(struct sctp_ifn *sctp_ifnp)
+{
+ if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifnp->refcount)) {
+ /* We zero'd the count */
+ if (sctp_ifnp->vrf) {
+ sctp_free_vrf(sctp_ifnp->vrf);
+ }
+ SCTP_FREE(sctp_ifnp, SCTP_M_IFN);
+ atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifns), 1);
+ }
+}
+
+void
+sctp_update_ifn_mtu(uint32_t ifn_index, uint32_t mtu)
+{
+ struct sctp_ifn *sctp_ifnp;
+
+ sctp_ifnp = sctp_find_ifn((void *)NULL, ifn_index);
+ if (sctp_ifnp != NULL) {
+ sctp_ifnp->ifn_mtu = mtu;
+ }
+}
+
+
+void
+sctp_free_ifa(struct sctp_ifa *sctp_ifap)
+{
+ if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifap->refcount)) {
+ /* We zero'd the count */
+ if (sctp_ifap->ifn_p) {
+ sctp_free_ifn(sctp_ifap->ifn_p);
+ }
+ SCTP_FREE(sctp_ifap, SCTP_M_IFA);
+ atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifas), 1);
+ }
+}
+
+static void
+sctp_delete_ifn(struct sctp_ifn *sctp_ifnp, int hold_addr_lock)
+{
+ struct sctp_ifn *found;
+
+ found = sctp_find_ifn(sctp_ifnp->ifn_p, sctp_ifnp->ifn_index);
+ if (found == NULL) {
+ /* Not in the list.. sorry */
+ return;
+ }
+ if (hold_addr_lock == 0)
+ SCTP_IPI_ADDR_WLOCK();
+ LIST_REMOVE(sctp_ifnp, next_bucket);
+ LIST_REMOVE(sctp_ifnp, next_ifn);
+ SCTP_DEREGISTER_INTERFACE(sctp_ifnp->ifn_index,
+ sctp_ifnp->registered_af);
+ if (hold_addr_lock == 0)
+ SCTP_IPI_ADDR_WUNLOCK();
+ /* Take away the reference, and possibly free it */
+ sctp_free_ifn(sctp_ifnp);
+}
+
+void
+sctp_mark_ifa_addr_down(uint32_t vrf_id, struct sockaddr *addr,
+ const char *if_name, uint32_t ifn_index)
+{
+ struct sctp_vrf *vrf;
+ struct sctp_ifa *sctp_ifap = NULL;
+
+ SCTP_IPI_ADDR_RLOCK();
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf == NULL) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id);
+ goto out;
+
+ }
+ sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED);
+ if (sctp_ifap == NULL) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n");
+ goto out;
+ }
+ if (sctp_ifap->ifn_p == NULL) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unuseable\n");
+ goto out;
+ }
+ if (if_name) {
+ int len1, len2;
+
+ len1 = strlen(if_name);
+ len2 = strlen(sctp_ifap->ifn_p->ifn_name);
+ if (len1 != len2) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "IFN of ifa names different length %d vs %d - ignored\n",
+ len1, len2);
+ goto out;
+ }
+ if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, len1) != 0) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n",
+ sctp_ifap->ifn_p->ifn_name,
+ if_name);
+ goto out;
+ }
+ } else {
+ if (sctp_ifap->ifn_p->ifn_index != ifn_index) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n",
+ sctp_ifap->ifn_p->ifn_index, ifn_index);
+ goto out;
+ }
+ }
+
+ sctp_ifap->localifa_flags &= (~SCTP_ADDR_VALID);
+ sctp_ifap->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE;
+out:
+ SCTP_IPI_ADDR_RUNLOCK();
+}
+
+void
+sctp_mark_ifa_addr_up(uint32_t vrf_id, struct sockaddr *addr,
+ const char *if_name, uint32_t ifn_index)
+{
+ struct sctp_vrf *vrf;
+ struct sctp_ifa *sctp_ifap = NULL;
+
+ SCTP_IPI_ADDR_RLOCK();
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf == NULL) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id);
+ goto out;
+
+ }
+ sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED);
+ if (sctp_ifap == NULL) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n");
+ goto out;
+ }
+ if (sctp_ifap->ifn_p == NULL) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unuseable\n");
+ goto out;
+ }
+ if (if_name) {
+ int len1, len2;
+
+ len1 = strlen(if_name);
+ len2 = strlen(sctp_ifap->ifn_p->ifn_name);
+ if (len1 != len2) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "IFN of ifa names different length %d vs %d - ignored\n",
+ len1, len2);
+ goto out;
+ }
+ if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, len1) != 0) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n",
+ sctp_ifap->ifn_p->ifn_name,
+ if_name);
+ goto out;
+ }
+ } else {
+ if (sctp_ifap->ifn_p->ifn_index != ifn_index) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n",
+ sctp_ifap->ifn_p->ifn_index, ifn_index);
+ goto out;
+ }
+ }
+
+ sctp_ifap->localifa_flags &= (~SCTP_ADDR_IFA_UNUSEABLE);
+ sctp_ifap->localifa_flags |= SCTP_ADDR_VALID;
+out:
+ SCTP_IPI_ADDR_RUNLOCK();
+}
+
+/*-
+ * Add an ifa to an ifn.
+ * Register the interface as necessary.
+ * NOTE: ADDR write lock MUST be held.
+ */
+static void
+sctp_add_ifa_to_ifn(struct sctp_ifn *sctp_ifnp, struct sctp_ifa *sctp_ifap)
+{
+ int ifa_af;
+
+ LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa);
+ sctp_ifap->ifn_p = sctp_ifnp;
+ atomic_add_int(&sctp_ifap->ifn_p->refcount, 1);
+ /* update address counts */
+ sctp_ifnp->ifa_count++;
+ ifa_af = sctp_ifap->address.sa.sa_family;
+ if (ifa_af == AF_INET)
+ sctp_ifnp->num_v4++;
+ else
+ sctp_ifnp->num_v6++;
+ if (sctp_ifnp->ifa_count == 1) {
+ /* register the new interface */
+ SCTP_REGISTER_INTERFACE(sctp_ifnp->ifn_index, ifa_af);
+ sctp_ifnp->registered_af = ifa_af;
+ }
+}
+
+/*-
+ * Remove an ifa from its ifn.
+ * If no more addresses exist, remove the ifn too. Otherwise, re-register
+ * the interface based on the remaining address families left.
+ * NOTE: ADDR write lock MUST be held.
+ */
+static void
+sctp_remove_ifa_from_ifn(struct sctp_ifa *sctp_ifap)
+{
+ uint32_t ifn_index;
+
+ LIST_REMOVE(sctp_ifap, next_ifa);
+ if (sctp_ifap->ifn_p) {
+ /* update address counts */
+ sctp_ifap->ifn_p->ifa_count--;
+ if (sctp_ifap->address.sa.sa_family == AF_INET6)
+ sctp_ifap->ifn_p->num_v6--;
+ else if (sctp_ifap->address.sa.sa_family == AF_INET)
+ sctp_ifap->ifn_p->num_v4--;
+
+ ifn_index = sctp_ifap->ifn_p->ifn_index;
+ if (LIST_EMPTY(&sctp_ifap->ifn_p->ifalist)) {
+ /* remove the ifn, possibly freeing it */
+ sctp_delete_ifn(sctp_ifap->ifn_p, SCTP_ADDR_LOCKED);
+ } else {
+ /* re-register address family type, if needed */
+ if ((sctp_ifap->ifn_p->num_v6 == 0) &&
+ (sctp_ifap->ifn_p->registered_af == AF_INET6)) {
+ SCTP_DEREGISTER_INTERFACE(ifn_index, AF_INET6);
+ SCTP_REGISTER_INTERFACE(ifn_index, AF_INET);
+ sctp_ifap->ifn_p->registered_af = AF_INET;
+ } else if ((sctp_ifap->ifn_p->num_v4 == 0) &&
+ (sctp_ifap->ifn_p->registered_af == AF_INET)) {
+ SCTP_DEREGISTER_INTERFACE(ifn_index, AF_INET);
+ SCTP_REGISTER_INTERFACE(ifn_index, AF_INET6);
+ sctp_ifap->ifn_p->registered_af = AF_INET6;
+ }
+ /* free the ifn refcount */
+ sctp_free_ifn(sctp_ifap->ifn_p);
+ }
+ sctp_ifap->ifn_p = NULL;
+ }
+}
+
+struct sctp_ifa *
+sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index,
+ uint32_t ifn_type, const char *if_name, void *ifa,
+ struct sockaddr *addr, uint32_t ifa_flags,
+ int dynamic_add)
+{
+ struct sctp_vrf *vrf;
+ struct sctp_ifn *sctp_ifnp = NULL;
+ struct sctp_ifa *sctp_ifap = NULL;
+ struct sctp_ifalist *hash_addr_head;
+ struct sctp_ifnlist *hash_ifn_head;
+ uint32_t hash_of_addr;
+ int new_ifn_af = 0;
+
+#ifdef SCTP_DEBUG
+ SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: adding address: ", vrf_id);
+ SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr);
+#endif
+ SCTP_IPI_ADDR_WLOCK();
+ sctp_ifnp = sctp_find_ifn(ifn, ifn_index);
+ if (sctp_ifnp) {
+ vrf = sctp_ifnp->vrf;
+ } else {
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf == NULL) {
+ vrf = sctp_allocate_vrf(vrf_id);
+ if (vrf == NULL) {
+ SCTP_IPI_ADDR_WUNLOCK();
+ return (NULL);
+ }
+ }
+ }
+ if (sctp_ifnp == NULL) {
+ /*
+ * build one and add it, can't hold lock until after malloc
+ * done though.
+ */
+ SCTP_IPI_ADDR_WUNLOCK();
+ SCTP_MALLOC(sctp_ifnp, struct sctp_ifn *,
+ sizeof(struct sctp_ifn), SCTP_M_IFN);
+ if (sctp_ifnp == NULL) {
+#ifdef INVARIANTS
+ panic("No memory for IFN");
+#endif
+ return (NULL);
+ }
+ memset(sctp_ifnp, 0, sizeof(struct sctp_ifn));
+ sctp_ifnp->ifn_index = ifn_index;
+ sctp_ifnp->ifn_p = ifn;
+ sctp_ifnp->ifn_type = ifn_type;
+ sctp_ifnp->refcount = 0;
+ sctp_ifnp->vrf = vrf;
+ atomic_add_int(&vrf->refcount, 1);
+ sctp_ifnp->ifn_mtu = SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index, addr->sa_family);
+ if (if_name != NULL) {
+ memcpy(sctp_ifnp->ifn_name, if_name, SCTP_IFNAMSIZ);
+ } else {
+ memcpy(sctp_ifnp->ifn_name, "unknown", min(7, SCTP_IFNAMSIZ));
+ }
+ hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))];
+ LIST_INIT(&sctp_ifnp->ifalist);
+ SCTP_IPI_ADDR_WLOCK();
+ LIST_INSERT_HEAD(hash_ifn_head, sctp_ifnp, next_bucket);
+ LIST_INSERT_HEAD(&vrf->ifnlist, sctp_ifnp, next_ifn);
+ atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifns), 1);
+ new_ifn_af = 1;
+ }
+ sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED);
+ if (sctp_ifap) {
+ /* Hmm, it already exists? */
+ if ((sctp_ifap->ifn_p) &&
+ (sctp_ifap->ifn_p->ifn_index == ifn_index)) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "Using existing ifn %s (0x%x) for ifa %p\n",
+ sctp_ifap->ifn_p->ifn_name, ifn_index,
+ sctp_ifap);
+ if (new_ifn_af) {
+ /* Remove the created one that we don't want */
+ sctp_delete_ifn(sctp_ifnp, SCTP_ADDR_LOCKED);
+ }
+ if (sctp_ifap->localifa_flags & SCTP_BEING_DELETED) {
+ /* easy to solve, just switch back to active */
+ SCTPDBG(SCTP_DEBUG_PCB4, "Clearing deleted ifa flag\n");
+ sctp_ifap->localifa_flags = SCTP_ADDR_VALID;
+ sctp_ifap->ifn_p = sctp_ifnp;
+ atomic_add_int(&sctp_ifap->ifn_p->refcount, 1);
+ }
+ exit_stage_left:
+ SCTP_IPI_ADDR_WUNLOCK();
+ return (sctp_ifap);
+ } else {
+ if (sctp_ifap->ifn_p) {
+ /*
+ * The last IFN gets the address, remove the
+ * old one
+ */
+ SCTPDBG(SCTP_DEBUG_PCB4, "Moving ifa %p from %s (0x%x) to %s (0x%x)\n",
+ sctp_ifap, sctp_ifap->ifn_p->ifn_name,
+ sctp_ifap->ifn_p->ifn_index, if_name,
+ ifn_index);
+ /* remove the address from the old ifn */
+ sctp_remove_ifa_from_ifn(sctp_ifap);
+ /* move the address over to the new ifn */
+ sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap);
+ goto exit_stage_left;
+ } else {
+ /* repair ifnp which was NULL ? */
+ sctp_ifap->localifa_flags = SCTP_ADDR_VALID;
+ SCTPDBG(SCTP_DEBUG_PCB4, "Repairing ifn %p for ifa %p\n",
+ sctp_ifnp, sctp_ifap);
+ sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap);
+ }
+ goto exit_stage_left;
+ }
+ }
+ SCTP_IPI_ADDR_WUNLOCK();
+ SCTP_MALLOC(sctp_ifap, struct sctp_ifa *, sizeof(struct sctp_ifa), SCTP_M_IFA);
+ if (sctp_ifap == NULL) {
+#ifdef INVARIANTS
+ panic("No memory for IFA");
+#endif
+ return (NULL);
+ }
+ memset(sctp_ifap, 0, sizeof(struct sctp_ifa));
+ sctp_ifap->ifn_p = sctp_ifnp;
+ atomic_add_int(&sctp_ifnp->refcount, 1);
+ sctp_ifap->vrf_id = vrf_id;
+ sctp_ifap->ifa = ifa;
+ memcpy(&sctp_ifap->address, addr, addr->sa_len);
+ sctp_ifap->localifa_flags = SCTP_ADDR_VALID | SCTP_ADDR_DEFER_USE;
+ sctp_ifap->flags = ifa_flags;
+ /* Set scope */
+ switch (sctp_ifap->address.sa.sa_family) {
+ case AF_INET:
+ {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&sctp_ifap->address.sin;
+ if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) ||
+ (IN4_ISLOOPBACK_ADDRESS(&sin->sin_addr))) {
+ sctp_ifap->src_is_loop = 1;
+ }
+ if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) {
+ sctp_ifap->src_is_priv = 1;
+ }
+ sctp_ifnp->num_v4++;
+ if (new_ifn_af)
+ new_ifn_af = AF_INET;
+ break;
+ }
+#ifdef INET6
+ case AF_INET6:
+ {
+ /* ok to use deprecated addresses? */
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&sctp_ifap->address.sin6;
+ if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) ||
+ (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr))) {
+ sctp_ifap->src_is_loop = 1;
+ }
+ if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
+ sctp_ifap->src_is_priv = 1;
+ }
+ sctp_ifnp->num_v6++;
+ if (new_ifn_af)
+ new_ifn_af = AF_INET6;
+ break;
+ }
+#endif
+ default:
+ new_ifn_af = 0;
+ break;
+ }
+ hash_of_addr = sctp_get_ifa_hash_val(&sctp_ifap->address.sa);
+
+ if ((sctp_ifap->src_is_priv == 0) &&
+ (sctp_ifap->src_is_loop == 0)) {
+ sctp_ifap->src_is_glob = 1;
+ }
+ SCTP_IPI_ADDR_WLOCK();
+ hash_addr_head = &vrf->vrf_addr_hash[(hash_of_addr & vrf->vrf_addr_hashmark)];
+ LIST_INSERT_HEAD(hash_addr_head, sctp_ifap, next_bucket);
+ sctp_ifap->refcount = 1;
+ LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa);
+ sctp_ifnp->ifa_count++;
+ vrf->total_ifa_count++;
+ atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifas), 1);
+ if (new_ifn_af) {
+ SCTP_REGISTER_INTERFACE(ifn_index, new_ifn_af);
+ sctp_ifnp->registered_af = new_ifn_af;
+ }
+ SCTP_IPI_ADDR_WUNLOCK();
+ if (dynamic_add) {
+ /*
+ * Bump up the refcount so that when the timer completes it
+ * will drop back down.
+ */
+ struct sctp_laddr *wi;
+
+ atomic_add_int(&sctp_ifap->refcount, 1);
+ wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
+ if (wi == NULL) {
+ /*
+ * Gak, what can we do? We have lost an address
+ * change can you say HOSED?
+ */
+ SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n");
+ /* Opps, must decrement the count */
+ sctp_del_addr_from_vrf(vrf_id, addr, ifn_index,
+ if_name);
+ return (NULL);
+ }
+ SCTP_INCR_LADDR_COUNT();
+ bzero(wi, sizeof(*wi));
+ (void)SCTP_GETTIME_TIMEVAL(&wi->start_time);
+ wi->ifa = sctp_ifap;
+ wi->action = SCTP_ADD_IP_ADDRESS;
+
+ SCTP_WQ_ADDR_LOCK();
+ LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr);
+ SCTP_WQ_ADDR_UNLOCK();
+
+ sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ,
+ (struct sctp_inpcb *)NULL,
+ (struct sctp_tcb *)NULL,
+ (struct sctp_nets *)NULL);
+ } else {
+ /* it's ready for use */
+ sctp_ifap->localifa_flags &= ~SCTP_ADDR_DEFER_USE;
+ }
+ return (sctp_ifap);
+}
+
+void
+sctp_del_addr_from_vrf(uint32_t vrf_id, struct sockaddr *addr,
+ uint32_t ifn_index, const char *if_name)
+{
+ struct sctp_vrf *vrf;
+ struct sctp_ifa *sctp_ifap = NULL;
+
+ SCTP_IPI_ADDR_WLOCK();
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf == NULL) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id);
+ goto out_now;
+ }
+#ifdef SCTP_DEBUG
+ SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: deleting address:", vrf_id);
+ SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr);
+#endif
+ sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED);
+ if (sctp_ifap) {
+ /* Validate the delete */
+ if (sctp_ifap->ifn_p) {
+ int valid = 0;
+
+ /*-
+ * The name has priority over the ifn_index
+ * if its given. We do this especially for
+ * panda who might recycle indexes fast.
+ */
+ if (if_name) {
+ int len1, len2;
+
+ len1 = min(SCTP_IFNAMSIZ, strlen(if_name));
+ len2 = min(SCTP_IFNAMSIZ, strlen(sctp_ifap->ifn_p->ifn_name));
+ if (len1 && len2 && (len1 == len2)) {
+ /* we can compare them */
+ if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, len1) == 0) {
+ /*
+ * They match its a correct
+ * delete
+ */
+ valid = 1;
+ }
+ }
+ }
+ if (!valid) {
+ /* last ditch check ifn_index */
+ if (ifn_index == sctp_ifap->ifn_p->ifn_index) {
+ valid = 1;
+ }
+ }
+ if (!valid) {
+ SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s does not match addresses\n",
+ ifn_index, ((if_name == NULL) ? "NULL" : if_name));
+ SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s - ignoring delete\n",
+ sctp_ifap->ifn_p->ifn_index, sctp_ifap->ifn_p->ifn_name);
+ SCTP_IPI_ADDR_WUNLOCK();
+ return;
+ }
+ }
+ SCTPDBG(SCTP_DEBUG_PCB4, "Deleting ifa %p\n", sctp_ifap);
+ sctp_ifap->localifa_flags &= SCTP_ADDR_VALID;
+ sctp_ifap->localifa_flags |= SCTP_BEING_DELETED;
+ vrf->total_ifa_count--;
+ LIST_REMOVE(sctp_ifap, next_bucket);
+ sctp_remove_ifa_from_ifn(sctp_ifap);
+ }
+#ifdef SCTP_DEBUG
+ else {
+ SCTPDBG(SCTP_DEBUG_PCB4, "Del Addr-ifn:%d Could not find address:",
+ ifn_index);
+ SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr);
+ }
+#endif
+
+out_now:
+ SCTP_IPI_ADDR_WUNLOCK();
+ if (sctp_ifap) {
+ struct sctp_laddr *wi;
+
+ wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
+ if (wi == NULL) {
+ /*
+ * Gak, what can we do? We have lost an address
+ * change can you say HOSED?
+ */
+ SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n");
+
+ /* Oops, must decrement the count */
+ sctp_free_ifa(sctp_ifap);
+ return;
+ }
+ SCTP_INCR_LADDR_COUNT();
+ bzero(wi, sizeof(*wi));
+ (void)SCTP_GETTIME_TIMEVAL(&wi->start_time);
+ wi->ifa = sctp_ifap;
+ wi->action = SCTP_DEL_IP_ADDRESS;
+ SCTP_WQ_ADDR_LOCK();
+ /*
+ * Should this really be a tailq? As it is we will process
+ * the newest first :-0
+ */
+ LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr);
+ SCTP_WQ_ADDR_UNLOCK();
+
+ sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ,
+ (struct sctp_inpcb *)NULL,
+ (struct sctp_tcb *)NULL,
+ (struct sctp_nets *)NULL);
+ }
+ return;
+}
+
+
+static struct sctp_tcb *
+sctp_tcb_special_locate(struct sctp_inpcb **inp_p, struct sockaddr *from,
+ struct sockaddr *to, struct sctp_nets **netp, uint32_t vrf_id)
+{
+ /**** ASSUMES THE CALLER holds the INP_INFO_RLOCK */
+ /*
+ * If we support the TCP model, then we must now dig through to see
+ * if we can find our endpoint in the list of tcp ep's.
+ */
+ uint16_t lport, rport;
+ struct sctppcbhead *ephead;
+ struct sctp_inpcb *inp;
+ struct sctp_laddr *laddr;
+ struct sctp_tcb *stcb;
+ struct sctp_nets *net;
+
+ if ((to == NULL) || (from == NULL)) {
+ return (NULL);
+ }
+ if (to->sa_family == AF_INET && from->sa_family == AF_INET) {
+ lport = ((struct sockaddr_in *)to)->sin_port;
+ rport = ((struct sockaddr_in *)from)->sin_port;
+ } else if (to->sa_family == AF_INET6 && from->sa_family == AF_INET6) {
+ lport = ((struct sockaddr_in6 *)to)->sin6_port;
+ rport = ((struct sockaddr_in6 *)from)->sin6_port;
+ } else {
+ return NULL;
+ }
+ ephead = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))];
+ /*
+ * Ok now for each of the guys in this bucket we must look and see:
+ * - Does the remote port match. - Does there single association's
+ * addresses match this address (to). If so we update p_ep to point
+ * to this ep and return the tcb from it.
+ */
+ LIST_FOREACH(inp, ephead, sctp_hash) {
+ SCTP_INP_RLOCK(inp);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ if (lport != inp->sctp_lport) {
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ if (inp->def_vrf_id != vrf_id) {
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ /* check to see if the ep has one of the addresses */
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
+ /* We are NOT bound all, so look further */
+ int match = 0;
+
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+
+ if (laddr->ifa == NULL) {
+ SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __FUNCTION__);
+ continue;
+ }
+ if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
+ SCTPDBG(SCTP_DEBUG_PCB1, "ifa being deleted\n");
+ continue;
+ }
+ if (laddr->ifa->address.sa.sa_family ==
+ to->sa_family) {
+ /* see if it matches */
+ struct sockaddr_in *intf_addr, *sin;
+
+ intf_addr = &laddr->ifa->address.sin;
+ sin = (struct sockaddr_in *)to;
+ if (from->sa_family == AF_INET) {
+ if (sin->sin_addr.s_addr ==
+ intf_addr->sin_addr.s_addr) {
+ match = 1;
+ break;
+ }
+ }
+#ifdef INET6
+ if (from->sa_family == AF_INET6) {
+ struct sockaddr_in6 *intf_addr6;
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)
+ to;
+ intf_addr6 = &laddr->ifa->address.sin6;
+
+ if (SCTP6_ARE_ADDR_EQUAL(sin6,
+ intf_addr6)) {
+ match = 1;
+ break;
+ }
+ }
+#endif
+ }
+ }
+ if (match == 0) {
+ /* This endpoint does not have this address */
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ }
+ /*
+ * Ok if we hit here the ep has the address, does it hold
+ * the tcb?
+ */
+
+ stcb = LIST_FIRST(&inp->sctp_asoc_list);
+ if (stcb == NULL) {
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ SCTP_TCB_LOCK(stcb);
+ if (stcb->rport != rport) {
+ /* remote port does not match. */
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ /* Does this TCB have a matching address? */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+
+ if (net->ro._l_addr.sa.sa_family != from->sa_family) {
+ /* not the same family, can't be a match */
+ continue;
+ }
+ switch (from->sa_family) {
+ case AF_INET:
+ {
+ struct sockaddr_in *sin, *rsin;
+
+ sin = (struct sockaddr_in *)&net->ro._l_addr;
+ rsin = (struct sockaddr_in *)from;
+ if (sin->sin_addr.s_addr ==
+ rsin->sin_addr.s_addr) {
+ /* found it */
+ if (netp != NULL) {
+ *netp = net;
+ }
+ /*
+ * Update the endpoint
+ * pointer
+ */
+ *inp_p = inp;
+ SCTP_INP_RUNLOCK(inp);
+ return (stcb);
+ }
+ break;
+ }
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 *sin6, *rsin6;
+
+ sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
+ rsin6 = (struct sockaddr_in6 *)from;
+ if (SCTP6_ARE_ADDR_EQUAL(sin6,
+ rsin6)) {
+ /* found it */
+ if (netp != NULL) {
+ *netp = net;
+ }
+ /*
+ * Update the endpoint
+ * pointer
+ */
+ *inp_p = inp;
+ SCTP_INP_RUNLOCK(inp);
+ return (stcb);
+ }
+ break;
+ }
+#endif
+ default:
+ /* TSNH */
+ break;
+ }
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_INP_RUNLOCK(inp);
+ }
+ return (NULL);
+}
+
+static int
+sctp_does_stcb_own_this_addr(struct sctp_tcb *stcb, struct sockaddr *to)
+{
+ int loopback_scope, ipv4_local_scope, local_scope, site_scope;
+ int ipv4_addr_legal, ipv6_addr_legal;
+ struct sctp_vrf *vrf;
+ struct sctp_ifn *sctp_ifn;
+ struct sctp_ifa *sctp_ifa;
+
+ loopback_scope = stcb->asoc.loopback_scope;
+ ipv4_local_scope = stcb->asoc.ipv4_local_scope;
+ local_scope = stcb->asoc.local_scope;
+ site_scope = stcb->asoc.site_scope;
+ ipv4_addr_legal = ipv6_addr_legal = 0;
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ ipv6_addr_legal = 1;
+ if (SCTP_IPV6_V6ONLY(stcb->sctp_ep) == 0) {
+ ipv4_addr_legal = 1;
+ }
+ } else {
+ ipv4_addr_legal = 1;
+ }
+
+ SCTP_IPI_ADDR_RLOCK();
+ vrf = sctp_find_vrf(stcb->asoc.vrf_id);
+ if (vrf == NULL) {
+ /* no vrf, no addresses */
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (0);
+ }
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
+ if ((loopback_scope == 0) &&
+ SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
+ continue;
+ }
+ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
+ if (sctp_is_addr_restricted(stcb, sctp_ifa))
+ continue;
+ switch (sctp_ifa->address.sa.sa_family) {
+#ifdef INET
+ case AF_INET:
+ if (ipv4_addr_legal) {
+ struct sockaddr_in *sin,
+ *rsin;
+
+ sin = &sctp_ifa->address.sin;
+ rsin = (struct sockaddr_in *)to;
+ if ((ipv4_local_scope == 0) &&
+ IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) {
+ continue;
+ }
+ if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) {
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (1);
+ }
+ }
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ if (ipv6_addr_legal) {
+ struct sockaddr_in6 *sin6,
+ *rsin6;
+
+ sin6 = &sctp_ifa->address.sin6;
+ rsin6 = (struct sockaddr_in6 *)to;
+ if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
+ if (local_scope == 0)
+ continue;
+ if (sin6->sin6_scope_id == 0) {
+ if (sa6_recoverscope(sin6) != 0)
+ continue;
+ }
+ }
+ if ((site_scope == 0) &&
+ (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) {
+ continue;
+ }
+ if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) {
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (1);
+ }
+ }
+ break;
+#endif
+ default:
+ /* TSNH */
+ break;
+ }
+ }
+ }
+ } else {
+ struct sctp_laddr *laddr;
+
+ LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) {
+ if (sctp_is_addr_restricted(stcb, laddr->ifa)) {
+ continue;
+ }
+ if (laddr->ifa->address.sa.sa_family != to->sa_family) {
+ continue;
+ }
+ switch (to->sa_family) {
+#ifdef INET
+ case AF_INET:
+ {
+ struct sockaddr_in *sin, *rsin;
+
+ sin = (struct sockaddr_in *)&laddr->ifa->address.sin;
+ rsin = (struct sockaddr_in *)to;
+ if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) {
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (1);
+ }
+ break;
+ }
+#endif
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 *sin6, *rsin6;
+
+ sin6 = (struct sockaddr_in6 *)&laddr->ifa->address.sin6;
+ rsin6 = (struct sockaddr_in6 *)to;
+ if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) {
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (1);
+ }
+ break;
+ }
+
+#endif
+ default:
+ /* TSNH */
+ break;
+ }
+
+ }
+ }
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (0);
+}
+
+/*
+ * rules for use
+ *
+ * 1) If I return a NULL you must decrement any INP ref cnt. 2) If I find an
+ * stcb, both will be locked (locked_tcb and stcb) but decrement will be done
+ * (if locked == NULL). 3) Decrement happens on return ONLY if locked ==
+ * NULL.
+ */
+
+struct sctp_tcb *
+sctp_findassociation_ep_addr(struct sctp_inpcb **inp_p, struct sockaddr *remote,
+ struct sctp_nets **netp, struct sockaddr *local, struct sctp_tcb *locked_tcb)
+{
+ struct sctpasochead *head;
+ struct sctp_inpcb *inp;
+ struct sctp_tcb *stcb = NULL;
+ struct sctp_nets *net;
+ uint16_t rport;
+
+ inp = *inp_p;
+ if (remote->sa_family == AF_INET) {
+ rport = (((struct sockaddr_in *)remote)->sin_port);
+ } else if (remote->sa_family == AF_INET6) {
+ rport = (((struct sockaddr_in6 *)remote)->sin6_port);
+ } else {
+ return (NULL);
+ }
+ if (locked_tcb) {
+ /*
+ * UN-lock so we can do proper locking here this occurs when
+ * called from load_addresses_from_init.
+ */
+ atomic_add_int(&locked_tcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
+ SCTP_INP_INFO_RLOCK();
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) {
+ /*-
+ * Now either this guy is our listener or it's the
+ * connector. If it is the one that issued the connect, then
+ * it's only chance is to be the first TCB in the list. If
+ * it is the acceptor, then do the special_lookup to hash
+ * and find the real inp.
+ */
+ if ((inp->sctp_socket) && (inp->sctp_socket->so_qlimit)) {
+ /* to is peer addr, from is my addr */
+ stcb = sctp_tcb_special_locate(inp_p, remote, local,
+ netp, inp->def_vrf_id);
+ if ((stcb != NULL) && (locked_tcb == NULL)) {
+ /* we have a locked tcb, lower refcount */
+ SCTP_INP_DECR_REF(inp);
+ }
+ if ((locked_tcb != NULL) && (locked_tcb != stcb)) {
+ SCTP_INP_RLOCK(locked_tcb->sctp_ep);
+ SCTP_TCB_LOCK(locked_tcb);
+ atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
+ SCTP_INP_RUNLOCK(locked_tcb->sctp_ep);
+ }
+ SCTP_INP_INFO_RUNLOCK();
+ return (stcb);
+ } else {
+ SCTP_INP_WLOCK(inp);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
+ goto null_return;
+ }
+ stcb = LIST_FIRST(&inp->sctp_asoc_list);
+ if (stcb == NULL) {
+ goto null_return;
+ }
+ SCTP_TCB_LOCK(stcb);
+
+ if (stcb->rport != rport) {
+ /* remote port does not match. */
+ SCTP_TCB_UNLOCK(stcb);
+ goto null_return;
+ }
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ SCTP_TCB_UNLOCK(stcb);
+ goto null_return;
+ }
+ if (local && !sctp_does_stcb_own_this_addr(stcb, local)) {
+ SCTP_TCB_UNLOCK(stcb);
+ goto null_return;
+ }
+ /* now look at the list of remote addresses */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+#ifdef INVARIANTS
+ if (net == (TAILQ_NEXT(net, sctp_next))) {
+ panic("Corrupt net list");
+ }
+#endif
+ if (net->ro._l_addr.sa.sa_family !=
+ remote->sa_family) {
+ /* not the same family */
+ continue;
+ }
+ switch (remote->sa_family) {
+ case AF_INET:
+ {
+ struct sockaddr_in *sin,
+ *rsin;
+
+ sin = (struct sockaddr_in *)
+ &net->ro._l_addr;
+ rsin = (struct sockaddr_in *)remote;
+ if (sin->sin_addr.s_addr ==
+ rsin->sin_addr.s_addr) {
+ /* found it */
+ if (netp != NULL) {
+ *netp = net;
+ }
+ if (locked_tcb == NULL) {
+ SCTP_INP_DECR_REF(inp);
+ } else if (locked_tcb != stcb) {
+ SCTP_TCB_LOCK(locked_tcb);
+ }
+ if (locked_tcb) {
+ atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ return (stcb);
+ }
+ break;
+ }
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 *sin6,
+ *rsin6;
+
+ sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
+ rsin6 = (struct sockaddr_in6 *)remote;
+ if (SCTP6_ARE_ADDR_EQUAL(sin6,
+ rsin6)) {
+ /* found it */
+ if (netp != NULL) {
+ *netp = net;
+ }
+ if (locked_tcb == NULL) {
+ SCTP_INP_DECR_REF(inp);
+ } else if (locked_tcb != stcb) {
+ SCTP_TCB_LOCK(locked_tcb);
+ }
+ if (locked_tcb) {
+ atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ return (stcb);
+ }
+ break;
+ }
+#endif
+ default:
+ /* TSNH */
+ break;
+ }
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ } else {
+ SCTP_INP_WLOCK(inp);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
+ goto null_return;
+ }
+ head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(rport,
+ inp->sctp_hashmark)];
+ if (head == NULL) {
+ goto null_return;
+ }
+ LIST_FOREACH(stcb, head, sctp_tcbhash) {
+ if (stcb->rport != rport) {
+ /* remote port does not match */
+ continue;
+ }
+ SCTP_TCB_LOCK(stcb);
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ SCTP_TCB_UNLOCK(stcb);
+ continue;
+ }
+ if (local && !sctp_does_stcb_own_this_addr(stcb, local)) {
+ SCTP_TCB_UNLOCK(stcb);
+ continue;
+ }
+ /* now look at the list of remote addresses */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+#ifdef INVARIANTS
+ if (net == (TAILQ_NEXT(net, sctp_next))) {
+ panic("Corrupt net list");
+ }
+#endif
+ if (net->ro._l_addr.sa.sa_family !=
+ remote->sa_family) {
+ /* not the same family */
+ continue;
+ }
+ switch (remote->sa_family) {
+ case AF_INET:
+ {
+ struct sockaddr_in *sin,
+ *rsin;
+
+ sin = (struct sockaddr_in *)
+ &net->ro._l_addr;
+ rsin = (struct sockaddr_in *)remote;
+ if (sin->sin_addr.s_addr ==
+ rsin->sin_addr.s_addr) {
+ /* found it */
+ if (netp != NULL) {
+ *netp = net;
+ }
+ if (locked_tcb == NULL) {
+ SCTP_INP_DECR_REF(inp);
+ } else if (locked_tcb != stcb) {
+ SCTP_TCB_LOCK(locked_tcb);
+ }
+ if (locked_tcb) {
+ atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ return (stcb);
+ }
+ break;
+ }
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 *sin6,
+ *rsin6;
+
+ sin6 = (struct sockaddr_in6 *)
+ &net->ro._l_addr;
+ rsin6 = (struct sockaddr_in6 *)remote;
+ if (SCTP6_ARE_ADDR_EQUAL(sin6,
+ rsin6)) {
+ /* found it */
+ if (netp != NULL) {
+ *netp = net;
+ }
+ if (locked_tcb == NULL) {
+ SCTP_INP_DECR_REF(inp);
+ } else if (locked_tcb != stcb) {
+ SCTP_TCB_LOCK(locked_tcb);
+ }
+ if (locked_tcb) {
+ atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ return (stcb);
+ }
+ break;
+ }
+#endif
+ default:
+ /* TSNH */
+ break;
+ }
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ }
+null_return:
+ /* clean up for returning null */
+ if (locked_tcb) {
+ SCTP_TCB_LOCK(locked_tcb);
+ atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ /* not found */
+ return (NULL);
+}
+
+/*
+ * Find an association for a specific endpoint using the association id given
+ * out in the COMM_UP notification
+ */
+
+struct sctp_tcb *
+sctp_findasoc_ep_asocid_locked(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock)
+{
+ /*
+ * Use my the assoc_id to find a endpoint
+ */
+ struct sctpasochead *head;
+ struct sctp_tcb *stcb;
+ uint32_t id;
+
+ if (inp == NULL) {
+ SCTP_PRINTF("TSNH ep_associd\n");
+ return (NULL);
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
+ SCTP_PRINTF("TSNH ep_associd0\n");
+ return (NULL);
+ }
+ id = (uint32_t) asoc_id;
+ head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)];
+ if (head == NULL) {
+ /* invalid id TSNH */
+ SCTP_PRINTF("TSNH ep_associd1\n");
+ return (NULL);
+ }
+ LIST_FOREACH(stcb, head, sctp_tcbasocidhash) {
+ if (stcb->asoc.assoc_id == id) {
+ if (inp != stcb->sctp_ep) {
+ /*
+ * some other guy has the same id active (id
+ * collision ??).
+ */
+ SCTP_PRINTF("TSNH ep_associd2\n");
+ continue;
+ }
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ continue;
+ }
+ if (want_lock) {
+ SCTP_TCB_LOCK(stcb);
+ }
+ return (stcb);
+ }
+ }
+ return (NULL);
+}
+
+
+struct sctp_tcb *
+sctp_findassociation_ep_asocid(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock)
+{
+ struct sctp_tcb *stcb;
+
+ SCTP_INP_RLOCK(inp);
+ stcb = sctp_findasoc_ep_asocid_locked(inp, asoc_id, want_lock);
+ SCTP_INP_RUNLOCK(inp);
+ return (stcb);
+}
+
+
+static struct sctp_inpcb *
+sctp_endpoint_probe(struct sockaddr *nam, struct sctppcbhead *head,
+ uint16_t lport, uint32_t vrf_id)
+{
+ struct sctp_inpcb *inp;
+ struct sockaddr_in *sin;
+
+#ifdef INET6
+ struct sockaddr_in6 *sin6;
+
+#endif
+ struct sctp_laddr *laddr;
+
+#ifdef INET6
+ struct sockaddr_in6 *intf_addr6;
+
+#endif
+
+ int fnd;
+
+ /*
+ * Endpoint probe expects that the INP_INFO is locked.
+ */
+ sin = NULL;
+#ifdef INET6
+ sin6 = NULL;
+#endif
+ switch (nam->sa_family) {
+ case AF_INET:
+ sin = (struct sockaddr_in *)nam;
+ break;
+#ifdef INET6
+ case AF_INET6:
+ sin6 = (struct sockaddr_in6 *)nam;
+ break;
+#endif
+ default:
+ /* unsupported family */
+ return (NULL);
+ }
+
+ if (head == NULL)
+ return (NULL);
+
+ LIST_FOREACH(inp, head, sctp_hash) {
+ SCTP_INP_RLOCK(inp);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) &&
+ (inp->sctp_lport == lport)) {
+ /* got it */
+ if ((nam->sa_family == AF_INET) &&
+ (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
+ SCTP_IPV6_V6ONLY(inp)) {
+ /* IPv4 on a IPv6 socket with ONLY IPv6 set */
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ /* A V6 address and the endpoint is NOT bound V6 */
+ if (nam->sa_family == AF_INET6 &&
+ (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ /* does a VRF id match? */
+ fnd = 0;
+ if (inp->def_vrf_id == vrf_id)
+ fnd = 1;
+
+ SCTP_INP_RUNLOCK(inp);
+ if (!fnd)
+ continue;
+ return (inp);
+ }
+ SCTP_INP_RUNLOCK(inp);
+ }
+ if ((nam->sa_family == AF_INET) &&
+ (sin->sin_addr.s_addr == INADDR_ANY)) {
+ /* Can't hunt for one that has no address specified */
+ return (NULL);
+ }
+#ifdef INET6
+ if ((nam->sa_family == AF_INET6) &&
+ (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))) {
+ /* Can't hunt for one that has no address specified */
+ return (NULL);
+ }
+#endif
+ /*
+ * ok, not bound to all so see if we can find a EP bound to this
+ * address.
+ */
+ LIST_FOREACH(inp, head, sctp_hash) {
+ SCTP_INP_RLOCK(inp);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL)) {
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ /*
+ * Ok this could be a likely candidate, look at all of its
+ * addresses
+ */
+ if (inp->sctp_lport != lport) {
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ /* does a VRF id match? */
+ fnd = 0;
+ if (inp->def_vrf_id == vrf_id)
+ fnd = 1;
+
+ if (!fnd) {
+ SCTP_INP_RUNLOCK(inp);
+ continue;
+ }
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ if (laddr->ifa == NULL) {
+ SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n",
+ __FUNCTION__);
+ continue;
+ }
+ SCTPDBG(SCTP_DEBUG_PCB1, "Ok laddr->ifa:%p is possible, ",
+ laddr->ifa);
+ if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
+ SCTPDBG(SCTP_DEBUG_PCB1, "Huh IFA being deleted\n");
+ continue;
+ }
+ if (laddr->ifa->address.sa.sa_family == nam->sa_family) {
+ /* possible, see if it matches */
+ struct sockaddr_in *intf_addr;
+
+ intf_addr = &laddr->ifa->address.sin;
+ switch (nam->sa_family) {
+ case AF_INET:
+ if (sin->sin_addr.s_addr ==
+ intf_addr->sin_addr.s_addr) {
+ SCTP_INP_RUNLOCK(inp);
+ return (inp);
+ }
+ break;
+#ifdef INET6
+ case AF_INET6:
+ intf_addr6 = &laddr->ifa->address.sin6;
+ if (SCTP6_ARE_ADDR_EQUAL(sin6,
+ intf_addr6)) {
+ SCTP_INP_RUNLOCK(inp);
+ return (inp);
+ }
+ break;
+#endif
+ }
+ }
+ }
+ SCTP_INP_RUNLOCK(inp);
+ }
+ return (NULL);
+}
+
+
+static struct sctp_inpcb *
+sctp_isport_inuse(struct sctp_inpcb *inp, uint16_t lport, uint32_t vrf_id)
+{
+ struct sctppcbhead *head;
+ struct sctp_inpcb *t_inp;
+ int fnd;
+
+ head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport,
+ SCTP_BASE_INFO(hashmark))];
+ LIST_FOREACH(t_inp, head, sctp_hash) {
+ if (t_inp->sctp_lport != lport) {
+ continue;
+ }
+ /* is it in the VRF in question */
+ fnd = 0;
+ if (t_inp->def_vrf_id == vrf_id)
+ fnd = 1;
+ if (!fnd)
+ continue;
+
+ /* This one is in use. */
+ /* check the v6/v4 binding issue */
+ if ((t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
+ SCTP_IPV6_V6ONLY(t_inp)) {
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ /* collision in V6 space */
+ return (t_inp);
+ } else {
+ /* inp is BOUND_V4 no conflict */
+ continue;
+ }
+ } else if (t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ /* t_inp is bound v4 and v6, conflict always */
+ return (t_inp);
+ } else {
+ /* t_inp is bound only V4 */
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
+ SCTP_IPV6_V6ONLY(inp)) {
+ /* no conflict */
+ continue;
+ }
+ /* else fall through to conflict */
+ }
+ return (t_inp);
+ }
+ return (NULL);
+}
+
+
+int
+sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp)
+{
+ /* For 1-2-1 with port reuse */
+ struct sctppcbhead *head;
+ struct sctp_inpcb *tinp;
+
+ if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) {
+ /* only works with port reuse on */
+ return (-1);
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) == 0) {
+ return (0);
+ }
+ SCTP_INP_RUNLOCK(inp);
+ head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport,
+ SCTP_BASE_INFO(hashmark))];
+ /* Kick out all non-listeners to the TCP hash */
+ LIST_FOREACH(tinp, head, sctp_hash) {
+ if (tinp->sctp_lport != inp->sctp_lport) {
+ continue;
+ }
+ if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
+ continue;
+ }
+ if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ continue;
+ }
+ if (tinp->sctp_socket->so_qlimit) {
+ continue;
+ }
+ SCTP_INP_WLOCK(tinp);
+ LIST_REMOVE(tinp, sctp_hash);
+ head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(tinp->sctp_lport, SCTP_BASE_INFO(hashtcpmark))];
+ tinp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL;
+ LIST_INSERT_HEAD(head, tinp, sctp_hash);
+ SCTP_INP_WUNLOCK(tinp);
+ }
+ SCTP_INP_WLOCK(inp);
+ /* Pull from where he was */
+ LIST_REMOVE(inp, sctp_hash);
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_IN_TCPPOOL;
+ head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport, SCTP_BASE_INFO(hashmark))];
+ LIST_INSERT_HEAD(head, inp, sctp_hash);
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_RLOCK(inp);
+ return (0);
+}
+
+
+struct sctp_inpcb *
+sctp_pcb_findep(struct sockaddr *nam, int find_tcp_pool, int have_lock,
+ uint32_t vrf_id)
+{
+ /*
+ * First we check the hash table to see if someone has this port
+ * bound with just the port.
+ */
+ struct sctp_inpcb *inp;
+ struct sctppcbhead *head;
+ struct sockaddr_in *sin;
+ struct sockaddr_in6 *sin6;
+ int lport;
+ unsigned int i;
+
+ if (nam->sa_family == AF_INET) {
+ sin = (struct sockaddr_in *)nam;
+ lport = ((struct sockaddr_in *)nam)->sin_port;
+ } else if (nam->sa_family == AF_INET6) {
+ sin6 = (struct sockaddr_in6 *)nam;
+ lport = ((struct sockaddr_in6 *)nam)->sin6_port;
+ } else {
+ /* unsupported family */
+ return (NULL);
+ }
+ /*
+ * I could cheat here and just cast to one of the types but we will
+ * do it right. It also provides the check against an Unsupported
+ * type too.
+ */
+ /* Find the head of the ALLADDR chain */
+ if (have_lock == 0) {
+ SCTP_INP_INFO_RLOCK();
+ }
+ head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport,
+ SCTP_BASE_INFO(hashmark))];
+ inp = sctp_endpoint_probe(nam, head, lport, vrf_id);
+
+ /*
+ * If the TCP model exists it could be that the main listening
+ * endpoint is gone but there still exists a connected socket for
+ * this guy. If so we can return the first one that we find. This
+ * may NOT be the correct one so the caller should be wary on the
+ * returned INP. Currently the only caller that sets find_tcp_pool
+ * is in bindx where we are verifying that a user CAN bind the
+ * address. He either has bound it already, or someone else has, or
+ * its open to bind, so this is good enough.
+ */
+ if (inp == NULL && find_tcp_pool) {
+ for (i = 0; i < SCTP_BASE_INFO(hashtcpmark) + 1; i++) {
+ head = &SCTP_BASE_INFO(sctp_tcpephash)[i];
+ inp = sctp_endpoint_probe(nam, head, lport, vrf_id);
+ if (inp) {
+ break;
+ }
+ }
+ }
+ if (inp) {
+ SCTP_INP_INCR_REF(inp);
+ }
+ if (have_lock == 0) {
+ SCTP_INP_INFO_RUNLOCK();
+ }
+ return (inp);
+}
+
+/*
+ * Find an association for an endpoint with the pointer to whom you want to
+ * send to and the endpoint pointer. The address can be IPv4 or IPv6. We may
+ * need to change the *to to some other struct like a mbuf...
+ */
+struct sctp_tcb *
+sctp_findassociation_addr_sa(struct sockaddr *to, struct sockaddr *from,
+ struct sctp_inpcb **inp_p, struct sctp_nets **netp, int find_tcp_pool,
+ uint32_t vrf_id)
+{
+ struct sctp_inpcb *inp = NULL;
+ struct sctp_tcb *retval;
+
+ SCTP_INP_INFO_RLOCK();
+ if (find_tcp_pool) {
+ if (inp_p != NULL) {
+ retval = sctp_tcb_special_locate(inp_p, from, to, netp,
+ vrf_id);
+ } else {
+ retval = sctp_tcb_special_locate(&inp, from, to, netp,
+ vrf_id);
+ }
+ if (retval != NULL) {
+ SCTP_INP_INFO_RUNLOCK();
+ return (retval);
+ }
+ }
+ inp = sctp_pcb_findep(to, 0, 1, vrf_id);
+ if (inp_p != NULL) {
+ *inp_p = inp;
+ }
+ SCTP_INP_INFO_RUNLOCK();
+
+ if (inp == NULL) {
+ return (NULL);
+ }
+ /*
+ * ok, we have an endpoint, now lets find the assoc for it (if any)
+ * we now place the source address or from in the to of the find
+ * endpoint call. Since in reality this chain is used from the
+ * inbound packet side.
+ */
+ if (inp_p != NULL) {
+ retval = sctp_findassociation_ep_addr(inp_p, from, netp, to,
+ NULL);
+ } else {
+ retval = sctp_findassociation_ep_addr(&inp, from, netp, to,
+ NULL);
+ }
+ return retval;
+}
+
+
+/*
+ * This routine will grub through the mbuf that is a INIT or INIT-ACK and
+ * find all addresses that the sender has specified in any address list. Each
+ * address will be used to lookup the TCB and see if one exits.
+ */
+static struct sctp_tcb *
+sctp_findassociation_special_addr(struct mbuf *m, int iphlen, int offset,
+ struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp,
+ struct sockaddr *dest)
+{
+ struct sockaddr_in sin4;
+ struct sockaddr_in6 sin6;
+ struct sctp_paramhdr *phdr, parm_buf;
+ struct sctp_tcb *retval;
+ uint32_t ptype, plen;
+
+ memset(&sin4, 0, sizeof(sin4));
+ memset(&sin6, 0, sizeof(sin6));
+ sin4.sin_len = sizeof(sin4);
+ sin4.sin_family = AF_INET;
+ sin4.sin_port = sh->src_port;
+ sin6.sin6_len = sizeof(sin6);
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_port = sh->src_port;
+
+ retval = NULL;
+ offset += sizeof(struct sctp_init_chunk);
+
+ phdr = sctp_get_next_param(m, offset, &parm_buf, sizeof(parm_buf));
+ while (phdr != NULL) {
+ /* now we must see if we want the parameter */
+ ptype = ntohs(phdr->param_type);
+ plen = ntohs(phdr->param_length);
+ if (plen == 0) {
+ break;
+ }
+ if (ptype == SCTP_IPV4_ADDRESS &&
+ plen == sizeof(struct sctp_ipv4addr_param)) {
+ /* Get the rest of the address */
+ struct sctp_ipv4addr_param ip4_parm, *p4;
+
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)&ip4_parm, min(plen, sizeof(ip4_parm)));
+ if (phdr == NULL) {
+ return (NULL);
+ }
+ p4 = (struct sctp_ipv4addr_param *)phdr;
+ memcpy(&sin4.sin_addr, &p4->addr, sizeof(p4->addr));
+ /* look it up */
+ retval = sctp_findassociation_ep_addr(inp_p,
+ (struct sockaddr *)&sin4, netp, dest, NULL);
+ if (retval != NULL) {
+ return (retval);
+ }
+ } else if (ptype == SCTP_IPV6_ADDRESS &&
+ plen == sizeof(struct sctp_ipv6addr_param)) {
+ /* Get the rest of the address */
+ struct sctp_ipv6addr_param ip6_parm, *p6;
+
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)&ip6_parm, min(plen, sizeof(ip6_parm)));
+ if (phdr == NULL) {
+ return (NULL);
+ }
+ p6 = (struct sctp_ipv6addr_param *)phdr;
+ memcpy(&sin6.sin6_addr, &p6->addr, sizeof(p6->addr));
+ /* look it up */
+ retval = sctp_findassociation_ep_addr(inp_p,
+ (struct sockaddr *)&sin6, netp, dest, NULL);
+ if (retval != NULL) {
+ return (retval);
+ }
+ }
+ offset += SCTP_SIZE32(plen);
+ phdr = sctp_get_next_param(m, offset, &parm_buf,
+ sizeof(parm_buf));
+ }
+ return (NULL);
+}
+
+static struct sctp_tcb *
+sctp_findassoc_by_vtag(struct sockaddr *from, struct sockaddr *to, uint32_t vtag,
+ struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint16_t rport,
+ uint16_t lport, int skip_src_check, uint32_t vrf_id, uint32_t remote_tag)
+{
+ /*
+ * Use my vtag to hash. If we find it we then verify the source addr
+ * is in the assoc. If all goes well we save a bit on rec of a
+ * packet.
+ */
+ struct sctpasochead *head;
+ struct sctp_nets *net;
+ struct sctp_tcb *stcb;
+
+ *netp = NULL;
+ *inp_p = NULL;
+ SCTP_INP_INFO_RLOCK();
+ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(vtag,
+ SCTP_BASE_INFO(hashasocmark))];
+ if (head == NULL) {
+ /* invalid vtag */
+ SCTP_INP_INFO_RUNLOCK();
+ return (NULL);
+ }
+ LIST_FOREACH(stcb, head, sctp_asocs) {
+ SCTP_INP_RLOCK(stcb->sctp_ep);
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
+ SCTP_INP_RUNLOCK(stcb->sctp_ep);
+ continue;
+ }
+ SCTP_TCB_LOCK(stcb);
+ SCTP_INP_RUNLOCK(stcb->sctp_ep);
+ if (stcb->asoc.my_vtag == vtag) {
+ /* candidate */
+ if (stcb->rport != rport) {
+ SCTP_TCB_UNLOCK(stcb);
+ continue;
+ }
+ if (stcb->sctp_ep->sctp_lport != lport) {
+ SCTP_TCB_UNLOCK(stcb);
+ continue;
+ }
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ SCTP_TCB_UNLOCK(stcb);
+ continue;
+ }
+ /* RRS:Need toaddr check here */
+ if (sctp_does_stcb_own_this_addr(stcb, to) == 0) {
+ /* Endpoint does not own this address */
+ SCTP_TCB_UNLOCK(stcb);
+ continue;
+ }
+ if (remote_tag) {
+ /*
+ * If we have both vtags that's all we match
+ * on
+ */
+ if (stcb->asoc.peer_vtag == remote_tag) {
+ /*
+ * If both tags match we consider it
+ * conclusive and check NO
+ * source/destination addresses
+ */
+ goto conclusive;
+ }
+ }
+ if (skip_src_check) {
+ conclusive:
+ if (from) {
+ net = sctp_findnet(stcb, from);
+ } else {
+ *netp = NULL; /* unknown */
+ }
+ if (inp_p)
+ *inp_p = stcb->sctp_ep;
+ SCTP_INP_INFO_RUNLOCK();
+ return (stcb);
+ }
+ net = sctp_findnet(stcb, from);
+ if (net) {
+ /* yep its him. */
+ *netp = net;
+ SCTP_STAT_INCR(sctps_vtagexpress);
+ *inp_p = stcb->sctp_ep;
+ SCTP_INP_INFO_RUNLOCK();
+ return (stcb);
+ } else {
+ /*
+ * not him, this should only happen in rare
+ * cases so I peg it.
+ */
+ SCTP_STAT_INCR(sctps_vtagbogus);
+ }
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ SCTP_INP_INFO_RUNLOCK();
+ return (NULL);
+}
+
+/*
+ * Find an association with the pointer to the inbound IP packet. This can be
+ * a IPv4 or IPv6 packet.
+ */
+struct sctp_tcb *
+sctp_findassociation_addr(struct mbuf *m, int iphlen, int offset,
+ struct sctphdr *sh, struct sctp_chunkhdr *ch,
+ struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id)
+{
+ int find_tcp_pool;
+ struct ip *iph;
+ struct sctp_tcb *retval;
+ struct sockaddr_storage to_store, from_store;
+ struct sockaddr *to = (struct sockaddr *)&to_store;
+ struct sockaddr *from = (struct sockaddr *)&from_store;
+ struct sctp_inpcb *inp;
+
+ iph = mtod(m, struct ip *);
+ switch (iph->ip_v) {
+ case IPVERSION:
+ {
+ /* its IPv4 */
+ struct sockaddr_in *from4;
+
+ from4 = (struct sockaddr_in *)&from_store;
+ bzero(from4, sizeof(*from4));
+ from4->sin_family = AF_INET;
+ from4->sin_len = sizeof(struct sockaddr_in);
+ from4->sin_addr.s_addr = iph->ip_src.s_addr;
+ from4->sin_port = sh->src_port;
+ break;
+ }
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ {
+ /* its IPv6 */
+ struct ip6_hdr *ip6;
+ struct sockaddr_in6 *from6;
+
+ ip6 = mtod(m, struct ip6_hdr *);
+ from6 = (struct sockaddr_in6 *)&from_store;
+ bzero(from6, sizeof(*from6));
+ from6->sin6_family = AF_INET6;
+ from6->sin6_len = sizeof(struct sockaddr_in6);
+ from6->sin6_addr = ip6->ip6_src;
+ from6->sin6_port = sh->src_port;
+ /* Get the scopes in properly to the sin6 addr's */
+ /* we probably don't need these operations */
+ (void)sa6_recoverscope(from6);
+ sa6_embedscope(from6, MODULE_GLOBAL(ip6_use_defzone));
+ break;
+ }
+#endif
+ default:
+ /* Currently not supported. */
+ return (NULL);
+ }
+
+
+ switch (iph->ip_v) {
+ case IPVERSION:
+ {
+ /* its IPv4 */
+ struct sockaddr_in *to4;
+
+ to4 = (struct sockaddr_in *)&to_store;
+ bzero(to4, sizeof(*to4));
+ to4->sin_family = AF_INET;
+ to4->sin_len = sizeof(struct sockaddr_in);
+ to4->sin_addr.s_addr = iph->ip_dst.s_addr;
+ to4->sin_port = sh->dest_port;
+ break;
+ }
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ {
+ /* its IPv6 */
+ struct ip6_hdr *ip6;
+ struct sockaddr_in6 *to6;
+
+ ip6 = mtod(m, struct ip6_hdr *);
+ to6 = (struct sockaddr_in6 *)&to_store;
+ bzero(to6, sizeof(*to6));
+ to6->sin6_family = AF_INET6;
+ to6->sin6_len = sizeof(struct sockaddr_in6);
+ to6->sin6_addr = ip6->ip6_dst;
+ to6->sin6_port = sh->dest_port;
+ /* Get the scopes in properly to the sin6 addr's */
+ /* we probably don't need these operations */
+ (void)sa6_recoverscope(to6);
+ sa6_embedscope(to6, MODULE_GLOBAL(ip6_use_defzone));
+ break;
+ }
+#endif
+ default:
+ /* TSNH */
+ break;
+ }
+ if (sh->v_tag) {
+ /* we only go down this path if vtag is non-zero */
+ retval = sctp_findassoc_by_vtag(from, to, ntohl(sh->v_tag),
+ inp_p, netp, sh->src_port, sh->dest_port, 0, vrf_id, 0);
+ if (retval) {
+ return (retval);
+ }
+ }
+ find_tcp_pool = 0;
+ if ((ch->chunk_type != SCTP_INITIATION) &&
+ (ch->chunk_type != SCTP_INITIATION_ACK) &&
+ (ch->chunk_type != SCTP_COOKIE_ACK) &&
+ (ch->chunk_type != SCTP_COOKIE_ECHO)) {
+ /* Other chunk types go to the tcp pool. */
+ find_tcp_pool = 1;
+ }
+ if (inp_p) {
+ retval = sctp_findassociation_addr_sa(to, from, inp_p, netp,
+ find_tcp_pool, vrf_id);
+ inp = *inp_p;
+ } else {
+ retval = sctp_findassociation_addr_sa(to, from, &inp, netp,
+ find_tcp_pool, vrf_id);
+ }
+ SCTPDBG(SCTP_DEBUG_PCB1, "retval:%p inp:%p\n", retval, inp);
+ if (retval == NULL && inp) {
+ /* Found a EP but not this address */
+ if ((ch->chunk_type == SCTP_INITIATION) ||
+ (ch->chunk_type == SCTP_INITIATION_ACK)) {
+ /*-
+ * special hook, we do NOT return linp or an
+ * association that is linked to an existing
+ * association that is under the TCP pool (i.e. no
+ * listener exists). The endpoint finding routine
+ * will always find a listener before examining the
+ * TCP pool.
+ */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) {
+ if (inp_p) {
+ *inp_p = NULL;
+ }
+ return (NULL);
+ }
+ retval = sctp_findassociation_special_addr(m, iphlen,
+ offset, sh, &inp, netp, to);
+ if (inp_p != NULL) {
+ *inp_p = inp;
+ }
+ }
+ }
+ SCTPDBG(SCTP_DEBUG_PCB1, "retval is %p\n", retval);
+ return (retval);
+}
+
+/*
+ * lookup an association by an ASCONF lookup address.
+ * if the lookup address is 0.0.0.0 or ::0, use the vtag to do the lookup
+ */
+struct sctp_tcb *
+sctp_findassociation_ep_asconf(struct mbuf *m, int iphlen, int offset,
+ struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id)
+{
+ struct sctp_tcb *stcb;
+ struct sockaddr_in *sin;
+
+#ifdef INET6
+ struct sockaddr_in6 *sin6;
+
+#endif
+ struct sockaddr_storage local_store, remote_store;
+ struct sockaddr *to;
+ struct ip *iph;
+
+#ifdef INET6
+ struct ip6_hdr *ip6;
+
+#endif
+ struct sctp_paramhdr parm_buf, *phdr;
+ int ptype;
+ int zero_address = 0;
+
+
+ memset(&local_store, 0, sizeof(local_store));
+ memset(&remote_store, 0, sizeof(remote_store));
+ to = (struct sockaddr *)&local_store;
+ /* First get the destination address setup too. */
+ iph = mtod(m, struct ip *);
+ switch (iph->ip_v) {
+ case IPVERSION:
+ /* its IPv4 */
+ sin = (struct sockaddr_in *)&local_store;
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_port = sh->dest_port;
+ sin->sin_addr.s_addr = iph->ip_dst.s_addr;
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ /* its IPv6 */
+ ip6 = mtod(m, struct ip6_hdr *);
+ sin6 = (struct sockaddr_in6 *)&local_store;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(*sin6);
+ sin6->sin6_port = sh->dest_port;
+ sin6->sin6_addr = ip6->ip6_dst;
+ break;
+#endif
+ default:
+ return NULL;
+ }
+
+ phdr = sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk),
+ &parm_buf, sizeof(struct sctp_paramhdr));
+ if (phdr == NULL) {
+ SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf lookup addr\n",
+ __FUNCTION__);
+ return NULL;
+ }
+ ptype = (int)((uint32_t) ntohs(phdr->param_type));
+ /* get the correlation address */
+ switch (ptype) {
+#ifdef INET6
+ case SCTP_IPV6_ADDRESS:
+ {
+ /* ipv6 address param */
+ struct sctp_ipv6addr_param *p6, p6_buf;
+
+ if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv6addr_param)) {
+ return NULL;
+ }
+ p6 = (struct sctp_ipv6addr_param *)sctp_get_next_param(m,
+ offset + sizeof(struct sctp_asconf_chunk),
+ &p6_buf.ph, sizeof(*p6));
+ if (p6 == NULL) {
+ SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v6 lookup addr\n",
+ __FUNCTION__);
+ return (NULL);
+ }
+ sin6 = (struct sockaddr_in6 *)&remote_store;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(*sin6);
+ sin6->sin6_port = sh->src_port;
+ memcpy(&sin6->sin6_addr, &p6->addr, sizeof(struct in6_addr));
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+ zero_address = 1;
+ break;
+ }
+#endif
+ case SCTP_IPV4_ADDRESS:
+ {
+ /* ipv4 address param */
+ struct sctp_ipv4addr_param *p4, p4_buf;
+
+ if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv4addr_param)) {
+ return NULL;
+ }
+ p4 = (struct sctp_ipv4addr_param *)sctp_get_next_param(m,
+ offset + sizeof(struct sctp_asconf_chunk),
+ &p4_buf.ph, sizeof(*p4));
+ if (p4 == NULL) {
+ SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v4 lookup addr\n",
+ __FUNCTION__);
+ return (NULL);
+ }
+ sin = (struct sockaddr_in *)&remote_store;
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_port = sh->src_port;
+ memcpy(&sin->sin_addr, &p4->addr, sizeof(struct in_addr));
+ if (sin->sin_addr.s_addr == INADDR_ANY)
+ zero_address = 1;
+ break;
+ }
+ default:
+ /* invalid address param type */
+ return NULL;
+ }
+
+ if (zero_address) {
+ stcb = sctp_findassoc_by_vtag(NULL, to, ntohl(sh->v_tag), inp_p,
+ netp, sh->src_port, sh->dest_port, 1, vrf_id, 0);
+ /*
+ * printf("findassociation_ep_asconf: zero lookup address
+ * finds stcb 0x%x\n", (uint32_t)stcb);
+ */
+ } else {
+ stcb = sctp_findassociation_ep_addr(inp_p,
+ (struct sockaddr *)&remote_store, netp,
+ to, NULL);
+ }
+ return (stcb);
+}
+
+
+/*
+ * allocate a sctp_inpcb and setup a temporary binding to a port/all
+ * addresses. This way if we don't get a bind we by default pick a ephemeral
+ * port with all addresses bound.
+ */
+int
+sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id)
+{
+ /*
+ * we get called when a new endpoint starts up. We need to allocate
+ * the sctp_inpcb structure from the zone and init it. Mark it as
+ * unbound and find a port that we can use as an ephemeral with
+ * INADDR_ANY. If the user binds later no problem we can then add in
+ * the specific addresses. And setup the default parameters for the
+ * EP.
+ */
+ int i, error;
+ struct sctp_inpcb *inp;
+ struct sctp_pcb *m;
+ struct timeval time;
+ sctp_sharedkey_t *null_key;
+
+ error = 0;
+
+ SCTP_INP_INFO_WLOCK();
+ inp = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_ep), struct sctp_inpcb);
+ if (inp == NULL) {
+ SCTP_PRINTF("Out of SCTP-INPCB structures - no resources\n");
+ SCTP_INP_INFO_WUNLOCK();
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS);
+ return (ENOBUFS);
+ }
+ /* zap it */
+ bzero(inp, sizeof(*inp));
+
+ /* bump generations */
+ /* setup socket pointers */
+ inp->sctp_socket = so;
+ inp->ip_inp.inp.inp_socket = so;
+ inp->sctp_associd_counter = 1;
+ inp->partial_delivery_point = SCTP_SB_LIMIT_RCV(so) >> SCTP_PARTIAL_DELIVERY_SHIFT;
+ inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT;
+ inp->sctp_cmt_on_off = SCTP_BASE_SYSCTL(sctp_cmt_on_off);
+ /* init the small hash table we use to track asocid <-> tcb */
+ inp->sctp_asocidhash = SCTP_HASH_INIT(SCTP_STACK_VTAG_HASH_SIZE, &inp->hashasocidmark);
+ if (inp->sctp_asocidhash == NULL) {
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
+ SCTP_INP_INFO_WUNLOCK();
+ return (ENOBUFS);
+ }
+#ifdef IPSEC
+ {
+ struct inpcbpolicy *pcb_sp = NULL;
+
+ error = ipsec_init_policy(so, &pcb_sp);
+ /* Arrange to share the policy */
+ inp->ip_inp.inp.inp_sp = pcb_sp;
+ ((struct in6pcb *)(&inp->ip_inp.inp))->in6p_sp = pcb_sp;
+ }
+ if (error != 0) {
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
+ SCTP_INP_INFO_WUNLOCK();
+ return error;
+ }
+#endif /* IPSEC */
+ SCTP_INCR_EP_COUNT();
+ inp->ip_inp.inp.inp_ip_ttl = MODULE_GLOBAL(ip_defttl);
+ SCTP_INP_INFO_WUNLOCK();
+
+ so->so_pcb = (caddr_t)inp;
+
+ if ((SCTP_SO_TYPE(so) == SOCK_DGRAM) ||
+ (SCTP_SO_TYPE(so) == SOCK_SEQPACKET)) {
+ /* UDP style socket */
+ inp->sctp_flags = (SCTP_PCB_FLAGS_UDPTYPE |
+ SCTP_PCB_FLAGS_UNBOUND);
+ /* Be sure it is NON-BLOCKING IO for UDP */
+ /* SCTP_SET_SO_NBIO(so); */
+ } else if (SCTP_SO_TYPE(so) == SOCK_STREAM) {
+ /* TCP style socket */
+ inp->sctp_flags = (SCTP_PCB_FLAGS_TCPTYPE |
+ SCTP_PCB_FLAGS_UNBOUND);
+ /* Be sure we have blocking IO by default */
+ SCTP_CLEAR_SO_NBIO(so);
+ } else {
+ /*
+ * unsupported socket type (RAW, etc)- in case we missed it
+ * in protosw
+ */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EOPNOTSUPP);
+ so->so_pcb = NULL;
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
+ return (EOPNOTSUPP);
+ }
+ if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_1) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
+ } else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_2) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
+ } else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_0) {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
+ }
+ inp->sctp_tcbhash = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_pcbtblsize),
+ &inp->sctp_hashmark);
+ if (inp->sctp_tcbhash == NULL) {
+ SCTP_PRINTF("Out of SCTP-INPCB->hashinit - no resources\n");
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS);
+ so->so_pcb = NULL;
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
+ return (ENOBUFS);
+ }
+ inp->def_vrf_id = vrf_id;
+
+ SCTP_INP_INFO_WLOCK();
+ SCTP_INP_LOCK_INIT(inp);
+ INP_LOCK_INIT(&inp->ip_inp.inp, "inp", "sctpinp");
+ SCTP_INP_READ_INIT(inp);
+ SCTP_ASOC_CREATE_LOCK_INIT(inp);
+ /* lock the new ep */
+ SCTP_INP_WLOCK(inp);
+
+ /* add it to the info area */
+ LIST_INSERT_HEAD(&SCTP_BASE_INFO(listhead), inp, sctp_list);
+ SCTP_INP_INFO_WUNLOCK();
+
+ TAILQ_INIT(&inp->read_queue);
+ LIST_INIT(&inp->sctp_addr_list);
+
+ LIST_INIT(&inp->sctp_asoc_list);
+
+#ifdef SCTP_TRACK_FREED_ASOCS
+ /* TEMP CODE */
+ LIST_INIT(&inp->sctp_asoc_free_list);
+#endif
+ /* Init the timer structure for signature change */
+ SCTP_OS_TIMER_INIT(&inp->sctp_ep.signature_change.timer);
+ inp->sctp_ep.signature_change.type = SCTP_TIMER_TYPE_NEWCOOKIE;
+
+ /* now init the actual endpoint default data */
+ m = &inp->sctp_ep;
+
+ /* setup the base timeout information */
+ m->sctp_timeoutticks[SCTP_TIMER_SEND] = SEC_TO_TICKS(SCTP_SEND_SEC); /* needed ? */
+ m->sctp_timeoutticks[SCTP_TIMER_INIT] = SEC_TO_TICKS(SCTP_INIT_SEC); /* needed ? */
+ m->sctp_timeoutticks[SCTP_TIMER_RECV] = MSEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default));
+ m->sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default));
+ m->sctp_timeoutticks[SCTP_TIMER_PMTU] = SEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default));
+ m->sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN] = SEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default));
+ m->sctp_timeoutticks[SCTP_TIMER_SIGNATURE] = SEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_secret_lifetime_default));
+ /* all max/min max are in ms */
+ m->sctp_maxrto = SCTP_BASE_SYSCTL(sctp_rto_max_default);
+ m->sctp_minrto = SCTP_BASE_SYSCTL(sctp_rto_min_default);
+ m->initial_rto = SCTP_BASE_SYSCTL(sctp_rto_initial_default);
+ m->initial_init_rto_max = SCTP_BASE_SYSCTL(sctp_init_rto_max_default);
+ m->sctp_sack_freq = SCTP_BASE_SYSCTL(sctp_sack_freq_default);
+
+ m->max_open_streams_intome = MAX_SCTP_STREAMS;
+
+ m->max_init_times = SCTP_BASE_SYSCTL(sctp_init_rtx_max_default);
+ m->max_send_times = SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default);
+ m->def_net_failure = SCTP_BASE_SYSCTL(sctp_path_rtx_max_default);
+ m->sctp_sws_sender = SCTP_SWS_SENDER_DEF;
+ m->sctp_sws_receiver = SCTP_SWS_RECEIVER_DEF;
+ m->max_burst = SCTP_BASE_SYSCTL(sctp_max_burst_default);
+ if ((SCTP_BASE_SYSCTL(sctp_default_cc_module) >= SCTP_CC_RFC2581) &&
+ (SCTP_BASE_SYSCTL(sctp_default_cc_module) <= SCTP_CC_HTCP)) {
+ m->sctp_default_cc_module = SCTP_BASE_SYSCTL(sctp_default_cc_module);
+ } else {
+ /* sysctl done with invalid value, set to 2581 */
+ m->sctp_default_cc_module = SCTP_CC_RFC2581;
+ }
+ /* number of streams to pre-open on a association */
+ m->pre_open_stream_count = SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default);
+
+ /* Add adaptation cookie */
+ m->adaptation_layer_indicator = 0x504C5253;
+
+ /* seed random number generator */
+ m->random_counter = 1;
+ m->store_at = SCTP_SIGNATURE_SIZE;
+ SCTP_READ_RANDOM(m->random_numbers, sizeof(m->random_numbers));
+ sctp_fill_random_store(m);
+
+ /* Minimum cookie size */
+ m->size_of_a_cookie = (sizeof(struct sctp_init_msg) * 2) +
+ sizeof(struct sctp_state_cookie);
+ m->size_of_a_cookie += SCTP_SIGNATURE_SIZE;
+
+ /* Setup the initial secret */
+ (void)SCTP_GETTIME_TIMEVAL(&time);
+ m->time_of_secret_change = time.tv_sec;
+
+ for (i = 0; i < SCTP_NUMBER_OF_SECRETS; i++) {
+ m->secret_key[0][i] = sctp_select_initial_TSN(m);
+ }
+ sctp_timer_start(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL);
+
+ /* How long is a cookie good for ? */
+ m->def_cookie_life = MSEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default));
+ /*
+ * Initialize authentication parameters
+ */
+ m->local_hmacs = sctp_default_supported_hmaclist();
+ m->local_auth_chunks = sctp_alloc_chunklist();
+ sctp_auth_set_default_chunks(m->local_auth_chunks);
+ LIST_INIT(&m->shared_keys);
+ /* add default NULL key as key id 0 */
+ null_key = sctp_alloc_sharedkey();
+ sctp_insert_sharedkey(&m->shared_keys, null_key);
+ SCTP_INP_WUNLOCK(inp);
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 12);
+#endif
+ return (error);
+}
+
+
+void
+sctp_move_pcb_and_assoc(struct sctp_inpcb *old_inp, struct sctp_inpcb *new_inp,
+ struct sctp_tcb *stcb)
+{
+ struct sctp_nets *net;
+ uint16_t lport, rport;
+ struct sctppcbhead *head;
+ struct sctp_laddr *laddr, *oladdr;
+
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_INP_INFO_WLOCK();
+ SCTP_INP_WLOCK(old_inp);
+ SCTP_INP_WLOCK(new_inp);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+
+ new_inp->sctp_ep.time_of_secret_change =
+ old_inp->sctp_ep.time_of_secret_change;
+ memcpy(new_inp->sctp_ep.secret_key, old_inp->sctp_ep.secret_key,
+ sizeof(old_inp->sctp_ep.secret_key));
+ new_inp->sctp_ep.current_secret_number =
+ old_inp->sctp_ep.current_secret_number;
+ new_inp->sctp_ep.last_secret_number =
+ old_inp->sctp_ep.last_secret_number;
+ new_inp->sctp_ep.size_of_a_cookie = old_inp->sctp_ep.size_of_a_cookie;
+
+ /* make it so new data pours into the new socket */
+ stcb->sctp_socket = new_inp->sctp_socket;
+ stcb->sctp_ep = new_inp;
+
+ /* Copy the port across */
+ lport = new_inp->sctp_lport = old_inp->sctp_lport;
+ rport = stcb->rport;
+ /* Pull the tcb from the old association */
+ LIST_REMOVE(stcb, sctp_tcbhash);
+ LIST_REMOVE(stcb, sctp_tcblist);
+ if (stcb->asoc.in_asocid_hash) {
+ LIST_REMOVE(stcb, sctp_tcbasocidhash);
+ }
+ /* Now insert the new_inp into the TCP connected hash */
+ head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))];
+
+ LIST_INSERT_HEAD(head, new_inp, sctp_hash);
+ /* Its safe to access */
+ new_inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND;
+
+ /* Now move the tcb into the endpoint list */
+ LIST_INSERT_HEAD(&new_inp->sctp_asoc_list, stcb, sctp_tcblist);
+ /*
+ * Question, do we even need to worry about the ep-hash since we
+ * only have one connection? Probably not :> so lets get rid of it
+ * and not suck up any kernel memory in that.
+ */
+ if (stcb->asoc.in_asocid_hash) {
+ struct sctpasochead *lhd;
+
+ lhd = &new_inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(stcb->asoc.assoc_id,
+ new_inp->hashasocidmark)];
+ LIST_INSERT_HEAD(lhd, stcb, sctp_tcbasocidhash);
+ }
+ /* Ok. Let's restart timer. */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, new_inp,
+ stcb, net);
+ }
+
+ SCTP_INP_INFO_WUNLOCK();
+ if (new_inp->sctp_tcbhash != NULL) {
+ SCTP_HASH_FREE(new_inp->sctp_tcbhash, new_inp->sctp_hashmark);
+ new_inp->sctp_tcbhash = NULL;
+ }
+ if ((new_inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
+ /* Subset bound, so copy in the laddr list from the old_inp */
+ LIST_FOREACH(oladdr, &old_inp->sctp_addr_list, sctp_nxt_addr) {
+ laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
+ if (laddr == NULL) {
+ /*
+ * Gak, what can we do? This assoc is really
+ * HOSED. We probably should send an abort
+ * here.
+ */
+ SCTPDBG(SCTP_DEBUG_PCB1, "Association hosed in TCP model, out of laddr memory\n");
+ continue;
+ }
+ SCTP_INCR_LADDR_COUNT();
+ bzero(laddr, sizeof(*laddr));
+ (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time);
+ laddr->ifa = oladdr->ifa;
+ atomic_add_int(&laddr->ifa->refcount, 1);
+ LIST_INSERT_HEAD(&new_inp->sctp_addr_list, laddr,
+ sctp_nxt_addr);
+ new_inp->laddr_count++;
+ }
+ }
+ /*
+ * Now any running timers need to be adjusted since we really don't
+ * care if they are running or not just blast in the new_inp into
+ * all of them.
+ */
+
+ stcb->asoc.hb_timer.ep = (void *)new_inp;
+ stcb->asoc.dack_timer.ep = (void *)new_inp;
+ stcb->asoc.asconf_timer.ep = (void *)new_inp;
+ stcb->asoc.strreset_timer.ep = (void *)new_inp;
+ stcb->asoc.shut_guard_timer.ep = (void *)new_inp;
+ stcb->asoc.autoclose_timer.ep = (void *)new_inp;
+ stcb->asoc.delayed_event_timer.ep = (void *)new_inp;
+ stcb->asoc.delete_prim_timer.ep = (void *)new_inp;
+ /* now what about the nets? */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ net->pmtu_timer.ep = (void *)new_inp;
+ net->rxt_timer.ep = (void *)new_inp;
+ net->fr_timer.ep = (void *)new_inp;
+ }
+ SCTP_INP_WUNLOCK(new_inp);
+ SCTP_INP_WUNLOCK(old_inp);
+}
+
+
+
+
+/* sctp_ifap is used to bypass normal local address validation checks */
+int
+sctp_inpcb_bind(struct socket *so, struct sockaddr *addr,
+ struct sctp_ifa *sctp_ifap, struct thread *p)
+{
+ /* bind a ep to a socket address */
+ struct sctppcbhead *head;
+ struct sctp_inpcb *inp, *inp_tmp;
+ struct inpcb *ip_inp;
+ int port_reuse_active = 0;
+ int bindall;
+ uint16_t lport;
+ int error;
+ uint32_t vrf_id;
+
+ lport = 0;
+ error = 0;
+ bindall = 1;
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ ip_inp = (struct inpcb *)so->so_pcb;
+#ifdef SCTP_DEBUG
+ if (addr) {
+ SCTPDBG(SCTP_DEBUG_PCB1, "Bind called port:%d\n",
+ ntohs(((struct sockaddr_in *)addr)->sin_port));
+ SCTPDBG(SCTP_DEBUG_PCB1, "Addr :");
+ SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr);
+ }
+#endif
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) {
+ /* already did a bind, subsequent binds NOT allowed ! */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ return (EINVAL);
+ }
+#ifdef INVARIANTS
+ if (p == NULL)
+ panic("null proc/thread");
+#endif
+ if (addr != NULL) {
+ switch (addr->sa_family) {
+ case AF_INET:
+ {
+ struct sockaddr_in *sin;
+
+ /* IPV6_V6ONLY socket? */
+ if (SCTP_IPV6_V6ONLY(ip_inp)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ return (EINVAL);
+ }
+ if (addr->sa_len != sizeof(*sin)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ return (EINVAL);
+ }
+ sin = (struct sockaddr_in *)addr;
+ lport = sin->sin_port;
+ /*
+ * For LOOPBACK the prison_local_ip4() call
+ * will transmute the ip address to the
+ * proper value.
+ */
+ if (p && (error = prison_local_ip4(p->td_ucred, &sin->sin_addr)) != 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error);
+ return (error);
+ }
+ if (sin->sin_addr.s_addr != INADDR_ANY) {
+ bindall = 0;
+ }
+ break;
+ }
+#ifdef INET6
+ case AF_INET6:
+ {
+ /*
+ * Only for pure IPv6 Address. (No IPv4
+ * Mapped!)
+ */
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)addr;
+
+ if (addr->sa_len != sizeof(*sin6)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ return (EINVAL);
+ }
+ lport = sin6->sin6_port;
+
+ /*
+ * For LOOPBACK the prison_local_ip6() call
+ * will transmute the ipv6 address to the
+ * proper value.
+ */
+ if (p && (error = prison_local_ip6(p->td_ucred, &sin6->sin6_addr,
+ (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error);
+ return (error);
+ }
+ if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+ bindall = 0;
+ /* KAME hack: embed scopeid */
+ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ return (EINVAL);
+ }
+ }
+ /* this must be cleared for ifa_ifwithaddr() */
+ sin6->sin6_scope_id = 0;
+ break;
+ }
+#endif
+ default:
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EAFNOSUPPORT);
+ return (EAFNOSUPPORT);
+ }
+ }
+ SCTP_INP_INFO_WLOCK();
+ SCTP_INP_WLOCK(inp);
+ /* Setup a vrf_id to be the default for the non-bind-all case. */
+ vrf_id = inp->def_vrf_id;
+
+ /* increase our count due to the unlock we do */
+ SCTP_INP_INCR_REF(inp);
+ if (lport) {
+ /*
+ * Did the caller specify a port? if so we must see if a ep
+ * already has this one bound.
+ */
+ /* got to be root to get at low ports */
+ if (ntohs(lport) < IPPORT_RESERVED) {
+ if (p && (error =
+ priv_check(p, PRIV_NETINET_RESERVEDPORT)
+ )) {
+ SCTP_INP_DECR_REF(inp);
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ return (error);
+ }
+ }
+ if (p == NULL) {
+ SCTP_INP_DECR_REF(inp);
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error);
+ return (error);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ if (bindall) {
+ vrf_id = inp->def_vrf_id;
+ inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id);
+ if (inp_tmp != NULL) {
+ /*
+ * lock guy returned and lower count note
+ * that we are not bound so inp_tmp should
+ * NEVER be inp. And it is this inp
+ * (inp_tmp) that gets the reference bump,
+ * so we must lower it.
+ */
+ SCTP_INP_DECR_REF(inp_tmp);
+ /* unlock info */
+ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) &&
+ (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) {
+ /*
+ * Ok, must be one-2-one and
+ * allowing port re-use
+ */
+ port_reuse_active = 1;
+ goto continue_anyway;
+ }
+ SCTP_INP_DECR_REF(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE);
+ return (EADDRINUSE);
+ }
+ } else {
+ inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id);
+ if (inp_tmp != NULL) {
+ /*
+ * lock guy returned and lower count note
+ * that we are not bound so inp_tmp should
+ * NEVER be inp. And it is this inp
+ * (inp_tmp) that gets the reference bump,
+ * so we must lower it.
+ */
+ SCTP_INP_DECR_REF(inp_tmp);
+ /* unlock info */
+ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) &&
+ (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) {
+ /*
+ * Ok, must be one-2-one and
+ * allowing port re-use
+ */
+ port_reuse_active = 1;
+ goto continue_anyway;
+ }
+ SCTP_INP_DECR_REF(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE);
+ return (EADDRINUSE);
+ }
+ }
+continue_anyway:
+ SCTP_INP_WLOCK(inp);
+ if (bindall) {
+ /* verify that no lport is not used by a singleton */
+ if ((port_reuse_active == 0) &&
+ (inp_tmp = sctp_isport_inuse(inp, lport, vrf_id))
+ ) {
+ /* Sorry someone already has this one bound */
+ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) &&
+ (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) {
+ port_reuse_active = 1;
+ } else {
+ SCTP_INP_DECR_REF(inp);
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE);
+ return (EADDRINUSE);
+ }
+ }
+ }
+ } else {
+ uint16_t first, last, candidate;
+ uint16_t count;
+ int done;
+
+ if (ip_inp->inp_flags & INP_HIGHPORT) {
+ first = MODULE_GLOBAL(ipport_hifirstauto);
+ last = MODULE_GLOBAL(ipport_hilastauto);
+ } else if (ip_inp->inp_flags & INP_LOWPORT) {
+ if (p && (error =
+ priv_check(p, PRIV_NETINET_RESERVEDPORT)
+ )) {
+ SCTP_INP_DECR_REF(inp);
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error);
+ return (error);
+ }
+ first = MODULE_GLOBAL(ipport_lowfirstauto);
+ last = MODULE_GLOBAL(ipport_lowlastauto);
+ } else {
+ first = MODULE_GLOBAL(ipport_firstauto);
+ last = MODULE_GLOBAL(ipport_lastauto);
+ }
+ if (first > last) {
+ uint16_t temp;
+
+ temp = first;
+ first = last;
+ last = temp;
+ }
+ count = last - first + 1; /* number of candidates */
+ candidate = first + sctp_select_initial_TSN(&inp->sctp_ep) % (count);
+
+ done = 0;
+ while (!done) {
+ if (sctp_isport_inuse(inp, htons(candidate), inp->def_vrf_id) == NULL) {
+ done = 1;
+ }
+ if (!done) {
+ if (--count == 0) {
+ SCTP_INP_DECR_REF(inp);
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE);
+ return (EADDRINUSE);
+ }
+ if (candidate == last)
+ candidate = first;
+ else
+ candidate = candidate + 1;
+ }
+ }
+ lport = htons(candidate);
+ }
+ SCTP_INP_DECR_REF(inp);
+ if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE |
+ SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
+ /*
+ * this really should not happen. The guy did a non-blocking
+ * bind and then did a close at the same time.
+ */
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ return (EINVAL);
+ }
+ /* ok we look clear to give out this port, so lets setup the binding */
+ if (bindall) {
+ /* binding to all addresses, so just set in the proper flags */
+ inp->sctp_flags |= SCTP_PCB_FLAGS_BOUNDALL;
+ /* set the automatic addr changes from kernel flag */
+ if (SCTP_BASE_SYSCTL(sctp_auto_asconf) == 0) {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF);
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF);
+ } else {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF);
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_AUTO_ASCONF);
+ }
+ if (SCTP_BASE_SYSCTL(sctp_multiple_asconfs) == 0) {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS);
+ } else {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS);
+ }
+ /*
+ * set the automatic mobility_base from kernel flag (by
+ * micchie)
+ */
+ if (SCTP_BASE_SYSCTL(sctp_mobility_base) == 0) {
+ sctp_mobility_feature_off(inp, SCTP_MOBILITY_BASE);
+ sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
+ } else {
+ sctp_mobility_feature_on(inp, SCTP_MOBILITY_BASE);
+ sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
+ }
+ /*
+ * set the automatic mobility_fasthandoff from kernel flag
+ * (by micchie)
+ */
+ if (SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff) == 0) {
+ sctp_mobility_feature_off(inp, SCTP_MOBILITY_FASTHANDOFF);
+ sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
+ } else {
+ sctp_mobility_feature_on(inp, SCTP_MOBILITY_FASTHANDOFF);
+ sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
+ }
+ } else {
+ /*
+ * bind specific, make sure flags is off and add a new
+ * address structure to the sctp_addr_list inside the ep
+ * structure.
+ *
+ * We will need to allocate one and insert it at the head. The
+ * socketopt call can just insert new addresses in there as
+ * well. It will also have to do the embed scope kame hack
+ * too (before adding).
+ */
+ struct sctp_ifa *ifa;
+ struct sockaddr_storage store_sa;
+
+ memset(&store_sa, 0, sizeof(store_sa));
+ if (addr->sa_family == AF_INET) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&store_sa;
+ memcpy(sin, addr, sizeof(struct sockaddr_in));
+ sin->sin_port = 0;
+ } else if (addr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&store_sa;
+ memcpy(sin6, addr, sizeof(struct sockaddr_in6));
+ sin6->sin6_port = 0;
+ }
+ /*
+ * first find the interface with the bound address need to
+ * zero out the port to find the address! yuck! can't do
+ * this earlier since need port for sctp_pcb_findep()
+ */
+ if (sctp_ifap != NULL)
+ ifa = sctp_ifap;
+ else {
+ /*
+ * Note for BSD we hit here always other O/S's will
+ * pass things in via the sctp_ifap argument
+ * (Panda).
+ */
+ ifa = sctp_find_ifa_by_addr((struct sockaddr *)&store_sa,
+ vrf_id, SCTP_ADDR_NOT_LOCKED);
+ }
+ if (ifa == NULL) {
+ /* Can't find an interface with that address */
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRNOTAVAIL);
+ return (EADDRNOTAVAIL);
+ }
+ if (addr->sa_family == AF_INET6) {
+ /* GAK, more FIXME IFA lock? */
+ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
+ /* Can't bind a non-existent addr. */
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ return (EINVAL);
+ }
+ }
+ /* we're not bound all */
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_BOUNDALL;
+ /* allow bindx() to send ASCONF's for binding changes */
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF);
+ /* clear automatic addr changes from kernel flag */
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF);
+
+ /* add this address to the endpoint list */
+ error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, 0);
+ if (error != 0) {
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ return (error);
+ }
+ inp->laddr_count++;
+ }
+ /* find the bucket */
+ if (port_reuse_active) {
+ /* Put it into tcp 1-2-1 hash */
+ head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashtcpmark))];
+ inp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL;
+ } else {
+ head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashmark))];
+ }
+ /* put it in the bucket */
+ LIST_INSERT_HEAD(head, inp, sctp_hash);
+ SCTPDBG(SCTP_DEBUG_PCB1, "Main hash to bind at head:%p, bound port:%d - in tcp_pool=%d\n",
+ head, ntohs(lport), port_reuse_active);
+ /* set in the port */
+ inp->sctp_lport = lport;
+
+ /* turn off just the unbound flag */
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND;
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ return (0);
+}
+
+
+static void
+sctp_iterator_inp_being_freed(struct sctp_inpcb *inp)
+{
+ struct sctp_iterator *it, *nit;
+
+ /*
+ * We enter with the only the ITERATOR_LOCK in place and a write
+ * lock on the inp_info stuff.
+ */
+ it = sctp_it_ctl.cur_it;
+ if (it && (it->vn != curvnet)) {
+ /* Its not looking at our VNET */
+ return;
+ }
+ if (it && (it->inp == inp)) {
+ /*
+ * This is tricky and we hold the iterator lock, but when it
+ * returns and gets the lock (when we release it) the
+ * iterator will try to operate on inp. We need to stop that
+ * from happening. But of course the iterator has a
+ * reference on the stcb and inp. We can mark it and it will
+ * stop.
+ *
+ * If its a single iterator situation, we set the end iterator
+ * flag. Otherwise we set the iterator to go to the next
+ * inp.
+ *
+ */
+ if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) {
+ sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT;
+ } else {
+ sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_INP;
+ }
+ }
+ /*
+ * Now go through and remove any single reference to our inp that
+ * may be still pending on the list
+ */
+ SCTP_IPI_ITERATOR_WQ_LOCK();
+ it = TAILQ_FIRST(&sctp_it_ctl.iteratorhead);
+ while (it) {
+ nit = TAILQ_NEXT(it, sctp_nxt_itr);
+ if (it->vn != curvnet) {
+ it = nit;
+ continue;
+ }
+ if (it->inp == inp) {
+ /* This one points to me is it inp specific? */
+ if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) {
+ /* Remove and free this one */
+ TAILQ_REMOVE(&sctp_it_ctl.iteratorhead,
+ it, sctp_nxt_itr);
+ if (it->function_atend != NULL) {
+ (*it->function_atend) (it->pointer, it->val);
+ }
+ SCTP_FREE(it, SCTP_M_ITER);
+ } else {
+ it->inp = LIST_NEXT(it->inp, sctp_list);
+ if (it->inp) {
+ SCTP_INP_INCR_REF(it->inp);
+ }
+ }
+ /*
+ * When its put in the refcnt is incremented so decr
+ * it
+ */
+ SCTP_INP_DECR_REF(inp);
+ }
+ it = nit;
+ }
+ SCTP_IPI_ITERATOR_WQ_UNLOCK();
+}
+
+/* release sctp_inpcb unbind the port */
+void
+sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from)
+{
+ /*
+ * Here we free a endpoint. We must find it (if it is in the Hash
+ * table) and remove it from there. Then we must also find it in the
+ * overall list and remove it from there. After all removals are
+ * complete then any timer has to be stopped. Then start the actual
+ * freeing. a) Any local lists. b) Any associations. c) The hash of
+ * all associations. d) finally the ep itself.
+ */
+ struct sctp_pcb *m;
+ struct sctp_tcb *asoc, *nasoc;
+ struct sctp_laddr *laddr, *nladdr;
+ struct inpcb *ip_pcb;
+ struct socket *so;
+ int being_refed = 0;
+ struct sctp_queued_to_read *sq;
+
+
+ int cnt;
+ sctp_sharedkey_t *shared_key;
+
+
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 0);
+#endif
+ SCTP_ITERATOR_LOCK();
+ /* mark any iterators on the list or being processed */
+ sctp_iterator_inp_being_freed(inp);
+ SCTP_ITERATOR_UNLOCK();
+ so = inp->sctp_socket;
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
+ /* been here before.. eeks.. get out of here */
+ SCTP_PRINTF("This conflict in free SHOULD not be happening! from %d, imm %d\n", from, immediate);
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 1);
+#endif
+ return;
+ }
+ SCTP_ASOC_CREATE_LOCK(inp);
+ SCTP_INP_INFO_WLOCK();
+
+ SCTP_INP_WLOCK(inp);
+ if (from == SCTP_CALLED_AFTER_CMPSET_OFCLOSE) {
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_CLOSE_IP;
+ /* socket is gone, so no more wakeups allowed */
+ inp->sctp_flags |= SCTP_PCB_FLAGS_DONT_WAKE;
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT;
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT;
+
+ }
+ /* First time through we have the socket lock, after that no more. */
+ sctp_timer_stop(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL,
+ SCTP_FROM_SCTP_PCB + SCTP_LOC_1);
+
+ if (inp->control) {
+ sctp_m_freem(inp->control);
+ inp->control = NULL;
+ }
+ if (inp->pkt) {
+ sctp_m_freem(inp->pkt);
+ inp->pkt = NULL;
+ }
+ m = &inp->sctp_ep;
+ ip_pcb = &inp->ip_inp.inp; /* we could just cast the main pointer
+ * here but I will be nice :> (i.e.
+ * ip_pcb = ep;) */
+ if (immediate == SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE) {
+ int cnt_in_sd;
+
+ cnt_in_sd = 0;
+ for ((asoc = LIST_FIRST(&inp->sctp_asoc_list)); asoc != NULL;
+ asoc = nasoc) {
+ SCTP_TCB_LOCK(asoc);
+ nasoc = LIST_NEXT(asoc, sctp_tcblist);
+ if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ /* Skip guys being freed */
+ cnt_in_sd++;
+ if (asoc->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) {
+ /*
+ * Special case - we did not start a
+ * kill timer on the asoc due to it
+ * was not closed. So go ahead and
+ * start it now.
+ */
+ asoc->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE;
+ sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, asoc, NULL);
+ }
+ SCTP_TCB_UNLOCK(asoc);
+ continue;
+ }
+ if (((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_COOKIE_WAIT) ||
+ (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_COOKIE_ECHOED)) &&
+ (asoc->asoc.total_output_queue_size == 0)) {
+ /*
+ * If we have data in queue, we don't want
+ * to just free since the app may have done,
+ * send()/close or connect/send/close. And
+ * it wants the data to get across first.
+ */
+ /* Just abandon things in the front states */
+ if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_NOFORCE,
+ SCTP_FROM_SCTP_PCB + SCTP_LOC_2) == 0) {
+ cnt_in_sd++;
+ }
+ continue;
+ }
+ /* Disconnect the socket please */
+ asoc->sctp_socket = NULL;
+ asoc->asoc.state |= SCTP_STATE_CLOSED_SOCKET;
+ if ((asoc->asoc.size_on_reasm_queue > 0) ||
+ (asoc->asoc.control_pdapi) ||
+ (asoc->asoc.size_on_all_streams > 0) ||
+ (so && (so->so_rcv.sb_cc > 0))
+ ) {
+ /* Left with Data unread */
+ struct mbuf *op_err;
+
+ op_err = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (op_err) {
+ /* Fill in the user initiated abort */
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(op_err) =
+ sizeof(struct sctp_paramhdr) + sizeof(uint32_t);
+ ph = mtod(op_err,
+ struct sctp_paramhdr *);
+ ph->param_type = htons(
+ SCTP_CAUSE_USER_INITIATED_ABT);
+ ph->param_length = htons(SCTP_BUF_LEN(op_err));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_PCB + SCTP_LOC_3);
+ }
+ asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_3;
+#if defined(SCTP_PANIC_ON_ABORT)
+ panic("inpcb_free does an abort");
+#endif
+ sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED);
+ SCTP_STAT_INCR_COUNTER32(sctps_aborted);
+ if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ if (sctp_free_assoc(inp, asoc,
+ SCTP_PCBFREE_NOFORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_4) == 0) {
+ cnt_in_sd++;
+ }
+ continue;
+ } else if (TAILQ_EMPTY(&asoc->asoc.send_queue) &&
+ TAILQ_EMPTY(&asoc->asoc.sent_queue) &&
+ (asoc->asoc.stream_queue_cnt == 0)
+ ) {
+ if (asoc->asoc.locked_on_sending) {
+ goto abort_anyway;
+ }
+ if ((SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_SHUTDOWN_SENT) &&
+ (SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
+ /*
+ * there is nothing queued to send,
+ * so I send shutdown
+ */
+ sctp_send_shutdown(asoc, asoc->asoc.primary_destination);
+ if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ SCTP_SET_STATE(&asoc->asoc, SCTP_STATE_SHUTDOWN_SENT);
+ SCTP_CLEAR_SUBSTATE(&asoc->asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, asoc->sctp_ep, asoc,
+ asoc->asoc.primary_destination);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, asoc->sctp_ep, asoc,
+ asoc->asoc.primary_destination);
+ sctp_chunk_output(inp, asoc, SCTP_OUTPUT_FROM_SHUT_TMR, SCTP_SO_LOCKED);
+ }
+ } else {
+ /* mark into shutdown pending */
+ struct sctp_stream_queue_pending *sp;
+
+ asoc->asoc.state |= SCTP_STATE_SHUTDOWN_PENDING;
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, asoc->sctp_ep, asoc,
+ asoc->asoc.primary_destination);
+ if (asoc->asoc.locked_on_sending) {
+ sp = TAILQ_LAST(&((asoc->asoc.locked_on_sending)->outqueue),
+ sctp_streamhead);
+ if (sp == NULL) {
+ SCTP_PRINTF("Error, sp is NULL, locked on sending is %p strm:%d\n",
+ asoc->asoc.locked_on_sending,
+ asoc->asoc.locked_on_sending->stream_no);
+ } else {
+ if ((sp->length == 0) && (sp->msg_is_complete == 0))
+ asoc->asoc.state |= SCTP_STATE_PARTIAL_MSG_LEFT;
+ }
+ }
+ if (TAILQ_EMPTY(&asoc->asoc.send_queue) &&
+ TAILQ_EMPTY(&asoc->asoc.sent_queue) &&
+ (asoc->asoc.state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
+ struct mbuf *op_err;
+
+ abort_anyway:
+ op_err = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (op_err) {
+ /*
+ * Fill in the user
+ * initiated abort
+ */
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(op_err) =
+ (sizeof(struct sctp_paramhdr) +
+ sizeof(uint32_t));
+ ph = mtod(op_err,
+ struct sctp_paramhdr *);
+ ph->param_type = htons(
+ SCTP_CAUSE_USER_INITIATED_ABT);
+ ph->param_length = htons(SCTP_BUF_LEN(op_err));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_PCB + SCTP_LOC_5);
+ }
+ asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_5;
+#if defined(SCTP_PANIC_ON_ABORT)
+ panic("inpcb_free does an abort");
+#endif
+
+ sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED);
+ SCTP_STAT_INCR_COUNTER32(sctps_aborted);
+ if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ if (sctp_free_assoc(inp, asoc,
+ SCTP_PCBFREE_NOFORCE,
+ SCTP_FROM_SCTP_PCB + SCTP_LOC_6) == 0) {
+ cnt_in_sd++;
+ }
+ continue;
+ } else {
+ sctp_chunk_output(inp, asoc, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED);
+ }
+ }
+ cnt_in_sd++;
+ SCTP_TCB_UNLOCK(asoc);
+ }
+ /* now is there some left in our SHUTDOWN state? */
+ if (cnt_in_sd) {
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 2);
+#endif
+ inp->sctp_socket = NULL;
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_ASOC_CREATE_UNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ return;
+ }
+ }
+ inp->sctp_socket = NULL;
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) !=
+ SCTP_PCB_FLAGS_UNBOUND) {
+ /*
+ * ok, this guy has been bound. It's port is somewhere in
+ * the SCTP_BASE_INFO(hash table). Remove it!
+ */
+ LIST_REMOVE(inp, sctp_hash);
+ inp->sctp_flags |= SCTP_PCB_FLAGS_UNBOUND;
+ }
+ /*
+ * If there is a timer running to kill us, forget it, since it may
+ * have a contest on the INP lock.. which would cause us to die ...
+ */
+ cnt = 0;
+ for ((asoc = LIST_FIRST(&inp->sctp_asoc_list)); asoc != NULL;
+ asoc = nasoc) {
+ SCTP_TCB_LOCK(asoc);
+ nasoc = LIST_NEXT(asoc, sctp_tcblist);
+ if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ if (asoc->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) {
+ asoc->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE;
+ sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, asoc, NULL);
+ }
+ cnt++;
+ SCTP_TCB_UNLOCK(asoc);
+ continue;
+ }
+ /* Free associations that are NOT killing us */
+ if ((SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_COOKIE_WAIT) &&
+ ((asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) {
+ struct mbuf *op_err;
+ uint32_t *ippp;
+
+ op_err = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (op_err) {
+ /* Fill in the user initiated abort */
+ struct sctp_paramhdr *ph;
+
+ SCTP_BUF_LEN(op_err) = (sizeof(struct sctp_paramhdr) +
+ sizeof(uint32_t));
+ ph = mtod(op_err, struct sctp_paramhdr *);
+ ph->param_type = htons(
+ SCTP_CAUSE_USER_INITIATED_ABT);
+ ph->param_length = htons(SCTP_BUF_LEN(op_err));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_PCB + SCTP_LOC_7);
+
+ }
+ asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_7;
+#if defined(SCTP_PANIC_ON_ABORT)
+ panic("inpcb_free does an abort");
+#endif
+ sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED);
+ SCTP_STAT_INCR_COUNTER32(sctps_aborted);
+ } else if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ cnt++;
+ SCTP_TCB_UNLOCK(asoc);
+ continue;
+ }
+ if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_8) == 0) {
+ cnt++;
+ }
+ }
+ if (cnt) {
+ /* Ok we have someone out there that will kill us */
+ (void)SCTP_OS_TIMER_STOP(&inp->sctp_ep.signature_change.timer);
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 3);
+#endif
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_ASOC_CREATE_UNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ return;
+ }
+#ifndef __rtems__
+ if (SCTP_INP_LOCK_CONTENDED(inp))
+ being_refed++;
+ if (SCTP_INP_READ_CONTENDED(inp))
+ being_refed++;
+ if (SCTP_ASOC_CREATE_LOCK_CONTENDED(inp))
+ being_refed++;
+#endif
+
+ if ((inp->refcount) ||
+ (being_refed) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_CLOSE_IP)) {
+ (void)SCTP_OS_TIMER_STOP(&inp->sctp_ep.signature_change.timer);
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 4);
+#endif
+ sctp_timer_start(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL);
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_ASOC_CREATE_UNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ return;
+ }
+ inp->sctp_ep.signature_change.type = 0;
+ inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_ALLGONE;
+ /*
+ * Remove it from the list .. last thing we need a lock for.
+ */
+ LIST_REMOVE(inp, sctp_list);
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_ASOC_CREATE_UNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ /*
+ * Now we release all locks. Since this INP cannot be found anymore
+ * except possibly by the kill timer that might be running. We call
+ * the drain function here. It should hit the case were it sees the
+ * ACTIVE flag cleared and exit out freeing us to proceed and
+ * destroy everything.
+ */
+ if (from != SCTP_CALLED_FROM_INPKILL_TIMER) {
+ (void)SCTP_OS_TIMER_STOP_DRAIN(&inp->sctp_ep.signature_change.timer);
+ } else {
+ /* Probably un-needed */
+ (void)SCTP_OS_TIMER_STOP(&inp->sctp_ep.signature_change.timer);
+ }
+
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 5);
+#endif
+
+
+ if ((inp->sctp_asocidhash) != NULL) {
+ SCTP_HASH_FREE(inp->sctp_asocidhash, inp->hashasocidmark);
+ inp->sctp_asocidhash = NULL;
+ }
+ /* sa_ignore FREED_MEMORY */
+ while ((sq = TAILQ_FIRST(&inp->read_queue)) != NULL) {
+ /* Its only abandoned if it had data left */
+ if (sq->length)
+ SCTP_STAT_INCR(sctps_left_abandon);
+
+ TAILQ_REMOVE(&inp->read_queue, sq, next);
+ sctp_free_remote_addr(sq->whoFrom);
+ if (so)
+ so->so_rcv.sb_cc -= sq->length;
+ if (sq->data) {
+ sctp_m_freem(sq->data);
+ sq->data = NULL;
+ }
+ /*
+ * no need to free the net count, since at this point all
+ * assoc's are gone.
+ */
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), sq);
+ SCTP_DECR_READQ_COUNT();
+ }
+ /* Now the sctp_pcb things */
+ /*
+ * free each asoc if it is not already closed/free. we can't use the
+ * macro here since le_next will get freed as part of the
+ * sctp_free_assoc() call.
+ */
+ cnt = 0;
+ if (so) {
+#ifdef IPSEC
+ ipsec_delete_pcbpolicy(ip_pcb);
+#endif /* IPSEC */
+
+ /* Unlocks not needed since the socket is gone now */
+ }
+ if (ip_pcb->inp_options) {
+ (void)sctp_m_free(ip_pcb->inp_options);
+ ip_pcb->inp_options = 0;
+ }
+ if (ip_pcb->inp_moptions) {
+ inp_freemoptions(ip_pcb->inp_moptions);
+ ip_pcb->inp_moptions = 0;
+ }
+#ifdef INET6
+ if (ip_pcb->inp_vflag & INP_IPV6) {
+ struct in6pcb *in6p;
+
+ in6p = (struct in6pcb *)inp;
+ ip6_freepcbopts(in6p->in6p_outputopts);
+ }
+#endif /* INET6 */
+ ip_pcb->inp_vflag = 0;
+ /* free up authentication fields */
+ if (inp->sctp_ep.local_auth_chunks != NULL)
+ sctp_free_chunklist(inp->sctp_ep.local_auth_chunks);
+ if (inp->sctp_ep.local_hmacs != NULL)
+ sctp_free_hmaclist(inp->sctp_ep.local_hmacs);
+
+ shared_key = LIST_FIRST(&inp->sctp_ep.shared_keys);
+ while (shared_key) {
+ LIST_REMOVE(shared_key, next);
+ sctp_free_sharedkey(shared_key);
+ /* sa_ignore FREED_MEMORY */
+ shared_key = LIST_FIRST(&inp->sctp_ep.shared_keys);
+ }
+
+ /*
+ * if we have an address list the following will free the list of
+ * ifaddr's that are set into this ep. Again macro limitations here,
+ * since the LIST_FOREACH could be a bad idea.
+ */
+ for ((laddr = LIST_FIRST(&inp->sctp_addr_list)); laddr != NULL;
+ laddr = nladdr) {
+ nladdr = LIST_NEXT(laddr, sctp_nxt_addr);
+ sctp_remove_laddr(laddr);
+ }
+
+#ifdef SCTP_TRACK_FREED_ASOCS
+ /* TEMP CODE */
+ for ((asoc = LIST_FIRST(&inp->sctp_asoc_free_list)); asoc != NULL;
+ asoc = nasoc) {
+ nasoc = LIST_NEXT(asoc, sctp_tcblist);
+ LIST_REMOVE(asoc, sctp_tcblist);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), asoc);
+ SCTP_DECR_ASOC_COUNT();
+ }
+ /* *** END TEMP CODE *** */
+#endif
+ /* Now lets see about freeing the EP hash table. */
+ if (inp->sctp_tcbhash != NULL) {
+ SCTP_HASH_FREE(inp->sctp_tcbhash, inp->sctp_hashmark);
+ inp->sctp_tcbhash = NULL;
+ }
+ /* Now we must put the ep memory back into the zone pool */
+ INP_LOCK_DESTROY(&inp->ip_inp.inp);
+ SCTP_INP_LOCK_DESTROY(inp);
+ SCTP_INP_READ_DESTROY(inp);
+ SCTP_ASOC_CREATE_LOCK_DESTROY(inp);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
+ SCTP_DECR_EP_COUNT();
+}
+
+
+struct sctp_nets *
+sctp_findnet(struct sctp_tcb *stcb, struct sockaddr *addr)
+{
+ struct sctp_nets *net;
+
+ /* locate the address */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ if (sctp_cmpaddr(addr, (struct sockaddr *)&net->ro._l_addr))
+ return (net);
+ }
+ return (NULL);
+}
+
+
+int
+sctp_is_address_on_local_host(struct sockaddr *addr, uint32_t vrf_id)
+{
+ struct sctp_ifa *sctp_ifa;
+
+ sctp_ifa = sctp_find_ifa_by_addr(addr, vrf_id, SCTP_ADDR_NOT_LOCKED);
+ if (sctp_ifa) {
+ return (1);
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * add's a remote endpoint address, done with the INIT/INIT-ACK as well as
+ * when a ASCONF arrives that adds it. It will also initialize all the cwnd
+ * stats of stuff.
+ */
+int
+sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr,
+ int set_scope, int from)
+{
+ /*
+ * The following is redundant to the same lines in the
+ * sctp_aloc_assoc() but is needed since others call the add address
+ * function
+ */
+ struct sctp_nets *net, *netfirst;
+ int addr_inscope;
+
+ SCTPDBG(SCTP_DEBUG_PCB1, "Adding an address (from:%d) to the peer: ",
+ from);
+ SCTPDBG_ADDR(SCTP_DEBUG_PCB1, newaddr);
+
+ netfirst = sctp_findnet(stcb, newaddr);
+ if (netfirst) {
+ /*
+ * Lie and return ok, we don't want to make the association
+ * go away for this behavior. It will happen in the TCP
+ * model in a connected socket. It does not reach the hash
+ * table until after the association is built so it can't be
+ * found. Mark as reachable, since the initial creation will
+ * have been cleared and the NOT_IN_ASSOC flag will have
+ * been added... and we don't want to end up removing it
+ * back out.
+ */
+ if (netfirst->dest_state & SCTP_ADDR_UNCONFIRMED) {
+ netfirst->dest_state = (SCTP_ADDR_REACHABLE |
+ SCTP_ADDR_UNCONFIRMED);
+ } else {
+ netfirst->dest_state = SCTP_ADDR_REACHABLE;
+ }
+
+ return (0);
+ }
+ addr_inscope = 1;
+ if (newaddr->sa_family == AF_INET) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)newaddr;
+ if (sin->sin_addr.s_addr == 0) {
+ /* Invalid address */
+ return (-1);
+ }
+ /* zero out the bzero area */
+ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+
+ /* assure len is set */
+ sin->sin_len = sizeof(struct sockaddr_in);
+ if (set_scope) {
+#ifdef SCTP_DONT_DO_PRIVADDR_SCOPE
+ stcb->ipv4_local_scope = 1;
+#else
+ if (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) {
+ stcb->asoc.ipv4_local_scope = 1;
+ }
+#endif /* SCTP_DONT_DO_PRIVADDR_SCOPE */
+ } else {
+ /* Validate the address is in scope */
+ if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) &&
+ (stcb->asoc.ipv4_local_scope == 0)) {
+ addr_inscope = 0;
+ }
+ }
+#ifdef INET6
+ } else if (newaddr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)newaddr;
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+ /* Invalid address */
+ return (-1);
+ }
+ /* assure len is set */
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+ if (set_scope) {
+ if (sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id)) {
+ stcb->asoc.loopback_scope = 1;
+ stcb->asoc.local_scope = 0;
+ stcb->asoc.ipv4_local_scope = 1;
+ stcb->asoc.site_scope = 1;
+ } else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
+ /*
+ * If the new destination is a LINK_LOCAL we
+ * must have common site scope. Don't set
+ * the local scope since we may not share
+ * all links, only loopback can do this.
+ * Links on the local network would also be
+ * on our private network for v4 too.
+ */
+ stcb->asoc.ipv4_local_scope = 1;
+ stcb->asoc.site_scope = 1;
+ } else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) {
+ /*
+ * If the new destination is SITE_LOCAL then
+ * we must have site scope in common.
+ */
+ stcb->asoc.site_scope = 1;
+ }
+ } else {
+ /* Validate the address is in scope */
+ if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr) &&
+ (stcb->asoc.loopback_scope == 0)) {
+ addr_inscope = 0;
+ } else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) &&
+ (stcb->asoc.local_scope == 0)) {
+ addr_inscope = 0;
+ } else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) &&
+ (stcb->asoc.site_scope == 0)) {
+ addr_inscope = 0;
+ }
+ }
+#endif
+ } else {
+ /* not supported family type */
+ return (-1);
+ }
+ net = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_net), struct sctp_nets);
+ if (net == NULL) {
+ return (-1);
+ }
+ SCTP_INCR_RADDR_COUNT();
+ bzero(net, sizeof(*net));
+ (void)SCTP_GETTIME_TIMEVAL(&net->start_time);
+ memcpy(&net->ro._l_addr, newaddr, newaddr->sa_len);
+ if (newaddr->sa_family == AF_INET) {
+ ((struct sockaddr_in *)&net->ro._l_addr)->sin_port = stcb->rport;
+ } else if (newaddr->sa_family == AF_INET6) {
+ ((struct sockaddr_in6 *)&net->ro._l_addr)->sin6_port = stcb->rport;
+ }
+ net->addr_is_local = sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id);
+ if (net->addr_is_local && ((set_scope || (from == SCTP_ADDR_IS_CONFIRMED)))) {
+ stcb->asoc.loopback_scope = 1;
+ stcb->asoc.ipv4_local_scope = 1;
+ stcb->asoc.local_scope = 0;
+ stcb->asoc.site_scope = 1;
+ addr_inscope = 1;
+ }
+ net->failure_threshold = stcb->asoc.def_net_failure;
+ if (addr_inscope == 0) {
+ net->dest_state = (SCTP_ADDR_REACHABLE |
+ SCTP_ADDR_OUT_OF_SCOPE);
+ } else {
+ if (from == SCTP_ADDR_IS_CONFIRMED)
+ /* SCTP_ADDR_IS_CONFIRMED is passed by connect_x */
+ net->dest_state = SCTP_ADDR_REACHABLE;
+ else
+ net->dest_state = SCTP_ADDR_REACHABLE |
+ SCTP_ADDR_UNCONFIRMED;
+ }
+ /*
+ * We set this to 0, the timer code knows that this means its an
+ * initial value
+ */
+ net->RTO = 0;
+ net->RTO_measured = 0;
+ stcb->asoc.numnets++;
+ *(&net->ref_count) = 1;
+ net->tos_flowlabel = 0;
+ if (SCTP_BASE_SYSCTL(sctp_udp_tunneling_for_client_enable)) {
+ net->port = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
+ } else {
+ net->port = 0;
+ }
+#ifdef INET
+ if (newaddr->sa_family == AF_INET)
+ net->tos_flowlabel = stcb->asoc.default_tos;
+#endif
+#ifdef INET6
+ if (newaddr->sa_family == AF_INET6)
+ net->tos_flowlabel = stcb->asoc.default_flowlabel;
+#endif
+ /* Init the timer structure */
+ SCTP_OS_TIMER_INIT(&net->rxt_timer.timer);
+ SCTP_OS_TIMER_INIT(&net->fr_timer.timer);
+ SCTP_OS_TIMER_INIT(&net->pmtu_timer.timer);
+
+ /* Now generate a route for this guy */
+#ifdef INET6
+ /* KAME hack: embed scopeid */
+ if (newaddr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
+ (void)sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone));
+ sin6->sin6_scope_id = 0;
+ }
+#endif
+ SCTP_RTALLOC((sctp_route_t *) & net->ro, stcb->asoc.vrf_id);
+
+ if (SCTP_ROUTE_HAS_VALID_IFN(&net->ro)) {
+ /* Get source address */
+ net->ro._s_addr = sctp_source_address_selection(stcb->sctp_ep,
+ stcb,
+ (sctp_route_t *) & net->ro,
+ net,
+ 0,
+ stcb->asoc.vrf_id);
+ /* Now get the interface MTU */
+ if (net->ro._s_addr && net->ro._s_addr->ifn_p) {
+ net->mtu = SCTP_GATHER_MTU_FROM_INTFC(net->ro._s_addr->ifn_p);
+ } else {
+ net->mtu = 0;
+ }
+ if (net->mtu == 0) {
+ /* Huh ?? */
+ net->mtu = SCTP_DEFAULT_MTU;
+ } else {
+ uint32_t rmtu;
+
+ rmtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, net->ro.ro_rt);
+ if (rmtu == 0) {
+ /*
+ * Start things off to match mtu of
+ * interface please.
+ */
+ SCTP_SET_MTU_OF_ROUTE(&net->ro._l_addr.sa,
+ net->ro.ro_rt, net->mtu);
+ } else {
+ /*
+ * we take the route mtu over the interface,
+ * since the route may be leading out the
+ * loopback, or a different interface.
+ */
+ net->mtu = rmtu;
+ }
+ }
+ if (from == SCTP_ALLOC_ASOC) {
+ stcb->asoc.smallest_mtu = net->mtu;
+ }
+ } else {
+ net->mtu = stcb->asoc.smallest_mtu;
+ }
+#ifdef INET6
+ if (newaddr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
+ (void)sa6_recoverscope(sin6);
+ }
+#endif
+ if (net->port) {
+ net->mtu -= sizeof(struct udphdr);
+ }
+ if (stcb->asoc.smallest_mtu > net->mtu) {
+ stcb->asoc.smallest_mtu = net->mtu;
+ }
+ /* JRS - Use the congestion control given in the CC module */
+ stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net);
+
+ /*
+ * CMT: CUC algo - set find_pseudo_cumack to TRUE (1) at beginning
+ * of assoc (2005/06/27, iyengar@cis.udel.edu)
+ */
+ net->find_pseudo_cumack = 1;
+ net->find_rtx_pseudo_cumack = 1;
+ net->src_addr_selected = 0;
+ netfirst = TAILQ_FIRST(&stcb->asoc.nets);
+ if (net->ro.ro_rt == NULL) {
+ /* Since we have no route put it at the back */
+ TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next);
+ } else if (netfirst == NULL) {
+ /* We are the first one in the pool. */
+ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next);
+ } else if (netfirst->ro.ro_rt == NULL) {
+ /*
+ * First one has NO route. Place this one ahead of the first
+ * one.
+ */
+ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next);
+ } else if (net->ro.ro_rt->rt_ifp != netfirst->ro.ro_rt->rt_ifp) {
+ /*
+ * This one has a different interface than the one at the
+ * top of the list. Place it ahead.
+ */
+ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next);
+ } else {
+ /*
+ * Ok we have the same interface as the first one. Move
+ * forward until we find either a) one with a NULL route...
+ * insert ahead of that b) one with a different ifp.. insert
+ * after that. c) end of the list.. insert at the tail.
+ */
+ struct sctp_nets *netlook;
+
+ do {
+ netlook = TAILQ_NEXT(netfirst, sctp_next);
+ if (netlook == NULL) {
+ /* End of the list */
+ TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next);
+ break;
+ } else if (netlook->ro.ro_rt == NULL) {
+ /* next one has NO route */
+ TAILQ_INSERT_BEFORE(netfirst, net, sctp_next);
+ break;
+ } else if (netlook->ro.ro_rt->rt_ifp != net->ro.ro_rt->rt_ifp) {
+ TAILQ_INSERT_AFTER(&stcb->asoc.nets, netlook,
+ net, sctp_next);
+ break;
+ }
+ /* Shift forward */
+ netfirst = netlook;
+ } while (netlook != NULL);
+ }
+
+ /* got to have a primary set */
+ if (stcb->asoc.primary_destination == 0) {
+ stcb->asoc.primary_destination = net;
+ } else if ((stcb->asoc.primary_destination->ro.ro_rt == NULL) &&
+ (net->ro.ro_rt) &&
+ ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0)) {
+ /* No route to current primary adopt new primary */
+ stcb->asoc.primary_destination = net;
+ }
+ sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, stcb->sctp_ep, stcb,
+ net);
+ /* Validate primary is first */
+ net = TAILQ_FIRST(&stcb->asoc.nets);
+ if ((net != stcb->asoc.primary_destination) &&
+ (stcb->asoc.primary_destination)) {
+ /*
+ * first one on the list is NOT the primary sctp_cmpaddr()
+ * is much more efficient if the primary is the first on the
+ * list, make it so.
+ */
+ TAILQ_REMOVE(&stcb->asoc.nets,
+ stcb->asoc.primary_destination, sctp_next);
+ TAILQ_INSERT_HEAD(&stcb->asoc.nets,
+ stcb->asoc.primary_destination, sctp_next);
+ }
+ return (0);
+}
+
+
+static uint32_t
+sctp_aloc_a_assoc_id(struct sctp_inpcb *inp, struct sctp_tcb *stcb)
+{
+ uint32_t id;
+ struct sctpasochead *head;
+ struct sctp_tcb *lstcb;
+
+ SCTP_INP_WLOCK(inp);
+try_again:
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
+ /* TSNH */
+ SCTP_INP_WUNLOCK(inp);
+ return (0);
+ }
+ /*
+ * We don't allow assoc id to be 0, this is needed otherwise if the
+ * id were to wrap we would have issues with some socket options.
+ */
+ if (inp->sctp_associd_counter == 0) {
+ inp->sctp_associd_counter++;
+ }
+ id = inp->sctp_associd_counter;
+ inp->sctp_associd_counter++;
+ lstcb = sctp_findasoc_ep_asocid_locked(inp, (sctp_assoc_t) id, 0);
+ if (lstcb) {
+ goto try_again;
+ }
+ head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)];
+ LIST_INSERT_HEAD(head, stcb, sctp_tcbasocidhash);
+ stcb->asoc.in_asocid_hash = 1;
+ SCTP_INP_WUNLOCK(inp);
+ return id;
+}
+
+/*
+ * allocate an association and add it to the endpoint. The caller must be
+ * careful to add all additional addresses once they are know right away or
+ * else the assoc will be may experience a blackout scenario.
+ */
+struct sctp_tcb *
+sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr,
+ int *error, uint32_t override_tag, uint32_t vrf_id,
+ struct thread *p
+)
+{
+ /* note the p argument is only valid in unbound sockets */
+
+ struct sctp_tcb *stcb;
+ struct sctp_association *asoc;
+ struct sctpasochead *head;
+ uint16_t rport;
+ int err;
+
+ /*
+ * Assumption made here: Caller has done a
+ * sctp_findassociation_ep_addr(ep, addr's); to make sure the
+ * address does not exist already.
+ */
+ if (SCTP_BASE_INFO(ipi_count_asoc) >= SCTP_MAX_NUM_OF_ASOC) {
+ /* Hit max assoc, sorry no more */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS);
+ *error = ENOBUFS;
+ return (NULL);
+ }
+ if (firstaddr == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ *error = EINVAL;
+ return (NULL);
+ }
+ SCTP_INP_RLOCK(inp);
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) &&
+ ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED))) {
+ /*
+ * If its in the TCP pool, its NOT allowed to create an
+ * association. The parent listener needs to call
+ * sctp_aloc_assoc.. or the one-2-many socket. If a peeled
+ * off, or connected one does this.. its an error.
+ */
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ *error = EINVAL;
+ return (NULL);
+ }
+ SCTPDBG(SCTP_DEBUG_PCB3, "Allocate an association for peer:");
+#ifdef SCTP_DEBUG
+ if (firstaddr) {
+ SCTPDBG_ADDR(SCTP_DEBUG_PCB3, firstaddr);
+ SCTPDBG(SCTP_DEBUG_PCB3, "Port:%d\n",
+ ntohs(((struct sockaddr_in *)firstaddr)->sin_port));
+ } else {
+ SCTPDBG(SCTP_DEBUG_PCB3, "None\n");
+ }
+#endif /* SCTP_DEBUG */
+ if (firstaddr->sa_family == AF_INET) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)firstaddr;
+ if ((sin->sin_port == 0) || (sin->sin_addr.s_addr == 0)) {
+ /* Invalid address */
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ *error = EINVAL;
+ return (NULL);
+ }
+ rport = sin->sin_port;
+ } else if (firstaddr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)firstaddr;
+ if ((sin6->sin6_port == 0) ||
+ (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))) {
+ /* Invalid address */
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ *error = EINVAL;
+ return (NULL);
+ }
+ rport = sin6->sin6_port;
+ } else {
+ /* not supported family type */
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ *error = EINVAL;
+ return (NULL);
+ }
+ SCTP_INP_RUNLOCK(inp);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) {
+ /*
+ * If you have not performed a bind, then we need to do the
+ * ephemeral bind for you.
+ */
+ if ((err = sctp_inpcb_bind(inp->sctp_socket,
+ (struct sockaddr *)NULL,
+ (struct sctp_ifa *)NULL,
+ p
+ ))) {
+ /* bind error, probably perm */
+ *error = err;
+ return (NULL);
+ }
+ }
+ stcb = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_asoc), struct sctp_tcb);
+ if (stcb == NULL) {
+ /* out of memory? */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM);
+ *error = ENOMEM;
+ return (NULL);
+ }
+ SCTP_INCR_ASOC_COUNT();
+
+ bzero(stcb, sizeof(*stcb));
+ asoc = &stcb->asoc;
+
+ asoc->assoc_id = sctp_aloc_a_assoc_id(inp, stcb);
+ SCTP_TCB_LOCK_INIT(stcb);
+ SCTP_TCB_SEND_LOCK_INIT(stcb);
+ stcb->rport = rport;
+ /* setup back pointer's */
+ stcb->sctp_ep = inp;
+ stcb->sctp_socket = inp->sctp_socket;
+ if ((err = sctp_init_asoc(inp, stcb, override_tag, vrf_id))) {
+ /* failed */
+ SCTP_TCB_LOCK_DESTROY(stcb);
+ SCTP_TCB_SEND_LOCK_DESTROY(stcb);
+ LIST_REMOVE(stcb, sctp_tcbasocidhash);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
+ SCTP_DECR_ASOC_COUNT();
+ *error = err;
+ return (NULL);
+ }
+ /* and the port */
+ SCTP_INP_INFO_WLOCK();
+ SCTP_INP_WLOCK(inp);
+ if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
+ /* inpcb freed while alloc going on */
+ SCTP_TCB_LOCK_DESTROY(stcb);
+ SCTP_TCB_SEND_LOCK_DESTROY(stcb);
+ LIST_REMOVE(stcb, sctp_tcbasocidhash);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
+ SCTP_DECR_ASOC_COUNT();
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ *error = EINVAL;
+ return (NULL);
+ }
+ SCTP_TCB_LOCK(stcb);
+
+ /* now that my_vtag is set, add it to the hash */
+ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))];
+ /* put it in the bucket in the vtag hash of assoc's for the system */
+ LIST_INSERT_HEAD(head, stcb, sctp_asocs);
+ SCTP_INP_INFO_WUNLOCK();
+
+ if ((err = sctp_add_remote_addr(stcb, firstaddr, SCTP_DO_SETSCOPE, SCTP_ALLOC_ASOC))) {
+ /* failure.. memory error? */
+ if (asoc->strmout) {
+ SCTP_FREE(asoc->strmout, SCTP_M_STRMO);
+ asoc->strmout = NULL;
+ }
+ if (asoc->mapping_array) {
+ SCTP_FREE(asoc->mapping_array, SCTP_M_MAP);
+ asoc->mapping_array = NULL;
+ }
+ if (asoc->nr_mapping_array) {
+ SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP);
+ asoc->nr_mapping_array = NULL;
+ }
+ SCTP_DECR_ASOC_COUNT();
+ SCTP_TCB_LOCK_DESTROY(stcb);
+ SCTP_TCB_SEND_LOCK_DESTROY(stcb);
+ LIST_REMOVE(stcb, sctp_tcbasocidhash);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
+ SCTP_INP_WUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS);
+ *error = ENOBUFS;
+ return (NULL);
+ }
+ /* Init all the timers */
+ SCTP_OS_TIMER_INIT(&asoc->hb_timer.timer);
+ SCTP_OS_TIMER_INIT(&asoc->dack_timer.timer);
+ SCTP_OS_TIMER_INIT(&asoc->strreset_timer.timer);
+ SCTP_OS_TIMER_INIT(&asoc->asconf_timer.timer);
+ SCTP_OS_TIMER_INIT(&asoc->shut_guard_timer.timer);
+ SCTP_OS_TIMER_INIT(&asoc->autoclose_timer.timer);
+ SCTP_OS_TIMER_INIT(&asoc->delayed_event_timer.timer);
+ SCTP_OS_TIMER_INIT(&asoc->delete_prim_timer.timer);
+
+ LIST_INSERT_HEAD(&inp->sctp_asoc_list, stcb, sctp_tcblist);
+ /* now file the port under the hash as well */
+ if (inp->sctp_tcbhash != NULL) {
+ head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(stcb->rport,
+ inp->sctp_hashmark)];
+ LIST_INSERT_HEAD(head, stcb, sctp_tcbhash);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ SCTPDBG(SCTP_DEBUG_PCB1, "Association %p now allocated\n", stcb);
+ return (stcb);
+}
+
+
+void
+sctp_remove_net(struct sctp_tcb *stcb, struct sctp_nets *net)
+{
+ struct sctp_association *asoc;
+
+ asoc = &stcb->asoc;
+ asoc->numnets--;
+ TAILQ_REMOVE(&asoc->nets, net, sctp_next);
+ if (net == asoc->primary_destination) {
+ /* Reset primary */
+ struct sctp_nets *lnet;
+
+ lnet = TAILQ_FIRST(&asoc->nets);
+ /*
+ * Mobility adaptation Ideally, if deleted destination is
+ * the primary, it becomes a fast retransmission trigger by
+ * the subsequent SET PRIMARY. (by micchie)
+ */
+ if (sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_BASE) ||
+ sctp_is_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_FASTHANDOFF)) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: primary dst is deleting\n");
+ if (asoc->deleted_primary != NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: deleted primary may be already stored\n");
+ goto out;
+ }
+ asoc->deleted_primary = net;
+ atomic_add_int(&net->ref_count, 1);
+ memset(&net->lastsa, 0, sizeof(net->lastsa));
+ memset(&net->lastsv, 0, sizeof(net->lastsv));
+ sctp_mobility_feature_on(stcb->sctp_ep,
+ SCTP_MOBILITY_PRIM_DELETED);
+ sctp_timer_start(SCTP_TIMER_TYPE_PRIM_DELETED,
+ stcb->sctp_ep, stcb, NULL);
+ }
+out:
+ /* Try to find a confirmed primary */
+ asoc->primary_destination = sctp_find_alternate_net(stcb, lnet, 0);
+ }
+ if (net == asoc->last_data_chunk_from) {
+ /* Reset primary */
+ asoc->last_data_chunk_from = TAILQ_FIRST(&asoc->nets);
+ }
+ if (net == asoc->last_control_chunk_from) {
+ /* Clear net */
+ asoc->last_control_chunk_from = NULL;
+ }
+ sctp_free_remote_addr(net);
+}
+
+/*
+ * remove a remote endpoint address from an association, it will fail if the
+ * address does not exist.
+ */
+int
+sctp_del_remote_addr(struct sctp_tcb *stcb, struct sockaddr *remaddr)
+{
+ /*
+ * Here we need to remove a remote address. This is quite simple, we
+ * first find it in the list of address for the association
+ * (tasoc->asoc.nets) and then if it is there, we do a LIST_REMOVE
+ * on that item. Note we do not allow it to be removed if there are
+ * no other addresses.
+ */
+ struct sctp_association *asoc;
+ struct sctp_nets *net, *net_tmp;
+
+ asoc = &stcb->asoc;
+
+ /* locate the address */
+ for (net = TAILQ_FIRST(&asoc->nets); net != NULL; net = net_tmp) {
+ net_tmp = TAILQ_NEXT(net, sctp_next);
+ if (net->ro._l_addr.sa.sa_family != remaddr->sa_family) {
+ continue;
+ }
+ if (sctp_cmpaddr((struct sockaddr *)&net->ro._l_addr,
+ remaddr)) {
+ /* we found the guy */
+ if (asoc->numnets < 2) {
+ /* Must have at LEAST two remote addresses */
+ return (-1);
+ } else {
+ sctp_remove_net(stcb, net);
+ return (0);
+ }
+ }
+ }
+ /* not found. */
+ return (-2);
+}
+
+void
+sctp_delete_from_timewait(uint32_t tag, uint16_t lport, uint16_t rport)
+{
+ struct sctpvtaghead *chain;
+ struct sctp_tagblock *twait_block;
+ int found = 0;
+ int i;
+
+ chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
+ if (!LIST_EMPTY(chain)) {
+ LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
+ for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
+ if ((twait_block->vtag_block[i].v_tag == tag) &&
+ (twait_block->vtag_block[i].lport == lport) &&
+ (twait_block->vtag_block[i].rport == rport)) {
+ twait_block->vtag_block[i].tv_sec_at_expire = 0;
+ twait_block->vtag_block[i].v_tag = 0;
+ twait_block->vtag_block[i].lport = 0;
+ twait_block->vtag_block[i].rport = 0;
+ found = 1;
+ break;
+ }
+ }
+ if (found)
+ break;
+ }
+ }
+}
+
+int
+sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport)
+{
+ struct sctpvtaghead *chain;
+ struct sctp_tagblock *twait_block;
+ int found = 0;
+ int i;
+
+ SCTP_INP_INFO_WLOCK();
+ chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
+ if (!LIST_EMPTY(chain)) {
+ LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
+ for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
+ if ((twait_block->vtag_block[i].v_tag == tag) &&
+ (twait_block->vtag_block[i].lport == lport) &&
+ (twait_block->vtag_block[i].rport == rport)) {
+ found = 1;
+ break;
+ }
+ }
+ if (found)
+ break;
+ }
+ }
+ SCTP_INP_INFO_WUNLOCK();
+ return (found);
+}
+
+
+void
+sctp_add_vtag_to_timewait(uint32_t tag, uint32_t time, uint16_t lport, uint16_t rport)
+{
+ struct sctpvtaghead *chain;
+ struct sctp_tagblock *twait_block;
+ struct timeval now;
+ int set, i;
+
+ if (time == 0) {
+ /* Its disabled */
+ return;
+ }
+ (void)SCTP_GETTIME_TIMEVAL(&now);
+ chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
+ set = 0;
+ if (!LIST_EMPTY(chain)) {
+ /* Block(s) present, lets find space, and expire on the fly */
+ LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
+ for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
+ if ((twait_block->vtag_block[i].v_tag == 0) &&
+ !set) {
+ twait_block->vtag_block[i].tv_sec_at_expire =
+ now.tv_sec + time;
+ twait_block->vtag_block[i].v_tag = tag;
+ twait_block->vtag_block[i].lport = lport;
+ twait_block->vtag_block[i].rport = rport;
+ set = 1;
+ } else if ((twait_block->vtag_block[i].v_tag) &&
+ ((long)twait_block->vtag_block[i].tv_sec_at_expire < now.tv_sec)) {
+ /* Audit expires this guy */
+ twait_block->vtag_block[i].tv_sec_at_expire = 0;
+ twait_block->vtag_block[i].v_tag = 0;
+ twait_block->vtag_block[i].lport = 0;
+ twait_block->vtag_block[i].rport = 0;
+ if (set == 0) {
+ /* Reuse it for my new tag */
+ twait_block->vtag_block[i].tv_sec_at_expire = now.tv_sec + time;
+ twait_block->vtag_block[i].v_tag = tag;
+ twait_block->vtag_block[i].lport = lport;
+ twait_block->vtag_block[i].rport = rport;
+ set = 1;
+ }
+ }
+ }
+ if (set) {
+ /*
+ * We only do up to the block where we can
+ * place our tag for audits
+ */
+ break;
+ }
+ }
+ }
+ /* Need to add a new block to chain */
+ if (!set) {
+ SCTP_MALLOC(twait_block, struct sctp_tagblock *,
+ sizeof(struct sctp_tagblock), SCTP_M_TIMW);
+ if (twait_block == NULL) {
+#ifdef INVARIANTS
+ panic("Can not alloc tagblock");
+#endif
+ return;
+ }
+ memset(twait_block, 0, sizeof(struct sctp_tagblock));
+ LIST_INSERT_HEAD(chain, twait_block, sctp_nxt_tagblock);
+ twait_block->vtag_block[0].tv_sec_at_expire = now.tv_sec + time;
+ twait_block->vtag_block[0].v_tag = tag;
+ twait_block->vtag_block[0].lport = lport;
+ twait_block->vtag_block[0].rport = rport;
+ }
+}
+
+
+
+/*-
+ * Free the association after un-hashing the remote port. This
+ * function ALWAYS returns holding NO LOCK on the stcb. It DOES
+ * expect that the input to this function IS a locked TCB.
+ * It will return 0, if it did NOT destroy the association (instead
+ * it unlocks it. It will return NON-zero if it either destroyed the
+ * association OR the association is already destroyed.
+ */
+int
+sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfree, int from_location)
+{
+ int i;
+ struct sctp_association *asoc;
+ struct sctp_nets *net, *prev;
+ struct sctp_laddr *laddr;
+ struct sctp_tmit_chunk *chk;
+ struct sctp_asconf_addr *aparam;
+ struct sctp_asconf_ack *aack;
+ struct sctp_stream_reset_list *liste;
+ struct sctp_queued_to_read *sq;
+ struct sctp_stream_queue_pending *sp;
+ sctp_sharedkey_t *shared_key;
+ struct socket *so;
+ int ccnt = 0;
+ int cnt = 0;
+
+ /* first, lets purge the entry from the hash table. */
+
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, stcb, 6);
+#endif
+ if (stcb->asoc.state == 0) {
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 7);
+#endif
+ /* there is no asoc, really TSNH :-0 */
+ return (1);
+ }
+ /* TEMP CODE */
+ if (stcb->freed_from_where == 0) {
+ /* Only record the first place free happened from */
+ stcb->freed_from_where = from_location;
+ }
+ /* TEMP CODE */
+
+ asoc = &stcb->asoc;
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE))
+ /* nothing around */
+ so = NULL;
+ else
+ so = inp->sctp_socket;
+
+ /*
+ * We used timer based freeing if a reader or writer is in the way.
+ * So we first check if we are actually being called from a timer,
+ * if so we abort early if a reader or writer is still in the way.
+ */
+ if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) &&
+ (from_inpcbfree == SCTP_NORMAL_PROC)) {
+ /*
+ * is it the timer driving us? if so are the reader/writers
+ * gone?
+ */
+ if (stcb->asoc.refcnt) {
+ /* nope, reader or writer in the way */
+ sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL);
+ /* no asoc destroyed */
+ SCTP_TCB_UNLOCK(stcb);
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, stcb, 8);
+#endif
+ return (0);
+ }
+ }
+ /* now clean up any other timers */
+ (void)SCTP_OS_TIMER_STOP(&asoc->hb_timer.timer);
+ asoc->hb_timer.self = NULL;
+ (void)SCTP_OS_TIMER_STOP(&asoc->dack_timer.timer);
+ asoc->dack_timer.self = NULL;
+ (void)SCTP_OS_TIMER_STOP(&asoc->strreset_timer.timer);
+ /*-
+ * For stream reset we don't blast this unless
+ * it is a str-reset timer, it might be the
+ * free-asoc timer which we DON'T want to
+ * disturb.
+ */
+ if (asoc->strreset_timer.type == SCTP_TIMER_TYPE_STRRESET)
+ asoc->strreset_timer.self = NULL;
+ (void)SCTP_OS_TIMER_STOP(&asoc->asconf_timer.timer);
+ asoc->asconf_timer.self = NULL;
+ (void)SCTP_OS_TIMER_STOP(&asoc->autoclose_timer.timer);
+ asoc->autoclose_timer.self = NULL;
+ (void)SCTP_OS_TIMER_STOP(&asoc->shut_guard_timer.timer);
+ asoc->shut_guard_timer.self = NULL;
+ (void)SCTP_OS_TIMER_STOP(&asoc->delayed_event_timer.timer);
+ asoc->delayed_event_timer.self = NULL;
+ /* Mobility adaptation */
+ (void)SCTP_OS_TIMER_STOP(&asoc->delete_prim_timer.timer);
+ asoc->delete_prim_timer.self = NULL;
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ (void)SCTP_OS_TIMER_STOP(&net->fr_timer.timer);
+ net->fr_timer.self = NULL;
+ (void)SCTP_OS_TIMER_STOP(&net->rxt_timer.timer);
+ net->rxt_timer.self = NULL;
+ (void)SCTP_OS_TIMER_STOP(&net->pmtu_timer.timer);
+ net->pmtu_timer.self = NULL;
+ }
+ /* Now the read queue needs to be cleaned up (only once) */
+ cnt = 0;
+ if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0) {
+ stcb->asoc.state |= SCTP_STATE_ABOUT_TO_BE_FREED;
+ SCTP_INP_READ_LOCK(inp);
+ TAILQ_FOREACH(sq, &inp->read_queue, next) {
+ if (sq->stcb == stcb) {
+ sq->do_not_ref_stcb = 1;
+ sq->sinfo_cumtsn = stcb->asoc.cumulative_tsn;
+ /*
+ * If there is no end, there never will be
+ * now.
+ */
+ if (sq->end_added == 0) {
+ /* Held for PD-API clear that. */
+ sq->pdapi_aborted = 1;
+ sq->held_length = 0;
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT) && (so != NULL)) {
+ /*
+ * Need to add a PD-API
+ * aborted indication.
+ * Setting the control_pdapi
+ * assures that it will be
+ * added right after this
+ * msg.
+ */
+ uint32_t strseq;
+
+ stcb->asoc.control_pdapi = sq;
+ strseq = (sq->sinfo_stream << 16) | sq->sinfo_ssn;
+ sctp_ulp_notify(SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION,
+ stcb,
+ SCTP_PARTIAL_DELIVERY_ABORTED,
+ (void *)&strseq,
+ SCTP_SO_LOCKED);
+ stcb->asoc.control_pdapi = NULL;
+ }
+ }
+ /* Add an end to wake them */
+ sq->end_added = 1;
+ cnt++;
+ }
+ }
+ SCTP_INP_READ_UNLOCK(inp);
+ if (stcb->block_entry) {
+ cnt++;
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_PCB, ECONNRESET);
+ stcb->block_entry->error = ECONNRESET;
+ stcb->block_entry = NULL;
+ }
+ }
+ if ((stcb->asoc.refcnt) || (stcb->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE)) {
+ /*
+ * Someone holds a reference OR the socket is unaccepted
+ * yet.
+ */
+ if ((stcb->asoc.refcnt) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
+ stcb->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE;
+ sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL);
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE))
+ /* nothing around */
+ so = NULL;
+ if (so) {
+ /* Wake any reader/writers */
+ sctp_sorwakeup(inp, so);
+ sctp_sowwakeup(inp, so);
+ }
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, stcb, 9);
+#endif
+ /* no asoc destroyed */
+ return (0);
+ }
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, stcb, 10);
+#endif
+ /*
+ * When I reach here, no others want to kill the assoc yet.. and I
+ * own the lock. Now its possible an abort comes in when I do the
+ * lock exchange below to grab all the locks to do the final take
+ * out. to prevent this we increment the count, which will start a
+ * timer and blow out above thus assuring us that we hold exclusive
+ * killing of the asoc. Note that after getting back the TCB lock we
+ * will go ahead and increment the counter back up and stop any
+ * timer a passing stranger may have started :-S
+ */
+ if (from_inpcbfree == SCTP_NORMAL_PROC) {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_INP_INFO_WLOCK();
+ SCTP_INP_WLOCK(inp);
+ SCTP_TCB_LOCK(stcb);
+ }
+ /* Double check the GONE flag */
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE))
+ /* nothing around */
+ so = NULL;
+
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ /*
+ * For TCP type we need special handling when we are
+ * connected. We also include the peel'ed off ones to.
+ */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_CONNECTED;
+ inp->sctp_flags |= SCTP_PCB_FLAGS_WAS_CONNECTED;
+ if (so) {
+ SOCK_LOCK(so);
+ if (so->so_rcv.sb_cc == 0) {
+ so->so_state &= ~(SS_ISCONNECTING |
+ SS_ISDISCONNECTING |
+ SS_ISCONFIRMING |
+ SS_ISCONNECTED);
+ }
+ socantrcvmore_locked(so);
+ sctp_sowwakeup(inp, so);
+ sctp_sorwakeup(inp, so);
+ SCTP_SOWAKEUP(so);
+ }
+ }
+ }
+ /*
+ * Make it invalid too, that way if its about to run it will abort
+ * and return.
+ */
+ /* re-increment the lock */
+ if (from_inpcbfree == SCTP_NORMAL_PROC) {
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ }
+ if (stcb->asoc.refcnt) {
+ stcb->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE;
+ sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL);
+ if (from_inpcbfree == SCTP_NORMAL_PROC) {
+ SCTP_INP_INFO_WUNLOCK();
+ SCTP_INP_WUNLOCK(inp);
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ return (0);
+ }
+ asoc->state = 0;
+ if (inp->sctp_tcbhash) {
+ LIST_REMOVE(stcb, sctp_tcbhash);
+ }
+ if (stcb->asoc.in_asocid_hash) {
+ LIST_REMOVE(stcb, sctp_tcbasocidhash);
+ }
+ /* Now lets remove it from the list of ALL associations in the EP */
+ LIST_REMOVE(stcb, sctp_tcblist);
+ if (from_inpcbfree == SCTP_NORMAL_PROC) {
+ SCTP_INP_INCR_REF(inp);
+ SCTP_INP_WUNLOCK(inp);
+ }
+ /* pull from vtag hash */
+ LIST_REMOVE(stcb, sctp_asocs);
+ sctp_add_vtag_to_timewait(asoc->my_vtag, SCTP_BASE_SYSCTL(sctp_vtag_time_wait),
+ inp->sctp_lport, stcb->rport);
+
+ /*
+ * Now restop the timers to be sure this is paranoia at is finest!
+ */
+ (void)SCTP_OS_TIMER_STOP(&asoc->strreset_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&asoc->hb_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&asoc->dack_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&asoc->strreset_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&asoc->asconf_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&asoc->shut_guard_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&asoc->autoclose_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&asoc->delayed_event_timer.timer);
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ (void)SCTP_OS_TIMER_STOP(&net->fr_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&net->rxt_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&net->pmtu_timer.timer);
+ }
+
+ asoc->strreset_timer.type = SCTP_TIMER_TYPE_NONE;
+ prev = NULL;
+ /*
+ * The chunk lists and such SHOULD be empty but we check them just
+ * in case.
+ */
+ /* anything on the wheel needs to be removed */
+ for (i = 0; i < asoc->streamoutcnt; i++) {
+ struct sctp_stream_out *outs;
+
+ outs = &asoc->strmout[i];
+ /* now clean up any chunks here */
+ sp = TAILQ_FIRST(&outs->outqueue);
+ while (sp) {
+ TAILQ_REMOVE(&outs->outqueue, sp, next);
+ if (sp->data) {
+ if (so) {
+ /* Still an open socket - report */
+ sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb,
+ SCTP_NOTIFY_DATAGRAM_UNSENT,
+ (void *)sp, SCTP_SO_LOCKED);
+ }
+ if (sp->data) {
+ sctp_m_freem(sp->data);
+ sp->data = NULL;
+ sp->tail_mbuf = NULL;
+ }
+ }
+ if (sp->net) {
+ sctp_free_remote_addr(sp->net);
+ sp->net = NULL;
+ }
+ sctp_free_spbufspace(stcb, asoc, sp);
+ if (sp->holds_key_ref)
+ sctp_auth_key_release(stcb, sp->auth_keyid);
+ /* Free the zone stuff */
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_strmoq), sp);
+ SCTP_DECR_STRMOQ_COUNT();
+ /* sa_ignore FREED_MEMORY */
+ sp = TAILQ_FIRST(&outs->outqueue);
+ }
+ }
+
+ /* sa_ignore FREED_MEMORY */
+ while ((liste = TAILQ_FIRST(&asoc->resetHead)) != NULL) {
+ TAILQ_REMOVE(&asoc->resetHead, liste, next_resp);
+ SCTP_FREE(liste, SCTP_M_STRESET);
+ }
+
+ sq = TAILQ_FIRST(&asoc->pending_reply_queue);
+ while (sq) {
+ TAILQ_REMOVE(&asoc->pending_reply_queue, sq, next);
+ if (sq->data) {
+ sctp_m_freem(sq->data);
+ sq->data = NULL;
+ }
+ sctp_free_remote_addr(sq->whoFrom);
+ sq->whoFrom = NULL;
+ sq->stcb = NULL;
+ /* Free the ctl entry */
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), sq);
+ SCTP_DECR_READQ_COUNT();
+ /* sa_ignore FREED_MEMORY */
+ sq = TAILQ_FIRST(&asoc->pending_reply_queue);
+ }
+
+ chk = TAILQ_FIRST(&asoc->free_chunks);
+ while (chk) {
+ TAILQ_REMOVE(&asoc->free_chunks, chk, sctp_next);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ if (chk->holds_key_ref)
+ sctp_auth_key_release(stcb, chk->auth_keyid);
+ ccnt++;
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
+ SCTP_DECR_CHK_COUNT();
+ atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1);
+ asoc->free_chunk_cnt--;
+ /* sa_ignore FREED_MEMORY */
+ chk = TAILQ_FIRST(&asoc->free_chunks);
+ }
+ /* pending send queue SHOULD be empty */
+ if (!TAILQ_EMPTY(&asoc->send_queue)) {
+ chk = TAILQ_FIRST(&asoc->send_queue);
+ while (chk) {
+ TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next);
+ if (chk->data) {
+ if (so) {
+ /* Still a socket? */
+ sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb,
+ SCTP_NOTIFY_DATAGRAM_UNSENT, chk, SCTP_SO_LOCKED);
+ }
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ }
+ if (chk->holds_key_ref)
+ sctp_auth_key_release(stcb, chk->auth_keyid);
+ ccnt++;
+ if (chk->whoTo) {
+ sctp_free_remote_addr(chk->whoTo);
+ chk->whoTo = NULL;
+ }
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
+ SCTP_DECR_CHK_COUNT();
+ /* sa_ignore FREED_MEMORY */
+ chk = TAILQ_FIRST(&asoc->send_queue);
+ }
+ }
+/*
+ if (ccnt) {
+ printf("Freed %d from send_queue\n", ccnt);
+ ccnt = 0;
+ }
+*/
+ /* sent queue SHOULD be empty */
+ if (!TAILQ_EMPTY(&asoc->sent_queue)) {
+ chk = TAILQ_FIRST(&asoc->sent_queue);
+ while (chk) {
+ TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next);
+ if (chk->data) {
+ if (so) {
+ /* Still a socket? */
+ sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb,
+ SCTP_NOTIFY_DATAGRAM_SENT, chk, SCTP_SO_LOCKED);
+ }
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ }
+ if (chk->holds_key_ref)
+ sctp_auth_key_release(stcb, chk->auth_keyid);
+ ccnt++;
+ sctp_free_remote_addr(chk->whoTo);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
+ SCTP_DECR_CHK_COUNT();
+ /* sa_ignore FREED_MEMORY */
+ chk = TAILQ_FIRST(&asoc->sent_queue);
+ }
+ }
+/*
+ if (ccnt) {
+ printf("Freed %d from sent_queue\n", ccnt);
+ ccnt = 0;
+ }
+*/
+ /* control queue MAY not be empty */
+ if (!TAILQ_EMPTY(&asoc->control_send_queue)) {
+ chk = TAILQ_FIRST(&asoc->control_send_queue);
+ while (chk) {
+ TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ if (chk->holds_key_ref)
+ sctp_auth_key_release(stcb, chk->auth_keyid);
+ ccnt++;
+ sctp_free_remote_addr(chk->whoTo);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
+ SCTP_DECR_CHK_COUNT();
+ /* sa_ignore FREED_MEMORY */
+ chk = TAILQ_FIRST(&asoc->control_send_queue);
+ }
+ }
+/*
+ if (ccnt) {
+ printf("Freed %d from ctrl_queue\n", ccnt);
+ ccnt = 0;
+ }
+*/
+
+ /* ASCONF queue MAY not be empty */
+ if (!TAILQ_EMPTY(&asoc->asconf_send_queue)) {
+ chk = TAILQ_FIRST(&asoc->asconf_send_queue);
+ while (chk) {
+ TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ if (chk->holds_key_ref)
+ sctp_auth_key_release(stcb, chk->auth_keyid);
+ ccnt++;
+ sctp_free_remote_addr(chk->whoTo);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
+ SCTP_DECR_CHK_COUNT();
+ /* sa_ignore FREED_MEMORY */
+ chk = TAILQ_FIRST(&asoc->asconf_send_queue);
+ }
+ }
+/*
+ if (ccnt) {
+ printf("Freed %d from asconf_queue\n", ccnt);
+ ccnt = 0;
+ }
+*/
+ if (!TAILQ_EMPTY(&asoc->reasmqueue)) {
+ chk = TAILQ_FIRST(&asoc->reasmqueue);
+ while (chk) {
+ TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ if (chk->holds_key_ref)
+ sctp_auth_key_release(stcb, chk->auth_keyid);
+ sctp_free_remote_addr(chk->whoTo);
+ ccnt++;
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
+ SCTP_DECR_CHK_COUNT();
+ /* sa_ignore FREED_MEMORY */
+ chk = TAILQ_FIRST(&asoc->reasmqueue);
+ }
+ }
+/*
+ if (ccnt) {
+ printf("Freed %d from reasm_queue\n", ccnt);
+ ccnt = 0;
+ }
+*/
+ if (asoc->mapping_array) {
+ SCTP_FREE(asoc->mapping_array, SCTP_M_MAP);
+ asoc->mapping_array = NULL;
+ }
+ if (asoc->nr_mapping_array) {
+ SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP);
+ asoc->nr_mapping_array = NULL;
+ }
+ /* the stream outs */
+ if (asoc->strmout) {
+ SCTP_FREE(asoc->strmout, SCTP_M_STRMO);
+ asoc->strmout = NULL;
+ }
+ asoc->strm_realoutsize = asoc->streamoutcnt = 0;
+ if (asoc->strmin) {
+ struct sctp_queued_to_read *ctl;
+
+ for (i = 0; i < asoc->streamincnt; i++) {
+ if (!TAILQ_EMPTY(&asoc->strmin[i].inqueue)) {
+ /* We have somethings on the streamin queue */
+ ctl = TAILQ_FIRST(&asoc->strmin[i].inqueue);
+ while (ctl) {
+ TAILQ_REMOVE(&asoc->strmin[i].inqueue,
+ ctl, next);
+ sctp_free_remote_addr(ctl->whoFrom);
+ if (ctl->data) {
+ sctp_m_freem(ctl->data);
+ ctl->data = NULL;
+ }
+ /*
+ * We don't free the address here
+ * since all the net's were freed
+ * above.
+ */
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), ctl);
+ SCTP_DECR_READQ_COUNT();
+ ctl = TAILQ_FIRST(&asoc->strmin[i].inqueue);
+ }
+ }
+ }
+ SCTP_FREE(asoc->strmin, SCTP_M_STRMI);
+ asoc->strmin = NULL;
+ }
+ asoc->streamincnt = 0;
+ while (!TAILQ_EMPTY(&asoc->nets)) {
+ /* sa_ignore FREED_MEMORY */
+ net = TAILQ_FIRST(&asoc->nets);
+ /* pull from list */
+ if ((SCTP_BASE_INFO(ipi_count_raddr) == 0) || (prev == net)) {
+#ifdef INVARIANTS
+ panic("no net's left alloc'ed, or list points to itself");
+#endif
+ break;
+ }
+ prev = net;
+ TAILQ_REMOVE(&asoc->nets, net, sctp_next);
+ sctp_free_remote_addr(net);
+ }
+
+ while (!LIST_EMPTY(&asoc->sctp_restricted_addrs)) {
+ /* sa_ignore FREED_MEMORY */
+ laddr = LIST_FIRST(&asoc->sctp_restricted_addrs);
+ sctp_remove_laddr(laddr);
+ }
+
+ /* pending asconf (address) parameters */
+ while (!TAILQ_EMPTY(&asoc->asconf_queue)) {
+ /* sa_ignore FREED_MEMORY */
+ aparam = TAILQ_FIRST(&asoc->asconf_queue);
+ TAILQ_REMOVE(&asoc->asconf_queue, aparam, next);
+ SCTP_FREE(aparam, SCTP_M_ASC_ADDR);
+ }
+ while (!TAILQ_EMPTY(&asoc->asconf_ack_sent)) {
+ /* sa_ignore FREED_MEMORY */
+ aack = TAILQ_FIRST(&asoc->asconf_ack_sent);
+ TAILQ_REMOVE(&asoc->asconf_ack_sent, aack, next);
+ if (aack->data != NULL) {
+ sctp_m_freem(aack->data);
+ }
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asconf_ack), aack);
+ }
+ /* clean up auth stuff */
+ if (asoc->local_hmacs)
+ sctp_free_hmaclist(asoc->local_hmacs);
+ if (asoc->peer_hmacs)
+ sctp_free_hmaclist(asoc->peer_hmacs);
+
+ if (asoc->local_auth_chunks)
+ sctp_free_chunklist(asoc->local_auth_chunks);
+ if (asoc->peer_auth_chunks)
+ sctp_free_chunklist(asoc->peer_auth_chunks);
+
+ sctp_free_authinfo(&asoc->authinfo);
+
+ shared_key = LIST_FIRST(&asoc->shared_keys);
+ while (shared_key) {
+ LIST_REMOVE(shared_key, next);
+ sctp_free_sharedkey(shared_key);
+ /* sa_ignore FREED_MEMORY */
+ shared_key = LIST_FIRST(&asoc->shared_keys);
+ }
+
+ /* Insert new items here :> */
+
+ /* Get rid of LOCK */
+ SCTP_TCB_LOCK_DESTROY(stcb);
+ SCTP_TCB_SEND_LOCK_DESTROY(stcb);
+ if (from_inpcbfree == SCTP_NORMAL_PROC) {
+ SCTP_INP_INFO_WUNLOCK();
+ SCTP_INP_RLOCK(inp);
+ }
+#ifdef SCTP_TRACK_FREED_ASOCS
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ /* now clean up the tasoc itself */
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
+ SCTP_DECR_ASOC_COUNT();
+ } else {
+ LIST_INSERT_HEAD(&inp->sctp_asoc_free_list, stcb, sctp_tcblist);
+ }
+#else
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
+ SCTP_DECR_ASOC_COUNT();
+#endif
+ if (from_inpcbfree == SCTP_NORMAL_PROC) {
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ /*
+ * If its NOT the inp_free calling us AND sctp_close
+ * as been called, we call back...
+ */
+ SCTP_INP_RUNLOCK(inp);
+ /*
+ * This will start the kill timer (if we are the
+ * last one) since we hold an increment yet. But
+ * this is the only safe way to do this since
+ * otherwise if the socket closes at the same time
+ * we are here we might collide in the cleanup.
+ */
+ sctp_inpcb_free(inp,
+ SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE,
+ SCTP_CALLED_DIRECTLY_NOCMPSET);
+ SCTP_INP_DECR_REF(inp);
+ goto out_of;
+ } else {
+ /* The socket is still open. */
+ SCTP_INP_DECR_REF(inp);
+ }
+ }
+ if (from_inpcbfree == SCTP_NORMAL_PROC) {
+ SCTP_INP_RUNLOCK(inp);
+ }
+out_of:
+ /* destroyed the asoc */
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 11);
+#endif
+ return (1);
+}
+
+
+
+/*
+ * determine if a destination is "reachable" based upon the addresses bound
+ * to the current endpoint (e.g. only v4 or v6 currently bound)
+ */
+/*
+ * FIX: if we allow assoc-level bindx(), then this needs to be fixed to use
+ * assoc level v4/v6 flags, as the assoc *may* not have the same address
+ * types bound as its endpoint
+ */
+int
+sctp_destination_is_reachable(struct sctp_tcb *stcb, struct sockaddr *destaddr)
+{
+ struct sctp_inpcb *inp;
+ int answer;
+
+ /*
+ * No locks here, the TCB, in all cases is already locked and an
+ * assoc is up. There is either a INP lock by the caller applied (in
+ * asconf case when deleting an address) or NOT in the HB case,
+ * however if HB then the INP increment is up and the INP will not
+ * be removed (on top of the fact that we have a TCB lock). So we
+ * only want to read the sctp_flags, which is either bound-all or
+ * not.. no protection needed since once an assoc is up you can't be
+ * changing your binding.
+ */
+ inp = stcb->sctp_ep;
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ /* if bound all, destination is not restricted */
+ /*
+ * RRS: Question during lock work: Is this correct? If you
+ * are bound-all you still might need to obey the V4--V6
+ * flags??? IMO this bound-all stuff needs to be removed!
+ */
+ return (1);
+ }
+ /* NOTE: all "scope" checks are done when local addresses are added */
+ if (destaddr->sa_family == AF_INET6) {
+ answer = inp->ip_inp.inp.inp_vflag & INP_IPV6;
+ } else if (destaddr->sa_family == AF_INET) {
+ answer = inp->ip_inp.inp.inp_vflag & INP_IPV4;
+ } else {
+ /* invalid family, so it's unreachable */
+ answer = 0;
+ }
+ return (answer);
+}
+
+/*
+ * update the inp_vflags on an endpoint
+ */
+static void
+sctp_update_ep_vflag(struct sctp_inpcb *inp)
+{
+ struct sctp_laddr *laddr;
+
+ /* first clear the flag */
+ inp->ip_inp.inp.inp_vflag = 0;
+ /* set the flag based on addresses on the ep list */
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ if (laddr->ifa == NULL) {
+ SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n",
+ __FUNCTION__);
+ continue;
+ }
+ if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
+ continue;
+ }
+ if (laddr->ifa->address.sa.sa_family == AF_INET6) {
+ inp->ip_inp.inp.inp_vflag |= INP_IPV6;
+ } else if (laddr->ifa->address.sa.sa_family == AF_INET) {
+ inp->ip_inp.inp.inp_vflag |= INP_IPV4;
+ }
+ }
+}
+
+/*
+ * Add the address to the endpoint local address list There is nothing to be
+ * done if we are bound to all addresses
+ */
+void
+sctp_add_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa, uint32_t action)
+{
+ struct sctp_laddr *laddr;
+ int fnd, error = 0;
+
+ fnd = 0;
+
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ /* You are already bound to all. You have it already */
+ return;
+ }
+ if (ifa->address.sa.sa_family == AF_INET6) {
+ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
+ /* Can't bind a non-useable addr. */
+ return;
+ }
+ }
+ /* first, is it already present? */
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ if (laddr->ifa == ifa) {
+ fnd = 1;
+ break;
+ }
+ }
+
+ if (fnd == 0) {
+ /* Not in the ep list */
+ error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, action);
+ if (error != 0)
+ return;
+ inp->laddr_count++;
+ /* update inp_vflag flags */
+ if (ifa->address.sa.sa_family == AF_INET6) {
+ inp->ip_inp.inp.inp_vflag |= INP_IPV6;
+ } else if (ifa->address.sa.sa_family == AF_INET) {
+ inp->ip_inp.inp.inp_vflag |= INP_IPV4;
+ }
+ }
+ return;
+}
+
+
+/*
+ * select a new (hopefully reachable) destination net (should only be used
+ * when we deleted an ep addr that is the only usable source address to reach
+ * the destination net)
+ */
+static void
+sctp_select_primary_destination(struct sctp_tcb *stcb)
+{
+ struct sctp_nets *net;
+
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ /* for now, we'll just pick the first reachable one we find */
+ if (net->dest_state & SCTP_ADDR_UNCONFIRMED)
+ continue;
+ if (sctp_destination_is_reachable(stcb,
+ (struct sockaddr *)&net->ro._l_addr)) {
+ /* found a reachable destination */
+ stcb->asoc.primary_destination = net;
+ }
+ }
+ /* I can't there from here! ...we're gonna die shortly... */
+}
+
+
+/*
+ * Delete the address from the endpoint local address list There is nothing
+ * to be done if we are bound to all addresses
+ */
+void
+sctp_del_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa)
+{
+ struct sctp_laddr *laddr;
+ int fnd;
+
+ fnd = 0;
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ /* You are already bound to all. You have it already */
+ return;
+ }
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ if (laddr->ifa == ifa) {
+ fnd = 1;
+ break;
+ }
+ }
+ if (fnd && (inp->laddr_count < 2)) {
+ /* can't delete unless there are at LEAST 2 addresses */
+ return;
+ }
+ if (fnd) {
+ /*
+ * clean up any use of this address go through our
+ * associations and clear any last_used_address that match
+ * this one for each assoc, see if a new primary_destination
+ * is needed
+ */
+ struct sctp_tcb *stcb;
+
+ /* clean up "next_addr_touse" */
+ if (inp->next_addr_touse == laddr)
+ /* delete this address */
+ inp->next_addr_touse = NULL;
+
+ /* clean up "last_used_address" */
+ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
+ struct sctp_nets *net;
+
+ SCTP_TCB_LOCK(stcb);
+ if (stcb->asoc.last_used_address == laddr)
+ /* delete this address */
+ stcb->asoc.last_used_address = NULL;
+ /*
+ * Now spin through all the nets and purge any ref
+ * to laddr
+ */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ if (net->ro._s_addr &&
+ (net->ro._s_addr->ifa == laddr->ifa)) {
+ /* Yep, purge src address selected */
+ sctp_rtentry_t *rt;
+
+ /* delete this address if cached */
+ rt = net->ro.ro_rt;
+ if (rt != NULL) {
+ RTFREE(rt);
+ net->ro.ro_rt = NULL;
+ }
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ net->src_addr_selected = 0;
+ }
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } /* for each tcb */
+ /* remove it from the ep list */
+ sctp_remove_laddr(laddr);
+ inp->laddr_count--;
+ /* update inp_vflag flags */
+ sctp_update_ep_vflag(inp);
+ }
+ return;
+}
+
+/*
+ * Add the address to the TCB local address restricted list.
+ * This is a "pending" address list (eg. addresses waiting for an
+ * ASCONF-ACK response) and cannot be used as a valid source address.
+ */
+void
+sctp_add_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa)
+{
+ struct sctp_inpcb *inp;
+ struct sctp_laddr *laddr;
+ struct sctpladdr *list;
+
+ /*
+ * Assumes TCB is locked.. and possibly the INP. May need to
+ * confirm/fix that if we need it and is not the case.
+ */
+ list = &stcb->asoc.sctp_restricted_addrs;
+
+ inp = stcb->sctp_ep;
+ if (ifa->address.sa.sa_family == AF_INET6) {
+ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
+ /* Can't bind a non-existent addr. */
+ return;
+ }
+ }
+ /* does the address already exist? */
+ LIST_FOREACH(laddr, list, sctp_nxt_addr) {
+ if (laddr->ifa == ifa) {
+ return;
+ }
+ }
+
+ /* add to the list */
+ (void)sctp_insert_laddr(list, ifa, 0);
+ return;
+}
+
+/*
+ * insert an laddr entry with the given ifa for the desired list
+ */
+int
+sctp_insert_laddr(struct sctpladdr *list, struct sctp_ifa *ifa, uint32_t act)
+{
+ struct sctp_laddr *laddr;
+
+ laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
+ if (laddr == NULL) {
+ /* out of memory? */
+ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ return (EINVAL);
+ }
+ SCTP_INCR_LADDR_COUNT();
+ bzero(laddr, sizeof(*laddr));
+ (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time);
+ laddr->ifa = ifa;
+ laddr->action = act;
+ atomic_add_int(&ifa->refcount, 1);
+ /* insert it */
+ LIST_INSERT_HEAD(list, laddr, sctp_nxt_addr);
+
+ return (0);
+}
+
+/*
+ * Remove an laddr entry from the local address list (on an assoc)
+ */
+void
+sctp_remove_laddr(struct sctp_laddr *laddr)
+{
+
+ /* remove from the list */
+ LIST_REMOVE(laddr, sctp_nxt_addr);
+ sctp_free_ifa(laddr->ifa);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), laddr);
+ SCTP_DECR_LADDR_COUNT();
+}
+
+/*
+ * Remove a local address from the TCB local address restricted list
+ */
+void
+sctp_del_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa)
+{
+ struct sctp_inpcb *inp;
+ struct sctp_laddr *laddr;
+
+ /*
+ * This is called by asconf work. It is assumed that a) The TCB is
+ * locked and b) The INP is locked. This is true in as much as I can
+ * trace through the entry asconf code where I did these locks.
+ * Again, the ASCONF code is a bit different in that it does lock
+ * the INP during its work often times. This must be since we don't
+ * want other proc's looking up things while what they are looking
+ * up is changing :-D
+ */
+
+ inp = stcb->sctp_ep;
+ /* if subset bound and don't allow ASCONF's, can't delete last */
+ if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) &&
+ sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF)) {
+ if (stcb->sctp_ep->laddr_count < 2) {
+ /* can't delete last address */
+ return;
+ }
+ }
+ LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) {
+ /* remove the address if it exists */
+ if (laddr->ifa == NULL)
+ continue;
+ if (laddr->ifa == ifa) {
+ sctp_remove_laddr(laddr);
+ return;
+ }
+ }
+
+ /* address not found! */
+ return;
+}
+
+/*
+ * Temporarily remove for __APPLE__ until we use the Tiger equivalents
+ */
+/* sysctl */
+static int sctp_max_number_of_assoc = SCTP_MAX_NUM_OF_ASOC;
+static int sctp_scale_up_for_address = SCTP_SCALE_FOR_ADDR;
+
+void
+sctp_pcb_init()
+{
+ /*
+ * SCTP initialization for the PCB structures should be called by
+ * the sctp_init() funciton.
+ */
+ int i;
+ struct timeval tv;
+
+ if (SCTP_BASE_VAR(sctp_pcb_initialized) != 0) {
+ /* error I was called twice */
+ return;
+ }
+ SCTP_BASE_VAR(sctp_pcb_initialized) = 1;
+
+#if defined(SCTP_LOCAL_TRACE_BUF)
+ bzero(&SCTP_BASE_SYSCTL(sctp_log), sizeof(struct sctp_log));
+#endif
+ (void)SCTP_GETTIME_TIMEVAL(&tv);
+#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
+ SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_sec = (uint32_t) tv.tv_sec;
+ SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_usec = (uint32_t) tv.tv_usec;
+#else
+ SCTP_BASE_STAT(sctps_discontinuitytime).tv_sec = (uint32_t) tv.tv_sec;
+ SCTP_BASE_STAT(sctps_discontinuitytime).tv_usec = (uint32_t) tv.tv_usec;
+#endif
+ /* init the empty list of (All) Endpoints */
+ LIST_INIT(&SCTP_BASE_INFO(listhead));
+
+
+ /* init the hash table of endpoints */
+ TUNABLE_INT_FETCH("net.inet.sctp.tcbhashsize", &SCTP_BASE_SYSCTL(sctp_hashtblsize));
+ TUNABLE_INT_FETCH("net.inet.sctp.pcbhashsize", &SCTP_BASE_SYSCTL(sctp_pcbtblsize));
+ TUNABLE_INT_FETCH("net.inet.sctp.chunkscale", &SCTP_BASE_SYSCTL(sctp_chunkscale));
+ SCTP_BASE_INFO(sctp_asochash) = SCTP_HASH_INIT((SCTP_BASE_SYSCTL(sctp_hashtblsize) * 31),
+ &SCTP_BASE_INFO(hashasocmark));
+ SCTP_BASE_INFO(sctp_ephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize),
+ &SCTP_BASE_INFO(hashmark));
+ SCTP_BASE_INFO(sctp_tcpephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize),
+ &SCTP_BASE_INFO(hashtcpmark));
+ SCTP_BASE_INFO(hashtblsize) = SCTP_BASE_SYSCTL(sctp_hashtblsize);
+
+
+ SCTP_BASE_INFO(sctp_vrfhash) = SCTP_HASH_INIT(SCTP_SIZE_OF_VRF_HASH,
+ &SCTP_BASE_INFO(hashvrfmark));
+
+ SCTP_BASE_INFO(vrf_ifn_hash) = SCTP_HASH_INIT(SCTP_VRF_IFN_HASH_SIZE,
+ &SCTP_BASE_INFO(vrf_ifn_hashmark));
+ /* init the zones */
+ /*
+ * FIX ME: Should check for NULL returns, but if it does fail we are
+ * doomed to panic anyways... add later maybe.
+ */
+ SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_ep), "sctp_ep",
+ sizeof(struct sctp_inpcb), maxsockets);
+
+ SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asoc), "sctp_asoc",
+ sizeof(struct sctp_tcb), sctp_max_number_of_assoc);
+
+ SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_laddr), "sctp_laddr",
+ sizeof(struct sctp_laddr),
+ (sctp_max_number_of_assoc * sctp_scale_up_for_address));
+
+ SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_net), "sctp_raddr",
+ sizeof(struct sctp_nets),
+ (sctp_max_number_of_assoc * sctp_scale_up_for_address));
+
+ SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_chunk), "sctp_chunk",
+ sizeof(struct sctp_tmit_chunk),
+ (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
+
+ SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_readq), "sctp_readq",
+ sizeof(struct sctp_queued_to_read),
+ (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
+
+ SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_strmoq), "sctp_stream_msg_out",
+ sizeof(struct sctp_stream_queue_pending),
+ (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
+
+ SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf), "sctp_asconf",
+ sizeof(struct sctp_asconf),
+ (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
+
+ SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf_ack), "sctp_asconf_ack",
+ sizeof(struct sctp_asconf_ack),
+ (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
+
+
+ /* Master Lock INIT for info structure */
+ SCTP_INP_INFO_LOCK_INIT();
+ SCTP_STATLOG_INIT_LOCK();
+
+ SCTP_IPI_COUNT_INIT();
+ SCTP_IPI_ADDR_INIT();
+#ifdef SCTP_PACKET_LOGGING
+ SCTP_IP_PKTLOG_INIT();
+#endif
+ LIST_INIT(&SCTP_BASE_INFO(addr_wq));
+
+ SCTP_WQ_ADDR_INIT();
+ /* not sure if we need all the counts */
+ SCTP_BASE_INFO(ipi_count_ep) = 0;
+ /* assoc/tcb zone info */
+ SCTP_BASE_INFO(ipi_count_asoc) = 0;
+ /* local addrlist zone info */
+ SCTP_BASE_INFO(ipi_count_laddr) = 0;
+ /* remote addrlist zone info */
+ SCTP_BASE_INFO(ipi_count_raddr) = 0;
+ /* chunk info */
+ SCTP_BASE_INFO(ipi_count_chunk) = 0;
+
+ /* socket queue zone info */
+ SCTP_BASE_INFO(ipi_count_readq) = 0;
+
+ /* stream out queue cont */
+ SCTP_BASE_INFO(ipi_count_strmoq) = 0;
+
+ SCTP_BASE_INFO(ipi_free_strmoq) = 0;
+ SCTP_BASE_INFO(ipi_free_chunks) = 0;
+
+ SCTP_OS_TIMER_INIT(&SCTP_BASE_INFO(addr_wq_timer.timer));
+
+ /* Init the TIMEWAIT list */
+ for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) {
+ LIST_INIT(&SCTP_BASE_INFO(vtag_timewait)[i]);
+ }
+
+ sctp_startup_iterator();
+
+ /*
+ * INIT the default VRF which for BSD is the only one, other O/S's
+ * may have more. But initially they must start with one and then
+ * add the VRF's as addresses are added.
+ */
+ sctp_init_vrf_list(SCTP_DEFAULT_VRF);
+
+}
+
+/*
+ * Assumes that the SCTP_BASE_INFO() lock is NOT held.
+ */
+void
+sctp_pcb_finish(void)
+{
+ struct sctp_vrflist *vrf_bucket;
+ struct sctp_vrf *vrf;
+ struct sctp_ifn *ifn;
+ struct sctp_ifa *ifa;
+ struct sctpvtaghead *chain;
+ struct sctp_tagblock *twait_block, *prev_twait_block;
+ struct sctp_laddr *wi;
+ int i;
+
+ /*
+ * Free BSD the it thread never exits but we do clean up. The only
+ * way freebsd reaches here if we have VRF's but we still add the
+ * ifdef to make it compile on old versions.
+ */
+ {
+ struct sctp_iterator *it, *nit;
+
+ SCTP_IPI_ITERATOR_WQ_LOCK();
+ it = TAILQ_FIRST(&sctp_it_ctl.iteratorhead);
+ while (it) {
+ nit = TAILQ_NEXT(it, sctp_nxt_itr);
+ if (it->vn != curvnet) {
+ it = nit;
+ continue;
+ }
+ TAILQ_REMOVE(&sctp_it_ctl.iteratorhead,
+ it, sctp_nxt_itr);
+ if (it->function_atend != NULL) {
+ (*it->function_atend) (it->pointer, it->val);
+ }
+ SCTP_FREE(it, SCTP_M_ITER);
+ it = nit;
+ }
+ SCTP_IPI_ITERATOR_WQ_UNLOCK();
+ SCTP_ITERATOR_LOCK();
+ if ((sctp_it_ctl.cur_it) &&
+ (sctp_it_ctl.cur_it->vn == curvnet)) {
+ sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT;
+ }
+ SCTP_ITERATOR_UNLOCK();
+ }
+
+ SCTP_OS_TIMER_STOP(&SCTP_BASE_INFO(addr_wq_timer.timer));
+ SCTP_WQ_ADDR_LOCK();
+ while ((wi = LIST_FIRST(&SCTP_BASE_INFO(addr_wq))) != NULL) {
+ LIST_REMOVE(wi, sctp_nxt_addr);
+ SCTP_DECR_LADDR_COUNT();
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), wi);
+ }
+ SCTP_WQ_ADDR_UNLOCK();
+
+ /*
+ * free the vrf/ifn/ifa lists and hashes (be sure address monitor is
+ * destroyed first).
+ */
+ vrf_bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(SCTP_DEFAULT_VRFID & SCTP_BASE_INFO(hashvrfmark))];
+ while ((vrf = LIST_FIRST(vrf_bucket)) != NULL) {
+ while ((ifn = LIST_FIRST(&vrf->ifnlist)) != NULL) {
+ while ((ifa = LIST_FIRST(&ifn->ifalist)) != NULL) {
+ /* free the ifa */
+ LIST_REMOVE(ifa, next_bucket);
+ LIST_REMOVE(ifa, next_ifa);
+ SCTP_FREE(ifa, SCTP_M_IFA);
+ }
+ /* free the ifn */
+ LIST_REMOVE(ifn, next_bucket);
+ LIST_REMOVE(ifn, next_ifn);
+ SCTP_FREE(ifn, SCTP_M_IFN);
+ }
+ SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark);
+ /* free the vrf */
+ LIST_REMOVE(vrf, next_vrf);
+ SCTP_FREE(vrf, SCTP_M_VRF);
+ }
+ /* free the vrf hashes */
+ SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_vrfhash), SCTP_BASE_INFO(hashvrfmark));
+ SCTP_HASH_FREE(SCTP_BASE_INFO(vrf_ifn_hash), SCTP_BASE_INFO(vrf_ifn_hashmark));
+
+ /*
+ * free the TIMEWAIT list elements malloc'd in the function
+ * sctp_add_vtag_to_timewait()...
+ */
+ for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) {
+ chain = &SCTP_BASE_INFO(vtag_timewait)[i];
+ if (!LIST_EMPTY(chain)) {
+ prev_twait_block = NULL;
+ LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
+ if (prev_twait_block) {
+ SCTP_FREE(prev_twait_block, SCTP_M_TIMW);
+ }
+ prev_twait_block = twait_block;
+ }
+ SCTP_FREE(prev_twait_block, SCTP_M_TIMW);
+ }
+ }
+
+ /* free the locks and mutexes */
+#ifdef SCTP_PACKET_LOGGING
+ SCTP_IP_PKTLOG_DESTROY();
+#endif
+ SCTP_IPI_ADDR_DESTROY();
+ SCTP_STATLOG_DESTROY();
+ SCTP_INP_INFO_LOCK_DESTROY();
+
+ SCTP_WQ_ADDR_DESTROY();
+
+ SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_ep));
+ SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asoc));
+ SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_laddr));
+ SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_net));
+ SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_chunk));
+ SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_readq));
+ SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_strmoq));
+ SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf));
+ SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf_ack));
+ /* Get rid of other stuff to */
+ if (SCTP_BASE_INFO(sctp_asochash) != NULL)
+ SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_asochash), SCTP_BASE_INFO(hashasocmark));
+ if (SCTP_BASE_INFO(sctp_ephash) != NULL)
+ SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_ephash), SCTP_BASE_INFO(hashmark));
+ if (SCTP_BASE_INFO(sctp_tcpephash) != NULL)
+ SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_tcpephash), SCTP_BASE_INFO(hashtcpmark));
+
+}
+
+
+int
+sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
+ int iphlen, int offset, int limit, struct sctphdr *sh,
+ struct sockaddr *altsa)
+{
+ /*
+ * grub through the INIT pulling addresses and loading them to the
+ * nets structure in the asoc. The from address in the mbuf should
+ * also be loaded (if it is not already). This routine can be called
+ * with either INIT or INIT-ACK's as long as the m points to the IP
+ * packet and the offset points to the beginning of the parameters.
+ */
+ struct sctp_inpcb *inp, *l_inp;
+ struct sctp_nets *net, *net_tmp;
+ struct ip *iph;
+ struct sctp_paramhdr *phdr, parm_buf;
+ struct sctp_tcb *stcb_tmp;
+ uint16_t ptype, plen;
+ struct sockaddr *sa;
+ struct sockaddr_storage dest_store;
+ struct sockaddr *local_sa = (struct sockaddr *)&dest_store;
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ uint8_t random_store[SCTP_PARAM_BUFFER_SIZE];
+ struct sctp_auth_random *p_random = NULL;
+ uint16_t random_len = 0;
+ uint8_t hmacs_store[SCTP_PARAM_BUFFER_SIZE];
+ struct sctp_auth_hmac_algo *hmacs = NULL;
+ uint16_t hmacs_len = 0;
+ uint8_t saw_asconf = 0;
+ uint8_t saw_asconf_ack = 0;
+ uint8_t chunks_store[SCTP_PARAM_BUFFER_SIZE];
+ struct sctp_auth_chunk_list *chunks = NULL;
+ uint16_t num_chunks = 0;
+ sctp_key_t *new_key;
+ uint32_t keylen;
+ int got_random = 0, got_hmacs = 0, got_chklist = 0;
+
+ /* First get the destination address setup too. */
+ memset(&sin, 0, sizeof(sin));
+ memset(&sin6, 0, sizeof(sin6));
+
+ sin.sin_family = AF_INET;
+ sin.sin_len = sizeof(sin);
+ sin.sin_port = stcb->rport;
+
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_len = sizeof(struct sockaddr_in6);
+ sin6.sin6_port = stcb->rport;
+ if (altsa == NULL) {
+ iph = mtod(m, struct ip *);
+ switch (iph->ip_v) {
+ case IPVERSION:
+ {
+ /* its IPv4 */
+ struct sockaddr_in *sin_2;
+
+ sin_2 = (struct sockaddr_in *)(local_sa);
+ memset(sin_2, 0, sizeof(sin));
+ sin_2->sin_family = AF_INET;
+ sin_2->sin_len = sizeof(sin);
+ sin_2->sin_port = sh->dest_port;
+ sin_2->sin_addr.s_addr = iph->ip_dst.s_addr;
+ sin.sin_addr = iph->ip_src;
+ sa = (struct sockaddr *)&sin;
+ break;
+ }
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ {
+ /* its IPv6 */
+ struct ip6_hdr *ip6;
+ struct sockaddr_in6 *sin6_2;
+
+ ip6 = mtod(m, struct ip6_hdr *);
+ sin6_2 = (struct sockaddr_in6 *)(local_sa);
+ memset(sin6_2, 0, sizeof(sin6));
+ sin6_2->sin6_family = AF_INET6;
+ sin6_2->sin6_len = sizeof(struct sockaddr_in6);
+ sin6_2->sin6_port = sh->dest_port;
+ sin6.sin6_addr = ip6->ip6_src;
+ sa = (struct sockaddr *)&sin6;
+ break;
+ }
+#endif
+ default:
+ return (-1);
+ break;
+ }
+ } else {
+ /*
+ * For cookies we use the src address NOT from the packet
+ * but from the original INIT
+ */
+ sa = altsa;
+ }
+ /* Turn off ECN until we get through all params */
+ stcb->asoc.ecn_allowed = 0;
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ /* mark all addresses that we have currently on the list */
+ net->dest_state |= SCTP_ADDR_NOT_IN_ASSOC;
+ }
+ /* does the source address already exist? if so skip it */
+ l_inp = inp = stcb->sctp_ep;
+
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net_tmp, local_sa, stcb);
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+
+ if ((stcb_tmp == NULL && inp == stcb->sctp_ep) || inp == NULL) {
+ /* we must add the source address */
+ /* no scope set here since we have a tcb already. */
+ if ((sa->sa_family == AF_INET) &&
+ (stcb->asoc.ipv4_addr_legal)) {
+ if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_2)) {
+ return (-1);
+ }
+ } else if ((sa->sa_family == AF_INET6) &&
+ (stcb->asoc.ipv6_addr_legal)) {
+ if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_3)) {
+ return (-2);
+ }
+ }
+ } else {
+ if (net_tmp != NULL && stcb_tmp == stcb) {
+ net_tmp->dest_state &= ~SCTP_ADDR_NOT_IN_ASSOC;
+ } else if (stcb_tmp != stcb) {
+ /* It belongs to another association? */
+ if (stcb_tmp)
+ SCTP_TCB_UNLOCK(stcb_tmp);
+ return (-3);
+ }
+ }
+ if (stcb->asoc.state == 0) {
+ /* the assoc was freed? */
+ return (-4);
+ }
+ /*
+ * peer must explicitly turn this on. This may have been initialized
+ * to be "on" in order to allow local addr changes while INIT's are
+ * in flight.
+ */
+ stcb->asoc.peer_supports_asconf = 0;
+ /* now we must go through each of the params. */
+ phdr = sctp_get_next_param(m, offset, &parm_buf, sizeof(parm_buf));
+ while (phdr) {
+ ptype = ntohs(phdr->param_type);
+ plen = ntohs(phdr->param_length);
+ /*
+ * printf("ptype => %0x, plen => %d\n", (uint32_t)ptype,
+ * (int)plen);
+ */
+ if (offset + plen > limit) {
+ break;
+ }
+ if (plen == 0) {
+ break;
+ }
+ if (ptype == SCTP_IPV4_ADDRESS) {
+ if (stcb->asoc.ipv4_addr_legal) {
+ struct sctp_ipv4addr_param *p4, p4_buf;
+
+ /* ok get the v4 address and check/add */
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)&p4_buf,
+ sizeof(p4_buf));
+ if (plen != sizeof(struct sctp_ipv4addr_param) ||
+ phdr == NULL) {
+ return (-5);
+ }
+ p4 = (struct sctp_ipv4addr_param *)phdr;
+ sin.sin_addr.s_addr = p4->addr;
+ if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
+ /* Skip multi-cast addresses */
+ goto next_param;
+ }
+ if ((sin.sin_addr.s_addr == INADDR_BROADCAST) ||
+ (sin.sin_addr.s_addr == INADDR_ANY)) {
+ goto next_param;
+ }
+ sa = (struct sockaddr *)&sin;
+ inp = stcb->sctp_ep;
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net,
+ local_sa, stcb);
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+
+ if ((stcb_tmp == NULL && inp == stcb->sctp_ep) ||
+ inp == NULL) {
+ /* we must add the source address */
+ /*
+ * no scope set since we have a tcb
+ * already
+ */
+
+ /*
+ * we must validate the state again
+ * here
+ */
+ add_it_now:
+ if (stcb->asoc.state == 0) {
+ /* the assoc was freed? */
+ return (-7);
+ }
+ if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_4)) {
+ return (-8);
+ }
+ } else if (stcb_tmp == stcb) {
+ if (stcb->asoc.state == 0) {
+ /* the assoc was freed? */
+ return (-10);
+ }
+ if (net != NULL) {
+ /* clear flag */
+ net->dest_state &=
+ ~SCTP_ADDR_NOT_IN_ASSOC;
+ }
+ } else {
+ /*
+ * strange, address is in another
+ * assoc? straighten out locks.
+ */
+ if (stcb_tmp) {
+ if (SCTP_GET_STATE(&stcb_tmp->asoc) & SCTP_STATE_COOKIE_WAIT) {
+ /*
+ * in setup state we
+ * abort this guy
+ */
+ sctp_abort_an_association(stcb_tmp->sctp_ep,
+ stcb_tmp, 1, NULL, 0);
+ goto add_it_now;
+ }
+ SCTP_TCB_UNLOCK(stcb_tmp);
+ }
+ if (stcb->asoc.state == 0) {
+ /* the assoc was freed? */
+ return (-12);
+ }
+ return (-13);
+ }
+ }
+ } else if (ptype == SCTP_IPV6_ADDRESS) {
+ if (stcb->asoc.ipv6_addr_legal) {
+ /* ok get the v6 address and check/add */
+ struct sctp_ipv6addr_param *p6, p6_buf;
+
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)&p6_buf,
+ sizeof(p6_buf));
+ if (plen != sizeof(struct sctp_ipv6addr_param) ||
+ phdr == NULL) {
+ return (-14);
+ }
+ p6 = (struct sctp_ipv6addr_param *)phdr;
+ memcpy((caddr_t)&sin6.sin6_addr, p6->addr,
+ sizeof(p6->addr));
+ if (IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) {
+ /* Skip multi-cast addresses */
+ goto next_param;
+ }
+ if (IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr)) {
+ /*
+ * Link local make no sense without
+ * scope
+ */
+ goto next_param;
+ }
+ sa = (struct sockaddr *)&sin6;
+ inp = stcb->sctp_ep;
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net,
+ local_sa, stcb);
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ if (stcb_tmp == NULL &&
+ (inp == stcb->sctp_ep || inp == NULL)) {
+ /*
+ * we must validate the state again
+ * here
+ */
+ add_it_now6:
+ if (stcb->asoc.state == 0) {
+ /* the assoc was freed? */
+ return (-16);
+ }
+ /*
+ * we must add the address, no scope
+ * set
+ */
+ if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_5)) {
+ return (-17);
+ }
+ } else if (stcb_tmp == stcb) {
+ /*
+ * we must validate the state again
+ * here
+ */
+ if (stcb->asoc.state == 0) {
+ /* the assoc was freed? */
+ return (-19);
+ }
+ if (net != NULL) {
+ /* clear flag */
+ net->dest_state &=
+ ~SCTP_ADDR_NOT_IN_ASSOC;
+ }
+ } else {
+ /*
+ * strange, address is in another
+ * assoc? straighten out locks.
+ */
+ if (stcb_tmp)
+ if (SCTP_GET_STATE(&stcb_tmp->asoc) & SCTP_STATE_COOKIE_WAIT) {
+ /*
+ * in setup state we
+ * abort this guy
+ */
+ sctp_abort_an_association(stcb_tmp->sctp_ep,
+ stcb_tmp, 1, NULL, 0);
+ goto add_it_now6;
+ }
+ SCTP_TCB_UNLOCK(stcb_tmp);
+
+ if (stcb->asoc.state == 0) {
+ /* the assoc was freed? */
+ return (-21);
+ }
+ return (-22);
+ }
+ }
+ } else if (ptype == SCTP_ECN_CAPABLE) {
+ stcb->asoc.ecn_allowed = 1;
+ } else if (ptype == SCTP_ULP_ADAPTATION) {
+ if (stcb->asoc.state != SCTP_STATE_OPEN) {
+ struct sctp_adaptation_layer_indication ai,
+ *aip;
+
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)&ai, sizeof(ai));
+ aip = (struct sctp_adaptation_layer_indication *)phdr;
+ if (aip) {
+ stcb->asoc.peers_adaptation = ntohl(aip->indication);
+ stcb->asoc.adaptation_needed = 1;
+ }
+ }
+ } else if (ptype == SCTP_SET_PRIM_ADDR) {
+ struct sctp_asconf_addr_param lstore, *fee;
+ struct sctp_asconf_addrv4_param *fii;
+ int lptype;
+ struct sockaddr *lsa = NULL;
+
+ stcb->asoc.peer_supports_asconf = 1;
+ if (plen > sizeof(lstore)) {
+ return (-23);
+ }
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)&lstore,
+ min(plen, sizeof(lstore)));
+ if (phdr == NULL) {
+ return (-24);
+ }
+ fee = (struct sctp_asconf_addr_param *)phdr;
+ lptype = ntohs(fee->addrp.ph.param_type);
+ if (lptype == SCTP_IPV4_ADDRESS) {
+ if (plen !=
+ sizeof(struct sctp_asconf_addrv4_param)) {
+ SCTP_PRINTF("Sizeof setprim in init/init ack not %d but %d - ignored\n",
+ (int)sizeof(struct sctp_asconf_addrv4_param),
+ plen);
+ } else {
+ fii = (struct sctp_asconf_addrv4_param *)fee;
+ sin.sin_addr.s_addr = fii->addrp.addr;
+ lsa = (struct sockaddr *)&sin;
+ }
+ } else if (lptype == SCTP_IPV6_ADDRESS) {
+ if (plen !=
+ sizeof(struct sctp_asconf_addr_param)) {
+ SCTP_PRINTF("Sizeof setprim (v6) in init/init ack not %d but %d - ignored\n",
+ (int)sizeof(struct sctp_asconf_addr_param),
+ plen);
+ } else {
+ memcpy(sin6.sin6_addr.s6_addr,
+ fee->addrp.addr,
+ sizeof(fee->addrp.addr));
+ lsa = (struct sockaddr *)&sin6;
+ }
+ }
+ if (lsa) {
+ (void)sctp_set_primary_addr(stcb, sa, NULL);
+ }
+ } else if (ptype == SCTP_HAS_NAT_SUPPORT) {
+ stcb->asoc.peer_supports_nat = 1;
+ } else if (ptype == SCTP_PRSCTP_SUPPORTED) {
+ /* Peer supports pr-sctp */
+ stcb->asoc.peer_supports_prsctp = 1;
+ } else if (ptype == SCTP_SUPPORTED_CHUNK_EXT) {
+ /* A supported extension chunk */
+ struct sctp_supported_chunk_types_param *pr_supported;
+ uint8_t local_store[SCTP_PARAM_BUFFER_SIZE];
+ int num_ent, i;
+
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)&local_store, min(sizeof(local_store), plen));
+ if (phdr == NULL) {
+ return (-25);
+ }
+ stcb->asoc.peer_supports_asconf = 0;
+ stcb->asoc.peer_supports_prsctp = 0;
+ stcb->asoc.peer_supports_pktdrop = 0;
+ stcb->asoc.peer_supports_strreset = 0;
+ stcb->asoc.peer_supports_nr_sack = 0;
+ stcb->asoc.peer_supports_auth = 0;
+ pr_supported = (struct sctp_supported_chunk_types_param *)phdr;
+ num_ent = plen - sizeof(struct sctp_paramhdr);
+ for (i = 0; i < num_ent; i++) {
+ switch (pr_supported->chunk_types[i]) {
+ case SCTP_ASCONF:
+ case SCTP_ASCONF_ACK:
+ stcb->asoc.peer_supports_asconf = 1;
+ break;
+ case SCTP_FORWARD_CUM_TSN:
+ stcb->asoc.peer_supports_prsctp = 1;
+ break;
+ case SCTP_PACKET_DROPPED:
+ stcb->asoc.peer_supports_pktdrop = 1;
+ break;
+ case SCTP_NR_SELECTIVE_ACK:
+ stcb->asoc.peer_supports_nr_sack = 1;
+ break;
+ case SCTP_STREAM_RESET:
+ stcb->asoc.peer_supports_strreset = 1;
+ break;
+ case SCTP_AUTHENTICATION:
+ stcb->asoc.peer_supports_auth = 1;
+ break;
+ default:
+ /* one I have not learned yet */
+ break;
+
+ }
+ }
+ } else if (ptype == SCTP_ECN_NONCE_SUPPORTED) {
+ /* Peer supports ECN-nonce */
+ stcb->asoc.peer_supports_ecn_nonce = 1;
+ stcb->asoc.ecn_nonce_allowed = 1;
+ } else if (ptype == SCTP_RANDOM) {
+ if (plen > sizeof(random_store))
+ break;
+ if (got_random) {
+ /* already processed a RANDOM */
+ goto next_param;
+ }
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)random_store,
+ min(sizeof(random_store), plen));
+ if (phdr == NULL)
+ return (-26);
+ p_random = (struct sctp_auth_random *)phdr;
+ random_len = plen - sizeof(*p_random);
+ /* enforce the random length */
+ if (random_len != SCTP_AUTH_RANDOM_SIZE_REQUIRED) {
+ SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: invalid RANDOM len\n");
+ return (-27);
+ }
+ got_random = 1;
+ } else if (ptype == SCTP_HMAC_LIST) {
+ int num_hmacs;
+ int i;
+
+ if (plen > sizeof(hmacs_store))
+ break;
+ if (got_hmacs) {
+ /* already processed a HMAC list */
+ goto next_param;
+ }
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)hmacs_store,
+ min(plen, sizeof(hmacs_store)));
+ if (phdr == NULL)
+ return (-28);
+ hmacs = (struct sctp_auth_hmac_algo *)phdr;
+ hmacs_len = plen - sizeof(*hmacs);
+ num_hmacs = hmacs_len / sizeof(hmacs->hmac_ids[0]);
+ /* validate the hmac list */
+ if (sctp_verify_hmac_param(hmacs, num_hmacs)) {
+ return (-29);
+ }
+ if (stcb->asoc.peer_hmacs != NULL)
+ sctp_free_hmaclist(stcb->asoc.peer_hmacs);
+ stcb->asoc.peer_hmacs = sctp_alloc_hmaclist(num_hmacs);
+ if (stcb->asoc.peer_hmacs != NULL) {
+ for (i = 0; i < num_hmacs; i++) {
+ (void)sctp_auth_add_hmacid(stcb->asoc.peer_hmacs,
+ ntohs(hmacs->hmac_ids[i]));
+ }
+ }
+ got_hmacs = 1;
+ } else if (ptype == SCTP_CHUNK_LIST) {
+ int i;
+
+ if (plen > sizeof(chunks_store))
+ break;
+ if (got_chklist) {
+ /* already processed a Chunks list */
+ goto next_param;
+ }
+ phdr = sctp_get_next_param(m, offset,
+ (struct sctp_paramhdr *)chunks_store,
+ min(plen, sizeof(chunks_store)));
+ if (phdr == NULL)
+ return (-30);
+ chunks = (struct sctp_auth_chunk_list *)phdr;
+ num_chunks = plen - sizeof(*chunks);
+ if (stcb->asoc.peer_auth_chunks != NULL)
+ sctp_clear_chunklist(stcb->asoc.peer_auth_chunks);
+ else
+ stcb->asoc.peer_auth_chunks = sctp_alloc_chunklist();
+ for (i = 0; i < num_chunks; i++) {
+ (void)sctp_auth_add_chunk(chunks->chunk_types[i],
+ stcb->asoc.peer_auth_chunks);
+ /* record asconf/asconf-ack if listed */
+ if (chunks->chunk_types[i] == SCTP_ASCONF)
+ saw_asconf = 1;
+ if (chunks->chunk_types[i] == SCTP_ASCONF_ACK)
+ saw_asconf_ack = 1;
+
+ }
+ got_chklist = 1;
+ } else if ((ptype == SCTP_HEARTBEAT_INFO) ||
+ (ptype == SCTP_STATE_COOKIE) ||
+ (ptype == SCTP_UNRECOG_PARAM) ||
+ (ptype == SCTP_COOKIE_PRESERVE) ||
+ (ptype == SCTP_SUPPORTED_ADDRTYPE) ||
+ (ptype == SCTP_ADD_IP_ADDRESS) ||
+ (ptype == SCTP_DEL_IP_ADDRESS) ||
+ (ptype == SCTP_ERROR_CAUSE_IND) ||
+ (ptype == SCTP_SUCCESS_REPORT)) {
+ /* don't care */ ;
+ } else {
+ if ((ptype & 0x8000) == 0x0000) {
+ /*
+ * must stop processing the rest of the
+ * param's. Any report bits were handled
+ * with the call to
+ * sctp_arethere_unrecognized_parameters()
+ * when the INIT or INIT-ACK was first seen.
+ */
+ break;
+ }
+ }
+
+next_param:
+ offset += SCTP_SIZE32(plen);
+ if (offset >= limit) {
+ break;
+ }
+ phdr = sctp_get_next_param(m, offset, &parm_buf,
+ sizeof(parm_buf));
+ }
+ /* Now check to see if we need to purge any addresses */
+ for (net = TAILQ_FIRST(&stcb->asoc.nets); net != NULL; net = net_tmp) {
+ net_tmp = TAILQ_NEXT(net, sctp_next);
+ if ((net->dest_state & SCTP_ADDR_NOT_IN_ASSOC) ==
+ SCTP_ADDR_NOT_IN_ASSOC) {
+ /* This address has been removed from the asoc */
+ /* remove and free it */
+ stcb->asoc.numnets--;
+ TAILQ_REMOVE(&stcb->asoc.nets, net, sctp_next);
+ sctp_free_remote_addr(net);
+ if (net == stcb->asoc.primary_destination) {
+ stcb->asoc.primary_destination = NULL;
+ sctp_select_primary_destination(stcb);
+ }
+ }
+ }
+ /* validate authentication required parameters */
+ if (got_random && got_hmacs) {
+ stcb->asoc.peer_supports_auth = 1;
+ } else {
+ stcb->asoc.peer_supports_auth = 0;
+ }
+ if (!stcb->asoc.peer_supports_auth && got_chklist) {
+ /* peer does not support auth but sent a chunks list? */
+ return (-31);
+ }
+ if (!SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk) && stcb->asoc.peer_supports_asconf &&
+ !stcb->asoc.peer_supports_auth) {
+ /* peer supports asconf but not auth? */
+ return (-32);
+ } else if ((stcb->asoc.peer_supports_asconf) && (stcb->asoc.peer_supports_auth) &&
+ ((saw_asconf == 0) || (saw_asconf_ack == 0))) {
+ return (-33);
+ }
+ /* concatenate the full random key */
+ keylen = sizeof(*p_random) + random_len + sizeof(*hmacs) + hmacs_len;
+ if (chunks != NULL) {
+ keylen += sizeof(*chunks) + num_chunks;
+ }
+ new_key = sctp_alloc_key(keylen);
+ if (new_key != NULL) {
+ /* copy in the RANDOM */
+ if (p_random != NULL) {
+ keylen = sizeof(*p_random) + random_len;
+ bcopy(p_random, new_key->key, keylen);
+ }
+ /* append in the AUTH chunks */
+ if (chunks != NULL) {
+ bcopy(chunks, new_key->key + keylen,
+ sizeof(*chunks) + num_chunks);
+ keylen += sizeof(*chunks) + num_chunks;
+ }
+ /* append in the HMACs */
+ if (hmacs != NULL) {
+ bcopy(hmacs, new_key->key + keylen,
+ sizeof(*hmacs) + hmacs_len);
+ }
+ } else {
+ /* failed to get memory for the key */
+ return (-34);
+ }
+ if (stcb->asoc.authinfo.peer_random != NULL)
+ sctp_free_key(stcb->asoc.authinfo.peer_random);
+ stcb->asoc.authinfo.peer_random = new_key;
+ sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.assoc_keyid);
+ sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.recv_keyid);
+
+ return (0);
+}
+
+int
+sctp_set_primary_addr(struct sctp_tcb *stcb, struct sockaddr *sa,
+ struct sctp_nets *net)
+{
+ /* make sure the requested primary address exists in the assoc */
+ if (net == NULL && sa)
+ net = sctp_findnet(stcb, sa);
+
+ if (net == NULL) {
+ /* didn't find the requested primary address! */
+ return (-1);
+ } else {
+ /* set the primary address */
+ if (net->dest_state & SCTP_ADDR_UNCONFIRMED) {
+ /* Must be confirmed, so queue to set */
+ net->dest_state |= SCTP_ADDR_REQ_PRIMARY;
+ return (0);
+ }
+ stcb->asoc.primary_destination = net;
+ net->dest_state &= ~SCTP_ADDR_WAS_PRIMARY;
+ net = TAILQ_FIRST(&stcb->asoc.nets);
+ if (net != stcb->asoc.primary_destination) {
+ /*
+ * first one on the list is NOT the primary
+ * sctp_cmpaddr() is much more efficient if the
+ * primary is the first on the list, make it so.
+ */
+ TAILQ_REMOVE(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next);
+ TAILQ_INSERT_HEAD(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next);
+ }
+ return (0);
+ }
+}
+
+int
+sctp_is_vtag_good(struct sctp_inpcb *inp, uint32_t tag, uint16_t lport, uint16_t rport, struct timeval *now, int save_in_twait)
+{
+ /*
+ * This function serves two purposes. It will see if a TAG can be
+ * re-used and return 1 for yes it is ok and 0 for don't use that
+ * tag. A secondary function it will do is purge out old tags that
+ * can be removed.
+ */
+ struct sctpvtaghead *chain;
+ struct sctp_tagblock *twait_block;
+ struct sctpasochead *head;
+ struct sctp_tcb *stcb;
+ int i;
+
+ SCTP_INP_INFO_RLOCK();
+ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag,
+ SCTP_BASE_INFO(hashasocmark))];
+ if (head == NULL) {
+ /* invalid vtag */
+ goto skip_vtag_check;
+ }
+ LIST_FOREACH(stcb, head, sctp_asocs) {
+ /*
+ * We choose not to lock anything here. TCB's can't be
+ * removed since we have the read lock, so they can't be
+ * freed on us, same thing for the INP. I may be wrong with
+ * this assumption, but we will go with it for now :-)
+ */
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
+ continue;
+ }
+ if (stcb->asoc.my_vtag == tag) {
+ /* candidate */
+ if (stcb->rport != rport) {
+ continue;
+ }
+ if (stcb->sctp_ep->sctp_lport != lport) {
+ continue;
+ }
+ /* Its a used tag set */
+ SCTP_INP_INFO_RUNLOCK();
+ return (0);
+ }
+ }
+skip_vtag_check:
+
+ chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
+ /* Now what about timed wait ? */
+ if (!LIST_EMPTY(chain)) {
+ /*
+ * Block(s) are present, lets see if we have this tag in the
+ * list
+ */
+ LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
+ for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
+ if (twait_block->vtag_block[i].v_tag == 0) {
+ /* not used */
+ continue;
+ } else if ((long)twait_block->vtag_block[i].tv_sec_at_expire <
+ now->tv_sec) {
+ /* Audit expires this guy */
+ twait_block->vtag_block[i].tv_sec_at_expire = 0;
+ twait_block->vtag_block[i].v_tag = 0;
+ twait_block->vtag_block[i].lport = 0;
+ twait_block->vtag_block[i].rport = 0;
+ } else if ((twait_block->vtag_block[i].v_tag == tag) &&
+ (twait_block->vtag_block[i].lport == lport) &&
+ (twait_block->vtag_block[i].rport == rport)) {
+ /* Bad tag, sorry :< */
+ SCTP_INP_INFO_RUNLOCK();
+ return (0);
+ }
+ }
+ }
+ }
+ SCTP_INP_INFO_RUNLOCK();
+ return (1);
+}
+
+
+static sctp_assoc_t reneged_asoc_ids[256];
+static uint8_t reneged_at = 0;
+
+
+static void
+sctp_drain_mbufs(struct sctp_inpcb *inp, struct sctp_tcb *stcb)
+{
+ /*
+ * We must hunt this association for MBUF's past the cumack (i.e.
+ * out of order data that we can renege on).
+ */
+ struct sctp_association *asoc;
+ struct sctp_tmit_chunk *chk, *nchk;
+ uint32_t cumulative_tsn_p1;
+ struct sctp_queued_to_read *ctl, *nctl;
+ int cnt, strmat;
+ uint32_t gap, i;
+ int fnd = 0;
+
+ /* We look for anything larger than the cum-ack + 1 */
+
+ asoc = &stcb->asoc;
+ if (asoc->cumulative_tsn == asoc->highest_tsn_inside_map) {
+ /* none we can reneg on. */
+ return;
+ }
+ SCTP_STAT_INCR(sctps_protocol_drains_done);
+ cumulative_tsn_p1 = asoc->cumulative_tsn + 1;
+ cnt = 0;
+ /* First look in the re-assembly queue */
+ chk = TAILQ_FIRST(&asoc->reasmqueue);
+ while (chk) {
+ /* Get the next one */
+ nchk = TAILQ_NEXT(chk, sctp_next);
+ if (compare_with_wrap(chk->rec.data.TSN_seq,
+ cumulative_tsn_p1, MAX_TSN)) {
+ /* Yep it is above cum-ack */
+ cnt++;
+ SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.TSN_seq, asoc->mapping_array_base_tsn);
+ asoc->size_on_reasm_queue = sctp_sbspace_sub(asoc->size_on_reasm_queue, chk->send_size);
+ sctp_ucount_decr(asoc->cnt_on_reasm_queue);
+ SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
+ TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ sctp_free_a_chunk(stcb, chk);
+ }
+ chk = nchk;
+ }
+ /* Ok that was fun, now we will drain all the inbound streams? */
+ for (strmat = 0; strmat < asoc->streamincnt; strmat++) {
+ ctl = TAILQ_FIRST(&asoc->strmin[strmat].inqueue);
+ while (ctl) {
+ nctl = TAILQ_NEXT(ctl, next);
+ if (compare_with_wrap(ctl->sinfo_tsn,
+ cumulative_tsn_p1, MAX_TSN)) {
+ /* Yep it is above cum-ack */
+ cnt++;
+ SCTP_CALC_TSN_TO_GAP(gap, ctl->sinfo_tsn, asoc->mapping_array_base_tsn);
+ asoc->size_on_all_streams = sctp_sbspace_sub(asoc->size_on_all_streams, ctl->length);
+ sctp_ucount_decr(asoc->cnt_on_all_streams);
+ SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
+ TAILQ_REMOVE(&asoc->strmin[strmat].inqueue, ctl, next);
+ if (ctl->data) {
+ sctp_m_freem(ctl->data);
+ ctl->data = NULL;
+ }
+ sctp_free_remote_addr(ctl->whoFrom);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), ctl);
+ SCTP_DECR_READQ_COUNT();
+ }
+ ctl = nctl;
+ }
+ }
+ if (cnt) {
+ /* We must back down to see what the new highest is */
+ for (i = asoc->highest_tsn_inside_map;
+ (compare_with_wrap(i, asoc->mapping_array_base_tsn, MAX_TSN) || (i == asoc->mapping_array_base_tsn));
+ i--) {
+ SCTP_CALC_TSN_TO_GAP(gap, i, asoc->mapping_array_base_tsn);
+ if (SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) {
+ asoc->highest_tsn_inside_map = i;
+ fnd = 1;
+ break;
+ }
+ }
+ if (!fnd) {
+ asoc->highest_tsn_inside_map = asoc->mapping_array_base_tsn - 1;
+ }
+ /*
+ * Question, should we go through the delivery queue? The
+ * only reason things are on here is the app not reading OR
+ * a p-d-api up. An attacker COULD send enough in to
+ * initiate the PD-API and then send a bunch of stuff to
+ * other streams... these would wind up on the delivery
+ * queue.. and then we would not get to them. But in order
+ * to do this I then have to back-track and un-deliver
+ * sequence numbers in streams.. el-yucko. I think for now
+ * we will NOT look at the delivery queue and leave it to be
+ * something to consider later. An alternative would be to
+ * abort the P-D-API with a notification and then deliver
+ * the data.... Or another method might be to keep track of
+ * how many times the situation occurs and if we see a
+ * possible attack underway just abort the association.
+ */
+#ifdef SCTP_DEBUG
+ SCTPDBG(SCTP_DEBUG_PCB1, "Freed %d chunks from reneg harvest\n", cnt);
+#endif
+ /*
+ * Now do we need to find a new
+ * asoc->highest_tsn_inside_map?
+ */
+ asoc->last_revoke_count = cnt;
+ (void)SCTP_OS_TIMER_STOP(&stcb->asoc.dack_timer.timer);
+ /* sa_ignore NO_NULL_CHK */
+ sctp_send_sack(stcb);
+ sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_DRAIN, SCTP_SO_NOT_LOCKED);
+ reneged_asoc_ids[reneged_at] = sctp_get_associd(stcb);
+ reneged_at++;
+ }
+ /*
+ * Another issue, in un-setting the TSN's in the mapping array we
+ * DID NOT adjust the highest_tsn marker. This will cause one of
+ * two things to occur. It may cause us to do extra work in checking
+ * for our mapping array movement. More importantly it may cause us
+ * to SACK every datagram. This may not be a bad thing though since
+ * we will recover once we get our cum-ack above and all this stuff
+ * we dumped recovered.
+ */
+}
+
+void
+sctp_drain()
+{
+ /*
+ * We must walk the PCB lists for ALL associations here. The system
+ * is LOW on MBUF's and needs help. This is where reneging will
+ * occur. We really hope this does NOT happen!
+ */
+ VNET_ITERATOR_DECL(vnet_iter);
+ VNET_LIST_RLOCK_NOSLEEP();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ struct sctp_inpcb *inp;
+ struct sctp_tcb *stcb;
+
+ SCTP_STAT_INCR(sctps_protocol_drain_calls);
+ if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) {
+#ifdef VIMAGE
+ continue;
+#else
+ return;
+#endif
+ }
+ SCTP_INP_INFO_RLOCK();
+ LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) {
+ /* For each endpoint */
+ SCTP_INP_RLOCK(inp);
+ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
+ /* For each association */
+ SCTP_TCB_LOCK(stcb);
+ sctp_drain_mbufs(inp, stcb);
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ SCTP_INP_RUNLOCK(inp);
+ }
+ SCTP_INP_INFO_RUNLOCK();
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK_NOSLEEP();
+}
+
+/*
+ * start a new iterator
+ * iterates through all endpoints and associations based on the pcb_state
+ * flags and asoc_state. "af" (mandatory) is executed for all matching
+ * assocs and "ef" (optional) is executed when the iterator completes.
+ * "inpf" (optional) is executed for each new endpoint as it is being
+ * iterated through. inpe (optional) is called when the inp completes
+ * its way through all the stcbs.
+ */
+int
+sctp_initiate_iterator(inp_func inpf,
+ asoc_func af,
+ inp_func inpe,
+ uint32_t pcb_state,
+ uint32_t pcb_features,
+ uint32_t asoc_state,
+ void *argp,
+ uint32_t argi,
+ end_func ef,
+ struct sctp_inpcb *s_inp,
+ uint8_t chunk_output_off)
+{
+ struct sctp_iterator *it = NULL;
+
+ if (af == NULL) {
+ return (-1);
+ }
+ SCTP_MALLOC(it, struct sctp_iterator *, sizeof(struct sctp_iterator),
+ SCTP_M_ITER);
+ if (it == NULL) {
+ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM);
+ return (ENOMEM);
+ }
+ memset(it, 0, sizeof(*it));
+ it->function_assoc = af;
+ it->function_inp = inpf;
+ if (inpf)
+ it->done_current_ep = 0;
+ else
+ it->done_current_ep = 1;
+ it->function_atend = ef;
+ it->pointer = argp;
+ it->val = argi;
+ it->pcb_flags = pcb_state;
+ it->pcb_features = pcb_features;
+ it->asoc_state = asoc_state;
+ it->function_inp_end = inpe;
+ it->no_chunk_output = chunk_output_off;
+ it->vn = curvnet;
+ if (s_inp) {
+ /* Assume lock is held here */
+ it->inp = s_inp;
+ SCTP_INP_INCR_REF(it->inp);
+ it->iterator_flags = SCTP_ITERATOR_DO_SINGLE_INP;
+ } else {
+ SCTP_INP_INFO_RLOCK();
+ it->inp = LIST_FIRST(&SCTP_BASE_INFO(listhead));
+ if (it->inp) {
+ SCTP_INP_INCR_REF(it->inp);
+ }
+ SCTP_INP_INFO_RUNLOCK();
+ it->iterator_flags = SCTP_ITERATOR_DO_ALL_INP;
+
+ }
+ SCTP_IPI_ITERATOR_WQ_LOCK();
+
+ TAILQ_INSERT_TAIL(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr);
+ if (sctp_it_ctl.iterator_running == 0) {
+ sctp_wakeup_iterator();
+ }
+ SCTP_IPI_ITERATOR_WQ_UNLOCK();
+ /* sa_ignore MEMLEAK {memory is put on the tailq for the iterator} */
+ return (0);
+}
diff --git a/freebsd/sys/netinet/sctp_pcb.h b/freebsd/sys/netinet/sctp_pcb.h
new file mode 100644
index 00000000..a4f4d30c
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_pcb.h
@@ -0,0 +1,632 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_pcb.h,v 1.21 2005/07/16 01:18:47 suz Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __sctp_pcb_h__
+#define __sctp_pcb_h__
+
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp.h>
+#include <freebsd/netinet/sctp_constants.h>
+#include <freebsd/netinet/sctp_sysctl.h>
+
+LIST_HEAD(sctppcbhead, sctp_inpcb);
+LIST_HEAD(sctpasochead, sctp_tcb);
+LIST_HEAD(sctpladdr, sctp_laddr);
+LIST_HEAD(sctpvtaghead, sctp_tagblock);
+LIST_HEAD(sctp_vrflist, sctp_vrf);
+LIST_HEAD(sctp_ifnlist, sctp_ifn);
+LIST_HEAD(sctp_ifalist, sctp_ifa);
+TAILQ_HEAD(sctp_readhead, sctp_queued_to_read);
+TAILQ_HEAD(sctp_streamhead, sctp_stream_queue_pending);
+
+#include <freebsd/netinet/sctp_structs.h>
+#include <freebsd/netinet/sctp_auth.h>
+
+#define SCTP_PCBHASH_ALLADDR(port, mask) (port & mask)
+#define SCTP_PCBHASH_ASOC(tag, mask) (tag & mask)
+
+struct sctp_vrf {
+ LIST_ENTRY(sctp_vrf) next_vrf;
+ struct sctp_ifalist *vrf_addr_hash;
+ struct sctp_ifnlist ifnlist;
+ uint32_t vrf_id;
+ uint32_t tbl_id_v4; /* default v4 table id */
+ uint32_t tbl_id_v6; /* default v6 table id */
+ uint32_t total_ifa_count;
+ u_long vrf_addr_hashmark;
+ uint32_t refcount;
+};
+
+struct sctp_ifn {
+ struct sctp_ifalist ifalist;
+ struct sctp_vrf *vrf;
+ LIST_ENTRY(sctp_ifn) next_ifn;
+ LIST_ENTRY(sctp_ifn) next_bucket;
+ void *ifn_p; /* never access without appropriate lock */
+ uint32_t ifn_mtu;
+ uint32_t ifn_type;
+ uint32_t ifn_index; /* shorthand way to look at ifn for reference */
+ uint32_t refcount; /* number of reference held should be >=
+ * ifa_count */
+ uint32_t ifa_count; /* IFA's we hold (in our list - ifalist) */
+ uint32_t num_v6; /* number of v6 addresses */
+ uint32_t num_v4; /* number of v4 addresses */
+ uint32_t registered_af; /* registered address family for i/f events */
+ char ifn_name[SCTP_IFNAMSIZ];
+};
+
+/* SCTP local IFA flags */
+#define SCTP_ADDR_VALID 0x00000001 /* its up and active */
+#define SCTP_BEING_DELETED 0x00000002 /* being deleted, when
+ * refcount = 0. Note that it
+ * is pulled from the ifn list
+ * and ifa_p is nulled right
+ * away but it cannot be freed
+ * until the last *net
+ * pointing to it is deleted. */
+#define SCTP_ADDR_DEFER_USE 0x00000004 /* Hold off using this one */
+#define SCTP_ADDR_IFA_UNUSEABLE 0x00000008
+
+struct sctp_ifa {
+ LIST_ENTRY(sctp_ifa) next_ifa;
+ LIST_ENTRY(sctp_ifa) next_bucket;
+ struct sctp_ifn *ifn_p; /* back pointer to parent ifn */
+ void *ifa; /* pointer to ifa, needed for flag update for
+ * that we MUST lock appropriate locks. This
+ * is for V6. */
+ union sctp_sockstore address;
+ uint32_t refcount; /* number of folks refering to this */
+ uint32_t flags;
+ uint32_t localifa_flags;
+ uint32_t vrf_id; /* vrf_id of this addr (for deleting) */
+ uint8_t src_is_loop;
+ uint8_t src_is_priv;
+ uint8_t src_is_glob;
+ uint8_t resv;
+};
+
+struct sctp_laddr {
+ LIST_ENTRY(sctp_laddr) sctp_nxt_addr; /* next in list */
+ struct sctp_ifa *ifa;
+ uint32_t action; /* Used during asconf and adding if no-zero
+ * src-addr selection will not consider this
+ * address. */
+ struct timeval start_time; /* time when this address was created */
+};
+
+struct sctp_block_entry {
+ int error;
+};
+
+struct sctp_timewait {
+ uint32_t tv_sec_at_expire; /* the seconds from boot to expire */
+ uint32_t v_tag; /* the vtag that can not be reused */
+ uint16_t lport; /* the local port used in vtag */
+ uint16_t rport; /* the remote port used in vtag */
+};
+
+struct sctp_tagblock {
+ LIST_ENTRY(sctp_tagblock) sctp_nxt_tagblock;
+ struct sctp_timewait vtag_block[SCTP_NUMBER_IN_VTAG_BLOCK];
+};
+
+
+struct sctp_epinfo {
+ struct socket *udp_tun_socket;
+ struct sctpasochead *sctp_asochash;
+ u_long hashasocmark;
+
+ struct sctppcbhead *sctp_ephash;
+ u_long hashmark;
+
+ /*-
+ * The TCP model represents a substantial overhead in that we get an
+ * additional hash table to keep explicit connections in. The
+ * listening TCP endpoint will exist in the usual ephash above and
+ * accept only INIT's. It will be incapable of sending off an INIT.
+ * When a dg arrives we must look in the normal ephash. If we find a
+ * TCP endpoint that will tell us to go to the specific endpoint
+ * hash and re-hash to find the right assoc/socket. If we find a UDP
+ * model socket we then must complete the lookup. If this fails,
+ * i.e. no association can be found then we must continue to see if
+ * a sctp_peeloff()'d socket is in the tcpephash (a spun off socket
+ * acts like a TCP model connected socket).
+ */
+ struct sctppcbhead *sctp_tcpephash;
+ u_long hashtcpmark;
+ uint32_t hashtblsize;
+
+ struct sctp_vrflist *sctp_vrfhash;
+ u_long hashvrfmark;
+
+ struct sctp_ifnlist *vrf_ifn_hash;
+ u_long vrf_ifn_hashmark;
+
+ struct sctppcbhead listhead;
+ struct sctpladdr addr_wq;
+
+ /* ep zone info */
+ sctp_zone_t ipi_zone_ep;
+ sctp_zone_t ipi_zone_asoc;
+ sctp_zone_t ipi_zone_laddr;
+ sctp_zone_t ipi_zone_net;
+ sctp_zone_t ipi_zone_chunk;
+ sctp_zone_t ipi_zone_readq;
+ sctp_zone_t ipi_zone_strmoq;
+ sctp_zone_t ipi_zone_asconf;
+ sctp_zone_t ipi_zone_asconf_ack;
+
+ struct rwlock ipi_ep_mtx;
+ struct mtx ipi_iterator_wq_mtx;
+ struct rwlock ipi_addr_mtx;
+ struct mtx ipi_pktlog_mtx;
+ struct mtx wq_addr_mtx;
+ uint32_t ipi_count_ep;
+
+ /* assoc/tcb zone info */
+ uint32_t ipi_count_asoc;
+
+ /* local addrlist zone info */
+ uint32_t ipi_count_laddr;
+
+ /* remote addrlist zone info */
+ uint32_t ipi_count_raddr;
+
+ /* chunk structure list for output */
+ uint32_t ipi_count_chunk;
+
+ /* socket queue zone info */
+ uint32_t ipi_count_readq;
+
+ /* socket queue zone info */
+ uint32_t ipi_count_strmoq;
+
+ /* Number of vrfs */
+ uint32_t ipi_count_vrfs;
+
+ /* Number of ifns */
+ uint32_t ipi_count_ifns;
+
+ /* Number of ifas */
+ uint32_t ipi_count_ifas;
+
+ /* system wide number of free chunks hanging around */
+ uint32_t ipi_free_chunks;
+ uint32_t ipi_free_strmoq;
+
+ struct sctpvtaghead vtag_timewait[SCTP_STACK_VTAG_HASH_SIZE];
+
+ /* address work queue handling */
+ struct sctp_timer addr_wq_timer;
+
+};
+
+
+struct sctp_base_info {
+ /*
+ * All static structures that anchor the system must be here.
+ */
+ struct sctp_epinfo sctppcbinfo;
+#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
+ struct sctpstat sctpstat[MAXCPU];
+#else
+ struct sctpstat sctpstat;
+#endif
+ struct sctp_sysctl sctpsysctl;
+ uint8_t first_time;
+ char sctp_pcb_initialized;
+#if defined(SCTP_PACKET_LOGGING)
+ int packet_log_writers;
+ int packet_log_end;
+ uint8_t packet_log_buffer[SCTP_PACKET_LOG_SIZE];
+#endif
+};
+
+/*-
+ * Here we have all the relevant information for each SCTP entity created. We
+ * will need to modify this as approprate. We also need to figure out how to
+ * access /dev/random.
+ */
+struct sctp_pcb {
+ unsigned int time_of_secret_change; /* number of seconds from
+ * timeval.tv_sec */
+ uint32_t secret_key[SCTP_HOW_MANY_SECRETS][SCTP_NUMBER_OF_SECRETS];
+ unsigned int size_of_a_cookie;
+
+ unsigned int sctp_timeoutticks[SCTP_NUM_TMRS];
+ unsigned int sctp_minrto;
+ unsigned int sctp_maxrto;
+ unsigned int initial_rto;
+ int initial_init_rto_max;
+
+ unsigned int sctp_sack_freq;
+ uint32_t sctp_sws_sender;
+ uint32_t sctp_sws_receiver;
+
+ uint32_t sctp_default_cc_module;
+ /* authentication related fields */
+ struct sctp_keyhead shared_keys;
+ sctp_auth_chklist_t *local_auth_chunks;
+ sctp_hmaclist_t *local_hmacs;
+ uint16_t default_keyid;
+
+ /* various thresholds */
+ /* Max times I will init at a guy */
+ uint16_t max_init_times;
+
+ /* Max times I will send before we consider someone dead */
+ uint16_t max_send_times;
+
+ uint16_t def_net_failure;
+
+ /* number of streams to pre-open on a association */
+ uint16_t pre_open_stream_count;
+ uint16_t max_open_streams_intome;
+
+ /* random number generator */
+ uint32_t random_counter;
+ uint8_t random_numbers[SCTP_SIGNATURE_ALOC_SIZE];
+ uint8_t random_store[SCTP_SIGNATURE_ALOC_SIZE];
+
+ /*
+ * This timer is kept running per endpoint. When it fires it will
+ * change the secret key. The default is once a hour
+ */
+ struct sctp_timer signature_change;
+
+ /* Zero copy full buffer timer */
+ struct sctp_timer zero_copy_timer;
+ /* Zero copy app to transport (sendq) read repulse timer */
+ struct sctp_timer zero_copy_sendq_timer;
+ uint32_t def_cookie_life;
+ /* defaults to 0 */
+ int auto_close_time;
+ uint32_t initial_sequence_debug;
+ uint32_t adaptation_layer_indicator;
+ uint32_t store_at;
+ uint8_t max_burst;
+ char current_secret_number;
+ char last_secret_number;
+};
+
+#ifndef SCTP_ALIGNMENT
+#define SCTP_ALIGNMENT 32
+#endif
+
+#ifndef SCTP_ALIGNM1
+#define SCTP_ALIGNM1 (SCTP_ALIGNMENT-1)
+#endif
+
+#define sctp_lport ip_inp.inp.inp_lport
+
+struct sctp_pcbtsn_rlog {
+ uint32_t vtag;
+ uint16_t strm;
+ uint16_t seq;
+ uint16_t sz;
+ uint16_t flgs;
+};
+
+#define SCTP_READ_LOG_SIZE 135 /* we choose the number to make a pcb a page */
+
+
+struct sctp_inpcb {
+ /*-
+ * put an inpcb in front of it all, kind of a waste but we need to
+ * for compatability with all the other stuff.
+ */
+ union {
+ struct inpcb inp;
+ char align[(sizeof(struct in6pcb) + SCTP_ALIGNM1) &
+ ~SCTP_ALIGNM1];
+ } ip_inp;
+
+
+ /* Socket buffer lock protects read_queue and of course sb_cc */
+ struct sctp_readhead read_queue;
+
+ LIST_ENTRY(sctp_inpcb) sctp_list; /* lists all endpoints */
+ /* hash of all endpoints for model */
+ LIST_ENTRY(sctp_inpcb) sctp_hash;
+ /* count of local addresses bound, 0 if bound all */
+ int laddr_count;
+
+ /* list of addrs in use by the EP, NULL if bound-all */
+ struct sctpladdr sctp_addr_list;
+ /*
+ * used for source address selection rotation when we are subset
+ * bound
+ */
+ struct sctp_laddr *next_addr_touse;
+
+ /* back pointer to our socket */
+ struct socket *sctp_socket;
+ uint32_t sctp_flags; /* INP state flag set */
+ uint32_t sctp_features; /* Feature flags */
+ uint32_t sctp_mobility_features; /* Mobility Feature flags */
+ struct sctp_pcb sctp_ep;/* SCTP ep data */
+ /* head of the hash of all associations */
+ struct sctpasochead *sctp_tcbhash;
+ u_long sctp_hashmark;
+ /* head of the list of all associations */
+ struct sctpasochead sctp_asoc_list;
+#ifdef SCTP_TRACK_FREED_ASOCS
+ struct sctpasochead sctp_asoc_free_list;
+#endif
+ struct sctp_iterator *inp_starting_point_for_iterator;
+ uint32_t sctp_frag_point;
+ uint32_t partial_delivery_point;
+ uint32_t sctp_context;
+ uint32_t sctp_cmt_on_off;
+ struct sctp_nonpad_sndrcvinfo def_send;
+ /*-
+ * These three are here for the sosend_dgram
+ * (pkt, pkt_last and control).
+ * routine. However, I don't think anyone in
+ * the current FreeBSD kernel calls this. So
+ * they are candidates with sctp_sendm for
+ * de-supporting.
+ */
+ struct mbuf *pkt, *pkt_last;
+ struct mbuf *control;
+ struct mtx inp_mtx;
+ struct mtx inp_create_mtx;
+ struct mtx inp_rdata_mtx;
+ int32_t refcount;
+ uint32_t def_vrf_id;
+ uint32_t total_sends;
+ uint32_t total_recvs;
+ uint32_t last_abort_code;
+ uint32_t total_nospaces;
+ struct sctpasochead *sctp_asocidhash;
+ u_long hashasocidmark;
+ uint32_t sctp_associd_counter;
+
+#ifdef SCTP_ASOCLOG_OF_TSNS
+ struct sctp_pcbtsn_rlog readlog[SCTP_READ_LOG_SIZE];
+ uint32_t readlog_index;
+#endif
+};
+
+struct sctp_tcb {
+ struct socket *sctp_socket; /* back pointer to socket */
+ struct sctp_inpcb *sctp_ep; /* back pointer to ep */
+ LIST_ENTRY(sctp_tcb) sctp_tcbhash; /* next link in hash
+ * table */
+ LIST_ENTRY(sctp_tcb) sctp_tcblist; /* list of all of the
+ * TCB's */
+ LIST_ENTRY(sctp_tcb) sctp_tcbasocidhash; /* next link in asocid
+ * hash table */
+ LIST_ENTRY(sctp_tcb) sctp_asocs; /* vtag hash list */
+ struct sctp_block_entry *block_entry; /* pointer locked by socket
+ * send buffer */
+ struct sctp_association asoc;
+ /*-
+ * freed_by_sorcv_sincelast is protected by the sockbuf_lock NOT the
+ * tcb_lock. Its special in this way to help avoid extra mutex calls
+ * in the reading of data.
+ */
+ uint32_t freed_by_sorcv_sincelast;
+ uint32_t total_sends;
+ uint32_t total_recvs;
+ int freed_from_where;
+ uint16_t rport; /* remote port in network format */
+ uint16_t resv;
+ struct mtx tcb_mtx;
+ struct mtx tcb_send_mtx;
+};
+
+
+
+#include <freebsd/netinet/sctp_lock_bsd.h>
+
+
+/* TODO where to put non-_KERNEL things for __Userspace__? */
+#if defined(_KERNEL) || defined(__Userspace__)
+
+/* Attention Julian, this is the extern that
+ * goes with the base info. sctp_pcb.c has
+ * the real definition.
+ */
+VNET_DECLARE(struct sctp_base_info, system_base_info);
+
+#ifdef INET6
+int SCTP6_ARE_ADDR_EQUAL(struct sockaddr_in6 *a, struct sockaddr_in6 *b);
+
+#endif
+
+void sctp_fill_pcbinfo(struct sctp_pcbinfo *);
+
+struct sctp_ifn *
+ sctp_find_ifn(void *ifn, uint32_t ifn_index);
+
+struct sctp_vrf *sctp_allocate_vrf(int vrfid);
+struct sctp_vrf *sctp_find_vrf(uint32_t vrfid);
+void sctp_free_vrf(struct sctp_vrf *vrf);
+
+/*-
+ * Change address state, can be used if
+ * O/S supports telling transports about
+ * changes to IFA/IFN's (link layer triggers).
+ * If a ifn goes down, we will do src-addr-selection
+ * and NOT use that, as a source address. This does
+ * not stop the routing system from routing out
+ * that interface, but we won't put it as a source.
+ */
+void sctp_mark_ifa_addr_down(uint32_t vrf_id, struct sockaddr *addr, const char *if_name, uint32_t ifn_index);
+void sctp_mark_ifa_addr_up(uint32_t vrf_id, struct sockaddr *addr, const char *if_name, uint32_t ifn_index);
+
+struct sctp_ifa *
+sctp_add_addr_to_vrf(uint32_t vrfid,
+ void *ifn, uint32_t ifn_index, uint32_t ifn_type,
+ const char *if_name,
+ void *ifa, struct sockaddr *addr, uint32_t ifa_flags,
+ int dynamic_add);
+
+void sctp_update_ifn_mtu(uint32_t ifn_index, uint32_t mtu);
+
+void sctp_free_ifn(struct sctp_ifn *sctp_ifnp);
+void sctp_free_ifa(struct sctp_ifa *sctp_ifap);
+
+
+void
+sctp_del_addr_from_vrf(uint32_t vrfid, struct sockaddr *addr,
+ uint32_t ifn_index, const char *if_name);
+
+
+
+struct sctp_nets *sctp_findnet(struct sctp_tcb *, struct sockaddr *);
+
+struct sctp_inpcb *sctp_pcb_findep(struct sockaddr *, int, int, uint32_t);
+
+int
+sctp_inpcb_bind(struct socket *, struct sockaddr *,
+ struct sctp_ifa *, struct thread *);
+
+struct sctp_tcb *
+sctp_findassociation_addr(struct mbuf *, int, int,
+ struct sctphdr *, struct sctp_chunkhdr *, struct sctp_inpcb **,
+ struct sctp_nets **, uint32_t vrf_id);
+
+struct sctp_tcb *
+sctp_findassociation_addr_sa(struct sockaddr *,
+ struct sockaddr *, struct sctp_inpcb **, struct sctp_nets **, int, uint32_t);
+
+void
+sctp_move_pcb_and_assoc(struct sctp_inpcb *, struct sctp_inpcb *,
+ struct sctp_tcb *);
+
+/*-
+ * For this call ep_addr, the to is the destination endpoint address of the
+ * peer (relative to outbound). The from field is only used if the TCP model
+ * is enabled and helps distingush amongst the subset bound (non-boundall).
+ * The TCP model MAY change the actual ep field, this is why it is passed.
+ */
+struct sctp_tcb *
+sctp_findassociation_ep_addr(struct sctp_inpcb **,
+ struct sockaddr *, struct sctp_nets **, struct sockaddr *,
+ struct sctp_tcb *);
+
+struct sctp_tcb *
+ sctp_findasoc_ep_asocid_locked(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock);
+
+struct sctp_tcb *
+sctp_findassociation_ep_asocid(struct sctp_inpcb *,
+ sctp_assoc_t, int);
+
+struct sctp_tcb *
+sctp_findassociation_ep_asconf(struct mbuf *, int, int,
+ struct sctphdr *, struct sctp_inpcb **, struct sctp_nets **, uint32_t vrf_id);
+
+int sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id);
+
+int sctp_is_address_on_local_host(struct sockaddr *addr, uint32_t vrf_id);
+
+void sctp_inpcb_free(struct sctp_inpcb *, int, int);
+
+struct sctp_tcb *
+sctp_aloc_assoc(struct sctp_inpcb *, struct sockaddr *,
+ int *, uint32_t, uint32_t, struct thread *);
+
+int sctp_free_assoc(struct sctp_inpcb *, struct sctp_tcb *, int, int);
+
+
+void sctp_delete_from_timewait(uint32_t, uint16_t, uint16_t);
+
+int sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport);
+
+void
+ sctp_add_vtag_to_timewait(uint32_t tag, uint32_t time, uint16_t lport, uint16_t rport);
+
+void sctp_add_local_addr_ep(struct sctp_inpcb *, struct sctp_ifa *, uint32_t);
+
+int sctp_insert_laddr(struct sctpladdr *, struct sctp_ifa *, uint32_t);
+
+void sctp_remove_laddr(struct sctp_laddr *);
+
+void sctp_del_local_addr_ep(struct sctp_inpcb *, struct sctp_ifa *);
+
+int sctp_add_remote_addr(struct sctp_tcb *, struct sockaddr *, int, int);
+
+void sctp_remove_net(struct sctp_tcb *, struct sctp_nets *);
+
+int sctp_del_remote_addr(struct sctp_tcb *, struct sockaddr *);
+
+void sctp_pcb_init(void);
+
+void sctp_pcb_finish(void);
+
+void sctp_add_local_addr_restricted(struct sctp_tcb *, struct sctp_ifa *);
+void sctp_del_local_addr_restricted(struct sctp_tcb *, struct sctp_ifa *);
+
+int
+sctp_load_addresses_from_init(struct sctp_tcb *, struct mbuf *, int, int,
+ int, struct sctphdr *, struct sockaddr *);
+
+int
+sctp_set_primary_addr(struct sctp_tcb *, struct sockaddr *,
+ struct sctp_nets *);
+
+int sctp_is_vtag_good(struct sctp_inpcb *, uint32_t, uint16_t lport, uint16_t rport, struct timeval *, int);
+
+/* void sctp_drain(void); */
+
+int sctp_destination_is_reachable(struct sctp_tcb *, struct sockaddr *);
+
+int sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp);
+
+/*-
+ * Null in last arg inpcb indicate run on ALL ep's. Specific inp in last arg
+ * indicates run on ONLY assoc's of the specified endpoint.
+ */
+int
+sctp_initiate_iterator(inp_func inpf,
+ asoc_func af,
+ inp_func inpe,
+ uint32_t, uint32_t,
+ uint32_t, void *,
+ uint32_t,
+ end_func ef,
+ struct sctp_inpcb *,
+ uint8_t co_off);
+
+#ifdef INVARIANTS
+void
+ sctp_validate_no_locks(struct sctp_inpcb *inp);
+
+#endif
+
+#endif /* _KERNEL */
+#endif /* !__sctp_pcb_h__ */
diff --git a/freebsd/sys/netinet/sctp_peeloff.c b/freebsd/sys/netinet/sctp_peeloff.c
new file mode 100644
index 00000000..7b859bba
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_peeloff.c
@@ -0,0 +1,240 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+/* $KAME: sctp_peeloff.c,v 1.13 2005/03/06 16:04:18 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_var.h>
+#include <freebsd/netinet/sctp_var.h>
+#include <freebsd/netinet/sctp_sysctl.h>
+#include <freebsd/netinet/sctp.h>
+#include <freebsd/netinet/sctp_uio.h>
+#include <freebsd/netinet/sctp_peeloff.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_auth.h>
+
+
+int
+sctp_can_peel_off(struct socket *head, sctp_assoc_t assoc_id)
+{
+ struct sctp_inpcb *inp;
+ struct sctp_tcb *stcb;
+ uint32_t state;
+
+ inp = (struct sctp_inpcb *)head->so_pcb;
+ if (inp == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, EFAULT);
+ return (EFAULT);
+ }
+ stcb = sctp_findassociation_ep_asocid(inp, assoc_id, 1);
+ if (stcb == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_PEELOFF, ENOENT);
+ return (ENOENT);
+ }
+ state = SCTP_GET_STATE((&stcb->asoc));
+ if ((state == SCTP_STATE_EMPTY) ||
+ (state == SCTP_STATE_INUSE) ||
+ (state == SCTP_STATE_COOKIE_WAIT) ||
+ (state == SCTP_STATE_COOKIE_ECHOED)) {
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_PEELOFF, ENOTCONN);
+ return (ENOTCONN);
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ /* We are clear to peel this one off */
+ return (0);
+}
+
+int
+sctp_do_peeloff(struct socket *head, struct socket *so, sctp_assoc_t assoc_id)
+{
+ struct sctp_inpcb *inp, *n_inp;
+ struct sctp_tcb *stcb;
+ uint32_t state;
+
+ inp = (struct sctp_inpcb *)head->so_pcb;
+ if (inp == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, EFAULT);
+ return (EFAULT);
+ }
+ stcb = sctp_findassociation_ep_asocid(inp, assoc_id, 1);
+ if (stcb == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, ENOTCONN);
+ return (ENOTCONN);
+ }
+ state = SCTP_GET_STATE((&stcb->asoc));
+ if ((state == SCTP_STATE_EMPTY) ||
+ (state == SCTP_STATE_INUSE) ||
+ (state == SCTP_STATE_COOKIE_WAIT) ||
+ (state == SCTP_STATE_COOKIE_ECHOED)) {
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, ENOTCONN);
+ return (ENOTCONN);
+ }
+ n_inp = (struct sctp_inpcb *)so->so_pcb;
+ n_inp->sctp_flags = (SCTP_PCB_FLAGS_UDPTYPE |
+ SCTP_PCB_FLAGS_CONNECTED |
+ SCTP_PCB_FLAGS_IN_TCPPOOL | /* Turn on Blocking IO */
+ (SCTP_PCB_COPY_FLAGS & inp->sctp_flags));
+ n_inp->sctp_socket = so;
+ n_inp->sctp_features = inp->sctp_features;
+ n_inp->sctp_mobility_features = inp->sctp_mobility_features;
+ n_inp->sctp_frag_point = inp->sctp_frag_point;
+ n_inp->sctp_cmt_on_off = inp->sctp_cmt_on_off;
+ n_inp->partial_delivery_point = inp->partial_delivery_point;
+ n_inp->sctp_context = inp->sctp_context;
+ n_inp->inp_starting_point_for_iterator = NULL;
+ /* copy in the authentication parameters from the original endpoint */
+ if (n_inp->sctp_ep.local_hmacs)
+ sctp_free_hmaclist(n_inp->sctp_ep.local_hmacs);
+ n_inp->sctp_ep.local_hmacs =
+ sctp_copy_hmaclist(inp->sctp_ep.local_hmacs);
+ if (n_inp->sctp_ep.local_auth_chunks)
+ sctp_free_chunklist(n_inp->sctp_ep.local_auth_chunks);
+ n_inp->sctp_ep.local_auth_chunks =
+ sctp_copy_chunklist(inp->sctp_ep.local_auth_chunks);
+ (void)sctp_copy_skeylist(&inp->sctp_ep.shared_keys,
+ &n_inp->sctp_ep.shared_keys);
+ /*
+ * Now we must move it from one hash table to another and get the
+ * stcb in the right place.
+ */
+ sctp_move_pcb_and_assoc(inp, n_inp, stcb);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+
+ sctp_pull_off_control_to_new_inp(inp, n_inp, stcb, SBL_WAIT);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+
+ return (0);
+}
+
+
+struct socket *
+sctp_get_peeloff(struct socket *head, sctp_assoc_t assoc_id, int *error)
+{
+ struct socket *newso;
+ struct sctp_inpcb *inp, *n_inp;
+ struct sctp_tcb *stcb;
+
+ SCTPDBG(SCTP_DEBUG_PEEL1, "SCTP peel-off called\n");
+ inp = (struct sctp_inpcb *)head->so_pcb;
+ if (inp == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, EFAULT);
+ *error = EFAULT;
+ return (NULL);
+ }
+ stcb = sctp_findassociation_ep_asocid(inp, assoc_id, 1);
+ if (stcb == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, ENOTCONN);
+ *error = ENOTCONN;
+ return (NULL);
+ }
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ newso = sonewconn(head, SS_ISCONNECTED
+ );
+ if (newso == NULL) {
+ SCTPDBG(SCTP_DEBUG_PEEL1, "sctp_peeloff:sonewconn failed\n");
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_PEELOFF, ENOMEM);
+ *error = ENOMEM;
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ return (NULL);
+
+ }
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ n_inp = (struct sctp_inpcb *)newso->so_pcb;
+ SOCK_LOCK(head);
+ n_inp->sctp_flags = (SCTP_PCB_FLAGS_UDPTYPE |
+ SCTP_PCB_FLAGS_CONNECTED |
+ SCTP_PCB_FLAGS_IN_TCPPOOL | /* Turn on Blocking IO */
+ (SCTP_PCB_COPY_FLAGS & inp->sctp_flags));
+ n_inp->sctp_features = inp->sctp_features;
+ n_inp->sctp_frag_point = inp->sctp_frag_point;
+ n_inp->sctp_cmt_on_off = inp->sctp_cmt_on_off;
+ n_inp->partial_delivery_point = inp->partial_delivery_point;
+ n_inp->sctp_context = inp->sctp_context;
+ n_inp->inp_starting_point_for_iterator = NULL;
+
+ /* copy in the authentication parameters from the original endpoint */
+ if (n_inp->sctp_ep.local_hmacs)
+ sctp_free_hmaclist(n_inp->sctp_ep.local_hmacs);
+ n_inp->sctp_ep.local_hmacs =
+ sctp_copy_hmaclist(inp->sctp_ep.local_hmacs);
+ if (n_inp->sctp_ep.local_auth_chunks)
+ sctp_free_chunklist(n_inp->sctp_ep.local_auth_chunks);
+ n_inp->sctp_ep.local_auth_chunks =
+ sctp_copy_chunklist(inp->sctp_ep.local_auth_chunks);
+ (void)sctp_copy_skeylist(&inp->sctp_ep.shared_keys,
+ &n_inp->sctp_ep.shared_keys);
+
+ n_inp->sctp_socket = newso;
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) {
+ sctp_feature_off(n_inp, SCTP_PCB_FLAGS_AUTOCLOSE);
+ n_inp->sctp_ep.auto_close_time = 0;
+ sctp_timer_stop(SCTP_TIMER_TYPE_AUTOCLOSE, n_inp, stcb, NULL,
+ SCTP_FROM_SCTP_PEELOFF + SCTP_LOC_1);
+ }
+ /* Turn off any non-blocking semantic. */
+ SCTP_CLEAR_SO_NBIO(newso);
+ newso->so_state |= SS_ISCONNECTED;
+ /* We remove it right away */
+
+#ifdef SCTP_LOCK_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) {
+ sctp_log_lock(inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_SOCK);
+ }
+#endif
+ TAILQ_REMOVE(&head->so_comp, newso, so_list);
+ head->so_qlen--;
+ SOCK_UNLOCK(head);
+ /*
+ * Now we must move it from one hash table to another and get the
+ * stcb in the right place.
+ */
+ sctp_move_pcb_and_assoc(inp, n_inp, stcb);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ /*
+ * And now the final hack. We move data in the pending side i.e.
+ * head to the new socket buffer. Let the GRUBBING begin :-0
+ */
+ sctp_pull_off_control_to_new_inp(inp, n_inp, stcb, SBL_WAIT);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ return (newso);
+}
diff --git a/freebsd/sys/netinet/sctp_peeloff.h b/freebsd/sys/netinet/sctp_peeloff.h
new file mode 100644
index 00000000..57fd5fef
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_peeloff.h
@@ -0,0 +1,52 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_peeloff.h,v 1.6 2005/03/06 16:04:18 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __sctp_peeloff_h__
+#define __sctp_peeloff_h__
+
+
+
+
+#if defined(_KERNEL)
+
+int sctp_can_peel_off(struct socket *, sctp_assoc_t);
+int sctp_do_peeloff(struct socket *, struct socket *, sctp_assoc_t);
+struct socket *sctp_get_peeloff(struct socket *, sctp_assoc_t, int *);
+
+
+
+#endif /* _KERNEL */
+
+#endif
diff --git a/freebsd/sys/netinet/sctp_structs.h b/freebsd/sys/netinet/sctp_structs.h
new file mode 100644
index 00000000..2050c581
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_structs.h
@@ -0,0 +1,1094 @@
+/*-
+ * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_structs.h,v 1.13 2005/03/06 16:04:18 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __sctp_structs_h__
+#define __sctp_structs_h__
+
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp_header.h>
+#include <freebsd/netinet/sctp_auth.h>
+
+struct sctp_timer {
+ sctp_os_timer_t timer;
+
+ int type;
+ /*
+ * Depending on the timer type these will be setup and cast with the
+ * appropriate entity.
+ */
+ void *ep;
+ void *tcb;
+ void *net;
+ void *vnet;
+
+ /* for sanity checking */
+ void *self;
+ uint32_t ticks;
+ uint32_t stopped_from;
+};
+
+
+struct sctp_foo_stuff {
+ struct sctp_inpcb *inp;
+ uint32_t lineno;
+ uint32_t ticks;
+ int updown;
+};
+
+
+/*
+ * This is the information we track on each interface that we know about from
+ * the distant end.
+ */
+TAILQ_HEAD(sctpnetlisthead, sctp_nets);
+
+struct sctp_stream_reset_list {
+ TAILQ_ENTRY(sctp_stream_reset_list) next_resp;
+ uint32_t tsn;
+ int number_entries;
+ struct sctp_stream_reset_out_request req;
+};
+
+TAILQ_HEAD(sctp_resethead, sctp_stream_reset_list);
+
+/*
+ * Users of the iterator need to malloc a iterator with a call to
+ * sctp_initiate_iterator(inp_func, assoc_func, inp_func, pcb_flags, pcb_features,
+ * asoc_state, void-ptr-arg, uint32-arg, end_func, inp);
+ *
+ * Use the following two defines if you don't care what pcb flags are on the EP
+ * and/or you don't care what state the association is in.
+ *
+ * Note that if you specify an INP as the last argument then ONLY each
+ * association of that single INP will be executed upon. Note that the pcb
+ * flags STILL apply so if the inp you specify has different pcb_flags then
+ * what you put in pcb_flags nothing will happen. use SCTP_PCB_ANY_FLAGS to
+ * assure the inp you specify gets treated.
+ */
+#define SCTP_PCB_ANY_FLAGS 0x00000000
+#define SCTP_PCB_ANY_FEATURES 0x00000000
+#define SCTP_ASOC_ANY_STATE 0x00000000
+
+typedef void (*asoc_func) (struct sctp_inpcb *, struct sctp_tcb *, void *ptr,
+ uint32_t val);
+typedef int (*inp_func) (struct sctp_inpcb *, void *ptr, uint32_t val);
+typedef void (*end_func) (void *ptr, uint32_t val);
+
+struct sctp_iterator {
+ TAILQ_ENTRY(sctp_iterator) sctp_nxt_itr;
+ struct vnet *vn;
+ struct sctp_timer tmr;
+ struct sctp_inpcb *inp; /* current endpoint */
+ struct sctp_tcb *stcb; /* current* assoc */
+ struct sctp_inpcb *next_inp; /* special hook to skip to */
+ asoc_func function_assoc; /* per assoc function */
+ inp_func function_inp; /* per endpoint function */
+ inp_func function_inp_end; /* end INP function */
+ end_func function_atend;/* iterator completion function */
+ void *pointer; /* pointer for apply func to use */
+ uint32_t val; /* value for apply func to use */
+ uint32_t pcb_flags; /* endpoint flags being checked */
+ uint32_t pcb_features; /* endpoint features being checked */
+ uint32_t asoc_state; /* assoc state being checked */
+ uint32_t iterator_flags;
+ uint8_t no_chunk_output;
+ uint8_t done_current_ep;
+};
+
+/* iterator_flags values */
+#define SCTP_ITERATOR_DO_ALL_INP 0x00000001
+#define SCTP_ITERATOR_DO_SINGLE_INP 0x00000002
+
+
+TAILQ_HEAD(sctpiterators, sctp_iterator);
+
+struct sctp_copy_all {
+ struct sctp_inpcb *inp; /* ep */
+ struct mbuf *m;
+ struct sctp_sndrcvinfo sndrcv;
+ int sndlen;
+ int cnt_sent;
+ int cnt_failed;
+};
+
+struct sctp_asconf_iterator {
+ struct sctpladdr list_of_work;
+ int cnt;
+};
+
+struct iterator_control {
+ struct mtx ipi_iterator_wq_mtx;
+ struct mtx it_mtx;
+ SCTP_PROCESS_STRUCT thread_proc;
+ struct sctpiterators iteratorhead;
+ struct sctp_iterator *cur_it;
+ uint32_t iterator_running;
+ uint32_t iterator_flags;
+};
+
+#define SCTP_ITERATOR_MUST_EXIT 0x00000001
+#define SCTP_ITERATOR_STOP_CUR_IT 0x00000002
+#define SCTP_ITERATOR_STOP_CUR_INP 0x00000004
+
+struct sctp_net_route {
+ sctp_rtentry_t *ro_rt;
+ void *ro_lle;
+ union sctp_sockstore _l_addr; /* remote peer addr */
+ struct sctp_ifa *_s_addr; /* our selected src addr */
+};
+
+struct htcp {
+ uint16_t alpha; /* Fixed point arith, << 7 */
+ uint8_t beta; /* Fixed point arith, << 7 */
+ uint8_t modeswitch; /* Delay modeswitch until we had at least one
+ * congestion event */
+ uint32_t last_cong; /* Time since last congestion event end */
+ uint32_t undo_last_cong;
+ uint16_t bytes_acked;
+ uint32_t bytecount;
+ uint32_t minRTT;
+ uint32_t maxRTT;
+
+ uint32_t undo_maxRTT;
+ uint32_t undo_old_maxB;
+
+ /* Bandwidth estimation */
+ uint32_t minB;
+ uint32_t maxB;
+ uint32_t old_maxB;
+ uint32_t Bi;
+ uint32_t lasttime;
+};
+
+
+struct sctp_nets {
+ TAILQ_ENTRY(sctp_nets) sctp_next; /* next link */
+
+ /*
+ * Things on the top half may be able to be split into a common
+ * structure shared by all.
+ */
+ struct sctp_timer pmtu_timer;
+
+ /*
+ * The following two in combination equate to a route entry for v6
+ * or v4.
+ */
+ struct sctp_net_route ro;
+
+ /* mtu discovered so far */
+ uint32_t mtu;
+ uint32_t ssthresh; /* not sure about this one for split */
+
+ /* smoothed average things for RTT and RTO itself */
+ int lastsa;
+ int lastsv;
+ int rtt; /* last measured rtt value in ms */
+ unsigned int RTO;
+
+ /* This is used for SHUTDOWN/SHUTDOWN-ACK/SEND or INIT timers */
+ struct sctp_timer rxt_timer;
+ struct sctp_timer fr_timer; /* for early fr */
+
+ /* last time in seconds I sent to it */
+ struct timeval last_sent_time;
+ int ref_count;
+
+ /* Congestion stats per destination */
+ /*
+ * flight size variables and such, sorry Vern, I could not avoid
+ * this if I wanted performance :>
+ */
+ uint32_t flight_size;
+ uint32_t cwnd; /* actual cwnd */
+ uint32_t prev_cwnd; /* cwnd before any processing */
+ uint32_t partial_bytes_acked; /* in CA tracks when to incr a MTU */
+ uint32_t prev_rtt;
+ /* tracking variables to avoid the aloc/free in sack processing */
+ unsigned int net_ack;
+ unsigned int net_ack2;
+
+ /*
+ * JRS - 5/8/07 - Variable to track last time a destination was
+ * active for CMT PF
+ */
+ uint32_t last_active;
+
+ /*
+ * CMT variables (iyengar@cis.udel.edu)
+ */
+ uint32_t this_sack_highest_newack; /* tracks highest TSN newly
+ * acked for a given dest in
+ * the current SACK. Used in
+ * SFR and HTNA algos */
+ uint32_t pseudo_cumack; /* CMT CUC algorithm. Maintains next expected
+ * pseudo-cumack for this destination */
+ uint32_t rtx_pseudo_cumack; /* CMT CUC algorithm. Maintains next
+ * expected pseudo-cumack for this
+ * destination */
+
+ /* CMT fast recovery variables */
+ uint32_t fast_recovery_tsn;
+ uint32_t heartbeat_random1;
+ uint32_t heartbeat_random2;
+ uint32_t tos_flowlabel;
+
+ struct timeval start_time; /* time when this net was created */
+
+ uint32_t marked_retrans;/* number or DATA chunks marked for timer
+ * based retransmissions */
+ uint32_t marked_fastretrans;
+
+ /* if this guy is ok or not ... status */
+ uint16_t dest_state;
+ /* number of transmit failures to down this guy */
+ uint16_t failure_threshold;
+ /* error stats on destination */
+ uint16_t error_count;
+ /* UDP port number in case of UDP tunneling */
+ uint16_t port;
+
+ uint8_t fast_retran_loss_recovery;
+ uint8_t will_exit_fast_recovery;
+ /* Flags that probably can be combined into dest_state */
+ uint8_t fast_retran_ip; /* fast retransmit in progress */
+ uint8_t hb_responded;
+ uint8_t saw_newack; /* CMT's SFR algorithm flag */
+ uint8_t src_addr_selected; /* if we split we move */
+ uint8_t indx_of_eligible_next_to_use;
+ uint8_t addr_is_local; /* its a local address (if known) could move
+ * in split */
+
+ /*
+ * CMT variables (iyengar@cis.udel.edu)
+ */
+ uint8_t find_pseudo_cumack; /* CMT CUC algorithm. Flag used to
+ * find a new pseudocumack. This flag
+ * is set after a new pseudo-cumack
+ * has been received and indicates
+ * that the sender should find the
+ * next pseudo-cumack expected for
+ * this destination */
+ uint8_t find_rtx_pseudo_cumack; /* CMT CUCv2 algorithm. Flag used to
+ * find a new rtx-pseudocumack. This
+ * flag is set after a new
+ * rtx-pseudo-cumack has been received
+ * and indicates that the sender
+ * should find the next
+ * rtx-pseudo-cumack expected for this
+ * destination */
+ uint8_t new_pseudo_cumack; /* CMT CUC algorithm. Flag used to
+ * indicate if a new pseudo-cumack or
+ * rtx-pseudo-cumack has been received */
+ uint8_t window_probe; /* Doing a window probe? */
+ uint8_t RTO_measured; /* Have we done the first measure */
+ uint8_t last_hs_used; /* index into the last HS table entry we used */
+ /* JRS - struct used in HTCP algorithm */
+ struct htcp htcp_ca;
+};
+
+
+struct sctp_data_chunkrec {
+ uint32_t TSN_seq; /* the TSN of this transmit */
+ uint16_t stream_seq; /* the stream sequence number of this transmit */
+ uint16_t stream_number; /* the stream number of this guy */
+ uint32_t payloadtype;
+ uint32_t context; /* from send */
+
+ /* ECN Nonce: Nonce Value for this chunk */
+ uint8_t ect_nonce;
+ uint8_t fwd_tsn_cnt;
+ /*
+ * part of the Highest sacked algorithm to be able to stroke counts
+ * on ones that are FR'd.
+ */
+ uint32_t fast_retran_tsn; /* sending_seq at the time of FR */
+ struct timeval timetodrop; /* time we drop it from queue */
+ uint8_t doing_fast_retransmit;
+ uint8_t rcv_flags; /* flags pulled from data chunk on inbound for
+ * outbound holds sending flags for PR-SCTP. */
+ uint8_t state_flags;
+ uint8_t chunk_was_revoked;
+};
+
+TAILQ_HEAD(sctpchunk_listhead, sctp_tmit_chunk);
+
+/* The lower byte is used to enumerate PR_SCTP policies */
+#define CHUNK_FLAGS_PR_SCTP_TTL SCTP_PR_SCTP_TTL
+#define CHUNK_FLAGS_PR_SCTP_BUF SCTP_PR_SCTP_BUF
+#define CHUNK_FLAGS_PR_SCTP_RTX SCTP_PR_SCTP_RTX
+
+/* The upper byte is used a a bit mask */
+#define CHUNK_FLAGS_FRAGMENT_OK 0x0100
+
+struct chk_id {
+ uint16_t id;
+ uint16_t can_take_data;
+};
+
+
+struct sctp_tmit_chunk {
+ union {
+ struct sctp_data_chunkrec data;
+ struct chk_id chunk_id;
+ } rec;
+ struct sctp_association *asoc; /* bp to asoc this belongs to */
+ struct timeval sent_rcv_time; /* filled in if RTT being calculated */
+ struct mbuf *data; /* pointer to mbuf chain of data */
+ struct mbuf *last_mbuf; /* pointer to last mbuf in chain */
+ struct sctp_nets *whoTo;
+ TAILQ_ENTRY(sctp_tmit_chunk) sctp_next; /* next link */
+ int32_t sent; /* the send status */
+ uint16_t snd_count; /* number of times I sent */
+ uint16_t flags; /* flags, such as FRAGMENT_OK */
+ uint16_t send_size;
+ uint16_t book_size;
+ uint16_t mbcnt;
+ uint16_t auth_keyid;
+ uint8_t holds_key_ref; /* flag if auth keyid refcount is held */
+ uint8_t pad_inplace;
+ uint8_t do_rtt;
+ uint8_t book_size_scale;
+ uint8_t no_fr_allowed;
+ uint8_t pr_sctp_on;
+ uint8_t copy_by_ref;
+ uint8_t window_probe;
+};
+
+/*
+ * The first part of this structure MUST be the entire sinfo structure. Maybe
+ * I should have made it a sub structure... we can circle back later and do
+ * that if we want.
+ */
+struct sctp_queued_to_read { /* sinfo structure Pluse more */
+ uint16_t sinfo_stream; /* off the wire */
+ uint16_t sinfo_ssn; /* off the wire */
+ uint16_t sinfo_flags; /* SCTP_UNORDERED from wire use SCTP_EOF for
+ * EOR */
+ uint32_t sinfo_ppid; /* off the wire */
+ uint32_t sinfo_context; /* pick this up from assoc def context? */
+ uint32_t sinfo_timetolive; /* not used by kernel */
+ uint32_t sinfo_tsn; /* Use this in reassembly as first TSN */
+ uint32_t sinfo_cumtsn; /* Use this in reassembly as last TSN */
+ sctp_assoc_t sinfo_assoc_id; /* our assoc id */
+ /* Non sinfo stuff */
+ uint32_t length; /* length of data */
+ uint32_t held_length; /* length held in sb */
+ struct sctp_nets *whoFrom; /* where it came from */
+ struct mbuf *data; /* front of the mbuf chain of data with
+ * PKT_HDR */
+ struct mbuf *tail_mbuf; /* used for multi-part data */
+ struct mbuf *aux_data; /* used to hold/cache control if o/s does not
+ * take it from us */
+ struct sctp_tcb *stcb; /* assoc, used for window update */
+ TAILQ_ENTRY(sctp_queued_to_read) next;
+ uint16_t port_from;
+ uint16_t spec_flags; /* Flags to hold the notification field */
+ uint8_t do_not_ref_stcb;
+ uint8_t end_added;
+ uint8_t pdapi_aborted;
+ uint8_t some_taken;
+};
+
+/* This data structure will be on the outbound
+ * stream queues. Data will be pulled off from
+ * the front of the mbuf data and chunk-ified
+ * by the output routines. We will custom
+ * fit every chunk we pull to the send/sent
+ * queue to make up the next full packet
+ * if we can. An entry cannot be removed
+ * from the stream_out queue until
+ * the msg_is_complete flag is set. This
+ * means at times data/tail_mbuf MIGHT
+ * be NULL.. If that occurs it happens
+ * for one of two reasons. Either the user
+ * is blocked on a send() call and has not
+ * awoken to copy more data down... OR
+ * the user is in the explict MSG_EOR mode
+ * and wrote some data, but has not completed
+ * sending.
+ */
+struct sctp_stream_queue_pending {
+ struct mbuf *data;
+ struct mbuf *tail_mbuf;
+ struct timeval ts;
+ struct sctp_nets *net;
+ TAILQ_ENTRY(sctp_stream_queue_pending) next;
+ uint32_t length;
+ uint32_t timetolive;
+ uint32_t ppid;
+ uint32_t context;
+ uint16_t sinfo_flags;
+ uint16_t stream;
+ uint16_t strseq;
+ uint16_t act_flags;
+ uint16_t auth_keyid;
+ uint8_t holds_key_ref;
+ uint8_t msg_is_complete;
+ uint8_t some_taken;
+ uint8_t pr_sctp_on;
+ uint8_t sender_all_done;
+ uint8_t put_last_out;
+ uint8_t discard_rest;
+};
+
+/*
+ * this struct contains info that is used to track inbound stream data and
+ * help with ordering.
+ */
+TAILQ_HEAD(sctpwheelunrel_listhead, sctp_stream_in);
+struct sctp_stream_in {
+ struct sctp_readhead inqueue;
+ uint16_t stream_no;
+ uint16_t last_sequence_delivered; /* used for re-order */
+ uint8_t delivery_started;
+};
+
+/* This struct is used to track the traffic on outbound streams */
+TAILQ_HEAD(sctpwheel_listhead, sctp_stream_out);
+struct sctp_stream_out {
+ struct sctp_streamhead outqueue;
+ TAILQ_ENTRY(sctp_stream_out) next_spoke; /* next link in wheel */
+ uint16_t stream_no;
+ uint16_t next_sequence_sent; /* next one I expect to send out */
+ uint8_t last_msg_incomplete;
+};
+
+/* used to keep track of the addresses yet to try to add/delete */
+TAILQ_HEAD(sctp_asconf_addrhead, sctp_asconf_addr);
+struct sctp_asconf_addr {
+ TAILQ_ENTRY(sctp_asconf_addr) next;
+ struct sctp_asconf_addr_param ap;
+ struct sctp_ifa *ifa; /* save the ifa for add/del ip */
+ uint8_t sent; /* has this been sent yet? */
+ uint8_t special_del; /* not to be used in lookup */
+};
+
+struct sctp_scoping {
+ uint8_t ipv4_addr_legal;
+ uint8_t ipv6_addr_legal;
+ uint8_t loopback_scope;
+ uint8_t ipv4_local_scope;
+ uint8_t local_scope;
+ uint8_t site_scope;
+};
+
+#define SCTP_TSN_LOG_SIZE 40
+
+struct sctp_tsn_log {
+ void *stcb;
+ uint32_t tsn;
+ uint16_t strm;
+ uint16_t seq;
+ uint16_t sz;
+ uint16_t flgs;
+ uint16_t in_pos;
+ uint16_t in_out;
+};
+
+#define SCTP_FS_SPEC_LOG_SIZE 200
+struct sctp_fs_spec_log {
+ uint32_t sent;
+ uint32_t total_flight;
+ uint32_t tsn;
+ uint16_t book;
+ uint8_t incr;
+ uint8_t decr;
+};
+
+/* This struct is here to cut out the compatiabilty
+ * pad that bulks up both the inp and stcb. The non
+ * pad portion MUST stay in complete sync with
+ * sctp_sndrcvinfo... i.e. if sinfo_xxxx is added
+ * this must be done here too.
+ */
+struct sctp_nonpad_sndrcvinfo {
+ uint16_t sinfo_stream;
+ uint16_t sinfo_ssn;
+ uint16_t sinfo_flags;
+ uint32_t sinfo_ppid;
+ uint32_t sinfo_context;
+ uint32_t sinfo_timetolive;
+ uint32_t sinfo_tsn;
+ uint32_t sinfo_cumtsn;
+ sctp_assoc_t sinfo_assoc_id;
+};
+
+/*
+ * JRS - Structure to hold function pointers to the functions responsible
+ * for congestion control.
+ */
+
+struct sctp_cc_functions {
+ void (*sctp_set_initial_cc_param) (struct sctp_tcb *stcb, struct sctp_nets *net);
+ void (*sctp_cwnd_update_after_sack) (struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ int accum_moved, int reneged_all, int will_exit);
+ void (*sctp_cwnd_update_after_fr) (struct sctp_tcb *stcb,
+ struct sctp_association *asoc);
+ void (*sctp_cwnd_update_after_timeout) (struct sctp_tcb *stcb,
+ struct sctp_nets *net);
+ void (*sctp_cwnd_update_after_ecn_echo) (struct sctp_tcb *stcb,
+ struct sctp_nets *net);
+ void (*sctp_cwnd_update_after_packet_dropped) (struct sctp_tcb *stcb,
+ struct sctp_nets *net, struct sctp_pktdrop_chunk *cp,
+ uint32_t * bottle_bw, uint32_t * on_queue);
+ void (*sctp_cwnd_update_after_output) (struct sctp_tcb *stcb,
+ struct sctp_nets *net, int burst_limit);
+ void (*sctp_cwnd_update_after_fr_timer) (struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb, struct sctp_nets *net);
+};
+
+/* used to save ASCONF chunks for retransmission */
+TAILQ_HEAD(sctp_asconf_head, sctp_asconf);
+struct sctp_asconf {
+ TAILQ_ENTRY(sctp_asconf) next;
+ uint32_t serial_number;
+ uint16_t snd_count;
+ struct mbuf *data;
+ uint16_t len;
+};
+
+/* used to save ASCONF-ACK chunks for retransmission */
+TAILQ_HEAD(sctp_asconf_ackhead, sctp_asconf_ack);
+struct sctp_asconf_ack {
+ TAILQ_ENTRY(sctp_asconf_ack) next;
+ uint32_t serial_number;
+ struct sctp_nets *last_sent_to;
+ struct mbuf *data;
+ uint16_t len;
+};
+
+/*
+ * Here we have information about each individual association that we track.
+ * We probably in production would be more dynamic. But for ease of
+ * implementation we will have a fixed array that we hunt for in a linear
+ * fashion.
+ */
+struct sctp_association {
+ /* association state */
+ int state;
+
+ /* queue of pending addrs to add/delete */
+ struct sctp_asconf_addrhead asconf_queue;
+
+ struct timeval time_entered; /* time we entered state */
+ struct timeval time_last_rcvd;
+ struct timeval time_last_sent;
+ struct timeval time_last_sat_advance;
+ struct sctp_nonpad_sndrcvinfo def_send;
+
+ /* timers and such */
+ struct sctp_timer hb_timer; /* hb timer */
+ struct sctp_timer dack_timer; /* Delayed ack timer */
+ struct sctp_timer asconf_timer; /* asconf */
+ struct sctp_timer strreset_timer; /* stream reset */
+ struct sctp_timer shut_guard_timer; /* shutdown guard */
+ struct sctp_timer autoclose_timer; /* automatic close timer */
+ struct sctp_timer delayed_event_timer; /* timer for delayed events */
+ struct sctp_timer delete_prim_timer; /* deleting primary dst */
+
+ /* list of restricted local addresses */
+ struct sctpladdr sctp_restricted_addrs;
+
+ /* last local address pending deletion (waiting for an address add) */
+ struct sctp_ifa *asconf_addr_del_pending;
+ /* Deleted primary destination (used to stop timer) */
+ struct sctp_nets *deleted_primary;
+
+ struct sctpnetlisthead nets; /* remote address list */
+
+ /* Free chunk list */
+ struct sctpchunk_listhead free_chunks;
+
+ /* Control chunk queue */
+ struct sctpchunk_listhead control_send_queue;
+
+ /* ASCONF chunk queue */
+ struct sctpchunk_listhead asconf_send_queue;
+
+ /*
+ * Once a TSN hits the wire it is moved to the sent_queue. We
+ * maintain two counts here (don't know if any but retran_cnt is
+ * needed). The idea is that the sent_queue_retran_cnt reflects how
+ * many chunks have been marked for retranmission by either T3-rxt
+ * or FR.
+ */
+ struct sctpchunk_listhead sent_queue;
+ struct sctpchunk_listhead send_queue;
+
+ /* re-assembly queue for fragmented chunks on the inbound path */
+ struct sctpchunk_listhead reasmqueue;
+
+ /*
+ * this queue is used when we reach a condition that we can NOT put
+ * data into the socket buffer. We track the size of this queue and
+ * set our rwnd to the space in the socket minus also the
+ * size_on_delivery_queue.
+ */
+ struct sctpwheel_listhead out_wheel;
+
+ /*
+ * This pointer will be set to NULL most of the time. But when we
+ * have a fragmented message, where we could not get out all of the
+ * message at the last send then this will point to the stream to go
+ * get data from.
+ */
+ struct sctp_stream_out *locked_on_sending;
+
+ /* If an iterator is looking at me, this is it */
+ struct sctp_iterator *stcb_starting_point_for_iterator;
+
+ /* ASCONF save the last ASCONF-ACK so we can resend it if necessary */
+ struct sctp_asconf_ackhead asconf_ack_sent;
+
+ /*
+ * pointer to last stream reset queued to control queue by us with
+ * requests.
+ */
+ struct sctp_tmit_chunk *str_reset;
+ /*
+ * if Source Address Selection happening, this will rotate through
+ * the link list.
+ */
+ struct sctp_laddr *last_used_address;
+
+ /* stream arrays */
+ struct sctp_stream_in *strmin;
+ struct sctp_stream_out *strmout;
+ uint8_t *mapping_array;
+ /* primary destination to use */
+ struct sctp_nets *primary_destination;
+ /* For CMT */
+ struct sctp_nets *last_net_cmt_send_started;
+ /* last place I got a data chunk from */
+ struct sctp_nets *last_data_chunk_from;
+ /* last place I got a control from */
+ struct sctp_nets *last_control_chunk_from;
+
+ /* circular looking for output selection */
+ struct sctp_stream_out *last_out_stream;
+
+ /*
+ * wait to the point the cum-ack passes req->send_reset_at_tsn for
+ * any req on the list.
+ */
+ struct sctp_resethead resetHead;
+
+ /* queue of chunks waiting to be sent into the local stack */
+ struct sctp_readhead pending_reply_queue;
+
+ /* JRS - the congestion control functions are in this struct */
+ struct sctp_cc_functions cc_functions;
+ /*
+ * JRS - value to store the currently loaded congestion control
+ * module
+ */
+ uint32_t congestion_control_module;
+
+ uint32_t vrf_id;
+
+ uint32_t cookie_preserve_req;
+ /* ASCONF next seq I am sending out, inits at init-tsn */
+ uint32_t asconf_seq_out;
+ uint32_t asconf_seq_out_acked;
+ /* ASCONF last received ASCONF from peer, starts at peer's TSN-1 */
+ uint32_t asconf_seq_in;
+
+ /* next seq I am sending in str reset messages */
+ uint32_t str_reset_seq_out;
+ /* next seq I am expecting in str reset messages */
+ uint32_t str_reset_seq_in;
+
+ /* various verification tag information */
+ uint32_t my_vtag; /* The tag to be used. if assoc is re-initited
+ * by remote end, and I have unlocked this
+ * will be regenerated to a new random value. */
+ uint32_t peer_vtag; /* The peers last tag */
+
+ uint32_t my_vtag_nonce;
+ uint32_t peer_vtag_nonce;
+
+ uint32_t assoc_id;
+
+ /* This is the SCTP fragmentation threshold */
+ uint32_t smallest_mtu;
+
+ /*
+ * Special hook for Fast retransmit, allows us to track the highest
+ * TSN that is NEW in this SACK if gap ack blocks are present.
+ */
+ uint32_t this_sack_highest_gap;
+
+ /*
+ * The highest consecutive TSN that has been acked by peer on my
+ * sends
+ */
+ uint32_t last_acked_seq;
+
+ /* The next TSN that I will use in sending. */
+ uint32_t sending_seq;
+
+ /* Original seq number I used ??questionable to keep?? */
+ uint32_t init_seq_number;
+
+
+ /* The Advanced Peer Ack Point, as required by the PR-SCTP */
+ /* (A1 in Section 4.2) */
+ uint32_t advanced_peer_ack_point;
+
+ /*
+ * The highest consequetive TSN at the bottom of the mapping array
+ * (for his sends).
+ */
+ uint32_t cumulative_tsn;
+ /*
+ * Used to track the mapping array and its offset bits. This MAY be
+ * lower then cumulative_tsn.
+ */
+ uint32_t mapping_array_base_tsn;
+ /*
+ * used to track highest TSN we have received and is listed in the
+ * mapping array.
+ */
+ uint32_t highest_tsn_inside_map;
+
+ /* EY - new NR variables used for nr_sack based on mapping_array */
+ uint8_t *nr_mapping_array;
+ uint32_t highest_tsn_inside_nr_map;
+
+ uint32_t last_echo_tsn;
+ uint32_t last_cwr_tsn;
+ uint32_t fast_recovery_tsn;
+ uint32_t sat_t3_recovery_tsn;
+ uint32_t tsn_last_delivered;
+ /*
+ * For the pd-api we should re-write this a bit more efficent. We
+ * could have multiple sctp_queued_to_read's that we are building at
+ * once. Now we only do this when we get ready to deliver to the
+ * socket buffer. Note that we depend on the fact that the struct is
+ * "stuck" on the read queue until we finish all the pd-api.
+ */
+ struct sctp_queued_to_read *control_pdapi;
+
+ uint32_t tsn_of_pdapi_last_delivered;
+ uint32_t pdapi_ppid;
+ uint32_t context;
+ uint32_t last_reset_action[SCTP_MAX_RESET_PARAMS];
+ uint32_t last_sending_seq[SCTP_MAX_RESET_PARAMS];
+ uint32_t last_base_tsnsent[SCTP_MAX_RESET_PARAMS];
+#ifdef SCTP_ASOCLOG_OF_TSNS
+ /*
+ * special log - This adds considerable size to the asoc, but
+ * provides a log that you can use to detect problems via kgdb.
+ */
+ struct sctp_tsn_log in_tsnlog[SCTP_TSN_LOG_SIZE];
+ struct sctp_tsn_log out_tsnlog[SCTP_TSN_LOG_SIZE];
+ uint32_t cumack_log[SCTP_TSN_LOG_SIZE];
+ uint32_t cumack_logsnt[SCTP_TSN_LOG_SIZE];
+ uint16_t tsn_in_at;
+ uint16_t tsn_out_at;
+ uint16_t tsn_in_wrapped;
+ uint16_t tsn_out_wrapped;
+ uint16_t cumack_log_at;
+ uint16_t cumack_log_atsnt;
+#endif /* SCTP_ASOCLOG_OF_TSNS */
+#ifdef SCTP_FS_SPEC_LOG
+ struct sctp_fs_spec_log fslog[SCTP_FS_SPEC_LOG_SIZE];
+ uint16_t fs_index;
+#endif
+
+ /*
+ * window state information and smallest MTU that I use to bound
+ * segmentation
+ */
+ uint32_t peers_rwnd;
+ uint32_t my_rwnd;
+ uint32_t my_last_reported_rwnd;
+ uint32_t sctp_frag_point;
+
+ uint32_t total_output_queue_size;
+
+ uint32_t sb_cc; /* shadow of sb_cc */
+ uint32_t sb_send_resv; /* amount reserved on a send */
+ uint32_t my_rwnd_control_len; /* shadow of sb_mbcnt used for rwnd
+ * control */
+ /* 32 bit nonce stuff */
+ uint32_t nonce_resync_tsn;
+ uint32_t nonce_wait_tsn;
+ uint32_t default_flowlabel;
+ uint32_t pr_sctp_cnt;
+ int ctrl_queue_cnt; /* could be removed REM */
+ /*
+ * All outbound datagrams queue into this list from the individual
+ * stream queue. Here they get assigned a TSN and then await
+ * sending. The stream seq comes when it is first put in the
+ * individual str queue
+ */
+ unsigned int stream_queue_cnt;
+ unsigned int send_queue_cnt;
+ unsigned int sent_queue_cnt;
+ unsigned int sent_queue_cnt_removeable;
+ /*
+ * Number on sent queue that are marked for retran until this value
+ * is 0 we only send one packet of retran'ed data.
+ */
+ unsigned int sent_queue_retran_cnt;
+
+ unsigned int size_on_reasm_queue;
+ unsigned int cnt_on_reasm_queue;
+ unsigned int fwd_tsn_cnt;
+ /* amount of data (bytes) currently in flight (on all destinations) */
+ unsigned int total_flight;
+ /* Total book size in flight */
+ unsigned int total_flight_count; /* count of chunks used with
+ * book total */
+ /* count of destinaton nets and list of destination nets */
+ unsigned int numnets;
+
+ /* Total error count on this association */
+ unsigned int overall_error_count;
+
+ unsigned int cnt_msg_on_sb;
+
+ /* All stream count of chunks for delivery */
+ unsigned int size_on_all_streams;
+ unsigned int cnt_on_all_streams;
+
+ /* Heart Beat delay in ticks */
+ unsigned int heart_beat_delay;
+
+ /* autoclose */
+ unsigned int sctp_autoclose_ticks;
+
+ /* how many preopen streams we have */
+ unsigned int pre_open_streams;
+
+ /* How many streams I support coming into me */
+ unsigned int max_inbound_streams;
+
+ /* the cookie life I award for any cookie, in seconds */
+ unsigned int cookie_life;
+ /* time to delay acks for */
+ unsigned int delayed_ack;
+ unsigned int old_delayed_ack;
+ unsigned int sack_freq;
+ unsigned int data_pkts_seen;
+
+ unsigned int numduptsns;
+ int dup_tsns[SCTP_MAX_DUP_TSNS];
+ unsigned int initial_init_rto_max; /* initial RTO for INIT's */
+ unsigned int initial_rto; /* initial send RTO */
+ unsigned int minrto; /* per assoc RTO-MIN */
+ unsigned int maxrto; /* per assoc RTO-MAX */
+
+ /* authentication fields */
+ sctp_auth_chklist_t *local_auth_chunks;
+ sctp_auth_chklist_t *peer_auth_chunks;
+ sctp_hmaclist_t *local_hmacs; /* local HMACs supported */
+ sctp_hmaclist_t *peer_hmacs; /* peer HMACs supported */
+ struct sctp_keyhead shared_keys; /* assoc's shared keys */
+ sctp_authinfo_t authinfo; /* randoms, cached keys */
+ /*
+ * refcnt to block freeing when a sender or receiver is off coping
+ * user data in.
+ */
+ uint32_t refcnt;
+ uint32_t chunks_on_out_queue; /* total chunks floating around,
+ * locked by send socket buffer */
+ uint32_t peers_adaptation;
+ uint16_t peer_hmac_id; /* peer HMAC id to send */
+
+ /*
+ * Being that we have no bag to collect stale cookies, and that we
+ * really would not want to anyway.. we will count them in this
+ * counter. We of course feed them to the pigeons right away (I have
+ * always thought of pigeons as flying rats).
+ */
+ uint16_t stale_cookie_count;
+
+ /*
+ * For the partial delivery API, if up, invoked this is what last
+ * TSN I delivered
+ */
+ uint16_t str_of_pdapi;
+ uint16_t ssn_of_pdapi;
+
+ /* counts of actual built streams. Allocation may be more however */
+ /* could re-arrange to optimize space here. */
+ uint16_t streamincnt;
+ uint16_t streamoutcnt;
+ uint16_t strm_realoutsize;
+ /* my maximum number of retrans of INIT and SEND */
+ /* copied from SCTP but should be individually setable */
+ uint16_t max_init_times;
+ uint16_t max_send_times;
+
+ uint16_t def_net_failure;
+
+ /*
+ * lock flag: 0 is ok to send, 1+ (duals as a retran count) is
+ * awaiting ACK
+ */
+ uint16_t mapping_array_size;
+
+ uint16_t last_strm_seq_delivered;
+ uint16_t last_strm_no_delivered;
+
+ uint16_t last_revoke_count;
+ int16_t num_send_timers_up;
+
+ uint16_t stream_locked_on;
+ uint16_t ecn_echo_cnt_onq;
+
+ uint16_t free_chunk_cnt;
+
+ uint8_t stream_locked;
+ uint8_t authenticated; /* packet authenticated ok */
+ /*
+ * This flag indicates that a SACK need to be sent. Initially this
+ * is 1 to send the first sACK immediately.
+ */
+ uint8_t send_sack;
+
+ /* max burst after fast retransmit completes */
+ uint8_t max_burst;
+
+ uint8_t sat_network; /* RTT is in range of sat net or greater */
+ uint8_t sat_network_lockout; /* lockout code */
+ uint8_t burst_limit_applied; /* Burst limit in effect at last send? */
+ /* flag goes on when we are doing a partial delivery api */
+ uint8_t hb_random_values[4];
+ uint8_t fragmented_delivery_inprogress;
+ uint8_t fragment_flags;
+ uint8_t last_flags_delivered;
+ uint8_t hb_ect_randombit;
+ uint8_t hb_random_idx;
+ uint8_t hb_is_disabled; /* is the hb disabled? */
+ uint8_t default_tos;
+ uint8_t asconf_del_pending; /* asconf delete last addr pending */
+
+ /* ECN Nonce stuff */
+ uint8_t receiver_nonce_sum; /* nonce I sum and put in my sack */
+ uint8_t ecn_nonce_allowed; /* Tells us if ECN nonce is on */
+ uint8_t nonce_sum_check;/* On off switch used during re-sync */
+ uint8_t nonce_wait_for_ecne; /* flag when we expect a ECN */
+ uint8_t peer_supports_ecn_nonce;
+
+ /*
+ * This value, plus all other ack'd but above cum-ack is added
+ * together to cross check against the bit that we have yet to
+ * define (probably in the SACK). When the cum-ack is updated, this
+ * sum is updated as well.
+ */
+ uint8_t nonce_sum_expect_base;
+ /* Flag to tell if ECN is allowed */
+ uint8_t ecn_allowed;
+
+ /* flag to indicate if peer can do asconf */
+ uint8_t peer_supports_asconf;
+ /* EY - flag to indicate if peer can do nr_sack */
+ uint8_t peer_supports_nr_sack;
+ /* pr-sctp support flag */
+ uint8_t peer_supports_prsctp;
+ /* peer authentication support flag */
+ uint8_t peer_supports_auth;
+ /* stream resets are supported by the peer */
+ uint8_t peer_supports_strreset;
+
+ uint8_t peer_supports_nat;
+ /*
+ * packet drop's are supported by the peer, we don't really care
+ * about this but we bookkeep it anyway.
+ */
+ uint8_t peer_supports_pktdrop;
+
+ /* Do we allow V6/V4? */
+ uint8_t ipv4_addr_legal;
+ uint8_t ipv6_addr_legal;
+ /* Address scoping flags */
+ /* scope value for IPv4 */
+ uint8_t ipv4_local_scope;
+ /* scope values for IPv6 */
+ uint8_t local_scope;
+ uint8_t site_scope;
+ /* loopback scope */
+ uint8_t loopback_scope;
+ /* flags to handle send alternate net tracking */
+ uint8_t used_alt_onsack;
+ uint8_t used_alt_asconfack;
+ uint8_t fast_retran_loss_recovery;
+ uint8_t sat_t3_loss_recovery;
+ uint8_t dropped_special_cnt;
+ uint8_t seen_a_sack_this_pkt;
+ uint8_t stream_reset_outstanding;
+ uint8_t stream_reset_out_is_outstanding;
+ uint8_t delayed_connection;
+ uint8_t ifp_had_enobuf;
+ uint8_t saw_sack_with_frags;
+ uint8_t saw_sack_with_nr_frags;
+ uint8_t in_asocid_hash;
+ uint8_t assoc_up_sent;
+ uint8_t adaptation_needed;
+ uint8_t adaptation_sent;
+ /* CMT variables */
+ uint8_t cmt_dac_pkts_rcvd;
+ uint8_t sctp_cmt_on_off;
+ uint8_t iam_blocking;
+ uint8_t cookie_how[8];
+ /* EY 05/05/08 - NR_SACK variable */
+ uint8_t sctp_nr_sack_on_off;
+ /* JRS 5/21/07 - CMT PF variable */
+ uint8_t sctp_cmt_pf;
+ /*
+ * The mapping array is used to track out of order sequences above
+ * last_acked_seq. 0 indicates packet missing 1 indicates packet
+ * rec'd. We slide it up every time we raise last_acked_seq and 0
+ * trailing locactions out. If I get a TSN above the array
+ * mappingArraySz, I discard the datagram and let retransmit happen.
+ */
+ uint32_t marked_retrans;
+ uint32_t timoinit;
+ uint32_t timodata;
+ uint32_t timosack;
+ uint32_t timoshutdown;
+ uint32_t timoheartbeat;
+ uint32_t timocookie;
+ uint32_t timoshutdownack;
+ struct timeval start_time;
+ struct timeval discontinuity_time;
+};
+
+#endif
diff --git a/freebsd/sys/netinet/sctp_sysctl.c b/freebsd/sys/netinet/sctp_sysctl.c
new file mode 100644
index 00000000..b5700e4e
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_sysctl.c
@@ -0,0 +1,1108 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp.h>
+#include <freebsd/netinet/sctp_constants.h>
+#include <freebsd/netinet/sctp_sysctl.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_output.h>
+#include <freebsd/sys/smp.h>
+
+/*
+ * sysctl tunable variables
+ */
+
+void
+sctp_init_sysctls()
+{
+ SCTP_BASE_SYSCTL(sctp_sendspace) = SCTPCTL_MAXDGRAM_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_recvspace) = SCTPCTL_RECVSPACE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_auto_asconf) = SCTPCTL_AUTOASCONF_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_multiple_asconfs) = SCTPCTL_MULTIPLEASCONFS_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_ecn_enable) = SCTPCTL_ECN_ENABLE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_ecn_nonce) = SCTPCTL_ECN_NONCE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_strict_sacks) = SCTPCTL_STRICT_SACKS_DEFAULT;
+#if !defined(SCTP_WITH_NO_CSUM)
+ SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback) = SCTPCTL_LOOPBACK_NOCSUM_DEFAULT;
+#endif
+ SCTP_BASE_SYSCTL(sctp_strict_init) = SCTPCTL_STRICT_INIT_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_peer_chunk_oh) = SCTPCTL_PEER_CHKOH_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_max_burst_default) = SCTPCTL_MAXBURST_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue) = SCTPCTL_MAXCHUNKS_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_hashtblsize) = SCTPCTL_TCBHASHSIZE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_pcbtblsize) = SCTPCTL_PCBHASHSIZE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_min_split_point) = SCTPCTL_MIN_SPLIT_POINT_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_chunkscale) = SCTPCTL_CHUNKSCALE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default) = SCTPCTL_DELAYED_SACK_TIME_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_sack_freq_default) = SCTPCTL_SACK_FREQ_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_system_free_resc_limit) = SCTPCTL_SYS_RESOURCE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit) = SCTPCTL_ASOC_RESOURCE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default) = SCTPCTL_HEARTBEAT_INTERVAL_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default) = SCTPCTL_PMTU_RAISE_TIME_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default) = SCTPCTL_SHUTDOWN_GUARD_TIME_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_secret_lifetime_default) = SCTPCTL_SECRET_LIFETIME_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_rto_max_default) = SCTPCTL_RTO_MAX_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_rto_min_default) = SCTPCTL_RTO_MIN_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_rto_initial_default) = SCTPCTL_RTO_INITIAL_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_init_rto_max_default) = SCTPCTL_INIT_RTO_MAX_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default) = SCTPCTL_VALID_COOKIE_LIFE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_init_rtx_max_default) = SCTPCTL_INIT_RTX_MAX_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default) = SCTPCTL_ASSOC_RTX_MAX_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_path_rtx_max_default) = SCTPCTL_PATH_RTX_MAX_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_add_more_threshold) = SCTPCTL_ADD_MORE_ON_OUTPUT_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default) = SCTPCTL_OUTGOING_STREAMS_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_cmt_on_off) = SCTPCTL_CMT_ON_OFF_DEFAULT;
+ /* EY */
+ SCTP_BASE_SYSCTL(sctp_nr_sack_on_off) = SCTPCTL_NR_SACK_ON_OFF_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_cmt_use_dac) = SCTPCTL_CMT_USE_DAC_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_cmt_pf) = SCTPCTL_CMT_PF_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) = SCTPCTL_CWND_MAXBURST_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_early_fr) = SCTPCTL_EARLY_FAST_RETRAN_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_early_fr_msec) = SCTPCTL_EARLY_FAST_RETRAN_MSEC_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk) = SCTPCTL_ASCONF_AUTH_NOCHK_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_auth_disable) = SCTPCTL_AUTH_DISABLE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_nat_friendly) = SCTPCTL_NAT_FRIENDLY_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_L2_abc_variable) = SCTPCTL_ABC_L_VAR_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count) = SCTPCTL_MAX_CHAINED_MBUFS_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_do_drain) = SCTPCTL_DO_SCTP_DRAIN_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_hb_maxburst) = SCTPCTL_HB_MAX_BURST_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit) = SCTPCTL_ABORT_AT_LIMIT_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_strict_data_order) = SCTPCTL_STRICT_DATA_ORDER_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_min_residual) = SCTPCTL_MIN_RESIDUAL_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_max_retran_chunk) = SCTPCTL_MAX_RETRAN_CHUNK_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_logging_level) = SCTPCTL_LOGGING_LEVEL_DEFAULT;
+ /* JRS - Variable for default congestion control module */
+ SCTP_BASE_SYSCTL(sctp_default_cc_module) = SCTPCTL_DEFAULT_CC_MODULE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_default_frag_interleave) = SCTPCTL_DEFAULT_FRAG_INTERLEAVE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_mobility_base) = SCTPCTL_MOBILITY_BASE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff) = SCTPCTL_MOBILITY_FASTHANDOFF_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_vtag_time_wait) = SCTPCTL_TIME_WAIT_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_buffer_splitting) = SCTPCTL_BUFFER_SPLITTING_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_initial_cwnd) = SCTPCTL_INITIAL_CWND_DEFAULT;
+#if defined(SCTP_LOCAL_TRACE_BUF)
+ memset(&SCTP_BASE_SYSCTL(sctp_log), 0, sizeof(struct sctp_log));
+#endif
+ SCTP_BASE_SYSCTL(sctp_udp_tunneling_for_client_enable) = SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_udp_tunneling_port) = SCTPCTL_UDP_TUNNELING_PORT_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) = SCTPCTL_SACK_IMMEDIATELY_ENABLE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly) = SCTPCTL_NAT_FRIENDLY_INITS_DEFAULT;
+#if defined(SCTP_DEBUG)
+ SCTP_BASE_SYSCTL(sctp_debug_on) = SCTPCTL_DEBUG_DEFAULT;
+#endif
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_BASE_SYSCTL(sctp_output_unlocked) = SCTPCTL_OUTPUT_UNLOCKED_DEFAULT;
+#endif
+}
+
+
+/* It returns an upper limit. No filtering is done here */
+static unsigned int
+number_of_addresses(struct sctp_inpcb *inp)
+{
+ int cnt;
+ struct sctp_vrf *vrf;
+ struct sctp_ifn *sctp_ifn;
+ struct sctp_ifa *sctp_ifa;
+ struct sctp_laddr *laddr;
+
+ cnt = 0;
+ /* neither Mac OS X nor FreeBSD support mulitple routing functions */
+ if ((vrf = sctp_find_vrf(inp->def_vrf_id)) == NULL) {
+ return (0);
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
+ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
+ if ((sctp_ifa->address.sa.sa_family == AF_INET) ||
+ (sctp_ifa->address.sa.sa_family == AF_INET6)) {
+ cnt++;
+ }
+ }
+ }
+ } else {
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ if ((laddr->ifa->address.sa.sa_family == AF_INET) ||
+ (laddr->ifa->address.sa.sa_family == AF_INET6)) {
+ cnt++;
+ }
+ }
+ }
+ return (cnt);
+}
+
+static int
+copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sysctl_req *req)
+{
+ struct sctp_ifn *sctp_ifn;
+ struct sctp_ifa *sctp_ifa;
+ int loopback_scope, ipv4_local_scope, local_scope, site_scope;
+ int ipv4_addr_legal, ipv6_addr_legal;
+ struct sctp_vrf *vrf;
+ struct xsctp_laddr xladdr;
+ struct sctp_laddr *laddr;
+ int error;
+
+ /* Turn on all the appropriate scope */
+ if (stcb) {
+ /* use association specific values */
+ loopback_scope = stcb->asoc.loopback_scope;
+ ipv4_local_scope = stcb->asoc.ipv4_local_scope;
+ local_scope = stcb->asoc.local_scope;
+ site_scope = stcb->asoc.site_scope;
+ } else {
+ /* use generic values for endpoints */
+ loopback_scope = 1;
+ ipv4_local_scope = 1;
+ local_scope = 1;
+ site_scope = 1;
+ }
+
+ /* use only address families of interest */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ ipv6_addr_legal = 1;
+ if (SCTP_IPV6_V6ONLY(inp)) {
+ ipv4_addr_legal = 0;
+ } else {
+ ipv4_addr_legal = 1;
+ }
+ } else {
+ ipv4_addr_legal = 1;
+ ipv6_addr_legal = 0;
+ }
+
+ /* neither Mac OS X nor FreeBSD support mulitple routing functions */
+ if ((vrf = sctp_find_vrf(inp->def_vrf_id)) == NULL) {
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ return (-1);
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
+ if ((loopback_scope == 0) && SCTP_IFN_IS_IFT_LOOP(sctp_ifn))
+ /* Skip loopback if loopback_scope not set */
+ continue;
+ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
+ if (stcb) {
+ /*
+ * ignore if blacklisted at
+ * association level
+ */
+ if (sctp_is_addr_restricted(stcb, sctp_ifa))
+ continue;
+ }
+ switch (sctp_ifa->address.sa.sa_family) {
+ case AF_INET:
+ if (ipv4_addr_legal) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&sctp_ifa->address.sa;
+ if (sin->sin_addr.s_addr == 0)
+ continue;
+ if ((ipv4_local_scope == 0) && (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)))
+ continue;
+ } else {
+ continue;
+ }
+ break;
+#ifdef INET6
+ case AF_INET6:
+ if (ipv6_addr_legal) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa;
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+ continue;
+ if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
+ if (local_scope == 0)
+ continue;
+ if (sin6->sin6_scope_id == 0) {
+ /*
+ * bad link
+ * local
+ * address
+ */
+ if (sa6_recoverscope(sin6) != 0)
+ continue;
+ }
+ }
+ if ((site_scope == 0) && (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)))
+ continue;
+ } else {
+ continue;
+ }
+ break;
+#endif
+ default:
+ continue;
+ }
+ memset((void *)&xladdr, 0, sizeof(struct xsctp_laddr));
+ memcpy((void *)&xladdr.address, (const void *)&sctp_ifa->address, sizeof(union sctp_sockstore));
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ error = SYSCTL_OUT(req, &xladdr, sizeof(struct xsctp_laddr));
+ if (error) {
+ return (error);
+ } else {
+ SCTP_INP_INFO_RLOCK();
+ SCTP_INP_RLOCK(inp);
+ }
+ }
+ }
+ } else {
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ /* ignore if blacklisted at association level */
+ if (stcb && sctp_is_addr_restricted(stcb, laddr->ifa))
+ continue;
+ memset((void *)&xladdr, 0, sizeof(struct xsctp_laddr));
+ memcpy((void *)&xladdr.address, (const void *)&laddr->ifa->address, sizeof(union sctp_sockstore));
+ xladdr.start_time.tv_sec = (uint32_t) laddr->start_time.tv_sec;
+ xladdr.start_time.tv_usec = (uint32_t) laddr->start_time.tv_usec;
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ error = SYSCTL_OUT(req, &xladdr, sizeof(struct xsctp_laddr));
+ if (error) {
+ return (error);
+ } else {
+ SCTP_INP_INFO_RLOCK();
+ SCTP_INP_RLOCK(inp);
+ }
+ }
+ }
+ memset((void *)&xladdr, 0, sizeof(struct xsctp_laddr));
+ xladdr.last = 1;
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ error = SYSCTL_OUT(req, &xladdr, sizeof(struct xsctp_laddr));
+
+ if (error) {
+ return (error);
+ } else {
+ SCTP_INP_INFO_RLOCK();
+ SCTP_INP_RLOCK(inp);
+ return (0);
+ }
+}
+
+/*
+ * sysctl functions
+ */
+static int
+sctp_assoclist(SYSCTL_HANDLER_ARGS)
+{
+ unsigned int number_of_endpoints;
+ unsigned int number_of_local_addresses;
+ unsigned int number_of_associations;
+ unsigned int number_of_remote_addresses;
+ unsigned int n;
+ int error;
+ struct sctp_inpcb *inp;
+ struct sctp_tcb *stcb;
+ struct sctp_nets *net;
+ struct xsctp_inpcb xinpcb;
+ struct xsctp_tcb xstcb;
+ struct xsctp_raddr xraddr;
+ struct socket *so;
+
+ number_of_endpoints = 0;
+ number_of_local_addresses = 0;
+ number_of_associations = 0;
+ number_of_remote_addresses = 0;
+
+ SCTP_INP_INFO_RLOCK();
+ if (req->oldptr == USER_ADDR_NULL) {
+ LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) {
+ SCTP_INP_RLOCK(inp);
+ number_of_endpoints++;
+ number_of_local_addresses += number_of_addresses(inp);
+ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
+ number_of_associations++;
+ number_of_local_addresses += number_of_addresses(inp);
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ number_of_remote_addresses++;
+ }
+ }
+ SCTP_INP_RUNLOCK(inp);
+ }
+ SCTP_INP_INFO_RUNLOCK();
+ n = (number_of_endpoints + 1) * sizeof(struct xsctp_inpcb) +
+ (number_of_local_addresses + number_of_endpoints + number_of_associations) * sizeof(struct xsctp_laddr) +
+ (number_of_associations + number_of_endpoints) * sizeof(struct xsctp_tcb) +
+ (number_of_remote_addresses + number_of_associations) * sizeof(struct xsctp_raddr);
+
+ /* request some more memory than needed */
+ req->oldidx = (n + n / 8);
+ return 0;
+ }
+ if (req->newptr != USER_ADDR_NULL) {
+ SCTP_INP_INFO_RUNLOCK();
+ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_SYSCTL, EPERM);
+ return EPERM;
+ }
+ LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) {
+ SCTP_INP_RLOCK(inp);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
+ /* if its allgone it is being freed - skip it */
+ goto skip;
+ }
+ xinpcb.last = 0;
+ xinpcb.local_port = ntohs(inp->sctp_lport);
+ xinpcb.flags = inp->sctp_flags;
+ xinpcb.features = inp->sctp_features;
+ xinpcb.total_sends = inp->total_sends;
+ xinpcb.total_recvs = inp->total_recvs;
+ xinpcb.total_nospaces = inp->total_nospaces;
+ xinpcb.fragmentation_point = inp->sctp_frag_point;
+ so = inp->sctp_socket;
+ if ((so == NULL) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
+ xinpcb.qlen = 0;
+ xinpcb.maxqlen = 0;
+ } else {
+ xinpcb.qlen = so->so_qlen;
+ xinpcb.maxqlen = so->so_qlimit;
+ }
+ SCTP_INP_INCR_REF(inp);
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ error = SYSCTL_OUT(req, &xinpcb, sizeof(struct xsctp_inpcb));
+ if (error) {
+ SCTP_INP_DECR_REF(inp);
+ return error;
+ }
+ SCTP_INP_INFO_RLOCK();
+ SCTP_INP_RLOCK(inp);
+ error = copy_out_local_addresses(inp, NULL, req);
+ if (error) {
+ SCTP_INP_DECR_REF(inp);
+ return error;
+ }
+ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
+ SCTP_TCB_LOCK(stcb);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ xstcb.last = 0;
+ xstcb.local_port = ntohs(inp->sctp_lport);
+ xstcb.remote_port = ntohs(stcb->rport);
+ if (stcb->asoc.primary_destination != NULL)
+ xstcb.primary_addr = stcb->asoc.primary_destination->ro._l_addr;
+ xstcb.heartbeat_interval = stcb->asoc.heart_beat_delay;
+ xstcb.state = SCTP_GET_STATE(&stcb->asoc); /* FIXME */
+ /* 7.0 does not support these */
+ xstcb.assoc_id = sctp_get_associd(stcb);
+ xstcb.peers_rwnd = stcb->asoc.peers_rwnd;
+ xstcb.in_streams = stcb->asoc.streamincnt;
+ xstcb.out_streams = stcb->asoc.streamoutcnt;
+ xstcb.max_nr_retrans = stcb->asoc.overall_error_count;
+ xstcb.primary_process = 0; /* not really supported
+ * yet */
+ xstcb.T1_expireries = stcb->asoc.timoinit + stcb->asoc.timocookie;
+ xstcb.T2_expireries = stcb->asoc.timoshutdown + stcb->asoc.timoshutdownack;
+ xstcb.retransmitted_tsns = stcb->asoc.marked_retrans;
+ xstcb.start_time.tv_sec = (uint32_t) stcb->asoc.start_time.tv_sec;
+ xstcb.start_time.tv_usec = (uint32_t) stcb->asoc.start_time.tv_usec;
+ xstcb.discontinuity_time.tv_sec = (uint32_t) stcb->asoc.discontinuity_time.tv_sec;
+ xstcb.discontinuity_time.tv_usec = (uint32_t) stcb->asoc.discontinuity_time.tv_usec;
+ xstcb.total_sends = stcb->total_sends;
+ xstcb.total_recvs = stcb->total_recvs;
+ xstcb.local_tag = stcb->asoc.my_vtag;
+ xstcb.remote_tag = stcb->asoc.peer_vtag;
+ xstcb.initial_tsn = stcb->asoc.init_seq_number;
+ xstcb.highest_tsn = stcb->asoc.sending_seq - 1;
+ xstcb.cumulative_tsn = stcb->asoc.last_acked_seq;
+ xstcb.cumulative_tsn_ack = stcb->asoc.cumulative_tsn;
+ xstcb.mtu = stcb->asoc.smallest_mtu;
+ xstcb.refcnt = stcb->asoc.refcnt;
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ error = SYSCTL_OUT(req, &xstcb, sizeof(struct xsctp_tcb));
+ if (error) {
+ SCTP_INP_DECR_REF(inp);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ return error;
+ }
+ SCTP_INP_INFO_RLOCK();
+ SCTP_INP_RLOCK(inp);
+ error = copy_out_local_addresses(inp, stcb, req);
+ if (error) {
+ SCTP_INP_DECR_REF(inp);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ return error;
+ }
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ xraddr.last = 0;
+ xraddr.address = net->ro._l_addr;
+ xraddr.active = ((net->dest_state & SCTP_ADDR_REACHABLE) == SCTP_ADDR_REACHABLE);
+ xraddr.confirmed = ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0);
+ xraddr.heartbeat_enabled = ((net->dest_state & SCTP_ADDR_NOHB) == 0);
+ xraddr.rto = net->RTO;
+ xraddr.max_path_rtx = net->failure_threshold;
+ xraddr.rtx = net->marked_retrans;
+ xraddr.error_counter = net->error_count;
+ xraddr.cwnd = net->cwnd;
+ xraddr.flight_size = net->flight_size;
+ xraddr.mtu = net->mtu;
+ xraddr.rtt = net->rtt;
+ xraddr.start_time.tv_sec = (uint32_t) net->start_time.tv_sec;
+ xraddr.start_time.tv_usec = (uint32_t) net->start_time.tv_usec;
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ error = SYSCTL_OUT(req, &xraddr, sizeof(struct xsctp_raddr));
+ if (error) {
+ SCTP_INP_DECR_REF(inp);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ return error;
+ }
+ SCTP_INP_INFO_RLOCK();
+ SCTP_INP_RLOCK(inp);
+ }
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ memset((void *)&xraddr, 0, sizeof(struct xsctp_raddr));
+ xraddr.last = 1;
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ error = SYSCTL_OUT(req, &xraddr, sizeof(struct xsctp_raddr));
+ if (error) {
+ SCTP_INP_DECR_REF(inp);
+ return error;
+ }
+ SCTP_INP_INFO_RLOCK();
+ SCTP_INP_RLOCK(inp);
+ }
+ SCTP_INP_DECR_REF(inp);
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_INP_INFO_RUNLOCK();
+ memset((void *)&xstcb, 0, sizeof(struct xsctp_tcb));
+ xstcb.last = 1;
+ error = SYSCTL_OUT(req, &xstcb, sizeof(struct xsctp_tcb));
+ if (error) {
+ return error;
+ }
+skip:
+ SCTP_INP_INFO_RLOCK();
+ }
+ SCTP_INP_INFO_RUNLOCK();
+
+ memset((void *)&xinpcb, 0, sizeof(struct xsctp_inpcb));
+ xinpcb.last = 1;
+ error = SYSCTL_OUT(req, &xinpcb, sizeof(struct xsctp_inpcb));
+ return error;
+}
+
+
+#define RANGECHK(var, min, max) \
+ if ((var) < (min)) { (var) = (min); } \
+ else if ((var) > (max)) { (var) = (max); }
+
+static int
+sysctl_sctp_udp_tunneling_check(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ uint32_t old_sctp_udp_tunneling_port;
+
+ SCTP_INP_INFO_RLOCK();
+ old_sctp_udp_tunneling_port = SCTP_BASE_SYSCTL(sctp_udp_tunneling_port);
+ SCTP_INP_INFO_RUNLOCK();
+ error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+ if (error == 0) {
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port), SCTPCTL_UDP_TUNNELING_PORT_MIN, SCTPCTL_UDP_TUNNELING_PORT_MAX);
+ if (old_sctp_udp_tunneling_port == SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) {
+ error = 0;
+ goto out;
+ }
+ SCTP_INP_INFO_WLOCK();
+ if (old_sctp_udp_tunneling_port) {
+ sctp_over_udp_stop();
+ }
+ if (SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) {
+ if (sctp_over_udp_start()) {
+ SCTP_BASE_SYSCTL(sctp_udp_tunneling_port) = 0;
+ }
+ }
+ SCTP_INP_INFO_WUNLOCK();
+ }
+out:
+ return (error);
+}
+
+
+static int
+sysctl_sctp_check(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+
+ error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+ if (error == 0) {
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_sendspace), SCTPCTL_MAXDGRAM_MIN, SCTPCTL_MAXDGRAM_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_recvspace), SCTPCTL_RECVSPACE_MIN, SCTPCTL_RECVSPACE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_auto_asconf), SCTPCTL_AUTOASCONF_MIN, SCTPCTL_AUTOASCONF_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_ecn_enable), SCTPCTL_ECN_ENABLE_MIN, SCTPCTL_ECN_ENABLE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_ecn_nonce), SCTPCTL_ECN_NONCE_MIN, SCTPCTL_ECN_NONCE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_strict_sacks), SCTPCTL_STRICT_SACKS_MIN, SCTPCTL_STRICT_SACKS_MAX);
+#if !defined(SCTP_WITH_NO_CSUM)
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback), SCTPCTL_LOOPBACK_NOCSUM_MIN, SCTPCTL_LOOPBACK_NOCSUM_MAX);
+#endif
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_strict_init), SCTPCTL_STRICT_INIT_MIN, SCTPCTL_STRICT_INIT_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_peer_chunk_oh), SCTPCTL_PEER_CHKOH_MIN, SCTPCTL_PEER_CHKOH_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_max_burst_default), SCTPCTL_MAXBURST_MIN, SCTPCTL_MAXBURST_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue), SCTPCTL_MAXCHUNKS_MIN, SCTPCTL_MAXCHUNKS_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_hashtblsize), SCTPCTL_TCBHASHSIZE_MIN, SCTPCTL_TCBHASHSIZE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_pcbtblsize), SCTPCTL_PCBHASHSIZE_MIN, SCTPCTL_PCBHASHSIZE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_min_split_point), SCTPCTL_MIN_SPLIT_POINT_MIN, SCTPCTL_MIN_SPLIT_POINT_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_chunkscale), SCTPCTL_CHUNKSCALE_MIN, SCTPCTL_CHUNKSCALE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default), SCTPCTL_DELAYED_SACK_TIME_MIN, SCTPCTL_DELAYED_SACK_TIME_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_sack_freq_default), SCTPCTL_SACK_FREQ_MIN, SCTPCTL_SACK_FREQ_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_system_free_resc_limit), SCTPCTL_SYS_RESOURCE_MIN, SCTPCTL_SYS_RESOURCE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit), SCTPCTL_ASOC_RESOURCE_MIN, SCTPCTL_ASOC_RESOURCE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default), SCTPCTL_HEARTBEAT_INTERVAL_MIN, SCTPCTL_HEARTBEAT_INTERVAL_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default), SCTPCTL_PMTU_RAISE_TIME_MIN, SCTPCTL_PMTU_RAISE_TIME_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default), SCTPCTL_SHUTDOWN_GUARD_TIME_MIN, SCTPCTL_SHUTDOWN_GUARD_TIME_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_secret_lifetime_default), SCTPCTL_SECRET_LIFETIME_MIN, SCTPCTL_SECRET_LIFETIME_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_rto_max_default), SCTPCTL_RTO_MAX_MIN, SCTPCTL_RTO_MAX_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_rto_min_default), SCTPCTL_RTO_MIN_MIN, SCTPCTL_RTO_MIN_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_rto_initial_default), SCTPCTL_RTO_INITIAL_MIN, SCTPCTL_RTO_INITIAL_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_init_rto_max_default), SCTPCTL_INIT_RTO_MAX_MIN, SCTPCTL_INIT_RTO_MAX_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default), SCTPCTL_VALID_COOKIE_LIFE_MIN, SCTPCTL_VALID_COOKIE_LIFE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_init_rtx_max_default), SCTPCTL_INIT_RTX_MAX_MIN, SCTPCTL_INIT_RTX_MAX_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default), SCTPCTL_ASSOC_RTX_MAX_MIN, SCTPCTL_ASSOC_RTX_MAX_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_path_rtx_max_default), SCTPCTL_PATH_RTX_MAX_MIN, SCTPCTL_PATH_RTX_MAX_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTPCTL_ADD_MORE_ON_OUTPUT_MIN, SCTPCTL_ADD_MORE_ON_OUTPUT_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), SCTPCTL_OUTGOING_STREAMS_MIN, SCTPCTL_OUTGOING_STREAMS_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_cmt_on_off), SCTPCTL_CMT_ON_OFF_MIN, SCTPCTL_CMT_ON_OFF_MAX);
+ /* EY */
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_sack_on_off), SCTPCTL_NR_SACK_ON_OFF_MIN, SCTPCTL_NR_SACK_ON_OFF_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_cmt_use_dac), SCTPCTL_CMT_USE_DAC_MIN, SCTPCTL_CMT_USE_DAC_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_cmt_pf), SCTPCTL_CMT_PF_MIN, SCTPCTL_CMT_PF_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst), SCTPCTL_CWND_MAXBURST_MIN, SCTPCTL_CWND_MAXBURST_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_early_fr), SCTPCTL_EARLY_FAST_RETRAN_MIN, SCTPCTL_EARLY_FAST_RETRAN_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_early_fr_msec), SCTPCTL_EARLY_FAST_RETRAN_MSEC_MIN, SCTPCTL_EARLY_FAST_RETRAN_MSEC_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk), SCTPCTL_ASCONF_AUTH_NOCHK_MIN, SCTPCTL_ASCONF_AUTH_NOCHK_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_auth_disable), SCTPCTL_AUTH_DISABLE_MIN, SCTPCTL_AUTH_DISABLE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_nat_friendly), SCTPCTL_NAT_FRIENDLY_MIN, SCTPCTL_NAT_FRIENDLY_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_L2_abc_variable), SCTPCTL_ABC_L_VAR_MIN, SCTPCTL_ABC_L_VAR_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count), SCTPCTL_MAX_CHAINED_MBUFS_MIN, SCTPCTL_MAX_CHAINED_MBUFS_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_do_drain), SCTPCTL_DO_SCTP_DRAIN_MIN, SCTPCTL_DO_SCTP_DRAIN_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_hb_maxburst), SCTPCTL_HB_MAX_BURST_MIN, SCTPCTL_HB_MAX_BURST_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit), SCTPCTL_ABORT_AT_LIMIT_MIN, SCTPCTL_ABORT_AT_LIMIT_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_strict_data_order), SCTPCTL_STRICT_DATA_ORDER_MIN, SCTPCTL_STRICT_DATA_ORDER_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_min_residual), SCTPCTL_MIN_RESIDUAL_MIN, SCTPCTL_MIN_RESIDUAL_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_max_retran_chunk), SCTPCTL_MAX_RETRAN_CHUNK_MIN, SCTPCTL_MAX_RETRAN_CHUNK_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_logging_level), SCTPCTL_LOGGING_LEVEL_MIN, SCTPCTL_LOGGING_LEVEL_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_default_cc_module), SCTPCTL_DEFAULT_CC_MODULE_MIN, SCTPCTL_DEFAULT_CC_MODULE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_default_frag_interleave), SCTPCTL_DEFAULT_FRAG_INTERLEAVE_MIN, SCTPCTL_DEFAULT_FRAG_INTERLEAVE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_vtag_time_wait), SCTPCTL_TIME_WAIT_MIN, SCTPCTL_TIME_WAIT_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_buffer_splitting), SCTPCTL_BUFFER_SPLITTING_MIN, SCTPCTL_BUFFER_SPLITTING_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_initial_cwnd), SCTPCTL_INITIAL_CWND_MIN, SCTPCTL_INITIAL_CWND_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_mobility_base), SCTPCTL_MOBILITY_BASE_MIN, SCTPCTL_MOBILITY_BASE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff), SCTPCTL_MOBILITY_FASTHANDOFF_MIN, SCTPCTL_MOBILITY_FASTHANDOFF_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_udp_tunneling_for_client_enable), SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_MIN, SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_enable_sack_immediately), SCTPCTL_SACK_IMMEDIATELY_ENABLE_MIN, SCTPCTL_SACK_IMMEDIATELY_ENABLE_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly), SCTPCTL_NAT_FRIENDLY_INITS_MIN, SCTPCTL_NAT_FRIENDLY_INITS_MAX);
+
+#ifdef SCTP_DEBUG
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_debug_on), SCTPCTL_DEBUG_MIN, SCTPCTL_DEBUG_MAX);
+#endif
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_output_unlocked), SCTPCTL_OUTPUT_UNLOCKED_MIN, SCTPCTL_OUTPUT_UNLOCKED_MAX);
+#endif
+ }
+ return (error);
+}
+
+#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
+static int
+sysctl_stat_get(SYSCTL_HANDLER_ARGS)
+{
+ int cpu, error;
+ struct sctpstat sb, *sarry;
+
+ memset(&sb, 0, sizeof(sb));
+ for (cpu = 0; cpu < mp_ncpus; cpu++) {
+ sarry = &SCTP_BASE_STATS[cpu];
+ if (sarry->sctps_discontinuitytime.tv_sec > sb.sctps_discontinuitytime.tv_sec) {
+ sb.sctps_discontinuitytime.tv_sec = sarry->sctps_discontinuitytime.tv_sec;
+ sb.sctps_discontinuitytime.tv_usec = sarry->sctps_discontinuitytime.tv_usec;
+ }
+ sb.sctps_currestab += sarry->sctps_currestab;
+ sb.sctps_activeestab += sarry->sctps_activeestab;
+ sb.sctps_restartestab += sarry->sctps_restartestab;
+ sb.sctps_collisionestab += sarry->sctps_collisionestab;
+ sb.sctps_passiveestab += sarry->sctps_passiveestab;
+ sb.sctps_aborted += sarry->sctps_aborted;
+ sb.sctps_shutdown += sarry->sctps_shutdown;
+ sb.sctps_outoftheblue += sarry->sctps_outoftheblue;
+ sb.sctps_checksumerrors += sarry->sctps_checksumerrors;
+ sb.sctps_outcontrolchunks += sarry->sctps_outcontrolchunks;
+ sb.sctps_outorderchunks += sarry->sctps_outorderchunks;
+ sb.sctps_outunorderchunks += sarry->sctps_outunorderchunks;
+ sb.sctps_incontrolchunks += sarry->sctps_incontrolchunks;
+ sb.sctps_inorderchunks += sarry->sctps_inorderchunks;
+ sb.sctps_inunorderchunks += sarry->sctps_inunorderchunks;
+ sb.sctps_fragusrmsgs += sarry->sctps_fragusrmsgs;
+ sb.sctps_reasmusrmsgs += sarry->sctps_reasmusrmsgs;
+ sb.sctps_outpackets += sarry->sctps_outpackets;
+ sb.sctps_inpackets += sarry->sctps_inpackets;
+ sb.sctps_recvpackets += sarry->sctps_recvpackets;
+ sb.sctps_recvdatagrams += sarry->sctps_recvdatagrams;
+ sb.sctps_recvpktwithdata += sarry->sctps_recvpktwithdata;
+ sb.sctps_recvsacks += sarry->sctps_recvsacks;
+ sb.sctps_recvdata += sarry->sctps_recvdata;
+ sb.sctps_recvdupdata += sarry->sctps_recvdupdata;
+ sb.sctps_recvheartbeat += sarry->sctps_recvheartbeat;
+ sb.sctps_recvheartbeatack += sarry->sctps_recvheartbeatack;
+ sb.sctps_recvecne += sarry->sctps_recvecne;
+ sb.sctps_recvauth += sarry->sctps_recvauth;
+ sb.sctps_recvauthmissing += sarry->sctps_recvauthmissing;
+ sb.sctps_recvivalhmacid += sarry->sctps_recvivalhmacid;
+ sb.sctps_recvivalkeyid += sarry->sctps_recvivalkeyid;
+ sb.sctps_recvauthfailed += sarry->sctps_recvauthfailed;
+ sb.sctps_recvexpress += sarry->sctps_recvexpress;
+ sb.sctps_recvexpressm += sarry->sctps_recvexpressm;
+ sb.sctps_recvnocrc += sarry->sctps_recvnocrc;
+ sb.sctps_recvswcrc += sarry->sctps_recvswcrc;
+ sb.sctps_recvhwcrc += sarry->sctps_recvhwcrc;
+ sb.sctps_sendpackets += sarry->sctps_sendpackets;
+ sb.sctps_sendsacks += sarry->sctps_sendsacks;
+ sb.sctps_senddata += sarry->sctps_senddata;
+ sb.sctps_sendretransdata += sarry->sctps_sendretransdata;
+ sb.sctps_sendfastretrans += sarry->sctps_sendfastretrans;
+ sb.sctps_sendmultfastretrans += sarry->sctps_sendmultfastretrans;
+ sb.sctps_sendheartbeat += sarry->sctps_sendheartbeat;
+ sb.sctps_sendecne += sarry->sctps_sendecne;
+ sb.sctps_sendauth += sarry->sctps_sendauth;
+ sb.sctps_senderrors += sarry->sctps_senderrors;
+ sb.sctps_sendnocrc += sarry->sctps_sendnocrc;
+ sb.sctps_sendswcrc += sarry->sctps_sendswcrc;
+ sb.sctps_sendhwcrc += sarry->sctps_sendhwcrc;
+ sb.sctps_pdrpfmbox += sarry->sctps_pdrpfmbox;
+ sb.sctps_pdrpfehos += sarry->sctps_pdrpfehos;
+ sb.sctps_pdrpmbda += sarry->sctps_pdrpmbda;
+ sb.sctps_pdrpmbct += sarry->sctps_pdrpmbct;
+ sb.sctps_pdrpbwrpt += sarry->sctps_pdrpbwrpt;
+ sb.sctps_pdrpcrupt += sarry->sctps_pdrpcrupt;
+ sb.sctps_pdrpnedat += sarry->sctps_pdrpnedat;
+ sb.sctps_pdrppdbrk += sarry->sctps_pdrppdbrk;
+ sb.sctps_pdrptsnnf += sarry->sctps_pdrptsnnf;
+ sb.sctps_pdrpdnfnd += sarry->sctps_pdrpdnfnd;
+ sb.sctps_pdrpdiwnp += sarry->sctps_pdrpdiwnp;
+ sb.sctps_pdrpdizrw += sarry->sctps_pdrpdizrw;
+ sb.sctps_pdrpbadd += sarry->sctps_pdrpbadd;
+ sb.sctps_pdrpmark += sarry->sctps_pdrpmark;
+ sb.sctps_timoiterator += sarry->sctps_timoiterator;
+ sb.sctps_timodata += sarry->sctps_timodata;
+ sb.sctps_timowindowprobe += sarry->sctps_timowindowprobe;
+ sb.sctps_timoinit += sarry->sctps_timoinit;
+ sb.sctps_timosack += sarry->sctps_timosack;
+ sb.sctps_timoshutdown += sarry->sctps_timoshutdown;
+ sb.sctps_timoheartbeat += sarry->sctps_timoheartbeat;
+ sb.sctps_timocookie += sarry->sctps_timocookie;
+ sb.sctps_timosecret += sarry->sctps_timosecret;
+ sb.sctps_timopathmtu += sarry->sctps_timopathmtu;
+ sb.sctps_timoshutdownack += sarry->sctps_timoshutdownack;
+ sb.sctps_timoshutdownguard += sarry->sctps_timoshutdownguard;
+ sb.sctps_timostrmrst += sarry->sctps_timostrmrst;
+ sb.sctps_timoearlyfr += sarry->sctps_timoearlyfr;
+ sb.sctps_timoasconf += sarry->sctps_timoasconf;
+ sb.sctps_timodelprim += sarry->sctps_timodelprim;
+ sb.sctps_timoautoclose += sarry->sctps_timoautoclose;
+ sb.sctps_timoassockill += sarry->sctps_timoassockill;
+ sb.sctps_timoinpkill += sarry->sctps_timoinpkill;
+ sb.sctps_earlyfrstart += sarry->sctps_earlyfrstart;
+ sb.sctps_earlyfrstop += sarry->sctps_earlyfrstop;
+ sb.sctps_earlyfrmrkretrans += sarry->sctps_earlyfrmrkretrans;
+ sb.sctps_earlyfrstpout += sarry->sctps_earlyfrstpout;
+ sb.sctps_earlyfrstpidsck1 += sarry->sctps_earlyfrstpidsck1;
+ sb.sctps_earlyfrstpidsck2 += sarry->sctps_earlyfrstpidsck2;
+ sb.sctps_earlyfrstpidsck3 += sarry->sctps_earlyfrstpidsck3;
+ sb.sctps_earlyfrstpidsck4 += sarry->sctps_earlyfrstpidsck4;
+ sb.sctps_earlyfrstrid += sarry->sctps_earlyfrstrid;
+ sb.sctps_earlyfrstrout += sarry->sctps_earlyfrstrout;
+ sb.sctps_earlyfrstrtmr += sarry->sctps_earlyfrstrtmr;
+ sb.sctps_hdrops += sarry->sctps_hdrops;
+ sb.sctps_badsum += sarry->sctps_badsum;
+ sb.sctps_noport += sarry->sctps_noport;
+ sb.sctps_badvtag += sarry->sctps_badvtag;
+ sb.sctps_badsid += sarry->sctps_badsid;
+ sb.sctps_nomem += sarry->sctps_nomem;
+ sb.sctps_fastretransinrtt += sarry->sctps_fastretransinrtt;
+ sb.sctps_markedretrans += sarry->sctps_markedretrans;
+ sb.sctps_naglesent += sarry->sctps_naglesent;
+ sb.sctps_naglequeued += sarry->sctps_naglequeued;
+ sb.sctps_maxburstqueued += sarry->sctps_maxburstqueued;
+ sb.sctps_ifnomemqueued += sarry->sctps_ifnomemqueued;
+ sb.sctps_windowprobed += sarry->sctps_windowprobed;
+ sb.sctps_lowlevelerr += sarry->sctps_lowlevelerr;
+ sb.sctps_lowlevelerrusr += sarry->sctps_lowlevelerrusr;
+ sb.sctps_datadropchklmt += sarry->sctps_datadropchklmt;
+ sb.sctps_datadroprwnd += sarry->sctps_datadroprwnd;
+ sb.sctps_ecnereducedcwnd += sarry->sctps_ecnereducedcwnd;
+ sb.sctps_vtagexpress += sarry->sctps_vtagexpress;
+ sb.sctps_vtagbogus += sarry->sctps_vtagbogus;
+ sb.sctps_primary_randry += sarry->sctps_primary_randry;
+ sb.sctps_cmt_randry += sarry->sctps_cmt_randry;
+ sb.sctps_slowpath_sack += sarry->sctps_slowpath_sack;
+ sb.sctps_wu_sacks_sent += sarry->sctps_wu_sacks_sent;
+ sb.sctps_sends_with_flags += sarry->sctps_sends_with_flags;
+ sb.sctps_sends_with_unord += sarry->sctps_sends_with_unord;
+ sb.sctps_sends_with_eof += sarry->sctps_sends_with_eof;
+ sb.sctps_sends_with_abort += sarry->sctps_sends_with_abort;
+ sb.sctps_protocol_drain_calls += sarry->sctps_protocol_drain_calls;
+ sb.sctps_protocol_drains_done += sarry->sctps_protocol_drains_done;
+ sb.sctps_read_peeks += sarry->sctps_read_peeks;
+ sb.sctps_cached_chk += sarry->sctps_cached_chk;
+ sb.sctps_cached_strmoq += sarry->sctps_cached_strmoq;
+ sb.sctps_left_abandon += sarry->sctps_left_abandon;
+ sb.sctps_send_burst_avoid += sarry->sctps_send_burst_avoid;
+ sb.sctps_send_cwnd_avoid += sarry->sctps_send_cwnd_avoid;
+ sb.sctps_fwdtsn_map_over += sarry->sctps_fwdtsn_map_over;
+ }
+ error = SYSCTL_OUT(req, &sb, sizeof(sb));
+ return (error);
+}
+
+#endif
+
+#if defined(SCTP_LOCAL_TRACE_BUF)
+static int
+sysctl_sctp_cleartrace(SYSCTL_HANDLER_ARGS)
+{
+ int error = 0;
+
+ memset(&SCTP_BASE_SYSCTL(sctp_log), 0, sizeof(struct sctp_log));
+ return (error);
+}
+
+#endif
+
+
+/*
+ * sysctl definitions
+ */
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, sendspace, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_sendspace), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_MAXDGRAM_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, recvspace, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_recvspace), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_RECVSPACE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, auto_asconf, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_auto_asconf), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_AUTOASCONF_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, ecn_enable, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_ecn_enable), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_ECN_ENABLE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, ecn_nonce, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_ecn_nonce), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_ECN_NONCE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, strict_sacks, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_strict_sacks), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_STRICT_SACKS_DESC);
+
+#if !defined(SCTP_WITH_NO_CSUM)
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, loopback_nocsum, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_LOOPBACK_NOCSUM_DESC);
+#endif
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, strict_init, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_strict_init), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_STRICT_INIT_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, peer_chkoh, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_peer_chunk_oh), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_PEER_CHKOH_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, maxburst, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_max_burst_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_MAXBURST_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, maxchunks, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_MAXCHUNKS_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, tcbhashsize, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_hashtblsize), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_TCBHASHSIZE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, pcbhashsize, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_pcbtblsize), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_PCBHASHSIZE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, min_split_point, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_min_split_point), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_MIN_SPLIT_POINT_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, chunkscale, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_chunkscale), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_CHUNKSCALE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, delayed_sack_time, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_DELAYED_SACK_TIME_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, sack_freq, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_sack_freq_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_SACK_FREQ_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, sys_resource, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_system_free_resc_limit), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_SYS_RESOURCE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, asoc_resource, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_ASOC_RESOURCE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, heartbeat_interval, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_HEARTBEAT_INTERVAL_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, pmtu_raise_time, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_PMTU_RAISE_TIME_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, shutdown_guard_time, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_SHUTDOWN_GUARD_TIME_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, secret_lifetime, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_secret_lifetime_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_SECRET_LIFETIME_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, rto_max, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_rto_max_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_RTO_MAX_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, rto_min, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_rto_min_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_RTO_MIN_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, rto_initial, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_rto_initial_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_RTO_INITIAL_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, init_rto_max, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_init_rto_max_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_INIT_RTO_MAX_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, valid_cookie_life, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_VALID_COOKIE_LIFE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, init_rtx_max, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_init_rtx_max_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_INIT_RTX_MAX_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, assoc_rtx_max, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_ASSOC_RTX_MAX_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, path_rtx_max, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_path_rtx_max_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_PATH_RTX_MAX_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, add_more_on_output, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_add_more_threshold), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_ADD_MORE_ON_OUTPUT_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, outgoing_streams, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_OUTGOING_STREAMS_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, cmt_on_off, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_cmt_on_off), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_CMT_ON_OFF_DESC);
+
+/* EY */
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, nr_sack_on_off, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_nr_sack_on_off), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_NR_SACK_ON_OFF_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, cmt_use_dac, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_cmt_use_dac), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_CMT_USE_DAC_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, cmt_pf, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_cmt_pf), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_CMT_PF_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, cwnd_maxburst, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_CWND_MAXBURST_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, early_fast_retran, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_early_fr), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_EARLY_FAST_RETRAN_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, early_fast_retran_msec, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_early_fr_msec), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_EARLY_FAST_RETRAN_MSEC_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, asconf_auth_nochk, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_ASCONF_AUTH_NOCHK_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, auth_disable, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_auth_disable), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_AUTH_DISABLE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, nat_friendly, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_nat_friendly), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_NAT_FRIENDLY_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, abc_l_var, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_L2_abc_variable), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_ABC_L_VAR_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, max_chained_mbufs, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_MAX_CHAINED_MBUFS_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, do_sctp_drain, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_do_drain), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_DO_SCTP_DRAIN_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, hb_max_burst, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_hb_maxburst), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_HB_MAX_BURST_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, abort_at_limit, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_ABORT_AT_LIMIT_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, strict_data_order, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_strict_data_order), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_STRICT_DATA_ORDER_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, min_residual, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_min_residual), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_MIN_RESIDUAL_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, max_retran_chunk, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_max_retran_chunk), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_MAX_RETRAN_CHUNK_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, log_level, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_logging_level), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_LOGGING_LEVEL_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, default_cc_module, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_default_cc_module), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_DEFAULT_CC_MODULE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, default_frag_interleave, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_default_frag_interleave), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_DEFAULT_FRAG_INTERLEAVE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, mobility_base, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_mobility_base), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_MOBILITY_BASE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, mobility_fasthandoff, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_MOBILITY_FASTHANDOFF_DESC);
+
+#if defined(SCTP_LOCAL_TRACE_BUF)
+SYSCTL_STRUCT(_net_inet_sctp, OID_AUTO, log, CTLFLAG_RD,
+ &SCTP_BASE_SYSCTL(sctp_log), sctp_log,
+ "SCTP logging (struct sctp_log)");
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, clear_trace, CTLTYPE_OPAQUE | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_log), 0, sysctl_sctp_cleartrace, "IU",
+ "Clear SCTP Logging buffer");
+
+
+
+#endif
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, udp_tunneling_for_client_enable, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_udp_tunneling_for_client_enable), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, udp_tunneling_port, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_udp_tunneling_port), 0, sysctl_sctp_udp_tunneling_check, "IU",
+ SCTPCTL_UDP_TUNNELING_PORT_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, enable_sack_immediately, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_enable_sack_immediately), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_SACK_IMMEDIATELY_ENABLE_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, nat_friendly_init, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_NAT_FRIENDLY_INITS_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, vtag_time_wait, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_vtag_time_wait), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_TIME_WAIT_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, buffer_splitting, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_buffer_splitting), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_BUFFER_SPLITTING_DESC);
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, initial_cwnd, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_initial_cwnd), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_INITIAL_CWND_DESC);
+
+#ifdef SCTP_DEBUG
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, debug, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_debug_on), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_DEBUG_DESC);
+#endif /* SCTP_DEBUG */
+
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, output_unlocked, CTLTYPE_INT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_output_unlocked), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_OUTPUT_UNLOCKED_DESC);
+#endif
+#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, stats,
+ CTLTYPE_STRUCT | CTLFLAG_RD,
+ 0, 0, sysctl_stat_get, "S,sctpstat",
+ "SCTP statistics (struct sctp_stat)");
+#else
+SYSCTL_STRUCT(_net_inet_sctp, OID_AUTO, stats, CTLFLAG_RW,
+ &SCTP_BASE_STATS_SYSCTL, sctpstat,
+ "SCTP statistics (struct sctp_stat)");
+#endif
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, assoclist, CTLFLAG_RD,
+ 0, 0, sctp_assoclist,
+ "S,xassoc", "List of active SCTP associations");
diff --git a/freebsd/sys/netinet/sctp_sysctl.h b/freebsd/sys/netinet/sctp_sysctl.h
new file mode 100644
index 00000000..5f7f270d
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_sysctl.h
@@ -0,0 +1,532 @@
+/*-
+ * Copyright (c) 2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __sctp_sysctl_h__
+#define __sctp_sysctl_h__
+
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp_constants.h>
+
+struct sctp_sysctl {
+ uint32_t sctp_sendspace;
+ uint32_t sctp_recvspace;
+ uint32_t sctp_auto_asconf;
+ uint32_t sctp_multiple_asconfs;
+ uint32_t sctp_ecn_enable;
+ uint32_t sctp_ecn_nonce;
+ uint32_t sctp_strict_sacks;
+#if !defined(SCTP_WITH_NO_CSUM)
+ uint32_t sctp_no_csum_on_loopback;
+#endif
+ uint32_t sctp_strict_init;
+ uint32_t sctp_peer_chunk_oh;
+ uint32_t sctp_max_burst_default;
+ uint32_t sctp_max_chunks_on_queue;
+ uint32_t sctp_hashtblsize;
+ uint32_t sctp_pcbtblsize;
+ uint32_t sctp_min_split_point;
+ uint32_t sctp_chunkscale;
+ uint32_t sctp_delayed_sack_time_default;
+ uint32_t sctp_sack_freq_default;
+ uint32_t sctp_system_free_resc_limit;
+ uint32_t sctp_asoc_free_resc_limit;
+ uint32_t sctp_heartbeat_interval_default;
+ uint32_t sctp_pmtu_raise_time_default;
+ uint32_t sctp_shutdown_guard_time_default;
+ uint32_t sctp_secret_lifetime_default;
+ uint32_t sctp_rto_max_default;
+ uint32_t sctp_rto_min_default;
+ uint32_t sctp_rto_initial_default;
+ uint32_t sctp_init_rto_max_default;
+ uint32_t sctp_valid_cookie_life_default;
+ uint32_t sctp_init_rtx_max_default;
+ uint32_t sctp_assoc_rtx_max_default;
+ uint32_t sctp_path_rtx_max_default;
+ uint32_t sctp_add_more_threshold;
+ uint32_t sctp_nr_outgoing_streams_default;
+ uint32_t sctp_cmt_on_off;
+ uint32_t sctp_cmt_use_dac;
+ /* EY 5/5/08 - nr_sack flag variable */
+ uint32_t sctp_nr_sack_on_off;
+ uint32_t sctp_cmt_pf;
+ uint32_t sctp_use_cwnd_based_maxburst;
+ uint32_t sctp_early_fr;
+ uint32_t sctp_early_fr_msec;
+ uint32_t sctp_asconf_auth_nochk;
+ uint32_t sctp_auth_disable;
+ uint32_t sctp_nat_friendly;
+ uint32_t sctp_L2_abc_variable;
+ uint32_t sctp_mbuf_threshold_count;
+ uint32_t sctp_do_drain;
+ uint32_t sctp_hb_maxburst;
+ uint32_t sctp_abort_if_one_2_one_hits_limit;
+ uint32_t sctp_strict_data_order;
+ uint32_t sctp_min_residual;
+ uint32_t sctp_max_retran_chunk;
+ uint32_t sctp_logging_level;
+ /* JRS - Variable for default congestion control module */
+ uint32_t sctp_default_cc_module;
+ uint32_t sctp_default_frag_interleave;
+ uint32_t sctp_mobility_base;
+ uint32_t sctp_mobility_fasthandoff;
+ uint32_t sctp_inits_include_nat_friendly;
+#if defined(SCTP_LOCAL_TRACE_BUF)
+ struct sctp_log sctp_log;
+#endif
+ uint32_t sctp_udp_tunneling_for_client_enable;
+ uint32_t sctp_udp_tunneling_port;
+ uint32_t sctp_enable_sack_immediately;
+ uint32_t sctp_vtag_time_wait;
+ uint32_t sctp_buffer_splitting;
+ uint32_t sctp_initial_cwnd;
+#if defined(SCTP_DEBUG)
+ uint32_t sctp_debug_on;
+#endif
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ uint32_t sctp_output_unlocked;
+#endif
+};
+
+/*
+ * limits for the sysctl variables
+ */
+/* maxdgram: Maximum outgoing SCTP buffer size */
+#define SCTPCTL_MAXDGRAM_DESC "Maximum outgoing SCTP buffer size"
+#define SCTPCTL_MAXDGRAM_MIN 0
+#define SCTPCTL_MAXDGRAM_MAX 0xFFFFFFFF
+#define SCTPCTL_MAXDGRAM_DEFAULT 262144 /* 256k */
+
+/* recvspace: Maximum incoming SCTP buffer size */
+#define SCTPCTL_RECVSPACE_DESC "Maximum incoming SCTP buffer size"
+#define SCTPCTL_RECVSPACE_MIN 0
+#define SCTPCTL_RECVSPACE_MAX 0xFFFFFFFF
+#define SCTPCTL_RECVSPACE_DEFAULT 262144 /* 256k */
+
+/* autoasconf: Enable SCTP Auto-ASCONF */
+#define SCTPCTL_AUTOASCONF_DESC "Enable SCTP Auto-ASCONF"
+#define SCTPCTL_AUTOASCONF_MIN 0
+#define SCTPCTL_AUTOASCONF_MAX 1
+#define SCTPCTL_AUTOASCONF_DEFAULT SCTP_DEFAULT_AUTO_ASCONF
+
+/* autoasconf: Enable SCTP Auto-ASCONF */
+#define SCTPCTL_MULTIPLEASCONFS_DESC "Enable SCTP Muliple-ASCONFs"
+#define SCTPCTL_MULTIPLEASCONFS_MIN 0
+#define SCTPCTL_MULTIPLEASCONFS_MAX 1
+#define SCTPCTL_MULTIPLEASCONFS_DEFAULT SCTP_DEFAULT_MULTIPLE_ASCONFS
+
+/* ecn_enable: Enable SCTP ECN */
+#define SCTPCTL_ECN_ENABLE_DESC "Enable SCTP ECN"
+#define SCTPCTL_ECN_ENABLE_MIN 0
+#define SCTPCTL_ECN_ENABLE_MAX 1
+#define SCTPCTL_ECN_ENABLE_DEFAULT 1
+
+/* ecn_nonce: Enable SCTP ECN Nonce */
+#define SCTPCTL_ECN_NONCE_DESC "Enable SCTP ECN Nonce"
+#define SCTPCTL_ECN_NONCE_MIN 0
+#define SCTPCTL_ECN_NONCE_MAX 1
+#define SCTPCTL_ECN_NONCE_DEFAULT 0
+
+/* strict_sacks: Enable SCTP Strict SACK checking */
+#define SCTPCTL_STRICT_SACKS_DESC "Enable SCTP Strict SACK checking"
+#define SCTPCTL_STRICT_SACKS_MIN 0
+#define SCTPCTL_STRICT_SACKS_MAX 1
+#define SCTPCTL_STRICT_SACKS_DEFAULT 1
+
+/* loopback_nocsum: Enable NO Csum on packets sent on loopback */
+#define SCTPCTL_LOOPBACK_NOCSUM_DESC "Enable NO Csum on packets sent on loopback"
+#define SCTPCTL_LOOPBACK_NOCSUM_MIN 0
+#define SCTPCTL_LOOPBACK_NOCSUM_MAX 1
+#define SCTPCTL_LOOPBACK_NOCSUM_DEFAULT 1
+
+/* strict_init: Enable strict INIT/INIT-ACK singleton enforcement */
+#define SCTPCTL_STRICT_INIT_DESC "Enable strict INIT/INIT-ACK singleton enforcement"
+#define SCTPCTL_STRICT_INIT_MIN 0
+#define SCTPCTL_STRICT_INIT_MAX 1
+#define SCTPCTL_STRICT_INIT_DEFAULT 1
+
+/* peer_chkoh: Amount to debit peers rwnd per chunk sent */
+#define SCTPCTL_PEER_CHKOH_DESC "Amount to debit peers rwnd per chunk sent"
+#define SCTPCTL_PEER_CHKOH_MIN 0
+#define SCTPCTL_PEER_CHKOH_MAX 0xFFFFFFFF
+#define SCTPCTL_PEER_CHKOH_DEFAULT 256
+
+/* maxburst: Default max burst for sctp endpoints */
+#define SCTPCTL_MAXBURST_DESC "Default max burst for sctp endpoints"
+#define SCTPCTL_MAXBURST_MIN 1
+#define SCTPCTL_MAXBURST_MAX 0xFFFFFFFF
+#define SCTPCTL_MAXBURST_DEFAULT SCTP_DEF_MAX_BURST
+
+/* maxchunks: Default max chunks on queue per asoc */
+#define SCTPCTL_MAXCHUNKS_DESC "Default max chunks on queue per asoc"
+#define SCTPCTL_MAXCHUNKS_MIN 0
+#define SCTPCTL_MAXCHUNKS_MAX 0xFFFFFFFF
+#define SCTPCTL_MAXCHUNKS_DEFAULT SCTP_ASOC_MAX_CHUNKS_ON_QUEUE
+
+/* tcbhashsize: Tuneable for Hash table sizes */
+#define SCTPCTL_TCBHASHSIZE_DESC "Tunable for TCB hash table sizes"
+#define SCTPCTL_TCBHASHSIZE_MIN 1
+#define SCTPCTL_TCBHASHSIZE_MAX 0xFFFFFFFF
+#define SCTPCTL_TCBHASHSIZE_DEFAULT SCTP_TCBHASHSIZE
+
+/* pcbhashsize: Tuneable for PCB Hash table sizes */
+#define SCTPCTL_PCBHASHSIZE_DESC "Tunable for PCB hash table sizes"
+#define SCTPCTL_PCBHASHSIZE_MIN 1
+#define SCTPCTL_PCBHASHSIZE_MAX 0xFFFFFFFF
+#define SCTPCTL_PCBHASHSIZE_DEFAULT SCTP_PCBHASHSIZE
+
+/* min_split_point: Minimum size when splitting a chunk */
+#define SCTPCTL_MIN_SPLIT_POINT_DESC "Minimum size when splitting a chunk"
+#define SCTPCTL_MIN_SPLIT_POINT_MIN 0
+#define SCTPCTL_MIN_SPLIT_POINT_MAX 0xFFFFFFFF
+#define SCTPCTL_MIN_SPLIT_POINT_DEFAULT SCTP_DEFAULT_SPLIT_POINT_MIN
+
+/* chunkscale: Tuneable for Scaling of number of chunks and messages */
+#define SCTPCTL_CHUNKSCALE_DESC "Tuneable for Scaling of number of chunks and messages"
+#define SCTPCTL_CHUNKSCALE_MIN 1
+#define SCTPCTL_CHUNKSCALE_MAX 0xFFFFFFFF
+#define SCTPCTL_CHUNKSCALE_DEFAULT SCTP_CHUNKQUEUE_SCALE
+
+/* delayed_sack_time: Default delayed SACK timer in msec */
+#define SCTPCTL_DELAYED_SACK_TIME_DESC "Default delayed SACK timer in msec"
+#define SCTPCTL_DELAYED_SACK_TIME_MIN 0
+#define SCTPCTL_DELAYED_SACK_TIME_MAX 0xFFFFFFFF
+#define SCTPCTL_DELAYED_SACK_TIME_DEFAULT SCTP_RECV_MSEC
+
+/* sack_freq: Default SACK frequency */
+#define SCTPCTL_SACK_FREQ_DESC "Default SACK frequency"
+#define SCTPCTL_SACK_FREQ_MIN 0
+#define SCTPCTL_SACK_FREQ_MAX 0xFFFFFFFF
+#define SCTPCTL_SACK_FREQ_DEFAULT SCTP_DEFAULT_SACK_FREQ
+
+/* sys_resource: Max number of cached resources in the system */
+#define SCTPCTL_SYS_RESOURCE_DESC "Max number of cached resources in the system"
+#define SCTPCTL_SYS_RESOURCE_MIN 0
+#define SCTPCTL_SYS_RESOURCE_MAX 0xFFFFFFFF
+#define SCTPCTL_SYS_RESOURCE_DEFAULT SCTP_DEF_SYSTEM_RESC_LIMIT
+
+/* asoc_resource: Max number of cached resources in an asoc */
+#define SCTPCTL_ASOC_RESOURCE_DESC "Max number of cached resources in an asoc"
+#define SCTPCTL_ASOC_RESOURCE_MIN 0
+#define SCTPCTL_ASOC_RESOURCE_MAX 0xFFFFFFFF
+#define SCTPCTL_ASOC_RESOURCE_DEFAULT SCTP_DEF_ASOC_RESC_LIMIT
+
+/* heartbeat_interval: Default heartbeat interval in msec */
+#define SCTPCTL_HEARTBEAT_INTERVAL_DESC "Default heartbeat interval in msec"
+#define SCTPCTL_HEARTBEAT_INTERVAL_MIN 0
+#define SCTPCTL_HEARTBEAT_INTERVAL_MAX 0xFFFFFFFF
+#define SCTPCTL_HEARTBEAT_INTERVAL_DEFAULT SCTP_HB_DEFAULT_MSEC
+
+/* pmtu_raise_time: Default PMTU raise timer in sec */
+#define SCTPCTL_PMTU_RAISE_TIME_DESC "Default PMTU raise timer in sec"
+#define SCTPCTL_PMTU_RAISE_TIME_MIN 0
+#define SCTPCTL_PMTU_RAISE_TIME_MAX 0xFFFFFFFF
+#define SCTPCTL_PMTU_RAISE_TIME_DEFAULT SCTP_DEF_PMTU_RAISE_SEC
+
+/* shutdown_guard_time: Default shutdown guard timer in sec */
+#define SCTPCTL_SHUTDOWN_GUARD_TIME_DESC "Default shutdown guard timer in sec"
+#define SCTPCTL_SHUTDOWN_GUARD_TIME_MIN 0
+#define SCTPCTL_SHUTDOWN_GUARD_TIME_MAX 0xFFFFFFFF
+#define SCTPCTL_SHUTDOWN_GUARD_TIME_DEFAULT SCTP_DEF_MAX_SHUTDOWN_SEC
+
+/* secret_lifetime: Default secret lifetime in sec */
+#define SCTPCTL_SECRET_LIFETIME_DESC "Default secret lifetime in sec"
+#define SCTPCTL_SECRET_LIFETIME_MIN 0
+#define SCTPCTL_SECRET_LIFETIME_MAX 0xFFFFFFFF
+#define SCTPCTL_SECRET_LIFETIME_DEFAULT SCTP_DEFAULT_SECRET_LIFE_SEC
+
+/* rto_max: Default maximum retransmission timeout in msec */
+#define SCTPCTL_RTO_MAX_DESC "Default maximum retransmission timeout in msec"
+#define SCTPCTL_RTO_MAX_MIN 0
+#define SCTPCTL_RTO_MAX_MAX 0xFFFFFFFF
+#define SCTPCTL_RTO_MAX_DEFAULT SCTP_RTO_UPPER_BOUND
+
+/* rto_min: Default minimum retransmission timeout in msec */
+#define SCTPCTL_RTO_MIN_DESC "Default minimum retransmission timeout in msec"
+#define SCTPCTL_RTO_MIN_MIN 0
+#define SCTPCTL_RTO_MIN_MAX 0xFFFFFFFF
+#define SCTPCTL_RTO_MIN_DEFAULT SCTP_RTO_LOWER_BOUND
+
+/* rto_initial: Default initial retransmission timeout in msec */
+#define SCTPCTL_RTO_INITIAL_DESC "Default initial retransmission timeout in msec"
+#define SCTPCTL_RTO_INITIAL_MIN 0
+#define SCTPCTL_RTO_INITIAL_MAX 0xFFFFFFFF
+#define SCTPCTL_RTO_INITIAL_DEFAULT SCTP_RTO_INITIAL
+
+/* init_rto_max: Default maximum retransmission timeout during association setup in msec */
+#define SCTPCTL_INIT_RTO_MAX_DESC "Default maximum retransmission timeout during association setup in msec"
+#define SCTPCTL_INIT_RTO_MAX_MIN 0
+#define SCTPCTL_INIT_RTO_MAX_MAX 0xFFFFFFFF
+#define SCTPCTL_INIT_RTO_MAX_DEFAULT SCTP_RTO_UPPER_BOUND
+
+/* valid_cookie_life: Default cookie lifetime in sec */
+#define SCTPCTL_VALID_COOKIE_LIFE_DESC "Default cookie lifetime in sec"
+#define SCTPCTL_VALID_COOKIE_LIFE_MIN 0
+#define SCTPCTL_VALID_COOKIE_LIFE_MAX 0xFFFFFFFF
+#define SCTPCTL_VALID_COOKIE_LIFE_DEFAULT SCTP_DEFAULT_COOKIE_LIFE
+
+/* init_rtx_max: Default maximum number of retransmission for INIT chunks */
+#define SCTPCTL_INIT_RTX_MAX_DESC "Default maximum number of retransmission for INIT chunks"
+#define SCTPCTL_INIT_RTX_MAX_MIN 0
+#define SCTPCTL_INIT_RTX_MAX_MAX 0xFFFFFFFF
+#define SCTPCTL_INIT_RTX_MAX_DEFAULT SCTP_DEF_MAX_INIT
+
+/* assoc_rtx_max: Default maximum number of retransmissions per association */
+#define SCTPCTL_ASSOC_RTX_MAX_DESC "Default maximum number of retransmissions per association"
+#define SCTPCTL_ASSOC_RTX_MAX_MIN 0
+#define SCTPCTL_ASSOC_RTX_MAX_MAX 0xFFFFFFFF
+#define SCTPCTL_ASSOC_RTX_MAX_DEFAULT SCTP_DEF_MAX_SEND
+
+/* path_rtx_max: Default maximum of retransmissions per path */
+#define SCTPCTL_PATH_RTX_MAX_DESC "Default maximum of retransmissions per path"
+#define SCTPCTL_PATH_RTX_MAX_MIN 0
+#define SCTPCTL_PATH_RTX_MAX_MAX 0xFFFFFFFF
+#define SCTPCTL_PATH_RTX_MAX_DEFAULT SCTP_DEF_MAX_PATH_RTX
+
+/* add_more_on_output: When space wise is it worthwhile to try to add more to a socket send buffer */
+#define SCTPCTL_ADD_MORE_ON_OUTPUT_DESC "When space wise is it worthwhile to try to add more to a socket send buffer"
+#define SCTPCTL_ADD_MORE_ON_OUTPUT_MIN 0
+#define SCTPCTL_ADD_MORE_ON_OUTPUT_MAX 0xFFFFFFFF
+#define SCTPCTL_ADD_MORE_ON_OUTPUT_DEFAULT SCTP_DEFAULT_ADD_MORE
+
+/* outgoing_streams: Default number of outgoing streams */
+#define SCTPCTL_OUTGOING_STREAMS_DESC "Default number of outgoing streams"
+#define SCTPCTL_OUTGOING_STREAMS_MIN 1
+#define SCTPCTL_OUTGOING_STREAMS_MAX 65535
+#define SCTPCTL_OUTGOING_STREAMS_DEFAULT SCTP_OSTREAM_INITIAL
+
+/* cmt_on_off: CMT on/off flag */
+#define SCTPCTL_CMT_ON_OFF_DESC "CMT on/off flag"
+#define SCTPCTL_CMT_ON_OFF_MIN 0
+#define SCTPCTL_CMT_ON_OFF_MAX 1
+#define SCTPCTL_CMT_ON_OFF_DEFAULT 0
+
+/* EY - nr_sack_on_off: NR_SACK on/off flag */
+#define SCTPCTL_NR_SACK_ON_OFF_DESC "NR_SACK on/off flag"
+#define SCTPCTL_NR_SACK_ON_OFF_MIN 0
+#define SCTPCTL_NR_SACK_ON_OFF_MAX 1
+#define SCTPCTL_NR_SACK_ON_OFF_DEFAULT 0
+
+/* cmt_use_dac: CMT DAC on/off flag */
+#define SCTPCTL_CMT_USE_DAC_DESC "CMT DAC on/off flag"
+#define SCTPCTL_CMT_USE_DAC_MIN 0
+#define SCTPCTL_CMT_USE_DAC_MAX 1
+#define SCTPCTL_CMT_USE_DAC_DEFAULT 0
+
+/* JRS 5/2107 - CMT PF type flag */
+#define SCTPCTL_CMT_PF_DESC "CMT PF type flag"
+#define SCTPCTL_CMT_PF_MIN 0
+#define SCTPCTL_CMT_PF_MAX 2
+#define SCTPCTL_CMT_PF_DEFAULT 0
+
+/* cwnd_maxburst: Use a CWND adjusting maxburst */
+#define SCTPCTL_CWND_MAXBURST_DESC "Use a CWND adjusting maxburst"
+#define SCTPCTL_CWND_MAXBURST_MIN 0
+#define SCTPCTL_CWND_MAXBURST_MAX 1
+#define SCTPCTL_CWND_MAXBURST_DEFAULT 1
+
+/* early_fast_retran: Early Fast Retransmit with timer */
+#define SCTPCTL_EARLY_FAST_RETRAN_DESC "Early Fast Retransmit with timer"
+#define SCTPCTL_EARLY_FAST_RETRAN_MIN 0
+#define SCTPCTL_EARLY_FAST_RETRAN_MAX 0xFFFFFFFF
+#define SCTPCTL_EARLY_FAST_RETRAN_DEFAULT 0
+
+/* early_fast_retran_msec: Early Fast Retransmit minimum timer value */
+#define SCTPCTL_EARLY_FAST_RETRAN_MSEC_DESC "Early Fast Retransmit minimum timer value"
+#define SCTPCTL_EARLY_FAST_RETRAN_MSEC_MIN 0
+#define SCTPCTL_EARLY_FAST_RETRAN_MSEC_MAX 0xFFFFFFFF
+#define SCTPCTL_EARLY_FAST_RETRAN_MSEC_DEFAULT SCTP_MINFR_MSEC_TIMER
+
+/* asconf_auth_nochk: Disable SCTP ASCONF AUTH requirement */
+#define SCTPCTL_ASCONF_AUTH_NOCHK_DESC "Disable SCTP ASCONF AUTH requirement"
+#define SCTPCTL_ASCONF_AUTH_NOCHK_MIN 0
+#define SCTPCTL_ASCONF_AUTH_NOCHK_MAX 1
+#define SCTPCTL_ASCONF_AUTH_NOCHK_DEFAULT 0
+
+/* auth_disable: Disable SCTP AUTH function */
+#define SCTPCTL_AUTH_DISABLE_DESC "Disable SCTP AUTH function"
+#define SCTPCTL_AUTH_DISABLE_MIN 0
+#define SCTPCTL_AUTH_DISABLE_MAX 1
+#define SCTPCTL_AUTH_DISABLE_DEFAULT 0
+
+/* nat_friendly: SCTP NAT friendly operation */
+#define SCTPCTL_NAT_FRIENDLY_DESC "SCTP NAT friendly operation"
+#define SCTPCTL_NAT_FRIENDLY_MIN 0
+#define SCTPCTL_NAT_FRIENDLY_MAX 1
+#define SCTPCTL_NAT_FRIENDLY_DEFAULT 1
+
+/* abc_l_var: SCTP ABC max increase per SACK (L) */
+#define SCTPCTL_ABC_L_VAR_DESC "SCTP ABC max increase per SACK (L)"
+#define SCTPCTL_ABC_L_VAR_MIN 0
+#define SCTPCTL_ABC_L_VAR_MAX 0xFFFFFFFF
+#define SCTPCTL_ABC_L_VAR_DEFAULT 1
+
+/* max_chained_mbufs: Default max number of small mbufs on a chain */
+#define SCTPCTL_MAX_CHAINED_MBUFS_DESC "Default max number of small mbufs on a chain"
+#define SCTPCTL_MAX_CHAINED_MBUFS_MIN 0
+#define SCTPCTL_MAX_CHAINED_MBUFS_MAX 0xFFFFFFFF
+#define SCTPCTL_MAX_CHAINED_MBUFS_DEFAULT SCTP_DEFAULT_MBUFS_IN_CHAIN
+
+/* do_sctp_drain: Should SCTP respond to the drain calls */
+#define SCTPCTL_DO_SCTP_DRAIN_DESC "Should SCTP respond to the drain calls"
+#define SCTPCTL_DO_SCTP_DRAIN_MIN 0
+#define SCTPCTL_DO_SCTP_DRAIN_MAX 1
+#define SCTPCTL_DO_SCTP_DRAIN_DEFAULT 1
+
+/* hb_max_burst: Confirmation Heartbeat max burst? */
+#define SCTPCTL_HB_MAX_BURST_DESC "Confirmation Heartbeat max burst"
+#define SCTPCTL_HB_MAX_BURST_MIN 1
+#define SCTPCTL_HB_MAX_BURST_MAX 0xFFFFFFFF
+#define SCTPCTL_HB_MAX_BURST_DEFAULT SCTP_DEF_MAX_BURST
+
+/* abort_at_limit: When one-2-one hits qlimit abort */
+#define SCTPCTL_ABORT_AT_LIMIT_DESC "When one-2-one hits qlimit abort"
+#define SCTPCTL_ABORT_AT_LIMIT_MIN 0
+#define SCTPCTL_ABORT_AT_LIMIT_MAX 1
+#define SCTPCTL_ABORT_AT_LIMIT_DEFAULT 0
+
+/* strict_data_order: Enforce strict data ordering, abort if control inside data */
+#define SCTPCTL_STRICT_DATA_ORDER_DESC "Enforce strict data ordering, abort if control inside data"
+#define SCTPCTL_STRICT_DATA_ORDER_MIN 0
+#define SCTPCTL_STRICT_DATA_ORDER_MAX 1
+#define SCTPCTL_STRICT_DATA_ORDER_DEFAULT 0
+
+/* min_residual: min residual in a data fragment leftover */
+#define SCTPCTL_MIN_RESIDUAL_DESC "Minimum residual data chunk in second part of split"
+#define SCTPCTL_MIN_RESIDUAL_MIN 20
+#define SCTPCTL_MIN_RESIDUAL_MAX 65535
+#define SCTPCTL_MIN_RESIDUAL_DEFAULT 1452
+
+/* max_retran_chunk: max chunk retransmissions */
+#define SCTPCTL_MAX_RETRAN_CHUNK_DESC "Maximum times an unlucky chunk can be retran'd before assoc abort"
+#define SCTPCTL_MAX_RETRAN_CHUNK_MIN 0
+#define SCTPCTL_MAX_RETRAN_CHUNK_MAX 65535
+#define SCTPCTL_MAX_RETRAN_CHUNK_DEFAULT 30
+
+/* sctp_logging: This gives us logging when the options are enabled */
+#define SCTPCTL_LOGGING_LEVEL_DESC "Ltrace/KTR trace logging level"
+#define SCTPCTL_LOGGING_LEVEL_MIN 0
+#define SCTPCTL_LOGGING_LEVEL_MAX 0xffffffff
+#define SCTPCTL_LOGGING_LEVEL_DEFAULT 0
+
+/* JRS - default congestion control module sysctl */
+#define SCTPCTL_DEFAULT_CC_MODULE_DESC "Default congestion control module"
+#define SCTPCTL_DEFAULT_CC_MODULE_MIN 0
+#define SCTPCTL_DEFAULT_CC_MODULE_MAX 2
+#define SCTPCTL_DEFAULT_CC_MODULE_DEFAULT 0
+
+/* RRS - default fragment interleave */
+#define SCTPCTL_DEFAULT_FRAG_INTERLEAVE_DESC "Default fragment interleave level"
+#define SCTPCTL_DEFAULT_FRAG_INTERLEAVE_MIN 0
+#define SCTPCTL_DEFAULT_FRAG_INTERLEAVE_MAX 2
+#define SCTPCTL_DEFAULT_FRAG_INTERLEAVE_DEFAULT 1
+
+/* mobility_base: Enable SCTP mobility support */
+#define SCTPCTL_MOBILITY_BASE_DESC "Enable SCTP base mobility"
+#define SCTPCTL_MOBILITY_BASE_MIN 0
+#define SCTPCTL_MOBILITY_BASE_MAX 1
+#define SCTPCTL_MOBILITY_BASE_DEFAULT SCTP_DEFAULT_MOBILITY_BASE
+
+/* mobility_fasthandoff: Enable SCTP fast handoff support */
+#define SCTPCTL_MOBILITY_FASTHANDOFF_DESC "Enable SCTP fast handoff"
+#define SCTPCTL_MOBILITY_FASTHANDOFF_MIN 0
+#define SCTPCTL_MOBILITY_FASTHANDOFF_MAX 1
+#define SCTPCTL_MOBILITY_FASTHANDOFF_DEFAULT SCTP_DEFAULT_MOBILITY_FASTHANDOFF
+
+/* Enable SCTP/UDP tunneling for clients*/
+#define SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_DESC "Enable SCTP/UDP tunneling for client"
+#define SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_MIN 0
+#define SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_MAX 1
+#define SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_DEFAULT SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_MIN
+
+/* Enable SCTP/UDP tunneling port */
+#define SCTPCTL_UDP_TUNNELING_PORT_DESC "Set the SCTP/UDP tunneling port"
+#define SCTPCTL_UDP_TUNNELING_PORT_MIN 0
+#define SCTPCTL_UDP_TUNNELING_PORT_MAX 65535
+#define SCTPCTL_UDP_TUNNELING_PORT_DEFAULT SCTP_OVER_UDP_TUNNELING_PORT
+
+/* Enable sending of the SACK-IMMEDIATELY bit */
+#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_DESC "Enable sending of the SACK-IMMEDIATELY-bit."
+#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_MIN 0
+#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_MAX 1
+#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_DEFAULT SCTPCTL_SACK_IMMEDIATELY_ENABLE_MIN
+
+/* Enable sending of the NAT-FRIENDLY message */
+#define SCTPCTL_NAT_FRIENDLY_INITS_DESC "Enable sending of the nat-friendly SCTP option on INITs."
+#define SCTPCTL_NAT_FRIENDLY_INITS_MIN 0
+#define SCTPCTL_NAT_FRIENDLY_INITS_MAX 1
+#define SCTPCTL_NAT_FRIENDLY_INITS_DEFAULT SCTPCTL_NAT_FRIENDLY_INITS_MIN
+
+/* Vtag time wait in seconds */
+#define SCTPCTL_TIME_WAIT_DESC "Vtag time wait time in seconds, 0 disables it."
+#define SCTPCTL_TIME_WAIT_MIN 0
+#define SCTPCTL_TIME_WAIT_MAX 0xffffffff
+#define SCTPCTL_TIME_WAIT_DEFAULT SCTP_TIME_WAIT
+
+/* Enable Send/Receive buffer splitting */
+#define SCTPCTL_BUFFER_SPLITTING_DESC "Enable send/receive buffer splitting."
+#define SCTPCTL_BUFFER_SPLITTING_MIN 0
+#define SCTPCTL_BUFFER_SPLITTING_MAX 0x3
+#define SCTPCTL_BUFFER_SPLITTING_DEFAULT SCTPCTL_BUFFER_SPLITTING_MIN
+
+/* Initial congestion window in MTU */
+#define SCTPCTL_INITIAL_CWND_DESC "Initial congestion window in MTUs"
+#define SCTPCTL_INITIAL_CWND_MIN 1
+#define SCTPCTL_INITIAL_CWND_MAX 0xffffffff
+#define SCTPCTL_INITIAL_CWND_DEFAULT 3
+
+#if defined(SCTP_DEBUG)
+/* debug: Configure debug output */
+#define SCTPCTL_DEBUG_DESC "Configure debug output"
+#define SCTPCTL_DEBUG_MIN 0
+#define SCTPCTL_DEBUG_MAX 0xFFFFFFFF
+#define SCTPCTL_DEBUG_DEFAULT 0
+#endif
+
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#define SCTPCTL_OUTPUT_UNLOCKED_DESC "Unlock socket when sending packets down to IP."
+#define SCTPCTL_OUTPUT_UNLOCKED_MIN 0
+#define SCTPCTL_OUTPUT_UNLOCKED_MAX 1
+#define SCTPCTL_OUTPUT_UNLOCKED_DEFAULT SCTPCTL_OUTPUT_UNLOCKED_MIN
+#endif
+
+
+#if defined(_KERNEL) || defined(__Userspace__)
+#if defined(SYSCTL_DECL)
+SYSCTL_DECL(_net_inet_sctp);
+#endif
+
+void sctp_init_sysctls(void);
+
+#endif /* _KERNEL */
+#endif /* __sctp_sysctl_h__ */
diff --git a/freebsd/sys/netinet/sctp_timer.c b/freebsd/sys/netinet/sctp_timer.c
new file mode 100644
index 00000000..090689b1
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_timer.c
@@ -0,0 +1,1804 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_timer.c,v 1.29 2005/03/06 16:04:18 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define _IP_VHL
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#ifdef INET6
+#endif
+#include <freebsd/netinet/sctp_var.h>
+#include <freebsd/netinet/sctp_sysctl.h>
+#include <freebsd/netinet/sctp_timer.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_output.h>
+#include <freebsd/netinet/sctp_header.h>
+#include <freebsd/netinet/sctp_indata.h>
+#include <freebsd/netinet/sctp_asconf.h>
+#include <freebsd/netinet/sctp_input.h>
+#include <freebsd/netinet/sctp.h>
+#include <freebsd/netinet/sctp_uio.h>
+#include <freebsd/netinet/udp.h>
+
+
+void
+sctp_early_fr_timer(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ struct sctp_tmit_chunk *chk, *tp2;
+ struct timeval now, min_wait, tv;
+ unsigned int cur_rtt, cnt = 0, cnt_resend = 0;
+
+ /* an early FR is occuring. */
+ (void)SCTP_GETTIME_TIMEVAL(&now);
+ /* get cur rto in micro-seconds */
+ if (net->lastsa == 0) {
+ /* Hmm no rtt estimate yet? */
+ cur_rtt = stcb->asoc.initial_rto >> 2;
+ } else {
+
+ cur_rtt = ((net->lastsa >> 2) + net->lastsv) >> 1;
+ }
+ if (cur_rtt < SCTP_BASE_SYSCTL(sctp_early_fr_msec)) {
+ cur_rtt = SCTP_BASE_SYSCTL(sctp_early_fr_msec);
+ }
+ cur_rtt *= 1000;
+ tv.tv_sec = cur_rtt / 1000000;
+ tv.tv_usec = cur_rtt % 1000000;
+ min_wait = now;
+ timevalsub(&min_wait, &tv);
+ if (min_wait.tv_sec < 0 || min_wait.tv_usec < 0) {
+ /*
+ * if we hit here, we don't have enough seconds on the clock
+ * to account for the RTO. We just let the lower seconds be
+ * the bounds and don't worry about it. This may mean we
+ * will mark a lot more than we should.
+ */
+ min_wait.tv_sec = min_wait.tv_usec = 0;
+ }
+ chk = TAILQ_LAST(&stcb->asoc.sent_queue, sctpchunk_listhead);
+ for (; chk != NULL; chk = tp2) {
+ tp2 = TAILQ_PREV(chk, sctpchunk_listhead, sctp_next);
+ if (chk->whoTo != net) {
+ continue;
+ }
+ if (chk->sent == SCTP_DATAGRAM_RESEND)
+ cnt_resend++;
+ else if ((chk->sent > SCTP_DATAGRAM_UNSENT) &&
+ (chk->sent < SCTP_DATAGRAM_RESEND)) {
+ /* pending, may need retran */
+ if (chk->sent_rcv_time.tv_sec > min_wait.tv_sec) {
+ /*
+ * we have reached a chunk that was sent
+ * some seconds past our min.. forget it we
+ * will find no more to send.
+ */
+ continue;
+ } else if (chk->sent_rcv_time.tv_sec == min_wait.tv_sec) {
+ /*
+ * we must look at the micro seconds to
+ * know.
+ */
+ if (chk->sent_rcv_time.tv_usec >= min_wait.tv_usec) {
+ /*
+ * ok it was sent after our boundary
+ * time.
+ */
+ continue;
+ }
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_EARLYFR_LOGGING_ENABLE) {
+ sctp_log_fr(chk->rec.data.TSN_seq, chk->snd_count,
+ 4, SCTP_FR_MARKED_EARLY);
+ }
+ SCTP_STAT_INCR(sctps_earlyfrmrkretrans);
+ chk->sent = SCTP_DATAGRAM_RESEND;
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ /* double book size since we are doing an early FR */
+ chk->book_size_scale++;
+ cnt += chk->send_size;
+ if ((cnt + net->flight_size) > net->cwnd) {
+ /* Mark all we could possibly resend */
+ break;
+ }
+ }
+ }
+ if (cnt) {
+ /*
+ * JRS - Use the congestion control given in the congestion
+ * control module
+ */
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer(inp, stcb, net);
+ } else if (cnt_resend) {
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_EARLY_FR_TMR, SCTP_SO_NOT_LOCKED);
+ }
+ /* Restart it? */
+ if (net->flight_size < net->cwnd) {
+ SCTP_STAT_INCR(sctps_earlyfrstrtmr);
+ sctp_timer_start(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net);
+ }
+}
+
+void
+sctp_audit_retranmission_queue(struct sctp_association *asoc)
+{
+ struct sctp_tmit_chunk *chk;
+
+ SCTPDBG(SCTP_DEBUG_TIMER4, "Audit invoked on send queue cnt:%d onqueue:%d\n",
+ asoc->sent_queue_retran_cnt,
+ asoc->sent_queue_cnt);
+ asoc->sent_queue_retran_cnt = 0;
+ asoc->sent_queue_cnt = 0;
+ TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
+ if (chk->sent == SCTP_DATAGRAM_RESEND) {
+ sctp_ucount_incr(asoc->sent_queue_retran_cnt);
+ }
+ asoc->sent_queue_cnt++;
+ }
+ TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
+ if (chk->sent == SCTP_DATAGRAM_RESEND) {
+ sctp_ucount_incr(asoc->sent_queue_retran_cnt);
+ }
+ }
+ TAILQ_FOREACH(chk, &asoc->asconf_send_queue, sctp_next) {
+ if (chk->sent == SCTP_DATAGRAM_RESEND) {
+ sctp_ucount_incr(asoc->sent_queue_retran_cnt);
+ }
+ }
+ SCTPDBG(SCTP_DEBUG_TIMER4, "Audit completes retran:%d onqueue:%d\n",
+ asoc->sent_queue_retran_cnt,
+ asoc->sent_queue_cnt);
+}
+
+int
+sctp_threshold_management(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net, uint16_t threshold)
+{
+ if (net) {
+ net->error_count++;
+ SCTPDBG(SCTP_DEBUG_TIMER4, "Error count for %p now %d thresh:%d\n",
+ net, net->error_count,
+ net->failure_threshold);
+ if (net->error_count > net->failure_threshold) {
+ /* We had a threshold failure */
+ if (net->dest_state & SCTP_ADDR_REACHABLE) {
+ net->dest_state &= ~SCTP_ADDR_REACHABLE;
+ net->dest_state |= SCTP_ADDR_NOT_REACHABLE;
+ net->dest_state &= ~SCTP_ADDR_REQ_PRIMARY;
+ if (net == stcb->asoc.primary_destination) {
+ net->dest_state |= SCTP_ADDR_WAS_PRIMARY;
+ }
+ /*
+ * JRS 5/14/07 - If a destination is
+ * unreachable, the PF bit is turned off.
+ * This allows an unambiguous use of the PF
+ * bit for destinations that are reachable
+ * but potentially failed. If the
+ * destination is set to the unreachable
+ * state, also set the destination to the PF
+ * state.
+ */
+ /*
+ * Add debug message here if destination is
+ * not in PF state.
+ */
+ /* Stop any running T3 timers here? */
+ if ((stcb->asoc.sctp_cmt_on_off == 1) &&
+ (stcb->asoc.sctp_cmt_pf > 0)) {
+ net->dest_state &= ~SCTP_ADDR_PF;
+ SCTPDBG(SCTP_DEBUG_TIMER4, "Destination %p moved from PF to unreachable.\n",
+ net);
+ }
+ sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN,
+ stcb,
+ SCTP_FAILED_THRESHOLD,
+ (void *)net, SCTP_SO_NOT_LOCKED);
+ }
+ }
+ /*********HOLD THIS COMMENT FOR PATCH OF ALTERNATE
+ *********ROUTING CODE
+ */
+ /*********HOLD THIS COMMENT FOR END OF PATCH OF ALTERNATE
+ *********ROUTING CODE
+ */
+ }
+ if (stcb == NULL)
+ return (0);
+
+ if (net) {
+ if ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_INCR,
+ stcb->asoc.overall_error_count,
+ (stcb->asoc.overall_error_count + 1),
+ SCTP_FROM_SCTP_TIMER,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count++;
+ }
+ } else {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
+ sctp_misc_ints(SCTP_THRESHOLD_INCR,
+ stcb->asoc.overall_error_count,
+ (stcb->asoc.overall_error_count + 1),
+ SCTP_FROM_SCTP_TIMER,
+ __LINE__);
+ }
+ stcb->asoc.overall_error_count++;
+ }
+ SCTPDBG(SCTP_DEBUG_TIMER4, "Overall error count for %p now %d thresh:%u state:%x\n",
+ &stcb->asoc, stcb->asoc.overall_error_count,
+ (uint32_t) threshold,
+ ((net == NULL) ? (uint32_t) 0 : (uint32_t) net->dest_state));
+ /*
+ * We specifically do not do >= to give the assoc one more change
+ * before we fail it.
+ */
+ if (stcb->asoc.overall_error_count > threshold) {
+ /* Abort notification sends a ULP notify */
+ struct mbuf *oper;
+
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) +
+ sizeof(uint32_t);
+ ph = mtod(oper, struct sctp_paramhdr *);
+ ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length = htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_TIMER + SCTP_LOC_1);
+ }
+ inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_1;
+ sctp_abort_an_association(inp, stcb, SCTP_FAILED_THRESHOLD, oper, SCTP_SO_NOT_LOCKED);
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * sctp_find_alternate_net() returns a non-NULL pointer as long
+ * the argument net is non-NULL.
+ */
+struct sctp_nets *
+sctp_find_alternate_net(struct sctp_tcb *stcb,
+ struct sctp_nets *net,
+ int mode)
+{
+ /* Find and return an alternate network if possible */
+ struct sctp_nets *alt, *mnet, *min_errors_net = NULL, *max_cwnd_net = NULL;
+ int once;
+
+ /* JRS 5/14/07 - Initialize min_errors to an impossible value. */
+ int min_errors = -1;
+ uint32_t max_cwnd = 0;
+
+ if (stcb->asoc.numnets == 1) {
+ /* No others but net */
+ return (TAILQ_FIRST(&stcb->asoc.nets));
+ }
+ /*
+ * JRS 5/14/07 - If mode is set to 2, use the CMT PF find alternate
+ * net algorithm. This algorithm chooses the active destination (not
+ * in PF state) with the largest cwnd value. If all destinations are
+ * in PF state, unreachable, or unconfirmed, choose the desination
+ * that is in PF state with the lowest error count. In case of a
+ * tie, choose the destination that was most recently active.
+ */
+ if (mode == 2) {
+ TAILQ_FOREACH(mnet, &stcb->asoc.nets, sctp_next) {
+ /*
+ * JRS 5/14/07 - If the destination is unreachable
+ * or unconfirmed, skip it.
+ */
+ if (((mnet->dest_state & SCTP_ADDR_REACHABLE) != SCTP_ADDR_REACHABLE) ||
+ (mnet->dest_state & SCTP_ADDR_UNCONFIRMED)) {
+ continue;
+ }
+ /*
+ * JRS 5/14/07 - If the destination is reachable
+ * but in PF state, compare the error count of the
+ * destination to the minimum error count seen thus
+ * far. Store the destination with the lower error
+ * count. If the error counts are equal, store the
+ * destination that was most recently active.
+ */
+ if (mnet->dest_state & SCTP_ADDR_PF) {
+ /*
+ * JRS 5/14/07 - If the destination under
+ * consideration is the current destination,
+ * work as if the error count is one higher.
+ * The actual error count will not be
+ * incremented until later in the t3
+ * handler.
+ */
+ if (mnet == net) {
+ if (min_errors == -1) {
+ min_errors = mnet->error_count + 1;
+ min_errors_net = mnet;
+ } else if (mnet->error_count + 1 < min_errors) {
+ min_errors = mnet->error_count + 1;
+ min_errors_net = mnet;
+ } else if (mnet->error_count + 1 == min_errors
+ && mnet->last_active > min_errors_net->last_active) {
+ min_errors_net = mnet;
+ min_errors = mnet->error_count + 1;
+ }
+ continue;
+ } else {
+ if (min_errors == -1) {
+ min_errors = mnet->error_count;
+ min_errors_net = mnet;
+ } else if (mnet->error_count < min_errors) {
+ min_errors = mnet->error_count;
+ min_errors_net = mnet;
+ } else if (mnet->error_count == min_errors
+ && mnet->last_active > min_errors_net->last_active) {
+ min_errors_net = mnet;
+ min_errors = mnet->error_count;
+ }
+ continue;
+ }
+ }
+ /*
+ * JRS 5/14/07 - If the destination is reachable and
+ * not in PF state, compare the cwnd of the
+ * destination to the highest cwnd seen thus far.
+ * Store the destination with the higher cwnd value.
+ * If the cwnd values are equal, randomly choose one
+ * of the two destinations.
+ */
+ if (max_cwnd < mnet->cwnd) {
+ max_cwnd_net = mnet;
+ max_cwnd = mnet->cwnd;
+ } else if (max_cwnd == mnet->cwnd) {
+ uint32_t rndval;
+ uint8_t this_random;
+
+ if (stcb->asoc.hb_random_idx > 3) {
+ rndval = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep);
+ memcpy(stcb->asoc.hb_random_values, &rndval, sizeof(stcb->asoc.hb_random_values));
+ this_random = stcb->asoc.hb_random_values[0];
+ stcb->asoc.hb_random_idx++;
+ stcb->asoc.hb_ect_randombit = 0;
+ } else {
+ this_random = stcb->asoc.hb_random_values[stcb->asoc.hb_random_idx];
+ stcb->asoc.hb_random_idx++;
+ stcb->asoc.hb_ect_randombit = 0;
+ }
+ if (this_random % 2 == 1) {
+ max_cwnd_net = mnet;
+ max_cwnd = mnet->cwnd; /* Useless? */
+ }
+ }
+ }
+ /*
+ * JRS 5/14/07 - After all destination have been considered
+ * as alternates, check to see if there was some active
+ * destination (not in PF state). If not, check to see if
+ * there was some PF destination with the minimum number of
+ * errors. If not, return the original destination. If
+ * there is a min_errors_net, remove the PF flag from that
+ * destination, set the cwnd to one or two MTUs, and return
+ * the destination as an alt. If there was some active
+ * destination with a highest cwnd, return the destination
+ * as an alt.
+ */
+ if (max_cwnd_net == NULL) {
+ if (min_errors_net == NULL) {
+ return (net);
+ }
+ min_errors_net->dest_state &= ~SCTP_ADDR_PF;
+ min_errors_net->cwnd = min_errors_net->mtu * stcb->asoc.sctp_cmt_pf;
+ if (SCTP_OS_TIMER_PENDING(&min_errors_net->rxt_timer.timer)) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
+ stcb, min_errors_net,
+ SCTP_FROM_SCTP_TIMER + SCTP_LOC_2);
+ }
+ SCTPDBG(SCTP_DEBUG_TIMER4, "Destination %p moved from PF to active with %d errors.\n",
+ min_errors_net, min_errors_net->error_count);
+ return (min_errors_net);
+ } else {
+ return (max_cwnd_net);
+ }
+ }
+ /*
+ * JRS 5/14/07 - If mode is set to 1, use the CMT policy for
+ * choosing an alternate net.
+ */
+ else if (mode == 1) {
+ TAILQ_FOREACH(mnet, &stcb->asoc.nets, sctp_next) {
+ if (((mnet->dest_state & SCTP_ADDR_REACHABLE) != SCTP_ADDR_REACHABLE) ||
+ (mnet->dest_state & SCTP_ADDR_UNCONFIRMED)) {
+ /*
+ * will skip ones that are not-reachable or
+ * unconfirmed
+ */
+ continue;
+ }
+ if (max_cwnd < mnet->cwnd) {
+ max_cwnd_net = mnet;
+ max_cwnd = mnet->cwnd;
+ } else if (max_cwnd == mnet->cwnd) {
+ uint32_t rndval;
+ uint8_t this_random;
+
+ if (stcb->asoc.hb_random_idx > 3) {
+ rndval = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep);
+ memcpy(stcb->asoc.hb_random_values, &rndval,
+ sizeof(stcb->asoc.hb_random_values));
+ this_random = stcb->asoc.hb_random_values[0];
+ stcb->asoc.hb_random_idx = 0;
+ stcb->asoc.hb_ect_randombit = 0;
+ } else {
+ this_random = stcb->asoc.hb_random_values[stcb->asoc.hb_random_idx];
+ stcb->asoc.hb_random_idx++;
+ stcb->asoc.hb_ect_randombit = 0;
+ }
+ if (this_random % 2) {
+ max_cwnd_net = mnet;
+ max_cwnd = mnet->cwnd;
+ }
+ }
+ }
+ if (max_cwnd_net) {
+ return (max_cwnd_net);
+ }
+ }
+ mnet = net;
+ once = 0;
+
+ if (mnet == NULL) {
+ mnet = TAILQ_FIRST(&stcb->asoc.nets);
+ if (mnet == NULL) {
+ return (NULL);
+ }
+ }
+ do {
+ alt = TAILQ_NEXT(mnet, sctp_next);
+ if (alt == NULL) {
+ once++;
+ if (once > 1) {
+ break;
+ }
+ alt = TAILQ_FIRST(&stcb->asoc.nets);
+ if (alt == NULL) {
+ return (NULL);
+ }
+ }
+ if (alt->ro.ro_rt == NULL) {
+ if (alt->ro._s_addr) {
+ sctp_free_ifa(alt->ro._s_addr);
+ alt->ro._s_addr = NULL;
+ }
+ alt->src_addr_selected = 0;
+ }
+ /* sa_ignore NO_NULL_CHK */
+ if (((alt->dest_state & SCTP_ADDR_REACHABLE) == SCTP_ADDR_REACHABLE) &&
+ (alt->ro.ro_rt != NULL) &&
+ (!(alt->dest_state & SCTP_ADDR_UNCONFIRMED))) {
+ /* Found a reachable address */
+ break;
+ }
+ mnet = alt;
+ } while (alt != NULL);
+
+ if (alt == NULL) {
+ /* Case where NO insv network exists (dormant state) */
+ /* we rotate destinations */
+ once = 0;
+ mnet = net;
+ do {
+ if (mnet == NULL) {
+ return (TAILQ_FIRST(&stcb->asoc.nets));
+ }
+ alt = TAILQ_NEXT(mnet, sctp_next);
+ if (alt == NULL) {
+ once++;
+ if (once > 1) {
+ break;
+ }
+ alt = TAILQ_FIRST(&stcb->asoc.nets);
+ }
+ /* sa_ignore NO_NULL_CHK */
+ if ((!(alt->dest_state & SCTP_ADDR_UNCONFIRMED)) &&
+ (alt != net)) {
+ /* Found an alternate address */
+ break;
+ }
+ mnet = alt;
+ } while (alt != NULL);
+ }
+ if (alt == NULL) {
+ return (net);
+ }
+ return (alt);
+}
+
+static void
+sctp_backoff_on_timeout(struct sctp_tcb *stcb,
+ struct sctp_nets *net,
+ int win_probe,
+ int num_marked, int num_abandoned)
+{
+ if (net->RTO == 0) {
+ net->RTO = stcb->asoc.minrto;
+ }
+ net->RTO <<= 1;
+ if (net->RTO > stcb->asoc.maxrto) {
+ net->RTO = stcb->asoc.maxrto;
+ }
+ if ((win_probe == 0) && (num_marked || num_abandoned)) {
+ /* We don't apply penalty to window probe scenarios */
+ /* JRS - Use the congestion control given in the CC module */
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout(stcb, net);
+ }
+}
+
+#ifndef INVARIANTS
+static void
+sctp_recover_sent_list(struct sctp_tcb *stcb)
+{
+ struct sctp_tmit_chunk *chk, *tp2;
+ struct sctp_association *asoc;
+
+ asoc = &stcb->asoc;
+ chk = TAILQ_FIRST(&stcb->asoc.sent_queue);
+ for (; chk != NULL; chk = tp2) {
+ tp2 = TAILQ_NEXT(chk, sctp_next);
+ if ((compare_with_wrap(stcb->asoc.last_acked_seq,
+ chk->rec.data.TSN_seq,
+ MAX_TSN)) ||
+ (stcb->asoc.last_acked_seq == chk->rec.data.TSN_seq)) {
+
+ SCTP_PRINTF("Found chk:%p tsn:%x <= last_acked_seq:%x\n",
+ chk, chk->rec.data.TSN_seq, stcb->asoc.last_acked_seq);
+ TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next);
+ if (chk->pr_sctp_on) {
+ if (asoc->pr_sctp_cnt != 0)
+ asoc->pr_sctp_cnt--;
+ }
+ if (chk->data) {
+ /* sa_ignore NO_NULL_CHK */
+ sctp_free_bufspace(stcb, asoc, chk, 1);
+ sctp_m_freem(chk->data);
+ if (asoc->peer_supports_prsctp && PR_SCTP_BUF_ENABLED(chk->flags)) {
+ asoc->sent_queue_cnt_removeable--;
+ }
+ }
+ chk->data = NULL;
+ asoc->sent_queue_cnt--;
+ sctp_free_a_chunk(stcb, chk);
+ }
+ }
+ SCTP_PRINTF("after recover order is as follows\n");
+ chk = TAILQ_FIRST(&stcb->asoc.sent_queue);
+ for (; chk != NULL; chk = tp2) {
+ tp2 = TAILQ_NEXT(chk, sctp_next);
+ SCTP_PRINTF("chk:%p TSN:%x\n", chk, chk->rec.data.TSN_seq);
+ }
+}
+
+#endif
+
+static int
+sctp_mark_all_for_resend(struct sctp_tcb *stcb,
+ struct sctp_nets *net,
+ struct sctp_nets *alt,
+ int window_probe,
+ int *num_marked,
+ int *num_abandoned)
+{
+
+ /*
+ * Mark all chunks (well not all) that were sent to *net for
+ * retransmission. Move them to alt for there destination as well...
+ * We only mark chunks that have been outstanding long enough to
+ * have received feed-back.
+ */
+ struct sctp_tmit_chunk *chk, *tp2;
+ struct sctp_nets *lnets;
+ struct timeval now, min_wait, tv;
+ int cur_rtt;
+ int cnt_abandoned;
+ int audit_tf, num_mk, fir;
+ unsigned int cnt_mk;
+ uint32_t orig_flight, orig_tf;
+ uint32_t tsnlast, tsnfirst;
+ int recovery_cnt = 0;
+
+
+ /* none in flight now */
+ audit_tf = 0;
+ fir = 0;
+ /*
+ * figure out how long a data chunk must be pending before we can
+ * mark it ..
+ */
+ (void)SCTP_GETTIME_TIMEVAL(&now);
+ /* get cur rto in micro-seconds */
+ cur_rtt = (((net->lastsa >> 2) + net->lastsv) >> 1);
+ cur_rtt *= 1000;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
+ sctp_log_fr(cur_rtt,
+ stcb->asoc.peers_rwnd,
+ window_probe,
+ SCTP_FR_T3_MARK_TIME);
+ sctp_log_fr(net->flight_size,
+ SCTP_OS_TIMER_PENDING(&net->fr_timer.timer),
+ SCTP_OS_TIMER_ACTIVE(&net->fr_timer.timer),
+ SCTP_FR_CWND_REPORT);
+ sctp_log_fr(net->flight_size, net->cwnd, stcb->asoc.total_flight, SCTP_FR_CWND_REPORT);
+ }
+ tv.tv_sec = cur_rtt / 1000000;
+ tv.tv_usec = cur_rtt % 1000000;
+ min_wait = now;
+ timevalsub(&min_wait, &tv);
+ if (min_wait.tv_sec < 0 || min_wait.tv_usec < 0) {
+ /*
+ * if we hit here, we don't have enough seconds on the clock
+ * to account for the RTO. We just let the lower seconds be
+ * the bounds and don't worry about it. This may mean we
+ * will mark a lot more than we should.
+ */
+ min_wait.tv_sec = min_wait.tv_usec = 0;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
+ sctp_log_fr(cur_rtt, now.tv_sec, now.tv_usec, SCTP_FR_T3_MARK_TIME);
+ sctp_log_fr(0, min_wait.tv_sec, min_wait.tv_usec, SCTP_FR_T3_MARK_TIME);
+ }
+ /*
+ * Our rwnd will be incorrect here since we are not adding back the
+ * cnt * mbuf but we will fix that down below.
+ */
+ orig_flight = net->flight_size;
+ orig_tf = stcb->asoc.total_flight;
+
+ net->fast_retran_ip = 0;
+ /* Now on to each chunk */
+ cnt_abandoned = 0;
+ num_mk = cnt_mk = 0;
+ tsnfirst = tsnlast = 0;
+#ifndef INVARIANTS
+start_again:
+#endif
+ chk = TAILQ_FIRST(&stcb->asoc.sent_queue);
+ for (; chk != NULL; chk = tp2) {
+ tp2 = TAILQ_NEXT(chk, sctp_next);
+ if ((compare_with_wrap(stcb->asoc.last_acked_seq,
+ chk->rec.data.TSN_seq,
+ MAX_TSN)) ||
+ (stcb->asoc.last_acked_seq == chk->rec.data.TSN_seq)) {
+ /* Strange case our list got out of order? */
+ SCTP_PRINTF("Our list is out of order? last_acked:%x chk:%x",
+ (unsigned int)stcb->asoc.last_acked_seq, (unsigned int)chk->rec.data.TSN_seq);
+ recovery_cnt++;
+#ifdef INVARIANTS
+ panic("last acked >= chk on sent-Q");
+#else
+ SCTP_PRINTF("Recover attempts a restart cnt:%d\n", recovery_cnt);
+ sctp_recover_sent_list(stcb);
+ if (recovery_cnt < 10) {
+ goto start_again;
+ } else {
+ SCTP_PRINTF("Recovery fails %d times??\n", recovery_cnt);
+ }
+#endif
+ }
+ if ((chk->whoTo == net) && (chk->sent < SCTP_DATAGRAM_ACKED)) {
+ /*
+ * found one to mark: If it is less than
+ * DATAGRAM_ACKED it MUST not be a skipped or marked
+ * TSN but instead one that is either already set
+ * for retransmission OR one that needs
+ * retransmission.
+ */
+
+ /* validate its been outstanding long enough */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
+ sctp_log_fr(chk->rec.data.TSN_seq,
+ chk->sent_rcv_time.tv_sec,
+ chk->sent_rcv_time.tv_usec,
+ SCTP_FR_T3_MARK_TIME);
+ }
+ if ((chk->sent_rcv_time.tv_sec > min_wait.tv_sec) && (window_probe == 0)) {
+ /*
+ * we have reached a chunk that was sent
+ * some seconds past our min.. forget it we
+ * will find no more to send.
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
+ sctp_log_fr(0,
+ chk->sent_rcv_time.tv_sec,
+ chk->sent_rcv_time.tv_usec,
+ SCTP_FR_T3_STOPPED);
+ }
+ continue;
+ } else if ((chk->sent_rcv_time.tv_sec == min_wait.tv_sec) &&
+ (window_probe == 0)) {
+ /*
+ * we must look at the micro seconds to
+ * know.
+ */
+ if (chk->sent_rcv_time.tv_usec >= min_wait.tv_usec) {
+ /*
+ * ok it was sent after our boundary
+ * time.
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
+ sctp_log_fr(0,
+ chk->sent_rcv_time.tv_sec,
+ chk->sent_rcv_time.tv_usec,
+ SCTP_FR_T3_STOPPED);
+ }
+ continue;
+ }
+ }
+ if (stcb->asoc.peer_supports_prsctp && PR_SCTP_TTL_ENABLED(chk->flags)) {
+ /* Is it expired? */
+ if (timevalcmp(&now, &chk->rec.data.timetodrop, >)) {
+ /* Yes so drop it */
+ if (chk->data) {
+ (void)sctp_release_pr_sctp_chunk(stcb,
+ chk,
+ (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
+ SCTP_SO_NOT_LOCKED);
+ cnt_abandoned++;
+ }
+ continue;
+ }
+ }
+ if (stcb->asoc.peer_supports_prsctp && PR_SCTP_RTX_ENABLED(chk->flags)) {
+ /* Has it been retransmitted tv_sec times? */
+ if (chk->snd_count > chk->rec.data.timetodrop.tv_sec) {
+ if (chk->data) {
+ (void)sctp_release_pr_sctp_chunk(stcb,
+ chk,
+ (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
+ SCTP_SO_NOT_LOCKED);
+ cnt_abandoned++;
+ }
+ continue;
+ }
+ }
+ if (chk->sent < SCTP_DATAGRAM_RESEND) {
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ num_mk++;
+ if (fir == 0) {
+ fir = 1;
+ tsnfirst = chk->rec.data.TSN_seq;
+ }
+ tsnlast = chk->rec.data.TSN_seq;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
+ sctp_log_fr(chk->rec.data.TSN_seq, chk->snd_count,
+ 0, SCTP_FR_T3_MARKED);
+ }
+ if (chk->rec.data.chunk_was_revoked) {
+ /* deflate the cwnd */
+ chk->whoTo->cwnd -= chk->book_size;
+ chk->rec.data.chunk_was_revoked = 0;
+ }
+ net->marked_retrans++;
+ stcb->asoc.marked_retrans++;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_RSND_TO,
+ chk->whoTo->flight_size,
+ chk->book_size,
+ (uintptr_t) chk->whoTo,
+ chk->rec.data.TSN_seq);
+ }
+ sctp_flight_size_decrease(chk);
+ sctp_total_flight_decrease(stcb, chk);
+ stcb->asoc.peers_rwnd += chk->send_size;
+ stcb->asoc.peers_rwnd += SCTP_BASE_SYSCTL(sctp_peer_chunk_oh);
+ }
+ chk->sent = SCTP_DATAGRAM_RESEND;
+ SCTP_STAT_INCR(sctps_markedretrans);
+
+ /* reset the TSN for striking and other FR stuff */
+ chk->rec.data.doing_fast_retransmit = 0;
+ /* Clear any time so NO RTT is being done */
+ chk->do_rtt = 0;
+ if (alt != net) {
+ sctp_free_remote_addr(chk->whoTo);
+ chk->no_fr_allowed = 1;
+ chk->whoTo = alt;
+ atomic_add_int(&alt->ref_count, 1);
+ } else {
+ chk->no_fr_allowed = 0;
+ if (TAILQ_EMPTY(&stcb->asoc.send_queue)) {
+ chk->rec.data.fast_retran_tsn = stcb->asoc.sending_seq;
+ } else {
+ chk->rec.data.fast_retran_tsn = (TAILQ_FIRST(&stcb->asoc.send_queue))->rec.data.TSN_seq;
+ }
+ }
+ /*
+ * CMT: Do not allow FRs on retransmitted TSNs.
+ */
+ if (stcb->asoc.sctp_cmt_on_off == 1) {
+ chk->no_fr_allowed = 1;
+ }
+#ifdef THIS_SHOULD_NOT_BE_DONE
+ } else if (chk->sent == SCTP_DATAGRAM_ACKED) {
+ /* remember highest acked one */
+ could_be_sent = chk;
+#endif
+ }
+ if (chk->sent == SCTP_DATAGRAM_RESEND) {
+ cnt_mk++;
+ }
+ }
+ if ((orig_flight - net->flight_size) != (orig_tf - stcb->asoc.total_flight)) {
+ /* we did not subtract the same things? */
+ audit_tf = 1;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
+ sctp_log_fr(tsnfirst, tsnlast, num_mk, SCTP_FR_T3_TIMEOUT);
+ }
+#ifdef SCTP_DEBUG
+ if (num_mk) {
+ SCTPDBG(SCTP_DEBUG_TIMER1, "LAST TSN marked was %x\n",
+ tsnlast);
+ SCTPDBG(SCTP_DEBUG_TIMER1, "Num marked for retransmission was %d peer-rwd:%ld\n",
+ num_mk, (u_long)stcb->asoc.peers_rwnd);
+ SCTPDBG(SCTP_DEBUG_TIMER1, "LAST TSN marked was %x\n",
+ tsnlast);
+ SCTPDBG(SCTP_DEBUG_TIMER1, "Num marked for retransmission was %d peer-rwd:%d\n",
+ num_mk,
+ (int)stcb->asoc.peers_rwnd);
+ }
+#endif
+ *num_marked = num_mk;
+ *num_abandoned = cnt_abandoned;
+ /*
+ * Now check for a ECN Echo that may be stranded And include the
+ * cnt_mk'd to have all resends in the control queue.
+ */
+ TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) {
+ if (chk->sent == SCTP_DATAGRAM_RESEND) {
+ cnt_mk++;
+ }
+ if ((chk->whoTo == net) &&
+ (chk->rec.chunk_id.id == SCTP_ECN_ECHO)) {
+ sctp_free_remote_addr(chk->whoTo);
+ chk->whoTo = alt;
+ if (chk->sent != SCTP_DATAGRAM_RESEND) {
+ chk->sent = SCTP_DATAGRAM_RESEND;
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ cnt_mk++;
+ }
+ atomic_add_int(&alt->ref_count, 1);
+ }
+ }
+#ifdef THIS_SHOULD_NOT_BE_DONE
+ if ((stcb->asoc.sent_queue_retran_cnt == 0) && (could_be_sent)) {
+ /* fix it so we retransmit the highest acked anyway */
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ cnt_mk++;
+ could_be_sent->sent = SCTP_DATAGRAM_RESEND;
+ }
+#endif
+ if (stcb->asoc.sent_queue_retran_cnt != cnt_mk) {
+#ifdef INVARIANTS
+ SCTP_PRINTF("Local Audit says there are %d for retran asoc cnt:%d we marked:%d this time\n",
+ cnt_mk, stcb->asoc.sent_queue_retran_cnt, num_mk);
+#endif
+#ifndef SCTP_AUDITING_ENABLED
+ stcb->asoc.sent_queue_retran_cnt = cnt_mk;
+#endif
+ }
+ if (audit_tf) {
+ SCTPDBG(SCTP_DEBUG_TIMER4,
+ "Audit total flight due to negative value net:%p\n",
+ net);
+ stcb->asoc.total_flight = 0;
+ stcb->asoc.total_flight_count = 0;
+ /* Clear all networks flight size */
+ TAILQ_FOREACH(lnets, &stcb->asoc.nets, sctp_next) {
+ lnets->flight_size = 0;
+ SCTPDBG(SCTP_DEBUG_TIMER4,
+ "Net:%p c-f cwnd:%d ssthresh:%d\n",
+ lnets, lnets->cwnd, lnets->ssthresh);
+ }
+ TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
+ if (chk->sent < SCTP_DATAGRAM_RESEND) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_UP,
+ chk->whoTo->flight_size,
+ chk->book_size,
+ (uintptr_t) chk->whoTo,
+ chk->rec.data.TSN_seq);
+ }
+ sctp_flight_size_increase(chk);
+ sctp_total_flight_increase(stcb, chk);
+ }
+ }
+ }
+ /*
+ * Setup the ecn nonce re-sync point. We do this since
+ * retranmissions are NOT setup for ECN. This means that do to
+ * Karn's rule, we don't know the total of the peers ecn bits.
+ */
+ chk = TAILQ_FIRST(&stcb->asoc.send_queue);
+ if (chk == NULL) {
+ stcb->asoc.nonce_resync_tsn = stcb->asoc.sending_seq;
+ } else {
+ stcb->asoc.nonce_resync_tsn = chk->rec.data.TSN_seq;
+ }
+ stcb->asoc.nonce_wait_for_ecne = 0;
+ stcb->asoc.nonce_sum_check = 0;
+ /* We return 1 if we only have a window probe outstanding */
+ return (0);
+}
+
+
+int
+sctp_t3rxt_timer(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ struct sctp_nets *alt;
+ int win_probe, num_mk, num_abandoned;
+
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) {
+ sctp_log_fr(0, 0, 0, SCTP_FR_T3_TIMEOUT);
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
+ struct sctp_nets *lnet;
+
+ TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) {
+ if (net == lnet) {
+ sctp_log_cwnd(stcb, lnet, 1, SCTP_CWND_LOG_FROM_T3);
+ } else {
+ sctp_log_cwnd(stcb, lnet, 0, SCTP_CWND_LOG_FROM_T3);
+ }
+ }
+ }
+ /* Find an alternate and mark those for retransmission */
+ if ((stcb->asoc.peers_rwnd == 0) &&
+ (stcb->asoc.total_flight < net->mtu)) {
+ SCTP_STAT_INCR(sctps_timowindowprobe);
+ win_probe = 1;
+ } else {
+ win_probe = 0;
+ }
+
+ /*
+ * JRS 5/14/07 - If CMT PF is on and the destination if not already
+ * in PF state, set the destination to PF state and store the
+ * current time as the time that the destination was last active. In
+ * addition, find an alternate destination with PF-based
+ * find_alt_net().
+ */
+ if ((stcb->asoc.sctp_cmt_on_off == 1) &&
+ (stcb->asoc.sctp_cmt_pf > 0)) {
+ if ((net->dest_state & SCTP_ADDR_PF) != SCTP_ADDR_PF) {
+ net->dest_state |= SCTP_ADDR_PF;
+ net->last_active = sctp_get_tick_count();
+ SCTPDBG(SCTP_DEBUG_TIMER4, "Destination %p moved from active to PF.\n",
+ net);
+ }
+ alt = sctp_find_alternate_net(stcb, net, 2);
+ } else if (stcb->asoc.sctp_cmt_on_off == 1) {
+ /*
+ * CMT: Using RTX_SSTHRESH policy for CMT. If CMT is being
+ * used, then pick dest with largest ssthresh for any
+ * retransmission.
+ */
+ alt = sctp_find_alternate_net(stcb, net, 1);
+ /*
+ * CUCv2: If a different dest is picked for the
+ * retransmission, then new (rtx-)pseudo_cumack needs to be
+ * tracked for orig dest. Let CUCv2 track new (rtx-)
+ * pseudo-cumack always.
+ */
+ net->find_pseudo_cumack = 1;
+ net->find_rtx_pseudo_cumack = 1;
+ } else { /* CMT is OFF */
+ alt = sctp_find_alternate_net(stcb, net, 0);
+ }
+ num_mk = 0;
+ num_abandoned = 0;
+ (void)sctp_mark_all_for_resend(stcb, net, alt, win_probe,
+ &num_mk, &num_abandoned);
+ /* FR Loss recovery just ended with the T3. */
+ stcb->asoc.fast_retran_loss_recovery = 0;
+
+ /* CMT FR loss recovery ended with the T3 */
+ net->fast_retran_loss_recovery = 0;
+
+ /*
+ * setup the sat loss recovery that prevents satellite cwnd advance.
+ */
+ stcb->asoc.sat_t3_loss_recovery = 1;
+ stcb->asoc.sat_t3_recovery_tsn = stcb->asoc.sending_seq;
+
+ /* Backoff the timer and cwnd */
+ sctp_backoff_on_timeout(stcb, net, win_probe, num_mk, num_abandoned);
+ if (win_probe == 0) {
+ /* We don't do normal threshold management on window probes */
+ if (sctp_threshold_management(inp, stcb, net,
+ stcb->asoc.max_send_times)) {
+ /* Association was destroyed */
+ return (1);
+ } else {
+ if (net != stcb->asoc.primary_destination) {
+ /* send a immediate HB if our RTO is stale */
+ struct timeval now;
+ unsigned int ms_goneby;
+
+ (void)SCTP_GETTIME_TIMEVAL(&now);
+ if (net->last_sent_time.tv_sec) {
+ ms_goneby = (now.tv_sec - net->last_sent_time.tv_sec) * 1000;
+ } else {
+ ms_goneby = 0;
+ }
+ if ((ms_goneby > net->RTO) || (net->RTO == 0)) {
+ /*
+ * no recent feed back in an RTO or
+ * more, request a RTT update
+ */
+ if (sctp_send_hb(stcb, 1, net) < 0)
+ /*
+ * Less than 0 means we lost
+ * the assoc
+ */
+ return (1);
+ }
+ }
+ }
+ } else {
+ /*
+ * For a window probe we don't penalize the net's but only
+ * the association. This may fail it if SACKs are not coming
+ * back. If sack's are coming with rwnd locked at 0, we will
+ * continue to hold things waiting for rwnd to raise
+ */
+ if (sctp_threshold_management(inp, stcb, NULL,
+ stcb->asoc.max_send_times)) {
+ /* Association was destroyed */
+ return (1);
+ }
+ }
+ if (net->dest_state & SCTP_ADDR_NOT_REACHABLE) {
+ /* Move all pending over too */
+ sctp_move_chunks_from_net(stcb, net);
+
+ /*
+ * Get the address that failed, to force a new src address
+ * selecton and a route allocation.
+ */
+ if (net->ro._s_addr) {
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ }
+ net->src_addr_selected = 0;
+
+ /* Force a route allocation too */
+ if (net->ro.ro_rt) {
+ RTFREE(net->ro.ro_rt);
+ net->ro.ro_rt = NULL;
+ }
+ /* Was it our primary? */
+ if ((stcb->asoc.primary_destination == net) && (alt != net)) {
+ /*
+ * Yes, note it as such and find an alternate note:
+ * this means HB code must use this to resent the
+ * primary if it goes active AND if someone does a
+ * change-primary then this flag must be cleared
+ * from any net structures.
+ */
+ if (sctp_set_primary_addr(stcb,
+ (struct sockaddr *)NULL,
+ alt) == 0) {
+ net->dest_state |= SCTP_ADDR_WAS_PRIMARY;
+ }
+ }
+ } else if ((stcb->asoc.sctp_cmt_on_off == 1) &&
+ (stcb->asoc.sctp_cmt_pf > 0) &&
+ ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF)) {
+ /*
+ * JRS 5/14/07 - If the destination hasn't failed completely
+ * but is in PF state, a PF-heartbeat needs to be sent
+ * manually.
+ */
+ if (sctp_send_hb(stcb, 1, net) < 0)
+ /* Return less than 0 means we lost the association */
+ return (1);
+ }
+ /*
+ * Special case for cookie-echo'ed case, we don't do output but must
+ * await the COOKIE-ACK before retransmission
+ */
+ if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED) {
+ /*
+ * Here we just reset the timer and start again since we
+ * have not established the asoc
+ */
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
+ return (0);
+ }
+ if (stcb->asoc.peer_supports_prsctp) {
+ struct sctp_tmit_chunk *lchk;
+
+ lchk = sctp_try_advance_peer_ack_point(stcb, &stcb->asoc);
+ /* C3. See if we need to send a Fwd-TSN */
+ if (compare_with_wrap(stcb->asoc.advanced_peer_ack_point,
+ stcb->asoc.last_acked_seq, MAX_TSN)) {
+ /*
+ * ISSUE with ECN, see FWD-TSN processing for notes
+ * on issues that will occur when the ECN NONCE
+ * stuff is put into SCTP for cross checking.
+ */
+ send_forward_tsn(stcb, &stcb->asoc);
+ if (lchk) {
+ /* Assure a timer is up */
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, lchk->whoTo);
+ }
+ }
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, net->cwnd, SCTP_CWND_LOG_FROM_RTX);
+ }
+ return (0);
+}
+
+int
+sctp_t1init_timer(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ /* bump the thresholds */
+ if (stcb->asoc.delayed_connection) {
+ /*
+ * special hook for delayed connection. The library did NOT
+ * complete the rest of its sends.
+ */
+ stcb->asoc.delayed_connection = 0;
+ sctp_send_initiate(inp, stcb, SCTP_SO_NOT_LOCKED);
+ return (0);
+ }
+ if (SCTP_GET_STATE((&stcb->asoc)) != SCTP_STATE_COOKIE_WAIT) {
+ return (0);
+ }
+ if (sctp_threshold_management(inp, stcb, net,
+ stcb->asoc.max_init_times)) {
+ /* Association was destroyed */
+ return (1);
+ }
+ stcb->asoc.dropped_special_cnt = 0;
+ sctp_backoff_on_timeout(stcb, stcb->asoc.primary_destination, 1, 0, 0);
+ if (stcb->asoc.initial_init_rto_max < net->RTO) {
+ net->RTO = stcb->asoc.initial_init_rto_max;
+ }
+ if (stcb->asoc.numnets > 1) {
+ /* If we have more than one addr use it */
+ struct sctp_nets *alt;
+
+ alt = sctp_find_alternate_net(stcb, stcb->asoc.primary_destination, 0);
+ if (alt != stcb->asoc.primary_destination) {
+ sctp_move_chunks_from_net(stcb, stcb->asoc.primary_destination);
+ stcb->asoc.primary_destination = alt;
+ }
+ }
+ /* Send out a new init */
+ sctp_send_initiate(inp, stcb, SCTP_SO_NOT_LOCKED);
+ return (0);
+}
+
+/*
+ * For cookie and asconf we actually need to find and mark for resend, then
+ * increment the resend counter (after all the threshold management stuff of
+ * course).
+ */
+int
+sctp_cookie_timer(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ struct sctp_nets *alt;
+ struct sctp_tmit_chunk *cookie;
+
+ /* first before all else we must find the cookie */
+ TAILQ_FOREACH(cookie, &stcb->asoc.control_send_queue, sctp_next) {
+ if (cookie->rec.chunk_id.id == SCTP_COOKIE_ECHO) {
+ break;
+ }
+ }
+ if (cookie == NULL) {
+ if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED) {
+ /* FOOBAR! */
+ struct mbuf *oper;
+
+ oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (oper) {
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) +
+ sizeof(uint32_t);
+ ph = mtod(oper, struct sctp_paramhdr *);
+ ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
+ ph->param_length = htons(SCTP_BUF_LEN(oper));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_TIMER + SCTP_LOC_3);
+ }
+ inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_4;
+ sctp_abort_an_association(inp, stcb, SCTP_INTERNAL_ERROR,
+ oper, SCTP_SO_NOT_LOCKED);
+ } else {
+#ifdef INVARIANTS
+ panic("Cookie timer expires in wrong state?");
+#else
+ SCTP_PRINTF("Strange in state %d not cookie-echoed yet c-e timer expires?\n", SCTP_GET_STATE(&stcb->asoc));
+ return (0);
+#endif
+ }
+ return (0);
+ }
+ /* Ok we found the cookie, threshold management next */
+ if (sctp_threshold_management(inp, stcb, cookie->whoTo,
+ stcb->asoc.max_init_times)) {
+ /* Assoc is over */
+ return (1);
+ }
+ /*
+ * cleared theshold management now lets backoff the address & select
+ * an alternate
+ */
+ stcb->asoc.dropped_special_cnt = 0;
+ sctp_backoff_on_timeout(stcb, cookie->whoTo, 1, 0, 0);
+ alt = sctp_find_alternate_net(stcb, cookie->whoTo, 0);
+ if (alt != cookie->whoTo) {
+ sctp_free_remote_addr(cookie->whoTo);
+ cookie->whoTo = alt;
+ atomic_add_int(&alt->ref_count, 1);
+ }
+ /* Now mark the retran info */
+ if (cookie->sent != SCTP_DATAGRAM_RESEND) {
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ }
+ cookie->sent = SCTP_DATAGRAM_RESEND;
+ /*
+ * Now call the output routine to kick out the cookie again, Note we
+ * don't mark any chunks for retran so that FR will need to kick in
+ * to move these (or a send timer).
+ */
+ return (0);
+}
+
+int
+sctp_strreset_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ struct sctp_nets *alt;
+ struct sctp_tmit_chunk *strrst = NULL, *chk = NULL;
+
+ if (stcb->asoc.stream_reset_outstanding == 0) {
+ return (0);
+ }
+ /* find the existing STRRESET, we use the seq number we sent out on */
+ (void)sctp_find_stream_reset(stcb, stcb->asoc.str_reset_seq_out, &strrst);
+ if (strrst == NULL) {
+ return (0);
+ }
+ /* do threshold management */
+ if (sctp_threshold_management(inp, stcb, strrst->whoTo,
+ stcb->asoc.max_send_times)) {
+ /* Assoc is over */
+ return (1);
+ }
+ /*
+ * cleared theshold management now lets backoff the address & select
+ * an alternate
+ */
+ sctp_backoff_on_timeout(stcb, strrst->whoTo, 1, 0, 0);
+ alt = sctp_find_alternate_net(stcb, strrst->whoTo, 0);
+ sctp_free_remote_addr(strrst->whoTo);
+ strrst->whoTo = alt;
+ atomic_add_int(&alt->ref_count, 1);
+
+ /* See if a ECN Echo is also stranded */
+ TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) {
+ if ((chk->whoTo == net) &&
+ (chk->rec.chunk_id.id == SCTP_ECN_ECHO)) {
+ sctp_free_remote_addr(chk->whoTo);
+ if (chk->sent != SCTP_DATAGRAM_RESEND) {
+ chk->sent = SCTP_DATAGRAM_RESEND;
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ }
+ chk->whoTo = alt;
+ atomic_add_int(&alt->ref_count, 1);
+ }
+ }
+ if (net->dest_state & SCTP_ADDR_NOT_REACHABLE) {
+ /*
+ * If the address went un-reachable, we need to move to
+ * alternates for ALL chk's in queue
+ */
+ sctp_move_chunks_from_net(stcb, net);
+ }
+ /* mark the retran info */
+ if (strrst->sent != SCTP_DATAGRAM_RESEND)
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ strrst->sent = SCTP_DATAGRAM_RESEND;
+
+ /* restart the timer */
+ sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, inp, stcb, strrst->whoTo);
+ return (0);
+}
+
+int
+sctp_asconf_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ struct sctp_nets *alt;
+ struct sctp_tmit_chunk *asconf, *chk, *nchk;
+
+ /* is this a first send, or a retransmission? */
+ if (TAILQ_EMPTY(&stcb->asoc.asconf_send_queue)) {
+ /* compose a new ASCONF chunk and send it */
+ sctp_send_asconf(stcb, net, SCTP_ADDR_NOT_LOCKED);
+ } else {
+ /*
+ * Retransmission of the existing ASCONF is needed
+ */
+
+ /* find the existing ASCONF */
+ asconf = TAILQ_FIRST(&stcb->asoc.asconf_send_queue);
+ if (asconf == NULL) {
+ return (0);
+ }
+ /* do threshold management */
+ if (sctp_threshold_management(inp, stcb, asconf->whoTo,
+ stcb->asoc.max_send_times)) {
+ /* Assoc is over */
+ return (1);
+ }
+ if (asconf->snd_count > stcb->asoc.max_send_times) {
+ /*
+ * Something is rotten: our peer is not responding
+ * to ASCONFs but apparently is to other chunks.
+ * i.e. it is not properly handling the chunk type
+ * upper bits. Mark this peer as ASCONF incapable
+ * and cleanup.
+ */
+ SCTPDBG(SCTP_DEBUG_TIMER1, "asconf_timer: Peer has not responded to our repeated ASCONFs\n");
+ sctp_asconf_cleanup(stcb, net);
+ return (0);
+ }
+ /*
+ * cleared threshold management, so now backoff the net and
+ * select an alternate
+ */
+ sctp_backoff_on_timeout(stcb, asconf->whoTo, 1, 0, 0);
+ alt = sctp_find_alternate_net(stcb, asconf->whoTo, 0);
+ if (asconf->whoTo != alt) {
+ sctp_free_remote_addr(asconf->whoTo);
+ asconf->whoTo = alt;
+ atomic_add_int(&alt->ref_count, 1);
+ }
+ /* See if an ECN Echo is also stranded */
+ TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) {
+ if ((chk->whoTo == net) &&
+ (chk->rec.chunk_id.id == SCTP_ECN_ECHO)) {
+ sctp_free_remote_addr(chk->whoTo);
+ chk->whoTo = alt;
+ if (chk->sent != SCTP_DATAGRAM_RESEND) {
+ chk->sent = SCTP_DATAGRAM_RESEND;
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ }
+ atomic_add_int(&alt->ref_count, 1);
+ }
+ }
+ for (chk = asconf; chk; chk = nchk) {
+ nchk = TAILQ_NEXT(chk, sctp_next);
+ if (chk->whoTo != alt) {
+ sctp_free_remote_addr(chk->whoTo);
+ chk->whoTo = alt;
+ atomic_add_int(&alt->ref_count, 1);
+ }
+ if (asconf->sent != SCTP_DATAGRAM_RESEND && chk->sent != SCTP_DATAGRAM_UNSENT)
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ chk->sent = SCTP_DATAGRAM_RESEND;
+ }
+ if (net->dest_state & SCTP_ADDR_NOT_REACHABLE) {
+ /*
+ * If the address went un-reachable, we need to move
+ * to the alternate for ALL chunks in queue
+ */
+ sctp_move_chunks_from_net(stcb, net);
+ }
+ /* mark the retran info */
+ if (asconf->sent != SCTP_DATAGRAM_RESEND)
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ asconf->sent = SCTP_DATAGRAM_RESEND;
+
+ /* send another ASCONF if any and we can do */
+ sctp_send_asconf(stcb, alt, SCTP_ADDR_NOT_LOCKED);
+ }
+ return (0);
+}
+
+/* Mobility adaptation */
+void
+sctp_delete_prim_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ if (stcb->asoc.deleted_primary == NULL) {
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "delete_prim_timer: deleted_primary is not stored...\n");
+ sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
+ return;
+ }
+ SCTPDBG(SCTP_DEBUG_ASCONF1, "delete_prim_timer: finished to keep deleted primary ");
+ SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &stcb->asoc.deleted_primary->ro._l_addr.sa);
+ sctp_free_remote_addr(stcb->asoc.deleted_primary);
+ stcb->asoc.deleted_primary = NULL;
+ sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
+ return;
+}
+
+/*
+ * For the shutdown and shutdown-ack, we do not keep one around on the
+ * control queue. This means we must generate a new one and call the general
+ * chunk output routine, AFTER having done threshold management.
+ * It is assumed that net is non-NULL.
+ */
+int
+sctp_shutdown_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ struct sctp_nets *alt;
+
+ /* first threshold managment */
+ if (sctp_threshold_management(inp, stcb, net, stcb->asoc.max_send_times)) {
+ /* Assoc is over */
+ return (1);
+ }
+ sctp_backoff_on_timeout(stcb, net, 1, 0, 0);
+ /* second select an alternative */
+ alt = sctp_find_alternate_net(stcb, net, 0);
+
+ /* third generate a shutdown into the queue for out net */
+ sctp_send_shutdown(stcb, alt);
+
+ /* fourth restart timer */
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, inp, stcb, alt);
+ return (0);
+}
+
+int
+sctp_shutdownack_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ struct sctp_nets *alt;
+
+ /* first threshold managment */
+ if (sctp_threshold_management(inp, stcb, net, stcb->asoc.max_send_times)) {
+ /* Assoc is over */
+ return (1);
+ }
+ sctp_backoff_on_timeout(stcb, net, 1, 0, 0);
+ /* second select an alternative */
+ alt = sctp_find_alternate_net(stcb, net, 0);
+
+ /* third generate a shutdown into the queue for out net */
+ sctp_send_shutdown_ack(stcb, alt);
+
+ /* fourth restart timer */
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, inp, stcb, alt);
+ return (0);
+}
+
+static void
+sctp_audit_stream_queues_for_size(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb)
+{
+ struct sctp_stream_out *outs;
+ struct sctp_stream_queue_pending *sp;
+ unsigned int chks_in_queue = 0;
+ int being_filled = 0;
+
+ /*
+ * This function is ONLY called when the send/sent queues are empty.
+ */
+ if ((stcb == NULL) || (inp == NULL))
+ return;
+
+ if (stcb->asoc.sent_queue_retran_cnt) {
+ SCTP_PRINTF("Hmm, sent_queue_retran_cnt is non-zero %d\n",
+ stcb->asoc.sent_queue_retran_cnt);
+ stcb->asoc.sent_queue_retran_cnt = 0;
+ }
+ SCTP_TCB_SEND_LOCK(stcb);
+ if (TAILQ_EMPTY(&stcb->asoc.out_wheel)) {
+ int i, cnt = 0;
+
+ /* Check to see if a spoke fell off the wheel */
+ for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
+ if (!TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) {
+ sctp_insert_on_wheel(stcb, &stcb->asoc, &stcb->asoc.strmout[i], 1);
+ cnt++;
+ }
+ }
+ if (cnt) {
+ /* yep, we lost a spoke or two */
+ SCTP_PRINTF("Found an additional %d streams NOT on outwheel, corrected\n", cnt);
+ } else {
+ /* no spokes lost, */
+ stcb->asoc.total_output_queue_size = 0;
+ }
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ return;
+ }
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ /* Check to see if some data queued, if so report it */
+ TAILQ_FOREACH(outs, &stcb->asoc.out_wheel, next_spoke) {
+ if (!TAILQ_EMPTY(&outs->outqueue)) {
+ TAILQ_FOREACH(sp, &outs->outqueue, next) {
+ if (sp->msg_is_complete)
+ being_filled++;
+ chks_in_queue++;
+ }
+ }
+ }
+ if (chks_in_queue != stcb->asoc.stream_queue_cnt) {
+ SCTP_PRINTF("Hmm, stream queue cnt at %d I counted %d in stream out wheel\n",
+ stcb->asoc.stream_queue_cnt, chks_in_queue);
+ }
+ if (chks_in_queue) {
+ /* call the output queue function */
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED);
+ if ((TAILQ_EMPTY(&stcb->asoc.send_queue)) &&
+ (TAILQ_EMPTY(&stcb->asoc.sent_queue))) {
+ /*
+ * Probably should go in and make it go back through
+ * and add fragments allowed
+ */
+ if (being_filled == 0) {
+ SCTP_PRINTF("Still nothing moved %d chunks are stuck\n",
+ chks_in_queue);
+ }
+ }
+ } else {
+ SCTP_PRINTF("Found no chunks on any queue tot:%lu\n",
+ (u_long)stcb->asoc.total_output_queue_size);
+ stcb->asoc.total_output_queue_size = 0;
+ }
+}
+
+int
+sctp_heartbeat_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net, int cnt_of_unconf)
+{
+ int ret;
+
+ if (net) {
+ if (net->hb_responded == 0) {
+ if (net->ro._s_addr) {
+ /*
+ * Invalidate the src address if we did not
+ * get a response last time.
+ */
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ net->src_addr_selected = 0;
+ }
+ sctp_backoff_on_timeout(stcb, net, 1, 0, 0);
+ }
+ /* Zero PBA, if it needs it */
+ if (net->partial_bytes_acked) {
+ net->partial_bytes_acked = 0;
+ }
+ }
+ if ((stcb->asoc.total_output_queue_size > 0) &&
+ (TAILQ_EMPTY(&stcb->asoc.send_queue)) &&
+ (TAILQ_EMPTY(&stcb->asoc.sent_queue))) {
+ sctp_audit_stream_queues_for_size(inp, stcb);
+ }
+ /* Send a new HB, this will do threshold managment, pick a new dest */
+ if (cnt_of_unconf == 0) {
+ if (sctp_send_hb(stcb, 0, NULL) < 0) {
+ return (1);
+ }
+ } else {
+ /*
+ * this will send out extra hb's up to maxburst if there are
+ * any unconfirmed addresses.
+ */
+ uint32_t cnt_sent = 0;
+
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ if ((net->dest_state & SCTP_ADDR_UNCONFIRMED) &&
+ (net->dest_state & SCTP_ADDR_REACHABLE)) {
+ cnt_sent++;
+ if (net->hb_responded == 0) {
+ /* Did we respond last time? */
+ if (net->ro._s_addr) {
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ net->src_addr_selected = 0;
+ }
+ }
+ ret = sctp_send_hb(stcb, 1, net);
+ if (ret < 0)
+ return 1;
+ else if (ret == 0) {
+ break;
+ }
+ if (cnt_sent >= SCTP_BASE_SYSCTL(sctp_hb_maxburst))
+ break;
+ }
+ }
+ }
+ return (0);
+}
+
+void
+sctp_pathmtu_timer(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ uint32_t next_mtu, mtu;
+
+ next_mtu = sctp_get_next_mtu(inp, net->mtu);
+
+ if ((next_mtu > net->mtu) && (net->port == 0)) {
+ if ((net->src_addr_selected == 0) ||
+ (net->ro._s_addr == NULL) ||
+ (net->ro._s_addr->localifa_flags & SCTP_BEING_DELETED)) {
+ if ((net->ro._s_addr != NULL) && (net->ro._s_addr->localifa_flags & SCTP_BEING_DELETED)) {
+ sctp_free_ifa(net->ro._s_addr);
+ net->ro._s_addr = NULL;
+ net->src_addr_selected = 0;
+ } else if (net->ro._s_addr == NULL) {
+#if defined(INET6) && defined(SCTP_EMBEDDED_V6_SCOPE)
+ if (net->ro._l_addr.sa.sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
+
+ /* KAME hack: embed scopeid */
+ (void)sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone));
+ }
+#endif
+
+ net->ro._s_addr = sctp_source_address_selection(inp,
+ stcb,
+ (sctp_route_t *) & net->ro,
+ net, 0, stcb->asoc.vrf_id);
+#if defined(INET6) && defined(SCTP_EMBEDDED_V6_SCOPE)
+ if (net->ro._l_addr.sa.sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
+
+ (void)sa6_recoverscope(sin6);
+ }
+#endif /* INET6 */
+ }
+ if (net->ro._s_addr)
+ net->src_addr_selected = 1;
+ }
+ if (net->ro._s_addr) {
+ mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._s_addr.sa, net->ro.ro_rt);
+ if (net->port) {
+ mtu -= sizeof(struct udphdr);
+ }
+ if (mtu > next_mtu) {
+ net->mtu = next_mtu;
+ }
+ }
+ }
+ /* restart the timer */
+ sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
+}
+
+void
+sctp_autoclose_timer(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ struct timeval tn, *tim_touse;
+ struct sctp_association *asoc;
+ int ticks_gone_by;
+
+ (void)SCTP_GETTIME_TIMEVAL(&tn);
+ if (stcb->asoc.sctp_autoclose_ticks &&
+ sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) {
+ /* Auto close is on */
+ asoc = &stcb->asoc;
+ /* pick the time to use */
+ if (asoc->time_last_rcvd.tv_sec >
+ asoc->time_last_sent.tv_sec) {
+ tim_touse = &asoc->time_last_rcvd;
+ } else {
+ tim_touse = &asoc->time_last_sent;
+ }
+ /* Now has long enough transpired to autoclose? */
+ ticks_gone_by = SEC_TO_TICKS(tn.tv_sec - tim_touse->tv_sec);
+ if ((ticks_gone_by > 0) &&
+ (ticks_gone_by >= (int)asoc->sctp_autoclose_ticks)) {
+ /*
+ * autoclose time has hit, call the output routine,
+ * which should do nothing just to be SURE we don't
+ * have hanging data. We can then safely check the
+ * queues and know that we are clear to send
+ * shutdown
+ */
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_AUTOCLOSE_TMR, SCTP_SO_NOT_LOCKED);
+ /* Are we clean? */
+ if (TAILQ_EMPTY(&asoc->send_queue) &&
+ TAILQ_EMPTY(&asoc->sent_queue)) {
+ /*
+ * there is nothing queued to send, so I'm
+ * done...
+ */
+ if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) {
+ /* only send SHUTDOWN 1st time thru */
+ sctp_send_shutdown(stcb, stcb->asoc.primary_destination);
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT);
+ SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN,
+ stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
+ stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+ }
+ }
+ } else {
+ /*
+ * No auto close at this time, reset t-o to check
+ * later
+ */
+ int tmp;
+
+ /* fool the timer startup to use the time left */
+ tmp = asoc->sctp_autoclose_ticks;
+ asoc->sctp_autoclose_ticks -= ticks_gone_by;
+ sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb,
+ net);
+ /* restore the real tick value */
+ asoc->sctp_autoclose_ticks = tmp;
+ }
+ }
+}
diff --git a/freebsd/sys/netinet/sctp_timer.h b/freebsd/sys/netinet/sctp_timer.h
new file mode 100644
index 00000000..34abbace
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_timer.h
@@ -0,0 +1,101 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_timer.h,v 1.6 2005/03/06 16:04:18 itojun Exp $ */
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __sctp_timer_h__
+#define __sctp_timer_h__
+
+#if defined(_KERNEL) || defined(__Userspace__)
+
+#define SCTP_RTT_SHIFT 3
+#define SCTP_RTT_VAR_SHIFT 2
+
+void
+sctp_early_fr_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net);
+
+struct sctp_nets *
+sctp_find_alternate_net(struct sctp_tcb *,
+ struct sctp_nets *, int mode);
+
+int
+sctp_threshold_management(struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *, uint16_t);
+
+int
+sctp_t3rxt_timer(struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *);
+int
+sctp_t1init_timer(struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *);
+int
+sctp_shutdown_timer(struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *);
+int
+sctp_heartbeat_timer(struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *, int);
+
+int
+sctp_cookie_timer(struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *);
+
+void
+sctp_pathmtu_timer(struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *);
+
+int
+sctp_shutdownack_timer(struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *);
+int
+sctp_strreset_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net);
+
+int
+sctp_asconf_timer(struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *);
+
+void
+sctp_delete_prim_timer(struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *);
+
+void
+sctp_autoclose_timer(struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *net);
+
+void sctp_audit_retranmission_queue(struct sctp_association *);
+
+void sctp_iterator_timer(struct sctp_iterator *it);
+
+
+#endif
+#endif
diff --git a/freebsd/sys/netinet/sctp_uio.h b/freebsd/sys/netinet/sctp_uio.h
new file mode 100644
index 00000000..734447ed
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_uio.h
@@ -0,0 +1,1166 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_uio.h,v 1.11 2005/03/06 16:04:18 itojun Exp $ */
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef __sctp_uio_h__
+#define __sctp_uio_h__
+
+
+#if ! defined(_KERNEL)
+#include <freebsd/stdint.h>
+#endif
+#include <freebsd/sys/types.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/netinet/in.h>
+
+typedef uint32_t sctp_assoc_t;
+
+/* Compatibility to previous define's */
+#define sctp_stream_reset_events sctp_stream_reset_event
+
+/* On/Off setup for subscription to events */
+struct sctp_event_subscribe {
+ uint8_t sctp_data_io_event;
+ uint8_t sctp_association_event;
+ uint8_t sctp_address_event;
+ uint8_t sctp_send_failure_event;
+ uint8_t sctp_peer_error_event;
+ uint8_t sctp_shutdown_event;
+ uint8_t sctp_partial_delivery_event;
+ uint8_t sctp_adaptation_layer_event;
+ uint8_t sctp_authentication_event;
+ uint8_t sctp_sender_dry_event;
+ uint8_t sctp_stream_reset_event;
+};
+
+/* ancillary data types */
+#define SCTP_INIT 0x0001
+#define SCTP_SNDRCV 0x0002
+#define SCTP_EXTRCV 0x0003
+/*
+ * ancillary data structures
+ */
+struct sctp_initmsg {
+ uint16_t sinit_num_ostreams;
+ uint16_t sinit_max_instreams;
+ uint16_t sinit_max_attempts;
+ uint16_t sinit_max_init_timeo;
+};
+
+/* We add 96 bytes to the size of sctp_sndrcvinfo.
+ * This makes the current structure 128 bytes long
+ * which is nicely 64 bit aligned but also has room
+ * for us to add more and keep ABI compatibility.
+ * For example, already we have the sctp_extrcvinfo
+ * when enabled which is 48 bytes.
+ */
+
+/*
+ * The assoc up needs a verfid
+ * all sendrcvinfo's need a verfid for SENDING only.
+ */
+
+
+#define SCTP_ALIGN_RESV_PAD 96
+#define SCTP_ALIGN_RESV_PAD_SHORT 80
+
+struct sctp_sndrcvinfo {
+ uint16_t sinfo_stream;
+ uint16_t sinfo_ssn;
+ uint16_t sinfo_flags;
+ uint32_t sinfo_ppid;
+ uint32_t sinfo_context;
+ uint32_t sinfo_timetolive;
+ uint32_t sinfo_tsn;
+ uint32_t sinfo_cumtsn;
+ sctp_assoc_t sinfo_assoc_id;
+ uint8_t __reserve_pad[SCTP_ALIGN_RESV_PAD];
+};
+
+struct sctp_extrcvinfo {
+ uint16_t sinfo_stream;
+ uint16_t sinfo_ssn;
+ uint16_t sinfo_flags;
+ uint16_t sinfo_pr_policy;
+ uint32_t sinfo_ppid;
+ uint32_t sinfo_context;
+ uint32_t sinfo_timetolive;
+ uint32_t sinfo_tsn;
+ uint32_t sinfo_cumtsn;
+ sctp_assoc_t sinfo_assoc_id;
+ uint16_t sreinfo_next_flags;
+ uint16_t sreinfo_next_stream;
+ uint32_t sreinfo_next_aid;
+ uint32_t sreinfo_next_length;
+ uint32_t sreinfo_next_ppid;
+ uint8_t __reserve_pad[SCTP_ALIGN_RESV_PAD_SHORT];
+};
+
+#define SCTP_NO_NEXT_MSG 0x0000
+#define SCTP_NEXT_MSG_AVAIL 0x0001
+#define SCTP_NEXT_MSG_ISCOMPLETE 0x0002
+#define SCTP_NEXT_MSG_IS_UNORDERED 0x0004
+#define SCTP_NEXT_MSG_IS_NOTIFICATION 0x0008
+
+struct sctp_snd_all_completes {
+ uint16_t sall_stream;
+ uint16_t sall_flags;
+ uint32_t sall_ppid;
+ uint32_t sall_context;
+ uint32_t sall_num_sent;
+ uint32_t sall_num_failed;
+};
+
+/* Flags that go into the sinfo->sinfo_flags field */
+#define SCTP_EOF 0x0100 /* Start shutdown procedures */
+#define SCTP_ABORT 0x0200 /* Send an ABORT to peer */
+#define SCTP_UNORDERED 0x0400 /* Message is un-ordered */
+#define SCTP_ADDR_OVER 0x0800 /* Override the primary-address */
+#define SCTP_SENDALL 0x1000 /* Send this on all associations */
+#define SCTP_EOR 0x2000 /* end of message signal */
+#define SCTP_SACK_IMMEDIATELY 0x4000 /* Set I-Bit */
+
+#define INVALID_SINFO_FLAG(x) (((x) & 0xffffff00 \
+ & ~(SCTP_EOF | SCTP_ABORT | SCTP_UNORDERED |\
+ SCTP_ADDR_OVER | SCTP_SENDALL | SCTP_EOR |\
+ SCTP_SACK_IMMEDIATELY)) != 0)
+/* for the endpoint */
+
+/* The lower byte is an enumeration of PR-SCTP policies */
+#define SCTP_PR_SCTP_TTL 0x0001/* Time based PR-SCTP */
+#define SCTP_PR_SCTP_BUF 0x0002/* Buffer based PR-SCTP */
+#define SCTP_PR_SCTP_RTX 0x0003/* Number of retransmissions based PR-SCTP */
+
+#define PR_SCTP_POLICY(x) ((x) & 0xff)
+#define PR_SCTP_ENABLED(x) (PR_SCTP_POLICY(x) != 0)
+#define PR_SCTP_TTL_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_TTL)
+#define PR_SCTP_BUF_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_BUF)
+#define PR_SCTP_RTX_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_RTX)
+#define PR_SCTP_INVALID_POLICY(x) (PR_SCTP_POLICY(x) > SCTP_PR_SCTP_RTX)
+/* Stat's */
+struct sctp_pcbinfo {
+ uint32_t ep_count;
+ uint32_t asoc_count;
+ uint32_t laddr_count;
+ uint32_t raddr_count;
+ uint32_t chk_count;
+ uint32_t readq_count;
+ uint32_t free_chunks;
+ uint32_t stream_oque;
+};
+
+struct sctp_sockstat {
+ sctp_assoc_t ss_assoc_id;
+ uint32_t ss_total_sndbuf;
+ uint32_t ss_total_recv_buf;
+};
+
+/*
+ * notification event structures
+ */
+
+/*
+ * association change event
+ */
+struct sctp_assoc_change {
+ uint16_t sac_type;
+ uint16_t sac_flags;
+ uint32_t sac_length;
+ uint16_t sac_state;
+ uint16_t sac_error;
+ uint16_t sac_outbound_streams;
+ uint16_t sac_inbound_streams;
+ sctp_assoc_t sac_assoc_id;
+};
+
+/* sac_state values */
+#define SCTP_COMM_UP 0x0001
+#define SCTP_COMM_LOST 0x0002
+#define SCTP_RESTART 0x0003
+#define SCTP_SHUTDOWN_COMP 0x0004
+#define SCTP_CANT_STR_ASSOC 0x0005
+
+
+/*
+ * Address event
+ */
+struct sctp_paddr_change {
+ uint16_t spc_type;
+ uint16_t spc_flags;
+ uint32_t spc_length;
+ struct sockaddr_storage spc_aaddr;
+ uint32_t spc_state;
+ uint32_t spc_error;
+ sctp_assoc_t spc_assoc_id;
+ uint8_t spc_padding[4];
+};
+
+/* paddr state values */
+#define SCTP_ADDR_AVAILABLE 0x0001
+#define SCTP_ADDR_UNREACHABLE 0x0002
+#define SCTP_ADDR_REMOVED 0x0003
+#define SCTP_ADDR_ADDED 0x0004
+#define SCTP_ADDR_MADE_PRIM 0x0005
+#define SCTP_ADDR_CONFIRMED 0x0006
+
+/*
+ * CAUTION: these are user exposed SCTP addr reachability states must be
+ * compatible with SCTP_ADDR states in sctp_constants.h
+ */
+#ifdef SCTP_ACTIVE
+#undef SCTP_ACTIVE
+#endif
+#define SCTP_ACTIVE 0x0001 /* SCTP_ADDR_REACHABLE */
+
+#ifdef SCTP_INACTIVE
+#undef SCTP_INACTIVE
+#endif
+#define SCTP_INACTIVE 0x0002 /* SCTP_ADDR_NOT_REACHABLE */
+
+#ifdef SCTP_UNCONFIRMED
+#undef SCTP_UNCONFIRMED
+#endif
+#define SCTP_UNCONFIRMED 0x0200 /* SCTP_ADDR_UNCONFIRMED */
+
+#ifdef SCTP_NOHEARTBEAT
+#undef SCTP_NOHEARTBEAT
+#endif
+#define SCTP_NOHEARTBEAT 0x0040 /* SCTP_ADDR_NOHB */
+
+
+/* remote error events */
+struct sctp_remote_error {
+ uint16_t sre_type;
+ uint16_t sre_flags;
+ uint32_t sre_length;
+ uint16_t sre_error;
+ sctp_assoc_t sre_assoc_id;
+ uint8_t sre_data[4];
+};
+
+/* data send failure event */
+struct sctp_send_failed {
+ uint16_t ssf_type;
+ uint16_t ssf_flags;
+ uint32_t ssf_length;
+ uint32_t ssf_error;
+ struct sctp_sndrcvinfo ssf_info;
+ sctp_assoc_t ssf_assoc_id;
+ uint8_t ssf_data[];
+};
+
+/* flag that indicates state of data */
+#define SCTP_DATA_UNSENT 0x0001 /* inqueue never on wire */
+#define SCTP_DATA_SENT 0x0002 /* on wire at failure */
+
+/* shutdown event */
+struct sctp_shutdown_event {
+ uint16_t sse_type;
+ uint16_t sse_flags;
+ uint32_t sse_length;
+ sctp_assoc_t sse_assoc_id;
+};
+
+/* Adaptation layer indication stuff */
+struct sctp_adaptation_event {
+ uint16_t sai_type;
+ uint16_t sai_flags;
+ uint32_t sai_length;
+ uint32_t sai_adaptation_ind;
+ sctp_assoc_t sai_assoc_id;
+};
+
+struct sctp_setadaptation {
+ uint32_t ssb_adaptation_ind;
+};
+
+/* compatible old spelling */
+struct sctp_adaption_event {
+ uint16_t sai_type;
+ uint16_t sai_flags;
+ uint32_t sai_length;
+ uint32_t sai_adaption_ind;
+ sctp_assoc_t sai_assoc_id;
+};
+
+struct sctp_setadaption {
+ uint32_t ssb_adaption_ind;
+};
+
+
+/*
+ * Partial Delivery API event
+ */
+struct sctp_pdapi_event {
+ uint16_t pdapi_type;
+ uint16_t pdapi_flags;
+ uint32_t pdapi_length;
+ uint32_t pdapi_indication;
+ uint16_t pdapi_stream;
+ uint16_t pdapi_seq;
+ sctp_assoc_t pdapi_assoc_id;
+};
+
+/* indication values */
+#define SCTP_PARTIAL_DELIVERY_ABORTED 0x0001
+
+
+/*
+ * authentication key event
+ */
+struct sctp_authkey_event {
+ uint16_t auth_type;
+ uint16_t auth_flags;
+ uint32_t auth_length;
+ uint16_t auth_keynumber;
+ uint16_t auth_altkeynumber;
+ uint32_t auth_indication;
+ sctp_assoc_t auth_assoc_id;
+};
+
+/* indication values */
+#define SCTP_AUTH_NEWKEY 0x0001
+#define SCTP_AUTH_NO_AUTH 0x0002
+#define SCTP_AUTH_FREE_KEY 0x0003
+
+
+struct sctp_sender_dry_event {
+ uint16_t sender_dry_type;
+ uint16_t sender_dry_flags;
+ uint32_t sender_dry_length;
+ sctp_assoc_t sender_dry_assoc_id;
+};
+
+
+/*
+ * stream reset event
+ */
+struct sctp_stream_reset_event {
+ uint16_t strreset_type;
+ uint16_t strreset_flags;
+ uint32_t strreset_length;
+ sctp_assoc_t strreset_assoc_id;
+ uint16_t strreset_list[];
+};
+
+/* flags in strreset_flags field */
+#define SCTP_STRRESET_INBOUND_STR 0x0001
+#define SCTP_STRRESET_OUTBOUND_STR 0x0002
+#define SCTP_STRRESET_ALL_STREAMS 0x0004
+#define SCTP_STRRESET_STREAM_LIST 0x0008
+#define SCTP_STRRESET_FAILED 0x0010
+#define SCTP_STRRESET_ADD_STREAM 0x0020
+
+/* SCTP notification event */
+struct sctp_tlv {
+ uint16_t sn_type;
+ uint16_t sn_flags;
+ uint32_t sn_length;
+};
+
+union sctp_notification {
+ struct sctp_tlv sn_header;
+ struct sctp_assoc_change sn_assoc_change;
+ struct sctp_paddr_change sn_paddr_change;
+ struct sctp_remote_error sn_remote_error;
+ struct sctp_send_failed sn_send_failed;
+ struct sctp_shutdown_event sn_shutdown_event;
+ struct sctp_adaptation_event sn_adaptation_event;
+ /* compatibility same as above */
+ struct sctp_adaption_event sn_adaption_event;
+ struct sctp_pdapi_event sn_pdapi_event;
+ struct sctp_authkey_event sn_auth_event;
+ struct sctp_sender_dry_event sn_sender_dry_event;
+ struct sctp_stream_reset_event sn_strreset_event;
+};
+
+/* notification types */
+#define SCTP_ASSOC_CHANGE 0x0001
+#define SCTP_PEER_ADDR_CHANGE 0x0002
+#define SCTP_REMOTE_ERROR 0x0003
+#define SCTP_SEND_FAILED 0x0004
+#define SCTP_SHUTDOWN_EVENT 0x0005
+#define SCTP_ADAPTATION_INDICATION 0x0006
+/* same as above */
+#define SCTP_ADAPTION_INDICATION 0x0006
+#define SCTP_PARTIAL_DELIVERY_EVENT 0x0007
+#define SCTP_AUTHENTICATION_EVENT 0x0008
+#define SCTP_STREAM_RESET_EVENT 0x0009
+#define SCTP_SENDER_DRY_EVENT 0x000a
+#define SCTP__NOTIFICATIONS_STOPPED_EVENT 0x000b /* we don't send this */
+/*
+ * socket option structs
+ */
+
+struct sctp_paddrparams {
+ struct sockaddr_storage spp_address;
+ sctp_assoc_t spp_assoc_id;
+ uint32_t spp_hbinterval;
+ uint32_t spp_pathmtu;
+ uint32_t spp_flags;
+ uint32_t spp_ipv6_flowlabel;
+ uint16_t spp_pathmaxrxt;
+ uint8_t spp_ipv4_tos;
+};
+
+#define SPP_HB_ENABLE 0x00000001
+#define SPP_HB_DISABLE 0x00000002
+#define SPP_HB_DEMAND 0x00000004
+#define SPP_PMTUD_ENABLE 0x00000008
+#define SPP_PMTUD_DISABLE 0x00000010
+#define SPP_HB_TIME_IS_ZERO 0x00000080
+#define SPP_IPV6_FLOWLABEL 0x00000100
+#define SPP_IPV4_TOS 0x00000200
+
+struct sctp_paddrinfo {
+ struct sockaddr_storage spinfo_address;
+ sctp_assoc_t spinfo_assoc_id;
+ int32_t spinfo_state;
+ uint32_t spinfo_cwnd;
+ uint32_t spinfo_srtt;
+ uint32_t spinfo_rto;
+ uint32_t spinfo_mtu;
+};
+
+struct sctp_rtoinfo {
+ sctp_assoc_t srto_assoc_id;
+ uint32_t srto_initial;
+ uint32_t srto_max;
+ uint32_t srto_min;
+};
+
+struct sctp_assocparams {
+ sctp_assoc_t sasoc_assoc_id;
+ uint32_t sasoc_peer_rwnd;
+ uint32_t sasoc_local_rwnd;
+ uint32_t sasoc_cookie_life;
+ uint16_t sasoc_asocmaxrxt;
+ uint16_t sasoc_number_peer_destinations;
+};
+
+struct sctp_setprim {
+ struct sockaddr_storage ssp_addr;
+ sctp_assoc_t ssp_assoc_id;
+ uint8_t ssp_padding[4];
+};
+
+struct sctp_setpeerprim {
+ struct sockaddr_storage sspp_addr;
+ sctp_assoc_t sspp_assoc_id;
+ uint8_t sspp_padding[4];
+};
+
+struct sctp_getaddresses {
+ sctp_assoc_t sget_assoc_id;
+ /* addr is filled in for N * sockaddr_storage */
+ struct sockaddr addr[1];
+};
+
+struct sctp_setstrm_timeout {
+ sctp_assoc_t ssto_assoc_id;
+ uint32_t ssto_timeout;
+ uint32_t ssto_streamid_start;
+ uint32_t ssto_streamid_end;
+};
+
+struct sctp_status {
+ sctp_assoc_t sstat_assoc_id;
+ int32_t sstat_state;
+ uint32_t sstat_rwnd;
+ uint16_t sstat_unackdata;
+ uint16_t sstat_penddata;
+ uint16_t sstat_instrms;
+ uint16_t sstat_outstrms;
+ uint32_t sstat_fragmentation_point;
+ struct sctp_paddrinfo sstat_primary;
+};
+
+/*
+ * AUTHENTICATION support
+ */
+/* SCTP_AUTH_CHUNK */
+struct sctp_authchunk {
+ uint8_t sauth_chunk;
+};
+
+/* SCTP_AUTH_KEY */
+struct sctp_authkey {
+ sctp_assoc_t sca_assoc_id;
+ uint16_t sca_keynumber;
+ uint8_t sca_key[];
+};
+
+/* SCTP_HMAC_IDENT */
+struct sctp_hmacalgo {
+ uint32_t shmac_number_of_idents;
+ uint16_t shmac_idents[];
+};
+
+/* AUTH hmac_id */
+#define SCTP_AUTH_HMAC_ID_RSVD 0x0000
+#define SCTP_AUTH_HMAC_ID_SHA1 0x0001 /* default, mandatory */
+#define SCTP_AUTH_HMAC_ID_SHA256 0x0003
+#define SCTP_AUTH_HMAC_ID_SHA224 0x0004
+#define SCTP_AUTH_HMAC_ID_SHA384 0x0005
+#define SCTP_AUTH_HMAC_ID_SHA512 0x0006
+
+
+/* SCTP_AUTH_ACTIVE_KEY / SCTP_AUTH_DELETE_KEY */
+struct sctp_authkeyid {
+ sctp_assoc_t scact_assoc_id;
+ uint16_t scact_keynumber;
+};
+
+/* SCTP_PEER_AUTH_CHUNKS / SCTP_LOCAL_AUTH_CHUNKS */
+struct sctp_authchunks {
+ sctp_assoc_t gauth_assoc_id;
+ uint8_t gauth_chunks[];
+};
+
+struct sctp_assoc_value {
+ sctp_assoc_t assoc_id;
+ uint32_t assoc_value;
+};
+
+struct sctp_assoc_ids {
+ uint32_t gaids_number_of_ids;
+ sctp_assoc_t gaids_assoc_id[];
+};
+
+struct sctp_sack_info {
+ sctp_assoc_t sack_assoc_id;
+ uint32_t sack_delay;
+ uint32_t sack_freq;
+};
+
+struct sctp_timeouts {
+ sctp_assoc_t stimo_assoc_id;
+ uint32_t stimo_init;
+ uint32_t stimo_data;
+ uint32_t stimo_sack;
+ uint32_t stimo_shutdown;
+ uint32_t stimo_heartbeat;
+ uint32_t stimo_cookie;
+ uint32_t stimo_shutdownack;
+};
+
+struct sctp_cwnd_args {
+ struct sctp_nets *net; /* network to *//* FIXME: LP64 issue */
+ uint32_t cwnd_new_value;/* cwnd in k */
+ uint32_t pseudo_cumack;
+ uint16_t inflight; /* flightsize in k */
+ uint16_t cwnd_augment; /* increment to it */
+ uint8_t meets_pseudo_cumack;
+ uint8_t need_new_pseudo_cumack;
+ uint8_t cnt_in_send;
+ uint8_t cnt_in_str;
+};
+
+struct sctp_blk_args {
+ uint32_t onsb; /* in 1k bytes */
+ uint32_t sndlen; /* len of send being attempted */
+ uint32_t peer_rwnd; /* rwnd of peer */
+ uint16_t send_sent_qcnt;/* chnk cnt */
+ uint16_t stream_qcnt; /* chnk cnt */
+ uint16_t chunks_on_oque;/* chunks out */
+ uint16_t flight_size; /* flight size in k */
+};
+
+/*
+ * Max we can reset in one setting, note this is dictated not by the define
+ * but the size of a mbuf cluster so don't change this define and think you
+ * can specify more. You must do multiple resets if you want to reset more
+ * than SCTP_MAX_EXPLICIT_STR_RESET.
+ */
+#define SCTP_MAX_EXPLICT_STR_RESET 1000
+
+#define SCTP_RESET_LOCAL_RECV 0x0001
+#define SCTP_RESET_LOCAL_SEND 0x0002
+#define SCTP_RESET_BOTH 0x0003
+#define SCTP_RESET_TSN 0x0004
+#define SCTP_RESET_ADD_STREAMS 0x0005
+
+struct sctp_stream_reset {
+ sctp_assoc_t strrst_assoc_id;
+ uint16_t strrst_flags;
+ uint16_t strrst_num_streams; /* 0 == ALL */
+ uint16_t strrst_list[]; /* list if strrst_num_streams is not 0 */
+};
+
+
+struct sctp_get_nonce_values {
+ sctp_assoc_t gn_assoc_id;
+ uint32_t gn_peers_tag;
+ uint32_t gn_local_tag;
+};
+
+/* Debugging logs */
+struct sctp_str_log {
+ void *stcb; /* FIXME: LP64 issue */
+ uint32_t n_tsn;
+ uint32_t e_tsn;
+ uint16_t n_sseq;
+ uint16_t e_sseq;
+ uint16_t strm;
+};
+
+struct sctp_sb_log {
+ void *stcb; /* FIXME: LP64 issue */
+ uint32_t so_sbcc;
+ uint32_t stcb_sbcc;
+ uint32_t incr;
+};
+
+struct sctp_fr_log {
+ uint32_t largest_tsn;
+ uint32_t largest_new_tsn;
+ uint32_t tsn;
+};
+
+struct sctp_fr_map {
+ uint32_t base;
+ uint32_t cum;
+ uint32_t high;
+};
+
+struct sctp_rwnd_log {
+ uint32_t rwnd;
+ uint32_t send_size;
+ uint32_t overhead;
+ uint32_t new_rwnd;
+};
+
+struct sctp_mbcnt_log {
+ uint32_t total_queue_size;
+ uint32_t size_change;
+ uint32_t total_queue_mb_size;
+ uint32_t mbcnt_change;
+};
+
+struct sctp_sack_log {
+ uint32_t cumack;
+ uint32_t oldcumack;
+ uint32_t tsn;
+ uint16_t numGaps;
+ uint16_t numDups;
+};
+
+struct sctp_lock_log {
+ void *sock; /* FIXME: LP64 issue */
+ void *inp; /* FIXME: LP64 issue */
+ uint8_t tcb_lock;
+ uint8_t inp_lock;
+ uint8_t info_lock;
+ uint8_t sock_lock;
+ uint8_t sockrcvbuf_lock;
+ uint8_t socksndbuf_lock;
+ uint8_t create_lock;
+ uint8_t resv;
+};
+
+struct sctp_rto_log {
+ void *net; /* FIXME: LP64 issue */
+ uint32_t rtt;
+};
+
+struct sctp_nagle_log {
+ void *stcb; /* FIXME: LP64 issue */
+ uint32_t total_flight;
+ uint32_t total_in_queue;
+ uint16_t count_in_queue;
+ uint16_t count_in_flight;
+};
+
+struct sctp_sbwake_log {
+ void *stcb; /* FIXME: LP64 issue */
+ uint16_t send_q;
+ uint16_t sent_q;
+ uint16_t flight;
+ uint16_t wake_cnt;
+ uint8_t stream_qcnt; /* chnk cnt */
+ uint8_t chunks_on_oque; /* chunks out */
+ uint8_t sbflags;
+ uint8_t sctpflags;
+};
+
+struct sctp_misc_info {
+ uint32_t log1;
+ uint32_t log2;
+ uint32_t log3;
+ uint32_t log4;
+};
+
+struct sctp_log_closing {
+ void *inp; /* FIXME: LP64 issue */
+ void *stcb; /* FIXME: LP64 issue */
+ uint32_t sctp_flags;
+ uint16_t state;
+ int16_t loc;
+};
+
+struct sctp_mbuf_log {
+ struct mbuf *mp; /* FIXME: LP64 issue */
+ caddr_t ext;
+ caddr_t data;
+ uint16_t size;
+ uint8_t refcnt;
+ uint8_t mbuf_flags;
+};
+
+struct sctp_cwnd_log {
+ uint64_t time_event;
+ uint8_t from;
+ uint8_t event_type;
+ uint8_t resv[2];
+ union {
+ struct sctp_log_closing close;
+ struct sctp_blk_args blk;
+ struct sctp_cwnd_args cwnd;
+ struct sctp_str_log strlog;
+ struct sctp_fr_log fr;
+ struct sctp_fr_map map;
+ struct sctp_rwnd_log rwnd;
+ struct sctp_mbcnt_log mbcnt;
+ struct sctp_sack_log sack;
+ struct sctp_lock_log lock;
+ struct sctp_rto_log rto;
+ struct sctp_sb_log sb;
+ struct sctp_nagle_log nagle;
+ struct sctp_sbwake_log wake;
+ struct sctp_mbuf_log mb;
+ struct sctp_misc_info misc;
+ } x;
+};
+
+struct sctp_cwnd_log_req {
+ int32_t num_in_log; /* Number in log */
+ int32_t num_ret; /* Number returned */
+ int32_t start_at; /* start at this one */
+ int32_t end_at; /* end at this one */
+ struct sctp_cwnd_log log[];
+};
+
+struct sctp_timeval {
+ uint32_t tv_sec;
+ uint32_t tv_usec;
+};
+
+struct sctpstat {
+ struct sctp_timeval sctps_discontinuitytime; /* sctpStats 18
+ * (TimeStamp) */
+ /* MIB according to RFC 3873 */
+ uint32_t sctps_currestab; /* sctpStats 1 (Gauge32) */
+ uint32_t sctps_activeestab; /* sctpStats 2 (Counter32) */
+ uint32_t sctps_restartestab;
+ uint32_t sctps_collisionestab;
+ uint32_t sctps_passiveestab; /* sctpStats 3 (Counter32) */
+ uint32_t sctps_aborted; /* sctpStats 4 (Counter32) */
+ uint32_t sctps_shutdown;/* sctpStats 5 (Counter32) */
+ uint32_t sctps_outoftheblue; /* sctpStats 6 (Counter32) */
+ uint32_t sctps_checksumerrors; /* sctpStats 7 (Counter32) */
+ uint32_t sctps_outcontrolchunks; /* sctpStats 8 (Counter64) */
+ uint32_t sctps_outorderchunks; /* sctpStats 9 (Counter64) */
+ uint32_t sctps_outunorderchunks; /* sctpStats 10 (Counter64) */
+ uint32_t sctps_incontrolchunks; /* sctpStats 11 (Counter64) */
+ uint32_t sctps_inorderchunks; /* sctpStats 12 (Counter64) */
+ uint32_t sctps_inunorderchunks; /* sctpStats 13 (Counter64) */
+ uint32_t sctps_fragusrmsgs; /* sctpStats 14 (Counter64) */
+ uint32_t sctps_reasmusrmsgs; /* sctpStats 15 (Counter64) */
+ uint32_t sctps_outpackets; /* sctpStats 16 (Counter64) */
+ uint32_t sctps_inpackets; /* sctpStats 17 (Counter64) */
+
+ /* input statistics: */
+ uint32_t sctps_recvpackets; /* total input packets */
+ uint32_t sctps_recvdatagrams; /* total input datagrams */
+ uint32_t sctps_recvpktwithdata; /* total packets that had data */
+ uint32_t sctps_recvsacks; /* total input SACK chunks */
+ uint32_t sctps_recvdata;/* total input DATA chunks */
+ uint32_t sctps_recvdupdata; /* total input duplicate DATA chunks */
+ uint32_t sctps_recvheartbeat; /* total input HB chunks */
+ uint32_t sctps_recvheartbeatack; /* total input HB-ACK chunks */
+ uint32_t sctps_recvecne;/* total input ECNE chunks */
+ uint32_t sctps_recvauth;/* total input AUTH chunks */
+ uint32_t sctps_recvauthmissing; /* total input chunks missing AUTH */
+ uint32_t sctps_recvivalhmacid; /* total number of invalid HMAC ids
+ * received */
+ uint32_t sctps_recvivalkeyid; /* total number of invalid secret ids
+ * received */
+ uint32_t sctps_recvauthfailed; /* total number of auth failed */
+ uint32_t sctps_recvexpress; /* total fast path receives all one
+ * chunk */
+ uint32_t sctps_recvexpressm; /* total fast path multi-part data */
+ uint32_t sctps_recvnocrc;
+ uint32_t sctps_recvswcrc;
+ uint32_t sctps_recvhwcrc;
+
+ /* output statistics: */
+ uint32_t sctps_sendpackets; /* total output packets */
+ uint32_t sctps_sendsacks; /* total output SACKs */
+ uint32_t sctps_senddata;/* total output DATA chunks */
+ uint32_t sctps_sendretransdata; /* total output retransmitted DATA
+ * chunks */
+ uint32_t sctps_sendfastretrans; /* total output fast retransmitted
+ * DATA chunks */
+ uint32_t sctps_sendmultfastretrans; /* total FR's that happened
+ * more than once to same
+ * chunk (u-del multi-fr
+ * algo). */
+ uint32_t sctps_sendheartbeat; /* total output HB chunks */
+ uint32_t sctps_sendecne;/* total output ECNE chunks */
+ uint32_t sctps_sendauth;/* total output AUTH chunks FIXME */
+ uint32_t sctps_senderrors; /* ip_output error counter */
+ uint32_t sctps_sendnocrc;
+ uint32_t sctps_sendswcrc;
+ uint32_t sctps_sendhwcrc;
+ /* PCKDROPREP statistics: */
+ uint32_t sctps_pdrpfmbox; /* Packet drop from middle box */
+ uint32_t sctps_pdrpfehos; /* P-drop from end host */
+ uint32_t sctps_pdrpmbda;/* P-drops with data */
+ uint32_t sctps_pdrpmbct;/* P-drops, non-data, non-endhost */
+ uint32_t sctps_pdrpbwrpt; /* P-drop, non-endhost, bandwidth rep
+ * only */
+ uint32_t sctps_pdrpcrupt; /* P-drop, not enough for chunk header */
+ uint32_t sctps_pdrpnedat; /* P-drop, not enough data to confirm */
+ uint32_t sctps_pdrppdbrk; /* P-drop, where process_chunk_drop
+ * said break */
+ uint32_t sctps_pdrptsnnf; /* P-drop, could not find TSN */
+ uint32_t sctps_pdrpdnfnd; /* P-drop, attempt reverse TSN lookup */
+ uint32_t sctps_pdrpdiwnp; /* P-drop, e-host confirms zero-rwnd */
+ uint32_t sctps_pdrpdizrw; /* P-drop, midbox confirms no space */
+ uint32_t sctps_pdrpbadd;/* P-drop, data did not match TSN */
+ uint32_t sctps_pdrpmark;/* P-drop, TSN's marked for Fast Retran */
+ /* timeouts */
+ uint32_t sctps_timoiterator; /* Number of iterator timers that
+ * fired */
+ uint32_t sctps_timodata;/* Number of T3 data time outs */
+ uint32_t sctps_timowindowprobe; /* Number of window probe (T3) timers
+ * that fired */
+ uint32_t sctps_timoinit;/* Number of INIT timers that fired */
+ uint32_t sctps_timosack;/* Number of sack timers that fired */
+ uint32_t sctps_timoshutdown; /* Number of shutdown timers that
+ * fired */
+ uint32_t sctps_timoheartbeat; /* Number of heartbeat timers that
+ * fired */
+ uint32_t sctps_timocookie; /* Number of times a cookie timeout
+ * fired */
+ uint32_t sctps_timosecret; /* Number of times an endpoint changed
+ * its cookie secret */
+ uint32_t sctps_timopathmtu; /* Number of PMTU timers that fired */
+ uint32_t sctps_timoshutdownack; /* Number of shutdown ack timers that
+ * fired */
+ uint32_t sctps_timoshutdownguard; /* Number of shutdown guard
+ * timers that fired */
+ uint32_t sctps_timostrmrst; /* Number of stream reset timers that
+ * fired */
+ uint32_t sctps_timoearlyfr; /* Number of early FR timers that
+ * fired */
+ uint32_t sctps_timoasconf; /* Number of times an asconf timer
+ * fired */
+ uint32_t sctps_timodelprim; /* Number of times a prim_deleted
+ * timer fired */
+ uint32_t sctps_timoautoclose; /* Number of times auto close timer
+ * fired */
+ uint32_t sctps_timoassockill; /* Number of asoc free timers expired */
+ uint32_t sctps_timoinpkill; /* Number of inp free timers expired */
+ /* Early fast retransmission counters */
+ uint32_t sctps_earlyfrstart;
+ uint32_t sctps_earlyfrstop;
+ uint32_t sctps_earlyfrmrkretrans;
+ uint32_t sctps_earlyfrstpout;
+ uint32_t sctps_earlyfrstpidsck1;
+ uint32_t sctps_earlyfrstpidsck2;
+ uint32_t sctps_earlyfrstpidsck3;
+ uint32_t sctps_earlyfrstpidsck4;
+ uint32_t sctps_earlyfrstrid;
+ uint32_t sctps_earlyfrstrout;
+ uint32_t sctps_earlyfrstrtmr;
+ /* others */
+ uint32_t sctps_hdrops; /* packet shorter than header */
+ uint32_t sctps_badsum; /* checksum error */
+ uint32_t sctps_noport; /* no endpoint for port */
+ uint32_t sctps_badvtag; /* bad v-tag */
+ uint32_t sctps_badsid; /* bad SID */
+ uint32_t sctps_nomem; /* no memory */
+ uint32_t sctps_fastretransinrtt; /* number of multiple FR in a
+ * RTT window */
+ uint32_t sctps_markedretrans;
+ uint32_t sctps_naglesent; /* nagle allowed sending */
+ uint32_t sctps_naglequeued; /* nagle doesn't allow sending */
+ uint32_t sctps_maxburstqueued; /* max burst doesn't allow sending */
+ uint32_t sctps_ifnomemqueued; /* look ahead tells us no memory in
+ * interface ring buffer OR we had a
+ * send error and are queuing one
+ * send. */
+ uint32_t sctps_windowprobed; /* total number of window probes sent */
+ uint32_t sctps_lowlevelerr; /* total times an output error causes
+ * us to clamp down on next user send. */
+ uint32_t sctps_lowlevelerrusr; /* total times sctp_senderrors were
+ * caused from a user send from a user
+ * invoked send not a sack response */
+ uint32_t sctps_datadropchklmt; /* Number of in data drops due to
+ * chunk limit reached */
+ uint32_t sctps_datadroprwnd; /* Number of in data drops due to rwnd
+ * limit reached */
+ uint32_t sctps_ecnereducedcwnd; /* Number of times a ECN reduced the
+ * cwnd */
+ uint32_t sctps_vtagexpress; /* Used express lookup via vtag */
+ uint32_t sctps_vtagbogus; /* Collision in express lookup. */
+ uint32_t sctps_primary_randry; /* Number of times the sender ran dry
+ * of user data on primary */
+ uint32_t sctps_cmt_randry; /* Same for above */
+ uint32_t sctps_slowpath_sack; /* Sacks the slow way */
+ uint32_t sctps_wu_sacks_sent; /* Window Update only sacks sent */
+ uint32_t sctps_sends_with_flags; /* number of sends with
+ * sinfo_flags !=0 */
+ uint32_t sctps_sends_with_unord; /* number of unordered sends */
+ uint32_t sctps_sends_with_eof; /* number of sends with EOF flag set */
+ uint32_t sctps_sends_with_abort; /* number of sends with ABORT
+ * flag set */
+ uint32_t sctps_protocol_drain_calls; /* number of times protocol
+ * drain called */
+ uint32_t sctps_protocol_drains_done; /* number of times we did a
+ * protocol drain */
+ uint32_t sctps_read_peeks; /* Number of times recv was called
+ * with peek */
+ uint32_t sctps_cached_chk; /* Number of cached chunks used */
+ uint32_t sctps_cached_strmoq; /* Number of cached stream oq's used */
+ uint32_t sctps_left_abandon; /* Number of unread messages abandoned
+ * by close */
+ uint32_t sctps_send_burst_avoid; /* Unused */
+ uint32_t sctps_send_cwnd_avoid; /* Send cwnd full avoidance, already
+ * max burst inflight to net */
+ uint32_t sctps_fwdtsn_map_over; /* number of map array over-runs via
+ * fwd-tsn's */
+
+ uint32_t sctps_reserved[32]; /* Future ABI compat - remove int's
+ * from here when adding new */
+};
+
+#define SCTP_STAT_INCR(_x) SCTP_STAT_INCR_BY(_x,1)
+#define SCTP_STAT_DECR(_x) SCTP_STAT_DECR_BY(_x,1)
+#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
+#define SCTP_STAT_INCR_BY(_x,_d) (SCTP_BASE_STATS[PCPU_GET(cpuid)]._x += _d)
+#define SCTP_STAT_DECR_BY(_x,_d) (SCTP_BASE_STATS[PCPU_GET(cpuid)]._x -= _d)
+#else
+#define SCTP_STAT_INCR_BY(_x,_d) atomic_add_int(&SCTP_BASE_STAT(_x), _d)
+#define SCTP_STAT_DECR_BY(_x,_d) atomic_subtract_int(&SCTP_BASE_STAT(_x), _d)
+#endif
+/* The following macros are for handling MIB values, */
+#define SCTP_STAT_INCR_COUNTER32(_x) SCTP_STAT_INCR(_x)
+#define SCTP_STAT_INCR_COUNTER64(_x) SCTP_STAT_INCR(_x)
+#define SCTP_STAT_INCR_GAUGE32(_x) SCTP_STAT_INCR(_x)
+#define SCTP_STAT_DECR_COUNTER32(_x) SCTP_STAT_DECR(_x)
+#define SCTP_STAT_DECR_COUNTER64(_x) SCTP_STAT_DECR(_x)
+#define SCTP_STAT_DECR_GAUGE32(_x) SCTP_STAT_DECR(_x)
+
+union sctp_sockstore {
+#if defined(INET) || !defined(_KERNEL)
+ struct sockaddr_in sin;
+#endif
+#if defined(INET6) || !defined(_KERNEL)
+ struct sockaddr_in6 sin6;
+#endif
+ struct sockaddr sa;
+};
+
+
+/***********************************/
+/* And something for us old timers */
+/***********************************/
+
+#ifndef ntohll
+#include <freebsd/sys/endian.h>
+#define ntohll(x) be64toh(x)
+#endif
+
+#ifndef htonll
+#include <freebsd/sys/endian.h>
+#define htonll(x) htobe64(x)
+#endif
+/***********************************/
+
+
+struct xsctp_inpcb {
+ uint32_t last;
+ uint32_t flags;
+ uint32_t features;
+ uint32_t total_sends;
+ uint32_t total_recvs;
+ uint32_t total_nospaces;
+ uint32_t fragmentation_point;
+ uint16_t local_port;
+ uint16_t qlen;
+ uint16_t maxqlen;
+ uint32_t extra_padding[32]; /* future */
+};
+
+struct xsctp_tcb {
+ union sctp_sockstore primary_addr; /* sctpAssocEntry 5/6 */
+ uint32_t last;
+ uint32_t heartbeat_interval; /* sctpAssocEntry 7 */
+ uint32_t state; /* sctpAssocEntry 8 */
+ uint32_t in_streams; /* sctpAssocEntry 9 */
+ uint32_t out_streams; /* sctpAssocEntry 10 */
+ uint32_t max_nr_retrans;/* sctpAssocEntry 11 */
+ uint32_t primary_process; /* sctpAssocEntry 12 */
+ uint32_t T1_expireries; /* sctpAssocEntry 13 */
+ uint32_t T2_expireries; /* sctpAssocEntry 14 */
+ uint32_t retransmitted_tsns; /* sctpAssocEntry 15 */
+ uint32_t total_sends;
+ uint32_t total_recvs;
+ uint32_t local_tag;
+ uint32_t remote_tag;
+ uint32_t initial_tsn;
+ uint32_t highest_tsn;
+ uint32_t cumulative_tsn;
+ uint32_t cumulative_tsn_ack;
+ uint32_t mtu;
+ uint32_t refcnt;
+ uint16_t local_port; /* sctpAssocEntry 3 */
+ uint16_t remote_port; /* sctpAssocEntry 4 */
+ struct sctp_timeval start_time; /* sctpAssocEntry 16 */
+ struct sctp_timeval discontinuity_time; /* sctpAssocEntry 17 */
+ uint32_t peers_rwnd;
+ sctp_assoc_t assoc_id; /* sctpAssocEntry 1 */
+ uint32_t extra_padding[32]; /* future */
+};
+
+struct xsctp_laddr {
+ union sctp_sockstore address; /* sctpAssocLocalAddrEntry 1/2 */
+ uint32_t last;
+ struct sctp_timeval start_time; /* sctpAssocLocalAddrEntry 3 */
+ uint32_t extra_padding[32]; /* future */
+};
+
+struct xsctp_raddr {
+ union sctp_sockstore address; /* sctpAssocLocalRemEntry 1/2 */
+ uint32_t last;
+ uint32_t rto; /* sctpAssocLocalRemEntry 5 */
+ uint32_t max_path_rtx; /* sctpAssocLocalRemEntry 6 */
+ uint32_t rtx; /* sctpAssocLocalRemEntry 7 */
+ uint32_t error_counter; /* */
+ uint32_t cwnd; /* */
+ uint32_t flight_size; /* */
+ uint32_t mtu; /* */
+ uint8_t active; /* sctpAssocLocalRemEntry 3 */
+ uint8_t confirmed; /* */
+ uint8_t heartbeat_enabled; /* sctpAssocLocalRemEntry 4 */
+ struct sctp_timeval start_time; /* sctpAssocLocalRemEntry 8 */
+ uint32_t rtt;
+ uint32_t extra_padding[32]; /* future */
+};
+
+#define SCTP_MAX_LOGGING_SIZE 30000
+#define SCTP_TRACE_PARAMS 6 /* This number MUST be even */
+
+struct sctp_log_entry {
+ uint64_t timestamp;
+ uint32_t subsys;
+ uint32_t padding;
+ uint32_t params[SCTP_TRACE_PARAMS];
+};
+
+struct sctp_log {
+ struct sctp_log_entry entry[SCTP_MAX_LOGGING_SIZE];
+ uint32_t index;
+ uint32_t padding;
+};
+
+/*
+ * Kernel defined for sctp_send
+ */
+#if defined(_KERNEL) || defined(__Userspace__)
+int
+sctp_lower_sosend(struct socket *so,
+ struct sockaddr *addr,
+ struct uio *uio,
+ struct mbuf *i_pak,
+ struct mbuf *control,
+ int flags,
+ struct sctp_sndrcvinfo *srcv
+ ,struct thread *p
+);
+
+int
+sctp_sorecvmsg(struct socket *so,
+ struct uio *uio,
+ struct mbuf **mp,
+ struct sockaddr *from,
+ int fromlen,
+ int *msg_flags,
+ struct sctp_sndrcvinfo *sinfo,
+ int filling_sinfo);
+
+#endif
+
+/*
+ * API system calls
+ */
+#if !(defined(_KERNEL)) && !(defined(__Userspace__))
+
+__BEGIN_DECLS
+int sctp_peeloff __P((int, sctp_assoc_t));
+int sctp_bindx __P((int, struct sockaddr *, int, int));
+int sctp_connectx __P((int, const struct sockaddr *, int, sctp_assoc_t *));
+int sctp_getaddrlen __P((sa_family_t));
+int sctp_getpaddrs __P((int, sctp_assoc_t, struct sockaddr **));
+void sctp_freepaddrs __P((struct sockaddr *));
+int sctp_getladdrs __P((int, sctp_assoc_t, struct sockaddr **));
+void sctp_freeladdrs __P((struct sockaddr *));
+int sctp_opt_info __P((int, sctp_assoc_t, int, void *, socklen_t *));
+
+ssize_t sctp_sendmsg
+__P((int, const void *, size_t,
+ const struct sockaddr *,
+ socklen_t, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t));
+
+ ssize_t sctp_send __P((int sd, const void *msg, size_t len,
+ const struct sctp_sndrcvinfo *sinfo, int flags));
+
+ ssize_t sctp_sendx __P((int sd, const void *msg, size_t len,
+ struct sockaddr *addrs, int addrcnt,
+ struct sctp_sndrcvinfo *sinfo, int flags));
+
+ ssize_t sctp_sendmsgx __P((int sd, const void *, size_t,
+ struct sockaddr *, int,
+ uint32_t, uint32_t, uint16_t, uint32_t, uint32_t));
+
+ sctp_assoc_t sctp_getassocid __P((int sd, struct sockaddr *sa));
+
+ ssize_t sctp_recvmsg __P((int, void *, size_t, struct sockaddr *,
+ socklen_t *, struct sctp_sndrcvinfo *, int *));
+
+__END_DECLS
+
+#endif /* !_KERNEL */
+#endif /* !__sctp_uio_h__ */
diff --git a/freebsd/sys/netinet/sctp_usrreq.c b/freebsd/sys/netinet/sctp_usrreq.c
new file mode 100644
index 00000000..bb60795c
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_usrreq.c
@@ -0,0 +1,4918 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_usrreq.c,v 1.48 2005/03/07 23:26:08 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#include <freebsd/netinet/sctp_header.h>
+#include <freebsd/netinet/sctp_var.h>
+#if defined(INET6)
+#endif
+#include <freebsd/netinet/sctp_sysctl.h>
+#include <freebsd/netinet/sctp_output.h>
+#include <freebsd/netinet/sctp_uio.h>
+#include <freebsd/netinet/sctp_asconf.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_indata.h>
+#include <freebsd/netinet/sctp_timer.h>
+#include <freebsd/netinet/sctp_auth.h>
+#include <freebsd/netinet/sctp_bsd_addr.h>
+#include <freebsd/netinet/sctp_cc_functions.h>
+#include <freebsd/netinet/udp.h>
+
+
+
+
+void
+sctp_init(void)
+{
+ u_long sb_max_adj;
+
+ bzero(&SCTP_BASE_STATS, sizeof(struct sctpstat));
+
+ /* Initialize and modify the sysctled variables */
+ sctp_init_sysctls();
+ if ((nmbclusters / 8) > SCTP_ASOC_MAX_CHUNKS_ON_QUEUE)
+ SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue) = (nmbclusters / 8);
+ /*
+ * Allow a user to take no more than 1/2 the number of clusters or
+ * the SB_MAX whichever is smaller for the send window.
+ */
+ sb_max_adj = (u_long)((u_quad_t) (SB_MAX) * MCLBYTES / (MSIZE + MCLBYTES));
+ SCTP_BASE_SYSCTL(sctp_sendspace) = min(sb_max_adj,
+ (((uint32_t) nmbclusters / 2) * SCTP_DEFAULT_MAXSEGMENT));
+ /*
+ * Now for the recv window, should we take the same amount? or
+ * should I do 1/2 the SB_MAX instead in the SB_MAX min above. For
+ * now I will just copy.
+ */
+ SCTP_BASE_SYSCTL(sctp_recvspace) = SCTP_BASE_SYSCTL(sctp_sendspace);
+
+ SCTP_BASE_VAR(first_time) = 0;
+ SCTP_BASE_VAR(sctp_pcb_initialized) = 0;
+ sctp_pcb_init();
+#if defined(SCTP_PACKET_LOGGING)
+ SCTP_BASE_VAR(packet_log_writers) = 0;
+ SCTP_BASE_VAR(packet_log_end) = 0;
+ bzero(&SCTP_BASE_VAR(packet_log_buffer), SCTP_PACKET_LOG_SIZE);
+#endif
+
+
+}
+
+void
+sctp_finish(void)
+{
+ sctp_pcb_finish();
+}
+
+
+
+void
+sctp_pathmtu_adjustment(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_nets *net,
+ uint16_t nxtsz)
+{
+ struct sctp_tmit_chunk *chk;
+ uint16_t overhead;
+
+ /* Adjust that too */
+ stcb->asoc.smallest_mtu = nxtsz;
+ /* now off to subtract IP_DF flag if needed */
+ overhead = IP_HDR_SIZE;
+ if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) {
+ overhead += sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
+ }
+ TAILQ_FOREACH(chk, &stcb->asoc.send_queue, sctp_next) {
+ if ((chk->send_size + overhead) > nxtsz) {
+ chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
+ }
+ }
+ TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
+ if ((chk->send_size + overhead) > nxtsz) {
+ /*
+ * For this guy we also mark for immediate resend
+ * since we sent to big of chunk
+ */
+ chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
+ if (chk->sent < SCTP_DATAGRAM_RESEND) {
+ sctp_flight_size_decrease(chk);
+ sctp_total_flight_decrease(stcb, chk);
+ }
+ if (chk->sent != SCTP_DATAGRAM_RESEND) {
+ sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ }
+ chk->sent = SCTP_DATAGRAM_RESEND;
+ chk->rec.data.doing_fast_retransmit = 0;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PMTU,
+ chk->whoTo->flight_size,
+ chk->book_size,
+ (uintptr_t) chk->whoTo,
+ chk->rec.data.TSN_seq);
+ }
+ /* Clear any time so NO RTT is being done */
+ chk->do_rtt = 0;
+ }
+ }
+}
+
+static void
+sctp_notify_mbuf(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_nets *net,
+ struct ip *ip,
+ struct sctphdr *sh)
+{
+ struct icmp *icmph;
+ int totsz, tmr_stopped = 0;
+ uint16_t nxtsz;
+
+ /* protection */
+ if ((inp == NULL) || (stcb == NULL) || (net == NULL) ||
+ (ip == NULL) || (sh == NULL)) {
+ if (stcb != NULL) {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ return;
+ }
+ /* First job is to verify the vtag matches what I would send */
+ if (ntohl(sh->v_tag) != (stcb->asoc.peer_vtag)) {
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ icmph = (struct icmp *)((caddr_t)ip - (sizeof(struct icmp) -
+ sizeof(struct ip)));
+ if (icmph->icmp_type != ICMP_UNREACH) {
+ /* We only care about unreachable */
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ if (icmph->icmp_code != ICMP_UNREACH_NEEDFRAG) {
+ /* not a unreachable message due to frag. */
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ totsz = ip->ip_len;
+
+ nxtsz = ntohs(icmph->icmp_nextmtu);
+ if (nxtsz == 0) {
+ /*
+ * old type router that does not tell us what the next size
+ * mtu is. Rats we will have to guess (in a educated fashion
+ * of course)
+ */
+ nxtsz = sctp_get_prev_mtu(totsz);
+ }
+ /* Stop any PMTU timer */
+ if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
+ tmr_stopped = 1;
+ sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_1);
+ }
+ /* Adjust destination size limit */
+ if (net->mtu > nxtsz) {
+ net->mtu = nxtsz;
+ if (net->port) {
+ net->mtu -= sizeof(struct udphdr);
+ }
+ }
+ /* now what about the ep? */
+ if (stcb->asoc.smallest_mtu > nxtsz) {
+ sctp_pathmtu_adjustment(inp, stcb, net, nxtsz);
+ }
+ if (tmr_stopped)
+ sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
+
+ SCTP_TCB_UNLOCK(stcb);
+}
+
+
+void
+sctp_notify(struct sctp_inpcb *inp,
+ struct ip *ip,
+ struct sctphdr *sh,
+ struct sockaddr *to,
+ struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+ /* protection */
+ int reason;
+ struct icmp *icmph;
+
+
+ if ((inp == NULL) || (stcb == NULL) || (net == NULL) ||
+ (sh == NULL) || (to == NULL)) {
+ if (stcb)
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ /* First job is to verify the vtag matches what I would send */
+ if (ntohl(sh->v_tag) != (stcb->asoc.peer_vtag)) {
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ icmph = (struct icmp *)((caddr_t)ip - (sizeof(struct icmp) -
+ sizeof(struct ip)));
+ if (icmph->icmp_type != ICMP_UNREACH) {
+ /* We only care about unreachable */
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ if ((icmph->icmp_code == ICMP_UNREACH_NET) ||
+ (icmph->icmp_code == ICMP_UNREACH_HOST) ||
+ (icmph->icmp_code == ICMP_UNREACH_NET_UNKNOWN) ||
+ (icmph->icmp_code == ICMP_UNREACH_HOST_UNKNOWN) ||
+ (icmph->icmp_code == ICMP_UNREACH_ISOLATED) ||
+ (icmph->icmp_code == ICMP_UNREACH_NET_PROHIB) ||
+ (icmph->icmp_code == ICMP_UNREACH_HOST_PROHIB) ||
+ (icmph->icmp_code == ICMP_UNREACH_FILTER_PROHIB)) {
+
+ /*
+ * Hmm reachablity problems we must examine closely. If its
+ * not reachable, we may have lost a network. Or if there is
+ * NO protocol at the other end named SCTP. well we consider
+ * it a OOTB abort.
+ */
+ if (net->dest_state & SCTP_ADDR_REACHABLE) {
+ /* Ok that destination is NOT reachable */
+ SCTP_PRINTF("ICMP (thresh %d/%d) takes interface %p down\n",
+ net->error_count,
+ net->failure_threshold,
+ net);
+
+ net->dest_state &= ~SCTP_ADDR_REACHABLE;
+ net->dest_state |= SCTP_ADDR_NOT_REACHABLE;
+ /*
+ * JRS 5/14/07 - If a destination is unreachable,
+ * the PF bit is turned off. This allows an
+ * unambiguous use of the PF bit for destinations
+ * that are reachable but potentially failed. If the
+ * destination is set to the unreachable state, also
+ * set the destination to the PF state.
+ */
+ /*
+ * Add debug message here if destination is not in
+ * PF state.
+ */
+ /* Stop any running T3 timers here? */
+ if ((stcb->asoc.sctp_cmt_on_off == 1) &&
+ (stcb->asoc.sctp_cmt_pf > 0)) {
+ net->dest_state &= ~SCTP_ADDR_PF;
+ SCTPDBG(SCTP_DEBUG_TIMER4, "Destination %p moved from PF to unreachable.\n",
+ net);
+ }
+ net->error_count = net->failure_threshold + 1;
+ sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN,
+ stcb, SCTP_FAILED_THRESHOLD,
+ (void *)net, SCTP_SO_NOT_LOCKED);
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else if ((icmph->icmp_code == ICMP_UNREACH_PROTOCOL) ||
+ (icmph->icmp_code == ICMP_UNREACH_PORT)) {
+ /*
+ * Here the peer is either playing tricks on us, including
+ * an address that belongs to someone who does not support
+ * SCTP OR was a userland implementation that shutdown and
+ * now is dead. In either case treat it like a OOTB abort
+ * with no TCB
+ */
+ reason = SCTP_PEER_FAULTY;
+ sctp_abort_notification(stcb, reason, SCTP_SO_NOT_LOCKED);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(inp);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+#endif
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_2);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+ /* SCTP_TCB_UNLOCK(stcb); MT: I think this is not needed. */
+#endif
+ /* no need to unlock here, since the TCB is gone */
+ } else {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+}
+
+void
+sctp_ctlinput(cmd, sa, vip)
+ int cmd;
+ struct sockaddr *sa;
+ void *vip;
+{
+ struct ip *ip = vip;
+ struct sctphdr *sh;
+ uint32_t vrf_id;
+
+ /* FIX, for non-bsd is this right? */
+ vrf_id = SCTP_DEFAULT_VRFID;
+ if (sa->sa_family != AF_INET ||
+ ((struct sockaddr_in *)sa)->sin_addr.s_addr == INADDR_ANY) {
+ return;
+ }
+ if (PRC_IS_REDIRECT(cmd)) {
+ ip = 0;
+ } else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) {
+ return;
+ }
+ if (ip) {
+ struct sctp_inpcb *inp = NULL;
+ struct sctp_tcb *stcb = NULL;
+ struct sctp_nets *net = NULL;
+ struct sockaddr_in to, from;
+
+ sh = (struct sctphdr *)((caddr_t)ip + (ip->ip_hl << 2));
+ bzero(&to, sizeof(to));
+ bzero(&from, sizeof(from));
+ from.sin_family = to.sin_family = AF_INET;
+ from.sin_len = to.sin_len = sizeof(to);
+ from.sin_port = sh->src_port;
+ from.sin_addr = ip->ip_src;
+ to.sin_port = sh->dest_port;
+ to.sin_addr = ip->ip_dst;
+
+ /*
+ * 'to' holds the dest of the packet that failed to be sent.
+ * 'from' holds our local endpoint address. Thus we reverse
+ * the to and the from in the lookup.
+ */
+ stcb = sctp_findassociation_addr_sa((struct sockaddr *)&from,
+ (struct sockaddr *)&to,
+ &inp, &net, 1, vrf_id);
+ if (stcb != NULL && inp && (inp->sctp_socket != NULL)) {
+ if (cmd != PRC_MSGSIZE) {
+ sctp_notify(inp, ip, sh,
+ (struct sockaddr *)&to, stcb,
+ net);
+ } else {
+ /* handle possible ICMP size messages */
+ sctp_notify_mbuf(inp, stcb, net, ip, sh);
+ }
+ } else {
+ if ((stcb == NULL) && (inp != NULL)) {
+ /* reduce ref-count */
+ SCTP_INP_WLOCK(inp);
+ SCTP_INP_DECR_REF(inp);
+ SCTP_INP_WUNLOCK(inp);
+ }
+ if (stcb) {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ }
+ }
+ return;
+}
+
+static int
+sctp_getcred(SYSCTL_HANDLER_ARGS)
+{
+ struct xucred xuc;
+ struct sockaddr_in addrs[2];
+ struct sctp_inpcb *inp;
+ struct sctp_nets *net;
+ struct sctp_tcb *stcb;
+ int error;
+ uint32_t vrf_id;
+
+ /* FIX, for non-bsd is this right? */
+ vrf_id = SCTP_DEFAULT_VRFID;
+
+ error = priv_check(req->td, PRIV_NETINET_GETCRED);
+
+ if (error)
+ return (error);
+
+ error = SYSCTL_IN(req, addrs, sizeof(addrs));
+ if (error)
+ return (error);
+
+ stcb = sctp_findassociation_addr_sa(sintosa(&addrs[0]),
+ sintosa(&addrs[1]),
+ &inp, &net, 1, vrf_id);
+ if (stcb == NULL || inp == NULL || inp->sctp_socket == NULL) {
+ if ((inp != NULL) && (stcb == NULL)) {
+ /* reduce ref-count */
+ SCTP_INP_WLOCK(inp);
+ SCTP_INP_DECR_REF(inp);
+ goto cred_can_cont;
+ }
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
+ error = ENOENT;
+ goto out;
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ /*
+ * We use the write lock here, only since in the error leg we need
+ * it. If we used RLOCK, then we would have to
+ * wlock/decr/unlock/rlock. Which in theory could create a hole.
+ * Better to use higher wlock.
+ */
+ SCTP_INP_WLOCK(inp);
+cred_can_cont:
+ error = cr_canseesocket(req->td->td_ucred, inp->sctp_socket);
+ if (error) {
+ SCTP_INP_WUNLOCK(inp);
+ goto out;
+ }
+ cru2x(inp->sctp_socket->so_cred, &xuc);
+ SCTP_INP_WUNLOCK(inp);
+ error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
+out:
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW,
+ 0, 0, sctp_getcred, "S,ucred", "Get the ucred of a SCTP connection");
+
+
+static void
+sctp_abort(struct socket *so)
+{
+ struct sctp_inpcb *inp;
+ uint32_t flags;
+
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == 0) {
+ return;
+ }
+sctp_must_try_again:
+ flags = inp->sctp_flags;
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 17);
+#endif
+ if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
+ (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) {
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 16);
+#endif
+ sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
+ SCTP_CALLED_AFTER_CMPSET_OFCLOSE);
+ SOCK_LOCK(so);
+ SCTP_SB_CLEAR(so->so_snd);
+ /*
+ * same for the rcv ones, they are only here for the
+ * accounting/select.
+ */
+ SCTP_SB_CLEAR(so->so_rcv);
+
+ /* Now null out the reference, we are completely detached. */
+ so->so_pcb = NULL;
+ SOCK_UNLOCK(so);
+ } else {
+ flags = inp->sctp_flags;
+ if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) {
+ goto sctp_must_try_again;
+ }
+ }
+ return;
+}
+
+static int
+sctp_attach(struct socket *so, int proto, struct thread *p)
+{
+ struct sctp_inpcb *inp;
+ struct inpcb *ip_inp;
+ int error;
+ uint32_t vrf_id = SCTP_DEFAULT_VRFID;
+
+#ifdef IPSEC
+ uint32_t flags;
+
+#endif
+
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp != 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return EINVAL;
+ }
+ if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+ error = SCTP_SORESERVE(so, SCTP_BASE_SYSCTL(sctp_sendspace), SCTP_BASE_SYSCTL(sctp_recvspace));
+ if (error) {
+ return error;
+ }
+ }
+ error = sctp_inpcb_alloc(so, vrf_id);
+ if (error) {
+ return error;
+ }
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ SCTP_INP_WLOCK(inp);
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_BOUND_V6; /* I'm not v6! */
+ ip_inp = &inp->ip_inp.inp;
+ ip_inp->inp_vflag |= INP_IPV4;
+ ip_inp->inp_ip_ttl = MODULE_GLOBAL(ip_defttl);
+#ifdef IPSEC
+ error = ipsec_init_policy(so, &ip_inp->inp_sp);
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 17);
+#endif
+ if (error != 0) {
+try_again:
+ flags = inp->sctp_flags;
+ if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
+ (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) {
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 15);
+#endif
+ SCTP_INP_WUNLOCK(inp);
+ sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
+ SCTP_CALLED_AFTER_CMPSET_OFCLOSE);
+ } else {
+ flags = inp->sctp_flags;
+ if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) {
+ goto try_again;
+ } else {
+ SCTP_INP_WUNLOCK(inp);
+ }
+ }
+ return error;
+ }
+#endif /* IPSEC */
+ SCTP_INP_WUNLOCK(inp);
+ return 0;
+}
+
+static int
+sctp_bind(struct socket *so, struct sockaddr *addr, struct thread *p)
+{
+ struct sctp_inpcb *inp = NULL;
+ int error;
+
+#ifdef INET6
+ if (addr && addr->sa_family != AF_INET) {
+ /* must be a v4 address! */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return EINVAL;
+ }
+#endif /* INET6 */
+ if (addr && (addr->sa_len != sizeof(struct sockaddr_in))) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return EINVAL;
+ }
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return EINVAL;
+ }
+ error = sctp_inpcb_bind(so, addr, NULL, p);
+ return error;
+}
+
+void
+sctp_close(struct socket *so)
+{
+ struct sctp_inpcb *inp;
+ uint32_t flags;
+
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == 0)
+ return;
+
+ /*
+ * Inform all the lower layer assoc that we are done.
+ */
+sctp_must_try_again:
+ flags = inp->sctp_flags;
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 17);
+#endif
+ if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
+ (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) {
+ if (((so->so_options & SO_LINGER) && (so->so_linger == 0)) ||
+ (so->so_rcv.sb_cc > 0)) {
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 13);
+#endif
+ sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
+ SCTP_CALLED_AFTER_CMPSET_OFCLOSE);
+ } else {
+#ifdef SCTP_LOG_CLOSING
+ sctp_log_closing(inp, NULL, 14);
+#endif
+ sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE,
+ SCTP_CALLED_AFTER_CMPSET_OFCLOSE);
+ }
+ /*
+ * The socket is now detached, no matter what the state of
+ * the SCTP association.
+ */
+ SOCK_LOCK(so);
+ SCTP_SB_CLEAR(so->so_snd);
+ /*
+ * same for the rcv ones, they are only here for the
+ * accounting/select.
+ */
+ SCTP_SB_CLEAR(so->so_rcv);
+
+ /* Now null out the reference, we are completely detached. */
+ so->so_pcb = NULL;
+ SOCK_UNLOCK(so);
+ } else {
+ flags = inp->sctp_flags;
+ if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) {
+ goto sctp_must_try_again;
+ }
+ }
+ return;
+}
+
+
+int
+sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
+ struct mbuf *control, struct thread *p);
+
+
+int
+sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
+ struct mbuf *control, struct thread *p)
+{
+ struct sctp_inpcb *inp;
+ int error;
+
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == 0) {
+ if (control) {
+ sctp_m_freem(control);
+ control = NULL;
+ }
+ SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ sctp_m_freem(m);
+ return EINVAL;
+ }
+ /* Got to have an to address if we are NOT a connected socket */
+ if ((addr == NULL) &&
+ ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE))
+ ) {
+ goto connected_type;
+ } else if (addr == NULL) {
+ SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EDESTADDRREQ);
+ error = EDESTADDRREQ;
+ sctp_m_freem(m);
+ if (control) {
+ sctp_m_freem(control);
+ control = NULL;
+ }
+ return (error);
+ }
+#ifdef INET6
+ if (addr->sa_family != AF_INET) {
+ /* must be a v4 address! */
+ SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EDESTADDRREQ);
+ sctp_m_freem(m);
+ if (control) {
+ sctp_m_freem(control);
+ control = NULL;
+ }
+ error = EDESTADDRREQ;
+ return EDESTADDRREQ;
+ }
+#endif /* INET6 */
+connected_type:
+ /* now what about control */
+ if (control) {
+ if (inp->control) {
+ SCTP_PRINTF("huh? control set?\n");
+ sctp_m_freem(inp->control);
+ inp->control = NULL;
+ }
+ inp->control = control;
+ }
+ /* Place the data */
+ if (inp->pkt) {
+ SCTP_BUF_NEXT(inp->pkt_last) = m;
+ inp->pkt_last = m;
+ } else {
+ inp->pkt_last = inp->pkt = m;
+ }
+ if (
+ /* FreeBSD uses a flag passed */
+ ((flags & PRUS_MORETOCOME) == 0)
+ ) {
+ /*
+ * note with the current version this code will only be used
+ * by OpenBSD-- NetBSD, FreeBSD, and MacOS have methods for
+ * re-defining sosend to use the sctp_sosend. One can
+ * optionally switch back to this code (by changing back the
+ * definitions) but this is not advisable. This code is used
+ * by FreeBSD when sending a file with sendfile() though.
+ */
+ int ret;
+
+ ret = sctp_output(inp, inp->pkt, addr, inp->control, p, flags);
+ inp->pkt = NULL;
+ inp->control = NULL;
+ return (ret);
+ } else {
+ return (0);
+ }
+}
+
+int
+sctp_disconnect(struct socket *so)
+{
+ struct sctp_inpcb *inp;
+
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
+ return (ENOTCONN);
+ }
+ SCTP_INP_RLOCK(inp);
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ if (LIST_EMPTY(&inp->sctp_asoc_list)) {
+ /* No connection */
+ SCTP_INP_RUNLOCK(inp);
+ return (0);
+ } else {
+ struct sctp_association *asoc;
+ struct sctp_tcb *stcb;
+
+ stcb = LIST_FIRST(&inp->sctp_asoc_list);
+ if (stcb == NULL) {
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (EINVAL);
+ }
+ SCTP_TCB_LOCK(stcb);
+ asoc = &stcb->asoc;
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ /* We are about to be freed, out of here */
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_INP_RUNLOCK(inp);
+ return (0);
+ }
+ if (((so->so_options & SO_LINGER) &&
+ (so->so_linger == 0)) ||
+ (so->so_rcv.sb_cc > 0)) {
+ if (SCTP_GET_STATE(asoc) !=
+ SCTP_STATE_COOKIE_WAIT) {
+ /* Left with Data unread */
+ struct mbuf *err;
+
+ err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_DONTWAIT, 1, MT_DATA);
+ if (err) {
+ /*
+ * Fill in the user
+ * initiated abort
+ */
+ struct sctp_paramhdr *ph;
+
+ ph = mtod(err, struct sctp_paramhdr *);
+ SCTP_BUF_LEN(err) = sizeof(struct sctp_paramhdr);
+ ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT);
+ ph->param_length = htons(SCTP_BUF_LEN(err));
+ }
+#if defined(SCTP_PANIC_ON_ABORT)
+ panic("disconnect does an abort");
+#endif
+ sctp_send_abort_tcb(stcb, err, SCTP_SO_LOCKED);
+ SCTP_STAT_INCR_COUNTER32(sctps_aborted);
+ }
+ SCTP_INP_RUNLOCK(inp);
+ if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_3);
+ /* No unlock tcb assoc is gone */
+ return (0);
+ }
+ if (TAILQ_EMPTY(&asoc->send_queue) &&
+ TAILQ_EMPTY(&asoc->sent_queue) &&
+ (asoc->stream_queue_cnt == 0)) {
+ /* there is nothing queued to send, so done */
+ if (asoc->locked_on_sending) {
+ goto abort_anyway;
+ }
+ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
+ /* only send SHUTDOWN 1st time thru */
+ sctp_stop_timers_for_shutdown(stcb);
+ sctp_send_shutdown(stcb,
+ stcb->asoc.primary_destination);
+ sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_LOCKED);
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT);
+ SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN,
+ stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
+ stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+ }
+ } else {
+ /*
+ * we still got (or just got) data to send,
+ * so set SHUTDOWN_PENDING
+ */
+ /*
+ * XXX sockets draft says that SCTP_EOF
+ * should be sent with no data. currently,
+ * we will allow user data to be sent first
+ * and move to SHUTDOWN-PENDING
+ */
+ asoc->state |= SCTP_STATE_SHUTDOWN_PENDING;
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+ if (asoc->locked_on_sending) {
+ /* Locked to send out the data */
+ struct sctp_stream_queue_pending *sp;
+
+ sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead);
+ if (sp == NULL) {
+ SCTP_PRINTF("Error, sp is NULL, locked on sending is non-null strm:%d\n",
+ asoc->locked_on_sending->stream_no);
+ } else {
+ if ((sp->length == 0) && (sp->msg_is_complete == 0))
+ asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
+ }
+ }
+ if (TAILQ_EMPTY(&asoc->send_queue) &&
+ TAILQ_EMPTY(&asoc->sent_queue) &&
+ (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
+ struct mbuf *op_err;
+
+ abort_anyway:
+ op_err = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (op_err) {
+ /*
+ * Fill in the user
+ * initiated abort
+ */
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(op_err) =
+ (sizeof(struct sctp_paramhdr) + sizeof(uint32_t));
+ ph = mtod(op_err,
+ struct sctp_paramhdr *);
+ ph->param_type = htons(
+ SCTP_CAUSE_USER_INITIATED_ABT);
+ ph->param_length = htons(SCTP_BUF_LEN(op_err));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_USRREQ + SCTP_LOC_4);
+ }
+#if defined(SCTP_PANIC_ON_ABORT)
+ panic("disconnect does an abort");
+#endif
+
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_4;
+ sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED);
+ SCTP_STAT_INCR_COUNTER32(sctps_aborted);
+ if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ SCTP_INP_RUNLOCK(inp);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_5);
+ return (0);
+ } else {
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED);
+ }
+ }
+ soisdisconnecting(so);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_INP_RUNLOCK(inp);
+ return (0);
+ }
+ /* not reached */
+ } else {
+ /* UDP model does not support this */
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
+ return EOPNOTSUPP;
+ }
+}
+
+int
+sctp_flush(struct socket *so, int how)
+{
+ /*
+ * We will just clear out the values and let subsequent close clear
+ * out the data, if any. Note if the user did a shutdown(SHUT_RD)
+ * they will not be able to read the data, the socket will block
+ * that from happening.
+ */
+ struct sctp_inpcb *inp;
+
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return EINVAL;
+ }
+ SCTP_INP_RLOCK(inp);
+ /* For the 1 to many model this does nothing */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) {
+ SCTP_INP_RUNLOCK(inp);
+ return (0);
+ }
+ SCTP_INP_RUNLOCK(inp);
+ if ((how == PRU_FLUSH_RD) || (how == PRU_FLUSH_RDWR)) {
+ /*
+ * First make sure the sb will be happy, we don't use these
+ * except maybe the count
+ */
+ SCTP_INP_WLOCK(inp);
+ SCTP_INP_READ_LOCK(inp);
+ inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_CANT_READ;
+ SCTP_INP_READ_UNLOCK(inp);
+ SCTP_INP_WUNLOCK(inp);
+ so->so_rcv.sb_cc = 0;
+ so->so_rcv.sb_mbcnt = 0;
+ so->so_rcv.sb_mb = NULL;
+ }
+ if ((how == PRU_FLUSH_WR) || (how == PRU_FLUSH_RDWR)) {
+ /*
+ * First make sure the sb will be happy, we don't use these
+ * except maybe the count
+ */
+ so->so_snd.sb_cc = 0;
+ so->so_snd.sb_mbcnt = 0;
+ so->so_snd.sb_mb = NULL;
+
+ }
+ return (0);
+}
+
+int
+sctp_shutdown(struct socket *so)
+{
+ struct sctp_inpcb *inp;
+
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return EINVAL;
+ }
+ SCTP_INP_RLOCK(inp);
+ /* For UDP model this is a invalid call */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) {
+ /* Restore the flags that the soshutdown took away. */
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_rcv.sb_state &= ~SBS_CANTRCVMORE;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ /* This proc will wakeup for read and do nothing (I hope) */
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
+ return (EOPNOTSUPP);
+ }
+ /*
+ * Ok if we reach here its the TCP model and it is either a SHUT_WR
+ * or SHUT_RDWR. This means we put the shutdown flag against it.
+ */
+ {
+ struct sctp_tcb *stcb;
+ struct sctp_association *asoc;
+
+ if ((so->so_state &
+ (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
+ SCTP_INP_RUNLOCK(inp);
+ return (ENOTCONN);
+ }
+ socantsendmore(so);
+
+ stcb = LIST_FIRST(&inp->sctp_asoc_list);
+ if (stcb == NULL) {
+ /*
+ * Ok we hit the case that the shutdown call was
+ * made after an abort or something. Nothing to do
+ * now.
+ */
+ SCTP_INP_RUNLOCK(inp);
+ return (0);
+ }
+ SCTP_TCB_LOCK(stcb);
+ asoc = &stcb->asoc;
+ if (TAILQ_EMPTY(&asoc->send_queue) &&
+ TAILQ_EMPTY(&asoc->sent_queue) &&
+ (asoc->stream_queue_cnt == 0)) {
+ if (asoc->locked_on_sending) {
+ goto abort_anyway;
+ }
+ /* there is nothing queued to send, so I'm done... */
+ if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) {
+ /* only send SHUTDOWN the first time through */
+ sctp_stop_timers_for_shutdown(stcb);
+ sctp_send_shutdown(stcb,
+ stcb->asoc.primary_destination);
+ sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_LOCKED);
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT);
+ SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN,
+ stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
+ stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+ }
+ } else {
+ /*
+ * we still got (or just got) data to send, so set
+ * SHUTDOWN_PENDING
+ */
+ asoc->state |= SCTP_STATE_SHUTDOWN_PENDING;
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb,
+ asoc->primary_destination);
+
+ if (asoc->locked_on_sending) {
+ /* Locked to send out the data */
+ struct sctp_stream_queue_pending *sp;
+
+ sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead);
+ if (sp == NULL) {
+ SCTP_PRINTF("Error, sp is NULL, locked on sending is non-null strm:%d\n",
+ asoc->locked_on_sending->stream_no);
+ } else {
+ if ((sp->length == 0) && (sp->msg_is_complete == 0)) {
+ asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
+ }
+ }
+ }
+ if (TAILQ_EMPTY(&asoc->send_queue) &&
+ TAILQ_EMPTY(&asoc->sent_queue) &&
+ (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
+ struct mbuf *op_err;
+
+ abort_anyway:
+ op_err = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)),
+ 0, M_DONTWAIT, 1, MT_DATA);
+ if (op_err) {
+ /* Fill in the user initiated abort */
+ struct sctp_paramhdr *ph;
+ uint32_t *ippp;
+
+ SCTP_BUF_LEN(op_err) =
+ sizeof(struct sctp_paramhdr) + sizeof(uint32_t);
+ ph = mtod(op_err,
+ struct sctp_paramhdr *);
+ ph->param_type = htons(
+ SCTP_CAUSE_USER_INITIATED_ABT);
+ ph->param_length = htons(SCTP_BUF_LEN(op_err));
+ ippp = (uint32_t *) (ph + 1);
+ *ippp = htonl(SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6);
+ }
+#if defined(SCTP_PANIC_ON_ABORT)
+ panic("shutdown does an abort");
+#endif
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6;
+ sctp_abort_an_association(stcb->sctp_ep, stcb,
+ SCTP_RESPONSE_TO_USER_REQ,
+ op_err, SCTP_SO_LOCKED);
+ goto skip_unlock;
+ } else {
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED);
+ }
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ }
+skip_unlock:
+ SCTP_INP_RUNLOCK(inp);
+ return 0;
+}
+
+/*
+ * copies a "user" presentable address and removes embedded scope, etc.
+ * returns 0 on success, 1 on error
+ */
+static uint32_t
+sctp_fill_user_address(struct sockaddr_storage *ss, struct sockaddr *sa)
+{
+#ifdef INET6
+ struct sockaddr_in6 lsa6;
+
+ sa = (struct sockaddr *)sctp_recover_scope((struct sockaddr_in6 *)sa,
+ &lsa6);
+#endif
+ memcpy(ss, sa, sa->sa_len);
+ return (0);
+}
+
+
+
+/*
+ * NOTE: assumes addr lock is held
+ */
+static size_t
+sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ size_t limit,
+ struct sockaddr_storage *sas,
+ uint32_t vrf_id)
+{
+ struct sctp_ifn *sctp_ifn;
+ struct sctp_ifa *sctp_ifa;
+ int loopback_scope, ipv4_local_scope, local_scope, site_scope;
+ size_t actual;
+ int ipv4_addr_legal, ipv6_addr_legal;
+ struct sctp_vrf *vrf;
+
+ actual = 0;
+ if (limit <= 0)
+ return (actual);
+
+ if (stcb) {
+ /* Turn on all the appropriate scope */
+ loopback_scope = stcb->asoc.loopback_scope;
+ ipv4_local_scope = stcb->asoc.ipv4_local_scope;
+ local_scope = stcb->asoc.local_scope;
+ site_scope = stcb->asoc.site_scope;
+ } else {
+ /* Turn on ALL scope, since we look at the EP */
+ loopback_scope = ipv4_local_scope = local_scope =
+ site_scope = 1;
+ }
+ ipv4_addr_legal = ipv6_addr_legal = 0;
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ ipv6_addr_legal = 1;
+ if (SCTP_IPV6_V6ONLY(inp) == 0) {
+ ipv4_addr_legal = 1;
+ }
+ } else {
+ ipv4_addr_legal = 1;
+ }
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf == NULL) {
+ return (0);
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
+ if ((loopback_scope == 0) &&
+ SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
+ /* Skip loopback if loopback_scope not set */
+ continue;
+ }
+ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
+ if (stcb) {
+ /*
+ * For the BOUND-ALL case, the list
+ * associated with a TCB is Always
+ * considered a reverse list.. i.e.
+ * it lists addresses that are NOT
+ * part of the association. If this
+ * is one of those we must skip it.
+ */
+ if (sctp_is_addr_restricted(stcb,
+ sctp_ifa)) {
+ continue;
+ }
+ }
+ switch (sctp_ifa->address.sa.sa_family) {
+ case AF_INET:
+ if (ipv4_addr_legal) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&sctp_ifa->address.sa;
+ if (sin->sin_addr.s_addr == 0) {
+ /*
+ * we skip
+ * unspecifed
+ * addresses
+ */
+ continue;
+ }
+ if ((ipv4_local_scope == 0) &&
+ (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) {
+ continue;
+ }
+#ifdef INET6
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
+ in6_sin_2_v4mapsin6(sin, (struct sockaddr_in6 *)sas);
+ ((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport;
+ sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(struct sockaddr_in6));
+ actual += sizeof(struct sockaddr_in6);
+ } else {
+#endif
+ memcpy(sas, sin, sizeof(*sin));
+ ((struct sockaddr_in *)sas)->sin_port = inp->sctp_lport;
+ sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(*sin));
+ actual += sizeof(*sin);
+#ifdef INET6
+ }
+#endif
+ if (actual >= limit) {
+ return (actual);
+ }
+ } else {
+ continue;
+ }
+ break;
+#ifdef INET6
+ case AF_INET6:
+ if (ipv6_addr_legal) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa;
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+ /*
+ * we skip
+ * unspecifed
+ * addresses
+ */
+ continue;
+ }
+ if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
+ if (local_scope == 0)
+ continue;
+ if (sin6->sin6_scope_id == 0) {
+ if (sa6_recoverscope(sin6) != 0)
+ /*
+ *
+ * bad
+ *
+ * li
+ * nk
+ *
+ * loc
+ * al
+ *
+ * add
+ * re
+ * ss
+ * */
+ continue;
+ }
+ }
+ if ((site_scope == 0) &&
+ (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) {
+ continue;
+ }
+ memcpy(sas, sin6, sizeof(*sin6));
+ ((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport;
+ sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(*sin6));
+ actual += sizeof(*sin6);
+ if (actual >= limit) {
+ return (actual);
+ }
+ } else {
+ continue;
+ }
+ break;
+#endif
+ default:
+ /* TSNH */
+ break;
+ }
+ }
+ }
+ } else {
+ struct sctp_laddr *laddr;
+
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ if (stcb) {
+ if (sctp_is_addr_restricted(stcb, laddr->ifa)) {
+ continue;
+ }
+ }
+ if (sctp_fill_user_address(sas, &laddr->ifa->address.sa))
+ continue;
+
+ ((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport;
+ sas = (struct sockaddr_storage *)((caddr_t)sas +
+ laddr->ifa->address.sa.sa_len);
+ actual += laddr->ifa->address.sa.sa_len;
+ if (actual >= limit) {
+ return (actual);
+ }
+ }
+ }
+ return (actual);
+}
+
+static size_t
+sctp_fill_up_addresses(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ size_t limit,
+ struct sockaddr_storage *sas)
+{
+ size_t size = 0;
+
+ SCTP_IPI_ADDR_RLOCK();
+ /* fill up addresses for the endpoint's default vrf */
+ size = sctp_fill_up_addresses_vrf(inp, stcb, limit, sas,
+ inp->def_vrf_id);
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (size);
+}
+
+/*
+ * NOTE: assumes addr lock is held
+ */
+static int
+sctp_count_max_addresses_vrf(struct sctp_inpcb *inp, uint32_t vrf_id)
+{
+ int cnt = 0;
+ struct sctp_vrf *vrf = NULL;
+
+ /*
+ * In both sub-set bound an bound_all cases we return the MAXIMUM
+ * number of addresses that you COULD get. In reality the sub-set
+ * bound may have an exclusion list for a given TCB OR in the
+ * bound-all case a TCB may NOT include the loopback or other
+ * addresses as well.
+ */
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf == NULL) {
+ return (0);
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ struct sctp_ifn *sctp_ifn;
+ struct sctp_ifa *sctp_ifa;
+
+ LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
+ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
+ /* Count them if they are the right type */
+ if (sctp_ifa->address.sa.sa_family == AF_INET) {
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4))
+ cnt += sizeof(struct sockaddr_in6);
+ else
+ cnt += sizeof(struct sockaddr_in);
+
+ } else if (sctp_ifa->address.sa.sa_family == AF_INET6)
+ cnt += sizeof(struct sockaddr_in6);
+ }
+ }
+ } else {
+ struct sctp_laddr *laddr;
+
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ if (laddr->ifa->address.sa.sa_family == AF_INET) {
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4))
+ cnt += sizeof(struct sockaddr_in6);
+ else
+ cnt += sizeof(struct sockaddr_in);
+
+ } else if (laddr->ifa->address.sa.sa_family == AF_INET6)
+ cnt += sizeof(struct sockaddr_in6);
+ }
+ }
+ return (cnt);
+}
+
+static int
+sctp_count_max_addresses(struct sctp_inpcb *inp)
+{
+ int cnt = 0;
+
+ SCTP_IPI_ADDR_RLOCK();
+ /* count addresses for the endpoint's default VRF */
+ cnt = sctp_count_max_addresses_vrf(inp, inp->def_vrf_id);
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (cnt);
+}
+
+static int
+sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval,
+ size_t optsize, void *p, int delay)
+{
+ int error = 0;
+ int creat_lock_on = 0;
+ struct sctp_tcb *stcb = NULL;
+ struct sockaddr *sa;
+ int num_v6 = 0, num_v4 = 0, *totaddrp, totaddr;
+ int added = 0;
+ uint32_t vrf_id;
+ int bad_addresses = 0;
+ sctp_assoc_t *a_id;
+
+ SCTPDBG(SCTP_DEBUG_PCB1, "Connectx called\n");
+
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
+ (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) {
+ /* We are already connected AND the TCP model */
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE);
+ return (EADDRINUSE);
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) &&
+ (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE))) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (EINVAL);
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
+ SCTP_INP_RLOCK(inp);
+ stcb = LIST_FIRST(&inp->sctp_asoc_list);
+ SCTP_INP_RUNLOCK(inp);
+ }
+ if (stcb) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
+ return (EALREADY);
+ }
+ SCTP_INP_INCR_REF(inp);
+ SCTP_ASOC_CREATE_LOCK(inp);
+ creat_lock_on = 1;
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EFAULT);
+ error = EFAULT;
+ goto out_now;
+ }
+ totaddrp = (int *)optval;
+ totaddr = *totaddrp;
+ sa = (struct sockaddr *)(totaddrp + 1);
+ stcb = sctp_connectx_helper_find(inp, sa, &totaddr, &num_v4, &num_v6, &error, (optsize - sizeof(int)), &bad_addresses);
+ if ((stcb != NULL) || bad_addresses) {
+ /* Already have or am bring up an association */
+ SCTP_ASOC_CREATE_UNLOCK(inp);
+ creat_lock_on = 0;
+ if (stcb)
+ SCTP_TCB_UNLOCK(stcb);
+ if (bad_addresses == 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
+ error = EALREADY;
+ }
+ goto out_now;
+ }
+#ifdef INET6
+ if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) &&
+ (num_v6 > 0)) {
+ error = EINVAL;
+ goto out_now;
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
+ (num_v4 > 0)) {
+ struct in6pcb *inp6;
+
+ inp6 = (struct in6pcb *)inp;
+ if (SCTP_IPV6_V6ONLY(inp6)) {
+ /*
+ * if IPV6_V6ONLY flag, ignore connections destined
+ * to a v4 addr or v4-mapped addr
+ */
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ goto out_now;
+ }
+ }
+#endif /* INET6 */
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) ==
+ SCTP_PCB_FLAGS_UNBOUND) {
+ /* Bind a ephemeral port */
+ error = sctp_inpcb_bind(so, NULL, NULL, p);
+ if (error) {
+ goto out_now;
+ }
+ }
+ /* FIX ME: do we want to pass in a vrf on the connect call? */
+ vrf_id = inp->def_vrf_id;
+
+
+ /* We are GOOD to go */
+ stcb = sctp_aloc_assoc(inp, sa, &error, 0, vrf_id,
+ (struct thread *)p
+ );
+ if (stcb == NULL) {
+ /* Gak! no memory */
+ goto out_now;
+ }
+ SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT);
+ /* move to second address */
+ if (sa->sa_family == AF_INET)
+ sa = (struct sockaddr *)((caddr_t)sa + sizeof(struct sockaddr_in));
+ else
+ sa = (struct sockaddr *)((caddr_t)sa + sizeof(struct sockaddr_in6));
+
+ error = 0;
+ added = sctp_connectx_helper_add(stcb, sa, (totaddr - 1), &error);
+ /* Fill in the return id */
+ if (error) {
+ (void)sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6);
+ goto out_now;
+ }
+ a_id = (sctp_assoc_t *) optval;
+ *a_id = sctp_get_associd(stcb);
+
+ /* initialize authentication parameters for the assoc */
+ sctp_initialize_auth_params(inp, stcb);
+
+ if (delay) {
+ /* doing delayed connection */
+ stcb->asoc.delayed_connection = 1;
+ sctp_timer_start(SCTP_TIMER_TYPE_INIT, inp, stcb, stcb->asoc.primary_destination);
+ } else {
+ (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
+ sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) {
+ stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED;
+ /* Set the connected flag so we can queue data */
+ soisconnecting(so);
+ }
+out_now:
+ if (creat_lock_on) {
+ SCTP_ASOC_CREATE_UNLOCK(inp);
+ }
+ SCTP_INP_DECR_REF(inp);
+ return error;
+}
+
+#define SCTP_FIND_STCB(inp, stcb, assoc_id) { \
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||\
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { \
+ SCTP_INP_RLOCK(inp); \
+ stcb = LIST_FIRST(&inp->sctp_asoc_list); \
+ if (stcb) { \
+ SCTP_TCB_LOCK(stcb); \
+ } \
+ SCTP_INP_RUNLOCK(inp); \
+ } else if (assoc_id != 0) { \
+ stcb = sctp_findassociation_ep_asocid(inp, assoc_id, 1); \
+ if (stcb == NULL) { \
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); \
+ error = ENOENT; \
+ break; \
+ } \
+ } else { \
+ stcb = NULL; \
+ } \
+ }
+
+
+#define SCTP_CHECK_AND_CAST(destp, srcp, type, size) {\
+ if (size < sizeof(type)) { \
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); \
+ error = EINVAL; \
+ break; \
+ } else { \
+ destp = (type *)srcp; \
+ } \
+ }
+
+static int
+sctp_getopt(struct socket *so, int optname, void *optval, size_t *optsize,
+ void *p)
+{
+ struct sctp_inpcb *inp = NULL;
+ int error, val = 0;
+ struct sctp_tcb *stcb = NULL;
+
+ if (optval == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (EINVAL);
+ }
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return EINVAL;
+ }
+ error = 0;
+
+ switch (optname) {
+ case SCTP_NODELAY:
+ case SCTP_AUTOCLOSE:
+ case SCTP_EXPLICIT_EOR:
+ case SCTP_AUTO_ASCONF:
+ case SCTP_DISABLE_FRAGMENTS:
+ case SCTP_I_WANT_MAPPED_V4_ADDR:
+ case SCTP_USE_EXT_RCVINFO:
+ SCTP_INP_RLOCK(inp);
+ switch (optname) {
+ case SCTP_DISABLE_FRAGMENTS:
+ val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NO_FRAGMENT);
+ break;
+ case SCTP_I_WANT_MAPPED_V4_ADDR:
+ val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4);
+ break;
+ case SCTP_AUTO_ASCONF:
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ /* only valid for bound all sockets */
+ val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTO_ASCONF);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ goto flags_out;
+ }
+ break;
+ case SCTP_EXPLICIT_EOR:
+ val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR);
+ break;
+ case SCTP_NODELAY:
+ val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NODELAY);
+ break;
+ case SCTP_USE_EXT_RCVINFO:
+ val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO);
+ break;
+ case SCTP_AUTOCLOSE:
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE))
+ val = TICKS_TO_SEC(inp->sctp_ep.auto_close_time);
+ else
+ val = 0;
+ break;
+
+ default:
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
+ error = ENOPROTOOPT;
+ } /* end switch (sopt->sopt_name) */
+ if (optname != SCTP_AUTOCLOSE) {
+ /* make it an "on/off" value */
+ val = (val != 0);
+ }
+ if (*optsize < sizeof(val)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+flags_out:
+ SCTP_INP_RUNLOCK(inp);
+ if (error == 0) {
+ /* return the option value */
+ *(int *)optval = val;
+ *optsize = sizeof(val);
+ }
+ break;
+ case SCTP_GET_PACKET_LOG:
+ {
+#ifdef SCTP_PACKET_LOGGING
+ uint8_t *target;
+ int ret;
+
+ SCTP_CHECK_AND_CAST(target, optval, uint8_t, *optsize);
+ ret = sctp_copy_out_packet_log(target, (int)*optsize);
+ *optsize = ret;
+#else
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
+ error = EOPNOTSUPP;
+#endif
+ break;
+ }
+ case SCTP_REUSE_PORT:
+ {
+ uint32_t *value;
+
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) {
+ /* Can't do this for a 1-m socket */
+ error = EINVAL;
+ break;
+ }
+ SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
+ *value = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE);
+ *optsize = sizeof(uint32_t);
+ }
+ break;
+ case SCTP_PARTIAL_DELIVERY_POINT:
+ {
+ uint32_t *value;
+
+ SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
+ *value = inp->partial_delivery_point;
+ *optsize = sizeof(uint32_t);
+ }
+ break;
+ case SCTP_FRAGMENT_INTERLEAVE:
+ {
+ uint32_t *value;
+
+ SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE)) {
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS)) {
+ *value = SCTP_FRAG_LEVEL_2;
+ } else {
+ *value = SCTP_FRAG_LEVEL_1;
+ }
+ } else {
+ *value = SCTP_FRAG_LEVEL_0;
+ }
+ *optsize = sizeof(uint32_t);
+ }
+ break;
+ case SCTP_CMT_ON_OFF:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+ if (stcb) {
+ av->assoc_value = stcb->asoc.sctp_cmt_on_off;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_RLOCK(inp);
+ av->assoc_value = inp->sctp_cmt_on_off;
+ SCTP_INP_RUNLOCK(inp);
+ }
+ *optsize = sizeof(*av);
+ }
+ break;
+ /* JRS - Get socket option for pluggable congestion control */
+ case SCTP_PLUGGABLE_CC:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+ if (stcb) {
+ av->assoc_value = stcb->asoc.congestion_control_module;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ av->assoc_value = inp->sctp_ep.sctp_default_cc_module;
+ }
+ *optsize = sizeof(*av);
+ }
+ break;
+ case SCTP_GET_ADDR_LEN:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ error = EINVAL;
+#ifdef INET
+ if (av->assoc_value == AF_INET) {
+ av->assoc_value = sizeof(struct sockaddr_in);
+ error = 0;
+ }
+#endif
+#ifdef INET6
+ if (av->assoc_value == AF_INET6) {
+ av->assoc_value = sizeof(struct sockaddr_in6);
+ error = 0;
+ }
+#endif
+ if (error) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ }
+ *optsize = sizeof(*av);
+ }
+ break;
+ case SCTP_GET_ASSOC_NUMBER:
+ {
+ uint32_t *value, cnt;
+
+ SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
+ cnt = 0;
+ SCTP_INP_RLOCK(inp);
+ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
+ cnt++;
+ }
+ SCTP_INP_RUNLOCK(inp);
+ *value = cnt;
+ *optsize = sizeof(uint32_t);
+ }
+ break;
+
+ case SCTP_GET_ASSOC_ID_LIST:
+ {
+ struct sctp_assoc_ids *ids;
+ unsigned int at, limit;
+
+ SCTP_CHECK_AND_CAST(ids, optval, struct sctp_assoc_ids, *optsize);
+ at = 0;
+ limit = (*optsize - sizeof(uint32_t)) / sizeof(sctp_assoc_t);
+ SCTP_INP_RLOCK(inp);
+ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
+ if (at < limit) {
+ ids->gaids_assoc_id[at++] = sctp_get_associd(stcb);
+ } else {
+ error = EINVAL;
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ break;
+ }
+ }
+ SCTP_INP_RUNLOCK(inp);
+ ids->gaids_number_of_ids = at;
+ *optsize = ((at * sizeof(sctp_assoc_t)) + sizeof(uint32_t));
+ }
+ break;
+ case SCTP_CONTEXT:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ av->assoc_value = stcb->asoc.context;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_RLOCK(inp);
+ av->assoc_value = inp->sctp_context;
+ SCTP_INP_RUNLOCK(inp);
+ }
+ *optsize = sizeof(*av);
+ }
+ break;
+ case SCTP_VRF_ID:
+ {
+ uint32_t *default_vrfid;
+
+ SCTP_CHECK_AND_CAST(default_vrfid, optval, uint32_t, *optsize);
+ *default_vrfid = inp->def_vrf_id;
+ break;
+ }
+ case SCTP_GET_ASOC_VRF:
+ {
+ struct sctp_assoc_value *id;
+
+ SCTP_CHECK_AND_CAST(id, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, id->assoc_id);
+ if (stcb == NULL) {
+ error = EINVAL;
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ break;
+ }
+ id->assoc_value = stcb->asoc.vrf_id;
+ break;
+ }
+ case SCTP_GET_VRF_IDS:
+ {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
+ error = EOPNOTSUPP;
+ break;
+ }
+ case SCTP_GET_NONCE_VALUES:
+ {
+ struct sctp_get_nonce_values *gnv;
+
+ SCTP_CHECK_AND_CAST(gnv, optval, struct sctp_get_nonce_values, *optsize);
+ SCTP_FIND_STCB(inp, stcb, gnv->gn_assoc_id);
+
+ if (stcb) {
+ gnv->gn_peers_tag = stcb->asoc.peer_vtag;
+ gnv->gn_local_tag = stcb->asoc.my_vtag;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
+ error = ENOTCONN;
+ }
+ *optsize = sizeof(*gnv);
+ }
+ break;
+ case SCTP_DELAYED_SACK:
+ {
+ struct sctp_sack_info *sack;
+
+ SCTP_CHECK_AND_CAST(sack, optval, struct sctp_sack_info, *optsize);
+ SCTP_FIND_STCB(inp, stcb, sack->sack_assoc_id);
+ if (stcb) {
+ sack->sack_delay = stcb->asoc.delayed_ack;
+ sack->sack_freq = stcb->asoc.sack_freq;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_RLOCK(inp);
+ sack->sack_delay = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV]);
+ sack->sack_freq = inp->sctp_ep.sctp_sack_freq;
+ SCTP_INP_RUNLOCK(inp);
+ }
+ *optsize = sizeof(*sack);
+ }
+ break;
+
+ case SCTP_GET_SNDBUF_USE:
+ {
+ struct sctp_sockstat *ss;
+
+ SCTP_CHECK_AND_CAST(ss, optval, struct sctp_sockstat, *optsize);
+ SCTP_FIND_STCB(inp, stcb, ss->ss_assoc_id);
+
+ if (stcb) {
+ ss->ss_total_sndbuf = stcb->asoc.total_output_queue_size;
+ ss->ss_total_recv_buf = (stcb->asoc.size_on_reasm_queue +
+ stcb->asoc.size_on_all_streams);
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
+ error = ENOTCONN;
+ }
+ *optsize = sizeof(struct sctp_sockstat);
+ }
+ break;
+ case SCTP_MAX_BURST:
+ {
+ uint8_t *value;
+
+ SCTP_CHECK_AND_CAST(value, optval, uint8_t, *optsize);
+
+ SCTP_INP_RLOCK(inp);
+ *value = inp->sctp_ep.max_burst;
+ SCTP_INP_RUNLOCK(inp);
+ *optsize = sizeof(uint8_t);
+ }
+ break;
+ case SCTP_MAXSEG:
+ {
+ struct sctp_assoc_value *av;
+ int ovh;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ av->assoc_value = sctp_get_frag_point(stcb, &stcb->asoc);
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_RLOCK(inp);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ ovh = SCTP_MED_OVERHEAD;
+ } else {
+ ovh = SCTP_MED_V4_OVERHEAD;
+ }
+ if (inp->sctp_frag_point >= SCTP_DEFAULT_MAXSEGMENT)
+ av->assoc_value = 0;
+ else
+ av->assoc_value = inp->sctp_frag_point - ovh;
+ SCTP_INP_RUNLOCK(inp);
+ }
+ *optsize = sizeof(struct sctp_assoc_value);
+ }
+ break;
+ case SCTP_GET_STAT_LOG:
+ error = sctp_fill_stat_log(optval, optsize);
+ break;
+ case SCTP_EVENTS:
+ {
+ struct sctp_event_subscribe *events;
+
+ SCTP_CHECK_AND_CAST(events, optval, struct sctp_event_subscribe, *optsize);
+ memset(events, 0, sizeof(*events));
+ SCTP_INP_RLOCK(inp);
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT))
+ events->sctp_data_io_event = 1;
+
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT))
+ events->sctp_association_event = 1;
+
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVPADDREVNT))
+ events->sctp_address_event = 1;
+
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT))
+ events->sctp_send_failure_event = 1;
+
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVPEERERR))
+ events->sctp_peer_error_event = 1;
+
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT))
+ events->sctp_shutdown_event = 1;
+
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT))
+ events->sctp_partial_delivery_event = 1;
+
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT))
+ events->sctp_adaptation_layer_event = 1;
+
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTHEVNT))
+ events->sctp_authentication_event = 1;
+
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DRYEVNT))
+ events->sctp_sender_dry_event = 1;
+
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT))
+ events->sctp_stream_reset_event = 1;
+ SCTP_INP_RUNLOCK(inp);
+ *optsize = sizeof(struct sctp_event_subscribe);
+ }
+ break;
+
+ case SCTP_ADAPTATION_LAYER:
+ {
+ uint32_t *value;
+
+ SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
+
+ SCTP_INP_RLOCK(inp);
+ *value = inp->sctp_ep.adaptation_layer_indicator;
+ SCTP_INP_RUNLOCK(inp);
+ *optsize = sizeof(uint32_t);
+ }
+ break;
+ case SCTP_SET_INITIAL_DBG_SEQ:
+ {
+ uint32_t *value;
+
+ SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
+ SCTP_INP_RLOCK(inp);
+ *value = inp->sctp_ep.initial_sequence_debug;
+ SCTP_INP_RUNLOCK(inp);
+ *optsize = sizeof(uint32_t);
+ }
+ break;
+ case SCTP_GET_LOCAL_ADDR_SIZE:
+ {
+ uint32_t *value;
+
+ SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
+ SCTP_INP_RLOCK(inp);
+ *value = sctp_count_max_addresses(inp);
+ SCTP_INP_RUNLOCK(inp);
+ *optsize = sizeof(uint32_t);
+ }
+ break;
+ case SCTP_GET_REMOTE_ADDR_SIZE:
+ {
+ uint32_t *value;
+ size_t size;
+ struct sctp_nets *net;
+
+ SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
+ /* FIXME MT: change to sctp_assoc_value? */
+ SCTP_FIND_STCB(inp, stcb, (sctp_assoc_t) * value);
+
+ if (stcb) {
+ size = 0;
+ /* Count the sizes */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) ||
+ (((struct sockaddr *)&net->ro._l_addr)->sa_family == AF_INET6)) {
+ size += sizeof(struct sockaddr_in6);
+ } else if (((struct sockaddr *)&net->ro._l_addr)->sa_family == AF_INET) {
+ size += sizeof(struct sockaddr_in);
+ } else {
+ /* huh */
+ break;
+ }
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ *value = (uint32_t) size;
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
+ error = ENOTCONN;
+ }
+ *optsize = sizeof(uint32_t);
+ }
+ break;
+ case SCTP_GET_PEER_ADDRESSES:
+ /*
+ * Get the address information, an array is passed in to
+ * fill up we pack it.
+ */
+ {
+ size_t cpsz, left;
+ struct sockaddr_storage *sas;
+ struct sctp_nets *net;
+ struct sctp_getaddresses *saddr;
+
+ SCTP_CHECK_AND_CAST(saddr, optval, struct sctp_getaddresses, *optsize);
+ SCTP_FIND_STCB(inp, stcb, saddr->sget_assoc_id);
+
+ if (stcb) {
+ left = (*optsize) - sizeof(struct sctp_getaddresses);
+ *optsize = sizeof(struct sctp_getaddresses);
+ sas = (struct sockaddr_storage *)&saddr->addr[0];
+
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) ||
+ (((struct sockaddr *)&net->ro._l_addr)->sa_family == AF_INET6)) {
+ cpsz = sizeof(struct sockaddr_in6);
+ } else if (((struct sockaddr *)&net->ro._l_addr)->sa_family == AF_INET) {
+ cpsz = sizeof(struct sockaddr_in);
+ } else {
+ /* huh */
+ break;
+ }
+ if (left < cpsz) {
+ /* not enough room. */
+ break;
+ }
+#ifdef INET6
+ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) &&
+ (((struct sockaddr *)&net->ro._l_addr)->sa_family == AF_INET)) {
+ /* Must map the address */
+ in6_sin_2_v4mapsin6((struct sockaddr_in *)&net->ro._l_addr,
+ (struct sockaddr_in6 *)sas);
+ } else {
+#endif
+ memcpy(sas, &net->ro._l_addr, cpsz);
+#ifdef INET6
+ }
+#endif
+ ((struct sockaddr_in *)sas)->sin_port = stcb->rport;
+
+ sas = (struct sockaddr_storage *)((caddr_t)sas + cpsz);
+ left -= cpsz;
+ *optsize += cpsz;
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
+ error = ENOENT;
+ }
+ }
+ break;
+ case SCTP_GET_LOCAL_ADDRESSES:
+ {
+ size_t limit, actual;
+ struct sockaddr_storage *sas;
+ struct sctp_getaddresses *saddr;
+
+ SCTP_CHECK_AND_CAST(saddr, optval, struct sctp_getaddresses, *optsize);
+ SCTP_FIND_STCB(inp, stcb, saddr->sget_assoc_id);
+
+ sas = (struct sockaddr_storage *)&saddr->addr[0];
+ limit = *optsize - sizeof(sctp_assoc_t);
+ actual = sctp_fill_up_addresses(inp, stcb, limit, sas);
+ if (stcb) {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ *optsize = sizeof(struct sockaddr_storage) + actual;
+ }
+ break;
+ case SCTP_PEER_ADDR_PARAMS:
+ {
+ struct sctp_paddrparams *paddrp;
+ struct sctp_nets *net;
+
+ SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, *optsize);
+ SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id);
+
+ net = NULL;
+ if (stcb) {
+ net = sctp_findnet(stcb, (struct sockaddr *)&paddrp->spp_address);
+ } else {
+ /*
+ * We increment here since
+ * sctp_findassociation_ep_addr() wil do a
+ * decrement if it finds the stcb as long as
+ * the locked tcb (last argument) is NOT a
+ * TCB.. aka NULL.
+ */
+ SCTP_INP_INCR_REF(inp);
+ stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&paddrp->spp_address, &net, NULL, NULL);
+ if (stcb == NULL) {
+ SCTP_INP_DECR_REF(inp);
+ }
+ }
+ if (stcb && (net == NULL)) {
+ struct sockaddr *sa;
+
+ sa = (struct sockaddr *)&paddrp->spp_address;
+ if (sa->sa_family == AF_INET) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)sa;
+ if (sin->sin_addr.s_addr) {
+ error = EINVAL;
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ break;
+ }
+ } else if (sa->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)sa;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+ error = EINVAL;
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ break;
+ }
+ } else {
+ error = EAFNOSUPPORT;
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ break;
+ }
+ }
+ if (stcb) {
+ /* Applys to the specific association */
+ paddrp->spp_flags = 0;
+ if (net) {
+ int ovh;
+
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ ovh = SCTP_MED_OVERHEAD;
+ } else {
+ ovh = SCTP_MED_V4_OVERHEAD;
+ }
+
+
+ paddrp->spp_pathmaxrxt = net->failure_threshold;
+ paddrp->spp_pathmtu = net->mtu - ovh;
+ /* get flags for HB */
+ if (net->dest_state & SCTP_ADDR_NOHB)
+ paddrp->spp_flags |= SPP_HB_DISABLE;
+ else
+ paddrp->spp_flags |= SPP_HB_ENABLE;
+ /* get flags for PMTU */
+ if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
+ paddrp->spp_flags |= SPP_PMTUD_ENABLE;
+ } else {
+ paddrp->spp_flags |= SPP_PMTUD_DISABLE;
+ }
+#ifdef INET
+ if (net->ro._l_addr.sin.sin_family == AF_INET) {
+ paddrp->spp_ipv4_tos = net->tos_flowlabel & 0x000000fc;
+ paddrp->spp_flags |= SPP_IPV4_TOS;
+ }
+#endif
+#ifdef INET6
+ if (net->ro._l_addr.sin6.sin6_family == AF_INET6) {
+ paddrp->spp_ipv6_flowlabel = net->tos_flowlabel;
+ paddrp->spp_flags |= SPP_IPV6_FLOWLABEL;
+ }
+#endif
+ } else {
+ /*
+ * No destination so return default
+ * value
+ */
+ int cnt = 0;
+
+ paddrp->spp_pathmaxrxt = stcb->asoc.def_net_failure;
+ paddrp->spp_pathmtu = sctp_get_frag_point(stcb, &stcb->asoc);
+#ifdef INET
+ paddrp->spp_ipv4_tos = stcb->asoc.default_tos & 0x000000fc;
+ paddrp->spp_flags |= SPP_IPV4_TOS;
+#endif
+#ifdef INET6
+ paddrp->spp_ipv6_flowlabel = stcb->asoc.default_flowlabel;
+ paddrp->spp_flags |= SPP_IPV6_FLOWLABEL;
+#endif
+ /* default settings should be these */
+ if (stcb->asoc.hb_is_disabled == 0) {
+ paddrp->spp_flags |= SPP_HB_ENABLE;
+ } else {
+ paddrp->spp_flags |= SPP_HB_DISABLE;
+ }
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
+ cnt++;
+ }
+ }
+ if (cnt) {
+ paddrp->spp_flags |= SPP_PMTUD_ENABLE;
+ }
+ }
+ paddrp->spp_hbinterval = stcb->asoc.heart_beat_delay;
+ paddrp->spp_assoc_id = sctp_get_associd(stcb);
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ /* Use endpoint defaults */
+ SCTP_INP_RLOCK(inp);
+ paddrp->spp_pathmaxrxt = inp->sctp_ep.def_net_failure;
+ paddrp->spp_hbinterval = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]);
+ paddrp->spp_assoc_id = (sctp_assoc_t) 0;
+ /* get inp's default */
+#ifdef INET
+ paddrp->spp_ipv4_tos = inp->ip_inp.inp.inp_ip_tos;
+ paddrp->spp_flags |= SPP_IPV4_TOS;
+#endif
+#ifdef INET6
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ paddrp->spp_ipv6_flowlabel = ((struct in6pcb *)inp)->in6p_flowinfo;
+ paddrp->spp_flags |= SPP_IPV6_FLOWLABEL;
+ }
+#endif
+ /* can't return this */
+ paddrp->spp_pathmtu = 0;
+
+ /* default behavior, no stcb */
+ paddrp->spp_flags = SPP_PMTUD_ENABLE;
+
+ if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) {
+ paddrp->spp_flags |= SPP_HB_ENABLE;
+ } else {
+ paddrp->spp_flags |= SPP_HB_DISABLE;
+ }
+ SCTP_INP_RUNLOCK(inp);
+ }
+ *optsize = sizeof(struct sctp_paddrparams);
+ }
+ break;
+ case SCTP_GET_PEER_ADDR_INFO:
+ {
+ struct sctp_paddrinfo *paddri;
+ struct sctp_nets *net;
+
+ SCTP_CHECK_AND_CAST(paddri, optval, struct sctp_paddrinfo, *optsize);
+ SCTP_FIND_STCB(inp, stcb, paddri->spinfo_assoc_id);
+
+ net = NULL;
+ if (stcb) {
+ net = sctp_findnet(stcb, (struct sockaddr *)&paddri->spinfo_address);
+ } else {
+ /*
+ * We increment here since
+ * sctp_findassociation_ep_addr() wil do a
+ * decrement if it finds the stcb as long as
+ * the locked tcb (last argument) is NOT a
+ * TCB.. aka NULL.
+ */
+ SCTP_INP_INCR_REF(inp);
+ stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&paddri->spinfo_address, &net, NULL, NULL);
+ if (stcb == NULL) {
+ SCTP_INP_DECR_REF(inp);
+ }
+ }
+
+ if ((stcb) && (net)) {
+ paddri->spinfo_state = net->dest_state & (SCTP_REACHABLE_MASK | SCTP_ADDR_NOHB);
+ paddri->spinfo_cwnd = net->cwnd;
+ paddri->spinfo_srtt = ((net->lastsa >> 2) + net->lastsv) >> 1;
+ paddri->spinfo_rto = net->RTO;
+ paddri->spinfo_assoc_id = sctp_get_associd(stcb);
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if (stcb) {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
+ error = ENOENT;
+ }
+ *optsize = sizeof(struct sctp_paddrinfo);
+ }
+ break;
+ case SCTP_PCB_STATUS:
+ {
+ struct sctp_pcbinfo *spcb;
+
+ SCTP_CHECK_AND_CAST(spcb, optval, struct sctp_pcbinfo, *optsize);
+ sctp_fill_pcbinfo(spcb);
+ *optsize = sizeof(struct sctp_pcbinfo);
+ }
+ break;
+
+ case SCTP_STATUS:
+ {
+ struct sctp_nets *net;
+ struct sctp_status *sstat;
+
+ SCTP_CHECK_AND_CAST(sstat, optval, struct sctp_status, *optsize);
+ SCTP_FIND_STCB(inp, stcb, sstat->sstat_assoc_id);
+
+ if (stcb == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ error = EINVAL;
+ break;
+ }
+ /*
+ * I think passing the state is fine since
+ * sctp_constants.h will be available to the user
+ * land.
+ */
+ sstat->sstat_state = stcb->asoc.state;
+ sstat->sstat_assoc_id = sctp_get_associd(stcb);
+ sstat->sstat_rwnd = stcb->asoc.peers_rwnd;
+ sstat->sstat_unackdata = stcb->asoc.sent_queue_cnt;
+ /*
+ * We can't include chunks that have been passed to
+ * the socket layer. Only things in queue.
+ */
+ sstat->sstat_penddata = (stcb->asoc.cnt_on_reasm_queue +
+ stcb->asoc.cnt_on_all_streams);
+
+
+ sstat->sstat_instrms = stcb->asoc.streamincnt;
+ sstat->sstat_outstrms = stcb->asoc.streamoutcnt;
+ sstat->sstat_fragmentation_point = sctp_get_frag_point(stcb, &stcb->asoc);
+ memcpy(&sstat->sstat_primary.spinfo_address,
+ &stcb->asoc.primary_destination->ro._l_addr,
+ ((struct sockaddr *)(&stcb->asoc.primary_destination->ro._l_addr))->sa_len);
+ net = stcb->asoc.primary_destination;
+ ((struct sockaddr_in *)&sstat->sstat_primary.spinfo_address)->sin_port = stcb->rport;
+ /*
+ * Again the user can get info from sctp_constants.h
+ * for what the state of the network is.
+ */
+ sstat->sstat_primary.spinfo_state = net->dest_state & SCTP_REACHABLE_MASK;
+ sstat->sstat_primary.spinfo_cwnd = net->cwnd;
+ sstat->sstat_primary.spinfo_srtt = net->lastsa;
+ sstat->sstat_primary.spinfo_rto = net->RTO;
+ sstat->sstat_primary.spinfo_mtu = net->mtu;
+ sstat->sstat_primary.spinfo_assoc_id = sctp_get_associd(stcb);
+ SCTP_TCB_UNLOCK(stcb);
+ *optsize = sizeof(*sstat);
+ }
+ break;
+ case SCTP_RTOINFO:
+ {
+ struct sctp_rtoinfo *srto;
+
+ SCTP_CHECK_AND_CAST(srto, optval, struct sctp_rtoinfo, *optsize);
+ SCTP_FIND_STCB(inp, stcb, srto->srto_assoc_id);
+
+ if (stcb) {
+ srto->srto_initial = stcb->asoc.initial_rto;
+ srto->srto_max = stcb->asoc.maxrto;
+ srto->srto_min = stcb->asoc.minrto;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_RLOCK(inp);
+ srto->srto_initial = inp->sctp_ep.initial_rto;
+ srto->srto_max = inp->sctp_ep.sctp_maxrto;
+ srto->srto_min = inp->sctp_ep.sctp_minrto;
+ SCTP_INP_RUNLOCK(inp);
+ }
+ *optsize = sizeof(*srto);
+ }
+ break;
+ case SCTP_TIMEOUTS:
+ {
+ struct sctp_timeouts *stimo;
+
+ SCTP_CHECK_AND_CAST(stimo, optval, struct sctp_timeouts, *optsize);
+ SCTP_FIND_STCB(inp, stcb, stimo->stimo_assoc_id);
+
+ if (stcb) {
+ stimo->stimo_init = stcb->asoc.timoinit;
+ stimo->stimo_data = stcb->asoc.timodata;
+ stimo->stimo_sack = stcb->asoc.timosack;
+ stimo->stimo_shutdown = stcb->asoc.timoshutdown;
+ stimo->stimo_heartbeat = stcb->asoc.timoheartbeat;
+ stimo->stimo_cookie = stcb->asoc.timocookie;
+ stimo->stimo_shutdownack = stcb->asoc.timoshutdownack;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ error = EINVAL;
+ }
+ *optsize = sizeof(*stimo);
+ }
+ break;
+ case SCTP_ASSOCINFO:
+ {
+ struct sctp_assocparams *sasoc;
+ uint32_t oldval;
+
+ SCTP_CHECK_AND_CAST(sasoc, optval, struct sctp_assocparams, *optsize);
+ SCTP_FIND_STCB(inp, stcb, sasoc->sasoc_assoc_id);
+
+ if (stcb) {
+ oldval = sasoc->sasoc_cookie_life;
+ sasoc->sasoc_cookie_life = TICKS_TO_MSEC(stcb->asoc.cookie_life);
+ sasoc->sasoc_asocmaxrxt = stcb->asoc.max_send_times;
+ sasoc->sasoc_number_peer_destinations = stcb->asoc.numnets;
+ sasoc->sasoc_peer_rwnd = stcb->asoc.peers_rwnd;
+ sasoc->sasoc_local_rwnd = stcb->asoc.my_rwnd;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_RLOCK(inp);
+ sasoc->sasoc_cookie_life = TICKS_TO_MSEC(inp->sctp_ep.def_cookie_life);
+ sasoc->sasoc_asocmaxrxt = inp->sctp_ep.max_send_times;
+ sasoc->sasoc_number_peer_destinations = 0;
+ sasoc->sasoc_peer_rwnd = 0;
+ sasoc->sasoc_local_rwnd = sbspace(&inp->sctp_socket->so_rcv);
+ SCTP_INP_RUNLOCK(inp);
+ }
+ *optsize = sizeof(*sasoc);
+ }
+ break;
+ case SCTP_DEFAULT_SEND_PARAM:
+ {
+ struct sctp_sndrcvinfo *s_info;
+
+ SCTP_CHECK_AND_CAST(s_info, optval, struct sctp_sndrcvinfo, *optsize);
+ SCTP_FIND_STCB(inp, stcb, s_info->sinfo_assoc_id);
+
+ if (stcb) {
+ memcpy(s_info, &stcb->asoc.def_send, sizeof(stcb->asoc.def_send));
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_RLOCK(inp);
+ memcpy(s_info, &inp->def_send, sizeof(inp->def_send));
+ SCTP_INP_RUNLOCK(inp);
+ }
+ *optsize = sizeof(*s_info);
+ }
+ break;
+ case SCTP_INITMSG:
+ {
+ struct sctp_initmsg *sinit;
+
+ SCTP_CHECK_AND_CAST(sinit, optval, struct sctp_initmsg, *optsize);
+ SCTP_INP_RLOCK(inp);
+ sinit->sinit_num_ostreams = inp->sctp_ep.pre_open_stream_count;
+ sinit->sinit_max_instreams = inp->sctp_ep.max_open_streams_intome;
+ sinit->sinit_max_attempts = inp->sctp_ep.max_init_times;
+ sinit->sinit_max_init_timeo = inp->sctp_ep.initial_init_rto_max;
+ SCTP_INP_RUNLOCK(inp);
+ *optsize = sizeof(*sinit);
+ }
+ break;
+ case SCTP_PRIMARY_ADDR:
+ /* we allow a "get" operation on this */
+ {
+ struct sctp_setprim *ssp;
+
+ SCTP_CHECK_AND_CAST(ssp, optval, struct sctp_setprim, *optsize);
+ SCTP_FIND_STCB(inp, stcb, ssp->ssp_assoc_id);
+
+ if (stcb) {
+ /* simply copy out the sockaddr_storage... */
+ int len;
+
+ len = *optsize;
+ if (len > stcb->asoc.primary_destination->ro._l_addr.sa.sa_len)
+ len = stcb->asoc.primary_destination->ro._l_addr.sa.sa_len;
+
+ memcpy(&ssp->ssp_addr,
+ &stcb->asoc.primary_destination->ro._l_addr,
+ len);
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ error = EINVAL;
+ }
+ *optsize = sizeof(*ssp);
+ }
+ break;
+
+ case SCTP_HMAC_IDENT:
+ {
+ struct sctp_hmacalgo *shmac;
+ sctp_hmaclist_t *hmaclist;
+ uint32_t size;
+ int i;
+
+ SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, *optsize);
+
+ SCTP_INP_RLOCK(inp);
+ hmaclist = inp->sctp_ep.local_hmacs;
+ if (hmaclist == NULL) {
+ /* no HMACs to return */
+ *optsize = sizeof(*shmac);
+ SCTP_INP_RUNLOCK(inp);
+ break;
+ }
+ /* is there room for all of the hmac ids? */
+ size = sizeof(*shmac) + (hmaclist->num_algo *
+ sizeof(shmac->shmac_idents[0]));
+ if ((size_t)(*optsize) < size) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ error = EINVAL;
+ SCTP_INP_RUNLOCK(inp);
+ break;
+ }
+ /* copy in the list */
+ shmac->shmac_number_of_idents = hmaclist->num_algo;
+ for (i = 0; i < hmaclist->num_algo; i++) {
+ shmac->shmac_idents[i] = hmaclist->hmac[i];
+ }
+ SCTP_INP_RUNLOCK(inp);
+ *optsize = size;
+ break;
+ }
+ case SCTP_AUTH_ACTIVE_KEY:
+ {
+ struct sctp_authkeyid *scact;
+
+ SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid, *optsize);
+ SCTP_FIND_STCB(inp, stcb, scact->scact_assoc_id);
+
+ if (stcb) {
+ /* get the active key on the assoc */
+ scact->scact_keynumber = stcb->asoc.authinfo.active_keyid;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ /* get the endpoint active key */
+ SCTP_INP_RLOCK(inp);
+ scact->scact_keynumber = inp->sctp_ep.default_keyid;
+ SCTP_INP_RUNLOCK(inp);
+ }
+ *optsize = sizeof(*scact);
+ break;
+ }
+ case SCTP_LOCAL_AUTH_CHUNKS:
+ {
+ struct sctp_authchunks *sac;
+ sctp_auth_chklist_t *chklist = NULL;
+ size_t size = 0;
+
+ SCTP_CHECK_AND_CAST(sac, optval, struct sctp_authchunks, *optsize);
+ SCTP_FIND_STCB(inp, stcb, sac->gauth_assoc_id);
+
+ if (stcb) {
+ /* get off the assoc */
+ chklist = stcb->asoc.local_auth_chunks;
+ /* is there enough space? */
+ size = sctp_auth_get_chklist_size(chklist);
+ if (*optsize < (sizeof(struct sctp_authchunks) + size)) {
+ error = EINVAL;
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ } else {
+ /* copy in the chunks */
+ (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks);
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ /* get off the endpoint */
+ SCTP_INP_RLOCK(inp);
+ chklist = inp->sctp_ep.local_auth_chunks;
+ /* is there enough space? */
+ size = sctp_auth_get_chklist_size(chklist);
+ if (*optsize < (sizeof(struct sctp_authchunks) + size)) {
+ error = EINVAL;
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ } else {
+ /* copy in the chunks */
+ (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks);
+ }
+ SCTP_INP_RUNLOCK(inp);
+ }
+ *optsize = sizeof(struct sctp_authchunks) + size;
+ break;
+ }
+ case SCTP_PEER_AUTH_CHUNKS:
+ {
+ struct sctp_authchunks *sac;
+ sctp_auth_chklist_t *chklist = NULL;
+ size_t size = 0;
+
+ SCTP_CHECK_AND_CAST(sac, optval, struct sctp_authchunks, *optsize);
+ SCTP_FIND_STCB(inp, stcb, sac->gauth_assoc_id);
+
+ if (stcb) {
+ /* get off the assoc */
+ chklist = stcb->asoc.peer_auth_chunks;
+ /* is there enough space? */
+ size = sctp_auth_get_chklist_size(chklist);
+ if (*optsize < (sizeof(struct sctp_authchunks) + size)) {
+ error = EINVAL;
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ } else {
+ /* copy in the chunks */
+ (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks);
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
+ error = ENOENT;
+ }
+ *optsize = sizeof(struct sctp_authchunks) + size;
+ break;
+ }
+
+
+ default:
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
+ error = ENOPROTOOPT;
+ *optsize = 0;
+ break;
+ } /* end switch (sopt->sopt_name) */
+ return (error);
+}
+
+static int
+sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
+ void *p)
+{
+ int error, set_opt;
+ uint32_t *mopt;
+ struct sctp_tcb *stcb = NULL;
+ struct sctp_inpcb *inp = NULL;
+ uint32_t vrf_id;
+
+ if (optval == NULL) {
+ SCTP_PRINTF("optval is NULL\n");
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (EINVAL);
+ }
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == 0) {
+ SCTP_PRINTF("inp is NULL?\n");
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return EINVAL;
+ }
+ vrf_id = inp->def_vrf_id;
+
+ error = 0;
+ switch (optname) {
+ case SCTP_NODELAY:
+ case SCTP_AUTOCLOSE:
+ case SCTP_AUTO_ASCONF:
+ case SCTP_EXPLICIT_EOR:
+ case SCTP_DISABLE_FRAGMENTS:
+ case SCTP_USE_EXT_RCVINFO:
+ case SCTP_I_WANT_MAPPED_V4_ADDR:
+ /* copy in the option value */
+ SCTP_CHECK_AND_CAST(mopt, optval, uint32_t, optsize);
+ set_opt = 0;
+ if (error)
+ break;
+ switch (optname) {
+ case SCTP_DISABLE_FRAGMENTS:
+ set_opt = SCTP_PCB_FLAGS_NO_FRAGMENT;
+ break;
+ case SCTP_AUTO_ASCONF:
+ /*
+ * NOTE: we don't really support this flag
+ */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ /* only valid for bound all sockets */
+ set_opt = SCTP_PCB_FLAGS_AUTO_ASCONF;
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (EINVAL);
+ }
+ break;
+ case SCTP_EXPLICIT_EOR:
+ set_opt = SCTP_PCB_FLAGS_EXPLICIT_EOR;
+ break;
+ case SCTP_USE_EXT_RCVINFO:
+ set_opt = SCTP_PCB_FLAGS_EXT_RCVINFO;
+ break;
+ case SCTP_I_WANT_MAPPED_V4_ADDR:
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ set_opt = SCTP_PCB_FLAGS_NEEDS_MAPPED_V4;
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (EINVAL);
+ }
+ break;
+ case SCTP_NODELAY:
+ set_opt = SCTP_PCB_FLAGS_NODELAY;
+ break;
+ case SCTP_AUTOCLOSE:
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (EINVAL);
+ }
+ set_opt = SCTP_PCB_FLAGS_AUTOCLOSE;
+ /*
+ * The value is in ticks. Note this does not effect
+ * old associations, only new ones.
+ */
+ inp->sctp_ep.auto_close_time = SEC_TO_TICKS(*mopt);
+ break;
+ }
+ SCTP_INP_WLOCK(inp);
+ if (*mopt != 0) {
+ sctp_feature_on(inp, set_opt);
+ } else {
+ sctp_feature_off(inp, set_opt);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ break;
+ case SCTP_REUSE_PORT:
+ {
+ SCTP_CHECK_AND_CAST(mopt, optval, uint32_t, optsize);
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) {
+ /* Can't set it after we are bound */
+ error = EINVAL;
+ break;
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) {
+ /* Can't do this for a 1-m socket */
+ error = EINVAL;
+ break;
+ }
+ if (optval)
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE);
+ else
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE);
+ }
+ break;
+ case SCTP_PARTIAL_DELIVERY_POINT:
+ {
+ uint32_t *value;
+
+ SCTP_CHECK_AND_CAST(value, optval, uint32_t, optsize);
+ if (*value > SCTP_SB_LIMIT_RCV(so)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ break;
+ }
+ inp->partial_delivery_point = *value;
+ }
+ break;
+ case SCTP_FRAGMENT_INTERLEAVE:
+ /* not yet until we re-write sctp_recvmsg() */
+ {
+ uint32_t *level;
+
+ SCTP_CHECK_AND_CAST(level, optval, uint32_t, optsize);
+ if (*level == SCTP_FRAG_LEVEL_2) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
+ } else if (*level == SCTP_FRAG_LEVEL_1) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
+ } else if (*level == SCTP_FRAG_LEVEL_0) {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
+
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ break;
+ case SCTP_CMT_ON_OFF:
+ if (SCTP_BASE_SYSCTL(sctp_cmt_on_off)) {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+ if (stcb) {
+ if (av->assoc_value != 0)
+ stcb->asoc.sctp_cmt_on_off = 1;
+ else
+ stcb->asoc.sctp_cmt_on_off = 0;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_WLOCK(inp);
+ if (av->assoc_value != 0)
+ inp->sctp_cmt_on_off = 1;
+ else
+ inp->sctp_cmt_on_off = 0;
+ SCTP_INP_WUNLOCK(inp);
+ }
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
+ error = ENOPROTOOPT;
+ }
+ break;
+ /* JRS - Set socket option for pluggable congestion control */
+ case SCTP_PLUGGABLE_CC:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+ if (stcb) {
+ switch (av->assoc_value) {
+ /*
+ * JRS - Standard TCP congestion
+ * control
+ */
+ case SCTP_CC_RFC2581:
+ {
+ stcb->asoc.congestion_control_module = SCTP_CC_RFC2581;
+ stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_set_initial_cc_param;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_cwnd_update_after_sack;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_cwnd_update_after_fr;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_cwnd_update_after_timeout;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_cwnd_update_after_ecn_echo;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_cwnd_update_after_fr_timer;
+ SCTP_TCB_UNLOCK(stcb);
+ break;
+ }
+ /*
+ * JRS - High Speed TCP congestion
+ * control (Floyd)
+ */
+ case SCTP_CC_HSTCP:
+ {
+ stcb->asoc.congestion_control_module = SCTP_CC_HSTCP;
+ stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_set_initial_cc_param;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_hs_cwnd_update_after_sack;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_hs_cwnd_update_after_fr;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_cwnd_update_after_timeout;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_cwnd_update_after_ecn_echo;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_cwnd_update_after_fr_timer;
+ SCTP_TCB_UNLOCK(stcb);
+ break;
+ }
+ /* JRS - HTCP congestion control */
+ case SCTP_CC_HTCP:
+ {
+ stcb->asoc.congestion_control_module = SCTP_CC_HTCP;
+ stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_htcp_set_initial_cc_param;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_htcp_cwnd_update_after_sack;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_htcp_cwnd_update_after_fr;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_htcp_cwnd_update_after_timeout;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_htcp_cwnd_update_after_ecn_echo;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_htcp_cwnd_update_after_fr_timer;
+ SCTP_TCB_UNLOCK(stcb);
+ break;
+ }
+ /*
+ * JRS - All other values are
+ * invalid
+ */
+ default:
+ {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ SCTP_TCB_UNLOCK(stcb);
+ break;
+ }
+ }
+ } else {
+ switch (av->assoc_value) {
+ case SCTP_CC_RFC2581:
+ case SCTP_CC_HSTCP:
+ case SCTP_CC_HTCP:
+ inp->sctp_ep.sctp_default_cc_module = av->assoc_value;
+ break;
+ default:
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ break;
+ };
+ }
+ }
+ break;
+ case SCTP_CLR_STAT_LOG:
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
+ error = EOPNOTSUPP;
+ break;
+ case SCTP_CONTEXT:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ stcb->asoc.context = av->assoc_value;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_WLOCK(inp);
+ inp->sctp_context = av->assoc_value;
+ SCTP_INP_WUNLOCK(inp);
+ }
+ }
+ break;
+ case SCTP_VRF_ID:
+ {
+ uint32_t *default_vrfid;
+
+ SCTP_CHECK_AND_CAST(default_vrfid, optval, uint32_t, optsize);
+ if (*default_vrfid > SCTP_MAX_VRF_ID) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ break;
+ }
+ inp->def_vrf_id = *default_vrfid;
+ break;
+ }
+ case SCTP_DEL_VRF_ID:
+ {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
+ error = EOPNOTSUPP;
+ break;
+ }
+ case SCTP_ADD_VRF_ID:
+ {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
+ error = EOPNOTSUPP;
+ break;
+ }
+ case SCTP_DELAYED_SACK:
+ {
+ struct sctp_sack_info *sack;
+
+ SCTP_CHECK_AND_CAST(sack, optval, struct sctp_sack_info, optsize);
+ SCTP_FIND_STCB(inp, stcb, sack->sack_assoc_id);
+ if (sack->sack_delay) {
+ if (sack->sack_delay > SCTP_MAX_SACK_DELAY)
+ sack->sack_delay = SCTP_MAX_SACK_DELAY;
+ }
+ if (stcb) {
+ if (sack->sack_delay) {
+ if (MSEC_TO_TICKS(sack->sack_delay) < 1) {
+ sack->sack_delay = TICKS_TO_MSEC(1);
+ }
+ stcb->asoc.delayed_ack = sack->sack_delay;
+ }
+ if (sack->sack_freq) {
+ stcb->asoc.sack_freq = sack->sack_freq;
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_WLOCK(inp);
+ if (sack->sack_delay) {
+ if (MSEC_TO_TICKS(sack->sack_delay) < 1) {
+ sack->sack_delay = TICKS_TO_MSEC(1);
+ }
+ inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV] = MSEC_TO_TICKS(sack->sack_delay);
+ }
+ if (sack->sack_freq) {
+ inp->sctp_ep.sctp_sack_freq = sack->sack_freq;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ }
+ break;
+ }
+ case SCTP_AUTH_CHUNK:
+ {
+ struct sctp_authchunk *sauth;
+
+ SCTP_CHECK_AND_CAST(sauth, optval, struct sctp_authchunk, optsize);
+
+ SCTP_INP_WLOCK(inp);
+ if (sctp_auth_add_chunk(sauth->sauth_chunk, inp->sctp_ep.local_auth_chunks)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ break;
+ }
+ case SCTP_AUTH_KEY:
+ {
+ struct sctp_authkey *sca;
+ struct sctp_keyhead *shared_keys;
+ sctp_sharedkey_t *shared_key;
+ sctp_key_t *key = NULL;
+ size_t size;
+
+ SCTP_CHECK_AND_CAST(sca, optval, struct sctp_authkey, optsize);
+ SCTP_FIND_STCB(inp, stcb, sca->sca_assoc_id);
+ size = optsize - sizeof(*sca);
+
+ if (stcb) {
+ /* set it on the assoc */
+ shared_keys = &stcb->asoc.shared_keys;
+ /* clear the cached keys for this key id */
+ sctp_clear_cachedkeys(stcb, sca->sca_keynumber);
+ /*
+ * create the new shared key and
+ * insert/replace it
+ */
+ if (size > 0) {
+ key = sctp_set_key(sca->sca_key, (uint32_t) size);
+ if (key == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
+ error = ENOMEM;
+ SCTP_TCB_UNLOCK(stcb);
+ break;
+ }
+ }
+ shared_key = sctp_alloc_sharedkey();
+ if (shared_key == NULL) {
+ sctp_free_key(key);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
+ error = ENOMEM;
+ SCTP_TCB_UNLOCK(stcb);
+ break;
+ }
+ shared_key->key = key;
+ shared_key->keyid = sca->sca_keynumber;
+ error = sctp_insert_sharedkey(shared_keys, shared_key);
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ /* set it on the endpoint */
+ SCTP_INP_WLOCK(inp);
+ shared_keys = &inp->sctp_ep.shared_keys;
+ /*
+ * clear the cached keys on all assocs for
+ * this key id
+ */
+ sctp_clear_cachedkeys_ep(inp, sca->sca_keynumber);
+ /*
+ * create the new shared key and
+ * insert/replace it
+ */
+ if (size > 0) {
+ key = sctp_set_key(sca->sca_key, (uint32_t) size);
+ if (key == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
+ error = ENOMEM;
+ SCTP_INP_WUNLOCK(inp);
+ break;
+ }
+ }
+ shared_key = sctp_alloc_sharedkey();
+ if (shared_key == NULL) {
+ sctp_free_key(key);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
+ error = ENOMEM;
+ SCTP_INP_WUNLOCK(inp);
+ break;
+ }
+ shared_key->key = key;
+ shared_key->keyid = sca->sca_keynumber;
+ error = sctp_insert_sharedkey(shared_keys, shared_key);
+ SCTP_INP_WUNLOCK(inp);
+ }
+ break;
+ }
+ case SCTP_HMAC_IDENT:
+ {
+ struct sctp_hmacalgo *shmac;
+ sctp_hmaclist_t *hmaclist;
+ uint16_t hmacid;
+ uint32_t i;
+
+ size_t found;
+
+ SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, optsize);
+ if (optsize < sizeof(struct sctp_hmacalgo) + shmac->shmac_number_of_idents * sizeof(uint16_t)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ break;
+ }
+ hmaclist = sctp_alloc_hmaclist(shmac->shmac_number_of_idents);
+ if (hmaclist == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
+ error = ENOMEM;
+ break;
+ }
+ for (i = 0; i < shmac->shmac_number_of_idents; i++) {
+ hmacid = shmac->shmac_idents[i];
+ if (sctp_auth_add_hmacid(hmaclist, hmacid)) {
+ /* invalid HMACs were found */ ;
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ sctp_free_hmaclist(hmaclist);
+ goto sctp_set_hmac_done;
+ }
+ }
+ found = 0;
+ for (i = 0; i < hmaclist->num_algo; i++) {
+ if (hmaclist->hmac[i] == SCTP_AUTH_HMAC_ID_SHA1) {
+ /* already in list */
+ found = 1;
+ }
+ }
+ if (!found) {
+ sctp_free_hmaclist(hmaclist);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ break;
+ }
+ /* set it on the endpoint */
+ SCTP_INP_WLOCK(inp);
+ if (inp->sctp_ep.local_hmacs)
+ sctp_free_hmaclist(inp->sctp_ep.local_hmacs);
+ inp->sctp_ep.local_hmacs = hmaclist;
+ SCTP_INP_WUNLOCK(inp);
+ sctp_set_hmac_done:
+ break;
+ }
+ case SCTP_AUTH_ACTIVE_KEY:
+ {
+ struct sctp_authkeyid *scact;
+
+ SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid,
+ optsize);
+ SCTP_FIND_STCB(inp, stcb, scact->scact_assoc_id);
+
+ /* set the active key on the right place */
+ if (stcb) {
+ /* set the active key on the assoc */
+ if (sctp_auth_setactivekey(stcb,
+ scact->scact_keynumber)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL,
+ SCTP_FROM_SCTP_USRREQ,
+ EINVAL);
+ error = EINVAL;
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ /* set the active key on the endpoint */
+ SCTP_INP_WLOCK(inp);
+ if (sctp_auth_setactivekey_ep(inp,
+ scact->scact_keynumber)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL,
+ SCTP_FROM_SCTP_USRREQ,
+ EINVAL);
+ error = EINVAL;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ }
+ break;
+ }
+ case SCTP_AUTH_DELETE_KEY:
+ {
+ struct sctp_authkeyid *scdel;
+
+ SCTP_CHECK_AND_CAST(scdel, optval, struct sctp_authkeyid,
+ optsize);
+ SCTP_FIND_STCB(inp, stcb, scdel->scact_assoc_id);
+
+ /* delete the key from the right place */
+ if (stcb) {
+ if (sctp_delete_sharedkey(stcb,
+ scdel->scact_keynumber)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL,
+ SCTP_FROM_SCTP_USRREQ,
+ EINVAL);
+ error = EINVAL;
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_WLOCK(inp);
+ if (sctp_delete_sharedkey_ep(inp,
+ scdel->scact_keynumber)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL,
+ SCTP_FROM_SCTP_USRREQ,
+ EINVAL);
+ error = EINVAL;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ }
+ break;
+ }
+ case SCTP_AUTH_DEACTIVATE_KEY:
+ {
+ struct sctp_authkeyid *keyid;
+
+ SCTP_CHECK_AND_CAST(keyid, optval, struct sctp_authkeyid,
+ optsize);
+ SCTP_FIND_STCB(inp, stcb, keyid->scact_assoc_id);
+
+ /* deactivate the key from the right place */
+ if (stcb) {
+ if (sctp_deact_sharedkey(stcb,
+ keyid->scact_keynumber)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL,
+ SCTP_FROM_SCTP_USRREQ,
+ EINVAL);
+ error = EINVAL;
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_WLOCK(inp);
+ if (sctp_deact_sharedkey_ep(inp,
+ keyid->scact_keynumber)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL,
+ SCTP_FROM_SCTP_USRREQ,
+ EINVAL);
+ error = EINVAL;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ }
+ break;
+ }
+
+ case SCTP_RESET_STREAMS:
+ {
+ struct sctp_stream_reset *strrst;
+ uint8_t send_in = 0, send_tsn = 0, send_out = 0,
+ addstream = 0;
+ uint16_t addstrmcnt = 0;
+ int i;
+
+ SCTP_CHECK_AND_CAST(strrst, optval, struct sctp_stream_reset, optsize);
+ SCTP_FIND_STCB(inp, stcb, strrst->strrst_assoc_id);
+
+ if (stcb == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
+ error = ENOENT;
+ break;
+ }
+ if (stcb->asoc.peer_supports_strreset == 0) {
+ /*
+ * Peer does not support it, we return
+ * protocol not supported since this is true
+ * for this feature and this peer, not the
+ * socket request in general.
+ */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EPROTONOSUPPORT);
+ error = EPROTONOSUPPORT;
+ SCTP_TCB_UNLOCK(stcb);
+ break;
+ }
+ if (stcb->asoc.stream_reset_outstanding) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
+ error = EALREADY;
+ SCTP_TCB_UNLOCK(stcb);
+ break;
+ }
+ if (strrst->strrst_flags == SCTP_RESET_LOCAL_RECV) {
+ send_in = 1;
+ } else if (strrst->strrst_flags == SCTP_RESET_LOCAL_SEND) {
+ send_out = 1;
+ } else if (strrst->strrst_flags == SCTP_RESET_BOTH) {
+ send_in = 1;
+ send_out = 1;
+ } else if (strrst->strrst_flags == SCTP_RESET_TSN) {
+ send_tsn = 1;
+ } else if (strrst->strrst_flags == SCTP_RESET_ADD_STREAMS) {
+ if (send_tsn ||
+ send_in ||
+ send_out) {
+ /* We can't do that and add streams */
+ error = EINVAL;
+ goto skip_stuff;
+ }
+ if (stcb->asoc.stream_reset_outstanding) {
+ error = EBUSY;
+ goto skip_stuff;
+ }
+ addstream = 1;
+ /* We allocate here */
+ addstrmcnt = strrst->strrst_num_streams;
+ if ((int)(addstrmcnt + stcb->asoc.streamoutcnt) > 0xffff) {
+ /* You can't have more than 64k */
+ error = EINVAL;
+ goto skip_stuff;
+ }
+ if ((stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt) < addstrmcnt) {
+ /* Need to allocate more */
+ struct sctp_stream_out *oldstream;
+ struct sctp_stream_queue_pending *sp;
+ int removed;
+
+ oldstream = stcb->asoc.strmout;
+ /* get some more */
+ SCTP_MALLOC(stcb->asoc.strmout, struct sctp_stream_out *,
+ ((stcb->asoc.streamoutcnt + addstrmcnt) * sizeof(struct sctp_stream_out)),
+ SCTP_M_STRMO);
+ if (stcb->asoc.strmout == NULL) {
+ stcb->asoc.strmout = oldstream;
+ error = ENOMEM;
+ goto skip_stuff;
+ }
+ /*
+ * Ok now we proceed with copying
+ * the old out stuff and
+ * initializing the new stuff.
+ */
+ SCTP_TCB_SEND_LOCK(stcb);
+ for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
+ TAILQ_INIT(&stcb->asoc.strmout[i].outqueue);
+ stcb->asoc.strmout[i].next_sequence_sent = oldstream[i].next_sequence_sent;
+ stcb->asoc.strmout[i].last_msg_incomplete = oldstream[i].last_msg_incomplete;
+ stcb->asoc.strmout[i].stream_no = i;
+ if (oldstream[i].next_spoke.tqe_next) {
+ sctp_remove_from_wheel(stcb, &stcb->asoc, &oldstream[i], 1);
+ stcb->asoc.strmout[i].next_spoke.tqe_next = NULL;
+ stcb->asoc.strmout[i].next_spoke.tqe_prev = NULL;
+ removed = 1;
+ } else {
+ /* not on out wheel */
+ stcb->asoc.strmout[i].next_spoke.tqe_next = NULL;
+ stcb->asoc.strmout[i].next_spoke.tqe_prev = NULL;
+ removed = 0;
+ }
+ /*
+ * now anything on those
+ * queues?
+ */
+ while (TAILQ_EMPTY(&oldstream[i].outqueue) == 0) {
+ sp = TAILQ_FIRST(&oldstream[i].outqueue);
+ TAILQ_REMOVE(&oldstream[i].outqueue, sp, next);
+ TAILQ_INSERT_TAIL(&stcb->asoc.strmout[i].outqueue, sp, next);
+ }
+ /* Did we disrupt the wheel? */
+ if (removed) {
+ sctp_insert_on_wheel(stcb,
+ &stcb->asoc,
+ &stcb->asoc.strmout[i],
+ 1);
+ }
+ /*
+ * Now move assoc pointers
+ * too
+ */
+ if (stcb->asoc.last_out_stream == &oldstream[i]) {
+ stcb->asoc.last_out_stream = &stcb->asoc.strmout[i];
+ }
+ if (stcb->asoc.locked_on_sending == &oldstream[i]) {
+ stcb->asoc.locked_on_sending = &stcb->asoc.strmout[i];
+ }
+ }
+ /* now the new streams */
+ for (i = stcb->asoc.streamoutcnt; i < (stcb->asoc.streamoutcnt + addstrmcnt); i++) {
+ stcb->asoc.strmout[i].next_sequence_sent = 0x0;
+ TAILQ_INIT(&stcb->asoc.strmout[i].outqueue);
+ stcb->asoc.strmout[i].stream_no = i;
+ stcb->asoc.strmout[i].last_msg_incomplete = 0;
+ stcb->asoc.strmout[i].next_spoke.tqe_next = NULL;
+ stcb->asoc.strmout[i].next_spoke.tqe_prev = NULL;
+ }
+ stcb->asoc.strm_realoutsize = stcb->asoc.streamoutcnt + addstrmcnt;
+ SCTP_FREE(oldstream, SCTP_M_STRMO);
+ }
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ goto skip_stuff;
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ SCTP_TCB_UNLOCK(stcb);
+ break;
+ }
+ for (i = 0; i < strrst->strrst_num_streams; i++) {
+ if ((send_in) &&
+
+ (strrst->strrst_list[i] > stcb->asoc.streamincnt)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ goto get_out;
+ }
+ if ((send_out) &&
+ (strrst->strrst_list[i] > stcb->asoc.streamoutcnt)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ goto get_out;
+ }
+ }
+ skip_stuff:
+ if (error) {
+ get_out:
+ SCTP_TCB_UNLOCK(stcb);
+ break;
+ }
+ error = sctp_send_str_reset_req(stcb, strrst->strrst_num_streams,
+ strrst->strrst_list,
+ send_out, (stcb->asoc.str_reset_seq_in - 3),
+ send_in, send_tsn, addstream, addstrmcnt);
+
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED);
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ break;
+
+ case SCTP_CONNECT_X:
+ if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ break;
+ }
+ error = sctp_do_connect_x(so, inp, optval, optsize, p, 0);
+ break;
+
+ case SCTP_CONNECT_X_DELAYED:
+ if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ break;
+ }
+ error = sctp_do_connect_x(so, inp, optval, optsize, p, 1);
+ break;
+
+ case SCTP_CONNECT_X_COMPLETE:
+ {
+ struct sockaddr *sa;
+ struct sctp_nets *net;
+
+ /* FIXME MT: check correct? */
+ SCTP_CHECK_AND_CAST(sa, optval, struct sockaddr, optsize);
+
+ /* find tcb */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
+ SCTP_INP_RLOCK(inp);
+ stcb = LIST_FIRST(&inp->sctp_asoc_list);
+ if (stcb) {
+ SCTP_TCB_LOCK(stcb);
+ net = sctp_findnet(stcb, sa);
+ }
+ SCTP_INP_RUNLOCK(inp);
+ } else {
+ /*
+ * We increment here since
+ * sctp_findassociation_ep_addr() wil do a
+ * decrement if it finds the stcb as long as
+ * the locked tcb (last argument) is NOT a
+ * TCB.. aka NULL.
+ */
+ SCTP_INP_INCR_REF(inp);
+ stcb = sctp_findassociation_ep_addr(&inp, sa, &net, NULL, NULL);
+ if (stcb == NULL) {
+ SCTP_INP_DECR_REF(inp);
+ }
+ }
+
+ if (stcb == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
+ error = ENOENT;
+ break;
+ }
+ if (stcb->asoc.delayed_connection == 1) {
+ stcb->asoc.delayed_connection = 0;
+ (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
+ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb,
+ stcb->asoc.primary_destination,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_9);
+ sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
+ } else {
+ /*
+ * already expired or did not use delayed
+ * connectx
+ */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
+ error = EALREADY;
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ break;
+ case SCTP_MAX_BURST:
+ {
+ uint8_t *burst;
+
+ SCTP_CHECK_AND_CAST(burst, optval, uint8_t, optsize);
+
+ SCTP_INP_WLOCK(inp);
+ if (*burst) {
+ inp->sctp_ep.max_burst = *burst;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ }
+ break;
+ case SCTP_MAXSEG:
+ {
+ struct sctp_assoc_value *av;
+ int ovh;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ ovh = SCTP_MED_OVERHEAD;
+ } else {
+ ovh = SCTP_MED_V4_OVERHEAD;
+ }
+ if (stcb) {
+ if (av->assoc_value) {
+ stcb->asoc.sctp_frag_point = (av->assoc_value + ovh);
+ } else {
+ stcb->asoc.sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT;
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_WLOCK(inp);
+ /*
+ * FIXME MT: I think this is not in tune
+ * with the API ID
+ */
+ if (av->assoc_value) {
+ inp->sctp_frag_point = (av->assoc_value + ovh);
+ } else {
+ inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ }
+ }
+ break;
+ case SCTP_EVENTS:
+ {
+ struct sctp_event_subscribe *events;
+
+ SCTP_CHECK_AND_CAST(events, optval, struct sctp_event_subscribe, optsize);
+
+ SCTP_INP_WLOCK(inp);
+ if (events->sctp_data_io_event) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT);
+ } else {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT);
+ }
+
+ if (events->sctp_association_event) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT);
+ } else {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT);
+ }
+
+ if (events->sctp_address_event) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVPADDREVNT);
+ } else {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVPADDREVNT);
+ }
+
+ if (events->sctp_send_failure_event) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT);
+ } else {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT);
+ }
+
+ if (events->sctp_peer_error_event) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVPEERERR);
+ } else {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVPEERERR);
+ }
+
+ if (events->sctp_shutdown_event) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT);
+ } else {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT);
+ }
+
+ if (events->sctp_partial_delivery_event) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT);
+ } else {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_PDAPIEVNT);
+ }
+
+ if (events->sctp_adaptation_layer_event) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT);
+ } else {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT);
+ }
+
+ if (events->sctp_authentication_event) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_AUTHEVNT);
+ } else {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTHEVNT);
+ }
+
+ if (events->sctp_sender_dry_event) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_DRYEVNT);
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ stcb = LIST_FIRST(&inp->sctp_asoc_list);
+ if (stcb) {
+ SCTP_TCB_LOCK(stcb);
+ }
+ if (stcb &&
+ TAILQ_EMPTY(&stcb->asoc.send_queue) &&
+ TAILQ_EMPTY(&stcb->asoc.sent_queue) &&
+ (stcb->asoc.stream_queue_cnt == 0)) {
+ sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED);
+ }
+ if (stcb) {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ }
+ } else {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_DRYEVNT);
+ }
+
+ if (events->sctp_stream_reset_event) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT);
+ } else {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ }
+ break;
+
+ case SCTP_ADAPTATION_LAYER:
+ {
+ struct sctp_setadaptation *adap_bits;
+
+ SCTP_CHECK_AND_CAST(adap_bits, optval, struct sctp_setadaptation, optsize);
+ SCTP_INP_WLOCK(inp);
+ inp->sctp_ep.adaptation_layer_indicator = adap_bits->ssb_adaptation_ind;
+ SCTP_INP_WUNLOCK(inp);
+ }
+ break;
+#ifdef SCTP_DEBUG
+ case SCTP_SET_INITIAL_DBG_SEQ:
+ {
+ uint32_t *vvv;
+
+ SCTP_CHECK_AND_CAST(vvv, optval, uint32_t, optsize);
+ SCTP_INP_WLOCK(inp);
+ inp->sctp_ep.initial_sequence_debug = *vvv;
+ SCTP_INP_WUNLOCK(inp);
+ }
+ break;
+#endif
+ case SCTP_DEFAULT_SEND_PARAM:
+ {
+ struct sctp_sndrcvinfo *s_info;
+
+ SCTP_CHECK_AND_CAST(s_info, optval, struct sctp_sndrcvinfo, optsize);
+ SCTP_FIND_STCB(inp, stcb, s_info->sinfo_assoc_id);
+
+ if (stcb) {
+ if (s_info->sinfo_stream <= stcb->asoc.streamoutcnt) {
+ memcpy(&stcb->asoc.def_send, s_info, min(optsize, sizeof(stcb->asoc.def_send)));
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_WLOCK(inp);
+ memcpy(&inp->def_send, s_info, min(optsize, sizeof(inp->def_send)));
+ SCTP_INP_WUNLOCK(inp);
+ }
+ }
+ break;
+ case SCTP_PEER_ADDR_PARAMS:
+ /* Applys to the specific association */
+ {
+ struct sctp_paddrparams *paddrp;
+ struct sctp_nets *net;
+
+ SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, optsize);
+ SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id);
+ net = NULL;
+ if (stcb) {
+ net = sctp_findnet(stcb, (struct sockaddr *)&paddrp->spp_address);
+ } else {
+ /*
+ * We increment here since
+ * sctp_findassociation_ep_addr() wil do a
+ * decrement if it finds the stcb as long as
+ * the locked tcb (last argument) is NOT a
+ * TCB.. aka NULL.
+ */
+ SCTP_INP_INCR_REF(inp);
+ stcb = sctp_findassociation_ep_addr(&inp,
+ (struct sockaddr *)&paddrp->spp_address,
+ &net, NULL, NULL);
+ if (stcb == NULL) {
+ SCTP_INP_DECR_REF(inp);
+ }
+ }
+ if (stcb && (net == NULL)) {
+ struct sockaddr *sa;
+
+ sa = (struct sockaddr *)&paddrp->spp_address;
+ if (sa->sa_family == AF_INET) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)sa;
+ if (sin->sin_addr.s_addr) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ SCTP_TCB_UNLOCK(stcb);
+ error = EINVAL;
+ break;
+ }
+ } else if (sa->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)sa;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ SCTP_TCB_UNLOCK(stcb);
+ error = EINVAL;
+ break;
+ }
+ } else {
+ error = EAFNOSUPPORT;
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ break;
+ }
+ }
+ /* sanity checks */
+ if ((paddrp->spp_flags & SPP_HB_ENABLE) && (paddrp->spp_flags & SPP_HB_DISABLE)) {
+ if (stcb)
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (EINVAL);
+ }
+ if ((paddrp->spp_flags & SPP_PMTUD_ENABLE) && (paddrp->spp_flags & SPP_PMTUD_DISABLE)) {
+ if (stcb)
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (EINVAL);
+ }
+ if (stcb) {
+ /************************TCB SPECIFIC SET ******************/
+ /*
+ * do we change the timer for HB, we run
+ * only one?
+ */
+ int ovh = 0;
+
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ ovh = SCTP_MED_OVERHEAD;
+ } else {
+ ovh = SCTP_MED_V4_OVERHEAD;
+ }
+
+ if (paddrp->spp_hbinterval)
+ stcb->asoc.heart_beat_delay = paddrp->spp_hbinterval;
+ else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO)
+ stcb->asoc.heart_beat_delay = 0;
+
+ /* network sets ? */
+ if (net) {
+ /************************NET SPECIFIC SET ******************/
+ if (paddrp->spp_flags & SPP_HB_DEMAND) {
+ /* on demand HB */
+ if (sctp_send_hb(stcb, 1, net) < 0) {
+ /* asoc destroyed */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ break;
+ }
+ }
+ if (paddrp->spp_flags & SPP_HB_DISABLE) {
+ net->dest_state |= SCTP_ADDR_NOHB;
+ }
+ if (paddrp->spp_flags & SPP_HB_ENABLE) {
+ net->dest_state &= ~SCTP_ADDR_NOHB;
+ }
+ if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) && (paddrp->spp_pathmtu >= SCTP_SMALLEST_PMTU)) {
+ if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10);
+ }
+ if (paddrp->spp_pathmtu > SCTP_DEFAULT_MINSEGMENT) {
+ net->mtu = paddrp->spp_pathmtu + ovh;
+ if (net->mtu < stcb->asoc.smallest_mtu) {
+ sctp_pathmtu_adjustment(inp, stcb, net, net->mtu);
+ }
+ }
+ }
+ if (paddrp->spp_flags & SPP_PMTUD_ENABLE) {
+ if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
+ sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
+ }
+ }
+ if (paddrp->spp_pathmaxrxt)
+ net->failure_threshold = paddrp->spp_pathmaxrxt;
+#ifdef INET
+ if (paddrp->spp_flags & SPP_IPV4_TOS) {
+ if (net->ro._l_addr.sin.sin_family == AF_INET) {
+ net->tos_flowlabel = paddrp->spp_ipv4_tos & 0x000000fc;
+ }
+ }
+#endif
+#ifdef INET6
+ if (paddrp->spp_flags & SPP_IPV6_FLOWLABEL) {
+ if (net->ro._l_addr.sin6.sin6_family == AF_INET6) {
+ net->tos_flowlabel = paddrp->spp_ipv6_flowlabel;
+ }
+ }
+#endif
+ } else {
+ /************************ASSOC ONLY -- NO NET SPECIFIC SET ******************/
+ if (paddrp->spp_pathmaxrxt)
+ stcb->asoc.def_net_failure = paddrp->spp_pathmaxrxt;
+
+ if (paddrp->spp_flags & SPP_HB_ENABLE) {
+ /* Turn back on the timer */
+ stcb->asoc.hb_is_disabled = 0;
+ sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net);
+ }
+ if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) && (paddrp->spp_pathmtu >= SCTP_SMALLEST_PMTU)) {
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10);
+ }
+ if (paddrp->spp_pathmtu > SCTP_DEFAULT_MINSEGMENT) {
+ net->mtu = paddrp->spp_pathmtu + ovh;
+ if (net->mtu < stcb->asoc.smallest_mtu) {
+ sctp_pathmtu_adjustment(inp, stcb, net, net->mtu);
+ }
+ }
+ }
+ }
+ if (paddrp->spp_flags & SPP_PMTUD_ENABLE) {
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
+ sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
+ }
+ }
+ }
+ if (paddrp->spp_flags & SPP_HB_DISABLE) {
+ int cnt_of_unconf = 0;
+ struct sctp_nets *lnet;
+
+ stcb->asoc.hb_is_disabled = 1;
+ TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) {
+ if (lnet->dest_state & SCTP_ADDR_UNCONFIRMED) {
+ cnt_of_unconf++;
+ }
+ }
+ /*
+ * stop the timer ONLY if we
+ * have no unconfirmed
+ * addresses
+ */
+ if (cnt_of_unconf == 0) {
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_11);
+ }
+ }
+ }
+ if (paddrp->spp_flags & SPP_HB_ENABLE) {
+ /* start up the timer. */
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net);
+ }
+ }
+#ifdef INET
+ if (paddrp->spp_flags & SPP_IPV4_TOS)
+ stcb->asoc.default_tos = paddrp->spp_ipv4_tos & 0x000000fc;
+#endif
+#ifdef INET6
+ if (paddrp->spp_flags & SPP_IPV6_FLOWLABEL)
+ stcb->asoc.default_flowlabel = paddrp->spp_ipv6_flowlabel;
+#endif
+
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ /************************NO TCB, SET TO default stuff ******************/
+ SCTP_INP_WLOCK(inp);
+ /*
+ * For the TOS/FLOWLABEL stuff you set it
+ * with the options on the socket
+ */
+ if (paddrp->spp_pathmaxrxt) {
+ inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt;
+ }
+ if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO)
+ inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0;
+ else if (paddrp->spp_hbinterval) {
+ if (paddrp->spp_hbinterval > SCTP_MAX_HB_INTERVAL)
+ paddrp->spp_hbinterval = SCTP_MAX_HB_INTERVAL;
+ inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(paddrp->spp_hbinterval);
+ }
+ if (paddrp->spp_flags & SPP_HB_ENABLE) {
+ sctp_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT);
+
+ } else if (paddrp->spp_flags & SPP_HB_DISABLE) {
+ sctp_feature_on(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ }
+ }
+ break;
+ case SCTP_RTOINFO:
+ {
+ struct sctp_rtoinfo *srto;
+ uint32_t new_init, new_min, new_max;
+
+ SCTP_CHECK_AND_CAST(srto, optval, struct sctp_rtoinfo, optsize);
+ SCTP_FIND_STCB(inp, stcb, srto->srto_assoc_id);
+
+ if (stcb) {
+ if (srto->srto_initial)
+ new_init = srto->srto_initial;
+ else
+ new_init = stcb->asoc.initial_rto;
+ if (srto->srto_max)
+ new_max = srto->srto_max;
+ else
+ new_max = stcb->asoc.maxrto;
+ if (srto->srto_min)
+ new_min = srto->srto_min;
+ else
+ new_min = stcb->asoc.minrto;
+ if ((new_min <= new_init) && (new_init <= new_max)) {
+ stcb->asoc.initial_rto = new_init;
+ stcb->asoc.maxrto = new_max;
+ stcb->asoc.minrto = new_min;
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_WLOCK(inp);
+ if (srto->srto_initial)
+ new_init = srto->srto_initial;
+ else
+ new_init = inp->sctp_ep.initial_rto;
+ if (srto->srto_max)
+ new_max = srto->srto_max;
+ else
+ new_max = inp->sctp_ep.sctp_maxrto;
+ if (srto->srto_min)
+ new_min = srto->srto_min;
+ else
+ new_min = inp->sctp_ep.sctp_minrto;
+ if ((new_min <= new_init) && (new_init <= new_max)) {
+ inp->sctp_ep.initial_rto = new_init;
+ inp->sctp_ep.sctp_maxrto = new_max;
+ inp->sctp_ep.sctp_minrto = new_min;
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ }
+ }
+ break;
+ case SCTP_ASSOCINFO:
+ {
+ struct sctp_assocparams *sasoc;
+
+ SCTP_CHECK_AND_CAST(sasoc, optval, struct sctp_assocparams, optsize);
+ SCTP_FIND_STCB(inp, stcb, sasoc->sasoc_assoc_id);
+ if (sasoc->sasoc_cookie_life) {
+ /* boundary check the cookie life */
+ if (sasoc->sasoc_cookie_life < 1000)
+ sasoc->sasoc_cookie_life = 1000;
+ if (sasoc->sasoc_cookie_life > SCTP_MAX_COOKIE_LIFE) {
+ sasoc->sasoc_cookie_life = SCTP_MAX_COOKIE_LIFE;
+ }
+ }
+ if (stcb) {
+ if (sasoc->sasoc_asocmaxrxt)
+ stcb->asoc.max_send_times = sasoc->sasoc_asocmaxrxt;
+ sasoc->sasoc_number_peer_destinations = stcb->asoc.numnets;
+ sasoc->sasoc_peer_rwnd = 0;
+ sasoc->sasoc_local_rwnd = 0;
+ if (sasoc->sasoc_cookie_life) {
+ stcb->asoc.cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life);
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_INP_WLOCK(inp);
+ if (sasoc->sasoc_asocmaxrxt)
+ inp->sctp_ep.max_send_times = sasoc->sasoc_asocmaxrxt;
+ sasoc->sasoc_number_peer_destinations = 0;
+ sasoc->sasoc_peer_rwnd = 0;
+ sasoc->sasoc_local_rwnd = 0;
+ if (sasoc->sasoc_cookie_life) {
+ inp->sctp_ep.def_cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ }
+ }
+ break;
+ case SCTP_INITMSG:
+ {
+ struct sctp_initmsg *sinit;
+
+ SCTP_CHECK_AND_CAST(sinit, optval, struct sctp_initmsg, optsize);
+ SCTP_INP_WLOCK(inp);
+ if (sinit->sinit_num_ostreams)
+ inp->sctp_ep.pre_open_stream_count = sinit->sinit_num_ostreams;
+
+ if (sinit->sinit_max_instreams)
+ inp->sctp_ep.max_open_streams_intome = sinit->sinit_max_instreams;
+
+ if (sinit->sinit_max_attempts)
+ inp->sctp_ep.max_init_times = sinit->sinit_max_attempts;
+
+ if (sinit->sinit_max_init_timeo)
+ inp->sctp_ep.initial_init_rto_max = sinit->sinit_max_init_timeo;
+ SCTP_INP_WUNLOCK(inp);
+ }
+ break;
+ case SCTP_PRIMARY_ADDR:
+ {
+ struct sctp_setprim *spa;
+ struct sctp_nets *net, *lnet;
+
+ SCTP_CHECK_AND_CAST(spa, optval, struct sctp_setprim, optsize);
+ SCTP_FIND_STCB(inp, stcb, spa->ssp_assoc_id);
+
+ net = NULL;
+ if (stcb) {
+ net = sctp_findnet(stcb, (struct sockaddr *)&spa->ssp_addr);
+ } else {
+ /*
+ * We increment here since
+ * sctp_findassociation_ep_addr() wil do a
+ * decrement if it finds the stcb as long as
+ * the locked tcb (last argument) is NOT a
+ * TCB.. aka NULL.
+ */
+ SCTP_INP_INCR_REF(inp);
+ stcb = sctp_findassociation_ep_addr(&inp,
+ (struct sockaddr *)&spa->ssp_addr,
+ &net, NULL, NULL);
+ if (stcb == NULL) {
+ SCTP_INP_DECR_REF(inp);
+ }
+ }
+
+ if ((stcb) && (net)) {
+ if ((net != stcb->asoc.primary_destination) &&
+ (!(net->dest_state & SCTP_ADDR_UNCONFIRMED))) {
+ /* Ok we need to set it */
+ lnet = stcb->asoc.primary_destination;
+ if (sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net) == 0) {
+ if (net->dest_state & SCTP_ADDR_SWITCH_PRIMARY) {
+ net->dest_state |= SCTP_ADDR_DOUBLE_SWITCH;
+ }
+ net->dest_state |= SCTP_ADDR_SWITCH_PRIMARY;
+ }
+ }
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ if (stcb) {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ }
+ break;
+ case SCTP_SET_DYNAMIC_PRIMARY:
+ {
+ union sctp_sockstore *ss;
+
+ error = priv_check(curthread,
+ PRIV_NETINET_RESERVEDPORT);
+ if (error)
+ break;
+
+ SCTP_CHECK_AND_CAST(ss, optval, union sctp_sockstore, optsize);
+ /* SUPER USER CHECK? */
+ error = sctp_dynamic_set_primary(&ss->sa, vrf_id);
+ }
+ break;
+ case SCTP_SET_PEER_PRIMARY_ADDR:
+ {
+ struct sctp_setpeerprim *sspp;
+
+ SCTP_CHECK_AND_CAST(sspp, optval, struct sctp_setpeerprim, optsize);
+ SCTP_FIND_STCB(inp, stcb, sspp->sspp_assoc_id);
+ if (stcb != NULL) {
+ struct sctp_ifa *ifa;
+
+ ifa = sctp_find_ifa_by_addr((struct sockaddr *)&sspp->sspp_addr,
+ stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED);
+ if (ifa == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ goto out_of_it;
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
+ /*
+ * Must validate the ifa found is in
+ * our ep
+ */
+ struct sctp_laddr *laddr;
+ int found = 0;
+
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ if (laddr->ifa == NULL) {
+ SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n",
+ __FUNCTION__);
+ continue;
+ }
+ if (laddr->ifa == ifa) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ goto out_of_it;
+ }
+ }
+ if (sctp_set_primary_ip_address_sa(stcb,
+ (struct sockaddr *)&sspp->sspp_addr) != 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ out_of_it:
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+
+ }
+ break;
+ case SCTP_BINDX_ADD_ADDR:
+ {
+ struct sctp_getaddresses *addrs;
+ size_t sz;
+ struct thread *td;
+
+ td = (struct thread *)p;
+ SCTP_CHECK_AND_CAST(addrs, optval, struct sctp_getaddresses,
+ optsize);
+ if (addrs->addr->sa_family == AF_INET) {
+ sz = sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in);
+ if (optsize < sz) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ break;
+ }
+ if (td != NULL && (error = prison_local_ip4(td->td_ucred, &(((struct sockaddr_in *)(addrs->addr))->sin_addr)))) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ break;
+ }
+#ifdef INET6
+ } else if (addrs->addr->sa_family == AF_INET6) {
+ sz = sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in6);
+ if (optsize < sz) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ break;
+ }
+ if (td != NULL && (error = prison_local_ip6(td->td_ucred, &(((struct sockaddr_in6 *)(addrs->addr))->sin6_addr),
+ (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ break;
+ }
+#endif
+ } else {
+ error = EAFNOSUPPORT;
+ break;
+ }
+ sctp_bindx_add_address(so, inp, addrs->addr,
+ addrs->sget_assoc_id, vrf_id,
+ &error, p);
+ }
+ break;
+ case SCTP_BINDX_REM_ADDR:
+ {
+ struct sctp_getaddresses *addrs;
+ size_t sz;
+ struct thread *td;
+
+ td = (struct thread *)p;
+
+ SCTP_CHECK_AND_CAST(addrs, optval, struct sctp_getaddresses, optsize);
+ if (addrs->addr->sa_family == AF_INET) {
+ sz = sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in);
+ if (optsize < sz) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ break;
+ }
+ if (td != NULL && (error = prison_local_ip4(td->td_ucred, &(((struct sockaddr_in *)(addrs->addr))->sin_addr)))) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ break;
+ }
+#ifdef INET6
+ } else if (addrs->addr->sa_family == AF_INET6) {
+ sz = sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in6);
+ if (optsize < sz) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ break;
+ }
+ if (td != NULL && (error = prison_local_ip6(td->td_ucred, &(((struct sockaddr_in6 *)(addrs->addr))->sin6_addr),
+ (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) {
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ break;
+ }
+#endif
+ } else {
+ error = EAFNOSUPPORT;
+ break;
+ }
+ sctp_bindx_delete_address(so, inp, addrs->addr,
+ addrs->sget_assoc_id, vrf_id,
+ &error);
+ }
+ break;
+ default:
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
+ error = ENOPROTOOPT;
+ break;
+ } /* end switch (opt) */
+ return (error);
+}
+
+int
+sctp_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ void *optval = NULL;
+ size_t optsize = 0;
+ struct sctp_inpcb *inp;
+ void *p;
+ int error = 0;
+
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == 0) {
+ /* I made the same as TCP since we are not setup? */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (ECONNRESET);
+ }
+ if (sopt->sopt_level != IPPROTO_SCTP) {
+ /* wrong proto level... send back up to IP */
+#ifdef INET6
+ if (INP_CHECK_SOCKAF(so, AF_INET6))
+ error = ip6_ctloutput(so, sopt);
+ else
+#endif /* INET6 */
+ error = ip_ctloutput(so, sopt);
+ return (error);
+ }
+ optsize = sopt->sopt_valsize;
+ if (optsize) {
+ SCTP_MALLOC(optval, void *, optsize, SCTP_M_SOCKOPT);
+ if (optval == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS);
+ return (ENOBUFS);
+ }
+ error = sooptcopyin(sopt, optval, optsize, optsize);
+ if (error) {
+ SCTP_FREE(optval, SCTP_M_SOCKOPT);
+ goto out;
+ }
+ }
+ p = (void *)sopt->sopt_td;
+ if (sopt->sopt_dir == SOPT_SET) {
+ error = sctp_setopt(so, sopt->sopt_name, optval, optsize, p);
+ } else if (sopt->sopt_dir == SOPT_GET) {
+ error = sctp_getopt(so, sopt->sopt_name, optval, &optsize, p);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ if ((error == 0) && (optval != NULL)) {
+ error = sooptcopyout(sopt, optval, optsize);
+ SCTP_FREE(optval, SCTP_M_SOCKOPT);
+ } else if (optval != NULL) {
+ SCTP_FREE(optval, SCTP_M_SOCKOPT);
+ }
+out:
+ return (error);
+}
+
+
+static int
+sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p)
+{
+ int error = 0;
+ int create_lock_on = 0;
+ uint32_t vrf_id;
+ struct sctp_inpcb *inp;
+ struct sctp_tcb *stcb = NULL;
+
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == 0) {
+ /* I made the same as TCP since we are not setup? */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (ECONNRESET);
+ }
+ if (addr == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return EINVAL;
+ }
+#ifdef INET6
+ if (addr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6p;
+
+ if (addr->sa_len != sizeof(struct sockaddr_in6)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (EINVAL);
+ }
+ sin6p = (struct sockaddr_in6 *)addr;
+ if (p != NULL && (error = prison_remote_ip6(p->td_ucred, &sin6p->sin6_addr)) != 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ return (error);
+ }
+ } else
+#endif
+ if (addr->sa_family == AF_INET) {
+ struct sockaddr_in *sinp;
+
+ if (addr->sa_len != sizeof(struct sockaddr_in)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (EINVAL);
+ }
+ sinp = (struct sockaddr_in *)addr;
+ if (p != NULL && (error = prison_remote_ip4(p->td_ucred, &sinp->sin_addr)) != 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ return (error);
+ }
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EAFNOSUPPORT);
+ return (EAFNOSUPPORT);
+ }
+ SCTP_INP_INCR_REF(inp);
+ SCTP_ASOC_CREATE_LOCK(inp);
+ create_lock_on = 1;
+
+
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
+ /* Should I really unlock ? */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EFAULT);
+ error = EFAULT;
+ goto out_now;
+ }
+#ifdef INET6
+ if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) &&
+ (addr->sa_family == AF_INET6)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ goto out_now;
+ }
+#endif /* INET6 */
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) ==
+ SCTP_PCB_FLAGS_UNBOUND) {
+ /* Bind a ephemeral port */
+ error = sctp_inpcb_bind(so, NULL, NULL, p);
+ if (error) {
+ goto out_now;
+ }
+ }
+ /* Now do we connect? */
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) &&
+ (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE))) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ goto out_now;
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
+ (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) {
+ /* We are already connected AND the TCP model */
+ SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE);
+ error = EADDRINUSE;
+ goto out_now;
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
+ SCTP_INP_RLOCK(inp);
+ stcb = LIST_FIRST(&inp->sctp_asoc_list);
+ SCTP_INP_RUNLOCK(inp);
+ } else {
+ /*
+ * We increment here since sctp_findassociation_ep_addr()
+ * will do a decrement if it finds the stcb as long as the
+ * locked tcb (last argument) is NOT a TCB.. aka NULL.
+ */
+ SCTP_INP_INCR_REF(inp);
+ stcb = sctp_findassociation_ep_addr(&inp, addr, NULL, NULL, NULL);
+ if (stcb == NULL) {
+ SCTP_INP_DECR_REF(inp);
+ } else {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ }
+ if (stcb != NULL) {
+ /* Already have or am bring up an association */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
+ error = EALREADY;
+ goto out_now;
+ }
+ vrf_id = inp->def_vrf_id;
+ /* We are GOOD to go */
+ stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, p);
+ if (stcb == NULL) {
+ /* Gak! no memory */
+ goto out_now;
+ }
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) {
+ stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED;
+ /* Set the connected flag so we can queue data */
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_rcv.sb_state &= ~SBS_CANTRCVMORE;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_LOCK(&so->so_snd);
+ so->so_snd.sb_state &= ~SBS_CANTSENDMORE;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_ISDISCONNECTING;
+ SOCK_UNLOCK(so);
+ soisconnecting(so);
+ }
+ SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT);
+ (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
+
+ /* initialize authentication parameters for the assoc */
+ sctp_initialize_auth_params(inp, stcb);
+
+ sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
+ SCTP_TCB_UNLOCK(stcb);
+out_now:
+ if (create_lock_on) {
+ SCTP_ASOC_CREATE_UNLOCK(inp);
+ }
+ SCTP_INP_DECR_REF(inp);
+ return error;
+}
+
+int
+sctp_listen(struct socket *so, int backlog, struct thread *p)
+{
+ /*
+ * Note this module depends on the protocol processing being called
+ * AFTER any socket level flags and backlog are applied to the
+ * socket. The traditional way that the socket flags are applied is
+ * AFTER protocol processing. We have made a change to the
+ * sys/kern/uipc_socket.c module to reverse this but this MUST be in
+ * place if the socket API for SCTP is to work properly.
+ */
+
+ int error = 0;
+ struct sctp_inpcb *inp;
+
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == 0) {
+ /* I made the same as TCP since we are not setup? */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (ECONNRESET);
+ }
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) {
+ /* See if we have a listener */
+ struct sctp_inpcb *tinp;
+ union sctp_sockstore store, *sp;
+
+ sp = &store;
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
+ /* not bound all */
+ struct sctp_laddr *laddr;
+
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ memcpy(&store, &laddr->ifa->address, sizeof(store));
+ sp->sin.sin_port = inp->sctp_lport;
+ tinp = sctp_pcb_findep(&sp->sa, 0, 0, inp->def_vrf_id);
+ if (tinp && (tinp != inp) &&
+ ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0) &&
+ ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
+ (tinp->sctp_socket->so_qlimit)) {
+ /*
+ * we have a listener already and
+ * its not this inp.
+ */
+ SCTP_INP_DECR_REF(tinp);
+ return (EADDRINUSE);
+ } else if (tinp) {
+ SCTP_INP_DECR_REF(tinp);
+ }
+ }
+ } else {
+ /* Setup a local addr bound all */
+ memset(&store, 0, sizeof(store));
+ store.sin.sin_port = inp->sctp_lport;
+#ifdef INET6
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ store.sa.sa_family = AF_INET6;
+ store.sa.sa_len = sizeof(struct sockaddr_in6);
+ }
+#endif
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
+ store.sa.sa_family = AF_INET;
+ store.sa.sa_len = sizeof(struct sockaddr_in);
+ }
+ tinp = sctp_pcb_findep(&sp->sa, 0, 0, inp->def_vrf_id);
+ if (tinp && (tinp != inp) &&
+ ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0) &&
+ ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
+ (tinp->sctp_socket->so_qlimit)) {
+ /*
+ * we have a listener already and its not
+ * this inp.
+ */
+ SCTP_INP_DECR_REF(tinp);
+ return (EADDRINUSE);
+ } else if (tinp) {
+ SCTP_INP_DECR_REF(inp);
+ }
+ }
+ }
+ SCTP_INP_RLOCK(inp);
+#ifdef SCTP_LOCK_LOGGING
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) {
+ sctp_log_lock(inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_SOCK);
+ }
+#endif
+ SOCK_LOCK(so);
+ error = solisten_proto_check(so);
+ if (error) {
+ SOCK_UNLOCK(so);
+ SCTP_INP_RUNLOCK(inp);
+ return (error);
+ }
+ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) &&
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ /*
+ * The unlucky case - We are in the tcp pool with this guy.
+ * - Someone else is in the main inp slot. - We must move
+ * this guy (the listener) to the main slot - We must then
+ * move the guy that was listener to the TCP Pool.
+ */
+ if (sctp_swap_inpcb_for_listen(inp)) {
+ goto in_use;
+ }
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
+ (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) {
+ /* We are already connected AND the TCP model */
+in_use:
+ SCTP_INP_RUNLOCK(inp);
+ SOCK_UNLOCK(so);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE);
+ return (EADDRINUSE);
+ }
+ SCTP_INP_RUNLOCK(inp);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) {
+ /* We must do a bind. */
+ SOCK_UNLOCK(so);
+ if ((error = sctp_inpcb_bind(so, NULL, NULL, p))) {
+ /* bind error, probably perm */
+ return (error);
+ }
+ SOCK_LOCK(so);
+ }
+ /* It appears for 7.0 and on, we must always call this. */
+ solisten_proto(so, backlog);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) {
+ /* remove the ACCEPTCONN flag for one-to-many sockets */
+ so->so_options &= ~SO_ACCEPTCONN;
+ }
+ if (backlog == 0) {
+ /* turning off listen */
+ so->so_options &= ~SO_ACCEPTCONN;
+ }
+ SOCK_UNLOCK(so);
+ return (error);
+}
+
+static int sctp_defered_wakeup_cnt = 0;
+
+int
+sctp_accept(struct socket *so, struct sockaddr **addr)
+{
+ struct sctp_tcb *stcb;
+ struct sctp_inpcb *inp;
+ union sctp_sockstore store;
+
+#ifdef INET6
+ int error;
+
+#endif
+ inp = (struct sctp_inpcb *)so->so_pcb;
+
+ if (inp == 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (ECONNRESET);
+ }
+ SCTP_INP_RLOCK(inp);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) {
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
+ return (EOPNOTSUPP);
+ }
+ if (so->so_state & SS_ISDISCONNECTED) {
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ECONNABORTED);
+ return (ECONNABORTED);
+ }
+ stcb = LIST_FIRST(&inp->sctp_asoc_list);
+ if (stcb == NULL) {
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return (ECONNRESET);
+ }
+ SCTP_TCB_LOCK(stcb);
+ SCTP_INP_RUNLOCK(inp);
+ store = stcb->asoc.primary_destination->ro._l_addr;
+ stcb->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE;
+ SCTP_TCB_UNLOCK(stcb);
+ switch (store.sa.sa_family) {
+ case AF_INET:
+ {
+ struct sockaddr_in *sin;
+
+ SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin);
+ if (sin == NULL)
+ return (ENOMEM);
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_port = ((struct sockaddr_in *)&store)->sin_port;
+ sin->sin_addr = ((struct sockaddr_in *)&store)->sin_addr;
+ *addr = (struct sockaddr *)sin;
+ break;
+ }
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 *sin6;
+
+ SCTP_MALLOC_SONAME(sin6, struct sockaddr_in6 *, sizeof *sin6);
+ if (sin6 == NULL)
+ return (ENOMEM);
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(*sin6);
+ sin6->sin6_port = ((struct sockaddr_in6 *)&store)->sin6_port;
+
+ sin6->sin6_addr = ((struct sockaddr_in6 *)&store)->sin6_addr;
+ if ((error = sa6_recoverscope(sin6)) != 0) {
+ SCTP_FREE_SONAME(sin6);
+ return (error);
+ }
+ *addr = (struct sockaddr *)sin6;
+ break;
+ }
+#endif
+ default:
+ /* TSNH */
+ break;
+ }
+ /* Wake any delayed sleep action */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) {
+ SCTP_INP_WLOCK(inp);
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_DONT_WAKE;
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEOUTPUT) {
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT;
+ SCTP_INP_WUNLOCK(inp);
+ SOCKBUF_LOCK(&inp->sctp_socket->so_snd);
+ if (sowriteable(inp->sctp_socket)) {
+ sowwakeup_locked(inp->sctp_socket);
+ } else {
+ SOCKBUF_UNLOCK(&inp->sctp_socket->so_snd);
+ }
+ SCTP_INP_WLOCK(inp);
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEINPUT) {
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT;
+ SCTP_INP_WUNLOCK(inp);
+ SOCKBUF_LOCK(&inp->sctp_socket->so_rcv);
+ if (soreadable(inp->sctp_socket)) {
+ sctp_defered_wakeup_cnt++;
+ sorwakeup_locked(inp->sctp_socket);
+ } else {
+ SOCKBUF_UNLOCK(&inp->sctp_socket->so_rcv);
+ }
+ SCTP_INP_WLOCK(inp);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ }
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ SCTP_TCB_LOCK(stcb);
+ sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7);
+ }
+ return (0);
+}
+
+int
+sctp_ingetaddr(struct socket *so, struct sockaddr **addr)
+{
+ struct sockaddr_in *sin;
+ uint32_t vrf_id;
+ struct sctp_inpcb *inp;
+ struct sctp_ifa *sctp_ifa;
+
+ /*
+ * Do the malloc first in case it blocks.
+ */
+ SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin);
+ if (sin == NULL)
+ return (ENOMEM);
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (!inp) {
+ SCTP_FREE_SONAME(sin);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return ECONNRESET;
+ }
+ SCTP_INP_RLOCK(inp);
+ sin->sin_port = inp->sctp_lport;
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
+ struct sctp_tcb *stcb;
+ struct sockaddr_in *sin_a;
+ struct sctp_nets *net;
+ int fnd;
+
+ stcb = LIST_FIRST(&inp->sctp_asoc_list);
+ if (stcb == NULL) {
+ goto notConn;
+ }
+ fnd = 0;
+ sin_a = NULL;
+ SCTP_TCB_LOCK(stcb);
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ sin_a = (struct sockaddr_in *)&net->ro._l_addr;
+ if (sin_a == NULL)
+ /* this will make coverity happy */
+ continue;
+
+ if (sin_a->sin_family == AF_INET) {
+ fnd = 1;
+ break;
+ }
+ }
+ if ((!fnd) || (sin_a == NULL)) {
+ /* punt */
+ SCTP_TCB_UNLOCK(stcb);
+ goto notConn;
+ }
+ vrf_id = inp->def_vrf_id;
+ sctp_ifa = sctp_source_address_selection(inp,
+ stcb,
+ (sctp_route_t *) & net->ro,
+ net, 0, vrf_id);
+ if (sctp_ifa) {
+ sin->sin_addr = sctp_ifa->address.sin.sin_addr;
+ sctp_free_ifa(sctp_ifa);
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ /* For the bound all case you get back 0 */
+ notConn:
+ sin->sin_addr.s_addr = 0;
+ }
+
+ } else {
+ /* Take the first IPv4 address in the list */
+ struct sctp_laddr *laddr;
+ int fnd = 0;
+
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ if (laddr->ifa->address.sa.sa_family == AF_INET) {
+ struct sockaddr_in *sin_a;
+
+ sin_a = (struct sockaddr_in *)&laddr->ifa->address.sa;
+ sin->sin_addr = sin_a->sin_addr;
+ fnd = 1;
+ break;
+ }
+ }
+ if (!fnd) {
+ SCTP_FREE_SONAME(sin);
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
+ return ENOENT;
+ }
+ }
+ SCTP_INP_RUNLOCK(inp);
+ (*addr) = (struct sockaddr *)sin;
+ return (0);
+}
+
+int
+sctp_peeraddr(struct socket *so, struct sockaddr **addr)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)*addr;
+ int fnd;
+ struct sockaddr_in *sin_a;
+ struct sctp_inpcb *inp;
+ struct sctp_tcb *stcb;
+ struct sctp_nets *net;
+
+ /* Do the malloc first in case it blocks. */
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if ((inp == NULL) ||
+ ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) {
+ /* UDP type and listeners will drop out here */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
+ return (ENOTCONN);
+ }
+ SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin);
+ if (sin == NULL)
+ return (ENOMEM);
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+
+ /* We must recapture incase we blocked */
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (!inp) {
+ SCTP_FREE_SONAME(sin);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return ECONNRESET;
+ }
+ SCTP_INP_RLOCK(inp);
+ stcb = LIST_FIRST(&inp->sctp_asoc_list);
+ if (stcb) {
+ SCTP_TCB_LOCK(stcb);
+ }
+ SCTP_INP_RUNLOCK(inp);
+ if (stcb == NULL) {
+ SCTP_FREE_SONAME(sin);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ return ECONNRESET;
+ }
+ fnd = 0;
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ sin_a = (struct sockaddr_in *)&net->ro._l_addr;
+ if (sin_a->sin_family == AF_INET) {
+ fnd = 1;
+ sin->sin_port = stcb->rport;
+ sin->sin_addr = sin_a->sin_addr;
+ break;
+ }
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ if (!fnd) {
+ /* No IPv4 address */
+ SCTP_FREE_SONAME(sin);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
+ return ENOENT;
+ }
+ (*addr) = (struct sockaddr *)sin;
+ return (0);
+}
+
+struct pr_usrreqs sctp_usrreqs = {
+ .pru_abort = sctp_abort,
+ .pru_accept = sctp_accept,
+ .pru_attach = sctp_attach,
+ .pru_bind = sctp_bind,
+ .pru_connect = sctp_connect,
+ .pru_control = in_control,
+ .pru_close = sctp_close,
+ .pru_detach = sctp_close,
+ .pru_sopoll = sopoll_generic,
+ .pru_flush = sctp_flush,
+ .pru_disconnect = sctp_disconnect,
+ .pru_listen = sctp_listen,
+ .pru_peeraddr = sctp_peeraddr,
+ .pru_send = sctp_sendm,
+ .pru_shutdown = sctp_shutdown,
+ .pru_sockaddr = sctp_ingetaddr,
+ .pru_sosend = sctp_sosend,
+ .pru_soreceive = sctp_soreceive
+};
diff --git a/freebsd/sys/netinet/sctp_var.h b/freebsd/sys/netinet/sctp_var.h
new file mode 100644
index 00000000..93b92038
--- /dev/null
+++ b/freebsd/sys/netinet/sctp_var.h
@@ -0,0 +1,336 @@
+/*-
+ * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_var.h,v 1.24 2005/03/06 16:04:19 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef _NETINET_SCTP_VAR_HH_
+#define _NETINET_SCTP_VAR_HH_
+
+#include <freebsd/netinet/sctp_uio.h>
+
+#if defined(_KERNEL) || defined(__Userspace__)
+
+extern struct pr_usrreqs sctp_usrreqs;
+
+
+#define sctp_feature_on(inp, feature) (inp->sctp_features |= feature)
+#define sctp_feature_off(inp, feature) (inp->sctp_features &= ~feature)
+#define sctp_is_feature_on(inp, feature) ((inp->sctp_features & feature) == feature)
+#define sctp_is_feature_off(inp, feature) ((inp->sctp_features & feature) == 0)
+
+
+/* managing mobility_feature in inpcb (by micchie) */
+#define sctp_mobility_feature_on(inp, feature) (inp->sctp_mobility_features |= feature)
+#define sctp_mobility_feature_off(inp, feature) (inp->sctp_mobility_features &= ~feature)
+#define sctp_is_mobility_feature_on(inp, feature) (inp->sctp_mobility_features & feature)
+#define sctp_is_mobility_feature_off(inp, feature) ((inp->sctp_mobility_features & feature) == 0)
+
+#define sctp_maxspace(sb) (max((sb)->sb_hiwat,SCTP_MINIMAL_RWND))
+
+#define sctp_sbspace(asoc, sb) ((long) ((sctp_maxspace(sb) > (asoc)->sb_cc) ? (sctp_maxspace(sb) - (asoc)->sb_cc) : 0))
+
+#define sctp_sbspace_failedmsgs(sb) ((long) ((sctp_maxspace(sb) > (sb)->sb_cc) ? (sctp_maxspace(sb) - (sb)->sb_cc) : 0))
+
+#define sctp_sbspace_sub(a,b) ((a > b) ? (a - b) : 0)
+
+/*
+ * I tried to cache the readq entries at one point. But the reality
+ * is that it did not add any performance since this meant we had to
+ * lock the STCB on read. And at that point once you have to do an
+ * extra lock, it really does not matter if the lock is in the ZONE
+ * stuff or in our code. Note that this same problem would occur with
+ * an mbuf cache as well so it is not really worth doing, at least
+ * right now :-D
+ */
+
+#define sctp_free_a_readq(_stcb, _readq) { \
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), (_readq)); \
+ SCTP_DECR_READQ_COUNT(); \
+}
+
+#define sctp_alloc_a_readq(_stcb, _readq) { \
+ (_readq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_readq), struct sctp_queued_to_read); \
+ if ((_readq)) { \
+ SCTP_INCR_READQ_COUNT(); \
+ } \
+}
+
+#define sctp_free_a_strmoq(_stcb, _strmoq) { \
+ if ((_strmoq)->holds_key_ref) { \
+ sctp_auth_key_release(stcb, sp->auth_keyid); \
+ (_strmoq)->holds_key_ref = 0; \
+ } \
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_strmoq), (_strmoq)); \
+ SCTP_DECR_STRMOQ_COUNT(); \
+}
+
+#define sctp_alloc_a_strmoq(_stcb, _strmoq) { \
+ (_strmoq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_strmoq), struct sctp_stream_queue_pending); \
+ if ((_strmoq)) { \
+ memset(_strmoq, 0, sizeof(struct sctp_stream_queue_pending)); \
+ SCTP_INCR_STRMOQ_COUNT(); \
+ (_strmoq)->holds_key_ref = 0; \
+ } \
+}
+
+#define sctp_free_a_chunk(_stcb, _chk) { \
+ if ((_chk)->holds_key_ref) {\
+ sctp_auth_key_release((_stcb), (_chk)->auth_keyid); \
+ (_chk)->holds_key_ref = 0; \
+ } \
+ if (_stcb) { \
+ SCTP_TCB_LOCK_ASSERT((_stcb)); \
+ if ((_chk)->whoTo) { \
+ sctp_free_remote_addr((_chk)->whoTo); \
+ (_chk)->whoTo = NULL; \
+ } \
+ if (((_stcb)->asoc.free_chunk_cnt > SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit)) || \
+ (SCTP_BASE_INFO(ipi_free_chunks) > SCTP_BASE_SYSCTL(sctp_system_free_resc_limit))) { \
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \
+ SCTP_DECR_CHK_COUNT(); \
+ } else { \
+ TAILQ_INSERT_TAIL(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \
+ (_stcb)->asoc.free_chunk_cnt++; \
+ atomic_add_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \
+ } \
+ } else { \
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \
+ SCTP_DECR_CHK_COUNT(); \
+ } \
+}
+
+#define sctp_alloc_a_chunk(_stcb, _chk) { \
+ if (TAILQ_EMPTY(&(_stcb)->asoc.free_chunks)) { \
+ (_chk) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_chunk), struct sctp_tmit_chunk); \
+ if ((_chk)) { \
+ SCTP_INCR_CHK_COUNT(); \
+ (_chk)->whoTo = NULL; \
+ (_chk)->holds_key_ref = 0; \
+ } \
+ } else { \
+ (_chk) = TAILQ_FIRST(&(_stcb)->asoc.free_chunks); \
+ TAILQ_REMOVE(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \
+ atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \
+ (_chk)->holds_key_ref = 0; \
+ SCTP_STAT_INCR(sctps_cached_chk); \
+ (_stcb)->asoc.free_chunk_cnt--; \
+ } \
+}
+
+
+#define sctp_free_remote_addr(__net) { \
+ if ((__net)) { \
+ if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&(__net)->ref_count)) { \
+ (void)SCTP_OS_TIMER_STOP(&(__net)->rxt_timer.timer); \
+ (void)SCTP_OS_TIMER_STOP(&(__net)->pmtu_timer.timer); \
+ (void)SCTP_OS_TIMER_STOP(&(__net)->fr_timer.timer); \
+ if ((__net)->ro.ro_rt) { \
+ RTFREE((__net)->ro.ro_rt); \
+ (__net)->ro.ro_rt = NULL; \
+ } \
+ if ((__net)->src_addr_selected) { \
+ sctp_free_ifa((__net)->ro._s_addr); \
+ (__net)->ro._s_addr = NULL; \
+ } \
+ (__net)->src_addr_selected = 0; \
+ (__net)->dest_state = SCTP_ADDR_NOT_REACHABLE; \
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_net), (__net)); \
+ SCTP_DECR_RADDR_COUNT(); \
+ } \
+ } \
+}
+
+#define sctp_sbfree(ctl, stcb, sb, m) { \
+ SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_cc, SCTP_BUF_LEN((m))); \
+ SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_mbcnt, MSIZE); \
+ if (((ctl)->do_not_ref_stcb == 0) && stcb) {\
+ SCTP_SAVE_ATOMIC_DECREMENT(&(stcb)->asoc.sb_cc, SCTP_BUF_LEN((m))); \
+ SCTP_SAVE_ATOMIC_DECREMENT(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \
+ } \
+ if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \
+ SCTP_BUF_TYPE(m) != MT_OOBDATA) \
+ atomic_subtract_int(&(sb)->sb_ctl,SCTP_BUF_LEN((m))); \
+}
+
+#define sctp_sballoc(stcb, sb, m) { \
+ atomic_add_int(&(sb)->sb_cc,SCTP_BUF_LEN((m))); \
+ atomic_add_int(&(sb)->sb_mbcnt, MSIZE); \
+ if (stcb) { \
+ atomic_add_int(&(stcb)->asoc.sb_cc,SCTP_BUF_LEN((m))); \
+ atomic_add_int(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \
+ } \
+ if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \
+ SCTP_BUF_TYPE(m) != MT_OOBDATA) \
+ atomic_add_int(&(sb)->sb_ctl,SCTP_BUF_LEN((m))); \
+}
+
+
+#define sctp_ucount_incr(val) { \
+ val++; \
+}
+
+#define sctp_ucount_decr(val) { \
+ if (val > 0) { \
+ val--; \
+ } else { \
+ val = 0; \
+ } \
+}
+
+#define sctp_mbuf_crush(data) do { \
+ struct mbuf *_m; \
+ _m = (data); \
+ while(_m && (SCTP_BUF_LEN(_m) == 0)) { \
+ (data) = SCTP_BUF_NEXT(_m); \
+ SCTP_BUF_NEXT(_m) = NULL; \
+ sctp_m_free(_m); \
+ _m = (data); \
+ } \
+} while (0)
+
+#define sctp_flight_size_decrease(tp1) do { \
+ if (tp1->whoTo->flight_size >= tp1->book_size) \
+ tp1->whoTo->flight_size -= tp1->book_size; \
+ else \
+ tp1->whoTo->flight_size = 0; \
+} while (0)
+
+#define sctp_flight_size_increase(tp1) do { \
+ (tp1)->whoTo->flight_size += (tp1)->book_size; \
+} while (0)
+
+#ifdef SCTP_FS_SPEC_LOG
+#define sctp_total_flight_decrease(stcb, tp1) do { \
+ if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \
+ stcb->asoc.fs_index = 0;\
+ stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \
+ stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.TSN_seq; \
+ stcb->asoc.fslog[stcb->asoc.fs_index].book = tp1->book_size; \
+ stcb->asoc.fslog[stcb->asoc.fs_index].sent = tp1->sent; \
+ stcb->asoc.fslog[stcb->asoc.fs_index].incr = 0; \
+ stcb->asoc.fslog[stcb->asoc.fs_index].decr = 1; \
+ stcb->asoc.fs_index++; \
+ tp1->window_probe = 0; \
+ if (stcb->asoc.total_flight >= tp1->book_size) { \
+ stcb->asoc.total_flight -= tp1->book_size; \
+ if (stcb->asoc.total_flight_count > 0) \
+ stcb->asoc.total_flight_count--; \
+ } else { \
+ stcb->asoc.total_flight = 0; \
+ stcb->asoc.total_flight_count = 0; \
+ } \
+} while (0)
+
+#define sctp_total_flight_increase(stcb, tp1) do { \
+ if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \
+ stcb->asoc.fs_index = 0;\
+ stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \
+ stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.TSN_seq; \
+ stcb->asoc.fslog[stcb->asoc.fs_index].book = tp1->book_size; \
+ stcb->asoc.fslog[stcb->asoc.fs_index].sent = tp1->sent; \
+ stcb->asoc.fslog[stcb->asoc.fs_index].incr = 1; \
+ stcb->asoc.fslog[stcb->asoc.fs_index].decr = 0; \
+ stcb->asoc.fs_index++; \
+ (stcb)->asoc.total_flight_count++; \
+ (stcb)->asoc.total_flight += (tp1)->book_size; \
+} while (0)
+
+#else
+
+#define sctp_total_flight_decrease(stcb, tp1) do { \
+ tp1->window_probe = 0; \
+ if (stcb->asoc.total_flight >= tp1->book_size) { \
+ stcb->asoc.total_flight -= tp1->book_size; \
+ if (stcb->asoc.total_flight_count > 0) \
+ stcb->asoc.total_flight_count--; \
+ } else { \
+ stcb->asoc.total_flight = 0; \
+ stcb->asoc.total_flight_count = 0; \
+ } \
+} while (0)
+
+#define sctp_total_flight_increase(stcb, tp1) do { \
+ (stcb)->asoc.total_flight_count++; \
+ (stcb)->asoc.total_flight += (tp1)->book_size; \
+} while (0)
+
+#endif
+
+
+struct sctp_nets;
+struct sctp_inpcb;
+struct sctp_tcb;
+struct sctphdr;
+
+
+void sctp_close(struct socket *so);
+int sctp_disconnect(struct socket *so);
+
+void sctp_ctlinput __P((int, struct sockaddr *, void *));
+int sctp_ctloutput __P((struct socket *, struct sockopt *));
+void sctp_input_with_port __P((struct mbuf *, int, uint16_t));
+void sctp_input __P((struct mbuf *, int));
+void sctp_pathmtu_adjustment __P((struct sctp_inpcb *, struct sctp_tcb *, struct sctp_nets *, uint16_t));
+void sctp_drain __P((void));
+void sctp_init __P((void));
+
+void sctp_finish(void);
+
+int sctp_flush(struct socket *, int);
+int sctp_shutdown __P((struct socket *));
+void sctp_notify
+__P((struct sctp_inpcb *, struct ip *ip, struct sctphdr *,
+ struct sockaddr *, struct sctp_tcb *,
+ struct sctp_nets *));
+
+ int sctp_bindx(struct socket *, int, struct sockaddr_storage *,
+ int, int, struct proc *);
+
+/* can't use sctp_assoc_t here */
+ int sctp_peeloff(struct socket *, struct socket *, int, caddr_t, int *);
+
+ int sctp_ingetaddr(struct socket *,
+ struct sockaddr **
+);
+
+ int sctp_peeraddr(struct socket *,
+ struct sockaddr **
+);
+
+ int sctp_listen(struct socket *, int, struct thread *);
+
+ int sctp_accept(struct socket *, struct sockaddr **);
+
+#endif /* _KERNEL */
+
+#endif /* !_NETINET_SCTP_VAR_HH_ */
diff --git a/freebsd/sys/netinet/sctputil.c b/freebsd/sys/netinet/sctputil.c
new file mode 100644
index 00000000..7e8ac1ea
--- /dev/null
+++ b/freebsd/sys/netinet/sctputil.c
@@ -0,0 +1,6977 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctputil.c,v 1.37 2005/03/07 23:26:09 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/netinet/sctp_os.h>
+#include <freebsd/netinet/sctp_pcb.h>
+#include <freebsd/netinet/sctputil.h>
+#include <freebsd/netinet/sctp_var.h>
+#include <freebsd/netinet/sctp_sysctl.h>
+#ifdef INET6
+#endif
+#include <freebsd/netinet/sctp_header.h>
+#include <freebsd/netinet/sctp_output.h>
+#include <freebsd/netinet/sctp_uio.h>
+#include <freebsd/netinet/sctp_timer.h>
+#include <freebsd/netinet/sctp_indata.h>/* for sctp_deliver_data() */
+#include <freebsd/netinet/sctp_auth.h>
+#include <freebsd/netinet/sctp_asconf.h>
+#include <freebsd/netinet/sctp_cc_functions.h>
+#include <freebsd/netinet/sctp_bsd_addr.h>
+
+
+#ifndef KTR_SCTP
+#define KTR_SCTP KTR_SUBSYS
+#endif
+
+void
+sctp_sblog(struct sockbuf *sb,
+ struct sctp_tcb *stcb, int from, int incr)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ sctp_clog.x.sb.stcb = stcb;
+ sctp_clog.x.sb.so_sbcc = sb->sb_cc;
+ if (stcb)
+ sctp_clog.x.sb.stcb_sbcc = stcb->asoc.sb_cc;
+ else
+ sctp_clog.x.sb.stcb_sbcc = 0;
+ sctp_clog.x.sb.incr = incr;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_SB,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+}
+
+void
+sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ sctp_clog.x.close.inp = (void *)inp;
+ sctp_clog.x.close.sctp_flags = inp->sctp_flags;
+ if (stcb) {
+ sctp_clog.x.close.stcb = (void *)stcb;
+ sctp_clog.x.close.state = (uint16_t) stcb->asoc.state;
+ } else {
+ sctp_clog.x.close.stcb = 0;
+ sctp_clog.x.close.state = 0;
+ }
+ sctp_clog.x.close.loc = loc;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_CLOSE,
+ 0,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+}
+
+
+void
+rto_logging(struct sctp_nets *net, int from)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ memset(&sctp_clog, 0, sizeof(sctp_clog));
+ sctp_clog.x.rto.net = (void *)net;
+ sctp_clog.x.rto.rtt = net->prev_rtt;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_RTT,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+
+}
+
+void
+sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t tsn, uint16_t sseq, uint16_t stream, int from)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ sctp_clog.x.strlog.stcb = stcb;
+ sctp_clog.x.strlog.n_tsn = tsn;
+ sctp_clog.x.strlog.n_sseq = sseq;
+ sctp_clog.x.strlog.e_tsn = 0;
+ sctp_clog.x.strlog.e_sseq = 0;
+ sctp_clog.x.strlog.strm = stream;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_STRM,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+
+}
+
+void
+sctp_log_nagle_event(struct sctp_tcb *stcb, int action)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ sctp_clog.x.nagle.stcb = (void *)stcb;
+ sctp_clog.x.nagle.total_flight = stcb->asoc.total_flight;
+ sctp_clog.x.nagle.total_in_queue = stcb->asoc.total_output_queue_size;
+ sctp_clog.x.nagle.count_in_queue = stcb->asoc.chunks_on_out_queue;
+ sctp_clog.x.nagle.count_in_flight = stcb->asoc.total_flight_count;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_NAGLE,
+ action,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+}
+
+
+void
+sctp_log_sack(uint32_t old_cumack, uint32_t cumack, uint32_t tsn, uint16_t gaps, uint16_t dups, int from)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ sctp_clog.x.sack.cumack = cumack;
+ sctp_clog.x.sack.oldcumack = old_cumack;
+ sctp_clog.x.sack.tsn = tsn;
+ sctp_clog.x.sack.numGaps = gaps;
+ sctp_clog.x.sack.numDups = dups;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_SACK,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+}
+
+void
+sctp_log_map(uint32_t map, uint32_t cum, uint32_t high, int from)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ memset(&sctp_clog, 0, sizeof(sctp_clog));
+ sctp_clog.x.map.base = map;
+ sctp_clog.x.map.cum = cum;
+ sctp_clog.x.map.high = high;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_MAP,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+}
+
+void
+sctp_log_fr(uint32_t biggest_tsn, uint32_t biggest_new_tsn, uint32_t tsn,
+ int from)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ memset(&sctp_clog, 0, sizeof(sctp_clog));
+ sctp_clog.x.fr.largest_tsn = biggest_tsn;
+ sctp_clog.x.fr.largest_new_tsn = biggest_new_tsn;
+ sctp_clog.x.fr.tsn = tsn;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_FR,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+
+}
+
+
+void
+sctp_log_mb(struct mbuf *m, int from)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ sctp_clog.x.mb.mp = m;
+ sctp_clog.x.mb.mbuf_flags = (uint8_t) (SCTP_BUF_GET_FLAGS(m));
+ sctp_clog.x.mb.size = (uint16_t) (SCTP_BUF_LEN(m));
+ sctp_clog.x.mb.data = SCTP_BUF_AT(m, 0);
+ if (SCTP_BUF_IS_EXTENDED(m)) {
+ sctp_clog.x.mb.ext = SCTP_BUF_EXTEND_BASE(m);
+ sctp_clog.x.mb.refcnt = (uint8_t) (SCTP_BUF_EXTEND_REFCNT(m));
+ } else {
+ sctp_clog.x.mb.ext = 0;
+ sctp_clog.x.mb.refcnt = 0;
+ }
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_MBUF,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+}
+
+
+void
+sctp_log_strm_del(struct sctp_queued_to_read *control, struct sctp_queued_to_read *poschk,
+ int from)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ if (control == NULL) {
+ SCTP_PRINTF("Gak log of NULL?\n");
+ return;
+ }
+ sctp_clog.x.strlog.stcb = control->stcb;
+ sctp_clog.x.strlog.n_tsn = control->sinfo_tsn;
+ sctp_clog.x.strlog.n_sseq = control->sinfo_ssn;
+ sctp_clog.x.strlog.strm = control->sinfo_stream;
+ if (poschk != NULL) {
+ sctp_clog.x.strlog.e_tsn = poschk->sinfo_tsn;
+ sctp_clog.x.strlog.e_sseq = poschk->sinfo_ssn;
+ } else {
+ sctp_clog.x.strlog.e_tsn = 0;
+ sctp_clog.x.strlog.e_sseq = 0;
+ }
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_STRM,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+
+}
+
+void
+sctp_log_cwnd(struct sctp_tcb *stcb, struct sctp_nets *net, int augment, uint8_t from)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ sctp_clog.x.cwnd.net = net;
+ if (stcb->asoc.send_queue_cnt > 255)
+ sctp_clog.x.cwnd.cnt_in_send = 255;
+ else
+ sctp_clog.x.cwnd.cnt_in_send = stcb->asoc.send_queue_cnt;
+ if (stcb->asoc.stream_queue_cnt > 255)
+ sctp_clog.x.cwnd.cnt_in_str = 255;
+ else
+ sctp_clog.x.cwnd.cnt_in_str = stcb->asoc.stream_queue_cnt;
+
+ if (net) {
+ sctp_clog.x.cwnd.cwnd_new_value = net->cwnd;
+ sctp_clog.x.cwnd.inflight = net->flight_size;
+ sctp_clog.x.cwnd.pseudo_cumack = net->pseudo_cumack;
+ sctp_clog.x.cwnd.meets_pseudo_cumack = net->new_pseudo_cumack;
+ sctp_clog.x.cwnd.need_new_pseudo_cumack = net->find_pseudo_cumack;
+ }
+ if (SCTP_CWNDLOG_PRESEND == from) {
+ sctp_clog.x.cwnd.meets_pseudo_cumack = stcb->asoc.peers_rwnd;
+ }
+ sctp_clog.x.cwnd.cwnd_augment = augment;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_CWND,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+
+}
+
+void
+sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ memset(&sctp_clog, 0, sizeof(sctp_clog));
+ if (inp) {
+ sctp_clog.x.lock.sock = (void *)inp->sctp_socket;
+
+ } else {
+ sctp_clog.x.lock.sock = (void *)NULL;
+ }
+ sctp_clog.x.lock.inp = (void *)inp;
+ if (stcb) {
+ sctp_clog.x.lock.tcb_lock = mtx_owned(&stcb->tcb_mtx);
+ } else {
+ sctp_clog.x.lock.tcb_lock = SCTP_LOCK_UNKNOWN;
+ }
+ if (inp) {
+ sctp_clog.x.lock.inp_lock = mtx_owned(&inp->inp_mtx);
+ sctp_clog.x.lock.create_lock = mtx_owned(&inp->inp_create_mtx);
+ } else {
+ sctp_clog.x.lock.inp_lock = SCTP_LOCK_UNKNOWN;
+ sctp_clog.x.lock.create_lock = SCTP_LOCK_UNKNOWN;
+ }
+ sctp_clog.x.lock.info_lock = rw_wowned(&SCTP_BASE_INFO(ipi_ep_mtx));
+ if (inp && (inp->sctp_socket)) {
+ sctp_clog.x.lock.sock_lock = mtx_owned(&(inp->sctp_socket->so_rcv.sb_mtx));
+ sctp_clog.x.lock.sockrcvbuf_lock = mtx_owned(&(inp->sctp_socket->so_rcv.sb_mtx));
+ sctp_clog.x.lock.socksndbuf_lock = mtx_owned(&(inp->sctp_socket->so_snd.sb_mtx));
+ } else {
+ sctp_clog.x.lock.sock_lock = SCTP_LOCK_UNKNOWN;
+ sctp_clog.x.lock.sockrcvbuf_lock = SCTP_LOCK_UNKNOWN;
+ sctp_clog.x.lock.socksndbuf_lock = SCTP_LOCK_UNKNOWN;
+ }
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_LOCK_EVENT,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+
+}
+
+void
+sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *net, int error, int burst, uint8_t from)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ memset(&sctp_clog, 0, sizeof(sctp_clog));
+ sctp_clog.x.cwnd.net = net;
+ sctp_clog.x.cwnd.cwnd_new_value = error;
+ sctp_clog.x.cwnd.inflight = net->flight_size;
+ sctp_clog.x.cwnd.cwnd_augment = burst;
+ if (stcb->asoc.send_queue_cnt > 255)
+ sctp_clog.x.cwnd.cnt_in_send = 255;
+ else
+ sctp_clog.x.cwnd.cnt_in_send = stcb->asoc.send_queue_cnt;
+ if (stcb->asoc.stream_queue_cnt > 255)
+ sctp_clog.x.cwnd.cnt_in_str = 255;
+ else
+ sctp_clog.x.cwnd.cnt_in_str = stcb->asoc.stream_queue_cnt;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_MAXBURST,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+
+}
+
+void
+sctp_log_rwnd(uint8_t from, uint32_t peers_rwnd, uint32_t snd_size, uint32_t overhead)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ sctp_clog.x.rwnd.rwnd = peers_rwnd;
+ sctp_clog.x.rwnd.send_size = snd_size;
+ sctp_clog.x.rwnd.overhead = overhead;
+ sctp_clog.x.rwnd.new_rwnd = 0;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_RWND,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+}
+
+void
+sctp_log_rwnd_set(uint8_t from, uint32_t peers_rwnd, uint32_t flight_size, uint32_t overhead, uint32_t a_rwndval)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ sctp_clog.x.rwnd.rwnd = peers_rwnd;
+ sctp_clog.x.rwnd.send_size = flight_size;
+ sctp_clog.x.rwnd.overhead = overhead;
+ sctp_clog.x.rwnd.new_rwnd = a_rwndval;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_RWND,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+}
+
+void
+sctp_log_mbcnt(uint8_t from, uint32_t total_oq, uint32_t book, uint32_t total_mbcnt_q, uint32_t mbcnt)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ sctp_clog.x.mbcnt.total_queue_size = total_oq;
+ sctp_clog.x.mbcnt.size_change = book;
+ sctp_clog.x.mbcnt.total_queue_mb_size = total_mbcnt_q;
+ sctp_clog.x.mbcnt.mbcnt_change = mbcnt;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_MBCNT,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+
+}
+
+void
+sctp_misc_ints(uint8_t from, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
+{
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_MISC_EVENT,
+ from,
+ a, b, c, d);
+}
+
+void
+sctp_wakeup_log(struct sctp_tcb *stcb, uint32_t cumtsn, uint32_t wake_cnt, int from)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ sctp_clog.x.wake.stcb = (void *)stcb;
+ sctp_clog.x.wake.wake_cnt = wake_cnt;
+ sctp_clog.x.wake.flight = stcb->asoc.total_flight_count;
+ sctp_clog.x.wake.send_q = stcb->asoc.send_queue_cnt;
+ sctp_clog.x.wake.sent_q = stcb->asoc.sent_queue_cnt;
+
+ if (stcb->asoc.stream_queue_cnt < 0xff)
+ sctp_clog.x.wake.stream_qcnt = (uint8_t) stcb->asoc.stream_queue_cnt;
+ else
+ sctp_clog.x.wake.stream_qcnt = 0xff;
+
+ if (stcb->asoc.chunks_on_out_queue < 0xff)
+ sctp_clog.x.wake.chunks_on_oque = (uint8_t) stcb->asoc.chunks_on_out_queue;
+ else
+ sctp_clog.x.wake.chunks_on_oque = 0xff;
+
+ sctp_clog.x.wake.sctpflags = 0;
+ /* set in the defered mode stuff */
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE)
+ sctp_clog.x.wake.sctpflags |= 1;
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_WAKEOUTPUT)
+ sctp_clog.x.wake.sctpflags |= 2;
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_WAKEINPUT)
+ sctp_clog.x.wake.sctpflags |= 4;
+ /* what about the sb */
+ if (stcb->sctp_socket) {
+ struct socket *so = stcb->sctp_socket;
+
+ sctp_clog.x.wake.sbflags = (uint8_t) ((so->so_snd.sb_flags & 0x00ff));
+ } else {
+ sctp_clog.x.wake.sbflags = 0xff;
+ }
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_WAKE,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+
+}
+
+void
+sctp_log_block(uint8_t from, struct socket *so, struct sctp_association *asoc, int sendlen)
+{
+ struct sctp_cwnd_log sctp_clog;
+
+ sctp_clog.x.blk.onsb = asoc->total_output_queue_size;
+ sctp_clog.x.blk.send_sent_qcnt = (uint16_t) (asoc->send_queue_cnt + asoc->sent_queue_cnt);
+ sctp_clog.x.blk.peer_rwnd = asoc->peers_rwnd;
+ sctp_clog.x.blk.stream_qcnt = (uint16_t) asoc->stream_queue_cnt;
+ sctp_clog.x.blk.chunks_on_oque = (uint16_t) asoc->chunks_on_out_queue;
+ sctp_clog.x.blk.flight_size = (uint16_t) (asoc->total_flight / 1024);
+ sctp_clog.x.blk.sndlen = sendlen;
+ SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
+ SCTP_LOG_EVENT_BLOCK,
+ from,
+ sctp_clog.x.misc.log1,
+ sctp_clog.x.misc.log2,
+ sctp_clog.x.misc.log3,
+ sctp_clog.x.misc.log4);
+
+}
+
+int
+sctp_fill_stat_log(void *optval, size_t *optsize)
+{
+ /* May need to fix this if ktrdump does not work */
+ return (0);
+}
+
+#ifdef SCTP_AUDITING_ENABLED
+uint8_t sctp_audit_data[SCTP_AUDIT_SIZE][2];
+static int sctp_audit_indx = 0;
+
+static
+void
+sctp_print_audit_report(void)
+{
+ int i;
+ int cnt;
+
+ cnt = 0;
+ for (i = sctp_audit_indx; i < SCTP_AUDIT_SIZE; i++) {
+ if ((sctp_audit_data[i][0] == 0xe0) &&
+ (sctp_audit_data[i][1] == 0x01)) {
+ cnt = 0;
+ SCTP_PRINTF("\n");
+ } else if (sctp_audit_data[i][0] == 0xf0) {
+ cnt = 0;
+ SCTP_PRINTF("\n");
+ } else if ((sctp_audit_data[i][0] == 0xc0) &&
+ (sctp_audit_data[i][1] == 0x01)) {
+ SCTP_PRINTF("\n");
+ cnt = 0;
+ }
+ SCTP_PRINTF("%2.2x%2.2x ", (uint32_t) sctp_audit_data[i][0],
+ (uint32_t) sctp_audit_data[i][1]);
+ cnt++;
+ if ((cnt % 14) == 0)
+ SCTP_PRINTF("\n");
+ }
+ for (i = 0; i < sctp_audit_indx; i++) {
+ if ((sctp_audit_data[i][0] == 0xe0) &&
+ (sctp_audit_data[i][1] == 0x01)) {
+ cnt = 0;
+ SCTP_PRINTF("\n");
+ } else if (sctp_audit_data[i][0] == 0xf0) {
+ cnt = 0;
+ SCTP_PRINTF("\n");
+ } else if ((sctp_audit_data[i][0] == 0xc0) &&
+ (sctp_audit_data[i][1] == 0x01)) {
+ SCTP_PRINTF("\n");
+ cnt = 0;
+ }
+ SCTP_PRINTF("%2.2x%2.2x ", (uint32_t) sctp_audit_data[i][0],
+ (uint32_t) sctp_audit_data[i][1]);
+ cnt++;
+ if ((cnt % 14) == 0)
+ SCTP_PRINTF("\n");
+ }
+ SCTP_PRINTF("\n");
+}
+
+void
+sctp_auditing(int from, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ int resend_cnt, tot_out, rep, tot_book_cnt;
+ struct sctp_nets *lnet;
+ struct sctp_tmit_chunk *chk;
+
+ sctp_audit_data[sctp_audit_indx][0] = 0xAA;
+ sctp_audit_data[sctp_audit_indx][1] = 0x000000ff & from;
+ sctp_audit_indx++;
+ if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
+ sctp_audit_indx = 0;
+ }
+ if (inp == NULL) {
+ sctp_audit_data[sctp_audit_indx][0] = 0xAF;
+ sctp_audit_data[sctp_audit_indx][1] = 0x01;
+ sctp_audit_indx++;
+ if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
+ sctp_audit_indx = 0;
+ }
+ return;
+ }
+ if (stcb == NULL) {
+ sctp_audit_data[sctp_audit_indx][0] = 0xAF;
+ sctp_audit_data[sctp_audit_indx][1] = 0x02;
+ sctp_audit_indx++;
+ if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
+ sctp_audit_indx = 0;
+ }
+ return;
+ }
+ sctp_audit_data[sctp_audit_indx][0] = 0xA1;
+ sctp_audit_data[sctp_audit_indx][1] =
+ (0x000000ff & stcb->asoc.sent_queue_retran_cnt);
+ sctp_audit_indx++;
+ if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
+ sctp_audit_indx = 0;
+ }
+ rep = 0;
+ tot_book_cnt = 0;
+ resend_cnt = tot_out = 0;
+ TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
+ if (chk->sent == SCTP_DATAGRAM_RESEND) {
+ resend_cnt++;
+ } else if (chk->sent < SCTP_DATAGRAM_RESEND) {
+ tot_out += chk->book_size;
+ tot_book_cnt++;
+ }
+ }
+ if (resend_cnt != stcb->asoc.sent_queue_retran_cnt) {
+ sctp_audit_data[sctp_audit_indx][0] = 0xAF;
+ sctp_audit_data[sctp_audit_indx][1] = 0xA1;
+ sctp_audit_indx++;
+ if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
+ sctp_audit_indx = 0;
+ }
+ SCTP_PRINTF("resend_cnt:%d asoc-tot:%d\n",
+ resend_cnt, stcb->asoc.sent_queue_retran_cnt);
+ rep = 1;
+ stcb->asoc.sent_queue_retran_cnt = resend_cnt;
+ sctp_audit_data[sctp_audit_indx][0] = 0xA2;
+ sctp_audit_data[sctp_audit_indx][1] =
+ (0x000000ff & stcb->asoc.sent_queue_retran_cnt);
+ sctp_audit_indx++;
+ if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
+ sctp_audit_indx = 0;
+ }
+ }
+ if (tot_out != stcb->asoc.total_flight) {
+ sctp_audit_data[sctp_audit_indx][0] = 0xAF;
+ sctp_audit_data[sctp_audit_indx][1] = 0xA2;
+ sctp_audit_indx++;
+ if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
+ sctp_audit_indx = 0;
+ }
+ rep = 1;
+ SCTP_PRINTF("tot_flt:%d asoc_tot:%d\n", tot_out,
+ (int)stcb->asoc.total_flight);
+ stcb->asoc.total_flight = tot_out;
+ }
+ if (tot_book_cnt != stcb->asoc.total_flight_count) {
+ sctp_audit_data[sctp_audit_indx][0] = 0xAF;
+ sctp_audit_data[sctp_audit_indx][1] = 0xA5;
+ sctp_audit_indx++;
+ if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
+ sctp_audit_indx = 0;
+ }
+ rep = 1;
+ SCTP_PRINTF("tot_flt_book:%d\n", tot_book_cnt);
+
+ stcb->asoc.total_flight_count = tot_book_cnt;
+ }
+ tot_out = 0;
+ TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) {
+ tot_out += lnet->flight_size;
+ }
+ if (tot_out != stcb->asoc.total_flight) {
+ sctp_audit_data[sctp_audit_indx][0] = 0xAF;
+ sctp_audit_data[sctp_audit_indx][1] = 0xA3;
+ sctp_audit_indx++;
+ if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
+ sctp_audit_indx = 0;
+ }
+ rep = 1;
+ SCTP_PRINTF("real flight:%d net total was %d\n",
+ stcb->asoc.total_flight, tot_out);
+ /* now corrective action */
+ TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) {
+
+ tot_out = 0;
+ TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
+ if ((chk->whoTo == lnet) &&
+ (chk->sent < SCTP_DATAGRAM_RESEND)) {
+ tot_out += chk->book_size;
+ }
+ }
+ if (lnet->flight_size != tot_out) {
+ SCTP_PRINTF("net:%p flight was %d corrected to %d\n",
+ lnet, lnet->flight_size,
+ tot_out);
+ lnet->flight_size = tot_out;
+ }
+ }
+ }
+ if (rep) {
+ sctp_print_audit_report();
+ }
+}
+
+void
+sctp_audit_log(uint8_t ev, uint8_t fd)
+{
+
+ sctp_audit_data[sctp_audit_indx][0] = ev;
+ sctp_audit_data[sctp_audit_indx][1] = fd;
+ sctp_audit_indx++;
+ if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
+ sctp_audit_indx = 0;
+ }
+}
+
+#endif
+
+/*
+ * sctp_stop_timers_for_shutdown() should be called
+ * when entering the SHUTDOWN_SENT or SHUTDOWN_ACK_SENT
+ * state to make sure that all timers are stopped.
+ */
+void
+sctp_stop_timers_for_shutdown(struct sctp_tcb *stcb)
+{
+ struct sctp_association *asoc;
+ struct sctp_nets *net;
+
+ asoc = &stcb->asoc;
+
+ (void)SCTP_OS_TIMER_STOP(&asoc->hb_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&asoc->dack_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&asoc->strreset_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&asoc->asconf_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&asoc->autoclose_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&asoc->delayed_event_timer.timer);
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ (void)SCTP_OS_TIMER_STOP(&net->fr_timer.timer);
+ (void)SCTP_OS_TIMER_STOP(&net->pmtu_timer.timer);
+ }
+}
+
+/*
+ * a list of sizes based on typical mtu's, used only if next hop size not
+ * returned.
+ */
+static uint32_t sctp_mtu_sizes[] = {
+ 68,
+ 296,
+ 508,
+ 512,
+ 544,
+ 576,
+ 1006,
+ 1492,
+ 1500,
+ 1536,
+ 2002,
+ 2048,
+ 4352,
+ 4464,
+ 8166,
+ 17914,
+ 32000,
+ 65535
+};
+
+/*
+ * Return the largest MTU smaller than val. If there is no
+ * entry, just return val.
+ */
+uint32_t
+sctp_get_prev_mtu(uint32_t val)
+{
+ uint32_t i;
+
+ if (val <= sctp_mtu_sizes[0]) {
+ return (val);
+ }
+ for (i = 1; i < (sizeof(sctp_mtu_sizes) / sizeof(uint32_t)); i++) {
+ if (val <= sctp_mtu_sizes[i]) {
+ break;
+ }
+ }
+ return (sctp_mtu_sizes[i - 1]);
+}
+
+/*
+ * Return the smallest MTU larger than val. If there is no
+ * entry, just return val.
+ */
+uint32_t
+sctp_get_next_mtu(struct sctp_inpcb *inp, uint32_t val)
+{
+ /* select another MTU that is just bigger than this one */
+ uint32_t i;
+
+ for (i = 0; i < (sizeof(sctp_mtu_sizes) / sizeof(uint32_t)); i++) {
+ if (val < sctp_mtu_sizes[i]) {
+ return (sctp_mtu_sizes[i]);
+ }
+ }
+ return (val);
+}
+
+void
+sctp_fill_random_store(struct sctp_pcb *m)
+{
+ /*
+ * Here we use the MD5/SHA-1 to hash with our good randomNumbers and
+ * our counter. The result becomes our good random numbers and we
+ * then setup to give these out. Note that we do no locking to
+ * protect this. This is ok, since if competing folks call this we
+ * will get more gobbled gook in the random store which is what we
+ * want. There is a danger that two guys will use the same random
+ * numbers, but thats ok too since that is random as well :->
+ */
+ m->store_at = 0;
+ (void)sctp_hmac(SCTP_HMAC, (uint8_t *) m->random_numbers,
+ sizeof(m->random_numbers), (uint8_t *) & m->random_counter,
+ sizeof(m->random_counter), (uint8_t *) m->random_store);
+ m->random_counter++;
+}
+
+uint32_t
+sctp_select_initial_TSN(struct sctp_pcb *inp)
+{
+ /*
+ * A true implementation should use random selection process to get
+ * the initial stream sequence number, using RFC1750 as a good
+ * guideline
+ */
+ uint32_t x, *xp;
+ uint8_t *p;
+ int store_at, new_store;
+
+ if (inp->initial_sequence_debug != 0) {
+ uint32_t ret;
+
+ ret = inp->initial_sequence_debug;
+ inp->initial_sequence_debug++;
+ return (ret);
+ }
+retry:
+ store_at = inp->store_at;
+ new_store = store_at + sizeof(uint32_t);
+ if (new_store >= (SCTP_SIGNATURE_SIZE - 3)) {
+ new_store = 0;
+ }
+ if (!atomic_cmpset_int(&inp->store_at, store_at, new_store)) {
+ goto retry;
+ }
+ if (new_store == 0) {
+ /* Refill the random store */
+ sctp_fill_random_store(inp);
+ }
+ p = &inp->random_store[store_at];
+ xp = (uint32_t *) p;
+ x = *xp;
+ return (x);
+}
+
+uint32_t
+sctp_select_a_tag(struct sctp_inpcb *inp, uint16_t lport, uint16_t rport, int save_in_twait)
+{
+ uint32_t x, not_done;
+ struct timeval now;
+
+ (void)SCTP_GETTIME_TIMEVAL(&now);
+ not_done = 1;
+ while (not_done) {
+ x = sctp_select_initial_TSN(&inp->sctp_ep);
+ if (x == 0) {
+ /* we never use 0 */
+ continue;
+ }
+ if (sctp_is_vtag_good(inp, x, lport, rport, &now, save_in_twait)) {
+ not_done = 0;
+ }
+ }
+ return (x);
+}
+
+int
+sctp_init_asoc(struct sctp_inpcb *m, struct sctp_tcb *stcb,
+ uint32_t override_tag, uint32_t vrf_id)
+{
+ struct sctp_association *asoc;
+
+ /*
+ * Anything set to zero is taken care of by the allocation routine's
+ * bzero
+ */
+
+ /*
+ * Up front select what scoping to apply on addresses I tell my peer
+ * Not sure what to do with these right now, we will need to come up
+ * with a way to set them. We may need to pass them through from the
+ * caller in the sctp_aloc_assoc() function.
+ */
+ int i;
+
+ asoc = &stcb->asoc;
+ /* init all variables to a known value. */
+ SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_INUSE);
+ asoc->max_burst = m->sctp_ep.max_burst;
+ asoc->heart_beat_delay = TICKS_TO_MSEC(m->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]);
+ asoc->cookie_life = m->sctp_ep.def_cookie_life;
+ asoc->sctp_cmt_on_off = m->sctp_cmt_on_off;
+ asoc->sctp_nr_sack_on_off = (uint8_t) SCTP_BASE_SYSCTL(sctp_nr_sack_on_off);
+ asoc->sctp_cmt_pf = (uint8_t) SCTP_BASE_SYSCTL(sctp_cmt_pf);
+ asoc->sctp_frag_point = m->sctp_frag_point;
+#ifdef INET
+ asoc->default_tos = m->ip_inp.inp.inp_ip_tos;
+#else
+ asoc->default_tos = 0;
+#endif
+
+#ifdef INET6
+ asoc->default_flowlabel = ((struct in6pcb *)m)->in6p_flowinfo;
+#else
+ asoc->default_flowlabel = 0;
+#endif
+ asoc->sb_send_resv = 0;
+ if (override_tag) {
+ asoc->my_vtag = override_tag;
+ } else {
+ asoc->my_vtag = sctp_select_a_tag(m, stcb->sctp_ep->sctp_lport, stcb->rport, 1);
+ }
+ /* Get the nonce tags */
+ asoc->my_vtag_nonce = sctp_select_a_tag(m, stcb->sctp_ep->sctp_lport, stcb->rport, 0);
+ asoc->peer_vtag_nonce = sctp_select_a_tag(m, stcb->sctp_ep->sctp_lport, stcb->rport, 0);
+ asoc->vrf_id = vrf_id;
+
+ if (sctp_is_feature_on(m, SCTP_PCB_FLAGS_DONOT_HEARTBEAT))
+ asoc->hb_is_disabled = 1;
+ else
+ asoc->hb_is_disabled = 0;
+
+#ifdef SCTP_ASOCLOG_OF_TSNS
+ asoc->tsn_in_at = 0;
+ asoc->tsn_out_at = 0;
+ asoc->tsn_in_wrapped = 0;
+ asoc->tsn_out_wrapped = 0;
+ asoc->cumack_log_at = 0;
+ asoc->cumack_log_atsnt = 0;
+#endif
+#ifdef SCTP_FS_SPEC_LOG
+ asoc->fs_index = 0;
+#endif
+ asoc->refcnt = 0;
+ asoc->assoc_up_sent = 0;
+ asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number = asoc->sending_seq =
+ sctp_select_initial_TSN(&m->sctp_ep);
+ asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1;
+ /* we are optimisitic here */
+ asoc->peer_supports_pktdrop = 1;
+ asoc->peer_supports_nat = 0;
+ asoc->sent_queue_retran_cnt = 0;
+
+ /* for CMT */
+ asoc->last_net_cmt_send_started = NULL;
+
+ /* This will need to be adjusted */
+ asoc->last_cwr_tsn = asoc->init_seq_number - 1;
+ asoc->last_acked_seq = asoc->init_seq_number - 1;
+ asoc->advanced_peer_ack_point = asoc->last_acked_seq;
+ asoc->asconf_seq_in = asoc->last_acked_seq;
+
+ /* here we are different, we hold the next one we expect */
+ asoc->str_reset_seq_in = asoc->last_acked_seq + 1;
+
+ asoc->initial_init_rto_max = m->sctp_ep.initial_init_rto_max;
+ asoc->initial_rto = m->sctp_ep.initial_rto;
+
+ asoc->max_init_times = m->sctp_ep.max_init_times;
+ asoc->max_send_times = m->sctp_ep.max_send_times;
+ asoc->def_net_failure = m->sctp_ep.def_net_failure;
+ asoc->free_chunk_cnt = 0;
+
+ asoc->iam_blocking = 0;
+ /* ECN Nonce initialization */
+ asoc->context = m->sctp_context;
+ asoc->def_send = m->def_send;
+ asoc->ecn_nonce_allowed = 0;
+ asoc->receiver_nonce_sum = 1;
+ asoc->nonce_sum_expect_base = 1;
+ asoc->nonce_sum_check = 1;
+ asoc->nonce_resync_tsn = 0;
+ asoc->nonce_wait_for_ecne = 0;
+ asoc->nonce_wait_tsn = 0;
+ asoc->delayed_ack = TICKS_TO_MSEC(m->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV]);
+ asoc->sack_freq = m->sctp_ep.sctp_sack_freq;
+ asoc->pr_sctp_cnt = 0;
+ asoc->total_output_queue_size = 0;
+
+ if (m->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ struct in6pcb *inp6;
+
+ /* Its a V6 socket */
+ inp6 = (struct in6pcb *)m;
+ asoc->ipv6_addr_legal = 1;
+ /* Now look at the binding flag to see if V4 will be legal */
+ if (SCTP_IPV6_V6ONLY(inp6) == 0) {
+ asoc->ipv4_addr_legal = 1;
+ } else {
+ /* V4 addresses are NOT legal on the association */
+ asoc->ipv4_addr_legal = 0;
+ }
+ } else {
+ /* Its a V4 socket, no - V6 */
+ asoc->ipv4_addr_legal = 1;
+ asoc->ipv6_addr_legal = 0;
+ }
+
+ asoc->my_rwnd = max(SCTP_SB_LIMIT_RCV(m->sctp_socket), SCTP_MINIMAL_RWND);
+ asoc->peers_rwnd = SCTP_SB_LIMIT_RCV(m->sctp_socket);
+
+ asoc->smallest_mtu = m->sctp_frag_point;
+ asoc->minrto = m->sctp_ep.sctp_minrto;
+ asoc->maxrto = m->sctp_ep.sctp_maxrto;
+
+ asoc->locked_on_sending = NULL;
+ asoc->stream_locked_on = 0;
+ asoc->ecn_echo_cnt_onq = 0;
+ asoc->stream_locked = 0;
+
+ asoc->send_sack = 1;
+
+ LIST_INIT(&asoc->sctp_restricted_addrs);
+
+ TAILQ_INIT(&asoc->nets);
+ TAILQ_INIT(&asoc->pending_reply_queue);
+ TAILQ_INIT(&asoc->asconf_ack_sent);
+ /* Setup to fill the hb random cache at first HB */
+ asoc->hb_random_idx = 4;
+
+ asoc->sctp_autoclose_ticks = m->sctp_ep.auto_close_time;
+
+ /*
+ * JRS - Pick the default congestion control module based on the
+ * sysctl.
+ */
+ switch (m->sctp_ep.sctp_default_cc_module) {
+ /* JRS - Standard TCP congestion control */
+ case SCTP_CC_RFC2581:
+ {
+ stcb->asoc.congestion_control_module = SCTP_CC_RFC2581;
+ stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_set_initial_cc_param;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_cwnd_update_after_sack;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_cwnd_update_after_fr;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_cwnd_update_after_timeout;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_cwnd_update_after_ecn_echo;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_cwnd_update_after_fr_timer;
+ break;
+ }
+ /* JRS - High Speed TCP congestion control (Floyd) */
+ case SCTP_CC_HSTCP:
+ {
+ stcb->asoc.congestion_control_module = SCTP_CC_HSTCP;
+ stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_set_initial_cc_param;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_hs_cwnd_update_after_sack;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_hs_cwnd_update_after_fr;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_cwnd_update_after_timeout;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_cwnd_update_after_ecn_echo;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_cwnd_update_after_fr_timer;
+ break;
+ }
+ /* JRS - HTCP congestion control */
+ case SCTP_CC_HTCP:
+ {
+ stcb->asoc.congestion_control_module = SCTP_CC_HTCP;
+ stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_htcp_set_initial_cc_param;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_htcp_cwnd_update_after_sack;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_htcp_cwnd_update_after_fr;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_htcp_cwnd_update_after_timeout;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_htcp_cwnd_update_after_ecn_echo;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_htcp_cwnd_update_after_fr_timer;
+ break;
+ }
+ /* JRS - By default, use RFC2581 */
+ default:
+ {
+ stcb->asoc.congestion_control_module = SCTP_CC_RFC2581;
+ stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_set_initial_cc_param;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_cwnd_update_after_sack;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_cwnd_update_after_fr;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_cwnd_update_after_timeout;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_cwnd_update_after_ecn_echo;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output;
+ stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_cwnd_update_after_fr_timer;
+ break;
+ }
+ }
+
+ /*
+ * Now the stream parameters, here we allocate space for all streams
+ * that we request by default.
+ */
+ asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams =
+ m->sctp_ep.pre_open_stream_count;
+ SCTP_MALLOC(asoc->strmout, struct sctp_stream_out *,
+ asoc->streamoutcnt * sizeof(struct sctp_stream_out),
+ SCTP_M_STRMO);
+ if (asoc->strmout == NULL) {
+ /* big trouble no memory */
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOMEM);
+ return (ENOMEM);
+ }
+ for (i = 0; i < asoc->streamoutcnt; i++) {
+ /*
+ * inbound side must be set to 0xffff, also NOTE when we get
+ * the INIT-ACK back (for INIT sender) we MUST reduce the
+ * count (streamoutcnt) but first check if we sent to any of
+ * the upper streams that were dropped (if some were). Those
+ * that were dropped must be notified to the upper layer as
+ * failed to send.
+ */
+ asoc->strmout[i].next_sequence_sent = 0x0;
+ TAILQ_INIT(&asoc->strmout[i].outqueue);
+ asoc->strmout[i].stream_no = i;
+ asoc->strmout[i].last_msg_incomplete = 0;
+ asoc->strmout[i].next_spoke.tqe_next = 0;
+ asoc->strmout[i].next_spoke.tqe_prev = 0;
+ }
+ /* Now the mapping array */
+ asoc->mapping_array_size = SCTP_INITIAL_MAPPING_ARRAY;
+ SCTP_MALLOC(asoc->mapping_array, uint8_t *, asoc->mapping_array_size,
+ SCTP_M_MAP);
+ if (asoc->mapping_array == NULL) {
+ SCTP_FREE(asoc->strmout, SCTP_M_STRMO);
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOMEM);
+ return (ENOMEM);
+ }
+ memset(asoc->mapping_array, 0, asoc->mapping_array_size);
+ SCTP_MALLOC(asoc->nr_mapping_array, uint8_t *, asoc->mapping_array_size,
+ SCTP_M_MAP);
+ if (asoc->nr_mapping_array == NULL) {
+ SCTP_FREE(asoc->strmout, SCTP_M_STRMO);
+ SCTP_FREE(asoc->mapping_array, SCTP_M_MAP);
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOMEM);
+ return (ENOMEM);
+ }
+ memset(asoc->nr_mapping_array, 0, asoc->mapping_array_size);
+
+ /* Now the init of the other outqueues */
+ TAILQ_INIT(&asoc->free_chunks);
+ TAILQ_INIT(&asoc->out_wheel);
+ TAILQ_INIT(&asoc->control_send_queue);
+ TAILQ_INIT(&asoc->asconf_send_queue);
+ TAILQ_INIT(&asoc->send_queue);
+ TAILQ_INIT(&asoc->sent_queue);
+ TAILQ_INIT(&asoc->reasmqueue);
+ TAILQ_INIT(&asoc->resetHead);
+ asoc->max_inbound_streams = m->sctp_ep.max_open_streams_intome;
+ TAILQ_INIT(&asoc->asconf_queue);
+ /* authentication fields */
+ asoc->authinfo.random = NULL;
+ asoc->authinfo.active_keyid = 0;
+ asoc->authinfo.assoc_key = NULL;
+ asoc->authinfo.assoc_keyid = 0;
+ asoc->authinfo.recv_key = NULL;
+ asoc->authinfo.recv_keyid = 0;
+ LIST_INIT(&asoc->shared_keys);
+ asoc->marked_retrans = 0;
+ asoc->timoinit = 0;
+ asoc->timodata = 0;
+ asoc->timosack = 0;
+ asoc->timoshutdown = 0;
+ asoc->timoheartbeat = 0;
+ asoc->timocookie = 0;
+ asoc->timoshutdownack = 0;
+ (void)SCTP_GETTIME_TIMEVAL(&asoc->start_time);
+ asoc->discontinuity_time = asoc->start_time;
+ /*
+ * sa_ignore MEMLEAK {memory is put in the assoc mapping array and
+ * freed later when the association is freed.
+ */
+ return (0);
+}
+
+void
+sctp_print_mapping_array(struct sctp_association *asoc)
+{
+ unsigned int i, limit;
+
+ printf("Mapping array size: %d, baseTSN: %8.8x, cumAck: %8.8x, highestTSN: (%8.8x, %8.8x).\n",
+ asoc->mapping_array_size,
+ asoc->mapping_array_base_tsn,
+ asoc->cumulative_tsn,
+ asoc->highest_tsn_inside_map,
+ asoc->highest_tsn_inside_nr_map);
+ for (limit = asoc->mapping_array_size; limit > 1; limit--) {
+ if (asoc->mapping_array[limit - 1]) {
+ break;
+ }
+ }
+ printf("Renegable mapping array (last %d entries are zero):\n", asoc->mapping_array_size - limit);
+ for (i = 0; i < limit; i++) {
+ printf("%2.2x%c", asoc->mapping_array[i], ((i + 1) % 16) ? ' ' : '\n');
+ }
+ if (limit % 16)
+ printf("\n");
+ for (limit = asoc->mapping_array_size; limit > 1; limit--) {
+ if (asoc->nr_mapping_array[limit - 1]) {
+ break;
+ }
+ }
+ printf("Non renegable mapping array (last %d entries are zero):\n", asoc->mapping_array_size - limit);
+ for (i = 0; i < limit; i++) {
+ printf("%2.2x%c", asoc->nr_mapping_array[i], ((i + 1) % 16) ? ' ' : '\n');
+ }
+ if (limit % 16)
+ printf("\n");
+}
+
+int
+sctp_expand_mapping_array(struct sctp_association *asoc, uint32_t needed)
+{
+ /* mapping array needs to grow */
+ uint8_t *new_array1, *new_array2;
+ uint32_t new_size;
+
+ new_size = asoc->mapping_array_size + ((needed + 7) / 8 + SCTP_MAPPING_ARRAY_INCR);
+ SCTP_MALLOC(new_array1, uint8_t *, new_size, SCTP_M_MAP);
+ SCTP_MALLOC(new_array2, uint8_t *, new_size, SCTP_M_MAP);
+ if ((new_array1 == NULL) || (new_array2 == NULL)) {
+ /* can't get more, forget it */
+ SCTP_PRINTF("No memory for expansion of SCTP mapping array %d\n", new_size);
+ if (new_array1) {
+ SCTP_FREE(new_array1, SCTP_M_MAP);
+ }
+ if (new_array2) {
+ SCTP_FREE(new_array2, SCTP_M_MAP);
+ }
+ return (-1);
+ }
+ memset(new_array1, 0, new_size);
+ memset(new_array2, 0, new_size);
+ memcpy(new_array1, asoc->mapping_array, asoc->mapping_array_size);
+ memcpy(new_array2, asoc->nr_mapping_array, asoc->mapping_array_size);
+ SCTP_FREE(asoc->mapping_array, SCTP_M_MAP);
+ SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP);
+ asoc->mapping_array = new_array1;
+ asoc->nr_mapping_array = new_array2;
+ asoc->mapping_array_size = new_size;
+ return (0);
+}
+
+
+static void
+sctp_iterator_work(struct sctp_iterator *it)
+{
+ int iteration_count = 0;
+ int inp_skip = 0;
+ int first_in = 1;
+ struct sctp_inpcb *tinp;
+
+ SCTP_INP_INFO_RLOCK();
+ SCTP_ITERATOR_LOCK();
+ if (it->inp) {
+ SCTP_INP_RLOCK(it->inp);
+ SCTP_INP_DECR_REF(it->inp);
+ }
+ if (it->inp == NULL) {
+ /* iterator is complete */
+done_with_iterator:
+ SCTP_ITERATOR_UNLOCK();
+ SCTP_INP_INFO_RUNLOCK();
+ if (it->function_atend != NULL) {
+ (*it->function_atend) (it->pointer, it->val);
+ }
+ SCTP_FREE(it, SCTP_M_ITER);
+ return;
+ }
+select_a_new_ep:
+ if (first_in) {
+ first_in = 0;
+ } else {
+ SCTP_INP_RLOCK(it->inp);
+ }
+ while (((it->pcb_flags) &&
+ ((it->inp->sctp_flags & it->pcb_flags) != it->pcb_flags)) ||
+ ((it->pcb_features) &&
+ ((it->inp->sctp_features & it->pcb_features) != it->pcb_features))) {
+ /* endpoint flags or features don't match, so keep looking */
+ if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) {
+ SCTP_INP_RUNLOCK(it->inp);
+ goto done_with_iterator;
+ }
+ tinp = it->inp;
+ it->inp = LIST_NEXT(it->inp, sctp_list);
+ SCTP_INP_RUNLOCK(tinp);
+ if (it->inp == NULL) {
+ goto done_with_iterator;
+ }
+ SCTP_INP_RLOCK(it->inp);
+ }
+ /* now go through each assoc which is in the desired state */
+ if (it->done_current_ep == 0) {
+ if (it->function_inp != NULL)
+ inp_skip = (*it->function_inp) (it->inp, it->pointer, it->val);
+ it->done_current_ep = 1;
+ }
+ if (it->stcb == NULL) {
+ /* run the per instance function */
+ it->stcb = LIST_FIRST(&it->inp->sctp_asoc_list);
+ }
+ if ((inp_skip) || it->stcb == NULL) {
+ if (it->function_inp_end != NULL) {
+ inp_skip = (*it->function_inp_end) (it->inp,
+ it->pointer,
+ it->val);
+ }
+ SCTP_INP_RUNLOCK(it->inp);
+ goto no_stcb;
+ }
+ while (it->stcb) {
+ SCTP_TCB_LOCK(it->stcb);
+ if (it->asoc_state && ((it->stcb->asoc.state & it->asoc_state) != it->asoc_state)) {
+ /* not in the right state... keep looking */
+ SCTP_TCB_UNLOCK(it->stcb);
+ goto next_assoc;
+ }
+ /* see if we have limited out the iterator loop */
+ iteration_count++;
+ if (iteration_count > SCTP_ITERATOR_MAX_AT_ONCE) {
+ /* Pause to let others grab the lock */
+ atomic_add_int(&it->stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(it->stcb);
+ SCTP_INP_INCR_REF(it->inp);
+ SCTP_INP_RUNLOCK(it->inp);
+ SCTP_ITERATOR_UNLOCK();
+ SCTP_INP_INFO_RUNLOCK();
+ SCTP_INP_INFO_RLOCK();
+ SCTP_ITERATOR_LOCK();
+ if (sctp_it_ctl.iterator_flags) {
+ /* We won't be staying here */
+ SCTP_INP_DECR_REF(it->inp);
+ atomic_add_int(&it->stcb->asoc.refcnt, -1);
+ if (sctp_it_ctl.iterator_flags &
+ SCTP_ITERATOR_MUST_EXIT) {
+ goto done_with_iterator;
+ }
+ if (sctp_it_ctl.iterator_flags &
+ SCTP_ITERATOR_STOP_CUR_IT) {
+ sctp_it_ctl.iterator_flags &= ~SCTP_ITERATOR_STOP_CUR_IT;
+ goto done_with_iterator;
+ }
+ if (sctp_it_ctl.iterator_flags &
+ SCTP_ITERATOR_STOP_CUR_INP) {
+ sctp_it_ctl.iterator_flags &= ~SCTP_ITERATOR_STOP_CUR_INP;
+ goto no_stcb;
+ }
+ /* If we reach here huh? */
+ printf("Unknown it ctl flag %x\n",
+ sctp_it_ctl.iterator_flags);
+ sctp_it_ctl.iterator_flags = 0;
+ }
+ SCTP_INP_RLOCK(it->inp);
+ SCTP_INP_DECR_REF(it->inp);
+ SCTP_TCB_LOCK(it->stcb);
+ atomic_add_int(&it->stcb->asoc.refcnt, -1);
+ iteration_count = 0;
+ }
+ /* run function on this one */
+ (*it->function_assoc) (it->inp, it->stcb, it->pointer, it->val);
+
+ /*
+ * we lie here, it really needs to have its own type but
+ * first I must verify that this won't effect things :-0
+ */
+ if (it->no_chunk_output == 0)
+ sctp_chunk_output(it->inp, it->stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED);
+
+ SCTP_TCB_UNLOCK(it->stcb);
+next_assoc:
+ it->stcb = LIST_NEXT(it->stcb, sctp_tcblist);
+ if (it->stcb == NULL) {
+ /* Run last function */
+ if (it->function_inp_end != NULL) {
+ inp_skip = (*it->function_inp_end) (it->inp,
+ it->pointer,
+ it->val);
+ }
+ }
+ }
+ SCTP_INP_RUNLOCK(it->inp);
+no_stcb:
+ /* done with all assocs on this endpoint, move on to next endpoint */
+ it->done_current_ep = 0;
+ if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) {
+ it->inp = NULL;
+ } else {
+ it->inp = LIST_NEXT(it->inp, sctp_list);
+ }
+ if (it->inp == NULL) {
+ goto done_with_iterator;
+ }
+ goto select_a_new_ep;
+}
+
+void
+sctp_iterator_worker(void)
+{
+ struct sctp_iterator *it = NULL;
+
+ /* This function is called with the WQ lock in place */
+
+ sctp_it_ctl.iterator_running = 1;
+ sctp_it_ctl.cur_it = it = TAILQ_FIRST(&sctp_it_ctl.iteratorhead);
+ while (it) {
+ /* now lets work on this one */
+ TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr);
+ SCTP_IPI_ITERATOR_WQ_UNLOCK();
+ CURVNET_SET(it->vn);
+ sctp_iterator_work(it);
+
+ CURVNET_RESTORE();
+ SCTP_IPI_ITERATOR_WQ_LOCK();
+ if (sctp_it_ctl.iterator_flags & SCTP_ITERATOR_MUST_EXIT) {
+ sctp_it_ctl.cur_it = NULL;
+ break;
+ }
+ /* sa_ignore FREED_MEMORY */
+ sctp_it_ctl.cur_it = it = TAILQ_FIRST(&sctp_it_ctl.iteratorhead);
+ }
+ sctp_it_ctl.iterator_running = 0;
+ return;
+}
+
+
+static void
+sctp_handle_addr_wq(void)
+{
+ /* deal with the ADDR wq from the rtsock calls */
+ struct sctp_laddr *wi;
+ struct sctp_asconf_iterator *asc;
+
+ SCTP_MALLOC(asc, struct sctp_asconf_iterator *,
+ sizeof(struct sctp_asconf_iterator), SCTP_M_ASC_IT);
+ if (asc == NULL) {
+ /* Try later, no memory */
+ sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ,
+ (struct sctp_inpcb *)NULL,
+ (struct sctp_tcb *)NULL,
+ (struct sctp_nets *)NULL);
+ return;
+ }
+ LIST_INIT(&asc->list_of_work);
+ asc->cnt = 0;
+
+ SCTP_WQ_ADDR_LOCK();
+ wi = LIST_FIRST(&SCTP_BASE_INFO(addr_wq));
+ while (wi != NULL) {
+ LIST_REMOVE(wi, sctp_nxt_addr);
+ LIST_INSERT_HEAD(&asc->list_of_work, wi, sctp_nxt_addr);
+ asc->cnt++;
+ wi = LIST_FIRST(&SCTP_BASE_INFO(addr_wq));
+ }
+ SCTP_WQ_ADDR_UNLOCK();
+
+ if (asc->cnt == 0) {
+ SCTP_FREE(asc, SCTP_M_ASC_IT);
+ } else {
+ (void)sctp_initiate_iterator(sctp_asconf_iterator_ep,
+ sctp_asconf_iterator_stcb,
+ NULL, /* No ep end for boundall */
+ SCTP_PCB_FLAGS_BOUNDALL,
+ SCTP_PCB_ANY_FEATURES,
+ SCTP_ASOC_ANY_STATE,
+ (void *)asc, 0,
+ sctp_asconf_iterator_end, NULL, 0);
+ }
+}
+
+int retcode = 0;
+int cur_oerr = 0;
+
+void
+sctp_timeout_handler(void *t)
+{
+ struct sctp_inpcb *inp;
+ struct sctp_tcb *stcb;
+ struct sctp_nets *net;
+ struct sctp_timer *tmr;
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+ int did_output, type;
+
+ tmr = (struct sctp_timer *)t;
+ inp = (struct sctp_inpcb *)tmr->ep;
+ stcb = (struct sctp_tcb *)tmr->tcb;
+ net = (struct sctp_nets *)tmr->net;
+ CURVNET_SET((struct vnet *)tmr->vnet);
+ did_output = 1;
+
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xF0, (uint8_t) tmr->type);
+ sctp_auditing(3, inp, stcb, net);
+#endif
+
+ /* sanity checks... */
+ if (tmr->self != (void *)tmr) {
+ /*
+ * SCTP_PRINTF("Stale SCTP timer fired (%p), ignoring...\n",
+ * tmr);
+ */
+ CURVNET_RESTORE();
+ return;
+ }
+ tmr->stopped_from = 0xa001;
+ if (!SCTP_IS_TIMER_TYPE_VALID(tmr->type)) {
+ /*
+ * SCTP_PRINTF("SCTP timer fired with invalid type: 0x%x\n",
+ * tmr->type);
+ */
+ CURVNET_RESTORE();
+ return;
+ }
+ tmr->stopped_from = 0xa002;
+ if ((tmr->type != SCTP_TIMER_TYPE_ADDR_WQ) && (inp == NULL)) {
+ CURVNET_RESTORE();
+ return;
+ }
+ /* if this is an iterator timeout, get the struct and clear inp */
+ tmr->stopped_from = 0xa003;
+ type = tmr->type;
+ if (inp) {
+ SCTP_INP_INCR_REF(inp);
+ if ((inp->sctp_socket == 0) &&
+ ((tmr->type != SCTP_TIMER_TYPE_INPKILL) &&
+ (tmr->type != SCTP_TIMER_TYPE_INIT) &&
+ (tmr->type != SCTP_TIMER_TYPE_SEND) &&
+ (tmr->type != SCTP_TIMER_TYPE_RECV) &&
+ (tmr->type != SCTP_TIMER_TYPE_HEARTBEAT) &&
+ (tmr->type != SCTP_TIMER_TYPE_SHUTDOWN) &&
+ (tmr->type != SCTP_TIMER_TYPE_SHUTDOWNACK) &&
+ (tmr->type != SCTP_TIMER_TYPE_SHUTDOWNGUARD) &&
+ (tmr->type != SCTP_TIMER_TYPE_ASOCKILL))
+ ) {
+ SCTP_INP_DECR_REF(inp);
+ CURVNET_RESTORE();
+ return;
+ }
+ }
+ tmr->stopped_from = 0xa004;
+ if (stcb) {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ if (stcb->asoc.state == 0) {
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ if (inp) {
+ SCTP_INP_DECR_REF(inp);
+ }
+ CURVNET_RESTORE();
+ return;
+ }
+ }
+ tmr->stopped_from = 0xa005;
+ SCTPDBG(SCTP_DEBUG_TIMER1, "Timer type %d goes off\n", tmr->type);
+ if (!SCTP_OS_TIMER_ACTIVE(&tmr->timer)) {
+ if (inp) {
+ SCTP_INP_DECR_REF(inp);
+ }
+ if (stcb) {
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ }
+ CURVNET_RESTORE();
+ return;
+ }
+ tmr->stopped_from = 0xa006;
+
+ if (stcb) {
+ SCTP_TCB_LOCK(stcb);
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ if ((tmr->type != SCTP_TIMER_TYPE_ASOCKILL) &&
+ ((stcb->asoc.state == 0) ||
+ (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED))) {
+ SCTP_TCB_UNLOCK(stcb);
+ if (inp) {
+ SCTP_INP_DECR_REF(inp);
+ }
+ CURVNET_RESTORE();
+ return;
+ }
+ }
+ /* record in stopped what t-o occured */
+ tmr->stopped_from = tmr->type;
+
+ /* mark as being serviced now */
+ if (SCTP_OS_TIMER_PENDING(&tmr->timer)) {
+ /*
+ * Callout has been rescheduled.
+ */
+ goto get_out;
+ }
+ if (!SCTP_OS_TIMER_ACTIVE(&tmr->timer)) {
+ /*
+ * Not active, so no action.
+ */
+ goto get_out;
+ }
+ SCTP_OS_TIMER_DEACTIVATE(&tmr->timer);
+
+ /* call the handler for the appropriate timer type */
+ switch (tmr->type) {
+ case SCTP_TIMER_TYPE_ZERO_COPY:
+ if (inp == NULL) {
+ break;
+ }
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) {
+ SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket);
+ }
+ break;
+ case SCTP_TIMER_TYPE_ZCOPY_SENDQ:
+ if (inp == NULL) {
+ break;
+ }
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) {
+ SCTP_ZERO_COPY_SENDQ_EVENT(inp, inp->sctp_socket);
+ }
+ break;
+ case SCTP_TIMER_TYPE_ADDR_WQ:
+ sctp_handle_addr_wq();
+ break;
+ case SCTP_TIMER_TYPE_SEND:
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ SCTP_STAT_INCR(sctps_timodata);
+ stcb->asoc.timodata++;
+ stcb->asoc.num_send_timers_up--;
+ if (stcb->asoc.num_send_timers_up < 0) {
+ stcb->asoc.num_send_timers_up = 0;
+ }
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ cur_oerr = stcb->asoc.overall_error_count;
+ retcode = sctp_t3rxt_timer(inp, stcb, net);
+ if (retcode) {
+ /* no need to unlock on tcb its gone */
+
+ goto out_decr;
+ }
+ SCTP_TCB_LOCK_ASSERT(stcb);
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(4, inp, stcb, net);
+#endif
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED);
+ if ((stcb->asoc.num_send_timers_up == 0) &&
+ (stcb->asoc.sent_queue_cnt > 0)
+ ) {
+ struct sctp_tmit_chunk *chk;
+
+ /*
+ * safeguard. If there on some on the sent queue
+ * somewhere but no timers running something is
+ * wrong... so we start a timer on the first chunk
+ * on the send queue on whatever net it is sent to.
+ */
+ chk = TAILQ_FIRST(&stcb->asoc.sent_queue);
+ sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb,
+ chk->whoTo);
+ }
+ break;
+ case SCTP_TIMER_TYPE_INIT:
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ SCTP_STAT_INCR(sctps_timoinit);
+ stcb->asoc.timoinit++;
+ if (sctp_t1init_timer(inp, stcb, net)) {
+ /* no need to unlock on tcb its gone */
+ goto out_decr;
+ }
+ /* We do output but not here */
+ did_output = 0;
+ break;
+ case SCTP_TIMER_TYPE_RECV:
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ } {
+ SCTP_STAT_INCR(sctps_timosack);
+ stcb->asoc.timosack++;
+ sctp_send_sack(stcb);
+ }
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(4, inp, stcb, net);
+#endif
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SACK_TMR, SCTP_SO_NOT_LOCKED);
+ break;
+ case SCTP_TIMER_TYPE_SHUTDOWN:
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ if (sctp_shutdown_timer(inp, stcb, net)) {
+ /* no need to unlock on tcb its gone */
+ goto out_decr;
+ }
+ SCTP_STAT_INCR(sctps_timoshutdown);
+ stcb->asoc.timoshutdown++;
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(4, inp, stcb, net);
+#endif
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SHUT_TMR, SCTP_SO_NOT_LOCKED);
+ break;
+ case SCTP_TIMER_TYPE_HEARTBEAT:
+ {
+ struct sctp_nets *lnet;
+ int cnt_of_unconf = 0;
+
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ SCTP_STAT_INCR(sctps_timoheartbeat);
+ stcb->asoc.timoheartbeat++;
+ TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) {
+ if ((lnet->dest_state & SCTP_ADDR_UNCONFIRMED) &&
+ (lnet->dest_state & SCTP_ADDR_REACHABLE)) {
+ cnt_of_unconf++;
+ }
+ }
+ if (cnt_of_unconf == 0) {
+ if (sctp_heartbeat_timer(inp, stcb, lnet,
+ cnt_of_unconf)) {
+ /* no need to unlock on tcb its gone */
+ goto out_decr;
+ }
+ }
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(4, inp, stcb, lnet);
+#endif
+ sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT,
+ stcb->sctp_ep, stcb, lnet);
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_HB_TMR, SCTP_SO_NOT_LOCKED);
+ }
+ break;
+ case SCTP_TIMER_TYPE_COOKIE:
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ if (sctp_cookie_timer(inp, stcb, net)) {
+ /* no need to unlock on tcb its gone */
+ goto out_decr;
+ }
+ SCTP_STAT_INCR(sctps_timocookie);
+ stcb->asoc.timocookie++;
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(4, inp, stcb, net);
+#endif
+ /*
+ * We consider T3 and Cookie timer pretty much the same with
+ * respect to where from in chunk_output.
+ */
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED);
+ break;
+ case SCTP_TIMER_TYPE_NEWCOOKIE:
+ {
+ struct timeval tv;
+ int i, secret;
+
+ if (inp == NULL) {
+ break;
+ }
+ SCTP_STAT_INCR(sctps_timosecret);
+ (void)SCTP_GETTIME_TIMEVAL(&tv);
+ SCTP_INP_WLOCK(inp);
+ inp->sctp_ep.time_of_secret_change = tv.tv_sec;
+ inp->sctp_ep.last_secret_number =
+ inp->sctp_ep.current_secret_number;
+ inp->sctp_ep.current_secret_number++;
+ if (inp->sctp_ep.current_secret_number >=
+ SCTP_HOW_MANY_SECRETS) {
+ inp->sctp_ep.current_secret_number = 0;
+ }
+ secret = (int)inp->sctp_ep.current_secret_number;
+ for (i = 0; i < SCTP_NUMBER_OF_SECRETS; i++) {
+ inp->sctp_ep.secret_key[secret][i] =
+ sctp_select_initial_TSN(&inp->sctp_ep);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ sctp_timer_start(SCTP_TIMER_TYPE_NEWCOOKIE, inp, stcb, net);
+ }
+ did_output = 0;
+ break;
+ case SCTP_TIMER_TYPE_PATHMTURAISE:
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ SCTP_STAT_INCR(sctps_timopathmtu);
+ sctp_pathmtu_timer(inp, stcb, net);
+ did_output = 0;
+ break;
+ case SCTP_TIMER_TYPE_SHUTDOWNACK:
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ if (sctp_shutdownack_timer(inp, stcb, net)) {
+ /* no need to unlock on tcb its gone */
+ goto out_decr;
+ }
+ SCTP_STAT_INCR(sctps_timoshutdownack);
+ stcb->asoc.timoshutdownack++;
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(4, inp, stcb, net);
+#endif
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SHUT_ACK_TMR, SCTP_SO_NOT_LOCKED);
+ break;
+ case SCTP_TIMER_TYPE_SHUTDOWNGUARD:
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ SCTP_STAT_INCR(sctps_timoshutdownguard);
+ sctp_abort_an_association(inp, stcb,
+ SCTP_SHUTDOWN_GUARD_EXPIRES, NULL, SCTP_SO_NOT_LOCKED);
+ /* no need to unlock on tcb its gone */
+ goto out_decr;
+
+ case SCTP_TIMER_TYPE_STRRESET:
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ if (sctp_strreset_timer(inp, stcb, net)) {
+ /* no need to unlock on tcb its gone */
+ goto out_decr;
+ }
+ SCTP_STAT_INCR(sctps_timostrmrst);
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_TMR, SCTP_SO_NOT_LOCKED);
+ break;
+ case SCTP_TIMER_TYPE_EARLYFR:
+ /* Need to do FR of things for net */
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ SCTP_STAT_INCR(sctps_timoearlyfr);
+ sctp_early_fr_timer(inp, stcb, net);
+ break;
+ case SCTP_TIMER_TYPE_ASCONF:
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ if (sctp_asconf_timer(inp, stcb, net)) {
+ /* no need to unlock on tcb its gone */
+ goto out_decr;
+ }
+ SCTP_STAT_INCR(sctps_timoasconf);
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_auditing(4, inp, stcb, net);
+#endif
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_ASCONF_TMR, SCTP_SO_NOT_LOCKED);
+ break;
+ case SCTP_TIMER_TYPE_PRIM_DELETED:
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ sctp_delete_prim_timer(inp, stcb, net);
+ SCTP_STAT_INCR(sctps_timodelprim);
+ break;
+
+ case SCTP_TIMER_TYPE_AUTOCLOSE:
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ SCTP_STAT_INCR(sctps_timoautoclose);
+ sctp_autoclose_timer(inp, stcb, net);
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_AUTOCLOSE_TMR, SCTP_SO_NOT_LOCKED);
+ did_output = 0;
+ break;
+ case SCTP_TIMER_TYPE_ASOCKILL:
+ if ((stcb == NULL) || (inp == NULL)) {
+ break;
+ }
+ SCTP_STAT_INCR(sctps_timoassockill);
+ /* Can we free it yet? */
+ SCTP_INP_DECR_REF(inp);
+ sctp_timer_stop(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_1);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(inp);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+#endif
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_2);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ /*
+ * free asoc, always unlocks (or destroy's) so prevent
+ * duplicate unlock or unlock of a free mtx :-0
+ */
+ stcb = NULL;
+ goto out_no_decr;
+ case SCTP_TIMER_TYPE_INPKILL:
+ SCTP_STAT_INCR(sctps_timoinpkill);
+ if (inp == NULL) {
+ break;
+ }
+ /*
+ * special case, take away our increment since WE are the
+ * killer
+ */
+ SCTP_INP_DECR_REF(inp);
+ sctp_timer_stop(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_3);
+ sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
+ SCTP_CALLED_FROM_INPKILL_TIMER);
+ inp = NULL;
+ goto out_no_decr;
+ default:
+ SCTPDBG(SCTP_DEBUG_TIMER1, "sctp_timeout_handler:unknown timer %d\n",
+ tmr->type);
+ break;
+ };
+#ifdef SCTP_AUDITING_ENABLED
+ sctp_audit_log(0xF1, (uint8_t) tmr->type);
+ if (inp)
+ sctp_auditing(5, inp, stcb, net);
+#endif
+ if ((did_output) && stcb) {
+ /*
+ * Now we need to clean up the control chunk chain if an
+ * ECNE is on it. It must be marked as UNSENT again so next
+ * call will continue to send it until such time that we get
+ * a CWR, to remove it. It is, however, less likely that we
+ * will find a ecn echo on the chain though.
+ */
+ sctp_fix_ecn_echo(&stcb->asoc);
+ }
+get_out:
+ if (stcb) {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+out_decr:
+ if (inp) {
+ SCTP_INP_DECR_REF(inp);
+ }
+out_no_decr:
+ SCTPDBG(SCTP_DEBUG_TIMER1, "Timer now complete (type %d)\n",
+ type);
+ CURVNET_RESTORE();
+}
+
+void
+sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net)
+{
+ int to_ticks;
+ struct sctp_timer *tmr;
+
+ if ((t_type != SCTP_TIMER_TYPE_ADDR_WQ) && (inp == NULL))
+ return;
+
+ to_ticks = 0;
+
+ tmr = NULL;
+ if (stcb) {
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ }
+ switch (t_type) {
+ case SCTP_TIMER_TYPE_ZERO_COPY:
+ tmr = &inp->sctp_ep.zero_copy_timer;
+ to_ticks = SCTP_ZERO_COPY_TICK_DELAY;
+ break;
+ case SCTP_TIMER_TYPE_ZCOPY_SENDQ:
+ tmr = &inp->sctp_ep.zero_copy_sendq_timer;
+ to_ticks = SCTP_ZERO_COPY_SENDQ_TICK_DELAY;
+ break;
+ case SCTP_TIMER_TYPE_ADDR_WQ:
+ /* Only 1 tick away :-) */
+ tmr = &SCTP_BASE_INFO(addr_wq_timer);
+ to_ticks = SCTP_ADDRESS_TICK_DELAY;
+ break;
+ case SCTP_TIMER_TYPE_SEND:
+ /* Here we use the RTO timer */
+ {
+ int rto_val;
+
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ tmr = &net->rxt_timer;
+ if (net->RTO == 0) {
+ rto_val = stcb->asoc.initial_rto;
+ } else {
+ rto_val = net->RTO;
+ }
+ to_ticks = MSEC_TO_TICKS(rto_val);
+ }
+ break;
+ case SCTP_TIMER_TYPE_INIT:
+ /*
+ * Here we use the INIT timer default usually about 1
+ * minute.
+ */
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ tmr = &net->rxt_timer;
+ if (net->RTO == 0) {
+ to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto);
+ } else {
+ to_ticks = MSEC_TO_TICKS(net->RTO);
+ }
+ break;
+ case SCTP_TIMER_TYPE_RECV:
+ /*
+ * Here we use the Delayed-Ack timer value from the inp
+ * ususually about 200ms.
+ */
+ if (stcb == NULL) {
+ return;
+ }
+ tmr = &stcb->asoc.dack_timer;
+ to_ticks = MSEC_TO_TICKS(stcb->asoc.delayed_ack);
+ break;
+ case SCTP_TIMER_TYPE_SHUTDOWN:
+ /* Here we use the RTO of the destination. */
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ if (net->RTO == 0) {
+ to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto);
+ } else {
+ to_ticks = MSEC_TO_TICKS(net->RTO);
+ }
+ tmr = &net->rxt_timer;
+ break;
+ case SCTP_TIMER_TYPE_HEARTBEAT:
+ /*
+ * the net is used here so that we can add in the RTO. Even
+ * though we use a different timer. We also add the HB timer
+ * PLUS a random jitter.
+ */
+ if ((inp == NULL) || (stcb == NULL)) {
+ return;
+ } else {
+ uint32_t rndval;
+ uint8_t this_random;
+ int cnt_of_unconf = 0;
+ struct sctp_nets *lnet;
+
+ TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) {
+ if ((lnet->dest_state & SCTP_ADDR_UNCONFIRMED) &&
+ (lnet->dest_state & SCTP_ADDR_REACHABLE)) {
+ cnt_of_unconf++;
+ }
+ }
+ if (cnt_of_unconf) {
+ net = lnet = NULL;
+ (void)sctp_heartbeat_timer(inp, stcb, lnet, cnt_of_unconf);
+ }
+ if (stcb->asoc.hb_random_idx > 3) {
+ rndval = sctp_select_initial_TSN(&inp->sctp_ep);
+ memcpy(stcb->asoc.hb_random_values, &rndval,
+ sizeof(stcb->asoc.hb_random_values));
+ stcb->asoc.hb_random_idx = 0;
+ }
+ this_random = stcb->asoc.hb_random_values[stcb->asoc.hb_random_idx];
+ stcb->asoc.hb_random_idx++;
+ stcb->asoc.hb_ect_randombit = 0;
+ /*
+ * this_random will be 0 - 256 ms RTO is in ms.
+ */
+ if ((stcb->asoc.hb_is_disabled) &&
+ (cnt_of_unconf == 0)) {
+ return;
+ }
+ if (net) {
+ int delay;
+
+ delay = stcb->asoc.heart_beat_delay;
+ TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) {
+ if ((lnet->dest_state & SCTP_ADDR_UNCONFIRMED) &&
+ ((lnet->dest_state & SCTP_ADDR_OUT_OF_SCOPE) == 0) &&
+ (lnet->dest_state & SCTP_ADDR_REACHABLE)) {
+ delay = 0;
+ }
+ }
+ if (net->RTO == 0) {
+ /* Never been checked */
+ to_ticks = this_random + stcb->asoc.initial_rto + delay;
+ } else {
+ /* set rto_val to the ms */
+ to_ticks = delay + net->RTO + this_random;
+ }
+ } else {
+ if (cnt_of_unconf) {
+ to_ticks = this_random + stcb->asoc.initial_rto;
+ } else {
+ to_ticks = stcb->asoc.heart_beat_delay + this_random + stcb->asoc.initial_rto;
+ }
+ }
+ /*
+ * Now we must convert the to_ticks that are now in
+ * ms to ticks.
+ */
+ to_ticks = MSEC_TO_TICKS(to_ticks);
+ tmr = &stcb->asoc.hb_timer;
+ }
+ break;
+ case SCTP_TIMER_TYPE_COOKIE:
+ /*
+ * Here we can use the RTO timer from the network since one
+ * RTT was compelete. If a retran happened then we will be
+ * using the RTO initial value.
+ */
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ if (net->RTO == 0) {
+ to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto);
+ } else {
+ to_ticks = MSEC_TO_TICKS(net->RTO);
+ }
+ tmr = &net->rxt_timer;
+ break;
+ case SCTP_TIMER_TYPE_NEWCOOKIE:
+ /*
+ * nothing needed but the endpoint here ususually about 60
+ * minutes.
+ */
+ if (inp == NULL) {
+ return;
+ }
+ tmr = &inp->sctp_ep.signature_change;
+ to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_SIGNATURE];
+ break;
+ case SCTP_TIMER_TYPE_ASOCKILL:
+ if (stcb == NULL) {
+ return;
+ }
+ tmr = &stcb->asoc.strreset_timer;
+ to_ticks = MSEC_TO_TICKS(SCTP_ASOC_KILL_TIMEOUT);
+ break;
+ case SCTP_TIMER_TYPE_INPKILL:
+ /*
+ * The inp is setup to die. We re-use the signature_chage
+ * timer since that has stopped and we are in the GONE
+ * state.
+ */
+ if (inp == NULL) {
+ return;
+ }
+ tmr = &inp->sctp_ep.signature_change;
+ to_ticks = MSEC_TO_TICKS(SCTP_INP_KILL_TIMEOUT);
+ break;
+ case SCTP_TIMER_TYPE_PATHMTURAISE:
+ /*
+ * Here we use the value found in the EP for PMTU ususually
+ * about 10 minutes.
+ */
+ if ((stcb == NULL) || (inp == NULL)) {
+ return;
+ }
+ if (net == NULL) {
+ return;
+ }
+ to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_PMTU];
+ tmr = &net->pmtu_timer;
+ break;
+ case SCTP_TIMER_TYPE_SHUTDOWNACK:
+ /* Here we use the RTO of the destination */
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ if (net->RTO == 0) {
+ to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto);
+ } else {
+ to_ticks = MSEC_TO_TICKS(net->RTO);
+ }
+ tmr = &net->rxt_timer;
+ break;
+ case SCTP_TIMER_TYPE_SHUTDOWNGUARD:
+ /*
+ * Here we use the endpoints shutdown guard timer usually
+ * about 3 minutes.
+ */
+ if ((inp == NULL) || (stcb == NULL)) {
+ return;
+ }
+ to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN];
+ tmr = &stcb->asoc.shut_guard_timer;
+ break;
+ case SCTP_TIMER_TYPE_STRRESET:
+ /*
+ * Here the timer comes from the stcb but its value is from
+ * the net's RTO.
+ */
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ if (net->RTO == 0) {
+ to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto);
+ } else {
+ to_ticks = MSEC_TO_TICKS(net->RTO);
+ }
+ tmr = &stcb->asoc.strreset_timer;
+ break;
+
+ case SCTP_TIMER_TYPE_EARLYFR:
+ {
+ unsigned int msec;
+
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ if (net->flight_size > net->cwnd) {
+ /* no need to start */
+ return;
+ }
+ SCTP_STAT_INCR(sctps_earlyfrstart);
+ if (net->lastsa == 0) {
+ /* Hmm no rtt estimate yet? */
+ msec = stcb->asoc.initial_rto >> 2;
+ } else {
+ msec = ((net->lastsa >> 2) + net->lastsv) >> 1;
+ }
+ if (msec < SCTP_BASE_SYSCTL(sctp_early_fr_msec)) {
+ msec = SCTP_BASE_SYSCTL(sctp_early_fr_msec);
+ if (msec < SCTP_MINFR_MSEC_FLOOR) {
+ msec = SCTP_MINFR_MSEC_FLOOR;
+ }
+ }
+ to_ticks = MSEC_TO_TICKS(msec);
+ tmr = &net->fr_timer;
+ }
+ break;
+ case SCTP_TIMER_TYPE_ASCONF:
+ /*
+ * Here the timer comes from the stcb but its value is from
+ * the net's RTO.
+ */
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ if (net->RTO == 0) {
+ to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto);
+ } else {
+ to_ticks = MSEC_TO_TICKS(net->RTO);
+ }
+ tmr = &stcb->asoc.asconf_timer;
+ break;
+ case SCTP_TIMER_TYPE_PRIM_DELETED:
+ if ((stcb == NULL) || (net != NULL)) {
+ return;
+ }
+ to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto);
+ tmr = &stcb->asoc.delete_prim_timer;
+ break;
+ case SCTP_TIMER_TYPE_AUTOCLOSE:
+ if (stcb == NULL) {
+ return;
+ }
+ if (stcb->asoc.sctp_autoclose_ticks == 0) {
+ /*
+ * Really an error since stcb is NOT set to
+ * autoclose
+ */
+ return;
+ }
+ to_ticks = stcb->asoc.sctp_autoclose_ticks;
+ tmr = &stcb->asoc.autoclose_timer;
+ break;
+ default:
+ SCTPDBG(SCTP_DEBUG_TIMER1, "%s: Unknown timer type %d\n",
+ __FUNCTION__, t_type);
+ return;
+ break;
+ };
+ if ((to_ticks <= 0) || (tmr == NULL)) {
+ SCTPDBG(SCTP_DEBUG_TIMER1, "%s: %d:software error to_ticks:%d tmr:%p not set ??\n",
+ __FUNCTION__, t_type, to_ticks, tmr);
+ return;
+ }
+ if (SCTP_OS_TIMER_PENDING(&tmr->timer)) {
+ /*
+ * we do NOT allow you to have it already running. if it is
+ * we leave the current one up unchanged
+ */
+ return;
+ }
+ /* At this point we can proceed */
+ if (t_type == SCTP_TIMER_TYPE_SEND) {
+ stcb->asoc.num_send_timers_up++;
+ }
+ tmr->stopped_from = 0;
+ tmr->type = t_type;
+ tmr->ep = (void *)inp;
+ tmr->tcb = (void *)stcb;
+ tmr->net = (void *)net;
+ tmr->self = (void *)tmr;
+ tmr->vnet = (void *)curvnet;
+ tmr->ticks = sctp_get_tick_count();
+ (void)SCTP_OS_TIMER_START(&tmr->timer, to_ticks, sctp_timeout_handler, tmr);
+ return;
+}
+
+void
+sctp_timer_stop(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net, uint32_t from)
+{
+ struct sctp_timer *tmr;
+
+ if ((t_type != SCTP_TIMER_TYPE_ADDR_WQ) &&
+ (inp == NULL))
+ return;
+
+ tmr = NULL;
+ if (stcb) {
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ }
+ switch (t_type) {
+ case SCTP_TIMER_TYPE_ZERO_COPY:
+ tmr = &inp->sctp_ep.zero_copy_timer;
+ break;
+ case SCTP_TIMER_TYPE_ZCOPY_SENDQ:
+ tmr = &inp->sctp_ep.zero_copy_sendq_timer;
+ break;
+ case SCTP_TIMER_TYPE_ADDR_WQ:
+ tmr = &SCTP_BASE_INFO(addr_wq_timer);
+ break;
+ case SCTP_TIMER_TYPE_EARLYFR:
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ tmr = &net->fr_timer;
+ SCTP_STAT_INCR(sctps_earlyfrstop);
+ break;
+ case SCTP_TIMER_TYPE_SEND:
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ tmr = &net->rxt_timer;
+ break;
+ case SCTP_TIMER_TYPE_INIT:
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ tmr = &net->rxt_timer;
+ break;
+ case SCTP_TIMER_TYPE_RECV:
+ if (stcb == NULL) {
+ return;
+ }
+ tmr = &stcb->asoc.dack_timer;
+ break;
+ case SCTP_TIMER_TYPE_SHUTDOWN:
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ tmr = &net->rxt_timer;
+ break;
+ case SCTP_TIMER_TYPE_HEARTBEAT:
+ if (stcb == NULL) {
+ return;
+ }
+ tmr = &stcb->asoc.hb_timer;
+ break;
+ case SCTP_TIMER_TYPE_COOKIE:
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ tmr = &net->rxt_timer;
+ break;
+ case SCTP_TIMER_TYPE_NEWCOOKIE:
+ /* nothing needed but the endpoint here */
+ tmr = &inp->sctp_ep.signature_change;
+ /*
+ * We re-use the newcookie timer for the INP kill timer. We
+ * must assure that we do not kill it by accident.
+ */
+ break;
+ case SCTP_TIMER_TYPE_ASOCKILL:
+ /*
+ * Stop the asoc kill timer.
+ */
+ if (stcb == NULL) {
+ return;
+ }
+ tmr = &stcb->asoc.strreset_timer;
+ break;
+
+ case SCTP_TIMER_TYPE_INPKILL:
+ /*
+ * The inp is setup to die. We re-use the signature_chage
+ * timer since that has stopped and we are in the GONE
+ * state.
+ */
+ tmr = &inp->sctp_ep.signature_change;
+ break;
+ case SCTP_TIMER_TYPE_PATHMTURAISE:
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ tmr = &net->pmtu_timer;
+ break;
+ case SCTP_TIMER_TYPE_SHUTDOWNACK:
+ if ((stcb == NULL) || (net == NULL)) {
+ return;
+ }
+ tmr = &net->rxt_timer;
+ break;
+ case SCTP_TIMER_TYPE_SHUTDOWNGUARD:
+ if (stcb == NULL) {
+ return;
+ }
+ tmr = &stcb->asoc.shut_guard_timer;
+ break;
+ case SCTP_TIMER_TYPE_STRRESET:
+ if (stcb == NULL) {
+ return;
+ }
+ tmr = &stcb->asoc.strreset_timer;
+ break;
+ case SCTP_TIMER_TYPE_ASCONF:
+ if (stcb == NULL) {
+ return;
+ }
+ tmr = &stcb->asoc.asconf_timer;
+ break;
+ case SCTP_TIMER_TYPE_PRIM_DELETED:
+ if (stcb == NULL) {
+ return;
+ }
+ tmr = &stcb->asoc.delete_prim_timer;
+ break;
+ case SCTP_TIMER_TYPE_AUTOCLOSE:
+ if (stcb == NULL) {
+ return;
+ }
+ tmr = &stcb->asoc.autoclose_timer;
+ break;
+ default:
+ SCTPDBG(SCTP_DEBUG_TIMER1, "%s: Unknown timer type %d\n",
+ __FUNCTION__, t_type);
+ break;
+ };
+ if (tmr == NULL) {
+ return;
+ }
+ if ((tmr->type != t_type) && tmr->type) {
+ /*
+ * Ok we have a timer that is under joint use. Cookie timer
+ * per chance with the SEND timer. We therefore are NOT
+ * running the timer that the caller wants stopped. So just
+ * return.
+ */
+ return;
+ }
+ if ((t_type == SCTP_TIMER_TYPE_SEND) && (stcb != NULL)) {
+ stcb->asoc.num_send_timers_up--;
+ if (stcb->asoc.num_send_timers_up < 0) {
+ stcb->asoc.num_send_timers_up = 0;
+ }
+ }
+ tmr->self = NULL;
+ tmr->stopped_from = from;
+ (void)SCTP_OS_TIMER_STOP(&tmr->timer);
+ return;
+}
+
+uint32_t
+sctp_calculate_len(struct mbuf *m)
+{
+ uint32_t tlen = 0;
+ struct mbuf *at;
+
+ at = m;
+ while (at) {
+ tlen += SCTP_BUF_LEN(at);
+ at = SCTP_BUF_NEXT(at);
+ }
+ return (tlen);
+}
+
+void
+sctp_mtu_size_reset(struct sctp_inpcb *inp,
+ struct sctp_association *asoc, uint32_t mtu)
+{
+ /*
+ * Reset the P-MTU size on this association, this involves changing
+ * the asoc MTU, going through ANY chunk+overhead larger than mtu to
+ * allow the DF flag to be cleared.
+ */
+ struct sctp_tmit_chunk *chk;
+ unsigned int eff_mtu, ovh;
+
+ asoc->smallest_mtu = mtu;
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ ovh = SCTP_MIN_OVERHEAD;
+ } else {
+ ovh = SCTP_MIN_V4_OVERHEAD;
+ }
+ eff_mtu = mtu - ovh;
+ TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) {
+ if (chk->send_size > eff_mtu) {
+ chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
+ }
+ }
+ TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
+ if (chk->send_size > eff_mtu) {
+ chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
+ }
+ }
+}
+
+
+/*
+ * given an association and starting time of the current RTT period return
+ * RTO in number of msecs net should point to the current network
+ */
+uint32_t
+sctp_calculate_rto(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ struct sctp_nets *net,
+ struct timeval *told,
+ int safe)
+{
+ /*-
+ * given an association and the starting time of the current RTT
+ * period (in value1/value2) return RTO in number of msecs.
+ */
+ int calc_time = 0;
+ int o_calctime;
+ uint32_t new_rto = 0;
+ int first_measure = 0;
+ struct timeval now, then, *old;
+
+ /* Copy it out for sparc64 */
+ if (safe == sctp_align_unsafe_makecopy) {
+ old = &then;
+ memcpy(&then, told, sizeof(struct timeval));
+ } else if (safe == sctp_align_safe_nocopy) {
+ old = told;
+ } else {
+ /* error */
+ SCTP_PRINTF("Huh, bad rto calc call\n");
+ return (0);
+ }
+ /************************/
+ /* 1. calculate new RTT */
+ /************************/
+ /* get the current time */
+ (void)SCTP_GETTIME_TIMEVAL(&now);
+ /* compute the RTT value */
+ if ((u_long)now.tv_sec > (u_long)old->tv_sec) {
+ calc_time = ((u_long)now.tv_sec - (u_long)old->tv_sec) * 1000;
+ if ((u_long)now.tv_usec > (u_long)old->tv_usec) {
+ calc_time += (((u_long)now.tv_usec -
+ (u_long)old->tv_usec) / 1000);
+ } else if ((u_long)now.tv_usec < (u_long)old->tv_usec) {
+ /* Borrow 1,000ms from current calculation */
+ calc_time -= 1000;
+ /* Add in the slop over */
+ calc_time += ((int)now.tv_usec / 1000);
+ /* Add in the pre-second ms's */
+ calc_time += (((int)1000000 - (int)old->tv_usec) / 1000);
+ }
+ } else if ((u_long)now.tv_sec == (u_long)old->tv_sec) {
+ if ((u_long)now.tv_usec > (u_long)old->tv_usec) {
+ calc_time = ((u_long)now.tv_usec -
+ (u_long)old->tv_usec) / 1000;
+ } else if ((u_long)now.tv_usec < (u_long)old->tv_usec) {
+ /* impossible .. garbage in nothing out */
+ goto calc_rto;
+ } else if ((u_long)now.tv_usec == (u_long)old->tv_usec) {
+ /*
+ * We have to have 1 usec :-D this must be the
+ * loopback.
+ */
+ calc_time = 1;
+ } else {
+ /* impossible .. garbage in nothing out */
+ goto calc_rto;
+ }
+ } else {
+ /* Clock wrapped? */
+ goto calc_rto;
+ }
+ /***************************/
+ /* 2. update RTTVAR & SRTT */
+ /***************************/
+ net->rtt = o_calctime = calc_time;
+ /* this is Van Jacobson's integer version */
+ if (net->RTO_measured) {
+ calc_time -= (net->lastsa >> SCTP_RTT_SHIFT); /* take away 1/8th when
+ * shift=3 */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RTTVAR_LOGGING_ENABLE) {
+ rto_logging(net, SCTP_LOG_RTTVAR);
+ }
+ net->prev_rtt = o_calctime;
+ net->lastsa += calc_time; /* add 7/8th into sa when
+ * shift=3 */
+ if (calc_time < 0) {
+ calc_time = -calc_time;
+ }
+ calc_time -= (net->lastsv >> SCTP_RTT_VAR_SHIFT); /* take away 1/4 when
+ * VAR shift=2 */
+ net->lastsv += calc_time;
+ if (net->lastsv == 0) {
+ net->lastsv = SCTP_CLOCK_GRANULARITY;
+ }
+ } else {
+ /* First RTO measurment */
+ net->RTO_measured = 1;
+ net->lastsa = calc_time << SCTP_RTT_SHIFT; /* Multiply by 8 when
+ * shift=3 */
+ net->lastsv = calc_time;
+ if (net->lastsv == 0) {
+ net->lastsv = SCTP_CLOCK_GRANULARITY;
+ }
+ first_measure = 1;
+ net->prev_rtt = o_calctime;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RTTVAR_LOGGING_ENABLE) {
+ rto_logging(net, SCTP_LOG_INITIAL_RTT);
+ }
+ }
+calc_rto:
+ new_rto = (net->lastsa >> SCTP_RTT_SHIFT) + net->lastsv;
+ if ((new_rto > SCTP_SAT_NETWORK_MIN) &&
+ (stcb->asoc.sat_network_lockout == 0)) {
+ stcb->asoc.sat_network = 1;
+ } else if ((!first_measure) && stcb->asoc.sat_network) {
+ stcb->asoc.sat_network = 0;
+ stcb->asoc.sat_network_lockout = 1;
+ }
+ /* bound it, per C6/C7 in Section 5.3.1 */
+ if (new_rto < stcb->asoc.minrto) {
+ new_rto = stcb->asoc.minrto;
+ }
+ if (new_rto > stcb->asoc.maxrto) {
+ new_rto = stcb->asoc.maxrto;
+ }
+ /* we are now returning the RTO */
+ return (new_rto);
+}
+
+/*
+ * return a pointer to a contiguous piece of data from the given mbuf chain
+ * starting at 'off' for 'len' bytes. If the desired piece spans more than
+ * one mbuf, a copy is made at 'ptr'. caller must ensure that the buffer size
+ * is >= 'len' returns NULL if there there isn't 'len' bytes in the chain.
+ */
+caddr_t
+sctp_m_getptr(struct mbuf *m, int off, int len, uint8_t * in_ptr)
+{
+ uint32_t count;
+ uint8_t *ptr;
+
+ ptr = in_ptr;
+ if ((off < 0) || (len <= 0))
+ return (NULL);
+
+ /* find the desired start location */
+ while ((m != NULL) && (off > 0)) {
+ if (off < SCTP_BUF_LEN(m))
+ break;
+ off -= SCTP_BUF_LEN(m);
+ m = SCTP_BUF_NEXT(m);
+ }
+ if (m == NULL)
+ return (NULL);
+
+ /* is the current mbuf large enough (eg. contiguous)? */
+ if ((SCTP_BUF_LEN(m) - off) >= len) {
+ return (mtod(m, caddr_t)+off);
+ } else {
+ /* else, it spans more than one mbuf, so save a temp copy... */
+ while ((m != NULL) && (len > 0)) {
+ count = min(SCTP_BUF_LEN(m) - off, len);
+ bcopy(mtod(m, caddr_t)+off, ptr, count);
+ len -= count;
+ ptr += count;
+ off = 0;
+ m = SCTP_BUF_NEXT(m);
+ }
+ if ((m == NULL) && (len > 0))
+ return (NULL);
+ else
+ return ((caddr_t)in_ptr);
+ }
+}
+
+
+
+struct sctp_paramhdr *
+sctp_get_next_param(struct mbuf *m,
+ int offset,
+ struct sctp_paramhdr *pull,
+ int pull_limit)
+{
+ /* This just provides a typed signature to Peter's Pull routine */
+ return ((struct sctp_paramhdr *)sctp_m_getptr(m, offset, pull_limit,
+ (uint8_t *) pull));
+}
+
+
+int
+sctp_add_pad_tombuf(struct mbuf *m, int padlen)
+{
+ /*
+ * add padlen bytes of 0 filled padding to the end of the mbuf. If
+ * padlen is > 3 this routine will fail.
+ */
+ uint8_t *dp;
+ int i;
+
+ if (padlen > 3) {
+ SCTP_LTRACE_ERR_RET_PKT(m, NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS);
+ return (ENOBUFS);
+ }
+ if (padlen <= M_TRAILINGSPACE(m)) {
+ /*
+ * The easy way. We hope the majority of the time we hit
+ * here :)
+ */
+ dp = (uint8_t *) (mtod(m, caddr_t)+SCTP_BUF_LEN(m));
+ SCTP_BUF_LEN(m) += padlen;
+ } else {
+ /* Hard way we must grow the mbuf */
+ struct mbuf *tmp;
+
+ tmp = sctp_get_mbuf_for_msg(padlen, 0, M_DONTWAIT, 1, MT_DATA);
+ if (tmp == NULL) {
+ /* Out of space GAK! we are in big trouble. */
+ SCTP_LTRACE_ERR_RET_PKT(m, NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ return (ENOSPC);
+ }
+ /* setup and insert in middle */
+ SCTP_BUF_LEN(tmp) = padlen;
+ SCTP_BUF_NEXT(tmp) = NULL;
+ SCTP_BUF_NEXT(m) = tmp;
+ dp = mtod(tmp, uint8_t *);
+ }
+ /* zero out the pad */
+ for (i = 0; i < padlen; i++) {
+ *dp = 0;
+ dp++;
+ }
+ return (0);
+}
+
+int
+sctp_pad_lastmbuf(struct mbuf *m, int padval, struct mbuf *last_mbuf)
+{
+ /* find the last mbuf in chain and pad it */
+ struct mbuf *m_at;
+
+ m_at = m;
+ if (last_mbuf) {
+ return (sctp_add_pad_tombuf(last_mbuf, padval));
+ } else {
+ while (m_at) {
+ if (SCTP_BUF_NEXT(m_at) == NULL) {
+ return (sctp_add_pad_tombuf(m_at, padval));
+ }
+ m_at = SCTP_BUF_NEXT(m_at);
+ }
+ }
+ SCTP_LTRACE_ERR_RET_PKT(m, NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EFAULT);
+ return (EFAULT);
+}
+
+static void
+sctp_notify_assoc_change(uint32_t event, struct sctp_tcb *stcb,
+ uint32_t error, void *data, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ struct mbuf *m_notify;
+ struct sctp_assoc_change *sac;
+ struct sctp_queued_to_read *control;
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+
+ /*
+ * For TCP model AND UDP connected sockets we will send an error up
+ * when an ABORT comes in.
+ */
+ if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) &&
+ ((event == SCTP_COMM_LOST) || (event == SCTP_CANT_STR_ASSOC))) {
+ if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNREFUSED);
+ stcb->sctp_socket->so_error = ECONNREFUSED;
+ } else {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET);
+ stcb->sctp_socket->so_error = ECONNRESET;
+ }
+ /* Wake ANY sleepers */
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ if (!so_locked) {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return;
+ }
+ }
+#endif
+ socantrcvmore(stcb->sctp_socket);
+ sorwakeup(stcb->sctp_socket);
+ sowwakeup(stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ if (!so_locked) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ }
+#endif
+ }
+ if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVASSOCEVNT)) {
+ /* event not enabled */
+ return;
+ }
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_assoc_change), 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_notify == NULL)
+ /* no space left */
+ return;
+ SCTP_BUF_LEN(m_notify) = 0;
+
+ sac = mtod(m_notify, struct sctp_assoc_change *);
+ sac->sac_type = SCTP_ASSOC_CHANGE;
+ sac->sac_flags = 0;
+ sac->sac_length = sizeof(struct sctp_assoc_change);
+ sac->sac_state = event;
+ sac->sac_error = error;
+ /* XXX verify these stream counts */
+ sac->sac_outbound_streams = stcb->asoc.streamoutcnt;
+ sac->sac_inbound_streams = stcb->asoc.streamincnt;
+ sac->sac_assoc_id = sctp_get_associd(stcb);
+ SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_assoc_change);
+ SCTP_BUF_NEXT(m_notify) = NULL;
+ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
+ 0, 0, 0, 0, 0, 0,
+ m_notify);
+ if (control == NULL) {
+ /* no memory */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ control->length = SCTP_BUF_LEN(m_notify);
+ /* not that we need this */
+ control->tail_mbuf = m_notify;
+ control->spec_flags = M_NOTIFICATION;
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD,
+ so_locked);
+ if (event == SCTP_COMM_LOST) {
+ /* Wake up any sleeper */
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ if (!so_locked) {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return;
+ }
+ }
+#endif
+ sctp_sowwakeup(stcb->sctp_ep, stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ if (!so_locked) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ }
+#endif
+ }
+}
+
+static void
+sctp_notify_peer_addr_change(struct sctp_tcb *stcb, uint32_t state,
+ struct sockaddr *sa, uint32_t error)
+{
+ struct mbuf *m_notify;
+ struct sctp_paddr_change *spc;
+ struct sctp_queued_to_read *control;
+
+ if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVPADDREVNT)) {
+ /* event not enabled */
+ return;
+ }
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_paddr_change), 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_notify == NULL)
+ return;
+ SCTP_BUF_LEN(m_notify) = 0;
+ spc = mtod(m_notify, struct sctp_paddr_change *);
+ spc->spc_type = SCTP_PEER_ADDR_CHANGE;
+ spc->spc_flags = 0;
+ spc->spc_length = sizeof(struct sctp_paddr_change);
+ switch (sa->sa_family) {
+ case AF_INET:
+ memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in));
+ break;
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 *sin6;
+
+ memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in6));
+
+ sin6 = (struct sockaddr_in6 *)&spc->spc_aaddr;
+ if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) {
+ if (sin6->sin6_scope_id == 0) {
+ /* recover scope_id for user */
+ (void)sa6_recoverscope(sin6);
+ } else {
+ /* clear embedded scope_id for user */
+ in6_clearscope(&sin6->sin6_addr);
+ }
+ }
+ break;
+ }
+#endif
+ default:
+ /* TSNH */
+ break;
+ }
+ spc->spc_state = state;
+ spc->spc_error = error;
+ spc->spc_assoc_id = sctp_get_associd(stcb);
+
+ SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_paddr_change);
+ SCTP_BUF_NEXT(m_notify) = NULL;
+
+ /* append to socket */
+ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
+ 0, 0, 0, 0, 0, 0,
+ m_notify);
+ if (control == NULL) {
+ /* no memory */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ control->length = SCTP_BUF_LEN(m_notify);
+ control->spec_flags = M_NOTIFICATION;
+ /* not that we need this */
+ control->tail_mbuf = m_notify;
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, 1,
+ SCTP_READ_LOCK_NOT_HELD,
+ SCTP_SO_NOT_LOCKED);
+}
+
+
+static void
+sctp_notify_send_failed(struct sctp_tcb *stcb, uint32_t error,
+ struct sctp_tmit_chunk *chk, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ struct mbuf *m_notify;
+ struct sctp_send_failed *ssf;
+ struct sctp_queued_to_read *control;
+ int length;
+
+ if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) {
+ /* event not enabled */
+ return;
+ }
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_send_failed), 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_notify == NULL)
+ /* no space left */
+ return;
+ length = sizeof(struct sctp_send_failed) + chk->send_size;
+ length -= sizeof(struct sctp_data_chunk);
+ SCTP_BUF_LEN(m_notify) = 0;
+ ssf = mtod(m_notify, struct sctp_send_failed *);
+ ssf->ssf_type = SCTP_SEND_FAILED;
+ if (error == SCTP_NOTIFY_DATAGRAM_UNSENT)
+ ssf->ssf_flags = SCTP_DATA_UNSENT;
+ else
+ ssf->ssf_flags = SCTP_DATA_SENT;
+ ssf->ssf_length = length;
+ ssf->ssf_error = error;
+ /* not exactly what the user sent in, but should be close :) */
+ bzero(&ssf->ssf_info, sizeof(ssf->ssf_info));
+ ssf->ssf_info.sinfo_stream = chk->rec.data.stream_number;
+ ssf->ssf_info.sinfo_ssn = chk->rec.data.stream_seq;
+ ssf->ssf_info.sinfo_flags = chk->rec.data.rcv_flags;
+ ssf->ssf_info.sinfo_ppid = chk->rec.data.payloadtype;
+ ssf->ssf_info.sinfo_context = chk->rec.data.context;
+ ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb);
+ ssf->ssf_assoc_id = sctp_get_associd(stcb);
+
+ if (chk->data) {
+ /*
+ * trim off the sctp chunk header(it should be there)
+ */
+ if (chk->send_size >= sizeof(struct sctp_data_chunk)) {
+ m_adj(chk->data, sizeof(struct sctp_data_chunk));
+ sctp_mbuf_crush(chk->data);
+ chk->send_size -= sizeof(struct sctp_data_chunk);
+ }
+ }
+ SCTP_BUF_NEXT(m_notify) = chk->data;
+ SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed);
+ /* Steal off the mbuf */
+ chk->data = NULL;
+ /*
+ * For this case, we check the actual socket buffer, since the assoc
+ * is going away we don't want to overfill the socket buffer for a
+ * non-reader
+ */
+ if (sctp_sbspace_failedmsgs(&stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) {
+ sctp_m_freem(m_notify);
+ return;
+ }
+ /* append to socket */
+ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
+ 0, 0, 0, 0, 0, 0,
+ m_notify);
+ if (control == NULL) {
+ /* no memory */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ control->spec_flags = M_NOTIFICATION;
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, 1,
+ SCTP_READ_LOCK_NOT_HELD,
+ so_locked);
+}
+
+
+static void
+sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error,
+ struct sctp_stream_queue_pending *sp, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ struct mbuf *m_notify;
+ struct sctp_send_failed *ssf;
+ struct sctp_queued_to_read *control;
+ int length;
+
+ if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) {
+ /* event not enabled */
+ return;
+ }
+ length = sizeof(struct sctp_send_failed) + sp->length;
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_send_failed), 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_notify == NULL)
+ /* no space left */
+ return;
+ SCTP_BUF_LEN(m_notify) = 0;
+ ssf = mtod(m_notify, struct sctp_send_failed *);
+ ssf->ssf_type = SCTP_SEND_FAILED;
+ if (error == SCTP_NOTIFY_DATAGRAM_UNSENT)
+ ssf->ssf_flags = SCTP_DATA_UNSENT;
+ else
+ ssf->ssf_flags = SCTP_DATA_SENT;
+ ssf->ssf_length = length;
+ ssf->ssf_error = error;
+ /* not exactly what the user sent in, but should be close :) */
+ bzero(&ssf->ssf_info, sizeof(ssf->ssf_info));
+ ssf->ssf_info.sinfo_stream = sp->stream;
+ ssf->ssf_info.sinfo_ssn = sp->strseq;
+ if (sp->some_taken) {
+ ssf->ssf_info.sinfo_flags = SCTP_DATA_LAST_FRAG;
+ } else {
+ ssf->ssf_info.sinfo_flags = SCTP_DATA_NOT_FRAG;
+ }
+ ssf->ssf_info.sinfo_ppid = sp->ppid;
+ ssf->ssf_info.sinfo_context = sp->context;
+ ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb);
+ ssf->ssf_assoc_id = sctp_get_associd(stcb);
+ SCTP_BUF_NEXT(m_notify) = sp->data;
+ SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed);
+
+ /* Steal off the mbuf */
+ sp->data = NULL;
+ /*
+ * For this case, we check the actual socket buffer, since the assoc
+ * is going away we don't want to overfill the socket buffer for a
+ * non-reader
+ */
+ if (sctp_sbspace_failedmsgs(&stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) {
+ sctp_m_freem(m_notify);
+ return;
+ }
+ /* append to socket */
+ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
+ 0, 0, 0, 0, 0, 0,
+ m_notify);
+ if (control == NULL) {
+ /* no memory */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ control->spec_flags = M_NOTIFICATION;
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked);
+}
+
+
+
+static void
+sctp_notify_adaptation_layer(struct sctp_tcb *stcb,
+ uint32_t error)
+{
+ struct mbuf *m_notify;
+ struct sctp_adaptation_event *sai;
+ struct sctp_queued_to_read *control;
+
+ if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_ADAPTATIONEVNT)) {
+ /* event not enabled */
+ return;
+ }
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_adaption_event), 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_notify == NULL)
+ /* no space left */
+ return;
+ SCTP_BUF_LEN(m_notify) = 0;
+ sai = mtod(m_notify, struct sctp_adaptation_event *);
+ sai->sai_type = SCTP_ADAPTATION_INDICATION;
+ sai->sai_flags = 0;
+ sai->sai_length = sizeof(struct sctp_adaptation_event);
+ sai->sai_adaptation_ind = stcb->asoc.peers_adaptation;
+ sai->sai_assoc_id = sctp_get_associd(stcb);
+
+ SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_adaptation_event);
+ SCTP_BUF_NEXT(m_notify) = NULL;
+
+ /* append to socket */
+ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
+ 0, 0, 0, 0, 0, 0,
+ m_notify);
+ if (control == NULL) {
+ /* no memory */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ control->length = SCTP_BUF_LEN(m_notify);
+ control->spec_flags = M_NOTIFICATION;
+ /* not that we need this */
+ control->tail_mbuf = m_notify;
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
+}
+
+/* This always must be called with the read-queue LOCKED in the INP */
+static void
+sctp_notify_partial_delivery_indication(struct sctp_tcb *stcb, uint32_t error,
+ uint32_t val, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ struct mbuf *m_notify;
+ struct sctp_pdapi_event *pdapi;
+ struct sctp_queued_to_read *control;
+ struct sockbuf *sb;
+
+ if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_PDAPIEVNT)) {
+ /* event not enabled */
+ return;
+ }
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ) {
+ return;
+ }
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_pdapi_event), 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_notify == NULL)
+ /* no space left */
+ return;
+ SCTP_BUF_LEN(m_notify) = 0;
+ pdapi = mtod(m_notify, struct sctp_pdapi_event *);
+ pdapi->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT;
+ pdapi->pdapi_flags = 0;
+ pdapi->pdapi_length = sizeof(struct sctp_pdapi_event);
+ pdapi->pdapi_indication = error;
+ pdapi->pdapi_stream = (val >> 16);
+ pdapi->pdapi_seq = (val & 0x0000ffff);
+ pdapi->pdapi_assoc_id = sctp_get_associd(stcb);
+
+ SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_pdapi_event);
+ SCTP_BUF_NEXT(m_notify) = NULL;
+ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
+ 0, 0, 0, 0, 0, 0,
+ m_notify);
+ if (control == NULL) {
+ /* no memory */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ control->spec_flags = M_NOTIFICATION;
+ control->length = SCTP_BUF_LEN(m_notify);
+ /* not that we need this */
+ control->tail_mbuf = m_notify;
+ control->held_length = 0;
+ control->length = 0;
+ sb = &stcb->sctp_socket->so_rcv;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(m_notify));
+ }
+ sctp_sballoc(stcb, sb, m_notify);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
+ }
+ atomic_add_int(&control->length, SCTP_BUF_LEN(m_notify));
+ control->end_added = 1;
+ if (stcb->asoc.control_pdapi)
+ TAILQ_INSERT_AFTER(&stcb->sctp_ep->read_queue, stcb->asoc.control_pdapi, control, next);
+ else {
+ /* we really should not see this case */
+ TAILQ_INSERT_TAIL(&stcb->sctp_ep->read_queue, control, next);
+ }
+ if (stcb->sctp_ep && stcb->sctp_socket) {
+ /* This should always be the case */
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ if (!so_locked) {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return;
+ }
+ }
+#endif
+ sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ if (!so_locked) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ }
+#endif
+ }
+}
+
+static void
+sctp_notify_shutdown_event(struct sctp_tcb *stcb)
+{
+ struct mbuf *m_notify;
+ struct sctp_shutdown_event *sse;
+ struct sctp_queued_to_read *control;
+
+ /*
+ * For TCP model AND UDP connected sockets we will send an error up
+ * when an SHUTDOWN completes
+ */
+ if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ /* mark socket closed for read/write and wakeup! */
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return;
+ }
+#endif
+ socantsendmore(stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ }
+ if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT)) {
+ /* event not enabled */
+ return;
+ }
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_event), 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_notify == NULL)
+ /* no space left */
+ return;
+ sse = mtod(m_notify, struct sctp_shutdown_event *);
+ sse->sse_type = SCTP_SHUTDOWN_EVENT;
+ sse->sse_flags = 0;
+ sse->sse_length = sizeof(struct sctp_shutdown_event);
+ sse->sse_assoc_id = sctp_get_associd(stcb);
+
+ SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_shutdown_event);
+ SCTP_BUF_NEXT(m_notify) = NULL;
+
+ /* append to socket */
+ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
+ 0, 0, 0, 0, 0, 0,
+ m_notify);
+ if (control == NULL) {
+ /* no memory */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ control->spec_flags = M_NOTIFICATION;
+ control->length = SCTP_BUF_LEN(m_notify);
+ /* not that we need this */
+ control->tail_mbuf = m_notify;
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
+}
+
+static void
+sctp_notify_sender_dry_event(struct sctp_tcb *stcb,
+ int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ struct mbuf *m_notify;
+ struct sctp_sender_dry_event *event;
+ struct sctp_queued_to_read *control;
+
+ if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_DRYEVNT)) {
+ /* event not enabled */
+ return;
+ }
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_sender_dry_event), 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_notify == NULL) {
+ /* no space left */
+ return;
+ }
+ SCTP_BUF_LEN(m_notify) = 0;
+ event = mtod(m_notify, struct sctp_sender_dry_event *);
+ event->sender_dry_type = SCTP_SENDER_DRY_EVENT;
+ event->sender_dry_flags = 0;
+ event->sender_dry_length = sizeof(struct sctp_sender_dry_event);
+ event->sender_dry_assoc_id = sctp_get_associd(stcb);
+
+ SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_sender_dry_event);
+ SCTP_BUF_NEXT(m_notify) = NULL;
+
+ /* append to socket */
+ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
+ 0, 0, 0, 0, 0, 0, m_notify);
+ if (control == NULL) {
+ /* no memory */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ control->length = SCTP_BUF_LEN(m_notify);
+ control->spec_flags = M_NOTIFICATION;
+ /* not that we need this */
+ control->tail_mbuf = m_notify;
+ sctp_add_to_readq(stcb->sctp_ep, stcb, control,
+ &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked);
+}
+
+
+static void
+sctp_notify_stream_reset_add(struct sctp_tcb *stcb, int number_entries, int flag)
+{
+ struct mbuf *m_notify;
+ struct sctp_queued_to_read *control;
+ struct sctp_stream_reset_event *strreset;
+ int len;
+
+ if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_STREAM_RESETEVNT)) {
+ /* event not enabled */
+ return;
+ }
+ m_notify = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_notify == NULL)
+ /* no space left */
+ return;
+ SCTP_BUF_LEN(m_notify) = 0;
+ len = sizeof(struct sctp_stream_reset_event) + (number_entries * sizeof(uint16_t));
+ if (len > M_TRAILINGSPACE(m_notify)) {
+ /* never enough room */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ strreset = mtod(m_notify, struct sctp_stream_reset_event *);
+ strreset->strreset_type = SCTP_STREAM_RESET_EVENT;
+ strreset->strreset_flags = SCTP_STRRESET_ADD_STREAM | flag;
+ strreset->strreset_length = len;
+ strreset->strreset_assoc_id = sctp_get_associd(stcb);
+ strreset->strreset_list[0] = number_entries;
+
+ SCTP_BUF_LEN(m_notify) = len;
+ SCTP_BUF_NEXT(m_notify) = NULL;
+ if (sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) {
+ /* no space */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ /* append to socket */
+ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
+ 0, 0, 0, 0, 0, 0,
+ m_notify);
+ if (control == NULL) {
+ /* no memory */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ control->spec_flags = M_NOTIFICATION;
+ control->length = SCTP_BUF_LEN(m_notify);
+ /* not that we need this */
+ control->tail_mbuf = m_notify;
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
+}
+
+
+static void
+sctp_notify_stream_reset(struct sctp_tcb *stcb,
+ int number_entries, uint16_t * list, int flag)
+{
+ struct mbuf *m_notify;
+ struct sctp_queued_to_read *control;
+ struct sctp_stream_reset_event *strreset;
+ int len;
+
+ if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_STREAM_RESETEVNT)) {
+ /* event not enabled */
+ return;
+ }
+ m_notify = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ if (m_notify == NULL)
+ /* no space left */
+ return;
+ SCTP_BUF_LEN(m_notify) = 0;
+ len = sizeof(struct sctp_stream_reset_event) + (number_entries * sizeof(uint16_t));
+ if (len > M_TRAILINGSPACE(m_notify)) {
+ /* never enough room */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ strreset = mtod(m_notify, struct sctp_stream_reset_event *);
+ strreset->strreset_type = SCTP_STREAM_RESET_EVENT;
+ if (number_entries == 0) {
+ strreset->strreset_flags = flag | SCTP_STRRESET_ALL_STREAMS;
+ } else {
+ strreset->strreset_flags = flag | SCTP_STRRESET_STREAM_LIST;
+ }
+ strreset->strreset_length = len;
+ strreset->strreset_assoc_id = sctp_get_associd(stcb);
+ if (number_entries) {
+ int i;
+
+ for (i = 0; i < number_entries; i++) {
+ strreset->strreset_list[i] = ntohs(list[i]);
+ }
+ }
+ SCTP_BUF_LEN(m_notify) = len;
+ SCTP_BUF_NEXT(m_notify) = NULL;
+ if (sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) {
+ /* no space */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ /* append to socket */
+ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
+ 0, 0, 0, 0, 0, 0,
+ m_notify);
+ if (control == NULL) {
+ /* no memory */
+ sctp_m_freem(m_notify);
+ return;
+ }
+ control->spec_flags = M_NOTIFICATION;
+ control->length = SCTP_BUF_LEN(m_notify);
+ /* not that we need this */
+ control->tail_mbuf = m_notify;
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
+}
+
+
+void
+sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb,
+ uint32_t error, void *data, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ if ((stcb == NULL) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
+ (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) {
+ /* If the socket is gone we are out of here */
+ return;
+ }
+ if (stcb->sctp_socket->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ return;
+ }
+ if (stcb && ((stcb->asoc.state & SCTP_STATE_COOKIE_WAIT) ||
+ (stcb->asoc.state & SCTP_STATE_COOKIE_ECHOED))) {
+ if ((notification == SCTP_NOTIFY_INTERFACE_DOWN) ||
+ (notification == SCTP_NOTIFY_INTERFACE_UP) ||
+ (notification == SCTP_NOTIFY_INTERFACE_CONFIRMED)) {
+ /* Don't report these in front states */
+ return;
+ }
+ }
+ switch (notification) {
+ case SCTP_NOTIFY_ASSOC_UP:
+ if (stcb->asoc.assoc_up_sent == 0) {
+ sctp_notify_assoc_change(SCTP_COMM_UP, stcb, error, NULL, so_locked);
+ stcb->asoc.assoc_up_sent = 1;
+ }
+ if (stcb->asoc.adaptation_needed && (stcb->asoc.adaptation_sent == 0)) {
+ sctp_notify_adaptation_layer(stcb, error);
+ }
+ if (stcb->asoc.peer_supports_auth == 0) {
+ sctp_ulp_notify(SCTP_NOTIFY_NO_PEER_AUTH, stcb, 0,
+ NULL, so_locked);
+ }
+ break;
+ case SCTP_NOTIFY_ASSOC_DOWN:
+ sctp_notify_assoc_change(SCTP_SHUTDOWN_COMP, stcb, error, NULL, so_locked);
+ break;
+ case SCTP_NOTIFY_INTERFACE_DOWN:
+ {
+ struct sctp_nets *net;
+
+ net = (struct sctp_nets *)data;
+ sctp_notify_peer_addr_change(stcb, SCTP_ADDR_UNREACHABLE,
+ (struct sockaddr *)&net->ro._l_addr, error);
+ break;
+ }
+ case SCTP_NOTIFY_INTERFACE_UP:
+ {
+ struct sctp_nets *net;
+
+ net = (struct sctp_nets *)data;
+ sctp_notify_peer_addr_change(stcb, SCTP_ADDR_AVAILABLE,
+ (struct sockaddr *)&net->ro._l_addr, error);
+ break;
+ }
+ case SCTP_NOTIFY_INTERFACE_CONFIRMED:
+ {
+ struct sctp_nets *net;
+
+ net = (struct sctp_nets *)data;
+ sctp_notify_peer_addr_change(stcb, SCTP_ADDR_CONFIRMED,
+ (struct sockaddr *)&net->ro._l_addr, error);
+ break;
+ }
+ case SCTP_NOTIFY_SPECIAL_SP_FAIL:
+ sctp_notify_send_failed2(stcb, error,
+ (struct sctp_stream_queue_pending *)data, so_locked);
+ break;
+ case SCTP_NOTIFY_DG_FAIL:
+ sctp_notify_send_failed(stcb, error,
+ (struct sctp_tmit_chunk *)data, so_locked);
+ break;
+ case SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION:
+ {
+ uint32_t val;
+
+ val = *((uint32_t *) data);
+
+ sctp_notify_partial_delivery_indication(stcb, error, val, so_locked);
+ break;
+ }
+ case SCTP_NOTIFY_STRDATA_ERR:
+ break;
+ case SCTP_NOTIFY_ASSOC_ABORTED:
+ if ((stcb) && (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) ||
+ ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED))) {
+ sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, NULL, so_locked);
+ } else {
+ sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, NULL, so_locked);
+ }
+ break;
+ case SCTP_NOTIFY_PEER_OPENED_STREAM:
+ break;
+ case SCTP_NOTIFY_STREAM_OPENED_OK:
+ break;
+ case SCTP_NOTIFY_ASSOC_RESTART:
+ sctp_notify_assoc_change(SCTP_RESTART, stcb, error, data, so_locked);
+ if (stcb->asoc.peer_supports_auth == 0) {
+ sctp_ulp_notify(SCTP_NOTIFY_NO_PEER_AUTH, stcb, 0,
+ NULL, so_locked);
+ }
+ break;
+ case SCTP_NOTIFY_HB_RESP:
+ break;
+ case SCTP_NOTIFY_STR_RESET_INSTREAM_ADD_OK:
+ sctp_notify_stream_reset_add(stcb, error, SCTP_STRRESET_INBOUND_STR);
+ break;
+ case SCTP_NOTIFY_STR_RESET_ADD_OK:
+ sctp_notify_stream_reset_add(stcb, error, SCTP_STRRESET_OUTBOUND_STR);
+ break;
+ case SCTP_NOTIFY_STR_RESET_ADD_FAIL:
+ sctp_notify_stream_reset_add(stcb, error, (SCTP_STRRESET_FAILED | SCTP_STRRESET_OUTBOUND_STR));
+ break;
+
+ case SCTP_NOTIFY_STR_RESET_SEND:
+ sctp_notify_stream_reset(stcb, error, ((uint16_t *) data), SCTP_STRRESET_OUTBOUND_STR);
+ break;
+ case SCTP_NOTIFY_STR_RESET_RECV:
+ sctp_notify_stream_reset(stcb, error, ((uint16_t *) data), SCTP_STRRESET_INBOUND_STR);
+ break;
+ case SCTP_NOTIFY_STR_RESET_FAILED_OUT:
+ sctp_notify_stream_reset(stcb, error, ((uint16_t *) data), (SCTP_STRRESET_OUTBOUND_STR | SCTP_STRRESET_FAILED));
+ break;
+ case SCTP_NOTIFY_STR_RESET_FAILED_IN:
+ sctp_notify_stream_reset(stcb, error, ((uint16_t *) data), (SCTP_STRRESET_INBOUND_STR | SCTP_STRRESET_FAILED));
+ break;
+ case SCTP_NOTIFY_ASCONF_ADD_IP:
+ sctp_notify_peer_addr_change(stcb, SCTP_ADDR_ADDED, data,
+ error);
+ break;
+ case SCTP_NOTIFY_ASCONF_DELETE_IP:
+ sctp_notify_peer_addr_change(stcb, SCTP_ADDR_REMOVED, data,
+ error);
+ break;
+ case SCTP_NOTIFY_ASCONF_SET_PRIMARY:
+ sctp_notify_peer_addr_change(stcb, SCTP_ADDR_MADE_PRIM, data,
+ error);
+ break;
+ case SCTP_NOTIFY_ASCONF_SUCCESS:
+ break;
+ case SCTP_NOTIFY_ASCONF_FAILED:
+ break;
+ case SCTP_NOTIFY_PEER_SHUTDOWN:
+ sctp_notify_shutdown_event(stcb);
+ break;
+ case SCTP_NOTIFY_AUTH_NEW_KEY:
+ sctp_notify_authentication(stcb, SCTP_AUTH_NEWKEY, error,
+ (uint16_t) (uintptr_t) data,
+ so_locked);
+ break;
+ case SCTP_NOTIFY_AUTH_FREE_KEY:
+ sctp_notify_authentication(stcb, SCTP_AUTH_FREE_KEY, error,
+ (uint16_t) (uintptr_t) data,
+ so_locked);
+ break;
+ case SCTP_NOTIFY_NO_PEER_AUTH:
+ sctp_notify_authentication(stcb, SCTP_AUTH_NO_AUTH, error,
+ (uint16_t) (uintptr_t) data,
+ so_locked);
+ break;
+ case SCTP_NOTIFY_SENDER_DRY:
+ sctp_notify_sender_dry_event(stcb, so_locked);
+ break;
+ default:
+ SCTPDBG(SCTP_DEBUG_UTIL1, "%s: unknown notification %xh (%u)\n",
+ __FUNCTION__, notification, notification);
+ break;
+ } /* end switch */
+}
+
+void
+sctp_report_all_outbound(struct sctp_tcb *stcb, int holds_lock, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ struct sctp_association *asoc;
+ struct sctp_stream_out *outs;
+ struct sctp_tmit_chunk *chk;
+ struct sctp_stream_queue_pending *sp;
+ int i;
+
+ asoc = &stcb->asoc;
+
+ if (stcb == NULL) {
+ return;
+ }
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ /* already being freed */
+ return;
+ }
+ if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
+ (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) {
+ return;
+ }
+ /* now through all the gunk freeing chunks */
+ if (holds_lock == 0) {
+ SCTP_TCB_SEND_LOCK(stcb);
+ }
+ /* sent queue SHOULD be empty */
+ if (!TAILQ_EMPTY(&asoc->sent_queue)) {
+ chk = TAILQ_FIRST(&asoc->sent_queue);
+ while (chk) {
+ TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next);
+ asoc->sent_queue_cnt--;
+ if (chk->data != NULL) {
+ sctp_free_bufspace(stcb, asoc, chk, 1);
+ sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb,
+ SCTP_NOTIFY_DATAGRAM_SENT, chk, so_locked);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ }
+ sctp_free_a_chunk(stcb, chk);
+ /* sa_ignore FREED_MEMORY */
+ chk = TAILQ_FIRST(&asoc->sent_queue);
+ }
+ }
+ /* pending send queue SHOULD be empty */
+ if (!TAILQ_EMPTY(&asoc->send_queue)) {
+ chk = TAILQ_FIRST(&asoc->send_queue);
+ while (chk) {
+ TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next);
+ asoc->send_queue_cnt--;
+ if (chk->data != NULL) {
+ sctp_free_bufspace(stcb, asoc, chk, 1);
+ sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb,
+ SCTP_NOTIFY_DATAGRAM_UNSENT, chk, so_locked);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ }
+ sctp_free_a_chunk(stcb, chk);
+ /* sa_ignore FREED_MEMORY */
+ chk = TAILQ_FIRST(&asoc->send_queue);
+ }
+ }
+ for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
+ /* For each stream */
+ outs = &stcb->asoc.strmout[i];
+ /* clean up any sends there */
+ stcb->asoc.locked_on_sending = NULL;
+ sp = TAILQ_FIRST(&outs->outqueue);
+ while (sp) {
+ stcb->asoc.stream_queue_cnt--;
+ TAILQ_REMOVE(&outs->outqueue, sp, next);
+ sctp_free_spbufspace(stcb, asoc, sp);
+ if (sp->data) {
+ sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb,
+ SCTP_NOTIFY_DATAGRAM_UNSENT, (void *)sp, so_locked);
+ if (sp->data) {
+ sctp_m_freem(sp->data);
+ sp->data = NULL;
+ }
+ }
+ if (sp->net) {
+ sctp_free_remote_addr(sp->net);
+ sp->net = NULL;
+ }
+ /* Free the chunk */
+ sctp_free_a_strmoq(stcb, sp);
+ /* sa_ignore FREED_MEMORY */
+ sp = TAILQ_FIRST(&outs->outqueue);
+ }
+ }
+
+ if (holds_lock == 0) {
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ }
+}
+
+void
+sctp_abort_notification(struct sctp_tcb *stcb, int error, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+
+ if (stcb == NULL) {
+ return;
+ }
+ if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
+ (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) {
+ return;
+ }
+ /* Tell them we lost the asoc */
+ sctp_report_all_outbound(stcb, 1, so_locked);
+ if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_CONNECTED))) {
+ stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_WAS_ABORTED;
+ }
+ sctp_ulp_notify(SCTP_NOTIFY_ASSOC_ABORTED, stcb, error, NULL, so_locked);
+}
+
+void
+sctp_abort_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct mbuf *m, int iphlen, struct sctphdr *sh, struct mbuf *op_err,
+ uint32_t vrf_id, uint16_t port)
+{
+ uint32_t vtag;
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+
+ vtag = 0;
+ if (stcb != NULL) {
+ /* We have a TCB to abort, send notification too */
+ vtag = stcb->asoc.peer_vtag;
+ sctp_abort_notification(stcb, 0, SCTP_SO_NOT_LOCKED);
+ /* get the assoc vrf id and table id */
+ vrf_id = stcb->asoc.vrf_id;
+ stcb->asoc.state |= SCTP_STATE_WAS_ABORTED;
+ }
+ sctp_send_abort(m, iphlen, sh, vtag, op_err, vrf_id, port);
+ if (stcb != NULL) {
+ /* Ok, now lets free it */
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(inp);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+#endif
+ SCTP_STAT_INCR_COUNTER32(sctps_aborted);
+ if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_4);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ }
+}
+
+#ifdef SCTP_ASOCLOG_OF_TSNS
+void
+sctp_print_out_track_log(struct sctp_tcb *stcb)
+{
+#ifdef NOSIY_PRINTS
+ int i;
+
+ SCTP_PRINTF("Last ep reason:%x\n", stcb->sctp_ep->last_abort_code);
+ SCTP_PRINTF("IN bound TSN log-aaa\n");
+ if ((stcb->asoc.tsn_in_at == 0) && (stcb->asoc.tsn_in_wrapped == 0)) {
+ SCTP_PRINTF("None rcvd\n");
+ goto none_in;
+ }
+ if (stcb->asoc.tsn_in_wrapped) {
+ for (i = stcb->asoc.tsn_in_at; i < SCTP_TSN_LOG_SIZE; i++) {
+ SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n",
+ stcb->asoc.in_tsnlog[i].tsn,
+ stcb->asoc.in_tsnlog[i].strm,
+ stcb->asoc.in_tsnlog[i].seq,
+ stcb->asoc.in_tsnlog[i].flgs,
+ stcb->asoc.in_tsnlog[i].sz);
+ }
+ }
+ if (stcb->asoc.tsn_in_at) {
+ for (i = 0; i < stcb->asoc.tsn_in_at; i++) {
+ SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n",
+ stcb->asoc.in_tsnlog[i].tsn,
+ stcb->asoc.in_tsnlog[i].strm,
+ stcb->asoc.in_tsnlog[i].seq,
+ stcb->asoc.in_tsnlog[i].flgs,
+ stcb->asoc.in_tsnlog[i].sz);
+ }
+ }
+none_in:
+ SCTP_PRINTF("OUT bound TSN log-aaa\n");
+ if ((stcb->asoc.tsn_out_at == 0) &&
+ (stcb->asoc.tsn_out_wrapped == 0)) {
+ SCTP_PRINTF("None sent\n");
+ }
+ if (stcb->asoc.tsn_out_wrapped) {
+ for (i = stcb->asoc.tsn_out_at; i < SCTP_TSN_LOG_SIZE; i++) {
+ SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n",
+ stcb->asoc.out_tsnlog[i].tsn,
+ stcb->asoc.out_tsnlog[i].strm,
+ stcb->asoc.out_tsnlog[i].seq,
+ stcb->asoc.out_tsnlog[i].flgs,
+ stcb->asoc.out_tsnlog[i].sz);
+ }
+ }
+ if (stcb->asoc.tsn_out_at) {
+ for (i = 0; i < stcb->asoc.tsn_out_at; i++) {
+ SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n",
+ stcb->asoc.out_tsnlog[i].tsn,
+ stcb->asoc.out_tsnlog[i].strm,
+ stcb->asoc.out_tsnlog[i].seq,
+ stcb->asoc.out_tsnlog[i].flgs,
+ stcb->asoc.out_tsnlog[i].sz);
+ }
+ }
+#endif
+}
+
+#endif
+
+void
+sctp_abort_an_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ int error, struct mbuf *op_err,
+ int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ uint32_t vtag;
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(inp);
+#endif
+ if (stcb == NULL) {
+ /* Got to have a TCB */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ if (LIST_FIRST(&inp->sctp_asoc_list) == NULL) {
+ sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
+ SCTP_CALLED_DIRECTLY_NOCMPSET);
+ }
+ }
+ return;
+ } else {
+ stcb->asoc.state |= SCTP_STATE_WAS_ABORTED;
+ }
+ vtag = stcb->asoc.peer_vtag;
+ /* notify the ulp */
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0)
+ sctp_abort_notification(stcb, error, so_locked);
+ /* notify the peer */
+#if defined(SCTP_PANIC_ON_ABORT)
+ panic("aborting an association");
+#endif
+ sctp_send_abort_tcb(stcb, op_err, so_locked);
+ SCTP_STAT_INCR_COUNTER32(sctps_aborted);
+ if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ }
+ /* now free the asoc */
+#ifdef SCTP_ASOCLOG_OF_TSNS
+ sctp_print_out_track_log(stcb);
+#endif
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ if (!so_locked) {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ }
+#endif
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_5);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ if (!so_locked) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ }
+#endif
+}
+
+void
+sctp_handle_ootb(struct mbuf *m, int iphlen, int offset, struct sctphdr *sh,
+ struct sctp_inpcb *inp, struct mbuf *op_err, uint32_t vrf_id, uint16_t port)
+{
+ struct sctp_chunkhdr *ch, chunk_buf;
+ unsigned int chk_length;
+
+ SCTP_STAT_INCR_COUNTER32(sctps_outoftheblue);
+ /* Generate a TO address for future reference */
+ if (inp && (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
+ if (LIST_FIRST(&inp->sctp_asoc_list) == NULL) {
+ sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
+ SCTP_CALLED_DIRECTLY_NOCMPSET);
+ }
+ }
+ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset,
+ sizeof(*ch), (uint8_t *) & chunk_buf);
+ while (ch != NULL) {
+ chk_length = ntohs(ch->chunk_length);
+ if (chk_length < sizeof(*ch)) {
+ /* break to abort land */
+ break;
+ }
+ switch (ch->chunk_type) {
+ case SCTP_COOKIE_ECHO:
+ /* We hit here only if the assoc is being freed */
+ return;
+ case SCTP_PACKET_DROPPED:
+ /* we don't respond to pkt-dropped */
+ return;
+ case SCTP_ABORT_ASSOCIATION:
+ /* we don't respond with an ABORT to an ABORT */
+ return;
+ case SCTP_SHUTDOWN_COMPLETE:
+ /*
+ * we ignore it since we are not waiting for it and
+ * peer is gone
+ */
+ return;
+ case SCTP_SHUTDOWN_ACK:
+ sctp_send_shutdown_complete2(m, iphlen, sh, vrf_id, port);
+ return;
+ default:
+ break;
+ }
+ offset += SCTP_SIZE32(chk_length);
+ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset,
+ sizeof(*ch), (uint8_t *) & chunk_buf);
+ }
+ sctp_send_abort(m, iphlen, sh, 0, op_err, vrf_id, port);
+}
+
+/*
+ * check the inbound datagram to make sure there is not an abort inside it,
+ * if there is return 1, else return 0.
+ */
+int
+sctp_is_there_an_abort_here(struct mbuf *m, int iphlen, uint32_t * vtagfill)
+{
+ struct sctp_chunkhdr *ch;
+ struct sctp_init_chunk *init_chk, chunk_buf;
+ int offset;
+ unsigned int chk_length;
+
+ offset = iphlen + sizeof(struct sctphdr);
+ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, sizeof(*ch),
+ (uint8_t *) & chunk_buf);
+ while (ch != NULL) {
+ chk_length = ntohs(ch->chunk_length);
+ if (chk_length < sizeof(*ch)) {
+ /* packet is probably corrupt */
+ break;
+ }
+ /* we seem to be ok, is it an abort? */
+ if (ch->chunk_type == SCTP_ABORT_ASSOCIATION) {
+ /* yep, tell them */
+ return (1);
+ }
+ if (ch->chunk_type == SCTP_INITIATION) {
+ /* need to update the Vtag */
+ init_chk = (struct sctp_init_chunk *)sctp_m_getptr(m,
+ offset, sizeof(*init_chk), (uint8_t *) & chunk_buf);
+ if (init_chk != NULL) {
+ *vtagfill = ntohl(init_chk->init.initiate_tag);
+ }
+ }
+ /* Nope, move to the next chunk */
+ offset += SCTP_SIZE32(chk_length);
+ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset,
+ sizeof(*ch), (uint8_t *) & chunk_buf);
+ }
+ return (0);
+}
+
+/*
+ * currently (2/02), ifa_addr embeds scope_id's and don't have sin6_scope_id
+ * set (i.e. it's 0) so, create this function to compare link local scopes
+ */
+#ifdef INET6
+uint32_t
+sctp_is_same_scope(struct sockaddr_in6 *addr1, struct sockaddr_in6 *addr2)
+{
+ struct sockaddr_in6 a, b;
+
+ /* save copies */
+ a = *addr1;
+ b = *addr2;
+
+ if (a.sin6_scope_id == 0)
+ if (sa6_recoverscope(&a)) {
+ /* can't get scope, so can't match */
+ return (0);
+ }
+ if (b.sin6_scope_id == 0)
+ if (sa6_recoverscope(&b)) {
+ /* can't get scope, so can't match */
+ return (0);
+ }
+ if (a.sin6_scope_id != b.sin6_scope_id)
+ return (0);
+
+ return (1);
+}
+
+/*
+ * returns a sockaddr_in6 with embedded scope recovered and removed
+ */
+struct sockaddr_in6 *
+sctp_recover_scope(struct sockaddr_in6 *addr, struct sockaddr_in6 *store)
+{
+ /* check and strip embedded scope junk */
+ if (addr->sin6_family == AF_INET6) {
+ if (IN6_IS_SCOPE_LINKLOCAL(&addr->sin6_addr)) {
+ if (addr->sin6_scope_id == 0) {
+ *store = *addr;
+ if (!sa6_recoverscope(store)) {
+ /* use the recovered scope */
+ addr = store;
+ }
+ } else {
+ /* else, return the original "to" addr */
+ in6_clearscope(&addr->sin6_addr);
+ }
+ }
+ }
+ return (addr);
+}
+
+#endif
+
+/*
+ * are the two addresses the same? currently a "scopeless" check returns: 1
+ * if same, 0 if not
+ */
+int
+sctp_cmpaddr(struct sockaddr *sa1, struct sockaddr *sa2)
+{
+
+ /* must be valid */
+ if (sa1 == NULL || sa2 == NULL)
+ return (0);
+
+ /* must be the same family */
+ if (sa1->sa_family != sa2->sa_family)
+ return (0);
+
+ switch (sa1->sa_family) {
+#ifdef INET6
+ case AF_INET6:
+ {
+ /* IPv6 addresses */
+ struct sockaddr_in6 *sin6_1, *sin6_2;
+
+ sin6_1 = (struct sockaddr_in6 *)sa1;
+ sin6_2 = (struct sockaddr_in6 *)sa2;
+ return (SCTP6_ARE_ADDR_EQUAL(sin6_1,
+ sin6_2));
+ }
+#endif
+ case AF_INET:
+ {
+ /* IPv4 addresses */
+ struct sockaddr_in *sin_1, *sin_2;
+
+ sin_1 = (struct sockaddr_in *)sa1;
+ sin_2 = (struct sockaddr_in *)sa2;
+ return (sin_1->sin_addr.s_addr == sin_2->sin_addr.s_addr);
+ }
+ default:
+ /* we don't do these... */
+ return (0);
+ }
+}
+
+void
+sctp_print_address(struct sockaddr *sa)
+{
+#ifdef INET6
+ char ip6buf[INET6_ADDRSTRLEN];
+
+ ip6buf[0] = 0;
+#endif
+
+ switch (sa->sa_family) {
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)sa;
+ SCTP_PRINTF("IPv6 address: %s:port:%d scope:%u\n",
+ ip6_sprintf(ip6buf, &sin6->sin6_addr),
+ ntohs(sin6->sin6_port),
+ sin6->sin6_scope_id);
+ break;
+ }
+#endif
+ case AF_INET:
+ {
+ struct sockaddr_in *sin;
+ unsigned char *p;
+
+ sin = (struct sockaddr_in *)sa;
+ p = (unsigned char *)&sin->sin_addr;
+ SCTP_PRINTF("IPv4 address: %u.%u.%u.%u:%d\n",
+ p[0], p[1], p[2], p[3], ntohs(sin->sin_port));
+ break;
+ }
+ default:
+ SCTP_PRINTF("?\n");
+ break;
+ }
+}
+
+void
+sctp_print_address_pkt(struct ip *iph, struct sctphdr *sh)
+{
+ switch (iph->ip_v) {
+ case IPVERSION:
+ {
+ struct sockaddr_in lsa, fsa;
+
+ bzero(&lsa, sizeof(lsa));
+ lsa.sin_len = sizeof(lsa);
+ lsa.sin_family = AF_INET;
+ lsa.sin_addr = iph->ip_src;
+ lsa.sin_port = sh->src_port;
+ bzero(&fsa, sizeof(fsa));
+ fsa.sin_len = sizeof(fsa);
+ fsa.sin_family = AF_INET;
+ fsa.sin_addr = iph->ip_dst;
+ fsa.sin_port = sh->dest_port;
+ SCTP_PRINTF("src: ");
+ sctp_print_address((struct sockaddr *)&lsa);
+ SCTP_PRINTF("dest: ");
+ sctp_print_address((struct sockaddr *)&fsa);
+ break;
+ }
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ {
+ struct ip6_hdr *ip6;
+ struct sockaddr_in6 lsa6, fsa6;
+
+ ip6 = (struct ip6_hdr *)iph;
+ bzero(&lsa6, sizeof(lsa6));
+ lsa6.sin6_len = sizeof(lsa6);
+ lsa6.sin6_family = AF_INET6;
+ lsa6.sin6_addr = ip6->ip6_src;
+ lsa6.sin6_port = sh->src_port;
+ bzero(&fsa6, sizeof(fsa6));
+ fsa6.sin6_len = sizeof(fsa6);
+ fsa6.sin6_family = AF_INET6;
+ fsa6.sin6_addr = ip6->ip6_dst;
+ fsa6.sin6_port = sh->dest_port;
+ SCTP_PRINTF("src: ");
+ sctp_print_address((struct sockaddr *)&lsa6);
+ SCTP_PRINTF("dest: ");
+ sctp_print_address((struct sockaddr *)&fsa6);
+ break;
+ }
+#endif
+ default:
+ /* TSNH */
+ break;
+ }
+}
+
+void
+sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp,
+ struct sctp_inpcb *new_inp,
+ struct sctp_tcb *stcb,
+ int waitflags)
+{
+ /*
+ * go through our old INP and pull off any control structures that
+ * belong to stcb and move then to the new inp.
+ */
+ struct socket *old_so, *new_so;
+ struct sctp_queued_to_read *control, *nctl;
+ struct sctp_readhead tmp_queue;
+ struct mbuf *m;
+ int error = 0;
+
+ old_so = old_inp->sctp_socket;
+ new_so = new_inp->sctp_socket;
+ TAILQ_INIT(&tmp_queue);
+ error = sblock(&old_so->so_rcv, waitflags);
+ if (error) {
+ /*
+ * Gak, can't get sblock, we have a problem. data will be
+ * left stranded.. and we don't dare look at it since the
+ * other thread may be reading something. Oh well, its a
+ * screwed up app that does a peeloff OR a accept while
+ * reading from the main socket... actually its only the
+ * peeloff() case, since I think read will fail on a
+ * listening socket..
+ */
+ return;
+ }
+ /* lock the socket buffers */
+ SCTP_INP_READ_LOCK(old_inp);
+ control = TAILQ_FIRST(&old_inp->read_queue);
+ /* Pull off all for out target stcb */
+ while (control) {
+ nctl = TAILQ_NEXT(control, next);
+ if (control->stcb == stcb) {
+ /* remove it we want it */
+ TAILQ_REMOVE(&old_inp->read_queue, control, next);
+ TAILQ_INSERT_TAIL(&tmp_queue, control, next);
+ m = control->data;
+ while (m) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(&old_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, SCTP_BUF_LEN(m));
+ }
+ sctp_sbfree(control, stcb, &old_so->so_rcv, m);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(&old_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
+ }
+ m = SCTP_BUF_NEXT(m);
+ }
+ }
+ control = nctl;
+ }
+ SCTP_INP_READ_UNLOCK(old_inp);
+ /* Remove the sb-lock on the old socket */
+
+ sbunlock(&old_so->so_rcv);
+ /* Now we move them over to the new socket buffer */
+ control = TAILQ_FIRST(&tmp_queue);
+ SCTP_INP_READ_LOCK(new_inp);
+ while (control) {
+ nctl = TAILQ_NEXT(control, next);
+ TAILQ_INSERT_TAIL(&new_inp->read_queue, control, next);
+ m = control->data;
+ while (m) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(&new_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(m));
+ }
+ sctp_sballoc(stcb, &new_so->so_rcv, m);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(&new_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
+ }
+ m = SCTP_BUF_NEXT(m);
+ }
+ control = nctl;
+ }
+ SCTP_INP_READ_UNLOCK(new_inp);
+}
+
+void
+sctp_add_to_readq(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_queued_to_read *control,
+ struct sockbuf *sb,
+ int end,
+ int inp_read_lock_held,
+ int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ /*
+ * Here we must place the control on the end of the socket read
+ * queue AND increment sb_cc so that select will work properly on
+ * read.
+ */
+ struct mbuf *m, *prev = NULL;
+
+ if (inp == NULL) {
+ /* Gak, TSNH!! */
+#ifdef INVARIANTS
+ panic("Gak, inp NULL on add_to_readq");
+#endif
+ return;
+ }
+ if (inp_read_lock_held == 0)
+ SCTP_INP_READ_LOCK(inp);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ) {
+ sctp_free_remote_addr(control->whoFrom);
+ if (control->data) {
+ sctp_m_freem(control->data);
+ control->data = NULL;
+ }
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), control);
+ if (inp_read_lock_held == 0)
+ SCTP_INP_READ_UNLOCK(inp);
+ return;
+ }
+ if (!(control->spec_flags & M_NOTIFICATION)) {
+ atomic_add_int(&inp->total_recvs, 1);
+ if (!control->do_not_ref_stcb) {
+ atomic_add_int(&stcb->total_recvs, 1);
+ }
+ }
+ m = control->data;
+ control->held_length = 0;
+ control->length = 0;
+ while (m) {
+ if (SCTP_BUF_LEN(m) == 0) {
+ /* Skip mbufs with NO length */
+ if (prev == NULL) {
+ /* First one */
+ control->data = sctp_m_free(m);
+ m = control->data;
+ } else {
+ SCTP_BUF_NEXT(prev) = sctp_m_free(m);
+ m = SCTP_BUF_NEXT(prev);
+ }
+ if (m == NULL) {
+ control->tail_mbuf = prev;
+ }
+ continue;
+ }
+ prev = m;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(m));
+ }
+ sctp_sballoc(stcb, sb, m);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
+ }
+ atomic_add_int(&control->length, SCTP_BUF_LEN(m));
+ m = SCTP_BUF_NEXT(m);
+ }
+ if (prev != NULL) {
+ control->tail_mbuf = prev;
+ } else {
+ /* Everything got collapsed out?? */
+ sctp_free_remote_addr(control->whoFrom);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), control);
+ if (inp_read_lock_held == 0)
+ SCTP_INP_READ_UNLOCK(inp);
+ return;
+ }
+ if (end) {
+ control->end_added = 1;
+ }
+ TAILQ_INSERT_TAIL(&inp->read_queue, control, next);
+ if (inp_read_lock_held == 0)
+ SCTP_INP_READ_UNLOCK(inp);
+ if (inp && inp->sctp_socket) {
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) {
+ SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket);
+ } else {
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+ so = SCTP_INP_SO(inp);
+ if (!so_locked) {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return;
+ }
+ }
+#endif
+ sctp_sorwakeup(inp, inp->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ if (!so_locked) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ }
+#endif
+ }
+ }
+}
+
+
+int
+sctp_append_to_readq(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_queued_to_read *control,
+ struct mbuf *m,
+ int end,
+ int ctls_cumack,
+ struct sockbuf *sb)
+{
+ /*
+ * A partial delivery API event is underway. OR we are appending on
+ * the reassembly queue.
+ *
+ * If PDAPI this means we need to add m to the end of the data.
+ * Increase the length in the control AND increment the sb_cc.
+ * Otherwise sb is NULL and all we need to do is put it at the end
+ * of the mbuf chain.
+ */
+ int len = 0;
+ struct mbuf *mm, *tail = NULL, *prev = NULL;
+
+ if (inp) {
+ SCTP_INP_READ_LOCK(inp);
+ }
+ if (control == NULL) {
+get_out:
+ if (inp) {
+ SCTP_INP_READ_UNLOCK(inp);
+ }
+ return (-1);
+ }
+ if (inp && (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ)) {
+ SCTP_INP_READ_UNLOCK(inp);
+ return 0;
+ }
+ if (control->end_added) {
+ /* huh this one is complete? */
+ goto get_out;
+ }
+ mm = m;
+ if (mm == NULL) {
+ goto get_out;
+ }
+ while (mm) {
+ if (SCTP_BUF_LEN(mm) == 0) {
+ /* Skip mbufs with NO lenght */
+ if (prev == NULL) {
+ /* First one */
+ m = sctp_m_free(mm);
+ mm = m;
+ } else {
+ SCTP_BUF_NEXT(prev) = sctp_m_free(mm);
+ mm = SCTP_BUF_NEXT(prev);
+ }
+ continue;
+ }
+ prev = mm;
+ len += SCTP_BUF_LEN(mm);
+ if (sb) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(mm));
+ }
+ sctp_sballoc(stcb, sb, mm);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
+ }
+ }
+ mm = SCTP_BUF_NEXT(mm);
+ }
+ if (prev) {
+ tail = prev;
+ } else {
+ /* Really there should always be a prev */
+ if (m == NULL) {
+ /* Huh nothing left? */
+#ifdef INVARIANTS
+ panic("Nothing left to add?");
+#else
+ goto get_out;
+#endif
+ }
+ tail = m;
+ }
+ if (control->tail_mbuf) {
+ /* append */
+ SCTP_BUF_NEXT(control->tail_mbuf) = m;
+ control->tail_mbuf = tail;
+ } else {
+ /* nothing there */
+#ifdef INVARIANTS
+ if (control->data != NULL) {
+ panic("This should NOT happen");
+ }
+#endif
+ control->data = m;
+ control->tail_mbuf = tail;
+ }
+ atomic_add_int(&control->length, len);
+ if (end) {
+ /* message is complete */
+ if (stcb && (control == stcb->asoc.control_pdapi)) {
+ stcb->asoc.control_pdapi = NULL;
+ }
+ control->held_length = 0;
+ control->end_added = 1;
+ }
+ if (stcb == NULL) {
+ control->do_not_ref_stcb = 1;
+ }
+ /*
+ * When we are appending in partial delivery, the cum-ack is used
+ * for the actual pd-api highest tsn on this mbuf. The true cum-ack
+ * is populated in the outbound sinfo structure from the true cumack
+ * if the association exists...
+ */
+ control->sinfo_tsn = control->sinfo_cumtsn = ctls_cumack;
+ if (inp) {
+ SCTP_INP_READ_UNLOCK(inp);
+ }
+ if (inp && inp->sctp_socket) {
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) {
+ SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket);
+ } else {
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+ so = SCTP_INP_SO(inp);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return (0);
+ }
+#endif
+ sctp_sorwakeup(inp, inp->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ }
+ }
+ return (0);
+}
+
+
+
+/*************HOLD THIS COMMENT FOR PATCH FILE OF
+ *************ALTERNATE ROUTING CODE
+ */
+
+/*************HOLD THIS COMMENT FOR END OF PATCH FILE OF
+ *************ALTERNATE ROUTING CODE
+ */
+
+struct mbuf *
+sctp_generate_invmanparam(int err)
+{
+ /* Return a MBUF with a invalid mandatory parameter */
+ struct mbuf *m;
+
+ m = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_DONTWAIT, 1, MT_DATA);
+ if (m) {
+ struct sctp_paramhdr *ph;
+
+ SCTP_BUF_LEN(m) = sizeof(struct sctp_paramhdr);
+ ph = mtod(m, struct sctp_paramhdr *);
+ ph->param_length = htons(sizeof(struct sctp_paramhdr));
+ ph->param_type = htons(err);
+ }
+ return (m);
+}
+
+#ifdef SCTP_MBCNT_LOGGING
+void
+sctp_free_bufspace(struct sctp_tcb *stcb, struct sctp_association *asoc,
+ struct sctp_tmit_chunk *tp1, int chk_cnt)
+{
+ if (tp1->data == NULL) {
+ return;
+ }
+ asoc->chunks_on_out_queue -= chk_cnt;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBCNT_LOGGING_ENABLE) {
+ sctp_log_mbcnt(SCTP_LOG_MBCNT_DECREASE,
+ asoc->total_output_queue_size,
+ tp1->book_size,
+ 0,
+ tp1->mbcnt);
+ }
+ if (asoc->total_output_queue_size >= tp1->book_size) {
+ atomic_add_int(&asoc->total_output_queue_size, -tp1->book_size);
+ } else {
+ asoc->total_output_queue_size = 0;
+ }
+
+ if (stcb->sctp_socket && (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) ||
+ ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE)))) {
+ if (stcb->sctp_socket->so_snd.sb_cc >= tp1->book_size) {
+ stcb->sctp_socket->so_snd.sb_cc -= tp1->book_size;
+ } else {
+ stcb->sctp_socket->so_snd.sb_cc = 0;
+
+ }
+ }
+}
+
+#endif
+
+int
+sctp_release_pr_sctp_chunk(struct sctp_tcb *stcb, struct sctp_tmit_chunk *tp1,
+ int reason, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ struct sctp_stream_out *strq;
+ struct sctp_tmit_chunk *chk = NULL;
+ struct sctp_stream_queue_pending *sp;
+ uint16_t stream = 0, seq = 0;
+ uint8_t foundeom = 0;
+ int ret_sz = 0;
+ int notdone;
+ int do_wakeup_routine = 0;
+
+ stream = tp1->rec.data.stream_number;
+ seq = tp1->rec.data.stream_seq;
+ do {
+ ret_sz += tp1->book_size;
+ if (tp1->data != NULL) {
+ if (tp1->sent < SCTP_DATAGRAM_RESEND) {
+ sctp_flight_size_decrease(tp1);
+ sctp_total_flight_decrease(stcb, tp1);
+ }
+ sctp_free_bufspace(stcb, &stcb->asoc, tp1, 1);
+ stcb->asoc.peers_rwnd += tp1->send_size;
+ stcb->asoc.peers_rwnd += SCTP_BASE_SYSCTL(sctp_peer_chunk_oh);
+ sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb, reason, tp1, so_locked);
+ if (tp1->data) {
+ sctp_m_freem(tp1->data);
+ tp1->data = NULL;
+ }
+ do_wakeup_routine = 1;
+ if (PR_SCTP_BUF_ENABLED(tp1->flags)) {
+ stcb->asoc.sent_queue_cnt_removeable--;
+ }
+ }
+ tp1->sent = SCTP_FORWARD_TSN_SKIP;
+ if ((tp1->rec.data.rcv_flags & SCTP_DATA_NOT_FRAG) ==
+ SCTP_DATA_NOT_FRAG) {
+ /* not frag'ed we ae done */
+ notdone = 0;
+ foundeom = 1;
+ } else if (tp1->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) {
+ /* end of frag, we are done */
+ notdone = 0;
+ foundeom = 1;
+ } else {
+ /*
+ * Its a begin or middle piece, we must mark all of
+ * it
+ */
+ notdone = 1;
+ tp1 = TAILQ_NEXT(tp1, sctp_next);
+ }
+ } while (tp1 && notdone);
+ if (foundeom == 0) {
+ /*
+ * The multi-part message was scattered across the send and
+ * sent queue.
+ */
+next_on_sent:
+ tp1 = TAILQ_FIRST(&stcb->asoc.send_queue);
+ /*
+ * recurse throught the send_queue too, starting at the
+ * beginning.
+ */
+ if ((tp1) &&
+ (tp1->rec.data.stream_number == stream) &&
+ (tp1->rec.data.stream_seq == seq)) {
+ /*
+ * save to chk in case we have some on stream out
+ * queue. If so and we have an un-transmitted one we
+ * don't have to fudge the TSN.
+ */
+ chk = tp1;
+ ret_sz += tp1->book_size;
+ sctp_free_bufspace(stcb, &stcb->asoc, tp1, 1);
+ sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb, reason, tp1, so_locked);
+ if (tp1->data) {
+ sctp_m_freem(tp1->data);
+ tp1->data = NULL;
+ }
+ /* No flight involved here book the size to 0 */
+ tp1->book_size = 0;
+ if (tp1->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) {
+ foundeom = 1;
+ }
+ do_wakeup_routine = 1;
+ tp1->sent = SCTP_FORWARD_TSN_SKIP;
+ TAILQ_REMOVE(&stcb->asoc.send_queue, tp1, sctp_next);
+ /*
+ * on to the sent queue so we can wait for it to be
+ * passed by.
+ */
+ TAILQ_INSERT_TAIL(&stcb->asoc.sent_queue, tp1,
+ sctp_next);
+ stcb->asoc.send_queue_cnt--;
+ stcb->asoc.sent_queue_cnt++;
+ goto next_on_sent;
+ }
+ }
+ if (foundeom == 0) {
+ /*
+ * Still no eom found. That means there is stuff left on the
+ * stream out queue.. yuck.
+ */
+ strq = &stcb->asoc.strmout[stream];
+ SCTP_TCB_SEND_LOCK(stcb);
+ sp = TAILQ_FIRST(&strq->outqueue);
+ while (sp->strseq <= seq) {
+ /* Check if its our SEQ */
+ if (sp->strseq == seq) {
+ sp->discard_rest = 1;
+ /*
+ * We may need to put a chunk on the queue
+ * that holds the TSN that would have been
+ * sent with the LAST bit.
+ */
+ if (chk == NULL) {
+ /* Yep, we have to */
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ /*
+ * we are hosed. All we can
+ * do is nothing.. which
+ * will cause an abort if
+ * the peer is paying
+ * attention.
+ */
+ goto oh_well;
+ }
+ memset(chk, 0, sizeof(*chk));
+ chk->rec.data.rcv_flags = SCTP_DATA_LAST_FRAG;
+ chk->sent = SCTP_FORWARD_TSN_SKIP;
+ chk->asoc = &stcb->asoc;
+ chk->rec.data.stream_seq = sp->strseq;
+ chk->rec.data.stream_number = sp->stream;
+ chk->rec.data.payloadtype = sp->ppid;
+ chk->rec.data.context = sp->context;
+ chk->flags = sp->act_flags;
+ if (sp->net)
+ chk->whoTo = sp->net;
+ else
+ chk->whoTo = stcb->asoc.primary_destination;
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ chk->rec.data.TSN_seq = atomic_fetchadd_int(&stcb->asoc.sending_seq, 1);
+ stcb->asoc.pr_sctp_cnt++;
+ chk->pr_sctp_on = 1;
+ TAILQ_INSERT_TAIL(&stcb->asoc.sent_queue, chk, sctp_next);
+ stcb->asoc.sent_queue_cnt++;
+ stcb->asoc.pr_sctp_cnt++;
+ } else {
+ chk->rec.data.rcv_flags |= SCTP_DATA_LAST_FRAG;
+ }
+ oh_well:
+ if (sp->data) {
+ /*
+ * Pull any data to free up the SB
+ * and allow sender to "add more"
+ * whilc we will throw away :-)
+ */
+ sctp_free_spbufspace(stcb, &stcb->asoc,
+ sp);
+ ret_sz += sp->length;
+ do_wakeup_routine = 1;
+ sp->some_taken = 1;
+ sctp_m_freem(sp->data);
+ sp->length = 0;
+ sp->data = NULL;
+ sp->tail_mbuf = NULL;
+ }
+ break;
+ } else {
+ /* Next one please */
+ sp = TAILQ_NEXT(sp, next);
+ }
+ } /* End while */
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ }
+ if (do_wakeup_routine) {
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ if (!so_locked) {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+ /* assoc was freed while we were unlocked */
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return (ret_sz);
+ }
+ }
+#endif
+ sctp_sowwakeup(stcb->sctp_ep, stcb->sctp_socket);
+#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ if (!so_locked) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ }
+#endif
+ }
+ return (ret_sz);
+}
+
+/*
+ * checks to see if the given address, sa, is one that is currently known by
+ * the kernel note: can't distinguish the same address on multiple interfaces
+ * and doesn't handle multiple addresses with different zone/scope id's note:
+ * ifa_ifwithaddr() compares the entire sockaddr struct
+ */
+struct sctp_ifa *
+sctp_find_ifa_in_ep(struct sctp_inpcb *inp, struct sockaddr *addr,
+ int holds_lock)
+{
+ struct sctp_laddr *laddr;
+
+ if (holds_lock == 0) {
+ SCTP_INP_RLOCK(inp);
+ }
+ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
+ if (laddr->ifa == NULL)
+ continue;
+ if (addr->sa_family != laddr->ifa->address.sa.sa_family)
+ continue;
+ if (addr->sa_family == AF_INET) {
+ if (((struct sockaddr_in *)addr)->sin_addr.s_addr ==
+ laddr->ifa->address.sin.sin_addr.s_addr) {
+ /* found him. */
+ if (holds_lock == 0) {
+ SCTP_INP_RUNLOCK(inp);
+ }
+ return (laddr->ifa);
+ break;
+ }
+ }
+#ifdef INET6
+ if (addr->sa_family == AF_INET6) {
+ if (SCTP6_ARE_ADDR_EQUAL((struct sockaddr_in6 *)addr,
+ &laddr->ifa->address.sin6)) {
+ /* found him. */
+ if (holds_lock == 0) {
+ SCTP_INP_RUNLOCK(inp);
+ }
+ return (laddr->ifa);
+ break;
+ }
+ }
+#endif
+ }
+ if (holds_lock == 0) {
+ SCTP_INP_RUNLOCK(inp);
+ }
+ return (NULL);
+}
+
+uint32_t
+sctp_get_ifa_hash_val(struct sockaddr *addr)
+{
+ if (addr->sa_family == AF_INET) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)addr;
+ return (sin->sin_addr.s_addr ^ (sin->sin_addr.s_addr >> 16));
+ } else if (addr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+ uint32_t hash_of_addr;
+
+ sin6 = (struct sockaddr_in6 *)addr;
+ hash_of_addr = (sin6->sin6_addr.s6_addr32[0] +
+ sin6->sin6_addr.s6_addr32[1] +
+ sin6->sin6_addr.s6_addr32[2] +
+ sin6->sin6_addr.s6_addr32[3]);
+ hash_of_addr = (hash_of_addr ^ (hash_of_addr >> 16));
+ return (hash_of_addr);
+ }
+ return (0);
+}
+
+struct sctp_ifa *
+sctp_find_ifa_by_addr(struct sockaddr *addr, uint32_t vrf_id, int holds_lock)
+{
+ struct sctp_ifa *sctp_ifap;
+ struct sctp_vrf *vrf;
+ struct sctp_ifalist *hash_head;
+ uint32_t hash_of_addr;
+
+ if (holds_lock == 0)
+ SCTP_IPI_ADDR_RLOCK();
+
+ vrf = sctp_find_vrf(vrf_id);
+ if (vrf == NULL) {
+stage_right:
+ if (holds_lock == 0)
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (NULL);
+ }
+ hash_of_addr = sctp_get_ifa_hash_val(addr);
+
+ hash_head = &vrf->vrf_addr_hash[(hash_of_addr & vrf->vrf_addr_hashmark)];
+ if (hash_head == NULL) {
+ SCTP_PRINTF("hash_of_addr:%x mask:%x table:%x - ",
+ hash_of_addr, (uint32_t) vrf->vrf_addr_hashmark,
+ (uint32_t) (hash_of_addr & vrf->vrf_addr_hashmark));
+ sctp_print_address(addr);
+ SCTP_PRINTF("No such bucket for address\n");
+ if (holds_lock == 0)
+ SCTP_IPI_ADDR_RUNLOCK();
+
+ return (NULL);
+ }
+ LIST_FOREACH(sctp_ifap, hash_head, next_bucket) {
+ if (sctp_ifap == NULL) {
+#ifdef INVARIANTS
+ panic("Huh LIST_FOREACH corrupt");
+ goto stage_right;
+#else
+ SCTP_PRINTF("LIST corrupt of sctp_ifap's?\n");
+ goto stage_right;
+#endif
+ }
+ if (addr->sa_family != sctp_ifap->address.sa.sa_family)
+ continue;
+ if (addr->sa_family == AF_INET) {
+ if (((struct sockaddr_in *)addr)->sin_addr.s_addr ==
+ sctp_ifap->address.sin.sin_addr.s_addr) {
+ /* found him. */
+ if (holds_lock == 0)
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (sctp_ifap);
+ break;
+ }
+ }
+#ifdef INET6
+ if (addr->sa_family == AF_INET6) {
+ if (SCTP6_ARE_ADDR_EQUAL((struct sockaddr_in6 *)addr,
+ &sctp_ifap->address.sin6)) {
+ /* found him. */
+ if (holds_lock == 0)
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (sctp_ifap);
+ break;
+ }
+ }
+#endif
+ }
+ if (holds_lock == 0)
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (NULL);
+}
+
+static void
+sctp_user_rcvd(struct sctp_tcb *stcb, uint32_t * freed_so_far, int hold_rlock,
+ uint32_t rwnd_req)
+{
+ /* User pulled some data, do we need a rwnd update? */
+ int r_unlocked = 0;
+ uint32_t dif, rwnd;
+ struct socket *so = NULL;
+
+ if (stcb == NULL)
+ return;
+
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+
+ if (stcb->asoc.state & (SCTP_STATE_ABOUT_TO_BE_FREED |
+ SCTP_STATE_SHUTDOWN_RECEIVED |
+ SCTP_STATE_SHUTDOWN_ACK_SENT)) {
+ /* Pre-check If we are freeing no update */
+ goto no_lock;
+ }
+ SCTP_INP_INCR_REF(stcb->sctp_ep);
+ if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
+ goto out;
+ }
+ so = stcb->sctp_socket;
+ if (so == NULL) {
+ goto out;
+ }
+ atomic_add_int(&stcb->freed_by_sorcv_sincelast, *freed_so_far);
+ /* Have you have freed enough to look */
+ *freed_so_far = 0;
+ /* Yep, its worth a look and the lock overhead */
+
+ /* Figure out what the rwnd would be */
+ rwnd = sctp_calc_rwnd(stcb, &stcb->asoc);
+ if (rwnd >= stcb->asoc.my_last_reported_rwnd) {
+ dif = rwnd - stcb->asoc.my_last_reported_rwnd;
+ } else {
+ dif = 0;
+ }
+ if (dif >= rwnd_req) {
+ if (hold_rlock) {
+ SCTP_INP_READ_UNLOCK(stcb->sctp_ep);
+ r_unlocked = 1;
+ }
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ /*
+ * One last check before we allow the guy possibly
+ * to get in. There is a race, where the guy has not
+ * reached the gate. In that case
+ */
+ goto out;
+ }
+ SCTP_TCB_LOCK(stcb);
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ /* No reports here */
+ SCTP_TCB_UNLOCK(stcb);
+ goto out;
+ }
+ SCTP_STAT_INCR(sctps_wu_sacks_sent);
+ sctp_send_sack(stcb);
+
+ sctp_chunk_output(stcb->sctp_ep, stcb,
+ SCTP_OUTPUT_FROM_USR_RCVD, SCTP_SO_LOCKED);
+ /* make sure no timer is running */
+ sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_6);
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ /* Update how much we have pending */
+ stcb->freed_by_sorcv_sincelast = dif;
+ }
+out:
+ if (so && r_unlocked && hold_rlock) {
+ SCTP_INP_READ_LOCK(stcb->sctp_ep);
+ }
+ SCTP_INP_DECR_REF(stcb->sctp_ep);
+no_lock:
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ return;
+}
+
+int
+sctp_sorecvmsg(struct socket *so,
+ struct uio *uio,
+ struct mbuf **mp,
+ struct sockaddr *from,
+ int fromlen,
+ int *msg_flags,
+ struct sctp_sndrcvinfo *sinfo,
+ int filling_sinfo)
+{
+ /*
+ * MSG flags we will look at MSG_DONTWAIT - non-blocking IO.
+ * MSG_PEEK - Look don't touch :-D (only valid with OUT mbuf copy
+ * mp=NULL thus uio is the copy method to userland) MSG_WAITALL - ??
+ * On the way out we may send out any combination of:
+ * MSG_NOTIFICATION MSG_EOR
+ *
+ */
+ struct sctp_inpcb *inp = NULL;
+ int my_len = 0;
+ int cp_len = 0, error = 0;
+ struct sctp_queued_to_read *control = NULL, *ctl = NULL, *nxt = NULL;
+ struct mbuf *m = NULL;
+ struct sctp_tcb *stcb = NULL;
+ int wakeup_read_socket = 0;
+ int freecnt_applied = 0;
+ int out_flags = 0, in_flags = 0;
+ int block_allowed = 1;
+ uint32_t freed_so_far = 0;
+ uint32_t copied_so_far = 0;
+ int in_eeor_mode = 0;
+ int no_rcv_needed = 0;
+ uint32_t rwnd_req = 0;
+ int hold_sblock = 0;
+ int hold_rlock = 0;
+ int slen = 0;
+ uint32_t held_length = 0;
+ int sockbuf_lock = 0;
+
+ if (uio == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ return (EINVAL);
+ }
+ if (msg_flags) {
+ in_flags = *msg_flags;
+ if (in_flags & MSG_PEEK)
+ SCTP_STAT_INCR(sctps_read_peeks);
+ } else {
+ in_flags = 0;
+ }
+ slen = uio->uio_resid;
+
+ /* Pull in and set up our int flags */
+ if (in_flags & MSG_OOB) {
+ /* Out of band's NOT supported */
+ return (EOPNOTSUPP);
+ }
+ if ((in_flags & MSG_PEEK) && (mp != NULL)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ return (EINVAL);
+ }
+ if ((in_flags & (MSG_DONTWAIT
+ | MSG_NBIO
+ )) ||
+ SCTP_SO_IS_NBIO(so)) {
+ block_allowed = 0;
+ }
+ /* setup the endpoint */
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == NULL) {
+ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EFAULT);
+ return (EFAULT);
+ }
+ rwnd_req = (SCTP_SB_LIMIT_RCV(so) >> SCTP_RWND_HIWAT_SHIFT);
+ /* Must be at least a MTU's worth */
+ if (rwnd_req < SCTP_MIN_RWND)
+ rwnd_req = SCTP_MIN_RWND;
+ in_eeor_mode = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_SORECV_ENTER,
+ rwnd_req, in_eeor_mode, so->so_rcv.sb_cc, uio->uio_resid);
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_SORECV_ENTERPL,
+ rwnd_req, block_allowed, so->so_rcv.sb_cc, uio->uio_resid);
+ }
+ error = sblock(&so->so_rcv, (block_allowed ? SBL_WAIT : 0));
+ sockbuf_lock = 1;
+ if (error) {
+ goto release_unlocked;
+ }
+restart:
+
+
+restart_nosblocks:
+ if (hold_sblock == 0) {
+ SOCKBUF_LOCK(&so->so_rcv);
+ hold_sblock = 1;
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
+ goto out;
+ }
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ if (so->so_error) {
+ error = so->so_error;
+ if ((in_flags & MSG_PEEK) == 0)
+ so->so_error = 0;
+ goto out;
+ } else {
+ if (so->so_rcv.sb_cc == 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOTCONN);
+ /* indicate EOF */
+ error = 0;
+ goto out;
+ }
+ }
+ }
+ if ((so->so_rcv.sb_cc <= held_length) && block_allowed) {
+ /* we need to wait for data */
+ if ((so->so_rcv.sb_cc == 0) &&
+ ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0) {
+ /*
+ * For active open side clear flags for
+ * re-use passive open is blocked by
+ * connect.
+ */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED) {
+ /*
+ * You were aborted, passive side
+ * always hits here
+ */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET);
+ error = ECONNRESET;
+ /*
+ * You get this once if you are
+ * active open side
+ */
+ if (!(inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ /*
+ * Remove flag if on the
+ * active open side
+ */
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAS_ABORTED;
+ }
+ }
+ so->so_state &= ~(SS_ISCONNECTING |
+ SS_ISDISCONNECTING |
+ SS_ISCONFIRMING |
+ SS_ISCONNECTED);
+ if (error == 0) {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) == 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOTCONN);
+ error = ENOTCONN;
+ } else {
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAS_CONNECTED;
+ }
+ }
+ goto out;
+ }
+ }
+ error = sbwait(&so->so_rcv);
+ if (error) {
+ goto out;
+ }
+ held_length = 0;
+ goto restart_nosblocks;
+ } else if (so->so_rcv.sb_cc == 0) {
+ if (so->so_error) {
+ error = so->so_error;
+ if ((in_flags & MSG_PEEK) == 0)
+ so->so_error = 0;
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0) {
+ /*
+ * For active open side clear flags
+ * for re-use passive open is
+ * blocked by connect.
+ */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED) {
+ /*
+ * You were aborted, passive
+ * side always hits here
+ */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET);
+ error = ECONNRESET;
+ /*
+ * You get this once if you
+ * are active open side
+ */
+ if (!(inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ /*
+ * Remove flag if on
+ * the active open
+ * side
+ */
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAS_ABORTED;
+ }
+ }
+ so->so_state &= ~(SS_ISCONNECTING |
+ SS_ISDISCONNECTING |
+ SS_ISCONFIRMING |
+ SS_ISCONNECTED);
+ if (error == 0) {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) == 0) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOTCONN);
+ error = ENOTCONN;
+ } else {
+ inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAS_CONNECTED;
+ }
+ }
+ goto out;
+ }
+ }
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EWOULDBLOCK);
+ error = EWOULDBLOCK;
+ }
+ goto out;
+ }
+ if (hold_sblock == 1) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ hold_sblock = 0;
+ }
+ /* we possibly have data we can read */
+ /* sa_ignore FREED_MEMORY */
+ control = TAILQ_FIRST(&inp->read_queue);
+ if (control == NULL) {
+ /*
+ * This could be happening since the appender did the
+ * increment but as not yet did the tailq insert onto the
+ * read_queue
+ */
+ if (hold_rlock == 0) {
+ SCTP_INP_READ_LOCK(inp);
+ hold_rlock = 1;
+ }
+ control = TAILQ_FIRST(&inp->read_queue);
+ if ((control == NULL) && (so->so_rcv.sb_cc != 0)) {
+#ifdef INVARIANTS
+ panic("Huh, its non zero and nothing on control?");
+#endif
+ so->so_rcv.sb_cc = 0;
+ }
+ SCTP_INP_READ_UNLOCK(inp);
+ hold_rlock = 0;
+ goto restart;
+ }
+ if ((control->length == 0) &&
+ (control->do_not_ref_stcb)) {
+ /*
+ * Clean up code for freeing assoc that left behind a
+ * pdapi.. maybe a peer in EEOR that just closed after
+ * sending and never indicated a EOR.
+ */
+ if (hold_rlock == 0) {
+ hold_rlock = 1;
+ SCTP_INP_READ_LOCK(inp);
+ }
+ control->held_length = 0;
+ if (control->data) {
+ /* Hmm there is data here .. fix */
+ struct mbuf *m_tmp;
+ int cnt = 0;
+
+ m_tmp = control->data;
+ while (m_tmp) {
+ cnt += SCTP_BUF_LEN(m_tmp);
+ if (SCTP_BUF_NEXT(m_tmp) == NULL) {
+ control->tail_mbuf = m_tmp;
+ control->end_added = 1;
+ }
+ m_tmp = SCTP_BUF_NEXT(m_tmp);
+ }
+ control->length = cnt;
+ } else {
+ /* remove it */
+ TAILQ_REMOVE(&inp->read_queue, control, next);
+ /* Add back any hiddend data */
+ sctp_free_remote_addr(control->whoFrom);
+ sctp_free_a_readq(stcb, control);
+ }
+ if (hold_rlock) {
+ hold_rlock = 0;
+ SCTP_INP_READ_UNLOCK(inp);
+ }
+ goto restart;
+ }
+ if ((control->length == 0) &&
+ (control->end_added == 1)) {
+ /*
+ * Do we also need to check for (control->pdapi_aborted ==
+ * 1)?
+ */
+ if (hold_rlock == 0) {
+ hold_rlock = 1;
+ SCTP_INP_READ_LOCK(inp);
+ }
+ TAILQ_REMOVE(&inp->read_queue, control, next);
+ if (control->data) {
+#ifdef INVARIANTS
+ panic("control->data not null but control->length == 0");
+#else
+ SCTP_PRINTF("Strange, data left in the control buffer. Cleaning up.\n");
+ sctp_m_freem(control->data);
+ control->data = NULL;
+#endif
+ }
+ if (control->aux_data) {
+ sctp_m_free(control->aux_data);
+ control->aux_data = NULL;
+ }
+ sctp_free_remote_addr(control->whoFrom);
+ sctp_free_a_readq(stcb, control);
+ if (hold_rlock) {
+ hold_rlock = 0;
+ SCTP_INP_READ_UNLOCK(inp);
+ }
+ goto restart;
+ }
+ if (control->length == 0) {
+ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE)) &&
+ (filling_sinfo)) {
+ /* find a more suitable one then this */
+ ctl = TAILQ_NEXT(control, next);
+ while (ctl) {
+ if ((ctl->stcb != control->stcb) && (ctl->length) &&
+ (ctl->some_taken ||
+ (ctl->spec_flags & M_NOTIFICATION) ||
+ ((ctl->do_not_ref_stcb == 0) &&
+ (ctl->stcb->asoc.strmin[ctl->sinfo_stream].delivery_started == 0)))
+ ) {
+ /*-
+ * If we have a different TCB next, and there is data
+ * present. If we have already taken some (pdapi), OR we can
+ * ref the tcb and no delivery as started on this stream, we
+ * take it. Note we allow a notification on a different
+ * assoc to be delivered..
+ */
+ control = ctl;
+ goto found_one;
+ } else if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS)) &&
+ (ctl->length) &&
+ ((ctl->some_taken) ||
+ ((ctl->do_not_ref_stcb == 0) &&
+ ((ctl->spec_flags & M_NOTIFICATION) == 0) &&
+ (ctl->stcb->asoc.strmin[ctl->sinfo_stream].delivery_started == 0)))) {
+ /*-
+ * If we have the same tcb, and there is data present, and we
+ * have the strm interleave feature present. Then if we have
+ * taken some (pdapi) or we can refer to tht tcb AND we have
+ * not started a delivery for this stream, we can take it.
+ * Note we do NOT allow a notificaiton on the same assoc to
+ * be delivered.
+ */
+ control = ctl;
+ goto found_one;
+ }
+ ctl = TAILQ_NEXT(ctl, next);
+ }
+ }
+ /*
+ * if we reach here, not suitable replacement is available
+ * <or> fragment interleave is NOT on. So stuff the sb_cc
+ * into the our held count, and its time to sleep again.
+ */
+ held_length = so->so_rcv.sb_cc;
+ control->held_length = so->so_rcv.sb_cc;
+ goto restart;
+ }
+ /* Clear the held length since there is something to read */
+ control->held_length = 0;
+ if (hold_rlock) {
+ SCTP_INP_READ_UNLOCK(inp);
+ hold_rlock = 0;
+ }
+found_one:
+ /*
+ * If we reach here, control has a some data for us to read off.
+ * Note that stcb COULD be NULL.
+ */
+ control->some_taken++;
+ if (hold_sblock) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ hold_sblock = 0;
+ }
+ stcb = control->stcb;
+ if (stcb) {
+ if ((control->do_not_ref_stcb == 0) &&
+ (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED)) {
+ if (freecnt_applied == 0)
+ stcb = NULL;
+ } else if (control->do_not_ref_stcb == 0) {
+ /* you can't free it on me please */
+ /*
+ * The lock on the socket buffer protects us so the
+ * free code will stop. But since we used the
+ * socketbuf lock and the sender uses the tcb_lock
+ * to increment, we need to use the atomic add to
+ * the refcnt
+ */
+ if (freecnt_applied) {
+#ifdef INVARIANTS
+ panic("refcnt already incremented");
+#else
+ printf("refcnt already incremented?\n");
+#endif
+ } else {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ freecnt_applied = 1;
+ }
+ /*
+ * Setup to remember how much we have not yet told
+ * the peer our rwnd has opened up. Note we grab the
+ * value from the tcb from last time. Note too that
+ * sack sending clears this when a sack is sent,
+ * which is fine. Once we hit the rwnd_req, we then
+ * will go to the sctp_user_rcvd() that will not
+ * lock until it KNOWs it MUST send a WUP-SACK.
+ */
+ freed_so_far = stcb->freed_by_sorcv_sincelast;
+ stcb->freed_by_sorcv_sincelast = 0;
+ }
+ }
+ if (stcb &&
+ ((control->spec_flags & M_NOTIFICATION) == 0) &&
+ control->do_not_ref_stcb == 0) {
+ stcb->asoc.strmin[control->sinfo_stream].delivery_started = 1;
+ }
+ /* First lets get off the sinfo and sockaddr info */
+ if ((sinfo) && filling_sinfo) {
+ memcpy(sinfo, control, sizeof(struct sctp_nonpad_sndrcvinfo));
+ nxt = TAILQ_NEXT(control, next);
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) {
+ struct sctp_extrcvinfo *s_extra;
+
+ s_extra = (struct sctp_extrcvinfo *)sinfo;
+ if ((nxt) &&
+ (nxt->length)) {
+ s_extra->sreinfo_next_flags = SCTP_NEXT_MSG_AVAIL;
+ if (nxt->sinfo_flags & SCTP_UNORDERED) {
+ s_extra->sreinfo_next_flags |= SCTP_NEXT_MSG_IS_UNORDERED;
+ }
+ if (nxt->spec_flags & M_NOTIFICATION) {
+ s_extra->sreinfo_next_flags |= SCTP_NEXT_MSG_IS_NOTIFICATION;
+ }
+ s_extra->sreinfo_next_aid = nxt->sinfo_assoc_id;
+ s_extra->sreinfo_next_length = nxt->length;
+ s_extra->sreinfo_next_ppid = nxt->sinfo_ppid;
+ s_extra->sreinfo_next_stream = nxt->sinfo_stream;
+ if (nxt->tail_mbuf != NULL) {
+ if (nxt->end_added) {
+ s_extra->sreinfo_next_flags |= SCTP_NEXT_MSG_ISCOMPLETE;
+ }
+ }
+ } else {
+ /*
+ * we explicitly 0 this, since the memcpy
+ * got some other things beyond the older
+ * sinfo_ that is on the control's structure
+ * :-D
+ */
+ nxt = NULL;
+ s_extra->sreinfo_next_flags = SCTP_NO_NEXT_MSG;
+ s_extra->sreinfo_next_aid = 0;
+ s_extra->sreinfo_next_length = 0;
+ s_extra->sreinfo_next_ppid = 0;
+ s_extra->sreinfo_next_stream = 0;
+ }
+ }
+ /*
+ * update off the real current cum-ack, if we have an stcb.
+ */
+ if ((control->do_not_ref_stcb == 0) && stcb)
+ sinfo->sinfo_cumtsn = stcb->asoc.cumulative_tsn;
+ /*
+ * mask off the high bits, we keep the actual chunk bits in
+ * there.
+ */
+ sinfo->sinfo_flags &= 0x00ff;
+ if ((control->sinfo_flags >> 8) & SCTP_DATA_UNORDERED) {
+ sinfo->sinfo_flags |= SCTP_UNORDERED;
+ }
+ }
+#ifdef SCTP_ASOCLOG_OF_TSNS
+ {
+ int index, newindex;
+ struct sctp_pcbtsn_rlog *entry;
+
+ do {
+ index = inp->readlog_index;
+ newindex = index + 1;
+ if (newindex >= SCTP_READ_LOG_SIZE) {
+ newindex = 0;
+ }
+ } while (atomic_cmpset_int(&inp->readlog_index, index, newindex) == 0);
+ entry = &inp->readlog[index];
+ entry->vtag = control->sinfo_assoc_id;
+ entry->strm = control->sinfo_stream;
+ entry->seq = control->sinfo_ssn;
+ entry->sz = control->length;
+ entry->flgs = control->sinfo_flags;
+ }
+#endif
+ if (fromlen && from) {
+ struct sockaddr *to;
+
+#ifdef INET
+ cp_len = min((size_t)fromlen, (size_t)control->whoFrom->ro._l_addr.sin.sin_len);
+ memcpy(from, &control->whoFrom->ro._l_addr, cp_len);
+ ((struct sockaddr_in *)from)->sin_port = control->port_from;
+#else
+ /* No AF_INET use AF_INET6 */
+ cp_len = min((size_t)fromlen, (size_t)control->whoFrom->ro._l_addr.sin6.sin6_len);
+ memcpy(from, &control->whoFrom->ro._l_addr, cp_len);
+ ((struct sockaddr_in6 *)from)->sin6_port = control->port_from;
+#endif
+
+ to = from;
+#if defined(INET) && defined(INET6)
+ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) &&
+ (to->sa_family == AF_INET) &&
+ ((size_t)fromlen >= sizeof(struct sockaddr_in6))) {
+ struct sockaddr_in *sin;
+ struct sockaddr_in6 sin6;
+
+ sin = (struct sockaddr_in *)to;
+ bzero(&sin6, sizeof(sin6));
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_len = sizeof(struct sockaddr_in6);
+ sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
+ bcopy(&sin->sin_addr,
+ &sin6.sin6_addr.s6_addr32[3],
+ sizeof(sin6.sin6_addr.s6_addr32[3]));
+ sin6.sin6_port = sin->sin_port;
+ memcpy(from, (caddr_t)&sin6, sizeof(sin6));
+ }
+#endif
+#if defined(INET6)
+ {
+ struct sockaddr_in6 lsa6, *to6;
+
+ to6 = (struct sockaddr_in6 *)to;
+ sctp_recover_scope_mac(to6, (&lsa6));
+ }
+#endif
+ }
+ /* now copy out what data we can */
+ if (mp == NULL) {
+ /* copy out each mbuf in the chain up to length */
+get_more_data:
+ m = control->data;
+ while (m) {
+ /* Move out all we can */
+ cp_len = (int)uio->uio_resid;
+ my_len = (int)SCTP_BUF_LEN(m);
+ if (cp_len > my_len) {
+ /* not enough in this buf */
+ cp_len = my_len;
+ }
+ if (hold_rlock) {
+ SCTP_INP_READ_UNLOCK(inp);
+ hold_rlock = 0;
+ }
+ if (cp_len > 0)
+ error = uiomove(mtod(m, char *), cp_len, uio);
+ /* re-read */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ goto release;
+ }
+ if ((control->do_not_ref_stcb == 0) && stcb &&
+ stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ no_rcv_needed = 1;
+ }
+ if (error) {
+ /* error we are out of here */
+ goto release;
+ }
+ if ((SCTP_BUF_NEXT(m) == NULL) &&
+ (cp_len >= SCTP_BUF_LEN(m)) &&
+ ((control->end_added == 0) ||
+ (control->end_added &&
+ (TAILQ_NEXT(control, next) == NULL)))
+ ) {
+ SCTP_INP_READ_LOCK(inp);
+ hold_rlock = 1;
+ }
+ if (cp_len == SCTP_BUF_LEN(m)) {
+ if ((SCTP_BUF_NEXT(m) == NULL) &&
+ (control->end_added)) {
+ out_flags |= MSG_EOR;
+ if ((control->do_not_ref_stcb == 0) &&
+ (control->stcb != NULL) &&
+ ((control->spec_flags & M_NOTIFICATION) == 0))
+ control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0;
+ }
+ if (control->spec_flags & M_NOTIFICATION) {
+ out_flags |= MSG_NOTIFICATION;
+ }
+ /* we ate up the mbuf */
+ if (in_flags & MSG_PEEK) {
+ /* just looking */
+ m = SCTP_BUF_NEXT(m);
+ copied_so_far += cp_len;
+ } else {
+ /* dispose of the mbuf */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(&so->so_rcv,
+ control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, SCTP_BUF_LEN(m));
+ }
+ sctp_sbfree(control, stcb, &so->so_rcv, m);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(&so->so_rcv,
+ control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
+ }
+ copied_so_far += cp_len;
+ freed_so_far += cp_len;
+ freed_so_far += MSIZE;
+ atomic_subtract_int(&control->length, cp_len);
+ control->data = sctp_m_free(m);
+ m = control->data;
+ /*
+ * been through it all, must hold sb
+ * lock ok to null tail
+ */
+ if (control->data == NULL) {
+#ifdef INVARIANTS
+ if ((control->end_added == 0) ||
+ (TAILQ_NEXT(control, next) == NULL)) {
+ /*
+ * If the end is not
+ * added, OR the
+ * next is NOT null
+ * we MUST have the
+ * lock.
+ */
+ if (mtx_owned(&inp->inp_rdata_mtx) == 0) {
+ panic("Hmm we don't own the lock?");
+ }
+ }
+#endif
+ control->tail_mbuf = NULL;
+#ifdef INVARIANTS
+ if ((control->end_added) && ((out_flags & MSG_EOR) == 0)) {
+ panic("end_added, nothing left and no MSG_EOR");
+ }
+#endif
+ }
+ }
+ } else {
+ /* Do we need to trim the mbuf? */
+ if (control->spec_flags & M_NOTIFICATION) {
+ out_flags |= MSG_NOTIFICATION;
+ }
+ if ((in_flags & MSG_PEEK) == 0) {
+ SCTP_BUF_RESV_UF(m, cp_len);
+ SCTP_BUF_LEN(m) -= cp_len;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, cp_len);
+ }
+ atomic_subtract_int(&so->so_rcv.sb_cc, cp_len);
+ if ((control->do_not_ref_stcb == 0) &&
+ stcb) {
+ atomic_subtract_int(&stcb->asoc.sb_cc, cp_len);
+ }
+ copied_so_far += cp_len;
+ freed_so_far += cp_len;
+ freed_so_far += MSIZE;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb,
+ SCTP_LOG_SBRESULT, 0);
+ }
+ atomic_subtract_int(&control->length, cp_len);
+ } else {
+ copied_so_far += cp_len;
+ }
+ }
+ if ((out_flags & MSG_EOR) || (uio->uio_resid == 0)) {
+ break;
+ }
+ if (((stcb) && (in_flags & MSG_PEEK) == 0) &&
+ (control->do_not_ref_stcb == 0) &&
+ (freed_so_far >= rwnd_req)) {
+ sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req);
+ }
+ } /* end while(m) */
+ /*
+ * At this point we have looked at it all and we either have
+ * a MSG_EOR/or read all the user wants... <OR>
+ * control->length == 0.
+ */
+ if ((out_flags & MSG_EOR) && ((in_flags & MSG_PEEK) == 0)) {
+ /* we are done with this control */
+ if (control->length == 0) {
+ if (control->data) {
+#ifdef INVARIANTS
+ panic("control->data not null at read eor?");
+#else
+ SCTP_PRINTF("Strange, data left in the control buffer .. invarients would panic?\n");
+ sctp_m_freem(control->data);
+ control->data = NULL;
+#endif
+ }
+ done_with_control:
+ if (TAILQ_NEXT(control, next) == NULL) {
+ /*
+ * If we don't have a next we need a
+ * lock, if there is a next
+ * interrupt is filling ahead of us
+ * and we don't need a lock to
+ * remove this guy (which is the
+ * head of the queue).
+ */
+ if (hold_rlock == 0) {
+ SCTP_INP_READ_LOCK(inp);
+ hold_rlock = 1;
+ }
+ }
+ TAILQ_REMOVE(&inp->read_queue, control, next);
+ /* Add back any hiddend data */
+ if (control->held_length) {
+ held_length = 0;
+ control->held_length = 0;
+ wakeup_read_socket = 1;
+ }
+ if (control->aux_data) {
+ sctp_m_free(control->aux_data);
+ control->aux_data = NULL;
+ }
+ no_rcv_needed = control->do_not_ref_stcb;
+ sctp_free_remote_addr(control->whoFrom);
+ control->data = NULL;
+ sctp_free_a_readq(stcb, control);
+ control = NULL;
+ if ((freed_so_far >= rwnd_req) &&
+ (no_rcv_needed == 0))
+ sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req);
+
+ } else {
+ /*
+ * The user did not read all of this
+ * message, turn off the returned MSG_EOR
+ * since we are leaving more behind on the
+ * control to read.
+ */
+#ifdef INVARIANTS
+ if (control->end_added &&
+ (control->data == NULL) &&
+ (control->tail_mbuf == NULL)) {
+ panic("Gak, control->length is corrupt?");
+ }
+#endif
+ no_rcv_needed = control->do_not_ref_stcb;
+ out_flags &= ~MSG_EOR;
+ }
+ }
+ if (out_flags & MSG_EOR) {
+ goto release;
+ }
+ if ((uio->uio_resid == 0) ||
+ ((in_eeor_mode) && (copied_so_far >= max(so->so_rcv.sb_lowat, 1)))
+ ) {
+ goto release;
+ }
+ /*
+ * If I hit here the receiver wants more and this message is
+ * NOT done (pd-api). So two questions. Can we block? if not
+ * we are done. Did the user NOT set MSG_WAITALL?
+ */
+ if (block_allowed == 0) {
+ goto release;
+ }
+ /*
+ * We need to wait for more data a few things: - We don't
+ * sbunlock() so we don't get someone else reading. - We
+ * must be sure to account for the case where what is added
+ * is NOT to our control when we wakeup.
+ */
+
+ /*
+ * Do we need to tell the transport a rwnd update might be
+ * needed before we go to sleep?
+ */
+ if (((stcb) && (in_flags & MSG_PEEK) == 0) &&
+ ((freed_so_far >= rwnd_req) &&
+ (control->do_not_ref_stcb == 0) &&
+ (no_rcv_needed == 0))) {
+ sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req);
+ }
+wait_some_more:
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ goto release;
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)
+ goto release;
+
+ if (hold_rlock == 1) {
+ SCTP_INP_READ_UNLOCK(inp);
+ hold_rlock = 0;
+ }
+ if (hold_sblock == 0) {
+ SOCKBUF_LOCK(&so->so_rcv);
+ hold_sblock = 1;
+ }
+ if ((copied_so_far) && (control->length == 0) &&
+ (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE))) {
+ goto release;
+ }
+ if (so->so_rcv.sb_cc <= control->held_length) {
+ error = sbwait(&so->so_rcv);
+ if (error) {
+ goto release;
+ }
+ control->held_length = 0;
+ }
+ if (hold_sblock) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ hold_sblock = 0;
+ }
+ if (control->length == 0) {
+ /* still nothing here */
+ if (control->end_added == 1) {
+ /* he aborted, or is done i.e.did a shutdown */
+ out_flags |= MSG_EOR;
+ if (control->pdapi_aborted) {
+ if ((control->do_not_ref_stcb == 0) && ((control->spec_flags & M_NOTIFICATION) == 0))
+ control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0;
+
+ out_flags |= MSG_TRUNC;
+ } else {
+ if ((control->do_not_ref_stcb == 0) && ((control->spec_flags & M_NOTIFICATION) == 0))
+ control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0;
+ }
+ goto done_with_control;
+ }
+ if (so->so_rcv.sb_cc > held_length) {
+ control->held_length = so->so_rcv.sb_cc;
+ held_length = 0;
+ }
+ goto wait_some_more;
+ } else if (control->data == NULL) {
+ /*
+ * we must re-sync since data is probably being
+ * added
+ */
+ SCTP_INP_READ_LOCK(inp);
+ if ((control->length > 0) && (control->data == NULL)) {
+ /*
+ * big trouble.. we have the lock and its
+ * corrupt?
+ */
+#ifdef INVARIANTS
+ panic("Impossible data==NULL length !=0");
+#endif
+ out_flags |= MSG_EOR;
+ out_flags |= MSG_TRUNC;
+ control->length = 0;
+ SCTP_INP_READ_UNLOCK(inp);
+ goto done_with_control;
+ }
+ SCTP_INP_READ_UNLOCK(inp);
+ /* We will fall around to get more data */
+ }
+ goto get_more_data;
+ } else {
+ /*-
+ * Give caller back the mbuf chain,
+ * store in uio_resid the length
+ */
+ wakeup_read_socket = 0;
+ if ((control->end_added == 0) ||
+ (TAILQ_NEXT(control, next) == NULL)) {
+ /* Need to get rlock */
+ if (hold_rlock == 0) {
+ SCTP_INP_READ_LOCK(inp);
+ hold_rlock = 1;
+ }
+ }
+ if (control->end_added) {
+ out_flags |= MSG_EOR;
+ if ((control->do_not_ref_stcb == 0) && ((control->spec_flags & M_NOTIFICATION) == 0))
+ control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0;
+ }
+ if (control->spec_flags & M_NOTIFICATION) {
+ out_flags |= MSG_NOTIFICATION;
+ }
+ uio->uio_resid = control->length;
+ *mp = control->data;
+ m = control->data;
+ while (m) {
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(&so->so_rcv,
+ control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, SCTP_BUF_LEN(m));
+ }
+ sctp_sbfree(control, stcb, &so->so_rcv, m);
+ freed_so_far += SCTP_BUF_LEN(m);
+ freed_so_far += MSIZE;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
+ sctp_sblog(&so->so_rcv,
+ control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
+ }
+ m = SCTP_BUF_NEXT(m);
+ }
+ control->data = control->tail_mbuf = NULL;
+ control->length = 0;
+ if (out_flags & MSG_EOR) {
+ /* Done with this control */
+ goto done_with_control;
+ }
+ }
+release:
+ if (hold_rlock == 1) {
+ SCTP_INP_READ_UNLOCK(inp);
+ hold_rlock = 0;
+ }
+ if (hold_sblock == 1) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ hold_sblock = 0;
+ }
+ sbunlock(&so->so_rcv);
+ sockbuf_lock = 0;
+
+release_unlocked:
+ if (hold_sblock) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ hold_sblock = 0;
+ }
+ if ((stcb) && (in_flags & MSG_PEEK) == 0) {
+ if ((freed_so_far >= rwnd_req) &&
+ (control && (control->do_not_ref_stcb == 0)) &&
+ (no_rcv_needed == 0))
+ sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req);
+ }
+out:
+ if (msg_flags) {
+ *msg_flags = out_flags;
+ }
+ if (((out_flags & MSG_EOR) == 0) &&
+ ((in_flags & MSG_PEEK) == 0) &&
+ (sinfo) &&
+ (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO))) {
+ struct sctp_extrcvinfo *s_extra;
+
+ s_extra = (struct sctp_extrcvinfo *)sinfo;
+ s_extra->sreinfo_next_flags = SCTP_NO_NEXT_MSG;
+ }
+ if (hold_rlock == 1) {
+ SCTP_INP_READ_UNLOCK(inp);
+ hold_rlock = 0;
+ }
+ if (hold_sblock) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ hold_sblock = 0;
+ }
+ if (sockbuf_lock) {
+ sbunlock(&so->so_rcv);
+ }
+ if (freecnt_applied) {
+ /*
+ * The lock on the socket buffer protects us so the free
+ * code will stop. But since we used the socketbuf lock and
+ * the sender uses the tcb_lock to increment, we need to use
+ * the atomic add to the refcnt.
+ */
+ if (stcb == NULL) {
+#ifdef INVARIANTS
+ panic("stcb for refcnt has gone NULL?");
+ goto stage_left;
+#else
+ goto stage_left;
+#endif
+ }
+ atomic_add_int(&stcb->asoc.refcnt, -1);
+ freecnt_applied = 0;
+ /* Save the value back for next time */
+ stcb->freed_by_sorcv_sincelast = freed_so_far;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) {
+ if (stcb) {
+ sctp_misc_ints(SCTP_SORECV_DONE,
+ freed_so_far,
+ ((uio) ? (slen - uio->uio_resid) : slen),
+ stcb->asoc.my_rwnd,
+ so->so_rcv.sb_cc);
+ } else {
+ sctp_misc_ints(SCTP_SORECV_DONE,
+ freed_so_far,
+ ((uio) ? (slen - uio->uio_resid) : slen),
+ 0,
+ so->so_rcv.sb_cc);
+ }
+ }
+stage_left:
+ if (wakeup_read_socket) {
+ sctp_sorwakeup(inp, so);
+ }
+ return (error);
+}
+
+
+#ifdef SCTP_MBUF_LOGGING
+struct mbuf *
+sctp_m_free(struct mbuf *m)
+{
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ if (SCTP_BUF_IS_EXTENDED(m)) {
+ sctp_log_mb(m, SCTP_MBUF_IFREE);
+ }
+ }
+ return (m_free(m));
+}
+
+void
+sctp_m_freem(struct mbuf *mb)
+{
+ while (mb != NULL)
+ mb = sctp_m_free(mb);
+}
+
+#endif
+
+int
+sctp_dynamic_set_primary(struct sockaddr *sa, uint32_t vrf_id)
+{
+ /*
+ * Given a local address. For all associations that holds the
+ * address, request a peer-set-primary.
+ */
+ struct sctp_ifa *ifa;
+ struct sctp_laddr *wi;
+
+ ifa = sctp_find_ifa_by_addr(sa, vrf_id, 0);
+ if (ifa == NULL) {
+ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EADDRNOTAVAIL);
+ return (EADDRNOTAVAIL);
+ }
+ /*
+ * Now that we have the ifa we must awaken the iterator with this
+ * message.
+ */
+ wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
+ if (wi == NULL) {
+ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOMEM);
+ return (ENOMEM);
+ }
+ /* Now incr the count and int wi structure */
+ SCTP_INCR_LADDR_COUNT();
+ bzero(wi, sizeof(*wi));
+ (void)SCTP_GETTIME_TIMEVAL(&wi->start_time);
+ wi->ifa = ifa;
+ wi->action = SCTP_SET_PRIM_ADDR;
+ atomic_add_int(&ifa->refcount, 1);
+
+ /* Now add it to the work queue */
+ SCTP_WQ_ADDR_LOCK();
+ /*
+ * Should this really be a tailq? As it is we will process the
+ * newest first :-0
+ */
+ LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr);
+ SCTP_WQ_ADDR_UNLOCK();
+ sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ,
+ (struct sctp_inpcb *)NULL,
+ (struct sctp_tcb *)NULL,
+ (struct sctp_nets *)NULL);
+ return (0);
+}
+
+
+int
+sctp_soreceive(struct socket *so,
+ struct sockaddr **psa,
+ struct uio *uio,
+ struct mbuf **mp0,
+ struct mbuf **controlp,
+ int *flagsp)
+{
+ int error, fromlen;
+ uint8_t sockbuf[256];
+ struct sockaddr *from;
+ struct sctp_extrcvinfo sinfo;
+ int filling_sinfo = 1;
+ struct sctp_inpcb *inp;
+
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ /* pickup the assoc we are reading from */
+ if (inp == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ return (EINVAL);
+ }
+ if ((sctp_is_feature_off(inp,
+ SCTP_PCB_FLAGS_RECVDATAIOEVNT)) ||
+ (controlp == NULL)) {
+ /* user does not want the sndrcv ctl */
+ filling_sinfo = 0;
+ }
+ if (psa) {
+ from = (struct sockaddr *)sockbuf;
+ fromlen = sizeof(sockbuf);
+ from->sa_len = 0;
+ } else {
+ from = NULL;
+ fromlen = 0;
+ }
+
+ error = sctp_sorecvmsg(so, uio, mp0, from, fromlen, flagsp,
+ (struct sctp_sndrcvinfo *)&sinfo, filling_sinfo);
+ if ((controlp) && (filling_sinfo)) {
+ /* copy back the sinfo in a CMSG format */
+ if (filling_sinfo)
+ *controlp = sctp_build_ctl_nchunk(inp,
+ (struct sctp_sndrcvinfo *)&sinfo);
+ else
+ *controlp = NULL;
+ }
+ if (psa) {
+ /* copy back the address info */
+ if (from && from->sa_len) {
+ *psa = sodupsockaddr(from, M_NOWAIT);
+ } else {
+ *psa = NULL;
+ }
+ }
+ return (error);
+}
+
+
+int
+sctp_l_soreceive(struct socket *so,
+ struct sockaddr **name,
+ struct uio *uio,
+ char **controlp,
+ int *controllen,
+ int *flag)
+{
+ int error, fromlen;
+ uint8_t sockbuf[256];
+ struct sockaddr *from;
+ struct sctp_extrcvinfo sinfo;
+ int filling_sinfo = 1;
+ struct sctp_inpcb *inp;
+
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ /* pickup the assoc we are reading from */
+ if (inp == NULL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ return (EINVAL);
+ }
+ if ((sctp_is_feature_off(inp,
+ SCTP_PCB_FLAGS_RECVDATAIOEVNT)) ||
+ (controlp == NULL)) {
+ /* user does not want the sndrcv ctl */
+ filling_sinfo = 0;
+ }
+ if (name) {
+ from = (struct sockaddr *)sockbuf;
+ fromlen = sizeof(sockbuf);
+ from->sa_len = 0;
+ } else {
+ from = NULL;
+ fromlen = 0;
+ }
+
+ error = sctp_sorecvmsg(so, uio,
+ (struct mbuf **)NULL,
+ from, fromlen, flag,
+ (struct sctp_sndrcvinfo *)&sinfo,
+ filling_sinfo);
+ if ((controlp) && (filling_sinfo)) {
+ /*
+ * copy back the sinfo in a CMSG format note that the caller
+ * has reponsibility for freeing the memory.
+ */
+ if (filling_sinfo)
+ *controlp = sctp_build_ctl_cchunk(inp,
+ controllen,
+ (struct sctp_sndrcvinfo *)&sinfo);
+ }
+ if (name) {
+ /* copy back the address info */
+ if (from && from->sa_len) {
+ *name = sodupsockaddr(from, M_WAIT);
+ } else {
+ *name = NULL;
+ }
+ }
+ return (error);
+}
+
+
+
+
+
+
+
+int
+sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr,
+ int totaddr, int *error)
+{
+ int added = 0;
+ int i;
+ struct sctp_inpcb *inp;
+ struct sockaddr *sa;
+ size_t incr = 0;
+
+ sa = addr;
+ inp = stcb->sctp_ep;
+ *error = 0;
+ for (i = 0; i < totaddr; i++) {
+ if (sa->sa_family == AF_INET) {
+ incr = sizeof(struct sockaddr_in);
+ if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) {
+ /* assoc gone no un-lock */
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7);
+ *error = ENOBUFS;
+ goto out_now;
+ }
+ added++;
+ } else if (sa->sa_family == AF_INET6) {
+ incr = sizeof(struct sockaddr_in6);
+ if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) {
+ /* assoc gone no un-lock */
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_8);
+ *error = ENOBUFS;
+ goto out_now;
+ }
+ added++;
+ }
+ sa = (struct sockaddr *)((caddr_t)sa + incr);
+ }
+out_now:
+ return (added);
+}
+
+struct sctp_tcb *
+sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr,
+ int *totaddr, int *num_v4, int *num_v6, int *error,
+ int limit, int *bad_addr)
+{
+ struct sockaddr *sa;
+ struct sctp_tcb *stcb = NULL;
+ size_t incr, at, i;
+
+ at = incr = 0;
+ sa = addr;
+ *error = *num_v6 = *num_v4 = 0;
+ /* account and validate addresses */
+ for (i = 0; i < (size_t)*totaddr; i++) {
+ if (sa->sa_family == AF_INET) {
+ (*num_v4) += 1;
+ incr = sizeof(struct sockaddr_in);
+ if (sa->sa_len != incr) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ *bad_addr = 1;
+ return (NULL);
+ }
+ } else if (sa->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)sa;
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ /* Must be non-mapped for connectx */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ *bad_addr = 1;
+ return (NULL);
+ }
+ (*num_v6) += 1;
+ incr = sizeof(struct sockaddr_in6);
+ if (sa->sa_len != incr) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ *bad_addr = 1;
+ return (NULL);
+ }
+ } else {
+ *totaddr = i;
+ /* we are done */
+ break;
+ }
+ SCTP_INP_INCR_REF(inp);
+ stcb = sctp_findassociation_ep_addr(&inp, sa, NULL, NULL, NULL);
+ if (stcb != NULL) {
+ /* Already have or am bring up an association */
+ return (stcb);
+ } else {
+ SCTP_INP_DECR_REF(inp);
+ }
+ if ((at + incr) > (size_t)limit) {
+ *totaddr = i;
+ break;
+ }
+ sa = (struct sockaddr *)((caddr_t)sa + incr);
+ }
+ return ((struct sctp_tcb *)NULL);
+}
+
+/*
+ * sctp_bindx(ADD) for one address.
+ * assumes all arguments are valid/checked by caller.
+ */
+void
+sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp,
+ struct sockaddr *sa, sctp_assoc_t assoc_id,
+ uint32_t vrf_id, int *error, void *p)
+{
+ struct sockaddr *addr_touse;
+
+#ifdef INET6
+ struct sockaddr_in sin;
+
+#endif
+
+ /* see if we're bound all already! */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ }
+ addr_touse = sa;
+#if defined(INET6) && !defined(__Userspace__) /* TODO port in6_sin6_2_sin */
+ if (sa->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ if (sa->sa_len != sizeof(struct sockaddr_in6)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
+ /* can only bind v6 on PF_INET6 sockets */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ }
+ sin6 = (struct sockaddr_in6 *)addr_touse;
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
+ SCTP_IPV6_V6ONLY(inp)) {
+ /* can't bind v4-mapped on PF_INET sockets */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ }
+ in6_sin6_2_sin(&sin, sin6);
+ addr_touse = (struct sockaddr *)&sin;
+ }
+ }
+#endif
+ if (sa->sa_family == AF_INET) {
+ if (sa->sa_len != sizeof(struct sockaddr_in)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
+ SCTP_IPV6_V6ONLY(inp)) {
+ /* can't bind v4 on PF_INET sockets */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ }
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) {
+ if (p == NULL) {
+ /* Can't get proc for Net/Open BSD */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ }
+ *error = sctp_inpcb_bind(so, addr_touse, NULL, p);
+ return;
+ }
+ /*
+ * No locks required here since bind and mgmt_ep_sa all do their own
+ * locking. If we do something for the FIX: below we may need to
+ * lock in that case.
+ */
+ if (assoc_id == 0) {
+ /* add the address */
+ struct sctp_inpcb *lep;
+ struct sockaddr_in *lsin = (struct sockaddr_in *)addr_touse;
+
+ /* validate the incoming port */
+ if ((lsin->sin_port != 0) &&
+ (lsin->sin_port != inp->sctp_lport)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ } else {
+ /* user specified 0 port, set it to existing port */
+ lsin->sin_port = inp->sctp_lport;
+ }
+
+ lep = sctp_pcb_findep(addr_touse, 1, 0, vrf_id);
+ if (lep != NULL) {
+ /*
+ * We must decrement the refcount since we have the
+ * ep already and are binding. No remove going on
+ * here.
+ */
+ SCTP_INP_DECR_REF(lep);
+ }
+ if (lep == inp) {
+ /* already bound to it.. ok */
+ return;
+ } else if (lep == NULL) {
+ ((struct sockaddr_in *)addr_touse)->sin_port = 0;
+ *error = sctp_addr_mgmt_ep_sa(inp, addr_touse,
+ SCTP_ADD_IP_ADDRESS,
+ vrf_id, NULL);
+ } else {
+ *error = EADDRINUSE;
+ }
+ if (*error)
+ return;
+ } else {
+ /*
+ * FIX: decide whether we allow assoc based bindx
+ */
+ }
+}
+
+/*
+ * sctp_bindx(DELETE) for one address.
+ * assumes all arguments are valid/checked by caller.
+ */
+void
+sctp_bindx_delete_address(struct socket *so, struct sctp_inpcb *inp,
+ struct sockaddr *sa, sctp_assoc_t assoc_id,
+ uint32_t vrf_id, int *error)
+{
+ struct sockaddr *addr_touse;
+
+#ifdef INET6
+ struct sockaddr_in sin;
+
+#endif
+
+ /* see if we're bound all already! */
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ }
+ addr_touse = sa;
+#if defined(INET6) && !defined(__Userspace__) /* TODO port in6_sin6_2_sin */
+ if (sa->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ if (sa->sa_len != sizeof(struct sockaddr_in6)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
+ /* can only bind v6 on PF_INET6 sockets */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ }
+ sin6 = (struct sockaddr_in6 *)addr_touse;
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
+ SCTP_IPV6_V6ONLY(inp)) {
+ /* can't bind mapped-v4 on PF_INET sockets */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ }
+ in6_sin6_2_sin(&sin, sin6);
+ addr_touse = (struct sockaddr *)&sin;
+ }
+ }
+#endif
+ if (sa->sa_family == AF_INET) {
+ if (sa->sa_len != sizeof(struct sockaddr_in)) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ }
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
+ SCTP_IPV6_V6ONLY(inp)) {
+ /* can't bind v4 on PF_INET sockets */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
+ *error = EINVAL;
+ return;
+ }
+ }
+ /*
+ * No lock required mgmt_ep_sa does its own locking. If the FIX:
+ * below is ever changed we may need to lock before calling
+ * association level binding.
+ */
+ if (assoc_id == 0) {
+ /* delete the address */
+ *error = sctp_addr_mgmt_ep_sa(inp, addr_touse,
+ SCTP_DEL_IP_ADDRESS,
+ vrf_id, NULL);
+ } else {
+ /*
+ * FIX: decide whether we allow assoc based bindx
+ */
+ }
+}
+
+/*
+ * returns the valid local address count for an assoc, taking into account
+ * all scoping rules
+ */
+int
+sctp_local_addr_count(struct sctp_tcb *stcb)
+{
+ int loopback_scope, ipv4_local_scope, local_scope, site_scope;
+ int ipv4_addr_legal, ipv6_addr_legal;
+ struct sctp_vrf *vrf;
+ struct sctp_ifn *sctp_ifn;
+ struct sctp_ifa *sctp_ifa;
+ int count = 0;
+
+ /* Turn on all the appropriate scopes */
+ loopback_scope = stcb->asoc.loopback_scope;
+ ipv4_local_scope = stcb->asoc.ipv4_local_scope;
+ local_scope = stcb->asoc.local_scope;
+ site_scope = stcb->asoc.site_scope;
+ ipv4_addr_legal = ipv6_addr_legal = 0;
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ ipv6_addr_legal = 1;
+ if (SCTP_IPV6_V6ONLY(stcb->sctp_ep) == 0) {
+ ipv4_addr_legal = 1;
+ }
+ } else {
+ ipv4_addr_legal = 1;
+ }
+
+ SCTP_IPI_ADDR_RLOCK();
+ vrf = sctp_find_vrf(stcb->asoc.vrf_id);
+ if (vrf == NULL) {
+ /* no vrf, no addresses */
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (0);
+ }
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ /*
+ * bound all case: go through all ifns on the vrf
+ */
+ LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
+ if ((loopback_scope == 0) &&
+ SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
+ continue;
+ }
+ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
+ if (sctp_is_addr_restricted(stcb, sctp_ifa))
+ continue;
+ switch (sctp_ifa->address.sa.sa_family) {
+ case AF_INET:
+ if (ipv4_addr_legal) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&sctp_ifa->address.sa;
+ if (sin->sin_addr.s_addr == 0) {
+ /*
+ * skip unspecified
+ * addrs
+ */
+ continue;
+ }
+ if ((ipv4_local_scope == 0) &&
+ (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) {
+ continue;
+ }
+ /* count this one */
+ count++;
+ } else {
+ continue;
+ }
+ break;
+#ifdef INET6
+ case AF_INET6:
+ if (ipv6_addr_legal) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa;
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+ continue;
+ }
+ if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
+ if (local_scope == 0)
+ continue;
+ if (sin6->sin6_scope_id == 0) {
+ if (sa6_recoverscope(sin6) != 0)
+ /*
+ *
+ * bad
+ *
+ * li
+ * nk
+ *
+ * loc
+ * al
+ *
+ * add
+ * re
+ * ss
+ * */
+ continue;
+ }
+ }
+ if ((site_scope == 0) &&
+ (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) {
+ continue;
+ }
+ /* count this one */
+ count++;
+ }
+ break;
+#endif
+ default:
+ /* TSNH */
+ break;
+ }
+ }
+ }
+ } else {
+ /*
+ * subset bound case
+ */
+ struct sctp_laddr *laddr;
+
+ LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list,
+ sctp_nxt_addr) {
+ if (sctp_is_addr_restricted(stcb, laddr->ifa)) {
+ continue;
+ }
+ /* count this one */
+ count++;
+ }
+ }
+ SCTP_IPI_ADDR_RUNLOCK();
+ return (count);
+}
+
+#if defined(SCTP_LOCAL_TRACE_BUF)
+
+void
+sctp_log_trace(uint32_t subsys, const char *str SCTP_UNUSED, uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e, uint32_t f)
+{
+ uint32_t saveindex, newindex;
+
+ do {
+ saveindex = SCTP_BASE_SYSCTL(sctp_log).index;
+ if (saveindex >= SCTP_MAX_LOGGING_SIZE) {
+ newindex = 1;
+ } else {
+ newindex = saveindex + 1;
+ }
+ } while (atomic_cmpset_int(&SCTP_BASE_SYSCTL(sctp_log).index, saveindex, newindex) == 0);
+ if (saveindex >= SCTP_MAX_LOGGING_SIZE) {
+ saveindex = 0;
+ }
+ SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].timestamp = SCTP_GET_CYCLECOUNT;
+ SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].subsys = subsys;
+ SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[0] = a;
+ SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[1] = b;
+ SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[2] = c;
+ SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[3] = d;
+ SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[4] = e;
+ SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[5] = f;
+}
+
+#endif
+/* We will need to add support
+ * to bind the ports and such here
+ * so we can do UDP tunneling. In
+ * the mean-time, we return error
+ */
+#include <freebsd/netinet/udp.h>
+#include <freebsd/netinet/udp_var.h>
+#include <freebsd/sys/proc.h>
+#ifdef INET6
+#include <freebsd/netinet6/sctp6_var.h>
+#endif
+
+static void
+sctp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *ignored)
+{
+ struct ip *iph;
+ struct mbuf *sp, *last;
+ struct udphdr *uhdr;
+ uint16_t port = 0, len;
+ int header_size = sizeof(struct udphdr) + sizeof(struct sctphdr);
+
+ /*
+ * Split out the mbuf chain. Leave the IP header in m, place the
+ * rest in the sp.
+ */
+ if ((m->m_flags & M_PKTHDR) == 0) {
+ /* Can't handle one that is not a pkt hdr */
+ goto out;
+ }
+ /* pull the src port */
+ iph = mtod(m, struct ip *);
+ uhdr = (struct udphdr *)((caddr_t)iph + off);
+
+ port = uhdr->uh_sport;
+ sp = m_split(m, off, M_DONTWAIT);
+ if (sp == NULL) {
+ /* Gak, drop packet, we can't do a split */
+ goto out;
+ }
+ if (sp->m_pkthdr.len < header_size) {
+ /* Gak, packet can't have an SCTP header in it - to small */
+ m_freem(sp);
+ goto out;
+ }
+ /* ok now pull up the UDP header and SCTP header together */
+ sp = m_pullup(sp, header_size);
+ if (sp == NULL) {
+ /* Gak pullup failed */
+ goto out;
+ }
+ /* trim out the UDP header */
+ m_adj(sp, sizeof(struct udphdr));
+
+ /* Now reconstruct the mbuf chain */
+ /* 1) find last one */
+ last = m;
+ while (last->m_next != NULL) {
+ last = last->m_next;
+ }
+ last->m_next = sp;
+ m->m_pkthdr.len += sp->m_pkthdr.len;
+ last = m;
+ while (last != NULL) {
+ last = last->m_next;
+ }
+ /* Now its ready for sctp_input or sctp6_input */
+ iph = mtod(m, struct ip *);
+ switch (iph->ip_v) {
+ case IPVERSION:
+ {
+ /* its IPv4 */
+ len = SCTP_GET_IPV4_LENGTH(iph);
+ len -= sizeof(struct udphdr);
+ SCTP_GET_IPV4_LENGTH(iph) = len;
+ sctp_input_with_port(m, off, port);
+ break;
+ }
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ {
+ /* its IPv6 - NOT supported */
+ goto out;
+ break;
+
+ }
+#endif
+ default:
+ {
+ m_freem(m);
+ break;
+ }
+ }
+ return;
+out:
+ m_freem(m);
+}
+
+void
+sctp_over_udp_stop(void)
+{
+ struct socket *sop;
+
+ /*
+ * This function assumes sysctl caller holds sctp_sysctl_info_lock()
+ * for writting!
+ */
+ if (SCTP_BASE_INFO(udp_tun_socket) == NULL) {
+ /* Nothing to do */
+ return;
+ }
+ sop = SCTP_BASE_INFO(udp_tun_socket);
+ soclose(sop);
+ SCTP_BASE_INFO(udp_tun_socket) = NULL;
+}
+int
+sctp_over_udp_start(void)
+{
+ uint16_t port;
+ int ret;
+ struct sockaddr_in sin;
+ struct socket *sop = NULL;
+ struct thread *th;
+ struct ucred *cred;
+
+ /*
+ * This function assumes sysctl caller holds sctp_sysctl_info_lock()
+ * for writting!
+ */
+ port = SCTP_BASE_SYSCTL(sctp_udp_tunneling_port);
+ if (port == 0) {
+ /* Must have a port set */
+ return (EINVAL);
+ }
+ if (SCTP_BASE_INFO(udp_tun_socket) != NULL) {
+ /* Already running -- must stop first */
+ return (EALREADY);
+ }
+ th = curthread;
+ cred = th->td_ucred;
+ if ((ret = socreate(PF_INET, &sop,
+ SOCK_DGRAM, IPPROTO_UDP, cred, th))) {
+ return (ret);
+ }
+ SCTP_BASE_INFO(udp_tun_socket) = sop;
+ /* call the special UDP hook */
+ ret = udp_set_kernel_tunneling(sop, sctp_recv_udp_tunneled_packet);
+ if (ret) {
+ goto exit_stage_left;
+ }
+ /* Ok we have a socket, bind it to the port */
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_port = htons(port);
+ ret = sobind(sop, (struct sockaddr *)&sin, th);
+ if (ret) {
+ /* Close up we cant get the port */
+exit_stage_left:
+ sctp_over_udp_stop();
+ return (ret);
+ }
+ /*
+ * Ok we should now get UDP packets directly to our input routine
+ * sctp_recv_upd_tunneled_packet().
+ */
+ return (0);
+}
diff --git a/freebsd/sys/netinet/sctputil.h b/freebsd/sys/netinet/sctputil.h
new file mode 100644
index 00000000..b1bee3a4
--- /dev/null
+++ b/freebsd/sys/netinet/sctputil.h
@@ -0,0 +1,392 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+/* $KAME: sctputil.h,v 1.15 2005/03/06 16:04:19 itojun Exp $ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#ifndef __sctputil_h__
+#define __sctputil_h__
+
+
+#if defined(_KERNEL) || defined(__Userspace__)
+
+#define SCTP_READ_LOCK_HELD 1
+#define SCTP_READ_LOCK_NOT_HELD 0
+
+#ifdef SCTP_ASOCLOG_OF_TSNS
+void sctp_print_out_track_log(struct sctp_tcb *stcb);
+
+#endif
+
+#ifdef SCTP_MBUF_LOGGING
+struct mbuf *sctp_m_free(struct mbuf *m);
+void sctp_m_freem(struct mbuf *m);
+
+#else
+#define sctp_m_free m_free
+#define sctp_m_freem m_freem
+#endif
+
+#if defined(SCTP_LOCAL_TRACE_BUF) || defined(__APPLE__)
+void
+ sctp_log_trace(uint32_t fr, const char *str SCTP_UNUSED, uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e, uint32_t f);
+
+#endif
+
+#define sctp_get_associd(stcb) ((sctp_assoc_t)stcb->asoc.assoc_id)
+
+
+/*
+ * Function prototypes
+ */
+uint32_t
+sctp_get_ifa_hash_val(struct sockaddr *addr);
+
+struct sctp_ifa *
+ sctp_find_ifa_in_ep(struct sctp_inpcb *inp, struct sockaddr *addr, int hold_lock);
+
+struct sctp_ifa *
+ sctp_find_ifa_by_addr(struct sockaddr *addr, uint32_t vrf_id, int holds_lock);
+
+uint32_t sctp_select_initial_TSN(struct sctp_pcb *);
+
+uint32_t sctp_select_a_tag(struct sctp_inpcb *, uint16_t lport, uint16_t rport, int);
+
+int sctp_init_asoc(struct sctp_inpcb *, struct sctp_tcb *, uint32_t, uint32_t);
+
+void sctp_fill_random_store(struct sctp_pcb *);
+
+void
+sctp_timer_start(int, struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *);
+
+void
+sctp_timer_stop(int, struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *, uint32_t);
+
+int
+ sctp_dynamic_set_primary(struct sockaddr *sa, uint32_t vrf_id);
+
+void
+ sctp_mtu_size_reset(struct sctp_inpcb *, struct sctp_association *, uint32_t);
+
+void
+sctp_add_to_readq(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_queued_to_read *control,
+ struct sockbuf *sb,
+ int end,
+ int inpread_locked,
+ int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+);
+
+int
+sctp_append_to_readq(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ struct sctp_queued_to_read *control,
+ struct mbuf *m,
+ int end,
+ int new_cumack,
+ struct sockbuf *sb);
+
+
+void sctp_iterator_worker(void);
+
+uint32_t sctp_get_prev_mtu(uint32_t);
+uint32_t sctp_get_next_mtu(struct sctp_inpcb *, uint32_t);
+
+void
+ sctp_timeout_handler(void *);
+
+uint32_t
+sctp_calculate_rto(struct sctp_tcb *, struct sctp_association *,
+ struct sctp_nets *, struct timeval *, int);
+
+uint32_t sctp_calculate_len(struct mbuf *);
+
+caddr_t sctp_m_getptr(struct mbuf *, int, int, uint8_t *);
+
+struct sctp_paramhdr *
+sctp_get_next_param(struct mbuf *, int,
+ struct sctp_paramhdr *, int);
+
+int sctp_add_pad_tombuf(struct mbuf *, int);
+
+int sctp_pad_lastmbuf(struct mbuf *, int, struct mbuf *);
+
+void
+sctp_ulp_notify(uint32_t, struct sctp_tcb *, uint32_t, void *, int
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+);
+
+void
+sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp,
+ struct sctp_inpcb *new_inp,
+ struct sctp_tcb *stcb, int waitflags);
+
+
+void sctp_stop_timers_for_shutdown(struct sctp_tcb *);
+
+void
+sctp_report_all_outbound(struct sctp_tcb *, int, int
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+);
+
+int sctp_expand_mapping_array(struct sctp_association *, uint32_t);
+
+void
+sctp_abort_notification(struct sctp_tcb *, int, int
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+);
+
+/* We abort responding to an IP packet for some reason */
+void
+sctp_abort_association(struct sctp_inpcb *, struct sctp_tcb *,
+ struct mbuf *, int, struct sctphdr *, struct mbuf *, uint32_t, uint16_t);
+
+
+/* We choose to abort via user input */
+void
+sctp_abort_an_association(struct sctp_inpcb *, struct sctp_tcb *, int,
+ struct mbuf *, int
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+);
+
+void
+sctp_handle_ootb(struct mbuf *, int, int, struct sctphdr *,
+ struct sctp_inpcb *, struct mbuf *, uint32_t, uint16_t);
+
+int
+sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr,
+ int totaddr, int *error);
+
+struct sctp_tcb *
+sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr,
+ int *totaddr, int *num_v4, int *num_v6, int *error, int limit, int *bad_addr);
+
+int sctp_is_there_an_abort_here(struct mbuf *, int, uint32_t *);
+
+#ifdef INET6
+uint32_t sctp_is_same_scope(struct sockaddr_in6 *, struct sockaddr_in6 *);
+
+struct sockaddr_in6 *
+ sctp_recover_scope(struct sockaddr_in6 *, struct sockaddr_in6 *);
+
+#define sctp_recover_scope_mac(addr, store) do { \
+ if ((addr->sin6_family == AF_INET6) && \
+ (IN6_IS_SCOPE_LINKLOCAL(&addr->sin6_addr))) { \
+ *store = *addr; \
+ if (addr->sin6_scope_id == 0) { \
+ if (!sa6_recoverscope(store)) { \
+ addr = store; \
+ } \
+ } else { \
+ in6_clearscope(&addr->sin6_addr); \
+ addr = store; \
+ } \
+ } \
+} while (0)
+#endif
+
+int sctp_cmpaddr(struct sockaddr *, struct sockaddr *);
+
+void sctp_print_address(struct sockaddr *);
+void sctp_print_address_pkt(struct ip *, struct sctphdr *);
+
+int
+sctp_release_pr_sctp_chunk(struct sctp_tcb *, struct sctp_tmit_chunk *,
+ int, int
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+);
+
+struct mbuf *sctp_generate_invmanparam(int);
+
+void
+sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp,
+ struct sockaddr *sa, sctp_assoc_t assoc_id,
+ uint32_t vrf_id, int *error, void *p);
+void
+sctp_bindx_delete_address(struct socket *so, struct sctp_inpcb *inp,
+ struct sockaddr *sa, sctp_assoc_t assoc_id,
+ uint32_t vrf_id, int *error);
+
+int sctp_local_addr_count(struct sctp_tcb *stcb);
+
+#ifdef SCTP_MBCNT_LOGGING
+void
+sctp_free_bufspace(struct sctp_tcb *, struct sctp_association *,
+ struct sctp_tmit_chunk *, int);
+
+#else
+#define sctp_free_bufspace(stcb, asoc, tp1, chk_cnt) \
+do { \
+ if (tp1->data != NULL) { \
+ atomic_subtract_int(&((asoc)->chunks_on_out_queue), chk_cnt); \
+ if ((asoc)->total_output_queue_size >= tp1->book_size) { \
+ atomic_subtract_int(&((asoc)->total_output_queue_size), tp1->book_size); \
+ } else { \
+ (asoc)->total_output_queue_size = 0; \
+ } \
+ if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \
+ if (stcb->sctp_socket->so_snd.sb_cc >= tp1->book_size) { \
+ atomic_subtract_int(&((stcb)->sctp_socket->so_snd.sb_cc), tp1->book_size); \
+ } else { \
+ stcb->sctp_socket->so_snd.sb_cc = 0; \
+ } \
+ } \
+ } \
+} while (0)
+
+#endif
+
+#define sctp_free_spbufspace(stcb, asoc, sp) \
+do { \
+ if (sp->data != NULL) { \
+ if ((asoc)->total_output_queue_size >= sp->length) { \
+ atomic_subtract_int(&(asoc)->total_output_queue_size, sp->length); \
+ } else { \
+ (asoc)->total_output_queue_size = 0; \
+ } \
+ if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \
+ if (stcb->sctp_socket->so_snd.sb_cc >= sp->length) { \
+ atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc,sp->length); \
+ } else { \
+ stcb->sctp_socket->so_snd.sb_cc = 0; \
+ } \
+ } \
+ } \
+} while (0)
+
+#define sctp_snd_sb_alloc(stcb, sz) \
+do { \
+ atomic_add_int(&stcb->asoc.total_output_queue_size,sz); \
+ if ((stcb->sctp_socket != NULL) && \
+ ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \
+ atomic_add_int(&stcb->sctp_socket->so_snd.sb_cc,sz); \
+ } \
+} while (0)
+
+/* new functions to start/stop udp tunneling */
+void sctp_over_udp_stop(void);
+int sctp_over_udp_start(void);
+
+int
+sctp_soreceive(struct socket *so, struct sockaddr **psa,
+ struct uio *uio,
+ struct mbuf **mp0,
+ struct mbuf **controlp,
+ int *flagsp);
+
+
+/* For those not passing mbufs, this does the
+ * translations for you. Caller owns memory
+ * of size controllen returned in controlp.
+ */
+int
+sctp_l_soreceive(struct socket *so,
+ struct sockaddr **name,
+ struct uio *uio,
+ char **controlp,
+ int *controllen,
+ int *flag);
+
+
+void
+ sctp_misc_ints(uint8_t from, uint32_t a, uint32_t b, uint32_t c, uint32_t d);
+
+void
+sctp_wakeup_log(struct sctp_tcb *stcb,
+ uint32_t cumtsn,
+ uint32_t wake_cnt, int from);
+
+void sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t, uint16_t, uint16_t, int);
+
+void sctp_log_nagle_event(struct sctp_tcb *stcb, int action);
+
+
+void
+ sctp_log_mb(struct mbuf *m, int from);
+
+void
+sctp_sblog(struct sockbuf *sb,
+ struct sctp_tcb *stcb, int from, int incr);
+
+void
+sctp_log_strm_del(struct sctp_queued_to_read *control,
+ struct sctp_queued_to_read *poschk,
+ int from);
+void sctp_log_cwnd(struct sctp_tcb *stcb, struct sctp_nets *, int, uint8_t);
+void rto_logging(struct sctp_nets *net, int from);
+
+void sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc);
+
+void sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from);
+void sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *, int, int, uint8_t);
+void sctp_log_block(uint8_t, struct socket *, struct sctp_association *, int);
+void sctp_log_rwnd(uint8_t, uint32_t, uint32_t, uint32_t);
+void sctp_log_mbcnt(uint8_t, uint32_t, uint32_t, uint32_t, uint32_t);
+void sctp_log_rwnd_set(uint8_t, uint32_t, uint32_t, uint32_t, uint32_t);
+int sctp_fill_stat_log(void *, size_t *);
+void sctp_log_fr(uint32_t, uint32_t, uint32_t, int);
+void sctp_log_sack(uint32_t, uint32_t, uint32_t, uint16_t, uint16_t, int);
+void sctp_log_map(uint32_t, uint32_t, uint32_t, int);
+void sctp_print_mapping_array(struct sctp_association *asoc);
+void sctp_clr_stat_log(void);
+
+
+#ifdef SCTP_AUDITING_ENABLED
+void
+sctp_auditing(int, struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *);
+void sctp_audit_log(uint8_t, uint8_t);
+
+#endif
+
+
+#endif /* _KERNEL */
+#endif
diff --git a/freebsd/sys/netinet/tcp.h b/freebsd/sys/netinet/tcp.h
new file mode 100644
index 00000000..19b1c57f
--- /dev/null
+++ b/freebsd/sys/netinet/tcp.h
@@ -0,0 +1,2 @@
+#include <freebsd/bsd.h>
+#include <freebsd/netinet/tcp.h>
diff --git a/freebsd/sys/netinet/tcp_debug.c b/freebsd/sys/netinet/tcp_debug.c
new file mode 100644
index 00000000..52a82193
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_debug.c
@@ -0,0 +1,226 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_debug.c 8.1 (Berkeley) 6/10/93
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_tcpdebug.h>
+
+#ifdef TCPDEBUG
+/* load symbolic names */
+#define PRUREQUESTS
+#define TCPSTATES
+#define TCPTIMERS
+#define TANAMES
+#endif
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/mutex.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/socket.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#ifdef INET6
+#include <freebsd/netinet/ip6.h>
+#endif
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_fsm.h>
+#include <freebsd/netinet/tcp_timer.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet/tcpip.h>
+#include <freebsd/netinet/tcp_debug.h>
+
+#ifdef TCPDEBUG
+static int tcpconsdebug = 0;
+#endif
+
+/*
+ * Global ring buffer of TCP debugging state. Each entry captures a snapshot
+ * of TCP connection state at any given moment. tcp_debx addresses at the
+ * next available slot. There is no explicit export of this data structure;
+ * it will be read via /dev/kmem by debugging tools.
+ */
+static struct tcp_debug tcp_debug[TCP_NDEBUG];
+static int tcp_debx;
+
+/*
+ * All global state is protected by tcp_debug_mtx; tcp_trace() is split into
+ * two parts, one of which saves connection and other state into the global
+ * array (locked by tcp_debug_mtx).
+ */
+struct mtx tcp_debug_mtx;
+MTX_SYSINIT(tcp_debug_mtx, &tcp_debug_mtx, "tcp_debug_mtx", MTX_DEF);
+
+/*
+ * Save TCP state at a given moment; optionally, both tcpcb and TCP packet
+ * header state will be saved.
+ */
+void
+tcp_trace(short act, short ostate, struct tcpcb *tp, void *ipgen,
+ struct tcphdr *th, int req)
+{
+#ifdef INET6
+ int isipv6;
+#endif /* INET6 */
+ tcp_seq seq, ack;
+ int len, flags;
+ struct tcp_debug *td;
+
+ mtx_lock(&tcp_debug_mtx);
+ td = &tcp_debug[tcp_debx++];
+ if (tcp_debx == TCP_NDEBUG)
+ tcp_debx = 0;
+ bzero(td, sizeof(*td));
+#ifdef INET6
+ isipv6 = (ipgen != NULL && ((struct ip *)ipgen)->ip_v == 6) ? 1 : 0;
+#endif /* INET6 */
+ td->td_family =
+#ifdef INET6
+ (isipv6 != 0) ? AF_INET6 :
+#endif
+ AF_INET;
+#ifdef INET
+ td->td_time = iptime();
+#endif
+ td->td_act = act;
+ td->td_ostate = ostate;
+ td->td_tcb = (caddr_t)tp;
+ if (tp != NULL)
+ td->td_cb = *tp;
+ if (ipgen != NULL) {
+ switch (td->td_family) {
+#ifdef INET
+ case AF_INET:
+ bcopy(ipgen, &td->td_ti.ti_i, sizeof(td->td_ti.ti_i));
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ bcopy(ipgen, td->td_ip6buf, sizeof(td->td_ip6buf));
+ break;
+#endif
+ }
+ }
+ if (th != NULL) {
+ switch (td->td_family) {
+#ifdef INET
+ case AF_INET:
+ td->td_ti.ti_t = *th;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ td->td_ti6.th = *th;
+ break;
+#endif
+ }
+ }
+ td->td_req = req;
+ mtx_unlock(&tcp_debug_mtx);
+#ifdef TCPDEBUG
+ if (tcpconsdebug == 0)
+ return;
+ if (tp != NULL)
+ printf("%p %s:", tp, tcpstates[ostate]);
+ else
+ printf("???????? ");
+ printf("%s ", tanames[act]);
+ switch (act) {
+ case TA_INPUT:
+ case TA_OUTPUT:
+ case TA_DROP:
+ if (ipgen == NULL || th == NULL)
+ break;
+ seq = th->th_seq;
+ ack = th->th_ack;
+ len =
+#ifdef INET6
+ isipv6 ? ntohs(((struct ip6_hdr *)ipgen)->ip6_plen) :
+#endif
+ ((struct ip *)ipgen)->ip_len;
+ if (act == TA_OUTPUT) {
+ seq = ntohl(seq);
+ ack = ntohl(ack);
+ len = ntohs((u_short)len);
+ }
+ if (act == TA_OUTPUT)
+ len -= sizeof (struct tcphdr);
+ if (len)
+ printf("[%x..%x)", seq, seq+len);
+ else
+ printf("%x", seq);
+ printf("@%x, urp=%x", ack, th->th_urp);
+ flags = th->th_flags;
+ if (flags) {
+ char *cp = "<";
+#define pf(f) { \
+ if (th->th_flags & TH_##f) { \
+ printf("%s%s", cp, #f); \
+ cp = ","; \
+ } \
+}
+ pf(SYN); pf(ACK); pf(FIN); pf(RST); pf(PUSH); pf(URG);
+ printf(">");
+ }
+ break;
+
+ case TA_USER:
+ printf("%s", prurequests[req&0xff]);
+ if ((req & 0xff) == PRU_SLOWTIMO)
+ printf("<%s>", tcptimers[req>>8]);
+ break;
+ }
+ if (tp != NULL)
+ printf(" -> %s", tcpstates[tp->t_state]);
+ /* print out internal state of tp !?! */
+ printf("\n");
+ if (tp == NULL)
+ return;
+ printf(
+ "\trcv_(nxt,wnd,up) (%lx,%lx,%lx) snd_(una,nxt,max) (%lx,%lx,%lx)\n",
+ (u_long)tp->rcv_nxt, tp->rcv_wnd, (u_long)tp->rcv_up,
+ (u_long)tp->snd_una, (u_long)tp->snd_nxt, (u_long)tp->snd_max);
+ printf("\tsnd_(wl1,wl2,wnd) (%lx,%lx,%lx)\n",
+ (u_long)tp->snd_wl1, (u_long)tp->snd_wl2, tp->snd_wnd);
+#endif /* TCPDEBUG */
+}
diff --git a/freebsd/sys/netinet/tcp_debug.h b/freebsd/sys/netinet/tcp_debug.h
new file mode 100644
index 00000000..0c103958
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_debug.h
@@ -0,0 +1,80 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_debug.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_DEBUG_HH_
+#define _NETINET_TCP_DEBUG_HH_
+
+struct tcp_debug {
+ uint32_t td_time; /* network format */
+ short td_act;
+ short td_ostate;
+ caddr_t td_tcb;
+ int td_family;
+ /*
+ * Co-existense of td_ti and td_ti6 below is ugly, but it is necessary
+ * to achieve backword compatibility to some extent.
+ */
+ struct tcpiphdr td_ti;
+ struct {
+#define IP6_HDR_LEN 40 /* sizeof(struct ip6_hdr) */
+#if !defined(_KERNEL) && defined(INET6)
+ struct ip6_hdr ip6;
+#else
+ u_char ip6buf[IP6_HDR_LEN];
+#endif
+ struct tcphdr th;
+ } td_ti6;
+#define td_ip6buf td_ti6.ip6buf
+ short td_req;
+ struct tcpcb td_cb;
+};
+
+#define TA_INPUT 0
+#define TA_OUTPUT 1
+#define TA_USER 2
+#define TA_RESPOND 3
+#define TA_DROP 4
+
+#ifdef TANAMES
+static const char *tanames[] =
+ { "input", "output", "user", "respond", "drop" };
+#endif
+
+#define TCP_NDEBUG 100
+
+#ifndef _KERNEL
+/* XXX common variables for broken applications. */
+struct tcp_debug tcp_debug[TCP_NDEBUG];
+int tcp_debx;
+#endif
+
+#endif /* !_NETINET_TCP_DEBUG_HH_ */
diff --git a/freebsd/sys/netinet/tcp_fsm.h b/freebsd/sys/netinet/tcp_fsm.h
new file mode 100644
index 00000000..253e53d4
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_fsm.h
@@ -0,0 +1,112 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_fsm.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_FSM_HH_
+#define _NETINET_TCP_FSM_HH_
+
+/*
+ * TCP FSM state definitions.
+ *
+ * Per RFC793, September, 1981.
+ */
+
+#define TCP_NSTATES 11
+
+#define TCPS_CLOSED 0 /* closed */
+#define TCPS_LISTEN 1 /* listening for connection */
+#define TCPS_SYN_SENT 2 /* active, have sent syn */
+#define TCPS_SYN_RECEIVED 3 /* have sent and received syn */
+/* states < TCPS_ESTABLISHED are those where connections not established */
+#define TCPS_ESTABLISHED 4 /* established */
+#define TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */
+/* states > TCPS_CLOSE_WAIT are those where user has closed */
+#define TCPS_FIN_WAIT_1 6 /* have closed, sent fin */
+#define TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */
+#define TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */
+/* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */
+#define TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */
+#define TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */
+
+/* for KAME src sync over BSD*'s */
+#define TCP6_NSTATES TCP_NSTATES
+#define TCP6S_CLOSED TCPS_CLOSED
+#define TCP6S_LISTEN TCPS_LISTEN
+#define TCP6S_SYN_SENT TCPS_SYN_SENT
+#define TCP6S_SYN_RECEIVED TCPS_SYN_RECEIVED
+#define TCP6S_ESTABLISHED TCPS_ESTABLISHED
+#define TCP6S_CLOSE_WAIT TCPS_CLOSE_WAIT
+#define TCP6S_FIN_WAIT_1 TCPS_FIN_WAIT_1
+#define TCP6S_CLOSING TCPS_CLOSING
+#define TCP6S_LAST_ACK TCPS_LAST_ACK
+#define TCP6S_FIN_WAIT_2 TCPS_FIN_WAIT_2
+#define TCP6S_TIME_WAIT TCPS_TIME_WAIT
+
+#define TCPS_HAVERCVDSYN(s) ((s) >= TCPS_SYN_RECEIVED)
+#define TCPS_HAVEESTABLISHED(s) ((s) >= TCPS_ESTABLISHED)
+#define TCPS_HAVERCVDFIN(s) ((s) >= TCPS_TIME_WAIT)
+
+#ifdef TCPOUTFLAGS
+/*
+ * Flags used when sending segments in tcp_output. Basic flags (TH_RST,
+ * TH_ACK,TH_SYN,TH_FIN) are totally determined by state, with the proviso
+ * that TH_FIN is sent only if all data queued for output is included in the
+ * segment.
+ */
+static u_char tcp_outflags[TCP_NSTATES] = {
+ TH_RST|TH_ACK, /* 0, CLOSED */
+ 0, /* 1, LISTEN */
+ TH_SYN, /* 2, SYN_SENT */
+ TH_SYN|TH_ACK, /* 3, SYN_RECEIVED */
+ TH_ACK, /* 4, ESTABLISHED */
+ TH_ACK, /* 5, CLOSE_WAIT */
+ TH_FIN|TH_ACK, /* 6, FIN_WAIT_1 */
+ TH_FIN|TH_ACK, /* 7, CLOSING */
+ TH_FIN|TH_ACK, /* 8, LAST_ACK */
+ TH_ACK, /* 9, FIN_WAIT_2 */
+ TH_ACK, /* 10, TIME_WAIT */
+};
+#endif
+
+#ifdef KPROF
+int tcp_acounts[TCP_NSTATES][PRU_NREQ];
+#endif
+
+#ifdef TCPSTATES
+static char const * const tcpstates[] = {
+ "CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD",
+ "ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING",
+ "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT",
+};
+#endif
+
+#endif
diff --git a/freebsd/sys/netinet/tcp_hostcache.c b/freebsd/sys/netinet/tcp_hostcache.c
new file mode 100644
index 00000000..07b78cfe
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_hostcache.c
@@ -0,0 +1,693 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The tcp_hostcache moves the tcp-specific cached metrics from the routing
+ * table to a dedicated structure indexed by the remote IP address. It keeps
+ * information on the measured TCP parameters of past TCP sessions to allow
+ * better initial start values to be used with later connections to/from the
+ * same source. Depending on the network parameters (delay, bandwidth, max
+ * MTU, congestion window) between local and remote sites, this can lead to
+ * significant speed-ups for new TCP connections after the first one.
+ *
+ * Due to the tcp_hostcache, all TCP-specific metrics information in the
+ * routing table have been removed. The inpcb no longer keeps a pointer to
+ * the routing entry, and protocol-initiated route cloning has been removed
+ * as well. With these changes, the routing table has gone back to being
+ * more lightwight and only carries information related to packet forwarding.
+ *
+ * tcp_hostcache is designed for multiple concurrent access in SMP
+ * environments and high contention. All bucket rows have their own lock and
+ * thus multiple lookups and modifies can be done at the same time as long as
+ * they are in different bucket rows. If a request for insertion of a new
+ * record can't be satisfied, it simply returns an empty structure. Nobody
+ * and nothing outside of tcp_hostcache.c will ever point directly to any
+ * entry in the tcp_hostcache. All communication is done in an
+ * object-oriented way and only functions of tcp_hostcache will manipulate
+ * hostcache entries. Otherwise, we are unable to achieve good behaviour in
+ * concurrent access situations. Since tcp_hostcache is only caching
+ * information, there are no fatal consequences if we either can't satisfy
+ * any particular request or have to drop/overwrite an existing entry because
+ * of bucket limit memory constrains.
+ */
+
+/*
+ * Many thanks to jlemon for basic structure of tcp_syncache which is being
+ * followed here.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet6.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/mutex.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/sysctl.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/ip_var.h>
+#ifdef INET6
+#include <freebsd/netinet/ip6.h>
+#include <freebsd/netinet6/ip6_var.h>
+#endif
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet/tcp_hostcache.h>
+#ifdef INET6
+#include <freebsd/netinet6/tcp6_var.h>
+#endif
+
+#include <freebsd/vm/uma.h>
+
+/* Arbitrary values */
+#define TCP_HOSTCACHE_HASHSIZE 512
+#define TCP_HOSTCACHE_BUCKETLIMIT 30
+#define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */
+#define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */
+
+static VNET_DEFINE(struct tcp_hostcache, tcp_hostcache);
+#define V_tcp_hostcache VNET(tcp_hostcache)
+
+static VNET_DEFINE(struct callout, tcp_hc_callout);
+#define V_tcp_hc_callout VNET(tcp_hc_callout)
+
+static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *);
+static struct hc_metrics *tcp_hc_insert(struct in_conninfo *);
+static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
+static void tcp_hc_purge_internal(int);
+static void tcp_hc_purge(void *);
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0,
+ "TCP Host cache");
+
+SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
+ &VNET_NAME(tcp_hostcache.cache_limit), 0,
+ "Overall entry limit for hostcache");
+
+SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
+ &VNET_NAME(tcp_hostcache.hashsize), 0,
+ "Size of TCP hostcache hashtable");
+
+SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit,
+ CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0,
+ "Per-bucket hash limit for hostcache");
+
+SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD,
+ &VNET_NAME(tcp_hostcache.cache_count), 0,
+ "Current number of entries in hostcache");
+
+SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW,
+ &VNET_NAME(tcp_hostcache.expire), 0,
+ "Expire time of TCP hostcache entries");
+
+SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_RW,
+ &VNET_NAME(tcp_hostcache.prune), 0,
+ "Time between purge runs");
+
+SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW,
+ &VNET_NAME(tcp_hostcache.purgeall), 0,
+ "Expire all entires on next purge run");
+
+SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0,
+ sysctl_tcp_hc_list, "A", "List of all hostcache entries");
+
+
+static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache");
+
+#define HOSTCACHE_HASH(ip) \
+ (((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \
+ V_tcp_hostcache.hashmask)
+
+/* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */
+#define HOSTCACHE_HASH6(ip6) \
+ (((ip6)->s6_addr32[0] ^ \
+ (ip6)->s6_addr32[1] ^ \
+ (ip6)->s6_addr32[2] ^ \
+ (ip6)->s6_addr32[3]) & \
+ V_tcp_hostcache.hashmask)
+
+#define THC_LOCK(lp) mtx_lock(lp)
+#define THC_UNLOCK(lp) mtx_unlock(lp)
+
+void
+tcp_hc_init(void)
+{
+ int i;
+
+ /*
+ * Initialize hostcache structures.
+ */
+ V_tcp_hostcache.cache_count = 0;
+ V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE;
+ V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT;
+ V_tcp_hostcache.cache_limit =
+ V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit;
+ V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE;
+ V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE;
+
+ TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize",
+ &V_tcp_hostcache.hashsize);
+ TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit",
+ &V_tcp_hostcache.cache_limit);
+ TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit",
+ &V_tcp_hostcache.bucket_limit);
+ if (!powerof2(V_tcp_hostcache.hashsize)) {
+ printf("WARNING: hostcache hash size is not a power of 2.\n");
+ V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */
+ }
+ V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1;
+
+ /*
+ * Allocate the hash table.
+ */
+ V_tcp_hostcache.hashbase = (struct hc_head *)
+ malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head),
+ M_HOSTCACHE, M_WAITOK | M_ZERO);
+
+ /*
+ * Initialize the hash buckets.
+ */
+ for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
+ TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket);
+ V_tcp_hostcache.hashbase[i].hch_length = 0;
+ mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry",
+ NULL, MTX_DEF);
+ }
+
+ /*
+ * Allocate the hostcache entries.
+ */
+ V_tcp_hostcache.zone =
+ uma_zcreate("hostcache", sizeof(struct hc_metrics),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit);
+
+ /*
+ * Set up periodic cache cleanup.
+ */
+ callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE);
+ callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
+ tcp_hc_purge, curvnet);
+}
+
+#ifdef VIMAGE
+void
+tcp_hc_destroy(void)
+{
+ int i;
+
+ callout_drain(&V_tcp_hc_callout);
+
+ /* Purge all hc entries. */
+ tcp_hc_purge_internal(1);
+
+ /* Free the uma zone and the allocated hash table. */
+ uma_zdestroy(V_tcp_hostcache.zone);
+
+ for (i = 0; i < V_tcp_hostcache.hashsize; i++)
+ mtx_destroy(&V_tcp_hostcache.hashbase[i].hch_mtx);
+ free(V_tcp_hostcache.hashbase, M_HOSTCACHE);
+}
+#endif
+
+/*
+ * Internal function: look up an entry in the hostcache or return NULL.
+ *
+ * If an entry has been returned, the caller becomes responsible for
+ * unlocking the bucket row after he is done reading/modifying the entry.
+ */
+static struct hc_metrics *
+tcp_hc_lookup(struct in_conninfo *inc)
+{
+ int hash;
+ struct hc_head *hc_head;
+ struct hc_metrics *hc_entry;
+
+ KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer"));
+
+ /*
+ * Hash the foreign ip address.
+ */
+ if (inc->inc_flags & INC_ISIPV6)
+ hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
+ else
+ hash = HOSTCACHE_HASH(&inc->inc_faddr);
+
+ hc_head = &V_tcp_hostcache.hashbase[hash];
+
+ /*
+ * Acquire lock for this bucket row; we release the lock if we don't
+ * find an entry, otherwise the caller has to unlock after he is
+ * done.
+ */
+ THC_LOCK(&hc_head->hch_mtx);
+
+ /*
+ * Iterate through entries in bucket row looking for a match.
+ */
+ TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) {
+ if (inc->inc_flags & INC_ISIPV6) {
+ if (memcmp(&inc->inc6_faddr, &hc_entry->ip6,
+ sizeof(inc->inc6_faddr)) == 0)
+ return hc_entry;
+ } else {
+ if (memcmp(&inc->inc_faddr, &hc_entry->ip4,
+ sizeof(inc->inc_faddr)) == 0)
+ return hc_entry;
+ }
+ }
+
+ /*
+ * We were unsuccessful and didn't find anything.
+ */
+ THC_UNLOCK(&hc_head->hch_mtx);
+ return NULL;
+}
+
+/*
+ * Internal function: insert an entry into the hostcache or return NULL if
+ * unable to allocate a new one.
+ *
+ * If an entry has been returned, the caller becomes responsible for
+ * unlocking the bucket row after he is done reading/modifying the entry.
+ */
+static struct hc_metrics *
+tcp_hc_insert(struct in_conninfo *inc)
+{
+ int hash;
+ struct hc_head *hc_head;
+ struct hc_metrics *hc_entry;
+
+ KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer"));
+
+ /*
+ * Hash the foreign ip address.
+ */
+ if (inc->inc_flags & INC_ISIPV6)
+ hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
+ else
+ hash = HOSTCACHE_HASH(&inc->inc_faddr);
+
+ hc_head = &V_tcp_hostcache.hashbase[hash];
+
+ /*
+ * Acquire lock for this bucket row; we release the lock if we don't
+ * find an entry, otherwise the caller has to unlock after he is
+ * done.
+ */
+ THC_LOCK(&hc_head->hch_mtx);
+
+ /*
+ * If the bucket limit is reached, reuse the least-used element.
+ */
+ if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit ||
+ V_tcp_hostcache.cache_count >= V_tcp_hostcache.cache_limit) {
+ hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead);
+ /*
+ * At first we were dropping the last element, just to
+ * reacquire it in the next two lines again, which isn't very
+ * efficient. Instead just reuse the least used element.
+ * We may drop something that is still "in-use" but we can be
+ * "lossy".
+ * Just give up if this bucket row is empty and we don't have
+ * anything to replace.
+ */
+ if (hc_entry == NULL) {
+ THC_UNLOCK(&hc_head->hch_mtx);
+ return NULL;
+ }
+ TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q);
+ V_tcp_hostcache.hashbase[hash].hch_length--;
+ V_tcp_hostcache.cache_count--;
+ TCPSTAT_INC(tcps_hc_bucketoverflow);
+#if 0
+ uma_zfree(V_tcp_hostcache.zone, hc_entry);
+#endif
+ } else {
+ /*
+ * Allocate a new entry, or balk if not possible.
+ */
+ hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT);
+ if (hc_entry == NULL) {
+ THC_UNLOCK(&hc_head->hch_mtx);
+ return NULL;
+ }
+ }
+
+ /*
+ * Initialize basic information of hostcache entry.
+ */
+ bzero(hc_entry, sizeof(*hc_entry));
+ if (inc->inc_flags & INC_ISIPV6)
+ bcopy(&inc->inc6_faddr, &hc_entry->ip6, sizeof(hc_entry->ip6));
+ else
+ hc_entry->ip4 = inc->inc_faddr;
+ hc_entry->rmx_head = hc_head;
+ hc_entry->rmx_expire = V_tcp_hostcache.expire;
+
+ /*
+ * Put it upfront.
+ */
+ TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q);
+ V_tcp_hostcache.hashbase[hash].hch_length++;
+ V_tcp_hostcache.cache_count++;
+ TCPSTAT_INC(tcps_hc_added);
+
+ return hc_entry;
+}
+
+/*
+ * External function: look up an entry in the hostcache and fill out the
+ * supplied TCP metrics structure. Fills in NULL when no entry was found or
+ * a value is not set.
+ */
+void
+tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite)
+{
+ struct hc_metrics *hc_entry;
+
+ /*
+ * Find the right bucket.
+ */
+ hc_entry = tcp_hc_lookup(inc);
+
+ /*
+ * If we don't have an existing object.
+ */
+ if (hc_entry == NULL) {
+ bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
+ return;
+ }
+ hc_entry->rmx_hits++;
+ hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
+
+ hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu;
+ hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh;
+ hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt;
+ hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar;
+ hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth;
+ hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd;
+ hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe;
+ hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe;
+
+ /*
+ * Unlock bucket row.
+ */
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+}
+
+/*
+ * External function: look up an entry in the hostcache and return the
+ * discovered path MTU. Returns NULL if no entry is found or value is not
+ * set.
+ */
+u_long
+tcp_hc_getmtu(struct in_conninfo *inc)
+{
+ struct hc_metrics *hc_entry;
+ u_long mtu;
+
+ hc_entry = tcp_hc_lookup(inc);
+ if (hc_entry == NULL) {
+ return 0;
+ }
+ hc_entry->rmx_hits++;
+ hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
+
+ mtu = hc_entry->rmx_mtu;
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+ return mtu;
+}
+
+/*
+ * External function: update the MTU value of an entry in the hostcache.
+ * Creates a new entry if none was found.
+ */
+void
+tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu)
+{
+ struct hc_metrics *hc_entry;
+
+ /*
+ * Find the right bucket.
+ */
+ hc_entry = tcp_hc_lookup(inc);
+
+ /*
+ * If we don't have an existing object, try to insert a new one.
+ */
+ if (hc_entry == NULL) {
+ hc_entry = tcp_hc_insert(inc);
+ if (hc_entry == NULL)
+ return;
+ }
+ hc_entry->rmx_updates++;
+ hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
+
+ hc_entry->rmx_mtu = mtu;
+
+ /*
+ * Put it upfront so we find it faster next time.
+ */
+ TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+ TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+
+ /*
+ * Unlock bucket row.
+ */
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+}
+
+/*
+ * External function: update the TCP metrics of an entry in the hostcache.
+ * Creates a new entry if none was found.
+ */
+void
+tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml)
+{
+ struct hc_metrics *hc_entry;
+
+ hc_entry = tcp_hc_lookup(inc);
+ if (hc_entry == NULL) {
+ hc_entry = tcp_hc_insert(inc);
+ if (hc_entry == NULL)
+ return;
+ }
+ hc_entry->rmx_updates++;
+ hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
+
+ if (hcml->rmx_rtt != 0) {
+ if (hc_entry->rmx_rtt == 0)
+ hc_entry->rmx_rtt = hcml->rmx_rtt;
+ else
+ hc_entry->rmx_rtt =
+ (hc_entry->rmx_rtt + hcml->rmx_rtt) / 2;
+ TCPSTAT_INC(tcps_cachedrtt);
+ }
+ if (hcml->rmx_rttvar != 0) {
+ if (hc_entry->rmx_rttvar == 0)
+ hc_entry->rmx_rttvar = hcml->rmx_rttvar;
+ else
+ hc_entry->rmx_rttvar =
+ (hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2;
+ TCPSTAT_INC(tcps_cachedrttvar);
+ }
+ if (hcml->rmx_ssthresh != 0) {
+ if (hc_entry->rmx_ssthresh == 0)
+ hc_entry->rmx_ssthresh = hcml->rmx_ssthresh;
+ else
+ hc_entry->rmx_ssthresh =
+ (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2;
+ TCPSTAT_INC(tcps_cachedssthresh);
+ }
+ if (hcml->rmx_bandwidth != 0) {
+ if (hc_entry->rmx_bandwidth == 0)
+ hc_entry->rmx_bandwidth = hcml->rmx_bandwidth;
+ else
+ hc_entry->rmx_bandwidth =
+ (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2;
+ /* TCPSTAT_INC(tcps_cachedbandwidth); */
+ }
+ if (hcml->rmx_cwnd != 0) {
+ if (hc_entry->rmx_cwnd == 0)
+ hc_entry->rmx_cwnd = hcml->rmx_cwnd;
+ else
+ hc_entry->rmx_cwnd =
+ (hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2;
+ /* TCPSTAT_INC(tcps_cachedcwnd); */
+ }
+ if (hcml->rmx_sendpipe != 0) {
+ if (hc_entry->rmx_sendpipe == 0)
+ hc_entry->rmx_sendpipe = hcml->rmx_sendpipe;
+ else
+ hc_entry->rmx_sendpipe =
+ (hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2;
+ /* TCPSTAT_INC(tcps_cachedsendpipe); */
+ }
+ if (hcml->rmx_recvpipe != 0) {
+ if (hc_entry->rmx_recvpipe == 0)
+ hc_entry->rmx_recvpipe = hcml->rmx_recvpipe;
+ else
+ hc_entry->rmx_recvpipe =
+ (hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2;
+ /* TCPSTAT_INC(tcps_cachedrecvpipe); */
+ }
+
+ TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+ TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+}
+
+/*
+ * Sysctl function: prints the list and values of all hostcache entries in
+ * unsorted order.
+ */
+static int
+sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
+{
+ int bufsize;
+ int linesize = 128;
+ char *p, *buf;
+ int len, i, error;
+ struct hc_metrics *hc_entry;
+#ifdef INET6
+ char ip6buf[INET6_ADDRSTRLEN];
+#endif
+
+ bufsize = linesize * (V_tcp_hostcache.cache_count + 1);
+
+ p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
+
+ len = snprintf(p, linesize,
+ "\nIP address MTU SSTRESH RTT RTTVAR BANDWIDTH "
+ " CWND SENDPIPE RECVPIPE HITS UPD EXP\n");
+ p += len;
+
+#define msec(u) (((u) + 500) / 1000)
+ for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
+ THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
+ TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket,
+ rmx_q) {
+ len = snprintf(p, linesize,
+ "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu "
+ "%4lu %4lu %4i\n",
+ hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) :
+#ifdef INET6
+ ip6_sprintf(ip6buf, &hc_entry->ip6),
+#else
+ "IPv6?",
+#endif
+ hc_entry->rmx_mtu,
+ hc_entry->rmx_ssthresh,
+ msec(hc_entry->rmx_rtt *
+ (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
+ msec(hc_entry->rmx_rttvar *
+ (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
+ hc_entry->rmx_bandwidth * 8,
+ hc_entry->rmx_cwnd,
+ hc_entry->rmx_sendpipe,
+ hc_entry->rmx_recvpipe,
+ hc_entry->rmx_hits,
+ hc_entry->rmx_updates,
+ hc_entry->rmx_expire);
+ p += len;
+ }
+ THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
+ }
+#undef msec
+ error = SYSCTL_OUT(req, buf, p - buf);
+ free(buf, M_TEMP);
+ return(error);
+}
+
+/*
+ * Caller has to make sure the curvnet is set properly.
+ */
+static void
+tcp_hc_purge_internal(int all)
+{
+ struct hc_metrics *hc_entry, *hc_next;
+ int i;
+
+ for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
+ THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
+ TAILQ_FOREACH_SAFE(hc_entry,
+ &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) {
+ if (all || hc_entry->rmx_expire <= 0) {
+ TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket,
+ hc_entry, rmx_q);
+ uma_zfree(V_tcp_hostcache.zone, hc_entry);
+ V_tcp_hostcache.hashbase[i].hch_length--;
+ V_tcp_hostcache.cache_count--;
+ } else
+ hc_entry->rmx_expire -= V_tcp_hostcache.prune;
+ }
+ THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
+ }
+}
+
+/*
+ * Expire and purge (old|all) entries in the tcp_hostcache. Runs
+ * periodically from the callout.
+ */
+static void
+tcp_hc_purge(void *arg)
+{
+ CURVNET_SET((struct vnet *) arg);
+ int all = 0;
+
+ if (V_tcp_hostcache.purgeall) {
+ all = 1;
+ V_tcp_hostcache.purgeall = 0;
+ }
+
+ tcp_hc_purge_internal(all);
+
+ callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
+ tcp_hc_purge, arg);
+ CURVNET_RESTORE();
+}
diff --git a/freebsd/sys/netinet/tcp_hostcache.h b/freebsd/sys/netinet/tcp_hostcache.h
new file mode 100644
index 00000000..a494ed03
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_hostcache.h
@@ -0,0 +1,82 @@
+/*-
+ * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Many thanks to jlemon for basic structure of tcp_syncache which is being
+ * followed here.
+ */
+
+#ifndef _NETINET_TCP_HOSTCACHE_HH_
+#define _NETINET_TCP_HOSTCACHE_HH_
+
+TAILQ_HEAD(hc_qhead, hc_metrics);
+
+struct hc_head {
+ struct hc_qhead hch_bucket;
+ u_int hch_length;
+ struct mtx hch_mtx;
+};
+
+struct hc_metrics {
+ /* housekeeping */
+ TAILQ_ENTRY(hc_metrics) rmx_q;
+ struct hc_head *rmx_head; /* head of bucket tail queue */
+ struct in_addr ip4; /* IP address */
+ struct in6_addr ip6; /* IP6 address */
+ /* endpoint specific values for tcp */
+ u_long rmx_mtu; /* MTU for this path */
+ u_long rmx_ssthresh; /* outbound gateway buffer limit */
+ u_long rmx_rtt; /* estimated round trip time */
+ u_long rmx_rttvar; /* estimated rtt variance */
+ u_long rmx_bandwidth; /* estimated bandwidth */
+ u_long rmx_cwnd; /* congestion window */
+ u_long rmx_sendpipe; /* outbound delay-bandwidth product */
+ u_long rmx_recvpipe; /* inbound delay-bandwidth product */
+ /* TCP hostcache internal data */
+ int rmx_expire; /* lifetime for object */
+ u_long rmx_hits; /* number of hits */
+ u_long rmx_updates; /* number of updates */
+};
+
+struct tcp_hostcache {
+ struct hc_head *hashbase;
+ uma_zone_t zone;
+ u_int hashsize;
+ u_int hashmask;
+ u_int bucket_limit;
+ u_int cache_count;
+ u_int cache_limit;
+ int expire;
+ int prune;
+ int purgeall;
+};
+
+#endif /* !_NETINET_TCP_HOSTCACHE_HH_*/
diff --git a/freebsd/sys/netinet/tcp_input.c b/freebsd/sys/netinet/tcp_input.c
new file mode 100644
index 00000000..85daf203
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_input.c
@@ -0,0 +1,3453 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_ipfw.h> /* for ipfw_fwd */
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_ipsec.h>
+#include <freebsd/local/opt_tcpdebug.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/proc.h> /* for proc0 declaration */
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/signalvar.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/sys/systm.h>
+
+#include <freebsd/machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
+
+#include <freebsd/vm/uma.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#define TCPSTATES /* for logging */
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_icmp.h> /* required for icmp_var.h */
+#include <freebsd/netinet/icmp_var.h> /* for ICMP_BANDLIM */
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_options.h>
+#include <freebsd/netinet/ip6.h>
+#include <freebsd/netinet/icmp6.h>
+#include <freebsd/netinet6/in6_pcb.h>
+#include <freebsd/netinet6/ip6_var.h>
+#include <freebsd/netinet6/nd6.h>
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_fsm.h>
+#include <freebsd/netinet/tcp_seq.h>
+#include <freebsd/netinet/tcp_timer.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet6/tcp6_var.h>
+#include <freebsd/netinet/tcpip.h>
+#include <freebsd/netinet/tcp_syncache.h>
+#ifdef TCPDEBUG
+#include <freebsd/netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+
+#ifdef IPSEC
+#include <freebsd/netipsec/ipsec.h>
+#include <freebsd/netipsec/ipsec6.h>
+#endif /*IPSEC*/
+
+#include <freebsd/machine/in_cksum.h>
+
+#include <freebsd/security/mac/mac_framework.h>
+
+static const int tcprexmtthresh = 3;
+
+VNET_DEFINE(struct tcpstat, tcpstat);
+SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
+ &VNET_NAME(tcpstat), tcpstat,
+ "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
+
+int tcp_log_in_vain = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
+ &tcp_log_in_vain, 0,
+ "Log all incoming TCP segments to closed ports");
+
+VNET_DEFINE(int, blackhole) = 0;
+#define V_blackhole VNET(blackhole)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
+ &VNET_NAME(blackhole), 0,
+ "Do not send RST on segments to closed ports");
+
+VNET_DEFINE(int, tcp_delack_enabled) = 1;
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
+ &VNET_NAME(tcp_delack_enabled), 0,
+ "Delay ACK to try and piggyback it onto a data packet");
+
+VNET_DEFINE(int, drop_synfin) = 0;
+#define V_drop_synfin VNET(drop_synfin)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
+ &VNET_NAME(drop_synfin), 0,
+ "Drop TCP packets with SYN+FIN set");
+
+VNET_DEFINE(int, tcp_do_rfc3042) = 1;
+#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
+ &VNET_NAME(tcp_do_rfc3042), 0,
+ "Enable RFC 3042 (Limited Transmit)");
+
+VNET_DEFINE(int, tcp_do_rfc3390) = 1;
+#define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
+ &VNET_NAME(tcp_do_rfc3390), 0,
+ "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
+
+VNET_DEFINE(int, tcp_do_rfc3465) = 1;
+#define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
+ &VNET_NAME(tcp_do_rfc3465), 0,
+ "Enable RFC 3465 (Appropriate Byte Counting)");
+
+VNET_DEFINE(int, tcp_abc_l_var) = 2;
+#define V_tcp_abc_l_var VNET(tcp_abc_l_var)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
+ &VNET_NAME(tcp_abc_l_var), 2,
+ "Cap the max cwnd increment during slow-start to this number of segments");
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
+
+VNET_DEFINE(int, tcp_do_ecn) = 0;
+SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
+ &VNET_NAME(tcp_do_ecn), 0,
+ "TCP ECN support");
+
+VNET_DEFINE(int, tcp_ecn_maxretries) = 1;
+SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW,
+ &VNET_NAME(tcp_ecn_maxretries), 0,
+ "Max retries before giving up on ECN");
+
+VNET_DEFINE(int, tcp_insecure_rst) = 0;
+#define V_tcp_insecure_rst VNET(tcp_insecure_rst)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
+ &VNET_NAME(tcp_insecure_rst), 0,
+ "Follow the old (insecure) criteria for accepting RST packets");
+
+VNET_DEFINE(int, tcp_do_autorcvbuf) = 1;
+#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
+ &VNET_NAME(tcp_do_autorcvbuf), 0,
+ "Enable automatic receive buffer sizing");
+
+VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024;
+#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
+ &VNET_NAME(tcp_autorcvbuf_inc), 0,
+ "Incrementor step size of automatic receive buffer");
+
+VNET_DEFINE(int, tcp_autorcvbuf_max) = 256*1024;
+#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
+ &VNET_NAME(tcp_autorcvbuf_max), 0,
+ "Max size of automatic receive buffer");
+
+int tcp_read_locking = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW,
+ &tcp_read_locking, 0, "Enable read locking strategy");
+
+VNET_DEFINE(struct inpcbhead, tcb);
+#define tcb6 tcb /* for KAME src sync over BSD*'s */
+VNET_DEFINE(struct inpcbinfo, tcbinfo);
+
+static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
+static void tcp_do_segment(struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *, int, int, uint8_t,
+ int);
+static void tcp_dropwithreset(struct mbuf *, struct tcphdr *,
+ struct tcpcb *, int, int);
+static void tcp_pulloutofband(struct socket *,
+ struct tcphdr *, struct mbuf *, int);
+static void tcp_xmit_timer(struct tcpcb *, int);
+static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
+static void inline
+ tcp_congestion_exp(struct tcpcb *);
+
+/*
+ * Kernel module interface for updating tcpstat. The argument is an index
+ * into tcpstat treated as an array of u_long. While this encodes the
+ * general layout of tcpstat into the caller, it doesn't encode its location,
+ * so that future changes to add, for example, per-CPU stats support won't
+ * cause binary compatibility problems for kernel modules.
+ */
+void
+kmod_tcpstat_inc(int statnum)
+{
+
+ (*((u_long *)&V_tcpstat + statnum))++;
+}
+
+static void inline
+tcp_congestion_exp(struct tcpcb *tp)
+{
+ u_int win;
+
+ win = min(tp->snd_wnd, tp->snd_cwnd) /
+ 2 / tp->t_maxseg;
+ if (win < 2)
+ win = 2;
+ tp->snd_ssthresh = win * tp->t_maxseg;
+ ENTER_FASTRECOVERY(tp);
+ tp->snd_recover = tp->snd_max;
+ if (tp->t_flags & TF_ECN_PERMIT)
+ tp->t_flags |= TF_ECN_SND_CWR;
+}
+
+/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
+#ifdef INET6
+#define ND6_HINT(tp) \
+do { \
+ if ((tp) && (tp)->t_inpcb && \
+ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
+ nd6_nud_hint(NULL, NULL, 0); \
+} while (0)
+#else
+#define ND6_HINT(tp)
+#endif
+
+/*
+ * Indicate whether this ack should be delayed. We can delay the ack if
+ * - there is no delayed ack timer in progress and
+ * - our last ack wasn't a 0-sized window. We never want to delay
+ * the ack that opens up a 0-sized window and
+ * - delayed acks are enabled or
+ * - this is a half-synchronized T/TCP connection.
+ */
+#define DELAY_ACK(tp) \
+ ((!tcp_timer_active(tp, TT_DELACK) && \
+ (tp->t_flags & TF_RXWIN0SENT) == 0) && \
+ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
+
+/*
+ * TCP input handling is split into multiple parts:
+ * tcp6_input is a thin wrapper around tcp_input for the extended
+ * ip6_protox[] call format in ip6_input
+ * tcp_input handles primary segment validation, inpcb lookup and
+ * SYN processing on listen sockets
+ * tcp_do_segment processes the ACK and text of the segment for
+ * establishing, established and closing connections
+ */
+#ifdef INET6
+int
+tcp6_input(struct mbuf **mp, int *offp, int proto)
+{
+ struct mbuf *m = *mp;
+ struct in6_ifaddr *ia6;
+
+ IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
+
+ /*
+ * draft-itojun-ipv6-tcp-to-anycast
+ * better place to put this in?
+ */
+ ia6 = ip6_getdstifaddr(m);
+ if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
+ struct ip6_hdr *ip6;
+
+ ifa_free(&ia6->ia_ifa);
+ ip6 = mtod(m, struct ip6_hdr *);
+ icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
+ (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
+ return IPPROTO_DONE;
+ }
+
+ tcp_input(m, *offp);
+ return IPPROTO_DONE;
+}
+#endif
+
+void
+tcp_input(struct mbuf *m, int off0)
+{
+ struct tcphdr *th;
+ struct ip *ip = NULL;
+ struct ipovly *ipov;
+ struct inpcb *inp = NULL;
+ struct tcpcb *tp = NULL;
+ struct socket *so = NULL;
+ u_char *optp = NULL;
+ int optlen = 0;
+ int len, tlen, off;
+ int drop_hdrlen;
+ int thflags;
+ int rstreason = 0; /* For badport_bandlim accounting purposes */
+ uint8_t iptos;
+#ifdef IPFIREWALL_FORWARD
+ struct m_tag *fwd_tag;
+#endif
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+ int isipv6;
+#else
+ const void *ip6 = NULL;
+ const int isipv6 = 0;
+#endif
+ struct tcpopt to; /* options in this segment */
+ char *s = NULL; /* address and port logging */
+ int ti_locked;
+#define TI_UNLOCKED 1
+#define TI_RLOCKED 2
+#define TI_WLOCKED 3
+
+#ifdef TCPDEBUG
+ /*
+ * The size of tcp_saveipgen must be the size of the max ip header,
+ * now IPv6.
+ */
+ u_char tcp_saveipgen[IP6_HDR_LEN];
+ struct tcphdr tcp_savetcp;
+ short ostate = 0;
+#endif
+
+#ifdef INET6
+ isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
+#endif
+
+ to.to_flags = 0;
+ TCPSTAT_INC(tcps_rcvtotal);
+
+ if (isipv6) {
+#ifdef INET6
+ /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */
+ ip6 = mtod(m, struct ip6_hdr *);
+ tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
+ if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
+ TCPSTAT_INC(tcps_rcvbadsum);
+ goto drop;
+ }
+ th = (struct tcphdr *)((caddr_t)ip6 + off0);
+
+ /*
+ * Be proactive about unspecified IPv6 address in source.
+ * As we use all-zero to indicate unbounded/unconnected pcb,
+ * unspecified IPv6 address can be used to confuse us.
+ *
+ * Note that packets with unspecified IPv6 destination is
+ * already dropped in ip6_input.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
+ /* XXX stat */
+ goto drop;
+ }
+#else
+ th = NULL; /* XXX: Avoid compiler warning. */
+#endif
+ } else {
+ /*
+ * Get IP and TCP header together in first mbuf.
+ * Note: IP leaves IP header in first mbuf.
+ */
+ if (off0 > sizeof (struct ip)) {
+ ip_stripoptions(m, (struct mbuf *)0);
+ off0 = sizeof(struct ip);
+ }
+ if (m->m_len < sizeof (struct tcpiphdr)) {
+ if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
+ == NULL) {
+ TCPSTAT_INC(tcps_rcvshort);
+ return;
+ }
+ }
+ ip = mtod(m, struct ip *);
+ ipov = (struct ipovly *)ip;
+ th = (struct tcphdr *)((caddr_t)ip + off0);
+ tlen = ip->ip_len;
+
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ th->th_sum = m->m_pkthdr.csum_data;
+ else
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr,
+ htonl(m->m_pkthdr.csum_data +
+ ip->ip_len +
+ IPPROTO_TCP));
+ th->th_sum ^= 0xffff;
+#ifdef TCPDEBUG
+ ipov->ih_len = (u_short)tlen;
+ ipov->ih_len = htons(ipov->ih_len);
+#endif
+ } else {
+ /*
+ * Checksum extended TCP header and data.
+ */
+ len = sizeof (struct ip) + tlen;
+ bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
+ ipov->ih_len = (u_short)tlen;
+ ipov->ih_len = htons(ipov->ih_len);
+ th->th_sum = in_cksum(m, len);
+ }
+ if (th->th_sum) {
+ TCPSTAT_INC(tcps_rcvbadsum);
+ goto drop;
+ }
+ /* Re-initialization for later version check */
+ ip->ip_v = IPVERSION;
+ }
+
+#ifdef INET6
+ if (isipv6)
+ iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+ else
+#endif
+ iptos = ip->ip_tos;
+
+ /*
+ * Check that TCP offset makes sense,
+ * pull out TCP options and adjust length. XXX
+ */
+ off = th->th_off << 2;
+ if (off < sizeof (struct tcphdr) || off > tlen) {
+ TCPSTAT_INC(tcps_rcvbadoff);
+ goto drop;
+ }
+ tlen -= off; /* tlen is used instead of ti->ti_len */
+ if (off > sizeof (struct tcphdr)) {
+ if (isipv6) {
+#ifdef INET6
+ IP6_EXTHDR_CHECK(m, off0, off, );
+ ip6 = mtod(m, struct ip6_hdr *);
+ th = (struct tcphdr *)((caddr_t)ip6 + off0);
+#endif
+ } else {
+ if (m->m_len < sizeof(struct ip) + off) {
+ if ((m = m_pullup(m, sizeof (struct ip) + off))
+ == NULL) {
+ TCPSTAT_INC(tcps_rcvshort);
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ ipov = (struct ipovly *)ip;
+ th = (struct tcphdr *)((caddr_t)ip + off0);
+ }
+ }
+ optlen = off - sizeof (struct tcphdr);
+ optp = (u_char *)(th + 1);
+ }
+ thflags = th->th_flags;
+
+ /*
+ * Convert TCP protocol specific fields to host format.
+ */
+ th->th_seq = ntohl(th->th_seq);
+ th->th_ack = ntohl(th->th_ack);
+ th->th_win = ntohs(th->th_win);
+ th->th_urp = ntohs(th->th_urp);
+
+ /*
+ * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
+ */
+ drop_hdrlen = off0 + off;
+
+ /*
+ * Locate pcb for segment, which requires a lock on tcbinfo.
+ * Optimisticaly acquire a global read lock rather than a write lock
+ * unless header flags necessarily imply a state change. There are
+ * two cases where we might discover later we need a write lock
+ * despite the flags: ACKs moving a connection out of the syncache,
+ * and ACKs for a connection in TIMEWAIT.
+ */
+ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
+ tcp_read_locking == 0) {
+ INP_INFO_WLOCK(&V_tcbinfo);
+ ti_locked = TI_WLOCKED;
+ } else {
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
+ }
+
+findpcb:
+#ifdef INVARIANTS
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ else if (ti_locked == TI_WLOCKED)
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ else
+ panic("%s: findpcb ti_locked %d\n", __func__, ti_locked);
+#endif
+
+#ifdef IPFIREWALL_FORWARD
+ /*
+ * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
+ */
+ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
+
+ if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */
+ struct sockaddr_in *next_hop;
+
+ next_hop = (struct sockaddr_in *)(fwd_tag+1);
+ /*
+ * Transparently forwarded. Pretend to be the destination.
+ * already got one like this?
+ */
+ inp = in_pcblookup_hash(&V_tcbinfo,
+ ip->ip_src, th->th_sport,
+ ip->ip_dst, th->th_dport,
+ 0, m->m_pkthdr.rcvif);
+ if (!inp) {
+ /* It's new. Try to find the ambushing socket. */
+ inp = in_pcblookup_hash(&V_tcbinfo,
+ ip->ip_src, th->th_sport,
+ next_hop->sin_addr,
+ next_hop->sin_port ?
+ ntohs(next_hop->sin_port) :
+ th->th_dport,
+ INPLOOKUP_WILDCARD,
+ m->m_pkthdr.rcvif);
+ }
+ /* Remove the tag from the packet. We don't need it anymore. */
+ m_tag_delete(m, fwd_tag);
+ } else
+#endif /* IPFIREWALL_FORWARD */
+ {
+ if (isipv6) {
+#ifdef INET6
+ inp = in6_pcblookup_hash(&V_tcbinfo,
+ &ip6->ip6_src, th->th_sport,
+ &ip6->ip6_dst, th->th_dport,
+ INPLOOKUP_WILDCARD,
+ m->m_pkthdr.rcvif);
+#endif
+ } else
+ inp = in_pcblookup_hash(&V_tcbinfo,
+ ip->ip_src, th->th_sport,
+ ip->ip_dst, th->th_dport,
+ INPLOOKUP_WILDCARD,
+ m->m_pkthdr.rcvif);
+ }
+
+ /*
+ * If the INPCB does not exist then all data in the incoming
+ * segment is discarded and an appropriate RST is sent back.
+ * XXX MRT Send RST using which routing table?
+ */
+ if (inp == NULL) {
+ /*
+ * Log communication attempts to ports that are not
+ * in use.
+ */
+ if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
+ tcp_log_in_vain == 2) {
+ if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6)))
+ log(LOG_INFO, "%s; %s: Connection attempt "
+ "to closed port\n", s, __func__);
+ }
+ /*
+ * When blackholing do not respond with a RST but
+ * completely ignore the segment and drop it.
+ */
+ if ((V_blackhole == 1 && (thflags & TH_SYN)) ||
+ V_blackhole == 2)
+ goto dropunlock;
+
+ rstreason = BANDLIM_RST_CLOSEDPORT;
+ goto dropwithreset;
+ }
+ INP_WLOCK(inp);
+ if (!(inp->inp_flags & INP_HW_FLOWID)
+ && (m->m_flags & M_FLOWID)
+ && ((inp->inp_socket == NULL)
+ || !(inp->inp_socket->so_options & SO_ACCEPTCONN))) {
+ inp->inp_flags |= INP_HW_FLOWID;
+ inp->inp_flags &= ~INP_SW_FLOWID;
+ inp->inp_flowid = m->m_pkthdr.flowid;
+ }
+#ifdef IPSEC
+#ifdef INET6
+ if (isipv6 && ipsec6_in_reject(m, inp)) {
+ V_ipsec6stat.in_polvio++;
+ goto dropunlock;
+ } else
+#endif /* INET6 */
+ if (ipsec4_in_reject(m, inp) != 0) {
+ V_ipsec4stat.in_polvio++;
+ goto dropunlock;
+ }
+#endif /* IPSEC */
+
+ /*
+ * Check the minimum TTL for socket.
+ */
+ if (inp->inp_ip_minttl != 0) {
+#ifdef INET6
+ if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim)
+ goto dropunlock;
+ else
+#endif
+ if (inp->inp_ip_minttl > ip->ip_ttl)
+ goto dropunlock;
+ }
+
+ /*
+ * A previous connection in TIMEWAIT state is supposed to catch stray
+ * or duplicate segments arriving late. If this segment was a
+ * legitimate new connection attempt the old INPCB gets removed and
+ * we can try again to find a listening socket.
+ *
+ * At this point, due to earlier optimism, we may hold a read lock on
+ * the inpcbinfo, rather than a write lock. If so, we need to
+ * upgrade, or if that fails, acquire a reference on the inpcb, drop
+ * all locks, acquire a global write lock, and then re-acquire the
+ * inpcb lock. We may at that point discover that another thread has
+ * tried to free the inpcb, in which case we need to loop back and
+ * try to find a new inpcb to deliver to.
+ */
+relocked:
+ if (inp->inp_flags & INP_TIMEWAIT) {
+ KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
+ ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked));
+
+ if (ti_locked == TI_RLOCKED) {
+ if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) {
+ in_pcbref(inp);
+ INP_WUNLOCK(inp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
+ ti_locked = TI_WLOCKED;
+ INP_WLOCK(inp);
+ if (in_pcbrele(inp)) {
+ inp = NULL;
+ goto findpcb;
+ }
+ } else
+ ti_locked = TI_WLOCKED;
+ }
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+ if (thflags & TH_SYN)
+ tcp_dooptions(&to, optp, optlen, TO_SYN);
+ /*
+ * NB: tcp_twcheck unlocks the INP and frees the mbuf.
+ */
+ if (tcp_twcheck(inp, &to, th, m, tlen))
+ goto findpcb;
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return;
+ }
+ /*
+ * The TCPCB may no longer exist if the connection is winding
+ * down or it is in the CLOSED state. Either way we drop the
+ * segment and send an appropriate response.
+ */
+ tp = intotcpcb(inp);
+ if (tp == NULL || tp->t_state == TCPS_CLOSED) {
+ rstreason = BANDLIM_RST_CLOSEDPORT;
+ goto dropwithreset;
+ }
+
+ /*
+ * We've identified a valid inpcb, but it could be that we need an
+ * inpcbinfo write lock and have only a read lock. In this case,
+ * attempt to upgrade/relock using the same strategy as the TIMEWAIT
+ * case above. If we relock, we have to jump back to 'relocked' as
+ * the connection might now be in TIMEWAIT.
+ */
+ if (tp->t_state != TCPS_ESTABLISHED ||
+ (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
+ tcp_read_locking == 0) {
+ KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
+ ("%s: upgrade check ti_locked %d", __func__, ti_locked));
+
+ if (ti_locked == TI_RLOCKED) {
+ if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) {
+ in_pcbref(inp);
+ INP_WUNLOCK(inp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
+ ti_locked = TI_WLOCKED;
+ INP_WLOCK(inp);
+ if (in_pcbrele(inp)) {
+ inp = NULL;
+ goto findpcb;
+ }
+ goto relocked;
+ } else
+ ti_locked = TI_WLOCKED;
+ }
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ }
+
+#ifdef MAC
+ INP_WLOCK_ASSERT(inp);
+ if (mac_inpcb_check_deliver(inp, m))
+ goto dropunlock;
+#endif
+ so = inp->inp_socket;
+ KASSERT(so != NULL, ("%s: so == NULL", __func__));
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG) {
+ ostate = tp->t_state;
+ if (isipv6) {
+#ifdef INET6
+ bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6));
+#endif
+ } else
+ bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
+ tcp_savetcp = *th;
+ }
+#endif
+ /*
+ * When the socket is accepting connections (the INPCB is in LISTEN
+ * state) we look into the SYN cache if this is a new connection
+ * attempt or the completion of a previous one.
+ */
+ if (so->so_options & SO_ACCEPTCONN) {
+ struct in_conninfo inc;
+
+ KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but "
+ "tp not listening", __func__));
+
+ bzero(&inc, sizeof(inc));
+#ifdef INET6
+ if (isipv6) {
+ inc.inc_flags |= INC_ISIPV6;
+ inc.inc6_faddr = ip6->ip6_src;
+ inc.inc6_laddr = ip6->ip6_dst;
+ } else
+#endif
+ {
+ inc.inc_faddr = ip->ip_src;
+ inc.inc_laddr = ip->ip_dst;
+ }
+ inc.inc_fport = th->th_sport;
+ inc.inc_lport = th->th_dport;
+ inc.inc_fibnum = so->so_fibnum;
+
+ /*
+ * Check for an existing connection attempt in syncache if
+ * the flag is only ACK. A successful lookup creates a new
+ * socket appended to the listen queue in SYN_RECEIVED state.
+ */
+ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
+ /*
+ * Parse the TCP options here because
+ * syncookies need access to the reflected
+ * timestamp.
+ */
+ tcp_dooptions(&to, optp, optlen, 0);
+ /*
+ * NB: syncache_expand() doesn't unlock
+ * inp and tcpinfo locks.
+ */
+ if (!syncache_expand(&inc, &to, th, &so, m)) {
+ /*
+ * No syncache entry or ACK was not
+ * for our SYN/ACK. Send a RST.
+ * NB: syncache did its own logging
+ * of the failure cause.
+ */
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+ if (so == NULL) {
+ /*
+ * We completed the 3-way handshake
+ * but could not allocate a socket
+ * either due to memory shortage,
+ * listen queue length limits or
+ * global socket limits. Send RST
+ * or wait and have the remote end
+ * retransmit the ACK for another
+ * try.
+ */
+ if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Listen socket: "
+ "Socket allocation failed due to "
+ "limits or memory shortage, %s\n",
+ s, __func__,
+ V_tcp_sc_rst_sock_fail ?
+ "sending RST" : "try again");
+ if (V_tcp_sc_rst_sock_fail) {
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ } else
+ goto dropunlock;
+ }
+ /*
+ * Socket is created in state SYN_RECEIVED.
+ * Unlock the listen socket, lock the newly
+ * created socket and update the tp variable.
+ */
+ INP_WUNLOCK(inp); /* listen socket */
+ inp = sotoinpcb(so);
+ INP_WLOCK(inp); /* new connection */
+ tp = intotcpcb(inp);
+ KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
+ ("%s: ", __func__));
+ /*
+ * Process the segment and the data it
+ * contains. tcp_do_segment() consumes
+ * the mbuf chain and unlocks the inpcb.
+ */
+ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
+ iptos, ti_locked);
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ return;
+ }
+ /*
+ * Segment flag validation for new connection attempts:
+ *
+ * Our (SYN|ACK) response was rejected.
+ * Check with syncache and remove entry to prevent
+ * retransmits.
+ *
+ * NB: syncache_chkrst does its own logging of failure
+ * causes.
+ */
+ if (thflags & TH_RST) {
+ syncache_chkrst(&inc, th);
+ goto dropunlock;
+ }
+ /*
+ * We can't do anything without SYN.
+ */
+ if ((thflags & TH_SYN) == 0) {
+ if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Listen socket: "
+ "SYN is missing, segment ignored\n",
+ s, __func__);
+ TCPSTAT_INC(tcps_badsyn);
+ goto dropunlock;
+ }
+ /*
+ * (SYN|ACK) is bogus on a listen socket.
+ */
+ if (thflags & TH_ACK) {
+ if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Listen socket: "
+ "SYN|ACK invalid, segment rejected\n",
+ s, __func__);
+ syncache_badack(&inc); /* XXX: Not needed! */
+ TCPSTAT_INC(tcps_badsyn);
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+ /*
+ * If the drop_synfin option is enabled, drop all
+ * segments with both the SYN and FIN bits set.
+ * This prevents e.g. nmap from identifying the
+ * TCP/IP stack.
+ * XXX: Poor reasoning. nmap has other methods
+ * and is constantly refining its stack detection
+ * strategies.
+ * XXX: This is a violation of the TCP specification
+ * and was used by RFC1644.
+ */
+ if ((thflags & TH_FIN) && V_drop_synfin) {
+ if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Listen socket: "
+ "SYN|FIN segment ignored (based on "
+ "sysctl setting)\n", s, __func__);
+ TCPSTAT_INC(tcps_badsyn);
+ goto dropunlock;
+ }
+ /*
+ * Segment's flags are (SYN) or (SYN|FIN).
+ *
+ * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
+ * as they do not affect the state of the TCP FSM.
+ * The data pointed to by TH_URG and th_urp is ignored.
+ */
+ KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
+ ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
+ KASSERT(thflags & (TH_SYN),
+ ("%s: Listen socket: TH_SYN not set", __func__));
+#ifdef INET6
+ /*
+ * If deprecated address is forbidden,
+ * we do not accept SYN to deprecated interface
+ * address to prevent any new inbound connection from
+ * getting established.
+ * When we do not accept SYN, we send a TCP RST,
+ * with deprecated source address (instead of dropping
+ * it). We compromise it as it is much better for peer
+ * to send a RST, and RST will be the final packet
+ * for the exchange.
+ *
+ * If we do not forbid deprecated addresses, we accept
+ * the SYN packet. RFC2462 does not suggest dropping
+ * SYN in this case.
+ * If we decipher RFC2462 5.5.4, it says like this:
+ * 1. use of deprecated addr with existing
+ * communication is okay - "SHOULD continue to be
+ * used"
+ * 2. use of it with new communication:
+ * (2a) "SHOULD NOT be used if alternate address
+ * with sufficient scope is available"
+ * (2b) nothing mentioned otherwise.
+ * Here we fall into (2b) case as we have no choice in
+ * our source address selection - we must obey the peer.
+ *
+ * The wording in RFC2462 is confusing, and there are
+ * multiple description text for deprecated address
+ * handling - worse, they are not exactly the same.
+ * I believe 5.5.4 is the best one, so we follow 5.5.4.
+ */
+ if (isipv6 && !V_ip6_use_deprecated) {
+ struct in6_ifaddr *ia6;
+
+ ia6 = ip6_getdstifaddr(m);
+ if (ia6 != NULL &&
+ (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
+ ifa_free(&ia6->ia_ifa);
+ if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Listen socket: "
+ "Connection attempt to deprecated "
+ "IPv6 address rejected\n",
+ s, __func__);
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+ ifa_free(&ia6->ia_ifa);
+ }
+#endif
+ /*
+ * Basic sanity checks on incoming SYN requests:
+ * Don't respond if the destination is a link layer
+ * broadcast according to RFC1122 4.2.3.10, p. 104.
+ * If it is from this socket it must be forged.
+ * Don't respond if the source or destination is a
+ * global or subnet broad- or multicast address.
+ * Note that it is quite possible to receive unicast
+ * link-layer packets with a broadcast IP address. Use
+ * in_broadcast() to find them.
+ */
+ if (m->m_flags & (M_BCAST|M_MCAST)) {
+ if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Listen socket: "
+ "Connection attempt from broad- or multicast "
+ "link layer address ignored\n", s, __func__);
+ goto dropunlock;
+ }
+ if (isipv6) {
+#ifdef INET6
+ if (th->th_dport == th->th_sport &&
+ IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
+ if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Listen socket: "
+ "Connection attempt to/from self "
+ "ignored\n", s, __func__);
+ goto dropunlock;
+ }
+ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
+ IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
+ if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Listen socket: "
+ "Connection attempt from/to multicast "
+ "address ignored\n", s, __func__);
+ goto dropunlock;
+ }
+#endif
+ } else {
+ if (th->th_dport == th->th_sport &&
+ ip->ip_dst.s_addr == ip->ip_src.s_addr) {
+ if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Listen socket: "
+ "Connection attempt from/to self "
+ "ignored\n", s, __func__);
+ goto dropunlock;
+ }
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
+ IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
+ ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
+ in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
+ if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Listen socket: "
+ "Connection attempt from/to broad- "
+ "or multicast address ignored\n",
+ s, __func__);
+ goto dropunlock;
+ }
+ }
+ /*
+ * SYN appears to be valid. Create compressed TCP state
+ * for syncache.
+ */
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp,
+ (void *)tcp_saveipgen, &tcp_savetcp, 0);
+#endif
+ tcp_dooptions(&to, optp, optlen, TO_SYN);
+ syncache_add(&inc, &to, th, inp, &so, m);
+ /*
+ * Entry added to syncache and mbuf consumed.
+ * Everything already unlocked by syncache_add().
+ */
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ return;
+ }
+
+ /*
+ * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
+ * state. tcp_do_segment() always consumes the mbuf chain, unlocks
+ * the inpcb, and unlocks pcbinfo.
+ */
+ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ return;
+
+dropwithreset:
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ else if (ti_locked == TI_WLOCKED)
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ else
+ panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
+ ti_locked = TI_UNLOCKED;
+
+ if (inp != NULL) {
+ tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ INP_WUNLOCK(inp);
+ } else
+ tcp_dropwithreset(m, th, NULL, tlen, rstreason);
+ m = NULL; /* mbuf chain got consumed. */
+ goto drop;
+
+dropunlock:
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ else if (ti_locked == TI_WLOCKED)
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ else
+ panic("%s: dropunlock ti_locked %d", __func__, ti_locked);
+ ti_locked = TI_UNLOCKED;
+
+ if (inp != NULL)
+ INP_WUNLOCK(inp);
+
+drop:
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ if (s != NULL)
+ free(s, M_TCPLOG);
+ if (m != NULL)
+ m_freem(m);
+}
+
+static void
+tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
+ int ti_locked)
+{
+ int thflags, acked, ourfinisacked, needoutput = 0;
+ int rstreason, todrop, win;
+ u_long tiwin;
+ struct tcpopt to;
+
+#ifdef TCPDEBUG
+ /*
+ * The size of tcp_saveipgen must be the size of the max ip header,
+ * now IPv6.
+ */
+ u_char tcp_saveipgen[IP6_HDR_LEN];
+ struct tcphdr tcp_savetcp;
+ short ostate = 0;
+#endif
+ thflags = th->th_flags;
+
+ /*
+ * If this is either a state-changing packet or current state isn't
+ * established, we require a write lock on tcbinfo. Otherwise, we
+ * allow either a read lock or a write lock, as we may have acquired
+ * a write lock due to a race.
+ *
+ * Require a global write lock for SYN/FIN/RST segments or
+ * non-established connections; otherwise accept either a read or
+ * write lock, as we may have conservatively acquired a write lock in
+ * certain cases in tcp_input() (is this still true?). Currently we
+ * will never enter with no lock, so we try to drop it quickly in the
+ * common pure ack/pure data cases.
+ */
+ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
+ tp->t_state != TCPS_ESTABLISHED) {
+ KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for "
+ "SYN/FIN/RST/!EST", __func__, ti_locked));
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ } else {
+#ifdef INVARIANTS
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ else if (ti_locked == TI_WLOCKED)
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ else
+ panic("%s: ti_locked %d for EST", __func__,
+ ti_locked);
+#endif
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
+ __func__));
+ KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
+ __func__));
+
+ /*
+ * Segment received on connection.
+ * Reset idle time and keep-alive timer.
+ * XXX: This should be done after segment
+ * validation to ignore broken/spoofed segs.
+ */
+ tp->t_rcvtime = ticks;
+ if (TCPS_HAVEESTABLISHED(tp->t_state))
+ tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
+
+ /*
+ * Unscale the window into a 32-bit value.
+ * For the SYN_SENT state the scale is zero.
+ */
+ tiwin = th->th_win << tp->snd_scale;
+
+ /*
+ * TCP ECN processing.
+ */
+ if (tp->t_flags & TF_ECN_PERMIT) {
+ if (thflags & TH_CWR)
+ tp->t_flags &= ~TF_ECN_SND_ECE;
+ switch (iptos & IPTOS_ECN_MASK) {
+ case IPTOS_ECN_CE:
+ tp->t_flags |= TF_ECN_SND_ECE;
+ TCPSTAT_INC(tcps_ecn_ce);
+ break;
+ case IPTOS_ECN_ECT0:
+ TCPSTAT_INC(tcps_ecn_ect0);
+ break;
+ case IPTOS_ECN_ECT1:
+ TCPSTAT_INC(tcps_ecn_ect1);
+ break;
+ }
+ /*
+ * Congestion experienced.
+ * Ignore if we are already trying to recover.
+ */
+ if ((thflags & TH_ECE) &&
+ SEQ_LEQ(th->th_ack, tp->snd_recover)) {
+ TCPSTAT_INC(tcps_ecn_rcwnd);
+ tcp_congestion_exp(tp);
+ }
+ }
+
+ /*
+ * Parse options on any incoming segment.
+ */
+ tcp_dooptions(&to, (u_char *)(th + 1),
+ (th->th_off << 2) - sizeof(struct tcphdr),
+ (thflags & TH_SYN) ? TO_SYN : 0);
+
+ /*
+ * If echoed timestamp is later than the current time,
+ * fall back to non RFC1323 RTT calculation. Normalize
+ * timestamp if syncookies were used when this connection
+ * was established.
+ */
+ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
+ to.to_tsecr -= tp->ts_offset;
+ if (TSTMP_GT(to.to_tsecr, ticks))
+ to.to_tsecr = 0;
+ }
+
+ /*
+ * Process options only when we get SYN/ACK back. The SYN case
+ * for incoming connections is handled in tcp_syncache.
+ * According to RFC1323 the window field in a SYN (i.e., a <SYN>
+ * or <SYN,ACK>) segment itself is never scaled.
+ * XXX this is traditional behavior, may need to be cleaned up.
+ */
+ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
+ if ((to.to_flags & TOF_SCALE) &&
+ (tp->t_flags & TF_REQ_SCALE)) {
+ tp->t_flags |= TF_RCVD_SCALE;
+ tp->snd_scale = to.to_wscale;
+ }
+ /*
+ * Initial send window. It will be updated with
+ * the next incoming segment to the scaled value.
+ */
+ tp->snd_wnd = th->th_win;
+ if (to.to_flags & TOF_TS) {
+ tp->t_flags |= TF_RCVD_TSTMP;
+ tp->ts_recent = to.to_tsval;
+ tp->ts_recent_age = ticks;
+ }
+ if (to.to_flags & TOF_MSS)
+ tcp_mss(tp, to.to_mss);
+ if ((tp->t_flags & TF_SACK_PERMIT) &&
+ (to.to_flags & TOF_SACKPERM) == 0)
+ tp->t_flags &= ~TF_SACK_PERMIT;
+ }
+
+ /*
+ * Header prediction: check for the two common cases
+ * of a uni-directional data xfer. If the packet has
+ * no control flags, is in-sequence, the window didn't
+ * change and we're not retransmitting, it's a
+ * candidate. If the length is zero and the ack moved
+ * forward, we're the sender side of the xfer. Just
+ * free the data acked & wake any higher level process
+ * that was blocked waiting for space. If the length
+ * is non-zero and the ack didn't move, we're the
+ * receiver side. If we're getting packets in-order
+ * (the reassembly queue is empty), add the data to
+ * the socket buffer and note that we need a delayed ack.
+ * Make sure that the hidden state-flags are also off.
+ * Since we check for TCPS_ESTABLISHED first, it can only
+ * be TH_NEEDSYN.
+ */
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ th->th_seq == tp->rcv_nxt &&
+ (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
+ tp->snd_nxt == tp->snd_max &&
+ tiwin && tiwin == tp->snd_wnd &&
+ ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
+ LIST_EMPTY(&tp->t_segq) &&
+ ((to.to_flags & TOF_TS) == 0 ||
+ TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
+
+ /*
+ * If last ACK falls within this segment's sequence numbers,
+ * record the timestamp.
+ * NOTE that the test is modified according to the latest
+ * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ */
+ if ((to.to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ tp->ts_recent_age = ticks;
+ tp->ts_recent = to.to_tsval;
+ }
+
+ if (tlen == 0) {
+ if (SEQ_GT(th->th_ack, tp->snd_una) &&
+ SEQ_LEQ(th->th_ack, tp->snd_max) &&
+ tp->snd_cwnd >= tp->snd_wnd &&
+ ((!V_tcp_do_newreno &&
+ !(tp->t_flags & TF_SACK_PERMIT) &&
+ tp->t_dupacks < tcprexmtthresh) ||
+ ((V_tcp_do_newreno ||
+ (tp->t_flags & TF_SACK_PERMIT)) &&
+ !IN_FASTRECOVERY(tp) &&
+ (to.to_flags & TOF_SACK) == 0 &&
+ TAILQ_EMPTY(&tp->snd_holes)))) {
+ /*
+ * This is a pure ack for outstanding data.
+ */
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ else if (ti_locked == TI_WLOCKED)
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ else
+ panic("%s: ti_locked %d on pure ACK",
+ __func__, ti_locked);
+ ti_locked = TI_UNLOCKED;
+
+ TCPSTAT_INC(tcps_predack);
+
+ /*
+ * "bad retransmit" recovery.
+ */
+ if (tp->t_rxtshift == 1 &&
+ (int)(ticks - tp->t_badrxtwin) < 0) {
+ TCPSTAT_INC(tcps_sndrexmitbad);
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh =
+ tp->snd_ssthresh_prev;
+ tp->snd_recover = tp->snd_recover_prev;
+ if (tp->t_flags & TF_WASFRECOVERY)
+ ENTER_FASTRECOVERY(tp);
+ tp->snd_nxt = tp->snd_max;
+ tp->t_badrxtwin = 0;
+ }
+
+ /*
+ * Recalculate the transmit timer / rtt.
+ *
+ * Some boxes send broken timestamp replies
+ * during the SYN+ACK phase, ignore
+ * timestamps of 0 or we could calculate a
+ * huge RTT and blow up the retransmit timer.
+ */
+ if ((to.to_flags & TOF_TS) != 0 &&
+ to.to_tsecr) {
+ if (!tp->t_rttlow ||
+ tp->t_rttlow > ticks - to.to_tsecr)
+ tp->t_rttlow = ticks - to.to_tsecr;
+ tcp_xmit_timer(tp,
+ ticks - to.to_tsecr + 1);
+ } else if (tp->t_rtttime &&
+ SEQ_GT(th->th_ack, tp->t_rtseq)) {
+ if (!tp->t_rttlow ||
+ tp->t_rttlow > ticks - tp->t_rtttime)
+ tp->t_rttlow = ticks - tp->t_rtttime;
+ tcp_xmit_timer(tp,
+ ticks - tp->t_rtttime);
+ }
+ tcp_xmit_bandwidth_limit(tp, th->th_ack);
+ acked = th->th_ack - tp->snd_una;
+ TCPSTAT_INC(tcps_rcvackpack);
+ TCPSTAT_ADD(tcps_rcvackbyte, acked);
+ sbdrop(&so->so_snd, acked);
+ if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
+ SEQ_LEQ(th->th_ack, tp->snd_recover))
+ tp->snd_recover = th->th_ack - 1;
+ tp->snd_una = th->th_ack;
+ /*
+ * Pull snd_wl2 up to prevent seq wrap relative
+ * to th_ack.
+ */
+ tp->snd_wl2 = th->th_ack;
+ tp->t_dupacks = 0;
+ m_freem(m);
+ ND6_HINT(tp); /* Some progress has been made. */
+
+ /*
+ * If all outstanding data are acked, stop
+ * retransmit timer, otherwise restart timer
+ * using current (possibly backed-off) value.
+ * If process is waiting for space,
+ * wakeup/selwakeup/signal. If data
+ * are ready to send, let tcp_output
+ * decide between more output or persist.
+ */
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp,
+ (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ if (tp->snd_una == tp->snd_max)
+ tcp_timer_activate(tp, TT_REXMT, 0);
+ else if (!tcp_timer_active(tp, TT_PERSIST))
+ tcp_timer_activate(tp, TT_REXMT,
+ tp->t_rxtcur);
+ sowwakeup(so);
+ if (so->so_snd.sb_cc)
+ (void) tcp_output(tp);
+ goto check_delack;
+ }
+ } else if (th->th_ack == tp->snd_una &&
+ tlen <= sbspace(&so->so_rcv)) {
+ int newsize = 0; /* automatic sockbuf scaling */
+
+ /*
+ * This is a pure, in-sequence data packet with
+ * nothing on the reassembly queue and we have enough
+ * buffer space to take it.
+ */
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ else if (ti_locked == TI_WLOCKED)
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ else
+ panic("%s: ti_locked %d on pure data "
+ "segment", __func__, ti_locked);
+ ti_locked = TI_UNLOCKED;
+
+ /* Clean receiver SACK report if present */
+ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
+ tcp_clean_sackreport(tp);
+ TCPSTAT_INC(tcps_preddat);
+ tp->rcv_nxt += tlen;
+ /*
+ * Pull snd_wl1 up to prevent seq wrap relative to
+ * th_seq.
+ */
+ tp->snd_wl1 = th->th_seq;
+ /*
+ * Pull rcv_up up to prevent seq wrap relative to
+ * rcv_nxt.
+ */
+ tp->rcv_up = tp->rcv_nxt;
+ TCPSTAT_INC(tcps_rcvpack);
+ TCPSTAT_ADD(tcps_rcvbyte, tlen);
+ ND6_HINT(tp); /* Some progress has been made */
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp,
+ (void *)tcp_saveipgen, &tcp_savetcp, 0);
+#endif
+ /*
+ * Automatic sizing of receive socket buffer. Often the send
+ * buffer size is not optimally adjusted to the actual network
+ * conditions at hand (delay bandwidth product). Setting the
+ * buffer size too small limits throughput on links with high
+ * bandwidth and high delay (eg. trans-continental/oceanic links).
+ *
+ * On the receive side the socket buffer memory is only rarely
+ * used to any significant extent. This allows us to be much
+ * more aggressive in scaling the receive socket buffer. For
+ * the case that the buffer space is actually used to a large
+ * extent and we run out of kernel memory we can simply drop
+ * the new segments; TCP on the sender will just retransmit it
+ * later. Setting the buffer size too big may only consume too
+ * much kernel memory if the application doesn't read() from
+ * the socket or packet loss or reordering makes use of the
+ * reassembly queue.
+ *
+ * The criteria to step up the receive buffer one notch are:
+ * 1. the number of bytes received during the time it takes
+ * one timestamp to be reflected back to us (the RTT);
+ * 2. received bytes per RTT is within seven eighth of the
+ * current socket buffer size;
+ * 3. receive buffer size has not hit maximal automatic size;
+ *
+ * This algorithm does one step per RTT at most and only if
+ * we receive a bulk stream w/o packet losses or reorderings.
+ * Shrinking the buffer during idle times is not necessary as
+ * it doesn't consume any memory when idle.
+ *
+ * TODO: Only step up if the application is actually serving
+ * the buffer to better manage the socket buffer resources.
+ */
+ if (V_tcp_do_autorcvbuf &&
+ to.to_tsecr &&
+ (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+ if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
+ to.to_tsecr - tp->rfbuf_ts < hz) {
+ if (tp->rfbuf_cnt >
+ (so->so_rcv.sb_hiwat / 8 * 7) &&
+ so->so_rcv.sb_hiwat <
+ V_tcp_autorcvbuf_max) {
+ newsize =
+ min(so->so_rcv.sb_hiwat +
+ V_tcp_autorcvbuf_inc,
+ V_tcp_autorcvbuf_max);
+ }
+ /* Start over with next RTT. */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+ } else
+ tp->rfbuf_cnt += tlen; /* add up */
+ }
+
+ /* Add data to socket buffer. */
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ m_freem(m);
+ } else {
+ /*
+ * Set new socket buffer size.
+ * Give up when limit is reached.
+ */
+ if (newsize)
+ if (!sbreserve_locked(&so->so_rcv,
+ newsize, so, NULL))
+ so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
+ m_adj(m, drop_hdrlen); /* delayed header drop */
+ sbappendstream_locked(&so->so_rcv, m);
+ }
+ /* NB: sorwakeup_locked() does an implicit unlock. */
+ sorwakeup_locked(so);
+ if (DELAY_ACK(tp)) {
+ tp->t_flags |= TF_DELACK;
+ } else {
+ tp->t_flags |= TF_ACKNOW;
+ tcp_output(tp);
+ }
+ goto check_delack;
+ }
+ }
+
+ /*
+ * Calculate amount of space in receive window,
+ * and then do TCP input processing.
+ * Receive window is amount of space in rcv queue,
+ * but not less than advertised window.
+ */
+ win = sbspace(&so->so_rcv);
+ if (win < 0)
+ win = 0;
+ tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
+
+ /* Reset receive buffer auto scaling when not in bulk receive mode. */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+
+ switch (tp->t_state) {
+
+ /*
+ * If the state is SYN_RECEIVED:
+ * if seg contains an ACK, but not for our SYN/ACK, send a RST.
+ */
+ case TCPS_SYN_RECEIVED:
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->snd_una) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+ break;
+
+ /*
+ * If the state is SYN_SENT:
+ * if seg contains an ACK, but not for our SYN, drop the input.
+ * if seg contains a RST, then drop the connection.
+ * if seg does not contain SYN, then drop it.
+ * Otherwise this is an acceptable SYN segment
+ * initialize tp->rcv_nxt and tp->irs
+ * if seg contains ack then advance tp->snd_una
+ * if seg contains an ECE and ECN support is enabled, the stream
+ * is ECN capable.
+ * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
+ * arrange for segment to be acked (eventually)
+ * continue processing rest of data/controls, beginning with URG
+ */
+ case TCPS_SYN_SENT:
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->iss) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+ if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST))
+ tp = tcp_drop(tp, ECONNREFUSED);
+ if (thflags & TH_RST)
+ goto drop;
+ if (!(thflags & TH_SYN))
+ goto drop;
+
+ tp->irs = th->th_seq;
+ tcp_rcvseqinit(tp);
+ if (thflags & TH_ACK) {
+ TCPSTAT_INC(tcps_connects);
+ soisconnected(so);
+#ifdef MAC
+ mac_socketpeer_set_from_mbuf(m, so);
+#endif
+ /* Do window scaling on this connection? */
+ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+ tp->rcv_scale = tp->request_r_scale;
+ }
+ tp->rcv_adv += tp->rcv_wnd;
+ tp->snd_una++; /* SYN is acked */
+ /*
+ * If there's data, delay ACK; if there's also a FIN
+ * ACKNOW will be turned on later.
+ */
+ if (DELAY_ACK(tp) && tlen != 0)
+ tcp_timer_activate(tp, TT_DELACK,
+ tcp_delacktime);
+ else
+ tp->t_flags |= TF_ACKNOW;
+
+ if ((thflags & TH_ECE) && V_tcp_do_ecn) {
+ tp->t_flags |= TF_ECN_PERMIT;
+ TCPSTAT_INC(tcps_ecn_shs);
+ }
+
+ /*
+ * Received <SYN,ACK> in SYN_SENT[*] state.
+ * Transitions:
+ * SYN_SENT --> ESTABLISHED
+ * SYN_SENT* --> FIN_WAIT_1
+ */
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tp->t_state = TCPS_FIN_WAIT_1;
+ tp->t_flags &= ~TF_NEEDFIN;
+ thflags &= ~TH_SYN;
+ } else {
+ tp->t_state = TCPS_ESTABLISHED;
+ tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
+ }
+ } else {
+ /*
+ * Received initial SYN in SYN-SENT[*] state =>
+ * simultaneous open. If segment contains CC option
+ * and there is a cached CC, apply TAO test.
+ * If it succeeds, connection is * half-synchronized.
+ * Otherwise, do 3-way handshake:
+ * SYN-SENT -> SYN-RECEIVED
+ * SYN-SENT* -> SYN-RECEIVED*
+ * If there was no CC option, clear cached CC value.
+ */
+ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
+ tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_state = TCPS_SYN_RECEIVED;
+ }
+
+ KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: "
+ "ti_locked %d", __func__, ti_locked));
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /*
+ * Advance th->th_seq to correspond to first data byte.
+ * If data, trim to stay within window,
+ * dropping FIN if necessary.
+ */
+ th->th_seq++;
+ if (tlen > tp->rcv_wnd) {
+ todrop = tlen - tp->rcv_wnd;
+ m_adj(m, -todrop);
+ tlen = tp->rcv_wnd;
+ thflags &= ~TH_FIN;
+ TCPSTAT_INC(tcps_rcvpackafterwin);
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+ }
+ tp->snd_wl1 = th->th_seq - 1;
+ tp->rcv_up = th->th_seq;
+ /*
+ * Client side of transaction: already sent SYN and data.
+ * If the remote host used T/TCP to validate the SYN,
+ * our data will be ACK'd; if so, enter normal data segment
+ * processing in the middle of step 5, ack processing.
+ * Otherwise, goto step 6.
+ */
+ if (thflags & TH_ACK)
+ goto process_ACK;
+
+ goto step6;
+
+ /*
+ * If the state is LAST_ACK or CLOSING or TIME_WAIT:
+ * do normal processing.
+ *
+ * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
+ */
+ case TCPS_LAST_ACK:
+ case TCPS_CLOSING:
+ break; /* continue normal processing */
+ }
+
+ /*
+ * States other than LISTEN or SYN_SENT.
+ * First check the RST flag and sequence number since reset segments
+ * are exempt from the timestamp and connection count tests. This
+ * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
+ * below which allowed reset segments in half the sequence space
+ * to fall though and be processed (which gives forged reset
+ * segments with a random sequence number a 50 percent chance of
+ * killing a connection).
+ * Then check timestamp, if present.
+ * Then check the connection count, if present.
+ * Then check that at least some bytes of segment are within
+ * receive window. If segment begins before rcv_nxt,
+ * drop leading data (and SYN); if nothing left, just ack.
+ *
+ *
+ * If the RST bit is set, check the sequence number to see
+ * if this is a valid reset segment.
+ * RFC 793 page 37:
+ * In all states except SYN-SENT, all reset (RST) segments
+ * are validated by checking their SEQ-fields. A reset is
+ * valid if its sequence number is in the window.
+ * Note: this does not take into account delayed ACKs, so
+ * we should test against last_ack_sent instead of rcv_nxt.
+ * The sequence number in the reset segment is normally an
+ * echo of our outgoing acknowlegement numbers, but some hosts
+ * send a reset with the sequence number at the rightmost edge
+ * of our receive window, and we have to handle this case.
+ * Note 2: Paul Watson's paper "Slipping in the Window" has shown
+ * that brute force RST attacks are possible. To combat this,
+ * we use a much stricter check while in the ESTABLISHED state,
+ * only accepting RSTs where the sequence number is equal to
+ * last_ack_sent. In all other states (the states in which a
+ * RST is more likely), the more permissive check is used.
+ * If we have multiple segments in flight, the initial reset
+ * segment sequence numbers will be to the left of last_ack_sent,
+ * but they will eventually catch up.
+ * In any case, it never made sense to trim reset segments to
+ * fit the receive window since RFC 1122 says:
+ * 4.2.2.12 RST Segment: RFC-793 Section 3.4
+ *
+ * A TCP SHOULD allow a received RST segment to include data.
+ *
+ * DISCUSSION
+ * It has been suggested that a RST segment could contain
+ * ASCII text that encoded and explained the cause of the
+ * RST. No standard has yet been established for such
+ * data.
+ *
+ * If the reset segment passes the sequence number test examine
+ * the state:
+ * SYN_RECEIVED STATE:
+ * If passive open, return to LISTEN state.
+ * If active open, inform user that connection was refused.
+ * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
+ * Inform user that connection was reset, and close tcb.
+ * CLOSING, LAST_ACK STATES:
+ * Close the tcb.
+ * TIME_WAIT STATE:
+ * Drop the segment - see Stevens, vol. 2, p. 964 and
+ * RFC 1337.
+ */
+ if (thflags & TH_RST) {
+ if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
+ switch (tp->t_state) {
+
+ case TCPS_SYN_RECEIVED:
+ so->so_error = ECONNREFUSED;
+ goto close;
+
+ case TCPS_ESTABLISHED:
+ if (V_tcp_insecure_rst == 0 &&
+ !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) &&
+ SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) &&
+ !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) {
+ TCPSTAT_INC(tcps_badrst);
+ goto drop;
+ }
+ /* FALLTHROUGH */
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ so->so_error = ECONNRESET;
+ close:
+ KASSERT(ti_locked == TI_WLOCKED,
+ ("tcp_do_segment: TH_RST 1 ti_locked %d",
+ ti_locked));
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+ tp->t_state = TCPS_CLOSED;
+ TCPSTAT_INC(tcps_drops);
+ tp = tcp_close(tp);
+ break;
+
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ KASSERT(ti_locked == TI_WLOCKED,
+ ("tcp_do_segment: TH_RST 2 ti_locked %d",
+ ti_locked));
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+ tp = tcp_close(tp);
+ break;
+ }
+ }
+ goto drop;
+ }
+
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment
+ * and it's less than ts_recent, drop it.
+ */
+ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to.to_tsval, tp->ts_recent)) {
+
+ /* Check to see if ts_recent is over 24 days old. */
+ if (ticks - tp->ts_recent_age > TCP_PAWS_IDLE) {
+ /*
+ * Invalidate ts_recent. If this segment updates
+ * ts_recent, the age will be reset later and ts_recent
+ * will get a valid value. If it does not, setting
+ * ts_recent to zero will at least satisfy the
+ * requirement that zero be placed in the timestamp
+ * echo reply when ts_recent isn't valid. The
+ * age isn't reset until we get a valid ts_recent
+ * because we don't want out-of-order segments to be
+ * dropped when ts_recent is old.
+ */
+ tp->ts_recent = 0;
+ } else {
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
+ TCPSTAT_INC(tcps_pawsdrop);
+ if (tlen)
+ goto dropafterack;
+ goto drop;
+ }
+ }
+
+ /*
+ * In the SYN-RECEIVED state, validate that the packet belongs to
+ * this connection before trimming the data to fit the receive
+ * window. Check the sequence number versus IRS since we know
+ * the sequence numbers haven't wrapped. This is a partial fix
+ * for the "LAND" DoS attack.
+ */
+ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+
+ todrop = tp->rcv_nxt - th->th_seq;
+ if (todrop > 0) {
+ /*
+ * If this is a duplicate SYN for our current connection,
+ * advance over it and pretend and it's not a SYN.
+ */
+ if (thflags & TH_SYN && th->th_seq == tp->irs) {
+ thflags &= ~TH_SYN;
+ th->th_seq++;
+ if (th->th_urp > 1)
+ th->th_urp--;
+ else
+ thflags &= ~TH_URG;
+ todrop--;
+ }
+ /*
+ * Following if statement from Stevens, vol. 2, p. 960.
+ */
+ if (todrop > tlen
+ || (todrop == tlen && (thflags & TH_FIN) == 0)) {
+ /*
+ * Any valid FIN must be to the left of the window.
+ * At this point the FIN must be a duplicate or out
+ * of sequence; drop it.
+ */
+ thflags &= ~TH_FIN;
+
+ /*
+ * Send an ACK to resynchronize and drop any data.
+ * But keep on processing for RST or ACK.
+ */
+ tp->t_flags |= TF_ACKNOW;
+ todrop = tlen;
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
+ } else {
+ TCPSTAT_INC(tcps_rcvpartduppack);
+ TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
+ }
+ drop_hdrlen += todrop; /* drop from the top afterwards */
+ th->th_seq += todrop;
+ tlen -= todrop;
+ if (th->th_urp > todrop)
+ th->th_urp -= todrop;
+ else {
+ thflags &= ~TH_URG;
+ th->th_urp = 0;
+ }
+ }
+
+ /*
+ * If new data are received on a connection after the
+ * user processes are gone, then RST the other end.
+ */
+ if ((so->so_state & SS_NOFDREF) &&
+ tp->t_state > TCPS_CLOSE_WAIT && tlen) {
+ char *s;
+
+ KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && "
+ "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+ if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket "
+ "was closed, sending RST and removing tcpcb\n",
+ s, __func__, tcpstates[tp->t_state], tlen);
+ free(s, M_TCPLOG);
+ }
+ tp = tcp_close(tp);
+ TCPSTAT_INC(tcps_rcvafterclose);
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+
+ /*
+ * If segment ends after window, drop trailing data
+ * (and PUSH and FIN); if nothing left, just ACK.
+ */
+ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
+ if (todrop > 0) {
+ TCPSTAT_INC(tcps_rcvpackafterwin);
+ if (todrop >= tlen) {
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
+ /*
+ * If window is closed can only take segments at
+ * window edge, and have to drop data and PUSH from
+ * incoming segments. Continue processing, but
+ * remember to ack. Otherwise, drop segment
+ * and ack.
+ */
+ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
+ tp->t_flags |= TF_ACKNOW;
+ TCPSTAT_INC(tcps_rcvwinprobe);
+ } else
+ goto dropafterack;
+ } else
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+ m_adj(m, -todrop);
+ tlen -= todrop;
+ thflags &= ~(TH_PUSH|TH_FIN);
+ }
+
+ /*
+ * If last ACK falls within this segment's sequence numbers,
+ * record its timestamp.
+ * NOTE:
+ * 1) That the test incorporates suggestions from the latest
+ * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ * 2) That updating only on newer timestamps interferes with
+ * our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment.
+ * 3) That we modify the segment boundary check to be
+ * Last.ACK.Sent <= SEG.SEQ + SEG.Len
+ * instead of RFC1323's
+ * Last.ACK.Sent < SEG.SEQ + SEG.Len,
+ * This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated
+ * Vol. 2 p.869. In such cases, we can still calculate the
+ * RTT correctly when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to.to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN|TH_FIN)) != 0))) {
+ tp->ts_recent_age = ticks;
+ tp->ts_recent = to.to_tsval;
+ }
+
+ /*
+ * If a SYN is in the window, then this is an
+ * error and we send an RST and drop the connection.
+ */
+ if (thflags & TH_SYN) {
+ KASSERT(ti_locked == TI_WLOCKED,
+ ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+ tp = tcp_drop(tp, ECONNRESET);
+ rstreason = BANDLIM_UNLIMITED;
+ goto drop;
+ }
+
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
+ * flag is on (half-synchronized state), then queue data for
+ * later processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_state == TCPS_SYN_RECEIVED ||
+ (tp->t_flags & TF_NEEDSYN))
+ goto step6;
+ else if (tp->t_flags & TF_ACKNOW)
+ goto dropafterack;
+ else
+ goto drop;
+ }
+
+ /*
+ * Ack processing.
+ */
+ switch (tp->t_state) {
+
+ /*
+ * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
+ * ESTABLISHED state and continue processing.
+ * The ACK was checked above.
+ */
+ case TCPS_SYN_RECEIVED:
+
+ TCPSTAT_INC(tcps_connects);
+ soisconnected(so);
+ /* Do window scaling? */
+ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+ tp->rcv_scale = tp->request_r_scale;
+ tp->snd_wnd = tiwin;
+ }
+ /*
+ * Make transitions:
+ * SYN-RECEIVED -> ESTABLISHED
+ * SYN-RECEIVED* -> FIN-WAIT-1
+ */
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tp->t_state = TCPS_FIN_WAIT_1;
+ tp->t_flags &= ~TF_NEEDFIN;
+ } else {
+ tp->t_state = TCPS_ESTABLISHED;
+ tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
+ }
+ /*
+ * If segment contains data or ACK, will call tcp_reass()
+ * later; if not, do so now to pass queued data to user.
+ */
+ if (tlen == 0 && (thflags & TH_FIN) == 0)
+ (void) tcp_reass(tp, (struct tcphdr *)0, 0,
+ (struct mbuf *)0);
+ tp->snd_wl1 = th->th_seq - 1;
+ /* FALLTHROUGH */
+
+ /*
+ * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
+ * ACKs. If the ack is in the range
+ * tp->snd_una < th->th_ack <= tp->snd_max
+ * then advance tp->snd_una to th->th_ack and drop
+ * data from the retransmission queue. If this ACK reflects
+ * more up to date window information we update our window information.
+ */
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ if (SEQ_GT(th->th_ack, tp->snd_max)) {
+ TCPSTAT_INC(tcps_rcvacktoomuch);
+ goto dropafterack;
+ }
+ if ((tp->t_flags & TF_SACK_PERMIT) &&
+ ((to.to_flags & TOF_SACK) ||
+ !TAILQ_EMPTY(&tp->snd_holes)))
+ tcp_sack_doack(tp, &to, th->th_ack);
+ if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
+ if (tlen == 0 && tiwin == tp->snd_wnd) {
+ TCPSTAT_INC(tcps_rcvdupack);
+ /*
+ * If we have outstanding data (other than
+ * a window probe), this is a completely
+ * duplicate ack (ie, window info didn't
+ * change), the ack is the biggest we've
+ * seen and we've seen exactly our rexmt
+ * threshhold of them, assume a packet
+ * has been dropped and retransmit it.
+ * Kludge snd_nxt & the congestion
+ * window so we send only this one
+ * packet.
+ *
+ * We know we're losing at the current
+ * window size so do congestion avoidance
+ * (set ssthresh to half the current window
+ * and pull our congestion window back to
+ * the new ssthresh).
+ *
+ * Dup acks mean that packets have left the
+ * network (they're now cached at the receiver)
+ * so bump cwnd by the amount in the receiver
+ * to keep a constant cwnd packets in the
+ * network.
+ *
+ * When using TCP ECN, notify the peer that
+ * we reduced the cwnd.
+ */
+ if (!tcp_timer_active(tp, TT_REXMT) ||
+ th->th_ack != tp->snd_una)
+ tp->t_dupacks = 0;
+ else if (++tp->t_dupacks > tcprexmtthresh ||
+ ((V_tcp_do_newreno ||
+ (tp->t_flags & TF_SACK_PERMIT)) &&
+ IN_FASTRECOVERY(tp))) {
+ if ((tp->t_flags & TF_SACK_PERMIT) &&
+ IN_FASTRECOVERY(tp)) {
+ int awnd;
+
+ /*
+ * Compute the amount of data in flight first.
+ * We can inject new data into the pipe iff
+ * we have less than 1/2 the original window's
+ * worth of data in flight.
+ */
+ awnd = (tp->snd_nxt - tp->snd_fack) +
+ tp->sackhint.sack_bytes_rexmit;
+ if (awnd < tp->snd_ssthresh) {
+ tp->snd_cwnd += tp->t_maxseg;
+ if (tp->snd_cwnd > tp->snd_ssthresh)
+ tp->snd_cwnd = tp->snd_ssthresh;
+ }
+ } else
+ tp->snd_cwnd += tp->t_maxseg;
+ (void) tcp_output(tp);
+ goto drop;
+ } else if (tp->t_dupacks == tcprexmtthresh) {
+ tcp_seq onxt = tp->snd_nxt;
+
+ /*
+ * If we're doing sack, check to
+ * see if we're already in sack
+ * recovery. If we're not doing sack,
+ * check to see if we're in newreno
+ * recovery.
+ */
+ if (tp->t_flags & TF_SACK_PERMIT) {
+ if (IN_FASTRECOVERY(tp)) {
+ tp->t_dupacks = 0;
+ break;
+ }
+ } else if (V_tcp_do_newreno ||
+ V_tcp_do_ecn) {
+ if (SEQ_LEQ(th->th_ack,
+ tp->snd_recover)) {
+ tp->t_dupacks = 0;
+ break;
+ }
+ }
+ tcp_congestion_exp(tp);
+ tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_rtttime = 0;
+ if (tp->t_flags & TF_SACK_PERMIT) {
+ TCPSTAT_INC(
+ tcps_sack_recovery_episode);
+ tp->sack_newdata = tp->snd_nxt;
+ tp->snd_cwnd = tp->t_maxseg;
+ (void) tcp_output(tp);
+ goto drop;
+ }
+ tp->snd_nxt = th->th_ack;
+ tp->snd_cwnd = tp->t_maxseg;
+ (void) tcp_output(tp);
+ KASSERT(tp->snd_limited <= 2,
+ ("%s: tp->snd_limited too big",
+ __func__));
+ tp->snd_cwnd = tp->snd_ssthresh +
+ tp->t_maxseg *
+ (tp->t_dupacks - tp->snd_limited);
+ if (SEQ_GT(onxt, tp->snd_nxt))
+ tp->snd_nxt = onxt;
+ goto drop;
+ } else if (V_tcp_do_rfc3042) {
+ u_long oldcwnd = tp->snd_cwnd;
+ tcp_seq oldsndmax = tp->snd_max;
+ u_int sent;
+
+ KASSERT(tp->t_dupacks == 1 ||
+ tp->t_dupacks == 2,
+ ("%s: dupacks not 1 or 2",
+ __func__));
+ if (tp->t_dupacks == 1)
+ tp->snd_limited = 0;
+ tp->snd_cwnd =
+ (tp->snd_nxt - tp->snd_una) +
+ (tp->t_dupacks - tp->snd_limited) *
+ tp->t_maxseg;
+ (void) tcp_output(tp);
+ sent = tp->snd_max - oldsndmax;
+ if (sent > tp->t_maxseg) {
+ KASSERT((tp->t_dupacks == 2 &&
+ tp->snd_limited == 0) ||
+ (sent == tp->t_maxseg + 1 &&
+ tp->t_flags & TF_SENTFIN),
+ ("%s: sent too much",
+ __func__));
+ tp->snd_limited = 2;
+ } else if (sent > 0)
+ ++tp->snd_limited;
+ tp->snd_cwnd = oldcwnd;
+ goto drop;
+ }
+ } else
+ tp->t_dupacks = 0;
+ break;
+ }
+
+ KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
+ ("%s: th_ack <= snd_una", __func__));
+
+ /*
+ * If the congestion window was inflated to account
+ * for the other side's cached packets, retract it.
+ */
+ if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) {
+ if (IN_FASTRECOVERY(tp)) {
+ if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+ if (tp->t_flags & TF_SACK_PERMIT)
+ tcp_sack_partialack(tp, th);
+ else
+ tcp_newreno_partial_ack(tp, th);
+ } else {
+ /*
+ * Out of fast recovery.
+ * Window inflation should have left us
+ * with approximately snd_ssthresh
+ * outstanding data.
+ * But in case we would be inclined to
+ * send a burst, better to do it via
+ * the slow start mechanism.
+ */
+ if (SEQ_GT(th->th_ack +
+ tp->snd_ssthresh,
+ tp->snd_max))
+ tp->snd_cwnd = tp->snd_max -
+ th->th_ack +
+ tp->t_maxseg;
+ else
+ tp->snd_cwnd = tp->snd_ssthresh;
+ }
+ }
+ } else {
+ if (tp->t_dupacks >= tcprexmtthresh &&
+ tp->snd_cwnd > tp->snd_ssthresh)
+ tp->snd_cwnd = tp->snd_ssthresh;
+ }
+ tp->t_dupacks = 0;
+ /*
+ * If we reach this point, ACK is not a duplicate,
+ * i.e., it ACKs something we sent.
+ */
+ if (tp->t_flags & TF_NEEDSYN) {
+ /*
+ * T/TCP: Connection was half-synchronized, and our
+ * SYN has been ACK'd (so connection is now fully
+ * synchronized). Go to non-starred state,
+ * increment snd_una for ACK of SYN, and check if
+ * we can do window scaling.
+ */
+ tp->t_flags &= ~TF_NEEDSYN;
+ tp->snd_una++;
+ /* Do window scaling? */
+ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+ tp->rcv_scale = tp->request_r_scale;
+ /* Send window already scaled. */
+ }
+ }
+
+process_ACK:
+ INP_INFO_LOCK_ASSERT(&V_tcbinfo);
+ KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
+ ("tcp_input: process_ACK ti_locked %d", ti_locked));
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ acked = th->th_ack - tp->snd_una;
+ TCPSTAT_INC(tcps_rcvackpack);
+ TCPSTAT_ADD(tcps_rcvackbyte, acked);
+
+ /*
+ * If we just performed our first retransmit, and the ACK
+ * arrives within our recovery window, then it was a mistake
+ * to do the retransmit in the first place. Recover our
+ * original cwnd and ssthresh, and proceed to transmit where
+ * we left off.
+ */
+ if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) {
+ TCPSTAT_INC(tcps_sndrexmitbad);
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+ tp->snd_recover = tp->snd_recover_prev;
+ if (tp->t_flags & TF_WASFRECOVERY)
+ ENTER_FASTRECOVERY(tp);
+ tp->snd_nxt = tp->snd_max;
+ tp->t_badrxtwin = 0; /* XXX probably not required */
+ }
+
+ /*
+ * If we have a timestamp reply, update smoothed
+ * round trip time. If no timestamp is present but
+ * transmit timer is running and timed sequence
+ * number was acked, update smoothed round trip time.
+ * Since we now have an rtt measurement, cancel the
+ * timer backoff (cf., Phil Karn's retransmit alg.).
+ * Recompute the initial retransmit timer.
+ *
+ * Some boxes send broken timestamp replies
+ * during the SYN+ACK phase, ignore
+ * timestamps of 0 or we could calculate a
+ * huge RTT and blow up the retransmit timer.
+ */
+ if ((to.to_flags & TOF_TS) != 0 &&
+ to.to_tsecr) {
+ if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr)
+ tp->t_rttlow = ticks - to.to_tsecr;
+ tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
+ } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
+ if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
+ tp->t_rttlow = ticks - tp->t_rtttime;
+ tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+ }
+ tcp_xmit_bandwidth_limit(tp, th->th_ack);
+
+ /*
+ * If all outstanding data is acked, stop retransmit
+ * timer and remember to restart (more output or persist).
+ * If there is more data to be acked, restart retransmit
+ * timer, using current (possibly backed-off) value.
+ */
+ if (th->th_ack == tp->snd_max) {
+ tcp_timer_activate(tp, TT_REXMT, 0);
+ needoutput = 1;
+ } else if (!tcp_timer_active(tp, TT_PERSIST))
+ tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+
+ /*
+ * If no data (only SYN) was ACK'd,
+ * skip rest of ACK processing.
+ */
+ if (acked == 0)
+ goto step6;
+
+ /*
+ * When new data is acked, open the congestion window.
+ * Method depends on which congestion control state we're
+ * in (slow start or cong avoid) and if ABC (RFC 3465) is
+ * enabled.
+ *
+ * slow start: cwnd <= ssthresh
+ * cong avoid: cwnd > ssthresh
+ *
+ * slow start and ABC (RFC 3465):
+ * Grow cwnd exponentially by the amount of data
+ * ACKed capping the max increment per ACK to
+ * (abc_l_var * maxseg) bytes.
+ *
+ * slow start without ABC (RFC 2581):
+ * Grow cwnd exponentially by maxseg per ACK.
+ *
+ * cong avoid and ABC (RFC 3465):
+ * Grow cwnd linearly by maxseg per RTT for each
+ * cwnd worth of ACKed data.
+ *
+ * cong avoid without ABC (RFC 2581):
+ * Grow cwnd linearly by approximately maxseg per RTT using
+ * maxseg^2 / cwnd per ACK as the increment.
+ * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
+ * avoid capping cwnd.
+ */
+ if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) ||
+ !IN_FASTRECOVERY(tp)) {
+ u_int cw = tp->snd_cwnd;
+ u_int incr = tp->t_maxseg;
+ /* In congestion avoidance? */
+ if (cw > tp->snd_ssthresh) {
+ if (V_tcp_do_rfc3465) {
+ tp->t_bytes_acked += acked;
+ if (tp->t_bytes_acked >= tp->snd_cwnd)
+ tp->t_bytes_acked -= cw;
+ else
+ incr = 0;
+ }
+ else
+ incr = max((incr * incr / cw), 1);
+ /*
+ * In slow-start with ABC enabled and no RTO in sight?
+ * (Must not use abc_l_var > 1 if slow starting after an
+ * RTO. On RTO, snd_nxt = snd_una, so the snd_nxt ==
+ * snd_max check is sufficient to handle this).
+ */
+ } else if (V_tcp_do_rfc3465 &&
+ tp->snd_nxt == tp->snd_max)
+ incr = min(acked,
+ V_tcp_abc_l_var * tp->t_maxseg);
+ /* ABC is on by default, so (incr == 0) frequently. */
+ if (incr > 0)
+ tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
+ }
+ SOCKBUF_LOCK(&so->so_snd);
+ if (acked > so->so_snd.sb_cc) {
+ tp->snd_wnd -= so->so_snd.sb_cc;
+ sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc);
+ ourfinisacked = 1;
+ } else {
+ sbdrop_locked(&so->so_snd, acked);
+ tp->snd_wnd -= acked;
+ ourfinisacked = 0;
+ }
+ /* NB: sowwakeup_locked() does an implicit unlock. */
+ sowwakeup_locked(so);
+ /* Detect una wraparound. */
+ if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
+ !IN_FASTRECOVERY(tp) &&
+ SEQ_GT(tp->snd_una, tp->snd_recover) &&
+ SEQ_LEQ(th->th_ack, tp->snd_recover))
+ tp->snd_recover = th->th_ack - 1;
+ if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
+ IN_FASTRECOVERY(tp) &&
+ SEQ_GEQ(th->th_ack, tp->snd_recover)) {
+ EXIT_FASTRECOVERY(tp);
+ tp->t_bytes_acked = 0;
+ }
+ tp->snd_una = th->th_ack;
+ if (tp->t_flags & TF_SACK_PERMIT) {
+ if (SEQ_GT(tp->snd_una, tp->snd_recover))
+ tp->snd_recover = tp->snd_una;
+ }
+ if (SEQ_LT(tp->snd_nxt, tp->snd_una))
+ tp->snd_nxt = tp->snd_una;
+
+ switch (tp->t_state) {
+
+ /*
+ * In FIN_WAIT_1 STATE in addition to the processing
+ * for the ESTABLISHED state if our FIN is now acknowledged
+ * then enter FIN_WAIT_2.
+ */
+ case TCPS_FIN_WAIT_1:
+ if (ourfinisacked) {
+ /*
+ * If we can't receive any more
+ * data, then closing user can proceed.
+ * Starting the timer is contrary to the
+ * specification, but if we don't get a FIN
+ * we'll hang forever.
+ *
+ * XXXjl:
+ * we should release the tp also, and use a
+ * compressed state.
+ */
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ int timeout;
+
+ soisdisconnected(so);
+ timeout = (tcp_fast_finwait2_recycle) ?
+ tcp_finwait2_timeout : tcp_maxidle;
+ tcp_timer_activate(tp, TT_2MSL, timeout);
+ }
+ tp->t_state = TCPS_FIN_WAIT_2;
+ }
+ break;
+
+ /*
+ * In CLOSING STATE in addition to the processing for
+ * the ESTABLISHED state if the ACK acknowledges our FIN
+ * then enter the TIME-WAIT state, otherwise ignore
+ * the segment.
+ */
+ case TCPS_CLOSING:
+ if (ourfinisacked) {
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ tcp_twstart(tp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ m_freem(m);
+ return;
+ }
+ break;
+
+ /*
+ * In LAST_ACK, we may still be waiting for data to drain
+ * and/or to be acked, as well as for the ack of our FIN.
+ * If our FIN is now acknowledged, delete the TCB,
+ * enter the closed state and return.
+ */
+ case TCPS_LAST_ACK:
+ if (ourfinisacked) {
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ tp = tcp_close(tp);
+ goto drop;
+ }
+ break;
+ }
+ }
+
+step6:
+ INP_INFO_LOCK_ASSERT(&V_tcbinfo);
+ KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
+ ("tcp_do_segment: step6 ti_locked %d", ti_locked));
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /*
+ * Update window information.
+ * Don't look at window if no ACK: TAC's send garbage on first SYN.
+ */
+ if ((thflags & TH_ACK) &&
+ (SEQ_LT(tp->snd_wl1, th->th_seq) ||
+ (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
+ (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
+ /* keep track of pure window updates */
+ if (tlen == 0 &&
+ tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
+ TCPSTAT_INC(tcps_rcvwinupd);
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ tp->snd_wl2 = th->th_ack;
+ if (tp->snd_wnd > tp->max_sndwnd)
+ tp->max_sndwnd = tp->snd_wnd;
+ needoutput = 1;
+ }
+
+ /*
+ * Process segments with URG.
+ */
+ if ((thflags & TH_URG) && th->th_urp &&
+ TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ /*
+ * This is a kludge, but if we receive and accept
+ * random urgent pointers, we'll crash in
+ * soreceive. It's hard to imagine someone
+ * actually wanting to send this much urgent data.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
+ th->th_urp = 0; /* XXX */
+ thflags &= ~TH_URG; /* XXX */
+ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */
+ goto dodata; /* XXX */
+ }
+ /*
+ * If this segment advances the known urgent pointer,
+ * then mark the data stream. This should not happen
+ * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
+ * a FIN has been received from the remote side.
+ * In these states we ignore the URG.
+ *
+ * According to RFC961 (Assigned Protocols),
+ * the urgent pointer points to the last octet
+ * of urgent data. We continue, however,
+ * to consider it to indicate the first octet
+ * of data past the urgent section as the original
+ * spec states (in one of two places).
+ */
+ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
+ tp->rcv_up = th->th_seq + th->th_urp;
+ so->so_oobmark = so->so_rcv.sb_cc +
+ (tp->rcv_up - tp->rcv_nxt) - 1;
+ if (so->so_oobmark == 0)
+ so->so_rcv.sb_state |= SBS_RCVATMARK;
+ sohasoutofband(so);
+ tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
+ }
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ /*
+ * Remove out of band data so doesn't get presented to user.
+ * This can happen independent of advancing the URG pointer,
+ * but if two URG's are pending at once, some out-of-band
+ * data may creep in... ick.
+ */
+ if (th->th_urp <= (u_long)tlen &&
+ !(so->so_options & SO_OOBINLINE)) {
+ /* hdr drop is delayed */
+ tcp_pulloutofband(so, th, m, drop_hdrlen);
+ }
+ } else {
+ /*
+ * If no out of band data is expected,
+ * pull receive urgent pointer along
+ * with the receive window.
+ */
+ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
+ tp->rcv_up = tp->rcv_nxt;
+ }
+dodata: /* XXX */
+ INP_INFO_LOCK_ASSERT(&V_tcbinfo);
+ KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
+ ("tcp_do_segment: dodata ti_locked %d", ti_locked));
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /*
+ * Process the segment text, merging it into the TCP sequencing queue,
+ * and arranging for acknowledgment of receipt if necessary.
+ * This process logically involves adjusting tp->rcv_wnd as data
+ * is presented to the user (this happens in tcp_usrreq.c,
+ * case PRU_RCVD). If a FIN has already been received on this
+ * connection then we just ignore the text.
+ */
+ if ((tlen || (thflags & TH_FIN)) &&
+ TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ tcp_seq save_start = th->th_seq;
+ m_adj(m, drop_hdrlen); /* delayed header drop */
+ /*
+ * Insert segment which includes th into TCP reassembly queue
+ * with control block tp. Set thflags to whether reassembly now
+ * includes a segment with FIN. This handles the common case
+ * inline (segment is the next to be received on an established
+ * connection, and the queue is empty), avoiding linkage into
+ * and removal from the queue and repetition of various
+ * conversions.
+ * Set DELACK for segments received in order, but ack
+ * immediately when segments are out of order (so
+ * fast retransmit can work).
+ */
+ if (th->th_seq == tp->rcv_nxt &&
+ LIST_EMPTY(&tp->t_segq) &&
+ TCPS_HAVEESTABLISHED(tp->t_state)) {
+ if (DELAY_ACK(tp))
+ tp->t_flags |= TF_DELACK;
+ else
+ tp->t_flags |= TF_ACKNOW;
+ tp->rcv_nxt += tlen;
+ thflags = th->th_flags & TH_FIN;
+ TCPSTAT_INC(tcps_rcvpack);
+ TCPSTAT_ADD(tcps_rcvbyte, tlen);
+ ND6_HINT(tp);
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
+ m_freem(m);
+ else
+ sbappendstream_locked(&so->so_rcv, m);
+ /* NB: sorwakeup_locked() does an implicit unlock. */
+ sorwakeup_locked(so);
+ } else {
+ /*
+ * XXX: Due to the header drop above "th" is
+ * theoretically invalid by now. Fortunately
+ * m_adj() doesn't actually frees any mbufs
+ * when trimming from the head.
+ */
+ thflags = tcp_reass(tp, th, &tlen, m);
+ tp->t_flags |= TF_ACKNOW;
+ }
+ if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
+ tcp_update_sack_list(tp, save_start, save_start + tlen);
+#if 0
+ /*
+ * Note the amount of data that peer has sent into
+ * our window, in order to estimate the sender's
+ * buffer size.
+ * XXX: Unused.
+ */
+ len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
+#endif
+ } else {
+ m_freem(m);
+ thflags &= ~TH_FIN;
+ }
+
+ /*
+ * If FIN is received ACK the FIN and let the user know
+ * that the connection is closing.
+ */
+ if (thflags & TH_FIN) {
+ if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ socantrcvmore(so);
+ /*
+ * If connection is half-synchronized
+ * (ie NEEDSYN flag on) then delay ACK,
+ * so it may be piggybacked when SYN is sent.
+ * Otherwise, since we received a FIN then no
+ * more input can be expected, send ACK now.
+ */
+ if (tp->t_flags & TF_NEEDSYN)
+ tp->t_flags |= TF_DELACK;
+ else
+ tp->t_flags |= TF_ACKNOW;
+ tp->rcv_nxt++;
+ }
+ switch (tp->t_state) {
+
+ /*
+ * In SYN_RECEIVED and ESTABLISHED STATES
+ * enter the CLOSE_WAIT state.
+ */
+ case TCPS_SYN_RECEIVED:
+ tp->t_starttime = ticks;
+ /* FALLTHROUGH */
+ case TCPS_ESTABLISHED:
+ tp->t_state = TCPS_CLOSE_WAIT;
+ break;
+
+ /*
+ * If still in FIN_WAIT_1 STATE FIN has not been acked so
+ * enter the CLOSING state.
+ */
+ case TCPS_FIN_WAIT_1:
+ tp->t_state = TCPS_CLOSING;
+ break;
+
+ /*
+ * In FIN_WAIT_2 state enter the TIME_WAIT state,
+ * starting the time-wait timer, turning off the other
+ * standard timers.
+ */
+ case TCPS_FIN_WAIT_2:
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata "
+ "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
+ ti_locked));
+
+ tcp_twstart(tp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return;
+ }
+ }
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ else if (ti_locked == TI_WLOCKED)
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ else
+ panic("%s: dodata epilogue ti_locked %d", __func__,
+ ti_locked);
+ ti_locked = TI_UNLOCKED;
+
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+
+ /*
+ * Return any desired output.
+ */
+ if (needoutput || (tp->t_flags & TF_ACKNOW))
+ (void) tcp_output(tp);
+
+check_delack:
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+ __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ if (tp->t_flags & TF_DELACK) {
+ tp->t_flags &= ~TF_DELACK;
+ tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+ }
+ INP_WUNLOCK(tp->t_inpcb);
+ return;
+
+dropafterack:
+ KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
+ ("tcp_do_segment: dropafterack ti_locked %d", ti_locked));
+
+ /*
+ * Generate an ACK dropping incoming segment if it occupies
+ * sequence space, where the ACK reflects our state.
+ *
+ * We can now skip the test for the RST flag since all
+ * paths to this code happen after packets containing
+ * RST have been dropped.
+ *
+ * In the SYN-RECEIVED state, don't send an ACK unless the
+ * segment we received passes the SYN-RECEIVED ACK test.
+ * If it fails send a RST. This breaks the loop in the
+ * "LAND" DoS attack, and also prevents an ACK storm
+ * between two listening ports that have been sent forged
+ * SYN segments, each with the source address of the other.
+ */
+ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
+ (SEQ_GT(tp->snd_una, th->th_ack) ||
+ SEQ_GT(th->th_ack, tp->snd_max)) ) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ else if (ti_locked == TI_WLOCKED)
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ else
+ panic("%s: dropafterack epilogue ti_locked %d", __func__,
+ ti_locked);
+ ti_locked = TI_UNLOCKED;
+
+ tp->t_flags |= TF_ACKNOW;
+ (void) tcp_output(tp);
+ INP_WUNLOCK(tp->t_inpcb);
+ m_freem(m);
+ return;
+
+dropwithreset:
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ else if (ti_locked == TI_WLOCKED)
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ else
+ panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
+ ti_locked = TI_UNLOCKED;
+
+ if (tp != NULL) {
+ tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ INP_WUNLOCK(tp->t_inpcb);
+ } else
+ tcp_dropwithreset(m, th, NULL, tlen, rstreason);
+ return;
+
+drop:
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ else if (ti_locked == TI_WLOCKED)
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+#ifdef INVARIANTS
+ else
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+#endif
+ ti_locked = TI_UNLOCKED;
+
+ /*
+ * Drop space held by incoming segment and return.
+ */
+#ifdef TCPDEBUG
+ if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ if (tp != NULL)
+ INP_WUNLOCK(tp->t_inpcb);
+ m_freem(m);
+}
+
+/*
+ * Issue RST and make ACK acceptable to originator of segment.
+ * The mbuf must still include the original packet header.
+ * tp may be NULL.
+ */
+static void
+tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
+ int tlen, int rstreason)
+{
+ struct ip *ip;
+#ifdef INET6
+ struct ip6_hdr *ip6;
+#endif
+
+ if (tp != NULL) {
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ }
+
+ /* Don't bother if destination was broadcast/multicast. */
+ if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
+ goto drop;
+#ifdef INET6
+ if (mtod(m, struct ip *)->ip_v == 6) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
+ IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
+ goto drop;
+ /* IPv6 anycast check is done at tcp6_input() */
+ } else
+#endif
+ {
+ ip = mtod(m, struct ip *);
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
+ IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
+ ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
+ in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
+ goto drop;
+ }
+
+ /* Perform bandwidth limiting. */
+ if (badport_bandlim(rstreason) < 0)
+ goto drop;
+
+ /* tcp_respond consumes the mbuf chain. */
+ if (th->th_flags & TH_ACK) {
+ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
+ th->th_ack, TH_RST);
+ } else {
+ if (th->th_flags & TH_SYN)
+ tlen++;
+ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
+ (tcp_seq)0, TH_RST|TH_ACK);
+ }
+ return;
+drop:
+ m_freem(m);
+}
+
+/*
+ * Parse TCP options and place in tcpopt.
+ */
+static void
+tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
+{
+ int opt, optlen;
+
+ to->to_flags = 0;
+ for (; cnt > 0; cnt -= optlen, cp += optlen) {
+ opt = cp[0];
+ if (opt == TCPOPT_EOL)
+ break;
+ if (opt == TCPOPT_NOP)
+ optlen = 1;
+ else {
+ if (cnt < 2)
+ break;
+ optlen = cp[1];
+ if (optlen < 2 || optlen > cnt)
+ break;
+ }
+ switch (opt) {
+ case TCPOPT_MAXSEG:
+ if (optlen != TCPOLEN_MAXSEG)
+ continue;
+ if (!(flags & TO_SYN))
+ continue;
+ to->to_flags |= TOF_MSS;
+ bcopy((char *)cp + 2,
+ (char *)&to->to_mss, sizeof(to->to_mss));
+ to->to_mss = ntohs(to->to_mss);
+ break;
+ case TCPOPT_WINDOW:
+ if (optlen != TCPOLEN_WINDOW)
+ continue;
+ if (!(flags & TO_SYN))
+ continue;
+ to->to_flags |= TOF_SCALE;
+ to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (optlen != TCPOLEN_TIMESTAMP)
+ continue;
+ to->to_flags |= TOF_TS;
+ bcopy((char *)cp + 2,
+ (char *)&to->to_tsval, sizeof(to->to_tsval));
+ to->to_tsval = ntohl(to->to_tsval);
+ bcopy((char *)cp + 6,
+ (char *)&to->to_tsecr, sizeof(to->to_tsecr));
+ to->to_tsecr = ntohl(to->to_tsecr);
+ break;
+#ifdef TCP_SIGNATURE
+ /*
+ * XXX In order to reply to a host which has set the
+ * TCP_SIGNATURE option in its initial SYN, we have to
+ * record the fact that the option was observed here
+ * for the syncache code to perform the correct response.
+ */
+ case TCPOPT_SIGNATURE:
+ if (optlen != TCPOLEN_SIGNATURE)
+ continue;
+ to->to_flags |= TOF_SIGNATURE;
+ to->to_signature = cp + 2;
+ break;
+#endif
+ case TCPOPT_SACK_PERMITTED:
+ if (optlen != TCPOLEN_SACK_PERMITTED)
+ continue;
+ if (!(flags & TO_SYN))
+ continue;
+ if (!V_tcp_do_sack)
+ continue;
+ to->to_flags |= TOF_SACKPERM;
+ break;
+ case TCPOPT_SACK:
+ if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
+ continue;
+ if (flags & TO_SYN)
+ continue;
+ to->to_flags |= TOF_SACK;
+ to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
+ to->to_sacks = cp + 2;
+ TCPSTAT_INC(tcps_sack_rcv_blocks);
+ break;
+ default:
+ continue;
+ }
+ }
+}
+
+/*
+ * Pull out of band byte out of a segment so
+ * it doesn't appear in the user's data queue.
+ * It is still reflected in the segment length for
+ * sequencing purposes.
+ */
+static void
+tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
+ int off)
+{
+ int cnt = off + th->th_urp - 1;
+
+ while (cnt >= 0) {
+ if (m->m_len > cnt) {
+ char *cp = mtod(m, caddr_t) + cnt;
+ struct tcpcb *tp = sototcpcb(so);
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ tp->t_iobc = *cp;
+ tp->t_oobflags |= TCPOOB_HAVEDATA;
+ bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
+ m->m_len--;
+ if (m->m_flags & M_PKTHDR)
+ m->m_pkthdr.len--;
+ return;
+ }
+ cnt -= m->m_len;
+ m = m->m_next;
+ if (m == NULL)
+ break;
+ }
+ panic("tcp_pulloutofband");
+}
+
+/*
+ * Collect new round-trip time estimate
+ * and update averages and current timeout.
+ */
+static void
+tcp_xmit_timer(struct tcpcb *tp, int rtt)
+{
+ int delta;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ TCPSTAT_INC(tcps_rttupdated);
+ tp->t_rttupdated++;
+ if (tp->t_srtt != 0) {
+ /*
+ * srtt is stored as fixed point with 5 bits after the
+ * binary point (i.e., scaled by 8). The following magic
+ * is equivalent to the smoothing algorithm in rfc793 with
+ * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
+ * point). Adjust rtt to origin 0.
+ */
+ delta = ((rtt - 1) << TCP_DELTA_SHIFT)
+ - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
+
+ if ((tp->t_srtt += delta) <= 0)
+ tp->t_srtt = 1;
+
+ /*
+ * We accumulate a smoothed rtt variance (actually, a
+ * smoothed mean difference), then set the retransmit
+ * timer to smoothed rtt + 4 times the smoothed variance.
+ * rttvar is stored as fixed point with 4 bits after the
+ * binary point (scaled by 16). The following is
+ * equivalent to rfc793 smoothing with an alpha of .75
+ * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
+ * rfc793's wired-in beta.
+ */
+ if (delta < 0)
+ delta = -delta;
+ delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
+ if ((tp->t_rttvar += delta) <= 0)
+ tp->t_rttvar = 1;
+ if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
+ tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
+ } else {
+ /*
+ * No rtt measurement yet - use the unsmoothed rtt.
+ * Set the variance to half the rtt (so our first
+ * retransmit happens at 3*rtt).
+ */
+ tp->t_srtt = rtt << TCP_RTT_SHIFT;
+ tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
+ tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
+ }
+ tp->t_rtttime = 0;
+ tp->t_rxtshift = 0;
+
+ /*
+ * the retransmit should happen at rtt + 4 * rttvar.
+ * Because of the way we do the smoothing, srtt and rttvar
+ * will each average +1/2 tick of bias. When we compute
+ * the retransmit timer, we want 1/2 tick of rounding and
+ * 1 extra tick because of +-1/2 tick uncertainty in the
+ * firing of the timer. The bias will give us exactly the
+ * 1.5 tick we need. But, because the bias is
+ * statistical, we have to test that we don't drop below
+ * the minimum feasible timer (which is 2 ticks).
+ */
+ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
+ max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
+
+ /*
+ * We received an ack for a packet that wasn't retransmitted;
+ * it is probably safe to discard any error indications we've
+ * received recently. This isn't quite right, but close enough
+ * for now (a route might have failed after we sent a segment,
+ * and the return path might not be symmetrical).
+ */
+ tp->t_softerror = 0;
+}
+
+/*
+ * Determine a reasonable value for maxseg size.
+ * If the route is known, check route for mtu.
+ * If none, use an mss that can be handled on the outgoing
+ * interface without forcing IP to fragment; if bigger than
+ * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
+ * to utilize large mbufs. If no route is found, route has no mtu,
+ * or the destination isn't local, use a default, hopefully conservative
+ * size (usually 512 or the default IP max size, but no more than the mtu
+ * of the interface), as we can't discover anything about intervening
+ * gateways or networks. We also initialize the congestion/slow start
+ * window to be a single segment if the destination isn't local.
+ * While looking at the routing entry, we also initialize other path-dependent
+ * parameters from pre-set or cached values in the routing entry.
+ *
+ * Also take into account the space needed for options that we
+ * send regularly. Make maxseg shorter by that amount to assure
+ * that we can send maxseg amount of data even when the options
+ * are present. Store the upper limit of the length of options plus
+ * data in maxopd.
+ *
+ * In case of T/TCP, we call this routine during implicit connection
+ * setup as well (offer = -1), to initialize maxseg from the cached
+ * MSS of our peer.
+ *
+ * NOTE that this routine is only called when we process an incoming
+ * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
+ */
+void
+tcp_mss_update(struct tcpcb *tp, int offer,
+ struct hc_metrics_lite *metricptr, int *mtuflags)
+{
+ int mss;
+ u_long maxmtu;
+ struct inpcb *inp = tp->t_inpcb;
+ struct hc_metrics_lite metrics;
+ int origoffer = offer;
+#ifdef INET6
+ int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+ size_t min_protoh = isipv6 ?
+ sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
+ sizeof (struct tcpiphdr);
+#else
+ const size_t min_protoh = sizeof(struct tcpiphdr);
+#endif
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /* Initialize. */
+#ifdef INET6
+ if (isipv6) {
+ maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags);
+ tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
+ } else
+#endif
+ {
+ maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags);
+ tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt;
+ }
+
+ /*
+ * No route to sender, stay with default mss and return.
+ */
+ if (maxmtu == 0) {
+ /*
+ * In case we return early we need to initialize metrics
+ * to a defined state as tcp_hc_get() would do for us
+ * if there was no cache hit.
+ */
+ if (metricptr != NULL)
+ bzero(metricptr, sizeof(struct hc_metrics_lite));
+ return;
+ }
+
+ /* What have we got? */
+ switch (offer) {
+ case 0:
+ /*
+ * Offer == 0 means that there was no MSS on the SYN
+ * segment, in this case we use tcp_mssdflt as
+ * already assigned to t_maxopd above.
+ */
+ offer = tp->t_maxopd;
+ break;
+
+ case -1:
+ /*
+ * Offer == -1 means that we didn't receive SYN yet.
+ */
+ /* FALLTHROUGH */
+
+ default:
+ /*
+ * Prevent DoS attack with too small MSS. Round up
+ * to at least minmss.
+ */
+ offer = max(offer, V_tcp_minmss);
+ }
+
+ /*
+ * rmx information is now retrieved from tcp_hostcache.
+ */
+ tcp_hc_get(&inp->inp_inc, &metrics);
+ if (metricptr != NULL)
+ bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
+
+ /*
+ * If there's a discovered mtu int tcp hostcache, use it
+ * else, use the link mtu.
+ */
+ if (metrics.rmx_mtu)
+ mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
+ else {
+#ifdef INET6
+ if (isipv6) {
+ mss = maxmtu - min_protoh;
+ if (!V_path_mtu_discovery &&
+ !in6_localaddr(&inp->in6p_faddr))
+ mss = min(mss, V_tcp_v6mssdflt);
+ } else
+#endif
+ {
+ mss = maxmtu - min_protoh;
+ if (!V_path_mtu_discovery &&
+ !in_localaddr(inp->inp_faddr))
+ mss = min(mss, V_tcp_mssdflt);
+ }
+ /*
+ * XXX - The above conditional (mss = maxmtu - min_protoh)
+ * probably violates the TCP spec.
+ * The problem is that, since we don't know the
+ * other end's MSS, we are supposed to use a conservative
+ * default. But, if we do that, then MTU discovery will
+ * never actually take place, because the conservative
+ * default is much less than the MTUs typically seen
+ * on the Internet today. For the moment, we'll sweep
+ * this under the carpet.
+ *
+ * The conservative default might not actually be a problem
+ * if the only case this occurs is when sending an initial
+ * SYN with options and data to a host we've never talked
+ * to before. Then, they will reply with an MSS value which
+ * will get recorded and the new parameters should get
+ * recomputed. For Further Study.
+ */
+ }
+ mss = min(mss, offer);
+
+ /*
+ * Sanity check: make sure that maxopd will be large
+ * enough to allow some data on segments even if the
+ * all the option space is used (40bytes). Otherwise
+ * funny things may happen in tcp_output.
+ */
+ mss = max(mss, 64);
+
+ /*
+ * maxopd stores the maximum length of data AND options
+ * in a segment; maxseg is the amount of data in a normal
+ * segment. We need to store this value (maxopd) apart
+ * from maxseg, because now every segment carries options
+ * and thus we normally have somewhat less data in segments.
+ */
+ tp->t_maxopd = mss;
+
+ /*
+ * origoffer==-1 indicates that no segments were received yet.
+ * In this case we just guess.
+ */
+ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
+ (origoffer == -1 ||
+ (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
+ mss -= TCPOLEN_TSTAMP_APPA;
+
+#if (MCLBYTES & (MCLBYTES - 1)) == 0
+ if (mss > MCLBYTES)
+ mss &= ~(MCLBYTES-1);
+#else
+ if (mss > MCLBYTES)
+ mss = mss / MCLBYTES * MCLBYTES;
+#endif
+ tp->t_maxseg = mss;
+}
+
+void
+tcp_mss(struct tcpcb *tp, int offer)
+{
+ int rtt, mss;
+ u_long bufsize;
+ struct inpcb *inp;
+ struct socket *so;
+ struct hc_metrics_lite metrics;
+ int mtuflags = 0;
+#ifdef INET6
+ int isipv6;
+#endif
+ KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
+
+ tcp_mss_update(tp, offer, &metrics, &mtuflags);
+
+ mss = tp->t_maxseg;
+ inp = tp->t_inpcb;
+#ifdef INET6
+ isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+#endif
+
+ /*
+ * If there's a pipesize, change the socket buffer to that size,
+ * don't change if sb_hiwat is different than default (then it
+ * has been changed on purpose with setsockopt).
+ * Make the socket buffers an integral number of mss units;
+ * if the mss is larger than the socket buffer, decrease the mss.
+ */
+ so = inp->inp_socket;
+ SOCKBUF_LOCK(&so->so_snd);
+ if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
+ bufsize = metrics.rmx_sendpipe;
+ else
+ bufsize = so->so_snd.sb_hiwat;
+ if (bufsize < mss)
+ mss = bufsize;
+ else {
+ bufsize = roundup(bufsize, mss);
+ if (bufsize > sb_max)
+ bufsize = sb_max;
+ if (bufsize > so->so_snd.sb_hiwat)
+ (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL);
+ }
+ SOCKBUF_UNLOCK(&so->so_snd);
+ tp->t_maxseg = mss;
+
+ SOCKBUF_LOCK(&so->so_rcv);
+ if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
+ bufsize = metrics.rmx_recvpipe;
+ else
+ bufsize = so->so_rcv.sb_hiwat;
+ if (bufsize > mss) {
+ bufsize = roundup(bufsize, mss);
+ if (bufsize > sb_max)
+ bufsize = sb_max;
+ if (bufsize > so->so_rcv.sb_hiwat)
+ (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
+ }
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ /*
+ * While we're here, check the others too.
+ */
+ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
+ tp->t_srtt = rtt;
+ tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
+ TCPSTAT_INC(tcps_usedrtt);
+ if (metrics.rmx_rttvar) {
+ tp->t_rttvar = metrics.rmx_rttvar;
+ TCPSTAT_INC(tcps_usedrttvar);
+ } else {
+ /* default variation is +- 1 rtt */
+ tp->t_rttvar =
+ tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+ }
+ TCPT_RANGESET(tp->t_rxtcur,
+ ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+ tp->t_rttmin, TCPTV_REXMTMAX);
+ }
+ if (metrics.rmx_ssthresh) {
+ /*
+ * There's some sort of gateway or interface
+ * buffer limit on the path. Use this to set
+ * the slow start threshhold, but set the
+ * threshold to no less than 2*mss.
+ */
+ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
+ TCPSTAT_INC(tcps_usedssthresh);
+ }
+ if (metrics.rmx_bandwidth)
+ tp->snd_bandwidth = metrics.rmx_bandwidth;
+
+ /*
+ * Set the slow-start flight size depending on whether this
+ * is a local network or not.
+ *
+ * Extend this so we cache the cwnd too and retrieve it here.
+ * Make cwnd even bigger than RFC3390 suggests but only if we
+ * have previous experience with the remote host. Be careful
+ * not make cwnd bigger than remote receive window or our own
+ * send socket buffer. Maybe put some additional upper bound
+ * on the retrieved cwnd. Should do incremental updates to
+ * hostcache when cwnd collapses so next connection doesn't
+ * overloads the path again.
+ *
+ * XXXAO: Initializing the CWND from the hostcache is broken
+ * and in its current form not RFC conformant. It is disabled
+ * until fixed or removed entirely.
+ *
+ * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
+ * We currently check only in syncache_socket for that.
+ */
+/* #define TCP_METRICS_CWND */
+#ifdef TCP_METRICS_CWND
+ if (metrics.rmx_cwnd)
+ tp->snd_cwnd = max(mss,
+ min(metrics.rmx_cwnd / 2,
+ min(tp->snd_wnd, so->so_snd.sb_hiwat)));
+ else
+#endif
+ if (V_tcp_do_rfc3390)
+ tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
+#ifdef INET6
+ else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
+ (!isipv6 && in_localaddr(inp->inp_faddr)))
+#else
+ else if (in_localaddr(inp->inp_faddr))
+#endif
+ tp->snd_cwnd = mss * V_ss_fltsz_local;
+ else
+ tp->snd_cwnd = mss * V_ss_fltsz;
+
+ /* Check the interface for TSO capabilities. */
+ if (mtuflags & CSUM_TSO)
+ tp->t_flags |= TF_TSO;
+}
+
+/*
+ * Determine the MSS option to send on an outgoing SYN.
+ */
+int
+tcp_mssopt(struct in_conninfo *inc)
+{
+ int mss = 0;
+ u_long maxmtu = 0;
+ u_long thcmtu = 0;
+ size_t min_protoh;
+
+ KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
+
+#ifdef INET6
+ if (inc->inc_flags & INC_ISIPV6) {
+ mss = V_tcp_v6mssdflt;
+ maxmtu = tcp_maxmtu6(inc, NULL);
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ } else
+#endif
+ {
+ mss = V_tcp_mssdflt;
+ maxmtu = tcp_maxmtu(inc, NULL);
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+ min_protoh = sizeof(struct tcpiphdr);
+ }
+ if (maxmtu && thcmtu)
+ mss = min(maxmtu, thcmtu) - min_protoh;
+ else if (maxmtu || thcmtu)
+ mss = max(maxmtu, thcmtu) - min_protoh;
+
+ return (mss);
+}
+
+
+/*
+ * On a partial ack arrives, force the retransmission of the
+ * next unacknowledged segment. Do not clear tp->t_dupacks.
+ * By setting snd_nxt to ti_ack, this forces retransmission timer to
+ * be started again.
+ */
+static void
+tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
+{
+ tcp_seq onxt = tp->snd_nxt;
+ u_long ocwnd = tp->snd_cwnd;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_rtttime = 0;
+ tp->snd_nxt = th->th_ack;
+ /*
+ * Set snd_cwnd to one segment beyond acknowledged offset.
+ * (tp->snd_una has not yet been updated when this function is called.)
+ */
+ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
+ tp->t_flags |= TF_ACKNOW;
+ (void) tcp_output(tp);
+ tp->snd_cwnd = ocwnd;
+ if (SEQ_GT(onxt, tp->snd_nxt))
+ tp->snd_nxt = onxt;
+ /*
+ * Partial window deflation. Relies on fact that tp->snd_una
+ * not updated yet.
+ */
+ if (tp->snd_cwnd > th->th_ack - tp->snd_una)
+ tp->snd_cwnd -= th->th_ack - tp->snd_una;
+ else
+ tp->snd_cwnd = 0;
+ tp->snd_cwnd += tp->t_maxseg;
+}
diff --git a/freebsd/sys/netinet/tcp_lro.c b/freebsd/sys/netinet/tcp_lro.c
new file mode 100644
index 00000000..6aaff4a5
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_lro.c
@@ -0,0 +1,389 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/******************************************************************************
+
+Copyright (c) 2007, Myricom Inc.
+Copyright (c) 2008, Intel Corporation.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Myricom Inc, nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ 3. Neither the name of the Intel Corporation, nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+$FreeBSD$
+***************************************************************************/
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/endian.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/socket.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/ethernet.h>
+#include <freebsd/net/if_media.h>
+
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_lro.h>
+
+#include <freebsd/machine/bus.h>
+#include <freebsd/machine/in_cksum.h>
+
+
+static uint16_t do_csum_data(uint16_t *raw, int len)
+{
+ uint32_t csum;
+ csum = 0;
+ while (len > 0) {
+ csum += *raw;
+ raw++;
+ csum += *raw;
+ raw++;
+ len -= 4;
+ }
+ csum = (csum >> 16) + (csum & 0xffff);
+ csum = (csum >> 16) + (csum & 0xffff);
+ return (uint16_t)csum;
+}
+
+/*
+ * Allocate and init the LRO data structures
+ */
+int
+tcp_lro_init(struct lro_ctrl *cntl)
+{
+ struct lro_entry *lro;
+ int i, error = 0;
+
+ SLIST_INIT(&cntl->lro_free);
+ SLIST_INIT(&cntl->lro_active);
+
+ cntl->lro_bad_csum = 0;
+ cntl->lro_queued = 0;
+ cntl->lro_flushed = 0;
+
+ for (i = 0; i < LRO_ENTRIES; i++) {
+ lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
+ M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (lro == NULL) {
+ if (i == 0)
+ error = ENOMEM;
+ break;
+ }
+ cntl->lro_cnt = i;
+ SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
+ }
+
+ return (error);
+}
+
+void
+tcp_lro_free(struct lro_ctrl *cntl)
+{
+ struct lro_entry *entry;
+
+ while (!SLIST_EMPTY(&cntl->lro_free)) {
+ entry = SLIST_FIRST(&cntl->lro_free);
+ SLIST_REMOVE_HEAD(&cntl->lro_free, next);
+ free(entry, M_DEVBUF);
+ }
+}
+
+void
+tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
+{
+ struct ifnet *ifp;
+ struct ip *ip;
+ struct tcphdr *tcp;
+ uint32_t *ts_ptr;
+ uint32_t tcplen, tcp_csum;
+
+
+ if (lro->append_cnt) {
+ /* incorporate the new len into the ip header and
+ * re-calculate the checksum */
+ ip = lro->ip;
+ ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
+ ip->ip_sum = 0;
+ ip->ip_sum = 0xffff ^
+ do_csum_data((uint16_t*)ip,
+ sizeof (*ip));
+
+ lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
+ CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ lro->m_head->m_pkthdr.csum_data = 0xffff;
+ lro->m_head->m_pkthdr.len = lro->len;
+
+ /* incorporate the latest ack into the tcp header */
+ tcp = (struct tcphdr *) (ip + 1);
+ tcp->th_ack = lro->ack_seq;
+ tcp->th_win = lro->window;
+ /* incorporate latest timestamp into the tcp header */
+ if (lro->timestamp) {
+ ts_ptr = (uint32_t *)(tcp + 1);
+ ts_ptr[1] = htonl(lro->tsval);
+ ts_ptr[2] = lro->tsecr;
+ }
+ /*
+ * update checksum in tcp header by re-calculating the
+ * tcp pseudoheader checksum, and adding it to the checksum
+ * of the tcp payload data
+ */
+ tcp->th_sum = 0;
+ tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
+ tcp_csum = lro->data_csum;
+ tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons(tcplen + IPPROTO_TCP));
+ tcp_csum += do_csum_data((uint16_t*)tcp,
+ tcp->th_off << 2);
+ tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
+ tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
+ tcp->th_sum = 0xffff ^ tcp_csum;
+ }
+ ifp = cntl->ifp;
+ (*ifp->if_input)(cntl->ifp, lro->m_head);
+ cntl->lro_queued += lro->append_cnt + 1;
+ cntl->lro_flushed++;
+ lro->m_head = NULL;
+ lro->timestamp = 0;
+ lro->append_cnt = 0;
+ SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
+}
+
+int
+tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
+{
+ struct ether_header *eh;
+ struct ip *ip;
+ struct tcphdr *tcp;
+ uint32_t *ts_ptr;
+ struct mbuf *m_nxt, *m_tail;
+ struct lro_entry *lro;
+ int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
+ int opt_bytes, trim, csum_flags;
+ uint32_t seq, tmp_csum, device_mtu;
+
+
+ eh = mtod(m_head, struct ether_header *);
+ if (eh->ether_type != htons(ETHERTYPE_IP))
+ return 1;
+ ip = (struct ip *) (eh + 1);
+ if (ip->ip_p != IPPROTO_TCP)
+ return 1;
+
+ /* ensure there are no options */
+ if ((ip->ip_hl << 2) != sizeof (*ip))
+ return -1;
+
+ /* .. and the packet is not fragmented */
+ if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
+ return -1;
+
+ /* verify that the IP header checksum is correct */
+ csum_flags = m_head->m_pkthdr.csum_flags;
+ if (csum_flags & CSUM_IP_CHECKED) {
+ if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
+ cntl->lro_bad_csum++;
+ return -1;
+ }
+ } else {
+ tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
+ if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
+ cntl->lro_bad_csum++;
+ return -1;
+ }
+ }
+
+ /* find the TCP header */
+ tcp = (struct tcphdr *) (ip + 1);
+
+ /* Get the TCP checksum if we dont have it */
+ if (!csum)
+ csum = tcp->th_sum;
+
+ /* ensure no bits set besides ack or psh */
+ if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
+ return -1;
+
+ /* check for timestamps. Since the only option we handle are
+ timestamps, we only have to handle the simple case of
+ aligned timestamps */
+
+ opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
+ tcp_hdr_len = sizeof (*tcp) + opt_bytes;
+ ts_ptr = (uint32_t *)(tcp + 1);
+ if (opt_bytes != 0) {
+ if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
+ (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
+ TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
+ return -1;
+ }
+
+ ip_len = ntohs(ip->ip_len);
+ tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
+
+
+ /*
+ * If frame is padded beyond the end of the IP packet,
+ * then we must trim the extra bytes off the end.
+ */
+ tot_len = m_head->m_pkthdr.len;
+ trim = tot_len - (ip_len + ETHER_HDR_LEN);
+ if (trim != 0) {
+ if (trim < 0) {
+ /* truncated packet */
+ return -1;
+ }
+ m_adj(m_head, -trim);
+ tot_len = m_head->m_pkthdr.len;
+ }
+
+ m_nxt = m_head;
+ m_tail = NULL; /* -Wuninitialized */
+ while (m_nxt != NULL) {
+ m_tail = m_nxt;
+ m_nxt = m_tail->m_next;
+ }
+
+ hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
+ seq = ntohl(tcp->th_seq);
+
+ SLIST_FOREACH(lro, &cntl->lro_active, next) {
+ if (lro->source_port == tcp->th_sport &&
+ lro->dest_port == tcp->th_dport &&
+ lro->source_ip == ip->ip_src.s_addr &&
+ lro->dest_ip == ip->ip_dst.s_addr) {
+ /* Try to append it */
+
+ if (__predict_false(seq != lro->next_seq)) {
+ /* out of order packet */
+ SLIST_REMOVE(&cntl->lro_active, lro,
+ lro_entry, next);
+ tcp_lro_flush(cntl, lro);
+ return -1;
+ }
+
+ if (opt_bytes) {
+ uint32_t tsval = ntohl(*(ts_ptr + 1));
+ /* make sure timestamp values are increasing */
+ if (__predict_false(lro->tsval > tsval ||
+ *(ts_ptr + 2) == 0)) {
+ return -1;
+ }
+ lro->tsval = tsval;
+ lro->tsecr = *(ts_ptr + 2);
+ }
+
+ lro->next_seq += tcp_data_len;
+ lro->ack_seq = tcp->th_ack;
+ lro->window = tcp->th_win;
+ lro->append_cnt++;
+ if (tcp_data_len == 0) {
+ m_freem(m_head);
+ return 0;
+ }
+ /* subtract off the checksum of the tcp header
+ * from the hardware checksum, and add it to the
+ * stored tcp data checksum. Byteswap the checksum
+ * if the total length so far is odd
+ */
+ tmp_csum = do_csum_data((uint16_t*)tcp,
+ tcp_hdr_len);
+ csum = csum + (tmp_csum ^ 0xffff);
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+ if (lro->len & 0x1) {
+ /* Odd number of bytes so far, flip bytes */
+ csum = ((csum << 8) | (csum >> 8)) & 0xffff;
+ }
+ csum = csum + lro->data_csum;
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+ lro->data_csum = csum;
+
+ lro->len += tcp_data_len;
+
+ /* adjust mbuf so that m->m_data points to
+ the first byte of the payload */
+ m_adj(m_head, hlen);
+ /* append mbuf chain */
+ lro->m_tail->m_next = m_head;
+ /* advance the last pointer */
+ lro->m_tail = m_tail;
+ /* flush packet if required */
+ device_mtu = cntl->ifp->if_mtu;
+ if (lro->len > (65535 - device_mtu)) {
+ SLIST_REMOVE(&cntl->lro_active, lro,
+ lro_entry, next);
+ tcp_lro_flush(cntl, lro);
+ }
+ return 0;
+ }
+ }
+
+ if (SLIST_EMPTY(&cntl->lro_free))
+ return -1;
+
+ /* start a new chain */
+ lro = SLIST_FIRST(&cntl->lro_free);
+ SLIST_REMOVE_HEAD(&cntl->lro_free, next);
+ SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
+ lro->source_port = tcp->th_sport;
+ lro->dest_port = tcp->th_dport;
+ lro->source_ip = ip->ip_src.s_addr;
+ lro->dest_ip = ip->ip_dst.s_addr;
+ lro->next_seq = seq + tcp_data_len;
+ lro->mss = tcp_data_len;
+ lro->ack_seq = tcp->th_ack;
+ lro->window = tcp->th_win;
+
+ /* save the checksum of just the TCP payload by
+ * subtracting off the checksum of the TCP header from
+ * the entire hardware checksum
+ * Since IP header checksum is correct, checksum over
+ * the IP header is -0. Substracting -0 is unnecessary.
+ */
+ tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
+ csum = csum + (tmp_csum ^ 0xffff);
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+ lro->data_csum = csum;
+
+ lro->ip = ip;
+ /* record timestamp if it is present */
+ if (opt_bytes) {
+ lro->timestamp = 1;
+ lro->tsval = ntohl(*(ts_ptr + 1));
+ lro->tsecr = *(ts_ptr + 2);
+ }
+ lro->len = tot_len;
+ lro->m_head = m_head;
+ lro->m_tail = m_tail;
+ return 0;
+}
diff --git a/freebsd/sys/netinet/tcp_lro.h b/freebsd/sys/netinet/tcp_lro.h
new file mode 100644
index 00000000..20cfb7cf
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_lro.h
@@ -0,0 +1,85 @@
+/*******************************************************************************
+
+Copyright (c) 2006, Myricom Inc.
+Copyright (c) 2008, Intel Corporation.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Myricom Inc, nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ 2. Neither the name of the Intel Corporation, nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef _TCP_LRO_HH_
+#define _TCP_LRO_HH_
+
+struct lro_entry;
+struct lro_entry
+{
+ SLIST_ENTRY(lro_entry) next;
+ struct mbuf *m_head;
+ struct mbuf *m_tail;
+ int timestamp;
+ struct ip *ip;
+ uint32_t tsval;
+ uint32_t tsecr;
+ uint32_t source_ip;
+ uint32_t dest_ip;
+ uint32_t next_seq;
+ uint32_t ack_seq;
+ uint32_t len;
+ uint32_t data_csum;
+ uint16_t window;
+ uint16_t source_port;
+ uint16_t dest_port;
+ uint16_t append_cnt;
+ uint16_t mss;
+
+};
+SLIST_HEAD(lro_head, lro_entry);
+
+struct lro_ctrl {
+ struct ifnet *ifp;
+ int lro_queued;
+ int lro_flushed;
+ int lro_bad_csum;
+ int lro_cnt;
+
+ struct lro_head lro_active;
+ struct lro_head lro_free;
+};
+
+
+int tcp_lro_init(struct lro_ctrl *);
+void tcp_lro_free(struct lro_ctrl *);
+void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *);
+int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t);
+
+/* Number of LRO entries - these are per rx queue */
+#define LRO_ENTRIES 8
+
+#endif /* _TCP_LRO_HH_ */
diff --git a/freebsd/sys/netinet/tcp_offload.c b/freebsd/sys/netinet/tcp_offload.c
new file mode 100644
index 00000000..9c73992b
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_offload.c
@@ -0,0 +1,147 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2007, Chelsio Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the Chelsio Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/types.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/if_types.h>
+#include <freebsd/net/if_var.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet/tcp_offload.h>
+#include <freebsd/netinet/toedev.h>
+
+uint32_t toedev_registration_count;
+
+int
+tcp_offload_connect(struct socket *so, struct sockaddr *nam)
+{
+ struct ifnet *ifp;
+ struct toedev *tdev;
+ struct rtentry *rt;
+ int error;
+
+ if (toedev_registration_count == 0)
+ return (EINVAL);
+
+ /*
+ * Look up the route used for the connection to
+ * determine if it uses an interface capable of
+ * offloading the connection.
+ */
+ rt = rtalloc1(nam, 0 /*report*/, 0 /*ignflags*/);
+ if (rt)
+ RT_UNLOCK(rt);
+ else
+ return (EHOSTUNREACH);
+
+ ifp = rt->rt_ifp;
+ if ((ifp->if_capenable & IFCAP_TOE) == 0) {
+ error = EINVAL;
+ goto fail;
+ }
+
+ tdev = TOEDEV(ifp);
+ if (tdev == NULL) {
+ error = EPERM;
+ goto fail;
+ }
+
+ if (tdev->tod_can_offload(tdev, so) == 0) {
+ error = EPERM;
+ goto fail;
+ }
+
+ return (tdev->tod_connect(tdev, so, rt, nam));
+fail:
+ RTFREE(rt);
+ return (error);
+}
+
+
+/*
+ * This file contains code as a short-term staging area before it is moved in
+ * to sys/netinet/tcp_offload.c
+ */
+
+void
+tcp_offload_twstart(struct tcpcb *tp)
+{
+
+ INP_INFO_WLOCK(&V_tcbinfo);
+ INP_WLOCK(tp->t_inpcb);
+ tcp_twstart(tp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+}
+
+struct tcpcb *
+tcp_offload_close(struct tcpcb *tp)
+{
+
+ INP_INFO_WLOCK(&V_tcbinfo);
+ INP_WLOCK(tp->t_inpcb);
+ tp = tcp_close(tp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (tp)
+ INP_WUNLOCK(tp->t_inpcb);
+
+ return (tp);
+}
+
+struct tcpcb *
+tcp_offload_drop(struct tcpcb *tp, int error)
+{
+
+ INP_INFO_WLOCK(&V_tcbinfo);
+ INP_WLOCK(tp->t_inpcb);
+ tp = tcp_drop(tp, error);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (tp)
+ INP_WUNLOCK(tp->t_inpcb);
+
+ return (tp);
+}
+
diff --git a/freebsd/sys/netinet/tcp_offload.h b/freebsd/sys/netinet/tcp_offload.h
new file mode 100644
index 00000000..f2a35a58
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_offload.h
@@ -0,0 +1,354 @@
+/*-
+ * Copyright (c) 2007, Chelsio Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the Chelsio Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_OFFLOAD_HH_
+#define _NETINET_TCP_OFFLOAD_HH_
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+/*
+ * A driver publishes that it provides offload services
+ * by setting IFCAP_TOE in the ifnet. The offload connect
+ * will bypass any further work if the interface that a
+ * connection would use does not support TCP offload.
+ *
+ * The TOE API assumes that the tcp offload engine can offload the
+ * the entire connection from set up to teardown, with some provision
+ * being made to allowing the software stack to handle time wait. If
+ * the device does not meet these criteria, it is the driver's responsibility
+ * to overload the functions that it needs to in tcp_usrreqs and make
+ * its own calls to tcp_output if it needs to do so.
+ *
+ * There is currently no provision for the device advertising the congestion
+ * control algorithms it supports as there is currently no API for querying
+ * an operating system for the protocols that it has loaded. This is a desirable
+ * future extension.
+ *
+ *
+ *
+ * It is assumed that individuals deploying TOE will want connections
+ * to be offloaded without software changes so all connections on an
+ * interface providing TOE are offloaded unless the the SO_NO_OFFLOAD
+ * flag is set on the socket.
+ *
+ *
+ * The toe_usrreqs structure constitutes the TOE driver's
+ * interface to the TCP stack for functionality that doesn't
+ * interact directly with userspace. If one wants to provide
+ * (optional) functionality to do zero-copy to/from
+ * userspace one still needs to override soreceive/sosend
+ * with functions that fault in and pin the user buffers.
+ *
+ * + tu_send
+ * - tells the driver that new data may have been added to the
+ * socket's send buffer - the driver should not fail if the
+ * buffer is in fact unchanged
+ * - the driver is responsible for providing credits (bytes in the send window)
+ * back to the socket by calling sbdrop() as segments are acknowledged.
+ * - The driver expects the inpcb lock to be held - the driver is expected
+ * not to drop the lock. Hence the driver is not allowed to acquire the
+ * pcbinfo lock during this call.
+ *
+ * + tu_rcvd
+ * - returns credits to the driver and triggers window updates
+ * to the peer (a credit as used here is a byte in the peer's receive window)
+ * - the driver is expected to determine how many bytes have been
+ * consumed and credit that back to the card so that it can grow
+ * the window again by maintaining its own state between invocations.
+ * - In principle this could be used to shrink the window as well as
+ * grow the window, although it is not used for that now.
+ * - this function needs to correctly handle being called any number of
+ * times without any bytes being consumed from the receive buffer.
+ * - The driver expects the inpcb lock to be held - the driver is expected
+ * not to drop the lock. Hence the driver is not allowed to acquire the
+ * pcbinfo lock during this call.
+ *
+ * + tu_disconnect
+ * - tells the driver to send FIN to peer
+ * - driver is expected to send the remaining data and then do a clean half close
+ * - disconnect implies at least half-close so only send, reset, and detach
+ * are legal
+ * - the driver is expected to handle transition through the shutdown
+ * state machine and allow the stack to support SO_LINGER.
+ * - The driver expects the inpcb lock to be held - the driver is expected
+ * not to drop the lock. Hence the driver is not allowed to acquire the
+ * pcbinfo lock during this call.
+ *
+ * + tu_reset
+ * - closes the connection and sends a RST to peer
+ * - driver is expectd to trigger an RST and detach the toepcb
+ * - no further calls are legal after reset
+ * - The driver expects the inpcb lock to be held - the driver is expected
+ * not to drop the lock. Hence the driver is not allowed to acquire the
+ * pcbinfo lock during this call.
+ *
+ * The following fields in the tcpcb are expected to be referenced by the driver:
+ * + iss
+ * + rcv_nxt
+ * + rcv_wnd
+ * + snd_isn
+ * + snd_max
+ * + snd_nxt
+ * + snd_una
+ * + t_flags
+ * + t_inpcb
+ * + t_maxseg
+ * + t_toe
+ *
+ * The following fields in the inpcb are expected to be referenced by the driver:
+ * + inp_lport
+ * + inp_fport
+ * + inp_laddr
+ * + inp_fport
+ * + inp_socket
+ * + inp_ip_tos
+ *
+ * The following fields in the socket are expected to be referenced by the
+ * driver:
+ * + so_comp
+ * + so_error
+ * + so_linger
+ * + so_options
+ * + so_rcv
+ * + so_snd
+ * + so_state
+ * + so_timeo
+ *
+ * These functions all return 0 on success and can return the following errors
+ * as appropriate:
+ * + EPERM:
+ * + ENOBUFS: memory allocation failed
+ * + EMSGSIZE: MTU changed during the call
+ * + EHOSTDOWN:
+ * + EHOSTUNREACH:
+ * + ENETDOWN:
+ * * ENETUNREACH: the peer is no longer reachable
+ *
+ * + tu_detach
+ * - tells driver that the socket is going away so disconnect
+ * the toepcb and free appropriate resources
+ * - allows the driver to cleanly handle the case of connection state
+ * outliving the socket
+ * - no further calls are legal after detach
+ * - the driver is expected to provide its own synchronization between
+ * detach and receiving new data.
+ *
+ * + tu_syncache_event
+ * - even if it is not actually needed, the driver is expected to
+ * call syncache_add for the initial SYN and then syncache_expand
+ * for the SYN,ACK
+ * - tells driver that a connection either has not been added or has
+ * been dropped from the syncache
+ * - the driver is expected to maintain state that lives outside the
+ * software stack so the syncache needs to be able to notify the
+ * toe driver that the software stack is not going to create a connection
+ * for a received SYN
+ * - The driver is responsible for any synchronization required between
+ * the syncache dropping an entry and the driver processing the SYN,ACK.
+ *
+ */
+struct toe_usrreqs {
+ int (*tu_send)(struct tcpcb *tp);
+ int (*tu_rcvd)(struct tcpcb *tp);
+ int (*tu_disconnect)(struct tcpcb *tp);
+ int (*tu_reset)(struct tcpcb *tp);
+ void (*tu_detach)(struct tcpcb *tp);
+ void (*tu_syncache_event)(int event, void *toep);
+};
+
+/*
+ * Proxy for struct tcpopt between TOE drivers and TCP functions.
+ */
+struct toeopt {
+ u_int64_t to_flags; /* see tcpopt in tcp_var.h */
+ u_int16_t to_mss; /* maximum segment size */
+ u_int8_t to_wscale; /* window scaling */
+
+ u_int8_t _pad1; /* explicit pad for 64bit alignment */
+ u_int32_t _pad2; /* explicit pad for 64bit alignment */
+ u_int64_t _pad3[4]; /* TBD */
+};
+
+#define TOE_SC_ENTRY_PRESENT 1 /* 4-tuple already present */
+#define TOE_SC_DROP 2 /* connection was timed out */
+
+/*
+ * Because listen is a one-to-many relationship (a socket can be listening
+ * on all interfaces on a machine some of which may be using different TCP
+ * offload devices), listen uses a publish/subscribe mechanism. The TCP
+ * offload driver registers a listen notification function with the stack.
+ * When a listen socket is created all TCP offload devices are notified
+ * so that they can do the appropriate set up to offload connections on the
+ * port to which the socket is bound. When the listen socket is closed,
+ * the offload devices are notified so that they will stop listening on that
+ * port and free any associated resources as well as sending RSTs on any
+ * connections in the SYN_RCVD state.
+ *
+ */
+
+typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
+typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
+
+EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
+EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
+
+/*
+ * Check if the socket can be offloaded by the following steps:
+ * - determine the egress interface
+ * - check the interface for TOE capability and TOE is enabled
+ * - check if the device has resources to offload the connection
+ */
+int tcp_offload_connect(struct socket *so, struct sockaddr *nam);
+
+/*
+ * The tcp_output_* routines are wrappers around the toe_usrreqs calls
+ * which trigger packet transmission. In the non-offloaded case they
+ * translate to tcp_output. The tcp_offload_* routines notify TOE
+ * of specific events. I the non-offloaded case they are no-ops.
+ *
+ * Listen is a special case because it is a 1 to many relationship
+ * and there can be more than one offload driver in the system.
+ */
+
+/*
+ * Connection is offloaded
+ */
+#define tp_offload(tp) ((tp)->t_flags & TF_TOE)
+
+/*
+ * hackish way of allowing this file to also be included by TOE
+ * which needs to be kept ignorant of socket implementation details
+ */
+#ifdef _SYS_SOCKETVAR_HH_
+/*
+ * The socket has not been marked as "do not offload"
+ */
+#define SO_OFFLOADABLE(so) ((so->so_options & SO_NO_OFFLOAD) == 0)
+
+static __inline int
+tcp_output_connect(struct socket *so, struct sockaddr *nam)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ int error;
+
+ /*
+ * If offload has been disabled for this socket or the
+ * connection cannot be offloaded just call tcp_output
+ * to start the TCP state machine.
+ */
+#ifndef TCP_OFFLOAD_DISABLE
+ if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
+#endif
+ error = tcp_output(tp);
+ return (error);
+}
+
+static __inline int
+tcp_output_send(struct tcpcb *tp)
+{
+
+#ifndef TCP_OFFLOAD_DISABLE
+ if (tp_offload(tp))
+ return (tp->t_tu->tu_send(tp));
+#endif
+ return (tcp_output(tp));
+}
+
+static __inline int
+tcp_output_rcvd(struct tcpcb *tp)
+{
+
+#ifndef TCP_OFFLOAD_DISABLE
+ if (tp_offload(tp))
+ return (tp->t_tu->tu_rcvd(tp));
+#endif
+ return (tcp_output(tp));
+}
+
+static __inline int
+tcp_output_disconnect(struct tcpcb *tp)
+{
+
+#ifndef TCP_OFFLOAD_DISABLE
+ if (tp_offload(tp))
+ return (tp->t_tu->tu_disconnect(tp));
+#endif
+ return (tcp_output(tp));
+}
+
+static __inline int
+tcp_output_reset(struct tcpcb *tp)
+{
+
+#ifndef TCP_OFFLOAD_DISABLE
+ if (tp_offload(tp))
+ return (tp->t_tu->tu_reset(tp));
+#endif
+ return (tcp_output(tp));
+}
+
+static __inline void
+tcp_offload_detach(struct tcpcb *tp)
+{
+
+#ifndef TCP_OFFLOAD_DISABLE
+ if (tp_offload(tp))
+ tp->t_tu->tu_detach(tp);
+#endif
+}
+
+static __inline void
+tcp_offload_listen_open(struct tcpcb *tp)
+{
+
+#ifndef TCP_OFFLOAD_DISABLE
+ if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
+ EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
+#endif
+}
+
+static __inline void
+tcp_offload_listen_close(struct tcpcb *tp)
+{
+
+#ifndef TCP_OFFLOAD_DISABLE
+ EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
+#endif
+}
+#undef SO_OFFLOADABLE
+#endif /* _SYS_SOCKETVAR_HH_ */
+#undef tp_offload
+
+void tcp_offload_twstart(struct tcpcb *tp);
+struct tcpcb *tcp_offload_close(struct tcpcb *tp);
+struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error);
+
+#endif /* _NETINET_TCP_OFFLOAD_HH_ */
diff --git a/freebsd/sys/netinet/tcp_output.c b/freebsd/sys/netinet/tcp_output.c
new file mode 100644
index 00000000..bebab1f1
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_output.c
@@ -0,0 +1,1485 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_ipsec.h>
+#include <freebsd/local/opt_tcpdebug.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/domain.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/mutex.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/sysctl.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_options.h>
+#ifdef INET6
+#include <freebsd/netinet6/in6_pcb.h>
+#include <freebsd/netinet/ip6.h>
+#include <freebsd/netinet6/ip6_var.h>
+#endif
+#include <freebsd/netinet/tcp.h>
+#define TCPOUTFLAGS
+#include <freebsd/netinet/tcp_fsm.h>
+#include <freebsd/netinet/tcp_seq.h>
+#include <freebsd/netinet/tcp_timer.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <freebsd/netinet/tcp_debug.h>
+#endif
+
+#ifdef IPSEC
+#include <freebsd/netipsec/ipsec.h>
+#endif /*IPSEC*/
+
+#include <freebsd/machine/in_cksum.h>
+
+#include <freebsd/security/mac/mac_framework.h>
+
+#ifdef notyet
+extern struct mbuf *m_copypack();
+#endif
+
+VNET_DEFINE(int, path_mtu_discovery) = 1;
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
+ &VNET_NAME(path_mtu_discovery), 1,
+ "Enable Path MTU Discovery");
+
+VNET_DEFINE(int, ss_fltsz) = 1;
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
+ &VNET_NAME(ss_fltsz), 1,
+ "Slow start flight size");
+
+VNET_DEFINE(int, ss_fltsz_local) = 4;
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize,
+ CTLFLAG_RW, &VNET_NAME(ss_fltsz_local), 1,
+ "Slow start flight size for local networks");
+
+VNET_DEFINE(int, tcp_do_newreno) = 1;
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW,
+ &VNET_NAME(tcp_do_newreno), 0,
+ "Enable NewReno Algorithms");
+
+VNET_DEFINE(int, tcp_do_tso) = 1;
+#define V_tcp_do_tso VNET(tcp_do_tso)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
+ &VNET_NAME(tcp_do_tso), 0,
+ "Enable TCP Segmentation Offload");
+
+VNET_DEFINE(int, tcp_do_autosndbuf) = 1;
+#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
+ &VNET_NAME(tcp_do_autosndbuf), 0,
+ "Enable automatic send buffer sizing");
+
+VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024;
+#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
+ &VNET_NAME(tcp_autosndbuf_inc), 0,
+ "Incrementor step size of automatic send buffer");
+
+VNET_DEFINE(int, tcp_autosndbuf_max) = 256*1024;
+#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
+ &VNET_NAME(tcp_autosndbuf_max), 0,
+ "Max size of automatic send buffer");
+
+
+/*
+ * Tcp output routine: figure out what should be sent and send it.
+ */
+int
+tcp_output(struct tcpcb *tp)
+{
+ struct socket *so = tp->t_inpcb->inp_socket;
+ long len, recwin, sendwin;
+ int off, flags, error, rw;
+ struct mbuf *m;
+ struct ip *ip = NULL;
+ struct ipovly *ipov = NULL;
+ struct tcphdr *th;
+ u_char opt[TCP_MAXOLEN];
+ unsigned ipoptlen, optlen, hdrlen;
+#ifdef IPSEC
+ unsigned ipsec_optlen = 0;
+#endif
+ int idle, sendalot;
+ int sack_rxmit, sack_bytes_rxmt;
+ struct sackhole *p;
+ int tso;
+ struct tcpopt to;
+#if 0
+ int maxburst = TCP_MAXBURST;
+#endif
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+ int isipv6;
+
+ isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
+#endif
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /*
+ * Determine length of data that should be transmitted,
+ * and flags that will be used.
+ * If there is some data or critical controls (SYN, RST)
+ * to send, then transmit; otherwise, investigate further.
+ */
+ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
+ if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) {
+ /*
+ * If we've been idle for more than one retransmit
+ * timeout the old congestion window is no longer
+ * current and we have to reduce it to the restart
+ * window before we can transmit again.
+ *
+ * The restart window is the initial window or the last
+ * CWND, whichever is smaller.
+ *
+ * This is done to prevent us from flooding the path with
+ * a full CWND at wirespeed, overloading router and switch
+ * buffers along the way.
+ *
+ * See RFC5681 Section 4.1. "Restarting Idle Connections".
+ */
+ if (V_tcp_do_rfc3390)
+ rw = min(4 * tp->t_maxseg,
+ max(2 * tp->t_maxseg, 4380));
+#ifdef INET6
+ else if ((isipv6 ? in6_localaddr(&tp->t_inpcb->in6p_faddr) :
+ in_localaddr(tp->t_inpcb->inp_faddr)))
+#else
+ else if (in_localaddr(tp->t_inpcb->inp_faddr))
+#endif
+ rw = V_ss_fltsz_local * tp->t_maxseg;
+ else
+ rw = V_ss_fltsz * tp->t_maxseg;
+
+ tp->snd_cwnd = min(rw, tp->snd_cwnd);
+ }
+ tp->t_flags &= ~TF_LASTIDLE;
+ if (idle) {
+ if (tp->t_flags & TF_MORETOCOME) {
+ tp->t_flags |= TF_LASTIDLE;
+ idle = 0;
+ }
+ }
+again:
+ /*
+ * If we've recently taken a timeout, snd_max will be greater than
+ * snd_nxt. There may be SACK information that allows us to avoid
+ * resending already delivered data. Adjust snd_nxt accordingly.
+ */
+ if ((tp->t_flags & TF_SACK_PERMIT) &&
+ SEQ_LT(tp->snd_nxt, tp->snd_max))
+ tcp_sack_adjust(tp);
+ sendalot = 0;
+ tso = 0;
+ off = tp->snd_nxt - tp->snd_una;
+ sendwin = min(tp->snd_wnd, tp->snd_cwnd);
+ sendwin = min(sendwin, tp->snd_bwnd);
+
+ flags = tcp_outflags[tp->t_state];
+ /*
+ * Send any SACK-generated retransmissions. If we're explicitly trying
+ * to send out new data (when sendalot is 1), bypass this function.
+ * If we retransmit in fast recovery mode, decrement snd_cwnd, since
+ * we're replacing a (future) new transmission with a retransmission
+ * now, and we previously incremented snd_cwnd in tcp_input().
+ */
+ /*
+ * Still in sack recovery , reset rxmit flag to zero.
+ */
+ sack_rxmit = 0;
+ sack_bytes_rxmt = 0;
+ len = 0;
+ p = NULL;
+ if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) &&
+ (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
+ long cwin;
+
+ cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
+ if (cwin < 0)
+ cwin = 0;
+ /* Do not retransmit SACK segments beyond snd_recover */
+ if (SEQ_GT(p->end, tp->snd_recover)) {
+ /*
+ * (At least) part of sack hole extends beyond
+ * snd_recover. Check to see if we can rexmit data
+ * for this hole.
+ */
+ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
+ /*
+ * Can't rexmit any more data for this hole.
+ * That data will be rexmitted in the next
+ * sack recovery episode, when snd_recover
+ * moves past p->rxmit.
+ */
+ p = NULL;
+ goto after_sack_rexmit;
+ } else
+ /* Can rexmit part of the current hole */
+ len = ((long)ulmin(cwin,
+ tp->snd_recover - p->rxmit));
+ } else
+ len = ((long)ulmin(cwin, p->end - p->rxmit));
+ off = p->rxmit - tp->snd_una;
+ KASSERT(off >= 0,("%s: sack block to the left of una : %d",
+ __func__, off));
+ if (len > 0) {
+ sack_rxmit = 1;
+ sendalot = 1;
+ TCPSTAT_INC(tcps_sack_rexmits);
+ TCPSTAT_ADD(tcps_sack_rexmit_bytes,
+ min(len, tp->t_maxseg));
+ }
+ }
+after_sack_rexmit:
+ /*
+ * Get standard flags, and add SYN or FIN if requested by 'hidden'
+ * state flags.
+ */
+ if (tp->t_flags & TF_NEEDFIN)
+ flags |= TH_FIN;
+ if (tp->t_flags & TF_NEEDSYN)
+ flags |= TH_SYN;
+
+ SOCKBUF_LOCK(&so->so_snd);
+ /*
+ * If in persist timeout with window of 0, send 1 byte.
+ * Otherwise, if window is small but nonzero
+ * and timer expired, we will send what we can
+ * and go to transmit state.
+ */
+ if (tp->t_flags & TF_FORCEDATA) {
+ if (sendwin == 0) {
+ /*
+ * If we still have some data to send, then
+ * clear the FIN bit. Usually this would
+ * happen below when it realizes that we
+ * aren't sending all the data. However,
+ * if we have exactly 1 byte of unsent data,
+ * then it won't clear the FIN bit below,
+ * and if we are in persist state, we wind
+ * up sending the packet without recording
+ * that we sent the FIN bit.
+ *
+ * We can't just blindly clear the FIN bit,
+ * because if we don't have any more data
+ * to send then the probe will be the FIN
+ * itself.
+ */
+ if (off < so->so_snd.sb_cc)
+ flags &= ~TH_FIN;
+ sendwin = 1;
+ } else {
+ tcp_timer_activate(tp, TT_PERSIST, 0);
+ tp->t_rxtshift = 0;
+ }
+ }
+
+ /*
+ * If snd_nxt == snd_max and we have transmitted a FIN, the
+ * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
+ * a negative length. This can also occur when TCP opens up
+ * its congestion window while receiving additional duplicate
+ * acks after fast-retransmit because TCP will reset snd_nxt
+ * to snd_max after the fast-retransmit.
+ *
+ * In the normal retransmit-FIN-only case, however, snd_nxt will
+ * be set to snd_una, the offset will be 0, and the length may
+ * wind up 0.
+ *
+ * If sack_rxmit is true we are retransmitting from the scoreboard
+ * in which case len is already set.
+ */
+ if (sack_rxmit == 0) {
+ if (sack_bytes_rxmt == 0)
+ len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
+ else {
+ long cwin;
+
+ /*
+ * We are inside of a SACK recovery episode and are
+ * sending new data, having retransmitted all the
+ * data possible in the scoreboard.
+ */
+ len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd)
+ - off);
+ /*
+ * Don't remove this (len > 0) check !
+ * We explicitly check for len > 0 here (although it
+ * isn't really necessary), to work around a gcc
+ * optimization issue - to force gcc to compute
+ * len above. Without this check, the computation
+ * of len is bungled by the optimizer.
+ */
+ if (len > 0) {
+ cwin = tp->snd_cwnd -
+ (tp->snd_nxt - tp->sack_newdata) -
+ sack_bytes_rxmt;
+ if (cwin < 0)
+ cwin = 0;
+ len = lmin(len, cwin);
+ }
+ }
+ }
+
+ /*
+ * Lop off SYN bit if it has already been sent. However, if this
+ * is SYN-SENT state and if segment contains data and if we don't
+ * know that foreign host supports TAO, suppress sending segment.
+ */
+ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
+ if (tp->t_state != TCPS_SYN_RECEIVED)
+ flags &= ~TH_SYN;
+ off--, len++;
+ }
+
+ /*
+ * Be careful not to send data and/or FIN on SYN segments.
+ * This measure is needed to prevent interoperability problems
+ * with not fully conformant TCP implementations.
+ */
+ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
+ len = 0;
+ flags &= ~TH_FIN;
+ }
+
+ if (len < 0) {
+ /*
+ * If FIN has been sent but not acked,
+ * but we haven't been called to retransmit,
+ * len will be < 0. Otherwise, window shrank
+ * after we sent into it. If window shrank to 0,
+ * cancel pending retransmit, pull snd_nxt back
+ * to (closed) window, and set the persist timer
+ * if it isn't already going. If the window didn't
+ * close completely, just wait for an ACK.
+ */
+ len = 0;
+ if (sendwin == 0) {
+ tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_rxtshift = 0;
+ tp->snd_nxt = tp->snd_una;
+ if (!tcp_timer_active(tp, TT_PERSIST))
+ tcp_setpersist(tp);
+ }
+ }
+
+ /* len will be >= 0 after this point. */
+ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
+
+ /*
+ * Automatic sizing of send socket buffer. Often the send buffer
+ * size is not optimally adjusted to the actual network conditions
+ * at hand (delay bandwidth product). Setting the buffer size too
+ * small limits throughput on links with high bandwidth and high
+ * delay (eg. trans-continental/oceanic links). Setting the
+ * buffer size too big consumes too much real kernel memory,
+ * especially with many connections on busy servers.
+ *
+ * The criteria to step up the send buffer one notch are:
+ * 1. receive window of remote host is larger than send buffer
+ * (with a fudge factor of 5/4th);
+ * 2. send buffer is filled to 7/8th with data (so we actually
+ * have data to make use of it);
+ * 3. send buffer fill has not hit maximal automatic size;
+ * 4. our send window (slow start and cogestion controlled) is
+ * larger than sent but unacknowledged data in send buffer.
+ *
+ * The remote host receive window scaling factor may limit the
+ * growing of the send buffer before it reaches its allowed
+ * maximum.
+ *
+ * It scales directly with slow start or congestion window
+ * and does at most one step per received ACK. This fast
+ * scaling has the drawback of growing the send buffer beyond
+ * what is strictly necessary to make full use of a given
+ * delay*bandwith product. However testing has shown this not
+ * to be much of an problem. At worst we are trading wasting
+ * of available bandwith (the non-use of it) for wasting some
+ * socket buffer memory.
+ *
+ * TODO: Shrink send buffer during idle periods together
+ * with congestion window. Requires another timer. Has to
+ * wait for upcoming tcp timer rewrite.
+ */
+ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
+ if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
+ so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
+ so->so_snd.sb_cc < V_tcp_autosndbuf_max &&
+ sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
+ if (!sbreserve_locked(&so->so_snd,
+ min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
+ V_tcp_autosndbuf_max), so, curthread))
+ so->so_snd.sb_flags &= ~SB_AUTOSIZE;
+ }
+ }
+
+ /*
+ * Truncate to the maximum segment length or enable TCP Segmentation
+ * Offloading (if supported by hardware) and ensure that FIN is removed
+ * if the length no longer contains the last data byte.
+ *
+ * TSO may only be used if we are in a pure bulk sending state. The
+ * presence of TCP-MD5, SACK retransmits, SACK advertizements and
+ * IP options prevent using TSO. With TSO the TCP header is the same
+ * (except for the sequence number) for all generated packets. This
+ * makes it impossible to transmit any options which vary per generated
+ * segment or packet.
+ *
+ * The length of TSO bursts is limited to TCP_MAXWIN. That limit and
+ * removal of FIN (if not already catched here) are handled later after
+ * the exact length of the TCP options are known.
+ */
+#ifdef IPSEC
+ /*
+ * Pre-calculate here as we save another lookup into the darknesses
+ * of IPsec that way and can actually decide if TSO is ok.
+ */
+ ipsec_optlen = ipsec_hdrsiz_tcp(tp);
+#endif
+ if (len > tp->t_maxseg) {
+ if ((tp->t_flags & TF_TSO) && V_tcp_do_tso &&
+ ((tp->t_flags & TF_SIGNATURE) == 0) &&
+ tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
+ tp->t_inpcb->inp_options == NULL &&
+ tp->t_inpcb->in6p_options == NULL
+#ifdef IPSEC
+ && ipsec_optlen == 0
+#endif
+ ) {
+ tso = 1;
+ } else {
+ len = tp->t_maxseg;
+ sendalot = 1;
+ }
+ }
+
+ if (sack_rxmit) {
+ if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
+ flags &= ~TH_FIN;
+ } else {
+ if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
+ flags &= ~TH_FIN;
+ }
+
+ recwin = sbspace(&so->so_rcv);
+
+ /*
+ * Sender silly window avoidance. We transmit under the following
+ * conditions when len is non-zero:
+ *
+ * - We have a full segment (or more with TSO)
+ * - This is the last buffer in a write()/send() and we are
+ * either idle or running NODELAY
+ * - we've timed out (e.g. persist timer)
+ * - we have more then 1/2 the maximum send window's worth of
+ * data (receiver may be limited the window size)
+ * - we need to retransmit
+ */
+ if (len) {
+ if (len >= tp->t_maxseg)
+ goto send;
+ /*
+ * NOTE! on localhost connections an 'ack' from the remote
+ * end may occur synchronously with the output and cause
+ * us to flush a buffer queued with moretocome. XXX
+ *
+ * note: the len + off check is almost certainly unnecessary.
+ */
+ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */
+ (idle || (tp->t_flags & TF_NODELAY)) &&
+ len + off >= so->so_snd.sb_cc &&
+ (tp->t_flags & TF_NOPUSH) == 0) {
+ goto send;
+ }
+ if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */
+ goto send;
+ if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
+ goto send;
+ if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */
+ goto send;
+ if (sack_rxmit)
+ goto send;
+ }
+
+ /*
+ * Compare available window to amount of window
+ * known to peer (as advertised window less
+ * next expected input). If the difference is at least two
+ * max size segments, or at least 50% of the maximum possible
+ * window, then want to send a window update to peer.
+ * Skip this if the connection is in T/TCP half-open state.
+ * Don't send pure window updates when the peer has closed
+ * the connection and won't ever send more data.
+ */
+ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
+ !TCPS_HAVERCVDFIN(tp->t_state)) {
+ /*
+ * "adv" is the amount we can increase the window,
+ * taking into account that we are limited by
+ * TCP_MAXWIN << tp->rcv_scale.
+ */
+ long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) -
+ (tp->rcv_adv - tp->rcv_nxt);
+
+ if (adv >= (long) (2 * tp->t_maxseg))
+ goto send;
+ if (2 * adv >= (long) so->so_rcv.sb_hiwat)
+ goto send;
+ }
+
+ /*
+ * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
+ * is also a catch-all for the retransmit timer timeout case.
+ */
+ if (tp->t_flags & TF_ACKNOW)
+ goto send;
+ if ((flags & TH_RST) ||
+ ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
+ goto send;
+ if (SEQ_GT(tp->snd_up, tp->snd_una))
+ goto send;
+ /*
+ * If our state indicates that FIN should be sent
+ * and we have not yet done so, then we need to send.
+ */
+ if (flags & TH_FIN &&
+ ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
+ goto send;
+ /*
+ * In SACK, it is possible for tcp_output to fail to send a segment
+ * after the retransmission timer has been turned off. Make sure
+ * that the retransmission timer is set.
+ */
+ if ((tp->t_flags & TF_SACK_PERMIT) &&
+ SEQ_GT(tp->snd_max, tp->snd_una) &&
+ !tcp_timer_active(tp, TT_REXMT) &&
+ !tcp_timer_active(tp, TT_PERSIST)) {
+ tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+ goto just_return;
+ }
+ /*
+ * TCP window updates are not reliable, rather a polling protocol
+ * using ``persist'' packets is used to insure receipt of window
+ * updates. The three ``states'' for the output side are:
+ * idle not doing retransmits or persists
+ * persisting to move a small or zero window
+ * (re)transmitting and thereby not persisting
+ *
+ * tcp_timer_active(tp, TT_PERSIST)
+ * is true when we are in persist state.
+ * (tp->t_flags & TF_FORCEDATA)
+ * is set when we are called to send a persist packet.
+ * tcp_timer_active(tp, TT_REXMT)
+ * is set when we are retransmitting
+ * The output side is idle when both timers are zero.
+ *
+ * If send window is too small, there is data to transmit, and no
+ * retransmit or persist is pending, then go to persist state.
+ * If nothing happens soon, send when timer expires:
+ * if window is nonzero, transmit what we can,
+ * otherwise force out a byte.
+ */
+ if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) &&
+ !tcp_timer_active(tp, TT_PERSIST)) {
+ tp->t_rxtshift = 0;
+ tcp_setpersist(tp);
+ }
+
+ /*
+ * No reason to send a segment, just return.
+ */
+just_return:
+ SOCKBUF_UNLOCK(&so->so_snd);
+ return (0);
+
+send:
+ SOCKBUF_LOCK_ASSERT(&so->so_snd);
+ /*
+ * Before ESTABLISHED, force sending of initial options
+ * unless TCP set not to do any options.
+ * NOTE: we assume that the IP/TCP header plus TCP options
+ * always fit in a single mbuf, leaving room for a maximum
+ * link header, i.e.
+ * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
+ */
+ optlen = 0;
+#ifdef INET6
+ if (isipv6)
+ hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
+ else
+#endif
+ hdrlen = sizeof (struct tcpiphdr);
+
+ /*
+ * Compute options for segment.
+ * We only have to care about SYN and established connection
+ * segments. Options for SYN-ACK segments are handled in TCP
+ * syncache.
+ */
+ if ((tp->t_flags & TF_NOOPT) == 0) {
+ to.to_flags = 0;
+ /* Maximum segment size. */
+ if (flags & TH_SYN) {
+ tp->snd_nxt = tp->iss;
+ to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
+ to.to_flags |= TOF_MSS;
+ }
+ /* Window scaling. */
+ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
+ to.to_wscale = tp->request_r_scale;
+ to.to_flags |= TOF_SCALE;
+ }
+ /* Timestamps. */
+ if ((tp->t_flags & TF_RCVD_TSTMP) ||
+ ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
+ to.to_tsval = ticks + tp->ts_offset;
+ to.to_tsecr = tp->ts_recent;
+ to.to_flags |= TOF_TS;
+ /* Set receive buffer autosizing timestamp. */
+ if (tp->rfbuf_ts == 0 &&
+ (so->so_rcv.sb_flags & SB_AUTOSIZE))
+ tp->rfbuf_ts = ticks;
+ }
+ /* Selective ACK's. */
+ if (tp->t_flags & TF_SACK_PERMIT) {
+ if (flags & TH_SYN)
+ to.to_flags |= TOF_SACKPERM;
+ else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->t_flags & TF_SACK_PERMIT) &&
+ tp->rcv_numsacks > 0) {
+ to.to_flags |= TOF_SACK;
+ to.to_nsacks = tp->rcv_numsacks;
+ to.to_sacks = (u_char *)tp->sackblks;
+ }
+ }
+#ifdef TCP_SIGNATURE
+ /* TCP-MD5 (RFC2385). */
+ if (tp->t_flags & TF_SIGNATURE)
+ to.to_flags |= TOF_SIGNATURE;
+#endif /* TCP_SIGNATURE */
+
+ /* Processing the options. */
+ hdrlen += optlen = tcp_addoptions(&to, opt);
+ }
+
+#ifdef INET6
+ if (isipv6)
+ ipoptlen = ip6_optlen(tp->t_inpcb);
+ else
+#endif
+ if (tp->t_inpcb->inp_options)
+ ipoptlen = tp->t_inpcb->inp_options->m_len -
+ offsetof(struct ipoption, ipopt_list);
+ else
+ ipoptlen = 0;
+#ifdef IPSEC
+ ipoptlen += ipsec_optlen;
+#endif
+
+ /*
+ * Adjust data length if insertion of options will
+ * bump the packet length beyond the t_maxopd length.
+ * Clear the FIN bit because we cut off the tail of
+ * the segment.
+ *
+ * When doing TSO limit a burst to TCP_MAXWIN minus the
+ * IP, TCP and Options length to keep ip->ip_len from
+ * overflowing. Prevent the last segment from being
+ * fractional thus making them all equal sized and set
+ * the flag to continue sending. TSO is disabled when
+ * IP options or IPSEC are present.
+ */
+ if (len + optlen + ipoptlen > tp->t_maxopd) {
+ flags &= ~TH_FIN;
+ if (tso) {
+ if (len > TCP_MAXWIN - hdrlen - optlen) {
+ len = TCP_MAXWIN - hdrlen - optlen;
+ len = len - (len % (tp->t_maxopd - optlen));
+ sendalot = 1;
+ } else if (tp->t_flags & TF_NEEDFIN)
+ sendalot = 1;
+ } else {
+ len = tp->t_maxopd - optlen - ipoptlen;
+ sendalot = 1;
+ }
+ }
+
+/*#ifdef DIAGNOSTIC*/
+#ifdef INET6
+ if (max_linkhdr + hdrlen > MCLBYTES)
+#else
+ if (max_linkhdr + hdrlen > MHLEN)
+#endif
+ panic("tcphdr too big");
+/*#endif*/
+
+ /*
+ * This KASSERT is here to catch edge cases at a well defined place.
+ * Before, those had triggered (random) panic conditions further down.
+ */
+ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
+
+ /*
+ * Grab a header mbuf, attaching a copy of data to
+ * be transmitted, and initialize the header from
+ * the template for sends on this connection.
+ */
+ if (len) {
+ struct mbuf *mb;
+ u_int moff;
+
+ if ((tp->t_flags & TF_FORCEDATA) && len == 1)
+ TCPSTAT_INC(tcps_sndprobe);
+ else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
+ TCPSTAT_INC(tcps_sndrexmitpack);
+ TCPSTAT_ADD(tcps_sndrexmitbyte, len);
+ } else {
+ TCPSTAT_INC(tcps_sndpack);
+ TCPSTAT_ADD(tcps_sndbyte, len);
+ }
+#ifdef notyet
+ if ((m = m_copypack(so->so_snd.sb_mb, off,
+ (int)len, max_linkhdr + hdrlen)) == 0) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = ENOBUFS;
+ goto out;
+ }
+ /*
+ * m_copypack left space for our hdr; use it.
+ */
+ m->m_len += hdrlen;
+ m->m_data -= hdrlen;
+#else
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (m == NULL) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = ENOBUFS;
+ goto out;
+ }
+#ifdef INET6
+ if (MHLEN < hdrlen + max_linkhdr) {
+ MCLGET(m, M_DONTWAIT);
+ if ((m->m_flags & M_EXT) == 0) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ m_freem(m);
+ error = ENOBUFS;
+ goto out;
+ }
+ }
+#endif
+ m->m_data += max_linkhdr;
+ m->m_len = hdrlen;
+
+ /*
+ * Start the m_copy functions from the closest mbuf
+ * to the offset in the socket buffer chain.
+ */
+ mb = sbsndptr(&so->so_snd, off, len, &moff);
+
+ if (len <= MHLEN - hdrlen - max_linkhdr) {
+ m_copydata(mb, moff, (int)len,
+ mtod(m, caddr_t) + hdrlen);
+ m->m_len += len;
+ } else {
+ m->m_next = m_copy(mb, moff, (int)len);
+ if (m->m_next == NULL) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ (void) m_free(m);
+ error = ENOBUFS;
+ goto out;
+ }
+ }
+#endif
+ /*
+ * If we're sending everything we've got, set PUSH.
+ * (This will keep happy those implementations which only
+ * give data to the user when a buffer fills or
+ * a PUSH comes in.)
+ */
+ if (off + len == so->so_snd.sb_cc)
+ flags |= TH_PUSH;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ } else {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (tp->t_flags & TF_ACKNOW)
+ TCPSTAT_INC(tcps_sndacks);
+ else if (flags & (TH_SYN|TH_FIN|TH_RST))
+ TCPSTAT_INC(tcps_sndctrl);
+ else if (SEQ_GT(tp->snd_up, tp->snd_una))
+ TCPSTAT_INC(tcps_sndurg);
+ else
+ TCPSTAT_INC(tcps_sndwinup);
+
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+#ifdef INET6
+ if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
+ MHLEN >= hdrlen) {
+ MH_ALIGN(m, hdrlen);
+ } else
+#endif
+ m->m_data += max_linkhdr;
+ m->m_len = hdrlen;
+ }
+ SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+#ifdef MAC
+ mac_inpcb_create_mbuf(tp->t_inpcb, m);
+#endif
+#ifdef INET6
+ if (isipv6) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ th = (struct tcphdr *)(ip6 + 1);
+ tcpip_fillheaders(tp->t_inpcb, ip6, th);
+ } else
+#endif /* INET6 */
+ {
+ ip = mtod(m, struct ip *);
+ ipov = (struct ipovly *)ip;
+ th = (struct tcphdr *)(ip + 1);
+ tcpip_fillheaders(tp->t_inpcb, ip, th);
+ }
+
+ /*
+ * Fill in fields, remembering maximum advertised
+ * window for use in delaying messages about window sizes.
+ * If resending a FIN, be sure not to use a new sequence number.
+ */
+ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
+ tp->snd_nxt == tp->snd_max)
+ tp->snd_nxt--;
+ /*
+ * If we are starting a connection, send ECN setup
+ * SYN packet. If we are on a retransmit, we may
+ * resend those bits a number of times as per
+ * RFC 3168.
+ */
+ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
+ if (tp->t_rxtshift >= 1) {
+ if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
+ flags |= TH_ECE|TH_CWR;
+ } else
+ flags |= TH_ECE|TH_CWR;
+ }
+
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ (tp->t_flags & TF_ECN_PERMIT)) {
+ /*
+ * If the peer has ECN, mark data packets with
+ * ECN capable transmission (ECT).
+ * Ignore pure ack packets, retransmissions and window probes.
+ */
+ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
+ !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
+#ifdef INET6
+ if (isipv6)
+ ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
+ else
+#endif
+ ip->ip_tos |= IPTOS_ECN_ECT0;
+ TCPSTAT_INC(tcps_ecn_ect0);
+ }
+
+ /*
+ * Reply with proper ECN notifications.
+ */
+ if (tp->t_flags & TF_ECN_SND_CWR) {
+ flags |= TH_CWR;
+ tp->t_flags &= ~TF_ECN_SND_CWR;
+ }
+ if (tp->t_flags & TF_ECN_SND_ECE)
+ flags |= TH_ECE;
+ }
+
+ /*
+ * If we are doing retransmissions, then snd_nxt will
+ * not reflect the first unsent octet. For ACK only
+ * packets, we do not want the sequence number of the
+ * retransmitted packet, we want the sequence number
+ * of the next unsent octet. So, if there is no data
+ * (and no SYN or FIN), use snd_max instead of snd_nxt
+ * when filling in ti_seq. But if we are in persist
+ * state, snd_max might reflect one byte beyond the
+ * right edge of the window, so use snd_nxt in that
+ * case, since we know we aren't doing a retransmission.
+ * (retransmit and persist are mutually exclusive...)
+ */
+ if (sack_rxmit == 0) {
+ if (len || (flags & (TH_SYN|TH_FIN)) ||
+ tcp_timer_active(tp, TT_PERSIST))
+ th->th_seq = htonl(tp->snd_nxt);
+ else
+ th->th_seq = htonl(tp->snd_max);
+ } else {
+ th->th_seq = htonl(p->rxmit);
+ p->rxmit += len;
+ tp->sackhint.sack_bytes_rexmit += len;
+ }
+ th->th_ack = htonl(tp->rcv_nxt);
+ if (optlen) {
+ bcopy(opt, th + 1, optlen);
+ th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
+ }
+ th->th_flags = flags;
+ /*
+ * Calculate receive window. Don't shrink window,
+ * but avoid silly window syndrome.
+ */
+ if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
+ recwin < (long)tp->t_maxseg)
+ recwin = 0;
+ if (recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
+ recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
+ if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
+ recwin = (long)TCP_MAXWIN << tp->rcv_scale;
+
+ /*
+ * According to RFC1323 the window field in a SYN (i.e., a <SYN>
+ * or <SYN,ACK>) segment itself is never scaled. The <SYN,ACK>
+ * case is handled in syncache.
+ */
+ if (flags & TH_SYN)
+ th->th_win = htons((u_short)
+ (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
+ else
+ th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
+
+ /*
+ * Adjust the RXWIN0SENT flag - indicate that we have advertised
+ * a 0 window. This may cause the remote transmitter to stall. This
+ * flag tells soreceive() to disable delayed acknowledgements when
+ * draining the buffer. This can occur if the receiver is attempting
+ * to read more data than can be buffered prior to transmitting on
+ * the connection.
+ */
+ if (th->th_win == 0)
+ tp->t_flags |= TF_RXWIN0SENT;
+ else
+ tp->t_flags &= ~TF_RXWIN0SENT;
+ if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
+ th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
+ th->th_flags |= TH_URG;
+ } else
+ /*
+ * If no urgent pointer to send, then we pull
+ * the urgent pointer to the left edge of the send window
+ * so that it doesn't drift into the send window on sequence
+ * number wraparound.
+ */
+ tp->snd_up = tp->snd_una; /* drag it along */
+
+#ifdef TCP_SIGNATURE
+ if (tp->t_flags & TF_SIGNATURE) {
+ int sigoff = to.to_signature - opt;
+ tcp_signature_compute(m, 0, len, optlen,
+ (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND);
+ }
+#endif
+
+ /*
+ * Put TCP length in extended header, and then
+ * checksum extended header and data.
+ */
+ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
+#ifdef INET6
+ if (isipv6)
+ /*
+ * ip6_plen is not need to be filled now, and will be filled
+ * in ip6_output.
+ */
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
+ sizeof(struct tcphdr) + optlen + len);
+ else
+#endif /* INET6 */
+ {
+ m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
+
+ /* IP version must be set here for ipv4/ipv6 checking later */
+ KASSERT(ip->ip_v == IPVERSION,
+ ("%s: IP version incorrect: %d", __func__, ip->ip_v));
+ }
+
+ /*
+ * Enable TSO and specify the size of the segments.
+ * The TCP pseudo header checksum is always provided.
+ * XXX: Fixme: This is currently not the case for IPv6.
+ */
+ if (tso) {
+ KASSERT(len > tp->t_maxopd - optlen,
+ ("%s: len <= tso_segsz", __func__));
+ m->m_pkthdr.csum_flags |= CSUM_TSO;
+ m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
+ }
+
+ /*
+ * In transmit state, time the transmission and arrange for
+ * the retransmit. In persist state, just set snd_max.
+ */
+ if ((tp->t_flags & TF_FORCEDATA) == 0 ||
+ !tcp_timer_active(tp, TT_PERSIST)) {
+ tcp_seq startseq = tp->snd_nxt;
+
+ /*
+ * Advance snd_nxt over sequence space of this segment.
+ */
+ if (flags & (TH_SYN|TH_FIN)) {
+ if (flags & TH_SYN)
+ tp->snd_nxt++;
+ if (flags & TH_FIN) {
+ tp->snd_nxt++;
+ tp->t_flags |= TF_SENTFIN;
+ }
+ }
+ if (sack_rxmit)
+ goto timer;
+ tp->snd_nxt += len;
+ if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
+ tp->snd_max = tp->snd_nxt;
+ /*
+ * Time this transmission if not a retransmission and
+ * not currently timing anything.
+ */
+ if (tp->t_rtttime == 0) {
+ tp->t_rtttime = ticks;
+ tp->t_rtseq = startseq;
+ TCPSTAT_INC(tcps_segstimed);
+ }
+ }
+
+ /*
+ * Set retransmit timer if not currently set,
+ * and not doing a pure ack or a keep-alive probe.
+ * Initial value for retransmit timer is smoothed
+ * round-trip time + 2 * round-trip time variance.
+ * Initialize shift counter which is used for backoff
+ * of retransmit time.
+ */
+timer:
+ if (!tcp_timer_active(tp, TT_REXMT) &&
+ ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
+ (tp->snd_nxt != tp->snd_una))) {
+ if (tcp_timer_active(tp, TT_PERSIST)) {
+ tcp_timer_activate(tp, TT_PERSIST, 0);
+ tp->t_rxtshift = 0;
+ }
+ tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+ }
+ } else {
+ /*
+ * Persist case, update snd_max but since we are in
+ * persist mode (no window) we do not update snd_nxt.
+ */
+ int xlen = len;
+ if (flags & TH_SYN)
+ ++xlen;
+ if (flags & TH_FIN) {
+ ++xlen;
+ tp->t_flags |= TF_SENTFIN;
+ }
+ if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
+ tp->snd_max = tp->snd_nxt + len;
+ }
+
+#ifdef TCPDEBUG
+ /*
+ * Trace.
+ */
+ if (so->so_options & SO_DEBUG) {
+ u_short save = 0;
+#ifdef INET6
+ if (!isipv6)
+#endif
+ {
+ save = ipov->ih_len;
+ ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */);
+ }
+ tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
+#ifdef INET6
+ if (!isipv6)
+#endif
+ ipov->ih_len = save;
+ }
+#endif
+
+ /*
+ * Fill in IP length and desired time to live and
+ * send to IP level. There should be a better way
+ * to handle ttl and tos; we could keep them in
+ * the template, but need a way to checksum without them.
+ */
+ /*
+ * m->m_pkthdr.len should have been set before cksum calcuration,
+ * because in6_cksum() need it.
+ */
+#ifdef INET6
+ if (isipv6) {
+ /*
+ * we separately set hoplimit for every segment, since the
+ * user might want to change the value via setsockopt.
+ * Also, desired default hop limit might be changed via
+ * Neighbor Discovery.
+ */
+ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
+
+ /* TODO: IPv6 IP6TOS_ECT bit on */
+ error = ip6_output(m,
+ tp->t_inpcb->in6p_outputopts, NULL,
+ ((so->so_options & SO_DONTROUTE) ?
+ IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb);
+ } else
+#endif /* INET6 */
+ {
+ ip->ip_len = m->m_pkthdr.len;
+#ifdef INET6
+ if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
+ ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
+#endif /* INET6 */
+ /*
+ * If we do path MTU discovery, then we set DF on every packet.
+ * This might not be the best thing to do according to RFC3390
+ * Section 2. However the tcp hostcache migitates the problem
+ * so it affects only the first tcp connection with a host.
+ *
+ * NB: Don't set DF on small MTU/MSS to have a safe fallback.
+ */
+ if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss)
+ ip->ip_off |= IP_DF;
+
+ error = ip_output(m, tp->t_inpcb->inp_options, NULL,
+ ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
+ tp->t_inpcb);
+ }
+ if (error) {
+
+ /*
+ * We know that the packet was lost, so back out the
+ * sequence number advance, if any.
+ *
+ * If the error is EPERM the packet got blocked by the
+ * local firewall. Normally we should terminate the
+ * connection but the blocking may have been spurious
+ * due to a firewall reconfiguration cycle. So we treat
+ * it like a packet loss and let the retransmit timer and
+ * timeouts do their work over time.
+ * XXX: It is a POLA question whether calling tcp_drop right
+ * away would be the really correct behavior instead.
+ */
+ if (((tp->t_flags & TF_FORCEDATA) == 0 ||
+ !tcp_timer_active(tp, TT_PERSIST)) &&
+ ((flags & TH_SYN) == 0) &&
+ (error != EPERM)) {
+ if (sack_rxmit) {
+ p->rxmit -= len;
+ tp->sackhint.sack_bytes_rexmit -= len;
+ KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
+ ("sackhint bytes rtx >= 0"));
+ } else
+ tp->snd_nxt -= len;
+ }
+out:
+ SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */
+ switch (error) {
+ case EPERM:
+ tp->t_softerror = error;
+ return (error);
+ case ENOBUFS:
+ if (!tcp_timer_active(tp, TT_REXMT) &&
+ !tcp_timer_active(tp, TT_PERSIST))
+ tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+ tp->snd_cwnd = tp->t_maxseg;
+ return (0);
+ case EMSGSIZE:
+ /*
+ * For some reason the interface we used initially
+ * to send segments changed to another or lowered
+ * its MTU.
+ *
+ * tcp_mtudisc() will find out the new MTU and as
+ * its last action, initiate retransmission, so it
+ * is important to not do so here.
+ *
+ * If TSO was active we either got an interface
+ * without TSO capabilits or TSO was turned off.
+ * Disable it for this connection as too and
+ * immediatly retry with MSS sized segments generated
+ * by this function.
+ */
+ if (tso)
+ tp->t_flags &= ~TF_TSO;
+ tcp_mtudisc(tp->t_inpcb, 0);
+ return (0);
+ case EHOSTDOWN:
+ case EHOSTUNREACH:
+ case ENETDOWN:
+ case ENETUNREACH:
+ if (TCPS_HAVERCVDSYN(tp->t_state)) {
+ tp->t_softerror = error;
+ return (0);
+ }
+ /* FALLTHROUGH */
+ default:
+ return (error);
+ }
+ }
+ TCPSTAT_INC(tcps_sndtotal);
+
+ /*
+ * Data sent (as far as we can tell).
+ * If this advertises a larger window than any other segment,
+ * then remember the size of the advertised window.
+ * Any pending ACK has now been sent.
+ */
+ if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
+ tp->rcv_adv = tp->rcv_nxt + recwin;
+ tp->last_ack_sent = tp->rcv_nxt;
+ tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
+ if (tcp_timer_active(tp, TT_DELACK))
+ tcp_timer_activate(tp, TT_DELACK, 0);
+#if 0
+ /*
+ * This completely breaks TCP if newreno is turned on. What happens
+ * is that if delayed-acks are turned on on the receiver, this code
+ * on the transmitter effectively destroys the TCP window, forcing
+ * it to four packets (1.5Kx4 = 6K window).
+ */
+ if (sendalot && (!V_tcp_do_newreno || --maxburst))
+ goto again;
+#endif
+ if (sendalot)
+ goto again;
+ return (0);
+}
+
+void
+tcp_setpersist(struct tcpcb *tp)
+{
+ int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
+ int tt;
+
+ if (tcp_timer_active(tp, TT_REXMT))
+ panic("tcp_setpersist: retransmit pending");
+ /*
+ * Start/restart persistance timer.
+ */
+ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
+ TCPTV_PERSMIN, TCPTV_PERSMAX);
+ tcp_timer_activate(tp, TT_PERSIST, tt);
+ if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
+ tp->t_rxtshift++;
+}
+
+/*
+ * Insert TCP options according to the supplied parameters to the place
+ * optp in a consistent way. Can handle unaligned destinations.
+ *
+ * The order of the option processing is crucial for optimal packing and
+ * alignment for the scarce option space.
+ *
+ * The optimal order for a SYN/SYN-ACK segment is:
+ * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
+ * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
+ *
+ * The SACK options should be last. SACK blocks consume 8*n+2 bytes.
+ * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
+ * At minimum we need 10 bytes (to generate 1 SACK block). If both
+ * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
+ * we only have 10 bytes for SACK options (40 - (12 + 18)).
+ */
+int
+tcp_addoptions(struct tcpopt *to, u_char *optp)
+{
+ u_int mask, optlen = 0;
+
+ for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
+ if ((to->to_flags & mask) != mask)
+ continue;
+ if (optlen == TCP_MAXOLEN)
+ break;
+ switch (to->to_flags & mask) {
+ case TOF_MSS:
+ while (optlen % 4) {
+ optlen += TCPOLEN_NOP;
+ *optp++ = TCPOPT_NOP;
+ }
+ if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG)
+ continue;
+ optlen += TCPOLEN_MAXSEG;
+ *optp++ = TCPOPT_MAXSEG;
+ *optp++ = TCPOLEN_MAXSEG;
+ to->to_mss = htons(to->to_mss);
+ bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
+ optp += sizeof(to->to_mss);
+ break;
+ case TOF_SCALE:
+ while (!optlen || optlen % 2 != 1) {
+ optlen += TCPOLEN_NOP;
+ *optp++ = TCPOPT_NOP;
+ }
+ if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW)
+ continue;
+ optlen += TCPOLEN_WINDOW;
+ *optp++ = TCPOPT_WINDOW;
+ *optp++ = TCPOLEN_WINDOW;
+ *optp++ = to->to_wscale;
+ break;
+ case TOF_SACKPERM:
+ while (optlen % 2) {
+ optlen += TCPOLEN_NOP;
+ *optp++ = TCPOPT_NOP;
+ }
+ if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED)
+ continue;
+ optlen += TCPOLEN_SACK_PERMITTED;
+ *optp++ = TCPOPT_SACK_PERMITTED;
+ *optp++ = TCPOLEN_SACK_PERMITTED;
+ break;
+ case TOF_TS:
+ while (!optlen || optlen % 4 != 2) {
+ optlen += TCPOLEN_NOP;
+ *optp++ = TCPOPT_NOP;
+ }
+ if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP)
+ continue;
+ optlen += TCPOLEN_TIMESTAMP;
+ *optp++ = TCPOPT_TIMESTAMP;
+ *optp++ = TCPOLEN_TIMESTAMP;
+ to->to_tsval = htonl(to->to_tsval);
+ to->to_tsecr = htonl(to->to_tsecr);
+ bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
+ optp += sizeof(to->to_tsval);
+ bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
+ optp += sizeof(to->to_tsecr);
+ break;
+ case TOF_SIGNATURE:
+ {
+ int siglen = TCPOLEN_SIGNATURE - 2;
+
+ while (!optlen || optlen % 4 != 2) {
+ optlen += TCPOLEN_NOP;
+ *optp++ = TCPOPT_NOP;
+ }
+ if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE)
+ continue;
+ optlen += TCPOLEN_SIGNATURE;
+ *optp++ = TCPOPT_SIGNATURE;
+ *optp++ = TCPOLEN_SIGNATURE;
+ to->to_signature = optp;
+ while (siglen--)
+ *optp++ = 0;
+ break;
+ }
+ case TOF_SACK:
+ {
+ int sackblks = 0;
+ struct sackblk *sack = (struct sackblk *)to->to_sacks;
+ tcp_seq sack_seq;
+
+ while (!optlen || optlen % 4 != 2) {
+ optlen += TCPOLEN_NOP;
+ *optp++ = TCPOPT_NOP;
+ }
+ if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK)
+ continue;
+ optlen += TCPOLEN_SACKHDR;
+ *optp++ = TCPOPT_SACK;
+ sackblks = min(to->to_nsacks,
+ (TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
+ *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;
+ while (sackblks--) {
+ sack_seq = htonl(sack->start);
+ bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
+ optp += sizeof(sack_seq);
+ sack_seq = htonl(sack->end);
+ bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
+ optp += sizeof(sack_seq);
+ optlen += TCPOLEN_SACK;
+ sack++;
+ }
+ TCPSTAT_INC(tcps_sack_send_blocks);
+ break;
+ }
+ default:
+ panic("%s: unknown TCP option type", __func__);
+ break;
+ }
+ }
+
+ /* Terminate and pad TCP options to a 4 byte boundary. */
+ if (optlen % 4) {
+ optlen += TCPOLEN_EOL;
+ *optp++ = TCPOPT_EOL;
+ }
+ /*
+ * According to RFC 793 (STD0007):
+ * "The content of the header beyond the End-of-Option option
+ * must be header padding (i.e., zero)."
+ * and later: "The padding is composed of zeros."
+ */
+ while (optlen % 4) {
+ optlen += TCPOLEN_PAD;
+ *optp++ = TCPOPT_PAD;
+ }
+
+ KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__));
+ return (optlen);
+}
diff --git a/freebsd/sys/netinet/tcp_reass.c b/freebsd/sys/netinet/tcp_reass.c
new file mode 100644
index 00000000..aea58740
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_reass.c
@@ -0,0 +1,335 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_tcpdebug.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/sys/systm.h>
+
+#include <freebsd/vm/uma.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_options.h>
+#include <freebsd/netinet/ip6.h>
+#include <freebsd/netinet6/in6_pcb.h>
+#include <freebsd/netinet6/ip6_var.h>
+#include <freebsd/netinet6/nd6.h>
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_fsm.h>
+#include <freebsd/netinet/tcp_seq.h>
+#include <freebsd/netinet/tcp_timer.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet6/tcp6_var.h>
+#include <freebsd/netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <freebsd/netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+
+static int tcp_reass_sysctl_maxseg(SYSCTL_HANDLER_ARGS);
+static int tcp_reass_sysctl_qsize(SYSCTL_HANDLER_ARGS);
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
+ "TCP Segment Reassembly Queue");
+
+static VNET_DEFINE(int, tcp_reass_maxseg) = 0;
+#define V_tcp_reass_maxseg VNET(tcp_reass_maxseg)
+SYSCTL_VNET_PROC(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
+ &VNET_NAME(tcp_reass_maxseg), 0, &tcp_reass_sysctl_maxseg, "I",
+ "Global maximum number of TCP Segments in Reassembly Queue");
+
+static VNET_DEFINE(int, tcp_reass_qsize) = 0;
+#define V_tcp_reass_qsize VNET(tcp_reass_qsize)
+SYSCTL_VNET_PROC(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD,
+ &VNET_NAME(tcp_reass_qsize), 0, &tcp_reass_sysctl_qsize, "I",
+ "Global number of TCP Segments currently in Reassembly Queue");
+
+static VNET_DEFINE(int, tcp_reass_overflows) = 0;
+#define V_tcp_reass_overflows VNET(tcp_reass_overflows)
+SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
+ &VNET_NAME(tcp_reass_overflows), 0,
+ "Global number of TCP Segment Reassembly Queue Overflows");
+
+static VNET_DEFINE(uma_zone_t, tcp_reass_zone);
+#define V_tcp_reass_zone VNET(tcp_reass_zone)
+
+/* Initialize TCP reassembly queue */
+static void
+tcp_reass_zone_change(void *tag)
+{
+
+ V_tcp_reass_maxseg = nmbclusters / 16;
+ uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxseg);
+}
+
+void
+tcp_reass_init(void)
+{
+
+ V_tcp_reass_maxseg = nmbclusters / 16;
+ TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
+ &V_tcp_reass_maxseg);
+ V_tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxseg);
+ EVENTHANDLER_REGISTER(nmbclusters_change,
+ tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
+}
+
+#ifdef VIMAGE
+void
+tcp_reass_destroy(void)
+{
+
+ uma_zdestroy(V_tcp_reass_zone);
+}
+#endif
+
+void
+tcp_reass_flush(struct tcpcb *tp)
+{
+ struct tseg_qent *qe;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ while ((qe = LIST_FIRST(&tp->t_segq)) != NULL) {
+ LIST_REMOVE(qe, tqe_q);
+ m_freem(qe->tqe_m);
+ uma_zfree(V_tcp_reass_zone, qe);
+ tp->t_segqlen--;
+ }
+
+ KASSERT((tp->t_segqlen == 0),
+ ("TCP reass queue %p segment count is %d instead of 0 after flush.",
+ tp, tp->t_segqlen));
+}
+
+static int
+tcp_reass_sysctl_maxseg(SYSCTL_HANDLER_ARGS)
+{
+ V_tcp_reass_maxseg = uma_zone_get_max(V_tcp_reass_zone);
+ return (sysctl_handle_int(oidp, arg1, arg2, req));
+}
+
+static int
+tcp_reass_sysctl_qsize(SYSCTL_HANDLER_ARGS)
+{
+ V_tcp_reass_qsize = uma_zone_get_cur(V_tcp_reass_zone);
+ return (sysctl_handle_int(oidp, arg1, arg2, req));
+}
+
+int
+tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
+{
+ struct tseg_qent *q;
+ struct tseg_qent *p = NULL;
+ struct tseg_qent *nq;
+ struct tseg_qent *te = NULL;
+ struct socket *so = tp->t_inpcb->inp_socket;
+ int flags;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /*
+ * XXX: tcp_reass() is rather inefficient with its data structures
+ * and should be rewritten (see NetBSD for optimizations).
+ */
+
+ /*
+ * Call with th==NULL after become established to
+ * force pre-ESTABLISHED data up to user socket.
+ */
+ if (th == NULL)
+ goto present;
+
+ /*
+ * Limit the number of segments that can be queued to reduce the
+ * potential for mbuf exhaustion. For best performance, we want to be
+ * able to queue a full window's worth of segments. The size of the
+ * socket receive buffer determines our advertised window and grows
+ * automatically when socket buffer autotuning is enabled. Use it as the
+ * basis for our queue limit.
+ * Always let the missing segment through which caused this queue.
+ * NB: Access to the socket buffer is left intentionally unlocked as we
+ * can tolerate stale information here.
+ *
+ * XXXLAS: Using sbspace(so->so_rcv) instead of so->so_rcv.sb_hiwat
+ * should work but causes packets to be dropped when they shouldn't.
+ * Investigate why and re-evaluate the below limit after the behaviour
+ * is understood.
+ */
+ if (th->th_seq != tp->rcv_nxt &&
+ tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) {
+ V_tcp_reass_overflows++;
+ TCPSTAT_INC(tcps_rcvmemdrop);
+ m_freem(m);
+ *tlenp = 0;
+ return (0);
+ }
+
+ /*
+ * Allocate a new queue entry. If we can't, or hit the zone limit
+ * just drop the pkt.
+ */
+ te = uma_zalloc(V_tcp_reass_zone, M_NOWAIT);
+ if (te == NULL) {
+ TCPSTAT_INC(tcps_rcvmemdrop);
+ m_freem(m);
+ *tlenp = 0;
+ return (0);
+ }
+ tp->t_segqlen++;
+
+ /*
+ * Find a segment which begins after this one does.
+ */
+ LIST_FOREACH(q, &tp->t_segq, tqe_q) {
+ if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
+ break;
+ p = q;
+ }
+
+ /*
+ * If there is a preceding segment, it may provide some of
+ * our data already. If so, drop the data from the incoming
+ * segment. If it provides all of our data, drop us.
+ */
+ if (p != NULL) {
+ int i;
+ /* conversion to int (in i) handles seq wraparound */
+ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
+ if (i > 0) {
+ if (i >= *tlenp) {
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
+ m_freem(m);
+ uma_zfree(V_tcp_reass_zone, te);
+ tp->t_segqlen--;
+ /*
+ * Try to present any queued data
+ * at the left window edge to the user.
+ * This is needed after the 3-WHS
+ * completes.
+ */
+ goto present; /* ??? */
+ }
+ m_adj(m, i);
+ *tlenp -= i;
+ th->th_seq += i;
+ }
+ }
+ TCPSTAT_INC(tcps_rcvoopack);
+ TCPSTAT_ADD(tcps_rcvoobyte, *tlenp);
+
+ /*
+ * While we overlap succeeding segments trim them or,
+ * if they are completely covered, dequeue them.
+ */
+ while (q) {
+ int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
+ if (i <= 0)
+ break;
+ if (i < q->tqe_len) {
+ q->tqe_th->th_seq += i;
+ q->tqe_len -= i;
+ m_adj(q->tqe_m, i);
+ break;
+ }
+
+ nq = LIST_NEXT(q, tqe_q);
+ LIST_REMOVE(q, tqe_q);
+ m_freem(q->tqe_m);
+ uma_zfree(V_tcp_reass_zone, q);
+ tp->t_segqlen--;
+ q = nq;
+ }
+
+ /* Insert the new segment queue entry into place. */
+ te->tqe_m = m;
+ te->tqe_th = th;
+ te->tqe_len = *tlenp;
+
+ if (p == NULL) {
+ LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
+ } else {
+ LIST_INSERT_AFTER(p, te, tqe_q);
+ }
+
+present:
+ /*
+ * Present data to user, advancing rcv_nxt through
+ * completed sequence space.
+ */
+ if (!TCPS_HAVEESTABLISHED(tp->t_state))
+ return (0);
+ q = LIST_FIRST(&tp->t_segq);
+ if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
+ return (0);
+ SOCKBUF_LOCK(&so->so_rcv);
+ do {
+ tp->rcv_nxt += q->tqe_len;
+ flags = q->tqe_th->th_flags & TH_FIN;
+ nq = LIST_NEXT(q, tqe_q);
+ LIST_REMOVE(q, tqe_q);
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
+ m_freem(q->tqe_m);
+ else
+ sbappendstream_locked(&so->so_rcv, q->tqe_m);
+ uma_zfree(V_tcp_reass_zone, q);
+ tp->t_segqlen--;
+ q = nq;
+ } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
+ ND6_HINT(tp);
+ sorwakeup_locked(so);
+ return (flags);
+}
diff --git a/freebsd/sys/netinet/tcp_sack.c b/freebsd/sys/netinet/tcp_sack.c
new file mode 100644
index 00000000..94bae57b
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_sack.c
@@ -0,0 +1,687 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ * The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95
+ */
+
+/*-
+ * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995
+ *
+ * NRL grants permission for redistribution and use in source and binary
+ * forms, with or without modification, of the software and documentation
+ * created at NRL provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgements:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * This product includes software developed at the Information
+ * Technology Division, US Naval Research Laboratory.
+ * 4. Neither the name of the NRL nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
+ * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation
+ * are those of the authors and should not be interpreted as representing
+ * official policies, either expressed or implied, of the US Naval
+ * Research Laboratory (NRL).
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_tcpdebug.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/proc.h> /* for proc0 declaration */
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/sys/systm.h>
+
+#include <freebsd/machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
+
+#include <freebsd/vm/uma.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip6.h>
+#include <freebsd/netinet/icmp6.h>
+#include <freebsd/netinet6/nd6.h>
+#include <freebsd/netinet6/ip6_var.h>
+#include <freebsd/netinet6/in6_pcb.h>
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_fsm.h>
+#include <freebsd/netinet/tcp_seq.h>
+#include <freebsd/netinet/tcp_timer.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet6/tcp6_var.h>
+#include <freebsd/netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <freebsd/netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+
+#include <freebsd/machine/in_cksum.h>
+
+VNET_DECLARE(struct uma_zone *, sack_hole_zone);
+#define V_sack_hole_zone VNET(sack_hole_zone)
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK");
+VNET_DEFINE(int, tcp_do_sack) = 1;
+#define V_tcp_do_sack VNET(tcp_do_sack)
+SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW,
+ &VNET_NAME(tcp_do_sack), 0, "Enable/Disable TCP SACK support");
+
+VNET_DEFINE(int, tcp_sack_maxholes) = 128;
+#define V_tcp_sack_maxholes VNET(tcp_sack_maxholes)
+SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW,
+ &VNET_NAME(tcp_sack_maxholes), 0,
+ "Maximum number of TCP SACK holes allowed per connection");
+
+VNET_DEFINE(int, tcp_sack_globalmaxholes) = 65536;
+#define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes)
+SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_RW,
+ &VNET_NAME(tcp_sack_globalmaxholes), 0,
+ "Global maximum number of TCP SACK holes");
+
+VNET_DEFINE(int, tcp_sack_globalholes) = 0;
+#define V_tcp_sack_globalholes VNET(tcp_sack_globalholes)
+SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_RD,
+ &VNET_NAME(tcp_sack_globalholes), 0,
+ "Global number of TCP SACK holes currently allocated");
+
+/*
+ * This function is called upon receipt of new valid data (while not in
+ * header prediction mode), and it updates the ordered list of sacks.
+ */
+void
+tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
+{
+ /*
+ * First reported block MUST be the most recent one. Subsequent
+ * blocks SHOULD be in the order in which they arrived at the
+ * receiver. These two conditions make the implementation fully
+ * compliant with RFC 2018.
+ */
+ struct sackblk head_blk, saved_blks[MAX_SACK_BLKS];
+ int num_head, num_saved, i;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /* Check arguments. */
+ KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));
+
+ /* SACK block for the received segment. */
+ head_blk.start = rcv_start;
+ head_blk.end = rcv_end;
+
+ /*
+ * Merge updated SACK blocks into head_blk, and save unchanged SACK
+ * blocks into saved_blks[]. num_saved will have the number of the
+ * saved SACK blocks.
+ */
+ num_saved = 0;
+ for (i = 0; i < tp->rcv_numsacks; i++) {
+ tcp_seq start = tp->sackblks[i].start;
+ tcp_seq end = tp->sackblks[i].end;
+ if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
+ /*
+ * Discard this SACK block.
+ */
+ } else if (SEQ_LEQ(head_blk.start, end) &&
+ SEQ_GEQ(head_blk.end, start)) {
+ /*
+ * Merge this SACK block into head_blk. This SACK
+ * block itself will be discarded.
+ */
+ if (SEQ_GT(head_blk.start, start))
+ head_blk.start = start;
+ if (SEQ_LT(head_blk.end, end))
+ head_blk.end = end;
+ } else {
+ /*
+ * Save this SACK block.
+ */
+ saved_blks[num_saved].start = start;
+ saved_blks[num_saved].end = end;
+ num_saved++;
+ }
+ }
+
+ /*
+ * Update SACK list in tp->sackblks[].
+ */
+ num_head = 0;
+ if (SEQ_GT(head_blk.start, tp->rcv_nxt)) {
+ /*
+ * The received data segment is an out-of-order segment. Put
+ * head_blk at the top of SACK list.
+ */
+ tp->sackblks[0] = head_blk;
+ num_head = 1;
+ /*
+ * If the number of saved SACK blocks exceeds its limit,
+ * discard the last SACK block.
+ */
+ if (num_saved >= MAX_SACK_BLKS)
+ num_saved--;
+ }
+ if (num_saved > 0) {
+ /*
+ * Copy the saved SACK blocks back.
+ */
+ bcopy(saved_blks, &tp->sackblks[num_head],
+ sizeof(struct sackblk) * num_saved);
+ }
+
+ /* Save the number of SACK blocks. */
+ tp->rcv_numsacks = num_head + num_saved;
+}
+
+/*
+ * Delete all receiver-side SACK information.
+ */
+void
+tcp_clean_sackreport(struct tcpcb *tp)
+{
+ int i;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ tp->rcv_numsacks = 0;
+ for (i = 0; i < MAX_SACK_BLKS; i++)
+ tp->sackblks[i].start = tp->sackblks[i].end=0;
+}
+
+/*
+ * Allocate struct sackhole.
+ */
+static struct sackhole *
+tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end)
+{
+ struct sackhole *hole;
+
+ if (tp->snd_numholes >= V_tcp_sack_maxholes ||
+ V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) {
+ TCPSTAT_INC(tcps_sack_sboverflow);
+ return NULL;
+ }
+
+ hole = (struct sackhole *)uma_zalloc(V_sack_hole_zone, M_NOWAIT);
+ if (hole == NULL)
+ return NULL;
+
+ hole->start = start;
+ hole->end = end;
+ hole->rxmit = start;
+
+ tp->snd_numholes++;
+ atomic_add_int(&V_tcp_sack_globalholes, 1);
+
+ return hole;
+}
+
+/*
+ * Free struct sackhole.
+ */
+static void
+tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole)
+{
+
+ uma_zfree(V_sack_hole_zone, hole);
+
+ tp->snd_numholes--;
+ atomic_subtract_int(&V_tcp_sack_globalholes, 1);
+
+ KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0"));
+ KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0"));
+}
+
+/*
+ * Insert new SACK hole into scoreboard.
+ */
+static struct sackhole *
+tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end,
+ struct sackhole *after)
+{
+ struct sackhole *hole;
+
+ /* Allocate a new SACK hole. */
+ hole = tcp_sackhole_alloc(tp, start, end);
+ if (hole == NULL)
+ return NULL;
+
+ /* Insert the new SACK hole into scoreboard. */
+ if (after != NULL)
+ TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink);
+ else
+ TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink);
+
+ /* Update SACK hint. */
+ if (tp->sackhint.nexthole == NULL)
+ tp->sackhint.nexthole = hole;
+
+ return hole;
+}
+
+/*
+ * Remove SACK hole from scoreboard.
+ */
+static void
+tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole)
+{
+
+ /* Update SACK hint. */
+ if (tp->sackhint.nexthole == hole)
+ tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink);
+
+ /* Remove this SACK hole. */
+ TAILQ_REMOVE(&tp->snd_holes, hole, scblink);
+
+ /* Free this SACK hole. */
+ tcp_sackhole_free(tp, hole);
+}
+
+/*
+ * Process cumulative ACK and the TCP SACK option to update the scoreboard.
+ * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of
+ * the sequence space).
+ */
+void
+tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
+{
+ struct sackhole *cur, *temp;
+ struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp;
+ int i, j, num_sack_blks;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ num_sack_blks = 0;
+ /*
+ * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist,
+ * treat [SND.UNA, SEG.ACK) as if it is a SACK block.
+ */
+ if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) {
+ sack_blocks[num_sack_blks].start = tp->snd_una;
+ sack_blocks[num_sack_blks++].end = th_ack;
+ }
+ /*
+ * Append received valid SACK blocks to sack_blocks[], but only if we
+ * received new blocks from the other side.
+ */
+ if (to->to_flags & TOF_SACK) {
+ for (i = 0; i < to->to_nsacks; i++) {
+ bcopy((to->to_sacks + i * TCPOLEN_SACK),
+ &sack, sizeof(sack));
+ sack.start = ntohl(sack.start);
+ sack.end = ntohl(sack.end);
+ if (SEQ_GT(sack.end, sack.start) &&
+ SEQ_GT(sack.start, tp->snd_una) &&
+ SEQ_GT(sack.start, th_ack) &&
+ SEQ_LT(sack.start, tp->snd_max) &&
+ SEQ_GT(sack.end, tp->snd_una) &&
+ SEQ_LEQ(sack.end, tp->snd_max))
+ sack_blocks[num_sack_blks++] = sack;
+ }
+ }
+ /*
+ * Return if SND.UNA is not advanced and no valid SACK block is
+ * received.
+ */
+ if (num_sack_blks == 0)
+ return;
+
+ /*
+ * Sort the SACK blocks so we can update the scoreboard with just one
+ * pass. The overhead of sorting upto 4+1 elements is less than
+ * making upto 4+1 passes over the scoreboard.
+ */
+ for (i = 0; i < num_sack_blks; i++) {
+ for (j = i + 1; j < num_sack_blks; j++) {
+ if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
+ sack = sack_blocks[i];
+ sack_blocks[i] = sack_blocks[j];
+ sack_blocks[j] = sack;
+ }
+ }
+ }
+ if (TAILQ_EMPTY(&tp->snd_holes))
+ /*
+ * Empty scoreboard. Need to initialize snd_fack (it may be
+ * uninitialized or have a bogus value). Scoreboard holes
+ * (from the sack blocks received) are created later below
+ * (in the logic that adds holes to the tail of the
+ * scoreboard).
+ */
+ tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
+ /*
+ * In the while-loop below, incoming SACK blocks (sack_blocks[]) and
+ * SACK holes (snd_holes) are traversed from their tails with just
+ * one pass in order to reduce the number of compares especially when
+ * the bandwidth-delay product is large.
+ *
+ * Note: Typically, in the first RTT of SACK recovery, the highest
+ * three or four SACK blocks with the same ack number are received.
+ * In the second RTT, if retransmitted data segments are not lost,
+ * the highest three or four SACK blocks with ack number advancing
+ * are received.
+ */
+ sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */
+ if (SEQ_LT(tp->snd_fack, sblkp->start)) {
+ /*
+ * The highest SACK block is beyond fack. Append new SACK
+ * hole at the tail. If the second or later highest SACK
+ * blocks are also beyond the current fack, they will be
+ * inserted by way of hole splitting in the while-loop below.
+ */
+ temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL);
+ if (temp != NULL) {
+ tp->snd_fack = sblkp->end;
+ /* Go to the previous sack block. */
+ sblkp--;
+ } else {
+ /*
+ * We failed to add a new hole based on the current
+ * sack block. Skip over all the sack blocks that
+ * fall completely to the right of snd_fack and
+ * proceed to trim the scoreboard based on the
+ * remaining sack blocks. This also trims the
+ * scoreboard for th_ack (which is sack_blocks[0]).
+ */
+ while (sblkp >= sack_blocks &&
+ SEQ_LT(tp->snd_fack, sblkp->start))
+ sblkp--;
+ if (sblkp >= sack_blocks &&
+ SEQ_LT(tp->snd_fack, sblkp->end))
+ tp->snd_fack = sblkp->end;
+ }
+ } else if (SEQ_LT(tp->snd_fack, sblkp->end))
+ /* fack is advanced. */
+ tp->snd_fack = sblkp->end;
+ /* We must have at least one SACK hole in scoreboard. */
+ KASSERT(!TAILQ_EMPTY(&tp->snd_holes),
+ ("SACK scoreboard must not be empty"));
+ cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */
+ /*
+ * Since the incoming sack blocks are sorted, we can process them
+ * making one sweep of the scoreboard.
+ */
+ while (sblkp >= sack_blocks && cur != NULL) {
+ if (SEQ_GEQ(sblkp->start, cur->end)) {
+ /*
+ * SACKs data beyond the current hole. Go to the
+ * previous sack block.
+ */
+ sblkp--;
+ continue;
+ }
+ if (SEQ_LEQ(sblkp->end, cur->start)) {
+ /*
+ * SACKs data before the current hole. Go to the
+ * previous hole.
+ */
+ cur = TAILQ_PREV(cur, sackhole_head, scblink);
+ continue;
+ }
+ tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start);
+ KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
+ ("sackhint bytes rtx >= 0"));
+ if (SEQ_LEQ(sblkp->start, cur->start)) {
+ /* Data acks at least the beginning of hole. */
+ if (SEQ_GEQ(sblkp->end, cur->end)) {
+ /* Acks entire hole, so delete hole. */
+ temp = cur;
+ cur = TAILQ_PREV(cur, sackhole_head, scblink);
+ tcp_sackhole_remove(tp, temp);
+ /*
+ * The sack block may ack all or part of the
+ * next hole too, so continue onto the next
+ * hole.
+ */
+ continue;
+ } else {
+ /* Move start of hole forward. */
+ cur->start = sblkp->end;
+ cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
+ }
+ } else {
+ /* Data acks at least the end of hole. */
+ if (SEQ_GEQ(sblkp->end, cur->end)) {
+ /* Move end of hole backward. */
+ cur->end = sblkp->start;
+ cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
+ } else {
+ /*
+ * ACKs some data in middle of a hole; need
+ * to split current hole
+ */
+ temp = tcp_sackhole_insert(tp, sblkp->end,
+ cur->end, cur);
+ if (temp != NULL) {
+ if (SEQ_GT(cur->rxmit, temp->rxmit)) {
+ temp->rxmit = cur->rxmit;
+ tp->sackhint.sack_bytes_rexmit
+ += (temp->rxmit
+ - temp->start);
+ }
+ cur->end = sblkp->start;
+ cur->rxmit = SEQ_MIN(cur->rxmit,
+ cur->end);
+ }
+ }
+ }
+ tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start);
+ /*
+ * Testing sblkp->start against cur->start tells us whether
+ * we're done with the sack block or the sack hole.
+ * Accordingly, we advance one or the other.
+ */
+ if (SEQ_LEQ(sblkp->start, cur->start))
+ cur = TAILQ_PREV(cur, sackhole_head, scblink);
+ else
+ sblkp--;
+ }
+}
+
+/*
+ * Free all SACK holes to clear the scoreboard.
+ */
+void
+tcp_free_sackholes(struct tcpcb *tp)
+{
+ struct sackhole *q;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL)
+ tcp_sackhole_remove(tp, q);
+ tp->sackhint.sack_bytes_rexmit = 0;
+
+ KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0"));
+ KASSERT(tp->sackhint.nexthole == NULL,
+ ("tp->sackhint.nexthole == NULL"));
+}
+
+/*
+ * Partial ack handling within a sack recovery episode. Keeping this very
+ * simple for now. When a partial ack is received, force snd_cwnd to a value
+ * that will allow the sender to transmit no more than 2 segments. If
+ * necessary, a better scheme can be adopted at a later point, but for now,
+ * the goal is to prevent the sender from bursting a large amount of data in
+ * the midst of sack recovery.
+ */
+void
+tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
+{
+ int num_segs = 1;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_rtttime = 0;
+ /* Send one or 2 segments based on how much new data was acked. */
+ if (((th->th_ack - tp->snd_una) / tp->t_maxseg) > 2)
+ num_segs = 2;
+ tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
+ (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg);
+ if (tp->snd_cwnd > tp->snd_ssthresh)
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->t_flags |= TF_ACKNOW;
+ (void) tcp_output(tp);
+}
+
+#if 0
+/*
+ * Debug version of tcp_sack_output() that walks the scoreboard. Used for
+ * now to sanity check the hint.
+ */
+static struct sackhole *
+tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt)
+{
+ struct sackhole *p;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ *sack_bytes_rexmt = 0;
+ TAILQ_FOREACH(p, &tp->snd_holes, scblink) {
+ if (SEQ_LT(p->rxmit, p->end)) {
+ if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
+ continue;
+ }
+ *sack_bytes_rexmt += (p->rxmit - p->start);
+ break;
+ }
+ *sack_bytes_rexmt += (p->rxmit - p->start);
+ }
+ return (p);
+}
+#endif
+
+/*
+ * Returns the next hole to retransmit and the number of retransmitted bytes
+ * from the scoreboard. We store both the next hole and the number of
+ * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK
+ * reception). This avoids scoreboard traversals completely.
+ *
+ * The loop here will traverse *at most* one link. Here's the argument. For
+ * the loop to traverse more than 1 link before finding the next hole to
+ * retransmit, we would need to have at least 1 node following the current
+ * hint with (rxmit == end). But, for all holes following the current hint,
+ * (start == rxmit), since we have not yet retransmitted from them.
+ * Therefore, in order to traverse more 1 link in the loop below, we need to
+ * have at least one node following the current hint with (start == rxmit ==
+ * end). But that can't happen, (start == end) means that all the data in
+ * that hole has been sacked, in which case, the hole would have been removed
+ * from the scoreboard.
+ */
+struct sackhole *
+tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
+{
+ struct sackhole *hole = NULL;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit;
+ hole = tp->sackhint.nexthole;
+ if (hole == NULL || SEQ_LT(hole->rxmit, hole->end))
+ goto out;
+ while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) {
+ if (SEQ_LT(hole->rxmit, hole->end)) {
+ tp->sackhint.nexthole = hole;
+ break;
+ }
+ }
+out:
+ return (hole);
+}
+
+/*
+ * After a timeout, the SACK list may be rebuilt. This SACK information
+ * should be used to avoid retransmitting SACKed data. This function
+ * traverses the SACK list to see if snd_nxt should be moved forward.
+ */
+void
+tcp_sack_adjust(struct tcpcb *tp)
+{
+ struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes);
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (cur == NULL)
+ return; /* No holes */
+ if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack))
+ return; /* We're already beyond any SACKed blocks */
+ /*-
+ * Two cases for which we want to advance snd_nxt:
+ * i) snd_nxt lies between end of one hole and beginning of another
+ * ii) snd_nxt lies between end of last hole and snd_fack
+ */
+ while ((p = TAILQ_NEXT(cur, scblink)) != NULL) {
+ if (SEQ_LT(tp->snd_nxt, cur->end))
+ return;
+ if (SEQ_GEQ(tp->snd_nxt, p->start))
+ cur = p;
+ else {
+ tp->snd_nxt = p->start;
+ return;
+ }
+ }
+ if (SEQ_LT(tp->snd_nxt, cur->end))
+ return;
+ tp->snd_nxt = tp->snd_fack;
+}
diff --git a/freebsd/sys/netinet/tcp_seq.h b/freebsd/sys/netinet/tcp_seq.h
new file mode 100644
index 00000000..8af7b0ab
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_seq.h
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_seq.h 8.3 (Berkeley) 6/21/95
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_SEQ_HH_
+#define _NETINET_TCP_SEQ_HH_
+/*
+ * TCP sequence numbers are 32 bit integers operated
+ * on with modular arithmetic. These macros can be
+ * used to compare such integers.
+ */
+#define SEQ_LT(a,b) ((int)((a)-(b)) < 0)
+#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0)
+#define SEQ_GT(a,b) ((int)((a)-(b)) > 0)
+#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0)
+
+#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b))
+#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b))
+
+/* for modulo comparisons of timestamps */
+#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
+#define TSTMP_GT(a,b) ((int)((a)-(b)) > 0)
+#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
+
+/*
+ * Macros to initialize tcp sequence numbers for
+ * send and receive from initial send and receive
+ * sequence numbers.
+ */
+#define tcp_rcvseqinit(tp) \
+ (tp)->rcv_adv = (tp)->rcv_nxt = (tp)->irs + 1
+
+#define tcp_sendseqinit(tp) \
+ (tp)->snd_una = (tp)->snd_nxt = (tp)->snd_max = (tp)->snd_up = \
+ (tp)->snd_recover = (tp)->iss
+
+#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * hz)
+ /* timestamp wrap-around time */
+
+#endif /* _NETINET_TCP_SEQ_HH_ */
diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c
new file mode 100644
index 00000000..83777450
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_subr.c
@@ -0,0 +1,2315 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_compat.h>
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_ipsec.h>
+#include <freebsd/local/opt_tcpdebug.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/callout.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/jail.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#ifdef INET6
+#include <freebsd/sys/domain.h>
+#endif
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/random.h>
+
+#include <freebsd/vm/uma.h>
+
+#include <freebsd/net/route.h>
+#include <freebsd/net/if.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#ifdef INET6
+#include <freebsd/netinet/ip6.h>
+#endif
+#include <freebsd/netinet/in_pcb.h>
+#ifdef INET6
+#include <freebsd/netinet6/in6_pcb.h>
+#endif
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip_var.h>
+#ifdef INET6
+#include <freebsd/netinet6/ip6_var.h>
+#include <freebsd/netinet6/scope6_var.h>
+#include <freebsd/netinet6/nd6.h>
+#endif
+#include <freebsd/netinet/ip_icmp.h>
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_fsm.h>
+#include <freebsd/netinet/tcp_seq.h>
+#include <freebsd/netinet/tcp_timer.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet/tcp_syncache.h>
+#include <freebsd/netinet/tcp_offload.h>
+#ifdef INET6
+#include <freebsd/netinet6/tcp6_var.h>
+#endif
+#include <freebsd/netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <freebsd/netinet/tcp_debug.h>
+#endif
+#include <freebsd/netinet6/ip6protosw.h>
+
+#ifdef IPSEC
+#include <freebsd/netipsec/ipsec.h>
+#include <freebsd/netipsec/xform.h>
+#ifdef INET6
+#include <freebsd/netipsec/ipsec6.h>
+#endif
+#include <freebsd/netipsec/key.h>
+#include <freebsd/sys/syslog.h>
+#endif /*IPSEC*/
+
+#include <freebsd/machine/in_cksum.h>
+#include <freebsd/sys/md5.h>
+
+#include <freebsd/security/mac/mac_framework.h>
+
+VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS;
+#ifdef INET6
+VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS;
+#endif
+
+static int
+sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
+{
+ int error, new;
+
+ new = V_tcp_mssdflt;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error == 0 && req->newptr) {
+ if (new < TCP_MINMSS)
+ error = EINVAL;
+ else
+ V_tcp_mssdflt = new;
+ }
+ return (error);
+}
+
+SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
+ CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0,
+ &sysctl_net_inet_tcp_mss_check, "I",
+ "Default TCP Maximum Segment Size");
+
+#ifdef INET6
+static int
+sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
+{
+ int error, new;
+
+ new = V_tcp_v6mssdflt;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error == 0 && req->newptr) {
+ if (new < TCP_MINMSS)
+ error = EINVAL;
+ else
+ V_tcp_v6mssdflt = new;
+ }
+ return (error);
+}
+
+SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
+ CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0,
+ &sysctl_net_inet_tcp_mss_v6_check, "I",
+ "Default TCP Maximum Segment Size for IPv6");
+#endif
+
+static int
+vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
+{
+
+ VNET_SYSCTL_ARG(req, arg1);
+ return (sysctl_msec_to_ticks(oidp, arg1, arg2, req));
+}
+
+/*
+ * Minimum MSS we accept and use. This prevents DoS attacks where
+ * we are forced to a ridiculous low MSS like 20 and send hundreds
+ * of packets instead of one. The effect scales with the available
+ * bandwidth and quickly saturates the CPU and network interface
+ * with packet generation and sending. Set to zero to disable MINMSS
+ * checking. This setting prevents us from sending too small packets.
+ */
+VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS;
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
+ &VNET_NAME(tcp_minmss), 0,
+ "Minmum TCP Maximum Segment Size");
+
+VNET_DEFINE(int, tcp_do_rfc1323) = 1;
+SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
+ &VNET_NAME(tcp_do_rfc1323), 0,
+ "Enable rfc1323 (high performance TCP) extensions");
+
+static int tcp_log_debug = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
+ &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
+
+static int tcp_tcbhashsize = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
+ &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
+
+static int do_tcpdrain = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
+ "Enable tcp_drain routine for extra help when low on mbufs");
+
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
+ &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs");
+
+static VNET_DEFINE(int, icmp_may_rst) = 1;
+#define V_icmp_may_rst VNET(icmp_may_rst)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW,
+ &VNET_NAME(icmp_may_rst), 0,
+ "Certain ICMP unreachable messages may abort connections in SYN_SENT");
+
+static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0;
+#define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
+ &VNET_NAME(tcp_isn_reseed_interval), 0,
+ "Seconds between reseeding of ISN secret");
+
+/*
+ * TCP bandwidth limiting sysctls. Note that the default lower bound of
+ * 1024 exists only for debugging. A good production default would be
+ * something like 6100.
+ */
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
+ "TCP inflight data limiting");
+
+static VNET_DEFINE(int, tcp_inflight_enable) = 0;
+#define V_tcp_inflight_enable VNET(tcp_inflight_enable)
+SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW,
+ &VNET_NAME(tcp_inflight_enable), 0,
+ "Enable automatic TCP inflight data limiting");
+
+static int tcp_inflight_debug = 0;
+SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
+ &tcp_inflight_debug, 0,
+ "Debug TCP inflight calculations");
+
+static VNET_DEFINE(int, tcp_inflight_rttthresh);
+#define V_tcp_inflight_rttthresh VNET(tcp_inflight_rttthresh)
+SYSCTL_VNET_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh,
+ CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_inflight_rttthresh), 0,
+ vnet_sysctl_msec_to_ticks, "I",
+ "RTT threshold below which inflight will deactivate itself");
+
+static VNET_DEFINE(int, tcp_inflight_min) = 6144;
+#define V_tcp_inflight_min VNET(tcp_inflight_min)
+SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW,
+ &VNET_NAME(tcp_inflight_min), 0,
+ "Lower-bound for TCP inflight window");
+
+static VNET_DEFINE(int, tcp_inflight_max) = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+#define V_tcp_inflight_max VNET(tcp_inflight_max)
+SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW,
+ &VNET_NAME(tcp_inflight_max), 0,
+ "Upper-bound for TCP inflight window");
+
+static VNET_DEFINE(int, tcp_inflight_stab) = 20;
+#define V_tcp_inflight_stab VNET(tcp_inflight_stab)
+SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
+ &VNET_NAME(tcp_inflight_stab), 0,
+ "Inflight Algorithm Stabilization 20 = 2 packets");
+
+VNET_DEFINE(uma_zone_t, sack_hole_zone);
+#define V_sack_hole_zone VNET(sack_hole_zone)
+
+static struct inpcb *tcp_notify(struct inpcb *, int);
+static void tcp_isn_tick(void *);
+static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
+ void *ip4hdr, const void *ip6hdr);
+
+/*
+ * Target size of TCP PCB hash tables. Must be a power of two.
+ *
+ * Note that this can be overridden by the kernel environment
+ * variable net.inet.tcp.tcbhashsize
+ */
+#ifndef TCBHASHSIZE
+#define TCBHASHSIZE 512
+#endif
+
+/*
+ * XXX
+ * Callouts should be moved into struct tcp directly. They are currently
+ * separate because the tcpcb structure is exported to userland for sysctl
+ * parsing purposes, which do not know about callouts.
+ */
+struct tcpcb_mem {
+ struct tcpcb tcb;
+ struct tcp_timer tt;
+};
+
+static VNET_DEFINE(uma_zone_t, tcpcb_zone);
+#define V_tcpcb_zone VNET(tcpcb_zone)
+
+MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
+struct callout isn_callout;
+static struct mtx isn_mtx;
+
+#define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
+#define ISN_LOCK() mtx_lock(&isn_mtx)
+#define ISN_UNLOCK() mtx_unlock(&isn_mtx)
+
+/*
+ * TCP initialization.
+ */
+static void
+tcp_zone_change(void *tag)
+{
+
+ uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
+ uma_zone_set_max(V_tcpcb_zone, maxsockets);
+ tcp_tw_zone_change();
+}
+
+static int
+tcp_inpcb_init(void *mem, int size, int flags)
+{
+ struct inpcb *inp = mem;
+
+ INP_LOCK_INIT(inp, "inp", "tcpinp");
+ return (0);
+}
+
+void
+tcp_init(void)
+{
+ int hashsize;
+
+ INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp");
+ LIST_INIT(&V_tcb);
+#ifdef VIMAGE
+ V_tcbinfo.ipi_vnet = curvnet;
+#endif
+ V_tcbinfo.ipi_listhead = &V_tcb;
+ hashsize = TCBHASHSIZE;
+ TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
+ if (!powerof2(hashsize)) {
+ printf("WARNING: TCB hash size not a power of 2\n");
+ hashsize = 512; /* safe default */
+ }
+ V_tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB,
+ &V_tcbinfo.ipi_hashmask);
+ V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB,
+ &V_tcbinfo.ipi_porthashmask);
+ V_tcbinfo.ipi_zone = uma_zcreate("tcp_inpcb", sizeof(struct inpcb),
+ NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
+ V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
+
+ /*
+ * These have to be type stable for the benefit of the timers.
+ */
+ V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(V_tcpcb_zone, maxsockets);
+
+ tcp_tw_init();
+ syncache_init();
+ tcp_hc_init();
+ tcp_reass_init();
+
+ TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
+ V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+
+ /* Skip initialization of globals for non-default instances. */
+ if (!IS_DEFAULT_VNET(curvnet))
+ return;
+
+ /* XXX virtualize those bellow? */
+ tcp_delacktime = TCPTV_DELACK;
+ tcp_keepinit = TCPTV_KEEP_INIT;
+ tcp_keepidle = TCPTV_KEEP_IDLE;
+ tcp_keepintvl = TCPTV_KEEPINTVL;
+ tcp_maxpersistidle = TCPTV_KEEP_IDLE;
+ tcp_msl = TCPTV_MSL;
+ tcp_rexmit_min = TCPTV_MIN;
+ if (tcp_rexmit_min < 1)
+ tcp_rexmit_min = 1;
+ tcp_rexmit_slop = TCPTV_CPU_VAR;
+ tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
+ tcp_tcbhashsize = hashsize;
+
+#ifdef INET6
+#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
+#else /* INET6 */
+#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
+#endif /* INET6 */
+ if (max_protohdr < TCP_MINPROTOHDR)
+ max_protohdr = TCP_MINPROTOHDR;
+ if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
+ panic("tcp_init");
+#undef TCP_MINPROTOHDR
+
+ ISN_LOCK_INIT();
+ callout_init(&isn_callout, CALLOUT_MPSAFE);
+ callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL);
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
+ SHUTDOWN_PRI_DEFAULT);
+ EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
+ EVENTHANDLER_PRI_ANY);
+}
+
+#ifdef VIMAGE
+void
+tcp_destroy(void)
+{
+
+ tcp_reass_destroy();
+ tcp_hc_destroy();
+ syncache_destroy();
+ tcp_tw_destroy();
+
+ /* XXX check that hashes are empty! */
+ hashdestroy(V_tcbinfo.ipi_hashbase, M_PCB,
+ V_tcbinfo.ipi_hashmask);
+ hashdestroy(V_tcbinfo.ipi_porthashbase, M_PCB,
+ V_tcbinfo.ipi_porthashmask);
+
+ uma_zdestroy(V_sack_hole_zone);
+ uma_zdestroy(V_tcpcb_zone);
+ uma_zdestroy(V_tcbinfo.ipi_zone);
+
+ INP_INFO_LOCK_DESTROY(&V_tcbinfo);
+}
+#endif
+
+void
+tcp_fini(void *xtp)
+{
+
+ callout_stop(&isn_callout);
+}
+
+/*
+ * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
+ * tcp_template used to store this data in mbufs, but we now recopy it out
+ * of the tcpcb each time to conserve mbufs.
+ */
+void
+tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
+{
+ struct tcphdr *th = (struct tcphdr *)tcp_ptr;
+
+ INP_WLOCK_ASSERT(inp);
+
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV6) != 0) {
+ struct ip6_hdr *ip6;
+
+ ip6 = (struct ip6_hdr *)ip_ptr;
+ ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
+ (inp->inp_flow & IPV6_FLOWINFO_MASK);
+ ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
+ (IPV6_VERSION & IPV6_VERSION_MASK);
+ ip6->ip6_nxt = IPPROTO_TCP;
+ ip6->ip6_plen = htons(sizeof(struct tcphdr));
+ ip6->ip6_src = inp->in6p_laddr;
+ ip6->ip6_dst = inp->in6p_faddr;
+ } else
+#endif
+ {
+ struct ip *ip;
+
+ ip = (struct ip *)ip_ptr;
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = 5;
+ ip->ip_tos = inp->inp_ip_tos;
+ ip->ip_len = 0;
+ ip->ip_id = 0;
+ ip->ip_off = 0;
+ ip->ip_ttl = inp->inp_ip_ttl;
+ ip->ip_sum = 0;
+ ip->ip_p = IPPROTO_TCP;
+ ip->ip_src = inp->inp_laddr;
+ ip->ip_dst = inp->inp_faddr;
+ }
+ th->th_sport = inp->inp_lport;
+ th->th_dport = inp->inp_fport;
+ th->th_seq = 0;
+ th->th_ack = 0;
+ th->th_x2 = 0;
+ th->th_off = 5;
+ th->th_flags = 0;
+ th->th_win = 0;
+ th->th_urp = 0;
+ th->th_sum = 0; /* in_pseudo() is called later for ipv4 */
+}
+
+/*
+ * Create template to be used to send tcp packets on a connection.
+ * Allocates an mbuf and fills in a skeletal tcp/ip header. The only
+ * use for this function is in keepalives, which use tcp_respond.
+ */
+struct tcptemp *
+tcpip_maketemplate(struct inpcb *inp)
+{
+ struct tcptemp *t;
+
+ t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
+ if (t == NULL)
+ return (NULL);
+ tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
+ return (t);
+}
+
+/*
+ * Send a single message to the TCP at address specified by
+ * the given TCP/IP header. If m == NULL, then we make a copy
+ * of the tcpiphdr at ti and send directly to the addressed host.
+ * This is used to force keep alive messages out using the TCP
+ * template for a connection. If flags are given then we send
+ * a message back to the TCP which originated the * segment ti,
+ * and discard the mbuf containing it and any other attached mbufs.
+ *
+ * In any case the ack and sequence number of the transmitted
+ * segment are as specified by the parameters.
+ *
+ * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
+ */
+void
+tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
+ tcp_seq ack, tcp_seq seq, int flags)
+{
+ int tlen;
+ int win = 0;
+ struct ip *ip;
+ struct tcphdr *nth;
+#ifdef INET6
+ struct ip6_hdr *ip6;
+ int isipv6;
+#endif /* INET6 */
+ int ipflags = 0;
+ struct inpcb *inp;
+
+ KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
+
+#ifdef INET6
+ isipv6 = ((struct ip *)ipgen)->ip_v == 6;
+ ip6 = ipgen;
+#endif /* INET6 */
+ ip = ipgen;
+
+ if (tp != NULL) {
+ inp = tp->t_inpcb;
+ KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
+ INP_WLOCK_ASSERT(inp);
+ } else
+ inp = NULL;
+
+ if (tp != NULL) {
+ if (!(flags & TH_RST)) {
+ win = sbspace(&inp->inp_socket->so_rcv);
+ if (win > (long)TCP_MAXWIN << tp->rcv_scale)
+ win = (long)TCP_MAXWIN << tp->rcv_scale;
+ }
+ }
+ if (m == NULL) {
+ m = m_gethdr(M_DONTWAIT, MT_DATA);
+ if (m == NULL)
+ return;
+ tlen = 0;
+ m->m_data += max_linkhdr;
+#ifdef INET6
+ if (isipv6) {
+ bcopy((caddr_t)ip6, mtod(m, caddr_t),
+ sizeof(struct ip6_hdr));
+ ip6 = mtod(m, struct ip6_hdr *);
+ nth = (struct tcphdr *)(ip6 + 1);
+ } else
+#endif /* INET6 */
+ {
+ bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
+ ip = mtod(m, struct ip *);
+ nth = (struct tcphdr *)(ip + 1);
+ }
+ bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
+ flags = TH_ACK;
+ } else {
+ /*
+ * reuse the mbuf.
+ * XXX MRT We inherrit the FIB, which is lucky.
+ */
+ m_freem(m->m_next);
+ m->m_next = NULL;
+ m->m_data = (caddr_t)ipgen;
+ /* m_len is set later */
+ tlen = 0;
+#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
+#ifdef INET6
+ if (isipv6) {
+ xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
+ nth = (struct tcphdr *)(ip6 + 1);
+ } else
+#endif /* INET6 */
+ {
+ xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
+ nth = (struct tcphdr *)(ip + 1);
+ }
+ if (th != nth) {
+ /*
+ * this is usually a case when an extension header
+ * exists between the IPv6 header and the
+ * TCP header.
+ */
+ nth->th_sport = th->th_sport;
+ nth->th_dport = th->th_dport;
+ }
+ xchg(nth->th_dport, nth->th_sport, uint16_t);
+#undef xchg
+ }
+#ifdef INET6
+ if (isipv6) {
+ ip6->ip6_flow = 0;
+ ip6->ip6_vfc = IPV6_VERSION;
+ ip6->ip6_nxt = IPPROTO_TCP;
+ ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
+ tlen));
+ tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
+ } else
+#endif
+ {
+ tlen += sizeof (struct tcpiphdr);
+ ip->ip_len = tlen;
+ ip->ip_ttl = V_ip_defttl;
+ if (V_path_mtu_discovery)
+ ip->ip_off |= IP_DF;
+ }
+ m->m_len = tlen;
+ m->m_pkthdr.len = tlen;
+ m->m_pkthdr.rcvif = NULL;
+#ifdef MAC
+ if (inp != NULL) {
+ /*
+ * Packet is associated with a socket, so allow the
+ * label of the response to reflect the socket label.
+ */
+ INP_WLOCK_ASSERT(inp);
+ mac_inpcb_create_mbuf(inp, m);
+ } else {
+ /*
+ * Packet is not associated with a socket, so possibly
+ * update the label in place.
+ */
+ mac_netinet_tcp_reply(m);
+ }
+#endif
+ nth->th_seq = htonl(seq);
+ nth->th_ack = htonl(ack);
+ nth->th_x2 = 0;
+ nth->th_off = sizeof (struct tcphdr) >> 2;
+ nth->th_flags = flags;
+ if (tp != NULL)
+ nth->th_win = htons((u_short) (win >> tp->rcv_scale));
+ else
+ nth->th_win = htons((u_short)win);
+ nth->th_urp = 0;
+#ifdef INET6
+ if (isipv6) {
+ nth->th_sum = 0;
+ nth->th_sum = in6_cksum(m, IPPROTO_TCP,
+ sizeof(struct ip6_hdr),
+ tlen - sizeof(struct ip6_hdr));
+ ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
+ NULL, NULL);
+ } else
+#endif /* INET6 */
+ {
+ nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
+ m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ }
+#ifdef TCPDEBUG
+ if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
+#endif
+#ifdef INET6
+ if (isipv6)
+ (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
+ else
+#endif /* INET6 */
+ (void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
+}
+
+/*
+ * Create a new TCP control block, making an
+ * empty reassembly queue and hooking it to the argument
+ * protocol control block. The `inp' parameter must have
+ * come from the zone allocator set up in tcp_init().
+ */
+struct tcpcb *
+tcp_newtcpcb(struct inpcb *inp)
+{
+ struct tcpcb_mem *tm;
+ struct tcpcb *tp;
+#ifdef INET6
+ int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
+#endif /* INET6 */
+
+ tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO);
+ if (tm == NULL)
+ return (NULL);
+ tp = &tm->tcb;
+#ifdef VIMAGE
+ tp->t_vnet = inp->inp_vnet;
+#endif
+ tp->t_timers = &tm->tt;
+ /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */
+ tp->t_maxseg = tp->t_maxopd =
+#ifdef INET6
+ isipv6 ? V_tcp_v6mssdflt :
+#endif /* INET6 */
+ V_tcp_mssdflt;
+
+ /* Set up our timeouts. */
+ callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE);
+ callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE);
+ callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE);
+ callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE);
+ callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE);
+
+ if (V_tcp_do_rfc1323)
+ tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
+ if (V_tcp_do_sack)
+ tp->t_flags |= TF_SACK_PERMIT;
+ TAILQ_INIT(&tp->snd_holes);
+ tp->t_inpcb = inp; /* XXX */
+ /*
+ * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
+ * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
+ * reasonable initial retransmit time.
+ */
+ tp->t_srtt = TCPTV_SRTTBASE;
+ tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
+ tp->t_rttmin = tcp_rexmit_min;
+ tp->t_rxtcur = TCPTV_RTOBASE;
+ tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ tp->t_rcvtime = ticks;
+ tp->t_bw_rtttime = ticks;
+ /*
+ * IPv4 TTL initialization is necessary for an IPv6 socket as well,
+ * because the socket may be bound to an IPv6 wildcard address,
+ * which may match an IPv4-mapped IPv6 address.
+ */
+ inp->inp_ip_ttl = V_ip_defttl;
+ inp->inp_ppcb = tp;
+ return (tp); /* XXX */
+}
+
+/*
+ * Drop a TCP connection, reporting
+ * the specified error. If connection is synchronized,
+ * then send a RST to peer.
+ */
+struct tcpcb *
+tcp_drop(struct tcpcb *tp, int errno)
+{
+ struct socket *so = tp->t_inpcb->inp_socket;
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ if (TCPS_HAVERCVDSYN(tp->t_state)) {
+ tp->t_state = TCPS_CLOSED;
+ (void) tcp_output_reset(tp);
+ TCPSTAT_INC(tcps_drops);
+ } else
+ TCPSTAT_INC(tcps_conndrops);
+ if (errno == ETIMEDOUT && tp->t_softerror)
+ errno = tp->t_softerror;
+ so->so_error = errno;
+ return (tcp_close(tp));
+}
+
+void
+tcp_discardcb(struct tcpcb *tp)
+{
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so = inp->inp_socket;
+#ifdef INET6
+ int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
+#endif /* INET6 */
+
+ INP_WLOCK_ASSERT(inp);
+
+ /*
+ * Make sure that all of our timers are stopped before we
+ * delete the PCB.
+ */
+ callout_stop(&tp->t_timers->tt_rexmt);
+ callout_stop(&tp->t_timers->tt_persist);
+ callout_stop(&tp->t_timers->tt_keep);
+ callout_stop(&tp->t_timers->tt_2msl);
+ callout_stop(&tp->t_timers->tt_delack);
+
+ /*
+ * If we got enough samples through the srtt filter,
+ * save the rtt and rttvar in the routing entry.
+ * 'Enough' is arbitrarily defined as 4 rtt samples.
+ * 4 samples is enough for the srtt filter to converge
+ * to within enough % of the correct value; fewer samples
+ * and we could save a bogus rtt. The danger is not high
+ * as tcp quickly recovers from everything.
+ * XXX: Works very well but needs some more statistics!
+ */
+ if (tp->t_rttupdated >= 4) {
+ struct hc_metrics_lite metrics;
+ u_long ssthresh;
+
+ bzero(&metrics, sizeof(metrics));
+ /*
+ * Update the ssthresh always when the conditions below
+ * are satisfied. This gives us better new start value
+ * for the congestion avoidance for new connections.
+ * ssthresh is only set if packet loss occured on a session.
+ *
+ * XXXRW: 'so' may be NULL here, and/or socket buffer may be
+ * being torn down. Ideally this code would not use 'so'.
+ */
+ ssthresh = tp->snd_ssthresh;
+ if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
+ /*
+ * convert the limit from user data bytes to
+ * packets then to packet data bytes.
+ */
+ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
+ if (ssthresh < 2)
+ ssthresh = 2;
+ ssthresh *= (u_long)(tp->t_maxseg +
+#ifdef INET6
+ (isipv6 ? sizeof (struct ip6_hdr) +
+ sizeof (struct tcphdr) :
+#endif
+ sizeof (struct tcpiphdr)
+#ifdef INET6
+ )
+#endif
+ );
+ } else
+ ssthresh = 0;
+ metrics.rmx_ssthresh = ssthresh;
+
+ metrics.rmx_rtt = tp->t_srtt;
+ metrics.rmx_rttvar = tp->t_rttvar;
+ /* XXX: This wraps if the pipe is more than 4 Gbit per second */
+ metrics.rmx_bandwidth = tp->snd_bandwidth;
+ metrics.rmx_cwnd = tp->snd_cwnd;
+ metrics.rmx_sendpipe = 0;
+ metrics.rmx_recvpipe = 0;
+
+ tcp_hc_update(&inp->inp_inc, &metrics);
+ }
+
+ /* free the reassembly queue, if any */
+ tcp_reass_flush(tp);
+ /* Disconnect offload device, if any. */
+ tcp_offload_detach(tp);
+
+ tcp_free_sackholes(tp);
+ inp->inp_ppcb = NULL;
+ tp->t_inpcb = NULL;
+ uma_zfree(V_tcpcb_zone, tp);
+}
+
+/*
+ * Attempt to close a TCP control block, marking it as dropped, and freeing
+ * the socket if we hold the only reference.
+ */
+struct tcpcb *
+tcp_close(struct tcpcb *tp)
+{
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so;
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ /* Notify any offload devices of listener close */
+ if (tp->t_state == TCPS_LISTEN)
+ tcp_offload_listen_close(tp);
+ in_pcbdrop(inp);
+ TCPSTAT_INC(tcps_closed);
+ KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
+ so = inp->inp_socket;
+ soisdisconnected(so);
+ if (inp->inp_flags & INP_SOCKREF) {
+ KASSERT(so->so_state & SS_PROTOREF,
+ ("tcp_close: !SS_PROTOREF"));
+ inp->inp_flags &= ~INP_SOCKREF;
+ INP_WUNLOCK(inp);
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_PROTOREF;
+ sofree(so);
+ return (NULL);
+ }
+ return (tp);
+}
+
+void
+tcp_drain(void)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+
+ if (!do_tcpdrain)
+ return;
+
+ VNET_LIST_RLOCK_NOSLEEP();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ struct inpcb *inpb;
+ struct tcpcb *tcpb;
+
+ /*
+ * Walk the tcpbs, if existing, and flush the reassembly queue,
+ * if there is one...
+ * XXX: The "Net/3" implementation doesn't imply that the TCP
+ * reassembly queue should be flushed, but in a situation
+ * where we're really low on mbufs, this is potentially
+ * usefull.
+ */
+ INP_INFO_RLOCK(&V_tcbinfo);
+ LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
+ if (inpb->inp_flags & INP_TIMEWAIT)
+ continue;
+ INP_WLOCK(inpb);
+ if ((tcpb = intotcpcb(inpb)) != NULL) {
+ tcp_reass_flush(tcpb);
+ tcp_clean_sackreport(tcpb);
+ }
+ INP_WUNLOCK(inpb);
+ }
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK_NOSLEEP();
+}
+
+/*
+ * Notify a tcp user of an asynchronous error;
+ * store error as soft error, but wake up user
+ * (for now, won't do anything until can select for soft error).
+ *
+ * Do not wake up user since there currently is no mechanism for
+ * reporting soft errors (yet - a kqueue filter may be added).
+ */
+static struct inpcb *
+tcp_notify(struct inpcb *inp, int error)
+{
+ struct tcpcb *tp;
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ if ((inp->inp_flags & INP_TIMEWAIT) ||
+ (inp->inp_flags & INP_DROPPED))
+ return (inp);
+
+ tp = intotcpcb(inp);
+ KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
+
+ /*
+ * Ignore some errors if we are hooked up.
+ * If connection hasn't completed, has retransmitted several times,
+ * and receives a second error, give up now. This is better
+ * than waiting a long time to establish a connection that
+ * can never complete.
+ */
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ (error == EHOSTUNREACH || error == ENETUNREACH ||
+ error == EHOSTDOWN)) {
+ return (inp);
+ } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
+ tp->t_softerror) {
+ tp = tcp_drop(tp, error);
+ if (tp != NULL)
+ return (inp);
+ else
+ return (NULL);
+ } else {
+ tp->t_softerror = error;
+ return (inp);
+ }
+#if 0
+ wakeup( &so->so_timeo);
+ sorwakeup(so);
+ sowwakeup(so);
+#endif
+}
+
+static int
+tcp_pcblist(SYSCTL_HANDLER_ARGS)
+{
+ int error, i, m, n, pcb_count;
+ struct inpcb *inp, **inp_list;
+ inp_gen_t gencnt;
+ struct xinpgen xig;
+
+ /*
+ * The process of preparing the TCB list is too time-consuming and
+ * resource-intensive to repeat twice on every request.
+ */
+ if (req->oldptr == NULL) {
+ n = V_tcbinfo.ipi_count + syncache_pcbcount();
+ n += imax(n / 8, 10);
+ req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
+ return (0);
+ }
+
+ if (req->newptr != NULL)
+ return (EPERM);
+
+ /*
+ * OK, now we're committed to doing something.
+ */
+ INP_INFO_RLOCK(&V_tcbinfo);
+ gencnt = V_tcbinfo.ipi_gencnt;
+ n = V_tcbinfo.ipi_count;
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+
+ m = syncache_pcbcount();
+
+ error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
+ + (n + m) * sizeof(struct xtcpcb));
+ if (error != 0)
+ return (error);
+
+ xig.xig_len = sizeof xig;
+ xig.xig_count = n + m;
+ xig.xig_gen = gencnt;
+ xig.xig_sogen = so_gencnt;
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ if (error)
+ return (error);
+
+ error = syncache_pcblist(req, m, &pcb_count);
+ if (error)
+ return (error);
+
+ inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
+ if (inp_list == NULL)
+ return (ENOMEM);
+
+ INP_INFO_RLOCK(&V_tcbinfo);
+ for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
+ inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) {
+ INP_WLOCK(inp);
+ if (inp->inp_gencnt <= gencnt) {
+ /*
+ * XXX: This use of cr_cansee(), introduced with
+ * TCP state changes, is not quite right, but for
+ * now, better than nothing.
+ */
+ if (inp->inp_flags & INP_TIMEWAIT) {
+ if (intotw(inp) != NULL)
+ error = cr_cansee(req->td->td_ucred,
+ intotw(inp)->tw_cred);
+ else
+ error = EINVAL; /* Skip this inp. */
+ } else
+ error = cr_canseeinpcb(req->td->td_ucred, inp);
+ if (error == 0) {
+ in_pcbref(inp);
+ inp_list[i++] = inp;
+ }
+ }
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ n = i;
+
+ error = 0;
+ for (i = 0; i < n; i++) {
+ inp = inp_list[i];
+ INP_RLOCK(inp);
+ if (inp->inp_gencnt <= gencnt) {
+ struct xtcpcb xt;
+ void *inp_ppcb;
+
+ bzero(&xt, sizeof(xt));
+ xt.xt_len = sizeof xt;
+ /* XXX should avoid extra copy */
+ bcopy(inp, &xt.xt_inp, sizeof *inp);
+ inp_ppcb = inp->inp_ppcb;
+ if (inp_ppcb == NULL)
+ bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
+ else if (inp->inp_flags & INP_TIMEWAIT) {
+ bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
+ xt.xt_tp.t_state = TCPS_TIME_WAIT;
+ } else
+ bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
+ if (inp->inp_socket != NULL)
+ sotoxsocket(inp->inp_socket, &xt.xt_socket);
+ else {
+ bzero(&xt.xt_socket, sizeof xt.xt_socket);
+ xt.xt_socket.xso_protocol = IPPROTO_TCP;
+ }
+ xt.xt_inp.inp_gencnt = inp->inp_gencnt;
+ INP_RUNLOCK(inp);
+ error = SYSCTL_OUT(req, &xt, sizeof xt);
+ } else
+ INP_RUNLOCK(inp);
+ }
+ INP_INFO_WLOCK(&V_tcbinfo);
+ for (i = 0; i < n; i++) {
+ inp = inp_list[i];
+ INP_WLOCK(inp);
+ if (!in_pcbrele(inp))
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+
+ if (!error) {
+ /*
+ * Give the user an updated idea of our state.
+ * If the generation differs from what we told
+ * her before, she knows that something happened
+ * while we were processing this request, and it
+ * might be necessary to retry.
+ */
+ INP_INFO_RLOCK(&V_tcbinfo);
+ xig.xig_gen = V_tcbinfo.ipi_gencnt;
+ xig.xig_sogen = so_gencnt;
+ xig.xig_count = V_tcbinfo.ipi_count + pcb_count;
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ }
+ free(inp_list, M_TEMP);
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
+ tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
+
+static int
+tcp_getcred(SYSCTL_HANDLER_ARGS)
+{
+ struct xucred xuc;
+ struct sockaddr_in addrs[2];
+ struct inpcb *inp;
+ int error;
+
+ error = priv_check(req->td, PRIV_NETINET_GETCRED);
+ if (error)
+ return (error);
+ error = SYSCTL_IN(req, addrs, sizeof(addrs));
+ if (error)
+ return (error);
+ INP_INFO_RLOCK(&V_tcbinfo);
+ inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr,
+ addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
+ if (inp != NULL) {
+ INP_RLOCK(inp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ if (inp->inp_socket == NULL)
+ error = ENOENT;
+ if (error == 0)
+ error = cr_canseeinpcb(req->td->td_ucred, inp);
+ if (error == 0)
+ cru2x(inp->inp_cred, &xuc);
+ INP_RUNLOCK(inp);
+ } else {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ error = ENOENT;
+ }
+ if (error == 0)
+ error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
+ CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
+ tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
+
+#ifdef INET6
+static int
+tcp6_getcred(SYSCTL_HANDLER_ARGS)
+{
+ struct xucred xuc;
+ struct sockaddr_in6 addrs[2];
+ struct inpcb *inp;
+ int error, mapped = 0;
+
+ error = priv_check(req->td, PRIV_NETINET_GETCRED);
+ if (error)
+ return (error);
+ error = SYSCTL_IN(req, addrs, sizeof(addrs));
+ if (error)
+ return (error);
+ if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
+ (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
+ return (error);
+ }
+ if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
+ if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
+ mapped = 1;
+ else
+ return (EINVAL);
+ }
+
+ INP_INFO_RLOCK(&V_tcbinfo);
+ if (mapped == 1)
+ inp = in_pcblookup_hash(&V_tcbinfo,
+ *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
+ addrs[1].sin6_port,
+ *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
+ addrs[0].sin6_port,
+ 0, NULL);
+ else
+ inp = in6_pcblookup_hash(&V_tcbinfo,
+ &addrs[1].sin6_addr, addrs[1].sin6_port,
+ &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL);
+ if (inp != NULL) {
+ INP_RLOCK(inp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ if (inp->inp_socket == NULL)
+ error = ENOENT;
+ if (error == 0)
+ error = cr_canseeinpcb(req->td->td_ucred, inp);
+ if (error == 0)
+ cru2x(inp->inp_cred, &xuc);
+ INP_RUNLOCK(inp);
+ } else {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ error = ENOENT;
+ }
+ if (error == 0)
+ error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
+ CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
+ tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
+#endif
+
+
+void
+tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
+{
+ struct ip *ip = vip;
+ struct tcphdr *th;
+ struct in_addr faddr;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
+ struct icmp *icp;
+ struct in_conninfo inc;
+ tcp_seq icmp_tcp_seq;
+ int mtu;
+
+ faddr = ((struct sockaddr_in *)sa)->sin_addr;
+ if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
+ return;
+
+ if (cmd == PRC_MSGSIZE)
+ notify = tcp_mtudisc;
+ else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
+ cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
+ notify = tcp_drop_syn_sent;
+ /*
+ * Redirects don't need to be handled up here.
+ */
+ else if (PRC_IS_REDIRECT(cmd))
+ return;
+ /*
+ * Source quench is depreciated.
+ */
+ else if (cmd == PRC_QUENCH)
+ return;
+ /*
+ * Hostdead is ugly because it goes linearly through all PCBs.
+ * XXX: We never get this from ICMP, otherwise it makes an
+ * excellent DoS attack on machines with many connections.
+ */
+ else if (cmd == PRC_HOSTDEAD)
+ ip = NULL;
+ else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
+ return;
+ if (ip != NULL) {
+ icp = (struct icmp *)((caddr_t)ip
+ - offsetof(struct icmp, icmp_ip));
+ th = (struct tcphdr *)((caddr_t)ip
+ + (ip->ip_hl << 2));
+ INP_INFO_WLOCK(&V_tcbinfo);
+ inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport,
+ ip->ip_src, th->th_sport, 0, NULL);
+ if (inp != NULL) {
+ INP_WLOCK(inp);
+ if (!(inp->inp_flags & INP_TIMEWAIT) &&
+ !(inp->inp_flags & INP_DROPPED) &&
+ !(inp->inp_socket == NULL)) {
+ icmp_tcp_seq = htonl(th->th_seq);
+ tp = intotcpcb(inp);
+ if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
+ SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
+ if (cmd == PRC_MSGSIZE) {
+ /*
+ * MTU discovery:
+ * If we got a needfrag set the MTU
+ * in the route to the suggested new
+ * value (if given) and then notify.
+ */
+ bzero(&inc, sizeof(inc));
+ inc.inc_faddr = faddr;
+ inc.inc_fibnum =
+ inp->inp_inc.inc_fibnum;
+
+ mtu = ntohs(icp->icmp_nextmtu);
+ /*
+ * If no alternative MTU was
+ * proposed, try the next smaller
+ * one. ip->ip_len has already
+ * been swapped in icmp_input().
+ */
+ if (!mtu)
+ mtu = ip_next_mtu(ip->ip_len,
+ 1);
+ if (mtu < V_tcp_minmss
+ + sizeof(struct tcpiphdr))
+ mtu = V_tcp_minmss
+ + sizeof(struct tcpiphdr);
+ /*
+ * Only cache the the MTU if it
+ * is smaller than the interface
+ * or route MTU. tcp_mtudisc()
+ * will do right thing by itself.
+ */
+ if (mtu <= tcp_maxmtu(&inc, NULL))
+ tcp_hc_updatemtu(&inc, mtu);
+ }
+
+ inp = (*notify)(inp, inetctlerrmap[cmd]);
+ }
+ }
+ if (inp != NULL)
+ INP_WUNLOCK(inp);
+ } else {
+ bzero(&inc, sizeof(inc));
+ inc.inc_fport = th->th_dport;
+ inc.inc_lport = th->th_sport;
+ inc.inc_faddr = faddr;
+ inc.inc_laddr = ip->ip_src;
+ syncache_unreach(&inc, th);
+ }
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ } else
+ in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
+}
+
+#ifdef INET6
+void
+tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
+{
+ struct tcphdr th;
+ struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
+ struct ip6_hdr *ip6;
+ struct mbuf *m;
+ struct ip6ctlparam *ip6cp = NULL;
+ const struct sockaddr_in6 *sa6_src = NULL;
+ int off;
+ struct tcp_portonly {
+ u_int16_t th_sport;
+ u_int16_t th_dport;
+ } *thp;
+
+ if (sa->sa_family != AF_INET6 ||
+ sa->sa_len != sizeof(struct sockaddr_in6))
+ return;
+
+ if (cmd == PRC_MSGSIZE)
+ notify = tcp_mtudisc;
+ else if (!PRC_IS_REDIRECT(cmd) &&
+ ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
+ return;
+ /* Source quench is depreciated. */
+ else if (cmd == PRC_QUENCH)
+ return;
+
+ /* if the parameter is from icmp6, decode it. */
+ if (d != NULL) {
+ ip6cp = (struct ip6ctlparam *)d;
+ m = ip6cp->ip6c_m;
+ ip6 = ip6cp->ip6c_ip6;
+ off = ip6cp->ip6c_off;
+ sa6_src = ip6cp->ip6c_src;
+ } else {
+ m = NULL;
+ ip6 = NULL;
+ off = 0; /* fool gcc */
+ sa6_src = &sa6_any;
+ }
+
+ if (ip6 != NULL) {
+ struct in_conninfo inc;
+ /*
+ * XXX: We assume that when IPV6 is non NULL,
+ * M and OFF are valid.
+ */
+
+ /* check if we can safely examine src and dst ports */
+ if (m->m_pkthdr.len < off + sizeof(*thp))
+ return;
+
+ bzero(&th, sizeof(th));
+ m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
+
+ in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
+ (struct sockaddr *)ip6cp->ip6c_src,
+ th.th_sport, cmd, NULL, notify);
+
+ bzero(&inc, sizeof(inc));
+ inc.inc_fport = th.th_dport;
+ inc.inc_lport = th.th_sport;
+ inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
+ inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
+ inc.inc_flags |= INC_ISIPV6;
+ INP_INFO_WLOCK(&V_tcbinfo);
+ syncache_unreach(&inc, &th);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ } else
+ in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
+ 0, cmd, NULL, notify);
+}
+#endif /* INET6 */
+
+
+/*
+ * Following is where TCP initial sequence number generation occurs.
+ *
+ * There are two places where we must use initial sequence numbers:
+ * 1. In SYN-ACK packets.
+ * 2. In SYN packets.
+ *
+ * All ISNs for SYN-ACK packets are generated by the syncache. See
+ * tcp_syncache.c for details.
+ *
+ * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
+ * depends on this property. In addition, these ISNs should be
+ * unguessable so as to prevent connection hijacking. To satisfy
+ * the requirements of this situation, the algorithm outlined in
+ * RFC 1948 is used, with only small modifications.
+ *
+ * Implementation details:
+ *
+ * Time is based off the system timer, and is corrected so that it
+ * increases by one megabyte per second. This allows for proper
+ * recycling on high speed LANs while still leaving over an hour
+ * before rollover.
+ *
+ * As reading the *exact* system time is too expensive to be done
+ * whenever setting up a TCP connection, we increment the time
+ * offset in two ways. First, a small random positive increment
+ * is added to isn_offset for each connection that is set up.
+ * Second, the function tcp_isn_tick fires once per clock tick
+ * and increments isn_offset as necessary so that sequence numbers
+ * are incremented at approximately ISN_BYTES_PER_SECOND. The
+ * random positive increments serve only to ensure that the same
+ * exact sequence number is never sent out twice (as could otherwise
+ * happen when a port is recycled in less than the system tick
+ * interval.)
+ *
+ * net.inet.tcp.isn_reseed_interval controls the number of seconds
+ * between seeding of isn_secret. This is normally set to zero,
+ * as reseeding should not be necessary.
+ *
+ * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
+ * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In
+ * general, this means holding an exclusive (write) lock.
+ */
+
+#define ISN_BYTES_PER_SECOND 1048576
+#define ISN_STATIC_INCREMENT 4096
+#define ISN_RANDOM_INCREMENT (4096 - 1)
+
+static VNET_DEFINE(u_char, isn_secret[32]);
+static VNET_DEFINE(int, isn_last_reseed);
+static VNET_DEFINE(u_int32_t, isn_offset);
+static VNET_DEFINE(u_int32_t, isn_offset_old);
+
+#define V_isn_secret VNET(isn_secret)
+#define V_isn_last_reseed VNET(isn_last_reseed)
+#define V_isn_offset VNET(isn_offset)
+#define V_isn_offset_old VNET(isn_offset_old)
+
+tcp_seq
+tcp_new_isn(struct tcpcb *tp)
+{
+ MD5_CTX isn_ctx;
+ u_int32_t md5_buffer[4];
+ tcp_seq new_isn;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ ISN_LOCK();
+ /* Seed if this is the first use, reseed if requested. */
+ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) &&
+ (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz)
+ < (u_int)ticks))) {
+ read_random(&V_isn_secret, sizeof(V_isn_secret));
+ V_isn_last_reseed = ticks;
+ }
+
+ /* Compute the md5 hash and return the ISN. */
+ MD5Init(&isn_ctx);
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
+#ifdef INET6
+ if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
+ sizeof(struct in6_addr));
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
+ sizeof(struct in6_addr));
+ } else
+#endif
+ {
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
+ sizeof(struct in_addr));
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
+ sizeof(struct in_addr));
+ }
+ MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret));
+ MD5Final((u_char *) &md5_buffer, &isn_ctx);
+ new_isn = (tcp_seq) md5_buffer[0];
+ V_isn_offset += ISN_STATIC_INCREMENT +
+ (arc4random() & ISN_RANDOM_INCREMENT);
+ new_isn += V_isn_offset;
+ ISN_UNLOCK();
+ return (new_isn);
+}
+
+/*
+ * Increment the offset to the next ISN_BYTES_PER_SECOND / 100 boundary
+ * to keep time flowing at a relatively constant rate. If the random
+ * increments have already pushed us past the projected offset, do nothing.
+ */
+static void
+tcp_isn_tick(void *xtp)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+ u_int32_t projected_offset;
+
+ VNET_LIST_RLOCK_NOSLEEP();
+ ISN_LOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS */
+ projected_offset =
+ V_isn_offset_old + ISN_BYTES_PER_SECOND / 100;
+
+ if (SEQ_GT(projected_offset, V_isn_offset))
+ V_isn_offset = projected_offset;
+
+ V_isn_offset_old = V_isn_offset;
+ CURVNET_RESTORE();
+ }
+ ISN_UNLOCK();
+ VNET_LIST_RUNLOCK_NOSLEEP();
+ callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL);
+}
+
+/*
+ * When a specific ICMP unreachable message is received and the
+ * connection state is SYN-SENT, drop the connection. This behavior
+ * is controlled by the icmp_may_rst sysctl.
+ */
+struct inpcb *
+tcp_drop_syn_sent(struct inpcb *inp, int errno)
+{
+ struct tcpcb *tp;
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ if ((inp->inp_flags & INP_TIMEWAIT) ||
+ (inp->inp_flags & INP_DROPPED))
+ return (inp);
+
+ tp = intotcpcb(inp);
+ if (tp->t_state != TCPS_SYN_SENT)
+ return (inp);
+
+ tp = tcp_drop(tp, errno);
+ if (tp != NULL)
+ return (inp);
+ else
+ return (NULL);
+}
+
+/*
+ * When `need fragmentation' ICMP is received, update our idea of the MSS
+ * based on the new value in the route. Also nudge TCP to send something,
+ * since we know the packet we just sent was dropped.
+ * This duplicates some code in the tcp_mss() function in tcp_input.c.
+ */
+struct inpcb *
+tcp_mtudisc(struct inpcb *inp, int errno)
+{
+ struct tcpcb *tp;
+ struct socket *so;
+
+ INP_WLOCK_ASSERT(inp);
+ if ((inp->inp_flags & INP_TIMEWAIT) ||
+ (inp->inp_flags & INP_DROPPED))
+ return (inp);
+
+ tp = intotcpcb(inp);
+ KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
+
+ tcp_mss_update(tp, -1, NULL, NULL);
+
+ so = inp->inp_socket;
+ SOCKBUF_LOCK(&so->so_snd);
+ /* If the mss is larger than the socket buffer, decrease the mss. */
+ if (so->so_snd.sb_hiwat < tp->t_maxseg)
+ tp->t_maxseg = so->so_snd.sb_hiwat;
+ SOCKBUF_UNLOCK(&so->so_snd);
+
+ TCPSTAT_INC(tcps_mturesent);
+ tp->t_rtttime = 0;
+ tp->snd_nxt = tp->snd_una;
+ tcp_free_sackholes(tp);
+ tp->snd_recover = tp->snd_max;
+ if (tp->t_flags & TF_SACK_PERMIT)
+ EXIT_FASTRECOVERY(tp);
+ tcp_output_send(tp);
+ return (inp);
+}
+
+/*
+ * Look-up the routing entry to the peer of this inpcb. If no route
+ * is found and it cannot be allocated, then return 0. This routine
+ * is called by TCP routines that access the rmx structure and by
+ * tcp_mss_update to get the peer/interface MTU.
+ */
+u_long
+tcp_maxmtu(struct in_conninfo *inc, int *flags)
+{
+ struct route sro;
+ struct sockaddr_in *dst;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
+
+ bzero(&sro, sizeof(sro));
+ if (inc->inc_faddr.s_addr != INADDR_ANY) {
+ dst = (struct sockaddr_in *)&sro.ro_dst;
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = inc->inc_faddr;
+ in_rtalloc_ign(&sro, 0, inc->inc_fibnum);
+ }
+ if (sro.ro_rt != NULL) {
+ ifp = sro.ro_rt->rt_ifp;
+ if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = ifp->if_mtu;
+ else
+ maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
+
+ /* Report additional interface capabilities. */
+ if (flags != NULL) {
+ if (ifp->if_capenable & IFCAP_TSO4 &&
+ ifp->if_hwassist & CSUM_TSO)
+ *flags |= CSUM_TSO;
+ }
+ RTFREE(sro.ro_rt);
+ }
+ return (maxmtu);
+}
+
+#ifdef INET6
+u_long
+tcp_maxmtu6(struct in_conninfo *inc, int *flags)
+{
+ struct route_in6 sro6;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
+
+ bzero(&sro6, sizeof(sro6));
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
+ sro6.ro_dst.sin6_family = AF_INET6;
+ sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
+ sro6.ro_dst.sin6_addr = inc->inc6_faddr;
+ rtalloc_ign((struct route *)&sro6, 0);
+ }
+ if (sro6.ro_rt != NULL) {
+ ifp = sro6.ro_rt->rt_ifp;
+ if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
+ else
+ maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
+ IN6_LINKMTU(sro6.ro_rt->rt_ifp));
+
+ /* Report additional interface capabilities. */
+ if (flags != NULL) {
+ if (ifp->if_capenable & IFCAP_TSO6 &&
+ ifp->if_hwassist & CSUM_TSO)
+ *flags |= CSUM_TSO;
+ }
+ RTFREE(sro6.ro_rt);
+ }
+
+ return (maxmtu);
+}
+#endif /* INET6 */
+
+#ifdef IPSEC
+/* compute ESP/AH header size for TCP, including outer IP header. */
+size_t
+ipsec_hdrsiz_tcp(struct tcpcb *tp)
+{
+ struct inpcb *inp;
+ struct mbuf *m;
+ size_t hdrsiz;
+ struct ip *ip;
+#ifdef INET6
+ struct ip6_hdr *ip6;
+#endif
+ struct tcphdr *th;
+
+ if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
+ return (0);
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (!m)
+ return (0);
+
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV6) != 0) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ th = (struct tcphdr *)(ip6 + 1);
+ m->m_pkthdr.len = m->m_len =
+ sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ tcpip_fillheaders(inp, ip6, th);
+ hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
+ } else
+#endif /* INET6 */
+ {
+ ip = mtod(m, struct ip *);
+ th = (struct tcphdr *)(ip + 1);
+ m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
+ tcpip_fillheaders(inp, ip, th);
+ hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
+ }
+
+ m_free(m);
+ return (hdrsiz);
+}
+#endif /* IPSEC */
+
+/*
+ * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
+ *
+ * This code attempts to calculate the bandwidth-delay product as a
+ * means of determining the optimal window size to maximize bandwidth,
+ * minimize RTT, and avoid the over-allocation of buffers on interfaces and
+ * routers. This code also does a fairly good job keeping RTTs in check
+ * across slow links like modems. We implement an algorithm which is very
+ * similar (but not meant to be) TCP/Vegas. The code operates on the
+ * transmitter side of a TCP connection and so only effects the transmit
+ * side of the connection.
+ *
+ * BACKGROUND: TCP makes no provision for the management of buffer space
+ * at the end points or at the intermediate routers and switches. A TCP
+ * stream, whether using NewReno or not, will eventually buffer as
+ * many packets as it is able and the only reason this typically works is
+ * due to the fairly small default buffers made available for a connection
+ * (typicaly 16K or 32K). As machines use larger windows and/or window
+ * scaling it is now fairly easy for even a single TCP connection to blow-out
+ * all available buffer space not only on the local interface, but on
+ * intermediate routers and switches as well. NewReno makes a misguided
+ * attempt to 'solve' this problem by waiting for an actual failure to occur,
+ * then backing off, then steadily increasing the window again until another
+ * failure occurs, ad-infinitum. This results in terrible oscillation that
+ * is only made worse as network loads increase and the idea of intentionally
+ * blowing out network buffers is, frankly, a terrible way to manage network
+ * resources.
+ *
+ * It is far better to limit the transmit window prior to the failure
+ * condition being achieved. There are two general ways to do this: First
+ * you can 'scan' through different transmit window sizes and locate the
+ * point where the RTT stops increasing, indicating that you have filled the
+ * pipe, then scan backwards until you note that RTT stops decreasing, then
+ * repeat ad-infinitum. This method works in principle but has severe
+ * implementation issues due to RTT variances, timer granularity, and
+ * instability in the algorithm which can lead to many false positives and
+ * create oscillations as well as interact badly with other TCP streams
+ * implementing the same algorithm.
+ *
+ * The second method is to limit the window to the bandwidth delay product
+ * of the link. This is the method we implement. RTT variances and our
+ * own manipulation of the congestion window, bwnd, can potentially
+ * destabilize the algorithm. For this reason we have to stabilize the
+ * elements used to calculate the window. We do this by using the minimum
+ * observed RTT, the long term average of the observed bandwidth, and
+ * by adding two segments worth of slop. It isn't perfect but it is able
+ * to react to changing conditions and gives us a very stable basis on
+ * which to extend the algorithm.
+ */
+void
+tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
+{
+ u_long bw;
+ u_long bwnd;
+ int save_ticks;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /*
+ * If inflight_enable is disabled in the middle of a tcp connection,
+ * make sure snd_bwnd is effectively disabled.
+ */
+ if (V_tcp_inflight_enable == 0 ||
+ tp->t_rttlow < V_tcp_inflight_rttthresh) {
+ tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ tp->snd_bandwidth = 0;
+ return;
+ }
+
+ /*
+ * Figure out the bandwidth. Due to the tick granularity this
+ * is a very rough number and it MUST be averaged over a fairly
+ * long period of time. XXX we need to take into account a link
+ * that is not using all available bandwidth, but for now our
+ * slop will ramp us up if this case occurs and the bandwidth later
+ * increases.
+ *
+ * Note: if ticks rollover 'bw' may wind up negative. We must
+ * effectively reset t_bw_rtttime for this case.
+ */
+ save_ticks = ticks;
+ if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
+ return;
+
+ bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
+ (save_ticks - tp->t_bw_rtttime);
+ tp->t_bw_rtttime = save_ticks;
+ tp->t_bw_rtseq = ack_seq;
+ if (tp->t_bw_rtttime == 0 || (int)bw < 0)
+ return;
+ bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
+
+ tp->snd_bandwidth = bw;
+
+ /*
+ * Calculate the semi-static bandwidth delay product, plus two maximal
+ * segments. The additional slop puts us squarely in the sweet
+ * spot and also handles the bandwidth run-up case and stabilization.
+ * Without the slop we could be locking ourselves into a lower
+ * bandwidth.
+ *
+ * Situations Handled:
+ * (1) Prevents over-queueing of packets on LANs, especially on
+ * high speed LANs, allowing larger TCP buffers to be
+ * specified, and also does a good job preventing
+ * over-queueing of packets over choke points like modems
+ * (at least for the transmit side).
+ *
+ * (2) Is able to handle changing network loads (bandwidth
+ * drops so bwnd drops, bandwidth increases so bwnd
+ * increases).
+ *
+ * (3) Theoretically should stabilize in the face of multiple
+ * connections implementing the same algorithm (this may need
+ * a little work).
+ *
+ * (4) Stability value (defaults to 20 = 2 maximal packets) can
+ * be adjusted with a sysctl but typically only needs to be
+ * on very slow connections. A value no smaller then 5
+ * should be used, but only reduce this default if you have
+ * no other choice.
+ */
+#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2)
+ bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10;
+#undef USERTT
+
+ if (tcp_inflight_debug > 0) {
+ static int ltime;
+ if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
+ ltime = ticks;
+ printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
+ tp,
+ bw,
+ tp->t_rttbest,
+ tp->t_srtt,
+ bwnd
+ );
+ }
+ }
+ if ((long)bwnd < V_tcp_inflight_min)
+ bwnd = V_tcp_inflight_min;
+ if (bwnd > V_tcp_inflight_max)
+ bwnd = V_tcp_inflight_max;
+ if ((long)bwnd < tp->t_maxseg * 2)
+ bwnd = tp->t_maxseg * 2;
+ tp->snd_bwnd = bwnd;
+}
+
+#ifdef TCP_SIGNATURE
+/*
+ * Callback function invoked by m_apply() to digest TCP segment data
+ * contained within an mbuf chain.
+ */
+static int
+tcp_signature_apply(void *fstate, void *data, u_int len)
+{
+
+ MD5Update(fstate, (u_char *)data, len);
+ return (0);
+}
+
+/*
+ * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
+ *
+ * Parameters:
+ * m pointer to head of mbuf chain
+ * _unused
+ * len length of TCP segment data, excluding options
+ * optlen length of TCP segment options
+ * buf pointer to storage for computed MD5 digest
+ * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
+ *
+ * We do this over ip, tcphdr, segment data, and the key in the SADB.
+ * When called from tcp_input(), we can be sure that th_sum has been
+ * zeroed out and verified already.
+ *
+ * Return 0 if successful, otherwise return -1.
+ *
+ * XXX The key is retrieved from the system's PF_KEY SADB, by keying a
+ * search with the destination IP address, and a 'magic SPI' to be
+ * determined by the application. This is hardcoded elsewhere to 1179
+ * right now. Another branch of this code exists which uses the SPD to
+ * specify per-application flows but it is unstable.
+ */
+int
+tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
+ u_char *buf, u_int direction)
+{
+ union sockaddr_union dst;
+ struct ippseudo ippseudo;
+ MD5_CTX ctx;
+ int doff;
+ struct ip *ip;
+ struct ipovly *ipovly;
+ struct secasvar *sav;
+ struct tcphdr *th;
+#ifdef INET6
+ struct ip6_hdr *ip6;
+ struct in6_addr in6;
+ char ip6buf[INET6_ADDRSTRLEN];
+ uint32_t plen;
+ uint16_t nhdr;
+#endif
+ u_short savecsum;
+
+ KASSERT(m != NULL, ("NULL mbuf chain"));
+ KASSERT(buf != NULL, ("NULL signature pointer"));
+
+ /* Extract the destination from the IP header in the mbuf. */
+ bzero(&dst, sizeof(union sockaddr_union));
+ ip = mtod(m, struct ip *);
+#ifdef INET6
+ ip6 = NULL; /* Make the compiler happy. */
+#endif
+ switch (ip->ip_v) {
+ case IPVERSION:
+ dst.sa.sa_len = sizeof(struct sockaddr_in);
+ dst.sa.sa_family = AF_INET;
+ dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
+ ip->ip_src : ip->ip_dst;
+ break;
+#ifdef INET6
+ case (IPV6_VERSION >> 4):
+ ip6 = mtod(m, struct ip6_hdr *);
+ dst.sa.sa_len = sizeof(struct sockaddr_in6);
+ dst.sa.sa_family = AF_INET6;
+ dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ?
+ ip6->ip6_src : ip6->ip6_dst;
+ break;
+#endif
+ default:
+ return (EINVAL);
+ /* NOTREACHED */
+ break;
+ }
+
+ /* Look up an SADB entry which matches the address of the peer. */
+ sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
+ if (sav == NULL) {
+ ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__,
+ (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) :
+#ifdef INET6
+ (ip->ip_v == (IPV6_VERSION >> 4)) ?
+ ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) :
+#endif
+ "(unsupported)"));
+ return (EINVAL);
+ }
+
+ MD5Init(&ctx);
+ /*
+ * Step 1: Update MD5 hash with IP(v6) pseudo-header.
+ *
+ * XXX The ippseudo header MUST be digested in network byte order,
+ * or else we'll fail the regression test. Assume all fields we've
+ * been doing arithmetic on have been in host byte order.
+ * XXX One cannot depend on ipovly->ih_len here. When called from
+ * tcp_output(), the underlying ip_len member has not yet been set.
+ */
+ switch (ip->ip_v) {
+ case IPVERSION:
+ ipovly = (struct ipovly *)ip;
+ ippseudo.ippseudo_src = ipovly->ih_src;
+ ippseudo.ippseudo_dst = ipovly->ih_dst;
+ ippseudo.ippseudo_pad = 0;
+ ippseudo.ippseudo_p = IPPROTO_TCP;
+ ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) +
+ optlen);
+ MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo));
+
+ th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip));
+ doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen;
+ break;
+#ifdef INET6
+ /*
+ * RFC 2385, 2.0 Proposal
+ * For IPv6, the pseudo-header is as described in RFC 2460, namely the
+ * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero-
+ * extended next header value (to form 32 bits), and 32-bit segment
+ * length.
+ * Note: Upper-Layer Packet Length comes before Next Header.
+ */
+ case (IPV6_VERSION >> 4):
+ in6 = ip6->ip6_src;
+ in6_clearscope(&in6);
+ MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
+ in6 = ip6->ip6_dst;
+ in6_clearscope(&in6);
+ MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
+ plen = htonl(len + sizeof(struct tcphdr) + optlen);
+ MD5Update(&ctx, (char *)&plen, sizeof(uint32_t));
+ nhdr = 0;
+ MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
+ MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
+ MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
+ nhdr = IPPROTO_TCP;
+ MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
+
+ th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr));
+ doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen;
+ break;
+#endif
+ default:
+ return (EINVAL);
+ /* NOTREACHED */
+ break;
+ }
+
+
+ /*
+ * Step 2: Update MD5 hash with TCP header, excluding options.
+ * The TCP checksum must be set to zero.
+ */
+ savecsum = th->th_sum;
+ th->th_sum = 0;
+ MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
+ th->th_sum = savecsum;
+
+ /*
+ * Step 3: Update MD5 hash with TCP segment data.
+ * Use m_apply() to avoid an early m_pullup().
+ */
+ if (len > 0)
+ m_apply(m, doff, len, tcp_signature_apply, &ctx);
+
+ /*
+ * Step 4: Update MD5 hash with shared secret.
+ */
+ MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth));
+ MD5Final(buf, &ctx);
+
+ key_sa_recordxfer(sav, m);
+ KEY_FREESAV(&sav);
+ return (0);
+}
+#endif /* TCP_SIGNATURE */
+
+static int
+sysctl_drop(SYSCTL_HANDLER_ARGS)
+{
+ /* addrs[0] is a foreign socket, addrs[1] is a local one. */
+ struct sockaddr_storage addrs[2];
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct tcptw *tw;
+ struct sockaddr_in *fin, *lin;
+#ifdef INET6
+ struct sockaddr_in6 *fin6, *lin6;
+#endif
+ int error;
+
+ inp = NULL;
+ fin = lin = NULL;
+#ifdef INET6
+ fin6 = lin6 = NULL;
+#endif
+ error = 0;
+
+ if (req->oldptr != NULL || req->oldlen != 0)
+ return (EINVAL);
+ if (req->newptr == NULL)
+ return (EPERM);
+ if (req->newlen < sizeof(addrs))
+ return (ENOMEM);
+ error = SYSCTL_IN(req, &addrs, sizeof(addrs));
+ if (error)
+ return (error);
+
+ switch (addrs[0].ss_family) {
+#ifdef INET6
+ case AF_INET6:
+ fin6 = (struct sockaddr_in6 *)&addrs[0];
+ lin6 = (struct sockaddr_in6 *)&addrs[1];
+ if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
+ lin6->sin6_len != sizeof(struct sockaddr_in6))
+ return (EINVAL);
+ if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
+ if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
+ return (EINVAL);
+ in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
+ in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
+ fin = (struct sockaddr_in *)&addrs[0];
+ lin = (struct sockaddr_in *)&addrs[1];
+ break;
+ }
+ error = sa6_embedscope(fin6, V_ip6_use_defzone);
+ if (error)
+ return (error);
+ error = sa6_embedscope(lin6, V_ip6_use_defzone);
+ if (error)
+ return (error);
+ break;
+#endif
+ case AF_INET:
+ fin = (struct sockaddr_in *)&addrs[0];
+ lin = (struct sockaddr_in *)&addrs[1];
+ if (fin->sin_len != sizeof(struct sockaddr_in) ||
+ lin->sin_len != sizeof(struct sockaddr_in))
+ return (EINVAL);
+ break;
+ default:
+ return (EINVAL);
+ }
+ INP_INFO_WLOCK(&V_tcbinfo);
+ switch (addrs[0].ss_family) {
+#ifdef INET6
+ case AF_INET6:
+ inp = in6_pcblookup_hash(&V_tcbinfo, &fin6->sin6_addr,
+ fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, 0,
+ NULL);
+ break;
+#endif
+ case AF_INET:
+ inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr,
+ fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL);
+ break;
+ }
+ if (inp != NULL) {
+ INP_WLOCK(inp);
+ if (inp->inp_flags & INP_TIMEWAIT) {
+ /*
+ * XXXRW: There currently exists a state where an
+ * inpcb is present, but its timewait state has been
+ * discarded. For now, don't allow dropping of this
+ * type of inpcb.
+ */
+ tw = intotw(inp);
+ if (tw != NULL)
+ tcp_twclose(tw, 0);
+ else
+ INP_WUNLOCK(inp);
+ } else if (!(inp->inp_flags & INP_DROPPED) &&
+ !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
+ tp = intotcpcb(inp);
+ tp = tcp_drop(tp, ECONNABORTED);
+ if (tp != NULL)
+ INP_WUNLOCK(inp);
+ } else
+ INP_WUNLOCK(inp);
+ } else
+ error = ESRCH;
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
+ CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
+ 0, sysctl_drop, "", "Drop TCP connection");
+
+/*
+ * Generate a standardized TCP log line for use throughout the
+ * tcp subsystem. Memory allocation is done with M_NOWAIT to
+ * allow use in the interrupt context.
+ *
+ * NB: The caller MUST free(s, M_TCPLOG) the returned string.
+ * NB: The function may return NULL if memory allocation failed.
+ *
+ * Due to header inclusion and ordering limitations the struct ip
+ * and ip6_hdr pointers have to be passed as void pointers.
+ */
+char *
+tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
+ const void *ip6hdr)
+{
+
+ /* Is logging enabled? */
+ if (tcp_log_in_vain == 0)
+ return (NULL);
+
+ return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
+}
+
+char *
+tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
+ const void *ip6hdr)
+{
+
+ /* Is logging enabled? */
+ if (tcp_log_debug == 0)
+ return (NULL);
+
+ return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
+}
+
+static char *
+tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
+ const void *ip6hdr)
+{
+ char *s, *sp;
+ size_t size;
+ struct ip *ip;
+#ifdef INET6
+ const struct ip6_hdr *ip6;
+
+ ip6 = (const struct ip6_hdr *)ip6hdr;
+#endif /* INET6 */
+ ip = (struct ip *)ip4hdr;
+
+ /*
+ * The log line looks like this:
+ * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
+ */
+ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
+ sizeof(PRINT_TH_FLAGS) + 1 +
+#ifdef INET6
+ 2 * INET6_ADDRSTRLEN;
+#else
+ 2 * INET_ADDRSTRLEN;
+#endif /* INET6 */
+
+ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT);
+ if (s == NULL)
+ return (NULL);
+
+ strcat(s, "TCP: [");
+ sp = s + strlen(s);
+
+ if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) {
+ inet_ntoa_r(inc->inc_faddr, sp);
+ sp = s + strlen(s);
+ sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
+ sp = s + strlen(s);
+ inet_ntoa_r(inc->inc_laddr, sp);
+ sp = s + strlen(s);
+ sprintf(sp, "]:%i", ntohs(inc->inc_lport));
+#ifdef INET6
+ } else if (inc) {
+ ip6_sprintf(sp, &inc->inc6_faddr);
+ sp = s + strlen(s);
+ sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
+ sp = s + strlen(s);
+ ip6_sprintf(sp, &inc->inc6_laddr);
+ sp = s + strlen(s);
+ sprintf(sp, "]:%i", ntohs(inc->inc_lport));
+ } else if (ip6 && th) {
+ ip6_sprintf(sp, &ip6->ip6_src);
+ sp = s + strlen(s);
+ sprintf(sp, "]:%i to [", ntohs(th->th_sport));
+ sp = s + strlen(s);
+ ip6_sprintf(sp, &ip6->ip6_dst);
+ sp = s + strlen(s);
+ sprintf(sp, "]:%i", ntohs(th->th_dport));
+#endif /* INET6 */
+ } else if (ip && th) {
+ inet_ntoa_r(ip->ip_src, sp);
+ sp = s + strlen(s);
+ sprintf(sp, "]:%i to [", ntohs(th->th_sport));
+ sp = s + strlen(s);
+ inet_ntoa_r(ip->ip_dst, sp);
+ sp = s + strlen(s);
+ sprintf(sp, "]:%i", ntohs(th->th_dport));
+ } else {
+ free(s, M_TCPLOG);
+ return (NULL);
+ }
+ sp = s + strlen(s);
+ if (th)
+ sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS);
+ if (*(s + size - 1) != '\0')
+ panic("%s: string too long", __func__);
+ return (s);
+}
diff --git a/freebsd/sys/netinet/tcp_syncache.c b/freebsd/sys/netinet/tcp_syncache.c
new file mode 100644
index 00000000..78790cc8
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_syncache.c
@@ -0,0 +1,1823 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 2001 McAfee, Inc.
+ * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Jonathan Lemon
+ * and McAfee Research, the Security Research Division of McAfee, Inc. under
+ * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
+ * DARPA CHATS research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_ipsec.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/limits.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/mutex.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/md5.h>
+#include <freebsd/sys/proc.h> /* for proc0 declaration */
+#include <freebsd/sys/random.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/sys/ucred.h>
+
+#include <freebsd/vm/uma.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_options.h>
+#ifdef INET6
+#include <freebsd/netinet/ip6.h>
+#include <freebsd/netinet/icmp6.h>
+#include <freebsd/netinet6/nd6.h>
+#include <freebsd/netinet6/ip6_var.h>
+#include <freebsd/netinet6/in6_pcb.h>
+#endif
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_fsm.h>
+#include <freebsd/netinet/tcp_seq.h>
+#include <freebsd/netinet/tcp_timer.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet/tcp_syncache.h>
+#include <freebsd/netinet/tcp_offload.h>
+#ifdef INET6
+#include <freebsd/netinet6/tcp6_var.h>
+#endif
+
+#ifdef IPSEC
+#include <freebsd/netipsec/ipsec.h>
+#ifdef INET6
+#include <freebsd/netipsec/ipsec6.h>
+#endif
+#include <freebsd/netipsec/key.h>
+#endif /*IPSEC*/
+
+#include <freebsd/machine/in_cksum.h>
+
+#include <freebsd/security/mac/mac_framework.h>
+
+static VNET_DEFINE(int, tcp_syncookies) = 1;
+#define V_tcp_syncookies VNET(tcp_syncookies)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW,
+ &VNET_NAME(tcp_syncookies), 0,
+ "Use TCP SYN cookies if the syncache overflows");
+
+static VNET_DEFINE(int, tcp_syncookiesonly) = 0;
+#define V_tcp_syncookiesonly VNET(tcp_syncookiesonly)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW,
+ &VNET_NAME(tcp_syncookiesonly), 0,
+ "Use only TCP SYN cookies");
+
+#ifdef TCP_OFFLOAD_DISABLE
+#define TOEPCB_ISSET(sc) (0)
+#else
+#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL)
+#endif
+
+static void syncache_drop(struct syncache *, struct syncache_head *);
+static void syncache_free(struct syncache *);
+static void syncache_insert(struct syncache *, struct syncache_head *);
+struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
+static int syncache_respond(struct syncache *);
+static struct socket *syncache_socket(struct syncache *, struct socket *,
+ struct mbuf *m);
+static void syncache_timeout(struct syncache *sc, struct syncache_head *sch,
+ int docallout);
+static void syncache_timer(void *);
+static void syncookie_generate(struct syncache_head *, struct syncache *,
+ u_int32_t *);
+static struct syncache
+ *syncookie_lookup(struct in_conninfo *, struct syncache_head *,
+ struct syncache *, struct tcpopt *, struct tcphdr *,
+ struct socket *);
+
+/*
+ * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
+ * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds,
+ * the odds are that the user has given up attempting to connect by then.
+ */
+#define SYNCACHE_MAXREXMTS 3
+
+/* Arbitrary values */
+#define TCP_SYNCACHE_HASHSIZE 512
+#define TCP_SYNCACHE_BUCKETLIMIT 30
+
+static VNET_DEFINE(struct tcp_syncache, tcp_syncache);
+#define V_tcp_syncache VNET(tcp_syncache)
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache");
+
+SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
+ &VNET_NAME(tcp_syncache.bucket_limit), 0,
+ "Per-bucket hash limit for syncache");
+
+SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
+ &VNET_NAME(tcp_syncache.cache_limit), 0,
+ "Overall entry limit for syncache");
+
+SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD,
+ &VNET_NAME(tcp_syncache.cache_count), 0,
+ "Current number of entries in syncache");
+
+SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
+ &VNET_NAME(tcp_syncache.hashsize), 0,
+ "Size of TCP syncache hashtable");
+
+SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW,
+ &VNET_NAME(tcp_syncache.rexmt_limit), 0,
+ "Limit on SYN/ACK retransmissions");
+
+VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1;
+SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail,
+ CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0,
+ "Send reset on socket allocation failure");
+
+static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
+
+#define SYNCACHE_HASH(inc, mask) \
+ ((V_tcp_syncache.hash_secret ^ \
+ (inc)->inc_faddr.s_addr ^ \
+ ((inc)->inc_faddr.s_addr >> 16) ^ \
+ (inc)->inc_fport ^ (inc)->inc_lport) & mask)
+
+#define SYNCACHE_HASH6(inc, mask) \
+ ((V_tcp_syncache.hash_secret ^ \
+ (inc)->inc6_faddr.s6_addr32[0] ^ \
+ (inc)->inc6_faddr.s6_addr32[3] ^ \
+ (inc)->inc_fport ^ (inc)->inc_lport) & mask)
+
+#define ENDPTS_EQ(a, b) ( \
+ (a)->ie_fport == (b)->ie_fport && \
+ (a)->ie_lport == (b)->ie_lport && \
+ (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \
+ (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \
+)
+
+#define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
+
+#define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx)
+#define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx)
+#define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED)
+
+/*
+ * Requires the syncache entry to be already removed from the bucket list.
+ */
+static void
+syncache_free(struct syncache *sc)
+{
+
+ if (sc->sc_ipopts)
+ (void) m_free(sc->sc_ipopts);
+ if (sc->sc_cred)
+ crfree(sc->sc_cred);
+#ifdef MAC
+ mac_syncache_destroy(&sc->sc_label);
+#endif
+
+ uma_zfree(V_tcp_syncache.zone, sc);
+}
+
+void
+syncache_init(void)
+{
+ int i;
+
+ V_tcp_syncache.cache_count = 0;
+ V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
+ V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
+ V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
+ V_tcp_syncache.hash_secret = arc4random();
+
+ TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
+ &V_tcp_syncache.hashsize);
+ TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
+ &V_tcp_syncache.bucket_limit);
+ if (!powerof2(V_tcp_syncache.hashsize) ||
+ V_tcp_syncache.hashsize == 0) {
+ printf("WARNING: syncache hash size is not a power of 2.\n");
+ V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
+ }
+ V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1;
+
+ /* Set limits. */
+ V_tcp_syncache.cache_limit =
+ V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit;
+ TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
+ &V_tcp_syncache.cache_limit);
+
+ /* Allocate the hash table. */
+ V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize *
+ sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO);
+
+ /* Initialize the hash buckets. */
+ for (i = 0; i < V_tcp_syncache.hashsize; i++) {
+#ifdef VIMAGE
+ V_tcp_syncache.hashbase[i].sch_vnet = curvnet;
+#endif
+ TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket);
+ mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head",
+ NULL, MTX_DEF);
+ callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer,
+ &V_tcp_syncache.hashbase[i].sch_mtx, 0);
+ V_tcp_syncache.hashbase[i].sch_length = 0;
+ }
+
+ /* Create the syncache entry zone. */
+ V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit);
+}
+
+#ifdef VIMAGE
+void
+syncache_destroy(void)
+{
+ struct syncache_head *sch;
+ struct syncache *sc, *nsc;
+ int i;
+
+ /* Cleanup hash buckets: stop timers, free entries, destroy locks. */
+ for (i = 0; i < V_tcp_syncache.hashsize; i++) {
+
+ sch = &V_tcp_syncache.hashbase[i];
+ callout_drain(&sch->sch_timer);
+
+ SCH_LOCK(sch);
+ TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc)
+ syncache_drop(sc, sch);
+ SCH_UNLOCK(sch);
+ KASSERT(TAILQ_EMPTY(&sch->sch_bucket),
+ ("%s: sch->sch_bucket not empty", __func__));
+ KASSERT(sch->sch_length == 0, ("%s: sch->sch_length %d not 0",
+ __func__, sch->sch_length));
+ mtx_destroy(&sch->sch_mtx);
+ }
+
+ KASSERT(V_tcp_syncache.cache_count == 0, ("%s: cache_count %d not 0",
+ __func__, V_tcp_syncache.cache_count));
+
+ /* Free the allocated global resources. */
+ uma_zdestroy(V_tcp_syncache.zone);
+ free(V_tcp_syncache.hashbase, M_SYNCACHE);
+}
+#endif
+
+/*
+ * Inserts a syncache entry into the specified bucket row.
+ * Locks and unlocks the syncache_head autonomously.
+ */
+static void
+syncache_insert(struct syncache *sc, struct syncache_head *sch)
+{
+ struct syncache *sc2;
+
+ SCH_LOCK(sch);
+
+ /*
+ * Make sure that we don't overflow the per-bucket limit.
+ * If the bucket is full, toss the oldest element.
+ */
+ if (sch->sch_length >= V_tcp_syncache.bucket_limit) {
+ KASSERT(!TAILQ_EMPTY(&sch->sch_bucket),
+ ("sch->sch_length incorrect"));
+ sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head);
+ syncache_drop(sc2, sch);
+ TCPSTAT_INC(tcps_sc_bucketoverflow);
+ }
+
+ /* Put it into the bucket. */
+ TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash);
+ sch->sch_length++;
+
+ /* Reinitialize the bucket row's timer. */
+ if (sch->sch_length == 1)
+ sch->sch_nextc = ticks + INT_MAX;
+ syncache_timeout(sc, sch, 1);
+
+ SCH_UNLOCK(sch);
+
+ V_tcp_syncache.cache_count++;
+ TCPSTAT_INC(tcps_sc_added);
+}
+
+/*
+ * Remove and free entry from syncache bucket row.
+ * Expects locked syncache head.
+ */
+static void
+syncache_drop(struct syncache *sc, struct syncache_head *sch)
+{
+
+ SCH_LOCK_ASSERT(sch);
+
+ TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
+ sch->sch_length--;
+
+#ifndef TCP_OFFLOAD_DISABLE
+ if (sc->sc_tu)
+ sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb);
+#endif
+ syncache_free(sc);
+ V_tcp_syncache.cache_count--;
+}
+
+/*
+ * Engage/reengage time on bucket row.
+ */
+static void
+syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout)
+{
+ sc->sc_rxttime = ticks +
+ TCPTV_RTOBASE * (tcp_backoff[sc->sc_rxmits]);
+ sc->sc_rxmits++;
+ if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) {
+ sch->sch_nextc = sc->sc_rxttime;
+ if (docallout)
+ callout_reset(&sch->sch_timer, sch->sch_nextc - ticks,
+ syncache_timer, (void *)sch);
+ }
+}
+
+/*
+ * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
+ * If we have retransmitted an entry the maximum number of times, expire it.
+ * One separate timer for each bucket row.
+ */
+static void
+syncache_timer(void *xsch)
+{
+ struct syncache_head *sch = (struct syncache_head *)xsch;
+ struct syncache *sc, *nsc;
+ int tick = ticks;
+ char *s;
+
+ CURVNET_SET(sch->sch_vnet);
+
+ /* NB: syncache_head has already been locked by the callout. */
+ SCH_LOCK_ASSERT(sch);
+
+ /*
+ * In the following cycle we may remove some entries and/or
+ * advance some timeouts, so re-initialize the bucket timer.
+ */
+ sch->sch_nextc = tick + INT_MAX;
+
+ TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) {
+ /*
+ * We do not check if the listen socket still exists
+ * and accept the case where the listen socket may be
+ * gone by the time we resend the SYN/ACK. We do
+ * not expect this to happens often. If it does,
+ * then the RST will be sent by the time the remote
+ * host does the SYN/ACK->ACK.
+ */
+ if (TSTMP_GT(sc->sc_rxttime, tick)) {
+ if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc))
+ sch->sch_nextc = sc->sc_rxttime;
+ continue;
+ }
+ if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) {
+ if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: Retransmits exhausted, "
+ "giving up and removing syncache entry\n",
+ s, __func__);
+ free(s, M_TCPLOG);
+ }
+ syncache_drop(sc, sch);
+ TCPSTAT_INC(tcps_sc_stale);
+ continue;
+ }
+ if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: Response timeout, "
+ "retransmitting (%u) SYN|ACK\n",
+ s, __func__, sc->sc_rxmits);
+ free(s, M_TCPLOG);
+ }
+
+ (void) syncache_respond(sc);
+ TCPSTAT_INC(tcps_sc_retransmitted);
+ syncache_timeout(sc, sch, 0);
+ }
+ if (!TAILQ_EMPTY(&(sch)->sch_bucket))
+ callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick,
+ syncache_timer, (void *)(sch));
+ CURVNET_RESTORE();
+}
+
+/*
+ * Find an entry in the syncache.
+ * Returns always with locked syncache_head plus a matching entry or NULL.
+ */
+struct syncache *
+syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp)
+{
+ struct syncache *sc;
+ struct syncache_head *sch;
+
+#ifdef INET6
+ if (inc->inc_flags & INC_ISIPV6) {
+ sch = &V_tcp_syncache.hashbase[
+ SYNCACHE_HASH6(inc, V_tcp_syncache.hashmask)];
+ *schp = sch;
+
+ SCH_LOCK(sch);
+
+ /* Circle through bucket row to find matching entry. */
+ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
+ if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
+ return (sc);
+ }
+ } else
+#endif
+ {
+ sch = &V_tcp_syncache.hashbase[
+ SYNCACHE_HASH(inc, V_tcp_syncache.hashmask)];
+ *schp = sch;
+
+ SCH_LOCK(sch);
+
+ /* Circle through bucket row to find matching entry. */
+ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
+#ifdef INET6
+ if (sc->sc_inc.inc_flags & INC_ISIPV6)
+ continue;
+#endif
+ if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
+ return (sc);
+ }
+ }
+ SCH_LOCK_ASSERT(*schp);
+ return (NULL); /* always returns with locked sch */
+}
+
+/*
+ * This function is called when we get a RST for a
+ * non-existent connection, so that we can see if the
+ * connection is in the syn cache. If it is, zap it.
+ */
+void
+syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th)
+{
+ struct syncache *sc;
+ struct syncache_head *sch;
+ char *s = NULL;
+
+ sc = syncache_lookup(inc, &sch); /* returns locked sch */
+ SCH_LOCK_ASSERT(sch);
+
+ /*
+ * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags.
+ * See RFC 793 page 65, section SEGMENT ARRIVES.
+ */
+ if (th->th_flags & (TH_ACK|TH_SYN|TH_FIN)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or "
+ "FIN flag set, segment ignored\n", s, __func__);
+ TCPSTAT_INC(tcps_badrst);
+ goto done;
+ }
+
+ /*
+ * No corresponding connection was found in syncache.
+ * If syncookies are enabled and possibly exclusively
+ * used, or we are under memory pressure, a valid RST
+ * may not find a syncache entry. In that case we're
+ * done and no SYN|ACK retransmissions will happen.
+ * Otherwise the the RST was misdirected or spoofed.
+ */
+ if (sc == NULL) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Spurious RST without matching "
+ "syncache entry (possibly syncookie only), "
+ "segment ignored\n", s, __func__);
+ TCPSTAT_INC(tcps_badrst);
+ goto done;
+ }
+
+ /*
+ * If the RST bit is set, check the sequence number to see
+ * if this is a valid reset segment.
+ * RFC 793 page 37:
+ * In all states except SYN-SENT, all reset (RST) segments
+ * are validated by checking their SEQ-fields. A reset is
+ * valid if its sequence number is in the window.
+ *
+ * The sequence number in the reset segment is normally an
+ * echo of our outgoing acknowlegement numbers, but some hosts
+ * send a reset with the sequence number at the rightmost edge
+ * of our receive window, and we have to handle this case.
+ */
+ if (SEQ_GEQ(th->th_seq, sc->sc_irs) &&
+ SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
+ syncache_drop(sc, sch);
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, "
+ "connection attempt aborted by remote endpoint\n",
+ s, __func__);
+ TCPSTAT_INC(tcps_sc_reset);
+ } else {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != "
+ "IRS %u (+WND %u), segment ignored\n",
+ s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd);
+ TCPSTAT_INC(tcps_badrst);
+ }
+
+done:
+ if (s != NULL)
+ free(s, M_TCPLOG);
+ SCH_UNLOCK(sch);
+}
+
+void
+syncache_badack(struct in_conninfo *inc)
+{
+ struct syncache *sc;
+ struct syncache_head *sch;
+
+ sc = syncache_lookup(inc, &sch); /* returns locked sch */
+ SCH_LOCK_ASSERT(sch);
+ if (sc != NULL) {
+ syncache_drop(sc, sch);
+ TCPSTAT_INC(tcps_sc_badack);
+ }
+ SCH_UNLOCK(sch);
+}
+
+void
+syncache_unreach(struct in_conninfo *inc, struct tcphdr *th)
+{
+ struct syncache *sc;
+ struct syncache_head *sch;
+
+ sc = syncache_lookup(inc, &sch); /* returns locked sch */
+ SCH_LOCK_ASSERT(sch);
+ if (sc == NULL)
+ goto done;
+
+ /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
+ if (ntohl(th->th_seq) != sc->sc_iss)
+ goto done;
+
+ /*
+ * If we've rertransmitted 3 times and this is our second error,
+ * we remove the entry. Otherwise, we allow it to continue on.
+ * This prevents us from incorrectly nuking an entry during a
+ * spurious network outage.
+ *
+ * See tcp_notify().
+ */
+ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) {
+ sc->sc_flags |= SCF_UNREACH;
+ goto done;
+ }
+ syncache_drop(sc, sch);
+ TCPSTAT_INC(tcps_sc_unreach);
+done:
+ SCH_UNLOCK(sch);
+}
+
+/*
+ * Build a new TCP socket structure from a syncache entry.
+ */
+static struct socket *
+syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
+{
+ struct inpcb *inp = NULL;
+ struct socket *so;
+ struct tcpcb *tp;
+ int error = 0;
+ char *s;
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+ /*
+ * Ok, create the full blown connection, and set things up
+ * as they would have been set up if we had created the
+ * connection when the SYN arrived. If we can't create
+ * the connection, abort it.
+ */
+ so = sonewconn(lso, SS_ISCONNECTED);
+ if (so == NULL) {
+ /*
+ * Drop the connection; we will either send a RST or
+ * have the peer retransmit its SYN again after its
+ * RTO and try again.
+ */
+ TCPSTAT_INC(tcps_listendrop);
+ if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: Socket create failed "
+ "due to limits or memory shortage\n",
+ s, __func__);
+ free(s, M_TCPLOG);
+ }
+ goto abort2;
+ }
+#ifdef MAC
+ mac_socketpeer_set_from_mbuf(m, so);
+#endif
+
+ inp = sotoinpcb(so);
+ inp->inp_inc.inc_fibnum = so->so_fibnum;
+ INP_WLOCK(inp);
+
+ /* Insert new socket into PCB hash list. */
+ inp->inp_inc.inc_flags = sc->sc_inc.inc_flags;
+#ifdef INET6
+ if (sc->sc_inc.inc_flags & INC_ISIPV6) {
+ inp->in6p_laddr = sc->sc_inc.inc6_laddr;
+ } else {
+ inp->inp_vflag &= ~INP_IPV6;
+ inp->inp_vflag |= INP_IPV4;
+#endif
+ inp->inp_laddr = sc->sc_inc.inc_laddr;
+#ifdef INET6
+ }
+#endif
+ inp->inp_lport = sc->sc_inc.inc_lport;
+ if ((error = in_pcbinshash(inp)) != 0) {
+ /*
+ * Undo the assignments above if we failed to
+ * put the PCB on the hash lists.
+ */
+#ifdef INET6
+ if (sc->sc_inc.inc_flags & INC_ISIPV6)
+ inp->in6p_laddr = in6addr_any;
+ else
+#endif
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ inp->inp_lport = 0;
+ if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: in_pcbinshash failed "
+ "with error %i\n",
+ s, __func__, error);
+ free(s, M_TCPLOG);
+ }
+ goto abort;
+ }
+#ifdef IPSEC
+ /* Copy old policy into new socket's. */
+ if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
+ printf("syncache_socket: could not copy policy\n");
+#endif
+#ifdef INET6
+ if (sc->sc_inc.inc_flags & INC_ISIPV6) {
+ struct inpcb *oinp = sotoinpcb(lso);
+ struct in6_addr laddr6;
+ struct sockaddr_in6 sin6;
+ /*
+ * Inherit socket options from the listening socket.
+ * Note that in6p_inputopts are not (and should not be)
+ * copied, since it stores previously received options and is
+ * used to detect if each new option is different than the
+ * previous one and hence should be passed to a user.
+ * If we copied in6p_inputopts, a user would not be able to
+ * receive options just after calling the accept system call.
+ */
+ inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS;
+ if (oinp->in6p_outputopts)
+ inp->in6p_outputopts =
+ ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
+
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_len = sizeof(sin6);
+ sin6.sin6_addr = sc->sc_inc.inc6_faddr;
+ sin6.sin6_port = sc->sc_inc.inc_fport;
+ sin6.sin6_flowinfo = sin6.sin6_scope_id = 0;
+ laddr6 = inp->in6p_laddr;
+ if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
+ inp->in6p_laddr = sc->sc_inc.inc6_laddr;
+#ifndef __rtems__
+ if ((error = in6_pcbconnect(inp, (struct sockaddr *)&sin6,
+ thread0.td_ucred)) != 0) {
+#else /* __rtems__ */
+ if ((error = in6_pcbconnect(inp, (struct sockaddr *)&sin6,
+ rtems_bsd_thread0_ucred)) != 0) {
+#endif /* __rtems__ */
+ inp->in6p_laddr = laddr6;
+ if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: in6_pcbconnect failed "
+ "with error %i\n",
+ s, __func__, error);
+ free(s, M_TCPLOG);
+ }
+ goto abort;
+ }
+ /* Override flowlabel from in6_pcbconnect. */
+ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
+ inp->inp_flow |= sc->sc_flowlabel;
+ } else
+#endif
+ {
+ struct in_addr laddr;
+ struct sockaddr_in sin;
+
+ inp->inp_options = (m) ? ip_srcroute(m) : NULL;
+
+ if (inp->inp_options == NULL) {
+ inp->inp_options = sc->sc_ipopts;
+ sc->sc_ipopts = NULL;
+ }
+
+ sin.sin_family = AF_INET;
+ sin.sin_len = sizeof(sin);
+ sin.sin_addr = sc->sc_inc.inc_faddr;
+ sin.sin_port = sc->sc_inc.inc_fport;
+ bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero));
+ laddr = inp->inp_laddr;
+ if (inp->inp_laddr.s_addr == INADDR_ANY)
+ inp->inp_laddr = sc->sc_inc.inc_laddr;
+#ifndef __rtems__
+ if ((error = in_pcbconnect(inp, (struct sockaddr *)&sin,
+ thread0.td_ucred)) != 0) {
+#else /* __rtems__ */
+ if ((error = in_pcbconnect(inp, (struct sockaddr *)&sin,
+ rtems_bsd_thread0_ucred)) != 0) {
+#endif /* __rtems__ */
+
+ inp->inp_laddr = laddr;
+ if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: in_pcbconnect failed "
+ "with error %i\n",
+ s, __func__, error);
+ free(s, M_TCPLOG);
+ }
+ goto abort;
+ }
+ }
+ tp = intotcpcb(inp);
+ tp->t_state = TCPS_SYN_RECEIVED;
+ tp->iss = sc->sc_iss;
+ tp->irs = sc->sc_irs;
+ tcp_rcvseqinit(tp);
+ tcp_sendseqinit(tp);
+ tp->snd_wl1 = sc->sc_irs;
+ tp->snd_max = tp->iss + 1;
+ tp->snd_nxt = tp->iss + 1;
+ tp->rcv_up = sc->sc_irs + 1;
+ tp->rcv_wnd = sc->sc_wnd;
+ tp->rcv_adv += tp->rcv_wnd;
+ tp->last_ack_sent = tp->rcv_nxt;
+
+ tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
+ if (sc->sc_flags & SCF_NOOPT)
+ tp->t_flags |= TF_NOOPT;
+ else {
+ if (sc->sc_flags & SCF_WINSCALE) {
+ tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
+ tp->snd_scale = sc->sc_requested_s_scale;
+ tp->request_r_scale = sc->sc_requested_r_scale;
+ }
+ if (sc->sc_flags & SCF_TIMESTAMP) {
+ tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
+ tp->ts_recent = sc->sc_tsreflect;
+ tp->ts_recent_age = ticks;
+ tp->ts_offset = sc->sc_tsoff;
+ }
+#ifdef TCP_SIGNATURE
+ if (sc->sc_flags & SCF_SIGNATURE)
+ tp->t_flags |= TF_SIGNATURE;
+#endif
+ if (sc->sc_flags & SCF_SACK)
+ tp->t_flags |= TF_SACK_PERMIT;
+ }
+
+ if (sc->sc_flags & SCF_ECN)
+ tp->t_flags |= TF_ECN_PERMIT;
+
+ /*
+ * Set up MSS and get cached values from tcp_hostcache.
+ * This might overwrite some of the defaults we just set.
+ */
+ tcp_mss(tp, sc->sc_peer_mss);
+
+ /*
+ * If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
+ * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits.
+ */
+ if (sc->sc_rxmits > 1)
+ tp->snd_cwnd = tp->t_maxseg;
+ tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
+
+ INP_WUNLOCK(inp);
+
+ TCPSTAT_INC(tcps_accepts);
+ return (so);
+
+abort:
+ INP_WUNLOCK(inp);
+abort2:
+ if (so != NULL)
+ soabort(so);
+ return (NULL);
+}
+
+/*
+ * This function gets called when we receive an ACK for a
+ * socket in the LISTEN state. We look up the connection
+ * in the syncache, and if its there, we pull it out of
+ * the cache and turn it into a full-blown connection in
+ * the SYN-RECEIVED state.
+ */
+int
+syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
+ struct socket **lsop, struct mbuf *m)
+{
+ struct syncache *sc;
+ struct syncache_head *sch;
+ struct syncache scs;
+ char *s;
+
+ /*
+ * Global TCP locks are held because we manipulate the PCB lists
+ * and create a new socket.
+ */
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK,
+ ("%s: can handle only ACK", __func__));
+
+ sc = syncache_lookup(inc, &sch); /* returns locked sch */
+ SCH_LOCK_ASSERT(sch);
+ if (sc == NULL) {
+ /*
+ * There is no syncache entry, so see if this ACK is
+ * a returning syncookie. To do this, first:
+ * A. See if this socket has had a syncache entry dropped in
+ * the past. We don't want to accept a bogus syncookie
+ * if we've never received a SYN.
+ * B. check that the syncookie is valid. If it is, then
+ * cobble up a fake syncache entry, and return.
+ */
+ if (!V_tcp_syncookies) {
+ SCH_UNLOCK(sch);
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Spurious ACK, "
+ "segment rejected (syncookies disabled)\n",
+ s, __func__);
+ goto failed;
+ }
+ bzero(&scs, sizeof(scs));
+ sc = syncookie_lookup(inc, sch, &scs, to, th, *lsop);
+ SCH_UNLOCK(sch);
+ if (sc == NULL) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Segment failed "
+ "SYNCOOKIE authentication, segment rejected "
+ "(probably spoofed)\n", s, __func__);
+ goto failed;
+ }
+ } else {
+ /* Pull out the entry to unlock the bucket row. */
+ TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
+ sch->sch_length--;
+ V_tcp_syncache.cache_count--;
+ SCH_UNLOCK(sch);
+ }
+
+ /*
+ * Segment validation:
+ * ACK must match our initial sequence number + 1 (the SYN|ACK).
+ */
+ if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment "
+ "rejected\n", s, __func__, th->th_ack, sc->sc_iss);
+ goto failed;
+ }
+
+ /*
+ * The SEQ must fall in the window starting at the received
+ * initial receive sequence number + 1 (the SYN).
+ */
+ if ((SEQ_LEQ(th->th_seq, sc->sc_irs) ||
+ SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) &&
+ !TOEPCB_ISSET(sc)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment "
+ "rejected\n", s, __func__, th->th_seq, sc->sc_irs);
+ goto failed;
+ }
+
+ if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
+ "segment rejected\n", s, __func__);
+ goto failed;
+ }
+ /*
+ * If timestamps were negotiated the reflected timestamp
+ * must be equal to what we actually sent in the SYN|ACK.
+ */
+ if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts &&
+ !TOEPCB_ISSET(sc)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, "
+ "segment rejected\n",
+ s, __func__, to->to_tsecr, sc->sc_ts);
+ goto failed;
+ }
+
+ *lsop = syncache_socket(sc, *lsop, m);
+
+ if (*lsop == NULL)
+ TCPSTAT_INC(tcps_sc_aborted);
+ else
+ TCPSTAT_INC(tcps_sc_completed);
+
+/* how do we find the inp for the new socket? */
+ if (sc != &scs)
+ syncache_free(sc);
+ return (1);
+failed:
+ if (sc != NULL && sc != &scs)
+ syncache_free(sc);
+ if (s != NULL)
+ free(s, M_TCPLOG);
+ *lsop = NULL;
+ return (0);
+}
+
+int
+tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo,
+ struct tcphdr *th, struct socket **lsop, struct mbuf *m)
+{
+ struct tcpopt to;
+ int rc;
+
+ bzero(&to, sizeof(struct tcpopt));
+ to.to_mss = toeo->to_mss;
+ to.to_wscale = toeo->to_wscale;
+ to.to_flags = toeo->to_flags;
+
+ INP_INFO_WLOCK(&V_tcbinfo);
+ rc = syncache_expand(inc, &to, th, lsop, m);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+
+ return (rc);
+}
+
+/*
+ * Given a LISTEN socket and an inbound SYN request, add
+ * this to the syn cache, and send back a segment:
+ * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
+ * to the source.
+ *
+ * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
+ * Doing so would require that we hold onto the data and deliver it
+ * to the application. However, if we are the target of a SYN-flood
+ * DoS attack, an attacker could send data which would eventually
+ * consume all available buffer space if it were ACKed. By not ACKing
+ * the data, we avoid this DoS scenario.
+ */
+static void
+_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
+ struct inpcb *inp, struct socket **lsop, struct mbuf *m,
+ struct toe_usrreqs *tu, void *toepcb)
+{
+ struct tcpcb *tp;
+ struct socket *so;
+ struct syncache *sc = NULL;
+ struct syncache_head *sch;
+ struct mbuf *ipopts = NULL;
+ u_int32_t flowtmp;
+ int win, sb_hiwat, ip_ttl, ip_tos, noopt;
+ char *s;
+#ifdef INET6
+ int autoflowlabel = 0;
+#endif
+#ifdef MAC
+ struct label *maclabel;
+#endif
+ struct syncache scs;
+ struct ucred *cred;
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(inp); /* listen socket */
+ KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN,
+ ("%s: unexpected tcp flags", __func__));
+
+ /*
+ * Combine all so/tp operations very early to drop the INP lock as
+ * soon as possible.
+ */
+ so = *lsop;
+ tp = sototcpcb(so);
+ cred = crhold(so->so_cred);
+
+#ifdef INET6
+ if ((inc->inc_flags & INC_ISIPV6) &&
+ (inp->inp_flags & IN6P_AUTOFLOWLABEL))
+ autoflowlabel = 1;
+#endif
+ ip_ttl = inp->inp_ip_ttl;
+ ip_tos = inp->inp_ip_tos;
+ win = sbspace(&so->so_rcv);
+ sb_hiwat = so->so_rcv.sb_hiwat;
+ noopt = (tp->t_flags & TF_NOOPT);
+
+ /* By the time we drop the lock these should no longer be used. */
+ so = NULL;
+ tp = NULL;
+
+#ifdef MAC
+ if (mac_syncache_init(&maclabel) != 0) {
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ goto done;
+ } else
+ mac_syncache_create(maclabel, inp);
+#endif
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+
+ /*
+ * Remember the IP options, if any.
+ */
+#ifdef INET6
+ if (!(inc->inc_flags & INC_ISIPV6))
+#endif
+ ipopts = (m) ? ip_srcroute(m) : NULL;
+
+ /*
+ * See if we already have an entry for this connection.
+ * If we do, resend the SYN,ACK, and reset the retransmit timer.
+ *
+ * XXX: should the syncache be re-initialized with the contents
+ * of the new SYN here (which may have different options?)
+ *
+ * XXX: We do not check the sequence number to see if this is a
+ * real retransmit or a new connection attempt. The question is
+ * how to handle such a case; either ignore it as spoofed, or
+ * drop the current entry and create a new one?
+ */
+ sc = syncache_lookup(inc, &sch); /* returns locked entry */
+ SCH_LOCK_ASSERT(sch);
+ if (sc != NULL) {
+#ifndef TCP_OFFLOAD_DISABLE
+ if (sc->sc_tu)
+ sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT,
+ sc->sc_toepcb);
+#endif
+ TCPSTAT_INC(tcps_sc_dupsyn);
+ if (ipopts) {
+ /*
+ * If we were remembering a previous source route,
+ * forget it and use the new one we've been given.
+ */
+ if (sc->sc_ipopts)
+ (void) m_free(sc->sc_ipopts);
+ sc->sc_ipopts = ipopts;
+ }
+ /*
+ * Update timestamp if present.
+ */
+ if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS))
+ sc->sc_tsreflect = to->to_tsval;
+ else
+ sc->sc_flags &= ~SCF_TIMESTAMP;
+#ifdef MAC
+ /*
+ * Since we have already unconditionally allocated label
+ * storage, free it up. The syncache entry will already
+ * have an initialized label we can use.
+ */
+ mac_syncache_destroy(&maclabel);
+#endif
+ /* Retransmit SYN|ACK and reset retransmit count. */
+ if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: Received duplicate SYN, "
+ "resetting timer and retransmitting SYN|ACK\n",
+ s, __func__);
+ free(s, M_TCPLOG);
+ }
+ if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) {
+ sc->sc_rxmits = 0;
+ syncache_timeout(sc, sch, 1);
+ TCPSTAT_INC(tcps_sndacks);
+ TCPSTAT_INC(tcps_sndtotal);
+ }
+ SCH_UNLOCK(sch);
+ goto done;
+ }
+
+ sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO);
+ if (sc == NULL) {
+ /*
+ * The zone allocator couldn't provide more entries.
+ * Treat this as if the cache was full; drop the oldest
+ * entry and insert the new one.
+ */
+ TCPSTAT_INC(tcps_sc_zonefail);
+ if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL)
+ syncache_drop(sc, sch);
+ sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO);
+ if (sc == NULL) {
+ if (V_tcp_syncookies) {
+ bzero(&scs, sizeof(scs));
+ sc = &scs;
+ } else {
+ SCH_UNLOCK(sch);
+ if (ipopts)
+ (void) m_free(ipopts);
+ goto done;
+ }
+ }
+ }
+
+ /*
+ * Fill in the syncache values.
+ */
+#ifdef MAC
+ sc->sc_label = maclabel;
+#endif
+ sc->sc_cred = cred;
+ cred = NULL;
+ sc->sc_ipopts = ipopts;
+ bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
+#ifdef INET6
+ if (!(inc->inc_flags & INC_ISIPV6))
+#endif
+ {
+ sc->sc_ip_tos = ip_tos;
+ sc->sc_ip_ttl = ip_ttl;
+ }
+#ifndef TCP_OFFLOAD_DISABLE
+ sc->sc_tu = tu;
+ sc->sc_toepcb = toepcb;
+#endif
+ sc->sc_irs = th->th_seq;
+ sc->sc_iss = arc4random();
+ sc->sc_flags = 0;
+ sc->sc_flowlabel = 0;
+
+ /*
+ * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN].
+ * win was derived from socket earlier in the function.
+ */
+ win = imax(win, 0);
+ win = imin(win, TCP_MAXWIN);
+ sc->sc_wnd = win;
+
+ if (V_tcp_do_rfc1323) {
+ /*
+ * A timestamp received in a SYN makes
+ * it ok to send timestamp requests and replies.
+ */
+ if (to->to_flags & TOF_TS) {
+ sc->sc_tsreflect = to->to_tsval;
+ sc->sc_ts = ticks;
+ sc->sc_flags |= SCF_TIMESTAMP;
+ }
+ if (to->to_flags & TOF_SCALE) {
+ int wscale = 0;
+
+ /*
+ * Pick the smallest possible scaling factor that
+ * will still allow us to scale up to sb_max, aka
+ * kern.ipc.maxsockbuf.
+ *
+ * We do this because there are broken firewalls that
+ * will corrupt the window scale option, leading to
+ * the other endpoint believing that our advertised
+ * window is unscaled. At scale factors larger than
+ * 5 the unscaled window will drop below 1500 bytes,
+ * leading to serious problems when traversing these
+ * broken firewalls.
+ *
+ * With the default maxsockbuf of 256K, a scale factor
+ * of 3 will be chosen by this algorithm. Those who
+ * choose a larger maxsockbuf should watch out
+ * for the compatiblity problems mentioned above.
+ *
+ * RFC1323: The Window field in a SYN (i.e., a <SYN>
+ * or <SYN,ACK>) segment itself is never scaled.
+ */
+ while (wscale < TCP_MAX_WINSHIFT &&
+ (TCP_MAXWIN << wscale) < sb_max)
+ wscale++;
+ sc->sc_requested_r_scale = wscale;
+ sc->sc_requested_s_scale = to->to_wscale;
+ sc->sc_flags |= SCF_WINSCALE;
+ }
+ }
+#ifdef TCP_SIGNATURE
+ /*
+ * If listening socket requested TCP digests, and received SYN
+ * contains the option, flag this in the syncache so that
+ * syncache_respond() will do the right thing with the SYN+ACK.
+ * XXX: Currently we always record the option by default and will
+ * attempt to use it in syncache_respond().
+ */
+ if (to->to_flags & TOF_SIGNATURE)
+ sc->sc_flags |= SCF_SIGNATURE;
+#endif
+ if (to->to_flags & TOF_SACKPERM)
+ sc->sc_flags |= SCF_SACK;
+ if (to->to_flags & TOF_MSS)
+ sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */
+ if (noopt)
+ sc->sc_flags |= SCF_NOOPT;
+ if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn)
+ sc->sc_flags |= SCF_ECN;
+
+ if (V_tcp_syncookies) {
+ syncookie_generate(sch, sc, &flowtmp);
+#ifdef INET6
+ if (autoflowlabel)
+ sc->sc_flowlabel = flowtmp;
+#endif
+ } else {
+#ifdef INET6
+ if (autoflowlabel)
+ sc->sc_flowlabel =
+ (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
+#endif
+ }
+ SCH_UNLOCK(sch);
+
+ /*
+ * Do a standard 3-way handshake.
+ */
+ if (TOEPCB_ISSET(sc) || syncache_respond(sc) == 0) {
+ if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs)
+ syncache_free(sc);
+ else if (sc != &scs)
+ syncache_insert(sc, sch); /* locks and unlocks sch */
+ TCPSTAT_INC(tcps_sndacks);
+ TCPSTAT_INC(tcps_sndtotal);
+ } else {
+ if (sc != &scs)
+ syncache_free(sc);
+ TCPSTAT_INC(tcps_sc_dropped);
+ }
+
+done:
+ if (cred != NULL)
+ crfree(cred);
+#ifdef MAC
+ if (sc == &scs)
+ mac_syncache_destroy(&maclabel);
+#endif
+ if (m) {
+
+ *lsop = NULL;
+ m_freem(m);
+ }
+}
+
+static int
+syncache_respond(struct syncache *sc)
+{
+ struct ip *ip = NULL;
+ struct mbuf *m;
+ struct tcphdr *th;
+ int optlen, error;
+ u_int16_t hlen, tlen, mssopt;
+ struct tcpopt to;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+#endif
+
+ hlen =
+#ifdef INET6
+ (sc->sc_inc.inc_flags & INC_ISIPV6) ? sizeof(struct ip6_hdr) :
+#endif
+ sizeof(struct ip);
+ tlen = hlen + sizeof(struct tcphdr);
+
+ /* Determine MSS we advertize to other end of connection. */
+ mssopt = tcp_mssopt(&sc->sc_inc);
+ if (sc->sc_peer_mss)
+ mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss);
+
+ /* XXX: Assume that the entire packet will fit in a header mbuf. */
+ KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN,
+ ("syncache: mbuf too small"));
+
+ /* Create the IP+TCP header from scratch. */
+ m = m_gethdr(M_DONTWAIT, MT_DATA);
+ if (m == NULL)
+ return (ENOBUFS);
+#ifdef MAC
+ mac_syncache_create_mbuf(sc->sc_label, m);
+#endif
+ m->m_data += max_linkhdr;
+ m->m_len = tlen;
+ m->m_pkthdr.len = tlen;
+ m->m_pkthdr.rcvif = NULL;
+
+#ifdef INET6
+ if (sc->sc_inc.inc_flags & INC_ISIPV6) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ ip6->ip6_vfc = IPV6_VERSION;
+ ip6->ip6_nxt = IPPROTO_TCP;
+ ip6->ip6_src = sc->sc_inc.inc6_laddr;
+ ip6->ip6_dst = sc->sc_inc.inc6_faddr;
+ ip6->ip6_plen = htons(tlen - hlen);
+ /* ip6_hlim is set after checksum */
+ ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
+ ip6->ip6_flow |= sc->sc_flowlabel;
+
+ th = (struct tcphdr *)(ip6 + 1);
+ } else
+#endif
+ {
+ ip = mtod(m, struct ip *);
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = sizeof(struct ip) >> 2;
+ ip->ip_len = tlen;
+ ip->ip_id = 0;
+ ip->ip_off = 0;
+ ip->ip_sum = 0;
+ ip->ip_p = IPPROTO_TCP;
+ ip->ip_src = sc->sc_inc.inc_laddr;
+ ip->ip_dst = sc->sc_inc.inc_faddr;
+ ip->ip_ttl = sc->sc_ip_ttl;
+ ip->ip_tos = sc->sc_ip_tos;
+
+ /*
+ * See if we should do MTU discovery. Route lookups are
+ * expensive, so we will only unset the DF bit if:
+ *
+ * 1) path_mtu_discovery is disabled
+ * 2) the SCF_UNREACH flag has been set
+ */
+ if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0))
+ ip->ip_off |= IP_DF;
+
+ th = (struct tcphdr *)(ip + 1);
+ }
+ th->th_sport = sc->sc_inc.inc_lport;
+ th->th_dport = sc->sc_inc.inc_fport;
+
+ th->th_seq = htonl(sc->sc_iss);
+ th->th_ack = htonl(sc->sc_irs + 1);
+ th->th_off = sizeof(struct tcphdr) >> 2;
+ th->th_x2 = 0;
+ th->th_flags = TH_SYN|TH_ACK;
+ th->th_win = htons(sc->sc_wnd);
+ th->th_urp = 0;
+
+ if (sc->sc_flags & SCF_ECN) {
+ th->th_flags |= TH_ECE;
+ TCPSTAT_INC(tcps_ecn_shs);
+ }
+
+ /* Tack on the TCP options. */
+ if ((sc->sc_flags & SCF_NOOPT) == 0) {
+ to.to_flags = 0;
+
+ to.to_mss = mssopt;
+ to.to_flags = TOF_MSS;
+ if (sc->sc_flags & SCF_WINSCALE) {
+ to.to_wscale = sc->sc_requested_r_scale;
+ to.to_flags |= TOF_SCALE;
+ }
+ if (sc->sc_flags & SCF_TIMESTAMP) {
+ /* Virgin timestamp or TCP cookie enhanced one. */
+ to.to_tsval = sc->sc_ts;
+ to.to_tsecr = sc->sc_tsreflect;
+ to.to_flags |= TOF_TS;
+ }
+ if (sc->sc_flags & SCF_SACK)
+ to.to_flags |= TOF_SACKPERM;
+#ifdef TCP_SIGNATURE
+ if (sc->sc_flags & SCF_SIGNATURE)
+ to.to_flags |= TOF_SIGNATURE;
+#endif
+ optlen = tcp_addoptions(&to, (u_char *)(th + 1));
+
+ /* Adjust headers by option size. */
+ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
+ m->m_len += optlen;
+ m->m_pkthdr.len += optlen;
+
+#ifdef TCP_SIGNATURE
+ if (sc->sc_flags & SCF_SIGNATURE)
+ tcp_signature_compute(m, 0, 0, optlen,
+ to.to_signature, IPSEC_DIR_OUTBOUND);
+#endif
+#ifdef INET6
+ if (sc->sc_inc.inc_flags & INC_ISIPV6)
+ ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen);
+ else
+#endif
+ ip->ip_len += optlen;
+ } else
+ optlen = 0;
+
+ M_SETFIB(m, sc->sc_inc.inc_fibnum);
+#ifdef INET6
+ if (sc->sc_inc.inc_flags & INC_ISIPV6) {
+ th->th_sum = 0;
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen,
+ tlen + optlen - hlen);
+ ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
+ error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
+ } else
+#endif
+ {
+ th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons(tlen + optlen - hlen + IPPROTO_TCP));
+ m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL);
+ }
+ return (error);
+}
+
+void
+syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
+ struct inpcb *inp, struct socket **lsop, struct mbuf *m)
+{
+ _syncache_add(inc, to, th, inp, lsop, m, NULL, NULL);
+}
+
+void
+tcp_offload_syncache_add(struct in_conninfo *inc, struct toeopt *toeo,
+ struct tcphdr *th, struct inpcb *inp, struct socket **lsop,
+ struct toe_usrreqs *tu, void *toepcb)
+{
+ struct tcpopt to;
+
+ bzero(&to, sizeof(struct tcpopt));
+ to.to_mss = toeo->to_mss;
+ to.to_wscale = toeo->to_wscale;
+ to.to_flags = toeo->to_flags;
+
+ INP_INFO_WLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+
+ _syncache_add(inc, &to, th, inp, lsop, NULL, tu, toepcb);
+}
+
+/*
+ * The purpose of SYN cookies is to avoid keeping track of all SYN's we
+ * receive and to be able to handle SYN floods from bogus source addresses
+ * (where we will never receive any reply). SYN floods try to exhaust all
+ * our memory and available slots in the SYN cache table to cause a denial
+ * of service to legitimate users of the local host.
+ *
+ * The idea of SYN cookies is to encode and include all necessary information
+ * about the connection setup state within the SYN-ACK we send back and thus
+ * to get along without keeping any local state until the ACK to the SYN-ACK
+ * arrives (if ever). Everything we need to know should be available from
+ * the information we encoded in the SYN-ACK.
+ *
+ * More information about the theory behind SYN cookies and its first
+ * discussion and specification can be found at:
+ * http://cr.yp.to/syncookies.html (overview)
+ * http://cr.yp.to/syncookies/archive (gory details)
+ *
+ * This implementation extends the orginal idea and first implementation
+ * of FreeBSD by using not only the initial sequence number field to store
+ * information but also the timestamp field if present. This way we can
+ * keep track of the entire state we need to know to recreate the session in
+ * its original form. Almost all TCP speakers implement RFC1323 timestamps
+ * these days. For those that do not we still have to live with the known
+ * shortcomings of the ISN only SYN cookies.
+ *
+ * Cookie layers:
+ *
+ * Initial sequence number we send:
+ * 31|................................|0
+ * DDDDDDDDDDDDDDDDDDDDDDDDDMMMRRRP
+ * D = MD5 Digest (first dword)
+ * M = MSS index
+ * R = Rotation of secret
+ * P = Odd or Even secret
+ *
+ * The MD5 Digest is computed with over following parameters:
+ * a) randomly rotated secret
+ * b) struct in_conninfo containing the remote/local ip/port (IPv4&IPv6)
+ * c) the received initial sequence number from remote host
+ * d) the rotation offset and odd/even bit
+ *
+ * Timestamp we send:
+ * 31|................................|0
+ * DDDDDDDDDDDDDDDDDDDDDDSSSSRRRRA5
+ * D = MD5 Digest (third dword) (only as filler)
+ * S = Requested send window scale
+ * R = Requested receive window scale
+ * A = SACK allowed
+ * 5 = TCP-MD5 enabled (not implemented yet)
+ * XORed with MD5 Digest (forth dword)
+ *
+ * The timestamp isn't cryptographically secure and doesn't need to be.
+ * The double use of the MD5 digest dwords ties it to a specific remote/
+ * local host/port, remote initial sequence number and our local time
+ * limited secret. A received timestamp is reverted (XORed) and then
+ * the contained MD5 dword is compared to the computed one to ensure the
+ * timestamp belongs to the SYN-ACK we sent. The other parameters may
+ * have been tampered with but this isn't different from supplying bogus
+ * values in the SYN in the first place.
+ *
+ * Some problems with SYN cookies remain however:
+ * Consider the problem of a recreated (and retransmitted) cookie. If the
+ * original SYN was accepted, the connection is established. The second
+ * SYN is inflight, and if it arrives with an ISN that falls within the
+ * receive window, the connection is killed.
+ *
+ * Notes:
+ * A heuristic to determine when to accept syn cookies is not necessary.
+ * An ACK flood would cause the syncookie verification to be attempted,
+ * but a SYN flood causes syncookies to be generated. Both are of equal
+ * cost, so there's no point in trying to optimize the ACK flood case.
+ * Also, if you don't process certain ACKs for some reason, then all someone
+ * would have to do is launch a SYN and ACK flood at the same time, which
+ * would stop cookie verification and defeat the entire purpose of syncookies.
+ */
+static int tcp_sc_msstab[] = { 0, 256, 468, 536, 996, 1452, 1460, 8960 };
+
+static void
+syncookie_generate(struct syncache_head *sch, struct syncache *sc,
+ u_int32_t *flowlabel)
+{
+ MD5_CTX ctx;
+ u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)];
+ u_int32_t data;
+ u_int32_t *secbits;
+ u_int off, pmss, mss;
+ int i;
+
+ SCH_LOCK_ASSERT(sch);
+
+ /* Which of the two secrets to use. */
+ secbits = sch->sch_oddeven ?
+ sch->sch_secbits_odd : sch->sch_secbits_even;
+
+ /* Reseed secret if too old. */
+ if (sch->sch_reseed < time_uptime) {
+ sch->sch_oddeven = sch->sch_oddeven ? 0 : 1; /* toggle */
+ secbits = sch->sch_oddeven ?
+ sch->sch_secbits_odd : sch->sch_secbits_even;
+ for (i = 0; i < SYNCOOKIE_SECRET_SIZE; i++)
+ secbits[i] = arc4random();
+ sch->sch_reseed = time_uptime + SYNCOOKIE_LIFETIME;
+ }
+
+ /* Secret rotation offset. */
+ off = sc->sc_iss & 0x7; /* iss was randomized before */
+
+ /* Maximum segment size calculation. */
+ pmss =
+ max( min(sc->sc_peer_mss, tcp_mssopt(&sc->sc_inc)), V_tcp_minmss);
+ for (mss = sizeof(tcp_sc_msstab) / sizeof(int) - 1; mss > 0; mss--)
+ if (tcp_sc_msstab[mss] <= pmss)
+ break;
+
+ /* Fold parameters and MD5 digest into the ISN we will send. */
+ data = sch->sch_oddeven;/* odd or even secret, 1 bit */
+ data |= off << 1; /* secret offset, derived from iss, 3 bits */
+ data |= mss << 4; /* mss, 3 bits */
+
+ MD5Init(&ctx);
+ MD5Update(&ctx, ((u_int8_t *)secbits) + off,
+ SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off);
+ MD5Update(&ctx, secbits, off);
+ MD5Update(&ctx, &sc->sc_inc, sizeof(sc->sc_inc));
+ MD5Update(&ctx, &sc->sc_irs, sizeof(sc->sc_irs));
+ MD5Update(&ctx, &data, sizeof(data));
+ MD5Final((u_int8_t *)&md5_buffer, &ctx);
+
+ data |= (md5_buffer[0] << 7);
+ sc->sc_iss = data;
+
+#ifdef INET6
+ *flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
+#endif
+
+ /* Additional parameters are stored in the timestamp if present. */
+ if (sc->sc_flags & SCF_TIMESTAMP) {
+ data = ((sc->sc_flags & SCF_SIGNATURE) ? 1 : 0); /* TCP-MD5, 1 bit */
+ data |= ((sc->sc_flags & SCF_SACK) ? 1 : 0) << 1; /* SACK, 1 bit */
+ data |= sc->sc_requested_s_scale << 2; /* SWIN scale, 4 bits */
+ data |= sc->sc_requested_r_scale << 6; /* RWIN scale, 4 bits */
+ data |= md5_buffer[2] << 10; /* more digest bits */
+ data ^= md5_buffer[3];
+ sc->sc_ts = data;
+ sc->sc_tsoff = data - ticks; /* after XOR */
+ }
+
+ TCPSTAT_INC(tcps_sc_sendcookie);
+}
+
+static struct syncache *
+syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch,
+ struct syncache *sc, struct tcpopt *to, struct tcphdr *th,
+ struct socket *so)
+{
+ MD5_CTX ctx;
+ u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)];
+ u_int32_t data = 0;
+ u_int32_t *secbits;
+ tcp_seq ack, seq;
+ int off, mss, wnd, flags;
+
+ SCH_LOCK_ASSERT(sch);
+
+ /*
+ * Pull information out of SYN-ACK/ACK and
+ * revert sequence number advances.
+ */
+ ack = th->th_ack - 1;
+ seq = th->th_seq - 1;
+ off = (ack >> 1) & 0x7;
+ mss = (ack >> 4) & 0x7;
+ flags = ack & 0x7f;
+
+ /* Which of the two secrets to use. */
+ secbits = (flags & 0x1) ? sch->sch_secbits_odd : sch->sch_secbits_even;
+
+ /*
+ * The secret wasn't updated for the lifetime of a syncookie,
+ * so this SYN-ACK/ACK is either too old (replay) or totally bogus.
+ */
+ if (sch->sch_reseed + SYNCOOKIE_LIFETIME < time_uptime) {
+ return (NULL);
+ }
+
+ /* Recompute the digest so we can compare it. */
+ MD5Init(&ctx);
+ MD5Update(&ctx, ((u_int8_t *)secbits) + off,
+ SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off);
+ MD5Update(&ctx, secbits, off);
+ MD5Update(&ctx, inc, sizeof(*inc));
+ MD5Update(&ctx, &seq, sizeof(seq));
+ MD5Update(&ctx, &flags, sizeof(flags));
+ MD5Final((u_int8_t *)&md5_buffer, &ctx);
+
+ /* Does the digest part of or ACK'ed ISS match? */
+ if ((ack & (~0x7f)) != (md5_buffer[0] << 7))
+ return (NULL);
+
+ /* Does the digest part of our reflected timestamp match? */
+ if (to->to_flags & TOF_TS) {
+ data = md5_buffer[3] ^ to->to_tsecr;
+ if ((data & (~0x3ff)) != (md5_buffer[2] << 10))
+ return (NULL);
+ }
+
+ /* Fill in the syncache values. */
+ bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
+ sc->sc_ipopts = NULL;
+
+ sc->sc_irs = seq;
+ sc->sc_iss = ack;
+
+#ifdef INET6
+ if (inc->inc_flags & INC_ISIPV6) {
+ if (sotoinpcb(so)->inp_flags & IN6P_AUTOFLOWLABEL)
+ sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
+ } else
+#endif
+ {
+ sc->sc_ip_ttl = sotoinpcb(so)->inp_ip_ttl;
+ sc->sc_ip_tos = sotoinpcb(so)->inp_ip_tos;
+ }
+
+ /* Additional parameters that were encoded in the timestamp. */
+ if (data) {
+ sc->sc_flags |= SCF_TIMESTAMP;
+ sc->sc_tsreflect = to->to_tsval;
+ sc->sc_ts = to->to_tsecr;
+ sc->sc_tsoff = to->to_tsecr - ticks;
+ sc->sc_flags |= (data & 0x1) ? SCF_SIGNATURE : 0;
+ sc->sc_flags |= ((data >> 1) & 0x1) ? SCF_SACK : 0;
+ sc->sc_requested_s_scale = min((data >> 2) & 0xf,
+ TCP_MAX_WINSHIFT);
+ sc->sc_requested_r_scale = min((data >> 6) & 0xf,
+ TCP_MAX_WINSHIFT);
+ if (sc->sc_requested_s_scale || sc->sc_requested_r_scale)
+ sc->sc_flags |= SCF_WINSCALE;
+ } else
+ sc->sc_flags |= SCF_NOOPT;
+
+ wnd = sbspace(&so->so_rcv);
+ wnd = imax(wnd, 0);
+ wnd = imin(wnd, TCP_MAXWIN);
+ sc->sc_wnd = wnd;
+
+ sc->sc_rxmits = 0;
+ sc->sc_peer_mss = tcp_sc_msstab[mss];
+
+ TCPSTAT_INC(tcps_sc_recvcookie);
+ return (sc);
+}
+
+/*
+ * Returns the current number of syncache entries. This number
+ * will probably change before you get around to calling
+ * syncache_pcblist.
+ */
+
+int
+syncache_pcbcount(void)
+{
+ struct syncache_head *sch;
+ int count, i;
+
+ for (count = 0, i = 0; i < V_tcp_syncache.hashsize; i++) {
+ /* No need to lock for a read. */
+ sch = &V_tcp_syncache.hashbase[i];
+ count += sch->sch_length;
+ }
+ return count;
+}
+
+/*
+ * Exports the syncache entries to userland so that netstat can display
+ * them alongside the other sockets. This function is intended to be
+ * called only from tcp_pcblist.
+ *
+ * Due to concurrency on an active system, the number of pcbs exported
+ * may have no relation to max_pcbs. max_pcbs merely indicates the
+ * amount of space the caller allocated for this function to use.
+ */
+int
+syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported)
+{
+ struct xtcpcb xt;
+ struct syncache *sc;
+ struct syncache_head *sch;
+ int count, error, i;
+
+ for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) {
+ sch = &V_tcp_syncache.hashbase[i];
+ SCH_LOCK(sch);
+ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
+ if (count >= max_pcbs) {
+ SCH_UNLOCK(sch);
+ goto exit;
+ }
+ if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0)
+ continue;
+ bzero(&xt, sizeof(xt));
+ xt.xt_len = sizeof(xt);
+ if (sc->sc_inc.inc_flags & INC_ISIPV6)
+ xt.xt_inp.inp_vflag = INP_IPV6;
+ else
+ xt.xt_inp.inp_vflag = INP_IPV4;
+ bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo));
+ xt.xt_tp.t_inpcb = &xt.xt_inp;
+ xt.xt_tp.t_state = TCPS_SYN_RECEIVED;
+ xt.xt_socket.xso_protocol = IPPROTO_TCP;
+ xt.xt_socket.xso_len = sizeof (struct xsocket);
+ xt.xt_socket.so_type = SOCK_STREAM;
+ xt.xt_socket.so_state = SS_ISCONNECTING;
+ error = SYSCTL_OUT(req, &xt, sizeof xt);
+ if (error) {
+ SCH_UNLOCK(sch);
+ goto exit;
+ }
+ count++;
+ }
+ SCH_UNLOCK(sch);
+ }
+exit:
+ *pcbs_exported = count;
+ return error;
+}
diff --git a/freebsd/sys/netinet/tcp_syncache.h b/freebsd/sys/netinet/tcp_syncache.h
new file mode 100644
index 00000000..96ba1535
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_syncache.h
@@ -0,0 +1,127 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_SYNCACHE_HH_
+#define _NETINET_TCP_SYNCACHE_HH_
+#ifdef _KERNEL
+
+struct toeopt;
+
+void syncache_init(void);
+#ifdef VIMAGE
+void syncache_destroy(void);
+#endif
+void syncache_unreach(struct in_conninfo *, struct tcphdr *);
+int syncache_expand(struct in_conninfo *, struct tcpopt *,
+ struct tcphdr *, struct socket **, struct mbuf *);
+int tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo,
+ struct tcphdr *th, struct socket **lsop, struct mbuf *m);
+void syncache_add(struct in_conninfo *, struct tcpopt *,
+ struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *);
+void tcp_offload_syncache_add(struct in_conninfo *, struct toeopt *,
+ struct tcphdr *, struct inpcb *, struct socket **,
+ struct toe_usrreqs *tu, void *toepcb);
+
+void syncache_chkrst(struct in_conninfo *, struct tcphdr *);
+void syncache_badack(struct in_conninfo *);
+int syncache_pcbcount(void);
+int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported);
+
+struct syncache {
+ TAILQ_ENTRY(syncache) sc_hash;
+ struct in_conninfo sc_inc; /* addresses */
+ int sc_rxttime; /* retransmit time */
+ u_int16_t sc_rxmits; /* retransmit counter */
+ u_int32_t sc_tsreflect; /* timestamp to reflect */
+ u_int32_t sc_ts; /* our timestamp to send */
+ u_int32_t sc_tsoff; /* ts offset w/ syncookies */
+ u_int32_t sc_flowlabel; /* IPv6 flowlabel */
+ tcp_seq sc_irs; /* seq from peer */
+ tcp_seq sc_iss; /* our ISS */
+ struct mbuf *sc_ipopts; /* source route */
+ u_int16_t sc_peer_mss; /* peer's MSS */
+ u_int16_t sc_wnd; /* advertised window */
+ u_int8_t sc_ip_ttl; /* IPv4 TTL */
+ u_int8_t sc_ip_tos; /* IPv4 TOS */
+ u_int8_t sc_requested_s_scale:4,
+ sc_requested_r_scale:4;
+ u_int16_t sc_flags;
+#ifndef TCP_OFFLOAD_DISABLE
+ struct toe_usrreqs *sc_tu; /* TOE operations */
+ void *sc_toepcb; /* TOE protocol block */
+#endif
+ struct label *sc_label; /* MAC label reference */
+ struct ucred *sc_cred; /* cred cache for jail checks */
+};
+
+/*
+ * Flags for the sc_flags field.
+ */
+#define SCF_NOOPT 0x01 /* no TCP options */
+#define SCF_WINSCALE 0x02 /* negotiated window scaling */
+#define SCF_TIMESTAMP 0x04 /* negotiated timestamps */
+ /* MSS is implicit */
+#define SCF_UNREACH 0x10 /* icmp unreachable received */
+#define SCF_SIGNATURE 0x20 /* send MD5 digests */
+#define SCF_SACK 0x80 /* send SACK option */
+#define SCF_ECN 0x100 /* send ECN setup packet */
+
+#define SYNCOOKIE_SECRET_SIZE 8 /* dwords */
+#define SYNCOOKIE_LIFETIME 16 /* seconds */
+
+struct syncache_head {
+ struct vnet *sch_vnet;
+ struct mtx sch_mtx;
+ TAILQ_HEAD(sch_head, syncache) sch_bucket;
+ struct callout sch_timer;
+ int sch_nextc;
+ u_int sch_length;
+ u_int sch_oddeven;
+ u_int32_t sch_secbits_odd[SYNCOOKIE_SECRET_SIZE];
+ u_int32_t sch_secbits_even[SYNCOOKIE_SECRET_SIZE];
+ u_int sch_reseed; /* time_uptime, seconds */
+};
+
+struct tcp_syncache {
+ struct syncache_head *hashbase;
+ uma_zone_t zone;
+ u_int hashsize;
+ u_int hashmask;
+ u_int bucket_limit;
+ u_int cache_count; /* XXX: unprotected */
+ u_int cache_limit;
+ u_int rexmt_limit;
+ u_int hash_secret;
+};
+
+#endif /* _KERNEL */
+#endif /* !_NETINET_TCP_SYNCACHE_HH_ */
diff --git a/freebsd/sys/netinet/tcp_timer.c b/freebsd/sys/netinet/tcp_timer.c
new file mode 100644
index 00000000..36e2bec2
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_timer.c
@@ -0,0 +1,660 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_tcpdebug.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/mutex.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/systm.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/in_systm.h>
+#ifdef INET6
+#include <freebsd/netinet6/in6_pcb.h>
+#endif
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_fsm.h>
+#include <freebsd/netinet/tcp_timer.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <freebsd/netinet/tcp_debug.h>
+#endif
+
+int tcp_keepinit;
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
+
+int tcp_keepidle;
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
+
+int tcp_keepintvl;
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
+
+int tcp_delacktime;
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
+ "Time before a delayed ACK is sent");
+
+int tcp_msl;
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
+
+int tcp_rexmit_min;
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
+ "Minimum Retransmission Timeout");
+
+int tcp_rexmit_slop;
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
+ "Retransmission Timer Slop");
+
+static int always_keepalive = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
+ &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
+
+int tcp_fast_finwait2_recycle = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
+ &tcp_fast_finwait2_recycle, 0,
+ "Recycle closed FIN_WAIT_2 connections faster");
+
+int tcp_finwait2_timeout;
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
+
+
+static int tcp_keepcnt = TCPTV_KEEPCNT;
+ /* max idle probes */
+int tcp_maxpersistidle;
+ /* max idle time in persist */
+int tcp_maxidle;
+
+/*
+ * Tcp protocol timeout routine called every 500 ms.
+ * Updates timestamps used for TCP
+ * causes finite state machine actions if timers expire.
+ */
+void
+tcp_slowtimo(void)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+
+ VNET_LIST_RLOCK_NOSLEEP();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
+ INP_INFO_WLOCK(&V_tcbinfo);
+ (void) tcp_tw_2msl_scan(0);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK_NOSLEEP();
+}
+
+int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
+ { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
+
+int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
+ { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
+
+static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */
+
+static int tcp_timer_race;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race,
+ 0, "Count of t_inpcb races on tcp_discardcb");
+
+/*
+ * TCP timer processing.
+ */
+
+void
+tcp_timer_delack(void *xtp)
+{
+ struct tcpcb *tp = xtp;
+ struct inpcb *inp;
+ CURVNET_SET(tp->t_vnet);
+
+ inp = tp->t_inpcb;
+ /*
+ * XXXRW: While this assert is in fact correct, bugs in the tcpcb
+ * tear-down mean we need it as a work-around for races between
+ * timers and tcp_discardcb().
+ *
+ * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL"));
+ */
+ if (inp == NULL) {
+ tcp_timer_race++;
+ CURVNET_RESTORE();
+ return;
+ }
+ INP_WLOCK(inp);
+ if ((inp->inp_flags & INP_DROPPED) || callout_pending(&tp->t_timers->tt_delack)
+ || !callout_active(&tp->t_timers->tt_delack)) {
+ INP_WUNLOCK(inp);
+ CURVNET_RESTORE();
+ return;
+ }
+ callout_deactivate(&tp->t_timers->tt_delack);
+
+ tp->t_flags |= TF_ACKNOW;
+ TCPSTAT_INC(tcps_delack);
+ (void) tcp_output(tp);
+ INP_WUNLOCK(inp);
+ CURVNET_RESTORE();
+}
+
+void
+tcp_timer_2msl(void *xtp)
+{
+ struct tcpcb *tp = xtp;
+ struct inpcb *inp;
+ CURVNET_SET(tp->t_vnet);
+#ifdef TCPDEBUG
+ int ostate;
+
+ ostate = tp->t_state;
+#endif
+ /*
+ * XXXRW: Does this actually happen?
+ */
+ INP_INFO_WLOCK(&V_tcbinfo);
+ inp = tp->t_inpcb;
+ /*
+ * XXXRW: While this assert is in fact correct, bugs in the tcpcb
+ * tear-down mean we need it as a work-around for races between
+ * timers and tcp_discardcb().
+ *
+ * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL"));
+ */
+ if (inp == NULL) {
+ tcp_timer_race++;
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+ }
+ INP_WLOCK(inp);
+ tcp_free_sackholes(tp);
+ if ((inp->inp_flags & INP_DROPPED) || callout_pending(&tp->t_timers->tt_2msl) ||
+ !callout_active(&tp->t_timers->tt_2msl)) {
+ INP_WUNLOCK(tp->t_inpcb);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+ }
+ callout_deactivate(&tp->t_timers->tt_2msl);
+ /*
+ * 2 MSL timeout in shutdown went off. If we're closed but
+ * still waiting for peer to close and connection has been idle
+ * too long, or if 2MSL time is up from TIME_WAIT, delete connection
+ * control block. Otherwise, check again in a bit.
+ *
+ * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
+ * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
+ * Ignore fact that there were recent incoming segments.
+ */
+ if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
+ tp->t_inpcb && tp->t_inpcb->inp_socket &&
+ (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
+ TCPSTAT_INC(tcps_finwait2_drops);
+ tp = tcp_close(tp);
+ } else {
+ if (tp->t_state != TCPS_TIME_WAIT &&
+ ticks - tp->t_rcvtime <= tcp_maxidle)
+ callout_reset(&tp->t_timers->tt_2msl, tcp_keepintvl,
+ tcp_timer_2msl, tp);
+ else
+ tp = tcp_close(tp);
+ }
+
+#ifdef TCPDEBUG
+ if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
+ PRU_SLOWTIMO);
+#endif
+ if (tp != NULL)
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+}
+
+void
+tcp_timer_keep(void *xtp)
+{
+ struct tcpcb *tp = xtp;
+ struct tcptemp *t_template;
+ struct inpcb *inp;
+ CURVNET_SET(tp->t_vnet);
+#ifdef TCPDEBUG
+ int ostate;
+
+ ostate = tp->t_state;
+#endif
+ INP_INFO_WLOCK(&V_tcbinfo);
+ inp = tp->t_inpcb;
+ /*
+ * XXXRW: While this assert is in fact correct, bugs in the tcpcb
+ * tear-down mean we need it as a work-around for races between
+ * timers and tcp_discardcb().
+ *
+ * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL"));
+ */
+ if (inp == NULL) {
+ tcp_timer_race++;
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+ }
+ INP_WLOCK(inp);
+ if ((inp->inp_flags & INP_DROPPED) || callout_pending(&tp->t_timers->tt_keep)
+ || !callout_active(&tp->t_timers->tt_keep)) {
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+ }
+ callout_deactivate(&tp->t_timers->tt_keep);
+ /*
+ * Keep-alive timer went off; send something
+ * or drop connection if idle for too long.
+ */
+ TCPSTAT_INC(tcps_keeptimeo);
+ if (tp->t_state < TCPS_ESTABLISHED)
+ goto dropit;
+ if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
+ tp->t_state <= TCPS_CLOSING) {
+ if (ticks - tp->t_rcvtime >= tcp_keepidle + tcp_maxidle)
+ goto dropit;
+ /*
+ * Send a packet designed to force a response
+ * if the peer is up and reachable:
+ * either an ACK if the connection is still alive,
+ * or an RST if the peer has closed the connection
+ * due to timeout or reboot.
+ * Using sequence number tp->snd_una-1
+ * causes the transmitted zero-length segment
+ * to lie outside the receive window;
+ * by the protocol spec, this requires the
+ * correspondent TCP to respond.
+ */
+ TCPSTAT_INC(tcps_keepprobe);
+ t_template = tcpip_maketemplate(inp);
+ if (t_template) {
+ tcp_respond(tp, t_template->tt_ipgen,
+ &t_template->tt_t, (struct mbuf *)NULL,
+ tp->rcv_nxt, tp->snd_una - 1, 0);
+ free(t_template, M_TEMP);
+ }
+ callout_reset(&tp->t_timers->tt_keep, tcp_keepintvl, tcp_timer_keep, tp);
+ } else
+ callout_reset(&tp->t_timers->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
+
+#ifdef TCPDEBUG
+ if (inp->inp_socket->so_options & SO_DEBUG)
+ tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
+ PRU_SLOWTIMO);
+#endif
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+
+dropit:
+ TCPSTAT_INC(tcps_keepdrops);
+ tp = tcp_drop(tp, ETIMEDOUT);
+
+#ifdef TCPDEBUG
+ if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
+ PRU_SLOWTIMO);
+#endif
+ if (tp != NULL)
+ INP_WUNLOCK(tp->t_inpcb);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+}
+
+void
+tcp_timer_persist(void *xtp)
+{
+ struct tcpcb *tp = xtp;
+ struct inpcb *inp;
+ CURVNET_SET(tp->t_vnet);
+#ifdef TCPDEBUG
+ int ostate;
+
+ ostate = tp->t_state;
+#endif
+ INP_INFO_WLOCK(&V_tcbinfo);
+ inp = tp->t_inpcb;
+ /*
+ * XXXRW: While this assert is in fact correct, bugs in the tcpcb
+ * tear-down mean we need it as a work-around for races between
+ * timers and tcp_discardcb().
+ *
+ * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL"));
+ */
+ if (inp == NULL) {
+ tcp_timer_race++;
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+ }
+ INP_WLOCK(inp);
+ if ((inp->inp_flags & INP_DROPPED) || callout_pending(&tp->t_timers->tt_persist)
+ || !callout_active(&tp->t_timers->tt_persist)) {
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+ }
+ callout_deactivate(&tp->t_timers->tt_persist);
+ /*
+ * Persistance timer into zero window.
+ * Force a byte to be output, if possible.
+ */
+ TCPSTAT_INC(tcps_persisttimeo);
+ /*
+ * Hack: if the peer is dead/unreachable, we do not
+ * time out if the window is closed. After a full
+ * backoff, drop the connection if the idle time
+ * (no responses to probes) reaches the maximum
+ * backoff that we would use if retransmitting.
+ */
+ if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
+ (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
+ ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
+ TCPSTAT_INC(tcps_persistdrop);
+ tp = tcp_drop(tp, ETIMEDOUT);
+ goto out;
+ }
+ tcp_setpersist(tp);
+ tp->t_flags |= TF_FORCEDATA;
+ (void) tcp_output(tp);
+ tp->t_flags &= ~TF_FORCEDATA;
+
+out:
+#ifdef TCPDEBUG
+ if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
+ tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
+#endif
+ if (tp != NULL)
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+}
+
+void
+tcp_timer_rexmt(void * xtp)
+{
+ struct tcpcb *tp = xtp;
+ CURVNET_SET(tp->t_vnet);
+ int rexmt;
+ int headlocked;
+ struct inpcb *inp;
+#ifdef TCPDEBUG
+ int ostate;
+
+ ostate = tp->t_state;
+#endif
+ INP_INFO_WLOCK(&V_tcbinfo);
+ headlocked = 1;
+ inp = tp->t_inpcb;
+ /*
+ * XXXRW: While this assert is in fact correct, bugs in the tcpcb
+ * tear-down mean we need it as a work-around for races between
+ * timers and tcp_discardcb().
+ *
+ * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL"));
+ */
+ if (inp == NULL) {
+ tcp_timer_race++;
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+ }
+ INP_WLOCK(inp);
+ if ((inp->inp_flags & INP_DROPPED) || callout_pending(&tp->t_timers->tt_rexmt)
+ || !callout_active(&tp->t_timers->tt_rexmt)) {
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+ }
+ callout_deactivate(&tp->t_timers->tt_rexmt);
+ tcp_free_sackholes(tp);
+ /*
+ * Retransmission timer went off. Message has not
+ * been acked within retransmit interval. Back off
+ * to a longer retransmit interval and retransmit one segment.
+ */
+ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
+ tp->t_rxtshift = TCP_MAXRXTSHIFT;
+ TCPSTAT_INC(tcps_timeoutdrop);
+ tp = tcp_drop(tp, tp->t_softerror ?
+ tp->t_softerror : ETIMEDOUT);
+ goto out;
+ }
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ headlocked = 0;
+ if (tp->t_rxtshift == 1) {
+ /*
+ * first retransmit; record ssthresh and cwnd so they can
+ * be recovered if this turns out to be a "bad" retransmit.
+ * A retransmit is considered "bad" if an ACK for this
+ * segment is received within RTT/2 interval; the assumption
+ * here is that the ACK was already in flight. See
+ * "On Estimating End-to-End Network Path Properties" by
+ * Allman and Paxson for more details.
+ */
+ tp->snd_cwnd_prev = tp->snd_cwnd;
+ tp->snd_ssthresh_prev = tp->snd_ssthresh;
+ tp->snd_recover_prev = tp->snd_recover;
+ if (IN_FASTRECOVERY(tp))
+ tp->t_flags |= TF_WASFRECOVERY;
+ else
+ tp->t_flags &= ~TF_WASFRECOVERY;
+ tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+ }
+ TCPSTAT_INC(tcps_rexmttimeo);
+ if (tp->t_state == TCPS_SYN_SENT)
+ rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
+ else
+ rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
+ TCPT_RANGESET(tp->t_rxtcur, rexmt,
+ tp->t_rttmin, TCPTV_REXMTMAX);
+ /*
+ * Disable rfc1323 if we havn't got any response to
+ * our third SYN to work-around some broken terminal servers
+ * (most of which have hopefully been retired) that have bad VJ
+ * header compression code which trashes TCP segments containing
+ * unknown-to-them TCP options.
+ */
+ if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3))
+ tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP);
+ /*
+ * If we backed off this far, our srtt estimate is probably bogus.
+ * Clobber it so we'll take the next rtt measurement as our srtt;
+ * move the current srtt into rttvar to keep the current
+ * retransmit times until then.
+ */
+ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
+#ifdef INET6
+ if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
+ in6_losing(tp->t_inpcb);
+ else
+#endif
+ tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
+ tp->t_srtt = 0;
+ }
+ tp->snd_nxt = tp->snd_una;
+ tp->snd_recover = tp->snd_max;
+ /*
+ * Force a segment to be sent.
+ */
+ tp->t_flags |= TF_ACKNOW;
+ /*
+ * If timing a segment in this window, stop the timer.
+ */
+ tp->t_rtttime = 0;
+ /*
+ * Close the congestion window down to one segment
+ * (we'll open it by one segment for each ack we get).
+ * Since we probably have a window's worth of unacked
+ * data accumulated, this "slow start" keeps us from
+ * dumping all that data as back-to-back packets (which
+ * might overwhelm an intermediate gateway).
+ *
+ * There are two phases to the opening: Initially we
+ * open by one mss on each ack. This makes the window
+ * size increase exponentially with time. If the
+ * window is larger than the path can handle, this
+ * exponential growth results in dropped packet(s)
+ * almost immediately. To get more time between
+ * drops but still "push" the network to take advantage
+ * of improving conditions, we switch from exponential
+ * to linear window opening at some threshhold size.
+ * For a threshhold, we use half the current window
+ * size, truncated to a multiple of the mss.
+ *
+ * (the minimum cwnd that will give us exponential
+ * growth is 2 mss. We don't allow the threshhold
+ * to go below this.)
+ */
+ {
+ u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
+ if (win < 2)
+ win = 2;
+ tp->snd_cwnd = tp->t_maxseg;
+ tp->snd_ssthresh = win * tp->t_maxseg;
+ tp->t_dupacks = 0;
+ }
+ EXIT_FASTRECOVERY(tp);
+ tp->t_bytes_acked = 0;
+ (void) tcp_output(tp);
+
+out:
+#ifdef TCPDEBUG
+ if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
+ PRU_SLOWTIMO);
+#endif
+ if (tp != NULL)
+ INP_WUNLOCK(inp);
+ if (headlocked)
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+}
+
+void
+tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
+{
+ struct callout *t_callout;
+ void *f_callout;
+
+ switch (timer_type) {
+ case TT_DELACK:
+ t_callout = &tp->t_timers->tt_delack;
+ f_callout = tcp_timer_delack;
+ break;
+ case TT_REXMT:
+ t_callout = &tp->t_timers->tt_rexmt;
+ f_callout = tcp_timer_rexmt;
+ break;
+ case TT_PERSIST:
+ t_callout = &tp->t_timers->tt_persist;
+ f_callout = tcp_timer_persist;
+ break;
+ case TT_KEEP:
+ t_callout = &tp->t_timers->tt_keep;
+ f_callout = tcp_timer_keep;
+ break;
+ case TT_2MSL:
+ t_callout = &tp->t_timers->tt_2msl;
+ f_callout = tcp_timer_2msl;
+ break;
+ default:
+ panic("bad timer_type");
+ }
+ if (delta == 0) {
+ callout_stop(t_callout);
+ } else {
+ callout_reset(t_callout, delta, f_callout, tp);
+ }
+}
+
+int
+tcp_timer_active(struct tcpcb *tp, int timer_type)
+{
+ struct callout *t_callout;
+
+ switch (timer_type) {
+ case TT_DELACK:
+ t_callout = &tp->t_timers->tt_delack;
+ break;
+ case TT_REXMT:
+ t_callout = &tp->t_timers->tt_rexmt;
+ break;
+ case TT_PERSIST:
+ t_callout = &tp->t_timers->tt_persist;
+ break;
+ case TT_KEEP:
+ t_callout = &tp->t_timers->tt_keep;
+ break;
+ case TT_2MSL:
+ t_callout = &tp->t_timers->tt_2msl;
+ break;
+ default:
+ panic("bad timer_type");
+ }
+ return callout_active(t_callout);
+}
diff --git a/freebsd/sys/netinet/tcp_timer.h b/freebsd/sys/netinet/tcp_timer.h
new file mode 100644
index 00000000..1514a293
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_timer.h
@@ -0,0 +1,183 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_timer.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_TIMER_HH_
+#define _NETINET_TCP_TIMER_HH_
+
+/*
+ * The TCPT_REXMT timer is used to force retransmissions.
+ * The TCP has the TCPT_REXMT timer set whenever segments
+ * have been sent for which ACKs are expected but not yet
+ * received. If an ACK is received which advances tp->snd_una,
+ * then the retransmit timer is cleared (if there are no more
+ * outstanding segments) or reset to the base value (if there
+ * are more ACKs expected). Whenever the retransmit timer goes off,
+ * we retransmit one unacknowledged segment, and do a backoff
+ * on the retransmit timer.
+ *
+ * The TCPT_PERSIST timer is used to keep window size information
+ * flowing even if the window goes shut. If all previous transmissions
+ * have been acknowledged (so that there are no retransmissions in progress),
+ * and the window is too small to bother sending anything, then we start
+ * the TCPT_PERSIST timer. When it expires, if the window is nonzero,
+ * we go to transmit state. Otherwise, at intervals send a single byte
+ * into the peer's window to force him to update our window information.
+ * We do this at most as often as TCPT_PERSMIN time intervals,
+ * but no more frequently than the current estimate of round-trip
+ * packet time. The TCPT_PERSIST timer is cleared whenever we receive
+ * a window update from the peer.
+ *
+ * The TCPT_KEEP timer is used to keep connections alive. If an
+ * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time,
+ * but not yet established, then we drop the connection. Once the connection
+ * is established, if the connection is idle for TCPTV_KEEP_IDLE time
+ * (and keepalives have been enabled on the socket), we begin to probe
+ * the connection. We force the peer to send us a segment by sending:
+ * <SEQ=SND.UNA-1><ACK=RCV.NXT><CTL=ACK>
+ * This segment is (deliberately) outside the window, and should elicit
+ * an ack segment in response from the peer. If, despite the TCPT_KEEP
+ * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE
+ * amount of time probing, then we drop the connection.
+ */
+
+/*
+ * Time constants.
+ */
+#define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */
+#define TCPTV_SRTTBASE 0 /* base roundtrip time;
+ if 0, no idea yet */
+#define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */
+#define TCPTV_SRTTDFLT ( 3*hz) /* assumed RTT if no info */
+
+#define TCPTV_PERSMIN ( 5*hz) /* retransmit persistence */
+#define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */
+
+#define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */
+#define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */
+#define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */
+#define TCPTV_KEEPCNT 8 /* max probes before drop */
+
+#define TCPTV_INFLIGHT_RTTTHRESH (10*hz/1000) /* below which inflight
+ disengages, in msec */
+
+#define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */
+
+/*
+ * Minimum retransmit timer is 3 ticks, for algorithmic stability.
+ * TCPT_RANGESET() will add another TCPTV_CPU_VAR to deal with
+ * the expected worst-case processing variances by the kernels
+ * representing the end points. Such variances do not always show
+ * up in the srtt because the timestamp is often calculated at
+ * the interface rather then at the TCP layer. This value is
+ * typically 50ms. However, it is also possible that delayed
+ * acks (typically 100ms) could create issues so we set the slop
+ * to 200ms to try to cover it. Note that, properly speaking,
+ * delayed-acks should not create a major issue for interactive
+ * environments which 'P'ush the last segment, at least as
+ * long as implementations do the required 'at least one ack
+ * for every two packets' for the non-interactive streaming case.
+ * (maybe the RTO calculation should use 2*RTT instead of RTT
+ * to handle the ack-every-other-packet case).
+ *
+ * The prior minimum of 1*hz (1 second) badly breaks throughput on any
+ * networks faster then a modem that has minor (e.g. 1%) packet loss.
+ */
+#define TCPTV_MIN ( hz/33 ) /* minimum allowable value */
+#define TCPTV_CPU_VAR ( hz/5 ) /* cpu variance allowed (200ms) */
+#define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */
+
+#define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */
+
+#define TCP_LINGERTIME 120 /* linger at most 2 minutes */
+
+#define TCP_MAXRXTSHIFT 12 /* maximum retransmits */
+
+#define TCPTV_DELACK (hz / PR_FASTHZ / 2) /* 100ms timeout */
+
+#ifdef TCPTIMERS
+static const char *tcptimers[] =
+ { "REXMT", "PERSIST", "KEEP", "2MSL" };
+#endif
+
+/*
+ * Force a time value to be in a certain range.
+ */
+#define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \
+ (tv) = (value) + tcp_rexmit_slop; \
+ if ((u_long)(tv) < (u_long)(tvmin)) \
+ (tv) = (tvmin); \
+ if ((u_long)(tv) > (u_long)(tvmax)) \
+ (tv) = (tvmax); \
+} while(0)
+
+#ifdef _KERNEL
+
+struct tcp_timer {
+ struct callout tt_rexmt; /* retransmit timer */
+ struct callout tt_persist; /* retransmit persistence */
+ struct callout tt_keep; /* keepalive */
+ struct callout tt_2msl; /* 2*msl TIME_WAIT timer */
+ struct callout tt_delack; /* delayed ACK timer */
+};
+#define TT_DELACK 0x01
+#define TT_REXMT 0x02
+#define TT_PERSIST 0x04
+#define TT_KEEP 0x08
+#define TT_2MSL 0x10
+
+extern int tcp_keepinit; /* time to establish connection */
+extern int tcp_keepidle; /* time before keepalive probes begin */
+extern int tcp_keepintvl; /* time between keepalive probes */
+extern int tcp_maxidle; /* time to drop after starting probes */
+extern int tcp_delacktime; /* time before sending a delayed ACK */
+extern int tcp_maxpersistidle;
+extern int tcp_rexmit_min;
+extern int tcp_rexmit_slop;
+extern int tcp_msl;
+extern int tcp_ttl; /* time to live for TCP segs */
+extern int tcp_backoff[];
+
+extern int tcp_finwait2_timeout;
+extern int tcp_fast_finwait2_recycle;
+
+void tcp_timer_init(void);
+void tcp_timer_2msl(void *xtp);
+struct tcptw *
+ tcp_tw_2msl_scan(int _reuse); /* XXX temporary */
+void tcp_timer_keep(void *xtp);
+void tcp_timer_persist(void *xtp);
+void tcp_timer_rexmt(void *xtp);
+void tcp_timer_delack(void *xtp);
+
+#endif /* _KERNEL */
+
+#endif /* !_NETINET_TCP_TIMER_HH_ */
diff --git a/freebsd/sys/netinet/tcp_timewait.c b/freebsd/sys/netinet/tcp_timewait.c
new file mode 100644
index 00000000..92643d0a
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_timewait.c
@@ -0,0 +1,618 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_tcpdebug.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/callout.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/random.h>
+
+#include <freebsd/vm/uma.h>
+
+#include <freebsd/net/route.h>
+#include <freebsd/net/if.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/ip.h>
+#ifdef INET6
+#include <freebsd/netinet/ip6.h>
+#endif
+#include <freebsd/netinet/in_pcb.h>
+#ifdef INET6
+#include <freebsd/netinet6/in6_pcb.h>
+#endif
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip_var.h>
+#ifdef INET6
+#include <freebsd/netinet6/ip6_var.h>
+#include <freebsd/netinet6/scope6_var.h>
+#include <freebsd/netinet6/nd6.h>
+#endif
+#include <freebsd/netinet/ip_icmp.h>
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_fsm.h>
+#include <freebsd/netinet/tcp_seq.h>
+#include <freebsd/netinet/tcp_timer.h>
+#include <freebsd/netinet/tcp_var.h>
+#ifdef INET6
+#include <freebsd/netinet6/tcp6_var.h>
+#endif
+#include <freebsd/netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <freebsd/netinet/tcp_debug.h>
+#endif
+#include <freebsd/netinet6/ip6protosw.h>
+
+#include <freebsd/machine/in_cksum.h>
+
+#include <freebsd/security/mac/mac_framework.h>
+
+static VNET_DEFINE(uma_zone_t, tcptw_zone);
+#define V_tcptw_zone VNET(tcptw_zone)
+static int maxtcptw;
+
+/*
+ * The timed wait queue contains references to each of the TCP sessions
+ * currently in the TIME_WAIT state. The queue pointers, including the
+ * queue pointers in each tcptw structure, are protected using the global
+ * tcbinfo lock, which must be held over queue iteration and modification.
+ */
+static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
+#define V_twq_2msl VNET(twq_2msl)
+
+static void tcp_tw_2msl_reset(struct tcptw *, int);
+static void tcp_tw_2msl_stop(struct tcptw *);
+
+static int
+tcptw_auto_size(void)
+{
+ int halfrange;
+
+ /*
+ * Max out at half the ephemeral port range so that TIME_WAIT
+ * sockets don't tie up too many ephemeral ports.
+ */
+ if (V_ipport_lastauto > V_ipport_firstauto)
+ halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2;
+ else
+ halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2;
+ /* Protect against goofy port ranges smaller than 32. */
+ return (imin(imax(halfrange, 32), maxsockets / 5));
+}
+
+static int
+sysctl_maxtcptw(SYSCTL_HANDLER_ARGS)
+{
+ int error, new;
+
+ if (maxtcptw == 0)
+ new = tcptw_auto_size();
+ else
+ new = maxtcptw;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error == 0 && req->newptr)
+ if (new >= 32) {
+ maxtcptw = new;
+ uma_zone_set_max(V_tcptw_zone, maxtcptw);
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW,
+ &maxtcptw, 0, sysctl_maxtcptw, "IU",
+ "Maximum number of compressed TCP TIME_WAIT entries");
+
+VNET_DEFINE(int, nolocaltimewait) = 0;
+#define V_nolocaltimewait VNET(nolocaltimewait)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_RW,
+ &VNET_NAME(nolocaltimewait), 0,
+ "Do not create compressed TCP TIME_WAIT entries for local connections");
+
+void
+tcp_tw_zone_change(void)
+{
+
+ if (maxtcptw == 0)
+ uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
+}
+
+void
+tcp_tw_init(void)
+{
+
+ V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw);
+ if (maxtcptw == 0)
+ uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
+ else
+ uma_zone_set_max(V_tcptw_zone, maxtcptw);
+ TAILQ_INIT(&V_twq_2msl);
+}
+
+#ifdef VIMAGE
+void
+tcp_tw_destroy(void)
+{
+ struct tcptw *tw;
+
+ INP_INFO_WLOCK(&V_tcbinfo);
+ while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
+ tcp_twclose(tw, 0);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+
+ uma_zdestroy(V_tcptw_zone);
+}
+#endif
+
+/*
+ * Move a TCP connection into TIME_WAIT state.
+ * tcbinfo is locked.
+ * inp is locked, and is unlocked before returning.
+ */
+void
+tcp_twstart(struct tcpcb *tp)
+{
+ struct tcptw *tw;
+ struct inpcb *inp = tp->t_inpcb;
+ int acknow;
+ struct socket *so;
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */
+ INP_WLOCK_ASSERT(inp);
+
+ if (V_nolocaltimewait && in_localip(inp->inp_faddr)) {
+ tp = tcp_close(tp);
+ if (tp != NULL)
+ INP_WUNLOCK(inp);
+ return;
+ }
+
+ tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
+ if (tw == NULL) {
+ tw = tcp_tw_2msl_scan(1);
+ if (tw == NULL) {
+ tp = tcp_close(tp);
+ if (tp != NULL)
+ INP_WUNLOCK(inp);
+ return;
+ }
+ }
+ tw->tw_inpcb = inp;
+
+ /*
+ * Recover last window size sent.
+ */
+ tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;
+
+ /*
+ * Set t_recent if timestamps are used on the connection.
+ */
+ if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
+ (TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
+ tw->t_recent = tp->ts_recent;
+ tw->ts_offset = tp->ts_offset;
+ } else {
+ tw->t_recent = 0;
+ tw->ts_offset = 0;
+ }
+
+ tw->snd_nxt = tp->snd_nxt;
+ tw->rcv_nxt = tp->rcv_nxt;
+ tw->iss = tp->iss;
+ tw->irs = tp->irs;
+ tw->t_starttime = tp->t_starttime;
+ tw->tw_time = 0;
+
+/* XXX
+ * If this code will
+ * be used for fin-wait-2 state also, then we may need
+ * a ts_recent from the last segment.
+ */
+ acknow = tp->t_flags & TF_ACKNOW;
+
+ /*
+ * First, discard tcpcb state, which includes stopping its timers and
+ * freeing it. tcp_discardcb() used to also release the inpcb, but
+ * that work is now done in the caller.
+ *
+ * Note: soisdisconnected() call used to be made in tcp_discardcb(),
+ * and might not be needed here any longer.
+ */
+ tcp_discardcb(tp);
+ so = inp->inp_socket;
+ soisdisconnected(so);
+ tw->tw_cred = crhold(so->so_cred);
+ SOCK_LOCK(so);
+ tw->tw_so_options = so->so_options;
+ SOCK_UNLOCK(so);
+ if (acknow)
+ tcp_twrespond(tw, TH_ACK);
+ inp->inp_ppcb = tw;
+ inp->inp_flags |= INP_TIMEWAIT;
+ tcp_tw_2msl_reset(tw, 0);
+
+ /*
+ * If the inpcb owns the sole reference to the socket, then we can
+ * detach and free the socket as it is not needed in time wait.
+ */
+ if (inp->inp_flags & INP_SOCKREF) {
+ KASSERT(so->so_state & SS_PROTOREF,
+ ("tcp_twstart: !SS_PROTOREF"));
+ inp->inp_flags &= ~INP_SOCKREF;
+ INP_WUNLOCK(inp);
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_PROTOREF;
+ sofree(so);
+ } else
+ INP_WUNLOCK(inp);
+}
+
+#if 0
+/*
+ * The appromixate rate of ISN increase of Microsoft TCP stacks;
+ * the actual rate is slightly higher due to the addition of
+ * random positive increments.
+ *
+ * Most other new OSes use semi-randomized ISN values, so we
+ * do not need to worry about them.
+ */
+#define MS_ISN_BYTES_PER_SECOND 250000
+
+/*
+ * Determine if the ISN we will generate has advanced beyond the last
+ * sequence number used by the previous connection. If so, indicate
+ * that it is safe to recycle this tw socket by returning 1.
+ */
+int
+tcp_twrecycleable(struct tcptw *tw)
+{
+ tcp_seq new_iss = tw->iss;
+ tcp_seq new_irs = tw->irs;
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz);
+ new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz);
+
+ if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt))
+ return (1);
+ else
+ return (0);
+}
+#endif
+
+/*
+ * Returns 1 if the TIME_WAIT state was killed and we should start over,
+ * looking for a pcb in the listen state. Returns 0 otherwise.
+ */
+int
+tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
+ struct mbuf *m, int tlen)
+{
+ struct tcptw *tw;
+ int thflags;
+ tcp_seq seq;
+
+ /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ /*
+ * XXXRW: Time wait state for inpcb has been recycled, but inpcb is
+ * still present. This is undesirable, but temporarily necessary
+ * until we work out how to handle inpcb's who's timewait state has
+ * been removed.
+ */
+ tw = intotw(inp);
+ if (tw == NULL)
+ goto drop;
+
+ thflags = th->th_flags;
+
+ /*
+ * NOTE: for FIN_WAIT_2 (to be added later),
+ * must validate sequence number before accepting RST
+ */
+
+ /*
+ * If the segment contains RST:
+ * Drop the segment - see Stevens, vol. 2, p. 964 and
+ * RFC 1337.
+ */
+ if (thflags & TH_RST)
+ goto drop;
+
+#if 0
+/* PAWS not needed at the moment */
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment
+ * and it's less than ts_recent, drop it.
+ */
+ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to.to_tsval, tp->ts_recent)) {
+ if ((thflags & TH_ACK) == 0)
+ goto drop;
+ goto ack;
+ }
+ /*
+ * ts_recent is never updated because we never accept new segments.
+ */
+#endif
+
+ /*
+ * If a new connection request is received
+ * while in TIME_WAIT, drop the old connection
+ * and start over if the sequence numbers
+ * are above the previous ones.
+ */
+ if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
+ tcp_twclose(tw, 0);
+ return (1);
+ }
+
+ /*
+ * Drop the the segment if it does not contain an ACK.
+ */
+ if ((thflags & TH_ACK) == 0)
+ goto drop;
+
+ /*
+ * Reset the 2MSL timer if this is a duplicate FIN.
+ */
+ if (thflags & TH_FIN) {
+ seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0);
+ if (seq + 1 == tw->rcv_nxt)
+ tcp_tw_2msl_reset(tw, 1);
+ }
+
+ /*
+ * Acknowledge the segment if it has data or is not a duplicate ACK.
+ */
+ if (thflags != TH_ACK || tlen != 0 ||
+ th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt)
+ tcp_twrespond(tw, TH_ACK);
+drop:
+ INP_WUNLOCK(inp);
+ m_freem(m);
+ return (0);
+}
+
+void
+tcp_twclose(struct tcptw *tw, int reuse)
+{
+ struct socket *so;
+ struct inpcb *inp;
+
+ /*
+ * At this point, we are in one of two situations:
+ *
+ * (1) We have no socket, just an inpcb<->twtcp pair. We can free
+ * all state.
+ *
+ * (2) We have a socket -- if we own a reference, release it and
+ * notify the socket layer.
+ */
+ inp = tw->tw_inpcb;
+ KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
+ KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */
+ INP_WLOCK_ASSERT(inp);
+
+ tw->tw_inpcb = NULL;
+ tcp_tw_2msl_stop(tw);
+ inp->inp_ppcb = NULL;
+ in_pcbdrop(inp);
+
+ so = inp->inp_socket;
+ if (so != NULL) {
+ /*
+ * If there's a socket, handle two cases: first, we own a
+ * strong reference, which we will now release, or we don't
+ * in which case another reference exists (XXXRW: think
+ * about this more), and we don't need to take action.
+ */
+ if (inp->inp_flags & INP_SOCKREF) {
+ inp->inp_flags &= ~INP_SOCKREF;
+ INP_WUNLOCK(inp);
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ KASSERT(so->so_state & SS_PROTOREF,
+ ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF"));
+ so->so_state &= ~SS_PROTOREF;
+ sofree(so);
+ } else {
+ /*
+ * If we don't own the only reference, the socket and
+ * inpcb need to be left around to be handled by
+ * tcp_usr_detach() later.
+ */
+ INP_WUNLOCK(inp);
+ }
+ } else
+ in_pcbfree(inp);
+ TCPSTAT_INC(tcps_closed);
+ crfree(tw->tw_cred);
+ tw->tw_cred = NULL;
+ if (reuse)
+ return;
+ uma_zfree(V_tcptw_zone, tw);
+}
+
+int
+tcp_twrespond(struct tcptw *tw, int flags)
+{
+ struct inpcb *inp = tw->tw_inpcb;
+ struct tcphdr *th;
+ struct mbuf *m;
+ struct ip *ip = NULL;
+ u_int hdrlen, optlen;
+ int error;
+ struct tcpopt to;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+ int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
+#endif
+
+ INP_WLOCK_ASSERT(inp);
+
+ m = m_gethdr(M_DONTWAIT, MT_DATA);
+ if (m == NULL)
+ return (ENOBUFS);
+ m->m_data += max_linkhdr;
+
+#ifdef MAC
+ mac_inpcb_create_mbuf(inp, m);
+#endif
+
+#ifdef INET6
+ if (isipv6) {
+ hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ ip6 = mtod(m, struct ip6_hdr *);
+ th = (struct tcphdr *)(ip6 + 1);
+ tcpip_fillheaders(inp, ip6, th);
+ } else
+#endif
+ {
+ hdrlen = sizeof(struct tcpiphdr);
+ ip = mtod(m, struct ip *);
+ th = (struct tcphdr *)(ip + 1);
+ tcpip_fillheaders(inp, ip, th);
+ }
+ to.to_flags = 0;
+
+ /*
+ * Send a timestamp and echo-reply if both our side and our peer
+ * have sent timestamps in our SYN's and this is not a RST.
+ */
+ if (tw->t_recent && flags == TH_ACK) {
+ to.to_flags |= TOF_TS;
+ to.to_tsval = ticks + tw->ts_offset;
+ to.to_tsecr = tw->t_recent;
+ }
+ optlen = tcp_addoptions(&to, (u_char *)(th + 1));
+
+ m->m_len = hdrlen + optlen;
+ m->m_pkthdr.len = m->m_len;
+
+ KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
+
+ th->th_seq = htonl(tw->snd_nxt);
+ th->th_ack = htonl(tw->rcv_nxt);
+ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
+ th->th_flags = flags;
+ th->th_win = htons(tw->last_win);
+
+#ifdef INET6
+ if (isipv6) {
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
+ sizeof(struct tcphdr) + optlen);
+ ip6->ip6_hlim = in6_selecthlim(inp, NULL);
+ error = ip6_output(m, inp->in6p_outputopts, NULL,
+ (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
+ } else
+#endif
+ {
+ th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
+ m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ ip->ip_len = m->m_pkthdr.len;
+ if (V_path_mtu_discovery)
+ ip->ip_off |= IP_DF;
+ error = ip_output(m, inp->inp_options, NULL,
+ ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
+ NULL, inp);
+ }
+ if (flags & TH_ACK)
+ TCPSTAT_INC(tcps_sndacks);
+ else
+ TCPSTAT_INC(tcps_sndctrl);
+ TCPSTAT_INC(tcps_sndtotal);
+ return (error);
+}
+
+static void
+tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
+{
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tw->tw_inpcb);
+ if (rearm)
+ TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
+ tw->tw_time = ticks + 2 * tcp_msl;
+ TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
+}
+
+static void
+tcp_tw_2msl_stop(struct tcptw *tw)
+{
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
+}
+
+struct tcptw *
+tcp_tw_2msl_scan(int reuse)
+{
+ struct tcptw *tw;
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ for (;;) {
+ tw = TAILQ_FIRST(&V_twq_2msl);
+ if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0))
+ break;
+ INP_WLOCK(tw->tw_inpcb);
+ tcp_twclose(tw, reuse);
+ if (reuse)
+ return (tw);
+ }
+ return (NULL);
+}
diff --git a/freebsd/sys/netinet/tcp_usrreq.c b/freebsd/sys/netinet/tcp_usrreq.c
new file mode 100644
index 00000000..fc083e05
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_usrreq.c
@@ -0,0 +1,1886 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California.
+ * Copyright (c) 2006-2007 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_ddb.h>
+#include <freebsd/local/opt_inet.h>
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_tcpdebug.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/systm.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/mbuf.h>
+#ifdef INET6
+#include <freebsd/sys/domain.h>
+#endif /* INET6 */
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/jail.h>
+
+#ifdef DDB
+#include <freebsd/ddb/ddb.h>
+#endif
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+#include <freebsd/net/vnet.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_systm.h>
+#ifdef INET6
+#include <freebsd/netinet/ip6.h>
+#endif
+#include <freebsd/netinet/in_pcb.h>
+#ifdef INET6
+#include <freebsd/netinet6/in6_pcb.h>
+#endif
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip_var.h>
+#ifdef INET6
+#include <freebsd/netinet6/ip6_var.h>
+#include <freebsd/netinet6/scope6_var.h>
+#endif
+#include <freebsd/netinet/tcp.h>
+#include <freebsd/netinet/tcp_fsm.h>
+#include <freebsd/netinet/tcp_seq.h>
+#include <freebsd/netinet/tcp_timer.h>
+#include <freebsd/netinet/tcp_var.h>
+#include <freebsd/netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <freebsd/netinet/tcp_debug.h>
+#endif
+#include <freebsd/netinet/tcp_offload.h>
+
+/*
+ * TCP protocol interface to socket abstraction.
+ */
+static int tcp_attach(struct socket *);
+static int tcp_connect(struct tcpcb *, struct sockaddr *,
+ struct thread *td);
+#ifdef INET6
+static int tcp6_connect(struct tcpcb *, struct sockaddr *,
+ struct thread *td);
+#endif /* INET6 */
+static void tcp_disconnect(struct tcpcb *);
+static void tcp_usrclosed(struct tcpcb *);
+static void tcp_fill_info(struct tcpcb *, struct tcp_info *);
+
+#ifdef TCPDEBUG
+#define TCPDEBUG0 int ostate = 0
+#define TCPDEBUG1() ostate = tp ? tp->t_state : 0
+#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \
+ tcp_trace(TA_USER, ostate, tp, 0, 0, req)
+#else
+#define TCPDEBUG0
+#define TCPDEBUG1()
+#define TCPDEBUG2(req)
+#endif
+
+/*
+ * TCP attaches to socket via pru_attach(), reserving space,
+ * and an internet control block.
+ */
+static int
+tcp_usr_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ int error;
+ TCPDEBUG0;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
+ TCPDEBUG1();
+
+ error = tcp_attach(so);
+ if (error)
+ goto out;
+
+ if ((so->so_options & SO_LINGER) && so->so_linger == 0)
+ so->so_linger = TCP_LINGERTIME;
+
+ inp = sotoinpcb(so);
+ tp = intotcpcb(inp);
+out:
+ TCPDEBUG2(PRU_ATTACH);
+ return error;
+}
+
+/*
+ * tcp_detach is called when the socket layer loses its final reference
+ * to the socket, be it a file descriptor reference, a reference from TCP,
+ * etc. At this point, there is only one case in which we will keep around
+ * inpcb state: time wait.
+ *
+ * This function can probably be re-absorbed back into tcp_usr_detach() now
+ * that there is a single detach path.
+ */
+static void
+tcp_detach(struct socket *so, struct inpcb *inp)
+{
+ struct tcpcb *tp;
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
+ KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so"));
+
+ tp = intotcpcb(inp);
+
+ if (inp->inp_flags & INP_TIMEWAIT) {
+ /*
+ * There are two cases to handle: one in which the time wait
+ * state is being discarded (INP_DROPPED), and one in which
+ * this connection will remain in timewait. In the former,
+ * it is time to discard all state (except tcptw, which has
+ * already been discarded by the timewait close code, which
+ * should be further up the call stack somewhere). In the
+ * latter case, we detach from the socket, but leave the pcb
+ * present until timewait ends.
+ *
+ * XXXRW: Would it be cleaner to free the tcptw here?
+ */
+ if (inp->inp_flags & INP_DROPPED) {
+ KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
+ "INP_DROPPED && tp != NULL"));
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+ } else {
+ in_pcbdetach(inp);
+ INP_WUNLOCK(inp);
+ }
+ } else {
+ /*
+ * If the connection is not in timewait, we consider two
+ * two conditions: one in which no further processing is
+ * necessary (dropped || embryonic), and one in which TCP is
+ * not yet done, but no longer requires the socket, so the
+ * pcb will persist for the time being.
+ *
+ * XXXRW: Does the second case still occur?
+ */
+ if (inp->inp_flags & INP_DROPPED ||
+ tp->t_state < TCPS_SYN_SENT) {
+ tcp_discardcb(tp);
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+ } else
+ in_pcbdetach(inp);
+ }
+}
+
+/*
+ * pru_detach() detaches the TCP protocol from the socket.
+ * If the protocol state is non-embryonic, then can't
+ * do this directly: have to initiate a pru_disconnect(),
+ * which may finish later; embryonic TCB's can just
+ * be discarded here.
+ */
+static void
+tcp_usr_detach(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
+ INP_INFO_WLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ KASSERT(inp->inp_socket != NULL,
+ ("tcp_usr_detach: inp_socket == NULL"));
+ tcp_detach(so, inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+}
+
+/*
+ * Give the socket an address.
+ */
+static int
+tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ struct sockaddr_in *sinp;
+
+ sinp = (struct sockaddr_in *)nam;
+ if (nam->sa_len != sizeof (*sinp))
+ return (EINVAL);
+ /*
+ * Must check for multicast addresses and disallow binding
+ * to them.
+ */
+ if (sinp->sin_family == AF_INET &&
+ IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
+ return (EAFNOSUPPORT);
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&V_tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ error = in_pcbbind(inp, nam, td->td_ucred);
+out:
+ TCPDEBUG2(PRU_BIND);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+
+ return (error);
+}
+
+#ifdef INET6
+static int
+tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ struct sockaddr_in6 *sin6p;
+
+ sin6p = (struct sockaddr_in6 *)nam;
+ if (nam->sa_len != sizeof (*sin6p))
+ return (EINVAL);
+ /*
+ * Must check for multicast addresses and disallow binding
+ * to them.
+ */
+ if (sin6p->sin6_family == AF_INET6 &&
+ IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
+ return (EAFNOSUPPORT);
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&V_tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ inp->inp_vflag &= ~INP_IPV4;
+ inp->inp_vflag |= INP_IPV6;
+ if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
+ inp->inp_vflag |= INP_IPV4;
+ else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
+ struct sockaddr_in sin;
+
+ in6_sin6_2_sin(&sin, sin6p);
+ inp->inp_vflag |= INP_IPV4;
+ inp->inp_vflag &= ~INP_IPV6;
+ error = in_pcbbind(inp, (struct sockaddr *)&sin,
+ td->td_ucred);
+ goto out;
+ }
+ }
+ error = in6_pcbbind(inp, nam, td->td_ucred);
+out:
+ TCPDEBUG2(PRU_BIND);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (error);
+}
+#endif /* INET6 */
+
+/*
+ * Prepare to accept connections.
+ */
+static int
+tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&V_tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ SOCK_LOCK(so);
+ error = solisten_proto_check(so);
+ if (error == 0 && inp->inp_lport == 0)
+ error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
+ if (error == 0) {
+ tp->t_state = TCPS_LISTEN;
+ solisten_proto(so, backlog);
+ tcp_offload_listen_open(tp);
+ }
+ SOCK_UNLOCK(so);
+
+out:
+ TCPDEBUG2(PRU_LISTEN);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (error);
+}
+
+#ifdef INET6
+static int
+tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&V_tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ SOCK_LOCK(so);
+ error = solisten_proto_check(so);
+ if (error == 0 && inp->inp_lport == 0) {
+ inp->inp_vflag &= ~INP_IPV4;
+ if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
+ inp->inp_vflag |= INP_IPV4;
+ error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
+ }
+ if (error == 0) {
+ tp->t_state = TCPS_LISTEN;
+ solisten_proto(so, backlog);
+ }
+ SOCK_UNLOCK(so);
+
+out:
+ TCPDEBUG2(PRU_LISTEN);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (error);
+}
+#endif /* INET6 */
+
+/*
+ * Initiate connection to peer.
+ * Create a template for use in transmissions on this connection.
+ * Enter SYN_SENT state, and mark socket as connecting.
+ * Start keep-alive timer, and seed output sequence space.
+ * Send initial segment on connection.
+ */
+static int
+tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ struct sockaddr_in *sinp;
+
+ sinp = (struct sockaddr_in *)nam;
+ if (nam->sa_len != sizeof (*sinp))
+ return (EINVAL);
+ /*
+ * Must disallow TCP ``connections'' to multicast addresses.
+ */
+ if (sinp->sin_family == AF_INET
+ && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
+ return (EAFNOSUPPORT);
+ if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0)
+ return (error);
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&V_tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ if ((error = tcp_connect(tp, nam, td)) != 0)
+ goto out;
+ error = tcp_output_connect(so, nam);
+out:
+ TCPDEBUG2(PRU_CONNECT);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (error);
+}
+
+#ifdef INET6
+static int
+tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ struct sockaddr_in6 *sin6p;
+
+ TCPDEBUG0;
+
+ sin6p = (struct sockaddr_in6 *)nam;
+ if (nam->sa_len != sizeof (*sin6p))
+ return (EINVAL);
+ /*
+ * Must disallow TCP ``connections'' to multicast addresses.
+ */
+ if (sin6p->sin6_family == AF_INET6
+ && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
+ return (EAFNOSUPPORT);
+
+ INP_INFO_WLOCK(&V_tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
+ struct sockaddr_in sin;
+
+ if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ in6_sin6_2_sin(&sin, sin6p);
+ inp->inp_vflag |= INP_IPV4;
+ inp->inp_vflag &= ~INP_IPV6;
+ if ((error = prison_remote_ip4(td->td_ucred,
+ &sin.sin_addr)) != 0)
+ goto out;
+ if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
+ goto out;
+ error = tcp_output_connect(so, nam);
+ goto out;
+ }
+ inp->inp_vflag &= ~INP_IPV4;
+ inp->inp_vflag |= INP_IPV6;
+ inp->inp_inc.inc_flags |= INC_ISIPV6;
+ if ((error = prison_remote_ip6(td->td_ucred, &sin6p->sin6_addr)) != 0)
+ goto out;
+ if ((error = tcp6_connect(tp, nam, td)) != 0)
+ goto out;
+ error = tcp_output_connect(so, nam);
+
+out:
+ TCPDEBUG2(PRU_CONNECT);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (error);
+}
+#endif /* INET6 */
+
+/*
+ * Initiate disconnect from peer.
+ * If connection never passed embryonic stage, just drop;
+ * else if don't need to let data drain, then can just drop anyways,
+ * else have to begin TCP shutdown process: mark socket disconnecting,
+ * drain unread data, state switch to reflect user close, and
+ * send segment (e.g. FIN) to peer. Socket will be really disconnected
+ * when peer sends FIN and acks ours.
+ *
+ * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
+ */
+static int
+tcp_usr_disconnect(struct socket *so)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ int error = 0;
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&V_tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = ECONNRESET;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ tcp_disconnect(tp);
+out:
+ TCPDEBUG2(PRU_DISCONNECT);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (error);
+}
+
+/*
+ * Accept a connection. Essentially all the work is done at higher levels;
+ * just return the address of the peer, storing through addr.
+ *
+ * The rationale for acquiring the tcbinfo lock here is somewhat complicated,
+ * and is described in detail in the commit log entry for r175612. Acquiring
+ * it delays an accept(2) racing with sonewconn(), which inserts the socket
+ * before the inpcb address/port fields are initialized. A better fix would
+ * prevent the socket from being placed in the listen queue until all fields
+ * are fully initialized.
+ */
+static int
+tcp_usr_accept(struct socket *so, struct sockaddr **nam)
+{
+ int error = 0;
+ struct inpcb *inp = NULL;
+ struct tcpcb *tp = NULL;
+ struct in_addr addr;
+ in_port_t port = 0;
+ TCPDEBUG0;
+
+ if (so->so_state & SS_ISDISCONNECTED)
+ return (ECONNABORTED);
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
+ INP_INFO_RLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = ECONNABORTED;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+
+ /*
+ * We inline in_getpeeraddr and COMMON_END here, so that we can
+ * copy the data of interest and defer the malloc until after we
+ * release the lock.
+ */
+ port = inp->inp_fport;
+ addr = inp->inp_faddr;
+
+out:
+ TCPDEBUG2(PRU_ACCEPT);
+ INP_WUNLOCK(inp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ if (error == 0)
+ *nam = in_sockaddr(port, &addr);
+ return error;
+}
+
+#ifdef INET6
+static int
+tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
+{
+ struct inpcb *inp = NULL;
+ int error = 0;
+ struct tcpcb *tp = NULL;
+ struct in_addr addr;
+ struct in6_addr addr6;
+ in_port_t port = 0;
+ int v4 = 0;
+ TCPDEBUG0;
+
+ if (so->so_state & SS_ISDISCONNECTED)
+ return (ECONNABORTED);
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = ECONNABORTED;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+
+ /*
+ * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
+ * copy the data of interest and defer the malloc until after we
+ * release the lock.
+ */
+ if (inp->inp_vflag & INP_IPV4) {
+ v4 = 1;
+ port = inp->inp_fport;
+ addr = inp->inp_faddr;
+ } else {
+ port = inp->inp_fport;
+ addr6 = inp->in6p_faddr;
+ }
+
+out:
+ TCPDEBUG2(PRU_ACCEPT);
+ INP_WUNLOCK(inp);
+ if (error == 0) {
+ if (v4)
+ *nam = in6_v4mapsin6_sockaddr(port, &addr);
+ else
+ *nam = in6_sockaddr(port, &addr6);
+ }
+ return error;
+}
+#endif /* INET6 */
+
+/*
+ * Mark the connection as being incapable of further output.
+ */
+static int
+tcp_usr_shutdown(struct socket *so)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&V_tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("inp == NULL"));
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = ECONNRESET;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ socantsendmore(so);
+ tcp_usrclosed(tp);
+ if (!(inp->inp_flags & INP_DROPPED))
+ error = tcp_output_disconnect(tp);
+
+out:
+ TCPDEBUG2(PRU_SHUTDOWN);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+
+ return (error);
+}
+
+/*
+ * After a receive, possibly send window update to peer.
+ */
+static int
+tcp_usr_rcvd(struct socket *so, int flags)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ int error = 0;
+
+ TCPDEBUG0;
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = ECONNRESET;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ tcp_output_rcvd(tp);
+
+out:
+ TCPDEBUG2(PRU_RCVD);
+ INP_WUNLOCK(inp);
+ return (error);
+}
+
+/*
+ * Do a send by putting data in output queue and updating urgent
+ * marker if URG set. Possibly send more data. Unlike the other
+ * pru_*() routines, the mbuf chains are our responsibility. We
+ * must either enqueue them or free them. The other pru_* routines
+ * generally are caller-frees.
+ */
+static int
+tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
+ struct sockaddr *nam, struct mbuf *control, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ int headlocked = 0;
+#ifdef INET6
+ int isipv6;
+#endif
+ TCPDEBUG0;
+
+ /*
+ * We require the pcbinfo lock in two cases:
+ *
+ * (1) An implied connect is taking place, which can result in
+ * binding IPs and ports and hence modification of the pcb hash
+ * chains.
+ *
+ * (2) PRUS_EOF is set, resulting in explicit close on the send.
+ */
+ if ((nam != NULL) || (flags & PRUS_EOF)) {
+ INP_INFO_WLOCK(&V_tcbinfo);
+ headlocked = 1;
+ }
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ if (control)
+ m_freem(control);
+ if (m)
+ m_freem(m);
+ error = ECONNRESET;
+ goto out;
+ }
+#ifdef INET6
+ isipv6 = nam && nam->sa_family == AF_INET6;
+#endif /* INET6 */
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ if (control) {
+ /* TCP doesn't do control messages (rights, creds, etc) */
+ if (control->m_len) {
+ m_freem(control);
+ if (m)
+ m_freem(m);
+ error = EINVAL;
+ goto out;
+ }
+ m_freem(control); /* empty control, just free it */
+ }
+ if (!(flags & PRUS_OOB)) {
+ sbappendstream(&so->so_snd, m);
+ if (nam && tp->t_state < TCPS_SYN_SENT) {
+ /*
+ * Do implied connect if not yet connected,
+ * initialize window to default value, and
+ * initialize maxseg/maxopd using peer's cached
+ * MSS.
+ */
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+#ifdef INET6
+ if (isipv6)
+ error = tcp6_connect(tp, nam, td);
+ else
+#endif /* INET6 */
+ error = tcp_connect(tp, nam, td);
+ if (error)
+ goto out;
+ tp->snd_wnd = TTCP_CLIENT_SND_WND;
+ tcp_mss(tp, -1);
+ }
+ if (flags & PRUS_EOF) {
+ /*
+ * Close the send side of the connection after
+ * the data is sent.
+ */
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ socantsendmore(so);
+ tcp_usrclosed(tp);
+ }
+ if (headlocked) {
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ headlocked = 0;
+ }
+ if (!(inp->inp_flags & INP_DROPPED)) {
+ if (flags & PRUS_MORETOCOME)
+ tp->t_flags |= TF_MORETOCOME;
+ error = tcp_output_send(tp);
+ if (flags & PRUS_MORETOCOME)
+ tp->t_flags &= ~TF_MORETOCOME;
+ }
+ } else {
+ /*
+ * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
+ */
+ SOCKBUF_LOCK(&so->so_snd);
+ if (sbspace(&so->so_snd) < -512) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ m_freem(m);
+ error = ENOBUFS;
+ goto out;
+ }
+ /*
+ * According to RFC961 (Assigned Protocols),
+ * the urgent pointer points to the last octet
+ * of urgent data. We continue, however,
+ * to consider it to indicate the first octet
+ * of data past the urgent section.
+ * Otherwise, snd_up should be one lower.
+ */
+ sbappendstream_locked(&so->so_snd, m);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (nam && tp->t_state < TCPS_SYN_SENT) {
+ /*
+ * Do implied connect if not yet connected,
+ * initialize window to default value, and
+ * initialize maxseg/maxopd using peer's cached
+ * MSS.
+ */
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+#ifdef INET6
+ if (isipv6)
+ error = tcp6_connect(tp, nam, td);
+ else
+#endif /* INET6 */
+ error = tcp_connect(tp, nam, td);
+ if (error)
+ goto out;
+ tp->snd_wnd = TTCP_CLIENT_SND_WND;
+ tcp_mss(tp, -1);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ headlocked = 0;
+ } else if (nam) {
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ headlocked = 0;
+ }
+ tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
+ tp->t_flags |= TF_FORCEDATA;
+ error = tcp_output_send(tp);
+ tp->t_flags &= ~TF_FORCEDATA;
+ }
+out:
+ TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
+ ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
+ INP_WUNLOCK(inp);
+ if (headlocked)
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (error);
+}
+
+/*
+ * Abort the TCP. Drop the connection abruptly.
+ */
+static void
+tcp_usr_abort(struct socket *so)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ TCPDEBUG0;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
+
+ INP_INFO_WLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ KASSERT(inp->inp_socket != NULL,
+ ("tcp_usr_abort: inp_socket == NULL"));
+
+ /*
+ * If we still have full TCP state, and we're not dropped, drop.
+ */
+ if (!(inp->inp_flags & INP_TIMEWAIT) &&
+ !(inp->inp_flags & INP_DROPPED)) {
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ tcp_drop(tp, ECONNABORTED);
+ TCPDEBUG2(PRU_ABORT);
+ }
+ if (!(inp->inp_flags & INP_DROPPED)) {
+ SOCK_LOCK(so);
+ so->so_state |= SS_PROTOREF;
+ SOCK_UNLOCK(so);
+ inp->inp_flags |= INP_SOCKREF;
+ }
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+}
+
+/*
+ * TCP socket is closed. Start friendly disconnect.
+ */
+static void
+tcp_usr_close(struct socket *so)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ TCPDEBUG0;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
+
+ INP_INFO_WLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ KASSERT(inp->inp_socket != NULL,
+ ("tcp_usr_close: inp_socket == NULL"));
+
+ /*
+ * If we still have full TCP state, and we're not dropped, initiate
+ * a disconnect.
+ */
+ if (!(inp->inp_flags & INP_TIMEWAIT) &&
+ !(inp->inp_flags & INP_DROPPED)) {
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ tcp_disconnect(tp);
+ TCPDEBUG2(PRU_CLOSE);
+ }
+ if (!(inp->inp_flags & INP_DROPPED)) {
+ SOCK_LOCK(so);
+ so->so_state |= SS_PROTOREF;
+ SOCK_UNLOCK(so);
+ inp->inp_flags |= INP_SOCKREF;
+ }
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+}
+
+/*
+ * Receive out-of-band data.
+ */
+static int
+tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+
+ TCPDEBUG0;
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = ECONNRESET;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ if ((so->so_oobmark == 0 &&
+ (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
+ so->so_options & SO_OOBINLINE ||
+ tp->t_oobflags & TCPOOB_HADDATA) {
+ error = EINVAL;
+ goto out;
+ }
+ if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
+ error = EWOULDBLOCK;
+ goto out;
+ }
+ m->m_len = 1;
+ *mtod(m, caddr_t) = tp->t_iobc;
+ if ((flags & MSG_PEEK) == 0)
+ tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
+
+out:
+ TCPDEBUG2(PRU_RCVOOB);
+ INP_WUNLOCK(inp);
+ return (error);
+}
+
+struct pr_usrreqs tcp_usrreqs = {
+ .pru_abort = tcp_usr_abort,
+ .pru_accept = tcp_usr_accept,
+ .pru_attach = tcp_usr_attach,
+ .pru_bind = tcp_usr_bind,
+ .pru_connect = tcp_usr_connect,
+ .pru_control = in_control,
+ .pru_detach = tcp_usr_detach,
+ .pru_disconnect = tcp_usr_disconnect,
+ .pru_listen = tcp_usr_listen,
+ .pru_peeraddr = in_getpeeraddr,
+ .pru_rcvd = tcp_usr_rcvd,
+ .pru_rcvoob = tcp_usr_rcvoob,
+ .pru_send = tcp_usr_send,
+ .pru_shutdown = tcp_usr_shutdown,
+ .pru_sockaddr = in_getsockaddr,
+#if 0
+ .pru_soreceive = soreceive_stream,
+#endif
+ .pru_sosetlabel = in_pcbsosetlabel,
+ .pru_close = tcp_usr_close,
+};
+
+#ifdef INET6
+struct pr_usrreqs tcp6_usrreqs = {
+ .pru_abort = tcp_usr_abort,
+ .pru_accept = tcp6_usr_accept,
+ .pru_attach = tcp_usr_attach,
+ .pru_bind = tcp6_usr_bind,
+ .pru_connect = tcp6_usr_connect,
+ .pru_control = in6_control,
+ .pru_detach = tcp_usr_detach,
+ .pru_disconnect = tcp_usr_disconnect,
+ .pru_listen = tcp6_usr_listen,
+ .pru_peeraddr = in6_mapped_peeraddr,
+ .pru_rcvd = tcp_usr_rcvd,
+ .pru_rcvoob = tcp_usr_rcvoob,
+ .pru_send = tcp_usr_send,
+ .pru_shutdown = tcp_usr_shutdown,
+ .pru_sockaddr = in6_mapped_sockaddr,
+#if 0
+ .pru_soreceive = soreceive_stream,
+#endif
+ .pru_sosetlabel = in_pcbsosetlabel,
+ .pru_close = tcp_usr_close,
+};
+#endif /* INET6 */
+
+/*
+ * Common subroutine to open a TCP connection to remote host specified
+ * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local
+ * port number if needed. Call in_pcbconnect_setup to do the routing and
+ * to choose a local host address (interface). If there is an existing
+ * incarnation of the same connection in TIME-WAIT state and if the remote
+ * host was sending CC options and if the connection duration was < MSL, then
+ * truncate the previous TIME-WAIT state and proceed.
+ * Initialize connection parameters and enter SYN-SENT state.
+ */
+static int
+tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
+{
+ struct inpcb *inp = tp->t_inpcb, *oinp;
+ struct socket *so = inp->inp_socket;
+ struct in_addr laddr;
+ u_short lport;
+ int error;
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ if (inp->inp_lport == 0) {
+ error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Cannot simply call in_pcbconnect, because there might be an
+ * earlier incarnation of this same connection still in
+ * TIME_WAIT state, creating an ADDRINUSE error.
+ */
+ laddr = inp->inp_laddr;
+ lport = inp->inp_lport;
+ error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
+ &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
+ if (error && oinp == NULL)
+ return error;
+ if (oinp)
+ return EADDRINUSE;
+ inp->inp_laddr = laddr;
+ in_pcbrehash(inp);
+
+ /*
+ * Compute window scaling to request:
+ * Scale to fit into sweet spot. See tcp_syncache.c.
+ * XXX: This should move to tcp_output().
+ */
+ while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
+ (TCP_MAXWIN << tp->request_r_scale) < sb_max)
+ tp->request_r_scale++;
+
+ soisconnecting(so);
+ TCPSTAT_INC(tcps_connattempt);
+ tp->t_state = TCPS_SYN_SENT;
+ tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
+ tp->iss = tcp_new_isn(tp);
+ tp->t_bw_rtseq = tp->iss;
+ tcp_sendseqinit(tp);
+
+ return 0;
+}
+
+#ifdef INET6
+static int
+tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
+{
+ struct inpcb *inp = tp->t_inpcb, *oinp;
+ struct socket *so = inp->inp_socket;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
+ struct in6_addr addr6;
+ int error;
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ if (inp->inp_lport == 0) {
+ error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Cannot simply call in_pcbconnect, because there might be an
+ * earlier incarnation of this same connection still in
+ * TIME_WAIT state, creating an ADDRINUSE error.
+ * in6_pcbladdr() also handles scope zone IDs.
+ */
+ error = in6_pcbladdr(inp, nam, &addr6);
+ if (error)
+ return error;
+ oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
+ &sin6->sin6_addr, sin6->sin6_port,
+ IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
+ ? &addr6
+ : &inp->in6p_laddr,
+ inp->inp_lport, 0, NULL);
+ if (oinp)
+ return EADDRINUSE;
+ if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
+ inp->in6p_laddr = addr6;
+ inp->in6p_faddr = sin6->sin6_addr;
+ inp->inp_fport = sin6->sin6_port;
+ /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
+ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
+ if (inp->inp_flags & IN6P_AUTOFLOWLABEL)
+ inp->inp_flow |=
+ (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
+ in_pcbrehash(inp);
+
+ /* Compute window scaling to request. */
+ while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
+ (TCP_MAXWIN << tp->request_r_scale) < sb_max)
+ tp->request_r_scale++;
+
+ soisconnecting(so);
+ TCPSTAT_INC(tcps_connattempt);
+ tp->t_state = TCPS_SYN_SENT;
+ tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
+ tp->iss = tcp_new_isn(tp);
+ tp->t_bw_rtseq = tp->iss;
+ tcp_sendseqinit(tp);
+
+ return 0;
+}
+#endif /* INET6 */
+
+/*
+ * Export TCP internal state information via a struct tcp_info, based on the
+ * Linux 2.6 API. Not ABI compatible as our constants are mapped differently
+ * (TCP state machine, etc). We export all information using FreeBSD-native
+ * constants -- for example, the numeric values for tcpi_state will differ
+ * from Linux.
+ */
+static void
+tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
+{
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ bzero(ti, sizeof(*ti));
+
+ ti->tcpi_state = tp->t_state;
+ if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
+ ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+ if (tp->t_flags & TF_SACK_PERMIT)
+ ti->tcpi_options |= TCPI_OPT_SACK;
+ if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
+ ti->tcpi_options |= TCPI_OPT_WSCALE;
+ ti->tcpi_snd_wscale = tp->snd_scale;
+ ti->tcpi_rcv_wscale = tp->rcv_scale;
+ }
+
+ ti->tcpi_rto = tp->t_rxtcur * tick;
+ ti->tcpi_last_data_recv = (long)(ticks - (int)tp->t_rcvtime) * tick;
+ ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
+ ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
+
+ ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
+ ti->tcpi_snd_cwnd = tp->snd_cwnd;
+
+ /*
+ * FreeBSD-specific extension fields for tcp_info.
+ */
+ ti->tcpi_rcv_space = tp->rcv_wnd;
+ ti->tcpi_rcv_nxt = tp->rcv_nxt;
+ ti->tcpi_snd_wnd = tp->snd_wnd;
+ ti->tcpi_snd_bwnd = tp->snd_bwnd;
+ ti->tcpi_snd_nxt = tp->snd_nxt;
+ ti->tcpi_snd_mss = tp->t_maxseg;
+ ti->tcpi_rcv_mss = tp->t_maxseg;
+ if (tp->t_flags & TF_TOE)
+ ti->tcpi_options |= TCPI_OPT_TOE;
+}
+
+/*
+ * tcp_ctloutput() must drop the inpcb lock before performing copyin on
+ * socket option arguments. When it re-acquires the lock after the copy, it
+ * has to revalidate that the connection is still valid for the socket
+ * option.
+ */
+#define INP_WLOCK_RECHECK(inp) do { \
+ INP_WLOCK(inp); \
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \
+ INP_WUNLOCK(inp); \
+ return (ECONNRESET); \
+ } \
+ tp = intotcpcb(inp); \
+} while(0)
+
+int
+tcp_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ int error, opt, optval;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct tcp_info ti;
+
+ error = 0;
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
+ INP_WLOCK(inp);
+ if (sopt->sopt_level != IPPROTO_TCP) {
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6PROTO) {
+ INP_WUNLOCK(inp);
+ error = ip6_ctloutput(so, sopt);
+ } else {
+#endif /* INET6 */
+ INP_WUNLOCK(inp);
+ error = ip_ctloutput(so, sopt);
+#ifdef INET6
+ }
+#endif
+ return (error);
+ }
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ INP_WUNLOCK(inp);
+ return (ECONNRESET);
+ }
+
+ switch (sopt->sopt_dir) {
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+#ifdef TCP_SIGNATURE
+ case TCP_MD5SIG:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+
+ INP_WLOCK_RECHECK(inp);
+ if (optval > 0)
+ tp->t_flags |= TF_SIGNATURE;
+ else
+ tp->t_flags &= ~TF_SIGNATURE;
+ INP_WUNLOCK(inp);
+ break;
+#endif /* TCP_SIGNATURE */
+ case TCP_NODELAY:
+ case TCP_NOOPT:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+
+ INP_WLOCK_RECHECK(inp);
+ switch (sopt->sopt_name) {
+ case TCP_NODELAY:
+ opt = TF_NODELAY;
+ break;
+ case TCP_NOOPT:
+ opt = TF_NOOPT;
+ break;
+ default:
+ opt = 0; /* dead code to fool gcc */
+ break;
+ }
+
+ if (optval)
+ tp->t_flags |= opt;
+ else
+ tp->t_flags &= ~opt;
+ INP_WUNLOCK(inp);
+ break;
+
+ case TCP_NOPUSH:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+
+ INP_WLOCK_RECHECK(inp);
+ if (optval)
+ tp->t_flags |= TF_NOPUSH;
+ else if (tp->t_flags & TF_NOPUSH) {
+ tp->t_flags &= ~TF_NOPUSH;
+ if (TCPS_HAVEESTABLISHED(tp->t_state))
+ error = tcp_output(tp);
+ }
+ INP_WUNLOCK(inp);
+ break;
+
+ case TCP_MAXSEG:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+
+ INP_WLOCK_RECHECK(inp);
+ if (optval > 0 && optval <= tp->t_maxseg &&
+ optval + 40 >= V_tcp_minmss)
+ tp->t_maxseg = optval;
+ else
+ error = EINVAL;
+ INP_WUNLOCK(inp);
+ break;
+
+ case TCP_INFO:
+ INP_WUNLOCK(inp);
+ error = EINVAL;
+ break;
+
+ default:
+ INP_WUNLOCK(inp);
+ error = ENOPROTOOPT;
+ break;
+ }
+ break;
+
+ case SOPT_GET:
+ tp = intotcpcb(inp);
+ switch (sopt->sopt_name) {
+#ifdef TCP_SIGNATURE
+ case TCP_MD5SIG:
+ optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+#endif
+
+ case TCP_NODELAY:
+ optval = tp->t_flags & TF_NODELAY;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+ case TCP_MAXSEG:
+ optval = tp->t_maxseg;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+ case TCP_NOOPT:
+ optval = tp->t_flags & TF_NOOPT;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+ case TCP_NOPUSH:
+ optval = tp->t_flags & TF_NOPUSH;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+ case TCP_INFO:
+ tcp_fill_info(tp, &ti);
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &ti, sizeof ti);
+ break;
+ default:
+ INP_WUNLOCK(inp);
+ error = ENOPROTOOPT;
+ break;
+ }
+ break;
+ }
+ return (error);
+}
+#undef INP_WLOCK_RECHECK
+
+/*
+ * tcp_sendspace and tcp_recvspace are the default send and receive window
+ * sizes, respectively. These are obsolescent (this information should
+ * be set by the route).
+ */
+u_long tcp_sendspace = 1024*32;
+SYSCTL_ULONG(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
+ &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
+u_long tcp_recvspace = 1024*64;
+SYSCTL_ULONG(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
+ &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
+
+/*
+ * Attach TCP protocol to socket, allocating
+ * internet protocol control block, tcp control block,
+ * bufer space, and entering LISTEN state if to accept connections.
+ */
+static int
+tcp_attach(struct socket *so)
+{
+ struct tcpcb *tp;
+ struct inpcb *inp;
+ int error;
+
+ if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+ error = soreserve(so, tcp_sendspace, tcp_recvspace);
+ if (error)
+ return (error);
+ }
+ so->so_rcv.sb_flags |= SB_AUTOSIZE;
+ so->so_snd.sb_flags |= SB_AUTOSIZE;
+ INP_INFO_WLOCK(&V_tcbinfo);
+ error = in_pcballoc(so, &V_tcbinfo);
+ if (error) {
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (error);
+ }
+ inp = sotoinpcb(so);
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6PROTO) {
+ inp->inp_vflag |= INP_IPV6;
+ inp->in6p_hops = -1; /* use kernel default */
+ }
+ else
+#endif
+ inp->inp_vflag |= INP_IPV4;
+ tp = tcp_newtcpcb(inp);
+ if (tp == NULL) {
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (ENOBUFS);
+ }
+ tp->t_state = TCPS_CLOSED;
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (0);
+}
+
+/*
+ * Initiate (or continue) disconnect.
+ * If embryonic state, just send reset (once).
+ * If in ``let data drain'' option and linger null, just drop.
+ * Otherwise (hard), mark socket disconnecting and drop
+ * current input data; switch states based on user close, and
+ * send segment to peer (with FIN).
+ */
+static void
+tcp_disconnect(struct tcpcb *tp)
+{
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so = inp->inp_socket;
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(inp);
+
+ /*
+ * Neither tcp_close() nor tcp_drop() should return NULL, as the
+ * socket is still open.
+ */
+ if (tp->t_state < TCPS_ESTABLISHED) {
+ tp = tcp_close(tp);
+ KASSERT(tp != NULL,
+ ("tcp_disconnect: tcp_close() returned NULL"));
+ } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
+ tp = tcp_drop(tp, 0);
+ KASSERT(tp != NULL,
+ ("tcp_disconnect: tcp_drop() returned NULL"));
+ } else {
+ soisdisconnecting(so);
+ sbflush(&so->so_rcv);
+ tcp_usrclosed(tp);
+ if (!(inp->inp_flags & INP_DROPPED))
+ tcp_output_disconnect(tp);
+ }
+}
+
+/*
+ * User issued close, and wish to trail through shutdown states:
+ * if never received SYN, just forget it. If got a SYN from peer,
+ * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
+ * If already got a FIN from peer, then almost done; go to LAST_ACK
+ * state. In all other cases, have already sent FIN to peer (e.g.
+ * after PRU_SHUTDOWN), and just have to play tedious game waiting
+ * for peer to send FIN or not respond to keep-alives, etc.
+ * We can let the user exit from the close as soon as the FIN is acked.
+ */
+static void
+tcp_usrclosed(struct tcpcb *tp)
+{
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ switch (tp->t_state) {
+ case TCPS_LISTEN:
+ tcp_offload_listen_close(tp);
+ /* FALLTHROUGH */
+ case TCPS_CLOSED:
+ tp->t_state = TCPS_CLOSED;
+ tp = tcp_close(tp);
+ /*
+ * tcp_close() should never return NULL here as the socket is
+ * still open.
+ */
+ KASSERT(tp != NULL,
+ ("tcp_usrclosed: tcp_close() returned NULL"));
+ break;
+
+ case TCPS_SYN_SENT:
+ case TCPS_SYN_RECEIVED:
+ tp->t_flags |= TF_NEEDFIN;
+ break;
+
+ case TCPS_ESTABLISHED:
+ tp->t_state = TCPS_FIN_WAIT_1;
+ break;
+
+ case TCPS_CLOSE_WAIT:
+ tp->t_state = TCPS_LAST_ACK;
+ break;
+ }
+ if (tp->t_state >= TCPS_FIN_WAIT_2) {
+ soisdisconnected(tp->t_inpcb->inp_socket);
+ /* Prevent the connection hanging in FIN_WAIT_2 forever. */
+ if (tp->t_state == TCPS_FIN_WAIT_2) {
+ int timeout;
+
+ timeout = (tcp_fast_finwait2_recycle) ?
+ tcp_finwait2_timeout : tcp_maxidle;
+ tcp_timer_activate(tp, TT_2MSL, timeout);
+ }
+ }
+}
+
+#ifdef DDB
+static void
+db_print_indent(int indent)
+{
+ int i;
+
+ for (i = 0; i < indent; i++)
+ db_printf(" ");
+}
+
+static void
+db_print_tstate(int t_state)
+{
+
+ switch (t_state) {
+ case TCPS_CLOSED:
+ db_printf("TCPS_CLOSED");
+ return;
+
+ case TCPS_LISTEN:
+ db_printf("TCPS_LISTEN");
+ return;
+
+ case TCPS_SYN_SENT:
+ db_printf("TCPS_SYN_SENT");
+ return;
+
+ case TCPS_SYN_RECEIVED:
+ db_printf("TCPS_SYN_RECEIVED");
+ return;
+
+ case TCPS_ESTABLISHED:
+ db_printf("TCPS_ESTABLISHED");
+ return;
+
+ case TCPS_CLOSE_WAIT:
+ db_printf("TCPS_CLOSE_WAIT");
+ return;
+
+ case TCPS_FIN_WAIT_1:
+ db_printf("TCPS_FIN_WAIT_1");
+ return;
+
+ case TCPS_CLOSING:
+ db_printf("TCPS_CLOSING");
+ return;
+
+ case TCPS_LAST_ACK:
+ db_printf("TCPS_LAST_ACK");
+ return;
+
+ case TCPS_FIN_WAIT_2:
+ db_printf("TCPS_FIN_WAIT_2");
+ return;
+
+ case TCPS_TIME_WAIT:
+ db_printf("TCPS_TIME_WAIT");
+ return;
+
+ default:
+ db_printf("unknown");
+ return;
+ }
+}
+
+static void
+db_print_tflags(u_int t_flags)
+{
+ int comma;
+
+ comma = 0;
+ if (t_flags & TF_ACKNOW) {
+ db_printf("%sTF_ACKNOW", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_DELACK) {
+ db_printf("%sTF_DELACK", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_NODELAY) {
+ db_printf("%sTF_NODELAY", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_NOOPT) {
+ db_printf("%sTF_NOOPT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_SENTFIN) {
+ db_printf("%sTF_SENTFIN", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_REQ_SCALE) {
+ db_printf("%sTF_REQ_SCALE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_RCVD_SCALE) {
+ db_printf("%sTF_RECVD_SCALE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_REQ_TSTMP) {
+ db_printf("%sTF_REQ_TSTMP", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_RCVD_TSTMP) {
+ db_printf("%sTF_RCVD_TSTMP", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_SACK_PERMIT) {
+ db_printf("%sTF_SACK_PERMIT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_NEEDSYN) {
+ db_printf("%sTF_NEEDSYN", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_NEEDFIN) {
+ db_printf("%sTF_NEEDFIN", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_NOPUSH) {
+ db_printf("%sTF_NOPUSH", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_NOPUSH) {
+ db_printf("%sTF_NOPUSH", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_MORETOCOME) {
+ db_printf("%sTF_MORETOCOME", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_LQ_OVERFLOW) {
+ db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_LASTIDLE) {
+ db_printf("%sTF_LASTIDLE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_RXWIN0SENT) {
+ db_printf("%sTF_RXWIN0SENT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_FASTRECOVERY) {
+ db_printf("%sTF_FASTRECOVERY", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_WASFRECOVERY) {
+ db_printf("%sTF_WASFRECOVERY", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_SIGNATURE) {
+ db_printf("%sTF_SIGNATURE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_FORCEDATA) {
+ db_printf("%sTF_FORCEDATA", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_TSO) {
+ db_printf("%sTF_TSO", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_flags & TF_ECN_PERMIT) {
+ db_printf("%sTF_ECN_PERMIT", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_toobflags(char t_oobflags)
+{
+ int comma;
+
+ comma = 0;
+ if (t_oobflags & TCPOOB_HAVEDATA) {
+ db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : "");
+ comma = 1;
+ }
+ if (t_oobflags & TCPOOB_HADDATA) {
+ db_printf("%sTCPOOB_HADDATA", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
+{
+
+ db_print_indent(indent);
+ db_printf("%s at %p\n", name, tp);
+
+ indent += 2;
+
+ db_print_indent(indent);
+ db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n",
+ LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
+
+ db_print_indent(indent);
+ db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n",
+ &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep);
+
+ db_print_indent(indent);
+ db_printf("tt_2msl: %p tt_delack: %p t_inpcb: %p\n", &tp->t_timers->tt_2msl,
+ &tp->t_timers->tt_delack, tp->t_inpcb);
+
+ db_print_indent(indent);
+ db_printf("t_state: %d (", tp->t_state);
+ db_print_tstate(tp->t_state);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("t_flags: 0x%x (", tp->t_flags);
+ db_print_tflags(tp->t_flags);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: x0%08x\n",
+ tp->snd_una, tp->snd_max, tp->snd_nxt);
+
+ db_print_indent(indent);
+ db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n",
+ tp->snd_up, tp->snd_wl1, tp->snd_wl2);
+
+ db_print_indent(indent);
+ db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n",
+ tp->iss, tp->irs, tp->rcv_nxt);
+
+ db_print_indent(indent);
+ db_printf("rcv_adv: 0x%08x rcv_wnd: %lu rcv_up: 0x%08x\n",
+ tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
+
+ db_print_indent(indent);
+ db_printf("snd_wnd: %lu snd_cwnd: %lu snd_bwnd: %lu\n",
+ tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd);
+
+ db_print_indent(indent);
+ db_printf("snd_ssthresh: %lu snd_bandwidth: %lu snd_recover: "
+ "0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth,
+ tp->snd_recover);
+
+ db_print_indent(indent);
+ db_printf("t_maxopd: %u t_rcvtime: %u t_startime: %u\n",
+ tp->t_maxopd, tp->t_rcvtime, tp->t_starttime);
+
+ db_print_indent(indent);
+ db_printf("t_rttime: %u t_rtsq: 0x%08x t_bw_rtttime: %u\n",
+ tp->t_rtttime, tp->t_rtseq, tp->t_bw_rtttime);
+
+ db_print_indent(indent);
+ db_printf("t_bw_rtseq: 0x%08x t_rxtcur: %d t_maxseg: %u "
+ "t_srtt: %d\n", tp->t_bw_rtseq, tp->t_rxtcur, tp->t_maxseg,
+ tp->t_srtt);
+
+ db_print_indent(indent);
+ db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u "
+ "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin,
+ tp->t_rttbest);
+
+ db_print_indent(indent);
+ db_printf("t_rttupdated: %lu max_sndwnd: %lu t_softerror: %d\n",
+ tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror);
+
+ db_print_indent(indent);
+ db_printf("t_oobflags: 0x%x (", tp->t_oobflags);
+ db_print_toobflags(tp->t_oobflags);
+ db_printf(") t_iobc: 0x%02x\n", tp->t_iobc);
+
+ db_print_indent(indent);
+ db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n",
+ tp->snd_scale, tp->rcv_scale, tp->request_r_scale);
+
+ db_print_indent(indent);
+ db_printf("ts_recent: %u ts_recent_age: %u\n",
+ tp->ts_recent, tp->ts_recent_age);
+
+ db_print_indent(indent);
+ db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: "
+ "%lu\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev);
+
+ db_print_indent(indent);
+ db_printf("snd_ssthresh_prev: %lu snd_recover_prev: 0x%08x "
+ "t_badrxtwin: %u\n", tp->snd_ssthresh_prev,
+ tp->snd_recover_prev, tp->t_badrxtwin);
+
+ db_print_indent(indent);
+ db_printf("snd_numholes: %d snd_holes first: %p\n",
+ tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes));
+
+ db_print_indent(indent);
+ db_printf("snd_fack: 0x%08x rcv_numsacks: %d sack_newdata: "
+ "0x%08x\n", tp->snd_fack, tp->rcv_numsacks, tp->sack_newdata);
+
+ /* Skip sackblks, sackhint. */
+
+ db_print_indent(indent);
+ db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n",
+ tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt);
+}
+
+DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
+{
+ struct tcpcb *tp;
+
+ if (!have_addr) {
+ db_printf("usage: show tcpcb <addr>\n");
+ return;
+ }
+ tp = (struct tcpcb *)addr;
+
+ db_print_tcpcb(tp, "tcpcb", 0);
+}
+#endif
diff --git a/freebsd/sys/netinet/tcp_var.h b/freebsd/sys/netinet/tcp_var.h
new file mode 100644
index 00000000..77586144
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_var.h
@@ -0,0 +1,687 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_VAR_HH_
+#define _NETINET_TCP_VAR_HH_
+
+#include <freebsd/netinet/tcp.h>
+
+#ifdef _KERNEL
+#include <freebsd/net/vnet.h>
+
+/*
+ * Kernel variables for tcp.
+ */
+VNET_DECLARE(int, tcp_do_rfc1323);
+#define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323)
+
+#endif /* _KERNEL */
+
+/* TCP segment queue entry */
+struct tseg_qent {
+ LIST_ENTRY(tseg_qent) tqe_q;
+ int tqe_len; /* TCP segment data length */
+ struct tcphdr *tqe_th; /* a pointer to tcp header */
+ struct mbuf *tqe_m; /* mbuf contains packet */
+};
+LIST_HEAD(tsegqe_head, tseg_qent);
+
+struct sackblk {
+ tcp_seq start; /* start seq no. of sack block */
+ tcp_seq end; /* end seq no. */
+};
+
+struct sackhole {
+ tcp_seq start; /* start seq no. of hole */
+ tcp_seq end; /* end seq no. */
+ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */
+ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */
+};
+
+struct sackhint {
+ struct sackhole *nexthole;
+ int sack_bytes_rexmit;
+
+ int ispare; /* explicit pad for 64bit alignment */
+ uint64_t _pad[2]; /* 1 sacked_bytes, 1 TBD */
+};
+
+struct tcptemp {
+ u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
+ struct tcphdr tt_t;
+};
+
+#define tcp6cb tcpcb /* for KAME src sync over BSD*'s */
+
+/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
+#ifdef INET6
+#define ND6_HINT(tp) \
+do { \
+ if ((tp) && (tp)->t_inpcb && \
+ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
+ nd6_nud_hint(NULL, NULL, 0); \
+} while (0)
+#else
+#define ND6_HINT(tp)
+#endif
+
+/*
+ * Tcp control block, one per tcp; fields:
+ * Organized for 16 byte cacheline efficiency.
+ */
+struct tcpcb {
+ struct tsegqe_head t_segq; /* segment reassembly queue */
+ void *t_pspare[2]; /* new reassembly queue */
+ int t_segqlen; /* segment reassembly queue length */
+ int t_dupacks; /* consecutive dup acks recd */
+
+ struct tcp_timer *t_timers; /* All the TCP timers in one struct */
+
+ struct inpcb *t_inpcb; /* back pointer to internet pcb */
+ int t_state; /* state of this connection */
+ u_int t_flags;
+
+ struct vnet *t_vnet; /* back pointer to parent vnet */
+
+ tcp_seq snd_una; /* send unacknowledged */
+ tcp_seq snd_max; /* highest sequence number sent;
+ * used to recognize retransmits
+ */
+ tcp_seq snd_nxt; /* send next */
+ tcp_seq snd_up; /* send urgent pointer */
+
+ tcp_seq snd_wl1; /* window update seg seq number */
+ tcp_seq snd_wl2; /* window update seg ack number */
+ tcp_seq iss; /* initial send sequence number */
+ tcp_seq irs; /* initial receive sequence number */
+
+ tcp_seq rcv_nxt; /* receive next */
+ tcp_seq rcv_adv; /* advertised window */
+ u_long rcv_wnd; /* receive window */
+ tcp_seq rcv_up; /* receive urgent pointer */
+
+ u_long snd_wnd; /* send window */
+ u_long snd_cwnd; /* congestion-controlled window */
+ u_long snd_bwnd; /* bandwidth-controlled window */
+ u_long snd_ssthresh; /* snd_cwnd size threshold for
+ * for slow start exponential to
+ * linear switch
+ */
+ u_long snd_bandwidth; /* calculated bandwidth or 0 */
+ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */
+
+ u_int t_maxopd; /* mss plus options */
+
+ u_int t_rcvtime; /* inactivity time */
+ u_int t_starttime; /* time connection was established */
+ u_int t_rtttime; /* RTT measurement start time */
+ tcp_seq t_rtseq; /* sequence number being timed */
+
+ u_int t_bw_rtttime; /* used for bandwidth calculation */
+ tcp_seq t_bw_rtseq; /* used for bandwidth calculation */
+
+ int t_rxtcur; /* current retransmit value (ticks) */
+ u_int t_maxseg; /* maximum segment size */
+ int t_srtt; /* smoothed round-trip time */
+ int t_rttvar; /* variance in round-trip time */
+
+ int t_rxtshift; /* log(2) of rexmt exp. backoff */
+ u_int t_rttmin; /* minimum rtt allowed */
+ u_int t_rttbest; /* best rtt we've seen */
+ u_long t_rttupdated; /* number of times rtt sampled */
+ u_long max_sndwnd; /* largest window peer has offered */
+
+ int t_softerror; /* possible error not yet reported */
+/* out-of-band data */
+ char t_oobflags; /* have some */
+ char t_iobc; /* input character */
+/* RFC 1323 variables */
+ u_char snd_scale; /* window scaling for send window */
+ u_char rcv_scale; /* window scaling for recv window */
+ u_char request_r_scale; /* pending window scaling */
+ u_int32_t ts_recent; /* timestamp echo data */
+ u_int ts_recent_age; /* when last updated */
+ u_int32_t ts_offset; /* our timestamp offset */
+
+ tcp_seq last_ack_sent;
+/* experimental */
+ u_long snd_cwnd_prev; /* cwnd prior to retransmit */
+ u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */
+ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */
+ u_int t_badrxtwin; /* window for retransmit recovery */
+ u_char snd_limited; /* segments limited transmitted */
+/* SACK related state */
+ int snd_numholes; /* number of holes seen by sender */
+ TAILQ_HEAD(sackhole_head, sackhole) snd_holes;
+ /* SACK scoreboard (sorted) */
+ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/
+ int rcv_numsacks; /* # distinct sack blks present */
+ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
+ tcp_seq sack_newdata; /* New data xmitted in this recovery
+ episode starts at this seq number */
+ struct sackhint sackhint; /* SACK scoreboard hint */
+ int t_rttlow; /* smallest observerved RTT */
+ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */
+ int rfbuf_cnt; /* recv buffer autoscaling byte count */
+ struct toe_usrreqs *t_tu; /* offload operations vector */
+ void *t_toe; /* TOE pcb pointer */
+ int t_bytes_acked; /* # bytes acked during current RTT */
+
+ int t_ispare; /* explicit pad for 64bit alignment */
+ void *t_pspare2[6]; /* 2 CC / 4 TBD */
+ uint64_t _pad[12]; /* 7 UTO, 5 TBD (1-2 CC/RTT?) */
+};
+
+/*
+ * Flags and utility macros for the t_flags field.
+ */
+#define TF_ACKNOW 0x000001 /* ack peer immediately */
+#define TF_DELACK 0x000002 /* ack, but try to delay it */
+#define TF_NODELAY 0x000004 /* don't delay packets to coalesce */
+#define TF_NOOPT 0x000008 /* don't use tcp options */
+#define TF_SENTFIN 0x000010 /* have sent FIN */
+#define TF_REQ_SCALE 0x000020 /* have/will request window scaling */
+#define TF_RCVD_SCALE 0x000040 /* other side has requested scaling */
+#define TF_REQ_TSTMP 0x000080 /* have/will request timestamps */
+#define TF_RCVD_TSTMP 0x000100 /* a timestamp was received in SYN */
+#define TF_SACK_PERMIT 0x000200 /* other side said I could SACK */
+#define TF_NEEDSYN 0x000400 /* send SYN (implicit state) */
+#define TF_NEEDFIN 0x000800 /* send FIN (implicit state) */
+#define TF_NOPUSH 0x001000 /* don't push */
+#define TF_MORETOCOME 0x010000 /* More data to be appended to sock */
+#define TF_LQ_OVERFLOW 0x020000 /* listen queue overflow */
+#define TF_LASTIDLE 0x040000 /* connection was previously idle */
+#define TF_RXWIN0SENT 0x080000 /* sent a receiver win 0 in response */
+#define TF_FASTRECOVERY 0x100000 /* in NewReno Fast Recovery */
+#define TF_WASFRECOVERY 0x200000 /* was in NewReno Fast Recovery */
+#define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */
+#define TF_FORCEDATA 0x800000 /* force out a byte */
+#define TF_TSO 0x1000000 /* TSO enabled on this connection */
+#define TF_TOE 0x2000000 /* this connection is offloaded */
+#define TF_ECN_PERMIT 0x4000000 /* connection ECN-ready */
+#define TF_ECN_SND_CWR 0x8000000 /* ECN CWR in queue */
+#define TF_ECN_SND_ECE 0x10000000 /* ECN ECE in queue */
+
+#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY)
+#define ENTER_FASTRECOVERY(tp) tp->t_flags |= TF_FASTRECOVERY
+#define EXIT_FASTRECOVERY(tp) tp->t_flags &= ~TF_FASTRECOVERY
+
+/*
+ * Flags for the t_oobflags field.
+ */
+#define TCPOOB_HAVEDATA 0x01
+#define TCPOOB_HADDATA 0x02
+
+#ifdef TCP_SIGNATURE
+/*
+ * Defines which are needed by the xform_tcp module and tcp_[in|out]put
+ * for SADB verification and lookup.
+ */
+#define TCP_SIGLEN 16 /* length of computed digest in bytes */
+#define TCP_KEYLEN_MIN 1 /* minimum length of TCP-MD5 key */
+#define TCP_KEYLEN_MAX 80 /* maximum length of TCP-MD5 key */
+/*
+ * Only a single SA per host may be specified at this time. An SPI is
+ * needed in order for the KEY_ALLOCSA() lookup to work.
+ */
+#define TCP_SIG_SPI 0x1000
+#endif /* TCP_SIGNATURE */
+
+/*
+ * Structure to hold TCP options that are only used during segment
+ * processing (in tcp_input), but not held in the tcpcb.
+ * It's basically used to reduce the number of parameters
+ * to tcp_dooptions and tcp_addoptions.
+ * The binary order of the to_flags is relevant for packing of the
+ * options in tcp_addoptions.
+ */
+struct tcpopt {
+ u_int64_t to_flags; /* which options are present */
+#define TOF_MSS 0x0001 /* maximum segment size */
+#define TOF_SCALE 0x0002 /* window scaling */
+#define TOF_SACKPERM 0x0004 /* SACK permitted */
+#define TOF_TS 0x0010 /* timestamp */
+#define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */
+#define TOF_SACK 0x0080 /* Peer sent SACK option */
+#define TOF_MAXOPT 0x0100
+ u_int32_t to_tsval; /* new timestamp */
+ u_int32_t to_tsecr; /* reflected timestamp */
+ u_char *to_sacks; /* pointer to the first SACK blocks */
+ u_char *to_signature; /* pointer to the TCP-MD5 signature */
+ u_int16_t to_mss; /* maximum segment size */
+ u_int8_t to_wscale; /* window scaling */
+ u_int8_t to_nsacks; /* number of SACK blocks */
+};
+
+/*
+ * Flags for tcp_dooptions.
+ */
+#define TO_SYN 0x01 /* parse SYN-only options */
+
+struct hc_metrics_lite { /* must stay in sync with hc_metrics */
+ u_long rmx_mtu; /* MTU for this path */
+ u_long rmx_ssthresh; /* outbound gateway buffer limit */
+ u_long rmx_rtt; /* estimated round trip time */
+ u_long rmx_rttvar; /* estimated rtt variance */
+ u_long rmx_bandwidth; /* estimated bandwidth */
+ u_long rmx_cwnd; /* congestion window */
+ u_long rmx_sendpipe; /* outbound delay-bandwidth product */
+ u_long rmx_recvpipe; /* inbound delay-bandwidth product */
+};
+
+#ifndef _NETINET_IN_PCB_HH_
+struct in_conninfo;
+#endif /* _NETINET_IN_PCB_HH_ */
+
+struct tcptw {
+ struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */
+ tcp_seq snd_nxt;
+ tcp_seq rcv_nxt;
+ tcp_seq iss;
+ tcp_seq irs;
+ u_short last_win; /* cached window value */
+ u_short tw_so_options; /* copy of so_options */
+ struct ucred *tw_cred; /* user credentials */
+ u_int32_t t_recent;
+ u_int32_t ts_offset; /* our timestamp offset */
+ u_int t_starttime;
+ int tw_time;
+ TAILQ_ENTRY(tcptw) tw_2msl;
+};
+
+#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
+#define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb)
+#define sototcpcb(so) (intotcpcb(sotoinpcb(so)))
+
+/*
+ * The smoothed round-trip time and estimated variance
+ * are stored as fixed point numbers scaled by the values below.
+ * For convenience, these scales are also used in smoothing the average
+ * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed).
+ * With these scales, srtt has 3 bits to the right of the binary point,
+ * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the
+ * binary point, and is smoothed with an ALPHA of 0.75.
+ */
+#define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */
+#define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */
+#define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */
+#define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */
+#define TCP_DELTA_SHIFT 2 /* see tcp_input.c */
+
+/*
+ * The initial retransmission should happen at rtt + 4 * rttvar.
+ * Because of the way we do the smoothing, srtt and rttvar
+ * will each average +1/2 tick of bias. When we compute
+ * the retransmit timer, we want 1/2 tick of rounding and
+ * 1 extra tick because of +-1/2 tick uncertainty in the
+ * firing of the timer. The bias will give us exactly the
+ * 1.5 tick we need. But, because the bias is
+ * statistical, we have to test that we don't drop below
+ * the minimum feasible timer (which is 2 ticks).
+ * This version of the macro adapted from a paper by Lawrence
+ * Brakmo and Larry Peterson which outlines a problem caused
+ * by insufficient precision in the original implementation,
+ * which results in inappropriately large RTO values for very
+ * fast networks.
+ */
+#define TCP_REXMTVAL(tp) \
+ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \
+ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT)
+
+/*
+ * TCP statistics.
+ * Many of these should be kept per connection,
+ * but that's inconvenient at the moment.
+ */
+struct tcpstat {
+ u_long tcps_connattempt; /* connections initiated */
+ u_long tcps_accepts; /* connections accepted */
+ u_long tcps_connects; /* connections established */
+ u_long tcps_drops; /* connections dropped */
+ u_long tcps_conndrops; /* embryonic connections dropped */
+ u_long tcps_minmssdrops; /* average minmss too low drops */
+ u_long tcps_closed; /* conn. closed (includes drops) */
+ u_long tcps_segstimed; /* segs where we tried to get rtt */
+ u_long tcps_rttupdated; /* times we succeeded */
+ u_long tcps_delack; /* delayed acks sent */
+ u_long tcps_timeoutdrop; /* conn. dropped in rxmt timeout */
+ u_long tcps_rexmttimeo; /* retransmit timeouts */
+ u_long tcps_persisttimeo; /* persist timeouts */
+ u_long tcps_keeptimeo; /* keepalive timeouts */
+ u_long tcps_keepprobe; /* keepalive probes sent */
+ u_long tcps_keepdrops; /* connections dropped in keepalive */
+
+ u_long tcps_sndtotal; /* total packets sent */
+ u_long tcps_sndpack; /* data packets sent */
+ u_long tcps_sndbyte; /* data bytes sent */
+ u_long tcps_sndrexmitpack; /* data packets retransmitted */
+ u_long tcps_sndrexmitbyte; /* data bytes retransmitted */
+ u_long tcps_sndrexmitbad; /* unnecessary packet retransmissions */
+ u_long tcps_sndacks; /* ack-only packets sent */
+ u_long tcps_sndprobe; /* window probes sent */
+ u_long tcps_sndurg; /* packets sent with URG only */
+ u_long tcps_sndwinup; /* window update-only packets sent */
+ u_long tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */
+
+ u_long tcps_rcvtotal; /* total packets received */
+ u_long tcps_rcvpack; /* packets received in sequence */
+ u_long tcps_rcvbyte; /* bytes received in sequence */
+ u_long tcps_rcvbadsum; /* packets received with ccksum errs */
+ u_long tcps_rcvbadoff; /* packets received with bad offset */
+ u_long tcps_rcvmemdrop; /* packets dropped for lack of memory */
+ u_long tcps_rcvshort; /* packets received too short */
+ u_long tcps_rcvduppack; /* duplicate-only packets received */
+ u_long tcps_rcvdupbyte; /* duplicate-only bytes received */
+ u_long tcps_rcvpartduppack; /* packets with some duplicate data */
+ u_long tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */
+ u_long tcps_rcvoopack; /* out-of-order packets received */
+ u_long tcps_rcvoobyte; /* out-of-order bytes received */
+ u_long tcps_rcvpackafterwin; /* packets with data after window */
+ u_long tcps_rcvbyteafterwin; /* bytes rcvd after window */
+ u_long tcps_rcvafterclose; /* packets rcvd after "close" */
+ u_long tcps_rcvwinprobe; /* rcvd window probe packets */
+ u_long tcps_rcvdupack; /* rcvd duplicate acks */
+ u_long tcps_rcvacktoomuch; /* rcvd acks for unsent data */
+ u_long tcps_rcvackpack; /* rcvd ack packets */
+ u_long tcps_rcvackbyte; /* bytes acked by rcvd acks */
+ u_long tcps_rcvwinupd; /* rcvd window update packets */
+ u_long tcps_pawsdrop; /* segments dropped due to PAWS */
+ u_long tcps_predack; /* times hdr predict ok for acks */
+ u_long tcps_preddat; /* times hdr predict ok for data pkts */
+ u_long tcps_pcbcachemiss;
+ u_long tcps_cachedrtt; /* times cached RTT in route updated */
+ u_long tcps_cachedrttvar; /* times cached rttvar updated */
+ u_long tcps_cachedssthresh; /* times cached ssthresh updated */
+ u_long tcps_usedrtt; /* times RTT initialized from route */
+ u_long tcps_usedrttvar; /* times RTTVAR initialized from rt */
+ u_long tcps_usedssthresh; /* times ssthresh initialized from rt*/
+ u_long tcps_persistdrop; /* timeout in persist state */
+ u_long tcps_badsyn; /* bogus SYN, e.g. premature ACK */
+ u_long tcps_mturesent; /* resends due to MTU discovery */
+ u_long tcps_listendrop; /* listen queue overflows */
+ u_long tcps_badrst; /* ignored RSTs in the window */
+
+ u_long tcps_sc_added; /* entry added to syncache */
+ u_long tcps_sc_retransmitted; /* syncache entry was retransmitted */
+ u_long tcps_sc_dupsyn; /* duplicate SYN packet */
+ u_long tcps_sc_dropped; /* could not reply to packet */
+ u_long tcps_sc_completed; /* successful extraction of entry */
+ u_long tcps_sc_bucketoverflow; /* syncache per-bucket limit hit */
+ u_long tcps_sc_cacheoverflow; /* syncache cache limit hit */
+ u_long tcps_sc_reset; /* RST removed entry from syncache */
+ u_long tcps_sc_stale; /* timed out or listen socket gone */
+ u_long tcps_sc_aborted; /* syncache entry aborted */
+ u_long tcps_sc_badack; /* removed due to bad ACK */
+ u_long tcps_sc_unreach; /* ICMP unreachable received */
+ u_long tcps_sc_zonefail; /* zalloc() failed */
+ u_long tcps_sc_sendcookie; /* SYN cookie sent */
+ u_long tcps_sc_recvcookie; /* SYN cookie received */
+
+ u_long tcps_hc_added; /* entry added to hostcache */
+ u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */
+
+ u_long tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */
+
+ /* SACK related stats */
+ u_long tcps_sack_recovery_episode; /* SACK recovery episodes */
+ u_long tcps_sack_rexmits; /* SACK rexmit segments */
+ u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */
+ u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */
+ u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */
+ u_long tcps_sack_sboverflow; /* times scoreboard overflowed */
+
+ /* ECN related stats */
+ u_long tcps_ecn_ce; /* ECN Congestion Experienced */
+ u_long tcps_ecn_ect0; /* ECN Capable Transport */
+ u_long tcps_ecn_ect1; /* ECN Capable Transport */
+ u_long tcps_ecn_shs; /* ECN successful handshakes */
+ u_long tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */
+
+ u_long _pad[12]; /* 6 UTO, 6 TBD */
+};
+
+#ifdef _KERNEL
+/*
+ * In-kernel consumers can use these accessor macros directly to update
+ * stats.
+ */
+#define TCPSTAT_ADD(name, val) V_tcpstat.name += (val)
+#define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1)
+
+/*
+ * Kernel module consumers must use this accessor macro.
+ */
+void kmod_tcpstat_inc(int statnum);
+#define KMOD_TCPSTAT_INC(name) \
+ kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(u_long))
+#endif
+
+/*
+ * TCB structure exported to user-land via sysctl(3).
+ * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been
+ * included. Not all of our clients do.
+ */
+#if defined(_NETINET_IN_PCB_HH_) && defined(_SYS_SOCKETVAR_HH_)
+struct xtcpcb {
+ size_t xt_len;
+ struct inpcb xt_inp;
+ struct tcpcb xt_tp;
+ struct xsocket xt_socket;
+ u_quad_t xt_alignment_hack;
+};
+#endif
+
+/*
+ * Names for TCP sysctl objects
+ */
+#define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */
+#define TCPCTL_MSSDFLT 3 /* MSS default */
+#define TCPCTL_STATS 4 /* statistics (read-only) */
+#define TCPCTL_RTTDFLT 5 /* default RTT estimate */
+#define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */
+#define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */
+#define TCPCTL_SENDSPACE 8 /* send buffer space */
+#define TCPCTL_RECVSPACE 9 /* receive buffer space */
+#define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */
+#define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */
+#define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */
+#define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */
+#define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */
+#define TCPCTL_DROP 15 /* drop tcp connection */
+#define TCPCTL_MAXID 16
+#define TCPCTL_FINWAIT2_TIMEOUT 17
+
+#define TCPCTL_NAMES { \
+ { 0, 0 }, \
+ { "rfc1323", CTLTYPE_INT }, \
+ { "mssdflt", CTLTYPE_INT }, \
+ { "stats", CTLTYPE_STRUCT }, \
+ { "rttdflt", CTLTYPE_INT }, \
+ { "keepidle", CTLTYPE_INT }, \
+ { "keepintvl", CTLTYPE_INT }, \
+ { "sendspace", CTLTYPE_INT }, \
+ { "recvspace", CTLTYPE_INT }, \
+ { "keepinit", CTLTYPE_INT }, \
+ { "pcblist", CTLTYPE_STRUCT }, \
+ { "delacktime", CTLTYPE_INT }, \
+ { "v6mssdflt", CTLTYPE_INT }, \
+ { "maxid", CTLTYPE_INT }, \
+}
+
+
+#ifdef _KERNEL
+#ifdef SYSCTL_DECL
+SYSCTL_DECL(_net_inet_tcp);
+SYSCTL_DECL(_net_inet_tcp_sack);
+MALLOC_DECLARE(M_TCPLOG);
+#endif
+
+VNET_DECLARE(struct inpcbhead, tcb); /* queue of active tcpcb's */
+VNET_DECLARE(struct inpcbinfo, tcbinfo);
+VNET_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */
+extern int tcp_log_in_vain;
+VNET_DECLARE(int, tcp_mssdflt); /* XXX */
+VNET_DECLARE(int, tcp_minmss);
+VNET_DECLARE(int, tcp_delack_enabled);
+VNET_DECLARE(int, tcp_do_rfc3390);
+VNET_DECLARE(int, tcp_do_newreno);
+VNET_DECLARE(int, path_mtu_discovery);
+VNET_DECLARE(int, ss_fltsz);
+VNET_DECLARE(int, ss_fltsz_local);
+#define V_tcb VNET(tcb)
+#define V_tcbinfo VNET(tcbinfo)
+#define V_tcpstat VNET(tcpstat)
+#define V_tcp_mssdflt VNET(tcp_mssdflt)
+#define V_tcp_minmss VNET(tcp_minmss)
+#define V_tcp_delack_enabled VNET(tcp_delack_enabled)
+#define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390)
+#define V_tcp_do_newreno VNET(tcp_do_newreno)
+#define V_path_mtu_discovery VNET(path_mtu_discovery)
+#define V_ss_fltsz VNET(ss_fltsz)
+#define V_ss_fltsz_local VNET(ss_fltsz_local)
+
+VNET_DECLARE(int, tcp_do_sack); /* SACK enabled/disabled */
+VNET_DECLARE(int, tcp_sc_rst_sock_fail); /* RST on sock alloc failure */
+#define V_tcp_do_sack VNET(tcp_do_sack)
+#define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail)
+
+VNET_DECLARE(int, tcp_do_ecn); /* TCP ECN enabled/disabled */
+VNET_DECLARE(int, tcp_ecn_maxretries);
+#define V_tcp_do_ecn VNET(tcp_do_ecn)
+#define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries)
+
+int tcp_addoptions(struct tcpopt *, u_char *);
+struct tcpcb *
+ tcp_close(struct tcpcb *);
+void tcp_discardcb(struct tcpcb *);
+void tcp_twstart(struct tcpcb *);
+#if 0
+int tcp_twrecycleable(struct tcptw *tw);
+#endif
+void tcp_twclose(struct tcptw *_tw, int _reuse);
+void tcp_ctlinput(int, struct sockaddr *, void *);
+int tcp_ctloutput(struct socket *, struct sockopt *);
+#ifndef __rtems__
+struct tcpcb *
+ tcp_drop(struct tcpcb *, int);
+#else
+struct tcpcb *
+tcp_drop(struct tcpcb *tp, int errno);
+#endif
+void tcp_drain(void);
+void tcp_init(void);
+#ifdef VIMAGE
+void tcp_destroy(void);
+#endif
+void tcp_fini(void *);
+char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *,
+ const void *);
+char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
+ const void *);
+int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *);
+void tcp_reass_init(void);
+void tcp_reass_flush(struct tcpcb *);
+#ifdef VIMAGE
+void tcp_reass_destroy(void);
+#endif
+void tcp_input(struct mbuf *, int);
+u_long tcp_maxmtu(struct in_conninfo *, int *);
+u_long tcp_maxmtu6(struct in_conninfo *, int *);
+void tcp_mss_update(struct tcpcb *, int, struct hc_metrics_lite *, int *);
+void tcp_mss(struct tcpcb *, int);
+int tcp_mssopt(struct in_conninfo *);
+#ifndef __rtems__
+struct inpcb *
+ tcp_drop_syn_sent(struct inpcb *, int);
+struct inpcb *
+ tcp_mtudisc(struct inpcb *, int);
+#else
+struct inpcb *
+tcp_drop_syn_sent(struct inpcb *inp, int errno);
+struct inpcb *
+tcp_mtudisc(struct inpcb *inp, int errno);
+#endif
+struct tcpcb *
+ tcp_newtcpcb(struct inpcb *);
+int tcp_output(struct tcpcb *);
+void tcp_respond(struct tcpcb *, void *,
+ struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int);
+void tcp_tw_init(void);
+#ifdef VIMAGE
+void tcp_tw_destroy(void);
+#endif
+void tcp_tw_zone_change(void);
+int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *,
+ struct mbuf *, int);
+int tcp_twrespond(struct tcptw *, int);
+void tcp_setpersist(struct tcpcb *);
+#ifdef TCP_SIGNATURE
+int tcp_signature_compute(struct mbuf *, int, int, int, u_char *, u_int);
+#endif
+void tcp_slowtimo(void);
+struct tcptemp *
+ tcpip_maketemplate(struct inpcb *);
+void tcpip_fillheaders(struct inpcb *, void *, void *);
+void tcp_timer_activate(struct tcpcb *, int, u_int);
+int tcp_timer_active(struct tcpcb *, int);
+void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
+void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq);
+/*
+ * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
+ */
+void tcp_hc_init(void);
+#ifdef VIMAGE
+void tcp_hc_destroy(void);
+#endif
+void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *);
+u_long tcp_hc_getmtu(struct in_conninfo *);
+void tcp_hc_updatemtu(struct in_conninfo *, u_long);
+void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *);
+
+extern struct pr_usrreqs tcp_usrreqs;
+extern u_long tcp_sendspace;
+extern u_long tcp_recvspace;
+tcp_seq tcp_new_isn(struct tcpcb *);
+
+void tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
+void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend);
+void tcp_clean_sackreport(struct tcpcb *tp);
+void tcp_sack_adjust(struct tcpcb *tp);
+struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
+void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
+void tcp_free_sackholes(struct tcpcb *tp);
+int tcp_newreno(struct tcpcb *, struct tcphdr *);
+u_long tcp_seq_subtract(u_long, u_long );
+
+#endif /* _KERNEL */
+
+#endif /* _NETINET_TCP_VAR_HH_ */
diff --git a/freebsd/sys/netinet/tcpip.h b/freebsd/sys/netinet/tcpip.h
new file mode 100644
index 00000000..337c07a6
--- /dev/null
+++ b/freebsd/sys/netinet/tcpip.h
@@ -0,0 +1,59 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcpip.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCPIP_HH_
+#define _NETINET_TCPIP_HH_
+
+/*
+ * Tcp+ip header, after ip options removed.
+ */
+struct tcpiphdr {
+ struct ipovly ti_i; /* overlaid ip structure */
+ struct tcphdr ti_t; /* tcp header */
+};
+#define ti_x1 ti_i.ih_x1
+#define ti_pr ti_i.ih_pr
+#define ti_len ti_i.ih_len
+#define ti_src ti_i.ih_src
+#define ti_dst ti_i.ih_dst
+#define ti_sport ti_t.th_sport
+#define ti_dport ti_t.th_dport
+#define ti_seq ti_t.th_seq
+#define ti_ack ti_t.th_ack
+#define ti_x2 ti_t.th_x2
+#define ti_off ti_t.th_off
+#define ti_flags ti_t.th_flags
+#define ti_win ti_t.th_win
+#define ti_sum ti_t.th_sum
+#define ti_urp ti_t.th_urp
+
+#endif
diff --git a/freebsd/sys/netinet/toedev.h b/freebsd/sys/netinet/toedev.h
new file mode 100644
index 00000000..4623845c
--- /dev/null
+++ b/freebsd/sys/netinet/toedev.h
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 2007, Chelsio Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the Chelsio Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TOEDEV_HH_
+#define _NETINET_TOEDEV_HH_
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+extern uint32_t toedev_registration_count;
+
+/* Parameter values for offload_get_phys_egress(). */
+enum {
+ TOE_OPEN,
+ TOE_FAILOVER,
+};
+
+/* Parameter values for toe_failover(). */
+enum {
+ TOE_ACTIVE_SLAVE,
+ TOE_LINK_DOWN,
+ TOE_LINK_UP,
+ TOE_RELEASE,
+ TOE_RELEASE_ALL,
+};
+
+#define TOENAMSIZ 16
+
+/* Get the toedev associated with a ifnet. */
+#define TOEDEV(ifp) ((ifp)->if_llsoftc)
+
+struct offload_id {
+ unsigned int id;
+ unsigned long data;
+};
+
+struct ifnet;
+struct rt_entry;
+struct tom_info;
+struct sysctl_oid;
+struct socket;
+struct mbuf;
+
+struct toedev {
+ TAILQ_ENTRY(toedev) entry;
+ char tod_name[TOENAMSIZ]; /* TOE device name */
+ unsigned int tod_ttid; /* TOE type id */
+ unsigned long tod_flags; /* device flags */
+ unsigned int tod_mtu; /* max TX offloaded data */
+ unsigned int tod_nconn; /* max # of offloaded
+ * connections
+ */
+ struct ifnet *tod_lldev; /* first interface */
+ const struct tom_info *tod_offload_mod; /* TCP offload module */
+
+ /*
+ * This TOE device is capable of offloading the connection for socket so
+ */
+ int (*tod_can_offload)(struct toedev *dev, struct socket *so);
+
+ /*
+ * Establish a connection to nam using the TOE device dev
+ */
+ int (*tod_connect)(struct toedev *dev, struct socket *so,
+ struct rtentry *rt, struct sockaddr *nam);
+ /*
+ * Send an mbuf down to the toe device
+ */
+ int (*tod_send)(struct toedev *dev, struct mbuf *m);
+ /*
+ * Receive an array of mbufs from the TOE device dev
+ */
+ int (*tod_recv)(struct toedev *dev, struct mbuf **m, int n);
+ /*
+ * Device specific ioctl interface
+ */
+ int (*tod_ctl)(struct toedev *dev, unsigned int req, void *data);
+ /*
+ * Update L2 entry in toedev
+ */
+ void (*tod_arp_update)(struct toedev *dev, struct rtentry *neigh);
+ /*
+ * Failover from one toe device to another
+ */
+ void (*tod_failover)(struct toedev *dev, struct ifnet *bond_ifp,
+ struct ifnet *ndev, int event);
+ void *tod_priv; /* driver private data */
+ void *tod_l2opt; /* optional layer 2 data */
+ void *tod_l3opt; /* optional layer 3 data */
+ void *tod_l4opt; /* optional layer 4 data */
+ void *tod_ulp; /* upper lever protocol */
+};
+
+struct tom_info {
+ TAILQ_ENTRY(tom_info) entry;
+ int (*ti_attach)(struct toedev *dev,
+ const struct offload_id *entry);
+ int (*ti_detach)(struct toedev *dev);
+ const char *ti_name;
+ const struct offload_id *ti_id_table;
+};
+
+static __inline void
+init_offload_dev(struct toedev *dev)
+{
+}
+
+int register_tom(struct tom_info *t);
+int unregister_tom(struct tom_info *t);
+int register_toedev(struct toedev *dev, const char *name);
+int unregister_toedev(struct toedev *dev);
+int activate_offload(struct toedev *dev);
+int toe_send(struct toedev *dev, struct mbuf *m);
+void toe_arp_update(struct rtentry *rt);
+struct ifnet *offload_get_phys_egress(struct ifnet *ifp,
+ struct socket *so, int context);
+int toe_receive_mbuf(struct toedev *dev, struct mbuf **m, int n);
+
+static __inline void
+toe_neigh_update(struct ifnet *ifp)
+{
+}
+
+static __inline void
+toe_failover(struct ifnet *bond_ifp, struct ifnet *fail_ifp, int event)
+{
+}
+
+static __inline int
+toe_enslave(struct ifnet *bond_ifp, struct ifnet *slave_ifp)
+{
+ return (0);
+}
+
+#endif /* _NETINET_TOEDEV_HH_ */
diff --git a/freebsd/sys/netinet/udp.h b/freebsd/sys/netinet/udp.h
new file mode 100644
index 00000000..e7010ac5
--- /dev/null
+++ b/freebsd/sys/netinet/udp.h
@@ -0,0 +1,2 @@
+#include <freebsd/bsd.h>
+#include <freebsd/netinet/udp.h>
diff --git a/freebsd/sys/netinet/udp_usrreq.c b/freebsd/sys/netinet/udp_usrreq.c
new file mode 100644
index 00000000..f992f5f6
--- /dev/null
+++ b/freebsd/sys/netinet/udp_usrreq.c
@@ -0,0 +1,1633 @@
+#include <freebsd/machine/rtems-bsd-config.h>
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ * The Regents of the University of California.
+ * Copyright (c) 2008 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95
+ */
+
+#include <freebsd/sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <freebsd/local/opt_ipfw.h>
+#include <freebsd/local/opt_inet6.h>
+#include <freebsd/local/opt_ipsec.h>
+
+#include <freebsd/sys/param.h>
+#include <freebsd/sys/domain.h>
+#include <freebsd/sys/eventhandler.h>
+#include <freebsd/sys/jail.h>
+#include <freebsd/sys/kernel.h>
+#include <freebsd/sys/lock.h>
+#include <freebsd/sys/malloc.h>
+#include <freebsd/sys/mbuf.h>
+#include <freebsd/sys/priv.h>
+#include <freebsd/sys/proc.h>
+#include <freebsd/sys/protosw.h>
+#include <freebsd/sys/signalvar.h>
+#include <freebsd/sys/socket.h>
+#include <freebsd/sys/socketvar.h>
+#include <freebsd/sys/sx.h>
+#include <freebsd/sys/sysctl.h>
+#include <freebsd/sys/syslog.h>
+#include <freebsd/sys/systm.h>
+
+#include <freebsd/vm/uma.h>
+
+#include <freebsd/net/if.h>
+#include <freebsd/net/route.h>
+
+#include <freebsd/netinet/in.h>
+#include <freebsd/netinet/in_pcb.h>
+#include <freebsd/netinet/in_systm.h>
+#include <freebsd/netinet/in_var.h>
+#include <freebsd/netinet/ip.h>
+#ifdef INET6
+#include <freebsd/netinet/ip6.h>
+#endif
+#include <freebsd/netinet/ip_icmp.h>
+#include <freebsd/netinet/icmp_var.h>
+#include <freebsd/netinet/ip_var.h>
+#include <freebsd/netinet/ip_options.h>
+#ifdef INET6
+#include <freebsd/netinet6/ip6_var.h>
+#endif
+#include <freebsd/netinet/udp.h>
+#include <freebsd/netinet/udp_var.h>
+
+#ifdef IPSEC
+#include <freebsd/netipsec/ipsec.h>
+#include <freebsd/netipsec/esp.h>
+#endif
+
+#include <freebsd/machine/in_cksum.h>
+
+#include <freebsd/security/mac/mac_framework.h>
+
+/*
+ * UDP protocol implementation.
+ * Per RFC 768, August, 1980.
+ */
+
+/*
+ * BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums
+ * removes the only data integrity mechanism for packets and malformed
+ * packets that would otherwise be discarded due to bad checksums, and may
+ * cause problems (especially for NFS data blocks).
+ */
+static int udp_cksum = 1;
+SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, &udp_cksum,
+ 0, "compute udp checksum");
+
+int udp_log_in_vain = 0;
+SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
+ &udp_log_in_vain, 0, "Log all incoming UDP packets");
+
+VNET_DEFINE(int, udp_blackhole) = 0;
+SYSCTL_VNET_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
+ &VNET_NAME(udp_blackhole), 0,
+ "Do not send port unreachables for refused connects");
+
+u_long udp_sendspace = 9216; /* really max datagram size */
+ /* 40 1K datagrams */
+SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
+ &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
+
+u_long udp_recvspace = 40 * (1024 +
+#ifdef INET6
+ sizeof(struct sockaddr_in6)
+#else
+ sizeof(struct sockaddr_in)
+#endif
+ );
+
+SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
+ &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
+
+VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */
+VNET_DEFINE(struct inpcbinfo, udbinfo);
+static VNET_DEFINE(uma_zone_t, udpcb_zone);
+#define V_udpcb_zone VNET(udpcb_zone)
+
+#ifndef UDBHASHSIZE
+#define UDBHASHSIZE 128
+#endif
+
+VNET_DEFINE(struct udpstat, udpstat); /* from udp_var.h */
+SYSCTL_VNET_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW,
+ &VNET_NAME(udpstat), udpstat,
+ "UDP statistics (struct udpstat, netinet/udp_var.h)");
+
+static void udp_detach(struct socket *so);
+static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
+ struct mbuf *, struct thread *);
+#ifdef IPSEC
+#ifdef IPSEC_NAT_T
+#define UF_ESPINUDP_ALL (UF_ESPINUDP_NON_IKE|UF_ESPINUDP)
+#ifdef INET
+static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int);
+#endif
+#endif /* IPSEC_NAT_T */
+#endif /* IPSEC */
+
+static void
+udp_zone_change(void *tag)
+{
+
+ uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
+ uma_zone_set_max(V_udpcb_zone, maxsockets);
+}
+
+static int
+udp_inpcb_init(void *mem, int size, int flags)
+{
+ struct inpcb *inp;
+
+ inp = mem;
+ INP_LOCK_INIT(inp, "inp", "udpinp");
+ return (0);
+}
+
+void
+udp_init(void)
+{
+
+
+ INP_INFO_LOCK_INIT(&V_udbinfo, "udp");
+ LIST_INIT(&V_udb);
+#ifdef VIMAGE
+ V_udbinfo.ipi_vnet = curvnet;
+#endif
+ V_udbinfo.ipi_listhead = &V_udb;
+ V_udbinfo.ipi_hashbase = hashinit(UDBHASHSIZE, M_PCB,
+ &V_udbinfo.ipi_hashmask);
+ V_udbinfo.ipi_porthashbase = hashinit(UDBHASHSIZE, M_PCB,
+ &V_udbinfo.ipi_porthashmask);
+ V_udbinfo.ipi_zone = uma_zcreate("udp_inpcb", sizeof(struct inpcb),
+ NULL, NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
+
+ V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(V_udpcb_zone, maxsockets);
+
+ EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
+ EVENTHANDLER_PRI_ANY);
+}
+
+/*
+ * Kernel module interface for updating udpstat. The argument is an index
+ * into udpstat treated as an array of u_long. While this encodes the
+ * general layout of udpstat into the caller, it doesn't encode its location,
+ * so that future changes to add, for example, per-CPU stats support won't
+ * cause binary compatibility problems for kernel modules.
+ */
+void
+kmod_udpstat_inc(int statnum)
+{
+
+ (*((u_long *)&V_udpstat + statnum))++;
+}
+
+int
+udp_newudpcb(struct inpcb *inp)
+{
+ struct udpcb *up;
+
+ up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
+ if (up == NULL)
+ return (ENOBUFS);
+ inp->inp_ppcb = up;
+ return (0);
+}
+
+void
+udp_discardcb(struct udpcb *up)
+{
+
+ uma_zfree(V_udpcb_zone, up);
+}
+
+#ifdef VIMAGE
+void
+udp_destroy(void)
+{
+
+ hashdestroy(V_udbinfo.ipi_hashbase, M_PCB,
+ V_udbinfo.ipi_hashmask);
+ hashdestroy(V_udbinfo.ipi_porthashbase, M_PCB,
+ V_udbinfo.ipi_porthashmask);
+
+ uma_zdestroy(V_udpcb_zone);
+ uma_zdestroy(V_udbinfo.ipi_zone);
+ INP_INFO_LOCK_DESTROY(&V_udbinfo);
+}
+#endif
+
+/*
+ * Subroutine of udp_input(), which appends the provided mbuf chain to the
+ * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that
+ * contains the source address. If the socket ends up being an IPv6 socket,
+ * udp_append() will convert to a sockaddr_in6 before passing the address
+ * into the socket code.
+ */
+static void
+udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
+ struct sockaddr_in *udp_in)
+{
+ struct sockaddr *append_sa;
+ struct socket *so;
+ struct mbuf *opts = 0;
+#ifdef INET6
+ struct sockaddr_in6 udp_in6;
+#endif
+#ifdef IPSEC
+#ifdef IPSEC_NAT_T
+#ifdef INET
+ struct udpcb *up;
+#endif
+#endif
+#endif
+
+ INP_RLOCK_ASSERT(inp);
+
+#ifdef IPSEC
+ /* Check AH/ESP integrity. */
+ if (ipsec4_in_reject(n, inp)) {
+ m_freem(n);
+ V_ipsec4stat.in_polvio++;
+ return;
+ }
+#ifdef IPSEC_NAT_T
+#ifdef INET
+ up = intoudpcb(inp);
+ KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
+ if (up->u_flags & UF_ESPINUDP_ALL) { /* IPSec UDP encaps. */
+ n = udp4_espdecap(inp, n, off);
+ if (n == NULL) /* Consumed. */
+ return;
+ }
+#endif /* INET */
+#endif /* IPSEC_NAT_T */
+#endif /* IPSEC */
+#ifdef MAC
+ if (mac_inpcb_check_deliver(inp, n) != 0) {
+ m_freem(n);
+ return;
+ }
+#endif
+ if (inp->inp_flags & INP_CONTROLOPTS ||
+ inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6)
+ (void)ip6_savecontrol_v4(inp, n, &opts, NULL);
+ else
+#endif
+ ip_savecontrol(inp, &opts, ip, n);
+ }
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6) {
+ bzero(&udp_in6, sizeof(udp_in6));
+ udp_in6.sin6_len = sizeof(udp_in6);
+ udp_in6.sin6_family = AF_INET6;
+ in6_sin_2_v4mapsin6(udp_in, &udp_in6);
+ append_sa = (struct sockaddr *)&udp_in6;
+ } else
+#endif
+ append_sa = (struct sockaddr *)udp_in;
+ m_adj(n, off);
+
+ so = inp->inp_socket;
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ m_freem(n);
+ if (opts)
+ m_freem(opts);
+ UDPSTAT_INC(udps_fullsock);
+ } else
+ sorwakeup_locked(so);
+}
+
+void
+udp_input(struct mbuf *m, int off)
+{
+ int iphlen = off;
+ struct ip *ip;
+ struct udphdr *uh;
+ struct ifnet *ifp;
+ struct inpcb *inp;
+ struct udpcb *up;
+ int len;
+ struct ip save_ip;
+ struct sockaddr_in udp_in;
+#ifdef IPFIREWALL_FORWARD
+ struct m_tag *fwd_tag;
+#endif
+
+ ifp = m->m_pkthdr.rcvif;
+ UDPSTAT_INC(udps_ipackets);
+
+ /*
+ * Strip IP options, if any; should skip this, make available to
+ * user, and use on returned packets, but we don't yet have a way to
+ * check the checksum with options still present.
+ */
+ if (iphlen > sizeof (struct ip)) {
+ ip_stripoptions(m, (struct mbuf *)0);
+ iphlen = sizeof(struct ip);
+ }
+
+ /*
+ * Get IP and UDP header together in first mbuf.
+ */
+ ip = mtod(m, struct ip *);
+ if (m->m_len < iphlen + sizeof(struct udphdr)) {
+ if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
+ UDPSTAT_INC(udps_hdrops);
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ }
+ uh = (struct udphdr *)((caddr_t)ip + iphlen);
+
+ /*
+ * Destination port of 0 is illegal, based on RFC768.
+ */
+ if (uh->uh_dport == 0)
+ goto badunlocked;
+
+ /*
+ * Construct sockaddr format source address. Stuff source address
+ * and datagram in user buffer.
+ */
+ bzero(&udp_in, sizeof(udp_in));
+ udp_in.sin_len = sizeof(udp_in);
+ udp_in.sin_family = AF_INET;
+ udp_in.sin_port = uh->uh_sport;
+ udp_in.sin_addr = ip->ip_src;
+
+ /*
+ * Make mbuf data length reflect UDP length. If not enough data to
+ * reflect UDP length, drop.
+ */
+ len = ntohs((u_short)uh->uh_ulen);
+ if (ip->ip_len != len) {
+ if (len > ip->ip_len || len < sizeof(struct udphdr)) {
+ UDPSTAT_INC(udps_badlen);
+ goto badunlocked;
+ }
+ m_adj(m, len - ip->ip_len);
+ /* ip->ip_len = len; */
+ }
+
+ /*
+ * Save a copy of the IP header in case we want restore it for
+ * sending an ICMP error message in response.
+ */
+ if (!V_udp_blackhole)
+ save_ip = *ip;
+ else
+ memset(&save_ip, 0, sizeof(save_ip));
+
+ /*
+ * Checksum extended UDP header and data.
+ */
+ if (uh->uh_sum) {
+ u_short uh_sum;
+
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ uh_sum = m->m_pkthdr.csum_data;
+ else
+ uh_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htonl((u_short)len +
+ m->m_pkthdr.csum_data + IPPROTO_UDP));
+ uh_sum ^= 0xffff;
+ } else {
+ char b[9];
+
+ bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
+ bzero(((struct ipovly *)ip)->ih_x1, 9);
+ ((struct ipovly *)ip)->ih_len = uh->uh_ulen;
+ uh_sum = in_cksum(m, len + sizeof (struct ip));
+ bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
+ }
+ if (uh_sum) {
+ UDPSTAT_INC(udps_badsum);
+ m_freem(m);
+ return;
+ }
+ } else
+ UDPSTAT_INC(udps_nosum);
+
+#ifdef IPFIREWALL_FORWARD
+ /*
+ * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
+ */
+ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
+ if (fwd_tag != NULL) {
+ struct sockaddr_in *next_hop;
+
+ /*
+ * Do the hack.
+ */
+ next_hop = (struct sockaddr_in *)(fwd_tag + 1);
+ ip->ip_dst = next_hop->sin_addr;
+ uh->uh_dport = ntohs(next_hop->sin_port);
+
+ /*
+ * Remove the tag from the packet. We don't need it anymore.
+ */
+ m_tag_delete(m, fwd_tag);
+ }
+#endif
+
+ INP_INFO_RLOCK(&V_udbinfo);
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
+ in_broadcast(ip->ip_dst, ifp)) {
+ struct inpcb *last;
+ struct ip_moptions *imo;
+
+ last = NULL;
+ LIST_FOREACH(inp, &V_udb, inp_list) {
+ if (inp->inp_lport != uh->uh_dport)
+ continue;
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (inp->inp_laddr.s_addr != INADDR_ANY &&
+ inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
+ continue;
+ if (inp->inp_faddr.s_addr != INADDR_ANY &&
+ inp->inp_faddr.s_addr != ip->ip_src.s_addr)
+ continue;
+ if (inp->inp_fport != 0 &&
+ inp->inp_fport != uh->uh_sport)
+ continue;
+
+ INP_RLOCK(inp);
+
+ /*
+ * Handle socket delivery policy for any-source
+ * and source-specific multicast. [RFC3678]
+ */
+ imo = inp->inp_moptions;
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
+ imo != NULL) {
+ struct sockaddr_in group;
+ int blocked;
+
+ bzero(&group, sizeof(struct sockaddr_in));
+ group.sin_len = sizeof(struct sockaddr_in);
+ group.sin_family = AF_INET;
+ group.sin_addr = ip->ip_dst;
+
+ blocked = imo_multi_filter(imo, ifp,
+ (struct sockaddr *)&group,
+ (struct sockaddr *)&udp_in);
+ if (blocked != MCAST_PASS) {
+ if (blocked == MCAST_NOTGMEMBER)
+ IPSTAT_INC(ips_notmember);
+ if (blocked == MCAST_NOTSMEMBER ||
+ blocked == MCAST_MUTED)
+ UDPSTAT_INC(udps_filtermcast);
+ INP_RUNLOCK(inp);
+ continue;
+ }
+ }
+ if (last != NULL) {
+ struct mbuf *n;
+
+ n = m_copy(m, 0, M_COPYALL);
+ up = intoudpcb(last);
+ if (up->u_tun_func == NULL) {
+ if (n != NULL)
+ udp_append(last,
+ ip, n,
+ iphlen +
+ sizeof(struct udphdr),
+ &udp_in);
+ } else {
+ /*
+ * Engage the tunneling protocol we
+ * will have to leave the info_lock
+ * up, since we are hunting through
+ * multiple UDP's.
+ */
+
+ (*up->u_tun_func)(n, iphlen, last);
+ }
+ INP_RUNLOCK(last);
+ }
+ last = inp;
+ /*
+ * Don't look for additional matches if this one does
+ * not have either the SO_REUSEPORT or SO_REUSEADDR
+ * socket options set. This heuristic avoids
+ * searching through all pcbs in the common case of a
+ * non-shared port. It assumes that an application
+ * will never clear these options after setting them.
+ */
+ if ((last->inp_socket->so_options &
+ (SO_REUSEPORT|SO_REUSEADDR)) == 0)
+ break;
+ }
+
+ if (last == NULL) {
+ /*
+ * No matching pcb found; discard datagram. (No need
+ * to send an ICMP Port Unreachable for a broadcast
+ * or multicast datgram.)
+ */
+ UDPSTAT_INC(udps_noportbcast);
+ goto badheadlocked;
+ }
+ up = intoudpcb(last);
+ if (up->u_tun_func == NULL) {
+ udp_append(last, ip, m, iphlen + sizeof(struct udphdr),
+ &udp_in);
+ } else {
+ /*
+ * Engage the tunneling protocol.
+ */
+ (*up->u_tun_func)(m, iphlen, last);
+ }
+ INP_RUNLOCK(last);
+ INP_INFO_RUNLOCK(&V_udbinfo);
+ return;
+ }
+
+ /*
+ * Locate pcb for datagram.
+ */
+ inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport,
+ ip->ip_dst, uh->uh_dport, 1, ifp);
+ if (inp == NULL) {
+ if (udp_log_in_vain) {
+ char buf[4*sizeof "123"];
+
+ strcpy(buf, inet_ntoa(ip->ip_dst));
+ log(LOG_INFO,
+ "Connection attempt to UDP %s:%d from %s:%d\n",
+ buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
+ ntohs(uh->uh_sport));
+ }
+ UDPSTAT_INC(udps_noport);
+ if (m->m_flags & (M_BCAST | M_MCAST)) {
+ UDPSTAT_INC(udps_noportbcast);
+ goto badheadlocked;
+ }
+ if (V_udp_blackhole)
+ goto badheadlocked;
+ if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
+ goto badheadlocked;
+ *ip = save_ip;
+ ip->ip_len += iphlen;
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
+ INP_INFO_RUNLOCK(&V_udbinfo);
+ return;
+ }
+
+ /*
+ * Check the minimum TTL for socket.
+ */
+ INP_RLOCK(inp);
+ INP_INFO_RUNLOCK(&V_udbinfo);
+ if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
+ INP_RUNLOCK(inp);
+ goto badunlocked;
+ }
+ up = intoudpcb(inp);
+ if (up->u_tun_func == NULL) {
+ udp_append(inp, ip, m, iphlen + sizeof(struct udphdr), &udp_in);
+ } else {
+ /*
+ * Engage the tunneling protocol.
+ */
+
+ (*up->u_tun_func)(m, iphlen, inp);
+ }
+ INP_RUNLOCK(inp);
+ return;
+
+badheadlocked:
+ if (inp)
+ INP_RUNLOCK(inp);
+ INP_INFO_RUNLOCK(&V_udbinfo);
+badunlocked:
+ m_freem(m);
+}
+
+/*
+ * Notify a udp user of an asynchronous error; just wake up so that they can
+ * collect error status.
+ */
+struct inpcb *
+udp_notify(struct inpcb *inp, int errno)
+{
+
+ /*
+ * While udp_ctlinput() always calls udp_notify() with a read lock
+ * when invoking it directly, in_pcbnotifyall() currently uses write
+ * locks due to sharing code with TCP. For now, accept either a read
+ * or a write lock, but a read lock is sufficient.
+ */
+ INP_LOCK_ASSERT(inp);
+
+ inp->inp_socket->so_error = errno;
+ sorwakeup(inp->inp_socket);
+ sowwakeup(inp->inp_socket);
+ return (inp);
+}
+
+void
+udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
+{
+ struct ip *ip = vip;
+ struct udphdr *uh;
+ struct in_addr faddr;
+ struct inpcb *inp;
+
+ faddr = ((struct sockaddr_in *)sa)->sin_addr;
+ if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
+ return;
+
+ /*
+ * Redirects don't need to be handled up here.
+ */
+ if (PRC_IS_REDIRECT(cmd))
+ return;
+
+ /*
+ * Hostdead is ugly because it goes linearly through all PCBs.
+ *
+ * XXX: We never get this from ICMP, otherwise it makes an excellent
+ * DoS attack on machines with many connections.
+ */
+ if (cmd == PRC_HOSTDEAD)
+ ip = NULL;
+ else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
+ return;
+ if (ip != NULL) {
+ uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
+ INP_INFO_RLOCK(&V_udbinfo);
+ inp = in_pcblookup_hash(&V_udbinfo, faddr, uh->uh_dport,
+ ip->ip_src, uh->uh_sport, 0, NULL);
+ if (inp != NULL) {
+ INP_RLOCK(inp);
+ if (inp->inp_socket != NULL) {
+ udp_notify(inp, inetctlerrmap[cmd]);
+ }
+ INP_RUNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(&V_udbinfo);
+ } else
+ in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd],
+ udp_notify);
+}
+
+static int
+udp_pcblist(SYSCTL_HANDLER_ARGS)
+{
+ int error, i, n;
+ struct inpcb *inp, **inp_list;
+ inp_gen_t gencnt;
+ struct xinpgen xig;
+
+ /*
+ * The process of preparing the PCB list is too time-consuming and
+ * resource-intensive to repeat twice on every request.
+ */
+ if (req->oldptr == 0) {
+ n = V_udbinfo.ipi_count;
+ n += imax(n / 8, 10);
+ req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
+ return (0);
+ }
+
+ if (req->newptr != 0)
+ return (EPERM);
+
+ /*
+ * OK, now we're committed to doing something.
+ */
+ INP_INFO_RLOCK(&V_udbinfo);
+ gencnt = V_udbinfo.ipi_gencnt;
+ n = V_udbinfo.ipi_count;
+ INP_INFO_RUNLOCK(&V_udbinfo);
+
+ error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
+ + n * sizeof(struct xinpcb));
+ if (error != 0)
+ return (error);
+
+ xig.xig_len = sizeof xig;
+ xig.xig_count = n;
+ xig.xig_gen = gencnt;
+ xig.xig_sogen = so_gencnt;
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ if (error)
+ return (error);
+
+ inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
+ if (inp_list == 0)
+ return (ENOMEM);
+
+ INP_INFO_RLOCK(&V_udbinfo);
+ for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
+ inp = LIST_NEXT(inp, inp_list)) {
+ INP_WLOCK(inp);
+ if (inp->inp_gencnt <= gencnt &&
+ cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
+ in_pcbref(inp);
+ inp_list[i++] = inp;
+ }
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(&V_udbinfo);
+ n = i;
+
+ error = 0;
+ for (i = 0; i < n; i++) {
+ inp = inp_list[i];
+ INP_RLOCK(inp);
+ if (inp->inp_gencnt <= gencnt) {
+ struct xinpcb xi;
+
+ bzero(&xi, sizeof(xi));
+ xi.xi_len = sizeof xi;
+ /* XXX should avoid extra copy */
+ bcopy(inp, &xi.xi_inp, sizeof *inp);
+ if (inp->inp_socket)
+ sotoxsocket(inp->inp_socket, &xi.xi_socket);
+ xi.xi_inp.inp_gencnt = inp->inp_gencnt;
+ INP_RUNLOCK(inp);
+ error = SYSCTL_OUT(req, &xi, sizeof xi);
+ } else
+ INP_RUNLOCK(inp);
+ }
+ INP_INFO_WLOCK(&V_udbinfo);
+ for (i = 0; i < n; i++) {
+ inp = inp_list[i];
+ INP_WLOCK(inp);
+ if (!in_pcbrele(inp))
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_WUNLOCK(&V_udbinfo);
+
+ if (!error) {
+ /*
+ * Give the user an updated idea of our state. If the
+ * generation differs from what we told her before, she knows
+ * that something happened while we were processing this
+ * request, and it might be necessary to retry.
+ */
+ INP_INFO_RLOCK(&V_udbinfo);
+ xig.xig_gen = V_udbinfo.ipi_gencnt;
+ xig.xig_sogen = so_gencnt;
+ xig.xig_count = V_udbinfo.ipi_count;
+ INP_INFO_RUNLOCK(&V_udbinfo);
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ }
+ free(inp_list, M_TEMP);
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
+ udp_pcblist, "S,xinpcb", "List of active UDP sockets");
+
+static int
+udp_getcred(SYSCTL_HANDLER_ARGS)
+{
+ struct xucred xuc;
+ struct sockaddr_in addrs[2];
+ struct inpcb *inp;
+ int error;
+
+ error = priv_check(req->td, PRIV_NETINET_GETCRED);
+ if (error)
+ return (error);
+ error = SYSCTL_IN(req, addrs, sizeof(addrs));
+ if (error)
+ return (error);
+ INP_INFO_RLOCK(&V_udbinfo);
+ inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
+ addrs[0].sin_addr, addrs[0].sin_port, 1, NULL);
+ if (inp != NULL) {
+ INP_RLOCK(inp);
+ INP_INFO_RUNLOCK(&V_udbinfo);
+ if (inp->inp_socket == NULL)
+ error = ENOENT;
+ if (error == 0)
+ error = cr_canseeinpcb(req->td->td_ucred, inp);
+ if (error == 0)
+ cru2x(inp->inp_cred, &xuc);
+ INP_RUNLOCK(inp);
+ } else {
+ INP_INFO_RUNLOCK(&V_udbinfo);
+ error = ENOENT;
+ }
+ if (error == 0)
+ error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
+ CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
+ udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
+
+int
+udp_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ int error = 0, optval;
+ struct inpcb *inp;
+#ifdef IPSEC_NAT_T
+ struct udpcb *up;
+#endif
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
+ INP_WLOCK(inp);
+ if (sopt->sopt_level != IPPROTO_UDP) {
+#ifdef INET6
+ if (INP_CHECK_SOCKAF(so, AF_INET6)) {
+ INP_WUNLOCK(inp);
+ error = ip6_ctloutput(so, sopt);
+ } else {
+#endif
+ INP_WUNLOCK(inp);
+ error = ip_ctloutput(so, sopt);
+#ifdef INET6
+ }
+#endif
+ return (error);
+ }
+
+ switch (sopt->sopt_dir) {
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case UDP_ENCAP:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ break;
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
+ INP_WLOCK(inp);
+#ifdef IPSEC_NAT_T
+ up = intoudpcb(inp);
+ KASSERT(up != NULL, ("%s: up == NULL", __func__));
+#endif
+ switch (optval) {
+ case 0:
+ /* Clear all UDP encap. */
+#ifdef IPSEC_NAT_T
+ up->u_flags &= ~UF_ESPINUDP_ALL;
+#endif
+ break;
+#ifdef IPSEC_NAT_T
+ case UDP_ENCAP_ESPINUDP:
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ up->u_flags &= ~UF_ESPINUDP_ALL;
+ if (optval == UDP_ENCAP_ESPINUDP)
+ up->u_flags |= UF_ESPINUDP;
+ else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE)
+ up->u_flags |= UF_ESPINUDP_NON_IKE;
+ break;
+#endif
+ default:
+ error = EINVAL;
+ break;
+ }
+ INP_WUNLOCK(inp);
+ break;
+ default:
+ INP_WUNLOCK(inp);
+ error = ENOPROTOOPT;
+ break;
+ }
+ break;
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+#ifdef IPSEC_NAT_T
+ case UDP_ENCAP:
+ up = intoudpcb(inp);
+ KASSERT(up != NULL, ("%s: up == NULL", __func__));
+ optval = up->u_flags & UF_ESPINUDP_ALL;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+#endif
+ default:
+ INP_WUNLOCK(inp);
+ error = ENOPROTOOPT;
+ break;
+ }
+ break;
+ }
+ return (error);
+}
+
+static int
+udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
+ struct mbuf *control, struct thread *td)
+{
+ struct udpiphdr *ui;
+ int len = m->m_pkthdr.len;
+ struct in_addr faddr, laddr;
+ struct cmsghdr *cm;
+ struct sockaddr_in *sin, src;
+ int error = 0;
+ int ipflags;
+ u_short fport, lport;
+ int unlock_udbinfo;
+
+ /*
+ * udp_output() may need to temporarily bind or connect the current
+ * inpcb. As such, we don't know up front whether we will need the
+ * pcbinfo lock or not. Do any work to decide what is needed up
+ * front before acquiring any locks.
+ */
+ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
+ if (control)
+ m_freem(control);
+ m_freem(m);
+ return (EMSGSIZE);
+ }
+
+ src.sin_family = 0;
+ if (control != NULL) {
+ /*
+ * XXX: Currently, we assume all the optional information is
+ * stored in a single mbuf.
+ */
+ if (control->m_next) {
+ m_freem(control);
+ m_freem(m);
+ return (EINVAL);
+ }
+ for (; control->m_len > 0;
+ control->m_data += CMSG_ALIGN(cm->cmsg_len),
+ control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
+ cm = mtod(control, struct cmsghdr *);
+ if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
+ || cm->cmsg_len > control->m_len) {
+ error = EINVAL;
+ break;
+ }
+ if (cm->cmsg_level != IPPROTO_IP)
+ continue;
+
+ switch (cm->cmsg_type) {
+ case IP_SENDSRCADDR:
+ if (cm->cmsg_len !=
+ CMSG_LEN(sizeof(struct in_addr))) {
+ error = EINVAL;
+ break;
+ }
+ bzero(&src, sizeof(src));
+ src.sin_family = AF_INET;
+ src.sin_len = sizeof(src);
+ src.sin_port = inp->inp_lport;
+ src.sin_addr =
+ *(struct in_addr *)CMSG_DATA(cm);
+ break;
+
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ if (error)
+ break;
+ }
+ m_freem(control);
+ }
+ if (error) {
+ m_freem(m);
+ return (error);
+ }
+
+ /*
+ * Depending on whether or not the application has bound or connected
+ * the socket, we may have to do varying levels of work. The optimal
+ * case is for a connected UDP socket, as a global lock isn't
+ * required at all.
+ *
+ * In order to decide which we need, we require stability of the
+ * inpcb binding, which we ensure by acquiring a read lock on the
+ * inpcb. This doesn't strictly follow the lock order, so we play
+ * the trylock and retry game; note that we may end up with more
+ * conservative locks than required the second time around, so later
+ * assertions have to accept that. Further analysis of the number of
+ * misses under contention is required.
+ */
+ sin = (struct sockaddr_in *)addr;
+ INP_RLOCK(inp);
+ if (sin != NULL &&
+ (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
+ INP_RUNLOCK(inp);
+ INP_INFO_WLOCK(&V_udbinfo);
+ INP_WLOCK(inp);
+ unlock_udbinfo = 2;
+ } else if ((sin != NULL && (
+ (sin->sin_addr.s_addr == INADDR_ANY) ||
+ (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
+ (inp->inp_laddr.s_addr == INADDR_ANY) ||
+ (inp->inp_lport == 0))) ||
+ (src.sin_family == AF_INET)) {
+ if (!INP_INFO_TRY_RLOCK(&V_udbinfo)) {
+ INP_RUNLOCK(inp);
+ INP_INFO_RLOCK(&V_udbinfo);
+ INP_RLOCK(inp);
+ }
+ unlock_udbinfo = 1;
+ } else
+ unlock_udbinfo = 0;
+
+ /*
+ * If the IP_SENDSRCADDR control message was specified, override the
+ * source address for this datagram. Its use is invalidated if the
+ * address thus specified is incomplete or clobbers other inpcbs.
+ */
+ laddr = inp->inp_laddr;
+ lport = inp->inp_lport;
+ if (src.sin_family == AF_INET) {
+ INP_INFO_LOCK_ASSERT(&V_udbinfo);
+ if ((lport == 0) ||
+ (laddr.s_addr == INADDR_ANY &&
+ src.sin_addr.s_addr == INADDR_ANY)) {
+ error = EINVAL;
+ goto release;
+ }
+ error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
+ &laddr.s_addr, &lport, td->td_ucred);
+ if (error)
+ goto release;
+ }
+
+ /*
+ * If a UDP socket has been connected, then a local address/port will
+ * have been selected and bound.
+ *
+ * If a UDP socket has not been connected to, then an explicit
+ * destination address must be used, in which case a local
+ * address/port may not have been selected and bound.
+ */
+ if (sin != NULL) {
+ INP_LOCK_ASSERT(inp);
+ if (inp->inp_faddr.s_addr != INADDR_ANY) {
+ error = EISCONN;
+ goto release;
+ }
+
+ /*
+ * Jail may rewrite the destination address, so let it do
+ * that before we use it.
+ */
+ error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
+ if (error)
+ goto release;
+
+ /*
+ * If a local address or port hasn't yet been selected, or if
+ * the destination address needs to be rewritten due to using
+ * a special INADDR_ constant, invoke in_pcbconnect_setup()
+ * to do the heavy lifting. Once a port is selected, we
+ * commit the binding back to the socket; we also commit the
+ * binding of the address if in jail.
+ *
+ * If we already have a valid binding and we're not
+ * requesting a destination address rewrite, use a fast path.
+ */
+ if (inp->inp_laddr.s_addr == INADDR_ANY ||
+ inp->inp_lport == 0 ||
+ sin->sin_addr.s_addr == INADDR_ANY ||
+ sin->sin_addr.s_addr == INADDR_BROADCAST) {
+ INP_INFO_LOCK_ASSERT(&V_udbinfo);
+ error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
+ &lport, &faddr.s_addr, &fport, NULL,
+ td->td_ucred);
+ if (error)
+ goto release;
+
+ /*
+ * XXXRW: Why not commit the port if the address is
+ * !INADDR_ANY?
+ */
+ /* Commit the local port if newly assigned. */
+ if (inp->inp_laddr.s_addr == INADDR_ANY &&
+ inp->inp_lport == 0) {
+ INP_INFO_WLOCK_ASSERT(&V_udbinfo);
+ INP_WLOCK_ASSERT(inp);
+ /*
+ * Remember addr if jailed, to prevent
+ * rebinding.
+ */
+ if (prison_flag(td->td_ucred, PR_IP4))
+ inp->inp_laddr = laddr;
+ inp->inp_lport = lport;
+ if (in_pcbinshash(inp) != 0) {
+ inp->inp_lport = 0;
+ error = EAGAIN;
+ goto release;
+ }
+ inp->inp_flags |= INP_ANONPORT;
+ }
+ } else {
+ faddr = sin->sin_addr;
+ fport = sin->sin_port;
+ }
+ } else {
+ INP_LOCK_ASSERT(inp);
+ faddr = inp->inp_faddr;
+ fport = inp->inp_fport;
+ if (faddr.s_addr == INADDR_ANY) {
+ error = ENOTCONN;
+ goto release;
+ }
+ }
+
+ /*
+ * Calculate data length and get a mbuf for UDP, IP, and possible
+ * link-layer headers. Immediate slide the data pointer back forward
+ * since we won't use that space at this layer.
+ */
+ M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto release;
+ }
+ m->m_data += max_linkhdr;
+ m->m_len -= max_linkhdr;
+ m->m_pkthdr.len -= max_linkhdr;
+
+ /*
+ * Fill in mbuf with extended UDP header and addresses and length put
+ * into network format.
+ */
+ ui = mtod(m, struct udpiphdr *);
+ bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */
+ ui->ui_pr = IPPROTO_UDP;
+ ui->ui_src = laddr;
+ ui->ui_dst = faddr;
+ ui->ui_sport = lport;
+ ui->ui_dport = fport;
+ ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
+
+ /*
+ * Set the Don't Fragment bit in the IP header.
+ */
+ if (inp->inp_flags & INP_DONTFRAG) {
+ struct ip *ip;
+
+ ip = (struct ip *)&ui->ui_i;
+ ip->ip_off |= IP_DF;
+ }
+
+ ipflags = 0;
+ if (inp->inp_socket->so_options & SO_DONTROUTE)
+ ipflags |= IP_ROUTETOIF;
+ if (inp->inp_socket->so_options & SO_BROADCAST)
+ ipflags |= IP_ALLOWBROADCAST;
+ if (inp->inp_flags & INP_ONESBCAST)
+ ipflags |= IP_SENDONES;
+
+#ifdef MAC
+ mac_inpcb_create_mbuf(inp, m);
+#endif
+
+ /*
+ * Set up checksum and output datagram.
+ */
+ if (udp_cksum) {
+ if (inp->inp_flags & INP_ONESBCAST)
+ faddr.s_addr = INADDR_BROADCAST;
+ ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
+ htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
+ m->m_pkthdr.csum_flags = CSUM_UDP;
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+ } else
+ ui->ui_sum = 0;
+ ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len;
+ ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */
+ ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */
+ UDPSTAT_INC(udps_opackets);
+
+ if (unlock_udbinfo == 2)
+ INP_INFO_WUNLOCK(&V_udbinfo);
+ else if (unlock_udbinfo == 1)
+ INP_INFO_RUNLOCK(&V_udbinfo);
+ error = ip_output(m, inp->inp_options, NULL, ipflags,
+ inp->inp_moptions, inp);
+ if (unlock_udbinfo == 2)
+ INP_WUNLOCK(inp);
+ else
+ INP_RUNLOCK(inp);
+ return (error);
+
+release:
+ if (unlock_udbinfo == 2) {
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_udbinfo);
+ } else if (unlock_udbinfo == 1) {
+ INP_RUNLOCK(inp);
+ INP_INFO_RUNLOCK(&V_udbinfo);
+ } else
+ INP_RUNLOCK(inp);
+ m_freem(m);
+ return (error);
+}
+
+
+#if defined(IPSEC) && defined(IPSEC_NAT_T)
+#ifdef INET
+/*
+ * Potentially decap ESP in UDP frame. Check for an ESP header
+ * and optional marker; if present, strip the UDP header and
+ * push the result through IPSec.
+ *
+ * Returns mbuf to be processed (potentially re-allocated) or
+ * NULL if consumed and/or processed.
+ */
+static struct mbuf *
+udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
+{
+ size_t minlen, payload, skip, iphlen;
+ caddr_t data;
+ struct udpcb *up;
+ struct m_tag *tag;
+ struct udphdr *udphdr;
+ struct ip *ip;
+
+ INP_RLOCK_ASSERT(inp);
+
+ /*
+ * Pull up data so the longest case is contiguous:
+ * IP/UDP hdr + non ESP marker + ESP hdr.
+ */
+ minlen = off + sizeof(uint64_t) + sizeof(struct esp);
+ if (minlen > m->m_pkthdr.len)
+ minlen = m->m_pkthdr.len;
+ if ((m = m_pullup(m, minlen)) == NULL) {
+ V_ipsec4stat.in_inval++;
+ return (NULL); /* Bypass caller processing. */
+ }
+ data = mtod(m, caddr_t); /* Points to ip header. */
+ payload = m->m_len - off; /* Size of payload. */
+
+ if (payload == 1 && data[off] == '\xff')
+ return (m); /* NB: keepalive packet, no decap. */
+
+ up = intoudpcb(inp);
+ KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
+ KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0,
+ ("u_flags 0x%x", up->u_flags));
+
+ /*
+ * Check that the payload is large enough to hold an
+ * ESP header and compute the amount of data to remove.
+ *
+ * NB: the caller has already done a pullup for us.
+ * XXX can we assume alignment and eliminate bcopys?
+ */
+ if (up->u_flags & UF_ESPINUDP_NON_IKE) {
+ /*
+ * draft-ietf-ipsec-nat-t-ike-0[01].txt and
+ * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring
+ * possible AH mode non-IKE marker+non-ESP marker
+ * from draft-ietf-ipsec-udp-encaps-00.txt.
+ */
+ uint64_t marker;
+
+ if (payload <= sizeof(uint64_t) + sizeof(struct esp))
+ return (m); /* NB: no decap. */
+ bcopy(data + off, &marker, sizeof(uint64_t));
+ if (marker != 0) /* Non-IKE marker. */
+ return (m); /* NB: no decap. */
+ skip = sizeof(uint64_t) + sizeof(struct udphdr);
+ } else {
+ uint32_t spi;
+
+ if (payload <= sizeof(struct esp)) {
+ V_ipsec4stat.in_inval++;
+ m_freem(m);
+ return (NULL); /* Discard. */
+ }
+ bcopy(data + off, &spi, sizeof(uint32_t));
+ if (spi == 0) /* Non-ESP marker. */
+ return (m); /* NB: no decap. */
+ skip = sizeof(struct udphdr);
+ }
+
+ /*
+ * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
+ * the UDP ports. This is required if we want to select
+ * the right SPD for multiple hosts behind same NAT.
+ *
+ * NB: ports are maintained in network byte order everywhere
+ * in the NAT-T code.
+ */
+ tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
+ 2 * sizeof(uint16_t), M_NOWAIT);
+ if (tag == NULL) {
+ V_ipsec4stat.in_nomem++;
+ m_freem(m);
+ return (NULL); /* Discard. */
+ }
+ iphlen = off - sizeof(struct udphdr);
+ udphdr = (struct udphdr *)(data + iphlen);
+ ((uint16_t *)(tag + 1))[0] = udphdr->uh_sport;
+ ((uint16_t *)(tag + 1))[1] = udphdr->uh_dport;
+ m_tag_prepend(m, tag);
+
+ /*
+ * Remove the UDP header (and possibly the non ESP marker)
+ * IP header length is iphlen
+ * Before:
+ * <--- off --->
+ * +----+------+-----+
+ * | IP | UDP | ESP |
+ * +----+------+-----+
+ * <-skip->
+ * After:
+ * +----+-----+
+ * | IP | ESP |
+ * +----+-----+
+ * <-skip->
+ */
+ ovbcopy(data, data + skip, iphlen);
+ m_adj(m, skip);
+
+ ip = mtod(m, struct ip *);
+ ip->ip_len -= skip;
+ ip->ip_p = IPPROTO_ESP;
+
+ /*
+ * We cannot yet update the cksums so clear any
+ * h/w cksum flags as they are no longer valid.
+ */
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)
+ m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
+
+ (void) ipsec4_common_input(m, iphlen, ip->ip_p);
+ return (NULL); /* NB: consumed, bypass processing. */
+}
+#endif /* INET */
+#endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */
+
+static void
+udp_abort(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
+ INP_INFO_WLOCK(&V_udbinfo);
+ INP_WLOCK(inp);
+ if (inp->inp_faddr.s_addr != INADDR_ANY) {
+ in_pcbdisconnect(inp);
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ soisdisconnected(so);
+ }
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_udbinfo);
+}
+
+static int
+udp_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct inpcb *inp;
+ int error;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
+ error = soreserve(so, udp_sendspace, udp_recvspace);
+ if (error)
+ return (error);
+ INP_INFO_WLOCK(&V_udbinfo);
+ error = in_pcballoc(so, &V_udbinfo);
+ if (error) {
+ INP_INFO_WUNLOCK(&V_udbinfo);
+ return (error);
+ }
+
+ inp = sotoinpcb(so);
+ inp->inp_vflag |= INP_IPV4;
+ inp->inp_ip_ttl = V_ip_defttl;
+
+ error = udp_newudpcb(inp);
+ if (error) {
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+ INP_INFO_WUNLOCK(&V_udbinfo);
+ return (error);
+ }
+
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_udbinfo);
+ return (0);
+}
+
+int
+udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f)
+{
+ struct inpcb *inp;
+ struct udpcb *up;
+
+ KASSERT(so->so_type == SOCK_DGRAM,
+ ("udp_set_kernel_tunneling: !dgram"));
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
+ INP_WLOCK(inp);
+ up = intoudpcb(inp);
+ if (up->u_tun_func != NULL) {
+ INP_WUNLOCK(inp);
+ return (EBUSY);
+ }
+ up->u_tun_func = f;
+ INP_WUNLOCK(inp);
+ return (0);
+}
+
+static int
+udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct inpcb *inp;
+ int error;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
+ INP_INFO_WLOCK(&V_udbinfo);
+ INP_WLOCK(inp);
+ error = in_pcbbind(inp, nam, td->td_ucred);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_udbinfo);
+ return (error);
+}
+
+static void
+udp_close(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("udp_close: inp == NULL"));
+ INP_INFO_WLOCK(&V_udbinfo);
+ INP_WLOCK(inp);
+ if (inp->inp_faddr.s_addr != INADDR_ANY) {
+ in_pcbdisconnect(inp);
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ soisdisconnected(so);
+ }
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_udbinfo);
+}
+
+static int
+udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct inpcb *inp;
+ int error;
+ struct sockaddr_in *sin;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
+ INP_INFO_WLOCK(&V_udbinfo);
+ INP_WLOCK(inp);
+ if (inp->inp_faddr.s_addr != INADDR_ANY) {
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_udbinfo);
+ return (EISCONN);
+ }
+ sin = (struct sockaddr_in *)nam;
+ error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
+ if (error != 0) {
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_udbinfo);
+ return (error);
+ }
+ error = in_pcbconnect(inp, nam, td->td_ucred);
+ if (error == 0)
+ soisconnected(so);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_udbinfo);
+ return (error);
+}
+
+static void
+udp_detach(struct socket *so)
+{
+ struct inpcb *inp;
+ struct udpcb *up;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
+ KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
+ ("udp_detach: not disconnected"));
+ INP_INFO_WLOCK(&V_udbinfo);
+ INP_WLOCK(inp);
+ up = intoudpcb(inp);
+ KASSERT(up != NULL, ("%s: up == NULL", __func__));
+ inp->inp_ppcb = NULL;
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+ INP_INFO_WUNLOCK(&V_udbinfo);
+ udp_discardcb(up);
+}
+
+static int
+udp_disconnect(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
+ INP_INFO_WLOCK(&V_udbinfo);
+ INP_WLOCK(inp);
+ if (inp->inp_faddr.s_addr == INADDR_ANY) {
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_udbinfo);
+ return (ENOTCONN);
+ }
+
+ in_pcbdisconnect(inp);
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_ISCONNECTED; /* XXX */
+ SOCK_UNLOCK(so);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_udbinfo);
+ return (0);
+}
+
+static int
+udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
+ struct mbuf *control, struct thread *td)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("udp_send: inp == NULL"));
+ return (udp_output(inp, m, addr, control, td));
+}
+
+int
+udp_shutdown(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
+ INP_WLOCK(inp);
+ socantsendmore(so);
+ INP_WUNLOCK(inp);
+ return (0);
+}
+
+struct pr_usrreqs udp_usrreqs = {
+ .pru_abort = udp_abort,
+ .pru_attach = udp_attach,
+ .pru_bind = udp_bind,
+ .pru_connect = udp_connect,
+ .pru_control = in_control,
+ .pru_detach = udp_detach,
+ .pru_disconnect = udp_disconnect,
+ .pru_peeraddr = in_getpeeraddr,
+ .pru_send = udp_send,
+ .pru_soreceive = soreceive_dgram,
+ .pru_sosend = sosend_dgram,
+ .pru_shutdown = udp_shutdown,
+ .pru_sockaddr = in_getsockaddr,
+ .pru_sosetlabel = in_pcbsosetlabel,
+ .pru_close = udp_close,
+};
diff --git a/freebsd/sys/netinet/udp_var.h b/freebsd/sys/netinet/udp_var.h
new file mode 100644
index 00000000..0bff6ea9
--- /dev/null
+++ b/freebsd/sys/netinet/udp_var.h
@@ -0,0 +1,161 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)udp_var.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_UDP_VAR_HH_
+#define _NETINET_UDP_VAR_HH_
+
+/*
+ * UDP kernel structures and variables.
+ */
+struct udpiphdr {
+ struct ipovly ui_i; /* overlaid ip structure */
+ struct udphdr ui_u; /* udp header */
+};
+#define ui_x1 ui_i.ih_x1
+#define ui_pr ui_i.ih_pr
+#define ui_len ui_i.ih_len
+#define ui_src ui_i.ih_src
+#define ui_dst ui_i.ih_dst
+#define ui_sport ui_u.uh_sport
+#define ui_dport ui_u.uh_dport
+#define ui_ulen ui_u.uh_ulen
+#define ui_sum ui_u.uh_sum
+
+typedef void(*udp_tun_func_t)(struct mbuf *, int off, struct inpcb *);
+
+/*
+ * UDP control block; one per udp.
+ */
+struct udpcb {
+ udp_tun_func_t u_tun_func; /* UDP kernel tunneling callback. */
+ u_int u_flags; /* Generic UDP flags. */
+};
+
+#define intoudpcb(ip) ((struct udpcb *)(ip)->inp_ppcb)
+#define sotoudpcb(so) (intoudpcb(sotoinpcb(so)))
+
+ /* IPsec: ESP in UDP tunneling: */
+#define UF_ESPINUDP_NON_IKE 0x00000001 /* w/ non-IKE marker .. */
+ /* .. per draft-ietf-ipsec-nat-t-ike-0[01],
+ * and draft-ietf-ipsec-udp-encaps-(00/)01.txt */
+#define UF_ESPINUDP 0x00000002 /* w/ non-ESP marker. */
+
+struct udpstat {
+ /* input statistics: */
+ u_long udps_ipackets; /* total input packets */
+ u_long udps_hdrops; /* packet shorter than header */
+ u_long udps_badsum; /* checksum error */
+ u_long udps_nosum; /* no checksum */
+ u_long udps_badlen; /* data length larger than packet */
+ u_long udps_noport; /* no socket on port */
+ u_long udps_noportbcast; /* of above, arrived as broadcast */
+ u_long udps_fullsock; /* not delivered, input socket full */
+ u_long udpps_pcbcachemiss; /* input packets missing pcb cache */
+ u_long udpps_pcbhashmiss; /* input packets not for hashed pcb */
+ /* output statistics: */
+ u_long udps_opackets; /* total output packets */
+ u_long udps_fastout; /* output packets on fast path */
+ /* of no socket on port, arrived as multicast */
+ u_long udps_noportmcast;
+ u_long udps_filtermcast; /* blocked by multicast filter */
+};
+
+#ifdef _KERNEL
+/*
+ * In-kernel consumers can use these accessor macros directly to update
+ * stats.
+ */
+#define UDPSTAT_ADD(name, val) V_udpstat.name += (val)
+#define UDPSTAT_INC(name) UDPSTAT_ADD(name, 1)
+
+/*
+ * Kernel module consumers must use this accessor macro.
+ */
+void kmod_udpstat_inc(int statnum);
+#define KMOD_UDPSTAT_INC(name) \
+ kmod_udpstat_inc(offsetof(struct udpstat, name) / sizeof(u_long))
+#endif
+
+/*
+ * Names for UDP sysctl objects.
+ */
+#define UDPCTL_CHECKSUM 1 /* checksum UDP packets */
+#define UDPCTL_STATS 2 /* statistics (read-only) */
+#define UDPCTL_MAXDGRAM 3 /* max datagram size */
+#define UDPCTL_RECVSPACE 4 /* default receive buffer space */
+#define UDPCTL_PCBLIST 5 /* list of PCBs for UDP sockets */
+#define UDPCTL_MAXID 6
+
+#define UDPCTL_NAMES { \
+ { 0, 0 }, \
+ { "checksum", CTLTYPE_INT }, \
+ { "stats", CTLTYPE_STRUCT }, \
+ { "maxdgram", CTLTYPE_INT }, \
+ { "recvspace", CTLTYPE_INT }, \
+ { "pcblist", CTLTYPE_STRUCT }, \
+}
+
+#ifdef _KERNEL
+SYSCTL_DECL(_net_inet_udp);
+
+extern struct pr_usrreqs udp_usrreqs;
+VNET_DECLARE(struct inpcbhead, udb);
+VNET_DECLARE(struct inpcbinfo, udbinfo);
+#define V_udb VNET(udb)
+#define V_udbinfo VNET(udbinfo)
+
+extern u_long udp_sendspace;
+extern u_long udp_recvspace;
+VNET_DECLARE(struct udpstat, udpstat);
+VNET_DECLARE(int, udp_blackhole);
+#define V_udpstat VNET(udpstat)
+#define V_udp_blackhole VNET(udp_blackhole)
+extern int udp_log_in_vain;
+
+int udp_newudpcb(struct inpcb *);
+void udp_discardcb(struct udpcb *);
+
+void udp_ctlinput(int, struct sockaddr *, void *);
+int udp_ctloutput(struct socket *, struct sockopt *);
+void udp_init(void);
+#ifdef VIMAGE
+void udp_destroy(void);
+#endif
+void udp_input(struct mbuf *, int);
+struct inpcb *udp_notify(struct inpcb *inp, int errno);
+int udp_shutdown(struct socket *so);
+
+int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f);
+#endif
+
+#endif